├── 004-Dask ├── dask-worker-space │ ├── global.lock │ └── purge.lock └── tutorial-01.py ├── 020-kinesis-single-shard ├── requirements.txt ├── consumer.py └── producer.py ├── 022-kinesis-data-analytics ├── requirements.txt ├── consumer.py └── producer.py ├── 204-Kubernetes-Redis └── redis-custom-config │ ├── redis.conf │ ├── kustomization.yaml │ └── redis-pod.yaml ├── README.md ├── 101-GeoHash ├── images │ ├── lat-lng.jpg │ ├── geohash-size.png │ └── law-of-cosine.svg ├── map.html └── 02 Lat Lng - Addition, Angle.ipynb ├── 200-Kubernetes ├── app │ ├── fasion_model.h5 │ ├── requirements.txt │ └── app.py ├── sample │ ├── sample_1139.jpg │ ├── sample_11518.jpg │ ├── sample_12352.jpg │ ├── sample_12924.jpg │ ├── sample_13180.jpg │ ├── sample_1421.jpg │ ├── sample_14467.jpg │ ├── sample_15419.jpg │ ├── sample_16078.jpg │ ├── sample_16742.jpg │ ├── sample_16975.jpg │ ├── sample_17027.jpg │ ├── sample_17101.jpg │ ├── sample_17764.jpg │ ├── sample_18043.jpg │ ├── sample_18313.jpg │ ├── sample_18788.jpg │ ├── sample_19007.jpg │ ├── sample_19098.jpg │ ├── sample_19177.jpg │ ├── sample_19442.jpg │ ├── sample_19482.jpg │ ├── sample_20091.jpg │ ├── sample_22950.jpg │ ├── sample_23330.jpg │ ├── sample_25021.jpg │ ├── sample_25536.jpg │ ├── sample_25643.jpg │ ├── sample_27647.jpg │ ├── sample_27825.jpg │ ├── sample_27936.jpg │ ├── sample_28251.jpg │ ├── sample_28550.jpg │ ├── sample_29020.jpg │ ├── sample_2919.jpg │ ├── sample_31021.jpg │ ├── sample_3152.jpg │ ├── sample_33108.jpg │ ├── sample_33165.jpg │ ├── sample_33193.jpg │ ├── sample_33271.jpg │ ├── sample_34045.jpg │ ├── sample_3480.jpg │ ├── sample_36605.jpg │ ├── sample_37196.jpg │ ├── sample_3767.jpg │ ├── sample_37793.jpg │ ├── sample_38070.jpg │ ├── sample_38479.jpg │ ├── sample_3880.jpg │ ├── sample_41126.jpg │ ├── sample_41669.jpg │ ├── sample_41819.jpg │ ├── sample_42013.jpg │ ├── sample_42528.jpg │ ├── sample_4256.jpg │ ├── sample_42662.jpg │ ├── sample_43296.jpg │ ├── sample_4354.jpg │ ├── sample_43814.jpg │ ├── sample_44314.jpg │ ├── sample_4496.jpg │ ├── sample_45115.jpg │ ├── sample_45910.jpg │ ├── sample_45926.jpg │ ├── sample_46011.jpg │ ├── sample_46449.jpg │ ├── sample_46682.jpg │ ├── sample_47082.jpg │ ├── sample_4712.jpg │ ├── sample_48604.jpg │ ├── sample_49798.jpg │ ├── sample_49844.jpg │ ├── sample_49963.jpg │ ├── sample_50107.jpg │ ├── sample_50488.jpg │ ├── sample_50863.jpg │ ├── sample_51212.jpg │ ├── sample_51335.jpg │ ├── sample_52312.jpg │ ├── sample_52636.jpg │ ├── sample_52970.jpg │ ├── sample_53360.jpg │ ├── sample_53374.jpg │ ├── sample_54723.jpg │ ├── sample_54962.jpg │ ├── sample_55104.jpg │ ├── sample_55535.jpg │ ├── sample_55641.jpg │ ├── sample_56036.jpg │ ├── sample_56392.jpg │ ├── sample_5707.jpg │ ├── sample_58386.jpg │ ├── sample_6135.jpg │ ├── sample_6197.jpg │ ├── sample_6689.jpg │ ├── sample_8744.jpg │ ├── sample_9688.jpg │ ├── sample_9758.jpg │ └── sample_9852.jpg ├── deployment.yaml ├── Dockerfile ├── README.md ├── 01-Kubernetes.ipynb └── 02-Generate-Fashion-MNIST-Sample-Images.ipynb ├── .gitignore ├── 100-PyQT ├── 01-simple-example │ └── main.py ├── 02-widgets │ └── main.py └── 03-QThread │ ├── qthread.py │ ├── signal_with_list.py │ └── signal_with_python_object.py ├── 005-ray ├── script.py ├── submit.py └── 10-ray-serving-tutorial │ └── app.py ├── 002-Pyspark ├── macdonald │ ├── README.md │ └── check_connection.py └── 01 Tutorial.ipynb ├── 202-Kubernetes-deploy-nginx ├── README.md └── deployment.yaml ├── 003-Shared-Memory ├── shared-memory-list.py ├── shared-memory-bytearray.py ├── shared-string.py └── shared_memory_queue.py ├── 203-Kubernetes-Service-MySQL └── deployment.yaml ├── 006-pyarrow ├── test_dataset.py ├── pyarrow_torch.py └── pyarrow-tutorial.ipynb └── 010-Pyspark └── 01 Tutorial.ipynb /004-Dask/dask-worker-space/global.lock: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /004-Dask/dask-worker-space/purge.lock: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /020-kinesis-single-shard/requirements.txt: -------------------------------------------------------------------------------- 1 | Faker==4.9.0 -------------------------------------------------------------------------------- /022-kinesis-data-analytics/requirements.txt: -------------------------------------------------------------------------------- 1 | Faker==4.9.0 -------------------------------------------------------------------------------- /204-Kubernetes-Redis/redis-custom-config/redis.conf: -------------------------------------------------------------------------------- 1 | maxmemory 2mb 2 | maxmemory-policy allkeys-lru -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # code-snippet 2 | 3 | 바로 가져가서 사용할 수 있는 코드들을 예제와 함께 제공을 합니다. 4 | 5 | 주로 Python 코드들을 지원하고 있습니다. 6 | 7 | -------------------------------------------------------------------------------- /101-GeoHash/images/lat-lng.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/101-GeoHash/images/lat-lng.jpg -------------------------------------------------------------------------------- /101-GeoHash/images/geohash-size.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/101-GeoHash/images/geohash-size.png -------------------------------------------------------------------------------- /200-Kubernetes/app/fasion_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/app/fasion_model.h5 -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_1139.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_1139.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_11518.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_11518.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_12352.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_12352.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_12924.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_12924.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_13180.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_13180.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_1421.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_1421.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_14467.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_14467.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_15419.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_15419.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_16078.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_16078.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_16742.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_16742.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_16975.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_16975.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_17027.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_17027.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_17101.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_17101.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_17764.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_17764.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_18043.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_18043.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_18313.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_18313.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_18788.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_18788.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_19007.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_19007.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_19098.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_19098.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_19177.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_19177.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_19442.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_19442.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_19482.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_19482.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_20091.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_20091.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_22950.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_22950.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_23330.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_23330.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_25021.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_25021.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_25536.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_25536.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_25643.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_25643.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_27647.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_27647.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_27825.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_27825.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_27936.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_27936.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_28251.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_28251.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_28550.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_28550.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_29020.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_29020.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_2919.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_2919.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_31021.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_31021.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_3152.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_3152.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_33108.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_33108.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_33165.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_33165.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_33193.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_33193.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_33271.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_33271.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_34045.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_34045.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_3480.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_3480.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_36605.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_36605.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_37196.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_37196.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_3767.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_3767.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_37793.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_37793.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_38070.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_38070.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_38479.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_38479.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_3880.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_3880.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_41126.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_41126.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_41669.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_41669.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_41819.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_41819.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_42013.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_42013.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_42528.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_42528.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_4256.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_4256.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_42662.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_42662.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_43296.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_43296.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_4354.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_4354.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_43814.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_43814.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_44314.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_44314.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_4496.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_4496.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_45115.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_45115.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_45910.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_45910.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_45926.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_45926.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_46011.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_46011.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_46449.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_46449.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_46682.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_46682.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_47082.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_47082.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_4712.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_4712.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_48604.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_48604.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_49798.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_49798.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_49844.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_49844.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_49963.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_49963.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_50107.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_50107.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_50488.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_50488.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_50863.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_50863.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_51212.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_51212.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_51335.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_51335.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_52312.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_52312.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_52636.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_52636.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_52970.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_52970.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_53360.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_53360.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_53374.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_53374.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_54723.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_54723.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_54962.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_54962.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_55104.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_55104.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_55535.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_55535.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_55641.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_55641.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_56036.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_56036.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_56392.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_56392.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_5707.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_5707.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_58386.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_58386.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_6135.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_6135.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_6197.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_6197.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_6689.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_6689.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_8744.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_8744.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_9688.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_9688.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_9758.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_9758.jpg -------------------------------------------------------------------------------- /200-Kubernetes/sample/sample_9852.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_9852.jpg -------------------------------------------------------------------------------- /200-Kubernetes/app/requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==1.1.2 2 | Keras==2.4.3 3 | h5py==2.10.0 4 | tensorflow-cpu==2.4.0 5 | Pillow==7.2.0 6 | numpy==1.18.5 7 | opencv-python==4.4.0.42 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | *.pyc 3 | 4 | # Project 5 | .idea 6 | .ipynb_checkpoints 7 | 8 | # Useless Files 9 | _SUCCESS 10 | *.crc 11 | 12 | # Data 13 | *.parquet 14 | *.orc 15 | -------------------------------------------------------------------------------- /204-Kubernetes-Redis/redis-custom-config/kustomization.yaml: -------------------------------------------------------------------------------- 1 | configMapGenerator: 2 | - name: example-redis-config 3 | files: 4 | - redis.conf 5 | resources: 6 | - redis-pod.yaml 7 | -------------------------------------------------------------------------------- /100-PyQT/01-simple-example/main.py: -------------------------------------------------------------------------------- 1 | from PyQt5.QtWidgets import QApplication, QLabel 2 | import sys 3 | 4 | app = QApplication(sys.argv) 5 | label = QLabel('Hello World') 6 | label.show() 7 | app.exec() 8 | -------------------------------------------------------------------------------- /005-ray/script.py: -------------------------------------------------------------------------------- 1 | import ray 2 | 3 | 4 | @ray.remote 5 | def hello_world(): 6 | print(ray.cluster_resources()) 7 | return "hello world" 8 | 9 | 10 | ray.init() 11 | print(ray.get(hello_world.remote())) 12 | -------------------------------------------------------------------------------- /002-Pyspark/macdonald/README.md: -------------------------------------------------------------------------------- 1 | # 1. Tutorial 2 | 3 | ## 1.1 Preparation 4 | 5 | S3 Bucket 하나 만들고, mcdonalds_dataset.csv 업로드 합니다.
6 | 예제에서의 S3 Bucket 이름은 data-emr-tutorial입니다. 7 | 8 | ```bash 9 | $ aws s3 cp mcdonalds_dataset.csv s3://data-emr-tutorial/data/ 10 | $ aws s3 ls data-emr-tutorial/data/ 11 | ``` 12 | 13 | ## 1.2 Run Script 14 | 15 | -------------------------------------------------------------------------------- /004-Dask/tutorial-01.py: -------------------------------------------------------------------------------- 1 | from dask.distributed import Client, progress 2 | import dask.array as da 3 | 4 | client = Client(processes=False, threads_per_worker=4, n_workers=1, memory_limit='2GB') 5 | x = da.random.random((10000, 10000), chunks=(1000, 1000)) 6 | y = x + x.T 7 | z = y[::2, 5000:].mean(axis=1) 8 | 9 | print(x) 10 | print(z.compute()) 11 | print(x.shape) -------------------------------------------------------------------------------- /202-Kubernetes-deploy-nginx/README.md: -------------------------------------------------------------------------------- 1 | # Tutorial 2 | 3 | ## Deployment 4 | 5 | 먼저 deployment.yaml 파일을 디플로이 시킵니다. 6 | 7 | ```bash 8 | kubectl apply -f deployment.yaml 9 | ``` 10 | 11 | 적용뒤에 라벨을 확인홥니다. 12 | 13 | ```bash 14 | kubectl describe deployments.apps nginx-deployment 15 | k get pods -l app=nginx 16 | ``` 17 | 18 | 이후에 replicas 를 1로 변경해준다음에 `kuberctl apply -f deployment.yaml` 실행해서 업데이트 해줍니다.
19 | pods 을 확인해서 1개가 terminating되고 있는지 확인합니다. 20 | -------------------------------------------------------------------------------- /202-Kubernetes-deploy-nginx/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 # for versions before 1.9.0 use apps/v1beta2 2 | kind: Deployment 3 | metadata: 4 | name: nginx-deployment 5 | spec: 6 | selector: 7 | matchLabels: 8 | app: nginx 9 | replicas: 2 # tells deployment to run 2 pods matching the template 10 | template: 11 | metadata: 12 | labels: 13 | app: nginx 14 | spec: 15 | containers: 16 | - name: nginx 17 | image: nginx:latest 18 | ports: 19 | - containerPort: 80 -------------------------------------------------------------------------------- /002-Pyspark/macdonald/check_connection.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from random import random 3 | from operator import add 4 | 5 | from pyspark.sql import SparkSession 6 | 7 | spark = SparkSession \ 8 | .builder \ 9 | .appName("PythonPi") \ 10 | .getOrCreate() 11 | 12 | partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2 13 | n = 100000 * partitions 14 | 15 | 16 | def f(_: int) -> float: 17 | x = random() * 2 - 1 18 | y = random() * 2 - 1 19 | return 1 if x ** 2 + y ** 2 <= 1 else 0 20 | 21 | 22 | count = spark.sparkContext.parallelize(range(1, n + 1), partitions).map(f).reduce(add) 23 | print("Pi is roughly %f" % (4.0 * count / n)) 24 | 25 | spark.stop() 26 | -------------------------------------------------------------------------------- /022-kinesis-data-analytics/consumer.py: -------------------------------------------------------------------------------- 1 | from boto import kinesis as boto_kinesis 2 | 3 | 4 | def main(): 5 | kinesis = boto_kinesis.connect_to_region('us-east-2') 6 | 7 | shard_id = 'shardId-000000000000' # Shard는 1개ch만 갖고 있음 8 | shard_it = kinesis.get_shard_iterator('AndersonStream', shard_id, 'LATEST')['ShardIterator'] 9 | print('Latest Shard Iterator:', shard_it) 10 | 11 | while True: 12 | _out = kinesis.get_records(shard_it, limit=10) 13 | records = _out['Records'] 14 | 15 | for r in records: 16 | print(r['Data']) 17 | 18 | shard_it = _out['NextShardIterator'] 19 | if not records: 20 | break 21 | 22 | 23 | if __name__ == '__main__': 24 | main() 25 | -------------------------------------------------------------------------------- /020-kinesis-single-shard/consumer.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pprint import pprint 3 | 4 | from boto import kinesis as boto_kinesis 5 | 6 | 7 | def main(): 8 | kinesis = boto_kinesis.connect_to_region('us-east-2') 9 | 10 | shard_id = 'shardId-000000000003' # Shard는 1개ch만 갖고 있음 11 | shard_it = kinesis.get_shard_iterator('AndersonStream', shard_id, 'LATEST')['ShardIterator'] 12 | print('Latest Shard Iterator:', shard_it) 13 | 14 | while True: 15 | _out = kinesis.get_records(shard_it, limit=10) 16 | records = _out['Records'] 17 | 18 | for r in records: 19 | print(r['Data']) 20 | 21 | shard_it = _out['NextShardIterator'] 22 | if not records: 23 | break 24 | 25 | 26 | if __name__ == '__main__': 27 | main() 28 | -------------------------------------------------------------------------------- /200-Kubernetes/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: myapp 5 | spec: 6 | type: NodePort 7 | ports: 8 | - protocol: TCP 9 | port: 80 10 | targetPort: 5000 11 | selector: 12 | app: myapp 13 | --- 14 | apiVersion: apps/v1 # for versions before 1.9.0 use apps/v1beta2 15 | kind: Deployment 16 | metadata: 17 | name: myapp-deployment 18 | spec: 19 | selector: 20 | matchLabels: 21 | app: myapp 22 | replicas: 1 # tells deployment to run 2 pods matching the template 23 | template: 24 | metadata: 25 | labels: 26 | app: myapp 27 | spec: 28 | containers: 29 | - name: myapp 30 | image: myapp:latest 31 | imagePullPolicy: Never 32 | ports: 33 | - containerPort: 5000 -------------------------------------------------------------------------------- /200-Kubernetes/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | MAINTAINER Anderson "a141890@gmail.com" 3 | 4 | ARG DEBIAN_FRONTEND=noninteractive 5 | ENV TZ=Asia/Seoul 6 | RUN apt-get update -y && \ 7 | apt-get install -y python3-pip python3-dev libgl1-mesa-dev libgl1-mesa-glx libglib2.0-0 \ 8 | build-essential cmake git pkg-config libgtk-3-dev \ 9 | libavcodec-dev libavformat-dev libswscale-dev libv4l-dev \ 10 | libxvidcore-dev libx264-dev libjpeg-dev libpng-dev libtiff-dev \ 11 | gfortran openexr libatlas-base-dev python3-dev python3-numpy \ 12 | libtbb2 libtbb-dev libdc1394-22-dev 13 | 14 | COPY ./app /app 15 | WORKDIR /app 16 | RUN pip3 install -r requirements.txt 17 | 18 | ENTRYPOINT [ "python3"] 19 | CMD [ "app.py" ] -------------------------------------------------------------------------------- /204-Kubernetes-Redis/redis-custom-config/redis-pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: redis 5 | spec: 6 | containers: 7 | - name: redis 8 | image: redis:5.0.4 9 | command: 10 | - redis-server 11 | - "/redis-master/redis.conf" 12 | env: 13 | - name: MASTER 14 | value: "true" 15 | ports: 16 | - containerPort: 6379 17 | resources: 18 | limits: 19 | cpu: "0.1" 20 | volumeMounts: 21 | - mountPath: /redis-master-data 22 | name: data 23 | - mountPath: /redis-master 24 | name: config 25 | volumes: 26 | - name: data 27 | emptyDir: {} 28 | - name: config 29 | configMap: 30 | name: example-redis-config 31 | items: 32 | - key: redis-config 33 | path: redis.conf 34 | -------------------------------------------------------------------------------- /005-ray/submit.py: -------------------------------------------------------------------------------- 1 | from ray.job_submission import JobSubmissionClient, JobStatus 2 | import time 3 | 4 | client = JobSubmissionClient("http://localhost:8265") 5 | job_id = client.submit_job( 6 | entrypoint="python script.py", 7 | runtime_env={ 8 | 'working_dir': './' # 이게 있어야지 script.py 파일이 클러스터에 업로드 / 내부적으로 _upload_working_dir_if_needed 함수 호출 9 | } 10 | ) 11 | print(job_id) 12 | 13 | 14 | def wait_until_status(job_id, status_to_wait_for, timeout_seconds=5): 15 | start = time.time() 16 | while time.time() - start <= timeout_seconds: 17 | status = client.get_job_status(job_id) 18 | print(f"status: {status}") 19 | if status in status_to_wait_for: 20 | break 21 | time.sleep(1) 22 | 23 | 24 | wait_until_status(job_id, {JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED}) 25 | logs = client.get_job_logs(job_id) 26 | print(logs) 27 | -------------------------------------------------------------------------------- /003-Shared-Memory/shared-memory-list.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.shared_memory import ShareableList 2 | import numpy as np 3 | from multiprocessing import Process 4 | 5 | 6 | def daemon_run(name): 7 | a = ShareableList(name=name) 8 | data = np.array(a) 9 | print(f'[Processor] data: {data[:4]} | size: {len(data)}') # [Processor] data: [255 11 0 100] | size: 4096 10 | for i, v in enumerate(['def', -9999999999, 0.123456789123456, 8889999]): 11 | a[i] = v 12 | 13 | 14 | def main(): 15 | # Shared Memory 생성 16 | a = ShareableList(['abc', 9999999, -100, 0.123456789]) 17 | 18 | p = Process(target=daemon_run, args=(a.shm.name,)) # 프로세서를 열고, shared memory 를 읽어서 출력한다. 19 | p.start() 20 | p.join() 21 | 22 | data = np.array(a) 23 | print(f'[Main] data: {data[:4]} | size: {len(a)}') # [Main] data: [1 2 3 4] | size: 10 24 | 25 | 26 | if __name__ == '__main__': 27 | main() 28 | -------------------------------------------------------------------------------- /200-Kubernetes/app/app.py: -------------------------------------------------------------------------------- 1 | from tempfile import gettempdir 2 | 3 | import cv2 4 | import numpy as np 5 | from flask import Flask, request, jsonify 6 | from keras.models import load_model 7 | 8 | app = Flask(__name__) 9 | model = load_model('fasion_model.h5') 10 | 11 | 12 | @app.route('/') 13 | def hello_world(): 14 | return 'Hello! Anderson!' 15 | 16 | 17 | @app.route('/predict', methods=['POST']) 18 | def predict(): 19 | tmp_dir = gettempdir() 20 | f = request.files["image"] 21 | f.save(tmp_dir + '/img.jpg', cv2.IMREAD_COLOR) 22 | img = cv2.imread(tmp_dir + '/img.jpg', cv2.IMREAD_COLOR)[:, :, 0] 23 | img = np.expand_dims(img, axis=0) 24 | pred_y = model.predict(img)[0] 25 | pred_label = int(np.argmax(pred_y)) 26 | prob = float(pred_y[pred_label]) 27 | return jsonify({'prediction': pred_label, 'prob': prob}) 28 | 29 | 30 | if __name__ == '__main__': 31 | app.run(debug=True, host='0.0.0.0') 32 | -------------------------------------------------------------------------------- /100-PyQT/02-widgets/main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from typing import Optional 3 | 4 | from PyQt5.QtCore import QObject 5 | from PyQt5.QtWidgets import * 6 | 7 | 8 | class WidgetGallery(QDialog): 9 | 10 | def __init__(self, parent: Optional[QObject] = None): 11 | super(WidgetGallery, self).__init__(parent) 12 | self.originalPalette = QApplication.palette() 13 | styleComboBox = QComboBox() 14 | styleComboBox.addItems(QStyleFactory.keys()) 15 | 16 | styleLabel = QLabel("&Style:") 17 | styleLabel.setBuddy(styleComboBox) 18 | 19 | topLayout = QHBoxLayout() 20 | topLayout.addWidget(styleLabel) 21 | 22 | mainLayout = QGridLayout() 23 | mainLayout.addLayout(topLayout, 0, 0, 1, 2) 24 | self.setLayout(mainLayout) 25 | 26 | 27 | if __name__ == '__main__': 28 | app = QApplication(sys.argv) 29 | gallery = WidgetGallery() 30 | gallery.show() 31 | sys.exit(app.exec()) 32 | -------------------------------------------------------------------------------- /003-Shared-Memory/shared-memory-bytearray.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.shared_memory import SharedMemory 2 | import numpy as np 3 | from multiprocessing import Process 4 | 5 | 6 | def daemon_run(name): 7 | shm = SharedMemory(name=name) 8 | data = np.array(shm.buf) 9 | print(f'[Processor] data: {data[:4]} | size: {len(data)}') # [Processor] data: [255 11 0 100] | size: 4096 10 | shm.buf[:4] = bytearray([1, 2, 3, 4]) 11 | 12 | 13 | def main(): 14 | # Shared Memory 생성 15 | shm = SharedMemory(create=True, size=1024*1024*8) 16 | shm.buf[:4] = bytearray([255, 11, 0, 100]) # 값은 [0~256) 사이의 값만 가능 17 | 18 | p = Process(target=daemon_run, args=(shm.name,)) # 프로세서를 열고, shared memory 를 읽어서 출력한다. 19 | p.start() 20 | p.join() 21 | 22 | data = np.array(shm.buf) 23 | print(f'[Main] data: {data[:4]} | size: {len(shm.buf)}') # [Main] data: [1 2 3 4] | size: 10 24 | 25 | import ipdb 26 | ipdb.set_trace() 27 | 28 | 29 | if __name__ == '__main__': 30 | main() 31 | -------------------------------------------------------------------------------- /203-Kubernetes-Service-MySQL/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: mysql 5 | spec: 6 | ports: 7 | - port: 3306 8 | selector: 9 | app: mysql 10 | clusterIP: None 11 | --- 12 | apiVersion: apps/v1 # for versions before 1.9.0 use apps/v1beta2 13 | kind: Deployment 14 | metadata: 15 | name: mysql 16 | spec: 17 | selector: 18 | matchLabels: 19 | app: mysql 20 | strategy: 21 | type: Recreate 22 | template: 23 | metadata: 24 | labels: 25 | app: mysql 26 | spec: 27 | containers: 28 | - image: mysql:5.6 29 | name: mysql 30 | env: 31 | # Use secret in real usage 32 | - name: MYSQL_ROOT_PASSWORD 33 | value: 1234 34 | ports: 35 | - containerPort: 3306 36 | name: mysql 37 | volumeMounts: 38 | - name: mysql-persistent-storage 39 | mountPath: /var/lib/mysql 40 | volumes: 41 | - name: mysql-persistent-storage 42 | persistentVolumeClaim: 43 | claimName: mysql-pv-claim -------------------------------------------------------------------------------- /022-kinesis-data-analytics/producer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | from time import sleep 4 | 5 | from boto import kinesis as boto_kinesis 6 | from faker import Faker 7 | 8 | 9 | def generate_data(faker): 10 | return {'name': faker.name(), 11 | 'age': random.randint(10, 20), 12 | 'gender': random.choice(['M', 'F']), 13 | 'score': random.choice(range(40, 70, 5)), 14 | 'job': faker.job()} 15 | 16 | 17 | def main(): 18 | faker = Faker() 19 | kinesis = boto_kinesis.connect_to_region('us-east-2') 20 | print('Connected') 21 | 22 | if 'AndersonStream' not in kinesis.list_streams()['StreamNames']: 23 | kinesis.create_stream('AndersonStream', 1) 24 | print('AndersonStream Stream has been created') 25 | 26 | while True: 27 | sleep(1) 28 | print(kinesis.list_streams()) 29 | if 'AndersonStream' in kinesis.list_streams()['StreamNames']: 30 | kinesis = boto_kinesis.connect_to_region('us-east-2') 31 | break 32 | 33 | for _ in range(50): 34 | data = generate_data(faker) 35 | res = kinesis.put_record('AndersonStream', json.dumps(data), 'partitionkey' + str(random.choice([0, 1]))) 36 | print('PUT', data) 37 | print(' ', res['SequenceNumber'], '\n') 38 | 39 | # kinesis.delete_stream('AndersonStream') 40 | 41 | 42 | if __name__ == '__main__': 43 | main() 44 | -------------------------------------------------------------------------------- /101-GeoHash/images/law-of-cosine.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | A 7 | C 8 | a 9 | b 10 | c 11 | 12 | 13 | 14 | 15 | 16 | 17 | B 18 | 19 | -------------------------------------------------------------------------------- /003-Shared-Memory/shared-string.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.shared_memory import SharedMemory 2 | import numpy as np 3 | from multiprocessing import Process 4 | 5 | 6 | def daemon_run(in_name, out_name): 7 | in_shm = SharedMemory(name=in_name) 8 | out_shm = SharedMemory(name=out_name) 9 | xb = in_shm.buf[:1024].tobytes() 10 | text = xb.decode('euc-kr').strip() 11 | print(f'[Processor] data: {text} | byte: {len(xb)}') # [Processor] data: [255 11 0 100] | size: 4096 12 | assert text == '한글1234 ABC %^&' 13 | 14 | xb = '프로세서에서 리턴된 스트링 1234!'.encode('euc-kr') 15 | out_shm.buf[:len(xb)] = xb 16 | 17 | 18 | def main(): 19 | # Shared Memory 생성 20 | shm1 = SharedMemory(create=True, size=1024 * 1024 * 64) 21 | shm2 = SharedMemory(create=True, size=1024 * 1024 * 64) 22 | 23 | shm1.buf[:] = (' ' * len(shm1.buf)).encode('euc-kr') 24 | shm2.buf[:] = (' ' * len(shm2.buf)).encode('euc-kr') 25 | 26 | x = '한글1234 ABC %^&' 27 | xb = x.encode('euc-kr') 28 | shm1.buf[:len(xb)] = xb # 값은 [0~256) 사이의 값만 가능 29 | p = Process(target=daemon_run, args=(shm1.name, shm2.name)) # 프로세서를 열고, shared memory 를 읽어서 출력한다. 30 | p.start() 31 | 32 | ob = shm2.buf[:1024].tobytes() 33 | text = ob.decode('euc-kr').strip() 34 | print(f'[Main ] data: {text} | byte: {len(ob)}') # [Main] data: [1 2 3 4] | size: 10 35 | 36 | assert text == '프로세서에서 리턴된 스트링 1234!' 37 | 38 | shm1.close() 39 | shm2.close() 40 | p.join() 41 | 42 | 43 | if __name__ == '__main__': 44 | main() 45 | -------------------------------------------------------------------------------- /020-kinesis-single-shard/producer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | from time import sleep 4 | 5 | from boto import kinesis as boto_kinesis 6 | from faker import Faker 7 | 8 | 9 | def generate_data(faker): 10 | return {'name': faker.name(), 11 | 'age': random.randint(10, 20), 12 | 'gender': random.choice(['M', 'F']), 13 | 'score': random.choice(range(40, 70, 5)), 14 | 'data': {'id': random.randint(0, 10000), 15 | 'type': random.choice(['a', 'b', 'c'])}} 16 | 17 | 18 | def main(): 19 | faker = Faker() 20 | kinesis = boto_kinesis.connect_to_region('us-east-2') 21 | print('Connected') 22 | 23 | if 'AndersonStream' not in kinesis.list_streams()['StreamNames']: 24 | kinesis.create_stream('AndersonStream', 1) 25 | print('AndersonStream Stream has been created') 26 | 27 | while True: 28 | sleep(1) 29 | print(kinesis.list_streams()) 30 | if 'AndersonStream' in kinesis.list_streams()['StreamNames']: 31 | kinesis = boto_kinesis.connect_to_region('us-east-2') 32 | break 33 | i = 0 34 | while True: 35 | i += 1 36 | data = generate_data(faker) 37 | data['i'] = i 38 | res = kinesis.put_record('AndersonStream', json.dumps(data), 'partitionkey' + str(random.randint(0, 10))) 39 | print(f'{i:2}', data) 40 | print(' ', res, '\n') 41 | 42 | # kinesis.delete_stream('AndersonStream') 43 | 44 | 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /200-Kubernetes/README.md: -------------------------------------------------------------------------------- 1 | # Tutorial 2 | 3 | ## Preparation 4 | 5 | 1. 먼저 [Kaggle Fahsion MNIST](https://www.kaggle.com/zalando-research/fashionmnist) 에서 데이터를 다운로드 받습니다. 6 | 7 | 8 | ## Postman 설정 9 | 10 | - POST 설정 11 | - body -> form-data 12 | - key: image 13 | - key 에서 파일로 변경 14 | - value: 파일 업로드 15 | 16 | ## Docker 확인 17 | 18 | ```bash 19 | docker build -t myapp . 20 | docker run -p 5000:5000 myapp 21 | ``` 22 | 23 | Daemon으로도 실행 24 | 25 | ```bash 26 | docker run -d -p 5000:5000 --name myapp myapp:latest 27 | ``` 28 | 29 | Postman에서 확인 합니다. 30 | 31 | ## Docker Hub로 올리기 32 | 33 | 태그걸어주고 Docker Hub에 올립니다. 34 | 35 | ```bash 36 | docker tag myapp andersonjo/myapp 37 | docker push andersonjo/myapp 38 | ``` 39 | 40 | ## Docker Hub에서 Pull 안하고 Minikube Image 사용하는 방법 41 | 42 | ```bash 43 | eval $(minikube docker-env) 44 | ``` 45 | 46 | 그 다음 build 를 해줍니다. 47 | 48 | ```bash 49 | docker build -t myapp . 50 | ``` 51 | 52 | Minikube안에서 실행을 합니다. 53 | 54 | ```bash 55 | kubectl run myapp-kube --image=myapp:latest --image-pull-policy=Never 56 | ``` 57 | 58 | pods을 확인 하고, Postman에서도 확인합니다. 59 | 60 | ```bash 61 | kubectl get pods 62 | kubectl port-forward myapp-kube 5000:5000 63 | ``` 64 | 65 | ## stateful 로 해보기 66 | 67 | 아래의 두개의 명령이 되어 있어야 합니다. 68 | 69 | ```bash 70 | eval $(minikube docker-env) 71 | docker build -t myapp . 72 | ``` 73 | 74 | 배포합니다. 75 | 76 | ```bash 77 | kubectl apply -f deployment.yaml 78 | 79 | kubectl port-forward svc/myapp 5000:80 80 | ``` 81 | 82 | 최종적으로 Minikube로 서비스 포트를 열수 있습니다. 83 | 84 | ```bash 85 | minikube service myapp 86 | ``` -------------------------------------------------------------------------------- /005-ray/10-ray-serving-tutorial/app.py: -------------------------------------------------------------------------------- 1 | from ray import serve 2 | from starlette.requests import Request 3 | from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer 4 | 5 | 6 | @serve.deployment(num_replicas=2, ray_actor_options={"num_cpus": 1, "num_gpus": 0}) 7 | class Translator: 8 | def __init__(self): 9 | # Load model 10 | self.tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B") 11 | self.tokenizer.src_lang = 'en' 12 | 13 | self.model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_1.2B") 14 | self.model.eval() 15 | 16 | def translate(self, text: str) -> str: 17 | dest_lang_id = self.tokenizer.get_lang_id('ko') 18 | encoded_src = self.tokenizer(text, return_tensors="pt") 19 | generated_tokens = self.model.generate(**encoded_src, 20 | forced_bos_token_id=dest_lang_id, 21 | max_length=200, 22 | use_cache=True) 23 | result = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] 24 | return result 25 | 26 | async def __call__(self, http_request: Request) -> str: 27 | korean_text: str = await http_request.json() 28 | return self.translate(korean_text) 29 | 30 | 31 | translator = Translator.bind() 32 | 33 | # if __name__ == '__main__': 34 | # translator = Translator() 35 | # print(translator.translate('self-belief and hard work will always earn you success')) 36 | -------------------------------------------------------------------------------- /100-PyQT/03-QThread/qthread.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from datetime import datetime 3 | from multiprocessing import Process, Queue 4 | 5 | import numpy as np 6 | from PyQt5.QtCore import QThread, pyqtSignal, pyqtSlot, QCoreApplication 7 | from PyQt5.QtWidgets import QMainWindow, QApplication 8 | 9 | 10 | def producer(que: Queue): 11 | data = ''.join([str(np.random.rand()) for _ in range(20)]) 12 | for i in range(100000): 13 | que.put(data, block=False) 14 | 15 | 16 | class Consumer(QThread): 17 | poped = pyqtSignal(str) 18 | 19 | def __init__(self, que: Queue): 20 | super().__init__() 21 | self.que = que 22 | 23 | def run(self): 24 | while True: 25 | data = self.que.get() 26 | self.poped.emit(data) 27 | 28 | 29 | class MyWindow(QMainWindow): 30 | def __init__(self, que): 31 | super().__init__() 32 | self.setWindowTitle('Test Haha') 33 | self.setGeometry(200, 200, 300, 200) 34 | self.statusBar().showMessage('Hello!') 35 | self.statusBar().setStyleSheet('border:1px solid #333333;') 36 | 37 | self.consumer = Consumer(que) 38 | self.consumer.poped.connect(self.process_data) 39 | self.consumer.start() 40 | self.cnt = 0 41 | 42 | @pyqtSlot(str) 43 | def process_data(self, data): 44 | self.cnt += 1 45 | self.statusBar().showMessage(str(self.cnt)) 46 | if self.cnt >= 100000: 47 | QCoreApplication.instance().quit() 48 | 49 | 50 | if __name__ == '__main__': 51 | start_dt = datetime.now() 52 | que = Queue() 53 | p = Process(name='producer', target=producer, args=(que,), daemon=True) 54 | p.start() 55 | 56 | # Main Application 57 | app = QApplication(sys.argv) 58 | window = MyWindow(que) 59 | window.show() 60 | app.exec_() 61 | 62 | print((datetime.now() - start_dt).total_seconds()) 63 | -------------------------------------------------------------------------------- /100-PyQT/03-QThread/signal_with_list.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from datetime import datetime 3 | from multiprocessing import Process, Queue 4 | 5 | import numpy as np 6 | from PyQt5.QtCore import QThread, pyqtSignal, pyqtSlot, QCoreApplication 7 | from PyQt5.QtWidgets import QMainWindow, QApplication 8 | 9 | 10 | def producer(que: Queue): 11 | data = ''.join([str(np.random.rand()) for _ in range(20)]) 12 | for i in range(100000): 13 | que.put(data, block=False) 14 | 15 | 16 | class Consumer(QThread): 17 | poped = pyqtSignal(list) # 여기가 중요. PyQt_PyObject 이걸로 해야 함 18 | 19 | def __init__(self, que: Queue): 20 | super().__init__() 21 | self.que = que 22 | 23 | def run(self): 24 | while True: 25 | data = self.que.get() 26 | self.poped.emit([1, data]) 27 | 28 | 29 | class MyWindow(QMainWindow): 30 | def __init__(self, que): 31 | super().__init__() 32 | self.setWindowTitle('Test Haha') 33 | self.setGeometry(200, 200, 300, 200) 34 | self.statusBar().showMessage('Hello!') 35 | self.statusBar().setStyleSheet('border:1px solid #333333;') 36 | 37 | self.consumer = Consumer(que) 38 | self.consumer.poped.connect(self.process_data) 39 | self.consumer.start() 40 | self.cnt = 0 41 | 42 | @pyqtSlot(list) 43 | def process_data(self, data): 44 | self.cnt += 1 45 | self.statusBar().showMessage(str(self.cnt)) 46 | if self.cnt >= 100000: 47 | QCoreApplication.instance().quit() 48 | 49 | 50 | if __name__ == '__main__': 51 | start_dt = datetime.now() 52 | que = Queue() 53 | p = Process(name='producer', target=producer, args=(que,), daemon=True) 54 | p.start() 55 | 56 | # Main Application 57 | app = QApplication(sys.argv) 58 | window = MyWindow(que) 59 | window.show() 60 | app.exec_() 61 | 62 | print((datetime.now() - start_dt).total_seconds()) 63 | -------------------------------------------------------------------------------- /100-PyQT/03-QThread/signal_with_python_object.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from datetime import datetime 3 | from multiprocessing import Process, Queue 4 | 5 | import numpy as np 6 | from PyQt5.QtCore import QThread, pyqtSignal, pyqtSlot, QCoreApplication 7 | from PyQt5.QtWidgets import QMainWindow, QApplication 8 | 9 | 10 | def producer(que: Queue): 11 | data = ''.join([str(np.random.rand()) for _ in range(20)]) 12 | for i in range(100000): 13 | que.put(data, block=False) 14 | 15 | 16 | class Consumer(QThread): 17 | poped = pyqtSignal('PyQt_PyObject') # 여기가 중요. PyQt_PyObject 이걸로 해야 함 18 | 19 | def __init__(self, que: Queue): 20 | super().__init__() 21 | self.que = que 22 | 23 | def run(self): 24 | class Data: 25 | def __init__(self, value): 26 | self.value = value 27 | 28 | while True: 29 | data = self.que.get() 30 | data_object = Data(data) 31 | self.poped.emit(data_object) 32 | 33 | 34 | class MyWindow(QMainWindow): 35 | def __init__(self, que): 36 | super().__init__() 37 | self.setWindowTitle('Test Haha') 38 | self.setGeometry(200, 200, 300, 200) 39 | self.statusBar().showMessage('Hello!') 40 | self.statusBar().setStyleSheet('border:1px solid #333333;') 41 | 42 | self.consumer = Consumer(que) 43 | self.consumer.poped.connect(self.process_data) 44 | self.consumer.start() 45 | self.cnt = 0 46 | 47 | @pyqtSlot('PyQt_PyObject') 48 | def process_data(self, data): 49 | self.cnt += 1 50 | self.statusBar().showMessage(str(self.cnt)) 51 | if self.cnt >= 100000: 52 | QCoreApplication.instance().quit() 53 | 54 | 55 | if __name__ == '__main__': 56 | start_dt = datetime.now() 57 | que = Queue() 58 | p = Process(name='producer', target=producer, args=(que,), daemon=True) 59 | p.start() 60 | 61 | # Main Application 62 | app = QApplication(sys.argv) 63 | window = MyWindow(que) 64 | window.show() 65 | app.exec_() 66 | 67 | print((datetime.now() - start_dt).total_seconds()) 68 | -------------------------------------------------------------------------------- /006-pyarrow/test_dataset.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from pathlib import Path 3 | from plistlib import Data 4 | 5 | import pandas as pd 6 | import pyarrow as pa 7 | from torch.utils.data import DataLoader 8 | 9 | from pyarrow_torch import PyArrowDataset 10 | 11 | 12 | def test_dataset(): 13 | create_data() 14 | 15 | dataset = CustomDataset("./data") 16 | assert dataset[50000] == 50000 17 | assert dataset.read_cnt == 1 18 | 19 | assert dataset[0] == 0 20 | assert dataset.read_cnt == 2 21 | 22 | assert dataset[500000] == 500000 23 | assert dataset.read_cnt == 3 24 | 25 | assert dataset[100] == 100 26 | assert dataset.read_cnt == 4 27 | 28 | assert dataset[32768] == 32768 29 | assert dataset.read_cnt == 5 30 | 31 | assert dataset[32767] == 32767 32 | assert dataset.read_cnt == 6 33 | 34 | assert dataset[32769] == 32769 35 | assert dataset.read_cnt == 7 36 | 37 | assert dataset[32770] == 32770 38 | assert dataset.read_cnt == 7 39 | 40 | assert dataset[32771] == 32771 41 | assert dataset.read_cnt == 7 42 | 43 | assert dataset[8000000] == 8000000 44 | assert dataset.read_cnt == 8 45 | 46 | assert dataset[8000001] == 8000001 47 | assert dataset.read_cnt == 8 48 | 49 | 50 | def test_dataloader_batch(): 51 | dataset = CustomDataset("./data") 52 | loader = DataLoader(dataset, batch_size=2) 53 | for i, row in enumerate(loader): 54 | assert [i * 2, i * 2 + 1] == row.tolist() 55 | # assert i == row.item() 56 | 57 | i = 0 58 | for i, row in enumerate(loader): 59 | assert [i * 2, i * 2 + 1] == row.tolist() 60 | assert (i + 1) == len(loader) 61 | 62 | 63 | def test_dataloader_workers(): 64 | dataset = CustomDataset("./data") 65 | loader = DataLoader(dataset, batch_size=2, num_workers=8) 66 | for i, row in enumerate(loader): 67 | assert [i * 2, i * 2 + 1] == row.tolist() 68 | 69 | 70 | def test_dataloader_random(): 71 | dataset = CustomDataset("./data") 72 | loader = DataLoader(dataset, batch_size=10, shuffle=True, num_workers=2, pin_memory=True) 73 | total = set() 74 | for i, row in enumerate(loader): 75 | total |= set(row.tolist()) 76 | assert len(total) == len(loader) * 10 77 | 78 | total = set() 79 | for i, row in enumerate(loader): 80 | total |= set(row.tolist()) 81 | assert len(total) == len(loader) * 10 82 | 83 | 84 | class CustomDataset(PyArrowDataset): 85 | 86 | def __getitem__(self, idx): 87 | row = super().__getitem__(idx) 88 | return row['idx'] 89 | 90 | 91 | def create_data(): 92 | if not Path('./data').exists(): 93 | df = pd.DataFrame({"idx": range(50000000)}) 94 | dt = datetime(2023, 1, 1) 95 | df["dt"] = df["idx"].apply( 96 | lambda x: (dt + timedelta(milliseconds=x * 10)).date() 97 | ) 98 | pa.parquet.write_to_dataset( 99 | pa.Table.from_pandas(df), 100 | root_path="data", 101 | partition_cols=["dt"], 102 | use_legacy_dataset=False, 103 | ) 104 | -------------------------------------------------------------------------------- /003-Shared-Memory/shared_memory_queue.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from multiprocessing import get_context, Process 3 | from multiprocessing import Queue 4 | from multiprocessing.shared_memory import SharedMemory 5 | from typing import Optional 6 | 7 | from tqdm import tqdm 8 | 9 | 10 | class SharedQueue: 11 | 12 | def __init__(self, queue: Queue, sq_name=None, shared_size=1024 * 1024 * 8): 13 | self.queue = queue 14 | if sq_name is None: 15 | self.shm = SharedMemory(create=True, size=shared_size) 16 | else: 17 | self.shm = SharedMemory(name=sq_name) 18 | self._shared_size = shared_size - 1 19 | self._cur_idx = 0 20 | 21 | @property 22 | def name(self): 23 | return self.shm.name 24 | 25 | def get(self, block: bool = True, timeout: Optional[float] = None, encoding='euc-kr') -> Optional[bytes]: 26 | r = self.queue.get(block=block, timeout=timeout) 27 | if r is None: 28 | return None 29 | 30 | start, end = r 31 | return self.shm.buf[start:end].tobytes() 32 | 33 | def put(self, obj: bytes, block: bool = True, timeout: Optional[float] = None): 34 | start = self._cur_idx 35 | end = start + len(obj) 36 | if end >= self._shared_size: 37 | start, end = 0, len(obj) 38 | self._cur_idx = end 39 | 40 | self.shm.buf[start:end] = obj 41 | self.queue.put((start, end), block=block, timeout=timeout) 42 | 43 | 44 | def daemon_run1(queue: Queue, sq_name: str): 45 | answer = '{0}안녕하세요 파이썬! 123456789 !@#$%^&*() 하하! 제타벨류 가즈아!' 46 | cnt = 0 47 | 48 | sq = SharedQueue(queue, sq_name=sq_name) 49 | while True: 50 | btext = sq.get() 51 | try: 52 | text = btext.decode('euc-kr') 53 | except UnicodeDecodeError as e: 54 | print(e) 55 | print(btext) 56 | continue 57 | 58 | if text == 'end': 59 | break 60 | assert text == answer.format(cnt), f'ERROR: text:{text}' 61 | cnt += 1 62 | 63 | 64 | def daemon_run2(queue: Queue): 65 | answer = '{0}안녕하세요 파이썬! 123456789 !@#$%^&*() 하하! 제타벨류 가즈아!' 66 | cnt = 0 67 | 68 | while True: 69 | text = queue.get() 70 | if text == 'end': 71 | break 72 | assert text == answer.format(cnt), f'ERROR: text:{text}' 73 | cnt += 1 74 | 75 | 76 | def main(): 77 | s = '{0}안녕하세요 파이썬! 123456789 !@#$%^&*() 하하! 제타벨류 가즈아!' 78 | 79 | # SharedMemoryQueue 테스트 80 | queue = Queue() 81 | sq = SharedQueue(queue, shared_size=1024 * 1024 * 128) # 낮은 값 설정시 에러가 난다 82 | p = Process(target=daemon_run1, args=(queue, sq.name)) 83 | p.start() 84 | 85 | start = datetime.now() 86 | for i in tqdm(range(1000000)): 87 | sq.put(bytes(s.format(i), encoding='euc-kr'), block=False) 88 | sq.put('end'.encode('euc-kr'), block=True) 89 | p.join() 90 | 91 | print('SharedQueue:', (datetime.now() - start).total_seconds()) 92 | 93 | # 기존 Queue 테스트 94 | queue = Queue() 95 | p = Process(target=daemon_run2, args=(queue,)) 96 | p.start() 97 | 98 | start = datetime.now() 99 | for i in tqdm(range(1000000)): 100 | queue.put(s.format(i), block=False) 101 | queue.put('end', block=True) 102 | p.join() 103 | 104 | print('Queue:', (datetime.now() - start).total_seconds()) 105 | 106 | 107 | if __name__ == '__main__': 108 | main() 109 | -------------------------------------------------------------------------------- /006-pyarrow/pyarrow_torch.py: -------------------------------------------------------------------------------- 1 | import random 2 | import sys 3 | from bisect import bisect_right 4 | from typing import Iterator, Optional, Tuple, List 5 | 6 | import pyarrow as pa 7 | import pandas as pd 8 | from pyarrow.dataset import ParquetFileFragment 9 | from pyarrow.lib import RecordBatch 10 | from pyarrow.parquet import ParquetDataset, ParquetFile 11 | from torch.utils.data import Dataset 12 | 13 | 14 | class PyArrowDataset(Dataset): 15 | def __init__(self, source: str, shuffle: bool = False, seed: int = 123): 16 | random.seed(seed) 17 | self.source = source 18 | self.seed = seed 19 | 20 | # Pyarrow 21 | self.dataset = ParquetDataset(source, use_legacy_dataset=False) 22 | self.parquet_indices: List[Tuple[int, int, int, ParquetFile, int, int]] = [] 23 | self._cur_meta = None 24 | self._df: Optional[pd.DataFrame] = None 25 | 26 | # Debug (Memory Profiling) 27 | self.read_cnt = 0 28 | 29 | self.init_parquet_indexing(shuffle) 30 | 31 | def init_parquet_indexing(self, shuffle: bool = False): 32 | fragments = self.dataset.fragments 33 | if shuffle: 34 | random.shuffle(fragments) 35 | 36 | idx = 0 37 | parquet_indices = [] 38 | for frag in fragments: 39 | parquet_file = ParquetFile(frag.path) 40 | for i, row_group in enumerate(frag.row_groups): 41 | start_idx = idx # inclusive 42 | end_idx = idx + row_group.num_rows # exclusive 43 | parquet_indices.append((i, start_idx, end_idx, parquet_file, row_group.id, row_group.num_rows)) 44 | idx += row_group.num_rows 45 | 46 | self.parquet_indices.clear() 47 | self.parquet_indices = parquet_indices 48 | 49 | def __len__(self): 50 | if not self.parquet_indices: 51 | return 0 52 | return self.parquet_indices[-1][-1] 53 | 54 | def __getitem__(self, idx: int): 55 | meta_idx = self._binary_search(idx) 56 | 57 | if self._cur_meta is not None and meta_idx == self._cur_meta[0]: 58 | start_idx = self._cur_meta[1] 59 | # print(f'total: {len(self)} | {idx - start_idx}') 60 | assert (idx - start_idx) >= 0, f'{idx} - {start_idx} = {idx - start_idx} <- should not be negative.' 61 | return self._df.iloc[idx - start_idx] 62 | 63 | # Clear memory references 64 | del self._df 65 | del self._cur_meta 66 | self.read_cnt += 1 67 | # Read a new Parquet File 68 | self._cur_meta = self.parquet_indices[meta_idx] 69 | start_idx = self._cur_meta[1] 70 | parquet_file = self._cur_meta[3] 71 | row_id = self._cur_meta[4] 72 | table: pa.Table = parquet_file.read_row_group(row_id) 73 | self._df = table.to_pandas() 74 | 75 | return self._df.iloc[idx - start_idx] 76 | 77 | def _binary_search(self, target: int): 78 | arr = self.parquet_indices 79 | n = len(arr) 80 | left, right = 0, n 81 | 82 | while left <= right: 83 | mid = (left + right) // 2 84 | _, start_idx, end_idx, _, _, _ = arr[mid] 85 | if target == start_idx: 86 | return mid 87 | elif target == end_idx: 88 | return mid + 1 89 | elif start_idx <= target < end_idx: 90 | return mid 91 | elif target <= end_idx: 92 | right = mid - 1 93 | else: 94 | left = mid + 1 95 | return left 96 | -------------------------------------------------------------------------------- /101-GeoHash/map.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 20 | 28 | 29 | 30 | 31 |
32 | 33 | -------------------------------------------------------------------------------- /200-Kubernetes/01-Kubernetes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Install Minikube\n", 8 | "\n", 9 | "\n", 10 | "```\n", 11 | "curl -Lo minikube https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64 \\\n", 12 | " && chmod +x minikube\n", 13 | " \n", 14 | "sudo install minikube /usr/local/bin/\n", 15 | "```\n", 16 | "\n", 17 | "기본적인 명령어는 다음과 같습니다.\n", 18 | "\n", 19 | " - `minikube start`\n", 20 | " - `minikube status`\n", 21 | " - `minikube stop`\n", 22 | "\n" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "# Kubectl 설치\n", 30 | "\n", 31 | "```\n", 32 | "curl -LO \"https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl\"\n", 33 | "\n", 34 | "chmod +x ./kubectl\n", 35 | "sudo mv ./kubectl /usr/local/bin/kubectl\n", 36 | "\n", 37 | "kubectl version --client\n", 38 | "```\n", 39 | "\n", 40 | "Snap이 된다면 다음과 같이 쉽게 설치도 가능합니다.\n", 41 | "\n", 42 | "```\n", 43 | "sudo snap install kubectl --classic\n", 44 | "```\n" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "# Getting Started\n", 52 | "\n", 53 | "```bash\n", 54 | "$ kubectl cluster-info\n", 55 | "Kubernetes master is running at https://172.17.0.3:8443\n", 56 | "KubeDNS is running at https://172.17.0.3:8443/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy\n", 57 | "\n", 58 | "To further debug and diagnose cluster problems, use 'kubectl cluster-info dump'.\n", 59 | "\n", 60 | "```\n", 61 | " - **Kubernetes master**: master\n", 62 | " - **KubeDNS**: DNS\n", 63 | " - **kubernetes-dashboard**: dashboard - UI에서 applications을 확인 가능" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "# Hello Minikube\n", 71 | "\n", 72 | "```\n", 73 | "$ kubectl create deployment nginx-hello-world --image=nginxdemos/hello\n", 74 | "$ kubectl get deployments\n", 75 | "NAME READY UP-TO-DATE AVAILABLE AGE\n", 76 | "hello-node 1/1 1 1 60s\n", 77 | "\n", 78 | "$ kubectl get pods\n", 79 | "NAME READY STATUS RESTARTS AGE\n", 80 | "hello-node-7bf657c596-glpfj 1/1 Running 0 2m\n", 81 | "\n", 82 | "```\n", 83 | "\n", 84 | "로그 확인은 다음과 같이 합니다.\n", 85 | "\n", 86 | "```\n", 87 | "kubectl get events\n", 88 | "```\n", 89 | "\n", 90 | "\n", 91 | "삭제는 다음과 같이 합니다.\n", 92 | "\n", 93 | "```\n", 94 | "$ kubectl delete deployment nginx-hello-world\n", 95 | "$ kubectl get deployments\n", 96 | "No resources found in default namespace.\n", 97 | "```" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "# Nginx Tutorial\n", 105 | "\n", 106 | "먼저 docker에서 확인합니다.\n", 107 | "\n", 108 | "```\n", 109 | "docker run --name my-nginx -p 5001:80 -d nginx\n", 110 | "```\n", 111 | "\n", 112 | "크롬에서 확인후 삭제 합니다.\n", 113 | "\n", 114 | "```\n", 115 | "docker stop my-nginx \n", 116 | "docker container prune\n", 117 | "```\n", 118 | "\n", 119 | "Nginx 배포합니다.\n", 120 | "\n", 121 | "```\n", 122 | "kubectl create deployment hello-node --image=nginx\n", 123 | "# 또는 이거\n", 124 | "# kubectl create deployment hello-node --image=nginxdemos/hello \n", 125 | "\n", 126 | "kubectl port-forward hello-node-544968b8c4-4kvfh 5001:80 --address 0.0.0.0\n", 127 | "```\n", 128 | "\n", 129 | "크롬에서 확인을 합니다.\n", 130 | "\n", 131 | "로그도 확인합니다.\n", 132 | "\n", 133 | "`logs [Pod 이름]` 을 사용합니다.\n", 134 | "\n", 135 | "```\n", 136 | "kubectl logs my-nginx-66b75b6f6b-29sw6 -f\n", 137 | "```\n", 138 | "\n", 139 | "Pod안의 명령문을 실행시킬수도 있습니다.\n", 140 | "\n", 141 | "```\n", 142 | "kubectl exec hello-node-66b75b6f6b-29sw6 -- env\n", 143 | "kubectl exec hello-node-544968b8c4-tp5pd -it -- bash\n", 144 | "```" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "## Expose\n", 152 | "\n", 153 | "먼저 nginx 를 디플로이해줍니다.\n", 154 | "\n", 155 | "```\n", 156 | "kubectl create deployment hello-node --image=nginx\n", 157 | "```\n", 158 | "\n", 159 | "Expose 시킵니다.\n", 160 | "\n", 161 | "```\n", 162 | "kubectl expose deployment hello-node --type=NodePort --port 5001 --target-port 80\n", 163 | "```\n", 164 | "\n", 165 | "이후 NodePort를 확인합니다.
\n", 166 | "이후 `curl $(minikube ip):[Node Port]` 로 확인합니다. \n", 167 | "\n", 168 | "```\n", 169 | "kubectl describe service hello-node | grep NodePort\n", 170 | "curl $(minikube ip):31832\n", 171 | "kubectl port-forward service/hello-node 5002:5001\n", 172 | "```" 173 | ] 174 | } 175 | ], 176 | "metadata": { 177 | "kernelspec": { 178 | "display_name": "Python 3", 179 | "language": "python", 180 | "name": "python3" 181 | }, 182 | "language_info": { 183 | "codemirror_mode": { 184 | "name": "ipython", 185 | "version": 3 186 | }, 187 | "file_extension": ".py", 188 | "mimetype": "text/x-python", 189 | "name": "python", 190 | "nbconvert_exporter": "python", 191 | "pygments_lexer": "ipython3", 192 | "version": "3.8.2" 193 | } 194 | }, 195 | "nbformat": 4, 196 | "nbformat_minor": 4 197 | } 198 | -------------------------------------------------------------------------------- /010-Pyspark/01 Tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import findspark\n", 10 | "findspark.init()\n", 11 | "\n", 12 | "import pyspark\n", 13 | "from pyspark import SparkContext\n", 14 | "from datetime import datetime" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Initialize Spark Context" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "sc = SparkContext(\"local\", \"tutorial\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## Word Count" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 7, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "counts: 8\n", 50 | "CPU times: user 69 µs, sys: 57 µs, total: 126 µs\n", 51 | "Wall time: 132 µs\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "words = sc.parallelize (\n", 57 | " [\"scala\", \n", 58 | " \"java\", \n", 59 | " \"hadoop\", \n", 60 | " \"spark\", \n", 61 | " \"akka\",\n", 62 | " \"spark vs hadoop\", \n", 63 | " \"pyspark\",\n", 64 | " \"pyspark and spark\"]\n", 65 | ")\n", 66 | "counts = words.count()\n", 67 | "\n", 68 | "%time print(f'counts: {counts}')" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## Collect" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 8, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "['scala',\n", 87 | " 'java',\n", 88 | " 'hadoop',\n", 89 | " 'spark',\n", 90 | " 'akka',\n", 91 | " 'spark vs hadoop',\n", 92 | " 'pyspark',\n", 93 | " 'pyspark and spark']" 94 | ] 95 | }, 96 | "execution_count": 8, 97 | "metadata": {}, 98 | "output_type": "execute_result" 99 | } 100 | ], 101 | "source": [ 102 | "words.collect()" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## ForEach" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 23, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "def f(x):\n", 119 | " print(x)\n", 120 | " \n", 121 | "words.foreach(f)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "## Filter" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 26, 134 | "metadata": {}, 135 | "outputs": [ 136 | { 137 | "data": { 138 | "text/plain": [ 139 | "['pyspark', 'pyspark and spark']" 140 | ] 141 | }, 142 | "execution_count": 26, 143 | "metadata": {}, 144 | "output_type": "execute_result" 145 | } 146 | ], 147 | "source": [ 148 | "words.filter(lambda x: 'py' in x).collect()" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "## Map" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 31, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "[('scala', 1, 3),\n", 167 | " ('java', 1, 3),\n", 168 | " ('hadoop', 1, 3),\n", 169 | " ('spark', 1, 3),\n", 170 | " ('akka', 1, 3),\n", 171 | " ('spark vs hadoop', 1, 3),\n", 172 | " ('pyspark', 1, 3),\n", 173 | " ('pyspark and spark', 1, 3)]" 174 | ] 175 | }, 176 | "execution_count": 31, 177 | "metadata": {}, 178 | "output_type": "execute_result" 179 | } 180 | ], 181 | "source": [ 182 | "words.map(lambda x: (x, 1, 3)).collect()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "## Reduce " 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 33, 195 | "metadata": {}, 196 | "outputs": [ 197 | { 198 | "data": { 199 | "text/plain": [ 200 | "25" 201 | ] 202 | }, 203 | "execution_count": 33, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "from operator import add\n", 210 | "\n", 211 | "nums = sc.parallelize([1, 2, 3, 4, 5, 10])\n", 212 | "nums.reduce(add)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "## Join\n", 220 | "\n", 221 | "1. **join**: 두개의 RDD에 모두 존재하는 elements만 join이 되고, 나머지는 제외\n", 222 | "2. **fullOuterJoin**: 모든 elements를 join 시킨다 " 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 37, 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "data": { 232 | "text/plain": [ 233 | "[('ml', (10, 5)), ('spark', (1, 2))]" 234 | ] 235 | }, 236 | "execution_count": 37, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "x = sc.parallelize([('spark', 1), ('ml', 10), ('power', 2)])\n", 243 | "y = sc.parallelize([('spark', 2), ('ml', 5), ('happy', 3)])\n", 244 | "joined = x.join(y)\n", 245 | "joined.collect()" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 39, 251 | "metadata": {}, 252 | "outputs": [ 253 | { 254 | "data": { 255 | "text/plain": [ 256 | "[('ml', (10, 5)),\n", 257 | " ('power', (2, None)),\n", 258 | " ('spark', (1, 2)),\n", 259 | " ('happy', (None, 3))]" 260 | ] 261 | }, 262 | "execution_count": 39, 263 | "metadata": {}, 264 | "output_type": "execute_result" 265 | } 266 | ], 267 | "source": [ 268 | "x.fullOuterJoin(y).collect()" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 40, 274 | "metadata": {}, 275 | "outputs": [ 276 | { 277 | "data": { 278 | "text/plain": [ 279 | "[('ml', (10, 5)), ('power', (2, None)), ('spark', (1, 2))]" 280 | ] 281 | }, 282 | "execution_count": 40, 283 | "metadata": {}, 284 | "output_type": "execute_result" 285 | } 286 | ], 287 | "source": [ 288 | "x.leftOuterJoin(y).collect()" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "## Cache\n", 296 | "\n", 297 | "\"MEMORY_ONLY\" 일경우.. 메모리에 RDD를 persist시킨다 " 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 42, 303 | "metadata": {}, 304 | "outputs": [ 305 | { 306 | "data": { 307 | "text/plain": [ 308 | "True" 309 | ] 310 | }, 311 | "execution_count": 42, 312 | "metadata": {}, 313 | "output_type": "execute_result" 314 | } 315 | ], 316 | "source": [ 317 | "words.cache()\n", 318 | "words.persist().is_cached" 319 | ] 320 | } 321 | ], 322 | "metadata": { 323 | "kernelspec": { 324 | "display_name": "Python 3", 325 | "language": "python", 326 | "name": "python3" 327 | }, 328 | "language_info": { 329 | "codemirror_mode": { 330 | "name": "ipython", 331 | "version": 3 332 | }, 333 | "file_extension": ".py", 334 | "mimetype": "text/x-python", 335 | "name": "python", 336 | "nbconvert_exporter": "python", 337 | "pygments_lexer": "ipython3", 338 | "version": "3.6.7" 339 | } 340 | }, 341 | "nbformat": 4, 342 | "nbformat_minor": 2 343 | } 344 | -------------------------------------------------------------------------------- /002-Pyspark/01 Tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": { 7 | "pycharm": { 8 | "name": "#%%\n" 9 | } 10 | }, 11 | "outputs": [], 12 | "source": [ 13 | "import findspark\n", 14 | "findspark.init()\n", 15 | "\n", 16 | "import pyspark\n", 17 | "from pyspark import SparkContext\n", 18 | "from datetime import datetime" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "pycharm": { 25 | "name": "#%% md\n" 26 | } 27 | }, 28 | "source": [ 29 | "## Initialize Spark Context" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "pycharm": { 37 | "name": "#%%\n" 38 | } 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "sc = SparkContext(\"local\", \"tutorial\")" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": { 48 | "pycharm": { 49 | "name": "#%% md\n" 50 | } 51 | }, 52 | "source": [ 53 | "## Word Count" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 7, 59 | "metadata": { 60 | "pycharm": { 61 | "name": "#%%\n" 62 | } 63 | }, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "counts: 8\n", 70 | "CPU times: user 69 µs, sys: 57 µs, total: 126 µs\n", 71 | "Wall time: 132 µs\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "words = sc.parallelize (\n", 77 | " [\"scala\", \n", 78 | " \"java\", \n", 79 | " \"hadoop\", \n", 80 | " \"spark\", \n", 81 | " \"akka\",\n", 82 | " \"spark vs hadoop\", \n", 83 | " \"pyspark\",\n", 84 | " \"pyspark and spark\"]\n", 85 | ")\n", 86 | "counts = words.count()\n", 87 | "\n", 88 | "%time print(f'counts: {counts}')" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": { 94 | "pycharm": { 95 | "name": "#%% md\n" 96 | } 97 | }, 98 | "source": [ 99 | "## Collect" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 8, 105 | "metadata": { 106 | "pycharm": { 107 | "name": "#%%\n" 108 | } 109 | }, 110 | "outputs": [ 111 | { 112 | "data": { 113 | "text/plain": [ 114 | "['scala',\n", 115 | " 'java',\n", 116 | " 'hadoop',\n", 117 | " 'spark',\n", 118 | " 'akka',\n", 119 | " 'spark vs hadoop',\n", 120 | " 'pyspark',\n", 121 | " 'pyspark and spark']" 122 | ] 123 | }, 124 | "execution_count": 8, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "words.collect()" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": { 136 | "pycharm": { 137 | "name": "#%% md\n" 138 | } 139 | }, 140 | "source": [ 141 | "## ForEach" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 23, 147 | "metadata": { 148 | "pycharm": { 149 | "name": "#%%\n" 150 | } 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "def f(x):\n", 155 | " print(x)\n", 156 | " \n", 157 | "words.foreach(f)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": { 163 | "pycharm": { 164 | "name": "#%% md\n" 165 | } 166 | }, 167 | "source": [ 168 | "## Filter" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 26, 174 | "metadata": { 175 | "pycharm": { 176 | "name": "#%%\n" 177 | } 178 | }, 179 | "outputs": [ 180 | { 181 | "data": { 182 | "text/plain": [ 183 | "['pyspark', 'pyspark and spark']" 184 | ] 185 | }, 186 | "execution_count": 26, 187 | "metadata": {}, 188 | "output_type": "execute_result" 189 | } 190 | ], 191 | "source": [ 192 | "words.filter(lambda x: 'py' in x).collect()" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": { 198 | "pycharm": { 199 | "name": "#%% md\n" 200 | } 201 | }, 202 | "source": [ 203 | "## Map" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 31, 209 | "metadata": { 210 | "pycharm": { 211 | "name": "#%%\n" 212 | } 213 | }, 214 | "outputs": [ 215 | { 216 | "data": { 217 | "text/plain": [ 218 | "[('scala', 1, 3),\n", 219 | " ('java', 1, 3),\n", 220 | " ('hadoop', 1, 3),\n", 221 | " ('spark', 1, 3),\n", 222 | " ('akka', 1, 3),\n", 223 | " ('spark vs hadoop', 1, 3),\n", 224 | " ('pyspark', 1, 3),\n", 225 | " ('pyspark and spark', 1, 3)]" 226 | ] 227 | }, 228 | "execution_count": 31, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "words.map(lambda x: (x, 1, 3)).collect()" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": { 240 | "pycharm": { 241 | "name": "#%% md\n" 242 | } 243 | }, 244 | "source": [ 245 | "## Reduce " 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 33, 251 | "metadata": { 252 | "pycharm": { 253 | "name": "#%%\n" 254 | } 255 | }, 256 | "outputs": [ 257 | { 258 | "data": { 259 | "text/plain": [ 260 | "25" 261 | ] 262 | }, 263 | "execution_count": 33, 264 | "metadata": {}, 265 | "output_type": "execute_result" 266 | } 267 | ], 268 | "source": [ 269 | "from operator import add\n", 270 | "\n", 271 | "nums = sc.parallelize([1, 2, 3, 4, 5, 10])\n", 272 | "nums.reduce(add)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": { 278 | "pycharm": { 279 | "name": "#%% md\n" 280 | } 281 | }, 282 | "source": [ 283 | "## Join\n", 284 | "\n", 285 | "1. **join**: 두개의 RDD에 모두 존재하는 elements만 join이 되고, 나머지는 제외\n", 286 | "2. **fullOuterJoin**: 모든 elements를 join 시킨다 " 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 37, 292 | "metadata": { 293 | "pycharm": { 294 | "name": "#%%\n" 295 | } 296 | }, 297 | "outputs": [ 298 | { 299 | "data": { 300 | "text/plain": [ 301 | "[('ml', (10, 5)), ('spark', (1, 2))]" 302 | ] 303 | }, 304 | "execution_count": 37, 305 | "metadata": {}, 306 | "output_type": "execute_result" 307 | } 308 | ], 309 | "source": [ 310 | "x = sc.parallelize([('spark', 1), ('ml', 10), ('power', 2)])\n", 311 | "y = sc.parallelize([('spark', 2), ('ml', 5), ('happy', 3)])\n", 312 | "joined = x.join(y)\n", 313 | "joined.collect()" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 39, 319 | "metadata": { 320 | "pycharm": { 321 | "name": "#%%\n" 322 | } 323 | }, 324 | "outputs": [ 325 | { 326 | "data": { 327 | "text/plain": [ 328 | "[('ml', (10, 5)),\n", 329 | " ('power', (2, None)),\n", 330 | " ('spark', (1, 2)),\n", 331 | " ('happy', (None, 3))]" 332 | ] 333 | }, 334 | "execution_count": 39, 335 | "metadata": {}, 336 | "output_type": "execute_result" 337 | } 338 | ], 339 | "source": [ 340 | "x.fullOuterJoin(y).collect()" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 40, 346 | "metadata": { 347 | "pycharm": { 348 | "name": "#%%\n" 349 | } 350 | }, 351 | "outputs": [ 352 | { 353 | "data": { 354 | "text/plain": [ 355 | "[('ml', (10, 5)), ('power', (2, None)), ('spark', (1, 2))]" 356 | ] 357 | }, 358 | "execution_count": 40, 359 | "metadata": {}, 360 | "output_type": "execute_result" 361 | } 362 | ], 363 | "source": [ 364 | "x.leftOuterJoin(y).collect()" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": { 370 | "pycharm": { 371 | "name": "#%% md\n" 372 | } 373 | }, 374 | "source": [ 375 | "## Cache\n", 376 | "\n", 377 | "\"MEMORY_ONLY\" 일경우.. 메모리에 RDD를 persist시킨다 " 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 42, 383 | "metadata": { 384 | "pycharm": { 385 | "name": "#%%\n" 386 | } 387 | }, 388 | "outputs": [ 389 | { 390 | "data": { 391 | "text/plain": [ 392 | "True" 393 | ] 394 | }, 395 | "execution_count": 42, 396 | "metadata": {}, 397 | "output_type": "execute_result" 398 | } 399 | ], 400 | "source": [ 401 | "words.cache()\n", 402 | "words.persist().is_cached" 403 | ] 404 | } 405 | ], 406 | "metadata": { 407 | "kernelspec": { 408 | "display_name": "Python 3", 409 | "language": "python", 410 | "name": "python3" 411 | }, 412 | "language_info": { 413 | "codemirror_mode": { 414 | "name": "ipython", 415 | "version": 3 416 | }, 417 | "file_extension": ".py", 418 | "mimetype": "text/x-python", 419 | "name": "python", 420 | "nbconvert_exporter": "python", 421 | "pygments_lexer": "ipython3", 422 | "version": "3.6.7" 423 | } 424 | }, 425 | "nbformat": 4, 426 | "nbformat_minor": 2 427 | } -------------------------------------------------------------------------------- /006-pyarrow/pyarrow-tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "b7462c43-d1b7-46ea-9c88-06c078c92e51", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "%config Completer.use_jedi = False\n", 13 | "\n", 14 | "import sys\n", 15 | "from datetime import datetime, timedelta\n", 16 | "from typing import Generator, Iterator, Optional, Tuple\n", 17 | "\n", 18 | "import pandas as pd\n", 19 | "import pyarrow as pa" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "id": "ad43b914-376a-4eac-9216-ebb9f2edc3e9", 25 | "metadata": {}, 26 | "source": [ 27 | "## Data Generation" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 41, 33 | "id": "ead8625c-0956-4a05-9e90-6db7ec7d07d5", 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "data": { 38 | "text/plain": [ 39 | "2380" 40 | ] 41 | }, 42 | "execution_count": 41, 43 | "metadata": {}, 44 | "output_type": "execute_result" 45 | } 46 | ], 47 | "source": [] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 75, 52 | "id": "0ec194e5-54c8-4f14-a653-77924160d1db", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "from sklearn.datasets import make_classification\n", 57 | "from datetime import datetime, timedelta\n", 58 | "from random import random, randint\n", 59 | "\n", 60 | "cur_date = datetime.now()\n", 61 | "\n", 62 | "for i in range(10):\n", 63 | " x, y = make_classification(n_samples=randint(1000, 10000), n_features=10, weights=(0.9, 0.1))\n", 64 | " df = pd.DataFrame(x)\n", 65 | " df.columns = [f'col_{x}' for x in range(10)]\n", 66 | " df['dt'] = cur_date.strftime('%Y%m%d')\n", 67 | " cur_date += timedelta(days=1)\n", 68 | " df.to_parquet('./data', partition_cols=['dt'])" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "id": "90c4ef72-d516-4797-a6dd-fb9adfd3cb03", 74 | "metadata": {}, 75 | "source": [ 76 | "# ParquetDataset" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "id": "d9469c34-2e8b-4427-a30d-dc4b95243c75", 82 | "metadata": {}, 83 | "source": [ 84 | "## Dataset" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 74, 90 | "id": "6ae8ce10-b595-4dd1-bd61-82fd42645a05", 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "scanner = dataset.scanner()\n" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 87, 100 | "id": "686d0599-130d-48ae-9156-7786e1374bda", 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "n_rows: 41851\n", 108 | "0.43440751685326995\n", 109 | "1.1431565798068537\n", 110 | "0.24992460337363664\n", 111 | "-0.34604515354971194\n", 112 | "0.32233619998326285\n", 113 | "0.7595871664144229\n", 114 | "-0.9966609176752007\n", 115 | "-0.5206429227786304\n", 116 | "1.2140122393778143\n", 117 | "-1.599369064413563\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "import pyarrow.dataset as ds\n", 123 | "\n", 124 | "dataset = ds.dataset('./data', format='parquet', partitioning=['dt'])\n", 125 | "print('n_rows:', dataset.count_rows())\n", 126 | "\n", 127 | "for batch in dataset.to_batches():\n", 128 | " for i in range(batch.num_rows):\n", 129 | " col0 = batch.column('col_0')[0].as_py()\n", 130 | " \n", 131 | " " 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "id": "eb49596a-eee5-4a94-afe9-be2d5183aeb0", 137 | "metadata": {}, 138 | "source": [ 139 | "## ParquetDataset" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 18, 145 | "id": "5e31dc0f-528c-4475-ace4-085a6be8547c", 146 | "metadata": { 147 | "tags": [] 148 | }, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "Pandas shape : (50000000, 2)\n", 155 | "Pandas size : 450000734\n", 156 | "Pyarrow size : 64\n", 157 | "files : ['./data/dt=2023-01-01/a300c22cb3554cec95c68957f6ac326f-0.parquet', './data/dt=2023-01-02/a300c22cb3554cec95c68957f6ac326f-0.parquet', './data/dt=2023-01-03/a300c22cb3554cec95c68957f6ac326f-0.parquet']\n", 158 | "fragments : [, , ]\n", 159 | "files rows : [8640000, 8640000, 8640000, 8640000, 8640000, 6800000]\n", 160 | "column size : 2\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "from pyarrow.parquet import ParquetDataset, ParquetFile\n", 166 | "\n", 167 | "dataset = ParquetDataset(\"./data\", memory_map=True, use_legacy_dataset=False)\n", 168 | "df = pd.read_parquet(\"./data\")\n", 169 | "\n", 170 | "file_rows = [frag.count_rows() for frag in dataset.fragments]\n", 171 | "\n", 172 | "print(\"Pandas shape :\", df.shape)\n", 173 | "print(\"Pandas size :\", sys.getsizeof(df))\n", 174 | "print(\"Pyarrow size :\", sys.getsizeof(dataset))\n", 175 | "print(\"files :\", dataset.files[:3])\n", 176 | "print(\"fragments :\", dataset.fragments[:3])\n", 177 | "print(\"files rows :\", file_rows)\n", 178 | "print(\"column size :\", len(dataset.schema))" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "id": "9f540049-85df-46d9-9815-4384b1df1156", 184 | "metadata": {}, 185 | "source": [ 186 | "## Iteration" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 4, 192 | "id": "1cf791db-87ef-46b5-925a-93828e5f05b4", 193 | "metadata": { 194 | "tags": [] 195 | }, 196 | "outputs": [ 197 | { 198 | "name": "stdout", 199 | "output_type": "stream", 200 | "text": [ 201 | "frag size : 72\n", 202 | "num rows : 32768\n", 203 | "Pandas shape: (32768, 1)\n" 204 | ] 205 | }, 206 | { 207 | "data": { 208 | "text/html": [ 209 | "
\n", 210 | "\n", 223 | "\n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | "
idx
00
\n", 237 | "
" 238 | ], 239 | "text/plain": [ 240 | " idx\n", 241 | "0 0" 242 | ] 243 | }, 244 | "metadata": {}, 245 | "output_type": "display_data" 246 | } 247 | ], 248 | "source": [ 249 | "for frag in dataset.fragments:\n", 250 | " for batch in frag.to_batches():\n", 251 | " df = batch.to_pandas()\n", 252 | " row = batch.take(pa.array([0]))\n", 253 | "\n", 254 | " print(\"frag size :\", sys.getsizeof(frag))\n", 255 | " print(\"num rows :\", batch.num_rows)\n", 256 | " print(\"Pandas shape:\", df.shape)\n", 257 | " display(row.to_pandas())\n", 258 | " break\n", 259 | " break" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 57, 265 | "id": "97089466-c70d-4ec8-8fab-8ebd3d6d5e67", 266 | "metadata": {}, 267 | "outputs": [ 268 | { 269 | "data": { 270 | "text/plain": [ 271 | "[(0, 0, 32768, , 0, 32768),\n", 272 | " (1,\n", 273 | " 32768,\n", 274 | " 65536,\n", 275 | " ,\n", 276 | " 1,\n", 277 | " 32768),\n", 278 | " (2,\n", 279 | " 65536,\n", 280 | " 98304,\n", 281 | " ,\n", 282 | " 2,\n", 283 | " 32768),\n", 284 | " (3,\n", 285 | " 98304,\n", 286 | " 131072,\n", 287 | " ,\n", 288 | " 3,\n", 289 | " 32768),\n", 290 | " (4,\n", 291 | " 131072,\n", 292 | " 163840,\n", 293 | " ,\n", 294 | " 4,\n", 295 | " 32768)]" 296 | ] 297 | }, 298 | "execution_count": 57, 299 | "metadata": {}, 300 | "output_type": "execute_result" 301 | } 302 | ], 303 | "source": [ 304 | "\n", 305 | "idx = 0\n", 306 | "parquet_indices = []\n", 307 | "for frag in dataset.fragments:\n", 308 | " parquet_file = ParquetFile(frag.path)\n", 309 | " for i, row_group in enumerate(frag.row_groups):\n", 310 | " start_idx = idx\n", 311 | " end_idx = idx + row_group.num_rows\n", 312 | " parquet_indices.append((i, start_idx, end_idx, parquet_file, row_group.id, row_group.num_rows))\n", 313 | " idx += row_group.num_rows\n", 314 | " \n", 315 | "parquet_indices[:5]" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 55, 321 | "id": "d3dbd0dc-55c5-41a7-b175-55a210f9036e", 322 | "metadata": {}, 323 | "outputs": [ 324 | { 325 | "data": { 326 | "text/plain": [ 327 | "28800" 328 | ] 329 | }, 330 | "execution_count": 55, 331 | "metadata": {}, 332 | "output_type": "execute_result" 333 | } 334 | ], 335 | "source": [ 336 | "parquet_indices[-1][-1]" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 56, 342 | "id": "8ff55871-98ee-4107-bcca-e6d66a519850", 343 | "metadata": {}, 344 | "outputs": [ 345 | { 346 | "data": { 347 | "text/plain": [ 348 | "pyarrow.Table\n", 349 | "idx: int64\n", 350 | "----\n", 351 | "idx: [[43200000,43200001,43200002,43200003,43200004,...,43220987,43220988,43220989,43220990,43220991]]" 352 | ] 353 | }, 354 | "execution_count": 56, 355 | "metadata": {}, 356 | "output_type": "execute_result" 357 | } 358 | ], 359 | "source": [ 360 | "group = frag.row_groups[0]\n", 361 | "group.id\n", 362 | "\n", 363 | "\n", 364 | "pf = ParquetFile(frag.path)\n", 365 | "table = pf.read_row_group(0)\n", 366 | "table" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "id": "e8c85fdc-60a9-4970-b325-08fb2b8489d9", 372 | "metadata": {}, 373 | "source": [ 374 | "## Create Parquet Files" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "id": "adac7f0c-3fa1-4b27-8072-dc6e24e724ba", 381 | "metadata": { 382 | "tags": [] 383 | }, 384 | "outputs": [], 385 | "source": [ 386 | "def create_data():\n", 387 | " df = pd.DataFrame({\"idx\": range(50000000)})\n", 388 | " dt = datetime(2023, 1, 1)\n", 389 | " df[\"dt\"] = df[\"idx\"].apply(\n", 390 | " lambda x: (dt + timedelta(milliseconds=x * 10)).date()\n", 391 | " )\n", 392 | " pa.parquet.write_to_dataset(\n", 393 | " pa.Table.from_pandas(df),\n", 394 | " root_path=\"data\",\n", 395 | " partition_cols=[\"dt\"],\n", 396 | " use_legacy_dataset=False,\n", 397 | " )\n", 398 | "\n", 399 | "\n", 400 | "# create_data()" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "id": "86634ed3-5b5f-4cdb-9a7b-1a7de80c9962", 406 | "metadata": {}, 407 | "source": [ 408 | "## Pytorch Dataset" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "id": "af4538a8-0733-4862-8e84-205490cb9358", 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "class PyArrowDataset(Dataset):\n", 419 | " def __init__(self, source:str, seed:int =123):\n", 420 | " pass\n", 421 | " \n", 422 | " def init_indexing(self, shuffle:bool=False)" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "id": "cb7d7752-ae4b-4e39-8a8e-9074e627e6e6", 429 | "metadata": { 430 | "tags": [] 431 | }, 432 | "outputs": [], 433 | "source": [ 434 | "import random\n", 435 | "import tracemalloc\n", 436 | "from bisect import bisect_right\n", 437 | "\n", 438 | "from pyarrow.dataset import ParquetFileFragment\n", 439 | "from pyarrow.lib import RecordBatch\n", 440 | "from torch.utils.data import Dataset\n", 441 | "import gc\n", 442 | "\n", 443 | "class PyArrowDataset(Dataset):\n", 444 | " \"\"\"\n", 445 | " Restriction\n", 446 | " - Don't shuffle in Dataloader. this is for efficiency to precess large dataset.\n", 447 | " If you need to shuffle, do it before this custom dataset. (like in SparkSQL)\n", 448 | " But the algorithm supports random access.\n", 449 | " \"\"\"\n", 450 | "\n", 451 | " def __init__(self, source: str, seed: int = 123):\n", 452 | " self.source = source\n", 453 | " self.seed = seed\n", 454 | "\n", 455 | " # Pyarrow\n", 456 | " self.dataset = ParquetDataset(source, use_legacy_dataset=False)\n", 457 | " self.fragments: List[ParquetFileFragment] = self.dataset.fragments\n", 458 | " self._batches: Iterator[RecordBatch] = None\n", 459 | " self._batch: Optional[RecordBatch] = None\n", 460 | " self._df: pd.DataFrame = None\n", 461 | "\n", 462 | " # Indexing meta information to make search faster\n", 463 | " self._cumulative_n_rows: List[int] = []\n", 464 | " self._batch_idx: int = 0\n", 465 | "\n", 466 | " # Index\n", 467 | " self._fragment_idx = 0\n", 468 | "\n", 469 | " # Initialization\n", 470 | " self._init()\n", 471 | "\n", 472 | " def _init(self):\n", 473 | " random.seed(self.seed)\n", 474 | " # random.shuffle(self.fragments)\n", 475 | "\n", 476 | " self._cumulative_n_rows = [frag.count_rows() for frag in self.fragments]\n", 477 | " for i in range(1, len(self._cumulative_n_rows)):\n", 478 | " self._cumulative_n_rows[i] += self._cumulative_n_rows[i - 1]\n", 479 | "\n", 480 | " def _get_next(self, idx: int) -> Tuple[int, int]:\n", 481 | " print('_get_next 01', idx)\n", 482 | " def get_prev_cum_frag_size(_fragment_idx):\n", 483 | " if _fragment_idx >= 1:\n", 484 | " return self._cumulative_n_rows[_fragment_idx - 1]\n", 485 | " return 0\n", 486 | "\n", 487 | " # Calculate fragment idx\n", 488 | " fragment_idx = self._fragment_idx\n", 489 | " fragment_changed = False\n", 490 | " _prev_size = get_prev_cum_frag_size(fragment_idx)\n", 491 | " _cur_size = self._cumulative_n_rows[self._fragment_idx]\n", 492 | " if (idx < _prev_size) or (idx >= _cur_size):\n", 493 | " fragment_idx = bisect_right(self._cumulative_n_rows, idx)\n", 494 | " assert fragment_idx < len(self.fragments)\n", 495 | " # fragment_idx %= len(self.fragments)\n", 496 | " fragment_changed = self._fragment_idx != fragment_idx\n", 497 | " self._fragment_idx = fragment_idx\n", 498 | " self._batch_idx = 0\n", 499 | " \n", 500 | " if self._batches:\n", 501 | " self._batches.clear()\n", 502 | " \n", 503 | " del self._batches\n", 504 | " del self._batch\n", 505 | " del self._df\n", 506 | " self._batches = None\n", 507 | " self._batch = None\n", 508 | " self._df = None\n", 509 | " \n", 510 | " print('_get_next 02', idx)\n", 511 | " # Calculate batch idx\n", 512 | " _prev_size = get_prev_cum_frag_size(fragment_idx)\n", 513 | " batch_idx = idx - _prev_size\n", 514 | " batch_changed = batch_idx < self._batch_idx\n", 515 | "\n", 516 | " # Calculate batches of the fragment\n", 517 | " if self._batches is None or fragment_changed or batch_changed:\n", 518 | " if self._batches:\n", 519 | " self._batches.clear()\n", 520 | " \n", 521 | " self.batches = self.fragments[fragment_idx].to_batches()\n", 522 | " self._batch = None\n", 523 | "\n", 524 | " if self._batch is None:\n", 525 | " self._batch = next(self.batches)\n", 526 | " del self._df\n", 527 | " self._df = self._batch.to_pandas()\n", 528 | " self._batch_idx = 0\n", 529 | " \n", 530 | " print('_get_next 03', idx)\n", 531 | " need_to_load_data = False\n", 532 | " while True:\n", 533 | " print(\n", 534 | " \"ITER:\",\n", 535 | " f\"{self._batch_idx} <= {batch_idx} < {self._batch_idx + self._batch.num_rows} | {sys.getsizeof(self._batch)}\",\n", 536 | " )\n", 537 | " if (\n", 538 | " self._batch_idx\n", 539 | " <= batch_idx\n", 540 | " < self._batch_idx + self._batch.num_rows\n", 541 | " ):\n", 542 | " if need_to_load_data:\n", 543 | " self._df = self._batch.to_pandas()\n", 544 | " break\n", 545 | "\n", 546 | " need_to_load_data = True\n", 547 | " self._batch_idx += self._batch.num_rows\n", 548 | " self._batch = next(self.batches)\n", 549 | " \n", 550 | " print('_get_next 04', idx)\n", 551 | " return fragment_idx, batch_idx - self._batch_idx\n", 552 | " \n", 553 | " def __del__(self):\n", 554 | " print('Deleted')\n", 555 | " if self.dataset:\n", 556 | " self.dataset.clear()\n", 557 | " \n", 558 | " if self.fragments:\n", 559 | " self.fragments.clearn\n", 560 | " del self.dataset\n", 561 | " del self.fragments\n", 562 | " del self._batches\n", 563 | " del self._batch\n", 564 | " del self._df\n", 565 | "\n", 566 | "\n", 567 | " def __len__(self):\n", 568 | " return self._cumulative_n_rows[-1]\n", 569 | "\n", 570 | " def __getitem__(self, idx):\n", 571 | " print('__getitem__', idx)\n", 572 | " fragment_idx, batch_idx = self._get_next(idx)\n", 573 | "\n", 574 | " row = self._df.iloc[batch_idx][[\"idx\"]]\n", 575 | " row = row.fillna(0)\n", 576 | " row[\"fragment_idx\"] = fragment_idx\n", 577 | " row[\"batch_idx\"] = batch_idx\n", 578 | " return row, idx\n", 579 | " \n", 580 | " \n", 581 | "\n", 582 | "\n", 583 | "tracemalloc.start()\n", 584 | "dataset = PyArrowDataset(\"./data\")\n", 585 | "print(dataset[50000][0].idx)\n", 586 | "print(dataset[0][0].idx)\n", 587 | "print(dataset[500000][0].idx)\n", 588 | "\n", 589 | "print('여기까지')\n", 590 | "del dataset\n", 591 | "print(tracemalloc.get_traced_memory())\n", 592 | "print(gc.get_count())" 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": null, 598 | "id": "4721df80-e665-40b4-b281-5f5ee285406d", 599 | "metadata": { 600 | "tags": [] 601 | }, 602 | "outputs": [], 603 | "source": [ 604 | "from torch.utils.data import DataLoader\n", 605 | "\n", 606 | "loader = DataLoader(dataset, batch_size=64, shuffle=True)\n", 607 | "data, labels = next(iter(loader))\n", 608 | "a = data[:, 0] - 1\n", 609 | "b = labels % 1000\n", 610 | "\n", 611 | "a == b" 612 | ] 613 | }, 614 | { 615 | "cell_type": "markdown", 616 | "id": "3b078d63-87a2-45b2-8316-1622e5e1d39d", 617 | "metadata": {}, 618 | "source": [ 619 | "\n", 620 | "\n", 621 | "\n", 622 | "\n", 623 | "\n", 624 | "# ParquetFile\n", 625 | "\n", 626 | "## Row 갯수 " 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": null, 632 | "id": "c9036712-a6f0-47db-bf95-358bcdcabca0", 633 | "metadata": { 634 | "tags": [] 635 | }, 636 | "outputs": [], 637 | "source": [ 638 | "from pyarrow.parquet import ParquetFile\n", 639 | "\n", 640 | "parquet_file = ParquetFile(\"./data/dt=20230101/userdata.parquet\")\n", 641 | "\n", 642 | "print(\"parquet_file size: \", sys.getsizeof(parquet_file))\n", 643 | "parquet_file.metadata" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": 25, 649 | "id": "26a0352e-1049-43cf-9f4b-651c2b0a215f", 650 | "metadata": { 651 | "collapsed": true, 652 | "jupyter": { 653 | "outputs_hidden": true 654 | } 655 | }, 656 | "outputs": [ 657 | { 658 | "name": "stdout", 659 | "output_type": "stream", 660 | "text": [ 661 | "dataset size : 64\n" 662 | ] 663 | }, 664 | { 665 | "data": { 666 | "text/plain": [ 667 | "['./data/dt=20230101/userdata.parquet']" 668 | ] 669 | }, 670 | "execution_count": 25, 671 | "metadata": {}, 672 | "output_type": "execute_result" 673 | } 674 | ], 675 | "source": [ 676 | "from pyarrow.parquet import ParquetDataset\n", 677 | "\n", 678 | "dataset = ParquetDataset(\"./data\")\n", 679 | "\n", 680 | "print(\"dataset size :\", sys.getsizeof(dataset))\n", 681 | "\n", 682 | "dataset.files" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": null, 688 | "id": "77eda780-7a45-495a-8a9e-aae88ce0cf49", 689 | "metadata": { 690 | "collapsed": true, 691 | "jupyter": { 692 | "outputs_hidden": true 693 | } 694 | }, 695 | "outputs": [], 696 | "source": [] 697 | } 698 | ], 699 | "metadata": { 700 | "kernelspec": { 701 | "display_name": "PyEnv 3.9.18", 702 | "language": "python", 703 | "name": "3.9.18" 704 | }, 705 | "language_info": { 706 | "codemirror_mode": { 707 | "name": "ipython", 708 | "version": 3 709 | }, 710 | "file_extension": ".py", 711 | "mimetype": "text/x-python", 712 | "name": "python", 713 | "nbconvert_exporter": "python", 714 | "pygments_lexer": "ipython3", 715 | "version": "3.9.18" 716 | } 717 | }, 718 | "nbformat": 4, 719 | "nbformat_minor": 5 720 | } 721 | -------------------------------------------------------------------------------- /101-GeoHash/02 Lat Lng - Addition, Angle.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Populating the interactive namespace from numpy and matplotlib\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "%pylab inline\n", 18 | "import geohash\n", 19 | "import folium\n", 20 | "\n", 21 | "from geopy.distance import distance\n", 22 | "from polygon_geohasher.polygon_geohasher import geohash_to_polygon" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "# Addition" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 373, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "500m addition\n", 42 | "Latitude Addition: 500.00005283004964\n", 43 | "Longitude Addition: 500.0049490970048\n", 44 | "Both Addition: 707.0996972853251\n", 45 | "1000m addition\n", 46 | "Latitude Addition: 1000.0004879418325\n", 47 | "Longitude Addition: 1000.0098977428217\n", 48 | "Both Addition: 1414.178421176091\n", 49 | "\n", 50 | "5000m addition\n", 51 | "Latitude Addition: 5000.017733038395\n", 52 | "Longitude Addition: 5000.049416527164\n", 53 | "Both Addition: 7070.052813071016\n", 54 | "\n", 55 | "10000m addition\n", 56 | "Latitude Addition: 10000.073709301136\n", 57 | "Longitude Addition: 10000.098381885806\n", 58 | "Both Addition: 14138.005608237756\n", 59 | "\n", 60 | "50000m addition\n", 61 | "Latitude Addition: 50001.900230356274\n", 62 | "Longitude Addition: 50000.41972205647\n", 63 | "Both Addition: 70605.6694353784\n", 64 | "\n" 65 | ] 66 | }, 67 | { 68 | "data": { 69 | "text/html": [ 70 | "
" 71 | ], 72 | "text/plain": [ 73 | "" 74 | ] 75 | }, 76 | "execution_count": 373, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "def add_meter(lat, lng, lat_meter, lng_meter):\n", 83 | " new_lat = lat + (lat_meter/1000/6359.0899) * (180/np.pi)\n", 84 | " new_lng = lng + (lng_meter/1000/6386) * (180/np.pi) / np.cos(lat * np.pi/180)\n", 85 | " return new_lat, new_lng\n", 86 | "\n", 87 | "m = folium.Map(location=(lat, lng), zoom_start=12)\n", 88 | "lat, lng = 37.499402, 127.054207\n", 89 | "\n", 90 | "folium.Marker((lat, lng), popup='A').add_to(m)\n", 91 | "new_lat, new_lng = add_meter(lat, lng, 500, 500)\n", 92 | "folium.Marker((new_lat, new_lng), popup='500m').add_to(m)\n", 93 | "\n", 94 | "print('500m addition')\n", 95 | "print('Latitude Addition:', distance((lat, lng), (new_lat, lng)).m)\n", 96 | "print('Longitude Addition:', distance((lat, lng), (lat, new_lng)).m)\n", 97 | "print('Both Addition:', distance((lat, lng), (new_lat, new_lng)).m)\n", 98 | "\n", 99 | "new_lat, new_lng = add_meter(lat, lng, 1000, 1000)\n", 100 | "folium.Marker((new_lat, new_lng), popup='500m').add_to(m)\n", 101 | "print('1000m addition')\n", 102 | "print('Latitude Addition:', distance((lat, lng), (new_lat, lng)).m)\n", 103 | "print('Longitude Addition:', distance((lat, lng), (lat, new_lng)).m)\n", 104 | "print('Both Addition:', distance((lat, lng), (new_lat, new_lng)).m)\n", 105 | "print()\n", 106 | "\n", 107 | "new_lat, new_lng = add_meter(lat, lng, 5000, 5000)\n", 108 | "folium.Marker((new_lat, new_lng), popup='500m').add_to(m)\n", 109 | "print('5000m addition')\n", 110 | "print('Latitude Addition:', distance((lat, lng), (new_lat, lng)).m)\n", 111 | "print('Longitude Addition:', distance((lat, lng), (lat, new_lng)).m)\n", 112 | "print('Both Addition:', distance((lat, lng), (new_lat, new_lng)).m)\n", 113 | "print()\n", 114 | "\n", 115 | "new_lat, new_lng = add_meter(lat, lng, 10000, 10000)\n", 116 | "folium.Marker((new_lat, new_lng), popup='500m').add_to(m)\n", 117 | "print('10000m addition')\n", 118 | "print('Latitude Addition:', distance((lat, lng), (new_lat, lng)).m)\n", 119 | "print('Longitude Addition:', distance((lat, lng), (lat, new_lng)).m)\n", 120 | "print('Both Addition:', distance((lat, lng), (new_lat, new_lng)).m)\n", 121 | "print()\n", 122 | "\n", 123 | "\n", 124 | "new_lat, new_lng = add_meter(lat, lng, 50000, 50000)\n", 125 | "folium.Marker((new_lat, new_lng), popup='500m').add_to(m)\n", 126 | "print('50000m addition')\n", 127 | "print('Latitude Addition:', distance((lat, lng), (new_lat, lng)).m)\n", 128 | "print('Longitude Addition:', distance((lat, lng), (lat, new_lng)).m)\n", 129 | "print('Both Addition:', distance((lat, lng), (new_lat, new_lng)).m)\n", 130 | "print()\n", 131 | "\n", 132 | "# Visualization\n", 133 | "m" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "# 3지점간의 각도 계산\n", 141 | "\n", 142 | "https://medium.com/@manivannan_data/find-the-angle-between-three-points-from-2d-using-python-348c513e2cd\n", 143 | "\n" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 417, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "Angle: 90.0\n" 156 | ] 157 | } 158 | ], 159 | "source": [ 160 | "def calculate_angle(cur_location: np.ndarray, p1: np.ndarray, p2: np.ndarray) -> float:\n", 161 | " \"\"\"\n", 162 | " :param cur_location: the current location of the latitude and the longitude.\n", 163 | " :param p1: (latitude, longitude)\n", 164 | " :param p2: (latitude, longitude)\n", 165 | " :return: float\n", 166 | " \"\"\"\n", 167 | " ab = cur_location - p1\n", 168 | " ac = cur_location - p2\n", 169 | " \n", 170 | " _direction = (np.dot(ab, ac) ) / (np.linalg.norm(ab) * np.linalg.norm(ac))\n", 171 | " _direction = min(max(_direction, -1), 1)\n", 172 | " angle = np.arccos(_direction)\n", 173 | " angle = np.degrees(angle)\n", 174 | " angle = np.nan_to_num(angle)\n", 175 | " return round(float(angle), 4)\n", 176 | " \n", 177 | "a = np.array([0, 0])\n", 178 | "b = np.array([5, 0])\n", 179 | "c = np.array([0, 5])\n", 180 | "\n", 181 | "print('Angle:', calculate_angle(a, b, c))" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 416, 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "name": "stdout", 191 | "output_type": "stream", 192 | "text": [ 193 | "Angle: 28.53\n" 194 | ] 195 | }, 196 | { 197 | "data": { 198 | "text/html": [ 199 | "
" 200 | ], 201 | "text/plain": [ 202 | "" 203 | ] 204 | }, 205 | "execution_count": 416, 206 | "metadata": {}, 207 | "output_type": "execute_result" 208 | } 209 | ], 210 | "source": [ 211 | "a = np.array([37.388641, 127.092138]) # Current location\n", 212 | "b = np.array([37.393937, 127.112294])\n", 213 | "c = np.array([37.381100, 127.122811])\n", 214 | "\n", 215 | "angle = calculate_angle(a, b, c)\n", 216 | "\n", 217 | "print('Angle:', round(angle, 2))\n", 218 | "\n", 219 | "m = folium.Map(location=(a+b+c)/3, zoom_start=13)\n", 220 | "folium.Marker(a, popup='A Current Location', icon=folium.Icon(color='black')).add_to(m)\n", 221 | "folium.Marker(b, popup='B Favorite Off Location', icon=folium.Icon(color='green')).add_to(m)\n", 222 | "folium.Marker(c, popup=f'C Call {int(angle)} degree', icon=folium.Icon(color='red')).add_to(m)\n", 223 | "folium.PolyLine([a, b], color='green').add_to(m)\n", 224 | "folium.PolyLine([a, c], color='red').add_to(m)\n", 225 | "m" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "# Circle" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 5, 238 | "metadata": {}, 239 | "outputs": [ 240 | { 241 | "name": "stdout", 242 | "output_type": "stream", 243 | "text": [ 244 | "distance in km: 17.393393505034894\n" 245 | ] 246 | }, 247 | { 248 | "data": { 249 | "text/html": [ 250 | "
" 251 | ], 252 | "text/plain": [ 253 | "" 254 | ] 255 | }, 256 | "execution_count": 5, 257 | "metadata": {}, 258 | "output_type": "execute_result" 259 | } 260 | ], 261 | "source": [ 262 | "a = np.array([37.388641, 127.092138])\n", 263 | "b = np.array([37.498641, 126.952138])\n", 264 | "\n", 265 | "print('distance in km:', distance(a, b).km)\n", 266 | "\n", 267 | "m = folium.Map(location=(a+b)/2, zoom_start=11)\n", 268 | "folium.Marker(a, popup='A', icon=folium.Icon(color='black')).add_to(m)\n", 269 | "folium.Marker(b, popup='B', icon=folium.Icon(color='green')).add_to(m)\n", 270 | "folium.Circle((a+b)/2, 10000, tooltip='test').add_to(m)\n", 271 | "folium.Circle((a+b)/2, 5000, tooltip='test').add_to(m)\n", 272 | "m" 273 | ] 274 | } 275 | ], 276 | "metadata": { 277 | "kernelspec": { 278 | "display_name": "Python 3", 279 | "language": "python", 280 | "name": "python3" 281 | }, 282 | "language_info": { 283 | "codemirror_mode": { 284 | "name": "ipython", 285 | "version": 3 286 | }, 287 | "file_extension": ".py", 288 | "mimetype": "text/x-python", 289 | "name": "python", 290 | "nbconvert_exporter": "python", 291 | "pygments_lexer": "ipython3", 292 | "version": "3.6.7" 293 | }, 294 | "toc": { 295 | "base_numbering": 1, 296 | "nav_menu": {}, 297 | "number_sections": true, 298 | "sideBar": true, 299 | "skip_h1_title": false, 300 | "title_cell": "Table of Contents", 301 | "title_sidebar": "Contents", 302 | "toc_cell": false, 303 | "toc_position": {}, 304 | "toc_section_display": true, 305 | "toc_window_display": false 306 | } 307 | }, 308 | "nbformat": 4, 309 | "nbformat_minor": 2 310 | } 311 | -------------------------------------------------------------------------------- /200-Kubernetes/02-Generate-Fashion-MNIST-Sample-Images.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 15, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Populating the interactive namespace from numpy and matplotlib\n", 13 | "x_train: (60000, 28, 28)\n", 14 | "y_train: (60000,)\n", 15 | "x_test: (10000, 28, 28)\n", 16 | "y_test: (10000,)\n" 17 | ] 18 | } 19 | ], 20 | "source": [ 21 | "%pylab inline\n", 22 | "import keras\n", 23 | "import imageio\n", 24 | "import os\n", 25 | "\n", 26 | "fashion_mnist = keras.datasets.fashion_mnist\n", 27 | "\n", 28 | "class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',\n", 29 | " 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']\n", 30 | "(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()\n", 31 | "\n", 32 | "print('x_train:', x_train.shape)\n", 33 | "print('y_train:', y_train.shape)\n", 34 | "print('x_test:', x_test.shape)\n", 35 | "print('y_test:', y_test.shape)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import scipy.misc.ims" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 22, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "(28, 28)\n", 57 | "(28, 28)\n", 58 | "(28, 28)\n", 59 | "(28, 28)\n", 60 | "(28, 28)\n", 61 | "(28, 28)\n", 62 | "(28, 28)\n", 63 | "(28, 28)\n", 64 | "(28, 28)\n", 65 | "(28, 28)\n", 66 | "(28, 28)\n", 67 | "(28, 28)\n", 68 | "(28, 28)\n", 69 | "(28, 28)\n", 70 | "(28, 28)\n", 71 | "(28, 28)\n", 72 | "(28, 28)\n", 73 | "(28, 28)\n", 74 | "(28, 28)\n", 75 | "(28, 28)\n", 76 | "(28, 28)\n", 77 | "(28, 28)\n", 78 | "(28, 28)\n", 79 | "(28, 28)\n", 80 | "(28, 28)\n" 81 | ] 82 | }, 83 | { 84 | "data": { 85 | "image/png": "\n", 86 | "text/plain": [ 87 | "
" 88 | ] 89 | }, 90 | "metadata": {}, 91 | "output_type": "display_data" 92 | } 93 | ], 94 | "source": [ 95 | "if not os.path.exists('./sample'):\n", 96 | " os.makedirs('./sample')\n", 97 | "\n", 98 | "plt.figure(figsize=(10,10))\n", 99 | "for i in range(25):\n", 100 | " idx = np.random.randint(0, x_train.shape[0])\n", 101 | " plt.subplot(5,5,i+1)\n", 102 | " plt.xticks([])\n", 103 | " plt.yticks([])\n", 104 | " plt.grid(False)\n", 105 | " plt.imshow(x_train[idx], cmap=plt.cm.binary)\n", 106 | " plt.xlabel(class_names[y_train[idx]])\n", 107 | " \n", 108 | " imageio.imsave(f'./sample/sample_{idx:03}.jpg', x_train[idx])\n", 109 | "plt.show()" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [] 118 | } 119 | ], 120 | "metadata": { 121 | "kernelspec": { 122 | "display_name": "Python 3", 123 | "language": "python", 124 | "name": "python3" 125 | }, 126 | "language_info": { 127 | "codemirror_mode": { 128 | "name": "ipython", 129 | "version": 3 130 | }, 131 | "file_extension": ".py", 132 | "mimetype": "text/x-python", 133 | "name": "python", 134 | "nbconvert_exporter": "python", 135 | "pygments_lexer": "ipython3", 136 | "version": "3.8.2" 137 | } 138 | }, 139 | "nbformat": 4, 140 | "nbformat_minor": 4 141 | } 142 | --------------------------------------------------------------------------------