├── 004-Dask
├── dask-worker-space
│ ├── global.lock
│ └── purge.lock
└── tutorial-01.py
├── 020-kinesis-single-shard
├── requirements.txt
├── consumer.py
└── producer.py
├── 022-kinesis-data-analytics
├── requirements.txt
├── consumer.py
└── producer.py
├── 204-Kubernetes-Redis
└── redis-custom-config
│ ├── redis.conf
│ ├── kustomization.yaml
│ └── redis-pod.yaml
├── README.md
├── 101-GeoHash
├── images
│ ├── lat-lng.jpg
│ ├── geohash-size.png
│ └── law-of-cosine.svg
├── map.html
└── 02 Lat Lng - Addition, Angle.ipynb
├── 200-Kubernetes
├── app
│ ├── fasion_model.h5
│ ├── requirements.txt
│ └── app.py
├── sample
│ ├── sample_1139.jpg
│ ├── sample_11518.jpg
│ ├── sample_12352.jpg
│ ├── sample_12924.jpg
│ ├── sample_13180.jpg
│ ├── sample_1421.jpg
│ ├── sample_14467.jpg
│ ├── sample_15419.jpg
│ ├── sample_16078.jpg
│ ├── sample_16742.jpg
│ ├── sample_16975.jpg
│ ├── sample_17027.jpg
│ ├── sample_17101.jpg
│ ├── sample_17764.jpg
│ ├── sample_18043.jpg
│ ├── sample_18313.jpg
│ ├── sample_18788.jpg
│ ├── sample_19007.jpg
│ ├── sample_19098.jpg
│ ├── sample_19177.jpg
│ ├── sample_19442.jpg
│ ├── sample_19482.jpg
│ ├── sample_20091.jpg
│ ├── sample_22950.jpg
│ ├── sample_23330.jpg
│ ├── sample_25021.jpg
│ ├── sample_25536.jpg
│ ├── sample_25643.jpg
│ ├── sample_27647.jpg
│ ├── sample_27825.jpg
│ ├── sample_27936.jpg
│ ├── sample_28251.jpg
│ ├── sample_28550.jpg
│ ├── sample_29020.jpg
│ ├── sample_2919.jpg
│ ├── sample_31021.jpg
│ ├── sample_3152.jpg
│ ├── sample_33108.jpg
│ ├── sample_33165.jpg
│ ├── sample_33193.jpg
│ ├── sample_33271.jpg
│ ├── sample_34045.jpg
│ ├── sample_3480.jpg
│ ├── sample_36605.jpg
│ ├── sample_37196.jpg
│ ├── sample_3767.jpg
│ ├── sample_37793.jpg
│ ├── sample_38070.jpg
│ ├── sample_38479.jpg
│ ├── sample_3880.jpg
│ ├── sample_41126.jpg
│ ├── sample_41669.jpg
│ ├── sample_41819.jpg
│ ├── sample_42013.jpg
│ ├── sample_42528.jpg
│ ├── sample_4256.jpg
│ ├── sample_42662.jpg
│ ├── sample_43296.jpg
│ ├── sample_4354.jpg
│ ├── sample_43814.jpg
│ ├── sample_44314.jpg
│ ├── sample_4496.jpg
│ ├── sample_45115.jpg
│ ├── sample_45910.jpg
│ ├── sample_45926.jpg
│ ├── sample_46011.jpg
│ ├── sample_46449.jpg
│ ├── sample_46682.jpg
│ ├── sample_47082.jpg
│ ├── sample_4712.jpg
│ ├── sample_48604.jpg
│ ├── sample_49798.jpg
│ ├── sample_49844.jpg
│ ├── sample_49963.jpg
│ ├── sample_50107.jpg
│ ├── sample_50488.jpg
│ ├── sample_50863.jpg
│ ├── sample_51212.jpg
│ ├── sample_51335.jpg
│ ├── sample_52312.jpg
│ ├── sample_52636.jpg
│ ├── sample_52970.jpg
│ ├── sample_53360.jpg
│ ├── sample_53374.jpg
│ ├── sample_54723.jpg
│ ├── sample_54962.jpg
│ ├── sample_55104.jpg
│ ├── sample_55535.jpg
│ ├── sample_55641.jpg
│ ├── sample_56036.jpg
│ ├── sample_56392.jpg
│ ├── sample_5707.jpg
│ ├── sample_58386.jpg
│ ├── sample_6135.jpg
│ ├── sample_6197.jpg
│ ├── sample_6689.jpg
│ ├── sample_8744.jpg
│ ├── sample_9688.jpg
│ ├── sample_9758.jpg
│ └── sample_9852.jpg
├── deployment.yaml
├── Dockerfile
├── README.md
├── 01-Kubernetes.ipynb
└── 02-Generate-Fashion-MNIST-Sample-Images.ipynb
├── .gitignore
├── 100-PyQT
├── 01-simple-example
│ └── main.py
├── 02-widgets
│ └── main.py
└── 03-QThread
│ ├── qthread.py
│ ├── signal_with_list.py
│ └── signal_with_python_object.py
├── 005-ray
├── script.py
├── submit.py
└── 10-ray-serving-tutorial
│ └── app.py
├── 002-Pyspark
├── macdonald
│ ├── README.md
│ └── check_connection.py
└── 01 Tutorial.ipynb
├── 202-Kubernetes-deploy-nginx
├── README.md
└── deployment.yaml
├── 003-Shared-Memory
├── shared-memory-list.py
├── shared-memory-bytearray.py
├── shared-string.py
└── shared_memory_queue.py
├── 203-Kubernetes-Service-MySQL
└── deployment.yaml
├── 006-pyarrow
├── test_dataset.py
├── pyarrow_torch.py
└── pyarrow-tutorial.ipynb
└── 010-Pyspark
└── 01 Tutorial.ipynb
/004-Dask/dask-worker-space/global.lock:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/004-Dask/dask-worker-space/purge.lock:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/020-kinesis-single-shard/requirements.txt:
--------------------------------------------------------------------------------
1 | Faker==4.9.0
--------------------------------------------------------------------------------
/022-kinesis-data-analytics/requirements.txt:
--------------------------------------------------------------------------------
1 | Faker==4.9.0
--------------------------------------------------------------------------------
/204-Kubernetes-Redis/redis-custom-config/redis.conf:
--------------------------------------------------------------------------------
1 | maxmemory 2mb
2 | maxmemory-policy allkeys-lru
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # code-snippet
2 |
3 | 바로 가져가서 사용할 수 있는 코드들을 예제와 함께 제공을 합니다.
4 |
5 | 주로 Python 코드들을 지원하고 있습니다.
6 |
7 |
--------------------------------------------------------------------------------
/101-GeoHash/images/lat-lng.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/101-GeoHash/images/lat-lng.jpg
--------------------------------------------------------------------------------
/101-GeoHash/images/geohash-size.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/101-GeoHash/images/geohash-size.png
--------------------------------------------------------------------------------
/200-Kubernetes/app/fasion_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/app/fasion_model.h5
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_1139.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_1139.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_11518.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_11518.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_12352.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_12352.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_12924.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_12924.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_13180.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_13180.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_1421.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_1421.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_14467.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_14467.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_15419.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_15419.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_16078.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_16078.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_16742.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_16742.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_16975.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_16975.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_17027.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_17027.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_17101.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_17101.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_17764.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_17764.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_18043.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_18043.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_18313.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_18313.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_18788.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_18788.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_19007.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_19007.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_19098.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_19098.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_19177.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_19177.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_19442.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_19442.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_19482.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_19482.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_20091.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_20091.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_22950.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_22950.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_23330.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_23330.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_25021.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_25021.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_25536.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_25536.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_25643.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_25643.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_27647.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_27647.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_27825.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_27825.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_27936.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_27936.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_28251.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_28251.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_28550.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_28550.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_29020.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_29020.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_2919.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_2919.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_31021.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_31021.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_3152.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_3152.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_33108.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_33108.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_33165.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_33165.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_33193.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_33193.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_33271.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_33271.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_34045.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_34045.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_3480.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_3480.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_36605.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_36605.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_37196.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_37196.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_3767.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_3767.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_37793.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_37793.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_38070.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_38070.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_38479.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_38479.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_3880.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_3880.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_41126.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_41126.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_41669.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_41669.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_41819.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_41819.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_42013.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_42013.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_42528.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_42528.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_4256.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_4256.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_42662.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_42662.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_43296.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_43296.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_4354.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_4354.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_43814.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_43814.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_44314.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_44314.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_4496.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_4496.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_45115.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_45115.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_45910.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_45910.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_45926.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_45926.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_46011.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_46011.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_46449.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_46449.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_46682.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_46682.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_47082.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_47082.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_4712.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_4712.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_48604.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_48604.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_49798.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_49798.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_49844.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_49844.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_49963.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_49963.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_50107.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_50107.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_50488.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_50488.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_50863.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_50863.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_51212.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_51212.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_51335.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_51335.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_52312.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_52312.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_52636.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_52636.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_52970.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_52970.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_53360.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_53360.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_53374.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_53374.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_54723.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_54723.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_54962.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_54962.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_55104.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_55104.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_55535.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_55535.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_55641.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_55641.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_56036.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_56036.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_56392.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_56392.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_5707.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_5707.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_58386.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_58386.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_6135.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_6135.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_6197.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_6197.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_6689.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_6689.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_8744.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_8744.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_9688.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_9688.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_9758.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_9758.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/sample/sample_9852.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndersonJo/code-snippet/master/200-Kubernetes/sample/sample_9852.jpg
--------------------------------------------------------------------------------
/200-Kubernetes/app/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==1.1.2
2 | Keras==2.4.3
3 | h5py==2.10.0
4 | tensorflow-cpu==2.4.0
5 | Pillow==7.2.0
6 | numpy==1.18.5
7 | opencv-python==4.4.0.42
8 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Python
2 | *.pyc
3 |
4 | # Project
5 | .idea
6 | .ipynb_checkpoints
7 |
8 | # Useless Files
9 | _SUCCESS
10 | *.crc
11 |
12 | # Data
13 | *.parquet
14 | *.orc
15 |
--------------------------------------------------------------------------------
/204-Kubernetes-Redis/redis-custom-config/kustomization.yaml:
--------------------------------------------------------------------------------
1 | configMapGenerator:
2 | - name: example-redis-config
3 | files:
4 | - redis.conf
5 | resources:
6 | - redis-pod.yaml
7 |
--------------------------------------------------------------------------------
/100-PyQT/01-simple-example/main.py:
--------------------------------------------------------------------------------
1 | from PyQt5.QtWidgets import QApplication, QLabel
2 | import sys
3 |
4 | app = QApplication(sys.argv)
5 | label = QLabel('Hello World')
6 | label.show()
7 | app.exec()
8 |
--------------------------------------------------------------------------------
/005-ray/script.py:
--------------------------------------------------------------------------------
1 | import ray
2 |
3 |
4 | @ray.remote
5 | def hello_world():
6 | print(ray.cluster_resources())
7 | return "hello world"
8 |
9 |
10 | ray.init()
11 | print(ray.get(hello_world.remote()))
12 |
--------------------------------------------------------------------------------
/002-Pyspark/macdonald/README.md:
--------------------------------------------------------------------------------
1 | # 1. Tutorial
2 |
3 | ## 1.1 Preparation
4 |
5 | S3 Bucket 하나 만들고, mcdonalds_dataset.csv 업로드 합니다.
6 | 예제에서의 S3 Bucket 이름은 data-emr-tutorial입니다.
7 |
8 | ```bash
9 | $ aws s3 cp mcdonalds_dataset.csv s3://data-emr-tutorial/data/
10 | $ aws s3 ls data-emr-tutorial/data/
11 | ```
12 |
13 | ## 1.2 Run Script
14 |
15 |
--------------------------------------------------------------------------------
/004-Dask/tutorial-01.py:
--------------------------------------------------------------------------------
1 | from dask.distributed import Client, progress
2 | import dask.array as da
3 |
4 | client = Client(processes=False, threads_per_worker=4, n_workers=1, memory_limit='2GB')
5 | x = da.random.random((10000, 10000), chunks=(1000, 1000))
6 | y = x + x.T
7 | z = y[::2, 5000:].mean(axis=1)
8 |
9 | print(x)
10 | print(z.compute())
11 | print(x.shape)
--------------------------------------------------------------------------------
/202-Kubernetes-deploy-nginx/README.md:
--------------------------------------------------------------------------------
1 | # Tutorial
2 |
3 | ## Deployment
4 |
5 | 먼저 deployment.yaml 파일을 디플로이 시킵니다.
6 |
7 | ```bash
8 | kubectl apply -f deployment.yaml
9 | ```
10 |
11 | 적용뒤에 라벨을 확인홥니다.
12 |
13 | ```bash
14 | kubectl describe deployments.apps nginx-deployment
15 | k get pods -l app=nginx
16 | ```
17 |
18 | 이후에 replicas 를 1로 변경해준다음에 `kuberctl apply -f deployment.yaml` 실행해서 업데이트 해줍니다.
19 | pods 을 확인해서 1개가 terminating되고 있는지 확인합니다.
20 |
--------------------------------------------------------------------------------
/202-Kubernetes-deploy-nginx/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1 # for versions before 1.9.0 use apps/v1beta2
2 | kind: Deployment
3 | metadata:
4 | name: nginx-deployment
5 | spec:
6 | selector:
7 | matchLabels:
8 | app: nginx
9 | replicas: 2 # tells deployment to run 2 pods matching the template
10 | template:
11 | metadata:
12 | labels:
13 | app: nginx
14 | spec:
15 | containers:
16 | - name: nginx
17 | image: nginx:latest
18 | ports:
19 | - containerPort: 80
--------------------------------------------------------------------------------
/002-Pyspark/macdonald/check_connection.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from random import random
3 | from operator import add
4 |
5 | from pyspark.sql import SparkSession
6 |
7 | spark = SparkSession \
8 | .builder \
9 | .appName("PythonPi") \
10 | .getOrCreate()
11 |
12 | partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
13 | n = 100000 * partitions
14 |
15 |
16 | def f(_: int) -> float:
17 | x = random() * 2 - 1
18 | y = random() * 2 - 1
19 | return 1 if x ** 2 + y ** 2 <= 1 else 0
20 |
21 |
22 | count = spark.sparkContext.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
23 | print("Pi is roughly %f" % (4.0 * count / n))
24 |
25 | spark.stop()
26 |
--------------------------------------------------------------------------------
/022-kinesis-data-analytics/consumer.py:
--------------------------------------------------------------------------------
1 | from boto import kinesis as boto_kinesis
2 |
3 |
4 | def main():
5 | kinesis = boto_kinesis.connect_to_region('us-east-2')
6 |
7 | shard_id = 'shardId-000000000000' # Shard는 1개ch만 갖고 있음
8 | shard_it = kinesis.get_shard_iterator('AndersonStream', shard_id, 'LATEST')['ShardIterator']
9 | print('Latest Shard Iterator:', shard_it)
10 |
11 | while True:
12 | _out = kinesis.get_records(shard_it, limit=10)
13 | records = _out['Records']
14 |
15 | for r in records:
16 | print(r['Data'])
17 |
18 | shard_it = _out['NextShardIterator']
19 | if not records:
20 | break
21 |
22 |
23 | if __name__ == '__main__':
24 | main()
25 |
--------------------------------------------------------------------------------
/020-kinesis-single-shard/consumer.py:
--------------------------------------------------------------------------------
1 | import json
2 | from pprint import pprint
3 |
4 | from boto import kinesis as boto_kinesis
5 |
6 |
7 | def main():
8 | kinesis = boto_kinesis.connect_to_region('us-east-2')
9 |
10 | shard_id = 'shardId-000000000003' # Shard는 1개ch만 갖고 있음
11 | shard_it = kinesis.get_shard_iterator('AndersonStream', shard_id, 'LATEST')['ShardIterator']
12 | print('Latest Shard Iterator:', shard_it)
13 |
14 | while True:
15 | _out = kinesis.get_records(shard_it, limit=10)
16 | records = _out['Records']
17 |
18 | for r in records:
19 | print(r['Data'])
20 |
21 | shard_it = _out['NextShardIterator']
22 | if not records:
23 | break
24 |
25 |
26 | if __name__ == '__main__':
27 | main()
28 |
--------------------------------------------------------------------------------
/200-Kubernetes/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: myapp
5 | spec:
6 | type: NodePort
7 | ports:
8 | - protocol: TCP
9 | port: 80
10 | targetPort: 5000
11 | selector:
12 | app: myapp
13 | ---
14 | apiVersion: apps/v1 # for versions before 1.9.0 use apps/v1beta2
15 | kind: Deployment
16 | metadata:
17 | name: myapp-deployment
18 | spec:
19 | selector:
20 | matchLabels:
21 | app: myapp
22 | replicas: 1 # tells deployment to run 2 pods matching the template
23 | template:
24 | metadata:
25 | labels:
26 | app: myapp
27 | spec:
28 | containers:
29 | - name: myapp
30 | image: myapp:latest
31 | imagePullPolicy: Never
32 | ports:
33 | - containerPort: 5000
--------------------------------------------------------------------------------
/200-Kubernetes/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:20.04
2 | MAINTAINER Anderson "a141890@gmail.com"
3 |
4 | ARG DEBIAN_FRONTEND=noninteractive
5 | ENV TZ=Asia/Seoul
6 | RUN apt-get update -y && \
7 | apt-get install -y python3-pip python3-dev libgl1-mesa-dev libgl1-mesa-glx libglib2.0-0 \
8 | build-essential cmake git pkg-config libgtk-3-dev \
9 | libavcodec-dev libavformat-dev libswscale-dev libv4l-dev \
10 | libxvidcore-dev libx264-dev libjpeg-dev libpng-dev libtiff-dev \
11 | gfortran openexr libatlas-base-dev python3-dev python3-numpy \
12 | libtbb2 libtbb-dev libdc1394-22-dev
13 |
14 | COPY ./app /app
15 | WORKDIR /app
16 | RUN pip3 install -r requirements.txt
17 |
18 | ENTRYPOINT [ "python3"]
19 | CMD [ "app.py" ]
--------------------------------------------------------------------------------
/204-Kubernetes-Redis/redis-custom-config/redis-pod.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: redis
5 | spec:
6 | containers:
7 | - name: redis
8 | image: redis:5.0.4
9 | command:
10 | - redis-server
11 | - "/redis-master/redis.conf"
12 | env:
13 | - name: MASTER
14 | value: "true"
15 | ports:
16 | - containerPort: 6379
17 | resources:
18 | limits:
19 | cpu: "0.1"
20 | volumeMounts:
21 | - mountPath: /redis-master-data
22 | name: data
23 | - mountPath: /redis-master
24 | name: config
25 | volumes:
26 | - name: data
27 | emptyDir: {}
28 | - name: config
29 | configMap:
30 | name: example-redis-config
31 | items:
32 | - key: redis-config
33 | path: redis.conf
34 |
--------------------------------------------------------------------------------
/005-ray/submit.py:
--------------------------------------------------------------------------------
1 | from ray.job_submission import JobSubmissionClient, JobStatus
2 | import time
3 |
4 | client = JobSubmissionClient("http://localhost:8265")
5 | job_id = client.submit_job(
6 | entrypoint="python script.py",
7 | runtime_env={
8 | 'working_dir': './' # 이게 있어야지 script.py 파일이 클러스터에 업로드 / 내부적으로 _upload_working_dir_if_needed 함수 호출
9 | }
10 | )
11 | print(job_id)
12 |
13 |
14 | def wait_until_status(job_id, status_to_wait_for, timeout_seconds=5):
15 | start = time.time()
16 | while time.time() - start <= timeout_seconds:
17 | status = client.get_job_status(job_id)
18 | print(f"status: {status}")
19 | if status in status_to_wait_for:
20 | break
21 | time.sleep(1)
22 |
23 |
24 | wait_until_status(job_id, {JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED})
25 | logs = client.get_job_logs(job_id)
26 | print(logs)
27 |
--------------------------------------------------------------------------------
/003-Shared-Memory/shared-memory-list.py:
--------------------------------------------------------------------------------
1 | from multiprocessing.shared_memory import ShareableList
2 | import numpy as np
3 | from multiprocessing import Process
4 |
5 |
6 | def daemon_run(name):
7 | a = ShareableList(name=name)
8 | data = np.array(a)
9 | print(f'[Processor] data: {data[:4]} | size: {len(data)}') # [Processor] data: [255 11 0 100] | size: 4096
10 | for i, v in enumerate(['def', -9999999999, 0.123456789123456, 8889999]):
11 | a[i] = v
12 |
13 |
14 | def main():
15 | # Shared Memory 생성
16 | a = ShareableList(['abc', 9999999, -100, 0.123456789])
17 |
18 | p = Process(target=daemon_run, args=(a.shm.name,)) # 프로세서를 열고, shared memory 를 읽어서 출력한다.
19 | p.start()
20 | p.join()
21 |
22 | data = np.array(a)
23 | print(f'[Main] data: {data[:4]} | size: {len(a)}') # [Main] data: [1 2 3 4] | size: 10
24 |
25 |
26 | if __name__ == '__main__':
27 | main()
28 |
--------------------------------------------------------------------------------
/200-Kubernetes/app/app.py:
--------------------------------------------------------------------------------
1 | from tempfile import gettempdir
2 |
3 | import cv2
4 | import numpy as np
5 | from flask import Flask, request, jsonify
6 | from keras.models import load_model
7 |
8 | app = Flask(__name__)
9 | model = load_model('fasion_model.h5')
10 |
11 |
12 | @app.route('/')
13 | def hello_world():
14 | return 'Hello! Anderson!'
15 |
16 |
17 | @app.route('/predict', methods=['POST'])
18 | def predict():
19 | tmp_dir = gettempdir()
20 | f = request.files["image"]
21 | f.save(tmp_dir + '/img.jpg', cv2.IMREAD_COLOR)
22 | img = cv2.imread(tmp_dir + '/img.jpg', cv2.IMREAD_COLOR)[:, :, 0]
23 | img = np.expand_dims(img, axis=0)
24 | pred_y = model.predict(img)[0]
25 | pred_label = int(np.argmax(pred_y))
26 | prob = float(pred_y[pred_label])
27 | return jsonify({'prediction': pred_label, 'prob': prob})
28 |
29 |
30 | if __name__ == '__main__':
31 | app.run(debug=True, host='0.0.0.0')
32 |
--------------------------------------------------------------------------------
/100-PyQT/02-widgets/main.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from typing import Optional
3 |
4 | from PyQt5.QtCore import QObject
5 | from PyQt5.QtWidgets import *
6 |
7 |
8 | class WidgetGallery(QDialog):
9 |
10 | def __init__(self, parent: Optional[QObject] = None):
11 | super(WidgetGallery, self).__init__(parent)
12 | self.originalPalette = QApplication.palette()
13 | styleComboBox = QComboBox()
14 | styleComboBox.addItems(QStyleFactory.keys())
15 |
16 | styleLabel = QLabel("&Style:")
17 | styleLabel.setBuddy(styleComboBox)
18 |
19 | topLayout = QHBoxLayout()
20 | topLayout.addWidget(styleLabel)
21 |
22 | mainLayout = QGridLayout()
23 | mainLayout.addLayout(topLayout, 0, 0, 1, 2)
24 | self.setLayout(mainLayout)
25 |
26 |
27 | if __name__ == '__main__':
28 | app = QApplication(sys.argv)
29 | gallery = WidgetGallery()
30 | gallery.show()
31 | sys.exit(app.exec())
32 |
--------------------------------------------------------------------------------
/003-Shared-Memory/shared-memory-bytearray.py:
--------------------------------------------------------------------------------
1 | from multiprocessing.shared_memory import SharedMemory
2 | import numpy as np
3 | from multiprocessing import Process
4 |
5 |
6 | def daemon_run(name):
7 | shm = SharedMemory(name=name)
8 | data = np.array(shm.buf)
9 | print(f'[Processor] data: {data[:4]} | size: {len(data)}') # [Processor] data: [255 11 0 100] | size: 4096
10 | shm.buf[:4] = bytearray([1, 2, 3, 4])
11 |
12 |
13 | def main():
14 | # Shared Memory 생성
15 | shm = SharedMemory(create=True, size=1024*1024*8)
16 | shm.buf[:4] = bytearray([255, 11, 0, 100]) # 값은 [0~256) 사이의 값만 가능
17 |
18 | p = Process(target=daemon_run, args=(shm.name,)) # 프로세서를 열고, shared memory 를 읽어서 출력한다.
19 | p.start()
20 | p.join()
21 |
22 | data = np.array(shm.buf)
23 | print(f'[Main] data: {data[:4]} | size: {len(shm.buf)}') # [Main] data: [1 2 3 4] | size: 10
24 |
25 | import ipdb
26 | ipdb.set_trace()
27 |
28 |
29 | if __name__ == '__main__':
30 | main()
31 |
--------------------------------------------------------------------------------
/203-Kubernetes-Service-MySQL/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: mysql
5 | spec:
6 | ports:
7 | - port: 3306
8 | selector:
9 | app: mysql
10 | clusterIP: None
11 | ---
12 | apiVersion: apps/v1 # for versions before 1.9.0 use apps/v1beta2
13 | kind: Deployment
14 | metadata:
15 | name: mysql
16 | spec:
17 | selector:
18 | matchLabels:
19 | app: mysql
20 | strategy:
21 | type: Recreate
22 | template:
23 | metadata:
24 | labels:
25 | app: mysql
26 | spec:
27 | containers:
28 | - image: mysql:5.6
29 | name: mysql
30 | env:
31 | # Use secret in real usage
32 | - name: MYSQL_ROOT_PASSWORD
33 | value: 1234
34 | ports:
35 | - containerPort: 3306
36 | name: mysql
37 | volumeMounts:
38 | - name: mysql-persistent-storage
39 | mountPath: /var/lib/mysql
40 | volumes:
41 | - name: mysql-persistent-storage
42 | persistentVolumeClaim:
43 | claimName: mysql-pv-claim
--------------------------------------------------------------------------------
/022-kinesis-data-analytics/producer.py:
--------------------------------------------------------------------------------
1 | import json
2 | import random
3 | from time import sleep
4 |
5 | from boto import kinesis as boto_kinesis
6 | from faker import Faker
7 |
8 |
9 | def generate_data(faker):
10 | return {'name': faker.name(),
11 | 'age': random.randint(10, 20),
12 | 'gender': random.choice(['M', 'F']),
13 | 'score': random.choice(range(40, 70, 5)),
14 | 'job': faker.job()}
15 |
16 |
17 | def main():
18 | faker = Faker()
19 | kinesis = boto_kinesis.connect_to_region('us-east-2')
20 | print('Connected')
21 |
22 | if 'AndersonStream' not in kinesis.list_streams()['StreamNames']:
23 | kinesis.create_stream('AndersonStream', 1)
24 | print('AndersonStream Stream has been created')
25 |
26 | while True:
27 | sleep(1)
28 | print(kinesis.list_streams())
29 | if 'AndersonStream' in kinesis.list_streams()['StreamNames']:
30 | kinesis = boto_kinesis.connect_to_region('us-east-2')
31 | break
32 |
33 | for _ in range(50):
34 | data = generate_data(faker)
35 | res = kinesis.put_record('AndersonStream', json.dumps(data), 'partitionkey' + str(random.choice([0, 1])))
36 | print('PUT', data)
37 | print(' ', res['SequenceNumber'], '\n')
38 |
39 | # kinesis.delete_stream('AndersonStream')
40 |
41 |
42 | if __name__ == '__main__':
43 | main()
44 |
--------------------------------------------------------------------------------
/101-GeoHash/images/law-of-cosine.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/003-Shared-Memory/shared-string.py:
--------------------------------------------------------------------------------
1 | from multiprocessing.shared_memory import SharedMemory
2 | import numpy as np
3 | from multiprocessing import Process
4 |
5 |
6 | def daemon_run(in_name, out_name):
7 | in_shm = SharedMemory(name=in_name)
8 | out_shm = SharedMemory(name=out_name)
9 | xb = in_shm.buf[:1024].tobytes()
10 | text = xb.decode('euc-kr').strip()
11 | print(f'[Processor] data: {text} | byte: {len(xb)}') # [Processor] data: [255 11 0 100] | size: 4096
12 | assert text == '한글1234 ABC %^&'
13 |
14 | xb = '프로세서에서 리턴된 스트링 1234!'.encode('euc-kr')
15 | out_shm.buf[:len(xb)] = xb
16 |
17 |
18 | def main():
19 | # Shared Memory 생성
20 | shm1 = SharedMemory(create=True, size=1024 * 1024 * 64)
21 | shm2 = SharedMemory(create=True, size=1024 * 1024 * 64)
22 |
23 | shm1.buf[:] = (' ' * len(shm1.buf)).encode('euc-kr')
24 | shm2.buf[:] = (' ' * len(shm2.buf)).encode('euc-kr')
25 |
26 | x = '한글1234 ABC %^&'
27 | xb = x.encode('euc-kr')
28 | shm1.buf[:len(xb)] = xb # 값은 [0~256) 사이의 값만 가능
29 | p = Process(target=daemon_run, args=(shm1.name, shm2.name)) # 프로세서를 열고, shared memory 를 읽어서 출력한다.
30 | p.start()
31 |
32 | ob = shm2.buf[:1024].tobytes()
33 | text = ob.decode('euc-kr').strip()
34 | print(f'[Main ] data: {text} | byte: {len(ob)}') # [Main] data: [1 2 3 4] | size: 10
35 |
36 | assert text == '프로세서에서 리턴된 스트링 1234!'
37 |
38 | shm1.close()
39 | shm2.close()
40 | p.join()
41 |
42 |
43 | if __name__ == '__main__':
44 | main()
45 |
--------------------------------------------------------------------------------
/020-kinesis-single-shard/producer.py:
--------------------------------------------------------------------------------
1 | import json
2 | import random
3 | from time import sleep
4 |
5 | from boto import kinesis as boto_kinesis
6 | from faker import Faker
7 |
8 |
9 | def generate_data(faker):
10 | return {'name': faker.name(),
11 | 'age': random.randint(10, 20),
12 | 'gender': random.choice(['M', 'F']),
13 | 'score': random.choice(range(40, 70, 5)),
14 | 'data': {'id': random.randint(0, 10000),
15 | 'type': random.choice(['a', 'b', 'c'])}}
16 |
17 |
18 | def main():
19 | faker = Faker()
20 | kinesis = boto_kinesis.connect_to_region('us-east-2')
21 | print('Connected')
22 |
23 | if 'AndersonStream' not in kinesis.list_streams()['StreamNames']:
24 | kinesis.create_stream('AndersonStream', 1)
25 | print('AndersonStream Stream has been created')
26 |
27 | while True:
28 | sleep(1)
29 | print(kinesis.list_streams())
30 | if 'AndersonStream' in kinesis.list_streams()['StreamNames']:
31 | kinesis = boto_kinesis.connect_to_region('us-east-2')
32 | break
33 | i = 0
34 | while True:
35 | i += 1
36 | data = generate_data(faker)
37 | data['i'] = i
38 | res = kinesis.put_record('AndersonStream', json.dumps(data), 'partitionkey' + str(random.randint(0, 10)))
39 | print(f'{i:2}', data)
40 | print(' ', res, '\n')
41 |
42 | # kinesis.delete_stream('AndersonStream')
43 |
44 |
45 | if __name__ == '__main__':
46 | main()
47 |
--------------------------------------------------------------------------------
/200-Kubernetes/README.md:
--------------------------------------------------------------------------------
1 | # Tutorial
2 |
3 | ## Preparation
4 |
5 | 1. 먼저 [Kaggle Fahsion MNIST](https://www.kaggle.com/zalando-research/fashionmnist) 에서 데이터를 다운로드 받습니다.
6 |
7 |
8 | ## Postman 설정
9 |
10 | - POST 설정
11 | - body -> form-data
12 | - key: image
13 | - key 에서 파일로 변경
14 | - value: 파일 업로드
15 |
16 | ## Docker 확인
17 |
18 | ```bash
19 | docker build -t myapp .
20 | docker run -p 5000:5000 myapp
21 | ```
22 |
23 | Daemon으로도 실행
24 |
25 | ```bash
26 | docker run -d -p 5000:5000 --name myapp myapp:latest
27 | ```
28 |
29 | Postman에서 확인 합니다.
30 |
31 | ## Docker Hub로 올리기
32 |
33 | 태그걸어주고 Docker Hub에 올립니다.
34 |
35 | ```bash
36 | docker tag myapp andersonjo/myapp
37 | docker push andersonjo/myapp
38 | ```
39 |
40 | ## Docker Hub에서 Pull 안하고 Minikube Image 사용하는 방법
41 |
42 | ```bash
43 | eval $(minikube docker-env)
44 | ```
45 |
46 | 그 다음 build 를 해줍니다.
47 |
48 | ```bash
49 | docker build -t myapp .
50 | ```
51 |
52 | Minikube안에서 실행을 합니다.
53 |
54 | ```bash
55 | kubectl run myapp-kube --image=myapp:latest --image-pull-policy=Never
56 | ```
57 |
58 | pods을 확인 하고, Postman에서도 확인합니다.
59 |
60 | ```bash
61 | kubectl get pods
62 | kubectl port-forward myapp-kube 5000:5000
63 | ```
64 |
65 | ## stateful 로 해보기
66 |
67 | 아래의 두개의 명령이 되어 있어야 합니다.
68 |
69 | ```bash
70 | eval $(minikube docker-env)
71 | docker build -t myapp .
72 | ```
73 |
74 | 배포합니다.
75 |
76 | ```bash
77 | kubectl apply -f deployment.yaml
78 |
79 | kubectl port-forward svc/myapp 5000:80
80 | ```
81 |
82 | 최종적으로 Minikube로 서비스 포트를 열수 있습니다.
83 |
84 | ```bash
85 | minikube service myapp
86 | ```
--------------------------------------------------------------------------------
/005-ray/10-ray-serving-tutorial/app.py:
--------------------------------------------------------------------------------
1 | from ray import serve
2 | from starlette.requests import Request
3 | from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
4 |
5 |
6 | @serve.deployment(num_replicas=2, ray_actor_options={"num_cpus": 1, "num_gpus": 0})
7 | class Translator:
8 | def __init__(self):
9 | # Load model
10 | self.tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B")
11 | self.tokenizer.src_lang = 'en'
12 |
13 | self.model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_1.2B")
14 | self.model.eval()
15 |
16 | def translate(self, text: str) -> str:
17 | dest_lang_id = self.tokenizer.get_lang_id('ko')
18 | encoded_src = self.tokenizer(text, return_tensors="pt")
19 | generated_tokens = self.model.generate(**encoded_src,
20 | forced_bos_token_id=dest_lang_id,
21 | max_length=200,
22 | use_cache=True)
23 | result = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
24 | return result
25 |
26 | async def __call__(self, http_request: Request) -> str:
27 | korean_text: str = await http_request.json()
28 | return self.translate(korean_text)
29 |
30 |
31 | translator = Translator.bind()
32 |
33 | # if __name__ == '__main__':
34 | # translator = Translator()
35 | # print(translator.translate('self-belief and hard work will always earn you success'))
36 |
--------------------------------------------------------------------------------
/100-PyQT/03-QThread/qthread.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from datetime import datetime
3 | from multiprocessing import Process, Queue
4 |
5 | import numpy as np
6 | from PyQt5.QtCore import QThread, pyqtSignal, pyqtSlot, QCoreApplication
7 | from PyQt5.QtWidgets import QMainWindow, QApplication
8 |
9 |
10 | def producer(que: Queue):
11 | data = ''.join([str(np.random.rand()) for _ in range(20)])
12 | for i in range(100000):
13 | que.put(data, block=False)
14 |
15 |
16 | class Consumer(QThread):
17 | poped = pyqtSignal(str)
18 |
19 | def __init__(self, que: Queue):
20 | super().__init__()
21 | self.que = que
22 |
23 | def run(self):
24 | while True:
25 | data = self.que.get()
26 | self.poped.emit(data)
27 |
28 |
29 | class MyWindow(QMainWindow):
30 | def __init__(self, que):
31 | super().__init__()
32 | self.setWindowTitle('Test Haha')
33 | self.setGeometry(200, 200, 300, 200)
34 | self.statusBar().showMessage('Hello!')
35 | self.statusBar().setStyleSheet('border:1px solid #333333;')
36 |
37 | self.consumer = Consumer(que)
38 | self.consumer.poped.connect(self.process_data)
39 | self.consumer.start()
40 | self.cnt = 0
41 |
42 | @pyqtSlot(str)
43 | def process_data(self, data):
44 | self.cnt += 1
45 | self.statusBar().showMessage(str(self.cnt))
46 | if self.cnt >= 100000:
47 | QCoreApplication.instance().quit()
48 |
49 |
50 | if __name__ == '__main__':
51 | start_dt = datetime.now()
52 | que = Queue()
53 | p = Process(name='producer', target=producer, args=(que,), daemon=True)
54 | p.start()
55 |
56 | # Main Application
57 | app = QApplication(sys.argv)
58 | window = MyWindow(que)
59 | window.show()
60 | app.exec_()
61 |
62 | print((datetime.now() - start_dt).total_seconds())
63 |
--------------------------------------------------------------------------------
/100-PyQT/03-QThread/signal_with_list.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from datetime import datetime
3 | from multiprocessing import Process, Queue
4 |
5 | import numpy as np
6 | from PyQt5.QtCore import QThread, pyqtSignal, pyqtSlot, QCoreApplication
7 | from PyQt5.QtWidgets import QMainWindow, QApplication
8 |
9 |
10 | def producer(que: Queue):
11 | data = ''.join([str(np.random.rand()) for _ in range(20)])
12 | for i in range(100000):
13 | que.put(data, block=False)
14 |
15 |
16 | class Consumer(QThread):
17 | poped = pyqtSignal(list) # 여기가 중요. PyQt_PyObject 이걸로 해야 함
18 |
19 | def __init__(self, que: Queue):
20 | super().__init__()
21 | self.que = que
22 |
23 | def run(self):
24 | while True:
25 | data = self.que.get()
26 | self.poped.emit([1, data])
27 |
28 |
29 | class MyWindow(QMainWindow):
30 | def __init__(self, que):
31 | super().__init__()
32 | self.setWindowTitle('Test Haha')
33 | self.setGeometry(200, 200, 300, 200)
34 | self.statusBar().showMessage('Hello!')
35 | self.statusBar().setStyleSheet('border:1px solid #333333;')
36 |
37 | self.consumer = Consumer(que)
38 | self.consumer.poped.connect(self.process_data)
39 | self.consumer.start()
40 | self.cnt = 0
41 |
42 | @pyqtSlot(list)
43 | def process_data(self, data):
44 | self.cnt += 1
45 | self.statusBar().showMessage(str(self.cnt))
46 | if self.cnt >= 100000:
47 | QCoreApplication.instance().quit()
48 |
49 |
50 | if __name__ == '__main__':
51 | start_dt = datetime.now()
52 | que = Queue()
53 | p = Process(name='producer', target=producer, args=(que,), daemon=True)
54 | p.start()
55 |
56 | # Main Application
57 | app = QApplication(sys.argv)
58 | window = MyWindow(que)
59 | window.show()
60 | app.exec_()
61 |
62 | print((datetime.now() - start_dt).total_seconds())
63 |
--------------------------------------------------------------------------------
/100-PyQT/03-QThread/signal_with_python_object.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from datetime import datetime
3 | from multiprocessing import Process, Queue
4 |
5 | import numpy as np
6 | from PyQt5.QtCore import QThread, pyqtSignal, pyqtSlot, QCoreApplication
7 | from PyQt5.QtWidgets import QMainWindow, QApplication
8 |
9 |
10 | def producer(que: Queue):
11 | data = ''.join([str(np.random.rand()) for _ in range(20)])
12 | for i in range(100000):
13 | que.put(data, block=False)
14 |
15 |
16 | class Consumer(QThread):
17 | poped = pyqtSignal('PyQt_PyObject') # 여기가 중요. PyQt_PyObject 이걸로 해야 함
18 |
19 | def __init__(self, que: Queue):
20 | super().__init__()
21 | self.que = que
22 |
23 | def run(self):
24 | class Data:
25 | def __init__(self, value):
26 | self.value = value
27 |
28 | while True:
29 | data = self.que.get()
30 | data_object = Data(data)
31 | self.poped.emit(data_object)
32 |
33 |
34 | class MyWindow(QMainWindow):
35 | def __init__(self, que):
36 | super().__init__()
37 | self.setWindowTitle('Test Haha')
38 | self.setGeometry(200, 200, 300, 200)
39 | self.statusBar().showMessage('Hello!')
40 | self.statusBar().setStyleSheet('border:1px solid #333333;')
41 |
42 | self.consumer = Consumer(que)
43 | self.consumer.poped.connect(self.process_data)
44 | self.consumer.start()
45 | self.cnt = 0
46 |
47 | @pyqtSlot('PyQt_PyObject')
48 | def process_data(self, data):
49 | self.cnt += 1
50 | self.statusBar().showMessage(str(self.cnt))
51 | if self.cnt >= 100000:
52 | QCoreApplication.instance().quit()
53 |
54 |
55 | if __name__ == '__main__':
56 | start_dt = datetime.now()
57 | que = Queue()
58 | p = Process(name='producer', target=producer, args=(que,), daemon=True)
59 | p.start()
60 |
61 | # Main Application
62 | app = QApplication(sys.argv)
63 | window = MyWindow(que)
64 | window.show()
65 | app.exec_()
66 |
67 | print((datetime.now() - start_dt).total_seconds())
68 |
--------------------------------------------------------------------------------
/006-pyarrow/test_dataset.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 | from pathlib import Path
3 | from plistlib import Data
4 |
5 | import pandas as pd
6 | import pyarrow as pa
7 | from torch.utils.data import DataLoader
8 |
9 | from pyarrow_torch import PyArrowDataset
10 |
11 |
12 | def test_dataset():
13 | create_data()
14 |
15 | dataset = CustomDataset("./data")
16 | assert dataset[50000] == 50000
17 | assert dataset.read_cnt == 1
18 |
19 | assert dataset[0] == 0
20 | assert dataset.read_cnt == 2
21 |
22 | assert dataset[500000] == 500000
23 | assert dataset.read_cnt == 3
24 |
25 | assert dataset[100] == 100
26 | assert dataset.read_cnt == 4
27 |
28 | assert dataset[32768] == 32768
29 | assert dataset.read_cnt == 5
30 |
31 | assert dataset[32767] == 32767
32 | assert dataset.read_cnt == 6
33 |
34 | assert dataset[32769] == 32769
35 | assert dataset.read_cnt == 7
36 |
37 | assert dataset[32770] == 32770
38 | assert dataset.read_cnt == 7
39 |
40 | assert dataset[32771] == 32771
41 | assert dataset.read_cnt == 7
42 |
43 | assert dataset[8000000] == 8000000
44 | assert dataset.read_cnt == 8
45 |
46 | assert dataset[8000001] == 8000001
47 | assert dataset.read_cnt == 8
48 |
49 |
50 | def test_dataloader_batch():
51 | dataset = CustomDataset("./data")
52 | loader = DataLoader(dataset, batch_size=2)
53 | for i, row in enumerate(loader):
54 | assert [i * 2, i * 2 + 1] == row.tolist()
55 | # assert i == row.item()
56 |
57 | i = 0
58 | for i, row in enumerate(loader):
59 | assert [i * 2, i * 2 + 1] == row.tolist()
60 | assert (i + 1) == len(loader)
61 |
62 |
63 | def test_dataloader_workers():
64 | dataset = CustomDataset("./data")
65 | loader = DataLoader(dataset, batch_size=2, num_workers=8)
66 | for i, row in enumerate(loader):
67 | assert [i * 2, i * 2 + 1] == row.tolist()
68 |
69 |
70 | def test_dataloader_random():
71 | dataset = CustomDataset("./data")
72 | loader = DataLoader(dataset, batch_size=10, shuffle=True, num_workers=2, pin_memory=True)
73 | total = set()
74 | for i, row in enumerate(loader):
75 | total |= set(row.tolist())
76 | assert len(total) == len(loader) * 10
77 |
78 | total = set()
79 | for i, row in enumerate(loader):
80 | total |= set(row.tolist())
81 | assert len(total) == len(loader) * 10
82 |
83 |
84 | class CustomDataset(PyArrowDataset):
85 |
86 | def __getitem__(self, idx):
87 | row = super().__getitem__(idx)
88 | return row['idx']
89 |
90 |
91 | def create_data():
92 | if not Path('./data').exists():
93 | df = pd.DataFrame({"idx": range(50000000)})
94 | dt = datetime(2023, 1, 1)
95 | df["dt"] = df["idx"].apply(
96 | lambda x: (dt + timedelta(milliseconds=x * 10)).date()
97 | )
98 | pa.parquet.write_to_dataset(
99 | pa.Table.from_pandas(df),
100 | root_path="data",
101 | partition_cols=["dt"],
102 | use_legacy_dataset=False,
103 | )
104 |
--------------------------------------------------------------------------------
/003-Shared-Memory/shared_memory_queue.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from multiprocessing import get_context, Process
3 | from multiprocessing import Queue
4 | from multiprocessing.shared_memory import SharedMemory
5 | from typing import Optional
6 |
7 | from tqdm import tqdm
8 |
9 |
10 | class SharedQueue:
11 |
12 | def __init__(self, queue: Queue, sq_name=None, shared_size=1024 * 1024 * 8):
13 | self.queue = queue
14 | if sq_name is None:
15 | self.shm = SharedMemory(create=True, size=shared_size)
16 | else:
17 | self.shm = SharedMemory(name=sq_name)
18 | self._shared_size = shared_size - 1
19 | self._cur_idx = 0
20 |
21 | @property
22 | def name(self):
23 | return self.shm.name
24 |
25 | def get(self, block: bool = True, timeout: Optional[float] = None, encoding='euc-kr') -> Optional[bytes]:
26 | r = self.queue.get(block=block, timeout=timeout)
27 | if r is None:
28 | return None
29 |
30 | start, end = r
31 | return self.shm.buf[start:end].tobytes()
32 |
33 | def put(self, obj: bytes, block: bool = True, timeout: Optional[float] = None):
34 | start = self._cur_idx
35 | end = start + len(obj)
36 | if end >= self._shared_size:
37 | start, end = 0, len(obj)
38 | self._cur_idx = end
39 |
40 | self.shm.buf[start:end] = obj
41 | self.queue.put((start, end), block=block, timeout=timeout)
42 |
43 |
44 | def daemon_run1(queue: Queue, sq_name: str):
45 | answer = '{0}안녕하세요 파이썬! 123456789 !@#$%^&*() 하하! 제타벨류 가즈아!'
46 | cnt = 0
47 |
48 | sq = SharedQueue(queue, sq_name=sq_name)
49 | while True:
50 | btext = sq.get()
51 | try:
52 | text = btext.decode('euc-kr')
53 | except UnicodeDecodeError as e:
54 | print(e)
55 | print(btext)
56 | continue
57 |
58 | if text == 'end':
59 | break
60 | assert text == answer.format(cnt), f'ERROR: text:{text}'
61 | cnt += 1
62 |
63 |
64 | def daemon_run2(queue: Queue):
65 | answer = '{0}안녕하세요 파이썬! 123456789 !@#$%^&*() 하하! 제타벨류 가즈아!'
66 | cnt = 0
67 |
68 | while True:
69 | text = queue.get()
70 | if text == 'end':
71 | break
72 | assert text == answer.format(cnt), f'ERROR: text:{text}'
73 | cnt += 1
74 |
75 |
76 | def main():
77 | s = '{0}안녕하세요 파이썬! 123456789 !@#$%^&*() 하하! 제타벨류 가즈아!'
78 |
79 | # SharedMemoryQueue 테스트
80 | queue = Queue()
81 | sq = SharedQueue(queue, shared_size=1024 * 1024 * 128) # 낮은 값 설정시 에러가 난다
82 | p = Process(target=daemon_run1, args=(queue, sq.name))
83 | p.start()
84 |
85 | start = datetime.now()
86 | for i in tqdm(range(1000000)):
87 | sq.put(bytes(s.format(i), encoding='euc-kr'), block=False)
88 | sq.put('end'.encode('euc-kr'), block=True)
89 | p.join()
90 |
91 | print('SharedQueue:', (datetime.now() - start).total_seconds())
92 |
93 | # 기존 Queue 테스트
94 | queue = Queue()
95 | p = Process(target=daemon_run2, args=(queue,))
96 | p.start()
97 |
98 | start = datetime.now()
99 | for i in tqdm(range(1000000)):
100 | queue.put(s.format(i), block=False)
101 | queue.put('end', block=True)
102 | p.join()
103 |
104 | print('Queue:', (datetime.now() - start).total_seconds())
105 |
106 |
107 | if __name__ == '__main__':
108 | main()
109 |
--------------------------------------------------------------------------------
/006-pyarrow/pyarrow_torch.py:
--------------------------------------------------------------------------------
1 | import random
2 | import sys
3 | from bisect import bisect_right
4 | from typing import Iterator, Optional, Tuple, List
5 |
6 | import pyarrow as pa
7 | import pandas as pd
8 | from pyarrow.dataset import ParquetFileFragment
9 | from pyarrow.lib import RecordBatch
10 | from pyarrow.parquet import ParquetDataset, ParquetFile
11 | from torch.utils.data import Dataset
12 |
13 |
14 | class PyArrowDataset(Dataset):
15 | def __init__(self, source: str, shuffle: bool = False, seed: int = 123):
16 | random.seed(seed)
17 | self.source = source
18 | self.seed = seed
19 |
20 | # Pyarrow
21 | self.dataset = ParquetDataset(source, use_legacy_dataset=False)
22 | self.parquet_indices: List[Tuple[int, int, int, ParquetFile, int, int]] = []
23 | self._cur_meta = None
24 | self._df: Optional[pd.DataFrame] = None
25 |
26 | # Debug (Memory Profiling)
27 | self.read_cnt = 0
28 |
29 | self.init_parquet_indexing(shuffle)
30 |
31 | def init_parquet_indexing(self, shuffle: bool = False):
32 | fragments = self.dataset.fragments
33 | if shuffle:
34 | random.shuffle(fragments)
35 |
36 | idx = 0
37 | parquet_indices = []
38 | for frag in fragments:
39 | parquet_file = ParquetFile(frag.path)
40 | for i, row_group in enumerate(frag.row_groups):
41 | start_idx = idx # inclusive
42 | end_idx = idx + row_group.num_rows # exclusive
43 | parquet_indices.append((i, start_idx, end_idx, parquet_file, row_group.id, row_group.num_rows))
44 | idx += row_group.num_rows
45 |
46 | self.parquet_indices.clear()
47 | self.parquet_indices = parquet_indices
48 |
49 | def __len__(self):
50 | if not self.parquet_indices:
51 | return 0
52 | return self.parquet_indices[-1][-1]
53 |
54 | def __getitem__(self, idx: int):
55 | meta_idx = self._binary_search(idx)
56 |
57 | if self._cur_meta is not None and meta_idx == self._cur_meta[0]:
58 | start_idx = self._cur_meta[1]
59 | # print(f'total: {len(self)} | {idx - start_idx}')
60 | assert (idx - start_idx) >= 0, f'{idx} - {start_idx} = {idx - start_idx} <- should not be negative.'
61 | return self._df.iloc[idx - start_idx]
62 |
63 | # Clear memory references
64 | del self._df
65 | del self._cur_meta
66 | self.read_cnt += 1
67 | # Read a new Parquet File
68 | self._cur_meta = self.parquet_indices[meta_idx]
69 | start_idx = self._cur_meta[1]
70 | parquet_file = self._cur_meta[3]
71 | row_id = self._cur_meta[4]
72 | table: pa.Table = parquet_file.read_row_group(row_id)
73 | self._df = table.to_pandas()
74 |
75 | return self._df.iloc[idx - start_idx]
76 |
77 | def _binary_search(self, target: int):
78 | arr = self.parquet_indices
79 | n = len(arr)
80 | left, right = 0, n
81 |
82 | while left <= right:
83 | mid = (left + right) // 2
84 | _, start_idx, end_idx, _, _, _ = arr[mid]
85 | if target == start_idx:
86 | return mid
87 | elif target == end_idx:
88 | return mid + 1
89 | elif start_idx <= target < end_idx:
90 | return mid
91 | elif target <= end_idx:
92 | right = mid - 1
93 | else:
94 | left = mid + 1
95 | return left
96 |
--------------------------------------------------------------------------------
/101-GeoHash/map.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
20 |
28 |
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/200-Kubernetes/01-Kubernetes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Install Minikube\n",
8 | "\n",
9 | "\n",
10 | "```\n",
11 | "curl -Lo minikube https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64 \\\n",
12 | " && chmod +x minikube\n",
13 | " \n",
14 | "sudo install minikube /usr/local/bin/\n",
15 | "```\n",
16 | "\n",
17 | "기본적인 명령어는 다음과 같습니다.\n",
18 | "\n",
19 | " - `minikube start`\n",
20 | " - `minikube status`\n",
21 | " - `minikube stop`\n",
22 | "\n"
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {},
28 | "source": [
29 | "# Kubectl 설치\n",
30 | "\n",
31 | "```\n",
32 | "curl -LO \"https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl\"\n",
33 | "\n",
34 | "chmod +x ./kubectl\n",
35 | "sudo mv ./kubectl /usr/local/bin/kubectl\n",
36 | "\n",
37 | "kubectl version --client\n",
38 | "```\n",
39 | "\n",
40 | "Snap이 된다면 다음과 같이 쉽게 설치도 가능합니다.\n",
41 | "\n",
42 | "```\n",
43 | "sudo snap install kubectl --classic\n",
44 | "```\n"
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "metadata": {},
50 | "source": [
51 | "# Getting Started\n",
52 | "\n",
53 | "```bash\n",
54 | "$ kubectl cluster-info\n",
55 | "Kubernetes master is running at https://172.17.0.3:8443\n",
56 | "KubeDNS is running at https://172.17.0.3:8443/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy\n",
57 | "\n",
58 | "To further debug and diagnose cluster problems, use 'kubectl cluster-info dump'.\n",
59 | "\n",
60 | "```\n",
61 | " - **Kubernetes master**: master\n",
62 | " - **KubeDNS**: DNS\n",
63 | " - **kubernetes-dashboard**: dashboard - UI에서 applications을 확인 가능"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "# Hello Minikube\n",
71 | "\n",
72 | "```\n",
73 | "$ kubectl create deployment nginx-hello-world --image=nginxdemos/hello\n",
74 | "$ kubectl get deployments\n",
75 | "NAME READY UP-TO-DATE AVAILABLE AGE\n",
76 | "hello-node 1/1 1 1 60s\n",
77 | "\n",
78 | "$ kubectl get pods\n",
79 | "NAME READY STATUS RESTARTS AGE\n",
80 | "hello-node-7bf657c596-glpfj 1/1 Running 0 2m\n",
81 | "\n",
82 | "```\n",
83 | "\n",
84 | "로그 확인은 다음과 같이 합니다.\n",
85 | "\n",
86 | "```\n",
87 | "kubectl get events\n",
88 | "```\n",
89 | "\n",
90 | "\n",
91 | "삭제는 다음과 같이 합니다.\n",
92 | "\n",
93 | "```\n",
94 | "$ kubectl delete deployment nginx-hello-world\n",
95 | "$ kubectl get deployments\n",
96 | "No resources found in default namespace.\n",
97 | "```"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "# Nginx Tutorial\n",
105 | "\n",
106 | "먼저 docker에서 확인합니다.\n",
107 | "\n",
108 | "```\n",
109 | "docker run --name my-nginx -p 5001:80 -d nginx\n",
110 | "```\n",
111 | "\n",
112 | "크롬에서 확인후 삭제 합니다.\n",
113 | "\n",
114 | "```\n",
115 | "docker stop my-nginx \n",
116 | "docker container prune\n",
117 | "```\n",
118 | "\n",
119 | "Nginx 배포합니다.\n",
120 | "\n",
121 | "```\n",
122 | "kubectl create deployment hello-node --image=nginx\n",
123 | "# 또는 이거\n",
124 | "# kubectl create deployment hello-node --image=nginxdemos/hello \n",
125 | "\n",
126 | "kubectl port-forward hello-node-544968b8c4-4kvfh 5001:80 --address 0.0.0.0\n",
127 | "```\n",
128 | "\n",
129 | "크롬에서 확인을 합니다.\n",
130 | "\n",
131 | "로그도 확인합니다.\n",
132 | "\n",
133 | "`logs [Pod 이름]` 을 사용합니다.\n",
134 | "\n",
135 | "```\n",
136 | "kubectl logs my-nginx-66b75b6f6b-29sw6 -f\n",
137 | "```\n",
138 | "\n",
139 | "Pod안의 명령문을 실행시킬수도 있습니다.\n",
140 | "\n",
141 | "```\n",
142 | "kubectl exec hello-node-66b75b6f6b-29sw6 -- env\n",
143 | "kubectl exec hello-node-544968b8c4-tp5pd -it -- bash\n",
144 | "```"
145 | ]
146 | },
147 | {
148 | "cell_type": "markdown",
149 | "metadata": {},
150 | "source": [
151 | "## Expose\n",
152 | "\n",
153 | "먼저 nginx 를 디플로이해줍니다.\n",
154 | "\n",
155 | "```\n",
156 | "kubectl create deployment hello-node --image=nginx\n",
157 | "```\n",
158 | "\n",
159 | "Expose 시킵니다.\n",
160 | "\n",
161 | "```\n",
162 | "kubectl expose deployment hello-node --type=NodePort --port 5001 --target-port 80\n",
163 | "```\n",
164 | "\n",
165 | "이후 NodePort를 확인합니다.
\n",
166 | "이후 `curl $(minikube ip):[Node Port]` 로 확인합니다. \n",
167 | "\n",
168 | "```\n",
169 | "kubectl describe service hello-node | grep NodePort\n",
170 | "curl $(minikube ip):31832\n",
171 | "kubectl port-forward service/hello-node 5002:5001\n",
172 | "```"
173 | ]
174 | }
175 | ],
176 | "metadata": {
177 | "kernelspec": {
178 | "display_name": "Python 3",
179 | "language": "python",
180 | "name": "python3"
181 | },
182 | "language_info": {
183 | "codemirror_mode": {
184 | "name": "ipython",
185 | "version": 3
186 | },
187 | "file_extension": ".py",
188 | "mimetype": "text/x-python",
189 | "name": "python",
190 | "nbconvert_exporter": "python",
191 | "pygments_lexer": "ipython3",
192 | "version": "3.8.2"
193 | }
194 | },
195 | "nbformat": 4,
196 | "nbformat_minor": 4
197 | }
198 |
--------------------------------------------------------------------------------
/010-Pyspark/01 Tutorial.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 5,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import findspark\n",
10 | "findspark.init()\n",
11 | "\n",
12 | "import pyspark\n",
13 | "from pyspark import SparkContext\n",
14 | "from datetime import datetime"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Initialize Spark Context"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "sc = SparkContext(\"local\", \"tutorial\")"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "## Word Count"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 7,
43 | "metadata": {},
44 | "outputs": [
45 | {
46 | "name": "stdout",
47 | "output_type": "stream",
48 | "text": [
49 | "counts: 8\n",
50 | "CPU times: user 69 µs, sys: 57 µs, total: 126 µs\n",
51 | "Wall time: 132 µs\n"
52 | ]
53 | }
54 | ],
55 | "source": [
56 | "words = sc.parallelize (\n",
57 | " [\"scala\", \n",
58 | " \"java\", \n",
59 | " \"hadoop\", \n",
60 | " \"spark\", \n",
61 | " \"akka\",\n",
62 | " \"spark vs hadoop\", \n",
63 | " \"pyspark\",\n",
64 | " \"pyspark and spark\"]\n",
65 | ")\n",
66 | "counts = words.count()\n",
67 | "\n",
68 | "%time print(f'counts: {counts}')"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "## Collect"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 8,
81 | "metadata": {},
82 | "outputs": [
83 | {
84 | "data": {
85 | "text/plain": [
86 | "['scala',\n",
87 | " 'java',\n",
88 | " 'hadoop',\n",
89 | " 'spark',\n",
90 | " 'akka',\n",
91 | " 'spark vs hadoop',\n",
92 | " 'pyspark',\n",
93 | " 'pyspark and spark']"
94 | ]
95 | },
96 | "execution_count": 8,
97 | "metadata": {},
98 | "output_type": "execute_result"
99 | }
100 | ],
101 | "source": [
102 | "words.collect()"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "metadata": {},
108 | "source": [
109 | "## ForEach"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 23,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "def f(x):\n",
119 | " print(x)\n",
120 | " \n",
121 | "words.foreach(f)"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "## Filter"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": 26,
134 | "metadata": {},
135 | "outputs": [
136 | {
137 | "data": {
138 | "text/plain": [
139 | "['pyspark', 'pyspark and spark']"
140 | ]
141 | },
142 | "execution_count": 26,
143 | "metadata": {},
144 | "output_type": "execute_result"
145 | }
146 | ],
147 | "source": [
148 | "words.filter(lambda x: 'py' in x).collect()"
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "metadata": {},
154 | "source": [
155 | "## Map"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 31,
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "data": {
165 | "text/plain": [
166 | "[('scala', 1, 3),\n",
167 | " ('java', 1, 3),\n",
168 | " ('hadoop', 1, 3),\n",
169 | " ('spark', 1, 3),\n",
170 | " ('akka', 1, 3),\n",
171 | " ('spark vs hadoop', 1, 3),\n",
172 | " ('pyspark', 1, 3),\n",
173 | " ('pyspark and spark', 1, 3)]"
174 | ]
175 | },
176 | "execution_count": 31,
177 | "metadata": {},
178 | "output_type": "execute_result"
179 | }
180 | ],
181 | "source": [
182 | "words.map(lambda x: (x, 1, 3)).collect()"
183 | ]
184 | },
185 | {
186 | "cell_type": "markdown",
187 | "metadata": {},
188 | "source": [
189 | "## Reduce "
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": 33,
195 | "metadata": {},
196 | "outputs": [
197 | {
198 | "data": {
199 | "text/plain": [
200 | "25"
201 | ]
202 | },
203 | "execution_count": 33,
204 | "metadata": {},
205 | "output_type": "execute_result"
206 | }
207 | ],
208 | "source": [
209 | "from operator import add\n",
210 | "\n",
211 | "nums = sc.parallelize([1, 2, 3, 4, 5, 10])\n",
212 | "nums.reduce(add)"
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "metadata": {},
218 | "source": [
219 | "## Join\n",
220 | "\n",
221 | "1. **join**: 두개의 RDD에 모두 존재하는 elements만 join이 되고, 나머지는 제외\n",
222 | "2. **fullOuterJoin**: 모든 elements를 join 시킨다 "
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 37,
228 | "metadata": {},
229 | "outputs": [
230 | {
231 | "data": {
232 | "text/plain": [
233 | "[('ml', (10, 5)), ('spark', (1, 2))]"
234 | ]
235 | },
236 | "execution_count": 37,
237 | "metadata": {},
238 | "output_type": "execute_result"
239 | }
240 | ],
241 | "source": [
242 | "x = sc.parallelize([('spark', 1), ('ml', 10), ('power', 2)])\n",
243 | "y = sc.parallelize([('spark', 2), ('ml', 5), ('happy', 3)])\n",
244 | "joined = x.join(y)\n",
245 | "joined.collect()"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 39,
251 | "metadata": {},
252 | "outputs": [
253 | {
254 | "data": {
255 | "text/plain": [
256 | "[('ml', (10, 5)),\n",
257 | " ('power', (2, None)),\n",
258 | " ('spark', (1, 2)),\n",
259 | " ('happy', (None, 3))]"
260 | ]
261 | },
262 | "execution_count": 39,
263 | "metadata": {},
264 | "output_type": "execute_result"
265 | }
266 | ],
267 | "source": [
268 | "x.fullOuterJoin(y).collect()"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": 40,
274 | "metadata": {},
275 | "outputs": [
276 | {
277 | "data": {
278 | "text/plain": [
279 | "[('ml', (10, 5)), ('power', (2, None)), ('spark', (1, 2))]"
280 | ]
281 | },
282 | "execution_count": 40,
283 | "metadata": {},
284 | "output_type": "execute_result"
285 | }
286 | ],
287 | "source": [
288 | "x.leftOuterJoin(y).collect()"
289 | ]
290 | },
291 | {
292 | "cell_type": "markdown",
293 | "metadata": {},
294 | "source": [
295 | "## Cache\n",
296 | "\n",
297 | "\"MEMORY_ONLY\" 일경우.. 메모리에 RDD를 persist시킨다 "
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": 42,
303 | "metadata": {},
304 | "outputs": [
305 | {
306 | "data": {
307 | "text/plain": [
308 | "True"
309 | ]
310 | },
311 | "execution_count": 42,
312 | "metadata": {},
313 | "output_type": "execute_result"
314 | }
315 | ],
316 | "source": [
317 | "words.cache()\n",
318 | "words.persist().is_cached"
319 | ]
320 | }
321 | ],
322 | "metadata": {
323 | "kernelspec": {
324 | "display_name": "Python 3",
325 | "language": "python",
326 | "name": "python3"
327 | },
328 | "language_info": {
329 | "codemirror_mode": {
330 | "name": "ipython",
331 | "version": 3
332 | },
333 | "file_extension": ".py",
334 | "mimetype": "text/x-python",
335 | "name": "python",
336 | "nbconvert_exporter": "python",
337 | "pygments_lexer": "ipython3",
338 | "version": "3.6.7"
339 | }
340 | },
341 | "nbformat": 4,
342 | "nbformat_minor": 2
343 | }
344 |
--------------------------------------------------------------------------------
/002-Pyspark/01 Tutorial.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 5,
6 | "metadata": {
7 | "pycharm": {
8 | "name": "#%%\n"
9 | }
10 | },
11 | "outputs": [],
12 | "source": [
13 | "import findspark\n",
14 | "findspark.init()\n",
15 | "\n",
16 | "import pyspark\n",
17 | "from pyspark import SparkContext\n",
18 | "from datetime import datetime"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {
24 | "pycharm": {
25 | "name": "#%% md\n"
26 | }
27 | },
28 | "source": [
29 | "## Initialize Spark Context"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 2,
35 | "metadata": {
36 | "pycharm": {
37 | "name": "#%%\n"
38 | }
39 | },
40 | "outputs": [],
41 | "source": [
42 | "sc = SparkContext(\"local\", \"tutorial\")"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {
48 | "pycharm": {
49 | "name": "#%% md\n"
50 | }
51 | },
52 | "source": [
53 | "## Word Count"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 7,
59 | "metadata": {
60 | "pycharm": {
61 | "name": "#%%\n"
62 | }
63 | },
64 | "outputs": [
65 | {
66 | "name": "stdout",
67 | "output_type": "stream",
68 | "text": [
69 | "counts: 8\n",
70 | "CPU times: user 69 µs, sys: 57 µs, total: 126 µs\n",
71 | "Wall time: 132 µs\n"
72 | ]
73 | }
74 | ],
75 | "source": [
76 | "words = sc.parallelize (\n",
77 | " [\"scala\", \n",
78 | " \"java\", \n",
79 | " \"hadoop\", \n",
80 | " \"spark\", \n",
81 | " \"akka\",\n",
82 | " \"spark vs hadoop\", \n",
83 | " \"pyspark\",\n",
84 | " \"pyspark and spark\"]\n",
85 | ")\n",
86 | "counts = words.count()\n",
87 | "\n",
88 | "%time print(f'counts: {counts}')"
89 | ]
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "metadata": {
94 | "pycharm": {
95 | "name": "#%% md\n"
96 | }
97 | },
98 | "source": [
99 | "## Collect"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": 8,
105 | "metadata": {
106 | "pycharm": {
107 | "name": "#%%\n"
108 | }
109 | },
110 | "outputs": [
111 | {
112 | "data": {
113 | "text/plain": [
114 | "['scala',\n",
115 | " 'java',\n",
116 | " 'hadoop',\n",
117 | " 'spark',\n",
118 | " 'akka',\n",
119 | " 'spark vs hadoop',\n",
120 | " 'pyspark',\n",
121 | " 'pyspark and spark']"
122 | ]
123 | },
124 | "execution_count": 8,
125 | "metadata": {},
126 | "output_type": "execute_result"
127 | }
128 | ],
129 | "source": [
130 | "words.collect()"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {
136 | "pycharm": {
137 | "name": "#%% md\n"
138 | }
139 | },
140 | "source": [
141 | "## ForEach"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 23,
147 | "metadata": {
148 | "pycharm": {
149 | "name": "#%%\n"
150 | }
151 | },
152 | "outputs": [],
153 | "source": [
154 | "def f(x):\n",
155 | " print(x)\n",
156 | " \n",
157 | "words.foreach(f)"
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "metadata": {
163 | "pycharm": {
164 | "name": "#%% md\n"
165 | }
166 | },
167 | "source": [
168 | "## Filter"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 26,
174 | "metadata": {
175 | "pycharm": {
176 | "name": "#%%\n"
177 | }
178 | },
179 | "outputs": [
180 | {
181 | "data": {
182 | "text/plain": [
183 | "['pyspark', 'pyspark and spark']"
184 | ]
185 | },
186 | "execution_count": 26,
187 | "metadata": {},
188 | "output_type": "execute_result"
189 | }
190 | ],
191 | "source": [
192 | "words.filter(lambda x: 'py' in x).collect()"
193 | ]
194 | },
195 | {
196 | "cell_type": "markdown",
197 | "metadata": {
198 | "pycharm": {
199 | "name": "#%% md\n"
200 | }
201 | },
202 | "source": [
203 | "## Map"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 31,
209 | "metadata": {
210 | "pycharm": {
211 | "name": "#%%\n"
212 | }
213 | },
214 | "outputs": [
215 | {
216 | "data": {
217 | "text/plain": [
218 | "[('scala', 1, 3),\n",
219 | " ('java', 1, 3),\n",
220 | " ('hadoop', 1, 3),\n",
221 | " ('spark', 1, 3),\n",
222 | " ('akka', 1, 3),\n",
223 | " ('spark vs hadoop', 1, 3),\n",
224 | " ('pyspark', 1, 3),\n",
225 | " ('pyspark and spark', 1, 3)]"
226 | ]
227 | },
228 | "execution_count": 31,
229 | "metadata": {},
230 | "output_type": "execute_result"
231 | }
232 | ],
233 | "source": [
234 | "words.map(lambda x: (x, 1, 3)).collect()"
235 | ]
236 | },
237 | {
238 | "cell_type": "markdown",
239 | "metadata": {
240 | "pycharm": {
241 | "name": "#%% md\n"
242 | }
243 | },
244 | "source": [
245 | "## Reduce "
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 33,
251 | "metadata": {
252 | "pycharm": {
253 | "name": "#%%\n"
254 | }
255 | },
256 | "outputs": [
257 | {
258 | "data": {
259 | "text/plain": [
260 | "25"
261 | ]
262 | },
263 | "execution_count": 33,
264 | "metadata": {},
265 | "output_type": "execute_result"
266 | }
267 | ],
268 | "source": [
269 | "from operator import add\n",
270 | "\n",
271 | "nums = sc.parallelize([1, 2, 3, 4, 5, 10])\n",
272 | "nums.reduce(add)"
273 | ]
274 | },
275 | {
276 | "cell_type": "markdown",
277 | "metadata": {
278 | "pycharm": {
279 | "name": "#%% md\n"
280 | }
281 | },
282 | "source": [
283 | "## Join\n",
284 | "\n",
285 | "1. **join**: 두개의 RDD에 모두 존재하는 elements만 join이 되고, 나머지는 제외\n",
286 | "2. **fullOuterJoin**: 모든 elements를 join 시킨다 "
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 37,
292 | "metadata": {
293 | "pycharm": {
294 | "name": "#%%\n"
295 | }
296 | },
297 | "outputs": [
298 | {
299 | "data": {
300 | "text/plain": [
301 | "[('ml', (10, 5)), ('spark', (1, 2))]"
302 | ]
303 | },
304 | "execution_count": 37,
305 | "metadata": {},
306 | "output_type": "execute_result"
307 | }
308 | ],
309 | "source": [
310 | "x = sc.parallelize([('spark', 1), ('ml', 10), ('power', 2)])\n",
311 | "y = sc.parallelize([('spark', 2), ('ml', 5), ('happy', 3)])\n",
312 | "joined = x.join(y)\n",
313 | "joined.collect()"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": 39,
319 | "metadata": {
320 | "pycharm": {
321 | "name": "#%%\n"
322 | }
323 | },
324 | "outputs": [
325 | {
326 | "data": {
327 | "text/plain": [
328 | "[('ml', (10, 5)),\n",
329 | " ('power', (2, None)),\n",
330 | " ('spark', (1, 2)),\n",
331 | " ('happy', (None, 3))]"
332 | ]
333 | },
334 | "execution_count": 39,
335 | "metadata": {},
336 | "output_type": "execute_result"
337 | }
338 | ],
339 | "source": [
340 | "x.fullOuterJoin(y).collect()"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": 40,
346 | "metadata": {
347 | "pycharm": {
348 | "name": "#%%\n"
349 | }
350 | },
351 | "outputs": [
352 | {
353 | "data": {
354 | "text/plain": [
355 | "[('ml', (10, 5)), ('power', (2, None)), ('spark', (1, 2))]"
356 | ]
357 | },
358 | "execution_count": 40,
359 | "metadata": {},
360 | "output_type": "execute_result"
361 | }
362 | ],
363 | "source": [
364 | "x.leftOuterJoin(y).collect()"
365 | ]
366 | },
367 | {
368 | "cell_type": "markdown",
369 | "metadata": {
370 | "pycharm": {
371 | "name": "#%% md\n"
372 | }
373 | },
374 | "source": [
375 | "## Cache\n",
376 | "\n",
377 | "\"MEMORY_ONLY\" 일경우.. 메모리에 RDD를 persist시킨다 "
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": 42,
383 | "metadata": {
384 | "pycharm": {
385 | "name": "#%%\n"
386 | }
387 | },
388 | "outputs": [
389 | {
390 | "data": {
391 | "text/plain": [
392 | "True"
393 | ]
394 | },
395 | "execution_count": 42,
396 | "metadata": {},
397 | "output_type": "execute_result"
398 | }
399 | ],
400 | "source": [
401 | "words.cache()\n",
402 | "words.persist().is_cached"
403 | ]
404 | }
405 | ],
406 | "metadata": {
407 | "kernelspec": {
408 | "display_name": "Python 3",
409 | "language": "python",
410 | "name": "python3"
411 | },
412 | "language_info": {
413 | "codemirror_mode": {
414 | "name": "ipython",
415 | "version": 3
416 | },
417 | "file_extension": ".py",
418 | "mimetype": "text/x-python",
419 | "name": "python",
420 | "nbconvert_exporter": "python",
421 | "pygments_lexer": "ipython3",
422 | "version": "3.6.7"
423 | }
424 | },
425 | "nbformat": 4,
426 | "nbformat_minor": 2
427 | }
--------------------------------------------------------------------------------
/006-pyarrow/pyarrow-tutorial.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "b7462c43-d1b7-46ea-9c88-06c078c92e51",
7 | "metadata": {
8 | "tags": []
9 | },
10 | "outputs": [],
11 | "source": [
12 | "%config Completer.use_jedi = False\n",
13 | "\n",
14 | "import sys\n",
15 | "from datetime import datetime, timedelta\n",
16 | "from typing import Generator, Iterator, Optional, Tuple\n",
17 | "\n",
18 | "import pandas as pd\n",
19 | "import pyarrow as pa"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "id": "ad43b914-376a-4eac-9216-ebb9f2edc3e9",
25 | "metadata": {},
26 | "source": [
27 | "## Data Generation"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 41,
33 | "id": "ead8625c-0956-4a05-9e90-6db7ec7d07d5",
34 | "metadata": {},
35 | "outputs": [
36 | {
37 | "data": {
38 | "text/plain": [
39 | "2380"
40 | ]
41 | },
42 | "execution_count": 41,
43 | "metadata": {},
44 | "output_type": "execute_result"
45 | }
46 | ],
47 | "source": []
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 75,
52 | "id": "0ec194e5-54c8-4f14-a653-77924160d1db",
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "from sklearn.datasets import make_classification\n",
57 | "from datetime import datetime, timedelta\n",
58 | "from random import random, randint\n",
59 | "\n",
60 | "cur_date = datetime.now()\n",
61 | "\n",
62 | "for i in range(10):\n",
63 | " x, y = make_classification(n_samples=randint(1000, 10000), n_features=10, weights=(0.9, 0.1))\n",
64 | " df = pd.DataFrame(x)\n",
65 | " df.columns = [f'col_{x}' for x in range(10)]\n",
66 | " df['dt'] = cur_date.strftime('%Y%m%d')\n",
67 | " cur_date += timedelta(days=1)\n",
68 | " df.to_parquet('./data', partition_cols=['dt'])"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "id": "90c4ef72-d516-4797-a6dd-fb9adfd3cb03",
74 | "metadata": {},
75 | "source": [
76 | "# ParquetDataset"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "id": "d9469c34-2e8b-4427-a30d-dc4b95243c75",
82 | "metadata": {},
83 | "source": [
84 | "## Dataset"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 74,
90 | "id": "6ae8ce10-b595-4dd1-bd61-82fd42645a05",
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "scanner = dataset.scanner()\n"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 87,
100 | "id": "686d0599-130d-48ae-9156-7786e1374bda",
101 | "metadata": {},
102 | "outputs": [
103 | {
104 | "name": "stdout",
105 | "output_type": "stream",
106 | "text": [
107 | "n_rows: 41851\n",
108 | "0.43440751685326995\n",
109 | "1.1431565798068537\n",
110 | "0.24992460337363664\n",
111 | "-0.34604515354971194\n",
112 | "0.32233619998326285\n",
113 | "0.7595871664144229\n",
114 | "-0.9966609176752007\n",
115 | "-0.5206429227786304\n",
116 | "1.2140122393778143\n",
117 | "-1.599369064413563\n"
118 | ]
119 | }
120 | ],
121 | "source": [
122 | "import pyarrow.dataset as ds\n",
123 | "\n",
124 | "dataset = ds.dataset('./data', format='parquet', partitioning=['dt'])\n",
125 | "print('n_rows:', dataset.count_rows())\n",
126 | "\n",
127 | "for batch in dataset.to_batches():\n",
128 | " for i in range(batch.num_rows):\n",
129 | " col0 = batch.column('col_0')[0].as_py()\n",
130 | " \n",
131 | " "
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "id": "eb49596a-eee5-4a94-afe9-be2d5183aeb0",
137 | "metadata": {},
138 | "source": [
139 | "## ParquetDataset"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 18,
145 | "id": "5e31dc0f-528c-4475-ace4-085a6be8547c",
146 | "metadata": {
147 | "tags": []
148 | },
149 | "outputs": [
150 | {
151 | "name": "stdout",
152 | "output_type": "stream",
153 | "text": [
154 | "Pandas shape : (50000000, 2)\n",
155 | "Pandas size : 450000734\n",
156 | "Pyarrow size : 64\n",
157 | "files : ['./data/dt=2023-01-01/a300c22cb3554cec95c68957f6ac326f-0.parquet', './data/dt=2023-01-02/a300c22cb3554cec95c68957f6ac326f-0.parquet', './data/dt=2023-01-03/a300c22cb3554cec95c68957f6ac326f-0.parquet']\n",
158 | "fragments : [, , ]\n",
159 | "files rows : [8640000, 8640000, 8640000, 8640000, 8640000, 6800000]\n",
160 | "column size : 2\n"
161 | ]
162 | }
163 | ],
164 | "source": [
165 | "from pyarrow.parquet import ParquetDataset, ParquetFile\n",
166 | "\n",
167 | "dataset = ParquetDataset(\"./data\", memory_map=True, use_legacy_dataset=False)\n",
168 | "df = pd.read_parquet(\"./data\")\n",
169 | "\n",
170 | "file_rows = [frag.count_rows() for frag in dataset.fragments]\n",
171 | "\n",
172 | "print(\"Pandas shape :\", df.shape)\n",
173 | "print(\"Pandas size :\", sys.getsizeof(df))\n",
174 | "print(\"Pyarrow size :\", sys.getsizeof(dataset))\n",
175 | "print(\"files :\", dataset.files[:3])\n",
176 | "print(\"fragments :\", dataset.fragments[:3])\n",
177 | "print(\"files rows :\", file_rows)\n",
178 | "print(\"column size :\", len(dataset.schema))"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "id": "9f540049-85df-46d9-9815-4384b1df1156",
184 | "metadata": {},
185 | "source": [
186 | "## Iteration"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": 4,
192 | "id": "1cf791db-87ef-46b5-925a-93828e5f05b4",
193 | "metadata": {
194 | "tags": []
195 | },
196 | "outputs": [
197 | {
198 | "name": "stdout",
199 | "output_type": "stream",
200 | "text": [
201 | "frag size : 72\n",
202 | "num rows : 32768\n",
203 | "Pandas shape: (32768, 1)\n"
204 | ]
205 | },
206 | {
207 | "data": {
208 | "text/html": [
209 | "\n",
210 | "\n",
223 | "
\n",
224 | " \n",
225 | " \n",
226 | " | \n",
227 | " idx | \n",
228 | "
\n",
229 | " \n",
230 | " \n",
231 | " \n",
232 | " | 0 | \n",
233 | " 0 | \n",
234 | "
\n",
235 | " \n",
236 | "
\n",
237 | "
"
238 | ],
239 | "text/plain": [
240 | " idx\n",
241 | "0 0"
242 | ]
243 | },
244 | "metadata": {},
245 | "output_type": "display_data"
246 | }
247 | ],
248 | "source": [
249 | "for frag in dataset.fragments:\n",
250 | " for batch in frag.to_batches():\n",
251 | " df = batch.to_pandas()\n",
252 | " row = batch.take(pa.array([0]))\n",
253 | "\n",
254 | " print(\"frag size :\", sys.getsizeof(frag))\n",
255 | " print(\"num rows :\", batch.num_rows)\n",
256 | " print(\"Pandas shape:\", df.shape)\n",
257 | " display(row.to_pandas())\n",
258 | " break\n",
259 | " break"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": 57,
265 | "id": "97089466-c70d-4ec8-8fab-8ebd3d6d5e67",
266 | "metadata": {},
267 | "outputs": [
268 | {
269 | "data": {
270 | "text/plain": [
271 | "[(0, 0, 32768, , 0, 32768),\n",
272 | " (1,\n",
273 | " 32768,\n",
274 | " 65536,\n",
275 | " ,\n",
276 | " 1,\n",
277 | " 32768),\n",
278 | " (2,\n",
279 | " 65536,\n",
280 | " 98304,\n",
281 | " ,\n",
282 | " 2,\n",
283 | " 32768),\n",
284 | " (3,\n",
285 | " 98304,\n",
286 | " 131072,\n",
287 | " ,\n",
288 | " 3,\n",
289 | " 32768),\n",
290 | " (4,\n",
291 | " 131072,\n",
292 | " 163840,\n",
293 | " ,\n",
294 | " 4,\n",
295 | " 32768)]"
296 | ]
297 | },
298 | "execution_count": 57,
299 | "metadata": {},
300 | "output_type": "execute_result"
301 | }
302 | ],
303 | "source": [
304 | "\n",
305 | "idx = 0\n",
306 | "parquet_indices = []\n",
307 | "for frag in dataset.fragments:\n",
308 | " parquet_file = ParquetFile(frag.path)\n",
309 | " for i, row_group in enumerate(frag.row_groups):\n",
310 | " start_idx = idx\n",
311 | " end_idx = idx + row_group.num_rows\n",
312 | " parquet_indices.append((i, start_idx, end_idx, parquet_file, row_group.id, row_group.num_rows))\n",
313 | " idx += row_group.num_rows\n",
314 | " \n",
315 | "parquet_indices[:5]"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 55,
321 | "id": "d3dbd0dc-55c5-41a7-b175-55a210f9036e",
322 | "metadata": {},
323 | "outputs": [
324 | {
325 | "data": {
326 | "text/plain": [
327 | "28800"
328 | ]
329 | },
330 | "execution_count": 55,
331 | "metadata": {},
332 | "output_type": "execute_result"
333 | }
334 | ],
335 | "source": [
336 | "parquet_indices[-1][-1]"
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": 56,
342 | "id": "8ff55871-98ee-4107-bcca-e6d66a519850",
343 | "metadata": {},
344 | "outputs": [
345 | {
346 | "data": {
347 | "text/plain": [
348 | "pyarrow.Table\n",
349 | "idx: int64\n",
350 | "----\n",
351 | "idx: [[43200000,43200001,43200002,43200003,43200004,...,43220987,43220988,43220989,43220990,43220991]]"
352 | ]
353 | },
354 | "execution_count": 56,
355 | "metadata": {},
356 | "output_type": "execute_result"
357 | }
358 | ],
359 | "source": [
360 | "group = frag.row_groups[0]\n",
361 | "group.id\n",
362 | "\n",
363 | "\n",
364 | "pf = ParquetFile(frag.path)\n",
365 | "table = pf.read_row_group(0)\n",
366 | "table"
367 | ]
368 | },
369 | {
370 | "cell_type": "markdown",
371 | "id": "e8c85fdc-60a9-4970-b325-08fb2b8489d9",
372 | "metadata": {},
373 | "source": [
374 | "## Create Parquet Files"
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": null,
380 | "id": "adac7f0c-3fa1-4b27-8072-dc6e24e724ba",
381 | "metadata": {
382 | "tags": []
383 | },
384 | "outputs": [],
385 | "source": [
386 | "def create_data():\n",
387 | " df = pd.DataFrame({\"idx\": range(50000000)})\n",
388 | " dt = datetime(2023, 1, 1)\n",
389 | " df[\"dt\"] = df[\"idx\"].apply(\n",
390 | " lambda x: (dt + timedelta(milliseconds=x * 10)).date()\n",
391 | " )\n",
392 | " pa.parquet.write_to_dataset(\n",
393 | " pa.Table.from_pandas(df),\n",
394 | " root_path=\"data\",\n",
395 | " partition_cols=[\"dt\"],\n",
396 | " use_legacy_dataset=False,\n",
397 | " )\n",
398 | "\n",
399 | "\n",
400 | "# create_data()"
401 | ]
402 | },
403 | {
404 | "cell_type": "markdown",
405 | "id": "86634ed3-5b5f-4cdb-9a7b-1a7de80c9962",
406 | "metadata": {},
407 | "source": [
408 | "## Pytorch Dataset"
409 | ]
410 | },
411 | {
412 | "cell_type": "code",
413 | "execution_count": null,
414 | "id": "af4538a8-0733-4862-8e84-205490cb9358",
415 | "metadata": {},
416 | "outputs": [],
417 | "source": [
418 | "class PyArrowDataset(Dataset):\n",
419 | " def __init__(self, source:str, seed:int =123):\n",
420 | " pass\n",
421 | " \n",
422 | " def init_indexing(self, shuffle:bool=False)"
423 | ]
424 | },
425 | {
426 | "cell_type": "code",
427 | "execution_count": null,
428 | "id": "cb7d7752-ae4b-4e39-8a8e-9074e627e6e6",
429 | "metadata": {
430 | "tags": []
431 | },
432 | "outputs": [],
433 | "source": [
434 | "import random\n",
435 | "import tracemalloc\n",
436 | "from bisect import bisect_right\n",
437 | "\n",
438 | "from pyarrow.dataset import ParquetFileFragment\n",
439 | "from pyarrow.lib import RecordBatch\n",
440 | "from torch.utils.data import Dataset\n",
441 | "import gc\n",
442 | "\n",
443 | "class PyArrowDataset(Dataset):\n",
444 | " \"\"\"\n",
445 | " Restriction\n",
446 | " - Don't shuffle in Dataloader. this is for efficiency to precess large dataset.\n",
447 | " If you need to shuffle, do it before this custom dataset. (like in SparkSQL)\n",
448 | " But the algorithm supports random access.\n",
449 | " \"\"\"\n",
450 | "\n",
451 | " def __init__(self, source: str, seed: int = 123):\n",
452 | " self.source = source\n",
453 | " self.seed = seed\n",
454 | "\n",
455 | " # Pyarrow\n",
456 | " self.dataset = ParquetDataset(source, use_legacy_dataset=False)\n",
457 | " self.fragments: List[ParquetFileFragment] = self.dataset.fragments\n",
458 | " self._batches: Iterator[RecordBatch] = None\n",
459 | " self._batch: Optional[RecordBatch] = None\n",
460 | " self._df: pd.DataFrame = None\n",
461 | "\n",
462 | " # Indexing meta information to make search faster\n",
463 | " self._cumulative_n_rows: List[int] = []\n",
464 | " self._batch_idx: int = 0\n",
465 | "\n",
466 | " # Index\n",
467 | " self._fragment_idx = 0\n",
468 | "\n",
469 | " # Initialization\n",
470 | " self._init()\n",
471 | "\n",
472 | " def _init(self):\n",
473 | " random.seed(self.seed)\n",
474 | " # random.shuffle(self.fragments)\n",
475 | "\n",
476 | " self._cumulative_n_rows = [frag.count_rows() for frag in self.fragments]\n",
477 | " for i in range(1, len(self._cumulative_n_rows)):\n",
478 | " self._cumulative_n_rows[i] += self._cumulative_n_rows[i - 1]\n",
479 | "\n",
480 | " def _get_next(self, idx: int) -> Tuple[int, int]:\n",
481 | " print('_get_next 01', idx)\n",
482 | " def get_prev_cum_frag_size(_fragment_idx):\n",
483 | " if _fragment_idx >= 1:\n",
484 | " return self._cumulative_n_rows[_fragment_idx - 1]\n",
485 | " return 0\n",
486 | "\n",
487 | " # Calculate fragment idx\n",
488 | " fragment_idx = self._fragment_idx\n",
489 | " fragment_changed = False\n",
490 | " _prev_size = get_prev_cum_frag_size(fragment_idx)\n",
491 | " _cur_size = self._cumulative_n_rows[self._fragment_idx]\n",
492 | " if (idx < _prev_size) or (idx >= _cur_size):\n",
493 | " fragment_idx = bisect_right(self._cumulative_n_rows, idx)\n",
494 | " assert fragment_idx < len(self.fragments)\n",
495 | " # fragment_idx %= len(self.fragments)\n",
496 | " fragment_changed = self._fragment_idx != fragment_idx\n",
497 | " self._fragment_idx = fragment_idx\n",
498 | " self._batch_idx = 0\n",
499 | " \n",
500 | " if self._batches:\n",
501 | " self._batches.clear()\n",
502 | " \n",
503 | " del self._batches\n",
504 | " del self._batch\n",
505 | " del self._df\n",
506 | " self._batches = None\n",
507 | " self._batch = None\n",
508 | " self._df = None\n",
509 | " \n",
510 | " print('_get_next 02', idx)\n",
511 | " # Calculate batch idx\n",
512 | " _prev_size = get_prev_cum_frag_size(fragment_idx)\n",
513 | " batch_idx = idx - _prev_size\n",
514 | " batch_changed = batch_idx < self._batch_idx\n",
515 | "\n",
516 | " # Calculate batches of the fragment\n",
517 | " if self._batches is None or fragment_changed or batch_changed:\n",
518 | " if self._batches:\n",
519 | " self._batches.clear()\n",
520 | " \n",
521 | " self.batches = self.fragments[fragment_idx].to_batches()\n",
522 | " self._batch = None\n",
523 | "\n",
524 | " if self._batch is None:\n",
525 | " self._batch = next(self.batches)\n",
526 | " del self._df\n",
527 | " self._df = self._batch.to_pandas()\n",
528 | " self._batch_idx = 0\n",
529 | " \n",
530 | " print('_get_next 03', idx)\n",
531 | " need_to_load_data = False\n",
532 | " while True:\n",
533 | " print(\n",
534 | " \"ITER:\",\n",
535 | " f\"{self._batch_idx} <= {batch_idx} < {self._batch_idx + self._batch.num_rows} | {sys.getsizeof(self._batch)}\",\n",
536 | " )\n",
537 | " if (\n",
538 | " self._batch_idx\n",
539 | " <= batch_idx\n",
540 | " < self._batch_idx + self._batch.num_rows\n",
541 | " ):\n",
542 | " if need_to_load_data:\n",
543 | " self._df = self._batch.to_pandas()\n",
544 | " break\n",
545 | "\n",
546 | " need_to_load_data = True\n",
547 | " self._batch_idx += self._batch.num_rows\n",
548 | " self._batch = next(self.batches)\n",
549 | " \n",
550 | " print('_get_next 04', idx)\n",
551 | " return fragment_idx, batch_idx - self._batch_idx\n",
552 | " \n",
553 | " def __del__(self):\n",
554 | " print('Deleted')\n",
555 | " if self.dataset:\n",
556 | " self.dataset.clear()\n",
557 | " \n",
558 | " if self.fragments:\n",
559 | " self.fragments.clearn\n",
560 | " del self.dataset\n",
561 | " del self.fragments\n",
562 | " del self._batches\n",
563 | " del self._batch\n",
564 | " del self._df\n",
565 | "\n",
566 | "\n",
567 | " def __len__(self):\n",
568 | " return self._cumulative_n_rows[-1]\n",
569 | "\n",
570 | " def __getitem__(self, idx):\n",
571 | " print('__getitem__', idx)\n",
572 | " fragment_idx, batch_idx = self._get_next(idx)\n",
573 | "\n",
574 | " row = self._df.iloc[batch_idx][[\"idx\"]]\n",
575 | " row = row.fillna(0)\n",
576 | " row[\"fragment_idx\"] = fragment_idx\n",
577 | " row[\"batch_idx\"] = batch_idx\n",
578 | " return row, idx\n",
579 | " \n",
580 | " \n",
581 | "\n",
582 | "\n",
583 | "tracemalloc.start()\n",
584 | "dataset = PyArrowDataset(\"./data\")\n",
585 | "print(dataset[50000][0].idx)\n",
586 | "print(dataset[0][0].idx)\n",
587 | "print(dataset[500000][0].idx)\n",
588 | "\n",
589 | "print('여기까지')\n",
590 | "del dataset\n",
591 | "print(tracemalloc.get_traced_memory())\n",
592 | "print(gc.get_count())"
593 | ]
594 | },
595 | {
596 | "cell_type": "code",
597 | "execution_count": null,
598 | "id": "4721df80-e665-40b4-b281-5f5ee285406d",
599 | "metadata": {
600 | "tags": []
601 | },
602 | "outputs": [],
603 | "source": [
604 | "from torch.utils.data import DataLoader\n",
605 | "\n",
606 | "loader = DataLoader(dataset, batch_size=64, shuffle=True)\n",
607 | "data, labels = next(iter(loader))\n",
608 | "a = data[:, 0] - 1\n",
609 | "b = labels % 1000\n",
610 | "\n",
611 | "a == b"
612 | ]
613 | },
614 | {
615 | "cell_type": "markdown",
616 | "id": "3b078d63-87a2-45b2-8316-1622e5e1d39d",
617 | "metadata": {},
618 | "source": [
619 | "\n",
620 | "\n",
621 | "\n",
622 | "\n",
623 | "\n",
624 | "# ParquetFile\n",
625 | "\n",
626 | "## Row 갯수 "
627 | ]
628 | },
629 | {
630 | "cell_type": "code",
631 | "execution_count": null,
632 | "id": "c9036712-a6f0-47db-bf95-358bcdcabca0",
633 | "metadata": {
634 | "tags": []
635 | },
636 | "outputs": [],
637 | "source": [
638 | "from pyarrow.parquet import ParquetFile\n",
639 | "\n",
640 | "parquet_file = ParquetFile(\"./data/dt=20230101/userdata.parquet\")\n",
641 | "\n",
642 | "print(\"parquet_file size: \", sys.getsizeof(parquet_file))\n",
643 | "parquet_file.metadata"
644 | ]
645 | },
646 | {
647 | "cell_type": "code",
648 | "execution_count": 25,
649 | "id": "26a0352e-1049-43cf-9f4b-651c2b0a215f",
650 | "metadata": {
651 | "collapsed": true,
652 | "jupyter": {
653 | "outputs_hidden": true
654 | }
655 | },
656 | "outputs": [
657 | {
658 | "name": "stdout",
659 | "output_type": "stream",
660 | "text": [
661 | "dataset size : 64\n"
662 | ]
663 | },
664 | {
665 | "data": {
666 | "text/plain": [
667 | "['./data/dt=20230101/userdata.parquet']"
668 | ]
669 | },
670 | "execution_count": 25,
671 | "metadata": {},
672 | "output_type": "execute_result"
673 | }
674 | ],
675 | "source": [
676 | "from pyarrow.parquet import ParquetDataset\n",
677 | "\n",
678 | "dataset = ParquetDataset(\"./data\")\n",
679 | "\n",
680 | "print(\"dataset size :\", sys.getsizeof(dataset))\n",
681 | "\n",
682 | "dataset.files"
683 | ]
684 | },
685 | {
686 | "cell_type": "code",
687 | "execution_count": null,
688 | "id": "77eda780-7a45-495a-8a9e-aae88ce0cf49",
689 | "metadata": {
690 | "collapsed": true,
691 | "jupyter": {
692 | "outputs_hidden": true
693 | }
694 | },
695 | "outputs": [],
696 | "source": []
697 | }
698 | ],
699 | "metadata": {
700 | "kernelspec": {
701 | "display_name": "PyEnv 3.9.18",
702 | "language": "python",
703 | "name": "3.9.18"
704 | },
705 | "language_info": {
706 | "codemirror_mode": {
707 | "name": "ipython",
708 | "version": 3
709 | },
710 | "file_extension": ".py",
711 | "mimetype": "text/x-python",
712 | "name": "python",
713 | "nbconvert_exporter": "python",
714 | "pygments_lexer": "ipython3",
715 | "version": "3.9.18"
716 | }
717 | },
718 | "nbformat": 4,
719 | "nbformat_minor": 5
720 | }
721 |
--------------------------------------------------------------------------------
/101-GeoHash/02 Lat Lng - Addition, Angle.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "Populating the interactive namespace from numpy and matplotlib\n"
13 | ]
14 | }
15 | ],
16 | "source": [
17 | "%pylab inline\n",
18 | "import geohash\n",
19 | "import folium\n",
20 | "\n",
21 | "from geopy.distance import distance\n",
22 | "from polygon_geohasher.polygon_geohasher import geohash_to_polygon"
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {},
28 | "source": [
29 | "# Addition"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 373,
35 | "metadata": {},
36 | "outputs": [
37 | {
38 | "name": "stdout",
39 | "output_type": "stream",
40 | "text": [
41 | "500m addition\n",
42 | "Latitude Addition: 500.00005283004964\n",
43 | "Longitude Addition: 500.0049490970048\n",
44 | "Both Addition: 707.0996972853251\n",
45 | "1000m addition\n",
46 | "Latitude Addition: 1000.0004879418325\n",
47 | "Longitude Addition: 1000.0098977428217\n",
48 | "Both Addition: 1414.178421176091\n",
49 | "\n",
50 | "5000m addition\n",
51 | "Latitude Addition: 5000.017733038395\n",
52 | "Longitude Addition: 5000.049416527164\n",
53 | "Both Addition: 7070.052813071016\n",
54 | "\n",
55 | "10000m addition\n",
56 | "Latitude Addition: 10000.073709301136\n",
57 | "Longitude Addition: 10000.098381885806\n",
58 | "Both Addition: 14138.005608237756\n",
59 | "\n",
60 | "50000m addition\n",
61 | "Latitude Addition: 50001.900230356274\n",
62 | "Longitude Addition: 50000.41972205647\n",
63 | "Both Addition: 70605.6694353784\n",
64 | "\n"
65 | ]
66 | },
67 | {
68 | "data": {
69 | "text/html": [
70 | ""
71 | ],
72 | "text/plain": [
73 | ""
74 | ]
75 | },
76 | "execution_count": 373,
77 | "metadata": {},
78 | "output_type": "execute_result"
79 | }
80 | ],
81 | "source": [
82 | "def add_meter(lat, lng, lat_meter, lng_meter):\n",
83 | " new_lat = lat + (lat_meter/1000/6359.0899) * (180/np.pi)\n",
84 | " new_lng = lng + (lng_meter/1000/6386) * (180/np.pi) / np.cos(lat * np.pi/180)\n",
85 | " return new_lat, new_lng\n",
86 | "\n",
87 | "m = folium.Map(location=(lat, lng), zoom_start=12)\n",
88 | "lat, lng = 37.499402, 127.054207\n",
89 | "\n",
90 | "folium.Marker((lat, lng), popup='A').add_to(m)\n",
91 | "new_lat, new_lng = add_meter(lat, lng, 500, 500)\n",
92 | "folium.Marker((new_lat, new_lng), popup='500m').add_to(m)\n",
93 | "\n",
94 | "print('500m addition')\n",
95 | "print('Latitude Addition:', distance((lat, lng), (new_lat, lng)).m)\n",
96 | "print('Longitude Addition:', distance((lat, lng), (lat, new_lng)).m)\n",
97 | "print('Both Addition:', distance((lat, lng), (new_lat, new_lng)).m)\n",
98 | "\n",
99 | "new_lat, new_lng = add_meter(lat, lng, 1000, 1000)\n",
100 | "folium.Marker((new_lat, new_lng), popup='500m').add_to(m)\n",
101 | "print('1000m addition')\n",
102 | "print('Latitude Addition:', distance((lat, lng), (new_lat, lng)).m)\n",
103 | "print('Longitude Addition:', distance((lat, lng), (lat, new_lng)).m)\n",
104 | "print('Both Addition:', distance((lat, lng), (new_lat, new_lng)).m)\n",
105 | "print()\n",
106 | "\n",
107 | "new_lat, new_lng = add_meter(lat, lng, 5000, 5000)\n",
108 | "folium.Marker((new_lat, new_lng), popup='500m').add_to(m)\n",
109 | "print('5000m addition')\n",
110 | "print('Latitude Addition:', distance((lat, lng), (new_lat, lng)).m)\n",
111 | "print('Longitude Addition:', distance((lat, lng), (lat, new_lng)).m)\n",
112 | "print('Both Addition:', distance((lat, lng), (new_lat, new_lng)).m)\n",
113 | "print()\n",
114 | "\n",
115 | "new_lat, new_lng = add_meter(lat, lng, 10000, 10000)\n",
116 | "folium.Marker((new_lat, new_lng), popup='500m').add_to(m)\n",
117 | "print('10000m addition')\n",
118 | "print('Latitude Addition:', distance((lat, lng), (new_lat, lng)).m)\n",
119 | "print('Longitude Addition:', distance((lat, lng), (lat, new_lng)).m)\n",
120 | "print('Both Addition:', distance((lat, lng), (new_lat, new_lng)).m)\n",
121 | "print()\n",
122 | "\n",
123 | "\n",
124 | "new_lat, new_lng = add_meter(lat, lng, 50000, 50000)\n",
125 | "folium.Marker((new_lat, new_lng), popup='500m').add_to(m)\n",
126 | "print('50000m addition')\n",
127 | "print('Latitude Addition:', distance((lat, lng), (new_lat, lng)).m)\n",
128 | "print('Longitude Addition:', distance((lat, lng), (lat, new_lng)).m)\n",
129 | "print('Both Addition:', distance((lat, lng), (new_lat, new_lng)).m)\n",
130 | "print()\n",
131 | "\n",
132 | "# Visualization\n",
133 | "m"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "# 3지점간의 각도 계산\n",
141 | "\n",
142 | "https://medium.com/@manivannan_data/find-the-angle-between-three-points-from-2d-using-python-348c513e2cd\n",
143 | "\n"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 417,
149 | "metadata": {},
150 | "outputs": [
151 | {
152 | "name": "stdout",
153 | "output_type": "stream",
154 | "text": [
155 | "Angle: 90.0\n"
156 | ]
157 | }
158 | ],
159 | "source": [
160 | "def calculate_angle(cur_location: np.ndarray, p1: np.ndarray, p2: np.ndarray) -> float:\n",
161 | " \"\"\"\n",
162 | " :param cur_location: the current location of the latitude and the longitude.\n",
163 | " :param p1: (latitude, longitude)\n",
164 | " :param p2: (latitude, longitude)\n",
165 | " :return: float\n",
166 | " \"\"\"\n",
167 | " ab = cur_location - p1\n",
168 | " ac = cur_location - p2\n",
169 | " \n",
170 | " _direction = (np.dot(ab, ac) ) / (np.linalg.norm(ab) * np.linalg.norm(ac))\n",
171 | " _direction = min(max(_direction, -1), 1)\n",
172 | " angle = np.arccos(_direction)\n",
173 | " angle = np.degrees(angle)\n",
174 | " angle = np.nan_to_num(angle)\n",
175 | " return round(float(angle), 4)\n",
176 | " \n",
177 | "a = np.array([0, 0])\n",
178 | "b = np.array([5, 0])\n",
179 | "c = np.array([0, 5])\n",
180 | "\n",
181 | "print('Angle:', calculate_angle(a, b, c))"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 416,
187 | "metadata": {},
188 | "outputs": [
189 | {
190 | "name": "stdout",
191 | "output_type": "stream",
192 | "text": [
193 | "Angle: 28.53\n"
194 | ]
195 | },
196 | {
197 | "data": {
198 | "text/html": [
199 | ""
200 | ],
201 | "text/plain": [
202 | ""
203 | ]
204 | },
205 | "execution_count": 416,
206 | "metadata": {},
207 | "output_type": "execute_result"
208 | }
209 | ],
210 | "source": [
211 | "a = np.array([37.388641, 127.092138]) # Current location\n",
212 | "b = np.array([37.393937, 127.112294])\n",
213 | "c = np.array([37.381100, 127.122811])\n",
214 | "\n",
215 | "angle = calculate_angle(a, b, c)\n",
216 | "\n",
217 | "print('Angle:', round(angle, 2))\n",
218 | "\n",
219 | "m = folium.Map(location=(a+b+c)/3, zoom_start=13)\n",
220 | "folium.Marker(a, popup='A Current Location', icon=folium.Icon(color='black')).add_to(m)\n",
221 | "folium.Marker(b, popup='B Favorite Off Location', icon=folium.Icon(color='green')).add_to(m)\n",
222 | "folium.Marker(c, popup=f'C Call {int(angle)} degree', icon=folium.Icon(color='red')).add_to(m)\n",
223 | "folium.PolyLine([a, b], color='green').add_to(m)\n",
224 | "folium.PolyLine([a, c], color='red').add_to(m)\n",
225 | "m"
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "metadata": {},
231 | "source": [
232 | "# Circle"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 5,
238 | "metadata": {},
239 | "outputs": [
240 | {
241 | "name": "stdout",
242 | "output_type": "stream",
243 | "text": [
244 | "distance in km: 17.393393505034894\n"
245 | ]
246 | },
247 | {
248 | "data": {
249 | "text/html": [
250 | ""
251 | ],
252 | "text/plain": [
253 | ""
254 | ]
255 | },
256 | "execution_count": 5,
257 | "metadata": {},
258 | "output_type": "execute_result"
259 | }
260 | ],
261 | "source": [
262 | "a = np.array([37.388641, 127.092138])\n",
263 | "b = np.array([37.498641, 126.952138])\n",
264 | "\n",
265 | "print('distance in km:', distance(a, b).km)\n",
266 | "\n",
267 | "m = folium.Map(location=(a+b)/2, zoom_start=11)\n",
268 | "folium.Marker(a, popup='A', icon=folium.Icon(color='black')).add_to(m)\n",
269 | "folium.Marker(b, popup='B', icon=folium.Icon(color='green')).add_to(m)\n",
270 | "folium.Circle((a+b)/2, 10000, tooltip='test').add_to(m)\n",
271 | "folium.Circle((a+b)/2, 5000, tooltip='test').add_to(m)\n",
272 | "m"
273 | ]
274 | }
275 | ],
276 | "metadata": {
277 | "kernelspec": {
278 | "display_name": "Python 3",
279 | "language": "python",
280 | "name": "python3"
281 | },
282 | "language_info": {
283 | "codemirror_mode": {
284 | "name": "ipython",
285 | "version": 3
286 | },
287 | "file_extension": ".py",
288 | "mimetype": "text/x-python",
289 | "name": "python",
290 | "nbconvert_exporter": "python",
291 | "pygments_lexer": "ipython3",
292 | "version": "3.6.7"
293 | },
294 | "toc": {
295 | "base_numbering": 1,
296 | "nav_menu": {},
297 | "number_sections": true,
298 | "sideBar": true,
299 | "skip_h1_title": false,
300 | "title_cell": "Table of Contents",
301 | "title_sidebar": "Contents",
302 | "toc_cell": false,
303 | "toc_position": {},
304 | "toc_section_display": true,
305 | "toc_window_display": false
306 | }
307 | },
308 | "nbformat": 4,
309 | "nbformat_minor": 2
310 | }
311 |
--------------------------------------------------------------------------------
/200-Kubernetes/02-Generate-Fashion-MNIST-Sample-Images.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 15,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "Populating the interactive namespace from numpy and matplotlib\n",
13 | "x_train: (60000, 28, 28)\n",
14 | "y_train: (60000,)\n",
15 | "x_test: (10000, 28, 28)\n",
16 | "y_test: (10000,)\n"
17 | ]
18 | }
19 | ],
20 | "source": [
21 | "%pylab inline\n",
22 | "import keras\n",
23 | "import imageio\n",
24 | "import os\n",
25 | "\n",
26 | "fashion_mnist = keras.datasets.fashion_mnist\n",
27 | "\n",
28 | "class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',\n",
29 | " 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']\n",
30 | "(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()\n",
31 | "\n",
32 | "print('x_train:', x_train.shape)\n",
33 | "print('y_train:', y_train.shape)\n",
34 | "print('x_test:', x_test.shape)\n",
35 | "print('y_test:', y_test.shape)"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "import scipy.misc.ims"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 22,
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "name": "stdout",
54 | "output_type": "stream",
55 | "text": [
56 | "(28, 28)\n",
57 | "(28, 28)\n",
58 | "(28, 28)\n",
59 | "(28, 28)\n",
60 | "(28, 28)\n",
61 | "(28, 28)\n",
62 | "(28, 28)\n",
63 | "(28, 28)\n",
64 | "(28, 28)\n",
65 | "(28, 28)\n",
66 | "(28, 28)\n",
67 | "(28, 28)\n",
68 | "(28, 28)\n",
69 | "(28, 28)\n",
70 | "(28, 28)\n",
71 | "(28, 28)\n",
72 | "(28, 28)\n",
73 | "(28, 28)\n",
74 | "(28, 28)\n",
75 | "(28, 28)\n",
76 | "(28, 28)\n",
77 | "(28, 28)\n",
78 | "(28, 28)\n",
79 | "(28, 28)\n",
80 | "(28, 28)\n"
81 | ]
82 | },
83 | {
84 | "data": {
85 | "image/png": "\n",
86 | "text/plain": [
87 | ""
88 | ]
89 | },
90 | "metadata": {},
91 | "output_type": "display_data"
92 | }
93 | ],
94 | "source": [
95 | "if not os.path.exists('./sample'):\n",
96 | " os.makedirs('./sample')\n",
97 | "\n",
98 | "plt.figure(figsize=(10,10))\n",
99 | "for i in range(25):\n",
100 | " idx = np.random.randint(0, x_train.shape[0])\n",
101 | " plt.subplot(5,5,i+1)\n",
102 | " plt.xticks([])\n",
103 | " plt.yticks([])\n",
104 | " plt.grid(False)\n",
105 | " plt.imshow(x_train[idx], cmap=plt.cm.binary)\n",
106 | " plt.xlabel(class_names[y_train[idx]])\n",
107 | " \n",
108 | " imageio.imsave(f'./sample/sample_{idx:03}.jpg', x_train[idx])\n",
109 | "plt.show()"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": []
118 | }
119 | ],
120 | "metadata": {
121 | "kernelspec": {
122 | "display_name": "Python 3",
123 | "language": "python",
124 | "name": "python3"
125 | },
126 | "language_info": {
127 | "codemirror_mode": {
128 | "name": "ipython",
129 | "version": 3
130 | },
131 | "file_extension": ".py",
132 | "mimetype": "text/x-python",
133 | "name": "python",
134 | "nbconvert_exporter": "python",
135 | "pygments_lexer": "ipython3",
136 | "version": "3.8.2"
137 | }
138 | },
139 | "nbformat": 4,
140 | "nbformat_minor": 4
141 | }
142 |
--------------------------------------------------------------------------------