├── pyspark ├── study_apache_spark │ ├── README.md │ ├── profile_pyspark_jupyter.sh │ ├── data │ │ └── sample_text.txt │ ├── scala │ │ ├── scala_dataframe.ipynb │ │ └── scala_rdd.ipynb │ └── rdd_co_ban.ipynb └── notebooks │ ├── makers.csv │ ├── sample_text.txt │ ├── structured_data.ipynb │ ├── Intro_DataFrame.ipynb │ └── spark_essentials.ipynb ├── python ├── data │ ├── comma_delimited_stock_prices.csv │ ├── tab_delimited_stock_prices.tsv │ ├── colors.json │ ├── colon_delimited_stock_prices.csv │ ├── terminal.md │ └── markdown_examples.md ├── models │ └── linear_model_v1.pkl ├── matrix_algorithms.py ├── regression_algorithms.py ├── visualizing_data.py ├── recommender │ └── song_recommender.py ├── getting_data.py ├── clustering │ └── document_retrieval.py ├── sentiment_analysis │ └── classification_algorithms.py └── jupyter │ └── Getting started with iPython Notebook.ipynb ├── spark └── notebooks │ └── data │ └── graphx │ ├── followers.txt │ └── users.txt ├── deep_learning ├── figs │ └── dogs.jpg ├── snippets │ ├── multitask_learning.py │ ├── mnist_classifies.py │ ├── sift_cats_vs_dogs.py │ ├── knn_cats_vs_dogs.py │ └── training_network.py ├── src │ ├── image_segmentation.py │ ├── prototype.py │ └── main.py ├── basics.py ├── output │ └── submission_results.csv └── data │ └── stage1_sample_submission.csv ├── word2vec ├── models │ └── first_model ├── brat_tokenize_ann.py ├── parse_xml.py ├── data │ ├── sample_tokenize.txt.sent.tkn.wseg │ └── sample_tokenize.ann └── gensim_test.ipynb ├── computer_vision └── color_clustering │ ├── son_tung │ ├── son_tung_1.png │ ├── son_tung_2.png │ └── son_tung_3.png │ ├── fig_out │ ├── color_pallete_son_tung_1.png │ ├── color_pallete_son_tung_2.png │ └── color_pallete_son_tung_3.png │ └── color_kmeans.py ├── profile_pyspark_jupyter ├── .gitignore └── README.md /pyspark/study_apache_spark/README.md: -------------------------------------------------------------------------------- 1 | # topdev_talks_Jul_2017 -------------------------------------------------------------------------------- /python/data/comma_delimited_stock_prices.csv: -------------------------------------------------------------------------------- 1 | VAF,13.3 2 | VCF,152.4 3 | ATA,0.8 4 | -------------------------------------------------------------------------------- /spark/notebooks/data/graphx/followers.txt: -------------------------------------------------------------------------------- 1 | 2 1 2 | 4 1 3 | 1 2 4 | 6 3 5 | 7 3 6 | 7 6 7 | 6 7 8 | 3 7 -------------------------------------------------------------------------------- /deep_learning/figs/dogs.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ongxuanhong/data-science-works/HEAD/deep_learning/figs/dogs.jpg -------------------------------------------------------------------------------- /word2vec/models/first_model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ongxuanhong/data-science-works/HEAD/word2vec/models/first_model -------------------------------------------------------------------------------- /python/models/linear_model_v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ongxuanhong/data-science-works/HEAD/python/models/linear_model_v1.pkl -------------------------------------------------------------------------------- /python/data/tab_delimited_stock_prices.tsv: -------------------------------------------------------------------------------- 1 | TV2 45,012 147.4 +13.4 (+10.00%) 2 | CTT 100 6.6 +0.6 (+10.00%) 3 | PCE 100 16.6 +1.5 (+9.93%) 4 | HAT 13,300 73.2 +6.6 (+9.87%) -------------------------------------------------------------------------------- /computer_vision/color_clustering/son_tung/son_tung_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ongxuanhong/data-science-works/HEAD/computer_vision/color_clustering/son_tung/son_tung_1.png -------------------------------------------------------------------------------- /computer_vision/color_clustering/son_tung/son_tung_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ongxuanhong/data-science-works/HEAD/computer_vision/color_clustering/son_tung/son_tung_2.png -------------------------------------------------------------------------------- /computer_vision/color_clustering/son_tung/son_tung_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ongxuanhong/data-science-works/HEAD/computer_vision/color_clustering/son_tung/son_tung_3.png -------------------------------------------------------------------------------- /profile_pyspark_jupyter: -------------------------------------------------------------------------------- 1 | export PATH=$PATH:/usr/local/Cellar/apache-spark/2.2.0/bin 2 | export PYSPARK_DRIVER_PYTHON=jupyter 3 | export PYSPARK_DRIVER_PYTHON_OPTS='notebook' pyspark 4 | -------------------------------------------------------------------------------- /python/data/colors.json: -------------------------------------------------------------------------------- 1 | { 2 | "red": "#f00", 3 | "green": "#0f0", 4 | "blue": "#00f", 5 | "cyan": "#0ff", 6 | "magenta": "#f0f", 7 | "yellow": "#ff0", 8 | "black": "#000" 9 | } -------------------------------------------------------------------------------- /python/data/colon_delimited_stock_prices.csv: -------------------------------------------------------------------------------- 1 | MA_CK:KL:GIA:DELTA 2 | TV2:45,012:147.4:+13.4 (+10.00%) 3 | CTT:100:6.6:+0.6 (+10.00%) 4 | PCE:100:16.6:+1.5 (+9.93%) 5 | HAT:13,300:73.2:+6.6 (+9.87%) -------------------------------------------------------------------------------- /computer_vision/color_clustering/fig_out/color_pallete_son_tung_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ongxuanhong/data-science-works/HEAD/computer_vision/color_clustering/fig_out/color_pallete_son_tung_1.png -------------------------------------------------------------------------------- /computer_vision/color_clustering/fig_out/color_pallete_son_tung_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ongxuanhong/data-science-works/HEAD/computer_vision/color_clustering/fig_out/color_pallete_son_tung_2.png -------------------------------------------------------------------------------- /computer_vision/color_clustering/fig_out/color_pallete_son_tung_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ongxuanhong/data-science-works/HEAD/computer_vision/color_clustering/fig_out/color_pallete_son_tung_3.png -------------------------------------------------------------------------------- /pyspark/study_apache_spark/profile_pyspark_jupyter.sh: -------------------------------------------------------------------------------- 1 | export PATH=$PATH:/usr/local/Cellar/apache-spark/2.2.0/bin 2 | export PYSPARK_DRIVER_PYTHON=jupyter 3 | export PYSPARK_DRIVER_PYTHON_OPTS='notebook' pyspark 4 | -------------------------------------------------------------------------------- /spark/notebooks/data/graphx/users.txt: -------------------------------------------------------------------------------- 1 | 1,BarackObama,Barack Obama 2 | 2,ladygaga,Goddess of Love 3 | 3,jeresig,John Resig 4 | 4,justinbieber,Justin Bieber 5 | 6,matei_zaharia,Matei Zaharia 6 | 7,odersky,Martin Odersky 7 | 8,anonsys -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | deep_learning/sample_images/ 3 | deep_learning/snippets/kaggle_dogs_vs_cats/ 4 | python/clustering/people_wiki.csv 5 | deep_learning/processed_segmentation/ 6 | metastore_db/ 7 | *.log 8 | .ipynb_checkpoints/ 9 | .DS_Store 10 | -------------------------------------------------------------------------------- /pyspark/notebooks/makers.csv: -------------------------------------------------------------------------------- 1 | id,maker_name,years 2 | 1,Porsche,2011 3 | 2,Nissan,2011 4 | 3,Dodge,2008 5 | 4,Cadillac,2006 6 | 5,Land Rover,2011 7 | 6,Mazda,1988 8 | 7,Isuzu,1998 9 | 8,Hyundai,2012 10 | 9,Hyundai,2006 11 | 10,Chevrolet,1998 12 | -------------------------------------------------------------------------------- /python/matrix_algorithms.py: -------------------------------------------------------------------------------- 1 | """ 2 | MATRIX FACTORIZATION & DIMENSIONALITY REDUCTION 3 | Case study: Recommending Products 4 | Models: 5 | Collaborative filtering 6 | Matrix factorization 7 | PCA 8 | Algorithms: 9 | Coordinate descent 10 | Eigen decomposition 11 | SVD 12 | Concepts: 13 | Matrix completion, eigenvalues, random projections, cold-start problem, diversity, scaling up 14 | """ 15 | 16 | if __name__ == "__main__": 17 | print "Hello" 18 | -------------------------------------------------------------------------------- /word2vec/brat_tokenize_ann.py: -------------------------------------------------------------------------------- 1 | if __name__ == "__main__": 2 | file_wseg = "data/sample_tokenize.txt.sent.tkn.wseg" 3 | file_ann = "data/sample_tokenize.ann" 4 | out_str = "" 5 | with open(file_wseg, "r") as text_file: 6 | curr_pos = 0 7 | curr_tag_id = 1 8 | lines = text_file.readlines() 9 | for line in lines: 10 | words = line.split(" ") 11 | for word in words: 12 | sub_words = word.split("_") 13 | for idx, sub in enumerate(sub_words): 14 | begin_span = curr_pos 15 | end_span = curr_pos + len(sub.decode("utf-8")) 16 | if idx == 0: 17 | tag_name = "B_W" 18 | else: 19 | tag_name = "I_W" 20 | 21 | out_str += "T" + str(curr_tag_id) + "\t" + tag_name + " " + str(begin_span) + " " + str( 22 | end_span) + "\t" + sub + "\n" 23 | curr_pos = end_span + 1 24 | curr_tag_id += 1 25 | 26 | with open(file_ann, "w") as out_ann: 27 | out_ann.writelines(out_str) 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Table of Contents 2 | ## Visualizing 3 | Source: [Visualizing Data](https://github.com/ongxuanhong/data-science-works/blob/master/python/visualizing_data.py) 4 | 5 | Markers types 6 | ``` 7 | ================ =============================== 8 | character description 9 | ================ =============================== 10 | - solid line style 11 | -- dashed line style 12 | -. dash-dot line style 13 | : dotted line style 14 | . point marker 15 | , pixel marker 16 | o circle marker 17 | v triangle_down marker 18 | ^ triangle_up marker 19 | < triangle_left marker 20 | > triangle_right marker 21 | 1 tri_down marker 22 | 2 tri_up marker 23 | 3 tri_left marker 24 | 4 tri_right marker 25 | s square marker 26 | p pentagon marker 27 | * star marker 28 | h hexagon1 marker 29 | H hexagon2 marker 30 | + plus marker 31 | x x marker 32 | D diamond marker 33 | d thin_diamond marker 34 | | vline marker 35 | _ hline marker 36 | ================ =============================== 37 | ``` -------------------------------------------------------------------------------- /pyspark/notebooks/sample_text.txt: -------------------------------------------------------------------------------- 1 | Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum. 2 | 3 | Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32. 4 | 5 | The standard chunk of Lorem Ipsum used since the 1500s is reproduced below for those interested. Sections 1.10.32 and 1.10.33 from "de Finibus Bonorum et Malorum" by Cicero are also reproduced in their exact original form, accompanied by English versions from the 1914 translation by H. Rackham. 6 | -------------------------------------------------------------------------------- /pyspark/study_apache_spark/data/sample_text.txt: -------------------------------------------------------------------------------- 1 | Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum. 2 | 3 | Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32. 4 | 5 | The standard chunk of Lorem Ipsum used since the 1500s is reproduced below for those interested. Sections 1.10.32 and 1.10.33 from "de Finibus Bonorum et Malorum" by Cicero are also reproduced in their exact original form, accompanied by English versions from the 1914 translation by H. Rackham. 6 | -------------------------------------------------------------------------------- /word2vec/parse_xml.py: -------------------------------------------------------------------------------- 1 | import os 2 | from multiprocessing import Pool 3 | from xml.dom import minidom 4 | 5 | from bs4 import BeautifulSoup 6 | 7 | 8 | def save_files(infos): 9 | (xml_path, saved_file) = infos 10 | 11 | if os.path.isfile(saved_file): 12 | os.unlink(saved_file) 13 | 14 | try: 15 | # parse xml file and get all articles 16 | doc = minidom.parse(xml_path) 17 | articles = doc.getElementsByTagName("article") 18 | 19 | # inspecting some stats 20 | total = len(articles) 21 | print "Processing", xml_path, total 22 | 23 | # get content from article and save to new file 24 | for art in articles: 25 | content = art.getElementsByTagName("content")[0] 26 | soup = BeautifulSoup(content.firstChild.data, "html5lib") 27 | text = soup.get_text().strip() 28 | 29 | # save to new file 30 | with open(saved_file, "a") as text_file: 31 | text_file.write(text.encode("utf8") + "\n") 32 | except Exception as e: 33 | print xml_path 34 | print "Error:", e 35 | 36 | 37 | if __name__ == "__main__": 38 | total_articles = 0 39 | total_error = 0 40 | dirname = "/Users/hongong/Downloads/baomoi_articles" 41 | dir_sentences = "/Users/hongong/Downloads/sentences/" 42 | 43 | list_files = [] 44 | for file_name in os.listdir(dirname): 45 | # get xml path, unlink before generating new content 46 | xml_path = os.path.join(dirname, file_name) 47 | saved_file = dir_sentences + file_name.split(".")[0] + ".txt" 48 | list_files.append((xml_path, saved_file)) 49 | 50 | p = Pool(16) 51 | p.map(save_files, list_files) 52 | -------------------------------------------------------------------------------- /deep_learning/snippets/multitask_learning.py: -------------------------------------------------------------------------------- 1 | # GRAPH CODE 2 | # ============ 3 | 4 | # Import Tensorflow and Numpy 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | # ====================== 9 | # Define the Graph 10 | # ====================== 11 | 12 | # Define the Placeholders 13 | X = tf.placeholder("float", [10, 10], name="X") 14 | Y1 = tf.placeholder("float", [10, 20], name="Y1") 15 | Y2 = tf.placeholder("float", [10, 20], name="Y2") 16 | 17 | # Define the weights for the layers 18 | 19 | initial_shared_layer_weights = np.random.rand(10, 20) 20 | initial_Y1_layer_weights = np.random.rand(20, 20) 21 | initial_Y2_layer_weights = np.random.rand(20, 20) 22 | 23 | shared_layer_weights = tf.Variable(initial_shared_layer_weights, name="share_W", dtype="float32") 24 | Y1_layer_weights = tf.Variable(initial_Y1_layer_weights, name="share_Y1", dtype="float32") 25 | Y2_layer_weights = tf.Variable(initial_Y2_layer_weights, name="share_Y2", dtype="float32") 26 | 27 | # Construct the Layers with RELU Activations 28 | shared_layer = tf.nn.relu(tf.matmul(X, shared_layer_weights)) 29 | Y1_layer = tf.nn.relu(tf.matmul(shared_layer, Y1_layer_weights)) 30 | Y2_layer = tf.nn.relu(tf.matmul(shared_layer, Y2_layer_weights)) 31 | 32 | # Calculate Loss 33 | Y1_Loss = tf.nn.l2_loss(Y1 - Y1_layer) 34 | Y2_Loss = tf.nn.l2_loss(Y2 - Y2_layer) 35 | Joint_Loss = Y1_Loss + Y2_Loss 36 | 37 | # optimisers 38 | Optimiser = tf.train.AdamOptimizer().minimize(Joint_Loss) 39 | Y1_op = tf.train.AdamOptimizer().minimize(Y1_Loss) 40 | Y2_op = tf.train.AdamOptimizer().minimize(Y2_Loss) 41 | 42 | # Calculation (Session) Code 43 | # ========================== 44 | 45 | # open the session 46 | 47 | with tf.Session() as session: 48 | session.run(tf.initialize_all_variables()) 49 | _, Joint_Loss = session.run([Optimiser, Joint_Loss], 50 | { 51 | X: np.random.rand(10, 10) * 10, 52 | Y1: np.random.rand(10, 20) * 10, 53 | Y2: np.random.rand(10, 20) * 10 54 | }) 55 | print(Joint_Loss) 56 | -------------------------------------------------------------------------------- /deep_learning/snippets/mnist_classifies.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from sklearn import datasets, svm, metrics 3 | 4 | if __name__ == "__main__": 5 | # The digits dataset 6 | digits = datasets.load_digits() 7 | 8 | # The data that we are interested in is made of 8x8 images of digits, let's 9 | # have a look at the first 4 images, stored in the `images` attribute of the 10 | # dataset. If we were working from image files, we could load them using 11 | # matplotlib.pyplot.imread. Note that each image must have the same size. For these 12 | # images, we know which digit they represent: it is given in the 'target' of 13 | # the dataset. 14 | images_and_labels = list(zip(digits.images, digits.target)) 15 | for index, (image, label) in enumerate(images_and_labels[:4]): 16 | plt.subplot(2, 4, index + 1) 17 | plt.axis('off') 18 | plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') 19 | plt.title('Training: %i' % label) 20 | 21 | # To apply a classifier on this data, we need to flatten the image, to 22 | # turn the data in a (samples, feature) matrix: 23 | n_samples = len(digits.images) 24 | data = digits.images.reshape((n_samples, -1)) 25 | 26 | # Create a classifier: a support vector classifier 27 | classifier = svm.SVC(gamma=0.001) 28 | 29 | # We learn the digits on the first half of the digits 30 | classifier.fit(data[:n_samples / 2], digits.target[:n_samples / 2]) 31 | 32 | # Now predict the value of the digit on the second half: 33 | expected = digits.target[n_samples / 2:] 34 | predicted = classifier.predict(data[n_samples / 2:]) 35 | 36 | print("Classification report for classifier %s:\n%s\n" 37 | % (classifier, metrics.classification_report(expected, predicted))) 38 | print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted)) 39 | 40 | images_and_predictions = list(zip(digits.images[n_samples / 2:], predicted)) 41 | for index, (image, prediction) in enumerate(images_and_predictions[:4]): 42 | plt.subplot(2, 4, index + 5) 43 | plt.axis('off') 44 | plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') 45 | plt.title('Prediction: %i' % prediction) 46 | 47 | plt.show() 48 | -------------------------------------------------------------------------------- /word2vec/data/sample_tokenize.txt.sent.tkn.wseg: -------------------------------------------------------------------------------- 1 | Con phà bị lật nghiêng sáng 16.4 khi ca_nô của Cảnh_sát biển đến_nơi trên boong tàu không có hành_khách nào vì hành_khách không_được thông_báo lên khu_vực này và được yêu_cầu ngồi_yên - Ảnh : Cảnh_sát biển Hàn_Quốc / Yonhap Việc thường_xuyên chở hàng_hóa quá_tải trên chiếc phà dùng chở khách cho_thấy lỗ_hổng trong việc quản_lý tàu phà lẫn kiểu kinh_doanh bất_chấp hậu_quả 2 | Cơ_quan đăng_kiểm tàu Hàn_Quốc đầu_năm 2013 xem_xét phà Sewol khi phà đăng_ký cải_tiến để chở thêm nhiều khách 3 | Cơ_quan này cho phà được chở thêm hàng_hóa tối_đa 987 tấn ( tăng 50 % ) với điều_kiện dằn thêm dưới khoang 2.000 tấn nước để cân_bằng 4 | Tuy_nhiên khuyến_cáo này chỉ gửi đến công_ty quản_lý phà mà không được gửi cho Cảnh_sát biển lẫn Hiệp_hội tàu_biển Hàn_Quốc 5 | Phà này sau_đó liên_tục chở hàng_hóa vượt tải_trọng cho_phép 987 tấn như chở hơn 2.000 tấn hàng qua 136 chuyến và trên 3.000 tấn qua 12 lần 6 | Tổng_cộng đến chuyến cuối_cùng ngày 16.4.2014 phà này chở hàng quá_tải đến 246 lần 7 | Và chuyến cuối_cùng phà chở lượng hàng_hóa khủng đến 3.608 tấn cùng 476 người kết_quả là phà lật nghiêng ngoài_khơi đảo Jindo sáng 16.4.2014 làm hơn 300 hành_khách thiệt_mạng 8 | Thuyền_trưởng Lee Joon - seok_khai hàng_hóa chở trên phà ít_hơn các con_số của báo_cáo trên rằng khi chìm phà có chở 657 tấn hàng cùng 150 ô_tô 9 | Tuy_nhiên Cảnh_sát biển tìm thấy đến 180 ô_tô trong_lòng phà dưới biển ! 10 | Các chuyên_gia tin rằng khi chở quá_tải chỉ cần đảo hướng một_chút cũng có_thể làm phà bị lật vì mất cân_bằng 11 | Và các dữ_liệu hành_trình cho_thấy con phà đã quẹo một góc 45 độ ngay_khi chìm 12 | Lối_vào cảng trên đảo Jindo đầy vòng_hoa tang tưởng_nhớ các nạn_nhân vụ chìm phà_Sewol ngày 28.4.2014 - Ảnh : Reuters Thợ_lặn hiện tìm_kiếm gần_hết các phòng trên phà Sewol chìm dưới biển - Ảnh : Reuters Ngày 4.5 Tổng_thống Hàn_Quốc_Park Geun - hye có chuyến thăm lần 2 các gia_đình nạn_nhân vụ chìm phà đang tạm_trú ở đảo_Jindo 13 | Bà Park nói rằng bà cũng từng đau_khổ vì mất_mát gia_đình nên hiểu_rõ tâm_trạng của mọi_người 14 | Bà hứa sẽ trừng_phạt các cá_nhân liên_quan vụ chìm phà này 15 | Tính đến ngày 4.5 đã có 244 thi_thể được tìm thấy vẫn còn 58 người mất_tích 16 | Số_người được cứu_sống là 174 gồm 22/29 thuyền_viên 17 | Phát_ngôn viên Lực_lượng cứu_hộ phà Sewol ông Ko_Myung - seok cho_biết thợ_lặn đã tìm_kiếm được 60 trong tổng_số 64 phòng của con phà dưới lòng_biển -------------------------------------------------------------------------------- /computer_vision/color_clustering/color_kmeans.py: -------------------------------------------------------------------------------- 1 | # python3 color_kmeans.py --image son_tung.png --clusters 5 2 | import argparse 3 | import os 4 | 5 | import cv2 6 | import numpy as np 7 | from sklearn.cluster import KMeans 8 | 9 | 10 | def get_color_palette(k_cluster, centroids, palette_w=600, palette_h=100): 11 | # initialize the color palette 12 | text_y = int(palette_h / 2) 13 | palette = np.zeros((palette_h, palette_w, 3), dtype="uint8") 14 | startX = 0 15 | 16 | # loop over the color of each cluster 17 | for color in centroids: 18 | # plot the relative percentage of each cluster 19 | endX = startX + (1.0 / k_cluster * palette_w) 20 | text_x = int(startX + 15) 21 | 22 | bgr_code = str(color.astype("uint8").tolist()[0]) + "," 23 | bgr_code += str(color.astype("uint8").tolist()[1]) + "," 24 | bgr_code += str(color.astype("uint8").tolist()[2]) 25 | 26 | cv2.rectangle(palette, (int(startX), 0), (int(endX), palette_h), color.astype("uint8").tolist(), -1) 27 | cv2.putText(palette, bgr_code, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 200)) 28 | startX = endX 29 | 30 | # return the palette 31 | return palette 32 | 33 | 34 | if __name__ == "__main__": 35 | 36 | ap = argparse.ArgumentParser() 37 | ap.add_argument("-i", "--image", required=True, help="Path to the image") 38 | ap.add_argument("-c", "--clusters", required=True, type=int, help="# of clusters") 39 | args = vars(ap.parse_args()) 40 | 41 | for f in os.listdir(args["image"]): 42 | if f.endswith(".png"): 43 | img_path = args["image"] + "/" + f 44 | img_name = os.path.splitext(f)[0] 45 | 46 | # load the image 47 | image = cv2.imread(img_path) 48 | 49 | # reshape the image to be a list of pixels 50 | image = image.reshape((image.shape[0] * image.shape[1], 3)) 51 | 52 | # cluster the pixel intensities 53 | clt = KMeans(n_clusters=args["clusters"]) 54 | clt.fit(image) 55 | 56 | # representing the number of pixels labeled to each color 57 | palette = get_color_palette(args["clusters"], clt.cluster_centers_) 58 | 59 | # save color palette 60 | fig_out = "fig_out/color_pallete_" + img_name + ".png" 61 | cv2.imwrite(fig_out, palette) 62 | print("Done", f) 63 | -------------------------------------------------------------------------------- /deep_learning/src/image_segmentation.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import glob 4 | import os 5 | import sys 6 | import time 7 | 8 | import numpy as np 9 | 10 | sys.path.append("/usr/local/lib/python2.7/site-packages") 11 | import cv2 12 | import dicom as dicomio 13 | 14 | 15 | def time_diff_str(t1, t2): 16 | """ 17 | Calculates time durations. 18 | """ 19 | diff = t2 - t1 20 | mins = int(diff / 60) 21 | secs = round(diff % 60, 2) 22 | return str(mins) + " mins and " + str(secs) + " seconds" 23 | 24 | 25 | if __name__ == "__main__": 26 | t_start = time.time() 27 | 28 | # construct the argument parse and parse the arguments 29 | ap = argparse.ArgumentParser() 30 | ap.add_argument("-d", "--dataset", required=True, help="path to input dataset") 31 | ap.add_argument("-s", "--saveto", required=True, help="path to saved processed data") 32 | args = vars(ap.parse_args()) 33 | 34 | list_dir = os.listdir(args["dataset"]) 35 | for (i, dir) in enumerate(list_dir): 36 | if os.path.isfile(dir) is False: 37 | basePath = args["dataset"] + "/" + dir 38 | os.chdir(basePath) 39 | images = [] 40 | for f in glob.glob("*.dcm"): 41 | # read dcm file 42 | ds = dicomio.read_file(f) 43 | img = ds.pixel_array 44 | 45 | # normalize image values to [0, 255] 46 | cv2.normalize(img, img, 0, 255, cv2.NORM_MINMAX) 47 | img = cv2.medianBlur(img.astype(np.uint8), 5) 48 | 49 | # image segmentation 50 | thresh = cv2.adaptiveThreshold(img, 51 | 255, 52 | cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 53 | cv2.THRESH_BINARY, 54 | 11, 55 | 2) 56 | images.append(thresh) 57 | 58 | data = np.array(images) 59 | mean_img = np.mean(data, axis=0) 60 | save_name = args["saveto"] + "/" + dir + ".png" 61 | cv2.imwrite(save_name, mean_img) 62 | print "Saved processed image:", save_name 63 | 64 | # show an update every 10 patients 65 | if i > 0 and i % 10 == 0: 66 | print "[INFO] processed {}/{} patients".format(i, len(list_dir)) 67 | print "[INFO] time passed", time_diff_str(t_start, time.time()) 68 | 69 | print "[INFO]", datetime.datetime.now(), "* DONE After *", time_diff_str(t_start, time.time()) 70 | -------------------------------------------------------------------------------- /python/regression_algorithms.py: -------------------------------------------------------------------------------- 1 | """ 2 | REGRESSION 3 | Case study: Predicting house prices 4 | Models: 5 | Linear regression 6 | Regularization: Ridge (L2), Lasso (L1) 7 | Algorithms: 8 | Gradient descent 9 | Coordinate descent 10 | Concepts: 11 | Loss functions, bias-variance tradeoff, cross-validation, sparsity, overfitting, model selection 12 | """ 13 | 14 | import os 15 | 16 | import matplotlib.pyplot as plt 17 | import pandas as pd 18 | from sklearn import linear_model 19 | from sklearn.externals import joblib 20 | from sklearn.linear_model import Ridge 21 | from sklearn.model_selection import train_test_split 22 | from sklearn.pipeline import Pipeline 23 | from sklearn.preprocessing import PolynomialFeatures 24 | 25 | 26 | def get_home_data(): 27 | """Get home data, from local csv.""" 28 | if os.path.exists("data/home_data.csv"): 29 | print("-- home_data.csv found locally") 30 | df = pd.read_csv("data/home_data.csv", index_col=0) 31 | 32 | return df 33 | 34 | 35 | def plotting_features_vs_target(features, x, y): 36 | # define number of subplot 37 | num_feature = len(features) 38 | f, axes = plt.subplots(1, num_feature, sharey=True) 39 | 40 | # plotting 41 | for i in range(0, num_feature): 42 | axes[i].scatter(x[features[i]], y) 43 | axes[i].set_title(features[i]) 44 | 45 | plt.show() 46 | 47 | 48 | if __name__ == "__main__": 49 | df = get_home_data() 50 | 51 | # features selection 52 | features = list(["bedrooms", "bathrooms", "grade"]) 53 | print "Features name:", list(df.columns.values) 54 | print "Selected features:", features 55 | y = df["price"] 56 | X = df[features] 57 | 58 | # split data-set into training (70%) and testing set (30%) 59 | x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 60 | 61 | # plotting features, target relationships 62 | plotting_features_vs_target(features, x_train, y_train) 63 | 64 | """ 65 | DEFAULT MODEL 66 | """ 67 | # training model 68 | linear = linear_model.LinearRegression() 69 | linear.fit(x_train, y_train) 70 | 71 | # evaluating model 72 | score_trained = linear.score(x_test, y_test) 73 | print "Model scored:", score_trained 74 | 75 | """ 76 | LASSO MODEL 77 | """ 78 | # L1 regularization 79 | lasso_linear = linear_model.Lasso(alpha=1.0) 80 | lasso_linear.fit(x_train, y_train) 81 | 82 | # evaluating L1 regularized model 83 | score_lasso_trained = lasso_linear.score(x_test, y_test) 84 | print "Lasso model scored:", score_lasso_trained 85 | 86 | """ 87 | RIDGE MODEL 88 | """ 89 | # L2 regularization 90 | ridge_linear = Ridge(alpha=1.0) 91 | ridge_linear.fit(x_train, y_train) 92 | 93 | # evaluating L2 regularized model 94 | score_ridge_trained = ridge_linear.score(x_test, y_test) 95 | print "Ridge model scored:", score_ridge_trained 96 | 97 | # saving model 98 | joblib.dump(linear, "models/linear_model_v1.pkl") 99 | 100 | # loading model 101 | clf = joblib.load("models/linear_model_v1.pkl") 102 | predicted = clf.predict(x_test) 103 | print "Predicted test:", predicted 104 | 105 | """ 106 | POLYNOMIAL REGRESSION 107 | """ 108 | poly_model = Pipeline([('poly', PolynomialFeatures(degree=2)), 109 | ('linear', linear_model.LinearRegression(fit_intercept=False))]) 110 | poly_model = poly_model.fit(x_train, y_train) 111 | score_poly_trained = poly_model.score(x_test, y_test) 112 | print "Poly model scored:", score_poly_trained 113 | 114 | poly_model = Pipeline([('poly', PolynomialFeatures(interaction_only=True, degree=2)), 115 | ('linear', linear_model.LinearRegression(fit_intercept=False))]) 116 | poly_model = poly_model.fit(x_train, y_train) 117 | score_poly_trained = poly_model.score(x_test, y_test) 118 | print "Poly model (interaction only) scored:", score_poly_trained 119 | -------------------------------------------------------------------------------- /deep_learning/snippets/sift_cats_vs_dogs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import os 4 | import sys 5 | import time 6 | 7 | import numpy as np 8 | from sklearn.model_selection import train_test_split 9 | from sklearn.neighbors import KNeighborsClassifier 10 | 11 | 12 | # returns descriptor of image at pth 13 | def feature_extract(pth): 14 | im = cv2.imread(pth, 1) 15 | gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY) 16 | return bowDiction.compute(gray, sift.detect(gray)) 17 | 18 | 19 | def time_diff_str(t1, t2): 20 | """ 21 | Calculates time durations. 22 | """ 23 | diff = t2 - t1 24 | mins = int(diff / 60) 25 | secs = round(diff % 60, 2) 26 | return str(mins) + " mins and " + str(secs) + " seconds" 27 | 28 | 29 | if __name__ == "__main__": 30 | # Load opencv libraries 31 | sys.path.append('/usr/local/lib/python2.7/site-packages') 32 | import cv2 33 | from imutils import paths 34 | 35 | t_start = time.time() 36 | 37 | # construct the argument parse and parse the arguments 38 | ap = argparse.ArgumentParser() 39 | ap.add_argument("-d", "--dataset", required=True, help="path to input dataset") 40 | ap.add_argument("-k", "--neighbors", type=int, default=1, help="# of nearest neighbors for classification") 41 | ap.add_argument("-j", "--jobs", type=int, default=-1, 42 | help="# of jobs for k-NN distance (-1 uses all available cores)") 43 | args = vars(ap.parse_args()) 44 | 45 | # grab the list of images that we'll be describing 46 | print("[INFO] describing images...") 47 | imagePaths = list(paths.list_images(args["dataset"])) 48 | 49 | # initialize the raw pixel intensities matrix, the features matrix, 50 | # and labels list 51 | features = [] 52 | labels = [] 53 | 54 | dictionarySize = 5 55 | BOW = cv2.BOWKMeansTrainer(dictionarySize) 56 | sift = cv2.xfeatures2d.SIFT_create() 57 | 58 | for (i, imagePath) in enumerate(imagePaths): 59 | image = cv2.imread(imagePath) 60 | gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) 61 | kp, dsc = sift.detectAndCompute(gray, None) 62 | BOW.add(dsc) 63 | print("# kps: {}, descriptors: {}".format(len(kp), dsc.shape)) 64 | 65 | # dictionary created 66 | dictionary = BOW.cluster() 67 | index_params = dict(algorithm=0, trees=5) 68 | search_params = dict(checks=50) # or pass empty dictionary 69 | flann = cv2.FlannBasedMatcher(index_params, search_params) 70 | sift2 = cv2.xfeatures2d.SIFT_create() 71 | bowDiction = cv2.BOWImgDescriptorExtractor(sift2, cv2.BFMatcher(cv2.NORM_L2)) 72 | bowDiction.setVocabulary(dictionary) 73 | print "BOW dictionary", np.shape(dictionary) 74 | 75 | # loop over the input images 76 | for (i, imagePath) in enumerate(imagePaths): 77 | # load the image and extract the class label (assuming that our 78 | # path as the format: /path/to/dataset/{class}.{image_num}.jpg 79 | label = imagePath.split(os.path.sep)[-1].split(".")[0] 80 | 81 | # update the raw images, features, and labels matricies, 82 | # respectively 83 | features.extend(feature_extract(imagePath)) 84 | labels.append(label) 85 | 86 | # show an update every 1,000 images 87 | if i > 0 and i % 1000 == 0: 88 | print("[INFO] processed {}/{}".format(i, len(imagePaths))) 89 | 90 | # show some information on the memory consumed by the features matrix 91 | features = np.array(features) 92 | labels = np.array(labels) 93 | print("[INFO] features matrix: {:.2f}MB".format(features.nbytes / (1024 * 1000.0))) 94 | 95 | # partition the data into training and testing splits, using 75% 96 | # of the data for training and the remaining 25% for testing 97 | (trainFeat, testFeat, trainLabels, testLabels) = train_test_split(features, labels, test_size=0.25, random_state=42) 98 | 99 | # train and evaluate a k-NN classifer on the histogram 100 | # representations 101 | print("[INFO] evaluating accuracy...") 102 | model = KNeighborsClassifier(n_neighbors=args["neighbors"], n_jobs=args["jobs"]) 103 | model.fit(trainFeat, trainLabels) 104 | acc = model.score(testFeat, testLabels) 105 | print("[INFO] accuracy: {:.2f}%".format(acc * 100)) 106 | 107 | print "-- %s * DONE After * %s" % (datetime.datetime.now(), time_diff_str(t_start, time.time())) 108 | -------------------------------------------------------------------------------- /pyspark/notebooks/structured_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# X.join(Y)\n", 8 | "- Return RDD of all pairs of elements with matching keys in X and Y.\n", 9 | "- Each pair is (k, (v1, v2)) tuple, where (k, v1) is in X and (k, v2) is in Y." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/plain": [ 20 | "[('a', (1, 2)), ('a', (1, 3))]" 21 | ] 22 | }, 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "output_type": "execute_result" 26 | } 27 | ], 28 | "source": [ 29 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n", 30 | "y = sc.parallelize([(\"a\", 2), (\"a\", 3)])\n", 31 | "sorted(x.join(y).collect())" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "# X.leftOuterJoin(Y)\n", 39 | "- For each element (k, v) in X, resulting RDD will either contain\n", 40 | " - All pairs (k, (v, w)) for w in Y.\n", 41 | " - Or the pair (k, (v, None)) if no elements in Y have key k." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/plain": [ 52 | "[('a', (1, 2)), ('b', (4, None))]" 53 | ] 54 | }, 55 | "execution_count": 2, 56 | "metadata": {}, 57 | "output_type": "execute_result" 58 | } 59 | ], 60 | "source": [ 61 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n", 62 | "y = sc.parallelize([(\"a\", 2)])\n", 63 | "sorted(x.leftOuterJoin(y).collect())" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "# X.rightOuterJoin(Y)\n", 71 | "- For each element (k, w) in Y, resulting RDD will either contain\n", 72 | " - All pairs (k, (v, w)) for v in X.\n", 73 | " - Or the pair (k, (None, w)) if no elements in X have key k." 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 3, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "data": { 83 | "text/plain": [ 84 | "[('a', (1, 2)), ('b', (None, 4))]" 85 | ] 86 | }, 87 | "execution_count": 3, 88 | "metadata": {}, 89 | "output_type": "execute_result" 90 | } 91 | ], 92 | "source": [ 93 | "x = sc.parallelize([(\"a\", 1)])\n", 94 | "y = sc.parallelize([(\"a\", 2), (\"b\", 4)])\n", 95 | "sorted(x.rightOuterJoin(y).collect())" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "# X.fullOuterJoin(Y)\n", 103 | "- For each element (k, v) in X, resulting RDD will either contain\n", 104 | " - All pairs (k, (v, w)) for w in Y.\n", 105 | " - Or the pair (k, (v, None)) if no elements in Y have key k.\n", 106 | "- For each element (k, w) in Y, resulting RDD will either contain\n", 107 | " - All pairs (k, (v, w)) for v in X.\n", 108 | " - Or the pair (k, (None, w)) if no elements in X have key k." 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 4, 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "data": { 118 | "text/plain": [ 119 | "[('a', (1, 2)), ('b', (4, None)), ('c', (None, 8))]" 120 | ] 121 | }, 122 | "execution_count": 4, 123 | "metadata": {}, 124 | "output_type": "execute_result" 125 | } 126 | ], 127 | "source": [ 128 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n", 129 | "y = sc.parallelize([(\"a\", 2), (\"c\", 8)])\n", 130 | "sorted(x.fullOuterJoin(y).collect())" 131 | ] 132 | } 133 | ], 134 | "metadata": { 135 | "kernelspec": { 136 | "display_name": "Python 2", 137 | "language": "python", 138 | "name": "python2" 139 | }, 140 | "language_info": { 141 | "codemirror_mode": { 142 | "name": "ipython", 143 | "version": 2 144 | }, 145 | "file_extension": ".py", 146 | "mimetype": "text/x-python", 147 | "name": "python", 148 | "nbconvert_exporter": "python", 149 | "pygments_lexer": "ipython2", 150 | "version": "2.7.10" 151 | }, 152 | "name": "06_Structured_Data", 153 | "notebookId": 3373040177660362 154 | }, 155 | "nbformat": 4, 156 | "nbformat_minor": 1 157 | } 158 | -------------------------------------------------------------------------------- /python/visualizing_data.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | 5 | def my_line_chart(plt): 6 | years = ["1985", "1986", "1987", "1988", "1989", "1990", "1991", "1992", "1993", "1994", "1995", "1996", "1997", 7 | "1998", "1999", "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", 8 | "2011", "2012", "2013", "2014", "2015"] 9 | gdp = [14094688429, 26336617862, 36658108169, 25423812494, 6293304847, 6471740486, 10 | 9613369553, 9866990096, 13180954014, 16286434094, 20736163915, 24657470331, 11 | 26843701136, 27209601995, 28683658004, 33640085727, 35291349277, 37947904054, 12 | 42717072777, 49424107709, 57633255739, 66371664817, 77414425532, 99130304099, 13 | 106014600963, 115931749904, 135539487317, 155820001920, 171222025117, 186204652922, 14 | 193599379094] 15 | 16 | # create a line chart, years on x-axis, gdp on y-axis 17 | plt.plot(years, gdp, color='#f39c12', marker='o', linestyle='solid') 18 | 19 | # add a title 20 | plt.title("Vietnam GDP") 21 | 22 | # add a label to the y-axis 23 | plt.ylabel("Billions of $") 24 | plt.show() 25 | 26 | 27 | def my_bar_chart(plt): 28 | color_names = ["Emerald", "Green Sea", "Midnight Blue", "Carrot", "Peter River"] 29 | colors = ["#2ecc71", "#16a085", "#2c3e50", "#e67e22", "#3498db"] 30 | num_favorite = [5, 11, 3, 8, 10] 31 | 32 | # bars are by default width 0.8, so we'll add 0.1 to the left coordinates 33 | # so that each bar is centered 34 | xs = [i + 0.1 for i, _ in enumerate(color_names)] 35 | 36 | # plot bars with left x-coordinates [xs], heights [num_favorite] 37 | plt.bar(xs, num_favorite, color=colors) 38 | plt.title("My Favorite Colors") 39 | 40 | # label x-axis with color names at bar centers 41 | plt.xticks([i + 0.5 for i, _ in enumerate(color_names)], color_names) 42 | 43 | plt.show() 44 | 45 | 46 | def my_histogram(plt): 47 | data = [] 48 | for i in range(100): 49 | data.append(np.random.randint(1, 11)) 50 | 51 | plt.hist(data, bins=10, facecolor='#bdc3c7') 52 | 53 | plt.xlabel("Points") 54 | plt.ylabel("# of Students") 55 | plt.title("Results of the exam") 56 | plt.show() 57 | 58 | 59 | def my_multi_line_charts(plt): 60 | bears = [10, 58, 85, 115, 139, 182] 61 | dolphins = [150, 75, 32, 14, 8, 5] 62 | whales = [80, 50, 100, 75, 90, 70] 63 | x = [0, 1, 2, 3, 4, 5] 64 | years = ["2009", "2010", "2011", "2012", "2013", "2014"] 65 | 66 | # we can make multiple calls to plt.plot 67 | # to show multiple series on the same chart 68 | plt.plot(x, bears, '#16a085', marker='o', linewidth=3.0, label='Bears') 69 | plt.plot(x, dolphins, '#c0392b', marker='s', linewidth=3.0, label='Dolphins') 70 | plt.plot(x, whales, '#3498db', marker='^', linewidth=3.0, label='Whales') 71 | 72 | # because we've assigned labels to each series 73 | # we can get a legend for free 74 | # loc=9 means "top center" 75 | plt.legend(loc=9) 76 | plt.title("Number of animals each year") 77 | plt.xlabel("Years") 78 | plt.xticks(x, years) 79 | plt.show() 80 | 81 | 82 | def my_scatter_plot(plt): 83 | sizes = [700, 650, 720, 630, 710, 640, 600, 640, 670] 84 | prices = [175, 170, 205, 120, 220, 130, 105, 145, 190] 85 | labels = ["$175", "$170", "$205", "$120", "$220", "$130", "$105", "$145", "$190"] 86 | 87 | plt.scatter(sizes, prices, marker='s', s=40, color='#2ecc71') 88 | 89 | # label each point 90 | for label, friend_count, minute_count in zip(labels, sizes, prices): 91 | plt.annotate(label, 92 | xy=(friend_count, minute_count), # put the label with its point 93 | xytext=(5, -5), # but slightly offset 94 | textcoords='offset points') 95 | 96 | plt.title("House prices") 97 | plt.xlabel("Size in m2") 98 | plt.ylabel("Thousand $") 99 | plt.show() 100 | 101 | 102 | def my_pie_chart(plt): 103 | data = [0.5, 0.26, 0.11, 0.04, 0.02, 0.02, 0.01, 0.04] 104 | smart_phone = ["Apple", "Samsung", "LG", "Motorola", "HTC", "Nokia", "Amazon", "Other"] 105 | colors = ["#ecf0f1", "#3498db", "#e67e22", "#1abc9c", "#bdc3c7", "#8e44ad", "#f39c12", "#2c3e50"] 106 | 107 | plt.pie(data, labels=smart_phone, colors=colors, autopct='%1.1f%%', 108 | startangle=-90, pctdistance=0.9, labeldistance=1.2) 109 | 110 | # make sure pie is a circle and not an oval 111 | plt.axis("equal") 112 | plt.show() 113 | 114 | 115 | if __name__ == "__main__": 116 | my_line_chart(plt) 117 | 118 | my_bar_chart(plt) 119 | 120 | my_histogram(plt) 121 | 122 | my_multi_line_charts(plt) 123 | 124 | my_scatter_plot(plt) 125 | 126 | my_pie_chart(plt) 127 | -------------------------------------------------------------------------------- /pyspark/study_apache_spark/scala/scala_dataframe.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "http://10.8.2.1:8089/proxy/application_1515394405830_3970\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "import org.apache.spark.sql.SparkSession\n", 18 | "\n", 19 | "val spark = SparkSession.builder().\n", 20 | " appName(\"scala_dataframe\").\n", 21 | " config(\"spark.executor.instances\",\"2\").\n", 22 | " config(\"spark.executor.cores\",\"2\").\n", 23 | " config(\"spark.executor.memory\", \"4g\").\n", 24 | " config(\"spark.yarn.executor.memoryOverhead\", \"1g\").\n", 25 | " getOrCreate()\n", 26 | "\n", 27 | "println(\"http://10.8.2.1:8089/proxy/\"+ spark.sparkContext.applicationId)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "Array([Alice,1])" 39 | ] 40 | }, 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "val l = Seq((\"Alice\", 1))\n", 48 | "spark.createDataFrame(l).collect()" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "[Stage 1:=======================================> (2 + 1) / 3]+------+-----+\n", 61 | "|number| word|\n", 62 | "+------+-----+\n", 63 | "| 8| bat|\n", 64 | "| 64|mouse|\n", 65 | "| -27|horse|\n", 66 | "+------+-----+\n", 67 | "\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "// For implicit conversions from RDDs to DataFrames\n", 73 | "import org.apache.spark.sql.Row\n", 74 | "import org.apache.spark.sql.types._\n", 75 | "\n", 76 | "val someData = Seq(\n", 77 | " Row(8, \"bat\"),\n", 78 | " Row(64, \"mouse\"),\n", 79 | " Row(-27, \"horse\")\n", 80 | ")\n", 81 | "\n", 82 | "val someSchema = List(\n", 83 | " StructField(\"number\", IntegerType, true),\n", 84 | " StructField(\"word\", StringType, true)\n", 85 | ")\n", 86 | "\n", 87 | "val someDF = spark.createDataFrame(\n", 88 | " spark.sparkContext.parallelize(someData),\n", 89 | " StructType(someSchema)\n", 90 | ")\n", 91 | "someDF.show()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 4, 97 | "metadata": { 98 | "scrolled": true 99 | }, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/plain": [ 104 | "Array([4])" 105 | ] 106 | }, 107 | "execution_count": 4, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | } 111 | ], 112 | "source": [ 113 | "spark.udf.register(\"stringLength\", (s: String) => s.length())\n", 114 | "spark.sql(\"SELECT stringLength('test')\").collect()" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 10, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "+------+---+\n", 127 | "| name|age|\n", 128 | "+------+---+\n", 129 | "| Max| 33|\n", 130 | "| Adam| 32|\n", 131 | "|Muller| 62|\n", 132 | "+------+---+\n", 133 | "\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "import spark.sqlContext.implicits._\n", 139 | "case class Person(name: String, age: Int)\n", 140 | "\n", 141 | "val personDS = Seq(Person(\"Max\", 33), Person(\"Adam\", 32), Person(\"Muller\", 62)).toDS()\n", 142 | "personDS.show()" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 3, 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "+----------+\n", 155 | "|sum_of_age|\n", 156 | "+----------+\n", 157 | "| 127|\n", 158 | "+----------+\n", 159 | "\n" 160 | ] 161 | } 162 | ], 163 | "source": [ 164 | "import org.apache.spark.sql.functions._\n", 165 | "personDS.groupBy().agg(sum(\"age\").as(\"sum_of_age\")).show()" 166 | ] 167 | } 168 | ], 169 | "metadata": { 170 | "kernelspec": { 171 | "display_name": "Apache Toree - Scala", 172 | "language": "scala", 173 | "name": "apache_toree_scala" 174 | }, 175 | "language_info": { 176 | "file_extension": ".scala", 177 | "name": "scala", 178 | "version": "2.11.8" 179 | } 180 | }, 181 | "nbformat": 4, 182 | "nbformat_minor": 2 183 | } 184 | -------------------------------------------------------------------------------- /python/recommender/song_recommender.py: -------------------------------------------------------------------------------- 1 | """ 2 | MATRIX FACTORIZATION & DIMENSIONALITY REDUCTION 3 | Case study: Recommending Products 4 | Models: 5 | Collaborative filtering 6 | Matrix factorization 7 | PCA 8 | Algorithms: 9 | Coordinate descent 10 | Eigen decomposition 11 | SVD 12 | Concepts: 13 | Matrix completion, eigenvalues, random projections, cold-start problem, diversity, scaling up 14 | """ 15 | import os 16 | from math import sqrt 17 | 18 | import numpy as np 19 | import pandas as pd 20 | from scipy.sparse.linalg import svds 21 | from sklearn.metrics import mean_squared_error 22 | from sklearn.metrics.pairwise import pairwise_distances 23 | from sklearn.model_selection import train_test_split 24 | 25 | 26 | def load_music_data(file_name): 27 | """Get reviews data, from local csv.""" 28 | if os.path.exists(file_name): 29 | print("-- " + file_name + " found locally") 30 | df = pd.read_csv(file_name) 31 | 32 | return df 33 | 34 | 35 | def values_to_map_index(values): 36 | map_index = {} 37 | idx = 0 38 | for val in values: 39 | map_index[val] = idx 40 | idx += 1 41 | 42 | return map_index 43 | 44 | 45 | def print_most_popular_songs(song): 46 | # Take a look at the words in the vocabulary 47 | vocab = vectorizer.get_feature_names() 48 | print "Words in vocabulary:", vocab 49 | 50 | # Sum up the counts of each vocabulary word 51 | dist = np.sum(song, axis=0) 52 | 53 | # For each, print the vocabulary word and the number of times it 54 | # appears in the training set 55 | print "Words frequency..." 56 | for tag, count in zip(vocab, dist): 57 | print count, tag 58 | 59 | 60 | def predict(ratings, similarity, type='user'): 61 | if type == 'user': 62 | mean_user_rating = ratings.mean(axis=1) 63 | # You use np.newaxis so that mean_user_rating has same format as ratings 64 | ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 65 | pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array( 66 | [np.abs(similarity).sum(axis=1)]).T 67 | elif type == 'item': 68 | pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)]) 69 | return pred 70 | 71 | 72 | def rmse(prediction, ground_truth): 73 | prediction = prediction[ground_truth.nonzero()].flatten() 74 | ground_truth = ground_truth[ground_truth.nonzero()].flatten() 75 | return sqrt(mean_squared_error(prediction, ground_truth)) 76 | 77 | 78 | if __name__ == "__main__": 79 | 80 | # Load music data 81 | song_data = load_music_data("song_data.csv") 82 | 83 | # Reduce complexity by getting first n elements 84 | n = 10000 85 | song_data = song_data.head(n) 86 | user_idx = values_to_map_index(song_data.user_id.unique()) 87 | song_idx = values_to_map_index(song_data.song_id.unique()) 88 | 89 | print "-- Explore data" 90 | print song_data.head() 91 | 92 | print "-- Showing the most popular songs in the dataset" 93 | unique, counts = np.unique(song_data["song"], return_counts=True) 94 | popular_songs = dict(zip(unique, counts)) 95 | df_popular_songs = pd.DataFrame(popular_songs.items(), columns=["Song", "Count"]) 96 | df_popular_songs = df_popular_songs.sort_values(by=["Count"], ascending=False) 97 | print df_popular_songs.head() 98 | 99 | n_users = song_data.user_id.unique().shape[0] 100 | n_items = song_data.song_id.unique().shape[0] 101 | print "Number of users = " + str(n_users) + " | Number of songs = " + str(n_items) 102 | 103 | train_data, test_data = train_test_split(song_data, test_size=0.25) 104 | train_data_matrix = np.zeros((n_users, n_items)) 105 | for line in train_data.itertuples(): 106 | train_data_matrix[user_idx[line[1]], song_idx[line[2]]] = line[3] 107 | 108 | test_data_matrix = np.zeros((n_users, n_items)) 109 | for line in test_data.itertuples(): 110 | test_data_matrix[user_idx[line[1]], song_idx[line[2]]] = line[3] 111 | 112 | user_similarity = pairwise_distances(train_data_matrix, metric='cosine') 113 | item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine') 114 | 115 | item_prediction = predict(train_data_matrix, item_similarity, type='item') 116 | user_prediction = predict(train_data_matrix, user_similarity, type='user') 117 | 118 | print 'User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)) 119 | print 'Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)) 120 | 121 | sparsity = round(1.0 - len(song_data) / float(n_users * n_items), 3) 122 | print 'The sparsity level is ' + str(sparsity * 100) + '%' 123 | 124 | # get SVD components from train matrix. Choose k. 125 | u, s, vt = svds(train_data_matrix, k=20) 126 | s_diag_matrix = np.diag(s) 127 | X_pred = np.dot(np.dot(u, s_diag_matrix), vt) 128 | print 'User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)) 129 | -------------------------------------------------------------------------------- /python/data/terminal.md: -------------------------------------------------------------------------------- 1 | # Retrieving a Software Package 2 | ``` 3 | %> wget http://framework.zend.com/releases/ZendFramework-1.10.3/ZendFramework-1.10.3-minimal.tar.gz 4 | ``` 5 | 6 | # Monitoring Server Processes 7 | ``` 8 | %> top 9 | ... 10 | ID COMMAND %CPU TIME #TH #WQ #PORT MEM PURG CMPRS PGRP PPID STATE BOOSTS %CPU_ME %CPU_OTHRS UID FAULTS 11 | 1701 top 2.7 00:00.42 1/1 0 20 2872K+ 0B 0B 1701 604 running *0[1] 0.00000 0.00000 0 3547+ 12 | 1675 com.apple.We 0.8 00:02.27 12 2 188- 41M- 6816K 0B 1675 1 sleeping *0[1061] 0.00000 0.00000 501 22651+ 13 | 1651 fsnotifier 0.0 00:00.02 3 1 30 292K 0B 404K 1644 1644 sleeping *0[1] 0.00000 0.00000 501 1144 14 | 1650 syncdefaults 0.0 00:00.56 4 2 124 8652K 0B 880K 1650 1 sleeping 0[2] 0.00000 0.00000 501 6658 15 | 1649 CVMCompiler 0.0 00:00.47 2 2 25 11M 0B 1572K 1649 1 sleeping *0[1] 0.00000 0.00000 501 4136 16 | 1645 ocspd 0.0 00:00.03 2 1 32 708K 0B 616K 1645 1 sleeping *0[1] 0.00000 0.00000 0 1502 17 | 1644 pycharm 3.9 02:01.61 52 2 344 509M 6372K 29M 1644 1 sleeping *0[50] 0.00000 0.00000 501 239705 18 | 1601 mdworker 0.0 00:00.12 3 1 56 116K 0B 4308K 1601 1 sleeping *0[1] 0.00000 0.00000 501 4105 19 | 20 | %> ps aux 21 | ... 22 | USER PID %CPU %MEM VSZ RSS TT STAT STARTED TIME COMMAND 23 | _windowserver 173 4.1 0.7 4877596 60536 ?? Ss 10:04AM 14:08.75 /System/Library/PrivateFrameworks/SkyLight.framework/Resources/WindowServer -daemon 24 | hongong 1644 2.2 8.2 7245168 688464 ?? S 12:16PM 7:39.61 /Applications/PyCharm CE.app/Contents/MacOS/pycharm 25 | hongong 589 1.2 0.5 2661812 44496 ?? S 10:09AM 0:43.55 /Applications/Utilities/Terminal.app/Contents/MacOS/Terminal 26 | hongong 701 0.8 0.5 2976980 41488 ?? S 10:19AM 4:25.12 /Applications/Sublime Text.app/Contents/MacOS/Sublime Text 27 | 28 | kill - Kill a process $ kill -15 24601 29 | pkill - -f Kill matching processes $ pkill -15 -f spring 30 | ``` 31 | 32 | # Reviewing Log Files 33 | ``` 34 | %> tail /var/log/apache/error.log 35 | %> tail -n 100 /var/log/apache/error.log | more 36 | %> tail -f /var/log/apache/error.log 37 | %> cat /var/log/apache/error.log 38 | %> less /var/log/apache/error.log 39 | ``` 40 | 41 | # Copying Files with scp 42 | ``` 43 | %> scp id_rsa.pub webuser@192.168.1.1:/home/webuser/.ssh/id_rsa.pub 44 | ``` 45 | 46 | # Backing Up Your Web Directory 47 | ``` 48 | # backup 49 | %> tar cpzf archive.backup.042710.tgz /var/mywebsite 50 | # restore 51 | %> tar xvpfz archive.backup.042710.tgz -C /var/www/ 52 | ``` 53 | 54 | # Viewing Your Command History 55 | ``` 56 | %> history 57 | ... 58 | 12 sudo ./configure && make 59 | 13 find . | grep config.log 60 | 14 less ./config.log 61 | 15 mongod 62 | 16 asadmin start-domain --debug 63 | ``` 64 | 65 | # Creating Directory Trees 66 | ``` 67 | %> mkdir -p webapp/application/controllers 68 | ``` 69 | 70 | # Creating Command Aliases 71 | You can add them to an account configuration file such as .bashrc. 72 | ``` 73 | %> alias dir='ls -al' 74 | %> dir 75 | ... 76 | drwxr-xr-x@ 6 hongong staff 204 Oct 20 12:00 . 77 | drwxr-xr-x@ 13 hongong staff 442 Sep 5 21:51 .. 78 | -rw-r--r--@ 1 hongong staff 6148 Sep 26 15:29 .DS_Store 79 | drwxr-xr-x 3 hongong staff 102 Sep 5 19:01 server 80 | -rw-r--r-- 1 hongong staff 158814 Oct 20 12:20 server.log 81 | ``` 82 | 83 | # Editing the line 84 | ``` 85 | echo Print string to screen 86 | man Display manual page for command 87 | ⌃C Get out of trouble 88 | ⌃A Move to beginning of line 89 | ⌃E Move to end of line 90 | ⌃U Delete to beginning of line 91 | ⌃K Delete to ending of line 92 | ⌃W Delete word before cursor 93 | ``` 94 | 95 | # Manipulating files 96 | ``` 97 | > Redirect output to filename 98 | >> Append output to filename 99 | diff Diff files 1 & 2 100 | ``` 101 | 102 | # Wordcount and pipes 103 | ``` 104 | wc server.log 105 | 1131 10946 167679 server.log 106 | 1131 lines, 10946 words, 167679 bytes 107 | 108 | head server.log | wc 109 | 10 104 1438 110 | ``` 111 | 112 | # Less is more 113 | ``` 114 | up & down arrow keys Move up or down one line 115 | spacebar Move forward one page 116 | ⌃F Move forward one page 117 | ⌃B Move back one page 118 | G Move to end of file 119 | 1G Move to beginning of file 120 | / Search file for string 121 | n Move to next search result 122 | N Move to previous search result 123 | q Quit less 124 | -N View line number 125 | ``` 126 | 127 | # Grepping 128 | ``` 129 | grep Find string in file 130 | grep -i Find case-insensitively 131 | 132 | ``` -------------------------------------------------------------------------------- /deep_learning/basics.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | 4 | import matplotlib.image as mpimg 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | import tensorflow as tf 8 | from pydicom import dicomio 9 | 10 | 11 | def convolution_with_filter(img_4d, filter): 12 | convolved = tf.nn.conv2d(img_4d, filter, strides=[1, 1, 1, 1], padding='SAME') 13 | res = convolved.eval() 14 | 15 | plt.imshow(np.squeeze(res), cmap='gray') 16 | plt.imshow(res[0, :, :, 0], cmap='gray') 17 | plt.show() 18 | 19 | 20 | if __name__ == "__main__": 21 | 22 | ############################## 23 | # Basic read and show images # 24 | ############################## 25 | img = mpimg.imread("imgs/dogs.jpg") 26 | print "Image data", img.shape 27 | print img 28 | 29 | print "Show image" 30 | plt.style.use("ggplot") 31 | plt.imshow(img) 32 | plt.colorbar() 33 | plt.show() 34 | 35 | print "Show RGB channels" 36 | plt.imshow(img[:, :, 0], cmap="gray") 37 | plt.show() 38 | plt.imshow(img[:, :, 1], cmap="gray") 39 | plt.show() 40 | plt.imshow(img[:, :, 2], cmap="gray") 41 | plt.show() 42 | 43 | ############################ 44 | # Mean/Deviation of Images # 45 | ############################ 46 | root_dir = "sample_images/00cba091fa4ad62cc3200a657aeb957e/" 47 | os.chdir(root_dir) 48 | images = [] 49 | for f in glob.glob("*.dcm"): 50 | ds = dicomio.read_file(f) 51 | img = ds.pixel_array 52 | images.append(img) 53 | 54 | # convert to array 55 | data = np.array(images) 56 | print "Total images:", len(images) 57 | print "Image dimensions:", images[0].shape 58 | print "Combine dimensions:", data.shape 59 | 60 | plt.style.use("ggplot") 61 | print "Calculating mean images" 62 | mean_img = np.mean(data, axis=0) 63 | plt.imshow(mean_img.astype(np.uint8)) 64 | plt.show() 65 | 66 | print "Calculating deviation images" 67 | std_img = np.std(data, axis=0) 68 | plt.imshow(std_img.astype(np.uint8)) 69 | plt.show() 70 | 71 | ############# 72 | # Histogram # 73 | ############# 74 | # convert to flattened array 75 | flattened = data.ravel() 76 | print "First image:", data[:1] 77 | print "First 10 values:", flattened[:10] 78 | 79 | print "Histogram" 80 | plt.hist(flattened, 255) 81 | plt.show() 82 | 83 | print "Histogram Equalization" 84 | plt.hist(mean_img.ravel(), 255) 85 | plt.show() 86 | 87 | print "Normalizing our data" 88 | bins = 20 89 | fig, axs = plt.subplots(1, 3, figsize=(12, 6), sharey=True, sharex=True) 90 | axs[0].hist(data[0].ravel(), bins) 91 | axs[0].set_title("img distribution") 92 | axs[1].hist(mean_img.ravel(), bins) 93 | axs[1].set_title("mean distribution") 94 | axs[2].hist((data[0] - mean_img).ravel(), bins) 95 | axs[2].set_title("(img - mean) distribution") 96 | plt.show() 97 | 98 | #################### 99 | # Tensorflow basic # 100 | #################### 101 | print "Tensors" 102 | x = tf.linspace(-3.0, 3.0, 100) 103 | print x 104 | 105 | print "Graphs and Operations" 106 | g = tf.get_default_graph() 107 | print [op.name for op in g.get_operations()] 108 | 109 | print "Tensor" 110 | print g.get_tensor_by_name('LinSpace' + ':0') 111 | 112 | # Create Session 113 | sess = tf.Session() 114 | 115 | # Tell session to compute 116 | print "Session computes" 117 | computed_x = sess.run(x) 118 | print(computed_x) 119 | 120 | # Evaluate itself using this session 121 | print "Variable evaluates" 122 | computed_x = x.eval(session=sess) 123 | print(computed_x) 124 | 125 | print "Tensor shapes" 126 | print(x.get_shape()) 127 | # convert to list format 128 | print(x.get_shape().as_list()) 129 | 130 | # Close the session 131 | sess.close() 132 | 133 | # explicitly tell the session which graph we want to manage 134 | sess = tf.Session(graph=g) 135 | sess.close() 136 | 137 | # created a new graph 138 | g2 = tf.Graph() 139 | 140 | # interactive with Tensorflow 141 | sess = tf.InteractiveSession() 142 | print x.eval() 143 | 144 | ############### 145 | # Convolution # 146 | ############### 147 | mean = 0.0 148 | sigma = 1.0 149 | 150 | z = (tf.exp(tf.neg(tf.pow(x - mean, 2.0) / 151 | (2.0 * tf.pow(sigma, 2.0)))) * 152 | (1.0 / (sigma * tf.sqrt(2.0 * 3.1415)))) 153 | 154 | res = z.eval() 155 | plt.style.use("ggplot") 156 | plt.plot(res) 157 | plt.show() 158 | 159 | # store the number of values in our Gaussian curve. 160 | ksize = z.get_shape().as_list()[0] 161 | 162 | # multiply the two to get a 2d gaussian 163 | z_2d = tf.matmul(tf.reshape(z, [ksize, 1]), tf.reshape(z, [1, ksize])) 164 | 165 | # Execute the graph 166 | plt.imshow(z_2d.eval()) 167 | plt.colorbar() 168 | plt.show() 169 | 170 | # use tensorflow to reshape matrix 171 | img = mean_img.astype(np.float32) 172 | img_4d = tf.reshape(img, [1, img.shape[0], img.shape[1], 1]) 173 | print("Tensorflow image shape:", img_4d.get_shape().as_list()) 174 | 175 | # Reshape with 4d format: H x W x I x O 176 | z_4d = tf.reshape(z_2d, [ksize, ksize, 1, 1]) 177 | print("Tensorflow kernel shape:", z_4d.get_shape().as_list()) 178 | 179 | convolution_with_filter(img_4d, z_4d) 180 | 181 | # apply sharpen filter 182 | sharpen_filter = np.zeros([3, 3, 1, 1]) 183 | sharpen_filter[1, 1, :, :] = 5 184 | sharpen_filter[0, 1, :, :] = -1 185 | sharpen_filter[1, 0, :, :] = -1 186 | sharpen_filter[2, 1, :, :] = -1 187 | sharpen_filter[1, 2, :, :] = -1 188 | 189 | convolution_with_filter(img_4d, sharpen_filter) 190 | 191 | # apply top sobel filter 192 | top_sobel_filter = np.zeros([3, 3, 1, 1]) 193 | top_sobel_filter[0, 0, :, :] = 1 194 | top_sobel_filter[0, 1, :, :] = 2 195 | top_sobel_filter[0, 2, :, :] = 1 196 | top_sobel_filter[2, 0, :, :] = -1 197 | top_sobel_filter[2, 1, :, :] = -2 198 | top_sobel_filter[2, 2, :, :] = -1 199 | 200 | convolution_with_filter(img_4d, top_sobel_filter) 201 | 202 | 203 | -------------------------------------------------------------------------------- /python/getting_data.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | import re 4 | from collections import Counter 5 | 6 | import matplotlib.pyplot as plt 7 | import requests 8 | from bs4 import BeautifulSoup 9 | from dateutil.parser import parse 10 | from twython import Twython 11 | 12 | 13 | def print_data(ma_ck, kl, gia, delta): 14 | print ma_ck, "#", kl, "#", gia, "#", delta 15 | 16 | 17 | #### 18 | # 19 | # Oreilly 20 | # 21 | #### 22 | 23 | def is_video(td): 24 | """it's a video if it has exactly one pricelabel, and if 25 | the stripped text inside that pricelabel starts with 'Video'""" 26 | price_labels = td('span', 'pricelabel') 27 | return (len(price_labels) == 1 and 28 | price_labels[0].text.strip().startswith("Video")) 29 | 30 | 31 | def book_info(td): 32 | """given a BeautifulSoup Tag representing a book, 33 | extract the book's details and return a dict""" 34 | 35 | title = td.find("div", "thumbheader").a.text 36 | by_author = td.find('div', 'AuthorName').text 37 | authors = [x.strip() for x in re.sub("^By ", "", by_author).split(",")] 38 | isbn_link = td.find("div", "thumbheader").a.get("href") 39 | isbn = re.match("/product/(.*)\.do", isbn_link).groups()[0] 40 | date = td.find("span", "directorydate").text.strip() 41 | 42 | return { 43 | "title": title, 44 | "authors": authors, 45 | "isbn": isbn, 46 | "date": date 47 | } 48 | 49 | 50 | def scrape(num_pages=10): 51 | base_url = "http://shop.oreilly.com/category/browse-subjects/data.do?sortby=publicationDate&page=" 52 | 53 | books = [] 54 | 55 | for page_num in range(1, num_pages + 1): 56 | print "souping page", page_num 57 | url = base_url + str(page_num) 58 | soup = BeautifulSoup(requests.get(url).text, 'lxml') 59 | 60 | for td in soup('td', 'thumbtext'): 61 | if not is_video(td): 62 | books.append(book_info(td)) 63 | 64 | return books 65 | 66 | 67 | def get_year(book): 68 | """book["date"] looks like 'November 2014' so we need to 69 | split on the space and then take the second piece""" 70 | return int(book["date"].split()[1]) 71 | 72 | 73 | def plot_years(plt, books): 74 | # 2014 is the last complete year of data (when I ran this) 75 | year_counts = Counter(get_year(book) for book in books 76 | if get_year(book) <= 2016) 77 | 78 | years = sorted(year_counts) 79 | book_counts = [year_counts[year] for year in years] 80 | plt.bar([x - 0.5 for x in years], book_counts) 81 | plt.xlabel("year") 82 | plt.ylabel("# of data books") 83 | plt.title("Data is Big!") 84 | plt.show() 85 | 86 | 87 | #### 88 | # 89 | # Twitter 90 | # 91 | #### 92 | 93 | # fill these in if you want to use the code 94 | CONSUMER_KEY = "JeuEwD5RJiBbxiw9jTMBYBEmU" 95 | CONSUMER_SECRET = "xRcmv8AMnSSMwq875HiP1SKFfGw51M97BvVH341yckPY3iilCu" 96 | ACCESS_TOKEN = "47319754-NL1AIh9PBomIVsJe5HXB9vjE5y1rjwZFYUQx0odzo" 97 | ACCESS_TOKEN_SECRET = "kcq7ER8UZSykDomPn9lYdh5DAafndvp73PzSfykTq0Kp7" 98 | 99 | 100 | def call_twitter_search_api(): 101 | twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET) 102 | 103 | # search for tweets containing the phrase "data science" 104 | for status in twitter.search(q='"data science"')["statuses"]: 105 | user = status["user"]["screen_name"].encode('utf-8') 106 | text = status["text"].encode('utf-8') 107 | print user, ":", text 108 | print 109 | 110 | 111 | if __name__ == "__main__": 112 | print "# Data from: http://s.cafef.vn/du-lieu.chn" 113 | print "## TAB delimited stock prices" 114 | 115 | with open('data/tab_delimited_stock_prices.tsv', 'rb') as f: 116 | reader = csv.reader(f, delimiter='\t') 117 | for row in reader: 118 | ma_ck = row[0] 119 | kl = row[1] 120 | gia = float(row[2]) 121 | delta = row[3] 122 | print_data(ma_ck, kl, gia, delta) 123 | 124 | print 125 | 126 | print "## COLON delimited stock prices" 127 | with open('data/colon_delimited_stock_prices.csv', 'rb') as f: 128 | reader = csv.DictReader(f, delimiter=':') 129 | for row in reader: 130 | ma_ck = row["MA_CK"] 131 | kl = row["KL"] 132 | gia = float(row["GIA"]) 133 | delta = row["DELTA"] 134 | print_data(ma_ck, kl, gia, delta) 135 | 136 | print 137 | 138 | print "## WRITING out comma_delimited_stock_prices.csv" 139 | today_prices = {'VCF': 152.4, 'VAF': 13.3, 'ATA': 0.8} 140 | with open('data/comma_delimited_stock_prices.csv', 'wb') as f: 141 | writer = csv.writer(f, delimiter=',') 142 | for stock, price in today_prices.items(): 143 | writer.writerow([stock, price]) 144 | 145 | print "## BeautifulSoup" 146 | html = requests.get("https://www.google.com").text 147 | soup = BeautifulSoup(html, "lxml") 148 | print soup 149 | print 150 | 151 | print "## PARSING json" 152 | # parse the JSON to create a Python object 153 | with open("data/colors.json") as json_data: 154 | document = json.load(json_data) 155 | print "Getting blue value:", document["blue"] 156 | 157 | print 158 | 159 | print "## GitHub API" 160 | endpoint = "https://api.github.com/users/ongxuanhong/repos" 161 | repos = json.loads(requests.get(endpoint).text) 162 | 163 | dates = [parse(repo["created_at"]) for repo in repos] 164 | month_counts = Counter(date.month for date in dates) 165 | weekday_counts = Counter(date.weekday() for date in dates) 166 | 167 | print "dates", [d.strftime("%d/%m/%y") for d in dates] 168 | print "month_counts", month_counts 169 | print "weekday_count", weekday_counts 170 | 171 | last_5_repositories = sorted(repos, 172 | key=lambda r: r["created_at"], 173 | reverse=True)[:5] 174 | 175 | print "last five repos", [repo["name"] 176 | for repo in last_5_repositories] 177 | print 178 | 179 | print "## Oreilly books" 180 | books = scrape() 181 | plot_years(plt, books) 182 | print 183 | 184 | print "## Twitter search" 185 | call_twitter_search_api() 186 | -------------------------------------------------------------------------------- /pyspark/notebooks/Intro_DataFrame.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","source":["# Creating DataFrames with Python"],"metadata":{}},{"cell_type":"code","source":["# import pyspark class Row from module sql\nfrom pyspark.sql import *\n\n# Create Example Data - Departments and Employees\n\n# Create the Departments\ndepartment1 = Row(id='123456', name='Computer Science')\ndepartment2 = Row(id='789012', name='Mechanical Engineering')\ndepartment3 = Row(id='345678', name='Theater and Drama')\ndepartment4 = Row(id='901234', name='Indoor Recreation')\n\n# Create the Employees\nEmployee = Row(\"firstName\", \"lastName\", \"email\", \"salary\")\nemployee1 = Employee('michael', 'armbrust', 'no-reply@berkeley.edu', 100000)\nemployee2 = Employee('xiangrui', 'meng', 'no-reply@stanford.edu', 120000)\nemployee3 = Employee('matei', None, 'no-reply@waterloo.edu', 140000)\nemployee4 = Employee(None, 'wendell', 'no-reply@berkeley.edu', 160000)\n\n# Create the DepartmentWithEmployees instances from Departments and Employees\ndepartmentWithEmployees1 = Row(department=department1, employees=[employee1, employee2])\ndepartmentWithEmployees2 = Row(department=department2, employees=[employee3, employee4])\ndepartmentWithEmployees3 = Row(department=department3, employees=[employee1, employee4])\ndepartmentWithEmployees4 = Row(department=department4, employees=[employee2, employee3])\n\nprint department1\nprint employee2\nprint departmentWithEmployees1.employees[0].email"],"metadata":{},"outputs":[],"execution_count":2},{"cell_type":"code","source":["# Create the first DataFrame from a list of the rows.\ndepartmentsWithEmployeesSeq1 = [departmentWithEmployees1, departmentWithEmployees2]\ndf1 = sqlContext.createDataFrame(departmentsWithEmployeesSeq1)\n\ndisplay(df1)"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"code","source":["# Create a second DataFrame from a list of rows.\ndepartmentsWithEmployeesSeq2 = [departmentWithEmployees3, departmentWithEmployees4]\ndf2 = sqlContext.createDataFrame(departmentsWithEmployeesSeq2)\n\ndisplay(df2)"],"metadata":{},"outputs":[],"execution_count":4},{"cell_type":"markdown","source":["# Working with DataFrames"],"metadata":{}},{"cell_type":"code","source":["# Union 2 DataFrames.\nunionDF = df1.unionAll(df2)\ndisplay(unionDF)"],"metadata":{},"outputs":[],"execution_count":6},{"cell_type":"code","source":["# Write the Unioned DataFrame to a Parquet file.\n# Remove the file if it exists\ndbutils.fs.rm(\"/tmp/df-example.parquet\", True)\nunionDF.write.parquet(\"/tmp/df-example.parquet\")"],"metadata":{},"outputs":[],"execution_count":7},{"cell_type":"code","source":["# Read a DataFrame from the Parquet file.\nparquetDF = sqlContext.read.parquet(\"/tmp/df-example.parquet\")\ndisplay(parquetDF)"],"metadata":{},"outputs":[],"execution_count":8},{"cell_type":"code","source":["# Explode the employees column.\nfrom pyspark.sql import Row\nfrom pyspark.sql import functions as F\neDF = sqlContext.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={\"a\": \"b\"})])\n\nprint eDF.select(F.explode(eDF.intlist).alias(\"anInt\")).collect()\neDF.select(F.explode(eDF.mapfield).alias(\"key\", \"value\")).show()"],"metadata":{},"outputs":[],"execution_count":9},{"cell_type":"code","source":["from pyspark.sql.functions import explode\ndf = parquetDF.select(explode(\"employees\").alias(\"e\"))\nexplodeDF = df.selectExpr(\"e.firstName\", \"e.lastName\", \"e.email\", \"e.salary\")\ndisplay(explodeDF)"],"metadata":{},"outputs":[],"execution_count":10},{"cell_type":"code","source":["# Use filter() to return only the rows that match the given predicate.\nfilterDF = explodeDF.filter(explodeDF.firstName == \"xiangrui\").sort(explodeDF.lastName)\ndisplay(filterDF)"],"metadata":{},"outputs":[],"execution_count":11},{"cell_type":"code","source":["from pyspark.sql.functions import col, asc\n# use | instead of or\nfilterDF = explodeDF.filter((col(\"firstName\") == \"xiangrui\") | (col(\"firstName\") == \"michael\"))\ndisplay(filterDF)"],"metadata":{},"outputs":[],"execution_count":12},{"cell_type":"code","source":["# The where() clause is equivalent to filter().\nwhereDF = explodeDF.where((col(\"firstName\") == \"xiangrui\") | (col(\"firstName\") == \"michael\")).sort(asc(\"lastName\"))\ndisplay(whereDF)"],"metadata":{},"outputs":[],"execution_count":13},{"cell_type":"code","source":["# Replace null values with -- using DataFrame Na functions.\nnonNullDF = explodeDF.fillna(\"--\")\ndisplay(nonNullDF)"],"metadata":{},"outputs":[],"execution_count":14},{"cell_type":"code","source":["# Retrieve only rows with missing firstName or lastName.\nfilterNullDF = explodeDF.filter((col(\"firstName\").isNull()) | (col(\"lastName\").isNull())).sort(\"email\")\ndisplay(filterNullDF)"],"metadata":{},"outputs":[],"execution_count":15},{"cell_type":"code","source":["# Example aggregations using agg() and countDistinct().\nfrom pyspark.sql.functions import countDistinct\ncountDistinctDF = explodeDF.select(\"firstName\", \"lastName\").groupBy(\"firstName\", \"lastName\").agg(countDistinct(\"firstName\"))\ndisplay(countDistinctDF)"],"metadata":{},"outputs":[],"execution_count":16},{"cell_type":"code","source":["# Compare the DataFrame and SQL Query Physical Plans (Hint: They should be the same.)\ncountDistinctDF.explain()"],"metadata":{},"outputs":[],"execution_count":17},{"cell_type":"code","source":["explodeDF.registerTempTable(\"table_example\")\ncountDistinctDF_sql = sqlContext.sql(\"SELECT firstName, lastName, count(distinct firstName) as distinct_first_names FROM table_example GROUP BY firstName, lastName\")\ncountDistinctDF_sql.explain()"],"metadata":{},"outputs":[],"execution_count":18},{"cell_type":"code","source":["# Sum up all the salaries\nsalarySumDF = explodeDF.agg({\"salary\": \"sum\"})\ndisplay(salarySumDF)"],"metadata":{},"outputs":[],"execution_count":19},{"cell_type":"code","source":["# Print the summary statistics for the salaries.\nexplodeDF.describe(\"salary\").show()"],"metadata":{},"outputs":[],"execution_count":20},{"cell_type":"code","source":["display(explodeDF.select(\"salary\"))"],"metadata":{},"outputs":[],"execution_count":21},{"cell_type":"code","source":["# An example using Pandas & Matplotlib Integration\nimport pandas as pd\nimport matplotlib.pyplot as plt\nplt.clf()\npdDF = nonNullDF.toPandas()\npdDF.plot(x=\"firstName\", y=\"salary\", kind=\"bar\", rot=45)\ndisplay()"],"metadata":{},"outputs":[],"execution_count":22},{"cell_type":"code","source":["# Cleanup: Remove the parquet file.\ndbutils.fs.rm(\"/tmp/df-example.parquet\", True)"],"metadata":{},"outputs":[],"execution_count":23}],"metadata":{"name":"Intro_DataFrame","notebookId":651954930651402},"nbformat":4,"nbformat_minor":0} 2 | -------------------------------------------------------------------------------- /python/data/markdown_examples.md: -------------------------------------------------------------------------------- 1 | # Headers 2 | ``` 3 | # H1 4 | ## H2 5 | ### H3 6 | #### H4 7 | ##### H5 8 | ###### H6 9 | 10 | Alternatively, for H1 and H2, an underline-ish style: 11 | 12 | Alt-H1 13 | ====== 14 | 15 | Alt-H2 16 | ------ 17 | ``` 18 | 19 | # Emphasis 20 | ``` 21 | Emphasis, aka italics, with *asterisks* or _underscores_. 22 | 23 | Strong emphasis, aka bold, with **asterisks** or __underscores__. 24 | 25 | Combined emphasis with **asterisks and _underscores_**. 26 | 27 | Strikethrough uses two tildes. ~~Scratch this.~~ 28 | ``` 29 | Emphasis, aka italics, with *asterisks* or _underscores_. 30 | 31 | Strong emphasis, aka bold, with **asterisks** or __underscores__. 32 | 33 | Combined emphasis with **asterisks and _underscores_**. 34 | 35 | Strikethrough uses two tildes. ~~Scratch this.~~ 36 | 37 | # Lists 38 | ``` 39 | 1. First ordered list item 40 | 2. Another item 41 | * Unordered sub-list. 42 | 1. Actual numbers don't matter, just that it's a number 43 | 1. Ordered sub-list 44 | 4. And another item. 45 | 46 | Some text that should be aligned with the above item. 47 | 48 | * Unordered list can use asterisks 49 | - Or minuses 50 | + Or pluses 51 | ``` 52 | 1. First ordered list item 53 | 2. Another item 54 | * Unordered sub-list. 55 | 1. Actual numbers don't matter, just that it's a number 56 | 1. Ordered sub-list 57 | 4. And another item. 58 | 59 | Some text that should be aligned with the above item. 60 | 61 | * Unordered list can use asterisks 62 | - Or minuses 63 | + Or pluses 64 | 65 | # Links 66 | There are two ways to create links. 67 | ``` 68 | [I'm an inline-style link](https://www.google.com) 69 | 70 | [I'm a reference-style link][Arbitrary case-insensitive reference text] 71 | 72 | [You can use numbers for reference-style link definitions][1] 73 | 74 | Or leave it empty and use the [link text itself] 75 | 76 | URLs and URLs in angle brackets will automatically get turned into links. 77 | http://www.example.com or and sometimes 78 | example.com (but not on Github, for example). 79 | 80 | Some text to show that the reference links can follow later. 81 | 82 | [arbitrary case-insensitive reference text]: https://www.mozilla.org 83 | [1]: http://slashdot.org 84 | [link text itself]: http://www.reddit.com 85 | ``` 86 | [I'm an inline-style link](https://www.google.com) 87 | 88 | [I'm a reference-style link][Arbitrary case-insensitive reference text] 89 | 90 | [You can use numbers for reference-style link definitions][1] 91 | 92 | Or leave it empty and use the [link text itself] 93 | 94 | URLs and URLs in angle brackets will automatically get turned into links. 95 | http://www.example.com or and sometimes 96 | example.com (but not on Github, for example). 97 | 98 | Some text to show that the reference links can follow later. 99 | 100 | [arbitrary case-insensitive reference text]: https://www.mozilla.org 101 | [1]: http://slashdot.org 102 | [link text itself]: http://www.reddit.com 103 | 104 | # Images 105 | ``` 106 | Here's our logo (hover to see the title text): 107 | 108 | Inline-style: 109 | ![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png "Logo Title Text 1") 110 | 111 | Reference-style: 112 | ![alt text][logo] 113 | 114 | [logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png "Logo Title Text 2" 115 | ``` 116 | Here's our logo (hover to see the title text): 117 | 118 | Inline-style: 119 | ![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png "Logo Title Text 1") 120 | 121 | Reference-style: 122 | ![alt text][logo] 123 | 124 | [logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png "Logo Title Text 2" 125 | 126 | # Code and Syntax Highlighting 127 |
128 | Inline `code` has `back-ticks around` it.
129 | 
130 | 131 | Blocks of code are either fenced by lines with three back-ticks ```, or are indented with four spaces. I recommend only using the fenced code blocks -- they're easier and only they support syntax highlighting. 132 |
133 | ```javascript
134 | var s = "JavaScript syntax highlighting";
135 | alert(s);
136 | ```
137 |  
138 | ```python
139 | s = "Python syntax highlighting"
140 | print s
141 | ```
142 |  
143 | ```
144 | No language indicated, so no syntax highlighting. 
145 | But let's throw in a tag.
146 | ```
147 | 
148 | 149 | # Tables 150 | ``` 151 | Colons can be used to align columns. 152 | 153 | | Tables | Are | Cool | 154 | | ------------- |:-------------:| -----:| 155 | | col 3 is | right-aligned | $1600 | 156 | | col 2 is | centered | $12 | 157 | | zebra stripes | are neat | $1 | 158 | 159 | The outer pipes (|) are optional, and you don't need to make the raw Markdown line up prettily. You can also use inline Markdown. 160 | 161 | Markdown | Less | Pretty 162 | --- | --- | --- 163 | *Still* | `renders` | **nicely** 164 | 1 | 2 | 3 165 | ``` 166 | Colons can be used to align columns. 167 | 168 | | Tables | Are | Cool | 169 | | ------------- |:-------------:| -----:| 170 | | col 3 is | right-aligned | $1600 | 171 | | col 2 is | centered | $12 | 172 | | zebra stripes | are neat | $1 | 173 | 174 | The outer pipes (|) are optional, and you don't need to make the raw Markdown line up prettily. You can also use inline Markdown. 175 | 176 | Markdown | Less | Pretty 177 | --- | --- | --- 178 | *Still* | `renders` | **nicely** 179 | 1 | 2 | 3 180 | 181 | # Blockquotes 182 | ``` 183 | > Blockquotes are very handy in email to emulate reply text. 184 | > This line is part of the same quote. 185 | 186 | Quote break. 187 | 188 | > This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote. 189 | ``` 190 | > Blockquotes are very handy in email to emulate reply text. 191 | > This line is part of the same quote. 192 | 193 | Quote break. 194 | 195 | > This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote. 196 | 197 | # Inline HTML 198 | ``` 199 |
200 |
Definition list
201 |
Is something people use sometimes.
202 | 203 |
Markdown in HTML
204 |
Does *not* work **very** well. Use HTML tags.
205 |
206 | ``` 207 | 208 | # Horizontal Rule 209 | ``` 210 | Three or more... 211 | 212 | --- 213 | 214 | Hyphens 215 | 216 | *** 217 | 218 | Asterisks 219 | 220 | ___ 221 | 222 | Underscores 223 | ``` 224 | 225 | # Visualization 226 | Get Vietnam GDP data at 227 | http://data.worldbank.org/country/vietnam -------------------------------------------------------------------------------- /deep_learning/snippets/knn_cats_vs_dogs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | from operator import itemgetter 5 | 6 | import numpy as np 7 | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 8 | from sklearn.ensemble import AdaBoostClassifier 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.gaussian_process import GaussianProcessClassifier 11 | from sklearn.gaussian_process.kernels import RBF 12 | from sklearn.model_selection import train_test_split 13 | from sklearn.naive_bayes import GaussianNB 14 | from sklearn.neighbors import KNeighborsClassifier 15 | from sklearn.neural_network import MLPClassifier 16 | from sklearn.svm import SVC 17 | from sklearn.tree import DecisionTreeClassifier 18 | 19 | 20 | def image_to_feature_vector(image, size=(32, 32)): 21 | # resize the image to a fixed size, then flatten the image into 22 | # a list of raw pixel intensities 23 | return cv2.resize(image, size).flatten() 24 | 25 | 26 | def extract_color_histogram(image, bins=(8, 8, 8)): 27 | # extract a 3D color histogram from the HSV color space using 28 | # the supplied number of `bins` per channel 29 | hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 30 | hist = cv2.calcHist([hsv], [0, 1, 2], None, bins, [0, 180, 0, 256, 0, 256]) 31 | 32 | # handle normalizing the histogram if we are using OpenCV 2.4.X 33 | if imutils.is_cv2(): 34 | hist = cv2.normalize(hist) 35 | 36 | # otherwise, perform "in place" normalization in OpenCV 3 (I 37 | # personally hate the way this is done 38 | else: 39 | cv2.normalize(hist, hist) 40 | 41 | # return the flattened histogram as the feature vector 42 | return hist.flatten() 43 | 44 | 45 | if __name__ == "__main__": 46 | # Load opencv libraries 47 | sys.path.append('/usr/local/lib/python2.7/site-packages') 48 | import cv2 49 | import imutils 50 | from imutils import paths 51 | 52 | # construct the argument parse and parse the arguments 53 | ap = argparse.ArgumentParser() 54 | ap.add_argument("-d", "--dataset", required=True, help="path to input dataset") 55 | ap.add_argument("-k", "--neighbors", type=int, default=1, help="# of nearest neighbors for classification") 56 | ap.add_argument("-j", "--jobs", type=int, default=-1, 57 | help="# of jobs for k-NN distance (-1 uses all available cores)") 58 | args = vars(ap.parse_args()) 59 | 60 | # grab the list of images that we'll be describing 61 | print("[INFO] describing images...") 62 | imagePaths = list(paths.list_images(args["dataset"])) 63 | 64 | # initialize the raw pixel intensities matrix, the features matrix, 65 | # and labels list 66 | rawImages = [] 67 | features = [] 68 | labels = [] 69 | 70 | # loop over the input images 71 | for (i, imagePath) in enumerate(imagePaths): 72 | # load the image and extract the class label (assuming that our 73 | # path as the format: /path/to/dataset/{class}.{image_num}.jpg 74 | image = cv2.imread(imagePath) 75 | label = imagePath.split(os.path.sep)[-1].split(".")[0] 76 | 77 | # extract raw pixel intensity "features", followed by a color 78 | # histogram to characterize the color distribution of the pixels 79 | # in the image 80 | pixels = image_to_feature_vector(image) 81 | hist = extract_color_histogram(image) 82 | 83 | # update the raw images, features, and labels matricies, 84 | # respectively 85 | rawImages.append(pixels) 86 | features.append(hist) 87 | labels.append(label) 88 | 89 | # show an update every 1,000 images 90 | if i > 0 and i % 1000 == 0: 91 | print("[INFO] processed {}/{}".format(i, len(imagePaths))) 92 | 93 | # show some information on the memory consumed by the raw images 94 | # matrix and features matrix 95 | rawImages = np.array(rawImages) 96 | features = np.array(features) 97 | labels = np.array(labels) 98 | print("[INFO] pixels matrix: {:.2f}MB".format(rawImages.nbytes / (1024 * 1000.0))) 99 | print("[INFO] features matrix: {:.2f}MB".format(features.nbytes / (1024 * 1000.0))) 100 | 101 | # partition the data into training and testing splits, using 75% 102 | # of the data for training and the remaining 25% for testing 103 | # (trainRI, testRI, trainRL, testRL) = train_test_split(rawImages, labels, test_size=0.25, random_state=42) 104 | (trainFeat, testFeat, trainLabels, testLabels) = train_test_split(features, labels, test_size=0.25, random_state=42) 105 | 106 | # # train and evaluate a k-NN classifer on the raw pixel intensities 107 | # print("[INFO] evaluating raw pixel accuracy...") 108 | # model = KNeighborsClassifier(n_neighbors=args["neighbors"], n_jobs=args["jobs"]) 109 | # model.fit(trainRI, trainRL) 110 | # acc = model.score(testRI, testRL) 111 | # print("[INFO] raw pixel accuracy: {:.2f}%".format(acc * 100)) 112 | # 113 | # # train and evaluate a k-NN classifer on the histogram 114 | # # representations 115 | # print("[INFO] evaluating histogram accuracy...") 116 | # model = KNeighborsClassifier(n_neighbors=args["neighbors"], n_jobs=args["jobs"]) 117 | # model.fit(trainFeat, trainLabels) 118 | # acc = model.score(testFeat, testLabels) 119 | # print("[INFO] histogram accuracy: {:.2f}%".format(acc * 100)) 120 | 121 | print "---------------------------" 122 | print "Training" 123 | print "---------------------------" 124 | 125 | names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process", 126 | "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", 127 | "Naive Bayes", "QDA"] 128 | 129 | classifiers = [ 130 | KNeighborsClassifier(3, n_jobs=args["jobs"]), 131 | SVC(kernel="linear", C=0.025), 132 | SVC(gamma=2, C=1), 133 | GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True, n_jobs=args["jobs"]), 134 | DecisionTreeClassifier(max_depth=5), 135 | RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, n_jobs=args["jobs"]), 136 | MLPClassifier(alpha=1), 137 | AdaBoostClassifier(), 138 | GaussianNB(), 139 | QuadraticDiscriminantAnalysis()] 140 | 141 | # iterate over classifiers 142 | results = {} 143 | 144 | for name, clf in zip(names, classifiers): 145 | print "Training " + name + " classifier..." 146 | clf.fit(trainFeat, trainLabels) 147 | score = clf.score(testFeat, testLabels) 148 | results[name] = score 149 | 150 | print "---------------------------" 151 | print "Evaluation results" 152 | print "---------------------------" 153 | 154 | # sorting results and print out 155 | sorted(results.items(), key=itemgetter(1)) 156 | for name in results: 157 | print name + " accuracy: %0.3f" % results[name] 158 | -------------------------------------------------------------------------------- /deep_learning/src/prototype.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import csv 3 | import datetime 4 | import os 5 | import sys 6 | import time 7 | from operator import itemgetter 8 | 9 | import numpy as np 10 | import pandas as pd 11 | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 12 | from sklearn.ensemble import AdaBoostClassifier 13 | from sklearn.ensemble import RandomForestClassifier 14 | from sklearn.gaussian_process import GaussianProcessClassifier 15 | from sklearn.gaussian_process.kernels import RBF 16 | from sklearn.model_selection import train_test_split 17 | from sklearn.naive_bayes import GaussianNB 18 | from sklearn.neighbors import KNeighborsClassifier 19 | from sklearn.neural_network import MLPClassifier 20 | from sklearn.svm import SVC 21 | from sklearn.tree import DecisionTreeClassifier 22 | 23 | sys.path.append("/usr/local/lib/python2.7/site-packages") 24 | import cv2 25 | from imutils import paths 26 | import matplotlib 27 | 28 | matplotlib.use("TkAgg") 29 | import matplotlib.pyplot as plt 30 | 31 | 32 | def time_diff_str(t1, t2): 33 | """ 34 | Calculates time durations. 35 | """ 36 | diff = t2 - t1 37 | mins = int(diff / 60) 38 | secs = round(diff % 60, 2) 39 | return str(mins) + " mins and " + str(secs) + " seconds" 40 | 41 | 42 | def load_csv(file_path): 43 | """Get data, from local csv.""" 44 | if os.path.exists(file_path): 45 | print "[INFO] load", file_path, "file..." 46 | df = pd.read_csv(file_path) 47 | 48 | return df 49 | 50 | 51 | def image_to_feature_vector(image, size=(32, 32)): 52 | # resize the image to a fixed size, then flatten the image into 53 | # a list of raw pixel intensities 54 | return cv2.resize(image, size).flatten() 55 | 56 | 57 | def get_simple_feature_labels(patient_df, img_paths): 58 | features = [] 59 | labels = [] 60 | 61 | patient_ids = patient_df["id"].tolist() 62 | 63 | # loop over the input images 64 | for (i, img_path) in enumerate(img_paths): 65 | # get only training labels 66 | base = os.path.basename(img_path) 67 | patient_id = os.path.splitext(base)[0] 68 | if patient_id in patient_ids: 69 | label = patient_df[patient_df["id"] == patient_id].iloc[0]["cancer"] 70 | labels.append(label) 71 | else: 72 | continue 73 | 74 | # load the image 75 | image = cv2.imread(img_path) 76 | feat = image_to_feature_vector(image) 77 | 78 | # update features 79 | features.append(feat) 80 | 81 | # show an update every 100 images 82 | if i > 0 and i % 100 == 0: 83 | print("[INFO] processed {}/{}".format(i, len(img_paths))) 84 | 85 | return features, labels 86 | 87 | 88 | if __name__ == "__main__": 89 | t_start = time.time() 90 | 91 | # construct the argument parse and parse the arguments 92 | ap = argparse.ArgumentParser() 93 | ap.add_argument("-d", "--dataset", required=True, help="path to input dataset") 94 | ap.add_argument("-o", "--out", required=True, help="path to output submission") 95 | ap.add_argument("-j", "--jobs", type=int, default=-1, help="# of jobs (-1 uses all available cores)") 96 | ap.add_argument("-s", "--save", help="path to save features") 97 | args = vars(ap.parse_args()) 98 | 99 | # load train/test labels 100 | stage1_labels = load_csv("../data/stage1_labels.csv") 101 | stage1_sample_submission = load_csv("../data/stage1_sample_submission.csv") 102 | 103 | img_paths = list(paths.list_images(args["dataset"])) 104 | train_features, train_labels = get_simple_feature_labels(stage1_labels, img_paths) 105 | test_features, test_labels = get_simple_feature_labels(stage1_sample_submission, img_paths) 106 | 107 | train_labels = np.array(train_labels) 108 | train_features = np.array(train_features) 109 | print "[INFO] labels vector shape:", train_labels.shape 110 | print "[INFO] features matrix shape:", train_features.shape 111 | print("[INFO] features matrix size: {:.2f}MB".format(train_features.nbytes / (1024 * 1000.0))) 112 | 113 | plt.imshow(train_features) 114 | plt.show() 115 | 116 | print "---------------------------" 117 | print "Training" 118 | print "---------------------------" 119 | 120 | classifiers = { 121 | "Nearest Neighbors": KNeighborsClassifier(3, n_jobs=args["jobs"]), 122 | "Linear SVM": SVC(kernel="linear", C=0.025), 123 | "RBF SVM": SVC(gamma=2, C=1), 124 | "Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True, n_jobs=args["jobs"]), 125 | "Decision Tree": DecisionTreeClassifier(max_depth=5), 126 | "Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, n_jobs=args["jobs"]), 127 | "Neural Net": MLPClassifier(alpha=1), 128 | "AdaBoost": AdaBoostClassifier(), 129 | "Naive Bayes": GaussianNB(), 130 | "QDA": QuadraticDiscriminantAnalysis() 131 | } 132 | 133 | # train/dev split 134 | # X_train, X_dev, Y_train, Y_dev 135 | (for_train_features, dev_features, for_train_labels, dev_labels) = train_test_split(train_features, 136 | train_labels, 137 | test_size=0.25, 138 | random_state=42) 139 | 140 | # iterate over classifiers 141 | results = {} 142 | 143 | for name in classifiers: 144 | print "[INFO]" + name + " classifier..." 145 | clf = classifiers[name] 146 | clf.fit(for_train_features, for_train_labels) 147 | score = clf.score(dev_features, dev_labels) 148 | results[name] = score 149 | 150 | print "---------------------------" 151 | print "Evaluation results" 152 | print "---------------------------" 153 | 154 | # sorting results and print out 155 | sorted(results.items(), key=itemgetter(1)) 156 | for name in results: 157 | print "[INFO]", name, "accuracy: %0.3f" % results[name] 158 | 159 | print "---------------------------" 160 | print "Training for submission" 161 | print "---------------------------" 162 | 163 | name = list(results)[0] 164 | clf = classifiers[name] 165 | print "[INFO]" + name + " classifier..." 166 | clf.fit(train_features, train_labels) 167 | predict_submission = clf.predict(test_features) 168 | 169 | # update submission 170 | submission = {} 171 | patient_ids = stage1_sample_submission["id"].tolist() 172 | for (i, patient_id) in enumerate(patient_ids): 173 | submission[patient_id] = predict_submission[i] 174 | 175 | with open(args["out"], "wb") as f: 176 | writer = csv.writer(f, delimiter=',') 177 | writer.writerow(["id", "cancer"]) 178 | for key, value in submission.items(): 179 | writer.writerow([key, value]) 180 | 181 | print "[INFO]", datetime.datetime.now(), "* DONE After *", time_diff_str(t_start, time.time()) 182 | -------------------------------------------------------------------------------- /python/clustering/document_retrieval.py: -------------------------------------------------------------------------------- 1 | """ 2 | CLUSTERING & RETRIEVAL 3 | Case study: Finding documents 4 | Models: 5 | Nearest neighbors 6 | Clustering, mixtures of Gaussians 7 | Latent Dirichlet allocation (LDA) 8 | Algorithms: 9 | KD-trees, locality-sensitive hashing (LSH) 10 | K-means 11 | Expectation-maximization (EM) 12 | Concepts: 13 | Distance metrics, approximation algorithms, hashing, sampling algorithms, scaling up with map-reduce 14 | """ 15 | import datetime 16 | import math 17 | import os 18 | import time 19 | 20 | import pandas as pd 21 | from sklearn.decomposition import LatentDirichletAllocation 22 | from sklearn.decomposition import NMF 23 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer 24 | from sklearn.neighbors import NearestNeighbors 25 | 26 | 27 | def time_diff_str(t1, t2): 28 | """ 29 | Calculates time durations. 30 | """ 31 | diff = t2 - t1 32 | mins = int(diff / 60) 33 | secs = round(diff % 60, 2) 34 | return str(mins) + " mins and " + str(secs) + " seconds" 35 | 36 | 37 | def load_wiki_data(file_name): 38 | """Get reviews data, from local csv.""" 39 | if os.path.exists(file_name): 40 | print("-- " + file_name + " found locally") 41 | df = pd.read_csv(file_name) 42 | 43 | return df 44 | 45 | 46 | def freq(word, doc): 47 | return doc.count(word) 48 | 49 | 50 | def word_count(doc): 51 | return len(doc) 52 | 53 | 54 | def tf(word, doc): 55 | return (freq(word, doc) / float(word_count(doc))) 56 | 57 | 58 | def num_docs_containing(word, list_of_docs): 59 | count = 0 60 | for document in list_of_docs: 61 | if freq(word, document) > 0: 62 | count += 1 63 | return 1 + count 64 | 65 | 66 | def idf(word, list_of_docs): 67 | return math.log(len(list_of_docs) / 68 | float(num_docs_containing(word, list_of_docs))) 69 | 70 | 71 | def tf_idf(word, doc, list_of_docs): 72 | return (tf(word, doc) * idf(word, list_of_docs)) 73 | 74 | 75 | def print_top_words(model, feature_names, n_top_words): 76 | for topic_idx, topic in enumerate(model.components_): 77 | print("Topic #%d:" % topic_idx) 78 | print(" ".join([feature_names[i] 79 | for i in topic.argsort()[:-n_top_words - 1:-1]])) 80 | print() 81 | 82 | 83 | if __name__ == "__main__": 84 | t_start = time.time() 85 | print "-- ----------------------------------------------------------------" 86 | print "-- %s - Start building document retrieval systems" % datetime.datetime.now() 87 | print "-- ----------------------------------------------------------------" 88 | 89 | n_samples = 2000 90 | n_features = 1000 91 | n_topics = 10 92 | n_top_words = 20 93 | 94 | # Load wiki data 95 | people = load_wiki_data("people_wiki.csv") 96 | print people.head() 97 | print len(people) 98 | 99 | # Explore 100 | obama = people[people["name"] == "Barack Obama"] 101 | obama_row_index = obama.index.tolist()[0] 102 | print "-- Obama:", obama 103 | 104 | taylor = people[people["name"] == "Taylor Swift"] 105 | taylor_row_index = taylor.index.tolist()[0] 106 | print "-- Taylor Swift:", taylor 107 | 108 | # Calculate term frequency 109 | txt_obama = obama["text"].tolist()[0] 110 | print "-- Obama term frequence" 111 | for word in txt_obama.split(): 112 | print word, tf(word, txt_obama) 113 | 114 | txt_taylor = taylor["text"].tolist()[0] 115 | print "-- Taylor Swift term frequence" 116 | for word in txt_taylor.split(): 117 | print word, tf(word, txt_taylor) 118 | 119 | # Calculate TF-IDF 120 | print "-- Obama TF-IDF" 121 | for word in txt_obama.split(): 122 | print word, tf_idf(word, txt_obama, people["text"]) 123 | 124 | print "-- Taylor Swift TF-IDF" 125 | for word in txt_taylor.split(): 126 | print word, tf_idf(word, txt_taylor, people["text"]) 127 | 128 | # TF-IDF 129 | count_vect = CountVectorizer() 130 | X_train_counts = count_vect.fit_transform(people["text"]) 131 | print "-- Term frequency matrix:", X_train_counts.shape 132 | 133 | tfidf_transformer = TfidfTransformer() 134 | X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) 135 | tfidf_matrix = X_train_tfidf.toarray() 136 | print "-- TF-IDF matrix:", X_train_tfidf.shape 137 | 138 | # Build nearest matrix 139 | neigh = NearestNeighbors(n_neighbors=5) 140 | neigh.fit(X_train_tfidf) 141 | 142 | # Looking for some nearest 143 | (distance, found_index) = neigh.kneighbors([tfidf_matrix[obama_row_index]]) 144 | print "-- Who is closest to Obama?" 145 | print people.iloc[found_index.tolist()[0]] 146 | 147 | (distance, found_index) = neigh.kneighbors([tfidf_matrix[taylor_row_index]]) 148 | print "-- Who is closest to Taylor Swift?" 149 | print people.iloc[found_index.tolist()[0]] 150 | 151 | ####### 152 | # NMF # 153 | ####### 154 | # Use tf-idf features for NMF. 155 | print("Extracting tf-idf features for NMF...") 156 | tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, 157 | max_features=n_features, 158 | stop_words='english') 159 | t0 = time.time() 160 | tfidf = tfidf_vectorizer.fit_transform(people["text"]) 161 | print("done in %0.3fs." % (time.time() - t0)) 162 | 163 | # Fit the NMF model 164 | print("Fitting the NMF model with tf-idf features, " 165 | "n_samples=%d and n_features=%d..." 166 | % (n_samples, n_features)) 167 | t0 = time.time() 168 | nmf = NMF(n_components=n_topics, random_state=1, 169 | alpha=.1, l1_ratio=.5).fit(tfidf) 170 | print("done in %0.3fs." % (time.time() - t0)) 171 | 172 | print("\nTopics in NMF model:") 173 | tfidf_feature_names = tfidf_vectorizer.get_feature_names() 174 | print_top_words(nmf, tfidf_feature_names, n_top_words) 175 | 176 | ####### 177 | # LDA # 178 | ####### 179 | # Use tf (raw term count) features for LDA. 180 | print("Extracting tf features for LDA...") 181 | tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, 182 | max_features=n_features, 183 | stop_words='english') 184 | t0 = time.time() 185 | tf = tf_vectorizer.fit_transform(people["text"]) 186 | print("done in %0.3fs." % (time.time() - t0)) 187 | 188 | print("Fitting LDA models with tf features, " 189 | "n_samples=%d and n_features=%d..." 190 | % (n_samples, n_features)) 191 | lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, 192 | learning_method='online', 193 | learning_offset=50., 194 | random_state=0) 195 | t0 = time.time() 196 | lda.fit(tf) 197 | print("done in %0.3fs." % (time.time() - t0)) 198 | 199 | print("\nTopics in LDA model:") 200 | tf_feature_names = tf_vectorizer.get_feature_names() 201 | print_top_words(lda, tf_feature_names, n_top_words) 202 | 203 | print "-- %s * DONE After * %s" % (datetime.datetime.now(), time_diff_str(t_start, time.time())) 204 | -------------------------------------------------------------------------------- /word2vec/gensim_test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# -*- coding: utf-8 -*-\n", 12 | "# import libraries\n", 13 | "import os, logging\n", 14 | "from gensim.models import Word2Vec\n", 15 | "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "# read sentences\n", 27 | "class MySentences(object):\n", 28 | " def __init__(self, dirname):\n", 29 | " self.dirname = dirname\n", 30 | " \n", 31 | " def __iter__(self):\n", 32 | " for fname in os.listdir(self.dirname):\n", 33 | " for line in open(os.path.join(self.dirname, fname)):\n", 34 | " yield line.split()\n", 35 | " \n", 36 | "sentences = MySentences(\"data\")" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "name": "stderr", 46 | "output_type": "stream", 47 | "text": [ 48 | "2017-09-18 15:55:55,940 : INFO : collecting all words and their counts\n", 49 | "2017-09-18 15:55:55,941 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", 50 | "2017-09-18 15:55:55,942 : INFO : collected 236 word types from a corpus of 441 raw words and 17 sentences\n", 51 | "2017-09-18 15:55:55,942 : INFO : Loading a fresh vocabulary\n", 52 | "2017-09-18 15:55:55,943 : INFO : min_count=5 retains 14 unique words (5% of original 236, drops 222)\n", 53 | "2017-09-18 15:55:55,944 : INFO : min_count=5 leaves 126 word corpus (28% of original 441, drops 315)\n", 54 | "2017-09-18 15:55:55,945 : INFO : deleting the raw counts dictionary of 236 items\n", 55 | "2017-09-18 15:55:55,945 : INFO : sample=0.001 downsamples 14 most-common words\n", 56 | "2017-09-18 15:55:55,946 : INFO : downsampling leaves estimated 16 word corpus (12.9% of prior 126)\n", 57 | "2017-09-18 15:55:55,947 : INFO : estimated required memory for 14 words and 100 dimensions: 18200 bytes\n", 58 | "2017-09-18 15:55:55,948 : INFO : resetting layer weights\n", 59 | "2017-09-18 15:55:55,949 : INFO : training model with 4 workers on 14 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n", 60 | "2017-09-18 15:55:55,952 : INFO : worker thread finished; awaiting finish of 3 more threads\n", 61 | "2017-09-18 15:55:55,953 : INFO : worker thread finished; awaiting finish of 2 more threads\n", 62 | "2017-09-18 15:55:55,953 : INFO : worker thread finished; awaiting finish of 1 more threads\n", 63 | "2017-09-18 15:55:55,954 : INFO : worker thread finished; awaiting finish of 0 more threads\n", 64 | "2017-09-18 15:55:55,955 : INFO : training on 2205 raw words (71 effective words) took 0.0s, 14786 effective words/s\n", 65 | "2017-09-18 15:55:55,956 : WARNING : under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n", 66 | "2017-09-18 15:55:55,956 : INFO : saving Word2Vec object under models/first_model, separately None\n", 67 | "2017-09-18 15:55:55,957 : INFO : not storing attribute syn0norm\n", 68 | "2017-09-18 15:55:55,958 : INFO : not storing attribute cum_table\n", 69 | "2017-09-18 15:55:55,959 : INFO : saved models/first_model\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)\n", 75 | "model.save(\"models/first_model\")" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 4, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "name": "stderr", 85 | "output_type": "stream", 86 | "text": [ 87 | "2017-09-18 15:55:55,964 : INFO : loading Word2Vec object from models/first_model\n", 88 | "2017-09-18 15:55:55,965 : INFO : loading wv recursively from models/first_model.wv.* with mmap=None\n", 89 | "2017-09-18 15:55:55,966 : INFO : setting ignored attribute syn0norm to None\n", 90 | "2017-09-18 15:55:55,966 : INFO : setting ignored attribute cum_table to None\n", 91 | "2017-09-18 15:55:55,967 : INFO : loaded models/first_model\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "new_model = Word2Vec.load(\"models/first_model\")" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 5, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "name": "stdout", 106 | "output_type": "stream", 107 | "text": [ 108 | "T�nh\n" 109 | ] 110 | } 111 | ], 112 | "source": [ 113 | "u = \"T\\xc3\\xadnh\"\n", 114 | "uu = u.decode('utf8')\n", 115 | "s = uu.encode('cp1250')\n", 116 | "print(s)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 6, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/plain": [ 127 | "{',': ,\n", 128 | " '-': ,\n", 129 | " '.': ,\n", 130 | " 'bi\\xe1\\xbb\\x83n': ,\n", 131 | " 'ch\\xc3\\xacm': ,\n", 132 | " 'ch\\xe1\\xbb\\x9f': ,\n", 133 | " 'c\\xc3\\xa1c': ,\n", 134 | " 'h\\xc3\\xa0ng_h\\xc3\\xb3a': ,\n", 135 | " 'n\\xc3\\xa0y': ,\n", 136 | " 'ph\\xc3\\xa0': ,\n", 137 | " 'tr\\xc3\\xaan': ,\n", 138 | " 't\\xe1\\xba\\xa5n': ,\n", 139 | " '\\xc4\\x91\\xc6\\xb0\\xe1\\xbb\\xa3c': ,\n", 140 | " '\\xc4\\x91\\xe1\\xba\\xbfn': }" 141 | ] 142 | }, 143 | "execution_count": 6, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | } 147 | ], 148 | "source": [ 149 | "new_model.wv.vocab" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": true 157 | }, 158 | "outputs": [], 159 | "source": [] 160 | } 161 | ], 162 | "metadata": { 163 | "kernelspec": { 164 | "display_name": "Python 2", 165 | "language": "python", 166 | "name": "python2" 167 | }, 168 | "language_info": { 169 | "codemirror_mode": { 170 | "name": "ipython", 171 | "version": 2 172 | }, 173 | "file_extension": ".py", 174 | "mimetype": "text/x-python", 175 | "name": "python", 176 | "nbconvert_exporter": "python", 177 | "pygments_lexer": "ipython2", 178 | "version": "2.7.10" 179 | } 180 | }, 181 | "nbformat": 4, 182 | "nbformat_minor": 2 183 | } 184 | -------------------------------------------------------------------------------- /deep_learning/output/submission_results.csv: -------------------------------------------------------------------------------- 1 | id,cancer 2 | aec5a58fea38b77b964007aa6975c049,0 3 | 6d3b16f2e60c3a1a4246f340dba73676,0 4 | 7027c0b8c8f8dcc76c6e4ba923d60a2e,0 5 | d1a20ef45bb03f93a407b492066f6d88,0 6 | 83728b6eed98845556bfc870b7567883,0 7 | 8bb7dd5fbfa5ecb95552d9c587f2fea5,0 8 | e9a27e2645e1fad9434ce765f678585f,0 9 | ea01deecde93cd9503a049d71d46e6d5,0 10 | 03bd22ed5858039af223c04993e9eb22,0 11 | d42c998d037fb3003faba541e2cf649a,0 12 | ac4056071f3cc98489b9db3aebfe2b6a,0 13 | 995fc0581ed0e3ba0f97dbd7fe63db59,0 14 | d654966fd2498de023552b830c07a659,0 15 | 8be7a7cc747365030bee8297221ab5bc,0 16 | e6160ed0ff2eb214abd4df9a3c336c1d,0 17 | 1f6333bc3599f683403d6f0884aefe00,0 18 | 49c88f7cc77341c9ae4e64243f9912fc,0 19 | 174c5f7c33ca31443208ef873b9477e5,0 20 | 33387bea2cacf6127035cc7033036a02,0 21 | a5d7909f14d43f01f44cdcaabed27b84,0 22 | d1131708024b32032ade1ef48d115915,0 23 | b8793dbd40de88c0de0913abbaab0fe7,0 24 | ae2fdcd8daa3fede6ae23cc63a8d9a82,0 25 | 202898fa97c5949fbdc07ae7ff1cd9f0,0 26 | 665c1913d8e90e57af3b745349d19537,0 27 | 901ed0a38aa16933c04ffd531b0aa2cf,0 28 | cc4805e3ebe8621bc94a621b1714fc84,0 29 | eb9db3f740f8e153e85f83c57bc4e522,0 30 | 5ce91933688cc8400105bf640ac11535,0 31 | dbd9c8025907511e965e7abad955547d,0 32 | 61017c23bbae6e17062ff582d1a237b3,0 33 | cdb53f3be6d8cce07fa41c833488d8a5,0 34 | a6c15206edadab0270898f03e770d730,0 35 | c71d0db2086b7e2024ca9c11bd2ca504,0 36 | e60d99ea9648e1ce859eb0b386365e26,0 37 | fcfab3eddbdf0421c39f71d651cc5c56,0 38 | 505405b3e70fb24b92e6a8a5b7ed339c,0 39 | fad57a1078ddbc685e517bd8f24aa8ac,0 40 | 538543b57d0c8fa0b2b6bb7c84df3f33,0 41 | e0aa61b44c33e6a75940a8541c6894c9,0 42 | 21b73c938fd7d346ee77a60bd60aaeac,0 43 | 4b28f147cb82baba3edcdbd34ca19085,0 44 | 7cf1a65bb0f89323668034244a59e725,0 45 | 6e240f23afa2c1b4352cd0db5d4f357d,0 46 | 031b7ec4fe96a3b035a8196264a8c8c3,0 47 | 8fde44df03fb80366c6604db53d3623f,0 48 | d81852bffda09dc8033a45332397c495,0 49 | 6993396b31078993e13cf9c0a6fd470b,0 50 | aa59b7a4aa4dfb2489feea527eda3e4d,0 51 | b82efe72526c59a96257208d95e54baf,0 52 | ff8599dd7c1139be3bad5a0351ab749a,0 53 | fdcd385b0d2d12341661e1abe845be0b,0 54 | a2a4bc7708f6831470d757cd6f32bffe,0 55 | 59af702c21840ec18073b6b56c95e7fe,0 56 | 82b9fb9e238397b2f3bff98975577ff9,0 57 | 5451203688c930484ba1f3c7f1378847,0 58 | c25876fb40d6f8dafd1ecb243193dd3f,0 59 | f0310ffc724faf9f7aef2c418127ee68,0 60 | 38bf066bba822584e14c0af65d4bb5e9,0 61 | 026470d51482c93efc18b9803159c960,0 62 | 9a378249b799bbcefac2a7de46896c0a,0 63 | 96042e205dd3dc055f084aaca245e550,0 64 | 2eb92d17ca91b393765e8acf069763a6,0 65 | be3e35bf8395366d235b8bcfc71a05ee,0 66 | bf6a7a9ab4e18b18f43129c9e22fb448,0 67 | 7191c236cfcfc68cd21143e3a0faac51,0 68 | b4db5b96c65a668a2e63f9a3ed36afe7,0 69 | ab9c7bef62d1ad65b824414087b6f06b,0 70 | 263a1c3bfa43556623e75ed901e3fd8f,0 71 | 50cdacec399071cf70d8badd2511d0b3,0 72 | d3a8fb1da8f7a0dcbd5a8d65f3647757,0 73 | ebcdfabecf4b46b1e55e4a4c75a0afb0,0 74 | 159bc8821a2dc39a1e770cb3559e098d,0 75 | 80938b4f531fa2334c13d829339e1356,0 76 | a0fc609febe3eef5a4713a22996cf8e5,0 77 | 2703df8c469906a06a45c0d7ff501199,0 78 | c3a9046fbe2b0f0a4e43a669c321e472,0 79 | 993f1e68290d591f755669e97b49b4f4,0 80 | bdfb2c23a8c1dca5ea8c1cc3d89efee9,0 81 | c46c3962c10e287f1c1e3af0d309a128,0 82 | 55b06d60e7c0329787f81d1b7cbf9aa0,0 83 | 96544665531e7f59bc2730e3c5f42e65,0 84 | 85ab88f093ca53a4fab5654e24c77ebe,0 85 | c87a713d17522698958de55c97654beb,0 86 | 6d43fdb6eb1bec3a5f4febfd442e8c93,0 87 | c7bdb83b7ca6269fac16ab7cff930a2e,0 88 | 519ad4ead3e61d2d71088ac8e46f25b6,0 89 | 63458b5875a0b223ec21555d17b52fd4,0 90 | b6857d98b7b3dbe84f153617f4dfd14b,0 91 | 84ed26b5d79da321711ed869b3cad2ea,0 92 | e314fd13809db0132443b924401d828b,0 93 | 2004b3f761c3f5dffb02204f1247b211,0 94 | f89e3d0867e27be8e19d7ed50e1eb7e8,0 95 | 9ca18e68b6b8d9c3112b4b69b7d6fad5,0 96 | 5d16819bd78c74448ce852a93bf423ad,0 97 | ae61ec94b0b8de5439180f4776551e42,0 98 | c2ef34cc347bc224b5a123426009d027,0 99 | bdc2daa372a36f6f7c72abdc0b5639d1,0 100 | af1d0c2fcde369dd1b715460c2f704a2,0 101 | b4d5b618fdf3a5a1bcfb325a3715e99e,0 102 | 68f4dff6dd1f135488e83b8a4ee6e20e,0 103 | 1cf8e778167d20bf769669b4be96592b,0 104 | 52f6d741e674f62fbcf73e6ec4f6a472,0 105 | 7ce310b8431ace09a91ededcc03f7361,0 106 | 26142353f46d20c9fdded93f01e2bff4,0 107 | cbb9bbd994c235b56fb77429291edf99,0 108 | 9cc74e673ec9807ee055973e1b185624,0 109 | 7daeb8ef7307849c715f7f6f3e2dd88e,0 110 | 8a1e5830a16db34b580202f8b6dbbd3d,0 111 | a0e60d7a13f6bb4002cc4a08e60b0776,0 112 | 243038f7bb7787497c59bc17f04c6ed9,0 113 | 88acee40bb9d8cb06898d1c5de01d3c8,0 114 | b17c07114dcf49ce71c8da4b43cf1192,0 115 | c0c5a155e6e59588783c2964975e7e1e,0 116 | 649fd56ef9809019b57261fcf9574d76,0 117 | fb55849cee6473974612c17f094a38cd,0 118 | 95a98df466d4f6c6689908ea9a8f324b,0 119 | e3bc0a970a4af5d52826e06742f90e5b,0 120 | e42065c1145ccf734312cb9edbe5234b,0 121 | 12db1ea8336eafaf7f9e3eda2b4e4fef,0 122 | 4575fe61bf3f536ce6cfeb26fcc2893c,0 123 | 1e62be2c3b6430b78ce31a8f023531ac,0 124 | 9050cf3aa8371bd7088c4bdf967141d4,0 125 | 34037914ceeec5605fc890159dd425c5,0 126 | 8e9002a485cbda2b47cd14014d6f1c36,0 127 | ae4e9d8aab8f8f5ae975bcca923f468d,0 128 | 2a3e6ecf9499607ef4fd14b436136b0c,0 129 | b0599ad2f33276e7cd065eaa8dcec8a2,0 130 | f5ff7734997820b45dafa75dff60ece8,0 131 | 4434e19303b62ebaecef2596583ff351,0 132 | e6d8ae8c3b0817df994a1ce3b37a7efb,0 133 | d753676c2c6c8ac6f97bd61ecab7554a,0 134 | a5bb766ab3b1bc5a8023a50a956595f2,0 135 | 9de48cf43611478ffc1fef051b75dc8c,0 136 | 80bda1afde73204abd74d1ebd2758382,0 137 | cc1b7e34d9eba737c9fb91316463e8f7,0 138 | 7f096cdfbc2fe03ec7f779278416a78c,0 139 | e33c25d0dbca5e54385f2100ce523467,0 140 | 7869cc6bfc3678fec1a81e93b34648cf,0 141 | eaeebb7a63edc8a329a7c5fbc583a507,0 142 | 48ab0b98fc7789304c21430978624f32,0 143 | bbf7a3e138f9353414f2d51f0c363561,0 144 | 567547810a1795b9c8e11c15dfd32c34,0 145 | 07b1defcfae5873ee1f03c90255eb170,0 146 | cd6be62834c72756738935f904ec9c2c,0 147 | 580cffecce8d3d53cde1abb922adf21a,0 148 | 06a90409e4fcea3e634748b967993531,0 149 | f7c387290d7e3074501eac167c849000,0 150 | 89f003dbfbdbd18a5cdeb9b128cb075b,0 151 | 6c71617e2cee498fd3dd20956bb90a3b,0 152 | 5a42f0a0d1e060531c20d04ed23efc02,0 153 | cd68d1a14cc504e3f7434d5cc324744d,0 154 | 616f98dab4db03edbad28c73d22468d2,0 155 | d2ec8f0fc56a9168cda0c707e49974ab,0 156 | 42b2161e43b4dd0ea94604485976c59c,0 157 | 5791c42d317f34592be9a933c50e68ad,0 158 | 5ae9ab473d59cd29262c47a741177b6e,0 159 | 7c2fd0d32df5a2780b4b10fdf2f2cdbe,0 160 | 6d3be6081d76d2365b080e599628d3bc,0 161 | fb5874408966d7c6bebd3d84a5599e20,0 162 | 2d596b6ead89ab35577fe625a9a17cbb,0 163 | 6f229187fe608c9eacc567eb74c1458c,0 164 | 96cca9d8e5764daa4bcb6c0ba07735bc,0 165 | 1753250dab5fc81bab8280df13309733,0 166 | 9b871732b3935661e7639e84a6ab9747,0 167 | 8e60f166f1f1dc0d72f997fe1c9e72b4,0 168 | 70f4eb8201e3155cc3e399f0ff09c5ef,0 169 | 3ee1fd6a0f3f108c3558e6699fb011f2,0 170 | d032116d73789ff9c805f493357b4037,0 171 | 6379e4435f78a5e5c150c32146ece4d4,0 172 | 86ad341b9ac27364f03981f6a775246c,0 173 | f4d23e0272a2ce5bfc7f07033d4f2e7d,0 174 | 94df6d1ae21c5bfaebe6f8daf8fcd85b,0 175 | 8f517521a2ed576e853fab1907fa5ffd,0 176 | a334d15ac8d2d25bce76693b1b2a3ed7,0 177 | 931253c408c440a8494dfaa74251efd3,0 178 | 2f77fd993fbd858dec3c085b9ff1a3a2,0 179 | a2558184e0f4a68e9fb13579d20cb244,0 180 | d5a0333be8795805fc39509f817780ee,0 181 | 763288341ee363a264fe45a28ea28c21,0 182 | 1fdbc07019192de4a114e090389c8330,0 183 | c95f2aa23e6d6702f5b16a3b35f89cf0,0 184 | 8b494d14d835dd5ae13dab19b9520a55,0 185 | d4a075768abe7fe43ad1caac92515256,0 186 | 3295cec04482210dc6f78c2b4a1d287b,0 187 | 85d6fb4a08853d370935a75de7495a27,0 188 | 81bd0c062bfa8e85616878bab90f2314,0 189 | efcb6def7a2080243052b6046186ab24,0 190 | 8b9a28375988de6ea0b143d48b4a8dc9,0 191 | 0b20184e0cd497028bdd155d9fb42dc9,0 192 | b53d997901eb880c41fbfbc82847204c,0 193 | be9a2df5a16434e581c6a0625c290591,0 194 | 70671fa94231eb377e8ac7cba4650dfb,0 195 | 7fd5be8ec9c236c314f801384bd89c0c,0 196 | 9065f2b133129c5747d42db18a424749,0 197 | d03127f497cae40bcbd9996b4d1f5b90,0 198 | 49433c1588cc078b825a0eff1dc2e816,0 199 | ea3a771ef05e288409e0250ea893cf87,0 200 | -------------------------------------------------------------------------------- /deep_learning/data/stage1_sample_submission.csv: -------------------------------------------------------------------------------- 1 | id,cancer 2 | 026470d51482c93efc18b9803159c960,0.5 3 | 031b7ec4fe96a3b035a8196264a8c8c3,0.5 4 | 03bd22ed5858039af223c04993e9eb22,0.5 5 | 06a90409e4fcea3e634748b967993531,0.5 6 | 07b1defcfae5873ee1f03c90255eb170,0.5 7 | 0b20184e0cd497028bdd155d9fb42dc9,0.5 8 | 12db1ea8336eafaf7f9e3eda2b4e4fef,0.5 9 | 159bc8821a2dc39a1e770cb3559e098d,0.5 10 | 174c5f7c33ca31443208ef873b9477e5,0.5 11 | 1753250dab5fc81bab8280df13309733,0.5 12 | 1cf8e778167d20bf769669b4be96592b,0.5 13 | 1e62be2c3b6430b78ce31a8f023531ac,0.5 14 | 1f6333bc3599f683403d6f0884aefe00,0.5 15 | 1fdbc07019192de4a114e090389c8330,0.5 16 | 2004b3f761c3f5dffb02204f1247b211,0.5 17 | 202898fa97c5949fbdc07ae7ff1cd9f0,0.5 18 | 21b73c938fd7d346ee77a60bd60aaeac,0.5 19 | 243038f7bb7787497c59bc17f04c6ed9,0.5 20 | 26142353f46d20c9fdded93f01e2bff4,0.5 21 | 263a1c3bfa43556623e75ed901e3fd8f,0.5 22 | 2703df8c469906a06a45c0d7ff501199,0.5 23 | 2a3e6ecf9499607ef4fd14b436136b0c,0.5 24 | 2d596b6ead89ab35577fe625a9a17cbb,0.5 25 | 2eb92d17ca91b393765e8acf069763a6,0.5 26 | 2f77fd993fbd858dec3c085b9ff1a3a2,0.5 27 | 3295cec04482210dc6f78c2b4a1d287b,0.5 28 | 33387bea2cacf6127035cc7033036a02,0.5 29 | 34037914ceeec5605fc890159dd425c5,0.5 30 | 38bf066bba822584e14c0af65d4bb5e9,0.5 31 | 3ee1fd6a0f3f108c3558e6699fb011f2,0.5 32 | 42b2161e43b4dd0ea94604485976c59c,0.5 33 | 4434e19303b62ebaecef2596583ff351,0.5 34 | 4575fe61bf3f536ce6cfeb26fcc2893c,0.5 35 | 48ab0b98fc7789304c21430978624f32,0.5 36 | 49433c1588cc078b825a0eff1dc2e816,0.5 37 | 49c88f7cc77341c9ae4e64243f9912fc,0.5 38 | 4b28f147cb82baba3edcdbd34ca19085,0.5 39 | 505405b3e70fb24b92e6a8a5b7ed339c,0.5 40 | 50cdacec399071cf70d8badd2511d0b3,0.5 41 | 519ad4ead3e61d2d71088ac8e46f25b6,0.5 42 | 52f6d741e674f62fbcf73e6ec4f6a472,0.5 43 | 538543b57d0c8fa0b2b6bb7c84df3f33,0.5 44 | 5451203688c930484ba1f3c7f1378847,0.5 45 | 55b06d60e7c0329787f81d1b7cbf9aa0,0.5 46 | 567547810a1795b9c8e11c15dfd32c34,0.5 47 | 5791c42d317f34592be9a933c50e68ad,0.5 48 | 580cffecce8d3d53cde1abb922adf21a,0.5 49 | 59af702c21840ec18073b6b56c95e7fe,0.5 50 | 5a42f0a0d1e060531c20d04ed23efc02,0.5 51 | 5ae9ab473d59cd29262c47a741177b6e,0.5 52 | 5ce91933688cc8400105bf640ac11535,0.5 53 | 5d16819bd78c74448ce852a93bf423ad,0.5 54 | 61017c23bbae6e17062ff582d1a237b3,0.5 55 | 616f98dab4db03edbad28c73d22468d2,0.5 56 | 63458b5875a0b223ec21555d17b52fd4,0.5 57 | 6379e4435f78a5e5c150c32146ece4d4,0.5 58 | 649fd56ef9809019b57261fcf9574d76,0.5 59 | 665c1913d8e90e57af3b745349d19537,0.5 60 | 68f4dff6dd1f135488e83b8a4ee6e20e,0.5 61 | 6993396b31078993e13cf9c0a6fd470b,0.5 62 | 6c71617e2cee498fd3dd20956bb90a3b,0.5 63 | 6d3b16f2e60c3a1a4246f340dba73676,0.5 64 | 6d3be6081d76d2365b080e599628d3bc,0.5 65 | 6d43fdb6eb1bec3a5f4febfd442e8c93,0.5 66 | 6e240f23afa2c1b4352cd0db5d4f357d,0.5 67 | 6f229187fe608c9eacc567eb74c1458c,0.5 68 | 7027c0b8c8f8dcc76c6e4ba923d60a2e,0.5 69 | 70671fa94231eb377e8ac7cba4650dfb,0.5 70 | 70f4eb8201e3155cc3e399f0ff09c5ef,0.5 71 | 7191c236cfcfc68cd21143e3a0faac51,0.5 72 | 763288341ee363a264fe45a28ea28c21,0.5 73 | 7869cc6bfc3678fec1a81e93b34648cf,0.5 74 | 7c2fd0d32df5a2780b4b10fdf2f2cdbe,0.5 75 | 7ce310b8431ace09a91ededcc03f7361,0.5 76 | 7cf1a65bb0f89323668034244a59e725,0.5 77 | 7daeb8ef7307849c715f7f6f3e2dd88e,0.5 78 | 7f096cdfbc2fe03ec7f779278416a78c,0.5 79 | 7fd5be8ec9c236c314f801384bd89c0c,0.5 80 | 80938b4f531fa2334c13d829339e1356,0.5 81 | 80bda1afde73204abd74d1ebd2758382,0.5 82 | 81bd0c062bfa8e85616878bab90f2314,0.5 83 | 82b9fb9e238397b2f3bff98975577ff9,0.5 84 | 83728b6eed98845556bfc870b7567883,0.5 85 | 84ed26b5d79da321711ed869b3cad2ea,0.5 86 | 85ab88f093ca53a4fab5654e24c77ebe,0.5 87 | 85d6fb4a08853d370935a75de7495a27,0.5 88 | 86ad341b9ac27364f03981f6a775246c,0.5 89 | 88acee40bb9d8cb06898d1c5de01d3c8,0.5 90 | 89f003dbfbdbd18a5cdeb9b128cb075b,0.5 91 | 8a1e5830a16db34b580202f8b6dbbd3d,0.5 92 | 8b494d14d835dd5ae13dab19b9520a55,0.5 93 | 8b9a28375988de6ea0b143d48b4a8dc9,0.5 94 | 8bb7dd5fbfa5ecb95552d9c587f2fea5,0.5 95 | 8be7a7cc747365030bee8297221ab5bc,0.5 96 | 8e60f166f1f1dc0d72f997fe1c9e72b4,0.5 97 | 8e9002a485cbda2b47cd14014d6f1c36,0.5 98 | 8f517521a2ed576e853fab1907fa5ffd,0.5 99 | 8fde44df03fb80366c6604db53d3623f,0.5 100 | 901ed0a38aa16933c04ffd531b0aa2cf,0.5 101 | 9050cf3aa8371bd7088c4bdf967141d4,0.5 102 | 9065f2b133129c5747d42db18a424749,0.5 103 | 931253c408c440a8494dfaa74251efd3,0.5 104 | 94df6d1ae21c5bfaebe6f8daf8fcd85b,0.5 105 | 95a98df466d4f6c6689908ea9a8f324b,0.5 106 | 96042e205dd3dc055f084aaca245e550,0.5 107 | 96544665531e7f59bc2730e3c5f42e65,0.5 108 | 96cca9d8e5764daa4bcb6c0ba07735bc,0.5 109 | 993f1e68290d591f755669e97b49b4f4,0.5 110 | 995fc0581ed0e3ba0f97dbd7fe63db59,0.5 111 | 9a378249b799bbcefac2a7de46896c0a,0.5 112 | 9b871732b3935661e7639e84a6ab9747,0.5 113 | 9ca18e68b6b8d9c3112b4b69b7d6fad5,0.5 114 | 9cc74e673ec9807ee055973e1b185624,0.5 115 | 9de48cf43611478ffc1fef051b75dc8c,0.5 116 | a0e60d7a13f6bb4002cc4a08e60b0776,0.5 117 | a0fc609febe3eef5a4713a22996cf8e5,0.5 118 | a2558184e0f4a68e9fb13579d20cb244,0.5 119 | a2a4bc7708f6831470d757cd6f32bffe,0.5 120 | a334d15ac8d2d25bce76693b1b2a3ed7,0.5 121 | a5bb766ab3b1bc5a8023a50a956595f2,0.5 122 | a5d7909f14d43f01f44cdcaabed27b84,0.5 123 | a6c15206edadab0270898f03e770d730,0.5 124 | aa59b7a4aa4dfb2489feea527eda3e4d,0.5 125 | ab9c7bef62d1ad65b824414087b6f06b,0.5 126 | ac4056071f3cc98489b9db3aebfe2b6a,0.5 127 | ae2fdcd8daa3fede6ae23cc63a8d9a82,0.5 128 | ae4e9d8aab8f8f5ae975bcca923f468d,0.5 129 | ae61ec94b0b8de5439180f4776551e42,0.5 130 | aec5a58fea38b77b964007aa6975c049,0.5 131 | af1d0c2fcde369dd1b715460c2f704a2,0.5 132 | b0599ad2f33276e7cd065eaa8dcec8a2,0.5 133 | b17c07114dcf49ce71c8da4b43cf1192,0.5 134 | b4d5b618fdf3a5a1bcfb325a3715e99e,0.5 135 | b4db5b96c65a668a2e63f9a3ed36afe7,0.5 136 | b53d997901eb880c41fbfbc82847204c,0.5 137 | b6857d98b7b3dbe84f153617f4dfd14b,0.5 138 | b82efe72526c59a96257208d95e54baf,0.5 139 | b8793dbd40de88c0de0913abbaab0fe7,0.5 140 | bbf7a3e138f9353414f2d51f0c363561,0.5 141 | bdc2daa372a36f6f7c72abdc0b5639d1,0.5 142 | bdfb2c23a8c1dca5ea8c1cc3d89efee9,0.5 143 | be3e35bf8395366d235b8bcfc71a05ee,0.5 144 | be9a2df5a16434e581c6a0625c290591,0.5 145 | bf6a7a9ab4e18b18f43129c9e22fb448,0.5 146 | c0c5a155e6e59588783c2964975e7e1e,0.5 147 | c25876fb40d6f8dafd1ecb243193dd3f,0.5 148 | c2ef34cc347bc224b5a123426009d027,0.5 149 | c3a9046fbe2b0f0a4e43a669c321e472,0.5 150 | c46c3962c10e287f1c1e3af0d309a128,0.5 151 | c71d0db2086b7e2024ca9c11bd2ca504,0.5 152 | c7bdb83b7ca6269fac16ab7cff930a2e,0.5 153 | c87a713d17522698958de55c97654beb,0.5 154 | c95f2aa23e6d6702f5b16a3b35f89cf0,0.5 155 | cbb9bbd994c235b56fb77429291edf99,0.5 156 | cc1b7e34d9eba737c9fb91316463e8f7,0.5 157 | cc4805e3ebe8621bc94a621b1714fc84,0.5 158 | cd68d1a14cc504e3f7434d5cc324744d,0.5 159 | cd6be62834c72756738935f904ec9c2c,0.5 160 | cdb53f3be6d8cce07fa41c833488d8a5,0.5 161 | d03127f497cae40bcbd9996b4d1f5b90,0.5 162 | d032116d73789ff9c805f493357b4037,0.5 163 | d1131708024b32032ade1ef48d115915,0.5 164 | d1a20ef45bb03f93a407b492066f6d88,0.5 165 | d2ec8f0fc56a9168cda0c707e49974ab,0.5 166 | d3a8fb1da8f7a0dcbd5a8d65f3647757,0.5 167 | d42c998d037fb3003faba541e2cf649a,0.5 168 | d4a075768abe7fe43ad1caac92515256,0.5 169 | d5a0333be8795805fc39509f817780ee,0.5 170 | d654966fd2498de023552b830c07a659,0.5 171 | d753676c2c6c8ac6f97bd61ecab7554a,0.5 172 | d81852bffda09dc8033a45332397c495,0.5 173 | dbd9c8025907511e965e7abad955547d,0.5 174 | e0aa61b44c33e6a75940a8541c6894c9,0.5 175 | e314fd13809db0132443b924401d828b,0.5 176 | e33c25d0dbca5e54385f2100ce523467,0.5 177 | e3bc0a970a4af5d52826e06742f90e5b,0.5 178 | e42065c1145ccf734312cb9edbe5234b,0.5 179 | e60d99ea9648e1ce859eb0b386365e26,0.5 180 | e6160ed0ff2eb214abd4df9a3c336c1d,0.5 181 | e6d8ae8c3b0817df994a1ce3b37a7efb,0.5 182 | e9a27e2645e1fad9434ce765f678585f,0.5 183 | ea01deecde93cd9503a049d71d46e6d5,0.5 184 | ea3a771ef05e288409e0250ea893cf87,0.5 185 | eaeebb7a63edc8a329a7c5fbc583a507,0.5 186 | eb9db3f740f8e153e85f83c57bc4e522,0.5 187 | ebcdfabecf4b46b1e55e4a4c75a0afb0,0.5 188 | efcb6def7a2080243052b6046186ab24,0.5 189 | f0310ffc724faf9f7aef2c418127ee68,0.5 190 | f4d23e0272a2ce5bfc7f07033d4f2e7d,0.5 191 | f5ff7734997820b45dafa75dff60ece8,0.5 192 | f7c387290d7e3074501eac167c849000,0.5 193 | f89e3d0867e27be8e19d7ed50e1eb7e8,0.5 194 | fad57a1078ddbc685e517bd8f24aa8ac,0.5 195 | fb55849cee6473974612c17f094a38cd,0.5 196 | fb5874408966d7c6bebd3d84a5599e20,0.5 197 | fcfab3eddbdf0421c39f71d651cc5c56,0.5 198 | fdcd385b0d2d12341661e1abe845be0b,0.5 199 | ff8599dd7c1139be3bad5a0351ab749a,0.5 200 | -------------------------------------------------------------------------------- /pyspark/study_apache_spark/scala/scala_rdd.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "http://10.8.2.1:8089/proxy/application_1515394405830_3960\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "import org.apache.spark.sql.SparkSession\n", 18 | "\n", 19 | "val spark = SparkSession.builder().\n", 20 | " appName(\"scala_rdd\").\n", 21 | " config(\"spark.executor.instances\",\"2\").\n", 22 | " config(\"spark.executor.cores\",\"2\").\n", 23 | " config(\"spark.executor.memory\", \"4g\").\n", 24 | " config(\"spark.yarn.executor.memoryOverhead\", \"1g\").\n", 25 | " getOrCreate()\n", 26 | "\n", 27 | "println(\"http://10.8.2.1:8089/proxy/\"+ spark.sparkContext.applicationId)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "# Transformations" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "name": "stdout", 44 | "output_type": "stream", 45 | "text": [ 46 | "Before: 1,2,3,4\n", 47 | "After * 2: 2,4,6,8\n", 48 | "Filter even: 2,4\n" 49 | ] 50 | } 51 | ], 52 | "source": [ 53 | "var rdd = spark.sparkContext.parallelize(Array(1, 2, 3, 4))\n", 54 | "println(\"Before: \" + rdd.collect().mkString(\",\"))\n", 55 | "println(\"After * 2: \" + rdd.map(_ * 2).collect().mkString(\",\"))\n", 56 | "println(\"Filter even: \" + rdd.filter(_ % 2 == 0).collect().mkString(\",\"))" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 3, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "Before: 1,2,2,3,4\n", 69 | "Distinct: 4,1,2,3\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "rdd = spark.sparkContext.parallelize(Array(1, 2, 2, 3, 4))\n", 75 | "println(\"Before: \" + rdd.collect().mkString(\",\"))\n", 76 | "println(\"Distinct: \" + rdd.distinct().collect().mkString(\",\"))" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 4, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | "Before: 1,2,3\n", 89 | "To array:\n" 90 | ] 91 | }, 92 | { 93 | "data": { 94 | "text/plain": [ 95 | "Array(Array(1, 6), Array(2, 7), Array(3, 8))" 96 | ] 97 | }, 98 | "execution_count": 4, 99 | "metadata": {}, 100 | "output_type": "execute_result" 101 | } 102 | ], 103 | "source": [ 104 | "rdd = spark.sparkContext.parallelize(Array(1, 2, 3))\n", 105 | "println(\"Before: \" + rdd.collect().mkString(\",\"))\n", 106 | "println(\"To array:\")\n", 107 | "rdd.map(x => Array(x, x + 5)).collect()" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 5, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "To flat array:\n" 120 | ] 121 | }, 122 | { 123 | "data": { 124 | "text/plain": [ 125 | "Array(1, 6, 2, 7, 3, 8)" 126 | ] 127 | }, 128 | "execution_count": 5, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "println(\"To flat array:\")\n", 135 | "rdd.flatMap(x => Array(x, x + 5)).collect()" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": { 141 | "collapsed": true 142 | }, 143 | "source": [ 144 | "# Actions" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 6, 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "name": "stdout", 154 | "output_type": "stream", 155 | "text": [ 156 | "Before: 1,2,3\n", 157 | "Reduce: 6\n", 158 | "Take 2: 1,2\n", 159 | "Collect: 1,2,3\n", 160 | "Count: 3\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "// Python rdd.reduce(lambda a, b: a * b)\n", 166 | "rdd = spark.sparkContext.parallelize(Array(1, 2, 3))\n", 167 | "println(\"Before: \" + rdd.collect().mkString(\",\"))\n", 168 | "println(\"Reduce: \" + rdd.reduce((a, b) => a * b))\n", 169 | "println(\"Take 2: \" + rdd.take(2).mkString(\",\"))\n", 170 | "println(\"Collect: \" + rdd.collect().mkString(\",\"))\n", 171 | "println(\"Count: \" + rdd.count())" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "# Key-Value RDDs" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 7, 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "data": { 188 | "text/plain": [ 189 | "Array((1,2), (3,10))" 190 | ] 191 | }, 192 | "execution_count": 7, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "val keyValReduceByKey = spark.sparkContext.parallelize(Seq((1, 2), (3, 4), (3, 6)))\n", 199 | "keyValReduceByKey.reduceByKey((a, b) => a + b).collect()" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 8, 205 | "metadata": {}, 206 | "outputs": [ 207 | { 208 | "data": { 209 | "text/plain": [ 210 | "Array((1,a), (1,b), (2,c))" 211 | ] 212 | }, 213 | "execution_count": 8, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "val keyValSortByKey = spark.sparkContext.parallelize(Seq((1, \"a\"), (2, \"c\"), (1, \"b\")))\n", 220 | "keyValSortByKey.sortByKey().collect()" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 9, 226 | "metadata": {}, 227 | "outputs": [ 228 | { 229 | "data": { 230 | "text/plain": [ 231 | "Array((1,CompactBuffer(a, b)), (2,CompactBuffer(c)))" 232 | ] 233 | }, 234 | "execution_count": 9, 235 | "metadata": {}, 236 | "output_type": "execute_result" 237 | } 238 | ], 239 | "source": [ 240 | "val keyValGroupByKey = spark.sparkContext.parallelize(Seq((1, \"a\"), (2, \"c\"), (1, \"b\")))\n", 241 | "keyValGroupByKey.groupByKey().collect()" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 10, 247 | "metadata": {}, 248 | "outputs": [ 249 | { 250 | "data": { 251 | "text/plain": [ 252 | "Array((a,(1,2)), (a,(1,3)))" 253 | ] 254 | }, 255 | "execution_count": 10, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "val x = spark.sparkContext.parallelize(Seq((\"a\", 1), (\"b\", 4)))\n", 262 | "val y = spark.sparkContext.parallelize(Seq((\"a\", 2), (\"a\", 3)))\n", 263 | "x.join(y).collect()" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 11, 269 | "metadata": {}, 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/plain": [ 274 | "Array((a,(1,Some(2))), (a,(1,Some(3))), (b,(4,None)))" 275 | ] 276 | }, 277 | "execution_count": 11, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [ 283 | "x.leftOuterJoin(y).collect()" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 12, 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "data": { 293 | "text/plain": [ 294 | "Array((a,(Some(1),2)), (a,(Some(1),3)))" 295 | ] 296 | }, 297 | "execution_count": 12, 298 | "metadata": {}, 299 | "output_type": "execute_result" 300 | } 301 | ], 302 | "source": [ 303 | "x.rightOuterJoin(y).collect()" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 13, 309 | "metadata": {}, 310 | "outputs": [ 311 | { 312 | "data": { 313 | "text/plain": [ 314 | "Array((a,(Some(1),Some(2))), (a,(Some(1),Some(3))), (b,(Some(4),None)))" 315 | ] 316 | }, 317 | "execution_count": 13, 318 | "metadata": {}, 319 | "output_type": "execute_result" 320 | } 321 | ], 322 | "source": [ 323 | "x.fullOuterJoin(y).collect()" 324 | ] 325 | } 326 | ], 327 | "metadata": { 328 | "kernelspec": { 329 | "display_name": "Apache Toree - Scala", 330 | "language": "scala", 331 | "name": "apache_toree_scala" 332 | }, 333 | "language_info": { 334 | "file_extension": ".scala", 335 | "name": "scala", 336 | "version": "2.11.8" 337 | } 338 | }, 339 | "nbformat": 4, 340 | "nbformat_minor": 2 341 | } 342 | -------------------------------------------------------------------------------- /python/sentiment_analysis/classification_algorithms.py: -------------------------------------------------------------------------------- 1 | """ 2 | CLASSIFICATION 3 | Case study: Analyzing sentiment 4 | Models: 5 | Linear classifiers (logistic regression, SVMs, perceptron) 6 | Kernels 7 | Decision trees 8 | Algorithms: 9 | Stochastic gradient descent 10 | Boosting 11 | Concepts: 12 | Decision boundaries, MLE, ensemble methods, random forests, CART, online learning 13 | """ 14 | import datetime 15 | import os 16 | import re 17 | import time 18 | from itertools import islice 19 | from operator import itemgetter 20 | 21 | import numpy as np 22 | import pandas as pd 23 | from BeautifulSoup import BeautifulSoup 24 | from nltk.corpus import stopwords 25 | from sklearn.cross_validation import train_test_split 26 | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 27 | from sklearn.ensemble import AdaBoostClassifier 28 | from sklearn.ensemble import RandomForestClassifier 29 | from sklearn.feature_extraction.text import CountVectorizer 30 | from sklearn.gaussian_process import GaussianProcessClassifier 31 | from sklearn.gaussian_process.kernels import RBF 32 | from sklearn.naive_bayes import GaussianNB 33 | from sklearn.neighbors import KNeighborsClassifier 34 | from sklearn.neural_network import MLPClassifier 35 | from sklearn.svm import SVC 36 | from sklearn.tree import DecisionTreeClassifier 37 | 38 | 39 | def time_diff_str(t1, t2): 40 | """ 41 | Calculates time durations. 42 | """ 43 | diff = t2 - t1 44 | mins = int(diff / 60) 45 | secs = round(diff % 60, 2) 46 | return str(mins) + " mins and " + str(secs) + " seconds" 47 | 48 | 49 | def clean_sentence(sentence): 50 | # Remove HTML 51 | review_text = BeautifulSoup(sentence).text 52 | 53 | # Remove non-letters 54 | letters_only = re.sub("[^a-zA-Z]", " ", review_text) 55 | return letters_only 56 | 57 | 58 | def convert_plain_to_csv(plain_name, csv_name): 59 | t0 = time.time() 60 | with open(plain_name, "r") as f1, open(csv_name, "w") as f2: 61 | i = 0 62 | f2.write("productId,score,summary,text\n") 63 | while True: 64 | next_n_lines = list(islice(f1, 9)) 65 | if not next_n_lines: 66 | break 67 | 68 | # process next_n_lines: get productId,score,summary,text info 69 | # remove special characters from summary and text 70 | output_line = "" 71 | for line in next_n_lines: 72 | if "product/productId:" in line: 73 | output_line += line.split(":")[1].strip() + "," 74 | elif "review/score:" in line: 75 | output_line += line.split(":")[1].strip() + "," 76 | elif "review/summary:" in line: 77 | summary = clean_sentence(line.split(":")[1].strip()) + "," 78 | output_line += summary 79 | elif "review/text:" in line: 80 | text = clean_sentence(line.split(":")[1].strip()) + "\n" 81 | output_line += text 82 | 83 | f2.write(output_line) 84 | 85 | # print status 86 | i += 1 87 | if i % 10000 == 0: 88 | print "%d reviews converted..." % i 89 | 90 | print " %s - Converting completed %s" % (datetime.datetime.now(), time_diff_str(t0, time.time())) 91 | 92 | 93 | def get_reviews_data(file_name): 94 | """Get reviews data, from local csv.""" 95 | if os.path.exists(file_name): 96 | print("-- " + file_name + " found locally") 97 | df = pd.read_csv(file_name) 98 | 99 | return df 100 | 101 | 102 | def review_to_words(review): 103 | """ 104 | Function to convert a raw review to a string of words 105 | :param review 106 | :return: meaningful_words 107 | """ 108 | # 1. Convert to lower case, split into individual words 109 | words = review.lower().split() 110 | # 111 | # 2. In Python, searching a set is much faster than searching 112 | # a list, so convert the stop words to a set 113 | stops = set(stopwords.words("english")) 114 | # 115 | # 3. Remove stop words 116 | meaningful_words = [w for w in words if not w in stops] 117 | # 118 | # 4. Join the words back into one string separated by space, 119 | # and return the result. 120 | return " ".join(meaningful_words) 121 | 122 | 123 | def cleaning_data(dataset, file_name): 124 | t0 = time.time() 125 | 126 | # Get the number of reviews based on the dataframe column size 127 | num_reviews = dataset["text"].size 128 | 129 | # Initialize an empty list to hold the clean reviews 130 | clean_train_reviews = [] 131 | 132 | # Loop over each review 133 | for i in xrange(0, num_reviews): 134 | # If the index is evenly divisible by 1000, print a message 135 | if (i + 1) % 10000 == 0: 136 | print "Review %d of %d\n" % (i + 1, num_reviews) 137 | 138 | # Call our function for each one, and add the result to the list of 139 | # clean reviews 140 | productId = str(dataset["productId"][i]) 141 | score = str(dataset["score"][i]) 142 | summary = str(dataset["summary"][i]) 143 | text = review_to_words(str(dataset["text"][i])) 144 | 145 | clean_train_reviews.append(productId + "," + score + "," + summary + "," + text + "\n") 146 | 147 | print "Writing clean train reviews..." 148 | with open(file_name, "w") as f: 149 | f.write("productId,score,summary,text\n") 150 | for review in clean_train_reviews: 151 | f.write("%s\n" % review) 152 | 153 | print " %s - Write file completed %s" % (datetime.datetime.now(), time_diff_str(t0, time.time())) 154 | 155 | 156 | def print_words_frequency(train_data_features): 157 | # Take a look at the words in the vocabulary 158 | vocab = vectorizer.get_feature_names() 159 | print "Words in vocabulary:", vocab 160 | 161 | # Sum up the counts of each vocabulary word 162 | dist = np.sum(train_data_features, axis=0) 163 | 164 | # For each, print the vocabulary word and the number of times it 165 | # appears in the training set 166 | print "Words frequency..." 167 | for tag, count in zip(vocab, dist): 168 | print count, tag 169 | 170 | 171 | if __name__ == "__main__": 172 | """ 173 | Pre-processing 174 | """ 175 | # converting plain text for next processing 176 | convert_plain_to_csv("foods.txt", "foods.csv") 177 | 178 | # Reading the Data 179 | train = get_reviews_data("foods.csv") 180 | print "Data dimensions:", train.shape 181 | print "List features:", train.columns.values 182 | print "First review:", train["summary"][0], "|", train["text"][0] 183 | 184 | cleaning_data(train, "clean_train_reviews.csv") 185 | 186 | """ 187 | Bag of Words features 188 | """ 189 | 190 | clean_train_reviews = pd.read_csv("clean_train_reviews.csv", nrows=1000) 191 | 192 | # ignore all 3* reviews 193 | clean_train_reviews = clean_train_reviews[clean_train_reviews["score"] != 3] 194 | # positive sentiment = 4* or 5* reviews 195 | clean_train_reviews["sentiment"] = clean_train_reviews["score"] >= 4 196 | 197 | train, test = train_test_split(clean_train_reviews, test_size=0.2) 198 | 199 | print "Creating the bag of words...\n" 200 | vectorizer = CountVectorizer(analyzer="word", 201 | tokenizer=None, 202 | preprocessor=None, 203 | stop_words=None, 204 | max_features=10) 205 | 206 | train_text = train["text"].values.astype('U') 207 | test_text = test["text"].values.astype('U') 208 | 209 | # convert data-set to term-document matrix 210 | X_train = vectorizer.fit_transform(train_text).toarray() 211 | y_train = train["sentiment"] 212 | 213 | X_test = vectorizer.fit_transform(test_text).toarray() 214 | y_test = test["sentiment"] 215 | 216 | print_words_frequency(X_train) 217 | 218 | """ 219 | Training 220 | """ 221 | 222 | print "---------------------------" 223 | print "Training" 224 | print "---------------------------" 225 | 226 | names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process", 227 | "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", 228 | "Naive Bayes", "QDA"] 229 | 230 | classifiers = [ 231 | KNeighborsClassifier(3), 232 | SVC(kernel="linear", C=0.025), 233 | SVC(gamma=2, C=1), 234 | GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), 235 | DecisionTreeClassifier(max_depth=5), 236 | RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 237 | MLPClassifier(alpha=1), 238 | AdaBoostClassifier(), 239 | GaussianNB(), 240 | QuadraticDiscriminantAnalysis()] 241 | 242 | # iterate over classifiers 243 | results = {} 244 | 245 | for name, clf in zip(names, classifiers): 246 | print "Training " + name + " classifier..." 247 | clf.fit(X_train, y_train) 248 | score = clf.score(X_test, y_test) 249 | results[name] = score 250 | 251 | print "---------------------------" 252 | print "Evaluation results" 253 | print "---------------------------" 254 | 255 | # sorting results and print out 256 | sorted(results.items(), key=itemgetter(1)) 257 | for name in results: 258 | print name + " accuracy: %0.3f" % results[name] 259 | -------------------------------------------------------------------------------- /deep_learning/snippets/training_network.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | 3 | import matplotlib.image as mpimg 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import tensorflow as tf 7 | from scipy.misc import imresize 8 | from tensorflow.python.framework import ops 9 | 10 | plt.style.use('ggplot') 11 | 12 | 13 | # hàm tính khoảng cách tuyệt đối (L1-norm) 14 | def distance(p1, p2): 15 | return tf.abs(p1 - p2) 16 | 17 | 18 | # hàm huấn luyện Stochastic and Mini Batch Gradient Descent 19 | def train(X, Y, Y_pred, n_iterations=100, batch_size=200, learning_rate=0.02): 20 | cost = tf.reduce_mean(distance(Y_pred, Y)) 21 | optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) 22 | 23 | with tf.Session() as sess: 24 | # thông báo cho TensorFlow biết ta cần khởi tạo tất cả các biến trong Graph 25 | # lúc này, `W` và `b` sẽ được khởi tạo 26 | sess.run(tf.global_variables_initializer()) 27 | 28 | # bắt đầu vòng lặp huấn luyện 29 | prev_training_cost = 0.0 30 | for it_i in range(n_iterations): 31 | # hoán vị chỉ số các phần tử trong x-axis 32 | idxs = np.random.permutation(range(len(xs))) 33 | n_batches = len(idxs) 34 | for batch_i in range(n_batches): 35 | # lấy batch_size giá trị các phần tử x-axis được lấy ngẫu nhiên 36 | # để huấn luyện 37 | idxs_i = idxs[batch_i * batch_size: (batch_i + 1) * batch_size] 38 | sess.run(optimizer, feed_dict={X: xs[idxs_i], Y: ys[idxs_i]}) 39 | 40 | # lấy giá trị lỗi huấn luyện hiện tại 41 | training_cost = sess.run(cost, feed_dict={X: xs, Y: ys}) 42 | 43 | if it_i % 10 == 0: 44 | # in ra lỗi huẫn luyện hiện tại 45 | print "Cost:", training_cost 46 | 47 | # dừng quá trình huấn luyện nếu độ lỗi không thay đổi nhiều 48 | if np.abs(prev_training_cost - training_cost) < 0.000001: 49 | print "Stop training..." 50 | break 51 | 52 | # cập nhật training cost 53 | prev_training_cost = training_cost 54 | 55 | 56 | def linear(X, n_input, n_output, activation=None, scope=None): 57 | with tf.variable_scope(scope or "linear"): 58 | # khởi tạo trọng số W cho n-layers 59 | W = tf.get_variable( 60 | name='W', 61 | shape=[n_input, n_output], 62 | initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1)) 63 | 64 | # khởi tạo bias cho n-layers 65 | b = tf.get_variable( 66 | name='b', 67 | shape=[n_output], 68 | initializer=tf.constant_initializer()) 69 | 70 | # kích hoạt giá trị dự đoán hypothesis (h) 71 | h = tf.matmul(X, W) + b 72 | if activation is not None: 73 | h = activation(h) 74 | return h 75 | 76 | 77 | def image_inpainting(X, Y, Y_pred, n_iterations=100, batch_size=200, learning_rate=0.001): 78 | cost = tf.reduce_mean(tf.reduce_sum(distance(Y_pred, Y), 1)) 79 | optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost) 80 | 81 | with tf.Session() as sess: 82 | # thông báo cho TensorFlow biết ta cần khởi tạo tất cả các biến trong Graph 83 | # lúc này, `W` và `b` sẽ được khởi tạo 84 | sess.run(tf.global_variables_initializer()) 85 | 86 | # bắt đầu vòng lặp huấn luyện 87 | prev_training_cost = 0.0 88 | for it_i in range(n_iterations): 89 | # hoán vị chỉ số các phần tử trong x-axis 90 | idxs = np.random.permutation(range(len(xs))) 91 | n_batches = len(idxs) 92 | for batch_i in range(n_batches): 93 | # lấy batch_size giá trị các phần tử x-axis được lấy ngẫu nhiên 94 | # để huấn luyện 95 | idxs_i = idxs[batch_i * batch_size: (batch_i + 1) * batch_size] 96 | sess.run(optimizer, feed_dict={X: xs[idxs_i], Y: ys[idxs_i]}) 97 | 98 | # lấy giá trị lỗi huấn luyện hiện tại 99 | training_cost = sess.run(cost, feed_dict={X: xs, Y: ys}) 100 | 101 | # in ra lỗi huẫn luyện hiện tại 102 | print "Cost", it_i, training_cost 103 | 104 | if (it_i + 1) % 20 == 0: 105 | # lấy giá trị dự đoán 106 | ys_pred = Y_pred.eval(feed_dict={X: xs}, session=sess) 107 | fig, ax = plt.subplots(1, 1) 108 | img = np.clip(ys_pred.reshape(img.shape), 0, 255).astype(np.uint8) 109 | plt.imshow(img) 110 | plt.show() 111 | 112 | # dừng quá trình huấn luyện nếu độ lỗi không thay đổi nhiều 113 | if np.abs(prev_training_cost - training_cost) < 0.000001: 114 | print "Stop training..." 115 | break 116 | 117 | # cập nhật training cost 118 | prev_training_cost = training_cost 119 | 120 | 121 | if __name__ == "__main__": 122 | # định nghĩa số lượng đối tượng quan sát 123 | n_observations = 1000 124 | 125 | # khởi tạo đối tượng đầu vào 126 | xs = np.linspace(-3, 3, n_observations) 127 | 128 | # khởi tạo giá trị đầu ra theo hình sine 129 | ys = np.sin(xs) + np.random.uniform(-0.5, 0.5, n_observations) 130 | plt.scatter(xs, ys, alpha=0.15, marker='+') 131 | plt.show() 132 | 133 | # Tạo placeholder tên X để truyền giá trị của x-axis vào 134 | # name=`X` dùng để quan sát operations trong Graph 135 | X = tf.placeholder(tf.float32, name='X') 136 | 137 | # Tạo placeholder tên để truyền giá trị của y-axis vào 138 | Y = tf.placeholder(tf.float32, name='Y') 139 | 140 | ######################### 141 | # Simple Neural Network # 142 | ######################### 143 | 144 | # để tạo biến ta dùng tf.Variable, không như tf.Placeholder, hàm này không 145 | # đòi hỏi phải định nghĩa giá trị ngay thời điểm bắt đầu run/eval. 146 | # ta sẽ lấy gía trị từ đường cong chuẩn và truyền vào tf.Variable để tạo tensor object 147 | W = tf.Variable(tf.random_normal([1], dtype=tf.float32, stddev=0.1), name='weight') 148 | 149 | # khởi tạo biến bias với giá trị zero 150 | B = tf.Variable(tf.constant([0], dtype=tf.float32), name='bias') 151 | 152 | # giá trị dự đoán 153 | Y_pred = X * W + B 154 | 155 | # huấn luyện mô hình 156 | print "Training linear model..." 157 | train(X, Y, Y_pred, 500, 1000) 158 | 159 | # tăng bậc cho mô hình 160 | degree = 3 161 | Y_pred = tf.Variable(tf.random_normal([1]), name='bias') 162 | W = tf.Variable(tf.random_normal([1], stddev=0.1), name='weight_%d' % degree) 163 | Y_pred = tf.add(tf.mul(tf.pow(X, degree), W), Y_pred) 164 | 165 | # huấn luyện mô hình 166 | print "Training polynomial model..." 167 | train(X, Y, Y_pred, 500, 100, 0.01) 168 | 169 | ######################################## 170 | # Nonlinearities / Activation Function # 171 | ######################################## 172 | 173 | sess = tf.InteractiveSession() 174 | x = np.linspace(-6, 6, 1000) 175 | plt.plot(x, tf.nn.tanh(x).eval(), label='tanh') 176 | plt.plot(x, tf.nn.sigmoid(x).eval(), label='sigmoid') 177 | plt.plot(x, tf.nn.relu(x).eval(), label='relu') 178 | plt.legend(loc='lower right') 179 | plt.xlim([-6, 6]) 180 | plt.ylim([-2, 2]) 181 | plt.xlabel('Input') 182 | plt.ylabel('Output') 183 | plt.grid('on') 184 | plt.show() 185 | 186 | # clear the graph 187 | ops.reset_default_graph() 188 | 189 | # get current graph 190 | g = tf.get_default_graph() 191 | 192 | # tạo network mới 193 | X = tf.placeholder(tf.float32, name='X') 194 | h = linear(X, 2, 10, scope='layer1') 195 | 196 | # tạo Deep Network! 197 | h2 = linear(h, 10, 10, scope='layer2') 198 | 199 | # thêm layer! 200 | h3 = linear(h2, 10, 3, scope='layer3') 201 | 202 | # xem danh sách operations trong graph 203 | print [op.name for op in tf.get_default_graph().get_operations()] 204 | 205 | #################### 206 | # Image Inpainting # 207 | #################### 208 | img = mpimg.imread("imgs/dogs.jpg") 209 | img = imresize(img, (64, 64)) 210 | plt.imshow(img) 211 | plt.show() 212 | 213 | # lưu vị trí điểm ảnh vào x-axis 214 | xs = [] 215 | 216 | # lưu giá trị màu tương ứng với vị trí điểm ảnh 217 | ys = [] 218 | 219 | # duyệt qua từng điểm ảnh 220 | for row_i in range(img.shape[0]): 221 | for col_i in range(img.shape[1]): 222 | # lưu giá trị inputs 223 | xs.append([row_i, col_i]) 224 | 225 | # lưu giá trị màu outputs Networks cần dự đoán 226 | ys.append(img[row_i, col_i]) 227 | 228 | # convert lists to arrays for numpy calculation 229 | xs = np.array(xs) 230 | ys = np.array(ys) 231 | 232 | # Normalizing the input by the mean and standard deviation 233 | xs = (xs - np.mean(xs)) / np.std(xs) 234 | 235 | # print the shapes 236 | print xs.shape, ys.shape 237 | 238 | X = tf.placeholder(tf.float32, shape=[None, 2], name='X') 239 | Y = tf.placeholder(tf.float32, shape=[None, 3], name='Y') 240 | 241 | # building networks 242 | n_neurons = [2, 64, 64, 64, 64, 64, 64, 3] 243 | 244 | current_input = X 245 | for layer_i in range(1, len(n_neurons)): 246 | current_input = linear( 247 | X=current_input, 248 | n_input=n_neurons[layer_i - 1], 249 | n_output=n_neurons[layer_i], 250 | activation=tf.nn.relu if (layer_i + 1) < len(n_neurons) else None, 251 | scope='layer_' + str(layer_i)) 252 | 253 | # training painting 254 | Y_pred = current_input 255 | image_inpainting(X, Y, Y_pred) 256 | -------------------------------------------------------------------------------- /deep_learning/src/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import csv 3 | import datetime 4 | import os 5 | import sys 6 | import time 7 | from operator import itemgetter 8 | 9 | import numpy as np 10 | import pandas as pd 11 | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 12 | from sklearn.ensemble import AdaBoostClassifier 13 | from sklearn.ensemble import RandomForestClassifier 14 | from sklearn.gaussian_process import GaussianProcessClassifier 15 | from sklearn.gaussian_process.kernels import RBF 16 | from sklearn.model_selection import train_test_split 17 | from sklearn.naive_bayes import GaussianNB 18 | from sklearn.neighbors import KNeighborsClassifier 19 | from sklearn.neural_network import MLPClassifier 20 | from sklearn.svm import SVC 21 | from sklearn.tree import DecisionTreeClassifier 22 | 23 | sys.path.append("/usr/local/lib/python2.7/site-packages") 24 | import cv2 25 | import imutils 26 | from imutils import paths 27 | 28 | 29 | def time_diff_str(t1, t2): 30 | """ 31 | Calculates time durations. 32 | """ 33 | diff = t2 - t1 34 | mins = int(diff / 60) 35 | secs = round(diff % 60, 2) 36 | return str(mins) + " mins and " + str(secs) + " seconds" 37 | 38 | 39 | def image_to_feature_vector(image, size=(32, 32)): 40 | # resize the image to a fixed size, then flatten the image into 41 | # a list of raw pixel intensities 42 | return cv2.resize(image, size).flatten() 43 | 44 | 45 | def extract_color_histogram(image): 46 | hist = cv2.calcHist([image], [0], None, [8], [0, 256]) 47 | 48 | # handle normalizing the histogram if we are using OpenCV 2.4.X 49 | if imutils.is_cv2(): 50 | hist = cv2.normalize(hist) 51 | 52 | # otherwise, perform "in place" normalization in OpenCV 3 (I 53 | # personally hate the way this is done 54 | else: 55 | cv2.normalize(hist, hist) 56 | 57 | # return the flattened histogram as the feature vector 58 | return hist.flatten() 59 | 60 | 61 | def load_csv(file_path): 62 | """Get data, from local csv.""" 63 | if os.path.exists(file_path): 64 | print "[INFO] load", file_path, "file..." 65 | df = pd.read_csv(file_path, index_col=0) 66 | 67 | return df.to_dict() 68 | 69 | 70 | def get_simple_feature_labels(patient_labels, img_paths): 71 | features = [] 72 | labels = [] 73 | 74 | # loop over the input images 75 | for (i, img_path) in enumerate(img_paths): 76 | # get only training labels 77 | base = os.path.basename(img_path) 78 | patient_id = os.path.splitext(base)[0] 79 | if patient_id in patient_labels["cancer"].keys(): 80 | labels.append(patient_labels["cancer"][patient_id]) 81 | else: 82 | continue 83 | 84 | # load the image 85 | image = cv2.imread(img_path) 86 | 87 | # histogram to characterize the color distribution of the pixels 88 | # in the image 89 | feat = image_to_feature_vector(image) 90 | 91 | # update features 92 | features.append(feat) 93 | 94 | # show an update every 100 images 95 | if i > 0 and i % 100 == 0: 96 | print("[INFO] processed {}/{}".format(i, len(img_paths))) 97 | 98 | return features, labels 99 | 100 | 101 | def get_hist_feature_labels(patient_labels, img_paths): 102 | features = [] 103 | labels = [] 104 | 105 | # loop over the input images 106 | for (i, img_path) in enumerate(img_paths): 107 | # get only training labels 108 | base = os.path.basename(img_path) 109 | patient_id = os.path.splitext(base)[0] 110 | if patient_id in patient_labels["cancer"].keys(): 111 | labels.append(patient_labels["cancer"][patient_id]) 112 | else: 113 | continue 114 | 115 | # load the image 116 | image = cv2.imread(img_path) 117 | 118 | # histogram to characterize the color distribution of the pixels 119 | # in the image 120 | hist = extract_color_histogram(image) 121 | 122 | # update features 123 | features.append(hist) 124 | 125 | # show an update every 100 images 126 | if i > 0 and i % 100 == 0: 127 | print("[INFO] processed {}/{}".format(i, len(img_paths))) 128 | 129 | return features, labels 130 | 131 | 132 | def generate_bow_features(img_paths, dictionarySize=5): 133 | BOW = cv2.BOWKMeansTrainer(dictionarySize) 134 | sift = cv2.xfeatures2d.SIFT_create() 135 | 136 | for (i, image_path) in enumerate(img_paths): 137 | gray = cv2.imread(image_path) 138 | kp, dsc = sift.detectAndCompute(gray, None) 139 | BOW.add(dsc) 140 | print("# kps: {}, descriptors: {}".format(len(kp), dsc.shape)) 141 | 142 | # dictionary created 143 | dictionary = BOW.cluster() 144 | index_params = dict(algorithm=0, trees=5) 145 | search_params = dict(checks=50) # or pass empty dictionary 146 | flann = cv2.FlannBasedMatcher(index_params, search_params) 147 | sift2 = cv2.xfeatures2d.SIFT_create() 148 | bowDiction = cv2.BOWImgDescriptorExtractor(sift2, cv2.BFMatcher(cv2.NORM_L2)) 149 | bowDiction.setVocabulary(dictionary) 150 | print "[INFO] Finished create BOW dictionary", time_diff_str(t_start, time.time()) 151 | return bowDiction 152 | 153 | 154 | def sift_feature_extract(img_paths, patient_labels, bow_dict): 155 | features = [] 156 | labels = [] 157 | 158 | # loop over the input images 159 | for (i, img_path) in enumerate(img_paths): 160 | # get only training labels 161 | base = os.path.basename(img_path) 162 | patient_id = os.path.splitext(base)[0] 163 | if patient_id in patient_labels["cancer"]: 164 | labels.append(patient_labels["cancer"][patient_id]) 165 | else: 166 | continue 167 | 168 | # load the image 169 | gray = cv2.imread(img_path) 170 | sift_feature = bow_dict.compute(gray, sift.detect(gray)) 171 | 172 | # update features 173 | features.extend(sift_feature) 174 | 175 | # show an update every 100 images 176 | if i > 0 and i % 100 == 0: 177 | print("[INFO] processed {}/{}".format(i, len(img_paths))) 178 | 179 | return features, labels 180 | 181 | 182 | if __name__ == "__main__": 183 | t_start = time.time() 184 | 185 | # construct the argument parse and parse the arguments 186 | ap = argparse.ArgumentParser() 187 | ap.add_argument("-d", "--dataset", required=True, help="path to input dataset") 188 | ap.add_argument("-j", "--jobs", type=int, default=-1, help="# of jobs (-1 uses all available cores)") 189 | args = vars(ap.parse_args()) 190 | 191 | # grab the list of images that we'll be describing 192 | print("[INFO] describing images...") 193 | img_paths = list(paths.list_images(args["dataset"])) 194 | 195 | # load train/test labels 196 | stage1_labels = load_csv("../data/stage1_labels.csv") 197 | stage1_sample_submission = load_csv("../data/stage1_sample_submission.csv") 198 | 199 | # Generating Bag of Words model 200 | # generate_bow_features(img_paths) 201 | 202 | # train_features, train_labels = get_hist_feature_labels(stage1_labels, img_paths) 203 | # test_features, test_labels = get_hist_feature_labels(stage1_sample_submission, img_paths) 204 | # train_features, train_labels = sift_feature_extract(img_paths, stage1_labels, bowDiction) 205 | # test_features, test_labels = sift_feature_extract(img_paths, stage1_sample_submission, bowDiction) 206 | train_features, train_labels = get_simple_feature_labels(stage1_labels, img_paths) 207 | test_features, test_labels = get_simple_feature_labels(stage1_sample_submission, img_paths) 208 | train_features = np.array(train_features) 209 | print("[INFO] features matrix: {:.2f}MB".format(train_features.nbytes / (1024 * 1000.0))) 210 | 211 | (for_train_features, dev_features, for_train_labels, dev_labels) = train_test_split(train_features, 212 | train_labels, 213 | test_size=0.25, 214 | random_state=42) 215 | 216 | print "---------------------------" 217 | print "Training" 218 | print "---------------------------" 219 | 220 | classifiers = { 221 | "Nearest Neighbors": KNeighborsClassifier(3, n_jobs=args["jobs"]), 222 | "Linear SVM": SVC(kernel="linear", C=0.025), 223 | "RBF SVM": SVC(gamma=2, C=1), 224 | "Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True, n_jobs=args["jobs"]), 225 | "Decision Tree": DecisionTreeClassifier(max_depth=5), 226 | "Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, n_jobs=args["jobs"]), 227 | "Neural Net": MLPClassifier(alpha=1), 228 | "AdaBoost": AdaBoostClassifier(), 229 | "Naive Bayes": GaussianNB(), 230 | "QDA": QuadraticDiscriminantAnalysis() 231 | } 232 | 233 | # iterate over classifiers 234 | results = {} 235 | 236 | for name in classifiers: 237 | print "[INFO]" + name + " classifier..." 238 | clf = classifiers[name] 239 | clf.fit(for_train_features, for_train_labels) 240 | score = clf.score(dev_features, dev_labels) 241 | results[name] = score 242 | 243 | print "---------------------------" 244 | print "Evaluation results" 245 | print "---------------------------" 246 | 247 | # sorting results and print out 248 | sorted(results.items(), key=itemgetter(1)) 249 | for name in results: 250 | print "[INFO]", name, "accuracy: %0.3f" % results[name] 251 | 252 | print "---------------------------" 253 | print "Training for submission" 254 | print "---------------------------" 255 | 256 | name = list(results)[0] 257 | clf = classifiers[name] 258 | print "[INFO]" + name + " classifier..." 259 | clf.fit(train_features, train_labels) 260 | predict_submission = clf.predict(test_features) 261 | 262 | # update submission 263 | submission = {} 264 | for (i, patient_id) in enumerate(stage1_sample_submission["cancer"]): 265 | submission[patient_id] = predict_submission[i] 266 | 267 | with open("submission_results.csv", "wb") as f: 268 | writer = csv.writer(f, delimiter=',') 269 | writer.writerow(["id", "cancer"]) 270 | for key, value in submission.items(): 271 | writer.writerow([key, value]) 272 | 273 | print "[INFO]", datetime.datetime.now(), "* DONE After *", time_diff_str(t_start, time.time()) 274 | -------------------------------------------------------------------------------- /python/jupyter/Getting started with iPython Notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "#Installing Python and GraphLab Create" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Please follow the installation instructions here before getting started:\n", 15 | "\n", 16 | "\n", 17 | "##We have done\n", 18 | "* Installed Python\n", 19 | "* Started Ipython Notebook" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "#Getting started with Python" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 1, 32 | "metadata": { 33 | "collapsed": false 34 | }, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "Hello World!\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "print 'Hello World!'" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "##Create some variables in Python" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 2, 58 | "metadata": { 59 | "collapsed": true 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "i = 4 #int" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": { 70 | "collapsed": false 71 | }, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "int" 77 | ] 78 | }, 79 | "execution_count": 3, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "type(i)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 4, 91 | "metadata": { 92 | "collapsed": true 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "f = 4.1 #float" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 5, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "text/plain": [ 109 | "float" 110 | ] 111 | }, 112 | "execution_count": 5, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "type(f)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 6, 124 | "metadata": { 125 | "collapsed": true 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "b = True #boolean variable" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 7, 135 | "metadata": { 136 | "collapsed": true 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "s = \"This is a string!\"" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 8, 146 | "metadata": { 147 | "collapsed": false 148 | }, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "This is a string!\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "print s" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "##Advanced python types" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 9, 172 | "metadata": { 173 | "collapsed": true 174 | }, 175 | "outputs": [], 176 | "source": [ 177 | "l = [3,1,2] #list" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 10, 183 | "metadata": { 184 | "collapsed": false 185 | }, 186 | "outputs": [ 187 | { 188 | "name": "stdout", 189 | "output_type": "stream", 190 | "text": [ 191 | "[3, 1, 2]\n" 192 | ] 193 | } 194 | ], 195 | "source": [ 196 | "print l" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 11, 202 | "metadata": { 203 | "collapsed": true 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "d = {'foo':1, 'bar':2.3, 's':'my first dictionary'} #dictionary" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 12, 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "{'s': 'my first dictionary', 'foo': 1, 'bar': 2.3}\n" 222 | ] 223 | } 224 | ], 225 | "source": [ 226 | "print d" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 13, 232 | "metadata": { 233 | "collapsed": false 234 | }, 235 | "outputs": [ 236 | { 237 | "name": "stdout", 238 | "output_type": "stream", 239 | "text": [ 240 | "1\n" 241 | ] 242 | } 243 | ], 244 | "source": [ 245 | "print d['foo'] #element of a dictionary" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 14, 251 | "metadata": { 252 | "collapsed": false 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "n = None #Python's null type" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 15, 262 | "metadata": { 263 | "collapsed": false 264 | }, 265 | "outputs": [ 266 | { 267 | "data": { 268 | "text/plain": [ 269 | "NoneType" 270 | ] 271 | }, 272 | "execution_count": 15, 273 | "metadata": {}, 274 | "output_type": "execute_result" 275 | } 276 | ], 277 | "source": [ 278 | "type(n)" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "##Advanced printing" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 16, 291 | "metadata": { 292 | "collapsed": false 293 | }, 294 | "outputs": [ 295 | { 296 | "name": "stdout", 297 | "output_type": "stream", 298 | "text": [ 299 | "Our float value is 4.1. Our int value is 4.\n" 300 | ] 301 | } 302 | ], 303 | "source": [ 304 | "print \"Our float value is %s. Our int value is %s.\" % (f,i) #Python is pretty good with strings" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "##Conditional statements in python" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 17, 317 | "metadata": { 318 | "collapsed": false 319 | }, 320 | "outputs": [ 321 | { 322 | "name": "stdout", 323 | "output_type": "stream", 324 | "text": [ 325 | "i or f are both greater than 4.\n" 326 | ] 327 | } 328 | ], 329 | "source": [ 330 | "if i == 1 and f > 4:\n", 331 | " print \"The value of i is 1 and f is greater than 4.\"\n", 332 | "elif i > 4 or f > 4:\n", 333 | " print \"i or f are both greater than 4.\"\n", 334 | "else:\n", 335 | " print \"both i and f are less than or equal to 4\"\n" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "##Conditional loops" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 18, 348 | "metadata": { 349 | "collapsed": false 350 | }, 351 | "outputs": [ 352 | { 353 | "name": "stdout", 354 | "output_type": "stream", 355 | "text": [ 356 | "[3, 1, 2]\n" 357 | ] 358 | } 359 | ], 360 | "source": [ 361 | "print l" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": 19, 367 | "metadata": { 368 | "collapsed": false 369 | }, 370 | "outputs": [ 371 | { 372 | "name": "stdout", 373 | "output_type": "stream", 374 | "text": [ 375 | "3\n", 376 | "1\n", 377 | "2\n" 378 | ] 379 | } 380 | ], 381 | "source": [ 382 | "for e in l:\n", 383 | " print e" 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": {}, 389 | "source": [ 390 | "Note that in Python, we don't use {} or other markers to indicate the part of the loop that gets iterated. Instead, we just indent and align each of the iterated statements with spaces or tabs. (You can use as many as you want, as long as the lines are aligned.)" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 20, 396 | "metadata": { 397 | "collapsed": false 398 | }, 399 | "outputs": [ 400 | { 401 | "name": "stdout", 402 | "output_type": "stream", 403 | "text": [ 404 | "6\n", 405 | "7\n", 406 | "8\n", 407 | "9\n" 408 | ] 409 | } 410 | ], 411 | "source": [ 412 | "counter = 6\n", 413 | "while counter < 10:\n", 414 | " print counter\n", 415 | " counter += 1" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": { 421 | "collapsed": true 422 | }, 423 | "source": [ 424 | "#Creating functions in Python\n", 425 | "\n", 426 | "Again, we don't use {}, but just indent the lines that are part of the function." 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 21, 432 | "metadata": { 433 | "collapsed": true 434 | }, 435 | "outputs": [], 436 | "source": [ 437 | "def add2(x):\n", 438 | " y = x + 2\n", 439 | " return y" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 22, 445 | "metadata": { 446 | "collapsed": true 447 | }, 448 | "outputs": [], 449 | "source": [ 450 | "i = 5" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": 23, 456 | "metadata": { 457 | "collapsed": false 458 | }, 459 | "outputs": [ 460 | { 461 | "data": { 462 | "text/plain": [ 463 | "7" 464 | ] 465 | }, 466 | "execution_count": 23, 467 | "metadata": {}, 468 | "output_type": "execute_result" 469 | } 470 | ], 471 | "source": [ 472 | "add2(i)" 473 | ] 474 | }, 475 | { 476 | "cell_type": "markdown", 477 | "metadata": {}, 478 | "source": [ 479 | "We can also define simple functions with lambdas:" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 24, 485 | "metadata": { 486 | "collapsed": true 487 | }, 488 | "outputs": [], 489 | "source": [ 490 | "square = lambda x: x*x" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "metadata": { 497 | "collapsed": true 498 | }, 499 | "outputs": [], 500 | "source": [] 501 | } 502 | ], 503 | "metadata": { 504 | "kernelspec": { 505 | "display_name": "Python 2", 506 | "language": "python", 507 | "name": "python2" 508 | }, 509 | "language_info": { 510 | "codemirror_mode": { 511 | "name": "ipython", 512 | "version": 2 513 | }, 514 | "file_extension": ".py", 515 | "mimetype": "text/x-python", 516 | "name": "python", 517 | "nbconvert_exporter": "python", 518 | "pygments_lexer": "ipython2", 519 | "version": "2.7.10" 520 | } 521 | }, 522 | "nbformat": 4, 523 | "nbformat_minor": 0 524 | } 525 | -------------------------------------------------------------------------------- /pyspark/notebooks/spark_essentials.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# RDD overview\n", 8 | "- Programmer specifies number of partitions\n", 9 | "- Driver passes each partition to corresponding Workers\n", 10 | "- Master parameter specifies number of workers.\n", 11 | "- Spark automatically pushes closures to workers.\n", 12 | "\n", 13 | "# Some transformations\n", 14 | "- map(func): return a new distributed dataset formed by passing each element of the source through a function func.\n", 15 | "- filter(func): return a new dataset formed by selecting those elements of the source on which func returns true.\n", 16 | "- distinct([numTasks]): return a new dataset that contains the distinct elements of the source dataset.\n", 17 | "- flatMap(func): similar to map, but each input item can be mapped to 0 or more output items (so func should return a Seq rather than a single item)." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "data": { 27 | "text/plain": [ 28 | "[2, 4, 6, 8]" 29 | ] 30 | }, 31 | "execution_count": 1, 32 | "metadata": {}, 33 | "output_type": "execute_result" 34 | } 35 | ], 36 | "source": [ 37 | "rdd = sc.parallelize([1, 2, 3, 4])\n", 38 | "rdd.map(lambda x: x * 2).collect()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 2, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "[2, 4]" 50 | ] 51 | }, 52 | "execution_count": 2, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "rdd.filter(lambda x: x % 2 == 0).collect()" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 3, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/plain": [ 69 | "[4, 1, 2, 3]" 70 | ] 71 | }, 72 | "execution_count": 3, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "rdd = sc.parallelize([1, 4, 2, 2, 3])\n", 79 | "rdd.distinct().collect()" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 4, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "[[1, 6], [2, 7], [3, 8]]" 91 | ] 92 | }, 93 | "execution_count": 4, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "rdd = sc.parallelize([1, 2, 3])\n", 100 | "rdd.map(lambda x: [x, x + 5]).collect()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 5, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "data": { 110 | "text/plain": [ 111 | "[1, 6, 2, 7, 3, 8]" 112 | ] 113 | }, 114 | "execution_count": 5, 115 | "metadata": {}, 116 | "output_type": "execute_result" 117 | } 118 | ], 119 | "source": [ 120 | "rdd.flatMap(lambda x: [x, x + 5]).collect()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "# Some actions\n", 128 | "- reduce(func): aggregate dataset's elements using function func, func takes two arguments and returns one, and is commutative and associative so that it can be computed correctly in parallel.\n", 129 | "- take(n): return an array with the list n elements.\n", 130 | "- collect(): return all the elements as an array. WARNING: make sure will fit in driver program.\n", 131 | "- takeOrdered(n, key=func): return n elements ordred in ascending order or as specified by the optional key function." 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 6, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "6" 143 | ] 144 | }, 145 | "execution_count": 6, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | } 149 | ], 150 | "source": [ 151 | "rdd = sc.parallelize([1, 2, 3])\n", 152 | "rdd.reduce(lambda a, b: a * b)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 7, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/plain": [ 163 | "[1, 2]" 164 | ] 165 | }, 166 | "execution_count": 7, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "rdd.take(2)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 8, 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "data": { 182 | "text/plain": [ 183 | "[1, 2, 3]" 184 | ] 185 | }, 186 | "execution_count": 8, 187 | "metadata": {}, 188 | "output_type": "execute_result" 189 | } 190 | ], 191 | "source": [ 192 | "rdd.collect()" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 9, 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "data": { 202 | "text/plain": [ 203 | "[5, 3, 2]" 204 | ] 205 | }, 206 | "execution_count": 9, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "rdd = sc.parallelize([5, 3, 1, 2])\n", 213 | "rdd.takeOrdered(3, lambda s: -1 * s)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 10, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | "5 5\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "lines = sc.textFile(\"sample_text.txt\", 4)\n", 231 | "lines.cache()\n", 232 | "print lines.count(), lines.count()" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "# Key-Value RDDs\n", 240 | "- Similar to Map Reduce, Spark supports Key-Value pairs\n", 241 | "- Each element of a Pair RDD is a pair tuple\n", 242 | "## Some Key-Value transformation\n", 243 | "- reduceByKey(func): return a new distributed dataset of (K, V) pairs where the values for each key are aggregated using the given reduce function func, which must be of type (V, V) -> V.\n", 244 | "- sortByKey(): return a new dataset (K, V) pairs sorted by keys in asceding order.\n", 245 | "- groupByKey(): return a new dataset of (K, Iterable) pairs." 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 11, 251 | "metadata": {}, 252 | "outputs": [ 253 | { 254 | "data": { 255 | "text/plain": [ 256 | "[(1, 2), (3, 4)]" 257 | ] 258 | }, 259 | "execution_count": 11, 260 | "metadata": {}, 261 | "output_type": "execute_result" 262 | } 263 | ], 264 | "source": [ 265 | "rdd = sc.parallelize([(1, 2), (3, 4)])\n", 266 | "rdd.collect()" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 12, 272 | "metadata": {}, 273 | "outputs": [ 274 | { 275 | "data": { 276 | "text/plain": [ 277 | "[(1, 2), (3, 10)]" 278 | ] 279 | }, 280 | "execution_count": 12, 281 | "metadata": {}, 282 | "output_type": "execute_result" 283 | } 284 | ], 285 | "source": [ 286 | "rdd = sc.parallelize([(1, 2), (3, 4), (3, 6)])\n", 287 | "rdd.reduceByKey(lambda a, b: a + b).collect()" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 13, 293 | "metadata": {}, 294 | "outputs": [ 295 | { 296 | "data": { 297 | "text/plain": [ 298 | "[(1, 'a'), (1, 'b'), (2, 'c')]" 299 | ] 300 | }, 301 | "execution_count": 13, 302 | "metadata": {}, 303 | "output_type": "execute_result" 304 | } 305 | ], 306 | "source": [ 307 | "rdd = sc.parallelize([(1, \"a\"), (2, \"c\"), (1, \"b\")])\n", 308 | "rdd.sortByKey().collect()" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 14, 314 | "metadata": {}, 315 | "outputs": [ 316 | { 317 | "data": { 318 | "text/plain": [ 319 | "[(1, ),\n", 320 | " (2, )]" 321 | ] 322 | }, 323 | "execution_count": 14, 324 | "metadata": {}, 325 | "output_type": "execute_result" 326 | } 327 | ], 328 | "source": [ 329 | "rdd.groupByKey().collect()" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "# Broadcast variables\n", 337 | "- Keep read-only variable cached on workers, ship to each worker only once instead of with each task" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 15, 343 | "metadata": {}, 344 | "outputs": [ 345 | { 346 | "data": { 347 | "text/plain": [ 348 | "[1, 2, 3]" 349 | ] 350 | }, 351 | "execution_count": 15, 352 | "metadata": {}, 353 | "output_type": "execute_result" 354 | } 355 | ], 356 | "source": [ 357 | "# at the driver:\n", 358 | "bcVar = sc.broadcast([1, 2, 3])\n", 359 | "\n", 360 | "# at the worker (in code passed via a closure)\n", 361 | "bcVar.value" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "# Accumulators\n", 369 | "- Variables that can only be \"added\" to by associative op\n", 370 | "- Used to efficiently implement parallel counters and sums\n", 371 | "- Only driver can read an accumulator's value, not tasks\n", 372 | "- Tasks at workers cannot access accumulator's values\n", 373 | "- Tasks see accumulators as write-only variables\n", 374 | "- Actions: each task's update to accumulator is applied only once\n", 375 | "- Transformations: no guarantees (use only for debugging)\n", 376 | "- Types: integers, double, long, float" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 16, 382 | "metadata": {}, 383 | "outputs": [ 384 | { 385 | "data": { 386 | "text/plain": [ 387 | "10" 388 | ] 389 | }, 390 | "execution_count": 16, 391 | "metadata": {}, 392 | "output_type": "execute_result" 393 | } 394 | ], 395 | "source": [ 396 | "accum = sc.accumulator(0)\n", 397 | "rdd = sc.parallelize([1, 2, 3, 4])\n", 398 | "def f(x):\n", 399 | " global accum\n", 400 | " accum += x\n", 401 | " \n", 402 | "rdd.foreach(f)\n", 403 | "accum.value" 404 | ] 405 | } 406 | ], 407 | "metadata": { 408 | "kernelspec": { 409 | "display_name": "Python 2", 410 | "language": "python", 411 | "name": "python2" 412 | }, 413 | "language_info": { 414 | "codemirror_mode": { 415 | "name": "ipython", 416 | "version": 2 417 | }, 418 | "file_extension": ".py", 419 | "mimetype": "text/x-python", 420 | "name": "python", 421 | "nbconvert_exporter": "python", 422 | "pygments_lexer": "ipython2", 423 | "version": "2.7.10" 424 | }, 425 | "name": "04_spark_essentials", 426 | "notebookId": 1227613790179004 427 | }, 428 | "nbformat": 4, 429 | "nbformat_minor": 1 430 | } 431 | -------------------------------------------------------------------------------- /word2vec/data/sample_tokenize.ann: -------------------------------------------------------------------------------- 1 | T1 B_W 0 3 Con 2 | T2 B_W 4 7 phà 3 | T3 B_W 8 10 bị 4 | T4 B_W 11 14 lật 5 | T5 B_W 15 22 nghiêng 6 | T6 B_W 23 27 sáng 7 | T7 B_W 28 32 16.4 8 | T8 B_W 33 36 khi 9 | T9 B_W 37 39 ca 10 | T10 I_W 40 42 nô 11 | T11 B_W 43 46 của 12 | T12 B_W 47 51 Cảnh 13 | T13 I_W 52 55 sát 14 | T14 B_W 56 60 biển 15 | T15 B_W 61 64 đến 16 | T16 I_W 65 68 nơi 17 | T17 B_W 69 73 trên 18 | T18 B_W 74 79 boong 19 | T19 B_W 80 83 tàu 20 | T20 B_W 84 89 không 21 | T21 B_W 90 92 có 22 | T22 B_W 93 97 hành 23 | T23 I_W 98 103 khách 24 | T24 B_W 104 107 nào 25 | T25 B_W 108 110 vì 26 | T26 B_W 111 115 hành 27 | T27 I_W 116 121 khách 28 | T28 B_W 122 127 không 29 | T29 I_W 128 132 được 30 | T30 B_W 133 138 thông 31 | T31 I_W 139 142 báo 32 | T32 B_W 143 146 lên 33 | T33 B_W 147 150 khu 34 | T34 I_W 151 154 vực 35 | T35 B_W 155 158 này 36 | T36 B_W 159 161 và 37 | T37 B_W 162 166 được 38 | T38 B_W 167 170 yêu 39 | T39 I_W 171 174 cầu 40 | T40 B_W 175 179 ngồi 41 | T41 I_W 180 183 yên 42 | T42 B_W 184 185 - 43 | T43 B_W 186 189 Ảnh 44 | T44 B_W 190 191 : 45 | T45 B_W 192 196 Cảnh 46 | T46 I_W 197 200 sát 47 | T47 B_W 201 205 biển 48 | T48 B_W 206 209 Hàn 49 | T49 I_W 210 214 Quốc 50 | T50 B_W 215 216 / 51 | T51 B_W 217 223 Yonhap 52 | T52 B_W 224 228 Việc 53 | T53 B_W 229 235 thường 54 | T54 I_W 236 241 xuyên 55 | T55 B_W 242 245 chở 56 | T56 B_W 246 250 hàng 57 | T57 I_W 251 254 hóa 58 | T58 B_W 255 258 quá 59 | T59 I_W 259 262 tải 60 | T60 B_W 263 267 trên 61 | T61 B_W 268 273 chiếc 62 | T62 B_W 274 277 phà 63 | T63 B_W 278 282 dùng 64 | T64 B_W 283 286 chở 65 | T65 B_W 287 292 khách 66 | T66 B_W 293 296 cho 67 | T67 I_W 297 301 thấy 68 | T68 B_W 302 304 lỗ 69 | T69 I_W 305 309 hổng 70 | T70 B_W 310 315 trong 71 | T71 B_W 316 320 việc 72 | T72 B_W 321 325 quản 73 | T73 I_W 326 328 lý 74 | T74 B_W 329 332 tàu 75 | T75 B_W 333 336 phà 76 | T76 B_W 337 340 lẫn 77 | T77 B_W 341 345 kiểu 78 | T78 B_W 346 350 kinh 79 | T79 I_W 351 356 doanh 80 | T80 B_W 357 360 bất 81 | T81 I_W 361 365 chấp 82 | T82 B_W 366 369 hậu 83 | T83 I_W 370 374 quả 84 | 85 | T84 B_W 375 377 Cơ 86 | T85 I_W 378 382 quan 87 | T86 B_W 383 387 đăng 88 | T87 I_W 388 392 kiểm 89 | T88 B_W 393 396 tàu 90 | T89 B_W 397 400 Hàn 91 | T90 I_W 401 405 Quốc 92 | T91 B_W 406 409 đầu 93 | T92 I_W 410 413 năm 94 | T93 B_W 414 418 2013 95 | T94 B_W 419 422 xem 96 | T95 I_W 423 426 xét 97 | T96 B_W 427 430 phà 98 | T97 B_W 431 436 Sewol 99 | T98 B_W 437 440 khi 100 | T99 B_W 441 444 phà 101 | T100 B_W 445 449 đăng 102 | T101 I_W 450 452 ký 103 | T102 B_W 453 456 cải 104 | T103 I_W 457 461 tiến 105 | T104 B_W 462 464 để 106 | T105 B_W 465 468 chở 107 | T106 B_W 469 473 thêm 108 | T107 B_W 474 479 nhiều 109 | T108 B_W 480 486 khách 110 | 111 | T109 B_W 487 489 Cơ 112 | T110 I_W 490 494 quan 113 | T111 B_W 495 498 này 114 | T112 B_W 499 502 cho 115 | T113 B_W 503 506 phà 116 | T114 B_W 507 511 được 117 | T115 B_W 512 515 chở 118 | T116 B_W 516 520 thêm 119 | T117 B_W 521 525 hàng 120 | T118 I_W 526 529 hóa 121 | T119 B_W 530 533 tối 122 | T120 I_W 534 536 đa 123 | T121 B_W 537 540 987 124 | T122 B_W 541 544 tấn 125 | T123 B_W 545 546 ( 126 | T124 B_W 547 551 tăng 127 | T125 B_W 552 554 50 128 | T126 B_W 555 556 % 129 | T127 B_W 557 558 ) 130 | T128 B_W 559 562 với 131 | T129 B_W 563 567 điều 132 | T130 I_W 568 572 kiện 133 | T131 B_W 573 576 dằn 134 | T132 B_W 577 581 thêm 135 | T133 B_W 582 586 dưới 136 | T134 B_W 587 593 khoang 137 | T135 B_W 594 599 2.000 138 | T136 B_W 600 603 tấn 139 | T137 B_W 604 608 nước 140 | T138 B_W 609 611 để 141 | T139 B_W 612 615 cân 142 | T140 I_W 616 621 bằng 143 | 144 | T141 B_W 622 625 Tuy 145 | T142 I_W 626 631 nhiên 146 | T143 B_W 632 638 khuyến 147 | T144 I_W 639 642 cáo 148 | T145 B_W 643 646 này 149 | T146 B_W 647 650 chỉ 150 | T147 B_W 651 654 gửi 151 | T148 B_W 655 658 đến 152 | T149 B_W 659 663 công 153 | T150 I_W 664 666 ty 154 | T151 B_W 667 671 quản 155 | T152 I_W 672 674 lý 156 | T153 B_W 675 678 phà 157 | T154 B_W 679 681 mà 158 | T155 B_W 682 687 không 159 | T156 B_W 688 692 được 160 | T157 B_W 693 696 gửi 161 | T158 B_W 697 700 cho 162 | T159 B_W 701 705 Cảnh 163 | T160 I_W 706 709 sát 164 | T161 B_W 710 714 biển 165 | T162 B_W 715 718 lẫn 166 | T163 B_W 719 723 Hiệp 167 | T164 I_W 724 727 hội 168 | T165 B_W 728 731 tàu 169 | T166 I_W 732 736 biển 170 | T167 B_W 737 740 Hàn 171 | T168 I_W 741 746 Quốc 172 | 173 | T169 B_W 747 750 Phà 174 | T170 B_W 751 754 này 175 | T171 B_W 755 758 sau 176 | T172 I_W 759 761 đó 177 | T173 B_W 762 766 liên 178 | T174 I_W 767 770 tục 179 | T175 B_W 771 774 chở 180 | T176 B_W 775 779 hàng 181 | T177 I_W 780 783 hóa 182 | T178 B_W 784 788 vượt 183 | T179 B_W 789 792 tải 184 | T180 I_W 793 798 trọng 185 | T181 B_W 799 802 cho 186 | T182 I_W 803 807 phép 187 | T183 B_W 808 811 987 188 | T184 B_W 812 815 tấn 189 | T185 B_W 816 819 như 190 | T186 B_W 820 823 chở 191 | T187 B_W 824 827 hơn 192 | T188 B_W 828 833 2.000 193 | T189 B_W 834 837 tấn 194 | T190 B_W 838 842 hàng 195 | T191 B_W 843 846 qua 196 | T192 B_W 847 850 136 197 | T193 B_W 851 857 chuyến 198 | T194 B_W 858 860 và 199 | T195 B_W 861 865 trên 200 | T196 B_W 866 871 3.000 201 | T197 B_W 872 875 tấn 202 | T198 B_W 876 879 qua 203 | T199 B_W 880 882 12 204 | T200 B_W 883 887 lần 205 | 206 | T201 B_W 888 892 Tổng 207 | T202 I_W 893 897 cộng 208 | T203 B_W 898 901 đến 209 | T204 B_W 902 908 chuyến 210 | T205 B_W 909 913 cuối 211 | T206 I_W 914 918 cùng 212 | T207 B_W 919 923 ngày 213 | T208 B_W 924 933 16.4.2014 214 | T209 B_W 934 937 phà 215 | T210 B_W 938 941 này 216 | T211 B_W 942 945 chở 217 | T212 B_W 946 950 hàng 218 | T213 B_W 951 954 quá 219 | T214 I_W 955 958 tải 220 | T215 B_W 959 962 đến 221 | T216 B_W 963 966 246 222 | T217 B_W 967 971 lần 223 | 224 | T218 B_W 972 974 Và 225 | T219 B_W 975 981 chuyến 226 | T220 B_W 982 986 cuối 227 | T221 I_W 987 991 cùng 228 | T222 B_W 992 995 phà 229 | T223 B_W 996 999 chở 230 | T224 B_W 1000 1005 lượng 231 | T225 B_W 1006 1010 hàng 232 | T226 I_W 1011 1014 hóa 233 | T227 B_W 1015 1020 khủng 234 | T228 B_W 1021 1024 đến 235 | T229 B_W 1025 1030 3.608 236 | T230 B_W 1031 1034 tấn 237 | T231 B_W 1035 1039 cùng 238 | T232 B_W 1040 1043 476 239 | T233 B_W 1044 1049 người 240 | T234 B_W 1050 1053 kết 241 | T235 I_W 1054 1057 quả 242 | T236 B_W 1058 1060 là 243 | T237 B_W 1061 1064 phà 244 | T238 B_W 1065 1068 lật 245 | T239 B_W 1069 1076 nghiêng 246 | T240 B_W 1077 1082 ngoài 247 | T241 I_W 1083 1087 khơi 248 | T242 B_W 1088 1091 đảo 249 | T243 B_W 1092 1097 Jindo 250 | T244 B_W 1098 1102 sáng 251 | T245 B_W 1103 1112 16.4.2014 252 | T246 B_W 1113 1116 làm 253 | T247 B_W 1117 1120 hơn 254 | T248 B_W 1121 1124 300 255 | T249 B_W 1125 1129 hành 256 | T250 I_W 1130 1135 khách 257 | T251 B_W 1136 1141 thiệt 258 | T252 I_W 1142 1147 mạng 259 | 260 | T253 B_W 1148 1154 Thuyền 261 | T254 I_W 1155 1161 trưởng 262 | T255 B_W 1162 1165 Lee 263 | T256 B_W 1166 1170 Joon 264 | T257 B_W 1171 1172 - 265 | T258 B_W 1173 1177 seok 266 | T259 I_W 1178 1182 khai 267 | T260 B_W 1183 1187 hàng 268 | T261 I_W 1188 1191 hóa 269 | T262 B_W 1192 1195 chở 270 | T263 B_W 1196 1200 trên 271 | T264 B_W 1201 1204 phà 272 | T265 B_W 1205 1207 ít 273 | T266 I_W 1208 1211 hơn 274 | T267 B_W 1212 1215 các 275 | T268 B_W 1216 1219 con 276 | T269 I_W 1220 1222 số 277 | T270 B_W 1223 1226 của 278 | T271 B_W 1227 1230 báo 279 | T272 I_W 1231 1234 cáo 280 | T273 B_W 1235 1239 trên 281 | T274 B_W 1240 1244 rằng 282 | T275 B_W 1245 1248 khi 283 | T276 B_W 1249 1253 chìm 284 | T277 B_W 1254 1257 phà 285 | T278 B_W 1258 1260 có 286 | T279 B_W 1261 1264 chở 287 | T280 B_W 1265 1268 657 288 | T281 B_W 1269 1272 tấn 289 | T282 B_W 1273 1277 hàng 290 | T283 B_W 1278 1282 cùng 291 | T284 B_W 1283 1286 150 292 | T285 B_W 1287 1288 ô 293 | T286 I_W 1289 1292 tô 294 | 295 | T287 B_W 1293 1296 Tuy 296 | T288 I_W 1297 1302 nhiên 297 | T289 B_W 1303 1307 Cảnh 298 | T290 I_W 1308 1311 sát 299 | T291 B_W 1312 1316 biển 300 | T292 B_W 1317 1320 tìm 301 | T293 B_W 1321 1325 thấy 302 | T294 B_W 1326 1329 đến 303 | T295 B_W 1330 1333 180 304 | T296 B_W 1334 1335 ô 305 | T297 I_W 1336 1338 tô 306 | T298 B_W 1339 1344 trong 307 | T299 I_W 1345 1349 lòng 308 | T300 B_W 1350 1353 phà 309 | T301 B_W 1354 1358 dưới 310 | T302 B_W 1359 1363 biển 311 | T303 B_W 1364 1366 ! 312 | 313 | T304 B_W 1367 1370 Các 314 | T305 B_W 1371 1377 chuyên 315 | T306 I_W 1378 1381 gia 316 | T307 B_W 1382 1385 tin 317 | T308 B_W 1386 1390 rằng 318 | T309 B_W 1391 1394 khi 319 | T310 B_W 1395 1398 chở 320 | T311 B_W 1399 1402 quá 321 | T312 I_W 1403 1406 tải 322 | T313 B_W 1407 1410 chỉ 323 | T314 B_W 1411 1414 cần 324 | T315 B_W 1415 1418 đảo 325 | T316 B_W 1419 1424 hướng 326 | T317 B_W 1425 1428 một 327 | T318 I_W 1429 1433 chút 328 | T319 B_W 1434 1438 cũng 329 | T320 B_W 1439 1441 có 330 | T321 I_W 1442 1445 thể 331 | T322 B_W 1446 1449 làm 332 | T323 B_W 1450 1453 phà 333 | T324 B_W 1454 1456 bị 334 | T325 B_W 1457 1460 lật 335 | T326 B_W 1461 1463 vì 336 | T327 B_W 1464 1467 mất 337 | T328 B_W 1468 1471 cân 338 | T329 I_W 1472 1477 bằng 339 | 340 | T330 B_W 1478 1480 Và 341 | T331 B_W 1481 1484 các 342 | T332 B_W 1485 1487 dữ 343 | T333 I_W 1488 1492 liệu 344 | T334 B_W 1493 1497 hành 345 | T335 I_W 1498 1503 trình 346 | T336 B_W 1504 1507 cho 347 | T337 I_W 1508 1512 thấy 348 | T338 B_W 1513 1516 con 349 | T339 B_W 1517 1520 phà 350 | T340 B_W 1521 1523 đã 351 | T341 B_W 1524 1528 quẹo 352 | T342 B_W 1529 1532 một 353 | T343 B_W 1533 1536 góc 354 | T344 B_W 1537 1539 45 355 | T345 B_W 1540 1542 độ 356 | T346 B_W 1543 1547 ngay 357 | T347 I_W 1548 1551 khi 358 | T348 B_W 1552 1557 chìm 359 | 360 | T349 B_W 1558 1561 Lối 361 | T350 I_W 1562 1565 vào 362 | T351 B_W 1566 1570 cảng 363 | T352 B_W 1571 1575 trên 364 | T353 B_W 1576 1579 đảo 365 | T354 B_W 1580 1585 Jindo 366 | T355 B_W 1586 1589 đầy 367 | T356 B_W 1590 1594 vòng 368 | T357 I_W 1595 1598 hoa 369 | T358 B_W 1599 1603 tang 370 | T359 B_W 1604 1609 tưởng 371 | T360 I_W 1610 1613 nhớ 372 | T361 B_W 1614 1617 các 373 | T362 B_W 1618 1621 nạn 374 | T363 I_W 1622 1626 nhân 375 | T364 B_W 1627 1629 vụ 376 | T365 B_W 1630 1634 chìm 377 | T366 B_W 1635 1638 phà 378 | T367 I_W 1639 1644 Sewol 379 | T368 B_W 1645 1649 ngày 380 | T369 B_W 1650 1659 28.4.2014 381 | T370 B_W 1660 1661 - 382 | T371 B_W 1662 1665 Ảnh 383 | T372 B_W 1666 1667 : 384 | T373 B_W 1668 1675 Reuters 385 | T374 B_W 1676 1679 Thợ 386 | T375 I_W 1680 1683 lặn 387 | T376 B_W 1684 1688 hiện 388 | T377 B_W 1689 1692 tìm 389 | T378 I_W 1693 1697 kiếm 390 | T379 B_W 1698 1701 gần 391 | T380 I_W 1702 1705 hết 392 | T381 B_W 1706 1709 các 393 | T382 B_W 1710 1715 phòng 394 | T383 B_W 1716 1720 trên 395 | T384 B_W 1721 1724 phà 396 | T385 B_W 1725 1730 Sewol 397 | T386 B_W 1731 1735 chìm 398 | T387 B_W 1736 1740 dưới 399 | T388 B_W 1741 1745 biển 400 | T389 B_W 1746 1747 - 401 | T390 B_W 1748 1751 Ảnh 402 | T391 B_W 1752 1753 : 403 | T392 B_W 1754 1761 Reuters 404 | T393 B_W 1762 1766 Ngày 405 | T394 B_W 1767 1770 4.5 406 | T395 B_W 1771 1775 Tổng 407 | T396 I_W 1776 1781 thống 408 | T397 B_W 1782 1785 Hàn 409 | T398 I_W 1786 1790 Quốc 410 | T399 I_W 1791 1795 Park 411 | T400 B_W 1796 1800 Geun 412 | T401 B_W 1801 1802 - 413 | T402 B_W 1803 1806 hye 414 | T403 B_W 1807 1809 có 415 | T404 B_W 1810 1816 chuyến 416 | T405 B_W 1817 1821 thăm 417 | T406 B_W 1822 1825 lần 418 | T407 B_W 1826 1827 2 419 | T408 B_W 1828 1831 các 420 | T409 B_W 1832 1835 gia 421 | T410 I_W 1836 1840 đình 422 | T411 B_W 1841 1844 nạn 423 | T412 I_W 1845 1849 nhân 424 | T413 B_W 1850 1852 vụ 425 | T414 B_W 1853 1857 chìm 426 | T415 B_W 1858 1861 phà 427 | T416 B_W 1862 1866 đang 428 | T417 B_W 1867 1870 tạm 429 | T418 I_W 1871 1874 trú 430 | T419 B_W 1875 1876 ở 431 | T420 B_W 1877 1880 đảo 432 | T421 I_W 1881 1887 Jindo 433 | 434 | T422 B_W 1888 1890 Bà 435 | T423 B_W 1891 1895 Park 436 | T424 B_W 1896 1899 nói 437 | T425 B_W 1900 1904 rằng 438 | T426 B_W 1905 1907 bà 439 | T427 B_W 1908 1912 cũng 440 | T428 B_W 1913 1917 từng 441 | T429 B_W 1918 1921 đau 442 | T430 I_W 1922 1925 khổ 443 | T431 B_W 1926 1928 vì 444 | T432 B_W 1929 1932 mất 445 | T433 I_W 1933 1936 mát 446 | T434 B_W 1937 1940 gia 447 | T435 I_W 1941 1945 đình 448 | T436 B_W 1946 1949 nên 449 | T437 B_W 1950 1954 hiểu 450 | T438 I_W 1955 1957 rõ 451 | T439 B_W 1958 1961 tâm 452 | T440 I_W 1962 1967 trạng 453 | T441 B_W 1968 1971 của 454 | T442 B_W 1972 1975 mọi 455 | T443 I_W 1976 1982 người 456 | 457 | T444 B_W 1983 1985 Bà 458 | T445 B_W 1986 1989 hứa 459 | T446 B_W 1990 1992 sẽ 460 | T447 B_W 1993 1998 trừng 461 | T448 I_W 1999 2003 phạt 462 | T449 B_W 2004 2007 các 463 | T450 B_W 2008 2010 cá 464 | T451 I_W 2011 2015 nhân 465 | T452 B_W 2016 2020 liên 466 | T453 I_W 2021 2025 quan 467 | T454 B_W 2026 2028 vụ 468 | T455 B_W 2029 2033 chìm 469 | T456 B_W 2034 2037 phà 470 | T457 B_W 2038 2042 này 471 | 472 | T458 B_W 2043 2047 Tính 473 | T459 B_W 2048 2051 đến 474 | T460 B_W 2052 2056 ngày 475 | T461 B_W 2057 2060 4.5 476 | T462 B_W 2061 2063 đã 477 | T463 B_W 2064 2066 có 478 | T464 B_W 2067 2070 244 479 | T465 B_W 2071 2074 thi 480 | T466 I_W 2075 2078 thể 481 | T467 B_W 2079 2083 được 482 | T468 B_W 2084 2087 tìm 483 | T469 B_W 2088 2092 thấy 484 | T470 B_W 2093 2096 vẫn 485 | T471 B_W 2097 2100 còn 486 | T472 B_W 2101 2103 58 487 | T473 B_W 2104 2109 người 488 | T474 B_W 2110 2113 mất 489 | T475 I_W 2114 2119 tích 490 | 491 | T476 B_W 2120 2122 Số 492 | T477 I_W 2123 2128 người 493 | T478 B_W 2129 2133 được 494 | T479 B_W 2134 2137 cứu 495 | T480 I_W 2138 2142 sống 496 | T481 B_W 2143 2145 là 497 | T482 B_W 2146 2149 174 498 | T483 B_W 2150 2153 gồm 499 | T484 B_W 2154 2159 22/29 500 | T485 B_W 2160 2166 thuyền 501 | T486 I_W 2167 2172 viên 502 | 503 | T487 B_W 2173 2177 Phát 504 | T488 I_W 2178 2182 ngôn 505 | T489 B_W 2183 2187 viên 506 | T490 B_W 2188 2191 Lực 507 | T491 I_W 2192 2197 lượng 508 | T492 B_W 2198 2201 cứu 509 | T493 I_W 2202 2204 hộ 510 | T494 B_W 2205 2208 phà 511 | T495 B_W 2209 2214 Sewol 512 | T496 B_W 2215 2218 ông 513 | T497 B_W 2219 2221 Ko 514 | T498 I_W 2222 2227 Myung 515 | T499 B_W 2228 2229 - 516 | T500 B_W 2230 2234 seok 517 | T501 B_W 2235 2238 cho 518 | T502 I_W 2239 2243 biết 519 | T503 B_W 2244 2247 thợ 520 | T504 I_W 2248 2251 lặn 521 | T505 B_W 2252 2254 đã 522 | T506 B_W 2255 2258 tìm 523 | T507 I_W 2259 2263 kiếm 524 | T508 B_W 2264 2268 được 525 | T509 B_W 2269 2271 60 526 | T510 B_W 2272 2277 trong 527 | T511 B_W 2278 2282 tổng 528 | T512 I_W 2283 2285 số 529 | T513 B_W 2286 2288 64 530 | T514 B_W 2289 2294 phòng 531 | T515 B_W 2295 2298 của 532 | T516 B_W 2299 2302 con 533 | T517 B_W 2303 2306 phà 534 | T518 B_W 2307 2311 dưới 535 | T519 B_W 2312 2316 lòng 536 | T520 I_W 2317 2321 biển 537 | -------------------------------------------------------------------------------- /pyspark/study_apache_spark/rdd_co_ban.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# RDD cơ bản\n", 8 | "- Programmer chỉ định số lượng partitions.\n", 9 | "- Driver tự phân chia partition đến các Workers tương ứng.\n", 10 | "- Master parameter chỉ định số lượng workers cụ thể.\n", 11 | "\n", 12 | "# Các hàm transformations\n", 13 | "- map(func): trả về tập dữ liệu phân tán mới bằng cách ánh xạ từng phần tử tập dữ liệu nguồn qua hàm func do programmer định nghĩa.\n", 14 | "- filter(func): trả về tập dữ liệu phân tán mới bằng cách lọc ra các phần tử tập dữ liệu nguồn thoả điều kiện hàm func định nghĩa.\n", 15 | "- distinct(): trả về tập dữ liệu phân tán mới chỉ chứa các phần tử riêng biệt từ tập dữ liệu nguồn.\n", 16 | "- flatMap(func): tương tự như map(), nhưng có thể ánh xạ các phần tử nguồn sang 0 hoặc nhiều phần tử ở tập dữ liệu mới. Hàm func thường trả về kiểu Seg thay vì phần tử đơn lẻ." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "http://localhost:4040/jobs/\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "print \"http://localhost:4040/jobs/\"" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/plain": [ 44 | "PythonRDD[1] at RDD at PythonRDD.scala:48" 45 | ] 46 | }, 47 | "execution_count": 2, 48 | "metadata": {}, 49 | "output_type": "execute_result" 50 | } 51 | ], 52 | "source": [ 53 | "rdd = sc.parallelize([1, 2, 3, 4])\n", 54 | "rdd.map(lambda x: x * 2)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/plain": [ 65 | "PythonRDD[2] at RDD at PythonRDD.scala:48" 66 | ] 67 | }, 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "rdd.filter(lambda x: x % 2 == 0)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/plain": [ 85 | "PythonRDD[8] at RDD at PythonRDD.scala:48" 86 | ] 87 | }, 88 | "execution_count": 4, 89 | "metadata": {}, 90 | "output_type": "execute_result" 91 | } 92 | ], 93 | "source": [ 94 | "rdd = sc.parallelize([1, 4, 2, 2, 3])\n", 95 | "rdd.distinct()" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 5, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "PythonRDD[10] at RDD at PythonRDD.scala:48" 107 | ] 108 | }, 109 | "execution_count": 5, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "rdd = sc.parallelize([1, 2, 3])\n", 116 | "rdd.map(lambda x: [x, x + 5])" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 6, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/plain": [ 127 | "PythonRDD[11] at RDD at PythonRDD.scala:48" 128 | ] 129 | }, 130 | "execution_count": 6, 131 | "metadata": {}, 132 | "output_type": "execute_result" 133 | } 134 | ], 135 | "source": [ 136 | "rdd.flatMap(lambda x: [x, x + 5])" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "# Các hàm actions\n", 144 | "- reduce(func): aggregate từng phần tử tập dữ liệu thông qua hàm func, hàm func nhận 2 đối số và trả về 1 giá trị.\n", 145 | "- take(n): trả về mảng n phần tử.\n", 146 | "- collect(): trả về tất cả các phần tử. CHÚ Ý: phải đảm bảo máy Driver đủ dung lượng để chứa kết quả trả về.\n", 147 | "- takeOrdered(n, key=func): trả về n phần tử sắp xếp tăng dần hoặc sắp xếp theo hàm key." 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 7, 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "data": { 157 | "text/plain": [ 158 | "6" 159 | ] 160 | }, 161 | "execution_count": 7, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "rdd = sc.parallelize([1, 2, 3])\n", 168 | "rdd.reduce(lambda a, b: a * b)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 8, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/plain": [ 179 | "[1, 2]" 180 | ] 181 | }, 182 | "execution_count": 8, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | } 186 | ], 187 | "source": [ 188 | "rdd.take(2)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 9, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/plain": [ 199 | "[1, 2, 3]" 200 | ] 201 | }, 202 | "execution_count": 9, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "rdd.collect()" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 10, 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "data": { 218 | "text/plain": [ 219 | "[5, 3, 2]" 220 | ] 221 | }, 222 | "execution_count": 10, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | } 226 | ], 227 | "source": [ 228 | "rdd = sc.parallelize([5, 3, 1, 2])\n", 229 | "rdd.takeOrdered(3, lambda s: -1 * s)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 11, 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "data": { 239 | "text/plain": [ 240 | "[1, 2, 3]" 241 | ] 242 | }, 243 | "execution_count": 11, 244 | "metadata": {}, 245 | "output_type": "execute_result" 246 | } 247 | ], 248 | "source": [ 249 | "rdd.takeOrdered(3)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 12, 255 | "metadata": {}, 256 | "outputs": [ 257 | { 258 | "name": "stdout", 259 | "output_type": "stream", 260 | "text": [ 261 | "5\n" 262 | ] 263 | } 264 | ], 265 | "source": [ 266 | "lines = sc.textFile(\"sample_text.txt\", 4)\n", 267 | "print lines.count()" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 13, 273 | "metadata": {}, 274 | "outputs": [ 275 | { 276 | "name": "stdout", 277 | "output_type": "stream", 278 | "text": [ 279 | "5\n" 280 | ] 281 | } 282 | ], 283 | "source": [ 284 | "print lines.count()" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 14, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "name": "stdout", 294 | "output_type": "stream", 295 | "text": [ 296 | "5\n", 297 | "5\n" 298 | ] 299 | } 300 | ], 301 | "source": [ 302 | "lines = sc.textFile(\"sample_text.txt\", 4)\n", 303 | "lines.cache()\n", 304 | "print lines.count()\n", 305 | "print lines.count()" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "# Key-Value RDDs\n", 313 | "- Tương tự như Map Reduce, Spark hỗ trợ Key-Value pairs.\n", 314 | "- Mỗi phần tử của Pair RDD là một cặp tuple.\n", 315 | "## Some Key-Value transformation\n", 316 | "- reduceByKey(func): trả về tập dữ liệu phân tán mới (K, V). Trong đó, các giá trị cho từng key được tổng hợp bằng hàm reduce func có dạng (V, V) -> V.\n", 317 | "- sortByKey(): trả về tập dữ liệu phân tán mới (K, V) sắp xếp tăng dần theo keys.\n", 318 | "- groupByKey(): trả về tập dữ liệu phân tán mới (K, Iterable)." 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 15, 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "data": { 328 | "text/plain": [ 329 | "[(1, 2), (3, 4)]" 330 | ] 331 | }, 332 | "execution_count": 15, 333 | "metadata": {}, 334 | "output_type": "execute_result" 335 | } 336 | ], 337 | "source": [ 338 | "rdd = sc.parallelize([(1, 2), (3, 4)])\n", 339 | "rdd.collect()" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 16, 345 | "metadata": {}, 346 | "outputs": [ 347 | { 348 | "data": { 349 | "text/plain": [ 350 | "[(1, 2), (3, 10)]" 351 | ] 352 | }, 353 | "execution_count": 16, 354 | "metadata": {}, 355 | "output_type": "execute_result" 356 | } 357 | ], 358 | "source": [ 359 | "rdd = sc.parallelize([(1, 2), (3, 4), (3, 6)])\n", 360 | "rdd.reduceByKey(lambda a, b: a + b).collect()" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 17, 366 | "metadata": {}, 367 | "outputs": [ 368 | { 369 | "data": { 370 | "text/plain": [ 371 | "[(1, 'a'), (1, 'b'), (2, 'c')]" 372 | ] 373 | }, 374 | "execution_count": 17, 375 | "metadata": {}, 376 | "output_type": "execute_result" 377 | } 378 | ], 379 | "source": [ 380 | "rdd = sc.parallelize([(1, \"a\"), (2, \"c\"), (1, \"b\")])\n", 381 | "rdd.sortByKey().collect()" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 18, 387 | "metadata": { 388 | "scrolled": true 389 | }, 390 | "outputs": [ 391 | { 392 | "data": { 393 | "text/plain": [ 394 | "[(1, ),\n", 395 | " (2, )]" 396 | ] 397 | }, 398 | "execution_count": 18, 399 | "metadata": {}, 400 | "output_type": "execute_result" 401 | } 402 | ], 403 | "source": [ 404 | "rdd.groupByKey().collect()" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": {}, 410 | "source": [ 411 | "# X.join(Y)\n", 412 | "- Trả về tất cả các phần tử RDD keys khớp với X và Y.\n", 413 | "- Mỗi cặp có định dạng (k, (v1, v2)). Trong đó, (k, v1) thuộc X và (k, v2) thuộc Y." 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 19, 419 | "metadata": {}, 420 | "outputs": [ 421 | { 422 | "data": { 423 | "text/plain": [ 424 | "[('a', (1, 2)), ('a', (1, 3))]" 425 | ] 426 | }, 427 | "execution_count": 19, 428 | "metadata": {}, 429 | "output_type": "execute_result" 430 | } 431 | ], 432 | "source": [ 433 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n", 434 | "y = sc.parallelize([(\"a\", 2), (\"a\", 3)])\n", 435 | "sorted(x.join(y).collect())" 436 | ] 437 | }, 438 | { 439 | "cell_type": "markdown", 440 | "metadata": {}, 441 | "source": [ 442 | "# X.leftOuterJoin(Y)\n", 443 | "- Với mỗi phần tử (k, v) thuộc X, kết quả trả về có thể là:\n", 444 | " - Tất cả các cặp (k, (v, w)) với w thuộc Y.\n", 445 | " - Hoặc các cặp (k, (v, None)) nếu không có phần tử nào thuộc Y có key là k." 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 20, 451 | "metadata": {}, 452 | "outputs": [ 453 | { 454 | "data": { 455 | "text/plain": [ 456 | "[('a', (1, 2)), ('b', (4, None))]" 457 | ] 458 | }, 459 | "execution_count": 20, 460 | "metadata": {}, 461 | "output_type": "execute_result" 462 | } 463 | ], 464 | "source": [ 465 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n", 466 | "y = sc.parallelize([(\"a\", 2)])\n", 467 | "sorted(x.leftOuterJoin(y).collect())" 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "metadata": {}, 473 | "source": [ 474 | "# X.rightOuterJoin(Y)\n", 475 | "- Với mỗi phần tử (k, w) thuộc Y, kết quả trả về có thể là:\n", 476 | " - Tất cả các cặp (k, (v, w)) với v thuộc X.\n", 477 | " - Hoặc các cặp (k, (None, w)) nếu không có phần tử nào thuộc X có key là k." 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": 21, 483 | "metadata": {}, 484 | "outputs": [ 485 | { 486 | "data": { 487 | "text/plain": [ 488 | "[('a', (1, 2)), ('b', (None, 4))]" 489 | ] 490 | }, 491 | "execution_count": 21, 492 | "metadata": {}, 493 | "output_type": "execute_result" 494 | } 495 | ], 496 | "source": [ 497 | "x = sc.parallelize([(\"a\", 1)])\n", 498 | "y = sc.parallelize([(\"a\", 2), (\"b\", 4)])\n", 499 | "sorted(x.rightOuterJoin(y).collect())" 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": {}, 505 | "source": [ 506 | "# X.fullOuterJoin(Y)\n", 507 | "- Với mỗi phần tử (k, v) thuộc X, kết quả trả về có thể là:\n", 508 | " - Tất cả các cặp (k, (v, w)) với w thuộc Y.\n", 509 | " - Hoặc các cặp (k, (v, None)) nếu không có phần tử nào thuộc Y có key là k.\n", 510 | "- Với mỗi phần tử (k, w) thuộc Y, kết quả trả về có thể là:\n", 511 | " - Tất cả các cặp (k, (v, w)) với v thuộc X.\n", 512 | " - Hoặc các cặp (k, (None, w)) nếu không có phần tử nào thuộc X có key là k. " 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 22, 518 | "metadata": {}, 519 | "outputs": [ 520 | { 521 | "data": { 522 | "text/plain": [ 523 | "[('a', (1, 2)), ('b', (4, None)), ('c', (None, 8))]" 524 | ] 525 | }, 526 | "execution_count": 22, 527 | "metadata": {}, 528 | "output_type": "execute_result" 529 | } 530 | ], 531 | "source": [ 532 | "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n", 533 | "y = sc.parallelize([(\"a\", 2), (\"c\", 8)])\n", 534 | "sorted(x.fullOuterJoin(y).collect())" 535 | ] 536 | } 537 | ], 538 | "metadata": { 539 | "kernelspec": { 540 | "display_name": "Python 2", 541 | "language": "python", 542 | "name": "python2" 543 | }, 544 | "language_info": { 545 | "codemirror_mode": { 546 | "name": "ipython", 547 | "version": 2 548 | }, 549 | "file_extension": ".py", 550 | "mimetype": "text/x-python", 551 | "name": "python", 552 | "nbconvert_exporter": "python", 553 | "pygments_lexer": "ipython2", 554 | "version": "2.7.10" 555 | }, 556 | "name": "04_spark_essentials", 557 | "notebookId": 1227613790179004 558 | }, 559 | "nbformat": 4, 560 | "nbformat_minor": 1 561 | } 562 | --------------------------------------------------------------------------------