├── pyspark
    ├── study_apache_spark
    │   ├── README.md
    │   ├── profile_pyspark_jupyter.sh
    │   ├── data
    │   │   └── sample_text.txt
    │   ├── scala
    │   │   ├── scala_dataframe.ipynb
    │   │   └── scala_rdd.ipynb
    │   └── rdd_co_ban.ipynb
    └── notebooks
    │   ├── makers.csv
    │   ├── sample_text.txt
    │   ├── structured_data.ipynb
    │   ├── Intro_DataFrame.ipynb
    │   └── spark_essentials.ipynb
├── python
    ├── data
    │   ├── comma_delimited_stock_prices.csv
    │   ├── tab_delimited_stock_prices.tsv
    │   ├── colors.json
    │   ├── colon_delimited_stock_prices.csv
    │   ├── terminal.md
    │   └── markdown_examples.md
    ├── models
    │   └── linear_model_v1.pkl
    ├── matrix_algorithms.py
    ├── regression_algorithms.py
    ├── visualizing_data.py
    ├── recommender
    │   └── song_recommender.py
    ├── getting_data.py
    ├── clustering
    │   └── document_retrieval.py
    ├── sentiment_analysis
    │   └── classification_algorithms.py
    └── jupyter
    │   └── Getting started with iPython Notebook.ipynb
├── spark
    └── notebooks
    │   └── data
    │       └── graphx
    │           ├── followers.txt
    │           └── users.txt
├── deep_learning
    ├── figs
    │   └── dogs.jpg
    ├── snippets
    │   ├── multitask_learning.py
    │   ├── mnist_classifies.py
    │   ├── sift_cats_vs_dogs.py
    │   ├── knn_cats_vs_dogs.py
    │   └── training_network.py
    ├── src
    │   ├── image_segmentation.py
    │   ├── prototype.py
    │   └── main.py
    ├── basics.py
    ├── output
    │   └── submission_results.csv
    └── data
    │   └── stage1_sample_submission.csv
├── word2vec
    ├── models
    │   └── first_model
    ├── brat_tokenize_ann.py
    ├── parse_xml.py
    ├── data
    │   ├── sample_tokenize.txt.sent.tkn.wseg
    │   └── sample_tokenize.ann
    └── gensim_test.ipynb
├── computer_vision
    └── color_clustering
    │   ├── son_tung
    │       ├── son_tung_1.png
    │       ├── son_tung_2.png
    │       └── son_tung_3.png
    │   ├── fig_out
    │       ├── color_pallete_son_tung_1.png
    │       ├── color_pallete_son_tung_2.png
    │       └── color_pallete_son_tung_3.png
    │   └── color_kmeans.py
├── profile_pyspark_jupyter
├── .gitignore
└── README.md


/pyspark/study_apache_spark/README.md:
--------------------------------------------------------------------------------
1 | # topdev_talks_Jul_2017


--------------------------------------------------------------------------------
/python/data/comma_delimited_stock_prices.csv:
--------------------------------------------------------------------------------
1 | VAF,13.3
2 | VCF,152.4
3 | ATA,0.8
4 | 


--------------------------------------------------------------------------------
/spark/notebooks/data/graphx/followers.txt:
--------------------------------------------------------------------------------
1 | 2 1
2 | 4 1
3 | 1 2
4 | 6 3
5 | 7 3
6 | 7 6
7 | 6 7
8 | 3 7


--------------------------------------------------------------------------------
/deep_learning/figs/dogs.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongxuanhong/data-science-works/HEAD/deep_learning/figs/dogs.jpg


--------------------------------------------------------------------------------
/word2vec/models/first_model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongxuanhong/data-science-works/HEAD/word2vec/models/first_model


--------------------------------------------------------------------------------
/python/models/linear_model_v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongxuanhong/data-science-works/HEAD/python/models/linear_model_v1.pkl


--------------------------------------------------------------------------------
/python/data/tab_delimited_stock_prices.tsv:
--------------------------------------------------------------------------------
1 | TV2	45,012	147.4	+13.4 (+10.00%)
2 | CTT	100	6.6	+0.6 (+10.00%)
3 | PCE	100	16.6	+1.5 (+9.93%)
4 | HAT	13,300	73.2	+6.6 (+9.87%)


--------------------------------------------------------------------------------
/computer_vision/color_clustering/son_tung/son_tung_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongxuanhong/data-science-works/HEAD/computer_vision/color_clustering/son_tung/son_tung_1.png


--------------------------------------------------------------------------------
/computer_vision/color_clustering/son_tung/son_tung_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongxuanhong/data-science-works/HEAD/computer_vision/color_clustering/son_tung/son_tung_2.png


--------------------------------------------------------------------------------
/computer_vision/color_clustering/son_tung/son_tung_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongxuanhong/data-science-works/HEAD/computer_vision/color_clustering/son_tung/son_tung_3.png


--------------------------------------------------------------------------------
/profile_pyspark_jupyter:
--------------------------------------------------------------------------------
1 | export PATH=$PATH:/usr/local/Cellar/apache-spark/2.2.0/bin
2 | export PYSPARK_DRIVER_PYTHON=jupyter
3 | export PYSPARK_DRIVER_PYTHON_OPTS='notebook' pyspark
4 | 


--------------------------------------------------------------------------------
/python/data/colors.json:
--------------------------------------------------------------------------------
1 | {
2 |   "red": "#f00",
3 |   "green": "#0f0",
4 |   "blue": "#00f",
5 |   "cyan": "#0ff",
6 |   "magenta": "#f0f",
7 |   "yellow": "#ff0",
8 |   "black": "#000"
9 | }


--------------------------------------------------------------------------------
/python/data/colon_delimited_stock_prices.csv:
--------------------------------------------------------------------------------
1 | MA_CK:KL:GIA:DELTA
2 | TV2:45,012:147.4:+13.4 (+10.00%)
3 | CTT:100:6.6:+0.6 (+10.00%)
4 | PCE:100:16.6:+1.5 (+9.93%)
5 | HAT:13,300:73.2:+6.6 (+9.87%)


--------------------------------------------------------------------------------
/computer_vision/color_clustering/fig_out/color_pallete_son_tung_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongxuanhong/data-science-works/HEAD/computer_vision/color_clustering/fig_out/color_pallete_son_tung_1.png


--------------------------------------------------------------------------------
/computer_vision/color_clustering/fig_out/color_pallete_son_tung_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongxuanhong/data-science-works/HEAD/computer_vision/color_clustering/fig_out/color_pallete_son_tung_2.png


--------------------------------------------------------------------------------
/computer_vision/color_clustering/fig_out/color_pallete_son_tung_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ongxuanhong/data-science-works/HEAD/computer_vision/color_clustering/fig_out/color_pallete_son_tung_3.png


--------------------------------------------------------------------------------
/pyspark/study_apache_spark/profile_pyspark_jupyter.sh:
--------------------------------------------------------------------------------
1 | export PATH=$PATH:/usr/local/Cellar/apache-spark/2.2.0/bin
2 | export PYSPARK_DRIVER_PYTHON=jupyter
3 | export PYSPARK_DRIVER_PYTHON_OPTS='notebook' pyspark
4 | 


--------------------------------------------------------------------------------
/spark/notebooks/data/graphx/users.txt:
--------------------------------------------------------------------------------
1 | 1,BarackObama,Barack Obama
2 | 2,ladygaga,Goddess of Love
3 | 3,jeresig,John Resig
4 | 4,justinbieber,Justin Bieber
5 | 6,matei_zaharia,Matei Zaharia
6 | 7,odersky,Martin Odersky
7 | 8,anonsys


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | deep_learning/sample_images/
 3 | deep_learning/snippets/kaggle_dogs_vs_cats/
 4 | python/clustering/people_wiki.csv
 5 | deep_learning/processed_segmentation/
 6 | metastore_db/
 7 | *.log
 8 | .ipynb_checkpoints/
 9 | .DS_Store
10 | 


--------------------------------------------------------------------------------
/pyspark/notebooks/makers.csv:
--------------------------------------------------------------------------------
 1 | id,maker_name,years
 2 | 1,Porsche,2011
 3 | 2,Nissan,2011
 4 | 3,Dodge,2008
 5 | 4,Cadillac,2006
 6 | 5,Land Rover,2011
 7 | 6,Mazda,1988
 8 | 7,Isuzu,1998
 9 | 8,Hyundai,2012
10 | 9,Hyundai,2006
11 | 10,Chevrolet,1998
12 | 


--------------------------------------------------------------------------------
/python/matrix_algorithms.py:
--------------------------------------------------------------------------------
 1 | """
 2 | MATRIX FACTORIZATION & DIMENSIONALITY REDUCTION
 3 | Case study: Recommending Products
 4 | Models:
 5 |     Collaborative filtering
 6 |     Matrix factorization
 7 |     PCA
 8 | Algorithms:
 9 |     Coordinate descent
10 |     Eigen decomposition
11 |     SVD
12 | Concepts:
13 |     Matrix completion, eigenvalues, random projections, cold-start problem, diversity, scaling up
14 | """
15 | 
16 | if __name__ == "__main__":
17 |     print "Hello"
18 | 


--------------------------------------------------------------------------------
/word2vec/brat_tokenize_ann.py:
--------------------------------------------------------------------------------
 1 | if __name__ == "__main__":
 2 |     file_wseg = "data/sample_tokenize.txt.sent.tkn.wseg"
 3 |     file_ann = "data/sample_tokenize.ann"
 4 |     out_str = ""
 5 |     with open(file_wseg, "r") as text_file:
 6 |         curr_pos = 0
 7 |         curr_tag_id = 1
 8 |         lines = text_file.readlines()
 9 |         for line in lines:
10 |             words = line.split(" ")
11 |             for word in words:
12 |                 sub_words = word.split("_")
13 |                 for idx, sub in enumerate(sub_words):
14 |                     begin_span = curr_pos
15 |                     end_span = curr_pos + len(sub.decode("utf-8"))
16 |                     if idx == 0:
17 |                         tag_name = "B_W"
18 |                     else:
19 |                         tag_name = "I_W"
20 | 
21 |                     out_str += "T" + str(curr_tag_id) + "\t" + tag_name + " " + str(begin_span) + " " + str(
22 |                         end_span) + "\t" + sub + "\n"
23 |                     curr_pos = end_span + 1
24 |                     curr_tag_id += 1
25 | 
26 |     with open(file_ann, "w") as out_ann:
27 |         out_ann.writelines(out_str)
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Table of Contents
 2 | ## Visualizing
 3 | Source: [Visualizing Data](https://github.com/ongxuanhong/data-science-works/blob/master/python/visualizing_data.py)
 4 | 
 5 | Markers types
 6 | ```
 7 | ================    ===============================
 8 | character           description
 9 | ================    ===============================
10 |    -                solid line style
11 |    --               dashed line style
12 |    -.               dash-dot line style
13 |    :                dotted line style
14 |    .                point marker
15 |    ,                pixel marker
16 |    o                circle marker
17 |    v                triangle_down marker
18 |    ^                triangle_up marker
19 |    <                triangle_left marker
20 |    >                triangle_right marker
21 |    1                tri_down marker
22 |    2                tri_up marker
23 |    3                tri_left marker
24 |    4                tri_right marker
25 |    s                square marker
26 |    p                pentagon marker
27 |    *                star marker
28 |    h                hexagon1 marker
29 |    H                hexagon2 marker
30 |    +                plus marker
31 |    x                x marker
32 |    D                diamond marker
33 |    d                thin_diamond marker
34 |    |                vline marker
35 |    _                hline marker
36 | ================    ===============================
37 | ```


--------------------------------------------------------------------------------
/pyspark/notebooks/sample_text.txt:
--------------------------------------------------------------------------------
1 | Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.
2 | 
3 | Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32.
4 | 
5 | The standard chunk of Lorem Ipsum used since the 1500s is reproduced below for those interested. Sections 1.10.32 and 1.10.33 from "de Finibus Bonorum et Malorum" by Cicero are also reproduced in their exact original form, accompanied by English versions from the 1914 translation by H. Rackham.
6 | 


--------------------------------------------------------------------------------
/pyspark/study_apache_spark/data/sample_text.txt:
--------------------------------------------------------------------------------
1 | Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.
2 | 
3 | Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32.
4 | 
5 | The standard chunk of Lorem Ipsum used since the 1500s is reproduced below for those interested. Sections 1.10.32 and 1.10.33 from "de Finibus Bonorum et Malorum" by Cicero are also reproduced in their exact original form, accompanied by English versions from the 1914 translation by H. Rackham.
6 | 


--------------------------------------------------------------------------------
/word2vec/parse_xml.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from multiprocessing import Pool
 3 | from xml.dom import minidom
 4 | 
 5 | from bs4 import BeautifulSoup
 6 | 
 7 | 
 8 | def save_files(infos):
 9 |     (xml_path, saved_file) = infos
10 | 
11 |     if os.path.isfile(saved_file):
12 |         os.unlink(saved_file)
13 | 
14 |     try:
15 |         # parse xml file and get all articles
16 |         doc = minidom.parse(xml_path)
17 |         articles = doc.getElementsByTagName("article")
18 | 
19 |         # inspecting some stats
20 |         total = len(articles)
21 |         print "Processing", xml_path, total
22 | 
23 |         # get content from article and save to new file
24 |         for art in articles:
25 |             content = art.getElementsByTagName("content")[0]
26 |             soup = BeautifulSoup(content.firstChild.data, "html5lib")
27 |             text = soup.get_text().strip()
28 | 
29 |             # save to new file
30 |             with open(saved_file, "a") as text_file:
31 |                 text_file.write(text.encode("utf8") + "\n")
32 |     except Exception as e:
33 |         print xml_path
34 |         print "Error:", e
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     total_articles = 0
39 |     total_error = 0
40 |     dirname = "/Users/hongong/Downloads/baomoi_articles"
41 |     dir_sentences = "/Users/hongong/Downloads/sentences/"
42 | 
43 |     list_files = []
44 |     for file_name in os.listdir(dirname):
45 |         # get xml path, unlink before generating new content
46 |         xml_path = os.path.join(dirname, file_name)
47 |         saved_file = dir_sentences + file_name.split(".")[0] + ".txt"
48 |         list_files.append((xml_path, saved_file))
49 | 
50 |     p = Pool(16)
51 |     p.map(save_files, list_files)
52 | 


--------------------------------------------------------------------------------
/deep_learning/snippets/multitask_learning.py:
--------------------------------------------------------------------------------
 1 | #  GRAPH CODE
 2 | # ============
 3 | 
 4 | # Import Tensorflow and Numpy
 5 | import numpy as np
 6 | import tensorflow as tf
 7 | 
 8 | # ======================
 9 | # Define the Graph
10 | # ======================
11 | 
12 | # Define the Placeholders
13 | X = tf.placeholder("float", [10, 10], name="X")
14 | Y1 = tf.placeholder("float", [10, 20], name="Y1")
15 | Y2 = tf.placeholder("float", [10, 20], name="Y2")
16 | 
17 | # Define the weights for the layers
18 | 
19 | initial_shared_layer_weights = np.random.rand(10, 20)
20 | initial_Y1_layer_weights = np.random.rand(20, 20)
21 | initial_Y2_layer_weights = np.random.rand(20, 20)
22 | 
23 | shared_layer_weights = tf.Variable(initial_shared_layer_weights, name="share_W", dtype="float32")
24 | Y1_layer_weights = tf.Variable(initial_Y1_layer_weights, name="share_Y1", dtype="float32")
25 | Y2_layer_weights = tf.Variable(initial_Y2_layer_weights, name="share_Y2", dtype="float32")
26 | 
27 | # Construct the Layers with RELU Activations
28 | shared_layer = tf.nn.relu(tf.matmul(X, shared_layer_weights))
29 | Y1_layer = tf.nn.relu(tf.matmul(shared_layer, Y1_layer_weights))
30 | Y2_layer = tf.nn.relu(tf.matmul(shared_layer, Y2_layer_weights))
31 | 
32 | # Calculate Loss
33 | Y1_Loss = tf.nn.l2_loss(Y1 - Y1_layer)
34 | Y2_Loss = tf.nn.l2_loss(Y2 - Y2_layer)
35 | Joint_Loss = Y1_Loss + Y2_Loss
36 | 
37 | # optimisers
38 | Optimiser = tf.train.AdamOptimizer().minimize(Joint_Loss)
39 | Y1_op = tf.train.AdamOptimizer().minimize(Y1_Loss)
40 | Y2_op = tf.train.AdamOptimizer().minimize(Y2_Loss)
41 | 
42 | # Calculation (Session) Code
43 | # ==========================
44 | 
45 | # open the session
46 | 
47 | with tf.Session() as session:
48 |     session.run(tf.initialize_all_variables())
49 |     _, Joint_Loss = session.run([Optimiser, Joint_Loss],
50 |                                 {
51 |                                     X: np.random.rand(10, 10) * 10,
52 |                                     Y1: np.random.rand(10, 20) * 10,
53 |                                     Y2: np.random.rand(10, 20) * 10
54 |                                 })
55 |     print(Joint_Loss)
56 | 


--------------------------------------------------------------------------------
/deep_learning/snippets/mnist_classifies.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | from sklearn import datasets, svm, metrics
 3 | 
 4 | if __name__ == "__main__":
 5 |     # The digits dataset
 6 |     digits = datasets.load_digits()
 7 | 
 8 |     # The data that we are interested in is made of 8x8 images of digits, let's
 9 |     # have a look at the first 4 images, stored in the `images` attribute of the
10 |     # dataset.  If we were working from image files, we could load them using
11 |     # matplotlib.pyplot.imread.  Note that each image must have the same size. For these
12 |     # images, we know which digit they represent: it is given in the 'target' of
13 |     # the dataset.
14 |     images_and_labels = list(zip(digits.images, digits.target))
15 |     for index, (image, label) in enumerate(images_and_labels[:4]):
16 |         plt.subplot(2, 4, index + 1)
17 |         plt.axis('off')
18 |         plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
19 |         plt.title('Training: %i' % label)
20 | 
21 |     # To apply a classifier on this data, we need to flatten the image, to
22 |     # turn the data in a (samples, feature) matrix:
23 |     n_samples = len(digits.images)
24 |     data = digits.images.reshape((n_samples, -1))
25 | 
26 |     # Create a classifier: a support vector classifier
27 |     classifier = svm.SVC(gamma=0.001)
28 | 
29 |     # We learn the digits on the first half of the digits
30 |     classifier.fit(data[:n_samples / 2], digits.target[:n_samples / 2])
31 | 
32 |     # Now predict the value of the digit on the second half:
33 |     expected = digits.target[n_samples / 2:]
34 |     predicted = classifier.predict(data[n_samples / 2:])
35 | 
36 |     print("Classification report for classifier %s:\n%s\n"
37 |           % (classifier, metrics.classification_report(expected, predicted)))
38 |     print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
39 | 
40 |     images_and_predictions = list(zip(digits.images[n_samples / 2:], predicted))
41 |     for index, (image, prediction) in enumerate(images_and_predictions[:4]):
42 |         plt.subplot(2, 4, index + 5)
43 |         plt.axis('off')
44 |         plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
45 |         plt.title('Prediction: %i' % prediction)
46 | 
47 |     plt.show()
48 | 


--------------------------------------------------------------------------------
/word2vec/data/sample_tokenize.txt.sent.tkn.wseg:
--------------------------------------------------------------------------------
 1 | Con phà bị lật nghiêng sáng 16.4 khi ca_nô của Cảnh_sát biển đến_nơi trên boong tàu không có hành_khách nào vì hành_khách không_được thông_báo lên khu_vực này và được yêu_cầu ngồi_yên - Ảnh : Cảnh_sát biển Hàn_Quốc / Yonhap Việc thường_xuyên chở hàng_hóa quá_tải trên chiếc phà dùng chở khách cho_thấy lỗ_hổng trong việc quản_lý tàu phà lẫn kiểu kinh_doanh bất_chấp hậu_quả
 2 | Cơ_quan đăng_kiểm tàu Hàn_Quốc đầu_năm 2013 xem_xét phà Sewol khi phà đăng_ký cải_tiến để chở thêm nhiều khách
 3 | Cơ_quan này cho phà được chở thêm hàng_hóa tối_đa 987 tấn ( tăng 50 % ) với điều_kiện dằn thêm dưới khoang 2.000 tấn nước để cân_bằng
 4 | Tuy_nhiên khuyến_cáo này chỉ gửi đến công_ty quản_lý phà mà không được gửi cho Cảnh_sát biển lẫn Hiệp_hội tàu_biển Hàn_Quốc
 5 | Phà này sau_đó liên_tục chở hàng_hóa vượt tải_trọng cho_phép 987 tấn như chở hơn 2.000 tấn hàng qua 136 chuyến và trên 3.000 tấn qua 12 lần
 6 | Tổng_cộng đến chuyến cuối_cùng ngày 16.4.2014 phà này chở hàng quá_tải đến 246 lần
 7 | Và chuyến cuối_cùng phà chở lượng hàng_hóa khủng đến 3.608 tấn cùng 476 người kết_quả là phà lật nghiêng ngoài_khơi đảo Jindo sáng 16.4.2014 làm hơn 300 hành_khách thiệt_mạng
 8 | Thuyền_trưởng Lee Joon - seok_khai hàng_hóa chở trên phà ít_hơn các con_số của báo_cáo trên rằng khi chìm phà có chở 657 tấn hàng cùng 150 ô_tô
 9 | Tuy_nhiên Cảnh_sát biển tìm thấy đến 180 ô_tô trong_lòng phà dưới biển !
10 | Các chuyên_gia tin rằng khi chở quá_tải chỉ cần đảo hướng một_chút cũng có_thể làm phà bị lật vì mất cân_bằng
11 | Và các dữ_liệu hành_trình cho_thấy con phà đã quẹo một góc 45 độ ngay_khi chìm
12 | Lối_vào cảng trên đảo Jindo đầy vòng_hoa tang tưởng_nhớ các nạn_nhân vụ chìm phà_Sewol ngày 28.4.2014 - Ảnh : Reuters Thợ_lặn hiện tìm_kiếm gần_hết các phòng trên phà Sewol chìm dưới biển - Ảnh : Reuters Ngày 4.5 Tổng_thống Hàn_Quốc_Park Geun - hye có chuyến thăm lần 2 các gia_đình nạn_nhân vụ chìm phà đang tạm_trú ở đảo_Jindo
13 | Bà Park nói rằng bà cũng từng đau_khổ vì mất_mát gia_đình nên hiểu_rõ tâm_trạng của mọi_người
14 | Bà hứa sẽ trừng_phạt các cá_nhân liên_quan vụ chìm phà này
15 | Tính đến ngày 4.5 đã có 244 thi_thể được tìm thấy vẫn còn 58 người mất_tích
16 | Số_người được cứu_sống là 174 gồm 22/29 thuyền_viên
17 | Phát_ngôn viên Lực_lượng cứu_hộ phà Sewol ông Ko_Myung - seok cho_biết thợ_lặn đã tìm_kiếm được 60 trong tổng_số 64 phòng của con phà dưới lòng_biển


--------------------------------------------------------------------------------
/computer_vision/color_clustering/color_kmeans.py:
--------------------------------------------------------------------------------
 1 | # python3 color_kmeans.py --image son_tung.png --clusters 5
 2 | import argparse
 3 | import os
 4 | 
 5 | import cv2
 6 | import numpy as np
 7 | from sklearn.cluster import KMeans
 8 | 
 9 | 
10 | def get_color_palette(k_cluster, centroids, palette_w=600, palette_h=100):
11 |     # initialize the color palette
12 |     text_y = int(palette_h / 2)
13 |     palette = np.zeros((palette_h, palette_w, 3), dtype="uint8")
14 |     startX = 0
15 | 
16 |     # loop over the color of each cluster
17 |     for color in centroids:
18 |         # plot the relative percentage of each cluster
19 |         endX = startX + (1.0 / k_cluster * palette_w)
20 |         text_x = int(startX + 15)
21 | 
22 |         bgr_code = str(color.astype("uint8").tolist()[0]) + ","
23 |         bgr_code += str(color.astype("uint8").tolist()[1]) + ","
24 |         bgr_code += str(color.astype("uint8").tolist()[2])
25 | 
26 |         cv2.rectangle(palette, (int(startX), 0), (int(endX), palette_h), color.astype("uint8").tolist(), -1)
27 |         cv2.putText(palette, bgr_code, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 200))
28 |         startX = endX
29 | 
30 |     # return the palette
31 |     return palette
32 | 
33 | 
34 | if __name__ == "__main__":
35 | 
36 |     ap = argparse.ArgumentParser()
37 |     ap.add_argument("-i", "--image", required=True, help="Path to the image")
38 |     ap.add_argument("-c", "--clusters", required=True, type=int, help="# of clusters")
39 |     args = vars(ap.parse_args())
40 | 
41 |     for f in os.listdir(args["image"]):
42 |         if f.endswith(".png"):
43 |             img_path = args["image"] + "/" + f
44 |             img_name = os.path.splitext(f)[0]
45 | 
46 |             # load the image
47 |             image = cv2.imread(img_path)
48 | 
49 |             # reshape the image to be a list of pixels
50 |             image = image.reshape((image.shape[0] * image.shape[1], 3))
51 | 
52 |             # cluster the pixel intensities
53 |             clt = KMeans(n_clusters=args["clusters"])
54 |             clt.fit(image)
55 | 
56 |             # representing the number of pixels labeled to each color
57 |             palette = get_color_palette(args["clusters"], clt.cluster_centers_)
58 | 
59 |             # save color palette
60 |             fig_out = "fig_out/color_pallete_" + img_name + ".png"
61 |             cv2.imwrite(fig_out, palette)
62 |             print("Done", f)
63 | 


--------------------------------------------------------------------------------
/deep_learning/src/image_segmentation.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import datetime
 3 | import glob
 4 | import os
 5 | import sys
 6 | import time
 7 | 
 8 | import numpy as np
 9 | 
10 | sys.path.append("/usr/local/lib/python2.7/site-packages")
11 | import cv2
12 | import dicom as dicomio
13 | 
14 | 
15 | def time_diff_str(t1, t2):
16 |     """
17 |     Calculates time durations.
18 |     """
19 |     diff = t2 - t1
20 |     mins = int(diff / 60)
21 |     secs = round(diff % 60, 2)
22 |     return str(mins) + " mins and " + str(secs) + " seconds"
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     t_start = time.time()
27 | 
28 |     # construct the argument parse and parse the arguments
29 |     ap = argparse.ArgumentParser()
30 |     ap.add_argument("-d", "--dataset", required=True, help="path to input dataset")
31 |     ap.add_argument("-s", "--saveto", required=True, help="path to saved processed data")
32 |     args = vars(ap.parse_args())
33 | 
34 |     list_dir = os.listdir(args["dataset"])
35 |     for (i, dir) in enumerate(list_dir):
36 |         if os.path.isfile(dir) is False:
37 |             basePath = args["dataset"] + "/" + dir
38 |             os.chdir(basePath)
39 |             images = []
40 |             for f in glob.glob("*.dcm"):
41 |                 # read dcm file
42 |                 ds = dicomio.read_file(f)
43 |                 img = ds.pixel_array
44 | 
45 |                 # normalize image values to [0, 255]
46 |                 cv2.normalize(img, img, 0, 255, cv2.NORM_MINMAX)
47 |                 img = cv2.medianBlur(img.astype(np.uint8), 5)
48 | 
49 |                 # image segmentation
50 |                 thresh = cv2.adaptiveThreshold(img,
51 |                                                255,
52 |                                                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
53 |                                                cv2.THRESH_BINARY,
54 |                                                11,
55 |                                                2)
56 |                 images.append(thresh)
57 | 
58 |             data = np.array(images)
59 |             mean_img = np.mean(data, axis=0)
60 |             save_name = args["saveto"] + "/" + dir + ".png"
61 |             cv2.imwrite(save_name, mean_img)
62 |             print "Saved processed image:", save_name
63 | 
64 |         # show an update every 10 patients
65 |         if i > 0 and i % 10 == 0:
66 |             print "[INFO] processed {}/{} patients".format(i, len(list_dir))
67 |             print "[INFO] time passed", time_diff_str(t_start, time.time())
68 | 
69 |     print "[INFO]", datetime.datetime.now(), "* DONE After *", time_diff_str(t_start, time.time())
70 | 


--------------------------------------------------------------------------------
/python/regression_algorithms.py:
--------------------------------------------------------------------------------
  1 | """
  2 | REGRESSION
  3 | Case study: Predicting house prices
  4 | Models:
  5 |     Linear regression
  6 |     Regularization: Ridge (L2), Lasso (L1)
  7 | Algorithms:
  8 |     Gradient descent
  9 |     Coordinate descent
 10 | Concepts:
 11 |     Loss functions, bias-variance tradeoff, cross-validation, sparsity, overfitting, model selection
 12 | """
 13 | 
 14 | import os
 15 | 
 16 | import matplotlib.pyplot as plt
 17 | import pandas as pd
 18 | from sklearn import linear_model
 19 | from sklearn.externals import joblib
 20 | from sklearn.linear_model import Ridge
 21 | from sklearn.model_selection import train_test_split
 22 | from sklearn.pipeline import Pipeline
 23 | from sklearn.preprocessing import PolynomialFeatures
 24 | 
 25 | 
 26 | def get_home_data():
 27 |     """Get home data, from local csv."""
 28 |     if os.path.exists("data/home_data.csv"):
 29 |         print("-- home_data.csv found locally")
 30 |         df = pd.read_csv("data/home_data.csv", index_col=0)
 31 | 
 32 |     return df
 33 | 
 34 | 
 35 | def plotting_features_vs_target(features, x, y):
 36 |     # define number of subplot
 37 |     num_feature = len(features)
 38 |     f, axes = plt.subplots(1, num_feature, sharey=True)
 39 | 
 40 |     # plotting
 41 |     for i in range(0, num_feature):
 42 |         axes[i].scatter(x[features[i]], y)
 43 |         axes[i].set_title(features[i])
 44 | 
 45 |     plt.show()
 46 | 
 47 | 
 48 | if __name__ == "__main__":
 49 |     df = get_home_data()
 50 | 
 51 |     # features selection
 52 |     features = list(["bedrooms", "bathrooms", "grade"])
 53 |     print "Features name:", list(df.columns.values)
 54 |     print "Selected features:", features
 55 |     y = df["price"]
 56 |     X = df[features]
 57 | 
 58 |     # split data-set into training (70%) and testing set (30%)
 59 |     x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
 60 | 
 61 |     # plotting features, target relationships
 62 |     plotting_features_vs_target(features, x_train, y_train)
 63 | 
 64 |     """
 65 |     DEFAULT MODEL
 66 |     """
 67 |     # training model
 68 |     linear = linear_model.LinearRegression()
 69 |     linear.fit(x_train, y_train)
 70 | 
 71 |     # evaluating model
 72 |     score_trained = linear.score(x_test, y_test)
 73 |     print "Model scored:", score_trained
 74 | 
 75 |     """
 76 |     LASSO MODEL
 77 |     """
 78 |     # L1 regularization
 79 |     lasso_linear = linear_model.Lasso(alpha=1.0)
 80 |     lasso_linear.fit(x_train, y_train)
 81 | 
 82 |     # evaluating L1 regularized model
 83 |     score_lasso_trained = lasso_linear.score(x_test, y_test)
 84 |     print "Lasso model scored:", score_lasso_trained
 85 | 
 86 |     """
 87 |     RIDGE MODEL
 88 |     """
 89 |     # L2 regularization
 90 |     ridge_linear = Ridge(alpha=1.0)
 91 |     ridge_linear.fit(x_train, y_train)
 92 | 
 93 |     # evaluating L2 regularized model
 94 |     score_ridge_trained = ridge_linear.score(x_test, y_test)
 95 |     print "Ridge model scored:", score_ridge_trained
 96 | 
 97 |     # saving model
 98 |     joblib.dump(linear, "models/linear_model_v1.pkl")
 99 | 
100 |     # loading model
101 |     clf = joblib.load("models/linear_model_v1.pkl")
102 |     predicted = clf.predict(x_test)
103 |     print "Predicted test:", predicted
104 | 
105 |     """
106 |     POLYNOMIAL REGRESSION
107 |     """
108 |     poly_model = Pipeline([('poly', PolynomialFeatures(degree=2)),
109 |                            ('linear', linear_model.LinearRegression(fit_intercept=False))])
110 |     poly_model = poly_model.fit(x_train, y_train)
111 |     score_poly_trained = poly_model.score(x_test, y_test)
112 |     print "Poly model scored:", score_poly_trained
113 | 
114 |     poly_model = Pipeline([('poly', PolynomialFeatures(interaction_only=True, degree=2)),
115 |                            ('linear', linear_model.LinearRegression(fit_intercept=False))])
116 |     poly_model = poly_model.fit(x_train, y_train)
117 |     score_poly_trained = poly_model.score(x_test, y_test)
118 |     print "Poly model (interaction only) scored:", score_poly_trained
119 | 


--------------------------------------------------------------------------------
/deep_learning/snippets/sift_cats_vs_dogs.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import os
  4 | import sys
  5 | import time
  6 | 
  7 | import numpy as np
  8 | from sklearn.model_selection import train_test_split
  9 | from sklearn.neighbors import KNeighborsClassifier
 10 | 
 11 | 
 12 | # returns descriptor of image at pth
 13 | def feature_extract(pth):
 14 |     im = cv2.imread(pth, 1)
 15 |     gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
 16 |     return bowDiction.compute(gray, sift.detect(gray))
 17 | 
 18 | 
 19 | def time_diff_str(t1, t2):
 20 |     """
 21 |     Calculates time durations.
 22 |     """
 23 |     diff = t2 - t1
 24 |     mins = int(diff / 60)
 25 |     secs = round(diff % 60, 2)
 26 |     return str(mins) + " mins and " + str(secs) + " seconds"
 27 | 
 28 | 
 29 | if __name__ == "__main__":
 30 |     # Load opencv libraries
 31 |     sys.path.append('/usr/local/lib/python2.7/site-packages')
 32 |     import cv2
 33 |     from imutils import paths
 34 | 
 35 |     t_start = time.time()
 36 | 
 37 |     # construct the argument parse and parse the arguments
 38 |     ap = argparse.ArgumentParser()
 39 |     ap.add_argument("-d", "--dataset", required=True, help="path to input dataset")
 40 |     ap.add_argument("-k", "--neighbors", type=int, default=1, help="# of nearest neighbors for classification")
 41 |     ap.add_argument("-j", "--jobs", type=int, default=-1,
 42 |                     help="# of jobs for k-NN distance (-1 uses all available cores)")
 43 |     args = vars(ap.parse_args())
 44 | 
 45 |     # grab the list of images that we'll be describing
 46 |     print("[INFO] describing images...")
 47 |     imagePaths = list(paths.list_images(args["dataset"]))
 48 | 
 49 |     # initialize the raw pixel intensities matrix, the features matrix,
 50 |     # and labels list
 51 |     features = []
 52 |     labels = []
 53 | 
 54 |     dictionarySize = 5
 55 |     BOW = cv2.BOWKMeansTrainer(dictionarySize)
 56 |     sift = cv2.xfeatures2d.SIFT_create()
 57 | 
 58 |     for (i, imagePath) in enumerate(imagePaths):
 59 |         image = cv2.imread(imagePath)
 60 |         gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 61 |         kp, dsc = sift.detectAndCompute(gray, None)
 62 |         BOW.add(dsc)
 63 |         print("# kps: {}, descriptors: {}".format(len(kp), dsc.shape))
 64 | 
 65 |     # dictionary created
 66 |     dictionary = BOW.cluster()
 67 |     index_params = dict(algorithm=0, trees=5)
 68 |     search_params = dict(checks=50)  # or pass empty dictionary
 69 |     flann = cv2.FlannBasedMatcher(index_params, search_params)
 70 |     sift2 = cv2.xfeatures2d.SIFT_create()
 71 |     bowDiction = cv2.BOWImgDescriptorExtractor(sift2, cv2.BFMatcher(cv2.NORM_L2))
 72 |     bowDiction.setVocabulary(dictionary)
 73 |     print "BOW dictionary", np.shape(dictionary)
 74 | 
 75 |     # loop over the input images
 76 |     for (i, imagePath) in enumerate(imagePaths):
 77 |         # load the image and extract the class label (assuming that our
 78 |         # path as the format: /path/to/dataset/{class}.{image_num}.jpg
 79 |         label = imagePath.split(os.path.sep)[-1].split(".")[0]
 80 | 
 81 |         # update the raw images, features, and labels matricies,
 82 |         # respectively
 83 |         features.extend(feature_extract(imagePath))
 84 |         labels.append(label)
 85 | 
 86 |         # show an update every 1,000 images
 87 |         if i > 0 and i % 1000 == 0:
 88 |             print("[INFO] processed {}/{}".format(i, len(imagePaths)))
 89 | 
 90 |     # show some information on the memory consumed by the features matrix
 91 |     features = np.array(features)
 92 |     labels = np.array(labels)
 93 |     print("[INFO] features matrix: {:.2f}MB".format(features.nbytes / (1024 * 1000.0)))
 94 | 
 95 |     # partition the data into training and testing splits, using 75%
 96 |     # of the data for training and the remaining 25% for testing
 97 |     (trainFeat, testFeat, trainLabels, testLabels) = train_test_split(features, labels, test_size=0.25, random_state=42)
 98 | 
 99 |     # train and evaluate a k-NN classifer on the histogram
100 |     # representations
101 |     print("[INFO] evaluating accuracy...")
102 |     model = KNeighborsClassifier(n_neighbors=args["neighbors"], n_jobs=args["jobs"])
103 |     model.fit(trainFeat, trainLabels)
104 |     acc = model.score(testFeat, testLabels)
105 |     print("[INFO] accuracy: {:.2f}%".format(acc * 100))
106 | 
107 |     print "-- %s * DONE After * %s" % (datetime.datetime.now(), time_diff_str(t_start, time.time()))
108 | 


--------------------------------------------------------------------------------
/pyspark/notebooks/structured_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# X.join(Y)\n",
  8 |     "- Return RDD of all pairs of elements with matching keys in X and Y.\n",
  9 |     "- Each pair is (k, (v1, v2)) tuple, where (k, v1) is in X and (k, v2) is in Y."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/plain": [
 20 |        "[('a', (1, 2)), ('a', (1, 3))]"
 21 |       ]
 22 |      },
 23 |      "execution_count": 1,
 24 |      "metadata": {},
 25 |      "output_type": "execute_result"
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n",
 30 |     "y = sc.parallelize([(\"a\", 2), (\"a\", 3)])\n",
 31 |     "sorted(x.join(y).collect())"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "# X.leftOuterJoin(Y)\n",
 39 |     "- For each element (k, v) in X, resulting RDD will either contain\n",
 40 |     " - All pairs (k, (v, w)) for w in Y.\n",
 41 |     " - Or the pair (k, (v, None)) if no elements in Y have key k."
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 2,
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "data": {
 51 |       "text/plain": [
 52 |        "[('a', (1, 2)), ('b', (4, None))]"
 53 |       ]
 54 |      },
 55 |      "execution_count": 2,
 56 |      "metadata": {},
 57 |      "output_type": "execute_result"
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n",
 62 |     "y = sc.parallelize([(\"a\", 2)])\n",
 63 |     "sorted(x.leftOuterJoin(y).collect())"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "# X.rightOuterJoin(Y)\n",
 71 |     "- For each element (k, w) in Y, resulting RDD will either contain\n",
 72 |     " - All pairs (k, (v, w)) for v in X.\n",
 73 |     " - Or the pair (k, (None, w)) if no elements in X have key k."
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 3,
 79 |    "metadata": {},
 80 |    "outputs": [
 81 |     {
 82 |      "data": {
 83 |       "text/plain": [
 84 |        "[('a', (1, 2)), ('b', (None, 4))]"
 85 |       ]
 86 |      },
 87 |      "execution_count": 3,
 88 |      "metadata": {},
 89 |      "output_type": "execute_result"
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "x = sc.parallelize([(\"a\", 1)])\n",
 94 |     "y = sc.parallelize([(\"a\", 2), (\"b\", 4)])\n",
 95 |     "sorted(x.rightOuterJoin(y).collect())"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "# X.fullOuterJoin(Y)\n",
103 |     "- For each element (k, v) in X, resulting RDD will either contain\n",
104 |     " - All pairs (k, (v, w)) for w in Y.\n",
105 |     " - Or the pair (k, (v, None)) if no elements in Y have key k.\n",
106 |     "- For each element (k, w) in Y, resulting RDD will either contain\n",
107 |     " - All pairs (k, (v, w)) for v in X.\n",
108 |     " - Or the pair (k, (None, w)) if no elements in X have key k."
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 4,
114 |    "metadata": {},
115 |    "outputs": [
116 |     {
117 |      "data": {
118 |       "text/plain": [
119 |        "[('a', (1, 2)), ('b', (4, None)), ('c', (None, 8))]"
120 |       ]
121 |      },
122 |      "execution_count": 4,
123 |      "metadata": {},
124 |      "output_type": "execute_result"
125 |     }
126 |    ],
127 |    "source": [
128 |     "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n",
129 |     "y = sc.parallelize([(\"a\", 2), (\"c\", 8)])\n",
130 |     "sorted(x.fullOuterJoin(y).collect())"
131 |    ]
132 |   }
133 |  ],
134 |  "metadata": {
135 |   "kernelspec": {
136 |    "display_name": "Python 2",
137 |    "language": "python",
138 |    "name": "python2"
139 |   },
140 |   "language_info": {
141 |    "codemirror_mode": {
142 |     "name": "ipython",
143 |     "version": 2
144 |    },
145 |    "file_extension": ".py",
146 |    "mimetype": "text/x-python",
147 |    "name": "python",
148 |    "nbconvert_exporter": "python",
149 |    "pygments_lexer": "ipython2",
150 |    "version": "2.7.10"
151 |   },
152 |   "name": "06_Structured_Data",
153 |   "notebookId": 3373040177660362
154 |  },
155 |  "nbformat": 4,
156 |  "nbformat_minor": 1
157 | }
158 | 


--------------------------------------------------------------------------------
/python/visualizing_data.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import numpy as np
  3 | 
  4 | 
  5 | def my_line_chart(plt):
  6 |     years = ["1985", "1986", "1987", "1988", "1989", "1990", "1991", "1992", "1993", "1994", "1995", "1996", "1997",
  7 |              "1998", "1999", "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010",
  8 |              "2011", "2012", "2013", "2014", "2015"]
  9 |     gdp = [14094688429, 26336617862, 36658108169, 25423812494, 6293304847, 6471740486,
 10 |            9613369553, 9866990096, 13180954014, 16286434094, 20736163915, 24657470331,
 11 |            26843701136, 27209601995, 28683658004, 33640085727, 35291349277, 37947904054,
 12 |            42717072777, 49424107709, 57633255739, 66371664817, 77414425532, 99130304099,
 13 |            106014600963, 115931749904, 135539487317, 155820001920, 171222025117, 186204652922,
 14 |            193599379094]
 15 | 
 16 |     # create a line chart, years on x-axis, gdp on y-axis
 17 |     plt.plot(years, gdp, color='#f39c12', marker='o', linestyle='solid')
 18 | 
 19 |     # add a title
 20 |     plt.title("Vietnam GDP")
 21 | 
 22 |     # add a label to the y-axis
 23 |     plt.ylabel("Billions of $")
 24 |     plt.show()
 25 | 
 26 | 
 27 | def my_bar_chart(plt):
 28 |     color_names = ["Emerald", "Green Sea", "Midnight Blue", "Carrot", "Peter River"]
 29 |     colors = ["#2ecc71", "#16a085", "#2c3e50", "#e67e22", "#3498db"]
 30 |     num_favorite = [5, 11, 3, 8, 10]
 31 | 
 32 |     # bars are by default width 0.8, so we'll add 0.1 to the left coordinates
 33 |     # so that each bar is centered
 34 |     xs = [i + 0.1 for i, _ in enumerate(color_names)]
 35 | 
 36 |     # plot bars with left x-coordinates [xs], heights [num_favorite]
 37 |     plt.bar(xs, num_favorite, color=colors)
 38 |     plt.title("My Favorite Colors")
 39 | 
 40 |     # label x-axis with color names at bar centers
 41 |     plt.xticks([i + 0.5 for i, _ in enumerate(color_names)], color_names)
 42 | 
 43 |     plt.show()
 44 | 
 45 | 
 46 | def my_histogram(plt):
 47 |     data = []
 48 |     for i in range(100):
 49 |         data.append(np.random.randint(1, 11))
 50 | 
 51 |     plt.hist(data, bins=10, facecolor='#bdc3c7')
 52 | 
 53 |     plt.xlabel("Points")
 54 |     plt.ylabel("# of Students")
 55 |     plt.title("Results of the exam")
 56 |     plt.show()
 57 | 
 58 | 
 59 | def my_multi_line_charts(plt):
 60 |     bears = [10, 58, 85, 115, 139, 182]
 61 |     dolphins = [150, 75, 32, 14, 8, 5]
 62 |     whales = [80, 50, 100, 75, 90, 70]
 63 |     x = [0, 1, 2, 3, 4, 5]
 64 |     years = ["2009", "2010", "2011", "2012", "2013", "2014"]
 65 | 
 66 |     # we can make multiple calls to plt.plot 
 67 |     # to show multiple series on the same chart
 68 |     plt.plot(x, bears, '#16a085', marker='o', linewidth=3.0, label='Bears')
 69 |     plt.plot(x, dolphins, '#c0392b', marker='s', linewidth=3.0, label='Dolphins')
 70 |     plt.plot(x, whales, '#3498db', marker='^', linewidth=3.0, label='Whales')
 71 | 
 72 |     # because we've assigned labels to each series
 73 |     # we can get a legend for free
 74 |     # loc=9 means "top center"
 75 |     plt.legend(loc=9)
 76 |     plt.title("Number of animals each year")
 77 |     plt.xlabel("Years")
 78 |     plt.xticks(x, years)
 79 |     plt.show()
 80 | 
 81 | 
 82 | def my_scatter_plot(plt):
 83 |     sizes = [700, 650, 720, 630, 710, 640, 600, 640, 670]
 84 |     prices = [175, 170, 205, 120, 220, 130, 105, 145, 190]
 85 |     labels = ["$175", "$170", "$205", "$120", "$220", "$130", "$105", "$145", "$190"]
 86 | 
 87 |     plt.scatter(sizes, prices, marker='s', s=40, color='#2ecc71')
 88 | 
 89 |     # label each point
 90 |     for label, friend_count, minute_count in zip(labels, sizes, prices):
 91 |         plt.annotate(label,
 92 |                      xy=(friend_count, minute_count),  # put the label with its point
 93 |                      xytext=(5, -5),  # but slightly offset
 94 |                      textcoords='offset points')
 95 | 
 96 |     plt.title("House prices")
 97 |     plt.xlabel("Size in m2")
 98 |     plt.ylabel("Thousand $")
 99 |     plt.show()
100 | 
101 | 
102 | def my_pie_chart(plt):
103 |     data = [0.5, 0.26, 0.11, 0.04, 0.02, 0.02, 0.01, 0.04]
104 |     smart_phone = ["Apple", "Samsung", "LG", "Motorola", "HTC", "Nokia", "Amazon", "Other"]
105 |     colors = ["#ecf0f1", "#3498db", "#e67e22", "#1abc9c", "#bdc3c7", "#8e44ad", "#f39c12", "#2c3e50"]
106 | 
107 |     plt.pie(data, labels=smart_phone, colors=colors, autopct='%1.1f%%',
108 |             startangle=-90, pctdistance=0.9, labeldistance=1.2)
109 | 
110 |     # make sure pie is a circle and not an oval
111 |     plt.axis("equal")
112 |     plt.show()
113 | 
114 | 
115 | if __name__ == "__main__":
116 |     my_line_chart(plt)
117 | 
118 |     my_bar_chart(plt)
119 | 
120 |     my_histogram(plt)
121 | 
122 |     my_multi_line_charts(plt)
123 | 
124 |     my_scatter_plot(plt)
125 | 
126 |     my_pie_chart(plt)
127 | 


--------------------------------------------------------------------------------
/pyspark/study_apache_spark/scala/scala_dataframe.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "http://10.8.2.1:8089/proxy/application_1515394405830_3970\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "import org.apache.spark.sql.SparkSession\n",
 18 |     "\n",
 19 |     "val spark = SparkSession.builder().\n",
 20 |     "    appName(\"scala_dataframe\").\n",
 21 |     "    config(\"spark.executor.instances\",\"2\").\n",
 22 |     "    config(\"spark.executor.cores\",\"2\").\n",
 23 |     "    config(\"spark.executor.memory\", \"4g\").\n",
 24 |     "    config(\"spark.yarn.executor.memoryOverhead\", \"1g\").\n",
 25 |     "    getOrCreate()\n",
 26 |     "\n",
 27 |     "println(\"http://10.8.2.1:8089/proxy/\"+ spark.sparkContext.applicationId)"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "data": {
 37 |       "text/plain": [
 38 |        "Array([Alice,1])"
 39 |       ]
 40 |      },
 41 |      "execution_count": 2,
 42 |      "metadata": {},
 43 |      "output_type": "execute_result"
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "val l = Seq((\"Alice\", 1))\n",
 48 |     "spark.createDataFrame(l).collect()"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 3,
 54 |    "metadata": {},
 55 |    "outputs": [
 56 |     {
 57 |      "name": "stdout",
 58 |      "output_type": "stream",
 59 |      "text": [
 60 |       "[Stage 1:=======================================>                   (2 + 1) / 3]+------+-----+\n",
 61 |       "|number| word|\n",
 62 |       "+------+-----+\n",
 63 |       "|     8|  bat|\n",
 64 |       "|    64|mouse|\n",
 65 |       "|   -27|horse|\n",
 66 |       "+------+-----+\n",
 67 |       "\n"
 68 |      ]
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "// For implicit conversions from RDDs to DataFrames\n",
 73 |     "import org.apache.spark.sql.Row\n",
 74 |     "import org.apache.spark.sql.types._\n",
 75 |     "\n",
 76 |     "val someData = Seq(\n",
 77 |     "  Row(8, \"bat\"),\n",
 78 |     "  Row(64, \"mouse\"),\n",
 79 |     "  Row(-27, \"horse\")\n",
 80 |     ")\n",
 81 |     "\n",
 82 |     "val someSchema = List(\n",
 83 |     "  StructField(\"number\", IntegerType, true),\n",
 84 |     "  StructField(\"word\", StringType, true)\n",
 85 |     ")\n",
 86 |     "\n",
 87 |     "val someDF = spark.createDataFrame(\n",
 88 |     "  spark.sparkContext.parallelize(someData),\n",
 89 |     "  StructType(someSchema)\n",
 90 |     ")\n",
 91 |     "someDF.show()"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 4,
 97 |    "metadata": {
 98 |     "scrolled": true
 99 |    },
100 |    "outputs": [
101 |     {
102 |      "data": {
103 |       "text/plain": [
104 |        "Array([4])"
105 |       ]
106 |      },
107 |      "execution_count": 4,
108 |      "metadata": {},
109 |      "output_type": "execute_result"
110 |     }
111 |    ],
112 |    "source": [
113 |     "spark.udf.register(\"stringLength\", (s: String) => s.length())\n",
114 |     "spark.sql(\"SELECT stringLength('test')\").collect()"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 10,
120 |    "metadata": {},
121 |    "outputs": [
122 |     {
123 |      "name": "stdout",
124 |      "output_type": "stream",
125 |      "text": [
126 |       "+------+---+\n",
127 |       "|  name|age|\n",
128 |       "+------+---+\n",
129 |       "|   Max| 33|\n",
130 |       "|  Adam| 32|\n",
131 |       "|Muller| 62|\n",
132 |       "+------+---+\n",
133 |       "\n"
134 |      ]
135 |     }
136 |    ],
137 |    "source": [
138 |     "import spark.sqlContext.implicits._\n",
139 |     "case class Person(name: String, age: Int)\n",
140 |     "\n",
141 |     "val personDS = Seq(Person(\"Max\", 33), Person(\"Adam\", 32), Person(\"Muller\", 62)).toDS()\n",
142 |     "personDS.show()"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 3,
148 |    "metadata": {},
149 |    "outputs": [
150 |     {
151 |      "name": "stdout",
152 |      "output_type": "stream",
153 |      "text": [
154 |       "+----------+\n",
155 |       "|sum_of_age|\n",
156 |       "+----------+\n",
157 |       "|       127|\n",
158 |       "+----------+\n",
159 |       "\n"
160 |      ]
161 |     }
162 |    ],
163 |    "source": [
164 |     "import org.apache.spark.sql.functions._\n",
165 |     "personDS.groupBy().agg(sum(\"age\").as(\"sum_of_age\")).show()"
166 |    ]
167 |   }
168 |  ],
169 |  "metadata": {
170 |   "kernelspec": {
171 |    "display_name": "Apache Toree - Scala",
172 |    "language": "scala",
173 |    "name": "apache_toree_scala"
174 |   },
175 |   "language_info": {
176 |    "file_extension": ".scala",
177 |    "name": "scala",
178 |    "version": "2.11.8"
179 |   }
180 |  },
181 |  "nbformat": 4,
182 |  "nbformat_minor": 2
183 | }
184 | 


--------------------------------------------------------------------------------
/python/recommender/song_recommender.py:
--------------------------------------------------------------------------------
  1 | """
  2 | MATRIX FACTORIZATION & DIMENSIONALITY REDUCTION
  3 | Case study: Recommending Products
  4 | Models:
  5 |     Collaborative filtering
  6 |     Matrix factorization
  7 |     PCA
  8 | Algorithms:
  9 |     Coordinate descent
 10 |     Eigen decomposition
 11 |     SVD
 12 | Concepts:
 13 |     Matrix completion, eigenvalues, random projections, cold-start problem, diversity, scaling up
 14 | """
 15 | import os
 16 | from math import sqrt
 17 | 
 18 | import numpy as np
 19 | import pandas as pd
 20 | from scipy.sparse.linalg import svds
 21 | from sklearn.metrics import mean_squared_error
 22 | from sklearn.metrics.pairwise import pairwise_distances
 23 | from sklearn.model_selection import train_test_split
 24 | 
 25 | 
 26 | def load_music_data(file_name):
 27 |     """Get reviews data, from local csv."""
 28 |     if os.path.exists(file_name):
 29 |         print("-- " + file_name + " found locally")
 30 |         df = pd.read_csv(file_name)
 31 | 
 32 |     return df
 33 | 
 34 | 
 35 | def values_to_map_index(values):
 36 |     map_index = {}
 37 |     idx = 0
 38 |     for val in values:
 39 |         map_index[val] = idx
 40 |         idx += 1
 41 | 
 42 |     return map_index
 43 | 
 44 | 
 45 | def print_most_popular_songs(song):
 46 |     # Take a look at the words in the vocabulary
 47 |     vocab = vectorizer.get_feature_names()
 48 |     print "Words in vocabulary:", vocab
 49 | 
 50 |     # Sum up the counts of each vocabulary word
 51 |     dist = np.sum(song, axis=0)
 52 | 
 53 |     # For each, print the vocabulary word and the number of times it
 54 |     # appears in the training set
 55 |     print "Words frequency..."
 56 |     for tag, count in zip(vocab, dist):
 57 |         print count, tag
 58 | 
 59 | 
 60 | def predict(ratings, similarity, type='user'):
 61 |     if type == 'user':
 62 |         mean_user_rating = ratings.mean(axis=1)
 63 |         # You use np.newaxis so that mean_user_rating has same format as ratings
 64 |         ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
 65 |         pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array(
 66 |             [np.abs(similarity).sum(axis=1)]).T
 67 |     elif type == 'item':
 68 |         pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
 69 |     return pred
 70 | 
 71 | 
 72 | def rmse(prediction, ground_truth):
 73 |     prediction = prediction[ground_truth.nonzero()].flatten()
 74 |     ground_truth = ground_truth[ground_truth.nonzero()].flatten()
 75 |     return sqrt(mean_squared_error(prediction, ground_truth))
 76 | 
 77 | 
 78 | if __name__ == "__main__":
 79 | 
 80 |     # Load music data
 81 |     song_data = load_music_data("song_data.csv")
 82 | 
 83 |     # Reduce complexity by getting first n elements
 84 |     n = 10000
 85 |     song_data = song_data.head(n)
 86 |     user_idx = values_to_map_index(song_data.user_id.unique())
 87 |     song_idx = values_to_map_index(song_data.song_id.unique())
 88 | 
 89 |     print "-- Explore data"
 90 |     print song_data.head()
 91 | 
 92 |     print "-- Showing the most popular songs in the dataset"
 93 |     unique, counts = np.unique(song_data["song"], return_counts=True)
 94 |     popular_songs = dict(zip(unique, counts))
 95 |     df_popular_songs = pd.DataFrame(popular_songs.items(), columns=["Song", "Count"])
 96 |     df_popular_songs = df_popular_songs.sort_values(by=["Count"], ascending=False)
 97 |     print df_popular_songs.head()
 98 | 
 99 |     n_users = song_data.user_id.unique().shape[0]
100 |     n_items = song_data.song_id.unique().shape[0]
101 |     print "Number of users = " + str(n_users) + " | Number of songs = " + str(n_items)
102 | 
103 |     train_data, test_data = train_test_split(song_data, test_size=0.25)
104 |     train_data_matrix = np.zeros((n_users, n_items))
105 |     for line in train_data.itertuples():
106 |         train_data_matrix[user_idx[line[1]], song_idx[line[2]]] = line[3]
107 | 
108 |     test_data_matrix = np.zeros((n_users, n_items))
109 |     for line in test_data.itertuples():
110 |         test_data_matrix[user_idx[line[1]], song_idx[line[2]]] = line[3]
111 | 
112 |     user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
113 |     item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
114 | 
115 |     item_prediction = predict(train_data_matrix, item_similarity, type='item')
116 |     user_prediction = predict(train_data_matrix, user_similarity, type='user')
117 | 
118 |     print 'User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
119 |     print 'Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix))
120 | 
121 |     sparsity = round(1.0 - len(song_data) / float(n_users * n_items), 3)
122 |     print 'The sparsity level is ' + str(sparsity * 100) + '%'
123 | 
124 |     # get SVD components from train matrix. Choose k.
125 |     u, s, vt = svds(train_data_matrix, k=20)
126 |     s_diag_matrix = np.diag(s)
127 |     X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
128 |     print 'User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix))
129 | 


--------------------------------------------------------------------------------
/python/data/terminal.md:
--------------------------------------------------------------------------------
  1 | # Retrieving a Software Package
  2 | ```
  3 | %> wget http://framework.zend.com/releases/ZendFramework-1.10.3/ZendFramework-1.10.3-minimal.tar.gz
  4 | ```
  5 | 
  6 | # Monitoring Server Processes
  7 | ```
  8 | %> top
  9 | ...
 10 | ID   COMMAND      %CPU TIME     #TH   #WQ  #PORT MEM    PURG   CMPRS  PGRP PPID STATE    BOOSTS          %CPU_ME %CPU_OTHRS UID  FAULTS
 11 | 1701  top          2.7  00:00.42 1/1   0    20    2872K+ 0B     0B     1701 604  running  *0[1]           0.00000 0.00000    0    3547+
 12 | 1675  com.apple.We 0.8  00:02.27 12    2    188-  41M-   6816K  0B     1675 1    sleeping *0[1061]        0.00000 0.00000    501  22651+
 13 | 1651  fsnotifier   0.0  00:00.02 3     1    30    292K   0B     404K   1644 1644 sleeping *0[1]           0.00000 0.00000    501  1144
 14 | 1650  syncdefaults 0.0  00:00.56 4     2    124   8652K  0B     880K   1650 1    sleeping  0[2]           0.00000 0.00000    501  6658
 15 | 1649  CVMCompiler  0.0  00:00.47 2     2    25    11M    0B     1572K  1649 1    sleeping *0[1]           0.00000 0.00000    501  4136
 16 | 1645  ocspd        0.0  00:00.03 2     1    32    708K   0B     616K   1645 1    sleeping *0[1]           0.00000 0.00000    0    1502
 17 | 1644  pycharm      3.9  02:01.61 52    2    344   509M   6372K  29M    1644 1    sleeping *0[50]          0.00000 0.00000    501  239705
 18 | 1601  mdworker     0.0  00:00.12 3     1    56    116K   0B     4308K  1601 1    sleeping *0[1]           0.00000 0.00000    501  4105
 19 | 
 20 | %> ps aux
 21 | ...
 22 | USER              PID  %CPU %MEM      VSZ    RSS   TT  STAT STARTED      TIME COMMAND
 23 | _windowserver     173   4.1  0.7  4877596  60536   ??  Ss   10:04AM  14:08.75 /System/Library/PrivateFrameworks/SkyLight.framework/Resources/WindowServer -daemon
 24 | hongong          1644   2.2  8.2  7245168 688464   ??  S    12:16PM   7:39.61 /Applications/PyCharm CE.app/Contents/MacOS/pycharm
 25 | hongong           589   1.2  0.5  2661812  44496   ??  S    10:09AM   0:43.55 /Applications/Utilities/Terminal.app/Contents/MacOS/Terminal
 26 | hongong           701   0.8  0.5  2976980  41488   ??  S    10:19AM   4:25.12 /Applications/Sublime Text.app/Contents/MacOS/Sublime Text
 27 | 
 28 | kill -<level> <pid>	Kill a process	$ kill -15 24601
 29 | pkill -<level> -f <name>	Kill matching processes	$ pkill -15 -f spring
 30 | ```
 31 | 
 32 | # Reviewing Log Files
 33 | ```
 34 | %> tail /var/log/apache/error.log
 35 | %> tail -n 100 /var/log/apache/error.log | more
 36 | %> tail -f /var/log/apache/error.log
 37 | %> cat /var/log/apache/error.log
 38 | %> less /var/log/apache/error.log
 39 | ```
 40 | 
 41 | # Copying Files with scp
 42 | ```
 43 | %> scp id_rsa.pub webuser@192.168.1.1:/home/webuser/.ssh/id_rsa.pub
 44 | ```
 45 | 
 46 | # Backing Up Your Web Directory
 47 | ```
 48 | # backup
 49 | %> tar cpzf archive.backup.042710.tgz /var/mywebsite
 50 | # restore
 51 | %> tar xvpfz archive.backup.042710.tgz -C /var/www/
 52 | ```
 53 | 
 54 | # Viewing Your Command History
 55 | ```
 56 | %> history
 57 | ...
 58 | 12  sudo ./configure && make
 59 | 13  find . | grep config.log
 60 | 14  less ./config.log 
 61 | 15  mongod
 62 | 16  asadmin start-domain --debug
 63 | ```
 64 | 
 65 | # Creating Directory Trees
 66 | ```
 67 | %> mkdir -p webapp/application/controllers
 68 | ```
 69 | 
 70 | # Creating Command Aliases
 71 | You can add them to an account configuration file such as .bashrc.
 72 | ```
 73 | %> alias dir='ls -al'
 74 | %> dir
 75 | ...
 76 | drwxr-xr-x@  6 hongong  staff      204 Oct 20 12:00 .
 77 | drwxr-xr-x@ 13 hongong  staff      442 Sep  5 21:51 ..
 78 | -rw-r--r--@  1 hongong  staff     6148 Sep 26 15:29 .DS_Store
 79 | drwxr-xr-x   3 hongong  staff      102 Sep  5 19:01 server
 80 | -rw-r--r--   1 hongong  staff   158814 Oct 20 12:20 server.log
 81 | ```
 82 | 
 83 | # Editing the line
 84 | ```
 85 | echo <string>	Print string to screen
 86 | man <command>	Display manual page for command
 87 | ⌃C	Get out of trouble
 88 | ⌃A	Move to beginning of line
 89 | ⌃E	Move to end of line
 90 | ⌃U	Delete to beginning of line
 91 | ⌃K	Delete to ending of line
 92 | ⌃W	Delete word before cursor
 93 | ```
 94 | 
 95 | # Manipulating files
 96 | ```
 97 | >	                Redirect output to filename
 98 | >>	                Append output to filename
 99 | diff <f1> <f2>	    Diff files 1 & 2
100 | ```
101 | 
102 | # Wordcount and pipes
103 | ```
104 | wc server.log
105 | 1131   10946  167679 server.log
106 | 1131 lines, 10946 words, 167679 bytes 
107 | 
108 | head server.log | wc
109 | 10     104    1438
110 | ```
111 | 
112 | # Less is more
113 | ```
114 | up & down arrow keys	Move up or down one line	
115 | spacebar	            Move forward one page	
116 | ⌃F	                    Move forward one page	
117 | ⌃B	                    Move back one page	
118 | G	                    Move to end of file	
119 | 1G	                    Move to beginning of file
120 | /<string>	            Search file for string
121 | n	                    Move to next search result
122 | N	                    Move to previous search result
123 | q	                    Quit less
124 | -N                      View line number
125 | ```
126 | 
127 | # Grepping
128 | ```
129 | grep <string> <file>	Find string in file
130 | grep -i <string> <file>	Find case-insensitively
131 | 
132 | ```


--------------------------------------------------------------------------------
/deep_learning/basics.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | 
  4 | import matplotlib.image as mpimg
  5 | import matplotlib.pyplot as plt
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | from pydicom import dicomio
  9 | 
 10 | 
 11 | def convolution_with_filter(img_4d, filter):
 12 |     convolved = tf.nn.conv2d(img_4d, filter, strides=[1, 1, 1, 1], padding='SAME')
 13 |     res = convolved.eval()
 14 | 
 15 |     plt.imshow(np.squeeze(res), cmap='gray')
 16 |     plt.imshow(res[0, :, :, 0], cmap='gray')
 17 |     plt.show()
 18 | 
 19 | 
 20 | if __name__ == "__main__":
 21 | 
 22 |     ##############################
 23 |     # Basic read and show images #
 24 |     ##############################
 25 |     img = mpimg.imread("imgs/dogs.jpg")
 26 |     print "Image data", img.shape
 27 |     print img
 28 | 
 29 |     print "Show image"
 30 |     plt.style.use("ggplot")
 31 |     plt.imshow(img)
 32 |     plt.colorbar()
 33 |     plt.show()
 34 | 
 35 |     print "Show RGB channels"
 36 |     plt.imshow(img[:, :, 0], cmap="gray")
 37 |     plt.show()
 38 |     plt.imshow(img[:, :, 1], cmap="gray")
 39 |     plt.show()
 40 |     plt.imshow(img[:, :, 2], cmap="gray")
 41 |     plt.show()
 42 | 
 43 |     ############################
 44 |     # Mean/Deviation of Images #
 45 |     ############################
 46 |     root_dir = "sample_images/00cba091fa4ad62cc3200a657aeb957e/"
 47 |     os.chdir(root_dir)
 48 |     images = []
 49 |     for f in glob.glob("*.dcm"):
 50 |         ds = dicomio.read_file(f)
 51 |         img = ds.pixel_array
 52 |         images.append(img)
 53 | 
 54 |     # convert to array
 55 |     data = np.array(images)
 56 |     print "Total images:", len(images)
 57 |     print "Image dimensions:", images[0].shape
 58 |     print "Combine dimensions:", data.shape
 59 | 
 60 |     plt.style.use("ggplot")
 61 |     print "Calculating mean images"
 62 |     mean_img = np.mean(data, axis=0)
 63 |     plt.imshow(mean_img.astype(np.uint8))
 64 |     plt.show()
 65 | 
 66 |     print "Calculating deviation images"
 67 |     std_img = np.std(data, axis=0)
 68 |     plt.imshow(std_img.astype(np.uint8))
 69 |     plt.show()
 70 | 
 71 |     #############
 72 |     # Histogram #
 73 |     #############
 74 |     # convert to flattened array
 75 |     flattened = data.ravel()
 76 |     print "First image:", data[:1]
 77 |     print "First 10 values:", flattened[:10]
 78 | 
 79 |     print "Histogram"
 80 |     plt.hist(flattened, 255)
 81 |     plt.show()
 82 | 
 83 |     print "Histogram Equalization"
 84 |     plt.hist(mean_img.ravel(), 255)
 85 |     plt.show()
 86 | 
 87 |     print "Normalizing our data"
 88 |     bins = 20
 89 |     fig, axs = plt.subplots(1, 3, figsize=(12, 6), sharey=True, sharex=True)
 90 |     axs[0].hist(data[0].ravel(), bins)
 91 |     axs[0].set_title("img distribution")
 92 |     axs[1].hist(mean_img.ravel(), bins)
 93 |     axs[1].set_title("mean distribution")
 94 |     axs[2].hist((data[0] - mean_img).ravel(), bins)
 95 |     axs[2].set_title("(img - mean) distribution")
 96 |     plt.show()
 97 | 
 98 |     ####################
 99 |     # Tensorflow basic #
100 |     ####################
101 |     print "Tensors"
102 |     x = tf.linspace(-3.0, 3.0, 100)
103 |     print x
104 | 
105 |     print "Graphs and Operations"
106 |     g = tf.get_default_graph()
107 |     print [op.name for op in g.get_operations()]
108 | 
109 |     print "Tensor"
110 |     print g.get_tensor_by_name('LinSpace' + ':0')
111 | 
112 |     # Create Session
113 |     sess = tf.Session()
114 | 
115 |     # Tell session to compute
116 |     print "Session computes"
117 |     computed_x = sess.run(x)
118 |     print(computed_x)
119 | 
120 |     # Evaluate itself using this session
121 |     print "Variable evaluates"
122 |     computed_x = x.eval(session=sess)
123 |     print(computed_x)
124 | 
125 |     print "Tensor shapes"
126 |     print(x.get_shape())
127 |     # convert to list format
128 |     print(x.get_shape().as_list())
129 | 
130 |     # Close the session
131 |     sess.close()
132 | 
133 |     # explicitly tell the session which graph we want to manage
134 |     sess = tf.Session(graph=g)
135 |     sess.close()
136 | 
137 |     # created a new graph
138 |     g2 = tf.Graph()
139 | 
140 |     # interactive with Tensorflow
141 |     sess = tf.InteractiveSession()
142 |     print x.eval()
143 | 
144 |     ###############
145 |     # Convolution #
146 |     ###############
147 |     mean = 0.0
148 |     sigma = 1.0
149 | 
150 |     z = (tf.exp(tf.neg(tf.pow(x - mean, 2.0) /
151 |                        (2.0 * tf.pow(sigma, 2.0)))) *
152 |          (1.0 / (sigma * tf.sqrt(2.0 * 3.1415))))
153 | 
154 |     res = z.eval()
155 |     plt.style.use("ggplot")
156 |     plt.plot(res)
157 |     plt.show()
158 | 
159 |     # store the number of values in our Gaussian curve.
160 |     ksize = z.get_shape().as_list()[0]
161 | 
162 |     # multiply the two to get a 2d gaussian
163 |     z_2d = tf.matmul(tf.reshape(z, [ksize, 1]), tf.reshape(z, [1, ksize]))
164 | 
165 |     # Execute the graph
166 |     plt.imshow(z_2d.eval())
167 |     plt.colorbar()
168 |     plt.show()
169 | 
170 |     # use tensorflow to reshape matrix
171 |     img = mean_img.astype(np.float32)
172 |     img_4d = tf.reshape(img, [1, img.shape[0], img.shape[1], 1])
173 |     print("Tensorflow image shape:", img_4d.get_shape().as_list())
174 | 
175 |     # Reshape with 4d format: H x W x I x O
176 |     z_4d = tf.reshape(z_2d, [ksize, ksize, 1, 1])
177 |     print("Tensorflow kernel shape:", z_4d.get_shape().as_list())
178 | 
179 |     convolution_with_filter(img_4d, z_4d)
180 | 
181 |     # apply sharpen filter
182 |     sharpen_filter = np.zeros([3, 3, 1, 1])
183 |     sharpen_filter[1, 1, :, :] = 5
184 |     sharpen_filter[0, 1, :, :] = -1
185 |     sharpen_filter[1, 0, :, :] = -1
186 |     sharpen_filter[2, 1, :, :] = -1
187 |     sharpen_filter[1, 2, :, :] = -1
188 | 
189 |     convolution_with_filter(img_4d, sharpen_filter)
190 | 
191 |     # apply top sobel filter
192 |     top_sobel_filter = np.zeros([3, 3, 1, 1])
193 |     top_sobel_filter[0, 0, :, :] = 1
194 |     top_sobel_filter[0, 1, :, :] = 2
195 |     top_sobel_filter[0, 2, :, :] = 1
196 |     top_sobel_filter[2, 0, :, :] = -1
197 |     top_sobel_filter[2, 1, :, :] = -2
198 |     top_sobel_filter[2, 2, :, :] = -1
199 | 
200 |     convolution_with_filter(img_4d, top_sobel_filter)
201 | 
202 | 
203 | 


--------------------------------------------------------------------------------
/python/getting_data.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import json
  3 | import re
  4 | from collections import Counter
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import requests
  8 | from bs4 import BeautifulSoup
  9 | from dateutil.parser import parse
 10 | from twython import Twython
 11 | 
 12 | 
 13 | def print_data(ma_ck, kl, gia, delta):
 14 |     print ma_ck, "#", kl, "#", gia, "#", delta
 15 | 
 16 | 
 17 | ####
 18 | #
 19 | # Oreilly
 20 | #
 21 | ####
 22 | 
 23 | def is_video(td):
 24 |     """it's a video if it has exactly one pricelabel, and if
 25 |     the stripped text inside that pricelabel starts with 'Video'"""
 26 |     price_labels = td('span', 'pricelabel')
 27 |     return (len(price_labels) == 1 and
 28 |             price_labels[0].text.strip().startswith("Video"))
 29 | 
 30 | 
 31 | def book_info(td):
 32 |     """given a BeautifulSoup <td> Tag representing a book,
 33 |     extract the book's details and return a dict"""
 34 | 
 35 |     title = td.find("div", "thumbheader").a.text
 36 |     by_author = td.find('div', 'AuthorName').text
 37 |     authors = [x.strip() for x in re.sub("^By ", "", by_author).split(",")]
 38 |     isbn_link = td.find("div", "thumbheader").a.get("href")
 39 |     isbn = re.match("/product/(.*)\.do", isbn_link).groups()[0]
 40 |     date = td.find("span", "directorydate").text.strip()
 41 | 
 42 |     return {
 43 |         "title": title,
 44 |         "authors": authors,
 45 |         "isbn": isbn,
 46 |         "date": date
 47 |     }
 48 | 
 49 | 
 50 | def scrape(num_pages=10):
 51 |     base_url = "http://shop.oreilly.com/category/browse-subjects/data.do?sortby=publicationDate&page="
 52 | 
 53 |     books = []
 54 | 
 55 |     for page_num in range(1, num_pages + 1):
 56 |         print "souping page", page_num
 57 |         url = base_url + str(page_num)
 58 |         soup = BeautifulSoup(requests.get(url).text, 'lxml')
 59 | 
 60 |         for td in soup('td', 'thumbtext'):
 61 |             if not is_video(td):
 62 |                 books.append(book_info(td))
 63 | 
 64 |     return books
 65 | 
 66 | 
 67 | def get_year(book):
 68 |     """book["date"] looks like 'November 2014' so we need to
 69 |     split on the space and then take the second piece"""
 70 |     return int(book["date"].split()[1])
 71 | 
 72 | 
 73 | def plot_years(plt, books):
 74 |     # 2014 is the last complete year of data (when I ran this)
 75 |     year_counts = Counter(get_year(book) for book in books
 76 |                           if get_year(book) <= 2016)
 77 | 
 78 |     years = sorted(year_counts)
 79 |     book_counts = [year_counts[year] for year in years]
 80 |     plt.bar([x - 0.5 for x in years], book_counts)
 81 |     plt.xlabel("year")
 82 |     plt.ylabel("# of data books")
 83 |     plt.title("Data is Big!")
 84 |     plt.show()
 85 | 
 86 | 
 87 | ####
 88 | #
 89 | # Twitter
 90 | #
 91 | ####
 92 | 
 93 | # fill these in if you want to use the code
 94 | CONSUMER_KEY = "JeuEwD5RJiBbxiw9jTMBYBEmU"
 95 | CONSUMER_SECRET = "xRcmv8AMnSSMwq875HiP1SKFfGw51M97BvVH341yckPY3iilCu"
 96 | ACCESS_TOKEN = "47319754-NL1AIh9PBomIVsJe5HXB9vjE5y1rjwZFYUQx0odzo"
 97 | ACCESS_TOKEN_SECRET = "kcq7ER8UZSykDomPn9lYdh5DAafndvp73PzSfykTq0Kp7"
 98 | 
 99 | 
100 | def call_twitter_search_api():
101 |     twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET)
102 | 
103 |     # search for tweets containing the phrase "data science"
104 |     for status in twitter.search(q='"data science"')["statuses"]:
105 |         user = status["user"]["screen_name"].encode('utf-8')
106 |         text = status["text"].encode('utf-8')
107 |         print user, ":", text
108 |         print
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     print "# Data from: http://s.cafef.vn/du-lieu.chn"
113 |     print "## TAB delimited stock prices"
114 | 
115 |     with open('data/tab_delimited_stock_prices.tsv', 'rb') as f:
116 |         reader = csv.reader(f, delimiter='\t')
117 |         for row in reader:
118 |             ma_ck = row[0]
119 |             kl = row[1]
120 |             gia = float(row[2])
121 |             delta = row[3]
122 |             print_data(ma_ck, kl, gia, delta)
123 | 
124 |     print
125 | 
126 |     print "## COLON delimited stock prices"
127 |     with open('data/colon_delimited_stock_prices.csv', 'rb') as f:
128 |         reader = csv.DictReader(f, delimiter=':')
129 |         for row in reader:
130 |             ma_ck = row["MA_CK"]
131 |             kl = row["KL"]
132 |             gia = float(row["GIA"])
133 |             delta = row["DELTA"]
134 |             print_data(ma_ck, kl, gia, delta)
135 | 
136 |     print
137 | 
138 |     print "## WRITING out comma_delimited_stock_prices.csv"
139 |     today_prices = {'VCF': 152.4, 'VAF': 13.3, 'ATA': 0.8}
140 |     with open('data/comma_delimited_stock_prices.csv', 'wb') as f:
141 |         writer = csv.writer(f, delimiter=',')
142 |         for stock, price in today_prices.items():
143 |             writer.writerow([stock, price])
144 | 
145 |     print "## BeautifulSoup"
146 |     html = requests.get("https://www.google.com").text
147 |     soup = BeautifulSoup(html, "lxml")
148 |     print soup
149 |     print
150 | 
151 |     print "## PARSING json"
152 |     # parse the JSON to create a Python object
153 |     with open("data/colors.json") as json_data:
154 |         document = json.load(json_data)
155 |         print "Getting blue value:", document["blue"]
156 | 
157 |     print
158 | 
159 |     print "## GitHub API"
160 |     endpoint = "https://api.github.com/users/ongxuanhong/repos"
161 |     repos = json.loads(requests.get(endpoint).text)
162 | 
163 |     dates = [parse(repo["created_at"]) for repo in repos]
164 |     month_counts = Counter(date.month for date in dates)
165 |     weekday_counts = Counter(date.weekday() for date in dates)
166 | 
167 |     print "dates", [d.strftime("%d/%m/%y") for d in dates]
168 |     print "month_counts", month_counts
169 |     print "weekday_count", weekday_counts
170 | 
171 |     last_5_repositories = sorted(repos,
172 |                                  key=lambda r: r["created_at"],
173 |                                  reverse=True)[:5]
174 | 
175 |     print "last five repos", [repo["name"]
176 |                               for repo in last_5_repositories]
177 |     print
178 | 
179 |     print "## Oreilly books"
180 |     books = scrape()
181 |     plot_years(plt, books)
182 |     print
183 | 
184 |     print "## Twitter search"
185 |     call_twitter_search_api()
186 | 


--------------------------------------------------------------------------------
/pyspark/notebooks/Intro_DataFrame.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","source":["# Creating DataFrames with Python"],"metadata":{}},{"cell_type":"code","source":["# import pyspark class Row from module sql\nfrom pyspark.sql import *\n\n# Create Example Data - Departments and Employees\n\n# Create the Departments\ndepartment1 = Row(id='123456', name='Computer Science')\ndepartment2 = Row(id='789012', name='Mechanical Engineering')\ndepartment3 = Row(id='345678', name='Theater and Drama')\ndepartment4 = Row(id='901234', name='Indoor Recreation')\n\n# Create the Employees\nEmployee = Row(\"firstName\", \"lastName\", \"email\", \"salary\")\nemployee1 = Employee('michael', 'armbrust', 'no-reply@berkeley.edu', 100000)\nemployee2 = Employee('xiangrui', 'meng', 'no-reply@stanford.edu', 120000)\nemployee3 = Employee('matei', None, 'no-reply@waterloo.edu', 140000)\nemployee4 = Employee(None, 'wendell', 'no-reply@berkeley.edu', 160000)\n\n# Create the DepartmentWithEmployees instances from Departments and Employees\ndepartmentWithEmployees1 = Row(department=department1, employees=[employee1, employee2])\ndepartmentWithEmployees2 = Row(department=department2, employees=[employee3, employee4])\ndepartmentWithEmployees3 = Row(department=department3, employees=[employee1, employee4])\ndepartmentWithEmployees4 = Row(department=department4, employees=[employee2, employee3])\n\nprint department1\nprint employee2\nprint departmentWithEmployees1.employees[0].email"],"metadata":{},"outputs":[],"execution_count":2},{"cell_type":"code","source":["# Create the first DataFrame from a list of the rows.\ndepartmentsWithEmployeesSeq1 = [departmentWithEmployees1, departmentWithEmployees2]\ndf1 = sqlContext.createDataFrame(departmentsWithEmployeesSeq1)\n\ndisplay(df1)"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"code","source":["# Create a second DataFrame from a list of rows.\ndepartmentsWithEmployeesSeq2 = [departmentWithEmployees3, departmentWithEmployees4]\ndf2 = sqlContext.createDataFrame(departmentsWithEmployeesSeq2)\n\ndisplay(df2)"],"metadata":{},"outputs":[],"execution_count":4},{"cell_type":"markdown","source":["# Working with DataFrames"],"metadata":{}},{"cell_type":"code","source":["# Union 2 DataFrames.\nunionDF = df1.unionAll(df2)\ndisplay(unionDF)"],"metadata":{},"outputs":[],"execution_count":6},{"cell_type":"code","source":["# Write the Unioned DataFrame to a Parquet file.\n# Remove the file if it exists\ndbutils.fs.rm(\"/tmp/df-example.parquet\", True)\nunionDF.write.parquet(\"/tmp/df-example.parquet\")"],"metadata":{},"outputs":[],"execution_count":7},{"cell_type":"code","source":["# Read a DataFrame from the Parquet file.\nparquetDF = sqlContext.read.parquet(\"/tmp/df-example.parquet\")\ndisplay(parquetDF)"],"metadata":{},"outputs":[],"execution_count":8},{"cell_type":"code","source":["# Explode the employees column.\nfrom pyspark.sql import Row\nfrom pyspark.sql import functions as F\neDF = sqlContext.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={\"a\": \"b\"})])\n\nprint eDF.select(F.explode(eDF.intlist).alias(\"anInt\")).collect()\neDF.select(F.explode(eDF.mapfield).alias(\"key\", \"value\")).show()"],"metadata":{},"outputs":[],"execution_count":9},{"cell_type":"code","source":["from pyspark.sql.functions import explode\ndf = parquetDF.select(explode(\"employees\").alias(\"e\"))\nexplodeDF = df.selectExpr(\"e.firstName\", \"e.lastName\", \"e.email\", \"e.salary\")\ndisplay(explodeDF)"],"metadata":{},"outputs":[],"execution_count":10},{"cell_type":"code","source":["# Use filter() to return only the rows that match the given predicate.\nfilterDF = explodeDF.filter(explodeDF.firstName == \"xiangrui\").sort(explodeDF.lastName)\ndisplay(filterDF)"],"metadata":{},"outputs":[],"execution_count":11},{"cell_type":"code","source":["from pyspark.sql.functions import col, asc\n# use | instead of or\nfilterDF = explodeDF.filter((col(\"firstName\") == \"xiangrui\") | (col(\"firstName\") == \"michael\"))\ndisplay(filterDF)"],"metadata":{},"outputs":[],"execution_count":12},{"cell_type":"code","source":["# The where() clause is equivalent to filter().\nwhereDF = explodeDF.where((col(\"firstName\") == \"xiangrui\") | (col(\"firstName\") == \"michael\")).sort(asc(\"lastName\"))\ndisplay(whereDF)"],"metadata":{},"outputs":[],"execution_count":13},{"cell_type":"code","source":["# Replace null values with -- using DataFrame Na functions.\nnonNullDF = explodeDF.fillna(\"--\")\ndisplay(nonNullDF)"],"metadata":{},"outputs":[],"execution_count":14},{"cell_type":"code","source":["# Retrieve only rows with missing firstName or lastName.\nfilterNullDF = explodeDF.filter((col(\"firstName\").isNull()) | (col(\"lastName\").isNull())).sort(\"email\")\ndisplay(filterNullDF)"],"metadata":{},"outputs":[],"execution_count":15},{"cell_type":"code","source":["# Example aggregations using agg() and countDistinct().\nfrom pyspark.sql.functions import countDistinct\ncountDistinctDF = explodeDF.select(\"firstName\", \"lastName\").groupBy(\"firstName\", \"lastName\").agg(countDistinct(\"firstName\"))\ndisplay(countDistinctDF)"],"metadata":{},"outputs":[],"execution_count":16},{"cell_type":"code","source":["# Compare the DataFrame and SQL Query Physical Plans (Hint: They should be the same.)\ncountDistinctDF.explain()"],"metadata":{},"outputs":[],"execution_count":17},{"cell_type":"code","source":["explodeDF.registerTempTable(\"table_example\")\ncountDistinctDF_sql = sqlContext.sql(\"SELECT firstName, lastName, count(distinct firstName) as distinct_first_names FROM table_example GROUP BY firstName, lastName\")\ncountDistinctDF_sql.explain()"],"metadata":{},"outputs":[],"execution_count":18},{"cell_type":"code","source":["# Sum up all the salaries\nsalarySumDF = explodeDF.agg({\"salary\": \"sum\"})\ndisplay(salarySumDF)"],"metadata":{},"outputs":[],"execution_count":19},{"cell_type":"code","source":["# Print the summary statistics for the salaries.\nexplodeDF.describe(\"salary\").show()"],"metadata":{},"outputs":[],"execution_count":20},{"cell_type":"code","source":["display(explodeDF.select(\"salary\"))"],"metadata":{},"outputs":[],"execution_count":21},{"cell_type":"code","source":["# An example using Pandas & Matplotlib Integration\nimport pandas as pd\nimport matplotlib.pyplot as plt\nplt.clf()\npdDF = nonNullDF.toPandas()\npdDF.plot(x=\"firstName\", y=\"salary\", kind=\"bar\", rot=45)\ndisplay()"],"metadata":{},"outputs":[],"execution_count":22},{"cell_type":"code","source":["# Cleanup: Remove the parquet file.\ndbutils.fs.rm(\"/tmp/df-example.parquet\", True)"],"metadata":{},"outputs":[],"execution_count":23}],"metadata":{"name":"Intro_DataFrame","notebookId":651954930651402},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/python/data/markdown_examples.md:
--------------------------------------------------------------------------------
  1 | # Headers
  2 | ```
  3 | # H1
  4 | ## H2
  5 | ### H3
  6 | #### H4
  7 | ##### H5
  8 | ###### H6
  9 | 
 10 | Alternatively, for H1 and H2, an underline-ish style:
 11 | 
 12 | Alt-H1
 13 | ======
 14 | 
 15 | Alt-H2
 16 | ------
 17 | ```
 18 | 
 19 | # Emphasis
 20 | ```
 21 | Emphasis, aka italics, with *asterisks* or _underscores_.
 22 | 
 23 | Strong emphasis, aka bold, with **asterisks** or __underscores__.
 24 | 
 25 | Combined emphasis with **asterisks and _underscores_**.
 26 | 
 27 | Strikethrough uses two tildes. ~~Scratch this.~~
 28 | ```
 29 | Emphasis, aka italics, with *asterisks* or _underscores_.
 30 | 
 31 | Strong emphasis, aka bold, with **asterisks** or __underscores__.
 32 | 
 33 | Combined emphasis with **asterisks and _underscores_**.
 34 | 
 35 | Strikethrough uses two tildes. ~~Scratch this.~~
 36 | 
 37 | # Lists
 38 | ```
 39 | 1. First ordered list item
 40 | 2. Another item
 41 |   * Unordered sub-list. 
 42 | 1. Actual numbers don't matter, just that it's a number
 43 |   1. Ordered sub-list
 44 | 4. And another item.  
 45 | 
 46 |    Some text that should be aligned with the above item.
 47 | 
 48 | * Unordered list can use asterisks
 49 | - Or minuses
 50 | + Or pluses
 51 | ```
 52 | 1. First ordered list item
 53 | 2. Another item
 54 |     * Unordered sub-list. 
 55 | 1. Actual numbers don't matter, just that it's a number
 56 |     1. Ordered sub-list
 57 | 4. And another item.  
 58 | 
 59 |    Some text that should be aligned with the above item.
 60 | 
 61 | * Unordered list can use asterisks
 62 | - Or minuses
 63 | + Or pluses
 64 | 
 65 | # Links
 66 | There are two ways to create links.
 67 | ```
 68 | [I'm an inline-style link](https://www.google.com)
 69 | 
 70 | [I'm a reference-style link][Arbitrary case-insensitive reference text]
 71 | 
 72 | [You can use numbers for reference-style link definitions][1]
 73 | 
 74 | Or leave it empty and use the [link text itself]
 75 | 
 76 | URLs and URLs in angle brackets will automatically get turned into links. 
 77 | http://www.example.com or <http://www.example.com> and sometimes 
 78 | example.com (but not on Github, for example).
 79 | 
 80 | Some text to show that the reference links can follow later.
 81 | 
 82 | [arbitrary case-insensitive reference text]: https://www.mozilla.org
 83 | [1]: http://slashdot.org
 84 | [link text itself]: http://www.reddit.com
 85 | ```
 86 | [I'm an inline-style link](https://www.google.com)
 87 | 
 88 | [I'm a reference-style link][Arbitrary case-insensitive reference text]
 89 | 
 90 | [You can use numbers for reference-style link definitions][1]
 91 | 
 92 | Or leave it empty and use the [link text itself]
 93 | 
 94 | URLs and URLs in angle brackets will automatically get turned into links. 
 95 | http://www.example.com or <http://www.example.com> and sometimes 
 96 | example.com (but not on Github, for example).
 97 | 
 98 | Some text to show that the reference links can follow later.
 99 | 
100 | [arbitrary case-insensitive reference text]: https://www.mozilla.org
101 | [1]: http://slashdot.org
102 | [link text itself]: http://www.reddit.com
103 | 
104 | # Images
105 | ```
106 | Here's our logo (hover to see the title text):
107 | 
108 | Inline-style: 
109 | ![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png "Logo Title Text 1")
110 | 
111 | Reference-style: 
112 | ![alt text][logo]
113 | 
114 | [logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png "Logo Title Text 2"
115 | ```
116 | Here's our logo (hover to see the title text):
117 | 
118 | Inline-style: 
119 | ![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png "Logo Title Text 1")
120 | 
121 | Reference-style: 
122 | ![alt text][logo]
123 | 
124 | [logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png "Logo Title Text 2"
125 | 
126 | # Code and Syntax Highlighting
127 | <pre>
128 | Inline `code` has `back-ticks around` it.
129 | </pre>
130 | 
131 | Blocks of code are either fenced by lines with three back-ticks ```, or are indented with four spaces. I recommend only using the fenced code blocks -- they're easier and only they support syntax highlighting.
132 | <pre>
133 | ```javascript
134 | var s = "JavaScript syntax highlighting";
135 | alert(s);
136 | ```
137 |  
138 | ```python
139 | s = "Python syntax highlighting"
140 | print s
141 | ```
142 |  
143 | ```
144 | No language indicated, so no syntax highlighting. 
145 | But let's throw in a <b>tag</b>.
146 | ```
147 | </pre>
148 | 
149 | # Tables
150 | ```
151 | Colons can be used to align columns.
152 | 
153 | | Tables        | Are           | Cool  |
154 | | ------------- |:-------------:| -----:|
155 | | col 3 is      | right-aligned | $1600 |
156 | | col 2 is      | centered      |   $12 |
157 | | zebra stripes | are neat      |    $1 |
158 | 
159 | The outer pipes (|) are optional, and you don't need to make the raw Markdown line up prettily. You can also use inline Markdown.
160 | 
161 | Markdown | Less | Pretty
162 | --- | --- | ---
163 | *Still* | `renders` | **nicely**
164 | 1 | 2 | 3
165 | ```
166 | Colons can be used to align columns.
167 | 
168 | | Tables        | Are           | Cool  |
169 | | ------------- |:-------------:| -----:|
170 | | col 3 is      | right-aligned | $1600 |
171 | | col 2 is      | centered      |   $12 |
172 | | zebra stripes | are neat      |    $1 |
173 | 
174 | The outer pipes (|) are optional, and you don't need to make the raw Markdown line up prettily. You can also use inline Markdown.
175 | 
176 | Markdown | Less | Pretty
177 | --- | --- | ---
178 | *Still* | `renders` | **nicely**
179 | 1 | 2 | 3
180 | 
181 | # Blockquotes
182 | ```
183 | > Blockquotes are very handy in email to emulate reply text.
184 | > This line is part of the same quote.
185 | 
186 | Quote break.
187 | 
188 | > This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote. 
189 | ```
190 | > Blockquotes are very handy in email to emulate reply text.
191 | > This line is part of the same quote.
192 | 
193 | Quote break.
194 | 
195 | > This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.
196 | 
197 | # Inline HTML
198 | ```
199 | <dl>
200 |   <dt>Definition list</dt>
201 |   <dd>Is something people use sometimes.</dd>
202 | 
203 |   <dt>Markdown in HTML</dt>
204 |   <dd>Does *not* work **very** well. Use HTML <em>tags</em>.</dd>
205 | </dl>
206 | ```
207 | 
208 | # Horizontal Rule
209 | ```
210 | Three or more...
211 | 
212 | ---
213 | 
214 | Hyphens
215 | 
216 | ***
217 | 
218 | Asterisks
219 | 
220 | ___
221 | 
222 | Underscores
223 | ```
224 | 
225 | # Visualization
226 | Get Vietnam GDP data at 
227 | http://data.worldbank.org/country/vietnam


--------------------------------------------------------------------------------
/deep_learning/snippets/knn_cats_vs_dogs.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import sys
  4 | from operator import itemgetter
  5 | 
  6 | import numpy as np
  7 | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
  8 | from sklearn.ensemble import AdaBoostClassifier
  9 | from sklearn.ensemble import RandomForestClassifier
 10 | from sklearn.gaussian_process import GaussianProcessClassifier
 11 | from sklearn.gaussian_process.kernels import RBF
 12 | from sklearn.model_selection import train_test_split
 13 | from sklearn.naive_bayes import GaussianNB
 14 | from sklearn.neighbors import KNeighborsClassifier
 15 | from sklearn.neural_network import MLPClassifier
 16 | from sklearn.svm import SVC
 17 | from sklearn.tree import DecisionTreeClassifier
 18 | 
 19 | 
 20 | def image_to_feature_vector(image, size=(32, 32)):
 21 |     # resize the image to a fixed size, then flatten the image into
 22 |     # a list of raw pixel intensities
 23 |     return cv2.resize(image, size).flatten()
 24 | 
 25 | 
 26 | def extract_color_histogram(image, bins=(8, 8, 8)):
 27 |     # extract a 3D color histogram from the HSV color space using
 28 |     # the supplied number of `bins` per channel
 29 |     hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
 30 |     hist = cv2.calcHist([hsv], [0, 1, 2], None, bins, [0, 180, 0, 256, 0, 256])
 31 | 
 32 |     # handle normalizing the histogram if we are using OpenCV 2.4.X
 33 |     if imutils.is_cv2():
 34 |         hist = cv2.normalize(hist)
 35 | 
 36 |     # otherwise, perform "in place" normalization in OpenCV 3 (I
 37 |     # personally hate the way this is done
 38 |     else:
 39 |         cv2.normalize(hist, hist)
 40 | 
 41 |     # return the flattened histogram as the feature vector
 42 |     return hist.flatten()
 43 | 
 44 | 
 45 | if __name__ == "__main__":
 46 |     # Load opencv libraries
 47 |     sys.path.append('/usr/local/lib/python2.7/site-packages')
 48 |     import cv2
 49 |     import imutils
 50 |     from imutils import paths
 51 | 
 52 |     # construct the argument parse and parse the arguments
 53 |     ap = argparse.ArgumentParser()
 54 |     ap.add_argument("-d", "--dataset", required=True, help="path to input dataset")
 55 |     ap.add_argument("-k", "--neighbors", type=int, default=1, help="# of nearest neighbors for classification")
 56 |     ap.add_argument("-j", "--jobs", type=int, default=-1,
 57 |                     help="# of jobs for k-NN distance (-1 uses all available cores)")
 58 |     args = vars(ap.parse_args())
 59 | 
 60 |     # grab the list of images that we'll be describing
 61 |     print("[INFO] describing images...")
 62 |     imagePaths = list(paths.list_images(args["dataset"]))
 63 | 
 64 |     # initialize the raw pixel intensities matrix, the features matrix,
 65 |     # and labels list
 66 |     rawImages = []
 67 |     features = []
 68 |     labels = []
 69 | 
 70 |     # loop over the input images
 71 |     for (i, imagePath) in enumerate(imagePaths):
 72 |         # load the image and extract the class label (assuming that our
 73 |         # path as the format: /path/to/dataset/{class}.{image_num}.jpg
 74 |         image = cv2.imread(imagePath)
 75 |         label = imagePath.split(os.path.sep)[-1].split(".")[0]
 76 | 
 77 |         # extract raw pixel intensity "features", followed by a color
 78 |         # histogram to characterize the color distribution of the pixels
 79 |         # in the image
 80 |         pixels = image_to_feature_vector(image)
 81 |         hist = extract_color_histogram(image)
 82 | 
 83 |         # update the raw images, features, and labels matricies,
 84 |         # respectively
 85 |         rawImages.append(pixels)
 86 |         features.append(hist)
 87 |         labels.append(label)
 88 | 
 89 |         # show an update every 1,000 images
 90 |         if i > 0 and i % 1000 == 0:
 91 |             print("[INFO] processed {}/{}".format(i, len(imagePaths)))
 92 | 
 93 |     # show some information on the memory consumed by the raw images
 94 |     # matrix and features matrix
 95 |     rawImages = np.array(rawImages)
 96 |     features = np.array(features)
 97 |     labels = np.array(labels)
 98 |     print("[INFO] pixels matrix: {:.2f}MB".format(rawImages.nbytes / (1024 * 1000.0)))
 99 |     print("[INFO] features matrix: {:.2f}MB".format(features.nbytes / (1024 * 1000.0)))
100 | 
101 |     # partition the data into training and testing splits, using 75%
102 |     # of the data for training and the remaining 25% for testing
103 |     # (trainRI, testRI, trainRL, testRL) = train_test_split(rawImages, labels, test_size=0.25, random_state=42)
104 |     (trainFeat, testFeat, trainLabels, testLabels) = train_test_split(features, labels, test_size=0.25, random_state=42)
105 | 
106 |     # # train and evaluate a k-NN classifer on the raw pixel intensities
107 |     # print("[INFO] evaluating raw pixel accuracy...")
108 |     # model = KNeighborsClassifier(n_neighbors=args["neighbors"], n_jobs=args["jobs"])
109 |     # model.fit(trainRI, trainRL)
110 |     # acc = model.score(testRI, testRL)
111 |     # print("[INFO] raw pixel accuracy: {:.2f}%".format(acc * 100))
112 |     #
113 |     # # train and evaluate a k-NN classifer on the histogram
114 |     # # representations
115 |     # print("[INFO] evaluating histogram accuracy...")
116 |     # model = KNeighborsClassifier(n_neighbors=args["neighbors"], n_jobs=args["jobs"])
117 |     # model.fit(trainFeat, trainLabels)
118 |     # acc = model.score(testFeat, testLabels)
119 |     # print("[INFO] histogram accuracy: {:.2f}%".format(acc * 100))
120 | 
121 |     print "---------------------------"
122 |     print "Training"
123 |     print "---------------------------"
124 | 
125 |     names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
126 |              "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
127 |              "Naive Bayes", "QDA"]
128 | 
129 |     classifiers = [
130 |         KNeighborsClassifier(3, n_jobs=args["jobs"]),
131 |         SVC(kernel="linear", C=0.025),
132 |         SVC(gamma=2, C=1),
133 |         GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True, n_jobs=args["jobs"]),
134 |         DecisionTreeClassifier(max_depth=5),
135 |         RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, n_jobs=args["jobs"]),
136 |         MLPClassifier(alpha=1),
137 |         AdaBoostClassifier(),
138 |         GaussianNB(),
139 |         QuadraticDiscriminantAnalysis()]
140 | 
141 |     # iterate over classifiers
142 |     results = {}
143 | 
144 |     for name, clf in zip(names, classifiers):
145 |         print "Training " + name + " classifier..."
146 |         clf.fit(trainFeat, trainLabels)
147 |         score = clf.score(testFeat, testLabels)
148 |         results[name] = score
149 | 
150 |     print "---------------------------"
151 |     print "Evaluation results"
152 |     print "---------------------------"
153 | 
154 |     # sorting results and print out
155 |     sorted(results.items(), key=itemgetter(1))
156 |     for name in results:
157 |         print name + " accuracy: %0.3f" % results[name]
158 | 


--------------------------------------------------------------------------------
/deep_learning/src/prototype.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import csv
  3 | import datetime
  4 | import os
  5 | import sys
  6 | import time
  7 | from operator import itemgetter
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
 12 | from sklearn.ensemble import AdaBoostClassifier
 13 | from sklearn.ensemble import RandomForestClassifier
 14 | from sklearn.gaussian_process import GaussianProcessClassifier
 15 | from sklearn.gaussian_process.kernels import RBF
 16 | from sklearn.model_selection import train_test_split
 17 | from sklearn.naive_bayes import GaussianNB
 18 | from sklearn.neighbors import KNeighborsClassifier
 19 | from sklearn.neural_network import MLPClassifier
 20 | from sklearn.svm import SVC
 21 | from sklearn.tree import DecisionTreeClassifier
 22 | 
 23 | sys.path.append("/usr/local/lib/python2.7/site-packages")
 24 | import cv2
 25 | from imutils import paths
 26 | import matplotlib
 27 | 
 28 | matplotlib.use("TkAgg")
 29 | import matplotlib.pyplot as plt
 30 | 
 31 | 
 32 | def time_diff_str(t1, t2):
 33 |     """
 34 |     Calculates time durations.
 35 |     """
 36 |     diff = t2 - t1
 37 |     mins = int(diff / 60)
 38 |     secs = round(diff % 60, 2)
 39 |     return str(mins) + " mins and " + str(secs) + " seconds"
 40 | 
 41 | 
 42 | def load_csv(file_path):
 43 |     """Get data, from local csv."""
 44 |     if os.path.exists(file_path):
 45 |         print "[INFO] load", file_path, "file..."
 46 |         df = pd.read_csv(file_path)
 47 | 
 48 |     return df
 49 | 
 50 | 
 51 | def image_to_feature_vector(image, size=(32, 32)):
 52 |     # resize the image to a fixed size, then flatten the image into
 53 |     # a list of raw pixel intensities
 54 |     return cv2.resize(image, size).flatten()
 55 | 
 56 | 
 57 | def get_simple_feature_labels(patient_df, img_paths):
 58 |     features = []
 59 |     labels = []
 60 | 
 61 |     patient_ids = patient_df["id"].tolist()
 62 | 
 63 |     # loop over the input images
 64 |     for (i, img_path) in enumerate(img_paths):
 65 |         # get only training labels
 66 |         base = os.path.basename(img_path)
 67 |         patient_id = os.path.splitext(base)[0]
 68 |         if patient_id in patient_ids:
 69 |             label = patient_df[patient_df["id"] == patient_id].iloc[0]["cancer"]
 70 |             labels.append(label)
 71 |         else:
 72 |             continue
 73 | 
 74 |         # load the image
 75 |         image = cv2.imread(img_path)
 76 |         feat = image_to_feature_vector(image)
 77 | 
 78 |         # update features
 79 |         features.append(feat)
 80 | 
 81 |         # show an update every 100 images
 82 |         if i > 0 and i % 100 == 0:
 83 |             print("[INFO] processed {}/{}".format(i, len(img_paths)))
 84 | 
 85 |     return features, labels
 86 | 
 87 | 
 88 | if __name__ == "__main__":
 89 |     t_start = time.time()
 90 | 
 91 |     # construct the argument parse and parse the arguments
 92 |     ap = argparse.ArgumentParser()
 93 |     ap.add_argument("-d", "--dataset", required=True, help="path to input dataset")
 94 |     ap.add_argument("-o", "--out", required=True, help="path to output submission")
 95 |     ap.add_argument("-j", "--jobs", type=int, default=-1, help="# of jobs (-1 uses all available cores)")
 96 |     ap.add_argument("-s", "--save", help="path to save features")
 97 |     args = vars(ap.parse_args())
 98 | 
 99 |     # load train/test labels
100 |     stage1_labels = load_csv("../data/stage1_labels.csv")
101 |     stage1_sample_submission = load_csv("../data/stage1_sample_submission.csv")
102 | 
103 |     img_paths = list(paths.list_images(args["dataset"]))
104 |     train_features, train_labels = get_simple_feature_labels(stage1_labels, img_paths)
105 |     test_features, test_labels = get_simple_feature_labels(stage1_sample_submission, img_paths)
106 | 
107 |     train_labels = np.array(train_labels)
108 |     train_features = np.array(train_features)
109 |     print "[INFO] labels vector shape:", train_labels.shape
110 |     print "[INFO] features matrix shape:", train_features.shape
111 |     print("[INFO] features matrix size: {:.2f}MB".format(train_features.nbytes / (1024 * 1000.0)))
112 | 
113 |     plt.imshow(train_features)
114 |     plt.show()
115 | 
116 |     print "---------------------------"
117 |     print "Training"
118 |     print "---------------------------"
119 | 
120 |     classifiers = {
121 |         "Nearest Neighbors": KNeighborsClassifier(3, n_jobs=args["jobs"]),
122 |         "Linear SVM": SVC(kernel="linear", C=0.025),
123 |         "RBF SVM": SVC(gamma=2, C=1),
124 |         "Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True, n_jobs=args["jobs"]),
125 |         "Decision Tree": DecisionTreeClassifier(max_depth=5),
126 |         "Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, n_jobs=args["jobs"]),
127 |         "Neural Net": MLPClassifier(alpha=1),
128 |         "AdaBoost": AdaBoostClassifier(),
129 |         "Naive Bayes": GaussianNB(),
130 |         "QDA": QuadraticDiscriminantAnalysis()
131 |     }
132 | 
133 |     # train/dev split
134 |     # X_train, X_dev, Y_train, Y_dev
135 |     (for_train_features, dev_features, for_train_labels, dev_labels) = train_test_split(train_features,
136 |                                                                                         train_labels,
137 |                                                                                         test_size=0.25,
138 |                                                                                         random_state=42)
139 | 
140 |     # iterate over classifiers
141 |     results = {}
142 | 
143 |     for name in classifiers:
144 |         print "[INFO]" + name + " classifier..."
145 |         clf = classifiers[name]
146 |         clf.fit(for_train_features, for_train_labels)
147 |         score = clf.score(dev_features, dev_labels)
148 |         results[name] = score
149 | 
150 |     print "---------------------------"
151 |     print "Evaluation results"
152 |     print "---------------------------"
153 | 
154 |     # sorting results and print out
155 |     sorted(results.items(), key=itemgetter(1))
156 |     for name in results:
157 |         print "[INFO]", name, "accuracy: %0.3f" % results[name]
158 | 
159 |     print "---------------------------"
160 |     print "Training for submission"
161 |     print "---------------------------"
162 | 
163 |     name = list(results)[0]
164 |     clf = classifiers[name]
165 |     print "[INFO]" + name + " classifier..."
166 |     clf.fit(train_features, train_labels)
167 |     predict_submission = clf.predict(test_features)
168 | 
169 |     # update submission
170 |     submission = {}
171 |     patient_ids = stage1_sample_submission["id"].tolist()
172 |     for (i, patient_id) in enumerate(patient_ids):
173 |         submission[patient_id] = predict_submission[i]
174 | 
175 |     with open(args["out"], "wb") as f:
176 |         writer = csv.writer(f, delimiter=',')
177 |         writer.writerow(["id", "cancer"])
178 |         for key, value in submission.items():
179 |             writer.writerow([key, value])
180 | 
181 |     print "[INFO]", datetime.datetime.now(), "* DONE After *", time_diff_str(t_start, time.time())
182 | 


--------------------------------------------------------------------------------
/python/clustering/document_retrieval.py:
--------------------------------------------------------------------------------
  1 | """
  2 | CLUSTERING & RETRIEVAL
  3 | Case study: Finding documents
  4 | Models:
  5 |     Nearest neighbors
  6 |     Clustering, mixtures of Gaussians
  7 |     Latent Dirichlet allocation (LDA)
  8 | Algorithms:
  9 |     KD-trees, locality-sensitive hashing (LSH)
 10 |     K-means
 11 |     Expectation-maximization (EM)
 12 | Concepts:
 13 |     Distance metrics, approximation algorithms, hashing, sampling algorithms, scaling up with map-reduce
 14 | """
 15 | import datetime
 16 | import math
 17 | import os
 18 | import time
 19 | 
 20 | import pandas as pd
 21 | from sklearn.decomposition import LatentDirichletAllocation
 22 | from sklearn.decomposition import NMF
 23 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
 24 | from sklearn.neighbors import NearestNeighbors
 25 | 
 26 | 
 27 | def time_diff_str(t1, t2):
 28 |     """
 29 |     Calculates time durations.
 30 |     """
 31 |     diff = t2 - t1
 32 |     mins = int(diff / 60)
 33 |     secs = round(diff % 60, 2)
 34 |     return str(mins) + " mins and " + str(secs) + " seconds"
 35 | 
 36 | 
 37 | def load_wiki_data(file_name):
 38 |     """Get reviews data, from local csv."""
 39 |     if os.path.exists(file_name):
 40 |         print("-- " + file_name + " found locally")
 41 |         df = pd.read_csv(file_name)
 42 | 
 43 |     return df
 44 | 
 45 | 
 46 | def freq(word, doc):
 47 |     return doc.count(word)
 48 | 
 49 | 
 50 | def word_count(doc):
 51 |     return len(doc)
 52 | 
 53 | 
 54 | def tf(word, doc):
 55 |     return (freq(word, doc) / float(word_count(doc)))
 56 | 
 57 | 
 58 | def num_docs_containing(word, list_of_docs):
 59 |     count = 0
 60 |     for document in list_of_docs:
 61 |         if freq(word, document) > 0:
 62 |             count += 1
 63 |     return 1 + count
 64 | 
 65 | 
 66 | def idf(word, list_of_docs):
 67 |     return math.log(len(list_of_docs) /
 68 |                     float(num_docs_containing(word, list_of_docs)))
 69 | 
 70 | 
 71 | def tf_idf(word, doc, list_of_docs):
 72 |     return (tf(word, doc) * idf(word, list_of_docs))
 73 | 
 74 | 
 75 | def print_top_words(model, feature_names, n_top_words):
 76 |     for topic_idx, topic in enumerate(model.components_):
 77 |         print("Topic #%d:" % topic_idx)
 78 |         print(" ".join([feature_names[i]
 79 |                         for i in topic.argsort()[:-n_top_words - 1:-1]]))
 80 |     print()
 81 | 
 82 | 
 83 | if __name__ == "__main__":
 84 |     t_start = time.time()
 85 |     print "-- ----------------------------------------------------------------"
 86 |     print "-- %s - Start building document retrieval systems" % datetime.datetime.now()
 87 |     print "-- ----------------------------------------------------------------"
 88 | 
 89 |     n_samples = 2000
 90 |     n_features = 1000
 91 |     n_topics = 10
 92 |     n_top_words = 20
 93 | 
 94 |     # Load wiki data
 95 |     people = load_wiki_data("people_wiki.csv")
 96 |     print people.head()
 97 |     print len(people)
 98 | 
 99 |     # Explore
100 |     obama = people[people["name"] == "Barack Obama"]
101 |     obama_row_index = obama.index.tolist()[0]
102 |     print "-- Obama:", obama
103 | 
104 |     taylor = people[people["name"] == "Taylor Swift"]
105 |     taylor_row_index = taylor.index.tolist()[0]
106 |     print "-- Taylor Swift:", taylor
107 | 
108 |     # Calculate term frequency
109 |     txt_obama = obama["text"].tolist()[0]
110 |     print "-- Obama term frequence"
111 |     for word in txt_obama.split():
112 |         print word, tf(word, txt_obama)
113 | 
114 |     txt_taylor = taylor["text"].tolist()[0]
115 |     print "-- Taylor Swift term frequence"
116 |     for word in txt_taylor.split():
117 |         print word, tf(word, txt_taylor)
118 | 
119 |     # Calculate TF-IDF
120 |     print "-- Obama TF-IDF"
121 |     for word in txt_obama.split():
122 |         print word, tf_idf(word, txt_obama, people["text"])
123 | 
124 |     print "-- Taylor Swift TF-IDF"
125 |     for word in txt_taylor.split():
126 |         print word, tf_idf(word, txt_taylor, people["text"])
127 | 
128 |     # TF-IDF
129 |     count_vect = CountVectorizer()
130 |     X_train_counts = count_vect.fit_transform(people["text"])
131 |     print "-- Term frequency matrix:", X_train_counts.shape
132 | 
133 |     tfidf_transformer = TfidfTransformer()
134 |     X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
135 |     tfidf_matrix = X_train_tfidf.toarray()
136 |     print "-- TF-IDF matrix:", X_train_tfidf.shape
137 | 
138 |     # Build nearest matrix
139 |     neigh = NearestNeighbors(n_neighbors=5)
140 |     neigh.fit(X_train_tfidf)
141 | 
142 |     # Looking for some nearest
143 |     (distance, found_index) = neigh.kneighbors([tfidf_matrix[obama_row_index]])
144 |     print "-- Who is closest to Obama?"
145 |     print people.iloc[found_index.tolist()[0]]
146 | 
147 |     (distance, found_index) = neigh.kneighbors([tfidf_matrix[taylor_row_index]])
148 |     print "-- Who is closest to Taylor Swift?"
149 |     print people.iloc[found_index.tolist()[0]]
150 | 
151 |     #######
152 |     # NMF #
153 |     #######
154 |     # Use tf-idf features for NMF.
155 |     print("Extracting tf-idf features for NMF...")
156 |     tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
157 |                                        max_features=n_features,
158 |                                        stop_words='english')
159 |     t0 = time.time()
160 |     tfidf = tfidf_vectorizer.fit_transform(people["text"])
161 |     print("done in %0.3fs." % (time.time() - t0))
162 | 
163 |     # Fit the NMF model
164 |     print("Fitting the NMF model with tf-idf features, "
165 |           "n_samples=%d and n_features=%d..."
166 |           % (n_samples, n_features))
167 |     t0 = time.time()
168 |     nmf = NMF(n_components=n_topics, random_state=1,
169 |               alpha=.1, l1_ratio=.5).fit(tfidf)
170 |     print("done in %0.3fs." % (time.time() - t0))
171 | 
172 |     print("\nTopics in NMF model:")
173 |     tfidf_feature_names = tfidf_vectorizer.get_feature_names()
174 |     print_top_words(nmf, tfidf_feature_names, n_top_words)
175 | 
176 |     #######
177 |     # LDA #
178 |     #######
179 |     # Use tf (raw term count) features for LDA.
180 |     print("Extracting tf features for LDA...")
181 |     tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
182 |                                     max_features=n_features,
183 |                                     stop_words='english')
184 |     t0 = time.time()
185 |     tf = tf_vectorizer.fit_transform(people["text"])
186 |     print("done in %0.3fs." % (time.time() - t0))
187 | 
188 |     print("Fitting LDA models with tf features, "
189 |           "n_samples=%d and n_features=%d..."
190 |           % (n_samples, n_features))
191 |     lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
192 |                                     learning_method='online',
193 |                                     learning_offset=50.,
194 |                                     random_state=0)
195 |     t0 = time.time()
196 |     lda.fit(tf)
197 |     print("done in %0.3fs." % (time.time() - t0))
198 | 
199 |     print("\nTopics in LDA model:")
200 |     tf_feature_names = tf_vectorizer.get_feature_names()
201 |     print_top_words(lda, tf_feature_names, n_top_words)
202 | 
203 |     print "-- %s * DONE After * %s" % (datetime.datetime.now(), time_diff_str(t_start, time.time()))
204 | 


--------------------------------------------------------------------------------
/word2vec/gensim_test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "# -*- coding: utf-8 -*-\n",
 12 |     "# import libraries\n",
 13 |     "import os, logging\n",
 14 |     "from gensim.models import Word2Vec\n",
 15 |     "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 2,
 21 |    "metadata": {
 22 |     "collapsed": true
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "# read sentences\n",
 27 |     "class MySentences(object):\n",
 28 |     "    def __init__(self, dirname):\n",
 29 |     "        self.dirname = dirname\n",
 30 |     " \n",
 31 |     "    def __iter__(self):\n",
 32 |     "        for fname in os.listdir(self.dirname):\n",
 33 |     "            for line in open(os.path.join(self.dirname, fname)):\n",
 34 |     "                yield line.split()\n",
 35 |     " \n",
 36 |     "sentences = MySentences(\"data\")"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 3,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "name": "stderr",
 46 |      "output_type": "stream",
 47 |      "text": [
 48 |       "2017-09-18 15:55:55,940 : INFO : collecting all words and their counts\n",
 49 |       "2017-09-18 15:55:55,941 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n",
 50 |       "2017-09-18 15:55:55,942 : INFO : collected 236 word types from a corpus of 441 raw words and 17 sentences\n",
 51 |       "2017-09-18 15:55:55,942 : INFO : Loading a fresh vocabulary\n",
 52 |       "2017-09-18 15:55:55,943 : INFO : min_count=5 retains 14 unique words (5% of original 236, drops 222)\n",
 53 |       "2017-09-18 15:55:55,944 : INFO : min_count=5 leaves 126 word corpus (28% of original 441, drops 315)\n",
 54 |       "2017-09-18 15:55:55,945 : INFO : deleting the raw counts dictionary of 236 items\n",
 55 |       "2017-09-18 15:55:55,945 : INFO : sample=0.001 downsamples 14 most-common words\n",
 56 |       "2017-09-18 15:55:55,946 : INFO : downsampling leaves estimated 16 word corpus (12.9% of prior 126)\n",
 57 |       "2017-09-18 15:55:55,947 : INFO : estimated required memory for 14 words and 100 dimensions: 18200 bytes\n",
 58 |       "2017-09-18 15:55:55,948 : INFO : resetting layer weights\n",
 59 |       "2017-09-18 15:55:55,949 : INFO : training model with 4 workers on 14 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n",
 60 |       "2017-09-18 15:55:55,952 : INFO : worker thread finished; awaiting finish of 3 more threads\n",
 61 |       "2017-09-18 15:55:55,953 : INFO : worker thread finished; awaiting finish of 2 more threads\n",
 62 |       "2017-09-18 15:55:55,953 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
 63 |       "2017-09-18 15:55:55,954 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
 64 |       "2017-09-18 15:55:55,955 : INFO : training on 2205 raw words (71 effective words) took 0.0s, 14786 effective words/s\n",
 65 |       "2017-09-18 15:55:55,956 : WARNING : under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n",
 66 |       "2017-09-18 15:55:55,956 : INFO : saving Word2Vec object under models/first_model, separately None\n",
 67 |       "2017-09-18 15:55:55,957 : INFO : not storing attribute syn0norm\n",
 68 |       "2017-09-18 15:55:55,958 : INFO : not storing attribute cum_table\n",
 69 |       "2017-09-18 15:55:55,959 : INFO : saved models/first_model\n"
 70 |      ]
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)\n",
 75 |     "model.save(\"models/first_model\")"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 4,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stderr",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "2017-09-18 15:55:55,964 : INFO : loading Word2Vec object from models/first_model\n",
 88 |       "2017-09-18 15:55:55,965 : INFO : loading wv recursively from models/first_model.wv.* with mmap=None\n",
 89 |       "2017-09-18 15:55:55,966 : INFO : setting ignored attribute syn0norm to None\n",
 90 |       "2017-09-18 15:55:55,966 : INFO : setting ignored attribute cum_table to None\n",
 91 |       "2017-09-18 15:55:55,967 : INFO : loaded models/first_model\n"
 92 |      ]
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "new_model = Word2Vec.load(\"models/first_model\")"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 5,
102 |    "metadata": {},
103 |    "outputs": [
104 |     {
105 |      "name": "stdout",
106 |      "output_type": "stream",
107 |      "text": [
108 |       "T�nh\n"
109 |      ]
110 |     }
111 |    ],
112 |    "source": [
113 |     "u = \"T\\xc3\\xadnh\"\n",
114 |     "uu = u.decode('utf8')\n",
115 |     "s = uu.encode('cp1250')\n",
116 |     "print(s)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 6,
122 |    "metadata": {},
123 |    "outputs": [
124 |     {
125 |      "data": {
126 |       "text/plain": [
127 |        "{',': <gensim.models.keyedvectors.Vocab at 0x10ecc3090>,\n",
128 |        " '-': <gensim.models.keyedvectors.Vocab at 0x10c441650>,\n",
129 |        " '.': <gensim.models.keyedvectors.Vocab at 0x10ecc3590>,\n",
130 |        " 'bi\\xe1\\xbb\\x83n': <gensim.models.keyedvectors.Vocab at 0x10ecc30d0>,\n",
131 |        " 'ch\\xc3\\xacm': <gensim.models.keyedvectors.Vocab at 0x10c441d90>,\n",
132 |        " 'ch\\xe1\\xbb\\x9f': <gensim.models.keyedvectors.Vocab at 0x10c441e10>,\n",
133 |        " 'c\\xc3\\xa1c': <gensim.models.keyedvectors.Vocab at 0x10ecc3450>,\n",
134 |        " 'h\\xc3\\xa0ng_h\\xc3\\xb3a': <gensim.models.keyedvectors.Vocab at 0x10ecc3690>,\n",
135 |        " 'n\\xc3\\xa0y': <gensim.models.keyedvectors.Vocab at 0x10c441dd0>,\n",
136 |        " 'ph\\xc3\\xa0': <gensim.models.keyedvectors.Vocab at 0x10c441cd0>,\n",
137 |        " 'tr\\xc3\\xaan': <gensim.models.keyedvectors.Vocab at 0x10ecc3510>,\n",
138 |        " 't\\xe1\\xba\\xa5n': <gensim.models.keyedvectors.Vocab at 0x10ecc3050>,\n",
139 |        " '\\xc4\\x91\\xc6\\xb0\\xe1\\xbb\\xa3c': <gensim.models.keyedvectors.Vocab at 0x10ecc3190>,\n",
140 |        " '\\xc4\\x91\\xe1\\xba\\xbfn': <gensim.models.keyedvectors.Vocab at 0x10ecc3a50>}"
141 |       ]
142 |      },
143 |      "execution_count": 6,
144 |      "metadata": {},
145 |      "output_type": "execute_result"
146 |     }
147 |    ],
148 |    "source": [
149 |     "new_model.wv.vocab"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {
156 |     "collapsed": true
157 |    },
158 |    "outputs": [],
159 |    "source": []
160 |   }
161 |  ],
162 |  "metadata": {
163 |   "kernelspec": {
164 |    "display_name": "Python 2",
165 |    "language": "python",
166 |    "name": "python2"
167 |   },
168 |   "language_info": {
169 |    "codemirror_mode": {
170 |     "name": "ipython",
171 |     "version": 2
172 |    },
173 |    "file_extension": ".py",
174 |    "mimetype": "text/x-python",
175 |    "name": "python",
176 |    "nbconvert_exporter": "python",
177 |    "pygments_lexer": "ipython2",
178 |    "version": "2.7.10"
179 |   }
180 |  },
181 |  "nbformat": 4,
182 |  "nbformat_minor": 2
183 | }
184 | 


--------------------------------------------------------------------------------
/deep_learning/output/submission_results.csv:
--------------------------------------------------------------------------------
  1 | id,cancer
  2 | aec5a58fea38b77b964007aa6975c049,0
  3 | 6d3b16f2e60c3a1a4246f340dba73676,0
  4 | 7027c0b8c8f8dcc76c6e4ba923d60a2e,0
  5 | d1a20ef45bb03f93a407b492066f6d88,0
  6 | 83728b6eed98845556bfc870b7567883,0
  7 | 8bb7dd5fbfa5ecb95552d9c587f2fea5,0
  8 | e9a27e2645e1fad9434ce765f678585f,0
  9 | ea01deecde93cd9503a049d71d46e6d5,0
 10 | 03bd22ed5858039af223c04993e9eb22,0
 11 | d42c998d037fb3003faba541e2cf649a,0
 12 | ac4056071f3cc98489b9db3aebfe2b6a,0
 13 | 995fc0581ed0e3ba0f97dbd7fe63db59,0
 14 | d654966fd2498de023552b830c07a659,0
 15 | 8be7a7cc747365030bee8297221ab5bc,0
 16 | e6160ed0ff2eb214abd4df9a3c336c1d,0
 17 | 1f6333bc3599f683403d6f0884aefe00,0
 18 | 49c88f7cc77341c9ae4e64243f9912fc,0
 19 | 174c5f7c33ca31443208ef873b9477e5,0
 20 | 33387bea2cacf6127035cc7033036a02,0
 21 | a5d7909f14d43f01f44cdcaabed27b84,0
 22 | d1131708024b32032ade1ef48d115915,0
 23 | b8793dbd40de88c0de0913abbaab0fe7,0
 24 | ae2fdcd8daa3fede6ae23cc63a8d9a82,0
 25 | 202898fa97c5949fbdc07ae7ff1cd9f0,0
 26 | 665c1913d8e90e57af3b745349d19537,0
 27 | 901ed0a38aa16933c04ffd531b0aa2cf,0
 28 | cc4805e3ebe8621bc94a621b1714fc84,0
 29 | eb9db3f740f8e153e85f83c57bc4e522,0
 30 | 5ce91933688cc8400105bf640ac11535,0
 31 | dbd9c8025907511e965e7abad955547d,0
 32 | 61017c23bbae6e17062ff582d1a237b3,0
 33 | cdb53f3be6d8cce07fa41c833488d8a5,0
 34 | a6c15206edadab0270898f03e770d730,0
 35 | c71d0db2086b7e2024ca9c11bd2ca504,0
 36 | e60d99ea9648e1ce859eb0b386365e26,0
 37 | fcfab3eddbdf0421c39f71d651cc5c56,0
 38 | 505405b3e70fb24b92e6a8a5b7ed339c,0
 39 | fad57a1078ddbc685e517bd8f24aa8ac,0
 40 | 538543b57d0c8fa0b2b6bb7c84df3f33,0
 41 | e0aa61b44c33e6a75940a8541c6894c9,0
 42 | 21b73c938fd7d346ee77a60bd60aaeac,0
 43 | 4b28f147cb82baba3edcdbd34ca19085,0
 44 | 7cf1a65bb0f89323668034244a59e725,0
 45 | 6e240f23afa2c1b4352cd0db5d4f357d,0
 46 | 031b7ec4fe96a3b035a8196264a8c8c3,0
 47 | 8fde44df03fb80366c6604db53d3623f,0
 48 | d81852bffda09dc8033a45332397c495,0
 49 | 6993396b31078993e13cf9c0a6fd470b,0
 50 | aa59b7a4aa4dfb2489feea527eda3e4d,0
 51 | b82efe72526c59a96257208d95e54baf,0
 52 | ff8599dd7c1139be3bad5a0351ab749a,0
 53 | fdcd385b0d2d12341661e1abe845be0b,0
 54 | a2a4bc7708f6831470d757cd6f32bffe,0
 55 | 59af702c21840ec18073b6b56c95e7fe,0
 56 | 82b9fb9e238397b2f3bff98975577ff9,0
 57 | 5451203688c930484ba1f3c7f1378847,0
 58 | c25876fb40d6f8dafd1ecb243193dd3f,0
 59 | f0310ffc724faf9f7aef2c418127ee68,0
 60 | 38bf066bba822584e14c0af65d4bb5e9,0
 61 | 026470d51482c93efc18b9803159c960,0
 62 | 9a378249b799bbcefac2a7de46896c0a,0
 63 | 96042e205dd3dc055f084aaca245e550,0
 64 | 2eb92d17ca91b393765e8acf069763a6,0
 65 | be3e35bf8395366d235b8bcfc71a05ee,0
 66 | bf6a7a9ab4e18b18f43129c9e22fb448,0
 67 | 7191c236cfcfc68cd21143e3a0faac51,0
 68 | b4db5b96c65a668a2e63f9a3ed36afe7,0
 69 | ab9c7bef62d1ad65b824414087b6f06b,0
 70 | 263a1c3bfa43556623e75ed901e3fd8f,0
 71 | 50cdacec399071cf70d8badd2511d0b3,0
 72 | d3a8fb1da8f7a0dcbd5a8d65f3647757,0
 73 | ebcdfabecf4b46b1e55e4a4c75a0afb0,0
 74 | 159bc8821a2dc39a1e770cb3559e098d,0
 75 | 80938b4f531fa2334c13d829339e1356,0
 76 | a0fc609febe3eef5a4713a22996cf8e5,0
 77 | 2703df8c469906a06a45c0d7ff501199,0
 78 | c3a9046fbe2b0f0a4e43a669c321e472,0
 79 | 993f1e68290d591f755669e97b49b4f4,0
 80 | bdfb2c23a8c1dca5ea8c1cc3d89efee9,0
 81 | c46c3962c10e287f1c1e3af0d309a128,0
 82 | 55b06d60e7c0329787f81d1b7cbf9aa0,0
 83 | 96544665531e7f59bc2730e3c5f42e65,0
 84 | 85ab88f093ca53a4fab5654e24c77ebe,0
 85 | c87a713d17522698958de55c97654beb,0
 86 | 6d43fdb6eb1bec3a5f4febfd442e8c93,0
 87 | c7bdb83b7ca6269fac16ab7cff930a2e,0
 88 | 519ad4ead3e61d2d71088ac8e46f25b6,0
 89 | 63458b5875a0b223ec21555d17b52fd4,0
 90 | b6857d98b7b3dbe84f153617f4dfd14b,0
 91 | 84ed26b5d79da321711ed869b3cad2ea,0
 92 | e314fd13809db0132443b924401d828b,0
 93 | 2004b3f761c3f5dffb02204f1247b211,0
 94 | f89e3d0867e27be8e19d7ed50e1eb7e8,0
 95 | 9ca18e68b6b8d9c3112b4b69b7d6fad5,0
 96 | 5d16819bd78c74448ce852a93bf423ad,0
 97 | ae61ec94b0b8de5439180f4776551e42,0
 98 | c2ef34cc347bc224b5a123426009d027,0
 99 | bdc2daa372a36f6f7c72abdc0b5639d1,0
100 | af1d0c2fcde369dd1b715460c2f704a2,0
101 | b4d5b618fdf3a5a1bcfb325a3715e99e,0
102 | 68f4dff6dd1f135488e83b8a4ee6e20e,0
103 | 1cf8e778167d20bf769669b4be96592b,0
104 | 52f6d741e674f62fbcf73e6ec4f6a472,0
105 | 7ce310b8431ace09a91ededcc03f7361,0
106 | 26142353f46d20c9fdded93f01e2bff4,0
107 | cbb9bbd994c235b56fb77429291edf99,0
108 | 9cc74e673ec9807ee055973e1b185624,0
109 | 7daeb8ef7307849c715f7f6f3e2dd88e,0
110 | 8a1e5830a16db34b580202f8b6dbbd3d,0
111 | a0e60d7a13f6bb4002cc4a08e60b0776,0
112 | 243038f7bb7787497c59bc17f04c6ed9,0
113 | 88acee40bb9d8cb06898d1c5de01d3c8,0
114 | b17c07114dcf49ce71c8da4b43cf1192,0
115 | c0c5a155e6e59588783c2964975e7e1e,0
116 | 649fd56ef9809019b57261fcf9574d76,0
117 | fb55849cee6473974612c17f094a38cd,0
118 | 95a98df466d4f6c6689908ea9a8f324b,0
119 | e3bc0a970a4af5d52826e06742f90e5b,0
120 | e42065c1145ccf734312cb9edbe5234b,0
121 | 12db1ea8336eafaf7f9e3eda2b4e4fef,0
122 | 4575fe61bf3f536ce6cfeb26fcc2893c,0
123 | 1e62be2c3b6430b78ce31a8f023531ac,0
124 | 9050cf3aa8371bd7088c4bdf967141d4,0
125 | 34037914ceeec5605fc890159dd425c5,0
126 | 8e9002a485cbda2b47cd14014d6f1c36,0
127 | ae4e9d8aab8f8f5ae975bcca923f468d,0
128 | 2a3e6ecf9499607ef4fd14b436136b0c,0
129 | b0599ad2f33276e7cd065eaa8dcec8a2,0
130 | f5ff7734997820b45dafa75dff60ece8,0
131 | 4434e19303b62ebaecef2596583ff351,0
132 | e6d8ae8c3b0817df994a1ce3b37a7efb,0
133 | d753676c2c6c8ac6f97bd61ecab7554a,0
134 | a5bb766ab3b1bc5a8023a50a956595f2,0
135 | 9de48cf43611478ffc1fef051b75dc8c,0
136 | 80bda1afde73204abd74d1ebd2758382,0
137 | cc1b7e34d9eba737c9fb91316463e8f7,0
138 | 7f096cdfbc2fe03ec7f779278416a78c,0
139 | e33c25d0dbca5e54385f2100ce523467,0
140 | 7869cc6bfc3678fec1a81e93b34648cf,0
141 | eaeebb7a63edc8a329a7c5fbc583a507,0
142 | 48ab0b98fc7789304c21430978624f32,0
143 | bbf7a3e138f9353414f2d51f0c363561,0
144 | 567547810a1795b9c8e11c15dfd32c34,0
145 | 07b1defcfae5873ee1f03c90255eb170,0
146 | cd6be62834c72756738935f904ec9c2c,0
147 | 580cffecce8d3d53cde1abb922adf21a,0
148 | 06a90409e4fcea3e634748b967993531,0
149 | f7c387290d7e3074501eac167c849000,0
150 | 89f003dbfbdbd18a5cdeb9b128cb075b,0
151 | 6c71617e2cee498fd3dd20956bb90a3b,0
152 | 5a42f0a0d1e060531c20d04ed23efc02,0
153 | cd68d1a14cc504e3f7434d5cc324744d,0
154 | 616f98dab4db03edbad28c73d22468d2,0
155 | d2ec8f0fc56a9168cda0c707e49974ab,0
156 | 42b2161e43b4dd0ea94604485976c59c,0
157 | 5791c42d317f34592be9a933c50e68ad,0
158 | 5ae9ab473d59cd29262c47a741177b6e,0
159 | 7c2fd0d32df5a2780b4b10fdf2f2cdbe,0
160 | 6d3be6081d76d2365b080e599628d3bc,0
161 | fb5874408966d7c6bebd3d84a5599e20,0
162 | 2d596b6ead89ab35577fe625a9a17cbb,0
163 | 6f229187fe608c9eacc567eb74c1458c,0
164 | 96cca9d8e5764daa4bcb6c0ba07735bc,0
165 | 1753250dab5fc81bab8280df13309733,0
166 | 9b871732b3935661e7639e84a6ab9747,0
167 | 8e60f166f1f1dc0d72f997fe1c9e72b4,0
168 | 70f4eb8201e3155cc3e399f0ff09c5ef,0
169 | 3ee1fd6a0f3f108c3558e6699fb011f2,0
170 | d032116d73789ff9c805f493357b4037,0
171 | 6379e4435f78a5e5c150c32146ece4d4,0
172 | 86ad341b9ac27364f03981f6a775246c,0
173 | f4d23e0272a2ce5bfc7f07033d4f2e7d,0
174 | 94df6d1ae21c5bfaebe6f8daf8fcd85b,0
175 | 8f517521a2ed576e853fab1907fa5ffd,0
176 | a334d15ac8d2d25bce76693b1b2a3ed7,0
177 | 931253c408c440a8494dfaa74251efd3,0
178 | 2f77fd993fbd858dec3c085b9ff1a3a2,0
179 | a2558184e0f4a68e9fb13579d20cb244,0
180 | d5a0333be8795805fc39509f817780ee,0
181 | 763288341ee363a264fe45a28ea28c21,0
182 | 1fdbc07019192de4a114e090389c8330,0
183 | c95f2aa23e6d6702f5b16a3b35f89cf0,0
184 | 8b494d14d835dd5ae13dab19b9520a55,0
185 | d4a075768abe7fe43ad1caac92515256,0
186 | 3295cec04482210dc6f78c2b4a1d287b,0
187 | 85d6fb4a08853d370935a75de7495a27,0
188 | 81bd0c062bfa8e85616878bab90f2314,0
189 | efcb6def7a2080243052b6046186ab24,0
190 | 8b9a28375988de6ea0b143d48b4a8dc9,0
191 | 0b20184e0cd497028bdd155d9fb42dc9,0
192 | b53d997901eb880c41fbfbc82847204c,0
193 | be9a2df5a16434e581c6a0625c290591,0
194 | 70671fa94231eb377e8ac7cba4650dfb,0
195 | 7fd5be8ec9c236c314f801384bd89c0c,0
196 | 9065f2b133129c5747d42db18a424749,0
197 | d03127f497cae40bcbd9996b4d1f5b90,0
198 | 49433c1588cc078b825a0eff1dc2e816,0
199 | ea3a771ef05e288409e0250ea893cf87,0
200 | 


--------------------------------------------------------------------------------
/deep_learning/data/stage1_sample_submission.csv:
--------------------------------------------------------------------------------
  1 | id,cancer
  2 | 026470d51482c93efc18b9803159c960,0.5
  3 | 031b7ec4fe96a3b035a8196264a8c8c3,0.5
  4 | 03bd22ed5858039af223c04993e9eb22,0.5
  5 | 06a90409e4fcea3e634748b967993531,0.5
  6 | 07b1defcfae5873ee1f03c90255eb170,0.5
  7 | 0b20184e0cd497028bdd155d9fb42dc9,0.5
  8 | 12db1ea8336eafaf7f9e3eda2b4e4fef,0.5
  9 | 159bc8821a2dc39a1e770cb3559e098d,0.5
 10 | 174c5f7c33ca31443208ef873b9477e5,0.5
 11 | 1753250dab5fc81bab8280df13309733,0.5
 12 | 1cf8e778167d20bf769669b4be96592b,0.5
 13 | 1e62be2c3b6430b78ce31a8f023531ac,0.5
 14 | 1f6333bc3599f683403d6f0884aefe00,0.5
 15 | 1fdbc07019192de4a114e090389c8330,0.5
 16 | 2004b3f761c3f5dffb02204f1247b211,0.5
 17 | 202898fa97c5949fbdc07ae7ff1cd9f0,0.5
 18 | 21b73c938fd7d346ee77a60bd60aaeac,0.5
 19 | 243038f7bb7787497c59bc17f04c6ed9,0.5
 20 | 26142353f46d20c9fdded93f01e2bff4,0.5
 21 | 263a1c3bfa43556623e75ed901e3fd8f,0.5
 22 | 2703df8c469906a06a45c0d7ff501199,0.5
 23 | 2a3e6ecf9499607ef4fd14b436136b0c,0.5
 24 | 2d596b6ead89ab35577fe625a9a17cbb,0.5
 25 | 2eb92d17ca91b393765e8acf069763a6,0.5
 26 | 2f77fd993fbd858dec3c085b9ff1a3a2,0.5
 27 | 3295cec04482210dc6f78c2b4a1d287b,0.5
 28 | 33387bea2cacf6127035cc7033036a02,0.5
 29 | 34037914ceeec5605fc890159dd425c5,0.5
 30 | 38bf066bba822584e14c0af65d4bb5e9,0.5
 31 | 3ee1fd6a0f3f108c3558e6699fb011f2,0.5
 32 | 42b2161e43b4dd0ea94604485976c59c,0.5
 33 | 4434e19303b62ebaecef2596583ff351,0.5
 34 | 4575fe61bf3f536ce6cfeb26fcc2893c,0.5
 35 | 48ab0b98fc7789304c21430978624f32,0.5
 36 | 49433c1588cc078b825a0eff1dc2e816,0.5
 37 | 49c88f7cc77341c9ae4e64243f9912fc,0.5
 38 | 4b28f147cb82baba3edcdbd34ca19085,0.5
 39 | 505405b3e70fb24b92e6a8a5b7ed339c,0.5
 40 | 50cdacec399071cf70d8badd2511d0b3,0.5
 41 | 519ad4ead3e61d2d71088ac8e46f25b6,0.5
 42 | 52f6d741e674f62fbcf73e6ec4f6a472,0.5
 43 | 538543b57d0c8fa0b2b6bb7c84df3f33,0.5
 44 | 5451203688c930484ba1f3c7f1378847,0.5
 45 | 55b06d60e7c0329787f81d1b7cbf9aa0,0.5
 46 | 567547810a1795b9c8e11c15dfd32c34,0.5
 47 | 5791c42d317f34592be9a933c50e68ad,0.5
 48 | 580cffecce8d3d53cde1abb922adf21a,0.5
 49 | 59af702c21840ec18073b6b56c95e7fe,0.5
 50 | 5a42f0a0d1e060531c20d04ed23efc02,0.5
 51 | 5ae9ab473d59cd29262c47a741177b6e,0.5
 52 | 5ce91933688cc8400105bf640ac11535,0.5
 53 | 5d16819bd78c74448ce852a93bf423ad,0.5
 54 | 61017c23bbae6e17062ff582d1a237b3,0.5
 55 | 616f98dab4db03edbad28c73d22468d2,0.5
 56 | 63458b5875a0b223ec21555d17b52fd4,0.5
 57 | 6379e4435f78a5e5c150c32146ece4d4,0.5
 58 | 649fd56ef9809019b57261fcf9574d76,0.5
 59 | 665c1913d8e90e57af3b745349d19537,0.5
 60 | 68f4dff6dd1f135488e83b8a4ee6e20e,0.5
 61 | 6993396b31078993e13cf9c0a6fd470b,0.5
 62 | 6c71617e2cee498fd3dd20956bb90a3b,0.5
 63 | 6d3b16f2e60c3a1a4246f340dba73676,0.5
 64 | 6d3be6081d76d2365b080e599628d3bc,0.5
 65 | 6d43fdb6eb1bec3a5f4febfd442e8c93,0.5
 66 | 6e240f23afa2c1b4352cd0db5d4f357d,0.5
 67 | 6f229187fe608c9eacc567eb74c1458c,0.5
 68 | 7027c0b8c8f8dcc76c6e4ba923d60a2e,0.5
 69 | 70671fa94231eb377e8ac7cba4650dfb,0.5
 70 | 70f4eb8201e3155cc3e399f0ff09c5ef,0.5
 71 | 7191c236cfcfc68cd21143e3a0faac51,0.5
 72 | 763288341ee363a264fe45a28ea28c21,0.5
 73 | 7869cc6bfc3678fec1a81e93b34648cf,0.5
 74 | 7c2fd0d32df5a2780b4b10fdf2f2cdbe,0.5
 75 | 7ce310b8431ace09a91ededcc03f7361,0.5
 76 | 7cf1a65bb0f89323668034244a59e725,0.5
 77 | 7daeb8ef7307849c715f7f6f3e2dd88e,0.5
 78 | 7f096cdfbc2fe03ec7f779278416a78c,0.5
 79 | 7fd5be8ec9c236c314f801384bd89c0c,0.5
 80 | 80938b4f531fa2334c13d829339e1356,0.5
 81 | 80bda1afde73204abd74d1ebd2758382,0.5
 82 | 81bd0c062bfa8e85616878bab90f2314,0.5
 83 | 82b9fb9e238397b2f3bff98975577ff9,0.5
 84 | 83728b6eed98845556bfc870b7567883,0.5
 85 | 84ed26b5d79da321711ed869b3cad2ea,0.5
 86 | 85ab88f093ca53a4fab5654e24c77ebe,0.5
 87 | 85d6fb4a08853d370935a75de7495a27,0.5
 88 | 86ad341b9ac27364f03981f6a775246c,0.5
 89 | 88acee40bb9d8cb06898d1c5de01d3c8,0.5
 90 | 89f003dbfbdbd18a5cdeb9b128cb075b,0.5
 91 | 8a1e5830a16db34b580202f8b6dbbd3d,0.5
 92 | 8b494d14d835dd5ae13dab19b9520a55,0.5
 93 | 8b9a28375988de6ea0b143d48b4a8dc9,0.5
 94 | 8bb7dd5fbfa5ecb95552d9c587f2fea5,0.5
 95 | 8be7a7cc747365030bee8297221ab5bc,0.5
 96 | 8e60f166f1f1dc0d72f997fe1c9e72b4,0.5
 97 | 8e9002a485cbda2b47cd14014d6f1c36,0.5
 98 | 8f517521a2ed576e853fab1907fa5ffd,0.5
 99 | 8fde44df03fb80366c6604db53d3623f,0.5
100 | 901ed0a38aa16933c04ffd531b0aa2cf,0.5
101 | 9050cf3aa8371bd7088c4bdf967141d4,0.5
102 | 9065f2b133129c5747d42db18a424749,0.5
103 | 931253c408c440a8494dfaa74251efd3,0.5
104 | 94df6d1ae21c5bfaebe6f8daf8fcd85b,0.5
105 | 95a98df466d4f6c6689908ea9a8f324b,0.5
106 | 96042e205dd3dc055f084aaca245e550,0.5
107 | 96544665531e7f59bc2730e3c5f42e65,0.5
108 | 96cca9d8e5764daa4bcb6c0ba07735bc,0.5
109 | 993f1e68290d591f755669e97b49b4f4,0.5
110 | 995fc0581ed0e3ba0f97dbd7fe63db59,0.5
111 | 9a378249b799bbcefac2a7de46896c0a,0.5
112 | 9b871732b3935661e7639e84a6ab9747,0.5
113 | 9ca18e68b6b8d9c3112b4b69b7d6fad5,0.5
114 | 9cc74e673ec9807ee055973e1b185624,0.5
115 | 9de48cf43611478ffc1fef051b75dc8c,0.5
116 | a0e60d7a13f6bb4002cc4a08e60b0776,0.5
117 | a0fc609febe3eef5a4713a22996cf8e5,0.5
118 | a2558184e0f4a68e9fb13579d20cb244,0.5
119 | a2a4bc7708f6831470d757cd6f32bffe,0.5
120 | a334d15ac8d2d25bce76693b1b2a3ed7,0.5
121 | a5bb766ab3b1bc5a8023a50a956595f2,0.5
122 | a5d7909f14d43f01f44cdcaabed27b84,0.5
123 | a6c15206edadab0270898f03e770d730,0.5
124 | aa59b7a4aa4dfb2489feea527eda3e4d,0.5
125 | ab9c7bef62d1ad65b824414087b6f06b,0.5
126 | ac4056071f3cc98489b9db3aebfe2b6a,0.5
127 | ae2fdcd8daa3fede6ae23cc63a8d9a82,0.5
128 | ae4e9d8aab8f8f5ae975bcca923f468d,0.5
129 | ae61ec94b0b8de5439180f4776551e42,0.5
130 | aec5a58fea38b77b964007aa6975c049,0.5
131 | af1d0c2fcde369dd1b715460c2f704a2,0.5
132 | b0599ad2f33276e7cd065eaa8dcec8a2,0.5
133 | b17c07114dcf49ce71c8da4b43cf1192,0.5
134 | b4d5b618fdf3a5a1bcfb325a3715e99e,0.5
135 | b4db5b96c65a668a2e63f9a3ed36afe7,0.5
136 | b53d997901eb880c41fbfbc82847204c,0.5
137 | b6857d98b7b3dbe84f153617f4dfd14b,0.5
138 | b82efe72526c59a96257208d95e54baf,0.5
139 | b8793dbd40de88c0de0913abbaab0fe7,0.5
140 | bbf7a3e138f9353414f2d51f0c363561,0.5
141 | bdc2daa372a36f6f7c72abdc0b5639d1,0.5
142 | bdfb2c23a8c1dca5ea8c1cc3d89efee9,0.5
143 | be3e35bf8395366d235b8bcfc71a05ee,0.5
144 | be9a2df5a16434e581c6a0625c290591,0.5
145 | bf6a7a9ab4e18b18f43129c9e22fb448,0.5
146 | c0c5a155e6e59588783c2964975e7e1e,0.5
147 | c25876fb40d6f8dafd1ecb243193dd3f,0.5
148 | c2ef34cc347bc224b5a123426009d027,0.5
149 | c3a9046fbe2b0f0a4e43a669c321e472,0.5
150 | c46c3962c10e287f1c1e3af0d309a128,0.5
151 | c71d0db2086b7e2024ca9c11bd2ca504,0.5
152 | c7bdb83b7ca6269fac16ab7cff930a2e,0.5
153 | c87a713d17522698958de55c97654beb,0.5
154 | c95f2aa23e6d6702f5b16a3b35f89cf0,0.5
155 | cbb9bbd994c235b56fb77429291edf99,0.5
156 | cc1b7e34d9eba737c9fb91316463e8f7,0.5
157 | cc4805e3ebe8621bc94a621b1714fc84,0.5
158 | cd68d1a14cc504e3f7434d5cc324744d,0.5
159 | cd6be62834c72756738935f904ec9c2c,0.5
160 | cdb53f3be6d8cce07fa41c833488d8a5,0.5
161 | d03127f497cae40bcbd9996b4d1f5b90,0.5
162 | d032116d73789ff9c805f493357b4037,0.5
163 | d1131708024b32032ade1ef48d115915,0.5
164 | d1a20ef45bb03f93a407b492066f6d88,0.5
165 | d2ec8f0fc56a9168cda0c707e49974ab,0.5
166 | d3a8fb1da8f7a0dcbd5a8d65f3647757,0.5
167 | d42c998d037fb3003faba541e2cf649a,0.5
168 | d4a075768abe7fe43ad1caac92515256,0.5
169 | d5a0333be8795805fc39509f817780ee,0.5
170 | d654966fd2498de023552b830c07a659,0.5
171 | d753676c2c6c8ac6f97bd61ecab7554a,0.5
172 | d81852bffda09dc8033a45332397c495,0.5
173 | dbd9c8025907511e965e7abad955547d,0.5
174 | e0aa61b44c33e6a75940a8541c6894c9,0.5
175 | e314fd13809db0132443b924401d828b,0.5
176 | e33c25d0dbca5e54385f2100ce523467,0.5
177 | e3bc0a970a4af5d52826e06742f90e5b,0.5
178 | e42065c1145ccf734312cb9edbe5234b,0.5
179 | e60d99ea9648e1ce859eb0b386365e26,0.5
180 | e6160ed0ff2eb214abd4df9a3c336c1d,0.5
181 | e6d8ae8c3b0817df994a1ce3b37a7efb,0.5
182 | e9a27e2645e1fad9434ce765f678585f,0.5
183 | ea01deecde93cd9503a049d71d46e6d5,0.5
184 | ea3a771ef05e288409e0250ea893cf87,0.5
185 | eaeebb7a63edc8a329a7c5fbc583a507,0.5
186 | eb9db3f740f8e153e85f83c57bc4e522,0.5
187 | ebcdfabecf4b46b1e55e4a4c75a0afb0,0.5
188 | efcb6def7a2080243052b6046186ab24,0.5
189 | f0310ffc724faf9f7aef2c418127ee68,0.5
190 | f4d23e0272a2ce5bfc7f07033d4f2e7d,0.5
191 | f5ff7734997820b45dafa75dff60ece8,0.5
192 | f7c387290d7e3074501eac167c849000,0.5
193 | f89e3d0867e27be8e19d7ed50e1eb7e8,0.5
194 | fad57a1078ddbc685e517bd8f24aa8ac,0.5
195 | fb55849cee6473974612c17f094a38cd,0.5
196 | fb5874408966d7c6bebd3d84a5599e20,0.5
197 | fcfab3eddbdf0421c39f71d651cc5c56,0.5
198 | fdcd385b0d2d12341661e1abe845be0b,0.5
199 | ff8599dd7c1139be3bad5a0351ab749a,0.5
200 | 


--------------------------------------------------------------------------------
/pyspark/study_apache_spark/scala/scala_rdd.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "http://10.8.2.1:8089/proxy/application_1515394405830_3960\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "import org.apache.spark.sql.SparkSession\n",
 18 |     "\n",
 19 |     "val spark = SparkSession.builder().\n",
 20 |     "    appName(\"scala_rdd\").\n",
 21 |     "    config(\"spark.executor.instances\",\"2\").\n",
 22 |     "    config(\"spark.executor.cores\",\"2\").\n",
 23 |     "    config(\"spark.executor.memory\", \"4g\").\n",
 24 |     "    config(\"spark.yarn.executor.memoryOverhead\", \"1g\").\n",
 25 |     "    getOrCreate()\n",
 26 |     "\n",
 27 |     "println(\"http://10.8.2.1:8089/proxy/\"+ spark.sparkContext.applicationId)"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "# Transformations"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 2,
 40 |    "metadata": {},
 41 |    "outputs": [
 42 |     {
 43 |      "name": "stdout",
 44 |      "output_type": "stream",
 45 |      "text": [
 46 |       "Before: 1,2,3,4\n",
 47 |       "After * 2: 2,4,6,8\n",
 48 |       "Filter even: 2,4\n"
 49 |      ]
 50 |     }
 51 |    ],
 52 |    "source": [
 53 |     "var rdd = spark.sparkContext.parallelize(Array(1, 2, 3, 4))\n",
 54 |     "println(\"Before: \" + rdd.collect().mkString(\",\"))\n",
 55 |     "println(\"After * 2: \" + rdd.map(_ * 2).collect().mkString(\",\"))\n",
 56 |     "println(\"Filter even: \" + rdd.filter(_ % 2 == 0).collect().mkString(\",\"))"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 3,
 62 |    "metadata": {},
 63 |    "outputs": [
 64 |     {
 65 |      "name": "stdout",
 66 |      "output_type": "stream",
 67 |      "text": [
 68 |       "Before: 1,2,2,3,4\n",
 69 |       "Distinct: 4,1,2,3\n"
 70 |      ]
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "rdd = spark.sparkContext.parallelize(Array(1, 2, 2, 3, 4))\n",
 75 |     "println(\"Before: \" + rdd.collect().mkString(\",\"))\n",
 76 |     "println(\"Distinct: \" + rdd.distinct().collect().mkString(\",\"))"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 4,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "name": "stdout",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "Before: 1,2,3\n",
 89 |       "To array:\n"
 90 |      ]
 91 |     },
 92 |     {
 93 |      "data": {
 94 |       "text/plain": [
 95 |        "Array(Array(1, 6), Array(2, 7), Array(3, 8))"
 96 |       ]
 97 |      },
 98 |      "execution_count": 4,
 99 |      "metadata": {},
100 |      "output_type": "execute_result"
101 |     }
102 |    ],
103 |    "source": [
104 |     "rdd = spark.sparkContext.parallelize(Array(1, 2, 3))\n",
105 |     "println(\"Before: \" + rdd.collect().mkString(\",\"))\n",
106 |     "println(\"To array:\")\n",
107 |     "rdd.map(x => Array(x, x + 5)).collect()"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 5,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "name": "stdout",
117 |      "output_type": "stream",
118 |      "text": [
119 |       "To flat array:\n"
120 |      ]
121 |     },
122 |     {
123 |      "data": {
124 |       "text/plain": [
125 |        "Array(1, 6, 2, 7, 3, 8)"
126 |       ]
127 |      },
128 |      "execution_count": 5,
129 |      "metadata": {},
130 |      "output_type": "execute_result"
131 |     }
132 |    ],
133 |    "source": [
134 |     "println(\"To flat array:\")\n",
135 |     "rdd.flatMap(x => Array(x, x + 5)).collect()"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "metadata": {
141 |     "collapsed": true
142 |    },
143 |    "source": [
144 |     "# Actions"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 6,
150 |    "metadata": {},
151 |    "outputs": [
152 |     {
153 |      "name": "stdout",
154 |      "output_type": "stream",
155 |      "text": [
156 |       "Before: 1,2,3\n",
157 |       "Reduce: 6\n",
158 |       "Take 2: 1,2\n",
159 |       "Collect: 1,2,3\n",
160 |       "Count: 3\n"
161 |      ]
162 |     }
163 |    ],
164 |    "source": [
165 |     "// Python rdd.reduce(lambda a, b: a * b)\n",
166 |     "rdd = spark.sparkContext.parallelize(Array(1, 2, 3))\n",
167 |     "println(\"Before: \" + rdd.collect().mkString(\",\"))\n",
168 |     "println(\"Reduce: \" + rdd.reduce((a, b) => a * b))\n",
169 |     "println(\"Take 2: \" + rdd.take(2).mkString(\",\"))\n",
170 |     "println(\"Collect: \" + rdd.collect().mkString(\",\"))\n",
171 |     "println(\"Count: \" + rdd.count())"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "# Key-Value RDDs"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 7,
184 |    "metadata": {},
185 |    "outputs": [
186 |     {
187 |      "data": {
188 |       "text/plain": [
189 |        "Array((1,2), (3,10))"
190 |       ]
191 |      },
192 |      "execution_count": 7,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     }
196 |    ],
197 |    "source": [
198 |     "val keyValReduceByKey = spark.sparkContext.parallelize(Seq((1, 2), (3, 4), (3, 6)))\n",
199 |     "keyValReduceByKey.reduceByKey((a, b) => a + b).collect()"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 8,
205 |    "metadata": {},
206 |    "outputs": [
207 |     {
208 |      "data": {
209 |       "text/plain": [
210 |        "Array((1,a), (1,b), (2,c))"
211 |       ]
212 |      },
213 |      "execution_count": 8,
214 |      "metadata": {},
215 |      "output_type": "execute_result"
216 |     }
217 |    ],
218 |    "source": [
219 |     "val keyValSortByKey = spark.sparkContext.parallelize(Seq((1, \"a\"), (2, \"c\"), (1, \"b\")))\n",
220 |     "keyValSortByKey.sortByKey().collect()"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 9,
226 |    "metadata": {},
227 |    "outputs": [
228 |     {
229 |      "data": {
230 |       "text/plain": [
231 |        "Array((1,CompactBuffer(a, b)), (2,CompactBuffer(c)))"
232 |       ]
233 |      },
234 |      "execution_count": 9,
235 |      "metadata": {},
236 |      "output_type": "execute_result"
237 |     }
238 |    ],
239 |    "source": [
240 |     "val keyValGroupByKey = spark.sparkContext.parallelize(Seq((1, \"a\"), (2, \"c\"), (1, \"b\")))\n",
241 |     "keyValGroupByKey.groupByKey().collect()"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 10,
247 |    "metadata": {},
248 |    "outputs": [
249 |     {
250 |      "data": {
251 |       "text/plain": [
252 |        "Array((a,(1,2)), (a,(1,3)))"
253 |       ]
254 |      },
255 |      "execution_count": 10,
256 |      "metadata": {},
257 |      "output_type": "execute_result"
258 |     }
259 |    ],
260 |    "source": [
261 |     "val x = spark.sparkContext.parallelize(Seq((\"a\", 1), (\"b\", 4)))\n",
262 |     "val y = spark.sparkContext.parallelize(Seq((\"a\", 2), (\"a\", 3)))\n",
263 |     "x.join(y).collect()"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 11,
269 |    "metadata": {},
270 |    "outputs": [
271 |     {
272 |      "data": {
273 |       "text/plain": [
274 |        "Array((a,(1,Some(2))), (a,(1,Some(3))), (b,(4,None)))"
275 |       ]
276 |      },
277 |      "execution_count": 11,
278 |      "metadata": {},
279 |      "output_type": "execute_result"
280 |     }
281 |    ],
282 |    "source": [
283 |     "x.leftOuterJoin(y).collect()"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 12,
289 |    "metadata": {},
290 |    "outputs": [
291 |     {
292 |      "data": {
293 |       "text/plain": [
294 |        "Array((a,(Some(1),2)), (a,(Some(1),3)))"
295 |       ]
296 |      },
297 |      "execution_count": 12,
298 |      "metadata": {},
299 |      "output_type": "execute_result"
300 |     }
301 |    ],
302 |    "source": [
303 |     "x.rightOuterJoin(y).collect()"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 13,
309 |    "metadata": {},
310 |    "outputs": [
311 |     {
312 |      "data": {
313 |       "text/plain": [
314 |        "Array((a,(Some(1),Some(2))), (a,(Some(1),Some(3))), (b,(Some(4),None)))"
315 |       ]
316 |      },
317 |      "execution_count": 13,
318 |      "metadata": {},
319 |      "output_type": "execute_result"
320 |     }
321 |    ],
322 |    "source": [
323 |     "x.fullOuterJoin(y).collect()"
324 |    ]
325 |   }
326 |  ],
327 |  "metadata": {
328 |   "kernelspec": {
329 |    "display_name": "Apache Toree - Scala",
330 |    "language": "scala",
331 |    "name": "apache_toree_scala"
332 |   },
333 |   "language_info": {
334 |    "file_extension": ".scala",
335 |    "name": "scala",
336 |    "version": "2.11.8"
337 |   }
338 |  },
339 |  "nbformat": 4,
340 |  "nbformat_minor": 2
341 | }
342 | 


--------------------------------------------------------------------------------
/python/sentiment_analysis/classification_algorithms.py:
--------------------------------------------------------------------------------
  1 | """
  2 | CLASSIFICATION
  3 | Case study: Analyzing sentiment
  4 | Models:
  5 |     Linear classifiers (logistic regression, SVMs, perceptron)
  6 |     Kernels
  7 |     Decision trees
  8 | Algorithms:
  9 |     Stochastic gradient descent
 10 |     Boosting
 11 | Concepts:
 12 |     Decision boundaries, MLE, ensemble methods, random forests, CART, online learning
 13 | """
 14 | import datetime
 15 | import os
 16 | import re
 17 | import time
 18 | from itertools import islice
 19 | from operator import itemgetter
 20 | 
 21 | import numpy as np
 22 | import pandas as pd
 23 | from BeautifulSoup import BeautifulSoup
 24 | from nltk.corpus import stopwords
 25 | from sklearn.cross_validation import train_test_split
 26 | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
 27 | from sklearn.ensemble import AdaBoostClassifier
 28 | from sklearn.ensemble import RandomForestClassifier
 29 | from sklearn.feature_extraction.text import CountVectorizer
 30 | from sklearn.gaussian_process import GaussianProcessClassifier
 31 | from sklearn.gaussian_process.kernels import RBF
 32 | from sklearn.naive_bayes import GaussianNB
 33 | from sklearn.neighbors import KNeighborsClassifier
 34 | from sklearn.neural_network import MLPClassifier
 35 | from sklearn.svm import SVC
 36 | from sklearn.tree import DecisionTreeClassifier
 37 | 
 38 | 
 39 | def time_diff_str(t1, t2):
 40 |     """
 41 |     Calculates time durations.
 42 |     """
 43 |     diff = t2 - t1
 44 |     mins = int(diff / 60)
 45 |     secs = round(diff % 60, 2)
 46 |     return str(mins) + " mins and " + str(secs) + " seconds"
 47 | 
 48 | 
 49 | def clean_sentence(sentence):
 50 |     # Remove HTML
 51 |     review_text = BeautifulSoup(sentence).text
 52 | 
 53 |     # Remove non-letters
 54 |     letters_only = re.sub("[^a-zA-Z]", " ", review_text)
 55 |     return letters_only
 56 | 
 57 | 
 58 | def convert_plain_to_csv(plain_name, csv_name):
 59 |     t0 = time.time()
 60 |     with open(plain_name, "r") as f1, open(csv_name, "w") as f2:
 61 |         i = 0
 62 |         f2.write("productId,score,summary,text\n")
 63 |         while True:
 64 |             next_n_lines = list(islice(f1, 9))
 65 |             if not next_n_lines:
 66 |                 break
 67 | 
 68 |             # process next_n_lines: get productId,score,summary,text info
 69 |             # remove special characters from summary and text
 70 |             output_line = ""
 71 |             for line in next_n_lines:
 72 |                 if "product/productId:" in line:
 73 |                     output_line += line.split(":")[1].strip() + ","
 74 |                 elif "review/score:" in line:
 75 |                     output_line += line.split(":")[1].strip() + ","
 76 |                 elif "review/summary:" in line:
 77 |                     summary = clean_sentence(line.split(":")[1].strip()) + ","
 78 |                     output_line += summary
 79 |                 elif "review/text:" in line:
 80 |                     text = clean_sentence(line.split(":")[1].strip()) + "\n"
 81 |                     output_line += text
 82 | 
 83 |             f2.write(output_line)
 84 | 
 85 |             # print status
 86 |             i += 1
 87 |             if i % 10000 == 0:
 88 |                 print "%d reviews converted..." % i
 89 | 
 90 |     print " %s - Converting completed %s" % (datetime.datetime.now(), time_diff_str(t0, time.time()))
 91 | 
 92 | 
 93 | def get_reviews_data(file_name):
 94 |     """Get reviews data, from local csv."""
 95 |     if os.path.exists(file_name):
 96 |         print("-- " + file_name + " found locally")
 97 |         df = pd.read_csv(file_name)
 98 | 
 99 |     return df
100 | 
101 | 
102 | def review_to_words(review):
103 |     """
104 |     Function to convert a raw review to a string of words
105 |     :param review
106 |     :return: meaningful_words
107 |     """
108 |     # 1. Convert to lower case, split into individual words
109 |     words = review.lower().split()
110 |     #
111 |     # 2. In Python, searching a set is much faster than searching
112 |     #   a list, so convert the stop words to a set
113 |     stops = set(stopwords.words("english"))
114 |     #
115 |     # 3. Remove stop words
116 |     meaningful_words = [w for w in words if not w in stops]
117 |     #
118 |     # 4. Join the words back into one string separated by space,
119 |     # and return the result.
120 |     return " ".join(meaningful_words)
121 | 
122 | 
123 | def cleaning_data(dataset, file_name):
124 |     t0 = time.time()
125 | 
126 |     # Get the number of reviews based on the dataframe column size
127 |     num_reviews = dataset["text"].size
128 | 
129 |     # Initialize an empty list to hold the clean reviews
130 |     clean_train_reviews = []
131 | 
132 |     # Loop over each review
133 |     for i in xrange(0, num_reviews):
134 |         # If the index is evenly divisible by 1000, print a message
135 |         if (i + 1) % 10000 == 0:
136 |             print "Review %d of %d\n" % (i + 1, num_reviews)
137 | 
138 |         # Call our function for each one, and add the result to the list of
139 |         # clean reviews
140 |         productId = str(dataset["productId"][i])
141 |         score = str(dataset["score"][i])
142 |         summary = str(dataset["summary"][i])
143 |         text = review_to_words(str(dataset["text"][i]))
144 | 
145 |         clean_train_reviews.append(productId + "," + score + "," + summary + "," + text + "\n")
146 | 
147 |     print "Writing clean train reviews..."
148 |     with open(file_name, "w") as f:
149 |         f.write("productId,score,summary,text\n")
150 |         for review in clean_train_reviews:
151 |             f.write("%s\n" % review)
152 | 
153 |     print " %s - Write file completed %s" % (datetime.datetime.now(), time_diff_str(t0, time.time()))
154 | 
155 | 
156 | def print_words_frequency(train_data_features):
157 |     # Take a look at the words in the vocabulary
158 |     vocab = vectorizer.get_feature_names()
159 |     print "Words in vocabulary:", vocab
160 | 
161 |     # Sum up the counts of each vocabulary word
162 |     dist = np.sum(train_data_features, axis=0)
163 | 
164 |     # For each, print the vocabulary word and the number of times it
165 |     # appears in the training set
166 |     print "Words frequency..."
167 |     for tag, count in zip(vocab, dist):
168 |         print count, tag
169 | 
170 | 
171 | if __name__ == "__main__":
172 |     """
173 |     Pre-processing
174 |     """
175 |     # converting plain text for next processing
176 |     convert_plain_to_csv("foods.txt", "foods.csv")
177 | 
178 |     # Reading the Data
179 |     train = get_reviews_data("foods.csv")
180 |     print "Data dimensions:", train.shape
181 |     print "List features:", train.columns.values
182 |     print "First review:", train["summary"][0], "|", train["text"][0]
183 | 
184 |     cleaning_data(train, "clean_train_reviews.csv")
185 | 
186 |     """
187 |     Bag of Words features
188 |     """
189 | 
190 |     clean_train_reviews = pd.read_csv("clean_train_reviews.csv", nrows=1000)
191 | 
192 |     # ignore all 3* reviews
193 |     clean_train_reviews = clean_train_reviews[clean_train_reviews["score"] != 3]
194 |     # positive sentiment = 4* or 5* reviews
195 |     clean_train_reviews["sentiment"] = clean_train_reviews["score"] >= 4
196 | 
197 |     train, test = train_test_split(clean_train_reviews, test_size=0.2)
198 | 
199 |     print "Creating the bag of words...\n"
200 |     vectorizer = CountVectorizer(analyzer="word",
201 |                                  tokenizer=None,
202 |                                  preprocessor=None,
203 |                                  stop_words=None,
204 |                                  max_features=10)
205 | 
206 |     train_text = train["text"].values.astype('U')
207 |     test_text = test["text"].values.astype('U')
208 | 
209 |     # convert data-set to term-document matrix
210 |     X_train = vectorizer.fit_transform(train_text).toarray()
211 |     y_train = train["sentiment"]
212 | 
213 |     X_test = vectorizer.fit_transform(test_text).toarray()
214 |     y_test = test["sentiment"]
215 | 
216 |     print_words_frequency(X_train)
217 | 
218 |     """
219 |     Training
220 |     """
221 | 
222 |     print "---------------------------"
223 |     print "Training"
224 |     print "---------------------------"
225 | 
226 |     names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
227 |              "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
228 |              "Naive Bayes", "QDA"]
229 | 
230 |     classifiers = [
231 |         KNeighborsClassifier(3),
232 |         SVC(kernel="linear", C=0.025),
233 |         SVC(gamma=2, C=1),
234 |         GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
235 |         DecisionTreeClassifier(max_depth=5),
236 |         RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
237 |         MLPClassifier(alpha=1),
238 |         AdaBoostClassifier(),
239 |         GaussianNB(),
240 |         QuadraticDiscriminantAnalysis()]
241 | 
242 |     # iterate over classifiers
243 |     results = {}
244 | 
245 |     for name, clf in zip(names, classifiers):
246 |         print "Training " + name + " classifier..."
247 |         clf.fit(X_train, y_train)
248 |         score = clf.score(X_test, y_test)
249 |         results[name] = score
250 | 
251 |     print "---------------------------"
252 |     print "Evaluation results"
253 |     print "---------------------------"
254 | 
255 |     # sorting results and print out
256 |     sorted(results.items(), key=itemgetter(1))
257 |     for name in results:
258 |         print name + " accuracy: %0.3f" % results[name]
259 | 


--------------------------------------------------------------------------------
/deep_learning/snippets/training_network.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf8 -*-
  2 | 
  3 | import matplotlib.image as mpimg
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | from scipy.misc import imresize
  8 | from tensorflow.python.framework import ops
  9 | 
 10 | plt.style.use('ggplot')
 11 | 
 12 | 
 13 | # hàm tính khoảng cách tuyệt đối (L1-norm)
 14 | def distance(p1, p2):
 15 |     return tf.abs(p1 - p2)
 16 | 
 17 | 
 18 | # hàm huấn luyện Stochastic and Mini Batch Gradient Descent
 19 | def train(X, Y, Y_pred, n_iterations=100, batch_size=200, learning_rate=0.02):
 20 |     cost = tf.reduce_mean(distance(Y_pred, Y))
 21 |     optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
 22 | 
 23 |     with tf.Session() as sess:
 24 |         # thông báo cho TensorFlow biết ta cần khởi tạo tất cả các biến trong Graph
 25 |         # lúc này, `W` và `b` sẽ được khởi tạo
 26 |         sess.run(tf.global_variables_initializer())
 27 | 
 28 |         # bắt đầu vòng lặp huấn luyện
 29 |         prev_training_cost = 0.0
 30 |         for it_i in range(n_iterations):
 31 |             # hoán vị chỉ số các phần tử trong x-axis
 32 |             idxs = np.random.permutation(range(len(xs)))
 33 |             n_batches = len(idxs)
 34 |             for batch_i in range(n_batches):
 35 |                 # lấy batch_size giá trị các phần tử x-axis được lấy ngẫu nhiên
 36 |                 # để huấn luyện
 37 |                 idxs_i = idxs[batch_i * batch_size: (batch_i + 1) * batch_size]
 38 |                 sess.run(optimizer, feed_dict={X: xs[idxs_i], Y: ys[idxs_i]})
 39 | 
 40 |             # lấy giá trị lỗi huấn luyện hiện tại
 41 |             training_cost = sess.run(cost, feed_dict={X: xs, Y: ys})
 42 | 
 43 |             if it_i % 10 == 0:
 44 |                 # in ra lỗi huẫn luyện hiện tại
 45 |                 print "Cost:", training_cost
 46 | 
 47 |             # dừng quá trình huấn luyện nếu độ lỗi không thay đổi nhiều
 48 |             if np.abs(prev_training_cost - training_cost) < 0.000001:
 49 |                 print "Stop training..."
 50 |                 break
 51 | 
 52 |             # cập nhật training cost
 53 |             prev_training_cost = training_cost
 54 | 
 55 | 
 56 | def linear(X, n_input, n_output, activation=None, scope=None):
 57 |     with tf.variable_scope(scope or "linear"):
 58 |         # khởi tạo trọng số W cho n-layers
 59 |         W = tf.get_variable(
 60 |             name='W',
 61 |             shape=[n_input, n_output],
 62 |             initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1))
 63 | 
 64 |         # khởi tạo bias cho n-layers
 65 |         b = tf.get_variable(
 66 |             name='b',
 67 |             shape=[n_output],
 68 |             initializer=tf.constant_initializer())
 69 | 
 70 |         # kích hoạt giá trị dự đoán hypothesis (h)
 71 |         h = tf.matmul(X, W) + b
 72 |         if activation is not None:
 73 |             h = activation(h)
 74 |         return h
 75 | 
 76 | 
 77 | def image_inpainting(X, Y, Y_pred, n_iterations=100, batch_size=200, learning_rate=0.001):
 78 |     cost = tf.reduce_mean(tf.reduce_sum(distance(Y_pred, Y), 1))
 79 |     optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
 80 | 
 81 |     with tf.Session() as sess:
 82 |         # thông báo cho TensorFlow biết ta cần khởi tạo tất cả các biến trong Graph
 83 |         # lúc này, `W` và `b` sẽ được khởi tạo
 84 |         sess.run(tf.global_variables_initializer())
 85 | 
 86 |         # bắt đầu vòng lặp huấn luyện
 87 |         prev_training_cost = 0.0
 88 |         for it_i in range(n_iterations):
 89 |             # hoán vị chỉ số các phần tử trong x-axis
 90 |             idxs = np.random.permutation(range(len(xs)))
 91 |             n_batches = len(idxs)
 92 |             for batch_i in range(n_batches):
 93 |                 # lấy batch_size giá trị các phần tử x-axis được lấy ngẫu nhiên
 94 |                 # để huấn luyện
 95 |                 idxs_i = idxs[batch_i * batch_size: (batch_i + 1) * batch_size]
 96 |                 sess.run(optimizer, feed_dict={X: xs[idxs_i], Y: ys[idxs_i]})
 97 | 
 98 |             # lấy giá trị lỗi huấn luyện hiện tại
 99 |             training_cost = sess.run(cost, feed_dict={X: xs, Y: ys})
100 | 
101 |             # in ra lỗi huẫn luyện hiện tại
102 |             print "Cost", it_i, training_cost
103 | 
104 |             if (it_i + 1) % 20 == 0:
105 |                 # lấy giá trị dự đoán
106 |                 ys_pred = Y_pred.eval(feed_dict={X: xs}, session=sess)
107 |                 fig, ax = plt.subplots(1, 1)
108 |                 img = np.clip(ys_pred.reshape(img.shape), 0, 255).astype(np.uint8)
109 |                 plt.imshow(img)
110 |                 plt.show()
111 | 
112 |             # dừng quá trình huấn luyện nếu độ lỗi không thay đổi nhiều
113 |             if np.abs(prev_training_cost - training_cost) < 0.000001:
114 |                 print "Stop training..."
115 |                 break
116 | 
117 |             # cập nhật training cost
118 |             prev_training_cost = training_cost
119 | 
120 | 
121 | if __name__ == "__main__":
122 |     # định nghĩa số lượng đối tượng quan sát
123 |     n_observations = 1000
124 | 
125 |     # khởi tạo đối tượng đầu vào
126 |     xs = np.linspace(-3, 3, n_observations)
127 | 
128 |     # khởi tạo giá trị đầu ra theo hình sine
129 |     ys = np.sin(xs) + np.random.uniform(-0.5, 0.5, n_observations)
130 |     plt.scatter(xs, ys, alpha=0.15, marker='+')
131 |     plt.show()
132 | 
133 |     # Tạo placeholder tên X để truyền giá trị của x-axis vào
134 |     # name=`X` dùng để quan sát operations trong Graph
135 |     X = tf.placeholder(tf.float32, name='X')
136 | 
137 |     # Tạo placeholder tên  để truyền giá trị của y-axis vào
138 |     Y = tf.placeholder(tf.float32, name='Y')
139 | 
140 |     #########################
141 |     # Simple Neural Network #
142 |     #########################
143 | 
144 |     # để tạo biến ta dùng tf.Variable, không như tf.Placeholder, hàm này không
145 |     # đòi hỏi phải định nghĩa giá trị ngay thời điểm bắt đầu run/eval.
146 |     # ta sẽ lấy gía trị từ đường cong chuẩn và truyền vào tf.Variable để tạo tensor object
147 |     W = tf.Variable(tf.random_normal([1], dtype=tf.float32, stddev=0.1), name='weight')
148 | 
149 |     # khởi tạo biến bias với giá trị zero
150 |     B = tf.Variable(tf.constant([0], dtype=tf.float32), name='bias')
151 | 
152 |     # giá trị dự đoán
153 |     Y_pred = X * W + B
154 | 
155 |     # huấn luyện mô hình
156 |     print "Training linear model..."
157 |     train(X, Y, Y_pred, 500, 1000)
158 | 
159 |     # tăng bậc cho mô hình
160 |     degree = 3
161 |     Y_pred = tf.Variable(tf.random_normal([1]), name='bias')
162 |     W = tf.Variable(tf.random_normal([1], stddev=0.1), name='weight_%d' % degree)
163 |     Y_pred = tf.add(tf.mul(tf.pow(X, degree), W), Y_pred)
164 | 
165 |     # huấn luyện mô hình
166 |     print "Training polynomial model..."
167 |     train(X, Y, Y_pred, 500, 100, 0.01)
168 | 
169 |     ########################################
170 |     # Nonlinearities / Activation Function #
171 |     ########################################
172 | 
173 |     sess = tf.InteractiveSession()
174 |     x = np.linspace(-6, 6, 1000)
175 |     plt.plot(x, tf.nn.tanh(x).eval(), label='tanh')
176 |     plt.plot(x, tf.nn.sigmoid(x).eval(), label='sigmoid')
177 |     plt.plot(x, tf.nn.relu(x).eval(), label='relu')
178 |     plt.legend(loc='lower right')
179 |     plt.xlim([-6, 6])
180 |     plt.ylim([-2, 2])
181 |     plt.xlabel('Input')
182 |     plt.ylabel('Output')
183 |     plt.grid('on')
184 |     plt.show()
185 | 
186 |     # clear the graph
187 |     ops.reset_default_graph()
188 | 
189 |     # get current graph
190 |     g = tf.get_default_graph()
191 | 
192 |     # tạo network mới
193 |     X = tf.placeholder(tf.float32, name='X')
194 |     h = linear(X, 2, 10, scope='layer1')
195 | 
196 |     # tạo Deep Network!
197 |     h2 = linear(h, 10, 10, scope='layer2')
198 | 
199 |     # thêm layer!
200 |     h3 = linear(h2, 10, 3, scope='layer3')
201 | 
202 |     # xem danh sách operations trong graph
203 |     print [op.name for op in tf.get_default_graph().get_operations()]
204 | 
205 |     ####################
206 |     # Image Inpainting #
207 |     ####################
208 |     img = mpimg.imread("imgs/dogs.jpg")
209 |     img = imresize(img, (64, 64))
210 |     plt.imshow(img)
211 |     plt.show()
212 | 
213 |     # lưu vị trí điểm ảnh vào x-axis
214 |     xs = []
215 | 
216 |     # lưu giá trị màu tương ứng với vị trí điểm ảnh
217 |     ys = []
218 | 
219 |     # duyệt qua từng điểm ảnh
220 |     for row_i in range(img.shape[0]):
221 |         for col_i in range(img.shape[1]):
222 |             # lưu giá trị inputs
223 |             xs.append([row_i, col_i])
224 | 
225 |             # lưu giá trị màu outputs Networks cần dự đoán
226 |             ys.append(img[row_i, col_i])
227 | 
228 |     # convert lists to arrays for numpy calculation
229 |     xs = np.array(xs)
230 |     ys = np.array(ys)
231 | 
232 |     # Normalizing the input by the mean and standard deviation
233 |     xs = (xs - np.mean(xs)) / np.std(xs)
234 | 
235 |     # print the shapes
236 |     print xs.shape, ys.shape
237 | 
238 |     X = tf.placeholder(tf.float32, shape=[None, 2], name='X')
239 |     Y = tf.placeholder(tf.float32, shape=[None, 3], name='Y')
240 | 
241 |     # building networks
242 |     n_neurons = [2, 64, 64, 64, 64, 64, 64, 3]
243 | 
244 |     current_input = X
245 |     for layer_i in range(1, len(n_neurons)):
246 |         current_input = linear(
247 |             X=current_input,
248 |             n_input=n_neurons[layer_i - 1],
249 |             n_output=n_neurons[layer_i],
250 |             activation=tf.nn.relu if (layer_i + 1) < len(n_neurons) else None,
251 |             scope='layer_' + str(layer_i))
252 | 
253 |     # training painting
254 |     Y_pred = current_input
255 |     image_inpainting(X, Y, Y_pred)
256 | 


--------------------------------------------------------------------------------
/deep_learning/src/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import csv
  3 | import datetime
  4 | import os
  5 | import sys
  6 | import time
  7 | from operator import itemgetter
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
 12 | from sklearn.ensemble import AdaBoostClassifier
 13 | from sklearn.ensemble import RandomForestClassifier
 14 | from sklearn.gaussian_process import GaussianProcessClassifier
 15 | from sklearn.gaussian_process.kernels import RBF
 16 | from sklearn.model_selection import train_test_split
 17 | from sklearn.naive_bayes import GaussianNB
 18 | from sklearn.neighbors import KNeighborsClassifier
 19 | from sklearn.neural_network import MLPClassifier
 20 | from sklearn.svm import SVC
 21 | from sklearn.tree import DecisionTreeClassifier
 22 | 
 23 | sys.path.append("/usr/local/lib/python2.7/site-packages")
 24 | import cv2
 25 | import imutils
 26 | from imutils import paths
 27 | 
 28 | 
 29 | def time_diff_str(t1, t2):
 30 |     """
 31 |     Calculates time durations.
 32 |     """
 33 |     diff = t2 - t1
 34 |     mins = int(diff / 60)
 35 |     secs = round(diff % 60, 2)
 36 |     return str(mins) + " mins and " + str(secs) + " seconds"
 37 | 
 38 | 
 39 | def image_to_feature_vector(image, size=(32, 32)):
 40 |     # resize the image to a fixed size, then flatten the image into
 41 |     # a list of raw pixel intensities
 42 |     return cv2.resize(image, size).flatten()
 43 | 
 44 | 
 45 | def extract_color_histogram(image):
 46 |     hist = cv2.calcHist([image], [0], None, [8], [0, 256])
 47 | 
 48 |     # handle normalizing the histogram if we are using OpenCV 2.4.X
 49 |     if imutils.is_cv2():
 50 |         hist = cv2.normalize(hist)
 51 | 
 52 |     # otherwise, perform "in place" normalization in OpenCV 3 (I
 53 |     # personally hate the way this is done
 54 |     else:
 55 |         cv2.normalize(hist, hist)
 56 | 
 57 |     # return the flattened histogram as the feature vector
 58 |     return hist.flatten()
 59 | 
 60 | 
 61 | def load_csv(file_path):
 62 |     """Get data, from local csv."""
 63 |     if os.path.exists(file_path):
 64 |         print "[INFO] load", file_path, "file..."
 65 |         df = pd.read_csv(file_path, index_col=0)
 66 | 
 67 |     return df.to_dict()
 68 | 
 69 | 
 70 | def get_simple_feature_labels(patient_labels, img_paths):
 71 |     features = []
 72 |     labels = []
 73 | 
 74 |     # loop over the input images
 75 |     for (i, img_path) in enumerate(img_paths):
 76 |         # get only training labels
 77 |         base = os.path.basename(img_path)
 78 |         patient_id = os.path.splitext(base)[0]
 79 |         if patient_id in patient_labels["cancer"].keys():
 80 |             labels.append(patient_labels["cancer"][patient_id])
 81 |         else:
 82 |             continue
 83 | 
 84 |         # load the image
 85 |         image = cv2.imread(img_path)
 86 | 
 87 |         # histogram to characterize the color distribution of the pixels
 88 |         # in the image
 89 |         feat = image_to_feature_vector(image)
 90 | 
 91 |         # update features
 92 |         features.append(feat)
 93 | 
 94 |         # show an update every 100 images
 95 |         if i > 0 and i % 100 == 0:
 96 |             print("[INFO] processed {}/{}".format(i, len(img_paths)))
 97 | 
 98 |     return features, labels
 99 | 
100 | 
101 | def get_hist_feature_labels(patient_labels, img_paths):
102 |     features = []
103 |     labels = []
104 | 
105 |     # loop over the input images
106 |     for (i, img_path) in enumerate(img_paths):
107 |         # get only training labels
108 |         base = os.path.basename(img_path)
109 |         patient_id = os.path.splitext(base)[0]
110 |         if patient_id in patient_labels["cancer"].keys():
111 |             labels.append(patient_labels["cancer"][patient_id])
112 |         else:
113 |             continue
114 | 
115 |         # load the image
116 |         image = cv2.imread(img_path)
117 | 
118 |         # histogram to characterize the color distribution of the pixels
119 |         # in the image
120 |         hist = extract_color_histogram(image)
121 | 
122 |         # update features
123 |         features.append(hist)
124 | 
125 |         # show an update every 100 images
126 |         if i > 0 and i % 100 == 0:
127 |             print("[INFO] processed {}/{}".format(i, len(img_paths)))
128 | 
129 |     return features, labels
130 | 
131 | 
132 | def generate_bow_features(img_paths, dictionarySize=5):
133 |     BOW = cv2.BOWKMeansTrainer(dictionarySize)
134 |     sift = cv2.xfeatures2d.SIFT_create()
135 | 
136 |     for (i, image_path) in enumerate(img_paths):
137 |         gray = cv2.imread(image_path)
138 |         kp, dsc = sift.detectAndCompute(gray, None)
139 |         BOW.add(dsc)
140 |         print("# kps: {}, descriptors: {}".format(len(kp), dsc.shape))
141 | 
142 |     # dictionary created
143 |     dictionary = BOW.cluster()
144 |     index_params = dict(algorithm=0, trees=5)
145 |     search_params = dict(checks=50)  # or pass empty dictionary
146 |     flann = cv2.FlannBasedMatcher(index_params, search_params)
147 |     sift2 = cv2.xfeatures2d.SIFT_create()
148 |     bowDiction = cv2.BOWImgDescriptorExtractor(sift2, cv2.BFMatcher(cv2.NORM_L2))
149 |     bowDiction.setVocabulary(dictionary)
150 |     print "[INFO] Finished create BOW dictionary", time_diff_str(t_start, time.time())
151 |     return bowDiction
152 | 
153 | 
154 | def sift_feature_extract(img_paths, patient_labels, bow_dict):
155 |     features = []
156 |     labels = []
157 | 
158 |     # loop over the input images
159 |     for (i, img_path) in enumerate(img_paths):
160 |         # get only training labels
161 |         base = os.path.basename(img_path)
162 |         patient_id = os.path.splitext(base)[0]
163 |         if patient_id in patient_labels["cancer"]:
164 |             labels.append(patient_labels["cancer"][patient_id])
165 |         else:
166 |             continue
167 | 
168 |         # load the image
169 |         gray = cv2.imread(img_path)
170 |         sift_feature = bow_dict.compute(gray, sift.detect(gray))
171 | 
172 |         # update features
173 |         features.extend(sift_feature)
174 | 
175 |         # show an update every 100 images
176 |         if i > 0 and i % 100 == 0:
177 |             print("[INFO] processed {}/{}".format(i, len(img_paths)))
178 | 
179 |     return features, labels
180 | 
181 | 
182 | if __name__ == "__main__":
183 |     t_start = time.time()
184 | 
185 |     # construct the argument parse and parse the arguments
186 |     ap = argparse.ArgumentParser()
187 |     ap.add_argument("-d", "--dataset", required=True, help="path to input dataset")
188 |     ap.add_argument("-j", "--jobs", type=int, default=-1, help="# of jobs (-1 uses all available cores)")
189 |     args = vars(ap.parse_args())
190 | 
191 |     # grab the list of images that we'll be describing
192 |     print("[INFO] describing images...")
193 |     img_paths = list(paths.list_images(args["dataset"]))
194 | 
195 |     # load train/test labels
196 |     stage1_labels = load_csv("../data/stage1_labels.csv")
197 |     stage1_sample_submission = load_csv("../data/stage1_sample_submission.csv")
198 | 
199 |     # Generating Bag of Words model
200 |     # generate_bow_features(img_paths)
201 | 
202 |     # train_features, train_labels = get_hist_feature_labels(stage1_labels, img_paths)
203 |     # test_features, test_labels = get_hist_feature_labels(stage1_sample_submission, img_paths)
204 |     # train_features, train_labels = sift_feature_extract(img_paths, stage1_labels, bowDiction)
205 |     # test_features, test_labels = sift_feature_extract(img_paths, stage1_sample_submission, bowDiction)
206 |     train_features, train_labels = get_simple_feature_labels(stage1_labels, img_paths)
207 |     test_features, test_labels = get_simple_feature_labels(stage1_sample_submission, img_paths)
208 |     train_features = np.array(train_features)
209 |     print("[INFO] features matrix: {:.2f}MB".format(train_features.nbytes / (1024 * 1000.0)))
210 | 
211 |     (for_train_features, dev_features, for_train_labels, dev_labels) = train_test_split(train_features,
212 |                                                                                         train_labels,
213 |                                                                                         test_size=0.25,
214 |                                                                                         random_state=42)
215 | 
216 |     print "---------------------------"
217 |     print "Training"
218 |     print "---------------------------"
219 | 
220 |     classifiers = {
221 |         "Nearest Neighbors": KNeighborsClassifier(3, n_jobs=args["jobs"]),
222 |         "Linear SVM": SVC(kernel="linear", C=0.025),
223 |         "RBF SVM": SVC(gamma=2, C=1),
224 |         "Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True, n_jobs=args["jobs"]),
225 |         "Decision Tree": DecisionTreeClassifier(max_depth=5),
226 |         "Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, n_jobs=args["jobs"]),
227 |         "Neural Net": MLPClassifier(alpha=1),
228 |         "AdaBoost": AdaBoostClassifier(),
229 |         "Naive Bayes": GaussianNB(),
230 |         "QDA": QuadraticDiscriminantAnalysis()
231 |     }
232 | 
233 |     # iterate over classifiers
234 |     results = {}
235 | 
236 |     for name in classifiers:
237 |         print "[INFO]" + name + " classifier..."
238 |         clf = classifiers[name]
239 |         clf.fit(for_train_features, for_train_labels)
240 |         score = clf.score(dev_features, dev_labels)
241 |         results[name] = score
242 | 
243 |     print "---------------------------"
244 |     print "Evaluation results"
245 |     print "---------------------------"
246 | 
247 |     # sorting results and print out
248 |     sorted(results.items(), key=itemgetter(1))
249 |     for name in results:
250 |         print "[INFO]", name, "accuracy: %0.3f" % results[name]
251 | 
252 |     print "---------------------------"
253 |     print "Training for submission"
254 |     print "---------------------------"
255 | 
256 |     name = list(results)[0]
257 |     clf = classifiers[name]
258 |     print "[INFO]" + name + " classifier..."
259 |     clf.fit(train_features, train_labels)
260 |     predict_submission = clf.predict(test_features)
261 | 
262 |     # update submission
263 |     submission = {}
264 |     for (i, patient_id) in enumerate(stage1_sample_submission["cancer"]):
265 |         submission[patient_id] = predict_submission[i]
266 | 
267 |     with open("submission_results.csv", "wb") as f:
268 |         writer = csv.writer(f, delimiter=',')
269 |         writer.writerow(["id", "cancer"])
270 |         for key, value in submission.items():
271 |             writer.writerow([key, value])
272 | 
273 |     print "[INFO]", datetime.datetime.now(), "* DONE After *", time_diff_str(t_start, time.time())
274 | 


--------------------------------------------------------------------------------
/python/jupyter/Getting started with iPython Notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "#Installing Python and GraphLab Create"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Please follow the installation instructions here before getting started:\n",
 15 |     "\n",
 16 |     "\n",
 17 |     "##We have done\n",
 18 |     "* Installed Python\n",
 19 |     "* Started Ipython Notebook"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "#Getting started with Python"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 1,
 32 |    "metadata": {
 33 |     "collapsed": false
 34 |    },
 35 |    "outputs": [
 36 |     {
 37 |      "name": "stdout",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "Hello World!\n"
 41 |      ]
 42 |     }
 43 |    ],
 44 |    "source": [
 45 |     "print 'Hello World!'"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "##Create some variables in Python"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 2,
 58 |    "metadata": {
 59 |     "collapsed": true
 60 |    },
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "i = 4  #int"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 3,
 69 |    "metadata": {
 70 |     "collapsed": false
 71 |    },
 72 |    "outputs": [
 73 |     {
 74 |      "data": {
 75 |       "text/plain": [
 76 |        "int"
 77 |       ]
 78 |      },
 79 |      "execution_count": 3,
 80 |      "metadata": {},
 81 |      "output_type": "execute_result"
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "type(i)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 4,
 91 |    "metadata": {
 92 |     "collapsed": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "f = 4.1  #float"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 5,
102 |    "metadata": {
103 |     "collapsed": false
104 |    },
105 |    "outputs": [
106 |     {
107 |      "data": {
108 |       "text/plain": [
109 |        "float"
110 |       ]
111 |      },
112 |      "execution_count": 5,
113 |      "metadata": {},
114 |      "output_type": "execute_result"
115 |     }
116 |    ],
117 |    "source": [
118 |     "type(f)"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 6,
124 |    "metadata": {
125 |     "collapsed": true
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "b = True  #boolean variable"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 7,
135 |    "metadata": {
136 |     "collapsed": true
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "s = \"This is a string!\""
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 8,
146 |    "metadata": {
147 |     "collapsed": false
148 |    },
149 |    "outputs": [
150 |     {
151 |      "name": "stdout",
152 |      "output_type": "stream",
153 |      "text": [
154 |       "This is a string!\n"
155 |      ]
156 |     }
157 |    ],
158 |    "source": [
159 |     "print s"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "metadata": {},
165 |    "source": [
166 |     "##Advanced python types"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 9,
172 |    "metadata": {
173 |     "collapsed": true
174 |    },
175 |    "outputs": [],
176 |    "source": [
177 |     "l = [3,1,2]  #list"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 10,
183 |    "metadata": {
184 |     "collapsed": false
185 |    },
186 |    "outputs": [
187 |     {
188 |      "name": "stdout",
189 |      "output_type": "stream",
190 |      "text": [
191 |       "[3, 1, 2]\n"
192 |      ]
193 |     }
194 |    ],
195 |    "source": [
196 |     "print l"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 11,
202 |    "metadata": {
203 |     "collapsed": true
204 |    },
205 |    "outputs": [],
206 |    "source": [
207 |     "d = {'foo':1, 'bar':2.3, 's':'my first dictionary'}  #dictionary"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 12,
213 |    "metadata": {
214 |     "collapsed": false
215 |    },
216 |    "outputs": [
217 |     {
218 |      "name": "stdout",
219 |      "output_type": "stream",
220 |      "text": [
221 |       "{'s': 'my first dictionary', 'foo': 1, 'bar': 2.3}\n"
222 |      ]
223 |     }
224 |    ],
225 |    "source": [
226 |     "print d"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 13,
232 |    "metadata": {
233 |     "collapsed": false
234 |    },
235 |    "outputs": [
236 |     {
237 |      "name": "stdout",
238 |      "output_type": "stream",
239 |      "text": [
240 |       "1\n"
241 |      ]
242 |     }
243 |    ],
244 |    "source": [
245 |     "print d['foo']  #element of a dictionary"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 14,
251 |    "metadata": {
252 |     "collapsed": false
253 |    },
254 |    "outputs": [],
255 |    "source": [
256 |     "n = None  #Python's null type"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 15,
262 |    "metadata": {
263 |     "collapsed": false
264 |    },
265 |    "outputs": [
266 |     {
267 |      "data": {
268 |       "text/plain": [
269 |        "NoneType"
270 |       ]
271 |      },
272 |      "execution_count": 15,
273 |      "metadata": {},
274 |      "output_type": "execute_result"
275 |     }
276 |    ],
277 |    "source": [
278 |     "type(n)"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {},
284 |    "source": [
285 |     "##Advanced printing"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 16,
291 |    "metadata": {
292 |     "collapsed": false
293 |    },
294 |    "outputs": [
295 |     {
296 |      "name": "stdout",
297 |      "output_type": "stream",
298 |      "text": [
299 |       "Our float value is 4.1. Our int value is 4.\n"
300 |      ]
301 |     }
302 |    ],
303 |    "source": [
304 |     "print \"Our float value is %s. Our int value is %s.\" % (f,i)  #Python is pretty good with strings"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "markdown",
309 |    "metadata": {},
310 |    "source": [
311 |     "##Conditional statements in python"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": 17,
317 |    "metadata": {
318 |     "collapsed": false
319 |    },
320 |    "outputs": [
321 |     {
322 |      "name": "stdout",
323 |      "output_type": "stream",
324 |      "text": [
325 |       "i or f are both greater than 4.\n"
326 |      ]
327 |     }
328 |    ],
329 |    "source": [
330 |     "if i == 1 and f > 4:\n",
331 |     "    print \"The value of i is 1 and f is greater than 4.\"\n",
332 |     "elif i > 4 or f > 4:\n",
333 |     "    print \"i or f are both greater than 4.\"\n",
334 |     "else:\n",
335 |     "    print \"both i and f are less than or equal to 4\"\n"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "metadata": {},
341 |    "source": [
342 |     "##Conditional loops"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": 18,
348 |    "metadata": {
349 |     "collapsed": false
350 |    },
351 |    "outputs": [
352 |     {
353 |      "name": "stdout",
354 |      "output_type": "stream",
355 |      "text": [
356 |       "[3, 1, 2]\n"
357 |      ]
358 |     }
359 |    ],
360 |    "source": [
361 |     "print l"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": 19,
367 |    "metadata": {
368 |     "collapsed": false
369 |    },
370 |    "outputs": [
371 |     {
372 |      "name": "stdout",
373 |      "output_type": "stream",
374 |      "text": [
375 |       "3\n",
376 |       "1\n",
377 |       "2\n"
378 |      ]
379 |     }
380 |    ],
381 |    "source": [
382 |     "for e in l:\n",
383 |     "    print e"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "markdown",
388 |    "metadata": {},
389 |    "source": [
390 |     "Note that in Python, we don't use {} or other markers to indicate the part of the loop that gets iterated.  Instead, we just indent and align each of the iterated statements with spaces or tabs. (You can use as many as you want, as long as the lines are aligned.)"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": 20,
396 |    "metadata": {
397 |     "collapsed": false
398 |    },
399 |    "outputs": [
400 |     {
401 |      "name": "stdout",
402 |      "output_type": "stream",
403 |      "text": [
404 |       "6\n",
405 |       "7\n",
406 |       "8\n",
407 |       "9\n"
408 |      ]
409 |     }
410 |    ],
411 |    "source": [
412 |     "counter = 6\n",
413 |     "while counter < 10:\n",
414 |     "    print counter\n",
415 |     "    counter += 1"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "markdown",
420 |    "metadata": {
421 |     "collapsed": true
422 |    },
423 |    "source": [
424 |     "#Creating functions in Python\n",
425 |     "\n",
426 |     "Again, we don't use {}, but just indent the lines that are part of the function."
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "code",
431 |    "execution_count": 21,
432 |    "metadata": {
433 |     "collapsed": true
434 |    },
435 |    "outputs": [],
436 |    "source": [
437 |     "def add2(x):\n",
438 |     "    y = x + 2\n",
439 |     "    return y"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": 22,
445 |    "metadata": {
446 |     "collapsed": true
447 |    },
448 |    "outputs": [],
449 |    "source": [
450 |     "i = 5"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": 23,
456 |    "metadata": {
457 |     "collapsed": false
458 |    },
459 |    "outputs": [
460 |     {
461 |      "data": {
462 |       "text/plain": [
463 |        "7"
464 |       ]
465 |      },
466 |      "execution_count": 23,
467 |      "metadata": {},
468 |      "output_type": "execute_result"
469 |     }
470 |    ],
471 |    "source": [
472 |     "add2(i)"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "markdown",
477 |    "metadata": {},
478 |    "source": [
479 |     "We can also define simple functions with lambdas:"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": 24,
485 |    "metadata": {
486 |     "collapsed": true
487 |    },
488 |    "outputs": [],
489 |    "source": [
490 |     "square = lambda x: x*x"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": null,
496 |    "metadata": {
497 |     "collapsed": true
498 |    },
499 |    "outputs": [],
500 |    "source": []
501 |   }
502 |  ],
503 |  "metadata": {
504 |   "kernelspec": {
505 |    "display_name": "Python 2",
506 |    "language": "python",
507 |    "name": "python2"
508 |   },
509 |   "language_info": {
510 |    "codemirror_mode": {
511 |     "name": "ipython",
512 |     "version": 2
513 |    },
514 |    "file_extension": ".py",
515 |    "mimetype": "text/x-python",
516 |    "name": "python",
517 |    "nbconvert_exporter": "python",
518 |    "pygments_lexer": "ipython2",
519 |    "version": "2.7.10"
520 |   }
521 |  },
522 |  "nbformat": 4,
523 |  "nbformat_minor": 0
524 | }
525 | 


--------------------------------------------------------------------------------
/pyspark/notebooks/spark_essentials.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# RDD overview\n",
  8 |     "- Programmer specifies number of partitions\n",
  9 |     "- Driver passes each partition to corresponding Workers\n",
 10 |     "- Master parameter specifies number of workers.\n",
 11 |     "- Spark automatically pushes closures to workers.\n",
 12 |     "\n",
 13 |     "# Some transformations\n",
 14 |     "- map(func): return a new distributed dataset formed by passing each element of the source through a function func.\n",
 15 |     "- filter(func): return a new dataset formed by selecting those elements of the source on which func returns true.\n",
 16 |     "- distinct([numTasks]): return a new dataset that contains the distinct elements of the source dataset.\n",
 17 |     "- flatMap(func): similar to map, but each input item can be mapped to 0 or more output items (so func should return a Seq rather than a single item)."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 1,
 23 |    "metadata": {},
 24 |    "outputs": [
 25 |     {
 26 |      "data": {
 27 |       "text/plain": [
 28 |        "[2, 4, 6, 8]"
 29 |       ]
 30 |      },
 31 |      "execution_count": 1,
 32 |      "metadata": {},
 33 |      "output_type": "execute_result"
 34 |     }
 35 |    ],
 36 |    "source": [
 37 |     "rdd = sc.parallelize([1, 2, 3, 4])\n",
 38 |     "rdd.map(lambda x: x * 2).collect()"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 2,
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "[2, 4]"
 50 |       ]
 51 |      },
 52 |      "execution_count": 2,
 53 |      "metadata": {},
 54 |      "output_type": "execute_result"
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "rdd.filter(lambda x: x % 2 == 0).collect()"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 3,
 64 |    "metadata": {},
 65 |    "outputs": [
 66 |     {
 67 |      "data": {
 68 |       "text/plain": [
 69 |        "[4, 1, 2, 3]"
 70 |       ]
 71 |      },
 72 |      "execution_count": 3,
 73 |      "metadata": {},
 74 |      "output_type": "execute_result"
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "rdd = sc.parallelize([1, 4, 2, 2, 3])\n",
 79 |     "rdd.distinct().collect()"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 4,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "data": {
 89 |       "text/plain": [
 90 |        "[[1, 6], [2, 7], [3, 8]]"
 91 |       ]
 92 |      },
 93 |      "execution_count": 4,
 94 |      "metadata": {},
 95 |      "output_type": "execute_result"
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "rdd = sc.parallelize([1, 2, 3])\n",
100 |     "rdd.map(lambda x: [x, x + 5]).collect()"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 5,
106 |    "metadata": {},
107 |    "outputs": [
108 |     {
109 |      "data": {
110 |       "text/plain": [
111 |        "[1, 6, 2, 7, 3, 8]"
112 |       ]
113 |      },
114 |      "execution_count": 5,
115 |      "metadata": {},
116 |      "output_type": "execute_result"
117 |     }
118 |    ],
119 |    "source": [
120 |     "rdd.flatMap(lambda x: [x, x + 5]).collect()"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "# Some actions\n",
128 |     "- reduce(func): aggregate dataset's elements using function func, func takes two arguments and returns one, and is commutative and associative so that it can be computed correctly in parallel.\n",
129 |     "- take(n): return an array with the list n elements.\n",
130 |     "- collect(): return all the elements as an array. WARNING: make sure will fit in driver program.\n",
131 |     "- takeOrdered(n, key=func): return n elements ordred in ascending order or as specified by the optional key function."
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 6,
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "data": {
141 |       "text/plain": [
142 |        "6"
143 |       ]
144 |      },
145 |      "execution_count": 6,
146 |      "metadata": {},
147 |      "output_type": "execute_result"
148 |     }
149 |    ],
150 |    "source": [
151 |     "rdd = sc.parallelize([1, 2, 3])\n",
152 |     "rdd.reduce(lambda a, b: a * b)"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 7,
158 |    "metadata": {},
159 |    "outputs": [
160 |     {
161 |      "data": {
162 |       "text/plain": [
163 |        "[1, 2]"
164 |       ]
165 |      },
166 |      "execution_count": 7,
167 |      "metadata": {},
168 |      "output_type": "execute_result"
169 |     }
170 |    ],
171 |    "source": [
172 |     "rdd.take(2)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 8,
178 |    "metadata": {},
179 |    "outputs": [
180 |     {
181 |      "data": {
182 |       "text/plain": [
183 |        "[1, 2, 3]"
184 |       ]
185 |      },
186 |      "execution_count": 8,
187 |      "metadata": {},
188 |      "output_type": "execute_result"
189 |     }
190 |    ],
191 |    "source": [
192 |     "rdd.collect()"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 9,
198 |    "metadata": {},
199 |    "outputs": [
200 |     {
201 |      "data": {
202 |       "text/plain": [
203 |        "[5, 3, 2]"
204 |       ]
205 |      },
206 |      "execution_count": 9,
207 |      "metadata": {},
208 |      "output_type": "execute_result"
209 |     }
210 |    ],
211 |    "source": [
212 |     "rdd = sc.parallelize([5, 3, 1, 2])\n",
213 |     "rdd.takeOrdered(3, lambda s: -1 * s)"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 10,
219 |    "metadata": {},
220 |    "outputs": [
221 |     {
222 |      "name": "stdout",
223 |      "output_type": "stream",
224 |      "text": [
225 |       "5 5\n"
226 |      ]
227 |     }
228 |    ],
229 |    "source": [
230 |     "lines = sc.textFile(\"sample_text.txt\", 4)\n",
231 |     "lines.cache()\n",
232 |     "print lines.count(), lines.count()"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "# Key-Value RDDs\n",
240 |     "- Similar to Map Reduce, Spark supports Key-Value pairs\n",
241 |     "- Each element of a Pair RDD is a pair tuple\n",
242 |     "## Some Key-Value transformation\n",
243 |     "- reduceByKey(func): return a new distributed dataset of (K, V) pairs where the values for each key are aggregated using the given reduce function func, which must be of type (V, V) -> V.\n",
244 |     "- sortByKey(): return a new dataset (K, V) pairs sorted by keys in asceding order.\n",
245 |     "- groupByKey(): return a new dataset of (K, Iterable<V>) pairs."
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 11,
251 |    "metadata": {},
252 |    "outputs": [
253 |     {
254 |      "data": {
255 |       "text/plain": [
256 |        "[(1, 2), (3, 4)]"
257 |       ]
258 |      },
259 |      "execution_count": 11,
260 |      "metadata": {},
261 |      "output_type": "execute_result"
262 |     }
263 |    ],
264 |    "source": [
265 |     "rdd = sc.parallelize([(1, 2), (3, 4)])\n",
266 |     "rdd.collect()"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 12,
272 |    "metadata": {},
273 |    "outputs": [
274 |     {
275 |      "data": {
276 |       "text/plain": [
277 |        "[(1, 2), (3, 10)]"
278 |       ]
279 |      },
280 |      "execution_count": 12,
281 |      "metadata": {},
282 |      "output_type": "execute_result"
283 |     }
284 |    ],
285 |    "source": [
286 |     "rdd = sc.parallelize([(1, 2), (3, 4), (3, 6)])\n",
287 |     "rdd.reduceByKey(lambda a, b: a + b).collect()"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 13,
293 |    "metadata": {},
294 |    "outputs": [
295 |     {
296 |      "data": {
297 |       "text/plain": [
298 |        "[(1, 'a'), (1, 'b'), (2, 'c')]"
299 |       ]
300 |      },
301 |      "execution_count": 13,
302 |      "metadata": {},
303 |      "output_type": "execute_result"
304 |     }
305 |    ],
306 |    "source": [
307 |     "rdd = sc.parallelize([(1, \"a\"), (2, \"c\"), (1, \"b\")])\n",
308 |     "rdd.sortByKey().collect()"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": 14,
314 |    "metadata": {},
315 |    "outputs": [
316 |     {
317 |      "data": {
318 |       "text/plain": [
319 |        "[(1, <pyspark.resultiterable.ResultIterable at 0x10e6841d0>),\n",
320 |        " (2, <pyspark.resultiterable.ResultIterable at 0x10e665b10>)]"
321 |       ]
322 |      },
323 |      "execution_count": 14,
324 |      "metadata": {},
325 |      "output_type": "execute_result"
326 |     }
327 |    ],
328 |    "source": [
329 |     "rdd.groupByKey().collect()"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "markdown",
334 |    "metadata": {},
335 |    "source": [
336 |     "# Broadcast variables\n",
337 |     "- Keep read-only variable cached on workers, ship to each worker only once instead of with each task"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 15,
343 |    "metadata": {},
344 |    "outputs": [
345 |     {
346 |      "data": {
347 |       "text/plain": [
348 |        "[1, 2, 3]"
349 |       ]
350 |      },
351 |      "execution_count": 15,
352 |      "metadata": {},
353 |      "output_type": "execute_result"
354 |     }
355 |    ],
356 |    "source": [
357 |     "# at the driver:\n",
358 |     "bcVar = sc.broadcast([1, 2, 3])\n",
359 |     "\n",
360 |     "# at the worker (in code passed via a closure)\n",
361 |     "bcVar.value"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "markdown",
366 |    "metadata": {},
367 |    "source": [
368 |     "# Accumulators\n",
369 |     "- Variables that can only be \"added\" to by associative op\n",
370 |     "- Used to efficiently implement parallel counters and sums\n",
371 |     "- Only driver can read an accumulator's value, not tasks\n",
372 |     "- Tasks at workers cannot access accumulator's values\n",
373 |     "- Tasks see accumulators as write-only variables\n",
374 |     "- Actions: each task's update to accumulator is applied only once\n",
375 |     "- Transformations: no guarantees (use only for debugging)\n",
376 |     "- Types: integers, double, long, float"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": 16,
382 |    "metadata": {},
383 |    "outputs": [
384 |     {
385 |      "data": {
386 |       "text/plain": [
387 |        "10"
388 |       ]
389 |      },
390 |      "execution_count": 16,
391 |      "metadata": {},
392 |      "output_type": "execute_result"
393 |     }
394 |    ],
395 |    "source": [
396 |     "accum = sc.accumulator(0)\n",
397 |     "rdd = sc.parallelize([1, 2, 3, 4])\n",
398 |     "def f(x):\n",
399 |     "  global accum\n",
400 |     "  accum += x\n",
401 |     "  \n",
402 |     "rdd.foreach(f)\n",
403 |     "accum.value"
404 |    ]
405 |   }
406 |  ],
407 |  "metadata": {
408 |   "kernelspec": {
409 |    "display_name": "Python 2",
410 |    "language": "python",
411 |    "name": "python2"
412 |   },
413 |   "language_info": {
414 |    "codemirror_mode": {
415 |     "name": "ipython",
416 |     "version": 2
417 |    },
418 |    "file_extension": ".py",
419 |    "mimetype": "text/x-python",
420 |    "name": "python",
421 |    "nbconvert_exporter": "python",
422 |    "pygments_lexer": "ipython2",
423 |    "version": "2.7.10"
424 |   },
425 |   "name": "04_spark_essentials",
426 |   "notebookId": 1227613790179004
427 |  },
428 |  "nbformat": 4,
429 |  "nbformat_minor": 1
430 | }
431 | 


--------------------------------------------------------------------------------
/word2vec/data/sample_tokenize.ann:
--------------------------------------------------------------------------------
  1 | T1	B_W 0 3	Con
  2 | T2	B_W 4 7	phà
  3 | T3	B_W 8 10	bị
  4 | T4	B_W 11 14	lật
  5 | T5	B_W 15 22	nghiêng
  6 | T6	B_W 23 27	sáng
  7 | T7	B_W 28 32	16.4
  8 | T8	B_W 33 36	khi
  9 | T9	B_W 37 39	ca
 10 | T10	I_W 40 42	nô
 11 | T11	B_W 43 46	của
 12 | T12	B_W 47 51	Cảnh
 13 | T13	I_W 52 55	sát
 14 | T14	B_W 56 60	biển
 15 | T15	B_W 61 64	đến
 16 | T16	I_W 65 68	nơi
 17 | T17	B_W 69 73	trên
 18 | T18	B_W 74 79	boong
 19 | T19	B_W 80 83	tàu
 20 | T20	B_W 84 89	không
 21 | T21	B_W 90 92	có
 22 | T22	B_W 93 97	hành
 23 | T23	I_W 98 103	khách
 24 | T24	B_W 104 107	nào
 25 | T25	B_W 108 110	vì
 26 | T26	B_W 111 115	hành
 27 | T27	I_W 116 121	khách
 28 | T28	B_W 122 127	không
 29 | T29	I_W 128 132	được
 30 | T30	B_W 133 138	thông
 31 | T31	I_W 139 142	báo
 32 | T32	B_W 143 146	lên
 33 | T33	B_W 147 150	khu
 34 | T34	I_W 151 154	vực
 35 | T35	B_W 155 158	này
 36 | T36	B_W 159 161	và
 37 | T37	B_W 162 166	được
 38 | T38	B_W 167 170	yêu
 39 | T39	I_W 171 174	cầu
 40 | T40	B_W 175 179	ngồi
 41 | T41	I_W 180 183	yên
 42 | T42	B_W 184 185	-
 43 | T43	B_W 186 189	Ảnh
 44 | T44	B_W 190 191	:
 45 | T45	B_W 192 196	Cảnh
 46 | T46	I_W 197 200	sát
 47 | T47	B_W 201 205	biển
 48 | T48	B_W 206 209	Hàn
 49 | T49	I_W 210 214	Quốc
 50 | T50	B_W 215 216	/
 51 | T51	B_W 217 223	Yonhap
 52 | T52	B_W 224 228	Việc
 53 | T53	B_W 229 235	thường
 54 | T54	I_W 236 241	xuyên
 55 | T55	B_W 242 245	chở
 56 | T56	B_W 246 250	hàng
 57 | T57	I_W 251 254	hóa
 58 | T58	B_W 255 258	quá
 59 | T59	I_W 259 262	tải
 60 | T60	B_W 263 267	trên
 61 | T61	B_W 268 273	chiếc
 62 | T62	B_W 274 277	phà
 63 | T63	B_W 278 282	dùng
 64 | T64	B_W 283 286	chở
 65 | T65	B_W 287 292	khách
 66 | T66	B_W 293 296	cho
 67 | T67	I_W 297 301	thấy
 68 | T68	B_W 302 304	lỗ
 69 | T69	I_W 305 309	hổng
 70 | T70	B_W 310 315	trong
 71 | T71	B_W 316 320	việc
 72 | T72	B_W 321 325	quản
 73 | T73	I_W 326 328	lý
 74 | T74	B_W 329 332	tàu
 75 | T75	B_W 333 336	phà
 76 | T76	B_W 337 340	lẫn
 77 | T77	B_W 341 345	kiểu
 78 | T78	B_W 346 350	kinh
 79 | T79	I_W 351 356	doanh
 80 | T80	B_W 357 360	bất
 81 | T81	I_W 361 365	chấp
 82 | T82	B_W 366 369	hậu
 83 | T83	I_W 370 374	quả
 84 | 
 85 | T84	B_W 375 377	Cơ
 86 | T85	I_W 378 382	quan
 87 | T86	B_W 383 387	đăng
 88 | T87	I_W 388 392	kiểm
 89 | T88	B_W 393 396	tàu
 90 | T89	B_W 397 400	Hàn
 91 | T90	I_W 401 405	Quốc
 92 | T91	B_W 406 409	đầu
 93 | T92	I_W 410 413	năm
 94 | T93	B_W 414 418	2013
 95 | T94	B_W 419 422	xem
 96 | T95	I_W 423 426	xét
 97 | T96	B_W 427 430	phà
 98 | T97	B_W 431 436	Sewol
 99 | T98	B_W 437 440	khi
100 | T99	B_W 441 444	phà
101 | T100	B_W 445 449	đăng
102 | T101	I_W 450 452	ký
103 | T102	B_W 453 456	cải
104 | T103	I_W 457 461	tiến
105 | T104	B_W 462 464	để
106 | T105	B_W 465 468	chở
107 | T106	B_W 469 473	thêm
108 | T107	B_W 474 479	nhiều
109 | T108	B_W 480 486	khách
110 | 
111 | T109	B_W 487 489	Cơ
112 | T110	I_W 490 494	quan
113 | T111	B_W 495 498	này
114 | T112	B_W 499 502	cho
115 | T113	B_W 503 506	phà
116 | T114	B_W 507 511	được
117 | T115	B_W 512 515	chở
118 | T116	B_W 516 520	thêm
119 | T117	B_W 521 525	hàng
120 | T118	I_W 526 529	hóa
121 | T119	B_W 530 533	tối
122 | T120	I_W 534 536	đa
123 | T121	B_W 537 540	987
124 | T122	B_W 541 544	tấn
125 | T123	B_W 545 546	(
126 | T124	B_W 547 551	tăng
127 | T125	B_W 552 554	50
128 | T126	B_W 555 556	%
129 | T127	B_W 557 558	)
130 | T128	B_W 559 562	với
131 | T129	B_W 563 567	điều
132 | T130	I_W 568 572	kiện
133 | T131	B_W 573 576	dằn
134 | T132	B_W 577 581	thêm
135 | T133	B_W 582 586	dưới
136 | T134	B_W 587 593	khoang
137 | T135	B_W 594 599	2.000
138 | T136	B_W 600 603	tấn
139 | T137	B_W 604 608	nước
140 | T138	B_W 609 611	để
141 | T139	B_W 612 615	cân
142 | T140	I_W 616 621	bằng
143 | 
144 | T141	B_W 622 625	Tuy
145 | T142	I_W 626 631	nhiên
146 | T143	B_W 632 638	khuyến
147 | T144	I_W 639 642	cáo
148 | T145	B_W 643 646	này
149 | T146	B_W 647 650	chỉ
150 | T147	B_W 651 654	gửi
151 | T148	B_W 655 658	đến
152 | T149	B_W 659 663	công
153 | T150	I_W 664 666	ty
154 | T151	B_W 667 671	quản
155 | T152	I_W 672 674	lý
156 | T153	B_W 675 678	phà
157 | T154	B_W 679 681	mà
158 | T155	B_W 682 687	không
159 | T156	B_W 688 692	được
160 | T157	B_W 693 696	gửi
161 | T158	B_W 697 700	cho
162 | T159	B_W 701 705	Cảnh
163 | T160	I_W 706 709	sát
164 | T161	B_W 710 714	biển
165 | T162	B_W 715 718	lẫn
166 | T163	B_W 719 723	Hiệp
167 | T164	I_W 724 727	hội
168 | T165	B_W 728 731	tàu
169 | T166	I_W 732 736	biển
170 | T167	B_W 737 740	Hàn
171 | T168	I_W 741 746	Quốc
172 | 
173 | T169	B_W 747 750	Phà
174 | T170	B_W 751 754	này
175 | T171	B_W 755 758	sau
176 | T172	I_W 759 761	đó
177 | T173	B_W 762 766	liên
178 | T174	I_W 767 770	tục
179 | T175	B_W 771 774	chở
180 | T176	B_W 775 779	hàng
181 | T177	I_W 780 783	hóa
182 | T178	B_W 784 788	vượt
183 | T179	B_W 789 792	tải
184 | T180	I_W 793 798	trọng
185 | T181	B_W 799 802	cho
186 | T182	I_W 803 807	phép
187 | T183	B_W 808 811	987
188 | T184	B_W 812 815	tấn
189 | T185	B_W 816 819	như
190 | T186	B_W 820 823	chở
191 | T187	B_W 824 827	hơn
192 | T188	B_W 828 833	2.000
193 | T189	B_W 834 837	tấn
194 | T190	B_W 838 842	hàng
195 | T191	B_W 843 846	qua
196 | T192	B_W 847 850	136
197 | T193	B_W 851 857	chuyến
198 | T194	B_W 858 860	và
199 | T195	B_W 861 865	trên
200 | T196	B_W 866 871	3.000
201 | T197	B_W 872 875	tấn
202 | T198	B_W 876 879	qua
203 | T199	B_W 880 882	12
204 | T200	B_W 883 887	lần
205 | 
206 | T201	B_W 888 892	Tổng
207 | T202	I_W 893 897	cộng
208 | T203	B_W 898 901	đến
209 | T204	B_W 902 908	chuyến
210 | T205	B_W 909 913	cuối
211 | T206	I_W 914 918	cùng
212 | T207	B_W 919 923	ngày
213 | T208	B_W 924 933	16.4.2014
214 | T209	B_W 934 937	phà
215 | T210	B_W 938 941	này
216 | T211	B_W 942 945	chở
217 | T212	B_W 946 950	hàng
218 | T213	B_W 951 954	quá
219 | T214	I_W 955 958	tải
220 | T215	B_W 959 962	đến
221 | T216	B_W 963 966	246
222 | T217	B_W 967 971	lần
223 | 
224 | T218	B_W 972 974	Và
225 | T219	B_W 975 981	chuyến
226 | T220	B_W 982 986	cuối
227 | T221	I_W 987 991	cùng
228 | T222	B_W 992 995	phà
229 | T223	B_W 996 999	chở
230 | T224	B_W 1000 1005	lượng
231 | T225	B_W 1006 1010	hàng
232 | T226	I_W 1011 1014	hóa
233 | T227	B_W 1015 1020	khủng
234 | T228	B_W 1021 1024	đến
235 | T229	B_W 1025 1030	3.608
236 | T230	B_W 1031 1034	tấn
237 | T231	B_W 1035 1039	cùng
238 | T232	B_W 1040 1043	476
239 | T233	B_W 1044 1049	người
240 | T234	B_W 1050 1053	kết
241 | T235	I_W 1054 1057	quả
242 | T236	B_W 1058 1060	là
243 | T237	B_W 1061 1064	phà
244 | T238	B_W 1065 1068	lật
245 | T239	B_W 1069 1076	nghiêng
246 | T240	B_W 1077 1082	ngoài
247 | T241	I_W 1083 1087	khơi
248 | T242	B_W 1088 1091	đảo
249 | T243	B_W 1092 1097	Jindo
250 | T244	B_W 1098 1102	sáng
251 | T245	B_W 1103 1112	16.4.2014
252 | T246	B_W 1113 1116	làm
253 | T247	B_W 1117 1120	hơn
254 | T248	B_W 1121 1124	300
255 | T249	B_W 1125 1129	hành
256 | T250	I_W 1130 1135	khách
257 | T251	B_W 1136 1141	thiệt
258 | T252	I_W 1142 1147	mạng
259 | 
260 | T253	B_W 1148 1154	Thuyền
261 | T254	I_W 1155 1161	trưởng
262 | T255	B_W 1162 1165	Lee
263 | T256	B_W 1166 1170	Joon
264 | T257	B_W 1171 1172	-
265 | T258	B_W 1173 1177	seok
266 | T259	I_W 1178 1182	khai
267 | T260	B_W 1183 1187	hàng
268 | T261	I_W 1188 1191	hóa
269 | T262	B_W 1192 1195	chở
270 | T263	B_W 1196 1200	trên
271 | T264	B_W 1201 1204	phà
272 | T265	B_W 1205 1207	ít
273 | T266	I_W 1208 1211	hơn
274 | T267	B_W 1212 1215	các
275 | T268	B_W 1216 1219	con
276 | T269	I_W 1220 1222	số
277 | T270	B_W 1223 1226	của
278 | T271	B_W 1227 1230	báo
279 | T272	I_W 1231 1234	cáo
280 | T273	B_W 1235 1239	trên
281 | T274	B_W 1240 1244	rằng
282 | T275	B_W 1245 1248	khi
283 | T276	B_W 1249 1253	chìm
284 | T277	B_W 1254 1257	phà
285 | T278	B_W 1258 1260	có
286 | T279	B_W 1261 1264	chở
287 | T280	B_W 1265 1268	657
288 | T281	B_W 1269 1272	tấn
289 | T282	B_W 1273 1277	hàng
290 | T283	B_W 1278 1282	cùng
291 | T284	B_W 1283 1286	150
292 | T285	B_W 1287 1288	ô
293 | T286	I_W 1289 1292	tô
294 | 
295 | T287	B_W 1293 1296	Tuy
296 | T288	I_W 1297 1302	nhiên
297 | T289	B_W 1303 1307	Cảnh
298 | T290	I_W 1308 1311	sát
299 | T291	B_W 1312 1316	biển
300 | T292	B_W 1317 1320	tìm
301 | T293	B_W 1321 1325	thấy
302 | T294	B_W 1326 1329	đến
303 | T295	B_W 1330 1333	180
304 | T296	B_W 1334 1335	ô
305 | T297	I_W 1336 1338	tô
306 | T298	B_W 1339 1344	trong
307 | T299	I_W 1345 1349	lòng
308 | T300	B_W 1350 1353	phà
309 | T301	B_W 1354 1358	dưới
310 | T302	B_W 1359 1363	biển
311 | T303	B_W 1364 1366	!
312 | 
313 | T304	B_W 1367 1370	Các
314 | T305	B_W 1371 1377	chuyên
315 | T306	I_W 1378 1381	gia
316 | T307	B_W 1382 1385	tin
317 | T308	B_W 1386 1390	rằng
318 | T309	B_W 1391 1394	khi
319 | T310	B_W 1395 1398	chở
320 | T311	B_W 1399 1402	quá
321 | T312	I_W 1403 1406	tải
322 | T313	B_W 1407 1410	chỉ
323 | T314	B_W 1411 1414	cần
324 | T315	B_W 1415 1418	đảo
325 | T316	B_W 1419 1424	hướng
326 | T317	B_W 1425 1428	một
327 | T318	I_W 1429 1433	chút
328 | T319	B_W 1434 1438	cũng
329 | T320	B_W 1439 1441	có
330 | T321	I_W 1442 1445	thể
331 | T322	B_W 1446 1449	làm
332 | T323	B_W 1450 1453	phà
333 | T324	B_W 1454 1456	bị
334 | T325	B_W 1457 1460	lật
335 | T326	B_W 1461 1463	vì
336 | T327	B_W 1464 1467	mất
337 | T328	B_W 1468 1471	cân
338 | T329	I_W 1472 1477	bằng
339 | 
340 | T330	B_W 1478 1480	Và
341 | T331	B_W 1481 1484	các
342 | T332	B_W 1485 1487	dữ
343 | T333	I_W 1488 1492	liệu
344 | T334	B_W 1493 1497	hành
345 | T335	I_W 1498 1503	trình
346 | T336	B_W 1504 1507	cho
347 | T337	I_W 1508 1512	thấy
348 | T338	B_W 1513 1516	con
349 | T339	B_W 1517 1520	phà
350 | T340	B_W 1521 1523	đã
351 | T341	B_W 1524 1528	quẹo
352 | T342	B_W 1529 1532	một
353 | T343	B_W 1533 1536	góc
354 | T344	B_W 1537 1539	45
355 | T345	B_W 1540 1542	độ
356 | T346	B_W 1543 1547	ngay
357 | T347	I_W 1548 1551	khi
358 | T348	B_W 1552 1557	chìm
359 | 
360 | T349	B_W 1558 1561	Lối
361 | T350	I_W 1562 1565	vào
362 | T351	B_W 1566 1570	cảng
363 | T352	B_W 1571 1575	trên
364 | T353	B_W 1576 1579	đảo
365 | T354	B_W 1580 1585	Jindo
366 | T355	B_W 1586 1589	đầy
367 | T356	B_W 1590 1594	vòng
368 | T357	I_W 1595 1598	hoa
369 | T358	B_W 1599 1603	tang
370 | T359	B_W 1604 1609	tưởng
371 | T360	I_W 1610 1613	nhớ
372 | T361	B_W 1614 1617	các
373 | T362	B_W 1618 1621	nạn
374 | T363	I_W 1622 1626	nhân
375 | T364	B_W 1627 1629	vụ
376 | T365	B_W 1630 1634	chìm
377 | T366	B_W 1635 1638	phà
378 | T367	I_W 1639 1644	Sewol
379 | T368	B_W 1645 1649	ngày
380 | T369	B_W 1650 1659	28.4.2014
381 | T370	B_W 1660 1661	-
382 | T371	B_W 1662 1665	Ảnh
383 | T372	B_W 1666 1667	:
384 | T373	B_W 1668 1675	Reuters
385 | T374	B_W 1676 1679	Thợ
386 | T375	I_W 1680 1683	lặn
387 | T376	B_W 1684 1688	hiện
388 | T377	B_W 1689 1692	tìm
389 | T378	I_W 1693 1697	kiếm
390 | T379	B_W 1698 1701	gần
391 | T380	I_W 1702 1705	hết
392 | T381	B_W 1706 1709	các
393 | T382	B_W 1710 1715	phòng
394 | T383	B_W 1716 1720	trên
395 | T384	B_W 1721 1724	phà
396 | T385	B_W 1725 1730	Sewol
397 | T386	B_W 1731 1735	chìm
398 | T387	B_W 1736 1740	dưới
399 | T388	B_W 1741 1745	biển
400 | T389	B_W 1746 1747	-
401 | T390	B_W 1748 1751	Ảnh
402 | T391	B_W 1752 1753	:
403 | T392	B_W 1754 1761	Reuters
404 | T393	B_W 1762 1766	Ngày
405 | T394	B_W 1767 1770	4.5
406 | T395	B_W 1771 1775	Tổng
407 | T396	I_W 1776 1781	thống
408 | T397	B_W 1782 1785	Hàn
409 | T398	I_W 1786 1790	Quốc
410 | T399	I_W 1791 1795	Park
411 | T400	B_W 1796 1800	Geun
412 | T401	B_W 1801 1802	-
413 | T402	B_W 1803 1806	hye
414 | T403	B_W 1807 1809	có
415 | T404	B_W 1810 1816	chuyến
416 | T405	B_W 1817 1821	thăm
417 | T406	B_W 1822 1825	lần
418 | T407	B_W 1826 1827	2
419 | T408	B_W 1828 1831	các
420 | T409	B_W 1832 1835	gia
421 | T410	I_W 1836 1840	đình
422 | T411	B_W 1841 1844	nạn
423 | T412	I_W 1845 1849	nhân
424 | T413	B_W 1850 1852	vụ
425 | T414	B_W 1853 1857	chìm
426 | T415	B_W 1858 1861	phà
427 | T416	B_W 1862 1866	đang
428 | T417	B_W 1867 1870	tạm
429 | T418	I_W 1871 1874	trú
430 | T419	B_W 1875 1876	ở
431 | T420	B_W 1877 1880	đảo
432 | T421	I_W 1881 1887	Jindo
433 | 
434 | T422	B_W 1888 1890	Bà
435 | T423	B_W 1891 1895	Park
436 | T424	B_W 1896 1899	nói
437 | T425	B_W 1900 1904	rằng
438 | T426	B_W 1905 1907	bà
439 | T427	B_W 1908 1912	cũng
440 | T428	B_W 1913 1917	từng
441 | T429	B_W 1918 1921	đau
442 | T430	I_W 1922 1925	khổ
443 | T431	B_W 1926 1928	vì
444 | T432	B_W 1929 1932	mất
445 | T433	I_W 1933 1936	mát
446 | T434	B_W 1937 1940	gia
447 | T435	I_W 1941 1945	đình
448 | T436	B_W 1946 1949	nên
449 | T437	B_W 1950 1954	hiểu
450 | T438	I_W 1955 1957	rõ
451 | T439	B_W 1958 1961	tâm
452 | T440	I_W 1962 1967	trạng
453 | T441	B_W 1968 1971	của
454 | T442	B_W 1972 1975	mọi
455 | T443	I_W 1976 1982	người
456 | 
457 | T444	B_W 1983 1985	Bà
458 | T445	B_W 1986 1989	hứa
459 | T446	B_W 1990 1992	sẽ
460 | T447	B_W 1993 1998	trừng
461 | T448	I_W 1999 2003	phạt
462 | T449	B_W 2004 2007	các
463 | T450	B_W 2008 2010	cá
464 | T451	I_W 2011 2015	nhân
465 | T452	B_W 2016 2020	liên
466 | T453	I_W 2021 2025	quan
467 | T454	B_W 2026 2028	vụ
468 | T455	B_W 2029 2033	chìm
469 | T456	B_W 2034 2037	phà
470 | T457	B_W 2038 2042	này
471 | 
472 | T458	B_W 2043 2047	Tính
473 | T459	B_W 2048 2051	đến
474 | T460	B_W 2052 2056	ngày
475 | T461	B_W 2057 2060	4.5
476 | T462	B_W 2061 2063	đã
477 | T463	B_W 2064 2066	có
478 | T464	B_W 2067 2070	244
479 | T465	B_W 2071 2074	thi
480 | T466	I_W 2075 2078	thể
481 | T467	B_W 2079 2083	được
482 | T468	B_W 2084 2087	tìm
483 | T469	B_W 2088 2092	thấy
484 | T470	B_W 2093 2096	vẫn
485 | T471	B_W 2097 2100	còn
486 | T472	B_W 2101 2103	58
487 | T473	B_W 2104 2109	người
488 | T474	B_W 2110 2113	mất
489 | T475	I_W 2114 2119	tích
490 | 
491 | T476	B_W 2120 2122	Số
492 | T477	I_W 2123 2128	người
493 | T478	B_W 2129 2133	được
494 | T479	B_W 2134 2137	cứu
495 | T480	I_W 2138 2142	sống
496 | T481	B_W 2143 2145	là
497 | T482	B_W 2146 2149	174
498 | T483	B_W 2150 2153	gồm
499 | T484	B_W 2154 2159	22/29
500 | T485	B_W 2160 2166	thuyền
501 | T486	I_W 2167 2172	viên
502 | 
503 | T487	B_W 2173 2177	Phát
504 | T488	I_W 2178 2182	ngôn
505 | T489	B_W 2183 2187	viên
506 | T490	B_W 2188 2191	Lực
507 | T491	I_W 2192 2197	lượng
508 | T492	B_W 2198 2201	cứu
509 | T493	I_W 2202 2204	hộ
510 | T494	B_W 2205 2208	phà
511 | T495	B_W 2209 2214	Sewol
512 | T496	B_W 2215 2218	ông
513 | T497	B_W 2219 2221	Ko
514 | T498	I_W 2222 2227	Myung
515 | T499	B_W 2228 2229	-
516 | T500	B_W 2230 2234	seok
517 | T501	B_W 2235 2238	cho
518 | T502	I_W 2239 2243	biết
519 | T503	B_W 2244 2247	thợ
520 | T504	I_W 2248 2251	lặn
521 | T505	B_W 2252 2254	đã
522 | T506	B_W 2255 2258	tìm
523 | T507	I_W 2259 2263	kiếm
524 | T508	B_W 2264 2268	được
525 | T509	B_W 2269 2271	60
526 | T510	B_W 2272 2277	trong
527 | T511	B_W 2278 2282	tổng
528 | T512	I_W 2283 2285	số
529 | T513	B_W 2286 2288	64
530 | T514	B_W 2289 2294	phòng
531 | T515	B_W 2295 2298	của
532 | T516	B_W 2299 2302	con
533 | T517	B_W 2303 2306	phà
534 | T518	B_W 2307 2311	dưới
535 | T519	B_W 2312 2316	lòng
536 | T520	I_W 2317 2321	biển
537 | 


--------------------------------------------------------------------------------
/pyspark/study_apache_spark/rdd_co_ban.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# RDD cơ bản\n",
  8 |     "- Programmer chỉ định số lượng partitions.\n",
  9 |     "- Driver tự phân chia partition đến các Workers tương ứng.\n",
 10 |     "- Master parameter chỉ định số lượng workers cụ thể.\n",
 11 |     "\n",
 12 |     "# Các hàm transformations\n",
 13 |     "- map(func): trả về tập dữ liệu phân tán mới bằng cách ánh xạ từng phần tử tập dữ liệu nguồn qua hàm func do programmer định nghĩa.\n",
 14 |     "- filter(func): trả về tập dữ liệu phân tán mới bằng cách lọc ra các phần tử tập dữ liệu nguồn thoả điều kiện hàm func định nghĩa.\n",
 15 |     "- distinct(): trả về tập dữ liệu phân tán mới chỉ chứa các phần tử riêng biệt từ tập dữ liệu nguồn.\n",
 16 |     "- flatMap(func): tương tự như map(), nhưng có thể ánh xạ các phần tử nguồn sang 0 hoặc nhiều phần tử ở tập dữ liệu mới. Hàm func thường trả về kiểu Seg thay vì phần tử đơn lẻ."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stdout",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "http://localhost:4040/jobs/\n"
 29 |      ]
 30 |     }
 31 |    ],
 32 |    "source": [
 33 |     "print \"http://localhost:4040/jobs/\""
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "data": {
 43 |       "text/plain": [
 44 |        "PythonRDD[1] at RDD at PythonRDD.scala:48"
 45 |       ]
 46 |      },
 47 |      "execution_count": 2,
 48 |      "metadata": {},
 49 |      "output_type": "execute_result"
 50 |     }
 51 |    ],
 52 |    "source": [
 53 |     "rdd = sc.parallelize([1, 2, 3, 4])\n",
 54 |     "rdd.map(lambda x: x * 2)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 3,
 60 |    "metadata": {},
 61 |    "outputs": [
 62 |     {
 63 |      "data": {
 64 |       "text/plain": [
 65 |        "PythonRDD[2] at RDD at PythonRDD.scala:48"
 66 |       ]
 67 |      },
 68 |      "execution_count": 3,
 69 |      "metadata": {},
 70 |      "output_type": "execute_result"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "rdd.filter(lambda x: x % 2 == 0)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 4,
 80 |    "metadata": {},
 81 |    "outputs": [
 82 |     {
 83 |      "data": {
 84 |       "text/plain": [
 85 |        "PythonRDD[8] at RDD at PythonRDD.scala:48"
 86 |       ]
 87 |      },
 88 |      "execution_count": 4,
 89 |      "metadata": {},
 90 |      "output_type": "execute_result"
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "rdd = sc.parallelize([1, 4, 2, 2, 3])\n",
 95 |     "rdd.distinct()"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 5,
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "data": {
105 |       "text/plain": [
106 |        "PythonRDD[10] at RDD at PythonRDD.scala:48"
107 |       ]
108 |      },
109 |      "execution_count": 5,
110 |      "metadata": {},
111 |      "output_type": "execute_result"
112 |     }
113 |    ],
114 |    "source": [
115 |     "rdd = sc.parallelize([1, 2, 3])\n",
116 |     "rdd.map(lambda x: [x, x + 5])"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 6,
122 |    "metadata": {},
123 |    "outputs": [
124 |     {
125 |      "data": {
126 |       "text/plain": [
127 |        "PythonRDD[11] at RDD at PythonRDD.scala:48"
128 |       ]
129 |      },
130 |      "execution_count": 6,
131 |      "metadata": {},
132 |      "output_type": "execute_result"
133 |     }
134 |    ],
135 |    "source": [
136 |     "rdd.flatMap(lambda x: [x, x + 5])"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "# Các hàm actions\n",
144 |     "- reduce(func): aggregate từng phần tử tập dữ liệu thông qua hàm func, hàm func nhận 2 đối số và trả về 1 giá trị.\n",
145 |     "- take(n): trả về mảng n phần tử.\n",
146 |     "- collect(): trả về tất cả các phần tử. CHÚ Ý: phải đảm bảo máy Driver đủ dung lượng để chứa kết quả trả về.\n",
147 |     "- takeOrdered(n, key=func): trả về n phần tử sắp xếp tăng dần hoặc sắp xếp theo hàm key."
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 7,
153 |    "metadata": {},
154 |    "outputs": [
155 |     {
156 |      "data": {
157 |       "text/plain": [
158 |        "6"
159 |       ]
160 |      },
161 |      "execution_count": 7,
162 |      "metadata": {},
163 |      "output_type": "execute_result"
164 |     }
165 |    ],
166 |    "source": [
167 |     "rdd = sc.parallelize([1, 2, 3])\n",
168 |     "rdd.reduce(lambda a, b: a * b)"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 8,
174 |    "metadata": {},
175 |    "outputs": [
176 |     {
177 |      "data": {
178 |       "text/plain": [
179 |        "[1, 2]"
180 |       ]
181 |      },
182 |      "execution_count": 8,
183 |      "metadata": {},
184 |      "output_type": "execute_result"
185 |     }
186 |    ],
187 |    "source": [
188 |     "rdd.take(2)"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 9,
194 |    "metadata": {},
195 |    "outputs": [
196 |     {
197 |      "data": {
198 |       "text/plain": [
199 |        "[1, 2, 3]"
200 |       ]
201 |      },
202 |      "execution_count": 9,
203 |      "metadata": {},
204 |      "output_type": "execute_result"
205 |     }
206 |    ],
207 |    "source": [
208 |     "rdd.collect()"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 10,
214 |    "metadata": {},
215 |    "outputs": [
216 |     {
217 |      "data": {
218 |       "text/plain": [
219 |        "[5, 3, 2]"
220 |       ]
221 |      },
222 |      "execution_count": 10,
223 |      "metadata": {},
224 |      "output_type": "execute_result"
225 |     }
226 |    ],
227 |    "source": [
228 |     "rdd = sc.parallelize([5, 3, 1, 2])\n",
229 |     "rdd.takeOrdered(3, lambda s: -1 * s)"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 11,
235 |    "metadata": {},
236 |    "outputs": [
237 |     {
238 |      "data": {
239 |       "text/plain": [
240 |        "[1, 2, 3]"
241 |       ]
242 |      },
243 |      "execution_count": 11,
244 |      "metadata": {},
245 |      "output_type": "execute_result"
246 |     }
247 |    ],
248 |    "source": [
249 |     "rdd.takeOrdered(3)"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 12,
255 |    "metadata": {},
256 |    "outputs": [
257 |     {
258 |      "name": "stdout",
259 |      "output_type": "stream",
260 |      "text": [
261 |       "5\n"
262 |      ]
263 |     }
264 |    ],
265 |    "source": [
266 |     "lines = sc.textFile(\"sample_text.txt\", 4)\n",
267 |     "print lines.count()"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 13,
273 |    "metadata": {},
274 |    "outputs": [
275 |     {
276 |      "name": "stdout",
277 |      "output_type": "stream",
278 |      "text": [
279 |       "5\n"
280 |      ]
281 |     }
282 |    ],
283 |    "source": [
284 |     "print lines.count()"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": 14,
290 |    "metadata": {},
291 |    "outputs": [
292 |     {
293 |      "name": "stdout",
294 |      "output_type": "stream",
295 |      "text": [
296 |       "5\n",
297 |       "5\n"
298 |      ]
299 |     }
300 |    ],
301 |    "source": [
302 |     "lines = sc.textFile(\"sample_text.txt\", 4)\n",
303 |     "lines.cache()\n",
304 |     "print lines.count()\n",
305 |     "print lines.count()"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "metadata": {},
311 |    "source": [
312 |     "# Key-Value RDDs\n",
313 |     "- Tương tự như Map Reduce, Spark hỗ trợ Key-Value pairs.\n",
314 |     "- Mỗi phần tử của Pair RDD là một cặp tuple.\n",
315 |     "## Some Key-Value transformation\n",
316 |     "- reduceByKey(func): trả về tập dữ liệu phân tán mới (K, V). Trong đó, các giá trị cho từng key được tổng hợp bằng hàm reduce func có dạng (V, V) -> V.\n",
317 |     "- sortByKey(): trả về tập dữ liệu phân tán mới (K, V) sắp xếp tăng dần theo keys.\n",
318 |     "- groupByKey(): trả về tập dữ liệu phân tán mới (K, Iterable<V>)."
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 15,
324 |    "metadata": {},
325 |    "outputs": [
326 |     {
327 |      "data": {
328 |       "text/plain": [
329 |        "[(1, 2), (3, 4)]"
330 |       ]
331 |      },
332 |      "execution_count": 15,
333 |      "metadata": {},
334 |      "output_type": "execute_result"
335 |     }
336 |    ],
337 |    "source": [
338 |     "rdd = sc.parallelize([(1, 2), (3, 4)])\n",
339 |     "rdd.collect()"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 16,
345 |    "metadata": {},
346 |    "outputs": [
347 |     {
348 |      "data": {
349 |       "text/plain": [
350 |        "[(1, 2), (3, 10)]"
351 |       ]
352 |      },
353 |      "execution_count": 16,
354 |      "metadata": {},
355 |      "output_type": "execute_result"
356 |     }
357 |    ],
358 |    "source": [
359 |     "rdd = sc.parallelize([(1, 2), (3, 4), (3, 6)])\n",
360 |     "rdd.reduceByKey(lambda a, b: a + b).collect()"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": 17,
366 |    "metadata": {},
367 |    "outputs": [
368 |     {
369 |      "data": {
370 |       "text/plain": [
371 |        "[(1, 'a'), (1, 'b'), (2, 'c')]"
372 |       ]
373 |      },
374 |      "execution_count": 17,
375 |      "metadata": {},
376 |      "output_type": "execute_result"
377 |     }
378 |    ],
379 |    "source": [
380 |     "rdd = sc.parallelize([(1, \"a\"), (2, \"c\"), (1, \"b\")])\n",
381 |     "rdd.sortByKey().collect()"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": 18,
387 |    "metadata": {
388 |     "scrolled": true
389 |    },
390 |    "outputs": [
391 |     {
392 |      "data": {
393 |       "text/plain": [
394 |        "[(1, <pyspark.resultiterable.ResultIterable at 0x1066d5e50>),\n",
395 |        " (2, <pyspark.resultiterable.ResultIterable at 0x1064d4750>)]"
396 |       ]
397 |      },
398 |      "execution_count": 18,
399 |      "metadata": {},
400 |      "output_type": "execute_result"
401 |     }
402 |    ],
403 |    "source": [
404 |     "rdd.groupByKey().collect()"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "markdown",
409 |    "metadata": {},
410 |    "source": [
411 |     "# X.join(Y)\n",
412 |     "- Trả về tất cả các phần tử RDD keys khớp với X và Y.\n",
413 |     "- Mỗi cặp có định dạng (k, (v1, v2)). Trong đó, (k, v1) thuộc X và (k, v2) thuộc Y."
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "code",
418 |    "execution_count": 19,
419 |    "metadata": {},
420 |    "outputs": [
421 |     {
422 |      "data": {
423 |       "text/plain": [
424 |        "[('a', (1, 2)), ('a', (1, 3))]"
425 |       ]
426 |      },
427 |      "execution_count": 19,
428 |      "metadata": {},
429 |      "output_type": "execute_result"
430 |     }
431 |    ],
432 |    "source": [
433 |     "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n",
434 |     "y = sc.parallelize([(\"a\", 2), (\"a\", 3)])\n",
435 |     "sorted(x.join(y).collect())"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "markdown",
440 |    "metadata": {},
441 |    "source": [
442 |     "# X.leftOuterJoin(Y)\n",
443 |     "- Với mỗi phần tử (k, v) thuộc X, kết quả trả về có thể là:\n",
444 |     " - Tất cả các cặp (k, (v, w)) với w thuộc Y.\n",
445 |     " - Hoặc các cặp (k, (v, None)) nếu không có phần tử nào thuộc Y có key là k."
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": 20,
451 |    "metadata": {},
452 |    "outputs": [
453 |     {
454 |      "data": {
455 |       "text/plain": [
456 |        "[('a', (1, 2)), ('b', (4, None))]"
457 |       ]
458 |      },
459 |      "execution_count": 20,
460 |      "metadata": {},
461 |      "output_type": "execute_result"
462 |     }
463 |    ],
464 |    "source": [
465 |     "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n",
466 |     "y = sc.parallelize([(\"a\", 2)])\n",
467 |     "sorted(x.leftOuterJoin(y).collect())"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "markdown",
472 |    "metadata": {},
473 |    "source": [
474 |     "# X.rightOuterJoin(Y)\n",
475 |     "- Với mỗi phần tử (k, w) thuộc Y, kết quả trả về có thể là:\n",
476 |     " - Tất cả các cặp (k, (v, w)) với v thuộc X.\n",
477 |     " - Hoặc các cặp (k, (None, w)) nếu không có phần tử nào thuộc X có key là k."
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": 21,
483 |    "metadata": {},
484 |    "outputs": [
485 |     {
486 |      "data": {
487 |       "text/plain": [
488 |        "[('a', (1, 2)), ('b', (None, 4))]"
489 |       ]
490 |      },
491 |      "execution_count": 21,
492 |      "metadata": {},
493 |      "output_type": "execute_result"
494 |     }
495 |    ],
496 |    "source": [
497 |     "x = sc.parallelize([(\"a\", 1)])\n",
498 |     "y = sc.parallelize([(\"a\", 2), (\"b\", 4)])\n",
499 |     "sorted(x.rightOuterJoin(y).collect())"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "markdown",
504 |    "metadata": {},
505 |    "source": [
506 |     "# X.fullOuterJoin(Y)\n",
507 |     "- Với mỗi phần tử (k, v) thuộc X, kết quả trả về có thể là:\n",
508 |     " - Tất cả các cặp (k, (v, w)) với w thuộc Y.\n",
509 |     " - Hoặc các cặp (k, (v, None)) nếu không có phần tử nào thuộc Y có key là k.\n",
510 |     "- Với mỗi phần tử (k, w) thuộc Y, kết quả trả về có thể là:\n",
511 |     " - Tất cả các cặp (k, (v, w)) với v thuộc X.\n",
512 |     " - Hoặc các cặp (k, (None, w)) nếu không có phần tử nào thuộc X có key là k. "
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": 22,
518 |    "metadata": {},
519 |    "outputs": [
520 |     {
521 |      "data": {
522 |       "text/plain": [
523 |        "[('a', (1, 2)), ('b', (4, None)), ('c', (None, 8))]"
524 |       ]
525 |      },
526 |      "execution_count": 22,
527 |      "metadata": {},
528 |      "output_type": "execute_result"
529 |     }
530 |    ],
531 |    "source": [
532 |     "x = sc.parallelize([(\"a\", 1), (\"b\", 4)])\n",
533 |     "y = sc.parallelize([(\"a\", 2), (\"c\", 8)])\n",
534 |     "sorted(x.fullOuterJoin(y).collect())"
535 |    ]
536 |   }
537 |  ],
538 |  "metadata": {
539 |   "kernelspec": {
540 |    "display_name": "Python 2",
541 |    "language": "python",
542 |    "name": "python2"
543 |   },
544 |   "language_info": {
545 |    "codemirror_mode": {
546 |     "name": "ipython",
547 |     "version": 2
548 |    },
549 |    "file_extension": ".py",
550 |    "mimetype": "text/x-python",
551 |    "name": "python",
552 |    "nbconvert_exporter": "python",
553 |    "pygments_lexer": "ipython2",
554 |    "version": "2.7.10"
555 |   },
556 |   "name": "04_spark_essentials",
557 |   "notebookId": 1227613790179004
558 |  },
559 |  "nbformat": 4,
560 |  "nbformat_minor": 1
561 | }
562 | 


--------------------------------------------------------------------------------