├── .gitignore
├── README.md
├── Schedule_Chinese.md
├── license.md
├── spark-summit-15.md
├── w1
    └── lecture_note.md
├── w2
    └── lecture_note.md
└── w3
    └── w3demo
        ├── demo1-python
            └── HDR
            │   ├── README.md
            │   ├── papers
            │       ├── 00b495249dfb217637000000.pdf
            │       ├── 10.1.1.110.4774.pdf
            │       ├── 1412.8307v1.pdf
            │       ├── Lecture21.pdf
            │       ├── Margulis-OpticalCharacterRecognition.pdf
            │       └── mdp_paper.pdf
            │   └── src
            │       ├── train_flow.py
            │       ├── train_flow_sklearn.py
            │       ├── train_knn.py
            │       ├── train_lr.py
            │       ├── train_rbm.py
            │       ├── train_rf.py
            │       └── train_svm.py
        ├── demo1
            └── readme.txt
        ├── demo2
            └── custom-serving
            │   ├── .gitignore
            │   ├── build.sbt
            │   ├── data
            │       └── sample_disabled_items.txt
            │   ├── engine.json
            │   ├── project
            │       ├── assembly.sbt
            │       └── plugins.sbt
            │   └── src
            │       └── main
            │           └── scala
            │               ├── ALSAlgorithm.scala
            │               ├── ALSModel.scala
            │               ├── DataSource.scala
            │               ├── Engine.scala
            │               ├── Preparator.scala
            │               └── Serving.scala
        └── demo3
            └── recommend-dish
                ├── core
                    └── src
                    │   ├── main
                    │       └── scala
                    │       │   └── com
                    │       │       └── amli
                    │       │           └── w3
                    │       │               └── recommend
                    │       │                   ├── DataSource.scala
                    │       │                   ├── Engine.scala
                    │       │                   ├── ItemToItem.scala
                    │       │                   ├── ItemToItemModel.scala
                    │       │                   ├── Preparator.scala
                    │       │                   └── Serving.scala
                    │   └── test
                    │       └── scala
                    │           └── com
                    │               └── amli
                    │                   └── w3
                    │                       └── recommend
                    │                           └── DataSourceSuite.scala
                ├── datalyze
                    └── src
                    │   ├── main
                    │       ├── resources
                    │       │   └── log4j.properties
                    │       └── scala
                    │       │   └── com
                    │       │       └── amli
                    │       │           └── w3
                    │       │               └── recommend
                    │       │                   ├── Logging.scala
                    │       │                   ├── io
                    │       │                       ├── DataLoader.scala
                    │       │                       ├── DataTransfer.scala
                    │       │                       ├── FeaturedData.scala
                    │       │                       ├── MySQLLoader.scala
                    │       │                       └── Utils.scala
                    │       │                   ├── model
                    │       │                       ├── ItemSimilarity.scala
                    │       │                       ├── Model.scala
                    │       │                       └── PopularityModel.scala
                    │       │                   └── package.scala
                    │   └── test
                    │       └── scala
                    │           └── com
                    │               └── amli
                    │                   └── w3
                    │                       └── recommend
                    │                           ├── io
                    │                               ├── CliqueStatSuite.scala
                    │                               ├── FeaturedDataSuite.scala
                    │                               └── MySQLLoaderSuite.scala
                    │                           └── model
                    │                               ├── CommunityModelSuite.scala
                    │                               ├── ItemSimilaritySuite.scala
                    │                               └── PregelUnfoldingSuite.scala
                ├── engine.json
                ├── make-distribution.sh
                ├── project
                    ├── build.scala
                    └── plugins.sbt
                └── requests.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.tgz
 2 | manifest.json
 3 | *.tar.gz
 4 | *.csv
 5 | *~
 6 | *.swp
 7 | *.ipr
 8 | *.iml
 9 | *.iws
10 | .idea/
11 | sbt/*.jar
12 | .settings
13 | .cache
14 | .generated-mima*
15 | /build/
16 | work/
17 | out/
18 | .DS_Store
19 | third_party/libmesos.so
20 | third_party/libmesos.dylib
21 | conf/java-opts
22 | conf/spark-env.sh
23 | conf/streaming-env.sh
24 | conf/log4j.properties
25 | conf/spark-defaults.conf
26 | conf/hive-site.xml
27 | docs/_site
28 | docs/api
29 | target/
30 | reports/
31 | .project
32 | .classpath
33 | .scala_dependencies
34 | lib_managed/
35 | src_managed/
36 | project/boot/
37 | project/plugins/project/build.properties
38 | project/build/target/
39 | project/plugins/target/
40 | project/plugins/lib_managed/
41 | project/plugins/src_managed/
42 | logs/
43 | log/
44 | spark-tests.log
45 | streaming-tests.log
46 | dependency-reduced-pom.xml
47 | .ensime
48 | .ensime_lucene
49 | checkpoint
50 | derby.log
51 | dist/
52 | spark-*-bin.tar.gz
53 | unit-tests.log
54 | /lib/
55 | rat-results.txt
56 | scalastyle.txt
57 | conf/*.conf
58 | scalastyle-output.xml
59 | 
60 | # For Hive
61 | metastore_db/
62 | metastore/
63 | warehouse/
64 | TempStatsStore/
65 | sql/hive-thriftserver/test_warehouses
66 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Applied Machine Learning and Implementation
  2 | 
  3 | ## Overview
  4 | 
  5 | 12 weeks, 2 hours / per week
  6 | 
  7 | 20 min per episode, so six episodes per week.
  8 | 
  9 | This course will cover:
 10 | 
 11 | \*\*\*\*\* **Spark MLlib**
 12 | 
 13 | \*\*\*\* **ML Pipeline and GraphX**
 14 | 
 15 | \*\*\* **Spark Core and Spark SQL**
 16 | 
 17 | \*\* **Spark Streaming**
 18 | 
 19 | \* **Scikit-learn for reference.**
 20 | 
 21 | ## Textbooks
 22 | 
 23 | 1. Advanced Analytics with Spark
 24 | 2. Machine Learning with Spark
 25 | 3. The Lion Way: Machine Learning plus Intelligent Optimization
 26 | 4. Others...
 27 | 
 28 | ## week 1 Introduction
 29 | 
 30 | 1. Spark ABC
 31 | 2. Machine learning ABC
 32 | 3. Graph Computing ABC
 33 | 4. Demos for Spark, MLlib, and GraphX
 34 | 
 35 | ## week 2 Generalized Linear Model
 36 | 
 37 | 2. Logistic regression
 38 | 3. Linear regression
 39 | 4. SVM
 40 | 5. LASSO
 41 | 6. Ridge regression
 42 | 7. Applied demos such as Handwritten digits recognition, etc.
 43 | 
 44 | ## week 3 Recommendation
 45 | 
 46 | 1. Recommendation ALS
 47 | 2. Singular Value Decomposition
 48 | 3. The implementation in both MLlib and Mahout
 49 | 4. Applied demo of recommendation with PredictionIO.
 50 | 
 51 | ## week 4 Clustering
 52 | 
 53 | 1. k-means
 54 | 2. LDA
 55 | 3. Applied demo of geo-location clustering and topic modeling
 56 | 
 57 | ## week 5 Streaming-wised Machine Learning
 58 | 
 59 | 1. Lambda Architecture
 60 | 2. Parameter Server
 61 | 3. Several algorithms from Freeman labs
 62 | 4. Applied demo such as the zebrafish experiment
 63 | 
 64 | ## week 6 ML Pipeline
 65 | 
 66 | 1. Pipeline of Scikit-learn
 67 | 2. Pipeline of Spark (DataFrame, ML Pipeline, etc.)
 68 | 3. Applied demo (TBD)
 69 | 
 70 | ## week 7 Scientific Computing
 71 | 
 72 | 1. Scientific computing and Notices from Matrix Computation
 73 | 2. Matrix libs (in C/Fortran and Java)
 74 | 3. Matrix in MLlib
 75 | 4. Applied demo (TBD)
 76 | 
 77 | ## week 8 The Graph Computation Model
 78 | 
 79 | 1. Graph computing and libs
 80 | 2. revisit LDA, ALS
 81 | 3. Applied demo such as community detection for food network/recommendation.
 82 | 
 83 | ## week 9 Tree Model and Boosting
 84 | 
 85 | 1. Tree model
 86 | 2. Random forest
 87 | 3. Ensemble in Kaggle and practice
 88 | 4. Applied demo for ensemble
 89 | 
 90 | ## week 10 Evaluation
 91 | 
 92 | 1. Evaluation methods
 93 | 2. Implementations in MLlib
 94 | 3. Online / Offline evaluations
 95 | 
 96 | ## week 11 Optimization in Parallel
 97 | 
 98 | 1. Commonly used optimization algorithms
 99 | 2. Sequential gene of optimization algorithms
100 | 3. BSP model to BSP+ model to SSP
101 | 4. Future ways?
102 | 
103 | ## week 12 Rethink of practical machine learning and how to build a good system
104 | 
105 | 1. One, two, three of practical ML
106 | 2. Rethink of practical machine learning
107 | 3. How to build a great machine learning system?
108 | 4. Compare with Mahout / Oryx2 / VM / ...
109 | 
110 | ## Survey of Advanced Analytics with Spark
111 | 
112 | | Chapter | Topic | Algorithms | Dataset | Source |
113 | |:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|
114 | | 2 | Record Linkage | Entity resolution, record dedup, merge-and-purge, list washing | Some business data such as TCPDS | UCI ML repo |
115 | | 3 | Recommending | ALS | Who plays what or who rates what | Audioscrobbler |
116 | | 4 | Predicting Forest Cover | Decision Tree | The type of forest covering parcels of land in Colorado | UCI ML repo |
117 | | 5 | Anomaly detection in network traffic | K-means | Network intrusion data | KDD Cup 1999 Dataset |
118 | | 6 | Understanding wikipedia | Latent Semantic Analysis, SVD, TF-IDF, etc | wikipedia texts | wikipedia |
119 | | 7 | Analyzing Co-occurrence Networks | Massive graph algorithms in GraphX | MEDLINE citation index | US National Library of Medicine |
120 | | 8 | Geo and Temporal data analysis | Building sessions | New York Taxicab Data | New York City Taxi and Limousine Commission |
121 | | 9 | Estimating Finacial Risk | Monte Carlo Simulation | Stock Data | Yahoo! |
122 | | 10 | Analyzing Genomic Data | Massive genome analysis algorithms | Genome data | NCBI |
123 | | 11 | Analyzing Neuroimaging Data | Thunder | Images of zebrafish brains | Thunder repository |
124 | 
125 | ## Structure of directories
126 | 
127 | /src/chapterx --> The code snippets of each chapter
128 | 
129 | /src/chapterx/{java, python, scala} --> Code snippets written with Mahout, Scikit-learn, and Spark
130 | 
131 | ## Spark VS Scikit-learn
132 | 
133 | ### Algorithms
134 | 
135 | | Type | Algorithm | Scikit-learn | Spark |
136 | |:-----------:|:----------:|:----------:|:----------:|
137 | |Classification| Logistic Regression | YES | YES
138 | |Classification| Perceptron | YES |
139 | |Classification| Passive Aggressive Algorithms | YES
140 | |Classification| SVM | YES | YES
141 | |Classification| Naive Bayes | YES | YES
142 | |Classification| Decision Tree | YES | YES
143 | |Classification| Ensemble methods | YES | YES
144 | |Classification| Label Propogation | YES | YES (in GraphX)
145 | |Classification| LDA and QDA | YES |
146 | |Regression| Ordinary Least Square | YES | YES
147 | |Regression| Ridge Regression | YES | YES
148 | |Regression| LASSO | YES | YES
149 | |Regression| Elastic Net | YES
150 | |Regression| Multi-task LASSO | YES 
151 | |Regression| Least Angle Regression | YES
152 | |Regression| LARS LASSO | YES
153 | |Regression| Orthogonal Matching Pursuit | YES
154 | |Regression| Bayesian Regression | YES
155 | |Regression| Polynomial Regression | YES
156 | |Regression|  Nearest Neighbor | YES | YES
157 | |Regression| Gaussian Process | YES
158 | |Regression| Isotonic Regression | YES
159 | |Clustering| K-means | YES | YES
160 | |Clustering| Affinity Propagation | YES
161 | |Clustering| Mean shift | YES
162 | |Clustering| Spectral Clustering | YES
163 | |Clustering| Ward | YES
164 | |Clustering| Agglomerative clustering | YES
165 | |Clustering| DBSCAN | YES
166 | |Clustering| Gaussian Mixtures | YES
167 | |Dimension Reduction| PCA | YES | YES
168 | |Dimension Reduction| SVD / LSA | YES | YES
169 | |Dimension Reduction| Dictionary Learning | YES
170 | |Dimension Reduction| Factor Analysis | YES
171 | |Dimension Reduction| ICA | YES
172 | |Dimension Reduction| NMF | YES
173 | |Model Selection| Cross Validation | YES | YES
174 | |Model Selection| Grid Search | YES
175 | |Model Selection| Pipeline | YES | YES
176 | |Model Selection| Feature Union | YES | YES
177 | |Model Selection| Model Evaluation | YES | YES
178 | |Model Selection| Model Presistence | YES
179 | |Model Selection| Validation Curves | YES 
180 | |Preprocessing| Standardization | YES | YES
181 | |Preprocessing| Encoding categorical features | YES | YES (dependency)
182 | |Preprocessing| Binarization | YES
183 | |Preprocessing| Normalization | YES | YES
184 | |Preprocessing| Label preprocessing | YES
185 | |Preprocessing| Imputation of missing values | YES
186 | |Preprocessing| Unsupervised data reduction | YES
187 | 


--------------------------------------------------------------------------------
/Schedule_Chinese.md:
--------------------------------------------------------------------------------
  1 | # 实践机器学习算法详解及工程实现
  2 | 
  3 | ## 概述
  4 | 
  5 | 本课程共12周，每周两小时。每个知识片段20分钟，每周共计6个视频片段。
  6 | 
  7 | 本课程包含下列开源产品/组件（星号代表重要程度）：
  8 | 
  9 | \*\*\*\*\* **Spark MLlib**
 10 | 
 11 | \*\*\*\* **ML Pipeline and GraphX**
 12 | 
 13 | \*\*\* **Spark Core and Spark SQL**
 14 | 
 15 | \*\* **Spark Streaming**
 16 | 
 17 | \* **Scikit-learn for reference.**
 18 | 
 19 | ## 参考书
 20 | 
 21 | 1. Advanced Analytics with Spark
 22 | 2. Machine Learning with Spark
 23 | 3. The Lion Way: Machine Learning plus Intelligent Optimization
 24 | 4. Others...
 25 | 
 26 | ## week 1 课程简介及入门基础
 27 | 
 28 | 1. Spark基础知识
 29 | 2. 机器学习基础知识
 30 | 3. 图计算基础知识
 31 | 4. Spark，MLlib，以及GraphX的操作示例
 32 | 
 33 | ## week 2 广义线性模型
 34 | 
 35 | 2. 逻辑回归
 36 | 3. 线性回归
 37 | 4. SVM
 38 | 5. LASSO
 39 | 6. 岭回归
 40 | 7. 广义线性模型代码及示例（如手写数字识别）
 41 | 
 42 | ## week 3 推荐算法及系统
 43 | 
 44 | 1. ALS算法
 45 | 2. 奇异值分解
 46 | 3. Mahout与MLlib的对比分析
 47 | 4. 推荐系统的搭建示例（依赖PredictionIO）
 48 | 
 49 | ## week 4 聚类算法
 50 | 
 51 | 1. k-means
 52 | 2. LDA
 53 | 3. 高斯混合模型
 54 | 4. Power Iteration聚类
 55 | 5. 聚类算法应用示例（如主题建模及地理位置聚类）
 56 | 
 57 | ## week 5 流式机器学习
 58 | 
 59 | 1. Lambda架构
 60 | 2. 参数服务器
 61 | 3. from Freeman labs提供的流式算法
 62 | 4. 应用示例（如斑马鱼实验）
 63 | 
 64 | ## week 6 机器学习流水线
 65 | 
 66 | 1. Scikit-learn的流水线（包括Pandas等对比）
 67 | 2. Spark的流水线（如DataFrame以及ML组件）
 68 | 3. 特征提取与变换
 69 | 4. 应用示例及对比（待定）
 70 | 
 71 | ## week 7 机器学习中的科学计算
 72 | 
 73 | 1. 矩阵计算中的注意事项
 74 | 2. 矩阵计算的组件(in C/Fortran and Java)
 75 | 3. MLlib中的矩阵计算
 76 | 4. MLlib中的统计方法
 77 | 5. 科学计算的示例（待定）
 78 | 
 79 | ## week 8 图计算模型
 80 | 
 81 | 1. GraphX进阶
 82 | 2. GraphX中的图算法
 83 | 3. 再议LDA与ALS算法
 84 | 4. 图模型的示例（如网络中的社团聚类）
 85 | 
 86 | ## week 9 决策树与组合学习
 87 | 
 88 | 1. MLlib中的决策树
 89 | 2. 随机森林算法
 90 | 3. Gradient-Boosted Trees
 91 | 3. 实践中的组合学习（如Kaggle）
 92 | 4. 组合模型的示例（待定）
 93 | 
 94 | ## week 10 机器学习算法评测
 95 | 
 96 | 1. 评测方法
 97 | 2. Cross validation与Grid Search
 98 | 2. MLlib中的实现
 99 | 3. 在线、离线测评方法
100 | 
101 | ## week 11 优化算法并行化
102 | 
103 | 1. 常用的优化算法
104 | 2. 优化算法的串行基因
105 | 3. 计算模型：从BSP到BSP+再到SSP
106 | 4. 未来的趋势
107 | 
108 | ## week 12 课程拾遗以及框架再思考
109 | 
110 | 1. 课程拾遗
111 | 2. 机器学习/数据分析的一般步骤
112 | 3. 实践机器学习的再思考
113 | 4. 多系统对比（Mahout、Oryx、VM以及一些python的包，SparkR，PySpark等）
114 | 5. 总结
115 | 


--------------------------------------------------------------------------------
/license.md:
--------------------------------------------------------------------------------
1 | All rights reserved.
2 | 
3 | 此处代码与文档均为《实践机器学习算法详解及工程实现》（英文名称 Applied Machine Learning and Implementation）课程所用，版权归作者所有。
4 | 
5 | 未经授权的转载和传播被禁止，违者将追究法律责任。
6 | 


--------------------------------------------------------------------------------
/spark-summit-15.md:
--------------------------------------------------------------------------------
 1 | # Spark上的数据分析简介
 2 | 
 3 | 总时长 = 2小时
 4 | 
 5 | 话题数目 = 6个
 6 | 
 7 | 大约每个话题 20分钟
 8 | 
 9 | 0. MLlib最新进展简介
10 | 1. 模型表示 --> MLlib的向量模型与矩阵模型
11 | 2. 优化并行 --> 同步方式、优化调度、以及模型存储
12 | 3. 计算模式 --> MLlib与GraphX
13 | 4. 数据承载 --> MLlib与SparkSQL
14 | 5. 实例分析 --> MLlib与Scikit-learn
15 | 


--------------------------------------------------------------------------------
/w1/lecture_note.md:
--------------------------------------------------------------------------------
 1 | Lecture note
 2 | Xusen Yin
 3 | April 4, 2015
 4 | 
 5 | # Machine Learning on Spark
 6 | 
 7 | # week1 讲义
 8 | 
 9 | 由于各位学员背景不同，基础知识的情况也不太一致，因此第一周的内容主要分为三个overviews，旨在首先可以统一本课程的一些大致脉络，其次对于已经懂得这些基础知识的同学算作一个回顾，对于还不知道这些内容的同学作为一个学习的起点和指导。需要注意的是，学习是个自我进步和不断回顾的过程，讲师在短短的数小时内所起到的职能是启发和引导，真正学习的过程是在课外所花费的时间上。讲师能力有限，在设计课程的过程中参考了多人的资料，随后会附上致谢列表。
10 | 
11 | ## Spark overview
12 | 
13 | 本周课程不打算在一个小时的时间内事无巨细的讲解Spark，也不打算向大家介绍Spark ABC。具体的使用，API的操作请查看Spark官方文档。Spark overview的主要目的是让大家能够尽量明晰Spark的自身特点、计算范式、编程模型、以及运行调度，以便以后在面临一些看似奇怪的代码时能分析它这么做背后的原因。
14 | 
15 | Spark自身的特点，诚如其名，“轻快灵巧”。轻指的是Spark core的设计精巧，代码量少，同时得益于Scala语言丰富的表达力。快说的是Spark上手快，对于初学者完全可以看作单机Scala的分布式版本，RDD的抽象容易掌握；其次是运行快，亚秒级延迟，使其完全可以胜任交互式操作。灵指的是不同层面的灵活性，实现层基于Scala trait实现不同策略的定制和mixin，原语层简单的算子扩展和数据源扩展，以及多语言绑定（Java，Python）。范式层从多迭代的批量操作到流处理，即席查询和图计算等。巧是指其实现借巧力，站在巨人的肩膀上，避免一些费力不讨好的设计。
16 | 
17 | 从计算范式的角度来看，Spark时典型的数据并行，因此具有数据并行的优势和局限。粗粒度数据并行的RDD中每个数据元素要过相同的代码序列，因此其完全不能胜任细粒度异步的数据更新，这点从GraphX的实现可见端倪，从MLlib现有的一些算法实现也可管中窥豹。Spark采用函数式语义，包括RDD的不可修改，UDF的使用，transformation不会产生副作用，而是产生一个新的RDD等。这样导致每个stage内部的计算是幂等的，失败时可以简单通过重放来容错。
18 | 
19 | 从编程模型的角度来看，Spark就是简单的“数据加变换”。其计算空间分为Spark空间和Scala空间，前者的计算是分布式的，在各个worker节点上执行，后者的计算是在单节点执行，具体说来实在Driver节点执行。Scala空间到Spark空间需要输入算子，如textFile，parallel等，Spark空间回到Scala空间需要action算子。
20 | 
21 | 从运行调度的角度来看，主要分为两大部分。第一部分是从Spark代码到切好的DAG stage，第二个部分是从stage到一个个执行的task。前者发生在Driver和代码之间，由DAG调度起负责，后者发生在各个Driver和各个具体执行的Executor之间，由task调度器负责。
22 | 
23 | 对每个RDD而言，action算子触发job的投递。transformation算子中的宽依赖算子触发新的stage切分，而窄依赖算子最终会被pipeline到一起，或者说叫做operator fusion亦可。每两个相互依赖的stages之间通过shuffle传递数据。
24 | 
25 | ## Machine Learning overview
26 | 
27 | T. G. Dietterich在MLSS 2014的讲座中讲了机器学习的三个问题，通俗易懂的介绍了机器学习的一般问题。此处借用这三个问题，向大家阐明机器学习的来由，问题，和基本方法。
28 | 
29 | 机器学习的原动力就是来改变软件的流程，由之前“推导”式的方式转变为“归纳总结”。传统的算法流程是领域专家给出解题思路，按照这种解题思路设计算法求解问题，即expert => logic => function(input): output。Machine learning的方法是收集问题领域的输入输出，在某种假设下总结出其背后的逻辑，模拟领域专家，即<input, output> => logic => expert。
30 | 
31 | 此处以有监督学习为例。有监督学习的简单例子包括手写识别、疾病监测、人脸识别等。其解决问题的一般框架是
32 | 
33 | - 由某一未知分布（the expert）随机且独立的采样，作为训练样本：
34 | 
35 | - 学习算法（the assumption）分析样本数据并生成分类器；
36 | 
37 | - 从相同的数据采样得到新数据，由分类器给出分类结果；
38 | 
39 | - 以某种方式评估误差；
40 | 
41 | Machine learning的目标就是，找到可以最小化expected loss的函数。
42 | 
43 | 以垃圾邮件检测为例，这里未知的分布就是给定的邮件及其标注（“是”垃圾邮件或者“不是”垃圾邮件）其背后隐含的分布。训练样本是用户标注好的一堆邮件。学习算法会在后续的课程中讲到，最简单的方法就是Naive Bayes。输出的分类器，对于Naive Bayes而言，是一个条件概率，即在给定邮件x的条件下，它是垃圾邮件的概率。测试样本为一封新的且有ground truth的邮件，用一个损失函数（loss function）评估误差。最直观的损失函数就是分对损失为0，分错损失为1，叫做0-1 loss。
44 | 
45 | 就学习方法而言，machine learning有三种基本方法。一是学习一个分类器y=f(x)（如Perceptron），二是学习一个条件概率分布p(y|x)（如Logistic regression），三是学习一个联合概率分布p(x, y)（如Linear discriminate  analysis）。第一周的课里只会讲到第一种，其他的在后续课程中会涉及。
46 | 
47 | 以perceptron为例，我们过一下机器学习算法的基本流程。（待补充）
48 | 
49 | ## Graph computing overview
50 | 
51 | 本周最后一节介绍一下图计算的基础知识。首先界定图计算和机器学习的关系。现实生活中的很多问题都可以用图来概括（依赖关系），像twitter friendship网络，call graph等等。而很多科学问题的结构也是图结构，如PageRank的问题，网页之间存在相互的链接。由此，许多科学问题的解法也是一种图算法。针对PageRank，它本身就是一种通过图上的迭代计算特征向量的过程。不仅如此，像协同过滤、梯度下降、置信传播等等多种常见的machine learning算法都能用图的模型概括出来。后面我们也会看到，图的编程模型极大简化了大规模机器学习算法的实现。
52 | 
53 | 图计算最重要两个组成部分就是编程模型和计算引擎，这周的课程中我们只会简要介绍编程模型的内容，计算引擎的部分我们会在专门研讨GraphX的一周内详细分析。
54 | 
55 | 我们以PageRank为例来看不同的编程模型对程序的影响。最简单的方法是直接用MapReduce来表达图计算，我们在Map中完成每个节点的计算任务，在Reduce中进行shuffle，为每个节点聚合其邻居节点送过来的消息。从代码上来看，计算部分和消息传递以及图的结构相互耦合在一起。
56 | 
57 | BSP（Bulk Synchronous Parallel）首先是各个节点并行完成自己的顶点程序（vertex program），之后每个节点向其邻居节点广播其内容更新，此处会有一个大同步等待所有节点完成信息交换。之后再用最新的消息重新进行顶点程序的计算，并以此类推。BSP模型较好的分离了顶点计算、消息传递、图的结构三部分内容，程序看起来清晰易懂。
58 | 
59 | BSP模型的barrier会导致很多问题。原理上来讲最佳的并行策略是全异步的执行。每个节点收集到自己计算足够多的消息之后便可执行，执行结束即可向其邻居节点广播更新。但是这种情况下时序问题难以确定，例如节点A的邻居为B和C，节点A在某时刻收到了节点B的消息(messageB, timeB), 以及节点C的消息(messageC, timeC)，那么节点A的计算到底是以B的逻辑时钟为准还是以C的逻辑时钟为准？抑或是要求逻辑时钟慢者追上逻辑时钟快者之后才开始计算？这里还需要很多内容要仔细考量。
60 | 
61 | 


--------------------------------------------------------------------------------
/w2/lecture_note.md:
--------------------------------------------------------------------------------
  1 | <head>
  2 |     <script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=default"></script>
  3 | </head>
  4 | 
  5 | # week 2 lecture note
  6 | 
  7 | 第二周主要介绍广义线性模型（Generalized Linear Model）。由于GLM在MLlib中实现比较简单，因此本周会讲大部分时间花在GLM的理论推导上。这会与下周的课有比较明显的区别。周三的课会偏重MLlib中ALS的实现部分。
  8 | 
  9 | MLlib中实现的广义线性模型有5中，分别是
 10 | 
 11 | - Logistic regression
 12 | 
 13 | - Linear regression
 14 | 
 15 | - SVM
 16 | 
 17 | - Lasso
 18 | 
 19 | - Linear regression
 20 | 
 21 | 再算上我们上周学到的Perceptron，目前我们一共会了解6种GLMs。这次课程的目的一是让大家了解GLM的特点，总结出规律，二是了解其在MLlib中的实现方法。更进一步，如果大家能对MLlib中的GLM实现提出自己的看法那就最好了。
 22 | 
 23 | ## Logistic regression
 24 | 
 25 | 首先介绍逻辑回归。跟之前的Perceptron不同，逻辑回归不再是学习一个分类函数，而是学习一个用于分类的条件概率$P(y|x)$，即在给定$x$的条件下，求解$y$取值的概率。
 26 | 
 27 | 更新一下我们的符号，我们令$p_y(x;w)$代表我们条件概率的取值，其中$w$是我们模型的参数。并假设我们的$y$只有两个取值$\{0, 1\}$。给出两个不同取值的概率分别为
 28 | 
 29 | $$p_1(x;w) = \frac{1}{1+exp(-w^Tx)}$$
 30 | 
 31 | $$p_0(x;w) = \frac{exp(-w^Tx)}{1+exp(-w^Tx)}$$
 32 | 
 33 | 从定义中容易得出
 34 | 
 35 | $$log\frac{p_0}{p_1} = -w^Tx$$
 36 | 
 37 | 在某种程度上说明了``线性''的来源。
 38 | 
 39 | 那么为什么会选择这种形式的函数作为我们概率的取值呢？从logit函数曲线的形状可见端倪。线性函数$-w^Tx$的取值范围是正负无穷，而logit函数可以把正负无穷的区间映射到$[0, 1]$之间，比较符合概率的取值范围。
 40 | 
 41 | 给定了假设和模型，下面要通过优化损失函数得到参数的最优解。在这里我们使用负的log loss：
 42 | 
 43 | $$L(\hat{P}(y|x), y) = \left\{
 44 |    \begin{array}{c}
 45 |    -log P(y=1|x_i)\qquad if ~x_i == 1,  \\
 46 |    -logP(y=0|x_i)\qquad if ~ x_i == 0.  \\
 47 |    \end{array}
 48 |   \right.$$
 49 | 
 50 | 为了最小化损失函数，我们要做的是最大化正的log概率，其实等价于最大似然，即
 51 | 
 52 | $$max_w\sum_ilog\hat{P}(y_i|x_i)$$
 53 | 
 54 | 对于最大似然最直观的理解就是，如果一个变量$x_i$的类别为1，那就让$\hat{P}(y_i=1|x_i)$的概率远大于$\hat{P}(y_i=0|x_i)$的概率，反之亦然。
 55 | 
 56 | 但是上文中的函数是分段函数，不便求梯度，因此我们利用概率的一些法则将分段函数展开成一个函数
 57 | 
 58 | $$l(w) = \sum_ip_1(x;w)*y_i + (1-p_1(x;w))*(1-y_i)$$
 59 | 
 60 | 大家可以将$y_i=1$和$y_i=0$带入到上面的式子中自行验证其正确性。
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 
203 | 
204 | 
205 | 
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 | 
213 | 
214 | 
215 | 
216 | 
217 | 
218 | 
219 | 
220 | 
221 | 
222 | 
223 | 
224 | 
225 | 
226 | 
227 | 
228 | 
229 | 
230 | 
231 | 
232 | 
233 | 
234 | 
235 | 
236 | 
237 | 
238 | 
239 | 
240 | 
241 | 
242 | 
243 | 
244 | 
245 | 
246 | 
247 | 
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 
255 | 
256 | 
257 | 
258 | 
259 | 
260 | 
261 | 
262 | 
263 | 
264 | 
265 | 
266 | 
267 | 
268 | 
269 | 
270 | 
271 | 
272 | 
273 | 
274 | 
275 | 
276 | 
277 | 
278 | 
279 | 
280 | 
281 | 
282 | 
283 | 
284 | 
285 | 
286 | 
287 | 
288 | 
289 | 
290 | 
291 | 
292 | 
293 | 
294 | 
295 | 
296 | 
297 | 
298 | 
299 | 
300 | 
301 | 
302 | 
303 | 
304 | 
305 | 
306 | 
307 | 
308 | 
309 | 
310 | 
311 | 
312 | 
313 | 
314 | 
315 | 
316 | 
317 | 
318 | 
319 | 
320 | 
321 | 
322 | 
323 | 
324 | 
325 | 
326 | 
327 | 
328 | 
329 | 
330 | 
331 | 
332 | 
333 | 
334 | 
335 | 
336 | 
337 | 
338 | 
339 | 
340 | 
341 | 
342 | 
343 | 
344 | 
345 | 
346 | 
347 | 
348 | 
349 | 
350 | 
351 | 
352 | 
353 | 
354 | 
355 | 
356 | 
357 | 
358 | 
359 | 
360 | 
361 | 
362 | 
363 | 
364 | 
365 | 
366 | 
367 | 
368 | 
369 | 
370 | 
371 | 
372 | 
373 | 
374 | 
375 | 
376 | 
377 | 
378 | 
379 | 
380 | 
381 | 
382 | 
383 | 
384 | 
385 | 
386 | 
387 | 
388 | 
389 | 
390 | 
391 | 
392 | 
393 | 
394 | 
395 | 
396 | 
397 | 
398 | 
399 | 
400 | 
401 | 
402 | 
403 | 
404 | 
405 | 
406 | 
407 | 
408 | 
409 | 
410 | 
411 | 
412 | 
413 | 
414 | 
415 | 
416 | 
417 | 
418 | 
419 | 
420 | 
421 | 
422 | 
423 | 
424 | 
425 | 
426 | 
427 | 
428 | 
429 | 
430 | 
431 | 
432 | 
433 | 
434 | 
435 | 
436 | 
437 | 
438 | 
439 | 
440 | 
441 | 
442 | 
443 | 
444 | 
445 | 
446 | 
447 | 
448 | 
449 | 
450 | 
451 | 
452 | 
453 | 
454 | 
455 | 
456 | 
457 | 
458 | 
459 | 
460 | 
461 | 
462 | 
463 | 
464 | 
465 | 
466 | 
467 | 
468 | 
469 | 
470 | 
471 | 
472 | 
473 | 
474 | 
475 | 
476 | 
477 | 
478 | 
479 | 
480 | 
481 | 
482 | 
483 | 
484 | 
485 | 
486 | 
487 | 
488 | 
489 | 
490 | 
491 | 
492 | 
493 | 
494 | 
495 | 
496 | 
497 | 
498 | 
499 | 
500 | 
501 | 
502 | 
503 | 
504 | 
505 | 
506 | 
507 | 
508 | 
509 | 
510 | 
511 | 
512 | 
513 | 
514 | 
515 | 
516 | 
517 | 
518 | 
519 | 
520 | 
521 | 
522 | 
523 | 
524 | 
525 | 
526 | 
527 | 
528 | 
529 | 
530 | 
531 | 
532 | 
533 | 
534 | 
535 | 
536 | 
537 | 
538 | 
539 | 
540 | 
541 | 
542 | 
543 | 
544 | 
545 | 
546 | 
547 | 
548 | 
549 | 
550 | 
551 | 
552 | 
553 | 
554 | 
555 | 
556 | 
557 | 
558 | 
559 | 
560 | 
561 | 
562 | 
563 | 
564 | 
565 | 
566 | 
567 | 
568 | 
569 | 
570 | 
571 | 
572 | 
573 | 
574 | 
575 | 
576 | 
577 | 
578 | 
579 | 
580 | 
581 | 
582 | 
583 | 
584 | 
585 | 
586 | 
587 | 
588 | 
589 | 
590 | 
591 | 
592 | 
593 | 
594 | 
595 | 
596 | 
597 | 
598 | 
599 | 
600 | 
601 | 
602 | 
603 | 
604 | 
605 | 
606 | 
607 | 
608 | 
609 | 
610 | 
611 | 
612 | 
613 | 
614 | 
615 | 
616 | 
617 | 
618 | 
619 | 
620 | 
621 | 
622 | 
623 | 
624 | 
625 | 
626 | 
627 | 
628 | 
629 | 
630 | 
631 | 
632 | 
633 | 
634 | 
635 | 
636 | 
637 | 
638 | 
639 | 
640 | 
641 | 
642 | 
643 | 
644 | 
645 | 
646 | 
647 | 
648 | 
649 | 
650 | 
651 | 
652 | 
653 | 
654 | 
655 | 
656 | 
657 | 
658 | 
659 | 
660 | 
661 | 
662 | 
663 | 
664 | 
665 | 
666 | 
667 | 
668 | 
669 | 
670 | 
671 | 
672 | 
673 | 
674 | 
675 | 
676 | 
677 | 
678 | 
679 | 
680 | 
681 | 
682 | 
683 | 
684 | 
685 | 
686 | 
687 | 
688 | 
689 | 
690 | 
691 | 
692 | 
693 | 
694 | 
695 | 
696 | 
697 | 
698 | 
699 | 
700 | 
701 | 
702 | 
703 | 
704 | 
705 | 
706 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo1-python/HDR/README.md:
--------------------------------------------------------------------------------
  1 | # Classification of Handwritten Digits
  2 | 
  3 | ## Summary
  4 | 
  5 | In the past few days, I tried to use several classification algorithms (and their combinations) to classify the handwritten digits. For your convenient, I put the summary here, and details in the following sections.
  6 | 
  7 | There are three kinds of methods that I used:
  8 | 
  9 | - Single algorithms, e.g. SVM and Logistic Regression.
 10 | 
 11 | - Ensemble algorithm, i.e. Random Forest.
 12 | 
 13 | - Pipelined algorithms.
 14 | 
 15 | I put the results as follows, and I will explain them in detail in the following sections, including why I choose them, how to train them, etc.
 16 | 
 17 | | Method | Precision Avg | Recall Avg | F-measure Avg | Algorithms |
 18 | |:----------:|:----------:|:----------:|:----------:|:----------:|
 19 | | SVM | 0.99 | 0.99 | 0.99 | Single SVM + 5-fold CV |
 20 | | Logistic Regression | 0.96 | 0.96 | 0.96 | Single LR + 5-fold CV |
 21 | |Pipelined SVM | 0.99 | 0.99 | 0.99 | PCA + Ploynomial Expansions + PCA + FDA + SVM |
 22 | |Pipelined Logistic Regression | 0.99 | 0.99 | 0.99 | PCA + Ploynomial Expansions + PCA + FDA + LR |
 23 | |Random Forest | 0.98 | 0.98 | 0.98 | Single RF + 3-fold CV |
 24 | |Pipelined Random Forest | 0.96 | 0.96 | 0.96 | Polynomial Expansion + RF | 
 25 | |Layered Neural Network I | 0.95 | 0.95 | 0.95 | 2-layer RBM + LR + 3-fold CV |
 26 | |Layered Neural Network II | 0.90 | 0.90 | 0.90 | 3-layer RBM + LR + 3-fold CV |
 27 | |Layered Neural Network III | 0.83 | 0.83 | 0.83 | 4-layer RBM + LR + 3-fold CV |
 28 | |K-Nearest Neighbors | 0.98 | 0.98 | 0.98 | single KNN + 3-fold CV |
 29 | 
 30 | ## Conclusion
 31 | 
 32 | In the handwritten digits recognition scenario, SVM is the best candidate. Logistic regression with manual polynomial expansions can compete with SVM and theoretically, they are very similar. Random forest gets the similar result with KNN. Because in the *classification superplane* point of view, they are very similar. Both KNN and random forest can draw an irregular division surface. I want to use different distance metrics (such as Isomap, MDS) for KNN, in order to find better *neighbors*, but it failed in this scenario. Neural network does not fit my expectation, but it is reasonable. 2-layer RBM plays the *nonlinear transformation* role, so I get a well enough result. But for 3-layer and 4-layer, I need to fine-tune the parameters of RBM layers along with LR layer to get better results, which scikit-learn does not support that.
 33 | 
 34 | # Details
 35 | 
 36 | ## Some statistics
 37 | 
 38 | ### The distribution of 0-9 samples
 39 | 
 40 | `awk -F "," '{print $65}' optdigits.tra | sort -n | uniq -c`
 41 | 
 42 | > 376 0
 43 | 
 44 | > 389 1
 45 | 
 46 | > 380 2
 47 | 
 48 | > 389 3
 49 | 
 50 | > 387 4
 51 | 
 52 | > 376 5
 53 | 
 54 | > 377 6
 55 | 
 56 | > 387 7
 57 | 
 58 | > 380 8
 59 | 
 60 | > 382 9
 61 | 
 62 | It looks good that the distribution of 0-9 digits looks like a uniform distribution.
 63 | 
 64 | ## Methodology
 65 | 
 66 | ### How to choose models
 67 | 
 68 | This dataset is quite small, so I will not try to use some **heavy** classifiers such as **Deep Neural Network**, which could cause over-fitting and cannot perform well on test set. But, the traditional **Shallow Neural Network** is a good idea, say, a network with two/three layers of neurons.
 69 | 
 70 | Instead of NN, I will try to use **Logistic Regression** and **SVM** first, to see whether can I get a good result. For the sake of low dimensionality, I might use some **dimension reduction** method to filter the dataset, e.g. **PCA**.
 71 | 
 72 | If time permits, I will try to use some **uncommon** methods, such as **random forest** and **k nearest neighbor**.
 73 | 
 74 | ### How to do multi-classification
 75 | 
 76 | There are three methods to solve multi-classification problem:
 77 | 
 78 | 1. 1 vs. (k-1) classification, namely, transforming a k-classification problem into k binary classification problem.
 79 | 
 80 | 2. 1 vs. 1 k-classification classifier, such as **softmax regression** instead of **logistic regression**.
 81 | 
 82 | 3. Error-correcting output codes, which is an uncommon way to solve multi-classification problem.
 83 | 
 84 | ### How to do ETL of the dataset
 85 | 
 86 | In order to use k-fold cross validation, I will let open source tool do it. Scikit-learn is a good choice. Moreover, I will try to use manual feature expansions such as polynomial expansions, and dimension reduction methods such as PCA and FDA.
 87 | 
 88 | ### How to do the grid search for hyper-parameters
 89 | 
 90 | Scikit-learn provides `GridSearchCV` methods to do the search. It is stable for single algorithms. But for pipelined methods, the search space is very large, it may cause OOM.
 91 | 
 92 | ### How to choose open source tools
 93 | 
 94 | - Spark/MLlib is the most familiar tool of me, but it is too heavy and no necessary in the scenario;
 95 | 
 96 | - Scikit-learn seems the most suitable tool, I will try to use ETL part and classification part of it;
 97 | 
 98 | - MDP (Modular Data Process) is useful for a DAG style data process `Flow`, but scikit-learn also has the similar kind of API called `Pipeline`.
 99 | 
100 | - LibSVM and libLinear are much faster than scikit-learn, but for a demo project, I prefer Python, because the scale-out and scale-up capabilities are not my first consideration.
101 | 
102 | ## Details
103 | 
104 | ### Install scikit-learn
105 | 
106 | `sudo apt-get install python-sklearn`
107 | 
108 | I try to use scikit-learn, with its SVM and Logistic Regression, and get good results.
109 | 
110 | For SVM, I get
111 | 
112 | > \>\>\> print(metrics.classification_report(expected, predicted))
113 | 
114 | >             precision    recall  f1-score   support
115 | 
116 | >         0.0       1.00      1.00      1.00       190
117 | 
118 | >         1.0       0.98      0.99      0.99       194
119 | 
120 | >         2.0       0.99      1.00      0.99       186
121 | 
122 | >         3.0       0.99      0.96      0.97       192
123 | 
124 | >         4.0       0.99      0.99      0.99       202
125 | 
126 | >         5.0       0.98      0.99      0.99       194
127 | 
128 | >         6.0       0.99      0.99      0.99       184
129 | 
130 | >         7.0       0.99      0.99      0.99       188
131 | 
132 | >         8.0       0.99      0.99      0.99       201
133 | 
134 | >         9.0       0.97      0.98      0.98       181
135 | 
136 | > avg / total       0.99      0.99      0.99      1912
137 | 
138 | For Logistic Regression, I get
139 | 
140 | > \>\>\> print(metrics.classification_report(expected, lrpredicted))
141 | 
142 | >              precision    recall  f1-score   support
143 | 
144 | >         0.0       0.99      1.00      0.99       190
145 | 
146 | >         1.0       0.93      0.95      0.94       194
147 | 
148 | >         2.0       0.98      0.97      0.98       186
149 | 
150 | >         3.0       0.98      0.93      0.96       192
151 | 
152 | >         4.0       0.98      0.97      0.97       202
153 | 
154 | >         5.0       0.97      0.97      0.97       194
155 | 
156 | >         6.0       0.98      0.99      0.98       184
157 | 
158 | >         7.0       0.99      0.99      0.99       188
159 | 
160 | >         8.0       0.93      0.93      0.93       201
161 | 
162 | >         9.0       0.91      0.94      0.93       181
163 | 
164 | > avg / total       0.96      0.96      0.96      1912
165 | 
166 | ### Install MDP
167 | 
168 | `sudo aptitude install python-mdp`
169 | 
170 | Let's try something of the **flow**. I love this kind of **pipeline**. Here is the result:
171 | 
172 | >           precision    recall  f1-score   support
173 | 
174 | >         0.0       1.00      0.99      1.00       130
175 | 
176 | >         1.0       0.99      0.98      0.98       130
177 | 
178 | >         2.0       1.00      0.99      1.00       119
179 | 
180 | >         3.0       0.98      1.00      0.99       129
181 | 
182 | >         4.0       0.99      0.98      0.99       130
183 | 
184 | >         5.0       0.99      1.00      1.00       128
185 | 
186 | >         6.0       0.99      1.00      1.00       124
187 | 
188 | >         7.0       0.99      0.98      0.99       126
189 | 
190 | >         8.0       0.97      0.99      0.98       139
191 | 
192 | >         9.0       0.98      0.98      0.98       120
193 | 
194 | > avg / total       0.99      0.99      0.99      1275
195 | 
196 | We can see that it is even better than the former SVM result.
197 | 
198 | To testify my assumption, I substitude `SVCScikitLearnNode` with `LogisticRegressionScikitLearnNode`, and get similar result:
199 | 
200 | >             precision    recall  f1-score   support
201 | 
202 | >         0.0       1.00      0.99      1.00       130
203 | 
204 | >         1.0       0.98      0.98      0.98       130
205 | 
206 | >         2.0       1.00      1.00      1.00       119
207 | 
208 | >         3.0       0.98      0.99      0.99       129
209 | 
210 | >         4.0       0.99      0.98      0.99       130
211 | 
212 | >         5.0       0.98      1.00      0.99       128
213 | 
214 | >         6.0       0.99      1.00      1.00       124
215 | 
216 | >         7.0       0.99      0.98      0.99       126
217 | 
218 | >         8.0       0.99      0.98      0.98       139
219 | 
220 | >         9.0       0.99      0.98      0.99       120
221 | 
222 | > avg / total       0.99      0.99      0.99      1275
223 | 
224 | So, in the handwritten digits recognition scenario, logistic regression with some feature expansion and transformation can compete SVM. SVM uses **kernel trick** to substitute the manual feature expansion and transformation.
225 | 
226 | ### Logistic Regression
227 | 
228 | Add k-fold cross validation and grid search in Logistic Regression. Result:
229 | 
230 | > Best score: 0.964
231 | 
232 | > Best parameters set:
233 | 
234 | > {'C': 0.1, 'intercept_scaling': 1, 'fit_intercept': True, 'penalty': 'l2', 'random_state': None, 'dual': False, 'tol': 0.0001, 'class_weight': None}
235 | 
236 | >              precision    recall  f1-score   support
237 | 
238 | >         0.0       1.00      1.00      1.00       130
239 | 
240 | >         1.0       0.93      0.95      0.94       130
241 | 
242 | >         2.0       0.99      0.93      0.96       119
243 | 
244 | >         3.0       0.96      0.97      0.97       129
245 | 
246 | >         4.0       0.98      0.95      0.96       130
247 | 
248 | >         5.0       0.97      0.99      0.98       128
249 | 
250 | >         6.0       0.99      0.99      0.99       124
251 | 
252 | >         7.0       0.98      0.98      0.98       126
253 | 
254 | >         8.0       0.91      0.92      0.91       139
255 | 
256 | >         9.0       0.94      0.94      0.94       120
257 | 
258 | > avg / total       0.96      0.96      0.96      1275
259 | 
260 | It looks better than the previous single Logistic Regression result.
261 | 
262 | ### SVM
263 | 
264 | Let's add k-fold cross validation and grid search in SVM. Here is the result:
265 | 
266 | > Best score: 0.990
267 | 
268 | > Best parameters set:
269 | 
270 | > {'kernel': 'rbf', 'C': 10, 'verbose': False, 'probability': False, 'degree': 3, 'shrinking': True, 'max_iter': -1, 'random_state': None, 'tol': 0.001, 'cache_size': 200, 'coef0': 0.0, 'gamma': 0.001, 'class_weight': None}
271 | 
272 | >              precision    recall  f1-score   support
273 | 
274 | >         0.0       1.00      1.00      1.00       130
275 | 
276 | >         1.0       0.98      0.98      0.98       130
277 | 
278 | >         2.0       1.00      1.00      1.00       119
279 | 
280 | >         3.0       0.99      0.98      0.99       129
281 | 
282 | >         4.0       0.98      0.99      0.99       130
283 | 
284 | >         5.0       0.99      1.00      1.00       128
285 | 
286 | >         6.0       0.99      0.99      0.99       124
287 | 
288 | >         7.0       0.99      0.98      0.99       126
289 | 
290 | >         8.0       0.99      0.99      0.99       139
291 | 
292 | >         9.0       0.98      0.99      0.99       120
293 | 
294 | > avg / total       0.99      0.99      0.99      1275
295 | 
296 | ### Neural Network
297 | 
298 | With the help of RBM and `GridSearchCV`, I can get the following result on LR. (To use `BernoulliRBM`, we should transform the features into [0,1].)
299 | 
300 | > Best score: 0.955
301 | 
302 | > Best parameters set:
303 | 
304 | > {'rbm1__batch_size': 10, 'lr__dual': False, 'rbm1__verbose': False, 'rbm1__n_iter': 10, 'rbm1': BernoulliRBM(batch_size=10, learning_rate=0.1, n_components=36, n_iter=10,
305 | 
306 | >        random_state=None, verbose=False), 'rbm1__n_components': 36, 'lr__tol': 0.0001, 'lr__class_weight': None, 'lr': LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
307 | 
308 | >           intercept_scaling=1, penalty='l1', random_state=None, tol=0.0001), 'rbm1__learning_rate': 0.1, 'rbm1__random_state': None, 'lr__fit_intercept': True, 'lr__penalty': 'l1', 'lr__random_state': None, 'lr__intercept_scaling': 1, 'lr__C': 100}
309 | 
310 | >              precision    recall  f1-score   support
311 | 
312 | >         0.0       0.98      1.00      0.99       130
313 | 
314 | >         1.0       0.98      0.96      0.97       130
315 | 
316 | >         2.0       0.99      0.95      0.97       119
317 | 
318 | >         3.0       0.91      0.94      0.92       129
319 | 
320 | >         4.0       0.99      0.97      0.98       130
321 | 
322 | >         5.0       0.93      0.96      0.95       128
323 | 
324 | >         6.0       0.98      0.97      0.97       124
325 | 
326 | >         7.0       0.92      0.95      0.93       126
327 | 
328 | >         8.0       0.90      0.93      0.91       139
329 | 
330 | >         9.0       0.90      0.83      0.87       120
331 | 
332 | > avg / total       0.95      0.95      0.95      1275
333 | 
334 | It is not very exciting, but it is a good solution. I also try to use three-layer Neural Network, but the result is not very well.
335 | 
336 | ### Random Forest
337 | 
338 | It seems that the '9' is always hard to tell than '0'. How about random forest?
339 | 
340 | From random forest, I can get my best score here:
341 | 
342 | > Best score: 0.972
343 | 
344 | > Best parameters set:
345 | 
346 | > {'rf__bootstrap': True, 'rf__max_depth': None, 'rf__n_estimators': 90, 'rf__verbose': 0, 'rf__criterion': 'gini', 'rf__min_density': None, 'rf__min_samples_split': 2, 'rf__compute_importances': None, 'rf': RandomForestClassifier(bootstrap=True, compute_importances=None,
347 | 
348 | >             criterion='gini', max_depth=None, max_features='auto',
349 | 
350 | >             min_density=None, min_samples_leaf=1, min_samples_split=2,
351 | 
352 | >             n_estimators=90, n_jobs=1, oob_score=False, random_state=None,
353 | 
354 | >             verbose=0), 'rf__max_features': 'auto', 'rf__n_jobs': 1, 'rf__random_state': None, 'rf__oob_score': False, 'rf__min_samples_leaf': 1}
355 | 
356 | >              precision    recall  f1-score   support
357 | 
358 | >         0.0       0.99      0.99      0.99       130
359 | 
360 | >         1.0       0.98      0.98      0.98       130
361 | 
362 | >         2.0       1.00      0.98      0.99       119
363 | 
364 | >         3.0       0.95      0.97      0.96       129
365 | 
366 | >         4.0       0.98      0.99      0.99       130
367 | 
368 | >         5.0       0.98      0.99      0.98       128
369 | 
370 | >         6.0       0.98      0.99      0.99       124
371 | 
372 | >         7.0       0.98      0.98      0.98       126
373 | 
374 | >         8.0       0.99      0.97      0.98       139
375 | 
376 | >         9.0       0.96      0.93      0.94       120
377 | 
378 | > avg / total       0.98      0.98      0.98      1275
379 | 
380 | 
381 | ### K Nearest Neighbors
382 | 
383 | I try to use a non-linear embedding method before KNN, but barely no promotion. So I only provide the single KNN result here.
384 | 
385 | > Best score: 0.982
386 | 
387 | > Best parameters set:
388 | 
389 | > {'knn': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
390 | 
391 | >            n_neighbors=5, p=2, weights='distance'), 'knn__p': 2, 'knn__metric': 'minkowski', 'knn__weights': 'distance', 'knn__leaf_size': 30, 'knn__algorithm': 'auto', 'knn__n_neighbors': 5}
392 | 
393 | >              precision    recall  f1-score   support
394 | 
395 | >         0.0       1.00      1.00      1.00       130
396 | 
397 | >         1.0       0.97      0.98      0.98       130
398 | 
399 | >         2.0       1.00      0.99      1.00       119
400 | 
401 | >         3.0       0.98      0.97      0.97       129
402 | 
403 | >         4.0       0.99      0.99      0.99       130
404 | 
405 | >         5.0       0.98      0.99      0.98       128
406 | 
407 | >         6.0       0.99      1.00      1.00       124
408 | 
409 | >         7.0       0.96      0.98      0.97       126
410 | 
411 | >         8.0       0.99      0.96      0.98       139
412 | 
413 | >         9.0       0.95      0.94      0.95       120
414 | 
415 | > avg / total       0.98      0.98      0.98      1275
416 | 
417 | ## References
418 | 
419 | 1. [Comparing Classification Algorithms for Handwritten Digits](http://blog.quantitations.com/machine%20learning/2013/02/27/comparing-classification-algorithms-for-handwritten-digits/)
420 | 
421 | 2. [Example: Handwritten Digit Classification](http://pythonhosted.org/bob.learn.boosting/example.html)
422 | 
423 | 3. [Classification of handwritten digits using a SVM](http://nbviewer.ipython.org/url/www.hdm-stuttgart.de/~maucher/ipnotebooks/MachineLearning/svmDigitRecognition.ipynb)
424 | 
425 | 4. [Using neural nets to recognize handwritten digits](http://neuralnetworksanddeeplearning.com/chap1.html)
426 | 
427 | 5. [Recognizing hand-written digits](http://scikit-learn.org/stable/auto_examples/plot_digits_classification.html)
428 | 
429 | 6. [The MNIST Database of Handwritten Digits](http://yann.lecun.com/exdb/mnist/)
430 | 
431 | 7. [Modular Toolkit for Data Processing](http://mdp-toolkit.sourceforge.net/documentation.html)
432 | 
433 | 8. [Scikit-learn document](http://scikit-learn.org/stable/)
434 | 
435 | 9. [Handwritten digits classification with MDP and scikits.learn](http://mdp-toolkit.sourceforge.net/examples/scikits_learn/digit_classification.html)
436 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo1-python/HDR/papers/00b495249dfb217637000000.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinxusen/AMLI/72619ed24344ad3798d8ef60f2b7352e5b9bcd05/w3/w3demo/demo1-python/HDR/papers/00b495249dfb217637000000.pdf


--------------------------------------------------------------------------------
/w3/w3demo/demo1-python/HDR/papers/10.1.1.110.4774.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinxusen/AMLI/72619ed24344ad3798d8ef60f2b7352e5b9bcd05/w3/w3demo/demo1-python/HDR/papers/10.1.1.110.4774.pdf


--------------------------------------------------------------------------------
/w3/w3demo/demo1-python/HDR/papers/1412.8307v1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinxusen/AMLI/72619ed24344ad3798d8ef60f2b7352e5b9bcd05/w3/w3demo/demo1-python/HDR/papers/1412.8307v1.pdf


--------------------------------------------------------------------------------
/w3/w3demo/demo1-python/HDR/papers/Lecture21.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinxusen/AMLI/72619ed24344ad3798d8ef60f2b7352e5b9bcd05/w3/w3demo/demo1-python/HDR/papers/Lecture21.pdf


--------------------------------------------------------------------------------
/w3/w3demo/demo1-python/HDR/papers/Margulis-OpticalCharacterRecognition.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinxusen/AMLI/72619ed24344ad3798d8ef60f2b7352e5b9bcd05/w3/w3demo/demo1-python/HDR/papers/Margulis-OpticalCharacterRecognition.pdf


--------------------------------------------------------------------------------
/w3/w3demo/demo1-python/HDR/papers/mdp_paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinxusen/AMLI/72619ed24344ad3798d8ef60f2b7352e5b9bcd05/w3/w3demo/demo1-python/HDR/papers/mdp_paper.pdf


--------------------------------------------------------------------------------
/w3/w3demo/demo1-python/HDR/src/train_flow.py:
--------------------------------------------------------------------------------
 1 | import mdp
 2 | import numpy
 3 | from sklearn import metrics
 4 | 
 5 | digits = numpy.loadtxt(fname="optdigits.tra", delimiter=',')
 6 | n_samples = len(digits)
 7 | 
 8 | data = digits[:,:-1]
 9 | target = digits[:,-1]
10 | 
11 | n_trains = n_samples / 3 * 2
12 | 
13 | train_data = [data[:n_trains, :]]
14 | train_data_with_labels = [(data[:n_trains, :], target[:n_trains])]
15 | 
16 | test_data = data[n_trains:, :]
17 | test_labels = target[n_trains:]
18 | 
19 | flow = mdp.Flow([mdp.nodes.PCANode(output_dim=25, dtype='f'),
20 |     mdp.nodes.PolynomialExpansionNode(3),
21 |     mdp.nodes.PCANode(output_dim=0.99),
22 |     mdp.nodes.FDANode(output_dim=9),
23 |     mdp.nodes.LogisticRegressionScikitsLearnNode()], verbose=True)
24 | 
25 | flow.train([train_data, None, train_data, train_data_with_labels, train_data_with_labels])
26 | 
27 | flow[-1].execute = flow[-1].label
28 | 
29 | prediction = flow(test_data)
30 | 
31 | print metrics.classification_report(test_labels, prediction)
32 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo1-python/HDR/src/train_flow_sklearn.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | from sklearn import datasets, svm, metrics
 3 | from sklearn.grid_search import GridSearchCV
 4 | from sklearn.linear_model import LogisticRegression
 5 | from sklearn.pipeline import Pipeline
 6 | from sklearn.decomposition import PCA
 7 | from sklearn.preprocessing import PolynomialFeatures
 8 | from sklearn.lda import LDA
 9 | 
10 | digits = numpy.loadtxt(fname="optdigits.tra", delimiter=',')
11 | n_samples = len(digits)
12 | 
13 | data = digits[:,:-1]
14 | target = digits[:,-1]
15 | 
16 | param_grid = {
17 |     'pca1__n_components': [16],
18 |     'poly__degree': [2],
19 |     'pca2__n_components': [0.8],
20 |     'lda__n_components': [9],
21 |     'lr__penalty': ['l2'],
22 |     'lr__C': [0.1, 1]
23 | }
24 | 
25 | steps = [('pca1', PCA()), 
26 |     ('poly', PolynomialFeatures()),
27 |     ('pca2', PCA()), 
28 |     ('lda', LDA()),
29 |     ('lr', LogisticRegression())]
30 | 
31 | pipeline = Pipeline(steps)
32 | 
33 | grid_search = GridSearchCV(pipeline, param_grid, n_jobs = -1, verbose = 1, cv = 2)
34 | 
35 | n_trains = n_samples / 3 * 2
36 | 
37 | # We learn the digits on the first half of the digits
38 | grid_search.fit(data[:n_trains], target[:n_trains])
39 | 
40 | print("Best score: %0.3f" % grid_search.best_score_)
41 | print("Best parameters set:")
42 | best_parameters = grid_search.best_estimator_.get_params()
43 | print best_parameters
44 | 
45 | # Now predict the value of the digit on the second half:
46 | expected = target[n_trains:]
47 | predicted = grid_search.best_estimator_.predict(data[n_trains:])
48 | 
49 | print(metrics.classification_report(expected, predicted))
50 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo1-python/HDR/src/train_knn.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | from sklearn import metrics
 3 | from sklearn.grid_search import GridSearchCV
 4 | from sklearn.pipeline import Pipeline
 5 | from sklearn.neighbors import KNeighborsClassifier
 6 | 
 7 | digits = numpy.loadtxt(fname="optdigits.tra", delimiter=',')
 8 | n_samples = len(digits)
 9 | 
10 | data = digits[:,:-1]
11 | target = digits[:,-1]
12 | 
13 | param_grid = {
14 |     'knn__weights': ['uniform', 'distance']
15 | }
16 | 
17 | steps = [
18 |     ('knn', KNeighborsClassifier())
19 | ]
20 | 
21 | pipeline = Pipeline(steps)
22 | 
23 | grid_search = GridSearchCV(pipeline, param_grid, n_jobs = -1, verbose = 1, cv = 3)
24 | 
25 | n_trains = n_samples / 3 * 2
26 | 
27 | # We learn the digits on the first half of the digits
28 | grid_search.fit(data[:n_trains], target[:n_trains])
29 | 
30 | print("Best score: %0.3f" % grid_search.best_score_)
31 | print("Best parameters set:")
32 | best_parameters = grid_search.best_estimator_.get_params()
33 | print best_parameters
34 | 
35 | # Now predict the value of the digit on the second half:
36 | expected = target[n_trains:]
37 | predicted = grid_search.best_estimator_.predict(data[n_trains:])
38 | 
39 | print(metrics.classification_report(expected, predicted))
40 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo1-python/HDR/src/train_lr.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | from sklearn import datasets, svm, metrics
 3 | from sklearn.grid_search import GridSearchCV
 4 | from sklearn.linear_model import LogisticRegression
 5 | 
 6 | digits = numpy.loadtxt(fname="optdigits.tra", delimiter=',')
 7 | n_samples = len(digits)
 8 | 
 9 | data = digits[:,:-1]
10 | target = digits[:,-1]
11 | 
12 | param_grid = [
13 |     {'penalty': ['l1', 'l2'], 'C': [0.1, 1.0, 10.0, 100.0]}
14 | ]
15 | 
16 | # Create a classifier: a support vector classifier
17 | classifier = LogisticRegression()
18 | 
19 | grid_search = GridSearchCV(classifier, param_grid, n_jobs = -1, verbose = 1, cv = 5)
20 | 
21 | n_trains = n_samples / 3 * 2
22 | 
23 | # We learn the digits on the first half of the digits
24 | grid_search.fit(data[:n_trains], target[:n_trains])
25 | 
26 | print("Best score: %0.3f" % grid_search.best_score_)
27 | print("Best parameters set:")
28 | best_parameters = grid_search.best_estimator_.get_params()
29 | print best_parameters
30 | 
31 | # Now predict the value of the digit on the second half:
32 | expected = target[n_trains:]
33 | predicted = grid_search.best_estimator_.predict(data[n_trains:])
34 | 
35 | print(metrics.classification_report(expected, predicted))
36 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo1-python/HDR/src/train_rbm.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | from sklearn import metrics
 3 | from sklearn.grid_search import GridSearchCV
 4 | from sklearn.linear_model import LogisticRegression
 5 | from sklearn.pipeline import Pipeline
 6 | from sklearn.neural_network import BernoulliRBM
 7 | 
 8 | digits = numpy.loadtxt(fname="optdigits.tra", delimiter=',')
 9 | n_samples = len(digits)
10 | 
11 | data = digits[:,:-1] / 16.0
12 | target = digits[:,-1]
13 | 
14 | param_grid = {
15 |     'rbm1__n_components': [36, 25, 16],
16 |     'rbm2__n_components': [16],
17 |     'rbm3__n_components': [9],
18 |     'lr__penalty': ['l2', 'l1'],
19 |     'lr__C': [1, 10, 100]
20 | }
21 | 
22 | steps = [
23 |     ('rbm1', BernoulliRBM()), 
24 |     ('rbm2', BernoulliRBM()), 
25 |     ('rbm3', BernoulliRBM()), 
26 |     ('lr', LogisticRegression())
27 | ]
28 | 
29 | pipeline = Pipeline(steps)
30 | 
31 | grid_search = GridSearchCV(pipeline, param_grid, n_jobs = -1, verbose = 1, cv = 3)
32 | 
33 | n_trains = n_samples / 3 * 2
34 | 
35 | # We learn the digits on the first half of the digits
36 | grid_search.fit(data[:n_trains], target[:n_trains])
37 | 
38 | print("Best score: %0.3f" % grid_search.best_score_)
39 | print("Best parameters set:")
40 | best_parameters = grid_search.best_estimator_.get_params()
41 | print best_parameters
42 | 
43 | # Now predict the value of the digit on the second half:
44 | expected = target[n_trains:]
45 | predicted = grid_search.best_estimator_.predict(data[n_trains:])
46 | 
47 | print(metrics.classification_report(expected, predicted))
48 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo1-python/HDR/src/train_rf.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | from sklearn import metrics
 3 | from sklearn.grid_search import GridSearchCV
 4 | from sklearn.pipeline import Pipeline
 5 | from sklearn.ensemble import RandomForestClassifier
 6 | 
 7 | digits = numpy.loadtxt(fname="optdigits.tra", delimiter=',')
 8 | n_samples = len(digits)
 9 | 
10 | data = digits[:,:-1]
11 | target = digits[:,-1]
12 | 
13 | param_grid = {
14 |     'rf__n_estimators': [40, 50, 60, 70, 80, 90]
15 | }
16 | 
17 | steps = [
18 |     ('rf', RandomForestClassifier())
19 | ]
20 | 
21 | pipeline = Pipeline(steps)
22 | 
23 | grid_search = GridSearchCV(pipeline, param_grid, n_jobs = -1, verbose = 1, cv = 3)
24 | 
25 | n_trains = n_samples / 3 * 2
26 | 
27 | # We learn the digits on the first half of the digits
28 | grid_search.fit(data[:n_trains], target[:n_trains])
29 | 
30 | print("Best score: %0.3f" % grid_search.best_score_)
31 | print("Best parameters set:")
32 | best_parameters = grid_search.best_estimator_.get_params()
33 | print best_parameters
34 | 
35 | # Now predict the value of the digit on the second half:
36 | expected = target[n_trains:]
37 | predicted = grid_search.best_estimator_.predict(data[n_trains:])
38 | 
39 | print(metrics.classification_report(expected, predicted))
40 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo1-python/HDR/src/train_svm.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | from sklearn import datasets, svm, metrics
 3 | from sklearn.grid_search import GridSearchCV
 4 | 
 5 | digits = numpy.loadtxt(fname="optdigits.tra", delimiter=',')
 6 | n_samples = len(digits)
 7 | 
 8 | data = digits[:,:-1]
 9 | target = digits[:,-1]
10 | 
11 | param_grid = [
12 |     {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
13 |     {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
14 | ]
15 | 
16 | # Create a classifier: a support vector classifier
17 | classifier = svm.SVC()
18 | 
19 | grid_search = GridSearchCV(classifier, param_grid, n_jobs = -1, verbose = 1, cv = 5)
20 | 
21 | n_trains = n_samples / 3 * 2
22 | 
23 | # We learn the digits on the first half of the digits
24 | grid_search.fit(data[:n_trains], target[:n_trains])
25 | 
26 | print("Best score: %0.3f" % grid_search.best_score_)
27 | print("Best parameters set:")
28 | best_parameters = grid_search.best_estimator_.get_params()
29 | print best_parameters
30 | 
31 | # Now predict the value of the digit on the second half:
32 | expected = target[n_trains:]
33 | predicted = grid_search.best_estimator_.predict(data[n_trains:])
34 | 
35 | print(metrics.classification_report(expected, predicted))
36 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo1/readme.txt:
--------------------------------------------------------------------------------
1 | 大家可以到spark下载页面获取第一次的demo：https://spark.apache.org/downloads.html
2 | 
3 | 第一次demo直接用的spark源码。
4 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo2/custom-serving/.gitignore:
--------------------------------------------------------------------------------
1 | data/sample_movielens_data.txt
2 | manifest.json
3 | target/
4 | /pio.sbt
5 | pio.log
6 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo2/custom-serving/build.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | name := "template-scala-parallel-recommendation"
 6 | 
 7 | organization := "io.prediction"
 8 | 
 9 | libraryDependencies ++= Seq(
10 |   "io.prediction"    %% "core"          % "0.9.2" % "provided",
11 |   "org.apache.spark" %% "spark-core"    % "1.2.0" % "provided",
12 |   "org.apache.spark" %% "spark-mllib"   % "1.2.0" % "provided")
13 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo2/custom-serving/data/sample_disabled_items.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinxusen/AMLI/72619ed24344ad3798d8ef60f2b7352e5b9bcd05/w3/w3demo/demo2/custom-serving/data/sample_disabled_items.txt


--------------------------------------------------------------------------------
/w3/w3demo/demo2/custom-serving/engine.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "id": "default",
 3 |   "description": "Default settings",
 4 |   "engineFactory": "org.template.recommendation.RecommendationEngine",
 5 |   "datasource": {
 6 |     "params": {
 7 |       "trainSet": "/Users/panda/data/demo/trainset"
 8 |     }
 9 |   },
10 |   "algorithms": [
11 |     {
12 |       "name": "als",
13 |       "params": {
14 |         "rank": 10,
15 |         "numIterations": 20,
16 |         "lambda": 0.01,
17 |         "seed": 3
18 |       }
19 |     }
20 |   ],
21 |   "serving": {
22 |     "params": {
23 |       "filepath": "./data/sample_disabled_items.txt"
24 |     }
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo2/custom-serving/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo2/custom-serving/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0")
2 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo2/custom-serving/src/main/scala/ALSAlgorithm.scala:
--------------------------------------------------------------------------------
 1 | package org.template.recommendation
 2 | 
 3 | import io.prediction.controller.PAlgorithm
 4 | import io.prediction.controller.Params
 5 | import io.prediction.data.storage.BiMap
 6 | 
 7 | import org.apache.spark.SparkContext
 8 | import org.apache.spark.SparkContext._
 9 | import org.apache.spark.rdd.RDD
10 | import org.apache.spark.mllib.recommendation.ALS
11 | import org.apache.spark.mllib.recommendation.{Rating => MLlibRating}
12 | import org.apache.spark.mllib.recommendation.ALSModel
13 | 
14 | import grizzled.slf4j.Logger
15 | 
16 | case class ALSAlgorithmParams(
17 |   rank: Int,
18 |   numIterations: Int,
19 |   lambda: Double,
20 |   seed: Option[Long]) extends Params
21 | 
22 | class ALSAlgorithm(val ap: ALSAlgorithmParams)
23 |   extends PAlgorithm[PreparedData, ALSModel, Query, PredictedResult] {
24 | 
25 |   @transient lazy val logger = Logger[this.type]
26 | 
27 |   def train(sc: SparkContext, data: PreparedData): ALSModel = {
28 |     // MLLib ALS cannot handle empty training data.
29 |     require(!data.ratings.take(1).isEmpty,
30 |       s"RDD[Rating] in PreparedData cannot be empty." +
31 |       " Please check if DataSource generates TrainingData" +
32 |       " and Preprator generates PreparedData correctly.")
33 |     // Convert user and item String IDs to Int index for MLlib
34 |     val userStringIntMap = BiMap.stringInt(data.ratings.map(_.user))
35 |     val itemStringIntMap = BiMap.stringInt(data.ratings.map(_.item))
36 |     val mllibRatings = data.ratings.map( r =>
37 |       // MLlibRating requires integer index for user and item
38 |       MLlibRating(userStringIntMap(r.user), itemStringIntMap(r.item), r.rating)
39 |     )
40 | 
41 |     // seed for MLlib ALS
42 |     val seed = ap.seed.getOrElse(System.nanoTime)
43 | 
44 |     // If you only have one type of implicit event (Eg. "view" event only),
45 |     // replace ALS.train(...) with
46 |     //val m = ALS.trainImplicit(
47 |       //ratings = mllibRatings,
48 |       //rank = ap.rank,
49 |       //iterations = ap.numIterations,
50 |       //lambda = ap.lambda,
51 |       //blocks = -1,
52 |       //alpha = 1.0,
53 |       //seed = seed)
54 | 
55 |     val m = ALS.train(
56 |       ratings = mllibRatings,
57 |       rank = ap.rank,
58 |       iterations = ap.numIterations,
59 |       lambda = ap.lambda,
60 |       blocks = -1,
61 |       seed = seed)
62 | 
63 |     new ALSModel(
64 |       rank = m.rank,
65 |       userFeatures = m.userFeatures,
66 |       productFeatures = m.productFeatures,
67 |       userStringIntMap = userStringIntMap,
68 |       itemStringIntMap = itemStringIntMap)
69 |   }
70 | 
71 |   def predict(model: ALSModel, query: Query): PredictedResult = {
72 |     // Convert String ID to Int index for Mllib
73 |     model.userStringIntMap.get(query.user).map { userInt =>
74 |       // create inverse view of itemStringIntMap
75 |       val itemIntStringMap = model.itemStringIntMap.inverse
76 |       // recommendProducts() returns Array[MLlibRating], which uses item Int
77 |       // index. Convert it to String ID for returning PredictedResult
78 |       val itemScores = model.recommendProducts(userInt, query.num)
79 |         .map (r => ItemScore(itemIntStringMap(r.product), r.rating))
80 |       new PredictedResult(itemScores)
81 |     }.getOrElse{
82 |       logger.info(s"No prediction for unknown user ${query.user}.")
83 |       new PredictedResult(Array.empty)
84 |     }
85 |   }
86 | 
87 | }
88 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo2/custom-serving/src/main/scala/ALSModel.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.mllib.recommendation
 2 | // This must be the same package as Spark's MatrixFactorizationModel because
 3 | // MatrixFactorizationModel's constructor is private and we are using
 4 | // its constructor in order to save and load the model
 5 | 
 6 | import org.template.recommendation.ALSAlgorithmParams
 7 | 
 8 | import io.prediction.controller.IPersistentModel
 9 | import io.prediction.controller.IPersistentModelLoader
10 | import io.prediction.data.storage.BiMap
11 | 
12 | import org.apache.spark.SparkContext
13 | import org.apache.spark.SparkContext._
14 | import org.apache.spark.rdd.RDD
15 | 
16 | class ALSModel(
17 |     override val rank: Int,
18 |     override val userFeatures: RDD[(Int, Array[Double])],
19 |     override val productFeatures: RDD[(Int, Array[Double])],
20 |     val userStringIntMap: BiMap[String, Int],
21 |     val itemStringIntMap: BiMap[String, Int])
22 |   extends MatrixFactorizationModel(rank, userFeatures, productFeatures)
23 |   with IPersistentModel[ALSAlgorithmParams] {
24 | 
25 |   def save(id: String, params: ALSAlgorithmParams,
26 |     sc: SparkContext): Boolean = {
27 | 
28 |     sc.parallelize(Seq(rank)).saveAsObjectFile(s"/tmp/${id}/rank")
29 |     userFeatures.saveAsObjectFile(s"/tmp/${id}/userFeatures")
30 |     productFeatures.saveAsObjectFile(s"/tmp/${id}/productFeatures")
31 |     sc.parallelize(Seq(userStringIntMap))
32 |       .saveAsObjectFile(s"/tmp/${id}/userStringIntMap")
33 |     sc.parallelize(Seq(itemStringIntMap))
34 |       .saveAsObjectFile(s"/tmp/${id}/itemStringIntMap")
35 |     true
36 |   }
37 | 
38 |   override def toString = {
39 |     s"userFeatures: [${userFeatures.count()}]" +
40 |     s"(${userFeatures.take(2).toList}...)" +
41 |     s" productFeatures: [${productFeatures.count()}]" +
42 |     s"(${productFeatures.take(2).toList}...)" +
43 |     s" userStringIntMap: [${userStringIntMap.size}]" +
44 |     s"(${userStringIntMap.take(2)}...)" +
45 |     s" itemStringIntMap: [${itemStringIntMap.size}]" +
46 |     s"(${itemStringIntMap.take(2)}...)"
47 |   }
48 | }
49 | 
50 | object ALSModel
51 |   extends IPersistentModelLoader[ALSAlgorithmParams, ALSModel] {
52 |   def apply(id: String, params: ALSAlgorithmParams,
53 |     sc: Option[SparkContext]) = {
54 |     new ALSModel(
55 |       rank = sc.get.objectFile[Int](s"/tmp/${id}/rank").first,
56 |       userFeatures = sc.get.objectFile(s"/tmp/${id}/userFeatures"),
57 |       productFeatures = sc.get.objectFile(s"/tmp/${id}/productFeatures"),
58 |       userStringIntMap = sc.get
59 |         .objectFile[BiMap[String, Int]](s"/tmp/${id}/userStringIntMap").first,
60 |       itemStringIntMap = sc.get
61 |         .objectFile[BiMap[String, Int]](s"/tmp/${id}/itemStringIntMap").first)
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo2/custom-serving/src/main/scala/DataSource.scala:
--------------------------------------------------------------------------------
 1 | package org.template.recommendation
 2 | 
 3 | import io.prediction.controller.PDataSource
 4 | import io.prediction.controller.EmptyEvaluationInfo
 5 | import io.prediction.controller.EmptyActualResult
 6 | import io.prediction.controller.Params
 7 | import io.prediction.data.storage.Event
 8 | import io.prediction.data.storage.Storage
 9 | 
10 | import org.apache.spark.SparkContext
11 | import org.apache.spark.SparkContext._
12 | import org.apache.spark.rdd.RDD
13 | 
14 | import grizzled.slf4j.Logger
15 | 
16 | case class DataSourceParams(trainSet: String) extends Params
17 | case class TrainData(mid: String, uid: String, score: Double, date: String)
18 | 
19 | class DataSource(val dsp: DataSourceParams)
20 |   extends PDataSource[TrainingData,
21 |       EmptyEvaluationInfo, Query, EmptyActualResult] {
22 | 
23 |   @transient lazy val logger = Logger[this.type]
24 | 
25 |   override
26 |   def readTraining(sc: SparkContext): TrainingData = {
27 |     val trainSet = sc.textFile(dsp.trainSet).map(_.split(",")).map {
28 |       m => TrainData(m(0), m(1), m(2).toDouble, m(3))
29 |     }
30 |     val ratingsRDD = trainSet.map(td => Rating(td.uid, td.mid, td.score))
31 | 
32 |     new TrainingData(ratingsRDD)
33 |   }
34 | }
35 | 
36 | case class Rating(
37 |   user: String,
38 |   item: String,
39 |   rating: Double
40 | )
41 | 
42 | class TrainingData(
43 |   val ratings: RDD[Rating]
44 | ) extends Serializable {
45 |   override def toString = {
46 |     s"ratings: [${ratings.count()}] (${ratings.take(2).toList}...)"
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo2/custom-serving/src/main/scala/Engine.scala:
--------------------------------------------------------------------------------
 1 | package org.template.recommendation
 2 | 
 3 | import io.prediction.controller.IEngineFactory
 4 | import io.prediction.controller.Engine
 5 | 
 6 | case class Query(
 7 |   user: String,
 8 |   num: Int
 9 | ) extends Serializable
10 | 
11 | case class PredictedResult(
12 |   itemScores: Array[ItemScore]
13 | ) extends Serializable
14 | 
15 | case class ItemScore(
16 |   item: String,
17 |   score: Double
18 | ) extends Serializable
19 | 
20 | object RecommendationEngine extends IEngineFactory {
21 |   def apply() = {
22 |     new Engine(
23 |       classOf[DataSource],
24 |       classOf[Preparator],
25 |       Map("als" -> classOf[ALSAlgorithm]),
26 |       classOf[Serving])
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo2/custom-serving/src/main/scala/Preparator.scala:
--------------------------------------------------------------------------------
 1 | package org.template.recommendation
 2 | 
 3 | import io.prediction.controller.PPreparator
 4 | 
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.SparkContext._
 7 | import org.apache.spark.rdd.RDD
 8 | 
 9 | class Preparator
10 |   extends PPreparator[TrainingData, PreparedData] {
11 | 
12 |   def prepare(sc: SparkContext, trainingData: TrainingData): PreparedData = {
13 |     new PreparedData(ratings = trainingData.ratings)
14 |   }
15 | }
16 | 
17 | class PreparedData(
18 |   val ratings: RDD[Rating]
19 | ) extends Serializable
20 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo2/custom-serving/src/main/scala/Serving.scala:
--------------------------------------------------------------------------------
 1 | package org.template.recommendation
 2 | 
 3 | import io.prediction.controller.LServing
 4 | 
 5 | import scala.io.Source
 6 | 
 7 | import io.prediction.controller.Params  // ADDED
 8 | 
 9 | // ADDED ServingParams to specify the blacklisting file location.
10 | case class ServingParams(filepath: String) extends Params
11 | 
12 | class Serving(val params: ServingParams)
13 |   extends LServing[Query, PredictedResult] {
14 | 
15 |   override
16 |   def serve(query: Query, predictedResults: Seq[PredictedResult])
17 |   : PredictedResult = {
18 |     val disabledProducts: Set[String] = Source
19 |       .fromFile(params.filepath)
20 |       .getLines()
21 |       .toSet
22 | 
23 |     val itemScores = predictedResults.head.itemScores
24 |     PredictedResult(itemScores.filter(ps => !disabledProducts(ps.item)))
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/core/src/main/scala/com/amli/w3/recommend/DataSource.scala:
--------------------------------------------------------------------------------
 1 | package com.amli.w3.recommend
 2 | 
 3 | import _root_.io.prediction.controller._
 4 | import org.apache.spark.SparkContext
 5 | 
 6 | import com.amli.w3.recommend.io.{DataLoader, MySQLLoader}
 7 | 
 8 | case class DataSourceParamUnit(sources: Seq[String], users: Seq[String], passwords: Seq[String])
 9 | case class DataSourceParams(params: Seq[DataSourceParamUnit]) extends Params
10 | 
11 | class DataSource(val dataSourceParams: DataSourceParams)
12 |   extends PDataSource[DataSourceParams, EmptyDataParams, TrainingData, Query, EmptyActualResult] {
13 | 
14 |   override def readTraining(sc: SparkContext): TrainingData = {
15 |     TrainingData {
16 |       dataSourceParams.params.map { case DataSourceParamUnit(sources, users, passwords) =>
17 |         val (mergedRestaurants, mergedData) = sources.zip(users.zip(passwords))
18 |           .foldLeft((Seq.empty[Int], TrainingUnit.empty)) {
19 |           case ((restaurantIds, trainingData), (source, (user, password))) =>
20 |             val sqlLoader = MySQLLoader(sc, source, user, password)
21 |             val trans = DataLoader.getTransactions(sqlLoader).collect()
22 |             val restaurantId = DataLoader.getRestaurantId(sqlLoader)
23 |             (restaurantIds :+ restaurantId, trainingData + TrainingUnit(trans))
24 |         }
25 |         (mergedRestaurants, mergedData)
26 |       }
27 |     }
28 |   }
29 | }
30 | 
31 | case class TrainingUnit(trans: Seq[(Int, Int)]) extends Serializable {
32 |   def +(other: TrainingUnit) = new TrainingUnit(this.trans ++ other.trans)
33 | }
34 | 
35 | object TrainingUnit {
36 |   def empty = new TrainingUnit(Nil)
37 | }
38 | 
39 | case class TrainingData(data: Seq[(Seq[Int], TrainingUnit)]) extends Serializable {
40 |   override def toString = s"Training data contains ${data.size} training units. " +
41 |     s"The first training unit contains ${data.head._2.trans.size} training data."
42 | }
43 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/core/src/main/scala/com/amli/w3/recommend/Engine.scala:
--------------------------------------------------------------------------------
 1 | package com.amli.w3.recommend
 2 | 
 3 | import _root_.io.prediction.controller._
 4 | 
 5 | case class Query(
 6 |   restaurantId: Int,
 7 |   users: Array[Int],
 8 |   personAmount: Int,
 9 |   expectedConsumePerHead: Int,
10 |   mealType: Int,
11 |   itemsInCart: Array[Int],
12 |   clickedItems: Array[Int],
13 |   num: Int
14 | ) extends Serializable
15 | 
16 | case class PredictedResult(
17 |   recommendItems: Array[RecommendItem]
18 | ) extends Serializable
19 | 
20 | case class RecommendItem(
21 |   id: Int,
22 |   score: Double,
23 |   reason: String
24 | ) extends Serializable
25 | 
26 | object RecommendationEngine extends IEngineFactory {
27 |   override def apply() = {
28 |     new Engine(
29 |       classOf[DataSource],
30 |       classOf[Preparator],
31 |       Map("itemtoitem" -> classOf[ItemToItem]),
32 |       classOf[Serving])
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/core/src/main/scala/com/amli/w3/recommend/ItemToItem.scala:
--------------------------------------------------------------------------------
 1 | package com.amli.w3.recommend
 2 | 
 3 | import _root_.io.prediction.controller._
 4 | 
 5 | import com.amli.w3.recommend.io.DataTransfer._
 6 | import com.amli.w3.recommend.model.ItemSimilarity
 7 | 
 8 | case class ItemToItemParamUnit(k: Int, modelFile: String)
 9 | case class ItemToItemParams(params: Seq[ItemToItemParamUnit]) extends Params
10 | 
11 | class ItemToItem(val itemToItemParams: ItemToItemParams)
12 |     extends PAlgorithm[ItemToItemParams, PreparedData, ItemToItemModel, Query, PredictedResult] {
13 | 
14 |   override def train(data: PreparedData): ItemToItemModel = {
15 |     val models = itemToItemParams.params.zip(data.data).flatMap {
16 |       case (ItemToItemParamUnit(k, modelFile), (restaurantIds, TrainingUnit(trans))) =>
17 |         val model = new ItemSimilarity()
18 |         model.configModel(Map("k" -> k.toString, "comment" -> restaurantIds.mkString(",")))
19 |         model.trainModel(addVirtualUserToTransaction(trans))
20 |         var i = 0
21 |         var res: Seq[(Int, ItemSimilarity, Option[String])] = Nil
22 |         while (i < restaurantIds.size) {
23 |           if (i == 0) res = res.+:((restaurantIds(i), model, Option(modelFile)))
24 |           else res = res.+:((restaurantIds(i), model, None))
25 |           i += 1
26 |         }
27 |         res
28 |     }
29 |     ItemToItemModel(
30 |       models.map { case (id, model, store) => (id, model) }.toMap,
31 |       models.map { case (id, model, store) => (id, store) }.toMap
32 |     )
33 |   }
34 | 
35 |   override def predict(model: ItemToItemModel, query: Query): PredictedResult = {
36 |     val results = {
37 |       if (!model.models.contains(query.restaurantId)) Nil
38 |       else model.models(query.restaurantId)
39 |         .recommend(query.users.toSeq, query.itemsInCart.toSeq, query.num)
40 |     }
41 |     new PredictedResult(results.map (r => RecommendItem(r.item, r.score, r.reason)).toArray)
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/core/src/main/scala/com/amli/w3/recommend/ItemToItemModel.scala:
--------------------------------------------------------------------------------
 1 | package com.amli.w3.recommend
 2 | 
 3 | import _root_.io.prediction.controller._
 4 | import org.apache.spark.SparkContext
 5 | 
 6 | import com.amli.w3.recommend.model.ItemSimilarity
 7 | 
 8 | class ItemToItemModel(val models: Map[Int, ItemSimilarity], stores: Map[Int, Option[String]])
 9 |   extends IPersistentModel[ItemToItemParams] {
10 | 
11 |   override def save(id: String, params: ItemToItemParams, sc: SparkContext): Boolean = {
12 |     for (restaurantId <- models.keySet) {
13 |       val model = models(restaurantId)
14 |       val store = stores(restaurantId)
15 |       store match {
16 |         case Some(something) => model.exportModel(something)
17 |         case None => // do nothing
18 |       }
19 |     }
20 |     true
21 |   }
22 | }
23 | 
24 | object ItemToItemModel extends IPersistentModelLoader[ItemToItemParams, ItemToItemModel] {
25 | 
26 |   def apply(models: Map[Int, ItemSimilarity], stores: Map[Int, Option[String]] = null) = {
27 |     new ItemToItemModel(models, stores)
28 |   }
29 | 
30 |   def parseComment(comment: String): Seq[Int] = {
31 |     comment.split(",").map(_.toInt)
32 |   }
33 | 
34 |   override def apply(id: String, params: ItemToItemParams, sc: Option[SparkContext]) = {
35 |     ItemToItemModel(
36 |       params.params.flatMap { case ItemToItemParamUnit(_, modelFile) =>
37 |         val model = new ItemSimilarity()
38 |         model.importModel(modelFile)
39 |         val restaurantIds = parseComment(model.comment)
40 |         restaurantIds.map(_ -> model)
41 |       }.toMap
42 |     )
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/core/src/main/scala/com/amli/w3/recommend/Preparator.scala:
--------------------------------------------------------------------------------
 1 | package com.amli.w3.recommend
 2 | 
 3 | import _root_.io.prediction.controller._
 4 | import org.apache.spark.SparkContext
 5 | 
 6 | class Preparator extends PPreparator[EmptyPreparatorParams, TrainingData, PreparedData] {
 7 | 
 8 |   override def prepare(sc: SparkContext, trainingData: TrainingData): PreparedData = {
 9 |     PreparedData(trainingData.data)
10 |   }
11 | }
12 | 
13 | case class PreparedData(data: Seq[(Seq[Int], TrainingUnit)]) extends Serializable {
14 |   override def toString = s"Prepared data contains ${data.size} training units."
15 | }
16 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/core/src/main/scala/com/amli/w3/recommend/Serving.scala:
--------------------------------------------------------------------------------
 1 | package com.amli.w3.recommend
 2 | 
 3 | import _root_.io.prediction.controller._
 4 | 
 5 | class Serving extends LServing[EmptyServingParams, Query, PredictedResult] {
 6 | 
 7 |   override def serve(query: Query, PredictedResults: Seq[PredictedResult]): PredictedResult = {
 8 |     PredictedResults.head
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/core/src/test/scala/com/amli/w3/recommend/DataSourceSuite.scala:
--------------------------------------------------------------------------------
1 | package com.amli.w3.recommend
2 | 
3 | class DataSourceSuite {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/datalyze/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootCategory=ALL, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 7 | 
 8 | # Settings to quiet third party logs that are too verbose
 9 | log4j.logger.org.eclipse.jetty=WARN
10 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
11 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
12 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
13 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/Logging.scala:
--------------------------------------------------------------------------------
  1 | package com.amli.w3.recommend
  2 | 
  3 | import org.apache.log4j.{LogManager, PropertyConfigurator}
  4 | import org.slf4j.impl.StaticLoggerBinder
  5 | import org.slf4j.{Logger, LoggerFactory}
  6 | 
  7 | /**
  8 |  * :: DeveloperApi ::
  9 |  * Utility trait for classes that want to log data. Creates a SLF4J logger for the class and allows
 10 |  * logging messages at different levels using methods that only evaluate parameters lazily if the
 11 |  * log level is enabled.
 12 |  *
 13 |  * NOTE: DO NOT USE this class outside of Spark. It is intended as an internal utility.
 14 |  *       This will likely be changed or removed in future releases.
 15 |  */
 16 | trait Logging {
 17 |   // Make the log field transient so that objects with Logging can
 18 |   // be serialized and used on another machine
 19 |   @transient private var log_ : Logger = null
 20 | 
 21 |   // Method to get the logger name for this object
 22 |   protected def logName = {
 23 |     // Ignore trailing $'s in the class names for Scala objects
 24 |     this.getClass.getName.stripSuffix("$")
 25 |   }
 26 | 
 27 |   // Method to get or create the logger for this object
 28 |   protected def log: Logger = {
 29 |     if (log_ == null) {
 30 |       initializeIfNecessary()
 31 |       log_ = LoggerFactory.getLogger(logName)
 32 |     }
 33 |     log_
 34 |   }
 35 | 
 36 |   // Log methods that take only a String
 37 |   protected def logInfo(msg: => String) {
 38 |     if (log.isInfoEnabled) log.info(msg)
 39 |   }
 40 | 
 41 |   protected def logDebug(msg: => String) {
 42 |     if (log.isDebugEnabled) log.debug(msg)
 43 |   }
 44 | 
 45 |   protected def logTrace(msg: => String) {
 46 |     if (log.isTraceEnabled) log.trace(msg)
 47 |   }
 48 | 
 49 |   protected def logWarning(msg: => String) {
 50 |     if (log.isWarnEnabled) log.warn(msg)
 51 |   }
 52 | 
 53 |   protected def logError(msg: => String) {
 54 |     if (log.isErrorEnabled) log.error(msg)
 55 |   }
 56 | 
 57 |   // Log methods that take Throwables (Exceptions/Errors) too
 58 |   protected def logInfo(msg: => String, throwable: Throwable) {
 59 |     if (log.isInfoEnabled) log.info(msg, throwable)
 60 |   }
 61 | 
 62 |   protected def logDebug(msg: => String, throwable: Throwable) {
 63 |     if (log.isDebugEnabled) log.debug(msg, throwable)
 64 |   }
 65 | 
 66 |   protected def logTrace(msg: => String, throwable: Throwable) {
 67 |     if (log.isTraceEnabled) log.trace(msg, throwable)
 68 |   }
 69 | 
 70 |   protected def logWarning(msg: => String, throwable: Throwable) {
 71 |     if (log.isWarnEnabled) log.warn(msg, throwable)
 72 |   }
 73 | 
 74 |   protected def logError(msg: => String, throwable: Throwable) {
 75 |     if (log.isErrorEnabled) log.error(msg, throwable)
 76 |   }
 77 | 
 78 |   protected def isTraceEnabled(): Boolean = {
 79 |     log.isTraceEnabled
 80 |   }
 81 | 
 82 |   private def initializeIfNecessary() {
 83 |     if (!Logging.initialized) {
 84 |       Logging.initLock.synchronized {
 85 |         if (!Logging.initialized) {
 86 |           initializeLogging()
 87 |         }
 88 |       }
 89 |     }
 90 |   }
 91 | 
 92 |   private def initializeLogging() {
 93 |     // Don't use a logger in here, as this is itself occurring during initialization of a logger
 94 |     // If Log4j 1.2 is being used, but is not initialized, load a default properties file
 95 |     val binderClass = StaticLoggerBinder.getSingleton.getLoggerFactoryClassStr
 96 |     // This distinguishes the log4j 1.2 binding, currently
 97 |     // org.slf4j.impl.Log4jLoggerFactory, from the log4j 2.0 binding, currently
 98 |     // org.apache.logging.slf4j.Log4jLoggerFactory
 99 |     val usingLog4j12 = "org.slf4j.impl.Log4jLoggerFactory".equals(binderClass)
100 |     val log4j12Initialized = LogManager.getRootLogger.getAllAppenders.hasMoreElements
101 |     if (!log4j12Initialized && usingLog4j12) {
102 |       val defaultLogProps = "com/ilc/dec/log4j-defaults.properties"
103 |       Option(getClass.getClassLoader.getResource(defaultLogProps)) match {
104 |         case Some(url) =>
105 |           PropertyConfigurator.configure(url)
106 |           System.err.println(s"Using Spark's default log4j profile: $defaultLogProps")
107 |         case None =>
108 |           System.err.println(s"Spark was unable to load $defaultLogProps")
109 |       }
110 |     }
111 |     Logging.initialized = true
112 | 
113 |     // Force a call into slf4j to initialize it. Avoids this happening from multiple threads
114 |     // and triggering this: http://mailman.qos.ch/pipermail/slf4j-dev/2010-April/002956.html
115 |     log
116 |   }
117 | }
118 | 
119 | private object Logging {
120 |   @volatile private var initialized = false
121 |   val initLock = new Object()
122 |   try {
123 |     // We use reflection here to handle the case where users remove the
124 |     // slf4j-to-jul bridge order to route their logs to JUL.
125 |     val bridgeClass = Class.forName("org.slf4j.bridge.SLF4JBridgeHandler")
126 |     bridgeClass.getMethod("removeHandlersForRootLogger").invoke(null)
127 |     val installed = bridgeClass.getMethod("isInstalled").invoke(null).asInstanceOf[Boolean]
128 |     if (!installed) {
129 |       bridgeClass.getMethod("install").invoke(null)
130 |     }
131 |   } catch {
132 |     case e: ClassNotFoundException => // can't log anything yet so just fail silently
133 |   }
134 | }
135 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/io/DataLoader.scala:
--------------------------------------------------------------------------------
  1 | package com.amli.w3.recommend.io
  2 | 
  3 | import java.sql.ResultSet
  4 | import scala.io.Source._
  5 | 
  6 | import org.apache.spark.rdd.RDD
  7 | 
  8 | import com.amli.w3.recommend.Logging
  9 | import com.amli.w3.recommend.model.{EmptyTransaction, Transaction}
 10 | 
 11 | /**
 12 |  * Load data from a kind of Loader. Currently only `MySQLLoader` is implemented. Other loaders such
 13 |  * as `HDFSLoader` will be implemented in next release.
 14 |  */
 15 | object DataLoader extends Logging {
 16 | 
 17 |   /**
 18 |    * Count all food orders from loader.
 19 |    */
 20 |   def getOrderCount(sqlLoader: MySQLLoader): Int = {
 21 |     val mapRow: (ResultSet) => Int = rs => rs.getInt(1)
 22 |     val qStatement = "SELECT count(*) FROM foodorder;"
 23 |     sqlLoader.query[Int](qStatement, mapRow).first()
 24 |   }
 25 | 
 26 |   /**
 27 |    * Get all transactions from loader.
 28 |    */
 29 |   def getTransactions(sqlLoader: MySQLLoader): RDD[(Int, Int)] = {
 30 |     val mapRow: (ResultSet) => (Int, Int) = rs => (rs.getInt(1), rs.getInt(2))
 31 |     val qStatement = "SELECT foodid, foodorderid FROM foodorderdetail;"
 32 |     sqlLoader.query[(Int, Int)](qStatement, mapRow)
 33 |   }
 34 | 
 35 |   /**
 36 |    * Get the mapping from foodid to foodname.
 37 |    */
 38 |   def getDictionary(sqlLoader: MySQLLoader): RDD[(Int, String)] = {
 39 |     val mapRow: (ResultSet) => (Int, String) = rs => (rs.getInt(1), rs.getString(2))
 40 |     val qStatement = " SELECT foodid, foodname FROM food;"
 41 |     sqlLoader.query[(Int, String)](qStatement, mapRow)
 42 |   }
 43 | 
 44 |   /**
 45 |    * Get current restaurant Id.
 46 |    */
 47 |   def getRestaurantId(sqlLoader: MySQLLoader): Int = {
 48 |     val mapRow: (ResultSet) => Int = rs => rs.getInt(1)
 49 |     val qStatement = "SELECT companyid FROM company;"
 50 |     sqlLoader.query[Int](qStatement, mapRow).first()
 51 |   }
 52 | 
 53 |   /**
 54 |    * Get all tables from a given database.
 55 |    */
 56 |   def getTables(sqlLoader: MySQLLoader): RDD[String] = {
 57 |     val mapRow: (ResultSet) => String = rs => rs.getString(1)
 58 |     val qStatement = "SHOW TABLES;"
 59 |     sqlLoader.query[String](qStatement, mapRow)
 60 |   }
 61 | 
 62 |   /**
 63 |    * Get a function of `tableName`, which gets all column names of the table.
 64 |    */
 65 |   def getColumns(sqlLoader: MySQLLoader): (String) => RDD[String] = {
 66 |     tableName =>
 67 |       val mapRow: (ResultSet) => String = rs => rs.getString(1)
 68 |       val qStatement = s"SELECT column_name FROM information_schema.columns " +
 69 |         s"WHERE table_name = $tableName AND table_schema = ${sqlLoader.dbName};"
 70 |       sqlLoader.query[String](qStatement, mapRow)
 71 |   }
 72 | 
 73 |   /**
 74 |    * Get a function of `(tableName, columnName)`, which count the non-null elements in this column.
 75 |    */
 76 |   def getColumnNotNullCount(sqlLoader: MySQLLoader): (String, String) => Int = {
 77 |     (tableName, columnName) =>
 78 |       val mapRow: (ResultSet) => Int = rs => rs.getInt(1)
 79 |       val qStatement = s"SELECT COUNT($columnName) FROM $tableName WHERE $columnName IS NOT NULL;"
 80 |       sqlLoader.query[Int](qStatement, mapRow).first()
 81 |   }
 82 | 
 83 |   /**
 84 |    * Get a function of `(tableName, columnName)`, which count all elements in this column.
 85 |    */
 86 |   def getColumnCount(sqlLoader: MySQLLoader): (String, String) => Int = {
 87 |     (tableName, columnName) =>
 88 |       val mapRow: (ResultSet) => Int = rs => rs.getInt(1)
 89 |       val qStatement = s"SELECT COUNT($columnName) FROM $tableName;"
 90 |       sqlLoader.query[Int](qStatement, mapRow).first()
 91 |   }
 92 | 
 93 |   /**
 94 |    * Get `(itemId, transactionId)` or `(itemId, transactionId, userId)` from a file.
 95 |    */
 96 |   def getTransDataFromFile(transFile: String): Seq[Transaction] = {
 97 |     fromFile(transFile).getLines().map { line =>
 98 |       val words = line.split("\\s+")
 99 |       if (words.size < 2) {
100 |         logError(s"Broken line of transaction file: $line")
101 |         EmptyTransaction
102 |       } else if (words.size == 2) {
103 |         Transaction(words(1).toInt, words(1).toInt, words(0).toInt)
104 |       } else {
105 |         Transaction(words(1).toInt, words(2).toInt, words(0).toInt)
106 |       }
107 |     }.toSeq
108 |   }
109 | 
110 |   /**
111 |    * Get all orders of each user.
112 |    */
113 |   def getOrderSetOfUsers(trans: Seq[Transaction]): Map[Int, Seq[Int]] = {
114 |     trans.map(tran => (tran.user, tran.item))
115 |       .groupBy(_._1).map { case (u, orders) =>
116 |       (u, orders.unzip._2)
117 |     }
118 |   }
119 | 
120 |   /**
121 |    * Get all orders of each transaction.
122 |    */
123 |   def getOrderSetOfTransactions(trans: Seq[Transaction]): Map[Int, (Seq[Int], Seq[Int])] = {
124 |     trans.map { case tran => (tran.transaction, (tran.user, tran.item)) }
125 |       .groupBy(_._1).map { case (t, orders) =>
126 |         val ui = orders.unzip._2
127 |         val users = ui.unzip._1.toSet.toSeq
128 |         val items = ui.unzip._2.toSet.toSeq
129 |         (t, (users, items))
130 |     }
131 |   }
132 | }
133 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/io/DataTransfer.scala:
--------------------------------------------------------------------------------
 1 | package com.amli.w3.recommend.io
 2 | 
 3 | import com.amli.w3.recommend.model.Transaction
 4 | 
 5 | /**
 6 |  * Transfer data from one format to another.
 7 |  */
 8 | object DataTransfer {
 9 |   // TODO: Combine `DataTransfer` with `SchemaRDD` and `ML` package for unified transformation.
10 | 
11 |   /**
12 |    * Map a transaction to a virtual user.
13 |    */
14 |   def transactionToVirtualUser(transaction: Int): Int = {
15 |     // TODO: make sure that the virtualUser will not be conflict to real user
16 |     transaction
17 |   }
18 | 
19 |   /**
20 |    * Add virtual users to `(itemId, transactionId)` pairs.
21 |    */
22 |   def addVirtualUserToTransaction(trans: Seq[(Int, Int)]): Seq[Transaction] = {
23 |     trans.map { case (item, transaction) =>
24 |       Transaction(transaction, transactionToVirtualUser(transaction), item)
25 |     }
26 |   }
27 | 
28 |   /**
29 |    * Remove `userId`s from a sequence of `Transaction`.
30 |    */
31 |   def removeUserFromTransaction(trans: Seq[Transaction]): Seq[(Int, Int)] = {
32 |     trans.map(tran => (tran.item, tran.transaction)).distinct
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/io/FeaturedData.scala:
--------------------------------------------------------------------------------
 1 | package com.amli.w3.recommend.io
 2 | 
 3 | import org.apache.spark.sql._
 4 | 
 5 | /**
 6 |  * Generate a feature data given an RDD of `Row`, which is used for data transformation.
 7 |  */
 8 | case class FeaturedData(sqlCtx: SQLContext, dat: SchemaRDD, tableName: String) {
 9 |   dat.registerTempTable(tableName)
10 | 
11 |   /**
12 |    * Given a SQL transformation (with UDF), transform a `FeaturedData` to another one.
13 |    */
14 |   def transform(transformer: String, otherTableName: String): FeaturedData = {
15 |     new FeaturedData(sqlCtx, sqlCtx.sql(transformer), otherTableName)
16 |   }
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/io/MySQLLoader.scala:
--------------------------------------------------------------------------------
 1 | package com.amli.w3.recommend.io
 2 | 
 3 | import java.sql.{DriverManager, ResultSet}
 4 | import scala.collection.mutable
 5 | import scala.reflect.ClassTag
 6 | 
 7 | import org.apache.spark.SparkContext
 8 | import org.apache.spark.rdd.RDD
 9 | 
10 | /**
11 |  * Interface that reading data from MySQL.
12 |  */
13 | import com.amli.w3.recommend.Logging
14 | 
15 | class MySQLLoader(sc: SparkContext, connStr: String, user: String, password: String)
16 |   extends Logging {
17 | 
18 |   val dbName = connStr.split("/").last
19 | 
20 |   /**
21 |    * Query from a mysql database with a row mapping function.
22 |    */
23 |   def query[T: ClassTag](
24 |       sql: String,
25 |       mapRow: (ResultSet) => T = MySQLLoader.resultSetToObjectArray _): RDD[T] = {
26 |     val conn = DriverManager.getConnection(connStr, user, password)
27 |     val stmt = conn.prepareStatement(sql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)
28 |     if (conn.getMetaData.getURL.matches("jdbc:mysql:.*")) {
29 |       stmt.setFetchSize(Integer.MIN_VALUE)
30 |     }
31 |     val rs = stmt.executeQuery()
32 | 
33 |     val arrayBuilder = mutable.ListBuffer[T]()
34 |     while(rs.next()) {
35 |       arrayBuilder += mapRow(rs)
36 |     }
37 |     try {
38 |       if (null != rs && ! rs.isClosed) {
39 |         rs.close()
40 |       }
41 |     } catch {
42 |       case e: Exception => logWarning("Exception closing resultset", e)
43 |     }
44 |     try {
45 |       if (null != stmt && ! stmt.isClosed) {
46 |         stmt.close()
47 |       }
48 |     } catch {
49 |       case e: Exception => logWarning("Exception closing statement", e)
50 |     }
51 |     try {
52 |       if (null != conn && ! conn.isClosed) {
53 |         conn.close()
54 |       }
55 |       logInfo("closed connection")
56 |     } catch {
57 |       case e: Exception => logWarning("Exception closing connection", e)
58 |     }
59 |     sc.parallelize[T](arrayBuilder)
60 |   }
61 | }
62 | 
63 | object MySQLLoader {
64 |   def apply(sc: SparkContext, conn: String, user: String = null, password: String = null) = {
65 |     new MySQLLoader(sc, conn, user, password)
66 |   }
67 | 
68 |   def apply(sc: SparkContext, dbMetaData: (String, String, String)) = {
69 |     new MySQLLoader(sc, dbMetaData._1, dbMetaData._2, dbMetaData._3)
70 |   }
71 | 
72 |   /**
73 |    * Default row mapping function, which maps a row into an array of `Object`.
74 |    */
75 |   def resultSetToObjectArray(rs: ResultSet): Array[Object] = {
76 |     Array.tabulate[Object](rs.getMetaData.getColumnCount)(i => rs.getObject(i + 1))
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/io/Utils.scala:
--------------------------------------------------------------------------------
1 | package com.amli.w3.recommend.io
2 | 
3 | object Utils {
4 |   val dbConnStr: (String) => String = dbName => s"jdbc:mysql://localhost:3306/$dbName"
5 |   val dbNames = Array("jts", "hbo", "qyf", "qjlx", "gwx")
6 |   val dbMap = dbNames.map( dbName => dbName -> (dbConnStr(dbName), "root", "root")).toMap
7 | }
8 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/model/ItemSimilarity.scala:
--------------------------------------------------------------------------------
  1 | package com.amli.w3.recommend.model
  2 | 
  3 | import scala.collection.mutable.{Map => MutableMap}
  4 | import scala.io.Source.fromFile
  5 | import java.io.PrintWriter
  6 | import collection.immutable.ListMap
  7 | 
  8 | import com.amli.w3.recommend.io.DataTransfer._
  9 | import com.amli.w3.recommend.io.CliqueStat
 10 | 
 11 | /**
 12 |  * Item similarity model.
 13 |  */
 14 | class ItemSimilarity extends Model with ModelLike {
 15 | 
 16 |   /**
 17 |    * Number of items referenced when computing similarity.
 18 |    */
 19 |   var k: Int = 20
 20 | 
 21 |   /**
 22 |    * Similarity matrix with weights.
 23 |    */
 24 |   var similarities = Map[Int, Map[Int, Double]]()
 25 | 
 26 |   /**
 27 |    * A mix-in popularity model for recommendation call-back in case of cold start.
 28 |    */
 29 |   val popularityModel = new PopularityModel()
 30 | 
 31 |   override def trainModel(trans: Seq[Transaction]): ModelLike = {
 32 |     popularityModel.trainModel(trans)
 33 |     import popularityModel._
 34 | 
 35 |     val clique = new CliqueStat(removeUserFromTransaction(trans))
 36 |     val coMatrix = clique.getNClique(2).flatMap { case (Seq(u, v), c) =>
 37 |       Map((u, v) -> c, (v, u) -> c)
 38 |     }
 39 | 
 40 |     similarities = coMatrix.map { case ((u, v), c) =>
 41 |       ((u, v), c / math.sqrt(popularities(u) * popularities(v)))
 42 |     }.groupBy(_._1._1).map { case (k, l) =>
 43 |       (k, ListMap(l.map { case ((u, v), c) =>
 44 |         (v, c)
 45 |       }.toSeq.sortBy(_._2).reverse: _*))
 46 |     }.map { case (k, l) =>
 47 |       (k, l.map { case (v, c) => (v, c / l.head._2) })
 48 |     }
 49 | 
 50 |     this
 51 |   }
 52 | 
 53 |   override def exportModel(outFile: String): ModelLike = {
 54 |     val out = new PrintWriter(outFile)
 55 |     out.println(s"comment: $comment")
 56 |     out.println(s"k: $k")
 57 |     for ((itemA, relatedItems) <- similarities) {
 58 |       for ((itemB, sim) <- similarities(itemA)) {
 59 |         out.println(s"similarity: $itemA $itemB $sim")
 60 |       }
 61 |     }
 62 |     out.close()
 63 |     popularityModel.exportModel(outFile)
 64 |     this
 65 |   }
 66 | 
 67 |   override def importModel(inFile: String): ModelLike = {
 68 |     val tmpSimilarities = MutableMap[Int, Map[Int, Double]]()
 69 | 
 70 |     val lineIterator = fromFile(inFile).getLines()
 71 |     for (line <- lineIterator) {
 72 |       val words = line.split("\\s+")
 73 |       if (words(0) == "comment:") comment = words(1)
 74 |       if (words(0) == "k:") k = words(1).toInt
 75 |       if (words(0) == "similarity:") {
 76 |         val (itemA, itemB, sim) =
 77 |           (words(1).toInt, words(2).toInt, words(3).toDouble)
 78 |         if (tmpSimilarities.contains(itemA)) {
 79 |           tmpSimilarities(itemA) += (itemB -> sim)
 80 |         } else {
 81 |           tmpSimilarities += (itemA -> Map(itemB -> sim))
 82 |         }
 83 |       }
 84 |     }
 85 |     val sortedSimilarities = tmpSimilarities.map { case (user, items) =>
 86 |       (user, ListMap(items.toSeq.sortBy(_._2).reverse:_*))
 87 |     }
 88 |     similarities = sortedSimilarities.toMap
 89 |     popularityModel.importModel(inFile)
 90 |     this
 91 |   }
 92 | 
 93 |   override def configModel(params: Map[String, String]): ModelLike = {
 94 |     params.map {
 95 |       case ("k", v) =>
 96 |         val tmp = scala.util.Try(v.toInt).toOption
 97 |         if (tmp.isDefined) k = tmp.get
 98 |       case ("comment", c) =>
 99 |         comment = c
100 |       case _ =>
101 |     }
102 |     this
103 |   }
104 | 
105 |   override def recommend(users: Seq[Int], itemsInCart: Seq[Int], num: Int): Seq[RecommendResult] = {
106 |     import popularityModel._
107 | 
108 |     var rank = MutableMap[Int, (Double, Int, Double)]()
109 |     val allRefItems = (orderHistoryOfUsers(users) ++ itemsInCart).toSet
110 | 
111 |     for (i <- allRefItems) {
112 |       if (similarities.contains(i)) {
113 |         for ((j, sim) <- similarities(i).slice(0, k)) {
114 |           if (!itemsInCart.contains(j)) {
115 |             if (rank.contains(j)) {
116 |               if (sim > rank(j)._3) rank(j) = (rank(j)._1 + sim, i, sim)
117 |               else rank(j) = (rank(j)._1 + sim, rank(j)._2, rank(j)._3)
118 |             } else {
119 |               rank += (j -> (sim, i, sim))
120 |             }
121 |           }
122 |         }
123 |       }
124 |     }
125 |     val results = ListMap(rank.toSeq.sortBy(_._2._1).reverse:_*)
126 |       .slice(0, num).map { case (j, (weight, i, wi)) =>
127 |         if (itemsInCart.contains(i)) {
128 |           new RecommendResult(j, weight, s"You ordered $i")
129 |         } else {
130 |           new RecommendResult(j, weight, s"You ever ordered $i")
131 |         }
132 |     }.toSeq
133 |     results ++ popularityModel.recommend(users, itemsInCart, num - results.size)
134 |   }
135 | }
136 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/model/Model.scala:
--------------------------------------------------------------------------------
 1 | package com.amli.w3.recommend.model
 2 | 
 3 | case class RecommendResult(item: Int, score: Double, reason: String)
 4 | case class Transaction(transaction: Int, user: Int, item: Int)
 5 | object EmptyTransaction extends Transaction(-1, -1, -1)
 6 | 
 7 | /**
 8 |  * Define the behaviors of a `Model`. All classes that implement these interfaces is a `Model`.
 9 |  */
10 | trait ModelLike {
11 |   def trainModel(trans: Seq[Transaction]): ModelLike
12 |   def exportModel(outFile: String): ModelLike
13 |   def importModel(inFile: String): ModelLike
14 |   def configModel(params: Map[String, String]): ModelLike
15 |   def recommend(users: Seq[Int], itemsInCart: Seq[Int], num: Int): Seq[RecommendResult]
16 | }
17 | 
18 | /**
19 |  * Define all common attributes of a `Model`.
20 |  */
21 | abstract class Model {
22 |   var comment: String = "fake comment"
23 | }
24 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/model/PopularityModel.scala:
--------------------------------------------------------------------------------
 1 | package com.amli.w3.recommend.model
 2 | 
 3 | import java.io.{File, FileWriter, PrintWriter}
 4 | import com.amli.w3.recommend.io.CliqueStat
 5 | 
 6 | import scala.io.Source.fromFile
 7 | import scala.collection.mutable.{Map => MutableMap}
 8 | import scala.collection.immutable.ListMap
 9 | 
10 | import com.amli.w3.recommend.io.DataTransfer._
11 | import com.amli.w3.recommend.io.DataLoader._
12 | 
13 | class PopularityModel extends Model with ModelLike {
14 |   /**
15 |    * Popularity of all items.
16 |    */
17 |   var popularities = Map[Int, Int]()
18 | 
19 |   /**
20 |    * Order history of each user or transaction.
21 |    */
22 |   var orderHistory = Map[Int, Seq[Int]]()
23 | 
24 |   override def trainModel(trans: Seq[Transaction]): ModelLike = {
25 |     val clique = new CliqueStat(removeUserFromTransaction(trans))
26 |     popularities = ListMap(clique.getNClique(1).map {
27 |       case (Seq(u), c) => (u, c)
28 |     }.toSeq.sortBy(_._2).reverse:_*).toMap
29 |     orderHistory = getOrderSetOfUsers(trans)
30 |     this
31 |   }
32 | 
33 |   override def exportModel(outFile: String): ModelLike = {
34 |     val out = new PrintWriter(new FileWriter(new File(outFile), true))
35 |     for ((user, ordered) <- orderHistory) {
36 |       for (item <- ordered) out.println(s"order: $user $item")
37 |     }
38 |     for ((item, pop) <- popularities) {
39 |       out.println(s"popularity: $item $pop")
40 |     }
41 |     out.close()
42 |     this
43 |   }
44 | 
45 |   override def importModel(inFile: String): ModelLike = {
46 |     val tmpOrderHistory = MutableMap[Int, Seq[Int]]()
47 |     val tmpPopularities = MutableMap[Int, Int]()
48 | 
49 |     val lineIterator = fromFile(inFile).getLines()
50 |     for (line <- lineIterator) {
51 |       val words = line.split("\\s+")
52 |       if (words(0) == "order:") {
53 |         val (user, item) = (words(1).toInt, words(2).toInt)
54 |         if (tmpOrderHistory.contains(user)) tmpOrderHistory(user) ++= Seq(item)
55 |         else tmpOrderHistory += (user -> Seq(item))
56 |       }
57 |       if (words(0) == "popularity:") {
58 |         val (item, pop) = (words(1).toInt, words(2).toInt)
59 |         tmpPopularities += (item -> pop)
60 |       }
61 |     }
62 |     val sorted = ListMap(tmpPopularities.toSeq.sortBy(_._2).reverse:_*)
63 |     orderHistory = tmpOrderHistory.toMap
64 |     popularities = sorted.toMap
65 |     this
66 |   }
67 | 
68 |   override def configModel(params: Map[String, String]): ModelLike = { this }
69 | 
70 |   override def recommend(users: Seq[Int],
71 |       itemsInCart: Seq[Int],
72 |       num: Int): Seq[RecommendResult] = {
73 |     popularities.slice(0, num).map { case (item, pop) =>
74 |       new RecommendResult(item, pop.toDouble, "Most popular")
75 |     }.toSeq
76 |   }
77 | 
78 |   def orderHistoryOfUsers(users: Seq[Int]): Seq[Int] = {
79 |     users.flatMap { case user => orderHistory.getOrElse(user, Nil) }
80 |   }
81 | }
82 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/package.scala:
--------------------------------------------------------------------------------
 1 | package com.amli.w3
 2 | 
 3 | import org.apache.spark.sql.SQLContext
 4 | import org.apache.spark.{SparkContext, SparkConf}
 5 | 
 6 | package object recommend {
 7 | 
 8 |   object scLocal {
 9 |     val scConf = new SparkConf().setMaster("local[4]").setAppName("Assist Spark Context")
10 |     val sc = new SparkContext(scConf)
11 |     val sqlCtx = new SQLContext(sc)
12 |   }
13 | }
14 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/datalyze/src/test/scala/com/amli/w3/recommend/io/CliqueStatSuite.scala:
--------------------------------------------------------------------------------
 1 | package com.amli.w3.recommend.io
 2 | 
 3 | import org.scalatest.{Matchers, FunSuite}
 4 | 
 5 | class CliqueStatSuite extends FunSuite with Matchers {
 6 |   test("test get n cliques") {
 7 |     val transactions = Seq(
 8 |       1 -> 1,
 9 |       1 -> 2,
10 |       1 -> 3,
11 |       1 -> 4,
12 |       2 -> 2,
13 |       2 -> 3,
14 |       2 -> 4,
15 |       3 -> 3,
16 |       3 -> 4,
17 |       4 -> 4
18 |     )
19 |     val cliques = new CliqueStat(transactions)
20 |     val twoCliques = cliques.getNClique(2)
21 |     val realTwoCliquesResult = Map(
22 |       (List(3, 4),1),
23 |       (List(1, 2),3),
24 |       (List(2, 3),2),
25 |       (List(1, 4),1),
26 |       (List(2, 4),1),
27 |       (List(1, 3),2)
28 |     )
29 |     twoCliques should equal (realTwoCliquesResult)
30 | 
31 |     val threeCliques = cliques.getNClique(3)
32 |     val realThreeCliquesResult = Map(
33 |       (List(1, 2, 4),1),
34 |       (List(2, 3, 4),1),
35 |       (List(1, 2, 3),2),
36 |       (List(1, 3, 4),1)
37 |     )
38 |     threeCliques should equal (realThreeCliquesResult)
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/datalyze/src/test/scala/com/amli/w3/recommend/io/FeaturedDataSuite.scala:
--------------------------------------------------------------------------------
 1 | package com.amli.w3.recommend.io
 2 | 
 3 | import org.scalatest.{Matchers, FunSuite}
 4 | 
 5 | import com.amli.w3.recommend._
 6 | 
 7 | case class DataPoint(xPoint: Int, yPoint: Int)
 8 | 
 9 | class FeaturedDataSuite extends FunSuite with Matchers {
10 |   test("test the functionality of SchemaRDD in FeaturedData") {
11 |     val dataLocal = Seq(
12 |       DataPoint(1, 2),
13 |       DataPoint(3, 4),
14 |       DataPoint(5, 6),
15 |       DataPoint(7, 8)
16 |     )
17 | 
18 |     import scLocal._
19 |     import sqlCtx._
20 | 
21 |     val data = sc.parallelize[DataPoint](dataLocal)
22 |     val feature = FeaturedData(sqlCtx, data.toSchemaRDD, "Graph")
23 | 
24 |     val avgPoint: (Int, Int) => Double = (l, r) => (l + r) / 2.0
25 |     sqlCtx.registerFunction("avgPoint", avgPoint)
26 |     val avg = feature.transform(s"select avgPoint(xPoint, yPoint) from Graph", "AVG")
27 |     avg.dat.map(_.getDouble(0)).collect() should contain allOf (1.5, 3.5, 5.5, 7.5)
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/datalyze/src/test/scala/com/amli/w3/recommend/io/MySQLLoaderSuite.scala:
--------------------------------------------------------------------------------
 1 | package com.amli.w3.recommend.io
 2 | 
 3 | import java.sql.{SQLException, DriverManager, ResultSet}
 4 | import org.scalatest.{BeforeAndAfter, Matchers, FunSuite}
 5 | 
 6 | import com.amli.w3.recommend._
 7 | 
 8 | class MySQLLoaderSuite extends FunSuite with Matchers with BeforeAndAfter {
 9 |   before {
10 |     Class.forName("org.apache.derby.jdbc.EmbeddedDriver")
11 |     val conn = DriverManager.getConnection("jdbc:derby:target/MySQLLoaderSuiteDB;create=true")
12 |     try {
13 |       val create = conn.createStatement
14 |       create.execute("""
15 |         CREATE TABLE FOO(
16 |           ID INTEGER NOT NULL GENERATED ALWAYS AS IDENTITY (START WITH 1, INCREMENT BY 1),
17 |           DATA INTEGER
18 |         )""")
19 |       create.close()
20 |       val insert = conn.prepareStatement("INSERT INTO FOO(DATA) VALUES(?)")
21 |       (1 to 100).foreach { i =>
22 |         insert.setInt(1, i * 2)
23 |         insert.executeUpdate
24 |       }
25 |       insert.close()
26 |     } catch {
27 |       case e: SQLException if e.getSQLState == "X0Y32" =>
28 |       // table exists
29 |     } finally {
30 |       conn.close()
31 |     }
32 |   }
33 | 
34 |   test("test mysql data loader") {
35 |     import scLocal._
36 |     val mapRow: (ResultSet) => (Int, Int) = rs => (rs.getInt(1), rs.getInt(2))
37 |     val conn = "jdbc:derby:target/MySQLLoaderSuiteDB;create=true"
38 |     val qStatement = "SELECT ID, DATA FROM FOO WHERE ID=10"
39 |     val data = MySQLLoader(sc, conn).query[(Int, Int)](qStatement, mapRow)
40 |     data.collect() should contain only ((10, 20))
41 |   }
42 | 
43 |   after {
44 |     try {
45 |       DriverManager.getConnection("jdbc:derby:;shutdown=true")
46 |     } catch {
47 |       case se: SQLException if se.getSQLState == "XJ015" =>
48 |       // normal shutdown
49 |     }
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/datalyze/src/test/scala/com/amli/w3/recommend/model/CommunityModelSuite.scala:
--------------------------------------------------------------------------------
 1 | package com.amli.w3.recommend.model
 2 | 
 3 | import java.io.PrintWriter
 4 | 
 5 | import org.scalatest.{BeforeAndAfter, FunSuite}
 6 | 
 7 | import com.amli.w3.recommend.io.DataLoader._
 8 | import com.amli.w3.recommend.scLocal._
 9 | 
10 | class CommunityModelSuite extends FunSuite with BeforeAndAfter {
11 |   val lambda = 0.8
12 |   val numIterations = 10
13 |   val transFile = "/tmp/transaction.txt"
14 |   val modelFile = "/tmp/model.txt"
15 | 
16 |   before {
17 |     val out = new PrintWriter(transFile)
18 |     for (i <- 1 to 100) {
19 |       out.println(s"${i % 20} ${i % 80}")
20 |     }
21 |     out.close()
22 |   }
23 | 
24 |   test("test train model") {
25 |     val model = new CommunityModel(sc)
26 |     val trans = getTransDataFromFile(transFile)
27 |     model.configModel(Map(("lambda", lambda.toString), ("numIterations", numIterations.toString)))
28 |     assert(model.lambda === lambda)
29 |     assert(model.numIterations === numIterations)
30 | 
31 |     model.trainModel(trans)
32 | 
33 |     model.exportModel(modelFile)
34 |     val readModel = new CommunityModel(sc)
35 |     readModel.importModel(modelFile)
36 | 
37 |     assert(model.lambda === readModel.lambda)
38 |     // TODO: how to test model training?
39 |   }
40 | 
41 |   after {
42 |     /** Not need to delete temporal files manually */
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/datalyze/src/test/scala/com/amli/w3/recommend/model/ItemSimilaritySuite.scala:
--------------------------------------------------------------------------------
 1 | package com.amli.w3.recommend.model
 2 | 
 3 | import java.io.PrintWriter
 4 | import org.scalatest.{BeforeAndAfter, FunSuite, Matchers}
 5 | import com.amli.w3.recommend.io.DataLoader._
 6 | import com.amli.w3.recommend.model.Utils._
 7 | 
 8 | class ItemSimilaritySuite extends FunSuite with Matchers with BeforeAndAfter {
 9 | 
10 |   val config = Config(transFile = "/tmp/transaction.txt", modelFile = "/tmp/model.txt")
11 | 
12 |   before {
13 |     val out = new PrintWriter(config.transFile)
14 |     for (i <- 1 to 100) {
15 |       out.println(s"${i % 20} ${i % 80}")
16 |     }
17 |     out.close()
18 |   }
19 | 
20 |   test("test train model") {
21 |     val model = new ItemSimilarity()
22 |     val trans = getTransDataFromFile(config.transFile)
23 |     model.configModel(Map(("K", config.k.toString)))
24 |     assert(model.k === config.k)
25 | 
26 |     model.trainModel(trans)
27 | 
28 |     model.exportModel(config.modelFile)
29 |     val readModel = new ItemSimilarity()
30 |     readModel.importModel(config.modelFile)
31 | 
32 |     assert(model.k === readModel.k)
33 |     model.similarities should equal(readModel.similarities)
34 |     model.popularityModel.popularities should equal (readModel.popularityModel.popularities)
35 |     model.popularityModel.orderHistory should equal (readModel.popularityModel.orderHistory)
36 |   }
37 | 
38 |   after {
39 |     /** Not need to delete temporal files manually */
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/datalyze/src/test/scala/com/amli/w3/recommend/model/PregelUnfoldingSuite.scala:
--------------------------------------------------------------------------------
 1 | package com.amli.w3.recommend.model
 2 | 
 3 | import java.io.PrintWriter
 4 | import org.scalatest.{BeforeAndAfter, Matchers, FunSuite}
 5 | 
 6 | import org.apache.spark.graphx.Graph
 7 | 
 8 | import com.amli.w3.recommend.scLocal._
 9 | 
10 | class PregelUnfoldingSuite extends FunSuite with Matchers with BeforeAndAfter {
11 | 
12 |   val transFile = "/tmp/transaction.txt"
13 | 
14 |   before {
15 |     val out = new PrintWriter(transFile)
16 |     for (i <- 1 to 100) {
17 |       out.println(s"${i % 20} ${i % 80}")
18 |     }
19 |     out.close()
20 |   }
21 | 
22 |   test("test pregel unfolding") {
23 |     val rawEdges = sc.textFile(transFile, 2).map(s =>
24 |       s.split("\\s+").head.toLong -> s.split("\\s+").last.toLong)
25 |     val graph = Graph.fromEdgeTuples(rawEdges, -1)
26 |     val puGraph = PregelUnfolding.run(graph, 5)
27 | 
28 |     // TODO: What should I do for test its result?
29 |     for ((a, NodeAttr(b, c, d, e, f)) <- puGraph.vertices.collect()) {
30 |       println(s"my id is $a")
31 |       println(s"my neighbors are ${b.mkString(",")}")
32 |       println(s"my community is ${c.mkString(",")}")
33 |       println(s"my outer links count is $d")
34 |       println(s"my inner links count is $e")
35 |       println(s"largest mod gain is $f")
36 |       println()
37 |     }
38 |   }
39 | 
40 |   after { /** do nothing*/ }
41 | }
42 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/engine.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "id": "default",
 3 |   "description": "Default settings",
 4 |   "engineFactory": "com.amli.w3.recommend.RecommendationEngine",
 5 |   "datasource": {
 6 |     "params": [
 7 |       {
 8 |         "sources": ["jdbc:mysql://localhost:3306/jts", "jdbc:mysql://localhost:3306/qyf"],
 9 |         "users": ["root", "root"],
10 |         "passwords": ["root", "root"]
11 |       },
12 |       {
13 |         "sources": ["jdbc:mysql://localhost:3306/hbo"],
14 |         "users": ["root"],
15 |         "passwords": ["root"]
16 |       }
17 |     ]
18 |   },
19 |   "algorithms": [
20 |     {
21 |       "name": "itemtoitem",
22 |       "params": {
23 |         "params": [
24 |           {
25 |             "k": 20,
26 |             "modelFile": "./model/ItemToItem_jts_20141031"
27 |           },
28 |           {
29 |             "k": 10,
30 |             "modelFile": "./model/ItemToItem_hbo_20141031"
31 |           }
32 |         ]
33 |       }
34 |       
35 |     }
36 |   ]
37 | }
38 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/make-distribution.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | FWDIR="$(cd `dirname $0`; pwd)"
 6 | DISTDIR="$FWDIR/dist"
 7 | mkdir -p $DISTDIR/model
 8 | 
 9 | echo "Building binary distribution for Inspeed"
10 | 
11 | cd $FWDIR
12 | 
13 | pio build --sbt-extra assembly
14 | 
15 | cp -r target $DISTDIR
16 | cp engine.json $DISTDIR
17 | 
18 | echo "Inspeed binary distribution created at $DISTDIR"
19 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/project/build.scala:
--------------------------------------------------------------------------------
 1 | import sbt._
 2 | import Keys._
 3 | import sbtassembly.Plugin._
 4 | 
 5 | object MyBuild extends Build {
 6 | 
 7 |   val versionOfScala = "2.10.4"
 8 | 
 9 |   lazy val demo = project.in(file("."))
10 |     .settings(name := "demo", version := "0.1", scalaVersion := versionOfScala)
11 |     .settings(assemblySettings: _*)
12 |     .aggregate(core, datalyze)
13 |     .dependsOn(core, datalyze)
14 | 
15 |   val commonSharingLibs = Seq(
16 |     "org.apache.spark" %% "spark-core"    % "1.1.0"  % "provided",
17 |     "org.apache.spark" %% "spark-mllib"   % "1.1.0"  % "provided"
18 |   )
19 | 
20 |   lazy val core = project.in(file("core")).settings(
21 |     scalaVersion := versionOfScala,
22 |     libraryDependencies ++= Seq(
23 |       "io.prediction" %% "core" % "0.8.1" % "provided"
24 |     ) ++ commonSharingLibs
25 |   ).dependsOn(datalyze)
26 | 
27 |   lazy val datalyze = project.in(file("datalyze")).settings(
28 |     scalaVersion := versionOfScala,
29 |     libraryDependencies ++= Seq(
30 |       "org.scalatest" %% "scalatest" % "2.2.0" % "test",
31 |       "org.apache.derby" % "derby" % "10.11.1.1" % "test",
32 |       "org.apache.spark" % "spark-graphx_2.10" % "1.1.0" % "provided",
33 |       "org.apache.spark" % "spark-sql_2.10" % "1.1.0" % "provided",
34 |       "mysql" % "mysql-connector-java" % "5.1.28",
35 |       "com.github.scopt" %% "scopt" % "3.2.0"
36 |     ) ++ commonSharingLibs
37 |   )
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 
3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.2.0")
4 | 
5 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0")
6 | 


--------------------------------------------------------------------------------
/w3/w3demo/demo3/recommend-dish/requests.sh:
--------------------------------------------------------------------------------
1 | curl -H "Content-Type: application/json" -X POST -d '{"restaurantId": 1003, "users": [123], "personAmount": 1, "expectedConsumePerHead": 20, "mealType": 0, "itemsInCart": [7454, 7455], "clickedItems": [], "num": 5}' http://localhost:8000/queries.json
2 | 


--------------------------------------------------------------------------------