├── .gitignore ├── README.md ├── Schedule_Chinese.md ├── license.md ├── spark-summit-15.md ├── w1 └── lecture_note.md ├── w2 └── lecture_note.md └── w3 └── w3demo ├── demo1-python └── HDR │ ├── README.md │ ├── papers │ ├── 00b495249dfb217637000000.pdf │ ├── 10.1.1.110.4774.pdf │ ├── 1412.8307v1.pdf │ ├── Lecture21.pdf │ ├── Margulis-OpticalCharacterRecognition.pdf │ └── mdp_paper.pdf │ └── src │ ├── train_flow.py │ ├── train_flow_sklearn.py │ ├── train_knn.py │ ├── train_lr.py │ ├── train_rbm.py │ ├── train_rf.py │ └── train_svm.py ├── demo1 └── readme.txt ├── demo2 └── custom-serving │ ├── .gitignore │ ├── build.sbt │ ├── data │ └── sample_disabled_items.txt │ ├── engine.json │ ├── project │ ├── assembly.sbt │ └── plugins.sbt │ └── src │ └── main │ └── scala │ ├── ALSAlgorithm.scala │ ├── ALSModel.scala │ ├── DataSource.scala │ ├── Engine.scala │ ├── Preparator.scala │ └── Serving.scala └── demo3 └── recommend-dish ├── core └── src │ ├── main │ └── scala │ │ └── com │ │ └── amli │ │ └── w3 │ │ └── recommend │ │ ├── DataSource.scala │ │ ├── Engine.scala │ │ ├── ItemToItem.scala │ │ ├── ItemToItemModel.scala │ │ ├── Preparator.scala │ │ └── Serving.scala │ └── test │ └── scala │ └── com │ └── amli │ └── w3 │ └── recommend │ └── DataSourceSuite.scala ├── datalyze └── src │ ├── main │ ├── resources │ │ └── log4j.properties │ └── scala │ │ └── com │ │ └── amli │ │ └── w3 │ │ └── recommend │ │ ├── Logging.scala │ │ ├── io │ │ ├── DataLoader.scala │ │ ├── DataTransfer.scala │ │ ├── FeaturedData.scala │ │ ├── MySQLLoader.scala │ │ └── Utils.scala │ │ ├── model │ │ ├── ItemSimilarity.scala │ │ ├── Model.scala │ │ └── PopularityModel.scala │ │ └── package.scala │ └── test │ └── scala │ └── com │ └── amli │ └── w3 │ └── recommend │ ├── io │ ├── CliqueStatSuite.scala │ ├── FeaturedDataSuite.scala │ └── MySQLLoaderSuite.scala │ └── model │ ├── CommunityModelSuite.scala │ ├── ItemSimilaritySuite.scala │ └── PregelUnfoldingSuite.scala ├── engine.json ├── make-distribution.sh ├── project ├── build.scala └── plugins.sbt └── requests.sh /.gitignore: -------------------------------------------------------------------------------- 1 | *.tgz 2 | manifest.json 3 | *.tar.gz 4 | *.csv 5 | *~ 6 | *.swp 7 | *.ipr 8 | *.iml 9 | *.iws 10 | .idea/ 11 | sbt/*.jar 12 | .settings 13 | .cache 14 | .generated-mima* 15 | /build/ 16 | work/ 17 | out/ 18 | .DS_Store 19 | third_party/libmesos.so 20 | third_party/libmesos.dylib 21 | conf/java-opts 22 | conf/spark-env.sh 23 | conf/streaming-env.sh 24 | conf/log4j.properties 25 | conf/spark-defaults.conf 26 | conf/hive-site.xml 27 | docs/_site 28 | docs/api 29 | target/ 30 | reports/ 31 | .project 32 | .classpath 33 | .scala_dependencies 34 | lib_managed/ 35 | src_managed/ 36 | project/boot/ 37 | project/plugins/project/build.properties 38 | project/build/target/ 39 | project/plugins/target/ 40 | project/plugins/lib_managed/ 41 | project/plugins/src_managed/ 42 | logs/ 43 | log/ 44 | spark-tests.log 45 | streaming-tests.log 46 | dependency-reduced-pom.xml 47 | .ensime 48 | .ensime_lucene 49 | checkpoint 50 | derby.log 51 | dist/ 52 | spark-*-bin.tar.gz 53 | unit-tests.log 54 | /lib/ 55 | rat-results.txt 56 | scalastyle.txt 57 | conf/*.conf 58 | scalastyle-output.xml 59 | 60 | # For Hive 61 | metastore_db/ 62 | metastore/ 63 | warehouse/ 64 | TempStatsStore/ 65 | sql/hive-thriftserver/test_warehouses 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Applied Machine Learning and Implementation 2 | 3 | ## Overview 4 | 5 | 12 weeks, 2 hours / per week 6 | 7 | 20 min per episode, so six episodes per week. 8 | 9 | This course will cover: 10 | 11 | \*\*\*\*\* **Spark MLlib** 12 | 13 | \*\*\*\* **ML Pipeline and GraphX** 14 | 15 | \*\*\* **Spark Core and Spark SQL** 16 | 17 | \*\* **Spark Streaming** 18 | 19 | \* **Scikit-learn for reference.** 20 | 21 | ## Textbooks 22 | 23 | 1. Advanced Analytics with Spark 24 | 2. Machine Learning with Spark 25 | 3. The Lion Way: Machine Learning plus Intelligent Optimization 26 | 4. Others... 27 | 28 | ## week 1 Introduction 29 | 30 | 1. Spark ABC 31 | 2. Machine learning ABC 32 | 3. Graph Computing ABC 33 | 4. Demos for Spark, MLlib, and GraphX 34 | 35 | ## week 2 Generalized Linear Model 36 | 37 | 2. Logistic regression 38 | 3. Linear regression 39 | 4. SVM 40 | 5. LASSO 41 | 6. Ridge regression 42 | 7. Applied demos such as Handwritten digits recognition, etc. 43 | 44 | ## week 3 Recommendation 45 | 46 | 1. Recommendation ALS 47 | 2. Singular Value Decomposition 48 | 3. The implementation in both MLlib and Mahout 49 | 4. Applied demo of recommendation with PredictionIO. 50 | 51 | ## week 4 Clustering 52 | 53 | 1. k-means 54 | 2. LDA 55 | 3. Applied demo of geo-location clustering and topic modeling 56 | 57 | ## week 5 Streaming-wised Machine Learning 58 | 59 | 1. Lambda Architecture 60 | 2. Parameter Server 61 | 3. Several algorithms from Freeman labs 62 | 4. Applied demo such as the zebrafish experiment 63 | 64 | ## week 6 ML Pipeline 65 | 66 | 1. Pipeline of Scikit-learn 67 | 2. Pipeline of Spark (DataFrame, ML Pipeline, etc.) 68 | 3. Applied demo (TBD) 69 | 70 | ## week 7 Scientific Computing 71 | 72 | 1. Scientific computing and Notices from Matrix Computation 73 | 2. Matrix libs (in C/Fortran and Java) 74 | 3. Matrix in MLlib 75 | 4. Applied demo (TBD) 76 | 77 | ## week 8 The Graph Computation Model 78 | 79 | 1. Graph computing and libs 80 | 2. revisit LDA, ALS 81 | 3. Applied demo such as community detection for food network/recommendation. 82 | 83 | ## week 9 Tree Model and Boosting 84 | 85 | 1. Tree model 86 | 2. Random forest 87 | 3. Ensemble in Kaggle and practice 88 | 4. Applied demo for ensemble 89 | 90 | ## week 10 Evaluation 91 | 92 | 1. Evaluation methods 93 | 2. Implementations in MLlib 94 | 3. Online / Offline evaluations 95 | 96 | ## week 11 Optimization in Parallel 97 | 98 | 1. Commonly used optimization algorithms 99 | 2. Sequential gene of optimization algorithms 100 | 3. BSP model to BSP+ model to SSP 101 | 4. Future ways? 102 | 103 | ## week 12 Rethink of practical machine learning and how to build a good system 104 | 105 | 1. One, two, three of practical ML 106 | 2. Rethink of practical machine learning 107 | 3. How to build a great machine learning system? 108 | 4. Compare with Mahout / Oryx2 / VM / ... 109 | 110 | ## Survey of Advanced Analytics with Spark 111 | 112 | | Chapter | Topic | Algorithms | Dataset | Source | 113 | |:-----:|:-----:|:-----:|:-----:|:-----:|:-----:| 114 | | 2 | Record Linkage | Entity resolution, record dedup, merge-and-purge, list washing | Some business data such as TCPDS | UCI ML repo | 115 | | 3 | Recommending | ALS | Who plays what or who rates what | Audioscrobbler | 116 | | 4 | Predicting Forest Cover | Decision Tree | The type of forest covering parcels of land in Colorado | UCI ML repo | 117 | | 5 | Anomaly detection in network traffic | K-means | Network intrusion data | KDD Cup 1999 Dataset | 118 | | 6 | Understanding wikipedia | Latent Semantic Analysis, SVD, TF-IDF, etc | wikipedia texts | wikipedia | 119 | | 7 | Analyzing Co-occurrence Networks | Massive graph algorithms in GraphX | MEDLINE citation index | US National Library of Medicine | 120 | | 8 | Geo and Temporal data analysis | Building sessions | New York Taxicab Data | New York City Taxi and Limousine Commission | 121 | | 9 | Estimating Finacial Risk | Monte Carlo Simulation | Stock Data | Yahoo! | 122 | | 10 | Analyzing Genomic Data | Massive genome analysis algorithms | Genome data | NCBI | 123 | | 11 | Analyzing Neuroimaging Data | Thunder | Images of zebrafish brains | Thunder repository | 124 | 125 | ## Structure of directories 126 | 127 | /src/chapterx --> The code snippets of each chapter 128 | 129 | /src/chapterx/{java, python, scala} --> Code snippets written with Mahout, Scikit-learn, and Spark 130 | 131 | ## Spark VS Scikit-learn 132 | 133 | ### Algorithms 134 | 135 | | Type | Algorithm | Scikit-learn | Spark | 136 | |:-----------:|:----------:|:----------:|:----------:| 137 | |Classification| Logistic Regression | YES | YES 138 | |Classification| Perceptron | YES | 139 | |Classification| Passive Aggressive Algorithms | YES 140 | |Classification| SVM | YES | YES 141 | |Classification| Naive Bayes | YES | YES 142 | |Classification| Decision Tree | YES | YES 143 | |Classification| Ensemble methods | YES | YES 144 | |Classification| Label Propogation | YES | YES (in GraphX) 145 | |Classification| LDA and QDA | YES | 146 | |Regression| Ordinary Least Square | YES | YES 147 | |Regression| Ridge Regression | YES | YES 148 | |Regression| LASSO | YES | YES 149 | |Regression| Elastic Net | YES 150 | |Regression| Multi-task LASSO | YES 151 | |Regression| Least Angle Regression | YES 152 | |Regression| LARS LASSO | YES 153 | |Regression| Orthogonal Matching Pursuit | YES 154 | |Regression| Bayesian Regression | YES 155 | |Regression| Polynomial Regression | YES 156 | |Regression| Nearest Neighbor | YES | YES 157 | |Regression| Gaussian Process | YES 158 | |Regression| Isotonic Regression | YES 159 | |Clustering| K-means | YES | YES 160 | |Clustering| Affinity Propagation | YES 161 | |Clustering| Mean shift | YES 162 | |Clustering| Spectral Clustering | YES 163 | |Clustering| Ward | YES 164 | |Clustering| Agglomerative clustering | YES 165 | |Clustering| DBSCAN | YES 166 | |Clustering| Gaussian Mixtures | YES 167 | |Dimension Reduction| PCA | YES | YES 168 | |Dimension Reduction| SVD / LSA | YES | YES 169 | |Dimension Reduction| Dictionary Learning | YES 170 | |Dimension Reduction| Factor Analysis | YES 171 | |Dimension Reduction| ICA | YES 172 | |Dimension Reduction| NMF | YES 173 | |Model Selection| Cross Validation | YES | YES 174 | |Model Selection| Grid Search | YES 175 | |Model Selection| Pipeline | YES | YES 176 | |Model Selection| Feature Union | YES | YES 177 | |Model Selection| Model Evaluation | YES | YES 178 | |Model Selection| Model Presistence | YES 179 | |Model Selection| Validation Curves | YES 180 | |Preprocessing| Standardization | YES | YES 181 | |Preprocessing| Encoding categorical features | YES | YES (dependency) 182 | |Preprocessing| Binarization | YES 183 | |Preprocessing| Normalization | YES | YES 184 | |Preprocessing| Label preprocessing | YES 185 | |Preprocessing| Imputation of missing values | YES 186 | |Preprocessing| Unsupervised data reduction | YES 187 | -------------------------------------------------------------------------------- /Schedule_Chinese.md: -------------------------------------------------------------------------------- 1 | # 实践机器学习算法详解及工程实现 2 | 3 | ## 概述 4 | 5 | 本课程共12周,每周两小时。每个知识片段20分钟,每周共计6个视频片段。 6 | 7 | 本课程包含下列开源产品/组件(星号代表重要程度): 8 | 9 | \*\*\*\*\* **Spark MLlib** 10 | 11 | \*\*\*\* **ML Pipeline and GraphX** 12 | 13 | \*\*\* **Spark Core and Spark SQL** 14 | 15 | \*\* **Spark Streaming** 16 | 17 | \* **Scikit-learn for reference.** 18 | 19 | ## 参考书 20 | 21 | 1. Advanced Analytics with Spark 22 | 2. Machine Learning with Spark 23 | 3. The Lion Way: Machine Learning plus Intelligent Optimization 24 | 4. Others... 25 | 26 | ## week 1 课程简介及入门基础 27 | 28 | 1. Spark基础知识 29 | 2. 机器学习基础知识 30 | 3. 图计算基础知识 31 | 4. Spark,MLlib,以及GraphX的操作示例 32 | 33 | ## week 2 广义线性模型 34 | 35 | 2. 逻辑回归 36 | 3. 线性回归 37 | 4. SVM 38 | 5. LASSO 39 | 6. 岭回归 40 | 7. 广义线性模型代码及示例(如手写数字识别) 41 | 42 | ## week 3 推荐算法及系统 43 | 44 | 1. ALS算法 45 | 2. 奇异值分解 46 | 3. Mahout与MLlib的对比分析 47 | 4. 推荐系统的搭建示例(依赖PredictionIO) 48 | 49 | ## week 4 聚类算法 50 | 51 | 1. k-means 52 | 2. LDA 53 | 3. 高斯混合模型 54 | 4. Power Iteration聚类 55 | 5. 聚类算法应用示例(如主题建模及地理位置聚类) 56 | 57 | ## week 5 流式机器学习 58 | 59 | 1. Lambda架构 60 | 2. 参数服务器 61 | 3. from Freeman labs提供的流式算法 62 | 4. 应用示例(如斑马鱼实验) 63 | 64 | ## week 6 机器学习流水线 65 | 66 | 1. Scikit-learn的流水线(包括Pandas等对比) 67 | 2. Spark的流水线(如DataFrame以及ML组件) 68 | 3. 特征提取与变换 69 | 4. 应用示例及对比(待定) 70 | 71 | ## week 7 机器学习中的科学计算 72 | 73 | 1. 矩阵计算中的注意事项 74 | 2. 矩阵计算的组件(in C/Fortran and Java) 75 | 3. MLlib中的矩阵计算 76 | 4. MLlib中的统计方法 77 | 5. 科学计算的示例(待定) 78 | 79 | ## week 8 图计算模型 80 | 81 | 1. GraphX进阶 82 | 2. GraphX中的图算法 83 | 3. 再议LDA与ALS算法 84 | 4. 图模型的示例(如网络中的社团聚类) 85 | 86 | ## week 9 决策树与组合学习 87 | 88 | 1. MLlib中的决策树 89 | 2. 随机森林算法 90 | 3. Gradient-Boosted Trees 91 | 3. 实践中的组合学习(如Kaggle) 92 | 4. 组合模型的示例(待定) 93 | 94 | ## week 10 机器学习算法评测 95 | 96 | 1. 评测方法 97 | 2. Cross validation与Grid Search 98 | 2. MLlib中的实现 99 | 3. 在线、离线测评方法 100 | 101 | ## week 11 优化算法并行化 102 | 103 | 1. 常用的优化算法 104 | 2. 优化算法的串行基因 105 | 3. 计算模型:从BSP到BSP+再到SSP 106 | 4. 未来的趋势 107 | 108 | ## week 12 课程拾遗以及框架再思考 109 | 110 | 1. 课程拾遗 111 | 2. 机器学习/数据分析的一般步骤 112 | 3. 实践机器学习的再思考 113 | 4. 多系统对比(Mahout、Oryx、VM以及一些python的包,SparkR,PySpark等) 114 | 5. 总结 115 | -------------------------------------------------------------------------------- /license.md: -------------------------------------------------------------------------------- 1 | All rights reserved. 2 | 3 | 此处代码与文档均为《实践机器学习算法详解及工程实现》(英文名称 Applied Machine Learning and Implementation)课程所用,版权归作者所有。 4 | 5 | 未经授权的转载和传播被禁止,违者将追究法律责任。 6 | -------------------------------------------------------------------------------- /spark-summit-15.md: -------------------------------------------------------------------------------- 1 | # Spark上的数据分析简介 2 | 3 | 总时长 = 2小时 4 | 5 | 话题数目 = 6个 6 | 7 | 大约每个话题 20分钟 8 | 9 | 0. MLlib最新进展简介 10 | 1. 模型表示 --> MLlib的向量模型与矩阵模型 11 | 2. 优化并行 --> 同步方式、优化调度、以及模型存储 12 | 3. 计算模式 --> MLlib与GraphX 13 | 4. 数据承载 --> MLlib与SparkSQL 14 | 5. 实例分析 --> MLlib与Scikit-learn 15 | -------------------------------------------------------------------------------- /w1/lecture_note.md: -------------------------------------------------------------------------------- 1 | Lecture note 2 | Xusen Yin 3 | April 4, 2015 4 | 5 | # Machine Learning on Spark 6 | 7 | # week1 讲义 8 | 9 | 由于各位学员背景不同,基础知识的情况也不太一致,因此第一周的内容主要分为三个overviews,旨在首先可以统一本课程的一些大致脉络,其次对于已经懂得这些基础知识的同学算作一个回顾,对于还不知道这些内容的同学作为一个学习的起点和指导。需要注意的是,学习是个自我进步和不断回顾的过程,讲师在短短的数小时内所起到的职能是启发和引导,真正学习的过程是在课外所花费的时间上。讲师能力有限,在设计课程的过程中参考了多人的资料,随后会附上致谢列表。 10 | 11 | ## Spark overview 12 | 13 | 本周课程不打算在一个小时的时间内事无巨细的讲解Spark,也不打算向大家介绍Spark ABC。具体的使用,API的操作请查看Spark官方文档。Spark overview的主要目的是让大家能够尽量明晰Spark的自身特点、计算范式、编程模型、以及运行调度,以便以后在面临一些看似奇怪的代码时能分析它这么做背后的原因。 14 | 15 | Spark自身的特点,诚如其名,“轻快灵巧”。轻指的是Spark core的设计精巧,代码量少,同时得益于Scala语言丰富的表达力。快说的是Spark上手快,对于初学者完全可以看作单机Scala的分布式版本,RDD的抽象容易掌握;其次是运行快,亚秒级延迟,使其完全可以胜任交互式操作。灵指的是不同层面的灵活性,实现层基于Scala trait实现不同策略的定制和mixin,原语层简单的算子扩展和数据源扩展,以及多语言绑定(Java,Python)。范式层从多迭代的批量操作到流处理,即席查询和图计算等。巧是指其实现借巧力,站在巨人的肩膀上,避免一些费力不讨好的设计。 16 | 17 | 从计算范式的角度来看,Spark时典型的数据并行,因此具有数据并行的优势和局限。粗粒度数据并行的RDD中每个数据元素要过相同的代码序列,因此其完全不能胜任细粒度异步的数据更新,这点从GraphX的实现可见端倪,从MLlib现有的一些算法实现也可管中窥豹。Spark采用函数式语义,包括RDD的不可修改,UDF的使用,transformation不会产生副作用,而是产生一个新的RDD等。这样导致每个stage内部的计算是幂等的,失败时可以简单通过重放来容错。 18 | 19 | 从编程模型的角度来看,Spark就是简单的“数据加变换”。其计算空间分为Spark空间和Scala空间,前者的计算是分布式的,在各个worker节点上执行,后者的计算是在单节点执行,具体说来实在Driver节点执行。Scala空间到Spark空间需要输入算子,如textFile,parallel等,Spark空间回到Scala空间需要action算子。 20 | 21 | 从运行调度的角度来看,主要分为两大部分。第一部分是从Spark代码到切好的DAG stage,第二个部分是从stage到一个个执行的task。前者发生在Driver和代码之间,由DAG调度起负责,后者发生在各个Driver和各个具体执行的Executor之间,由task调度器负责。 22 | 23 | 对每个RDD而言,action算子触发job的投递。transformation算子中的宽依赖算子触发新的stage切分,而窄依赖算子最终会被pipeline到一起,或者说叫做operator fusion亦可。每两个相互依赖的stages之间通过shuffle传递数据。 24 | 25 | ## Machine Learning overview 26 | 27 | T. G. Dietterich在MLSS 2014的讲座中讲了机器学习的三个问题,通俗易懂的介绍了机器学习的一般问题。此处借用这三个问题,向大家阐明机器学习的来由,问题,和基本方法。 28 | 29 | 机器学习的原动力就是来改变软件的流程,由之前“推导”式的方式转变为“归纳总结”。传统的算法流程是领域专家给出解题思路,按照这种解题思路设计算法求解问题,即expert => logic => function(input): output。Machine learning的方法是收集问题领域的输入输出,在某种假设下总结出其背后的逻辑,模拟领域专家,即 => logic => expert。 30 | 31 | 此处以有监督学习为例。有监督学习的简单例子包括手写识别、疾病监测、人脸识别等。其解决问题的一般框架是 32 | 33 | - 由某一未知分布(the expert)随机且独立的采样,作为训练样本: 34 | 35 | - 学习算法(the assumption)分析样本数据并生成分类器; 36 | 37 | - 从相同的数据采样得到新数据,由分类器给出分类结果; 38 | 39 | - 以某种方式评估误差; 40 | 41 | Machine learning的目标就是,找到可以最小化expected loss的函数。 42 | 43 | 以垃圾邮件检测为例,这里未知的分布就是给定的邮件及其标注(“是”垃圾邮件或者“不是”垃圾邮件)其背后隐含的分布。训练样本是用户标注好的一堆邮件。学习算法会在后续的课程中讲到,最简单的方法就是Naive Bayes。输出的分类器,对于Naive Bayes而言,是一个条件概率,即在给定邮件x的条件下,它是垃圾邮件的概率。测试样本为一封新的且有ground truth的邮件,用一个损失函数(loss function)评估误差。最直观的损失函数就是分对损失为0,分错损失为1,叫做0-1 loss。 44 | 45 | 就学习方法而言,machine learning有三种基本方法。一是学习一个分类器y=f(x)(如Perceptron),二是学习一个条件概率分布p(y|x)(如Logistic regression),三是学习一个联合概率分布p(x, y)(如Linear discriminate analysis)。第一周的课里只会讲到第一种,其他的在后续课程中会涉及。 46 | 47 | 以perceptron为例,我们过一下机器学习算法的基本流程。(待补充) 48 | 49 | ## Graph computing overview 50 | 51 | 本周最后一节介绍一下图计算的基础知识。首先界定图计算和机器学习的关系。现实生活中的很多问题都可以用图来概括(依赖关系),像twitter friendship网络,call graph等等。而很多科学问题的结构也是图结构,如PageRank的问题,网页之间存在相互的链接。由此,许多科学问题的解法也是一种图算法。针对PageRank,它本身就是一种通过图上的迭代计算特征向量的过程。不仅如此,像协同过滤、梯度下降、置信传播等等多种常见的machine learning算法都能用图的模型概括出来。后面我们也会看到,图的编程模型极大简化了大规模机器学习算法的实现。 52 | 53 | 图计算最重要两个组成部分就是编程模型和计算引擎,这周的课程中我们只会简要介绍编程模型的内容,计算引擎的部分我们会在专门研讨GraphX的一周内详细分析。 54 | 55 | 我们以PageRank为例来看不同的编程模型对程序的影响。最简单的方法是直接用MapReduce来表达图计算,我们在Map中完成每个节点的计算任务,在Reduce中进行shuffle,为每个节点聚合其邻居节点送过来的消息。从代码上来看,计算部分和消息传递以及图的结构相互耦合在一起。 56 | 57 | BSP(Bulk Synchronous Parallel)首先是各个节点并行完成自己的顶点程序(vertex program),之后每个节点向其邻居节点广播其内容更新,此处会有一个大同步等待所有节点完成信息交换。之后再用最新的消息重新进行顶点程序的计算,并以此类推。BSP模型较好的分离了顶点计算、消息传递、图的结构三部分内容,程序看起来清晰易懂。 58 | 59 | BSP模型的barrier会导致很多问题。原理上来讲最佳的并行策略是全异步的执行。每个节点收集到自己计算足够多的消息之后便可执行,执行结束即可向其邻居节点广播更新。但是这种情况下时序问题难以确定,例如节点A的邻居为B和C,节点A在某时刻收到了节点B的消息(messageB, timeB), 以及节点C的消息(messageC, timeC),那么节点A的计算到底是以B的逻辑时钟为准还是以C的逻辑时钟为准?抑或是要求逻辑时钟慢者追上逻辑时钟快者之后才开始计算?这里还需要很多内容要仔细考量。 60 | 61 | -------------------------------------------------------------------------------- /w2/lecture_note.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | # week 2 lecture note 6 | 7 | 第二周主要介绍广义线性模型(Generalized Linear Model)。由于GLM在MLlib中实现比较简单,因此本周会讲大部分时间花在GLM的理论推导上。这会与下周的课有比较明显的区别。周三的课会偏重MLlib中ALS的实现部分。 8 | 9 | MLlib中实现的广义线性模型有5中,分别是 10 | 11 | - Logistic regression 12 | 13 | - Linear regression 14 | 15 | - SVM 16 | 17 | - Lasso 18 | 19 | - Linear regression 20 | 21 | 再算上我们上周学到的Perceptron,目前我们一共会了解6种GLMs。这次课程的目的一是让大家了解GLM的特点,总结出规律,二是了解其在MLlib中的实现方法。更进一步,如果大家能对MLlib中的GLM实现提出自己的看法那就最好了。 22 | 23 | ## Logistic regression 24 | 25 | 首先介绍逻辑回归。跟之前的Perceptron不同,逻辑回归不再是学习一个分类函数,而是学习一个用于分类的条件概率$P(y|x)$,即在给定$x$的条件下,求解$y$取值的概率。 26 | 27 | 更新一下我们的符号,我们令$p_y(x;w)$代表我们条件概率的取值,其中$w$是我们模型的参数。并假设我们的$y$只有两个取值$\{0, 1\}$。给出两个不同取值的概率分别为 28 | 29 | $$p_1(x;w) = \frac{1}{1+exp(-w^Tx)}$$ 30 | 31 | $$p_0(x;w) = \frac{exp(-w^Tx)}{1+exp(-w^Tx)}$$ 32 | 33 | 从定义中容易得出 34 | 35 | $$log\frac{p_0}{p_1} = -w^Tx$$ 36 | 37 | 在某种程度上说明了``线性''的来源。 38 | 39 | 那么为什么会选择这种形式的函数作为我们概率的取值呢?从logit函数曲线的形状可见端倪。线性函数$-w^Tx$的取值范围是正负无穷,而logit函数可以把正负无穷的区间映射到$[0, 1]$之间,比较符合概率的取值范围。 40 | 41 | 给定了假设和模型,下面要通过优化损失函数得到参数的最优解。在这里我们使用负的log loss: 42 | 43 | $$L(\hat{P}(y|x), y) = \left\{ 44 | \begin{array}{c} 45 | -log P(y=1|x_i)\qquad if ~x_i == 1, \\ 46 | -logP(y=0|x_i)\qquad if ~ x_i == 0. \\ 47 | \end{array} 48 | \right.$$ 49 | 50 | 为了最小化损失函数,我们要做的是最大化正的log概率,其实等价于最大似然,即 51 | 52 | $$max_w\sum_ilog\hat{P}(y_i|x_i)$$ 53 | 54 | 对于最大似然最直观的理解就是,如果一个变量$x_i$的类别为1,那就让$\hat{P}(y_i=1|x_i)$的概率远大于$\hat{P}(y_i=0|x_i)$的概率,反之亦然。 55 | 56 | 但是上文中的函数是分段函数,不便求梯度,因此我们利用概率的一些法则将分段函数展开成一个函数 57 | 58 | $$l(w) = \sum_ip_1(x;w)*y_i + (1-p_1(x;w))*(1-y_i)$$ 59 | 60 | 大家可以将$y_i=1$和$y_i=0$带入到上面的式子中自行验证其正确性。 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | -------------------------------------------------------------------------------- /w3/w3demo/demo1-python/HDR/README.md: -------------------------------------------------------------------------------- 1 | # Classification of Handwritten Digits 2 | 3 | ## Summary 4 | 5 | In the past few days, I tried to use several classification algorithms (and their combinations) to classify the handwritten digits. For your convenient, I put the summary here, and details in the following sections. 6 | 7 | There are three kinds of methods that I used: 8 | 9 | - Single algorithms, e.g. SVM and Logistic Regression. 10 | 11 | - Ensemble algorithm, i.e. Random Forest. 12 | 13 | - Pipelined algorithms. 14 | 15 | I put the results as follows, and I will explain them in detail in the following sections, including why I choose them, how to train them, etc. 16 | 17 | | Method | Precision Avg | Recall Avg | F-measure Avg | Algorithms | 18 | |:----------:|:----------:|:----------:|:----------:|:----------:| 19 | | SVM | 0.99 | 0.99 | 0.99 | Single SVM + 5-fold CV | 20 | | Logistic Regression | 0.96 | 0.96 | 0.96 | Single LR + 5-fold CV | 21 | |Pipelined SVM | 0.99 | 0.99 | 0.99 | PCA + Ploynomial Expansions + PCA + FDA + SVM | 22 | |Pipelined Logistic Regression | 0.99 | 0.99 | 0.99 | PCA + Ploynomial Expansions + PCA + FDA + LR | 23 | |Random Forest | 0.98 | 0.98 | 0.98 | Single RF + 3-fold CV | 24 | |Pipelined Random Forest | 0.96 | 0.96 | 0.96 | Polynomial Expansion + RF | 25 | |Layered Neural Network I | 0.95 | 0.95 | 0.95 | 2-layer RBM + LR + 3-fold CV | 26 | |Layered Neural Network II | 0.90 | 0.90 | 0.90 | 3-layer RBM + LR + 3-fold CV | 27 | |Layered Neural Network III | 0.83 | 0.83 | 0.83 | 4-layer RBM + LR + 3-fold CV | 28 | |K-Nearest Neighbors | 0.98 | 0.98 | 0.98 | single KNN + 3-fold CV | 29 | 30 | ## Conclusion 31 | 32 | In the handwritten digits recognition scenario, SVM is the best candidate. Logistic regression with manual polynomial expansions can compete with SVM and theoretically, they are very similar. Random forest gets the similar result with KNN. Because in the *classification superplane* point of view, they are very similar. Both KNN and random forest can draw an irregular division surface. I want to use different distance metrics (such as Isomap, MDS) for KNN, in order to find better *neighbors*, but it failed in this scenario. Neural network does not fit my expectation, but it is reasonable. 2-layer RBM plays the *nonlinear transformation* role, so I get a well enough result. But for 3-layer and 4-layer, I need to fine-tune the parameters of RBM layers along with LR layer to get better results, which scikit-learn does not support that. 33 | 34 | # Details 35 | 36 | ## Some statistics 37 | 38 | ### The distribution of 0-9 samples 39 | 40 | `awk -F "," '{print $65}' optdigits.tra | sort -n | uniq -c` 41 | 42 | > 376 0 43 | 44 | > 389 1 45 | 46 | > 380 2 47 | 48 | > 389 3 49 | 50 | > 387 4 51 | 52 | > 376 5 53 | 54 | > 377 6 55 | 56 | > 387 7 57 | 58 | > 380 8 59 | 60 | > 382 9 61 | 62 | It looks good that the distribution of 0-9 digits looks like a uniform distribution. 63 | 64 | ## Methodology 65 | 66 | ### How to choose models 67 | 68 | This dataset is quite small, so I will not try to use some **heavy** classifiers such as **Deep Neural Network**, which could cause over-fitting and cannot perform well on test set. But, the traditional **Shallow Neural Network** is a good idea, say, a network with two/three layers of neurons. 69 | 70 | Instead of NN, I will try to use **Logistic Regression** and **SVM** first, to see whether can I get a good result. For the sake of low dimensionality, I might use some **dimension reduction** method to filter the dataset, e.g. **PCA**. 71 | 72 | If time permits, I will try to use some **uncommon** methods, such as **random forest** and **k nearest neighbor**. 73 | 74 | ### How to do multi-classification 75 | 76 | There are three methods to solve multi-classification problem: 77 | 78 | 1. 1 vs. (k-1) classification, namely, transforming a k-classification problem into k binary classification problem. 79 | 80 | 2. 1 vs. 1 k-classification classifier, such as **softmax regression** instead of **logistic regression**. 81 | 82 | 3. Error-correcting output codes, which is an uncommon way to solve multi-classification problem. 83 | 84 | ### How to do ETL of the dataset 85 | 86 | In order to use k-fold cross validation, I will let open source tool do it. Scikit-learn is a good choice. Moreover, I will try to use manual feature expansions such as polynomial expansions, and dimension reduction methods such as PCA and FDA. 87 | 88 | ### How to do the grid search for hyper-parameters 89 | 90 | Scikit-learn provides `GridSearchCV` methods to do the search. It is stable for single algorithms. But for pipelined methods, the search space is very large, it may cause OOM. 91 | 92 | ### How to choose open source tools 93 | 94 | - Spark/MLlib is the most familiar tool of me, but it is too heavy and no necessary in the scenario; 95 | 96 | - Scikit-learn seems the most suitable tool, I will try to use ETL part and classification part of it; 97 | 98 | - MDP (Modular Data Process) is useful for a DAG style data process `Flow`, but scikit-learn also has the similar kind of API called `Pipeline`. 99 | 100 | - LibSVM and libLinear are much faster than scikit-learn, but for a demo project, I prefer Python, because the scale-out and scale-up capabilities are not my first consideration. 101 | 102 | ## Details 103 | 104 | ### Install scikit-learn 105 | 106 | `sudo apt-get install python-sklearn` 107 | 108 | I try to use scikit-learn, with its SVM and Logistic Regression, and get good results. 109 | 110 | For SVM, I get 111 | 112 | > \>\>\> print(metrics.classification_report(expected, predicted)) 113 | 114 | > precision recall f1-score support 115 | 116 | > 0.0 1.00 1.00 1.00 190 117 | 118 | > 1.0 0.98 0.99 0.99 194 119 | 120 | > 2.0 0.99 1.00 0.99 186 121 | 122 | > 3.0 0.99 0.96 0.97 192 123 | 124 | > 4.0 0.99 0.99 0.99 202 125 | 126 | > 5.0 0.98 0.99 0.99 194 127 | 128 | > 6.0 0.99 0.99 0.99 184 129 | 130 | > 7.0 0.99 0.99 0.99 188 131 | 132 | > 8.0 0.99 0.99 0.99 201 133 | 134 | > 9.0 0.97 0.98 0.98 181 135 | 136 | > avg / total 0.99 0.99 0.99 1912 137 | 138 | For Logistic Regression, I get 139 | 140 | > \>\>\> print(metrics.classification_report(expected, lrpredicted)) 141 | 142 | > precision recall f1-score support 143 | 144 | > 0.0 0.99 1.00 0.99 190 145 | 146 | > 1.0 0.93 0.95 0.94 194 147 | 148 | > 2.0 0.98 0.97 0.98 186 149 | 150 | > 3.0 0.98 0.93 0.96 192 151 | 152 | > 4.0 0.98 0.97 0.97 202 153 | 154 | > 5.0 0.97 0.97 0.97 194 155 | 156 | > 6.0 0.98 0.99 0.98 184 157 | 158 | > 7.0 0.99 0.99 0.99 188 159 | 160 | > 8.0 0.93 0.93 0.93 201 161 | 162 | > 9.0 0.91 0.94 0.93 181 163 | 164 | > avg / total 0.96 0.96 0.96 1912 165 | 166 | ### Install MDP 167 | 168 | `sudo aptitude install python-mdp` 169 | 170 | Let's try something of the **flow**. I love this kind of **pipeline**. Here is the result: 171 | 172 | > precision recall f1-score support 173 | 174 | > 0.0 1.00 0.99 1.00 130 175 | 176 | > 1.0 0.99 0.98 0.98 130 177 | 178 | > 2.0 1.00 0.99 1.00 119 179 | 180 | > 3.0 0.98 1.00 0.99 129 181 | 182 | > 4.0 0.99 0.98 0.99 130 183 | 184 | > 5.0 0.99 1.00 1.00 128 185 | 186 | > 6.0 0.99 1.00 1.00 124 187 | 188 | > 7.0 0.99 0.98 0.99 126 189 | 190 | > 8.0 0.97 0.99 0.98 139 191 | 192 | > 9.0 0.98 0.98 0.98 120 193 | 194 | > avg / total 0.99 0.99 0.99 1275 195 | 196 | We can see that it is even better than the former SVM result. 197 | 198 | To testify my assumption, I substitude `SVCScikitLearnNode` with `LogisticRegressionScikitLearnNode`, and get similar result: 199 | 200 | > precision recall f1-score support 201 | 202 | > 0.0 1.00 0.99 1.00 130 203 | 204 | > 1.0 0.98 0.98 0.98 130 205 | 206 | > 2.0 1.00 1.00 1.00 119 207 | 208 | > 3.0 0.98 0.99 0.99 129 209 | 210 | > 4.0 0.99 0.98 0.99 130 211 | 212 | > 5.0 0.98 1.00 0.99 128 213 | 214 | > 6.0 0.99 1.00 1.00 124 215 | 216 | > 7.0 0.99 0.98 0.99 126 217 | 218 | > 8.0 0.99 0.98 0.98 139 219 | 220 | > 9.0 0.99 0.98 0.99 120 221 | 222 | > avg / total 0.99 0.99 0.99 1275 223 | 224 | So, in the handwritten digits recognition scenario, logistic regression with some feature expansion and transformation can compete SVM. SVM uses **kernel trick** to substitute the manual feature expansion and transformation. 225 | 226 | ### Logistic Regression 227 | 228 | Add k-fold cross validation and grid search in Logistic Regression. Result: 229 | 230 | > Best score: 0.964 231 | 232 | > Best parameters set: 233 | 234 | > {'C': 0.1, 'intercept_scaling': 1, 'fit_intercept': True, 'penalty': 'l2', 'random_state': None, 'dual': False, 'tol': 0.0001, 'class_weight': None} 235 | 236 | > precision recall f1-score support 237 | 238 | > 0.0 1.00 1.00 1.00 130 239 | 240 | > 1.0 0.93 0.95 0.94 130 241 | 242 | > 2.0 0.99 0.93 0.96 119 243 | 244 | > 3.0 0.96 0.97 0.97 129 245 | 246 | > 4.0 0.98 0.95 0.96 130 247 | 248 | > 5.0 0.97 0.99 0.98 128 249 | 250 | > 6.0 0.99 0.99 0.99 124 251 | 252 | > 7.0 0.98 0.98 0.98 126 253 | 254 | > 8.0 0.91 0.92 0.91 139 255 | 256 | > 9.0 0.94 0.94 0.94 120 257 | 258 | > avg / total 0.96 0.96 0.96 1275 259 | 260 | It looks better than the previous single Logistic Regression result. 261 | 262 | ### SVM 263 | 264 | Let's add k-fold cross validation and grid search in SVM. Here is the result: 265 | 266 | > Best score: 0.990 267 | 268 | > Best parameters set: 269 | 270 | > {'kernel': 'rbf', 'C': 10, 'verbose': False, 'probability': False, 'degree': 3, 'shrinking': True, 'max_iter': -1, 'random_state': None, 'tol': 0.001, 'cache_size': 200, 'coef0': 0.0, 'gamma': 0.001, 'class_weight': None} 271 | 272 | > precision recall f1-score support 273 | 274 | > 0.0 1.00 1.00 1.00 130 275 | 276 | > 1.0 0.98 0.98 0.98 130 277 | 278 | > 2.0 1.00 1.00 1.00 119 279 | 280 | > 3.0 0.99 0.98 0.99 129 281 | 282 | > 4.0 0.98 0.99 0.99 130 283 | 284 | > 5.0 0.99 1.00 1.00 128 285 | 286 | > 6.0 0.99 0.99 0.99 124 287 | 288 | > 7.0 0.99 0.98 0.99 126 289 | 290 | > 8.0 0.99 0.99 0.99 139 291 | 292 | > 9.0 0.98 0.99 0.99 120 293 | 294 | > avg / total 0.99 0.99 0.99 1275 295 | 296 | ### Neural Network 297 | 298 | With the help of RBM and `GridSearchCV`, I can get the following result on LR. (To use `BernoulliRBM`, we should transform the features into [0,1].) 299 | 300 | > Best score: 0.955 301 | 302 | > Best parameters set: 303 | 304 | > {'rbm1__batch_size': 10, 'lr__dual': False, 'rbm1__verbose': False, 'rbm1__n_iter': 10, 'rbm1': BernoulliRBM(batch_size=10, learning_rate=0.1, n_components=36, n_iter=10, 305 | 306 | > random_state=None, verbose=False), 'rbm1__n_components': 36, 'lr__tol': 0.0001, 'lr__class_weight': None, 'lr': LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True, 307 | 308 | > intercept_scaling=1, penalty='l1', random_state=None, tol=0.0001), 'rbm1__learning_rate': 0.1, 'rbm1__random_state': None, 'lr__fit_intercept': True, 'lr__penalty': 'l1', 'lr__random_state': None, 'lr__intercept_scaling': 1, 'lr__C': 100} 309 | 310 | > precision recall f1-score support 311 | 312 | > 0.0 0.98 1.00 0.99 130 313 | 314 | > 1.0 0.98 0.96 0.97 130 315 | 316 | > 2.0 0.99 0.95 0.97 119 317 | 318 | > 3.0 0.91 0.94 0.92 129 319 | 320 | > 4.0 0.99 0.97 0.98 130 321 | 322 | > 5.0 0.93 0.96 0.95 128 323 | 324 | > 6.0 0.98 0.97 0.97 124 325 | 326 | > 7.0 0.92 0.95 0.93 126 327 | 328 | > 8.0 0.90 0.93 0.91 139 329 | 330 | > 9.0 0.90 0.83 0.87 120 331 | 332 | > avg / total 0.95 0.95 0.95 1275 333 | 334 | It is not very exciting, but it is a good solution. I also try to use three-layer Neural Network, but the result is not very well. 335 | 336 | ### Random Forest 337 | 338 | It seems that the '9' is always hard to tell than '0'. How about random forest? 339 | 340 | From random forest, I can get my best score here: 341 | 342 | > Best score: 0.972 343 | 344 | > Best parameters set: 345 | 346 | > {'rf__bootstrap': True, 'rf__max_depth': None, 'rf__n_estimators': 90, 'rf__verbose': 0, 'rf__criterion': 'gini', 'rf__min_density': None, 'rf__min_samples_split': 2, 'rf__compute_importances': None, 'rf': RandomForestClassifier(bootstrap=True, compute_importances=None, 347 | 348 | > criterion='gini', max_depth=None, max_features='auto', 349 | 350 | > min_density=None, min_samples_leaf=1, min_samples_split=2, 351 | 352 | > n_estimators=90, n_jobs=1, oob_score=False, random_state=None, 353 | 354 | > verbose=0), 'rf__max_features': 'auto', 'rf__n_jobs': 1, 'rf__random_state': None, 'rf__oob_score': False, 'rf__min_samples_leaf': 1} 355 | 356 | > precision recall f1-score support 357 | 358 | > 0.0 0.99 0.99 0.99 130 359 | 360 | > 1.0 0.98 0.98 0.98 130 361 | 362 | > 2.0 1.00 0.98 0.99 119 363 | 364 | > 3.0 0.95 0.97 0.96 129 365 | 366 | > 4.0 0.98 0.99 0.99 130 367 | 368 | > 5.0 0.98 0.99 0.98 128 369 | 370 | > 6.0 0.98 0.99 0.99 124 371 | 372 | > 7.0 0.98 0.98 0.98 126 373 | 374 | > 8.0 0.99 0.97 0.98 139 375 | 376 | > 9.0 0.96 0.93 0.94 120 377 | 378 | > avg / total 0.98 0.98 0.98 1275 379 | 380 | 381 | ### K Nearest Neighbors 382 | 383 | I try to use a non-linear embedding method before KNN, but barely no promotion. So I only provide the single KNN result here. 384 | 385 | > Best score: 0.982 386 | 387 | > Best parameters set: 388 | 389 | > {'knn': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', 390 | 391 | > n_neighbors=5, p=2, weights='distance'), 'knn__p': 2, 'knn__metric': 'minkowski', 'knn__weights': 'distance', 'knn__leaf_size': 30, 'knn__algorithm': 'auto', 'knn__n_neighbors': 5} 392 | 393 | > precision recall f1-score support 394 | 395 | > 0.0 1.00 1.00 1.00 130 396 | 397 | > 1.0 0.97 0.98 0.98 130 398 | 399 | > 2.0 1.00 0.99 1.00 119 400 | 401 | > 3.0 0.98 0.97 0.97 129 402 | 403 | > 4.0 0.99 0.99 0.99 130 404 | 405 | > 5.0 0.98 0.99 0.98 128 406 | 407 | > 6.0 0.99 1.00 1.00 124 408 | 409 | > 7.0 0.96 0.98 0.97 126 410 | 411 | > 8.0 0.99 0.96 0.98 139 412 | 413 | > 9.0 0.95 0.94 0.95 120 414 | 415 | > avg / total 0.98 0.98 0.98 1275 416 | 417 | ## References 418 | 419 | 1. [Comparing Classification Algorithms for Handwritten Digits](http://blog.quantitations.com/machine%20learning/2013/02/27/comparing-classification-algorithms-for-handwritten-digits/) 420 | 421 | 2. [Example: Handwritten Digit Classification](http://pythonhosted.org/bob.learn.boosting/example.html) 422 | 423 | 3. [Classification of handwritten digits using a SVM](http://nbviewer.ipython.org/url/www.hdm-stuttgart.de/~maucher/ipnotebooks/MachineLearning/svmDigitRecognition.ipynb) 424 | 425 | 4. [Using neural nets to recognize handwritten digits](http://neuralnetworksanddeeplearning.com/chap1.html) 426 | 427 | 5. [Recognizing hand-written digits](http://scikit-learn.org/stable/auto_examples/plot_digits_classification.html) 428 | 429 | 6. [The MNIST Database of Handwritten Digits](http://yann.lecun.com/exdb/mnist/) 430 | 431 | 7. [Modular Toolkit for Data Processing](http://mdp-toolkit.sourceforge.net/documentation.html) 432 | 433 | 8. [Scikit-learn document](http://scikit-learn.org/stable/) 434 | 435 | 9. [Handwritten digits classification with MDP and scikits.learn](http://mdp-toolkit.sourceforge.net/examples/scikits_learn/digit_classification.html) 436 | -------------------------------------------------------------------------------- /w3/w3demo/demo1-python/HDR/papers/00b495249dfb217637000000.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinxusen/AMLI/72619ed24344ad3798d8ef60f2b7352e5b9bcd05/w3/w3demo/demo1-python/HDR/papers/00b495249dfb217637000000.pdf -------------------------------------------------------------------------------- /w3/w3demo/demo1-python/HDR/papers/10.1.1.110.4774.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinxusen/AMLI/72619ed24344ad3798d8ef60f2b7352e5b9bcd05/w3/w3demo/demo1-python/HDR/papers/10.1.1.110.4774.pdf -------------------------------------------------------------------------------- /w3/w3demo/demo1-python/HDR/papers/1412.8307v1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinxusen/AMLI/72619ed24344ad3798d8ef60f2b7352e5b9bcd05/w3/w3demo/demo1-python/HDR/papers/1412.8307v1.pdf -------------------------------------------------------------------------------- /w3/w3demo/demo1-python/HDR/papers/Lecture21.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinxusen/AMLI/72619ed24344ad3798d8ef60f2b7352e5b9bcd05/w3/w3demo/demo1-python/HDR/papers/Lecture21.pdf -------------------------------------------------------------------------------- /w3/w3demo/demo1-python/HDR/papers/Margulis-OpticalCharacterRecognition.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinxusen/AMLI/72619ed24344ad3798d8ef60f2b7352e5b9bcd05/w3/w3demo/demo1-python/HDR/papers/Margulis-OpticalCharacterRecognition.pdf -------------------------------------------------------------------------------- /w3/w3demo/demo1-python/HDR/papers/mdp_paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinxusen/AMLI/72619ed24344ad3798d8ef60f2b7352e5b9bcd05/w3/w3demo/demo1-python/HDR/papers/mdp_paper.pdf -------------------------------------------------------------------------------- /w3/w3demo/demo1-python/HDR/src/train_flow.py: -------------------------------------------------------------------------------- 1 | import mdp 2 | import numpy 3 | from sklearn import metrics 4 | 5 | digits = numpy.loadtxt(fname="optdigits.tra", delimiter=',') 6 | n_samples = len(digits) 7 | 8 | data = digits[:,:-1] 9 | target = digits[:,-1] 10 | 11 | n_trains = n_samples / 3 * 2 12 | 13 | train_data = [data[:n_trains, :]] 14 | train_data_with_labels = [(data[:n_trains, :], target[:n_trains])] 15 | 16 | test_data = data[n_trains:, :] 17 | test_labels = target[n_trains:] 18 | 19 | flow = mdp.Flow([mdp.nodes.PCANode(output_dim=25, dtype='f'), 20 | mdp.nodes.PolynomialExpansionNode(3), 21 | mdp.nodes.PCANode(output_dim=0.99), 22 | mdp.nodes.FDANode(output_dim=9), 23 | mdp.nodes.LogisticRegressionScikitsLearnNode()], verbose=True) 24 | 25 | flow.train([train_data, None, train_data, train_data_with_labels, train_data_with_labels]) 26 | 27 | flow[-1].execute = flow[-1].label 28 | 29 | prediction = flow(test_data) 30 | 31 | print metrics.classification_report(test_labels, prediction) 32 | -------------------------------------------------------------------------------- /w3/w3demo/demo1-python/HDR/src/train_flow_sklearn.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | from sklearn import datasets, svm, metrics 3 | from sklearn.grid_search import GridSearchCV 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.pipeline import Pipeline 6 | from sklearn.decomposition import PCA 7 | from sklearn.preprocessing import PolynomialFeatures 8 | from sklearn.lda import LDA 9 | 10 | digits = numpy.loadtxt(fname="optdigits.tra", delimiter=',') 11 | n_samples = len(digits) 12 | 13 | data = digits[:,:-1] 14 | target = digits[:,-1] 15 | 16 | param_grid = { 17 | 'pca1__n_components': [16], 18 | 'poly__degree': [2], 19 | 'pca2__n_components': [0.8], 20 | 'lda__n_components': [9], 21 | 'lr__penalty': ['l2'], 22 | 'lr__C': [0.1, 1] 23 | } 24 | 25 | steps = [('pca1', PCA()), 26 | ('poly', PolynomialFeatures()), 27 | ('pca2', PCA()), 28 | ('lda', LDA()), 29 | ('lr', LogisticRegression())] 30 | 31 | pipeline = Pipeline(steps) 32 | 33 | grid_search = GridSearchCV(pipeline, param_grid, n_jobs = -1, verbose = 1, cv = 2) 34 | 35 | n_trains = n_samples / 3 * 2 36 | 37 | # We learn the digits on the first half of the digits 38 | grid_search.fit(data[:n_trains], target[:n_trains]) 39 | 40 | print("Best score: %0.3f" % grid_search.best_score_) 41 | print("Best parameters set:") 42 | best_parameters = grid_search.best_estimator_.get_params() 43 | print best_parameters 44 | 45 | # Now predict the value of the digit on the second half: 46 | expected = target[n_trains:] 47 | predicted = grid_search.best_estimator_.predict(data[n_trains:]) 48 | 49 | print(metrics.classification_report(expected, predicted)) 50 | -------------------------------------------------------------------------------- /w3/w3demo/demo1-python/HDR/src/train_knn.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | from sklearn import metrics 3 | from sklearn.grid_search import GridSearchCV 4 | from sklearn.pipeline import Pipeline 5 | from sklearn.neighbors import KNeighborsClassifier 6 | 7 | digits = numpy.loadtxt(fname="optdigits.tra", delimiter=',') 8 | n_samples = len(digits) 9 | 10 | data = digits[:,:-1] 11 | target = digits[:,-1] 12 | 13 | param_grid = { 14 | 'knn__weights': ['uniform', 'distance'] 15 | } 16 | 17 | steps = [ 18 | ('knn', KNeighborsClassifier()) 19 | ] 20 | 21 | pipeline = Pipeline(steps) 22 | 23 | grid_search = GridSearchCV(pipeline, param_grid, n_jobs = -1, verbose = 1, cv = 3) 24 | 25 | n_trains = n_samples / 3 * 2 26 | 27 | # We learn the digits on the first half of the digits 28 | grid_search.fit(data[:n_trains], target[:n_trains]) 29 | 30 | print("Best score: %0.3f" % grid_search.best_score_) 31 | print("Best parameters set:") 32 | best_parameters = grid_search.best_estimator_.get_params() 33 | print best_parameters 34 | 35 | # Now predict the value of the digit on the second half: 36 | expected = target[n_trains:] 37 | predicted = grid_search.best_estimator_.predict(data[n_trains:]) 38 | 39 | print(metrics.classification_report(expected, predicted)) 40 | -------------------------------------------------------------------------------- /w3/w3demo/demo1-python/HDR/src/train_lr.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | from sklearn import datasets, svm, metrics 3 | from sklearn.grid_search import GridSearchCV 4 | from sklearn.linear_model import LogisticRegression 5 | 6 | digits = numpy.loadtxt(fname="optdigits.tra", delimiter=',') 7 | n_samples = len(digits) 8 | 9 | data = digits[:,:-1] 10 | target = digits[:,-1] 11 | 12 | param_grid = [ 13 | {'penalty': ['l1', 'l2'], 'C': [0.1, 1.0, 10.0, 100.0]} 14 | ] 15 | 16 | # Create a classifier: a support vector classifier 17 | classifier = LogisticRegression() 18 | 19 | grid_search = GridSearchCV(classifier, param_grid, n_jobs = -1, verbose = 1, cv = 5) 20 | 21 | n_trains = n_samples / 3 * 2 22 | 23 | # We learn the digits on the first half of the digits 24 | grid_search.fit(data[:n_trains], target[:n_trains]) 25 | 26 | print("Best score: %0.3f" % grid_search.best_score_) 27 | print("Best parameters set:") 28 | best_parameters = grid_search.best_estimator_.get_params() 29 | print best_parameters 30 | 31 | # Now predict the value of the digit on the second half: 32 | expected = target[n_trains:] 33 | predicted = grid_search.best_estimator_.predict(data[n_trains:]) 34 | 35 | print(metrics.classification_report(expected, predicted)) 36 | -------------------------------------------------------------------------------- /w3/w3demo/demo1-python/HDR/src/train_rbm.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | from sklearn import metrics 3 | from sklearn.grid_search import GridSearchCV 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.pipeline import Pipeline 6 | from sklearn.neural_network import BernoulliRBM 7 | 8 | digits = numpy.loadtxt(fname="optdigits.tra", delimiter=',') 9 | n_samples = len(digits) 10 | 11 | data = digits[:,:-1] / 16.0 12 | target = digits[:,-1] 13 | 14 | param_grid = { 15 | 'rbm1__n_components': [36, 25, 16], 16 | 'rbm2__n_components': [16], 17 | 'rbm3__n_components': [9], 18 | 'lr__penalty': ['l2', 'l1'], 19 | 'lr__C': [1, 10, 100] 20 | } 21 | 22 | steps = [ 23 | ('rbm1', BernoulliRBM()), 24 | ('rbm2', BernoulliRBM()), 25 | ('rbm3', BernoulliRBM()), 26 | ('lr', LogisticRegression()) 27 | ] 28 | 29 | pipeline = Pipeline(steps) 30 | 31 | grid_search = GridSearchCV(pipeline, param_grid, n_jobs = -1, verbose = 1, cv = 3) 32 | 33 | n_trains = n_samples / 3 * 2 34 | 35 | # We learn the digits on the first half of the digits 36 | grid_search.fit(data[:n_trains], target[:n_trains]) 37 | 38 | print("Best score: %0.3f" % grid_search.best_score_) 39 | print("Best parameters set:") 40 | best_parameters = grid_search.best_estimator_.get_params() 41 | print best_parameters 42 | 43 | # Now predict the value of the digit on the second half: 44 | expected = target[n_trains:] 45 | predicted = grid_search.best_estimator_.predict(data[n_trains:]) 46 | 47 | print(metrics.classification_report(expected, predicted)) 48 | -------------------------------------------------------------------------------- /w3/w3demo/demo1-python/HDR/src/train_rf.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | from sklearn import metrics 3 | from sklearn.grid_search import GridSearchCV 4 | from sklearn.pipeline import Pipeline 5 | from sklearn.ensemble import RandomForestClassifier 6 | 7 | digits = numpy.loadtxt(fname="optdigits.tra", delimiter=',') 8 | n_samples = len(digits) 9 | 10 | data = digits[:,:-1] 11 | target = digits[:,-1] 12 | 13 | param_grid = { 14 | 'rf__n_estimators': [40, 50, 60, 70, 80, 90] 15 | } 16 | 17 | steps = [ 18 | ('rf', RandomForestClassifier()) 19 | ] 20 | 21 | pipeline = Pipeline(steps) 22 | 23 | grid_search = GridSearchCV(pipeline, param_grid, n_jobs = -1, verbose = 1, cv = 3) 24 | 25 | n_trains = n_samples / 3 * 2 26 | 27 | # We learn the digits on the first half of the digits 28 | grid_search.fit(data[:n_trains], target[:n_trains]) 29 | 30 | print("Best score: %0.3f" % grid_search.best_score_) 31 | print("Best parameters set:") 32 | best_parameters = grid_search.best_estimator_.get_params() 33 | print best_parameters 34 | 35 | # Now predict the value of the digit on the second half: 36 | expected = target[n_trains:] 37 | predicted = grid_search.best_estimator_.predict(data[n_trains:]) 38 | 39 | print(metrics.classification_report(expected, predicted)) 40 | -------------------------------------------------------------------------------- /w3/w3demo/demo1-python/HDR/src/train_svm.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | from sklearn import datasets, svm, metrics 3 | from sklearn.grid_search import GridSearchCV 4 | 5 | digits = numpy.loadtxt(fname="optdigits.tra", delimiter=',') 6 | n_samples = len(digits) 7 | 8 | data = digits[:,:-1] 9 | target = digits[:,-1] 10 | 11 | param_grid = [ 12 | {'C': [1, 10, 100, 1000], 'kernel': ['linear']}, 13 | {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}, 14 | ] 15 | 16 | # Create a classifier: a support vector classifier 17 | classifier = svm.SVC() 18 | 19 | grid_search = GridSearchCV(classifier, param_grid, n_jobs = -1, verbose = 1, cv = 5) 20 | 21 | n_trains = n_samples / 3 * 2 22 | 23 | # We learn the digits on the first half of the digits 24 | grid_search.fit(data[:n_trains], target[:n_trains]) 25 | 26 | print("Best score: %0.3f" % grid_search.best_score_) 27 | print("Best parameters set:") 28 | best_parameters = grid_search.best_estimator_.get_params() 29 | print best_parameters 30 | 31 | # Now predict the value of the digit on the second half: 32 | expected = target[n_trains:] 33 | predicted = grid_search.best_estimator_.predict(data[n_trains:]) 34 | 35 | print(metrics.classification_report(expected, predicted)) 36 | -------------------------------------------------------------------------------- /w3/w3demo/demo1/readme.txt: -------------------------------------------------------------------------------- 1 | 大家可以到spark下载页面获取第一次的demo:https://spark.apache.org/downloads.html 2 | 3 | 第一次demo直接用的spark源码。 4 | -------------------------------------------------------------------------------- /w3/w3demo/demo2/custom-serving/.gitignore: -------------------------------------------------------------------------------- 1 | data/sample_movielens_data.txt 2 | manifest.json 3 | target/ 4 | /pio.sbt 5 | pio.log 6 | -------------------------------------------------------------------------------- /w3/w3demo/demo2/custom-serving/build.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | name := "template-scala-parallel-recommendation" 6 | 7 | organization := "io.prediction" 8 | 9 | libraryDependencies ++= Seq( 10 | "io.prediction" %% "core" % "0.9.2" % "provided", 11 | "org.apache.spark" %% "spark-core" % "1.2.0" % "provided", 12 | "org.apache.spark" %% "spark-mllib" % "1.2.0" % "provided") 13 | -------------------------------------------------------------------------------- /w3/w3demo/demo2/custom-serving/data/sample_disabled_items.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinxusen/AMLI/72619ed24344ad3798d8ef60f2b7352e5b9bcd05/w3/w3demo/demo2/custom-serving/data/sample_disabled_items.txt -------------------------------------------------------------------------------- /w3/w3demo/demo2/custom-serving/engine.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "default", 3 | "description": "Default settings", 4 | "engineFactory": "org.template.recommendation.RecommendationEngine", 5 | "datasource": { 6 | "params": { 7 | "trainSet": "/Users/panda/data/demo/trainset" 8 | } 9 | }, 10 | "algorithms": [ 11 | { 12 | "name": "als", 13 | "params": { 14 | "rank": 10, 15 | "numIterations": 20, 16 | "lambda": 0.01, 17 | "seed": 3 18 | } 19 | } 20 | ], 21 | "serving": { 22 | "params": { 23 | "filepath": "./data/sample_disabled_items.txt" 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /w3/w3demo/demo2/custom-serving/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /w3/w3demo/demo2/custom-serving/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0") 2 | -------------------------------------------------------------------------------- /w3/w3demo/demo2/custom-serving/src/main/scala/ALSAlgorithm.scala: -------------------------------------------------------------------------------- 1 | package org.template.recommendation 2 | 3 | import io.prediction.controller.PAlgorithm 4 | import io.prediction.controller.Params 5 | import io.prediction.data.storage.BiMap 6 | 7 | import org.apache.spark.SparkContext 8 | import org.apache.spark.SparkContext._ 9 | import org.apache.spark.rdd.RDD 10 | import org.apache.spark.mllib.recommendation.ALS 11 | import org.apache.spark.mllib.recommendation.{Rating => MLlibRating} 12 | import org.apache.spark.mllib.recommendation.ALSModel 13 | 14 | import grizzled.slf4j.Logger 15 | 16 | case class ALSAlgorithmParams( 17 | rank: Int, 18 | numIterations: Int, 19 | lambda: Double, 20 | seed: Option[Long]) extends Params 21 | 22 | class ALSAlgorithm(val ap: ALSAlgorithmParams) 23 | extends PAlgorithm[PreparedData, ALSModel, Query, PredictedResult] { 24 | 25 | @transient lazy val logger = Logger[this.type] 26 | 27 | def train(sc: SparkContext, data: PreparedData): ALSModel = { 28 | // MLLib ALS cannot handle empty training data. 29 | require(!data.ratings.take(1).isEmpty, 30 | s"RDD[Rating] in PreparedData cannot be empty." + 31 | " Please check if DataSource generates TrainingData" + 32 | " and Preprator generates PreparedData correctly.") 33 | // Convert user and item String IDs to Int index for MLlib 34 | val userStringIntMap = BiMap.stringInt(data.ratings.map(_.user)) 35 | val itemStringIntMap = BiMap.stringInt(data.ratings.map(_.item)) 36 | val mllibRatings = data.ratings.map( r => 37 | // MLlibRating requires integer index for user and item 38 | MLlibRating(userStringIntMap(r.user), itemStringIntMap(r.item), r.rating) 39 | ) 40 | 41 | // seed for MLlib ALS 42 | val seed = ap.seed.getOrElse(System.nanoTime) 43 | 44 | // If you only have one type of implicit event (Eg. "view" event only), 45 | // replace ALS.train(...) with 46 | //val m = ALS.trainImplicit( 47 | //ratings = mllibRatings, 48 | //rank = ap.rank, 49 | //iterations = ap.numIterations, 50 | //lambda = ap.lambda, 51 | //blocks = -1, 52 | //alpha = 1.0, 53 | //seed = seed) 54 | 55 | val m = ALS.train( 56 | ratings = mllibRatings, 57 | rank = ap.rank, 58 | iterations = ap.numIterations, 59 | lambda = ap.lambda, 60 | blocks = -1, 61 | seed = seed) 62 | 63 | new ALSModel( 64 | rank = m.rank, 65 | userFeatures = m.userFeatures, 66 | productFeatures = m.productFeatures, 67 | userStringIntMap = userStringIntMap, 68 | itemStringIntMap = itemStringIntMap) 69 | } 70 | 71 | def predict(model: ALSModel, query: Query): PredictedResult = { 72 | // Convert String ID to Int index for Mllib 73 | model.userStringIntMap.get(query.user).map { userInt => 74 | // create inverse view of itemStringIntMap 75 | val itemIntStringMap = model.itemStringIntMap.inverse 76 | // recommendProducts() returns Array[MLlibRating], which uses item Int 77 | // index. Convert it to String ID for returning PredictedResult 78 | val itemScores = model.recommendProducts(userInt, query.num) 79 | .map (r => ItemScore(itemIntStringMap(r.product), r.rating)) 80 | new PredictedResult(itemScores) 81 | }.getOrElse{ 82 | logger.info(s"No prediction for unknown user ${query.user}.") 83 | new PredictedResult(Array.empty) 84 | } 85 | } 86 | 87 | } 88 | -------------------------------------------------------------------------------- /w3/w3demo/demo2/custom-serving/src/main/scala/ALSModel.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.mllib.recommendation 2 | // This must be the same package as Spark's MatrixFactorizationModel because 3 | // MatrixFactorizationModel's constructor is private and we are using 4 | // its constructor in order to save and load the model 5 | 6 | import org.template.recommendation.ALSAlgorithmParams 7 | 8 | import io.prediction.controller.IPersistentModel 9 | import io.prediction.controller.IPersistentModelLoader 10 | import io.prediction.data.storage.BiMap 11 | 12 | import org.apache.spark.SparkContext 13 | import org.apache.spark.SparkContext._ 14 | import org.apache.spark.rdd.RDD 15 | 16 | class ALSModel( 17 | override val rank: Int, 18 | override val userFeatures: RDD[(Int, Array[Double])], 19 | override val productFeatures: RDD[(Int, Array[Double])], 20 | val userStringIntMap: BiMap[String, Int], 21 | val itemStringIntMap: BiMap[String, Int]) 22 | extends MatrixFactorizationModel(rank, userFeatures, productFeatures) 23 | with IPersistentModel[ALSAlgorithmParams] { 24 | 25 | def save(id: String, params: ALSAlgorithmParams, 26 | sc: SparkContext): Boolean = { 27 | 28 | sc.parallelize(Seq(rank)).saveAsObjectFile(s"/tmp/${id}/rank") 29 | userFeatures.saveAsObjectFile(s"/tmp/${id}/userFeatures") 30 | productFeatures.saveAsObjectFile(s"/tmp/${id}/productFeatures") 31 | sc.parallelize(Seq(userStringIntMap)) 32 | .saveAsObjectFile(s"/tmp/${id}/userStringIntMap") 33 | sc.parallelize(Seq(itemStringIntMap)) 34 | .saveAsObjectFile(s"/tmp/${id}/itemStringIntMap") 35 | true 36 | } 37 | 38 | override def toString = { 39 | s"userFeatures: [${userFeatures.count()}]" + 40 | s"(${userFeatures.take(2).toList}...)" + 41 | s" productFeatures: [${productFeatures.count()}]" + 42 | s"(${productFeatures.take(2).toList}...)" + 43 | s" userStringIntMap: [${userStringIntMap.size}]" + 44 | s"(${userStringIntMap.take(2)}...)" + 45 | s" itemStringIntMap: [${itemStringIntMap.size}]" + 46 | s"(${itemStringIntMap.take(2)}...)" 47 | } 48 | } 49 | 50 | object ALSModel 51 | extends IPersistentModelLoader[ALSAlgorithmParams, ALSModel] { 52 | def apply(id: String, params: ALSAlgorithmParams, 53 | sc: Option[SparkContext]) = { 54 | new ALSModel( 55 | rank = sc.get.objectFile[Int](s"/tmp/${id}/rank").first, 56 | userFeatures = sc.get.objectFile(s"/tmp/${id}/userFeatures"), 57 | productFeatures = sc.get.objectFile(s"/tmp/${id}/productFeatures"), 58 | userStringIntMap = sc.get 59 | .objectFile[BiMap[String, Int]](s"/tmp/${id}/userStringIntMap").first, 60 | itemStringIntMap = sc.get 61 | .objectFile[BiMap[String, Int]](s"/tmp/${id}/itemStringIntMap").first) 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /w3/w3demo/demo2/custom-serving/src/main/scala/DataSource.scala: -------------------------------------------------------------------------------- 1 | package org.template.recommendation 2 | 3 | import io.prediction.controller.PDataSource 4 | import io.prediction.controller.EmptyEvaluationInfo 5 | import io.prediction.controller.EmptyActualResult 6 | import io.prediction.controller.Params 7 | import io.prediction.data.storage.Event 8 | import io.prediction.data.storage.Storage 9 | 10 | import org.apache.spark.SparkContext 11 | import org.apache.spark.SparkContext._ 12 | import org.apache.spark.rdd.RDD 13 | 14 | import grizzled.slf4j.Logger 15 | 16 | case class DataSourceParams(trainSet: String) extends Params 17 | case class TrainData(mid: String, uid: String, score: Double, date: String) 18 | 19 | class DataSource(val dsp: DataSourceParams) 20 | extends PDataSource[TrainingData, 21 | EmptyEvaluationInfo, Query, EmptyActualResult] { 22 | 23 | @transient lazy val logger = Logger[this.type] 24 | 25 | override 26 | def readTraining(sc: SparkContext): TrainingData = { 27 | val trainSet = sc.textFile(dsp.trainSet).map(_.split(",")).map { 28 | m => TrainData(m(0), m(1), m(2).toDouble, m(3)) 29 | } 30 | val ratingsRDD = trainSet.map(td => Rating(td.uid, td.mid, td.score)) 31 | 32 | new TrainingData(ratingsRDD) 33 | } 34 | } 35 | 36 | case class Rating( 37 | user: String, 38 | item: String, 39 | rating: Double 40 | ) 41 | 42 | class TrainingData( 43 | val ratings: RDD[Rating] 44 | ) extends Serializable { 45 | override def toString = { 46 | s"ratings: [${ratings.count()}] (${ratings.take(2).toList}...)" 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /w3/w3demo/demo2/custom-serving/src/main/scala/Engine.scala: -------------------------------------------------------------------------------- 1 | package org.template.recommendation 2 | 3 | import io.prediction.controller.IEngineFactory 4 | import io.prediction.controller.Engine 5 | 6 | case class Query( 7 | user: String, 8 | num: Int 9 | ) extends Serializable 10 | 11 | case class PredictedResult( 12 | itemScores: Array[ItemScore] 13 | ) extends Serializable 14 | 15 | case class ItemScore( 16 | item: String, 17 | score: Double 18 | ) extends Serializable 19 | 20 | object RecommendationEngine extends IEngineFactory { 21 | def apply() = { 22 | new Engine( 23 | classOf[DataSource], 24 | classOf[Preparator], 25 | Map("als" -> classOf[ALSAlgorithm]), 26 | classOf[Serving]) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /w3/w3demo/demo2/custom-serving/src/main/scala/Preparator.scala: -------------------------------------------------------------------------------- 1 | package org.template.recommendation 2 | 3 | import io.prediction.controller.PPreparator 4 | 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.SparkContext._ 7 | import org.apache.spark.rdd.RDD 8 | 9 | class Preparator 10 | extends PPreparator[TrainingData, PreparedData] { 11 | 12 | def prepare(sc: SparkContext, trainingData: TrainingData): PreparedData = { 13 | new PreparedData(ratings = trainingData.ratings) 14 | } 15 | } 16 | 17 | class PreparedData( 18 | val ratings: RDD[Rating] 19 | ) extends Serializable 20 | -------------------------------------------------------------------------------- /w3/w3demo/demo2/custom-serving/src/main/scala/Serving.scala: -------------------------------------------------------------------------------- 1 | package org.template.recommendation 2 | 3 | import io.prediction.controller.LServing 4 | 5 | import scala.io.Source 6 | 7 | import io.prediction.controller.Params // ADDED 8 | 9 | // ADDED ServingParams to specify the blacklisting file location. 10 | case class ServingParams(filepath: String) extends Params 11 | 12 | class Serving(val params: ServingParams) 13 | extends LServing[Query, PredictedResult] { 14 | 15 | override 16 | def serve(query: Query, predictedResults: Seq[PredictedResult]) 17 | : PredictedResult = { 18 | val disabledProducts: Set[String] = Source 19 | .fromFile(params.filepath) 20 | .getLines() 21 | .toSet 22 | 23 | val itemScores = predictedResults.head.itemScores 24 | PredictedResult(itemScores.filter(ps => !disabledProducts(ps.item))) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/core/src/main/scala/com/amli/w3/recommend/DataSource.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend 2 | 3 | import _root_.io.prediction.controller._ 4 | import org.apache.spark.SparkContext 5 | 6 | import com.amli.w3.recommend.io.{DataLoader, MySQLLoader} 7 | 8 | case class DataSourceParamUnit(sources: Seq[String], users: Seq[String], passwords: Seq[String]) 9 | case class DataSourceParams(params: Seq[DataSourceParamUnit]) extends Params 10 | 11 | class DataSource(val dataSourceParams: DataSourceParams) 12 | extends PDataSource[DataSourceParams, EmptyDataParams, TrainingData, Query, EmptyActualResult] { 13 | 14 | override def readTraining(sc: SparkContext): TrainingData = { 15 | TrainingData { 16 | dataSourceParams.params.map { case DataSourceParamUnit(sources, users, passwords) => 17 | val (mergedRestaurants, mergedData) = sources.zip(users.zip(passwords)) 18 | .foldLeft((Seq.empty[Int], TrainingUnit.empty)) { 19 | case ((restaurantIds, trainingData), (source, (user, password))) => 20 | val sqlLoader = MySQLLoader(sc, source, user, password) 21 | val trans = DataLoader.getTransactions(sqlLoader).collect() 22 | val restaurantId = DataLoader.getRestaurantId(sqlLoader) 23 | (restaurantIds :+ restaurantId, trainingData + TrainingUnit(trans)) 24 | } 25 | (mergedRestaurants, mergedData) 26 | } 27 | } 28 | } 29 | } 30 | 31 | case class TrainingUnit(trans: Seq[(Int, Int)]) extends Serializable { 32 | def +(other: TrainingUnit) = new TrainingUnit(this.trans ++ other.trans) 33 | } 34 | 35 | object TrainingUnit { 36 | def empty = new TrainingUnit(Nil) 37 | } 38 | 39 | case class TrainingData(data: Seq[(Seq[Int], TrainingUnit)]) extends Serializable { 40 | override def toString = s"Training data contains ${data.size} training units. " + 41 | s"The first training unit contains ${data.head._2.trans.size} training data." 42 | } 43 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/core/src/main/scala/com/amli/w3/recommend/Engine.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend 2 | 3 | import _root_.io.prediction.controller._ 4 | 5 | case class Query( 6 | restaurantId: Int, 7 | users: Array[Int], 8 | personAmount: Int, 9 | expectedConsumePerHead: Int, 10 | mealType: Int, 11 | itemsInCart: Array[Int], 12 | clickedItems: Array[Int], 13 | num: Int 14 | ) extends Serializable 15 | 16 | case class PredictedResult( 17 | recommendItems: Array[RecommendItem] 18 | ) extends Serializable 19 | 20 | case class RecommendItem( 21 | id: Int, 22 | score: Double, 23 | reason: String 24 | ) extends Serializable 25 | 26 | object RecommendationEngine extends IEngineFactory { 27 | override def apply() = { 28 | new Engine( 29 | classOf[DataSource], 30 | classOf[Preparator], 31 | Map("itemtoitem" -> classOf[ItemToItem]), 32 | classOf[Serving]) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/core/src/main/scala/com/amli/w3/recommend/ItemToItem.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend 2 | 3 | import _root_.io.prediction.controller._ 4 | 5 | import com.amli.w3.recommend.io.DataTransfer._ 6 | import com.amli.w3.recommend.model.ItemSimilarity 7 | 8 | case class ItemToItemParamUnit(k: Int, modelFile: String) 9 | case class ItemToItemParams(params: Seq[ItemToItemParamUnit]) extends Params 10 | 11 | class ItemToItem(val itemToItemParams: ItemToItemParams) 12 | extends PAlgorithm[ItemToItemParams, PreparedData, ItemToItemModel, Query, PredictedResult] { 13 | 14 | override def train(data: PreparedData): ItemToItemModel = { 15 | val models = itemToItemParams.params.zip(data.data).flatMap { 16 | case (ItemToItemParamUnit(k, modelFile), (restaurantIds, TrainingUnit(trans))) => 17 | val model = new ItemSimilarity() 18 | model.configModel(Map("k" -> k.toString, "comment" -> restaurantIds.mkString(","))) 19 | model.trainModel(addVirtualUserToTransaction(trans)) 20 | var i = 0 21 | var res: Seq[(Int, ItemSimilarity, Option[String])] = Nil 22 | while (i < restaurantIds.size) { 23 | if (i == 0) res = res.+:((restaurantIds(i), model, Option(modelFile))) 24 | else res = res.+:((restaurantIds(i), model, None)) 25 | i += 1 26 | } 27 | res 28 | } 29 | ItemToItemModel( 30 | models.map { case (id, model, store) => (id, model) }.toMap, 31 | models.map { case (id, model, store) => (id, store) }.toMap 32 | ) 33 | } 34 | 35 | override def predict(model: ItemToItemModel, query: Query): PredictedResult = { 36 | val results = { 37 | if (!model.models.contains(query.restaurantId)) Nil 38 | else model.models(query.restaurantId) 39 | .recommend(query.users.toSeq, query.itemsInCart.toSeq, query.num) 40 | } 41 | new PredictedResult(results.map (r => RecommendItem(r.item, r.score, r.reason)).toArray) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/core/src/main/scala/com/amli/w3/recommend/ItemToItemModel.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend 2 | 3 | import _root_.io.prediction.controller._ 4 | import org.apache.spark.SparkContext 5 | 6 | import com.amli.w3.recommend.model.ItemSimilarity 7 | 8 | class ItemToItemModel(val models: Map[Int, ItemSimilarity], stores: Map[Int, Option[String]]) 9 | extends IPersistentModel[ItemToItemParams] { 10 | 11 | override def save(id: String, params: ItemToItemParams, sc: SparkContext): Boolean = { 12 | for (restaurantId <- models.keySet) { 13 | val model = models(restaurantId) 14 | val store = stores(restaurantId) 15 | store match { 16 | case Some(something) => model.exportModel(something) 17 | case None => // do nothing 18 | } 19 | } 20 | true 21 | } 22 | } 23 | 24 | object ItemToItemModel extends IPersistentModelLoader[ItemToItemParams, ItemToItemModel] { 25 | 26 | def apply(models: Map[Int, ItemSimilarity], stores: Map[Int, Option[String]] = null) = { 27 | new ItemToItemModel(models, stores) 28 | } 29 | 30 | def parseComment(comment: String): Seq[Int] = { 31 | comment.split(",").map(_.toInt) 32 | } 33 | 34 | override def apply(id: String, params: ItemToItemParams, sc: Option[SparkContext]) = { 35 | ItemToItemModel( 36 | params.params.flatMap { case ItemToItemParamUnit(_, modelFile) => 37 | val model = new ItemSimilarity() 38 | model.importModel(modelFile) 39 | val restaurantIds = parseComment(model.comment) 40 | restaurantIds.map(_ -> model) 41 | }.toMap 42 | ) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/core/src/main/scala/com/amli/w3/recommend/Preparator.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend 2 | 3 | import _root_.io.prediction.controller._ 4 | import org.apache.spark.SparkContext 5 | 6 | class Preparator extends PPreparator[EmptyPreparatorParams, TrainingData, PreparedData] { 7 | 8 | override def prepare(sc: SparkContext, trainingData: TrainingData): PreparedData = { 9 | PreparedData(trainingData.data) 10 | } 11 | } 12 | 13 | case class PreparedData(data: Seq[(Seq[Int], TrainingUnit)]) extends Serializable { 14 | override def toString = s"Prepared data contains ${data.size} training units." 15 | } 16 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/core/src/main/scala/com/amli/w3/recommend/Serving.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend 2 | 3 | import _root_.io.prediction.controller._ 4 | 5 | class Serving extends LServing[EmptyServingParams, Query, PredictedResult] { 6 | 7 | override def serve(query: Query, PredictedResults: Seq[PredictedResult]): PredictedResult = { 8 | PredictedResults.head 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/core/src/test/scala/com/amli/w3/recommend/DataSourceSuite.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend 2 | 3 | class DataSourceSuite { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/datalyze/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=ALL, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Settings to quiet third party logs that are too verbose 9 | log4j.logger.org.eclipse.jetty=WARN 10 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR 11 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 12 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 13 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/Logging.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend 2 | 3 | import org.apache.log4j.{LogManager, PropertyConfigurator} 4 | import org.slf4j.impl.StaticLoggerBinder 5 | import org.slf4j.{Logger, LoggerFactory} 6 | 7 | /** 8 | * :: DeveloperApi :: 9 | * Utility trait for classes that want to log data. Creates a SLF4J logger for the class and allows 10 | * logging messages at different levels using methods that only evaluate parameters lazily if the 11 | * log level is enabled. 12 | * 13 | * NOTE: DO NOT USE this class outside of Spark. It is intended as an internal utility. 14 | * This will likely be changed or removed in future releases. 15 | */ 16 | trait Logging { 17 | // Make the log field transient so that objects with Logging can 18 | // be serialized and used on another machine 19 | @transient private var log_ : Logger = null 20 | 21 | // Method to get the logger name for this object 22 | protected def logName = { 23 | // Ignore trailing $'s in the class names for Scala objects 24 | this.getClass.getName.stripSuffix("$") 25 | } 26 | 27 | // Method to get or create the logger for this object 28 | protected def log: Logger = { 29 | if (log_ == null) { 30 | initializeIfNecessary() 31 | log_ = LoggerFactory.getLogger(logName) 32 | } 33 | log_ 34 | } 35 | 36 | // Log methods that take only a String 37 | protected def logInfo(msg: => String) { 38 | if (log.isInfoEnabled) log.info(msg) 39 | } 40 | 41 | protected def logDebug(msg: => String) { 42 | if (log.isDebugEnabled) log.debug(msg) 43 | } 44 | 45 | protected def logTrace(msg: => String) { 46 | if (log.isTraceEnabled) log.trace(msg) 47 | } 48 | 49 | protected def logWarning(msg: => String) { 50 | if (log.isWarnEnabled) log.warn(msg) 51 | } 52 | 53 | protected def logError(msg: => String) { 54 | if (log.isErrorEnabled) log.error(msg) 55 | } 56 | 57 | // Log methods that take Throwables (Exceptions/Errors) too 58 | protected def logInfo(msg: => String, throwable: Throwable) { 59 | if (log.isInfoEnabled) log.info(msg, throwable) 60 | } 61 | 62 | protected def logDebug(msg: => String, throwable: Throwable) { 63 | if (log.isDebugEnabled) log.debug(msg, throwable) 64 | } 65 | 66 | protected def logTrace(msg: => String, throwable: Throwable) { 67 | if (log.isTraceEnabled) log.trace(msg, throwable) 68 | } 69 | 70 | protected def logWarning(msg: => String, throwable: Throwable) { 71 | if (log.isWarnEnabled) log.warn(msg, throwable) 72 | } 73 | 74 | protected def logError(msg: => String, throwable: Throwable) { 75 | if (log.isErrorEnabled) log.error(msg, throwable) 76 | } 77 | 78 | protected def isTraceEnabled(): Boolean = { 79 | log.isTraceEnabled 80 | } 81 | 82 | private def initializeIfNecessary() { 83 | if (!Logging.initialized) { 84 | Logging.initLock.synchronized { 85 | if (!Logging.initialized) { 86 | initializeLogging() 87 | } 88 | } 89 | } 90 | } 91 | 92 | private def initializeLogging() { 93 | // Don't use a logger in here, as this is itself occurring during initialization of a logger 94 | // If Log4j 1.2 is being used, but is not initialized, load a default properties file 95 | val binderClass = StaticLoggerBinder.getSingleton.getLoggerFactoryClassStr 96 | // This distinguishes the log4j 1.2 binding, currently 97 | // org.slf4j.impl.Log4jLoggerFactory, from the log4j 2.0 binding, currently 98 | // org.apache.logging.slf4j.Log4jLoggerFactory 99 | val usingLog4j12 = "org.slf4j.impl.Log4jLoggerFactory".equals(binderClass) 100 | val log4j12Initialized = LogManager.getRootLogger.getAllAppenders.hasMoreElements 101 | if (!log4j12Initialized && usingLog4j12) { 102 | val defaultLogProps = "com/ilc/dec/log4j-defaults.properties" 103 | Option(getClass.getClassLoader.getResource(defaultLogProps)) match { 104 | case Some(url) => 105 | PropertyConfigurator.configure(url) 106 | System.err.println(s"Using Spark's default log4j profile: $defaultLogProps") 107 | case None => 108 | System.err.println(s"Spark was unable to load $defaultLogProps") 109 | } 110 | } 111 | Logging.initialized = true 112 | 113 | // Force a call into slf4j to initialize it. Avoids this happening from multiple threads 114 | // and triggering this: http://mailman.qos.ch/pipermail/slf4j-dev/2010-April/002956.html 115 | log 116 | } 117 | } 118 | 119 | private object Logging { 120 | @volatile private var initialized = false 121 | val initLock = new Object() 122 | try { 123 | // We use reflection here to handle the case where users remove the 124 | // slf4j-to-jul bridge order to route their logs to JUL. 125 | val bridgeClass = Class.forName("org.slf4j.bridge.SLF4JBridgeHandler") 126 | bridgeClass.getMethod("removeHandlersForRootLogger").invoke(null) 127 | val installed = bridgeClass.getMethod("isInstalled").invoke(null).asInstanceOf[Boolean] 128 | if (!installed) { 129 | bridgeClass.getMethod("install").invoke(null) 130 | } 131 | } catch { 132 | case e: ClassNotFoundException => // can't log anything yet so just fail silently 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/io/DataLoader.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend.io 2 | 3 | import java.sql.ResultSet 4 | import scala.io.Source._ 5 | 6 | import org.apache.spark.rdd.RDD 7 | 8 | import com.amli.w3.recommend.Logging 9 | import com.amli.w3.recommend.model.{EmptyTransaction, Transaction} 10 | 11 | /** 12 | * Load data from a kind of Loader. Currently only `MySQLLoader` is implemented. Other loaders such 13 | * as `HDFSLoader` will be implemented in next release. 14 | */ 15 | object DataLoader extends Logging { 16 | 17 | /** 18 | * Count all food orders from loader. 19 | */ 20 | def getOrderCount(sqlLoader: MySQLLoader): Int = { 21 | val mapRow: (ResultSet) => Int = rs => rs.getInt(1) 22 | val qStatement = "SELECT count(*) FROM foodorder;" 23 | sqlLoader.query[Int](qStatement, mapRow).first() 24 | } 25 | 26 | /** 27 | * Get all transactions from loader. 28 | */ 29 | def getTransactions(sqlLoader: MySQLLoader): RDD[(Int, Int)] = { 30 | val mapRow: (ResultSet) => (Int, Int) = rs => (rs.getInt(1), rs.getInt(2)) 31 | val qStatement = "SELECT foodid, foodorderid FROM foodorderdetail;" 32 | sqlLoader.query[(Int, Int)](qStatement, mapRow) 33 | } 34 | 35 | /** 36 | * Get the mapping from foodid to foodname. 37 | */ 38 | def getDictionary(sqlLoader: MySQLLoader): RDD[(Int, String)] = { 39 | val mapRow: (ResultSet) => (Int, String) = rs => (rs.getInt(1), rs.getString(2)) 40 | val qStatement = " SELECT foodid, foodname FROM food;" 41 | sqlLoader.query[(Int, String)](qStatement, mapRow) 42 | } 43 | 44 | /** 45 | * Get current restaurant Id. 46 | */ 47 | def getRestaurantId(sqlLoader: MySQLLoader): Int = { 48 | val mapRow: (ResultSet) => Int = rs => rs.getInt(1) 49 | val qStatement = "SELECT companyid FROM company;" 50 | sqlLoader.query[Int](qStatement, mapRow).first() 51 | } 52 | 53 | /** 54 | * Get all tables from a given database. 55 | */ 56 | def getTables(sqlLoader: MySQLLoader): RDD[String] = { 57 | val mapRow: (ResultSet) => String = rs => rs.getString(1) 58 | val qStatement = "SHOW TABLES;" 59 | sqlLoader.query[String](qStatement, mapRow) 60 | } 61 | 62 | /** 63 | * Get a function of `tableName`, which gets all column names of the table. 64 | */ 65 | def getColumns(sqlLoader: MySQLLoader): (String) => RDD[String] = { 66 | tableName => 67 | val mapRow: (ResultSet) => String = rs => rs.getString(1) 68 | val qStatement = s"SELECT column_name FROM information_schema.columns " + 69 | s"WHERE table_name = $tableName AND table_schema = ${sqlLoader.dbName};" 70 | sqlLoader.query[String](qStatement, mapRow) 71 | } 72 | 73 | /** 74 | * Get a function of `(tableName, columnName)`, which count the non-null elements in this column. 75 | */ 76 | def getColumnNotNullCount(sqlLoader: MySQLLoader): (String, String) => Int = { 77 | (tableName, columnName) => 78 | val mapRow: (ResultSet) => Int = rs => rs.getInt(1) 79 | val qStatement = s"SELECT COUNT($columnName) FROM $tableName WHERE $columnName IS NOT NULL;" 80 | sqlLoader.query[Int](qStatement, mapRow).first() 81 | } 82 | 83 | /** 84 | * Get a function of `(tableName, columnName)`, which count all elements in this column. 85 | */ 86 | def getColumnCount(sqlLoader: MySQLLoader): (String, String) => Int = { 87 | (tableName, columnName) => 88 | val mapRow: (ResultSet) => Int = rs => rs.getInt(1) 89 | val qStatement = s"SELECT COUNT($columnName) FROM $tableName;" 90 | sqlLoader.query[Int](qStatement, mapRow).first() 91 | } 92 | 93 | /** 94 | * Get `(itemId, transactionId)` or `(itemId, transactionId, userId)` from a file. 95 | */ 96 | def getTransDataFromFile(transFile: String): Seq[Transaction] = { 97 | fromFile(transFile).getLines().map { line => 98 | val words = line.split("\\s+") 99 | if (words.size < 2) { 100 | logError(s"Broken line of transaction file: $line") 101 | EmptyTransaction 102 | } else if (words.size == 2) { 103 | Transaction(words(1).toInt, words(1).toInt, words(0).toInt) 104 | } else { 105 | Transaction(words(1).toInt, words(2).toInt, words(0).toInt) 106 | } 107 | }.toSeq 108 | } 109 | 110 | /** 111 | * Get all orders of each user. 112 | */ 113 | def getOrderSetOfUsers(trans: Seq[Transaction]): Map[Int, Seq[Int]] = { 114 | trans.map(tran => (tran.user, tran.item)) 115 | .groupBy(_._1).map { case (u, orders) => 116 | (u, orders.unzip._2) 117 | } 118 | } 119 | 120 | /** 121 | * Get all orders of each transaction. 122 | */ 123 | def getOrderSetOfTransactions(trans: Seq[Transaction]): Map[Int, (Seq[Int], Seq[Int])] = { 124 | trans.map { case tran => (tran.transaction, (tran.user, tran.item)) } 125 | .groupBy(_._1).map { case (t, orders) => 126 | val ui = orders.unzip._2 127 | val users = ui.unzip._1.toSet.toSeq 128 | val items = ui.unzip._2.toSet.toSeq 129 | (t, (users, items)) 130 | } 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/io/DataTransfer.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend.io 2 | 3 | import com.amli.w3.recommend.model.Transaction 4 | 5 | /** 6 | * Transfer data from one format to another. 7 | */ 8 | object DataTransfer { 9 | // TODO: Combine `DataTransfer` with `SchemaRDD` and `ML` package for unified transformation. 10 | 11 | /** 12 | * Map a transaction to a virtual user. 13 | */ 14 | def transactionToVirtualUser(transaction: Int): Int = { 15 | // TODO: make sure that the virtualUser will not be conflict to real user 16 | transaction 17 | } 18 | 19 | /** 20 | * Add virtual users to `(itemId, transactionId)` pairs. 21 | */ 22 | def addVirtualUserToTransaction(trans: Seq[(Int, Int)]): Seq[Transaction] = { 23 | trans.map { case (item, transaction) => 24 | Transaction(transaction, transactionToVirtualUser(transaction), item) 25 | } 26 | } 27 | 28 | /** 29 | * Remove `userId`s from a sequence of `Transaction`. 30 | */ 31 | def removeUserFromTransaction(trans: Seq[Transaction]): Seq[(Int, Int)] = { 32 | trans.map(tran => (tran.item, tran.transaction)).distinct 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/io/FeaturedData.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend.io 2 | 3 | import org.apache.spark.sql._ 4 | 5 | /** 6 | * Generate a feature data given an RDD of `Row`, which is used for data transformation. 7 | */ 8 | case class FeaturedData(sqlCtx: SQLContext, dat: SchemaRDD, tableName: String) { 9 | dat.registerTempTable(tableName) 10 | 11 | /** 12 | * Given a SQL transformation (with UDF), transform a `FeaturedData` to another one. 13 | */ 14 | def transform(transformer: String, otherTableName: String): FeaturedData = { 15 | new FeaturedData(sqlCtx, sqlCtx.sql(transformer), otherTableName) 16 | } 17 | } 18 | 19 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/io/MySQLLoader.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend.io 2 | 3 | import java.sql.{DriverManager, ResultSet} 4 | import scala.collection.mutable 5 | import scala.reflect.ClassTag 6 | 7 | import org.apache.spark.SparkContext 8 | import org.apache.spark.rdd.RDD 9 | 10 | /** 11 | * Interface that reading data from MySQL. 12 | */ 13 | import com.amli.w3.recommend.Logging 14 | 15 | class MySQLLoader(sc: SparkContext, connStr: String, user: String, password: String) 16 | extends Logging { 17 | 18 | val dbName = connStr.split("/").last 19 | 20 | /** 21 | * Query from a mysql database with a row mapping function. 22 | */ 23 | def query[T: ClassTag]( 24 | sql: String, 25 | mapRow: (ResultSet) => T = MySQLLoader.resultSetToObjectArray _): RDD[T] = { 26 | val conn = DriverManager.getConnection(connStr, user, password) 27 | val stmt = conn.prepareStatement(sql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY) 28 | if (conn.getMetaData.getURL.matches("jdbc:mysql:.*")) { 29 | stmt.setFetchSize(Integer.MIN_VALUE) 30 | } 31 | val rs = stmt.executeQuery() 32 | 33 | val arrayBuilder = mutable.ListBuffer[T]() 34 | while(rs.next()) { 35 | arrayBuilder += mapRow(rs) 36 | } 37 | try { 38 | if (null != rs && ! rs.isClosed) { 39 | rs.close() 40 | } 41 | } catch { 42 | case e: Exception => logWarning("Exception closing resultset", e) 43 | } 44 | try { 45 | if (null != stmt && ! stmt.isClosed) { 46 | stmt.close() 47 | } 48 | } catch { 49 | case e: Exception => logWarning("Exception closing statement", e) 50 | } 51 | try { 52 | if (null != conn && ! conn.isClosed) { 53 | conn.close() 54 | } 55 | logInfo("closed connection") 56 | } catch { 57 | case e: Exception => logWarning("Exception closing connection", e) 58 | } 59 | sc.parallelize[T](arrayBuilder) 60 | } 61 | } 62 | 63 | object MySQLLoader { 64 | def apply(sc: SparkContext, conn: String, user: String = null, password: String = null) = { 65 | new MySQLLoader(sc, conn, user, password) 66 | } 67 | 68 | def apply(sc: SparkContext, dbMetaData: (String, String, String)) = { 69 | new MySQLLoader(sc, dbMetaData._1, dbMetaData._2, dbMetaData._3) 70 | } 71 | 72 | /** 73 | * Default row mapping function, which maps a row into an array of `Object`. 74 | */ 75 | def resultSetToObjectArray(rs: ResultSet): Array[Object] = { 76 | Array.tabulate[Object](rs.getMetaData.getColumnCount)(i => rs.getObject(i + 1)) 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/io/Utils.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend.io 2 | 3 | object Utils { 4 | val dbConnStr: (String) => String = dbName => s"jdbc:mysql://localhost:3306/$dbName" 5 | val dbNames = Array("jts", "hbo", "qyf", "qjlx", "gwx") 6 | val dbMap = dbNames.map( dbName => dbName -> (dbConnStr(dbName), "root", "root")).toMap 7 | } 8 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/model/ItemSimilarity.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend.model 2 | 3 | import scala.collection.mutable.{Map => MutableMap} 4 | import scala.io.Source.fromFile 5 | import java.io.PrintWriter 6 | import collection.immutable.ListMap 7 | 8 | import com.amli.w3.recommend.io.DataTransfer._ 9 | import com.amli.w3.recommend.io.CliqueStat 10 | 11 | /** 12 | * Item similarity model. 13 | */ 14 | class ItemSimilarity extends Model with ModelLike { 15 | 16 | /** 17 | * Number of items referenced when computing similarity. 18 | */ 19 | var k: Int = 20 20 | 21 | /** 22 | * Similarity matrix with weights. 23 | */ 24 | var similarities = Map[Int, Map[Int, Double]]() 25 | 26 | /** 27 | * A mix-in popularity model for recommendation call-back in case of cold start. 28 | */ 29 | val popularityModel = new PopularityModel() 30 | 31 | override def trainModel(trans: Seq[Transaction]): ModelLike = { 32 | popularityModel.trainModel(trans) 33 | import popularityModel._ 34 | 35 | val clique = new CliqueStat(removeUserFromTransaction(trans)) 36 | val coMatrix = clique.getNClique(2).flatMap { case (Seq(u, v), c) => 37 | Map((u, v) -> c, (v, u) -> c) 38 | } 39 | 40 | similarities = coMatrix.map { case ((u, v), c) => 41 | ((u, v), c / math.sqrt(popularities(u) * popularities(v))) 42 | }.groupBy(_._1._1).map { case (k, l) => 43 | (k, ListMap(l.map { case ((u, v), c) => 44 | (v, c) 45 | }.toSeq.sortBy(_._2).reverse: _*)) 46 | }.map { case (k, l) => 47 | (k, l.map { case (v, c) => (v, c / l.head._2) }) 48 | } 49 | 50 | this 51 | } 52 | 53 | override def exportModel(outFile: String): ModelLike = { 54 | val out = new PrintWriter(outFile) 55 | out.println(s"comment: $comment") 56 | out.println(s"k: $k") 57 | for ((itemA, relatedItems) <- similarities) { 58 | for ((itemB, sim) <- similarities(itemA)) { 59 | out.println(s"similarity: $itemA $itemB $sim") 60 | } 61 | } 62 | out.close() 63 | popularityModel.exportModel(outFile) 64 | this 65 | } 66 | 67 | override def importModel(inFile: String): ModelLike = { 68 | val tmpSimilarities = MutableMap[Int, Map[Int, Double]]() 69 | 70 | val lineIterator = fromFile(inFile).getLines() 71 | for (line <- lineIterator) { 72 | val words = line.split("\\s+") 73 | if (words(0) == "comment:") comment = words(1) 74 | if (words(0) == "k:") k = words(1).toInt 75 | if (words(0) == "similarity:") { 76 | val (itemA, itemB, sim) = 77 | (words(1).toInt, words(2).toInt, words(3).toDouble) 78 | if (tmpSimilarities.contains(itemA)) { 79 | tmpSimilarities(itemA) += (itemB -> sim) 80 | } else { 81 | tmpSimilarities += (itemA -> Map(itemB -> sim)) 82 | } 83 | } 84 | } 85 | val sortedSimilarities = tmpSimilarities.map { case (user, items) => 86 | (user, ListMap(items.toSeq.sortBy(_._2).reverse:_*)) 87 | } 88 | similarities = sortedSimilarities.toMap 89 | popularityModel.importModel(inFile) 90 | this 91 | } 92 | 93 | override def configModel(params: Map[String, String]): ModelLike = { 94 | params.map { 95 | case ("k", v) => 96 | val tmp = scala.util.Try(v.toInt).toOption 97 | if (tmp.isDefined) k = tmp.get 98 | case ("comment", c) => 99 | comment = c 100 | case _ => 101 | } 102 | this 103 | } 104 | 105 | override def recommend(users: Seq[Int], itemsInCart: Seq[Int], num: Int): Seq[RecommendResult] = { 106 | import popularityModel._ 107 | 108 | var rank = MutableMap[Int, (Double, Int, Double)]() 109 | val allRefItems = (orderHistoryOfUsers(users) ++ itemsInCart).toSet 110 | 111 | for (i <- allRefItems) { 112 | if (similarities.contains(i)) { 113 | for ((j, sim) <- similarities(i).slice(0, k)) { 114 | if (!itemsInCart.contains(j)) { 115 | if (rank.contains(j)) { 116 | if (sim > rank(j)._3) rank(j) = (rank(j)._1 + sim, i, sim) 117 | else rank(j) = (rank(j)._1 + sim, rank(j)._2, rank(j)._3) 118 | } else { 119 | rank += (j -> (sim, i, sim)) 120 | } 121 | } 122 | } 123 | } 124 | } 125 | val results = ListMap(rank.toSeq.sortBy(_._2._1).reverse:_*) 126 | .slice(0, num).map { case (j, (weight, i, wi)) => 127 | if (itemsInCart.contains(i)) { 128 | new RecommendResult(j, weight, s"You ordered $i") 129 | } else { 130 | new RecommendResult(j, weight, s"You ever ordered $i") 131 | } 132 | }.toSeq 133 | results ++ popularityModel.recommend(users, itemsInCart, num - results.size) 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/model/Model.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend.model 2 | 3 | case class RecommendResult(item: Int, score: Double, reason: String) 4 | case class Transaction(transaction: Int, user: Int, item: Int) 5 | object EmptyTransaction extends Transaction(-1, -1, -1) 6 | 7 | /** 8 | * Define the behaviors of a `Model`. All classes that implement these interfaces is a `Model`. 9 | */ 10 | trait ModelLike { 11 | def trainModel(trans: Seq[Transaction]): ModelLike 12 | def exportModel(outFile: String): ModelLike 13 | def importModel(inFile: String): ModelLike 14 | def configModel(params: Map[String, String]): ModelLike 15 | def recommend(users: Seq[Int], itemsInCart: Seq[Int], num: Int): Seq[RecommendResult] 16 | } 17 | 18 | /** 19 | * Define all common attributes of a `Model`. 20 | */ 21 | abstract class Model { 22 | var comment: String = "fake comment" 23 | } 24 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/model/PopularityModel.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend.model 2 | 3 | import java.io.{File, FileWriter, PrintWriter} 4 | import com.amli.w3.recommend.io.CliqueStat 5 | 6 | import scala.io.Source.fromFile 7 | import scala.collection.mutable.{Map => MutableMap} 8 | import scala.collection.immutable.ListMap 9 | 10 | import com.amli.w3.recommend.io.DataTransfer._ 11 | import com.amli.w3.recommend.io.DataLoader._ 12 | 13 | class PopularityModel extends Model with ModelLike { 14 | /** 15 | * Popularity of all items. 16 | */ 17 | var popularities = Map[Int, Int]() 18 | 19 | /** 20 | * Order history of each user or transaction. 21 | */ 22 | var orderHistory = Map[Int, Seq[Int]]() 23 | 24 | override def trainModel(trans: Seq[Transaction]): ModelLike = { 25 | val clique = new CliqueStat(removeUserFromTransaction(trans)) 26 | popularities = ListMap(clique.getNClique(1).map { 27 | case (Seq(u), c) => (u, c) 28 | }.toSeq.sortBy(_._2).reverse:_*).toMap 29 | orderHistory = getOrderSetOfUsers(trans) 30 | this 31 | } 32 | 33 | override def exportModel(outFile: String): ModelLike = { 34 | val out = new PrintWriter(new FileWriter(new File(outFile), true)) 35 | for ((user, ordered) <- orderHistory) { 36 | for (item <- ordered) out.println(s"order: $user $item") 37 | } 38 | for ((item, pop) <- popularities) { 39 | out.println(s"popularity: $item $pop") 40 | } 41 | out.close() 42 | this 43 | } 44 | 45 | override def importModel(inFile: String): ModelLike = { 46 | val tmpOrderHistory = MutableMap[Int, Seq[Int]]() 47 | val tmpPopularities = MutableMap[Int, Int]() 48 | 49 | val lineIterator = fromFile(inFile).getLines() 50 | for (line <- lineIterator) { 51 | val words = line.split("\\s+") 52 | if (words(0) == "order:") { 53 | val (user, item) = (words(1).toInt, words(2).toInt) 54 | if (tmpOrderHistory.contains(user)) tmpOrderHistory(user) ++= Seq(item) 55 | else tmpOrderHistory += (user -> Seq(item)) 56 | } 57 | if (words(0) == "popularity:") { 58 | val (item, pop) = (words(1).toInt, words(2).toInt) 59 | tmpPopularities += (item -> pop) 60 | } 61 | } 62 | val sorted = ListMap(tmpPopularities.toSeq.sortBy(_._2).reverse:_*) 63 | orderHistory = tmpOrderHistory.toMap 64 | popularities = sorted.toMap 65 | this 66 | } 67 | 68 | override def configModel(params: Map[String, String]): ModelLike = { this } 69 | 70 | override def recommend(users: Seq[Int], 71 | itemsInCart: Seq[Int], 72 | num: Int): Seq[RecommendResult] = { 73 | popularities.slice(0, num).map { case (item, pop) => 74 | new RecommendResult(item, pop.toDouble, "Most popular") 75 | }.toSeq 76 | } 77 | 78 | def orderHistoryOfUsers(users: Seq[Int]): Seq[Int] = { 79 | users.flatMap { case user => orderHistory.getOrElse(user, Nil) } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/datalyze/src/main/scala/com/amli/w3/recommend/package.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3 2 | 3 | import org.apache.spark.sql.SQLContext 4 | import org.apache.spark.{SparkContext, SparkConf} 5 | 6 | package object recommend { 7 | 8 | object scLocal { 9 | val scConf = new SparkConf().setMaster("local[4]").setAppName("Assist Spark Context") 10 | val sc = new SparkContext(scConf) 11 | val sqlCtx = new SQLContext(sc) 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/datalyze/src/test/scala/com/amli/w3/recommend/io/CliqueStatSuite.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend.io 2 | 3 | import org.scalatest.{Matchers, FunSuite} 4 | 5 | class CliqueStatSuite extends FunSuite with Matchers { 6 | test("test get n cliques") { 7 | val transactions = Seq( 8 | 1 -> 1, 9 | 1 -> 2, 10 | 1 -> 3, 11 | 1 -> 4, 12 | 2 -> 2, 13 | 2 -> 3, 14 | 2 -> 4, 15 | 3 -> 3, 16 | 3 -> 4, 17 | 4 -> 4 18 | ) 19 | val cliques = new CliqueStat(transactions) 20 | val twoCliques = cliques.getNClique(2) 21 | val realTwoCliquesResult = Map( 22 | (List(3, 4),1), 23 | (List(1, 2),3), 24 | (List(2, 3),2), 25 | (List(1, 4),1), 26 | (List(2, 4),1), 27 | (List(1, 3),2) 28 | ) 29 | twoCliques should equal (realTwoCliquesResult) 30 | 31 | val threeCliques = cliques.getNClique(3) 32 | val realThreeCliquesResult = Map( 33 | (List(1, 2, 4),1), 34 | (List(2, 3, 4),1), 35 | (List(1, 2, 3),2), 36 | (List(1, 3, 4),1) 37 | ) 38 | threeCliques should equal (realThreeCliquesResult) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/datalyze/src/test/scala/com/amli/w3/recommend/io/FeaturedDataSuite.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend.io 2 | 3 | import org.scalatest.{Matchers, FunSuite} 4 | 5 | import com.amli.w3.recommend._ 6 | 7 | case class DataPoint(xPoint: Int, yPoint: Int) 8 | 9 | class FeaturedDataSuite extends FunSuite with Matchers { 10 | test("test the functionality of SchemaRDD in FeaturedData") { 11 | val dataLocal = Seq( 12 | DataPoint(1, 2), 13 | DataPoint(3, 4), 14 | DataPoint(5, 6), 15 | DataPoint(7, 8) 16 | ) 17 | 18 | import scLocal._ 19 | import sqlCtx._ 20 | 21 | val data = sc.parallelize[DataPoint](dataLocal) 22 | val feature = FeaturedData(sqlCtx, data.toSchemaRDD, "Graph") 23 | 24 | val avgPoint: (Int, Int) => Double = (l, r) => (l + r) / 2.0 25 | sqlCtx.registerFunction("avgPoint", avgPoint) 26 | val avg = feature.transform(s"select avgPoint(xPoint, yPoint) from Graph", "AVG") 27 | avg.dat.map(_.getDouble(0)).collect() should contain allOf (1.5, 3.5, 5.5, 7.5) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/datalyze/src/test/scala/com/amli/w3/recommend/io/MySQLLoaderSuite.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend.io 2 | 3 | import java.sql.{SQLException, DriverManager, ResultSet} 4 | import org.scalatest.{BeforeAndAfter, Matchers, FunSuite} 5 | 6 | import com.amli.w3.recommend._ 7 | 8 | class MySQLLoaderSuite extends FunSuite with Matchers with BeforeAndAfter { 9 | before { 10 | Class.forName("org.apache.derby.jdbc.EmbeddedDriver") 11 | val conn = DriverManager.getConnection("jdbc:derby:target/MySQLLoaderSuiteDB;create=true") 12 | try { 13 | val create = conn.createStatement 14 | create.execute(""" 15 | CREATE TABLE FOO( 16 | ID INTEGER NOT NULL GENERATED ALWAYS AS IDENTITY (START WITH 1, INCREMENT BY 1), 17 | DATA INTEGER 18 | )""") 19 | create.close() 20 | val insert = conn.prepareStatement("INSERT INTO FOO(DATA) VALUES(?)") 21 | (1 to 100).foreach { i => 22 | insert.setInt(1, i * 2) 23 | insert.executeUpdate 24 | } 25 | insert.close() 26 | } catch { 27 | case e: SQLException if e.getSQLState == "X0Y32" => 28 | // table exists 29 | } finally { 30 | conn.close() 31 | } 32 | } 33 | 34 | test("test mysql data loader") { 35 | import scLocal._ 36 | val mapRow: (ResultSet) => (Int, Int) = rs => (rs.getInt(1), rs.getInt(2)) 37 | val conn = "jdbc:derby:target/MySQLLoaderSuiteDB;create=true" 38 | val qStatement = "SELECT ID, DATA FROM FOO WHERE ID=10" 39 | val data = MySQLLoader(sc, conn).query[(Int, Int)](qStatement, mapRow) 40 | data.collect() should contain only ((10, 20)) 41 | } 42 | 43 | after { 44 | try { 45 | DriverManager.getConnection("jdbc:derby:;shutdown=true") 46 | } catch { 47 | case se: SQLException if se.getSQLState == "XJ015" => 48 | // normal shutdown 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/datalyze/src/test/scala/com/amli/w3/recommend/model/CommunityModelSuite.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend.model 2 | 3 | import java.io.PrintWriter 4 | 5 | import org.scalatest.{BeforeAndAfter, FunSuite} 6 | 7 | import com.amli.w3.recommend.io.DataLoader._ 8 | import com.amli.w3.recommend.scLocal._ 9 | 10 | class CommunityModelSuite extends FunSuite with BeforeAndAfter { 11 | val lambda = 0.8 12 | val numIterations = 10 13 | val transFile = "/tmp/transaction.txt" 14 | val modelFile = "/tmp/model.txt" 15 | 16 | before { 17 | val out = new PrintWriter(transFile) 18 | for (i <- 1 to 100) { 19 | out.println(s"${i % 20} ${i % 80}") 20 | } 21 | out.close() 22 | } 23 | 24 | test("test train model") { 25 | val model = new CommunityModel(sc) 26 | val trans = getTransDataFromFile(transFile) 27 | model.configModel(Map(("lambda", lambda.toString), ("numIterations", numIterations.toString))) 28 | assert(model.lambda === lambda) 29 | assert(model.numIterations === numIterations) 30 | 31 | model.trainModel(trans) 32 | 33 | model.exportModel(modelFile) 34 | val readModel = new CommunityModel(sc) 35 | readModel.importModel(modelFile) 36 | 37 | assert(model.lambda === readModel.lambda) 38 | // TODO: how to test model training? 39 | } 40 | 41 | after { 42 | /** Not need to delete temporal files manually */ 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/datalyze/src/test/scala/com/amli/w3/recommend/model/ItemSimilaritySuite.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend.model 2 | 3 | import java.io.PrintWriter 4 | import org.scalatest.{BeforeAndAfter, FunSuite, Matchers} 5 | import com.amli.w3.recommend.io.DataLoader._ 6 | import com.amli.w3.recommend.model.Utils._ 7 | 8 | class ItemSimilaritySuite extends FunSuite with Matchers with BeforeAndAfter { 9 | 10 | val config = Config(transFile = "/tmp/transaction.txt", modelFile = "/tmp/model.txt") 11 | 12 | before { 13 | val out = new PrintWriter(config.transFile) 14 | for (i <- 1 to 100) { 15 | out.println(s"${i % 20} ${i % 80}") 16 | } 17 | out.close() 18 | } 19 | 20 | test("test train model") { 21 | val model = new ItemSimilarity() 22 | val trans = getTransDataFromFile(config.transFile) 23 | model.configModel(Map(("K", config.k.toString))) 24 | assert(model.k === config.k) 25 | 26 | model.trainModel(trans) 27 | 28 | model.exportModel(config.modelFile) 29 | val readModel = new ItemSimilarity() 30 | readModel.importModel(config.modelFile) 31 | 32 | assert(model.k === readModel.k) 33 | model.similarities should equal(readModel.similarities) 34 | model.popularityModel.popularities should equal (readModel.popularityModel.popularities) 35 | model.popularityModel.orderHistory should equal (readModel.popularityModel.orderHistory) 36 | } 37 | 38 | after { 39 | /** Not need to delete temporal files manually */ 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/datalyze/src/test/scala/com/amli/w3/recommend/model/PregelUnfoldingSuite.scala: -------------------------------------------------------------------------------- 1 | package com.amli.w3.recommend.model 2 | 3 | import java.io.PrintWriter 4 | import org.scalatest.{BeforeAndAfter, Matchers, FunSuite} 5 | 6 | import org.apache.spark.graphx.Graph 7 | 8 | import com.amli.w3.recommend.scLocal._ 9 | 10 | class PregelUnfoldingSuite extends FunSuite with Matchers with BeforeAndAfter { 11 | 12 | val transFile = "/tmp/transaction.txt" 13 | 14 | before { 15 | val out = new PrintWriter(transFile) 16 | for (i <- 1 to 100) { 17 | out.println(s"${i % 20} ${i % 80}") 18 | } 19 | out.close() 20 | } 21 | 22 | test("test pregel unfolding") { 23 | val rawEdges = sc.textFile(transFile, 2).map(s => 24 | s.split("\\s+").head.toLong -> s.split("\\s+").last.toLong) 25 | val graph = Graph.fromEdgeTuples(rawEdges, -1) 26 | val puGraph = PregelUnfolding.run(graph, 5) 27 | 28 | // TODO: What should I do for test its result? 29 | for ((a, NodeAttr(b, c, d, e, f)) <- puGraph.vertices.collect()) { 30 | println(s"my id is $a") 31 | println(s"my neighbors are ${b.mkString(",")}") 32 | println(s"my community is ${c.mkString(",")}") 33 | println(s"my outer links count is $d") 34 | println(s"my inner links count is $e") 35 | println(s"largest mod gain is $f") 36 | println() 37 | } 38 | } 39 | 40 | after { /** do nothing*/ } 41 | } 42 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/engine.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "default", 3 | "description": "Default settings", 4 | "engineFactory": "com.amli.w3.recommend.RecommendationEngine", 5 | "datasource": { 6 | "params": [ 7 | { 8 | "sources": ["jdbc:mysql://localhost:3306/jts", "jdbc:mysql://localhost:3306/qyf"], 9 | "users": ["root", "root"], 10 | "passwords": ["root", "root"] 11 | }, 12 | { 13 | "sources": ["jdbc:mysql://localhost:3306/hbo"], 14 | "users": ["root"], 15 | "passwords": ["root"] 16 | } 17 | ] 18 | }, 19 | "algorithms": [ 20 | { 21 | "name": "itemtoitem", 22 | "params": { 23 | "params": [ 24 | { 25 | "k": 20, 26 | "modelFile": "./model/ItemToItem_jts_20141031" 27 | }, 28 | { 29 | "k": 10, 30 | "modelFile": "./model/ItemToItem_hbo_20141031" 31 | } 32 | ] 33 | } 34 | 35 | } 36 | ] 37 | } 38 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/make-distribution.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | FWDIR="$(cd `dirname $0`; pwd)" 6 | DISTDIR="$FWDIR/dist" 7 | mkdir -p $DISTDIR/model 8 | 9 | echo "Building binary distribution for Inspeed" 10 | 11 | cd $FWDIR 12 | 13 | pio build --sbt-extra assembly 14 | 15 | cp -r target $DISTDIR 16 | cp engine.json $DISTDIR 17 | 18 | echo "Inspeed binary distribution created at $DISTDIR" 19 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/project/build.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | import Keys._ 3 | import sbtassembly.Plugin._ 4 | 5 | object MyBuild extends Build { 6 | 7 | val versionOfScala = "2.10.4" 8 | 9 | lazy val demo = project.in(file(".")) 10 | .settings(name := "demo", version := "0.1", scalaVersion := versionOfScala) 11 | .settings(assemblySettings: _*) 12 | .aggregate(core, datalyze) 13 | .dependsOn(core, datalyze) 14 | 15 | val commonSharingLibs = Seq( 16 | "org.apache.spark" %% "spark-core" % "1.1.0" % "provided", 17 | "org.apache.spark" %% "spark-mllib" % "1.1.0" % "provided" 18 | ) 19 | 20 | lazy val core = project.in(file("core")).settings( 21 | scalaVersion := versionOfScala, 22 | libraryDependencies ++= Seq( 23 | "io.prediction" %% "core" % "0.8.1" % "provided" 24 | ) ++ commonSharingLibs 25 | ).dependsOn(datalyze) 26 | 27 | lazy val datalyze = project.in(file("datalyze")).settings( 28 | scalaVersion := versionOfScala, 29 | libraryDependencies ++= Seq( 30 | "org.scalatest" %% "scalatest" % "2.2.0" % "test", 31 | "org.apache.derby" % "derby" % "10.11.1.1" % "test", 32 | "org.apache.spark" % "spark-graphx_2.10" % "1.1.0" % "provided", 33 | "org.apache.spark" % "spark-sql_2.10" % "1.1.0" % "provided", 34 | "mysql" % "mysql-connector-java" % "5.1.28", 35 | "com.github.scopt" %% "scopt" % "3.2.0" 36 | ) ++ commonSharingLibs 37 | ) 38 | } 39 | 40 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.2.0") 4 | 5 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0") 6 | -------------------------------------------------------------------------------- /w3/w3demo/demo3/recommend-dish/requests.sh: -------------------------------------------------------------------------------- 1 | curl -H "Content-Type: application/json" -X POST -d '{"restaurantId": 1003, "users": [123], "personAmount": 1, "expectedConsumePerHead": 20, "mealType": 0, "itemsInCart": [7454, 7455], "clickedItems": [], "num": 5}' http://localhost:8000/queries.json 2 | --------------------------------------------------------------------------------