├── LICENSE
├── README.md
├── SUMMARY.md
├── _config.yml
├── 分类和回归
├── readme.md
├── 保序回归
│ ├── imgs
│ │ ├── 1.1.png
│ │ ├── 1.10.png
│ │ ├── 1.11.png
│ │ ├── 1.12.png
│ │ ├── 1.2.png
│ │ ├── 1.3.png
│ │ ├── 1.4.png
│ │ ├── 1.5.png
│ │ ├── 1.6.png
│ │ ├── 1.7.png
│ │ ├── 1.8.png
│ │ └── 1.9.png
│ └── isotonic-regression.md
├── 决策树
│ ├── decision-tree.md
│ └── imgs
│ │ ├── 1.1.png
│ │ ├── 1.2.png
│ │ ├── 1.3.png
│ │ └── 1.4.png
├── 朴素贝叶斯
│ ├── imgs
│ │ ├── 1.1.png
│ │ ├── 1.2.png
│ │ ├── 1.3.png
│ │ ├── 1.4.png
│ │ ├── 1.5.png
│ │ ├── 1.6.png
│ │ ├── 1.7.png
│ │ ├── 1.8.png
│ │ ├── 2.1.png
│ │ ├── 2.2.png
│ │ ├── 2.3.png
│ │ ├── 3.1.png
│ │ └── 3.2.png
│ └── nb.md
├── 线性模型
│ ├── imgs
│ │ ├── 1.1.png
│ │ ├── 1.2.png
│ │ ├── 1.3.png
│ │ ├── 1.4.png
│ │ ├── 1.5.png
│ │ ├── 1.6.png
│ │ ├── 1.7.png
│ │ ├── 1.8.png
│ │ └── introduce1.png
│ ├── readme.md
│ ├── 回归
│ │ ├── imgs
│ │ │ ├── 1.1.png
│ │ │ └── 1.2.png
│ │ └── regression.md
│ ├── 广义线性回归
│ │ └── glr.md
│ ├── 支持向量机
│ │ ├── imgs
│ │ │ └── 1.1.png
│ │ └── lsvm.md
│ └── 逻辑回归
│ │ ├── imgs
│ │ ├── 1.1.png
│ │ ├── 1.2.png
│ │ ├── 1.3.png
│ │ ├── 1.4.png
│ │ ├── 1.5.png
│ │ ├── 1.6.png
│ │ ├── 1.7.png
│ │ ├── 2.1.png
│ │ ├── 2.2.png
│ │ ├── 2.3.png
│ │ ├── 2.4.png
│ │ └── 2.5.png
│ │ └── logic-regression.md
└── 组合树
│ ├── readme.md
│ ├── 梯度提升树
│ ├── gbts.md
│ └── imgs
│ │ ├── 1.1.png
│ │ ├── 1.2.png
│ │ └── 1.3.png
│ └── 随机森林
│ ├── imgs
│ ├── 1.1.png
│ ├── 1.2.png
│ ├── 1.3.png
│ └── 1.4.png
│ └── random-forests.md
├── 基本统计
├── correlations.md
├── hypothesis-testing.md
├── imgs
│ ├── 1.1.png
│ ├── 1.2.png
│ ├── 1.3.png
│ ├── 1.4.png
│ ├── 1.5.png
│ ├── 1.6.png
│ ├── 2.1.png
│ ├── 2.2.png
│ ├── 2.3.png
│ ├── 5.1.png
│ └── 5.2.png
├── kernel-density-estimation.md
├── random-data-generation.md
├── summary-statistics.md
└── tratified-sampling.md
├── 推荐
├── ALS.md
├── imgs
│ ├── ALS.1.1.png
│ ├── ALS.2.1.png
│ ├── ALS.3.1.png
│ ├── ALS.3.2.png
│ ├── math.1.1.png
│ ├── math.2.1.png
│ ├── math.2.2.png
│ ├── math.2.3.png
│ └── math.2.4.png
└── papers
│ ├── Collaborative Filtering for Implicit Feedback Datasets.pdf
│ ├── Large-scale Parallel Collaborative Filtering the Netflix Prize.pdf
│ └── Matrix Factorization Techniques for Recommender Systems.pdf
├── 数据类型
├── data-type.md
└── imgs
│ └── 1.1.png
├── 最优化算法
├── IRLS.md
├── L-BFGS
│ ├── docs
│ │ ├── On the Limited Memory BFGS Method for Large Scale Optimization.pdf
│ │ └── Updating Quasi-Newton Matrices with Limited Storage.pdf
│ ├── imgs
│ │ ├── 1.1.png
│ │ ├── 1.2.png
│ │ ├── 1.3.png
│ │ ├── 1.4.png
│ │ ├── 1.5.png
│ │ ├── 2.1.png
│ │ ├── 2.10.png
│ │ ├── 2.11.png
│ │ ├── 2.12.png
│ │ ├── 2.13.png
│ │ ├── 2.14.png
│ │ ├── 2.15.png
│ │ ├── 2.16.png
│ │ ├── 2.17.png
│ │ ├── 2.18.png
│ │ ├── 2.19.png
│ │ ├── 2.2.png
│ │ ├── 2.20.png
│ │ ├── 2.21.png
│ │ ├── 2.22.png
│ │ ├── 2.23.jpeg
│ │ ├── 2.24.png
│ │ ├── 2.25.png
│ │ ├── 2.26.png
│ │ ├── 2.27.png
│ │ ├── 2.28.png
│ │ ├── 2.29.png
│ │ ├── 2.3.png
│ │ ├── 2.30.jpg
│ │ ├── 2.31.png
│ │ ├── 2.32.png
│ │ ├── 2.33.png
│ │ ├── 2.34.png
│ │ ├── 2.35.gif
│ │ ├── 2.36.gif
│ │ ├── 2.37.gif
│ │ ├── 2.38.gif
│ │ ├── 2.39.png
│ │ ├── 2.4.png
│ │ ├── 2.5.png
│ │ ├── 2.6.png
│ │ ├── 2.7.png
│ │ ├── 2.8.png
│ │ └── 2.9.png
│ └── lbfgs.md
├── WeightsLeastSquares.md
├── 梯度下降
│ ├── gradient-descent.md
│ └── imgs
│ │ ├── 1.1.png
│ │ ├── 1.2.png
│ │ ├── 1.3.png
│ │ ├── 1.4.png
│ │ └── 1.5.png
└── 非负最小二乘
│ ├── NNLS.md
│ ├── imgs
│ ├── NNLS.2.1.png
│ ├── math.1.1.png
│ ├── math.1.10.append1.png
│ ├── math.1.10.png
│ ├── math.1.11.png
│ ├── math.1.12.png
│ ├── math.1.13.png
│ ├── math.1.14.png
│ ├── math.1.15.png
│ ├── math.1.16.png
│ ├── math.1.17.png
│ ├── math.1.2.png
│ ├── math.1.3.1.png
│ ├── math.1.3.png
│ ├── math.1.4.png
│ ├── math.1.5.png
│ ├── math.1.6.png
│ ├── math.1.7.png
│ ├── math.1.8.png
│ ├── math.1.9.png
│ ├── math.2.1.png
│ ├── math.2.10.png
│ ├── math.2.11.a1.png
│ ├── math.2.11.png
│ ├── math.2.12.png
│ ├── math.2.2.a1.png
│ ├── math.2.2.a2.png
│ ├── math.2.2.png
│ ├── math.2.3.png
│ ├── math.2.4.png
│ ├── math.2.5.a1.png
│ ├── math.2.5.a2.png
│ ├── math.2.5.png
│ ├── math.2.6.png
│ ├── math.2.7.png
│ ├── math.2.8.png
│ ├── math.2.9.png
│ ├── math.3.1.png
│ └── math.3.2.png
│ └── papers
│ └── The conjugate gradient method in extreme problems.pdf
├── 特征抽取和转换
├── Binarizer.md
├── Bucketizer.md
├── CountVectorizer.md
├── DCT.md
├── IndexToString.md
├── MaxAbsScaler.md
├── MinMaxScaler.md
├── OneHotEncoder.md
├── PolynomialExpansion.md
├── QuantileDiscretizer.md
├── RFormula.md
├── SQLTransformer.md
├── StandardScaler.md
├── StopWordsRemover.md
├── StringIndexer.md
├── TF-IDF.md
├── Tokenizer.md
├── VectorAssembler.md
├── VectorIndexer.md
├── VectorSlicer.md
├── Word2Vector.md
├── chi-square-selector.md
├── docs
│ └── word2vec.pdf
├── element-wise-product.md
├── imgs
│ ├── 1.1.png
│ ├── 1.2.png
│ ├── 2.1.png
│ ├── 2.2.png
│ ├── 3.1.png
│ ├── 4.1.png
│ ├── 5.1.png
│ └── 6.1.png
├── n_gram.md
└── normalizer.md
├── 聚类
├── LDA
│ ├── docs
│ │ ├── Latent Dirichlet Allocation.pdf
│ │ ├── On Smoothing and Inference for Topic Models.pdf
│ │ ├── Online Learning for Latent Dirichlet Allocation.pdf
│ │ └── dirichlet.pdf
│ ├── imgs
│ │ ├── 1.1.1.png
│ │ ├── 1.1.2.png
│ │ ├── 1.1.3.png
│ │ ├── 1.2.1.png
│ │ ├── 1.2.2.png
│ │ ├── 1.3.1.png
│ │ ├── 1.3.2.png
│ │ ├── 1.4.1.png
│ │ ├── 1.4.2.png
│ │ ├── 1.5.1.png
│ │ ├── 1.5.2.png
│ │ ├── 1.5.3.png
│ │ ├── 1.5.4.png
│ │ ├── 1.5.5.png
│ │ ├── 1.5.6.png
│ │ ├── 1.5.7.png
│ │ ├── 1.6.1.png
│ │ ├── 1.6.2.png
│ │ ├── 1.6.3.png
│ │ ├── 1.6.4.png
│ │ ├── 1.6.5.png
│ │ ├── 1.6.6.png
│ │ ├── 1.6.7.png
│ │ ├── 1.6.8.png
│ │ ├── 1.7.1.png
│ │ ├── 1.7.2.png
│ │ ├── 1.7.3.png
│ │ ├── 1.7.4.png
│ │ ├── 2.1.1.png
│ │ ├── 2.1.2.png
│ │ ├── 2.2.1.png
│ │ ├── 2.2.2.png
│ │ ├── 2.3.1.png
│ │ ├── 2.3.2.png
│ │ ├── 2.3.3.png
│ │ ├── 2.3.4.png
│ │ ├── 2.3.5.png
│ │ ├── 3.1.1.png
│ │ ├── 3.1.2.png
│ │ ├── 3.1.3.png
│ │ ├── 3.1.4.png
│ │ ├── 3.1.5.png
│ │ ├── 3.1.6.png
│ │ ├── 3.1.7.png
│ │ ├── 3.2.1.png
│ │ ├── 3.2.2.png
│ │ ├── 3.2.3.png
│ │ ├── 3.2.4.png
│ │ ├── 3.2.5.png
│ │ ├── 3.2.6.png
│ │ ├── 3.2.7.png
│ │ ├── 3.2.8.png
│ │ ├── 3.2.9.png
│ │ ├── 3.3.1.png
│ │ ├── 3.3.2.png
│ │ ├── 3.3.3.png
│ │ ├── 3.3.4.png
│ │ ├── 3.3.5.png
│ │ ├── 3.3.6.png
│ │ ├── 3.3.7.png
│ │ ├── LDA.png
│ │ ├── alg1.png
│ │ ├── alg2.png
│ │ ├── docs.png
│ │ ├── question1.png
│ │ ├── question2.png
│ │ ├── question3.png
│ │ ├── question4.png
│ │ └── topic_words.png
│ └── lda.md
├── PIC
│ ├── imgs
│ │ ├── PIC.1.1.png
│ │ ├── PIC.1.2.png
│ │ ├── PIC.1.3.png
│ │ ├── PIC.1.4.png
│ │ ├── PIC.1.5.png
│ │ └── PIC.1.6.png
│ ├── papers
│ │ └── Power Iteration Clustering.pdf
│ └── pic.md
├── bis-k-means
│ ├── bisecting-k-means.md
│ ├── imgs
│ │ ├── dis-k-means.1.1.png
│ │ └── dis-k-means.1.2.png
│ └── papers
│ │ └── A Comparison of Document Clustering Techniques.pdf
├── gaussian-mixture
│ ├── gaussian-mixture.md
│ └── imgs
│ │ ├── 1.1.png
│ │ ├── 1.2.png
│ │ ├── 1.3.png
│ │ ├── 1.4.png
│ │ ├── 1.5.png
│ │ ├── 1.6.png
│ │ ├── 1.7.png
│ │ ├── 1.8.png
│ │ └── 1.9.png
├── k-means
│ ├── imgs
│ │ ├── 1.1.png
│ │ ├── math.1.1.png
│ │ ├── math.1.2.png
│ │ └── math.1.3.png
│ ├── k-means.md
│ └── papers
│ │ ├── Scalable K-Means++.pdf
│ │ └── k-means++ The Advantages of Careful Seeding.pdf
├── readme.md
└── streaming-k-means
│ ├── imgs
│ ├── streaming-k-means.1.1.png
│ └── streaming-k-means.1.2.png
│ └── streaming-k-means.md
└── 降维
├── EVD
├── evd.md
└── imgs
│ ├── 1.1.png
│ └── 1.2.png
├── PCA
├── imgs
│ ├── 1.1.png
│ ├── 1.2.png
│ ├── 1.3.png
│ ├── 1.4.png
│ ├── 1.5.png
│ ├── 1.6.png
│ ├── 1.7.png
│ └── 1.8.png
└── pca.md
└── SVD
├── imgs
├── 1.10.png
├── 1.11.png
├── 1.3.png
├── 1.4.png
├── 1.5.png
├── 1.6.png
├── 1.7.png
├── 1.8.png
└── 1.9.png
└── svd.md
/README.md:
--------------------------------------------------------------------------------
1 | # spark机器学习算法研究和源码分析
2 |
3 | 本项目对`spark ml`包中各种算法的原理加以介绍并且对算法的代码实现进行详细分析,旨在加深自己对机器学习算法的理解,熟悉这些算法的分布式实现方式。
4 |
5 | ## 本系列文章支持的spark版本
6 |
7 | - **spark 2.x**
8 |
9 | ## 本系列的目录结构
10 |
11 | 本系列目录如下:
12 | * [数据类型](数据类型/data-type.md)
13 | * [基本统计](基本统计/summary-statistics.md)
14 | * [summary statistics(概括统计)](基本统计/summary-statistics.md)
15 | * [correlations(相关性系数)](基本统计/correlations.md)
16 | * [tratified sampling(分层取样)](基本统计/tratified-sampling.md)
17 | * [hypothesis testing(假设检验)](基本统计/hypothesis-testing.md)
18 | * [random data generation(随机数生成)](基本统计/random-data-generation.md)
19 | * [Kernel density estimation(核密度估计)](基本统计/kernel-density-estimation.md)
20 | * [协同过滤](推荐/ALS.md)
21 | * [交换最小二乘](推荐/ALS.md)
22 | * [分类和回归](分类和回归/readme.md)
23 | * [线性模型](分类和回归/线性模型/readme.md)
24 | * [SVMs(支持向量机)](分类和回归/线性模型/支持向量机/lsvm.md)
25 | * [逻辑回归](分类和回归/线性模型/逻辑回归/logic-regression.md)
26 | * [线性回归](分类和回归/线性模型/回归/regression.md)
27 | * [广义线性回归](分类和回归/线性模型/广义线性回归/glr.md)
28 | * [朴素贝叶斯](分类和回归/朴素贝叶斯/nb.md)
29 | * [决策树](分类和回归/决策树/decision-tree.md)
30 | * [组合树](分类和回归/组合树/readme.md)
31 | * [随机森林](分类和回归/组合树/随机森林/random-forests.md)
32 | * [梯度提升树](分类和回归/组合树/梯度提升树/gbts.md)
33 | * [保序回归](分类和回归/保序回归/isotonic-regression.md)
34 | * [聚类](聚类/readme.md)
35 | * [k-means||算法](聚类/k-means/k-means.md)
36 | * [GMM(高斯混合模型)](聚类/gaussian-mixture/gaussian-mixture.md)
37 | * [PIC(快速迭代聚类)](聚类/PIC/pic.md)
38 | * [LDA(隐式狄利克雷分布)](聚类/LDA/lda.md)
39 | * [二分k-means算法](聚类/bis-k-means/bisecting-k-means.md)
40 | * [流式k-means算法](聚类/streaming-k-means/streaming-k-means.md)
41 | * [最优化算法](最优化算法/梯度下降/gradient-descent.md)
42 | * [梯度下降算法](最优化算法/梯度下降/gradient-descent.md)
43 | * [拟牛顿法](最优化算法/L-BFGS/lbfgs.md)
44 | * [NNLS(非负最小二乘)](最优化算法/非负最小二乘/NNLS.md)
45 | * [带权最小二乘](最优化算法/WeightsLeastSquares.md)
46 | * [迭代再加权最小二乘](最优化算法/IRLS.md)
47 | * [降维](降维/SVD/svd.md)
48 | * [EVD(特征值分解)](降维/EVD/evd.md)
49 | * [SVD(奇异值分解)](降维/SVD/svd.md)
50 | * [PCA(主成分分析)](降维/PCA/pca.md)
51 | * [特征抽取和转换](特征抽取和转换/TF-IDF.md)
52 | * [特征抽取](特征抽取和转换/TF-IDF.md)
53 | * [TF-IDF](特征抽取和转换/TF-IDF.md)
54 | * [Word2Vec](特征抽取和转换/Word2Vector.md)
55 | * [CountVectorizer](特征抽取和转换/CountVectorizer.md)
56 | * [特征转换](特征抽取和转换/normalizer.md)
57 | * [Tokenizer](特征抽取和转换/Tokenizer.md)
58 | * [StopWordsRemover](特征抽取和转换/StopWordsRemover.md)
59 | * [n-gram](特征抽取和转换/n_gram.md)
60 | * [Binarizer](特征抽取和转换/Binarizer.md)
61 | * [PolynomialExpansion](特征抽取和转换/PolynomialExpansion.md)
62 | * [Discrete Cosine Transform (DCT)](特征抽取和转换/DCT.md)
63 | * [StringIndexer](特征抽取和转换/StringIndexer.md)
64 | * [IndexToString](特征抽取和转换/IndexToString.md)
65 | * [OneHotEncoder](特征抽取和转换/OneHotEncoder.md)
66 | * [VectorIndexer](特征抽取和转换/VectorIndexer.md)
67 | * [Normalizer(规则化)](特征抽取和转换/normalizer.md)
68 | * [StandardScaler(特征缩放)](特征抽取和转换/StandardScaler.md)
69 | * [MinMaxScaler](特征抽取和转换/MinMaxScaler.md)
70 | * [MaxAbsScaler](特征抽取和转换/MaxAbsScaler.md)
71 | * [Bucketizer](特征抽取和转换/Bucketizer.md)
72 | * [ElementwiseProduct(元素智能乘积)](特征抽取和转换/element-wise-product.md)
73 | * [SQLTransformer](特征抽取和转换/SQLTransformer.md)
74 | * [VectorAssembler](特征抽取和转换/VectorAssembler.md)
75 | * [QuantileDiscretizer](特征抽取和转换/QuantileDiscretizer.md)
76 | * [特征选择](特征抽取和转换/VectorSlicer.md)
77 | * [VectorSlicer](特征抽取和转换/VectorSlicer.md)
78 | * [RFormula](特征抽取和转换/RFormula.md)
79 | * [ChiSqSelector(卡方选择器)](特征抽取和转换/chi-square-selector.md)
80 |
81 |
82 | ## 说明
83 |
84 | 本专题的大部分内容来自[spark源码](https://github.com/apache/spark)、[spark官方文档](https://spark.apache.org/docs/latest),并不用于商业用途。转载请注明本专题地址。
85 | 本专题引用他人的内容均列出了参考文献,如有侵权,请务必邮件通知作者。邮箱地址:`endymecy@sina.cn`。
86 |
87 | 本专题的部分文章中用到了latex来写数学公式,可以在浏览器中安装`MathJax`插件用来展示这些公式。
88 |
89 | 本人水平有限,分析中难免有错误和误解的地方,请大家不吝指教,万分感激。有问题可以到 [](https://www.codewake.com/p/spark-ml-source-analysis) 讨论。
90 |
91 | ## License
92 |
93 | 本文使用的许可见 [LICENSE](LICENSE)
94 |
--------------------------------------------------------------------------------
/SUMMARY.md:
--------------------------------------------------------------------------------
1 | * [数据类型](数据类型/data-type.md)
2 | * [基本统计](基本统计/summary-statistics.md)
3 | * [summary statistics(概括统计)](基本统计/summary-statistics.md)
4 | * [correlations(相关性系数)](基本统计/correlations.md)
5 | * [tratified sampling(分层取样)](基本统计/tratified-sampling.md)
6 | * [hypothesis testing(假设检验)](基本统计/hypothesis-testing.md)
7 | * [random data generation(随机数生成)](基本统计/random-data-generation.md)
8 | * [Kernel density estimation(核密度估计)](基本统计/kernel-density-estimation.md)
9 | * [协同过滤](推荐/ALS.md)
10 | * [交换最小二乘](推荐/ALS.md)
11 | * [分类和回归](分类和回归/readme.md)
12 | * [线性模型](分类和回归/线性模型/readme.md)
13 | * [SVMs(支持向量机)](分类和回归/线性模型/支持向量机/lsvm.md)
14 | * [逻辑回归](分类和回归/线性模型/逻辑回归/logic-regression.md)
15 | * [线性回归](分类和回归/线性模型/回归/regression.md)
16 | * [广义线性回归](分类和回归/线性模型/广义线性回归/glr.md)
17 | * [朴素贝叶斯](分类和回归/朴素贝叶斯/nb.md)
18 | * [决策树](分类和回归/决策树/decision-tree.md)
19 | * [组合树](分类和回归/组合树/readme.md)
20 | * [随机森林](分类和回归/组合树/随机森林/random-forests.md)
21 | * [梯度提升树](分类和回归/组合树/梯度提升树/gbts.md)
22 | * [保序回归](分类和回归/保序回归/isotonic-regression.md)
23 | * [聚类](聚类/readme.md)
24 | * [k-means算法](聚类/k-means/k-means.md)
25 | * [GMM(高斯混合模型)](聚类/gaussian-mixture/gaussian-mixture.md)
26 | * [PIC(快速迭代聚类)](聚类/PIC/pic.md)
27 | * [LDA(隐式狄利克雷分布)](聚类/LDA/lda.md)
28 | * [二分k-means算法](聚类/bis-k-means/bisecting-k-means.md)
29 | * [流式k-means算法](聚类/streaming-k-means/streaming-k-means.md)
30 | * [最优化算法](最优化算法/梯度下降/gradient-descent.md)
31 | * [梯度下降算法](最优化算法/梯度下降/gradient-descent.md)
32 | * [拟牛顿法](最优化算法/L-BFGS/lbfgs.md)
33 | * [NNLS(非负最小二乘)](最优化算法/非负最小二乘/NNLS.md)
34 | * [带权最小二乘](最优化算法/WeightsLeastSquares.md)
35 | * [迭代再加权最小二乘](最优化算法/IRLS.md)
36 | * [降维](降维/SVD/svd.md)
37 | * [EVD(特征值分解)](降维/EVD/evd.md)
38 | * [SVD(奇异值分解)](降维/SVD/svd.md)
39 | * [PCA(主成分分析)](降维/PCA/pca.md)
40 | * [特征抽取和转换](特征抽取和转换/TF-IDF.md)
41 | * [特征抽取](特征抽取和转换/TF-IDF.md)
42 | * [TF-IDF](特征抽取和转换/TF-IDF.md)
43 | * [Word2Vec](特征抽取和转换/Word2Vector.md)
44 | * [CountVectorizer](特征抽取和转换/CountVectorizer.md)
45 | * [特征转换](特征抽取和转换/normalizer.md)
46 | * [Tokenizer](特征抽取和转换/Tokenizer.md)
47 | * [StopWordsRemover](特征抽取和转换/StopWordsRemover.md)
48 | * [n-gram](特征抽取和转换/n_gram.md)
49 | * [Binarizer](特征抽取和转换/Binarizer.md)
50 | * [PolynomialExpansion](特征抽取和转换/PolynomialExpansion.md)
51 | * [Discrete Cosine Transform (DCT)](特征抽取和转换/DCT.md)
52 | * [StringIndexer](特征抽取和转换/StringIndexer.md)
53 | * [IndexToString](特征抽取和转换/IndexToString.md)
54 | * [OneHotEncoder](特征抽取和转换/OneHotEncoder.md)
55 | * [VectorIndexer](特征抽取和转换/VectorIndexer.md)
56 | * [Normalizer(规则化)](特征抽取和转换/normalizer.md)
57 | * [StandardScaler(特征缩放)](特征抽取和转换/StandardScaler.md)
58 | * [MinMaxScaler](特征抽取和转换/MinMaxScaler.md)
59 | * [MaxAbsScaler](特征抽取和转换/MaxAbsScaler.md)
60 | * [Bucketizer](特征抽取和转换/Bucketizer.md)
61 | * [ElementwiseProduct(元素智能乘积)](特征抽取和转换/element-wise-product.md)
62 | * [SQLTransformer](特征抽取和转换/SQLTransformer.md)
63 | * [VectorAssembler](特征抽取和转换/VectorAssembler.md)
64 | * [QuantileDiscretizer](特征抽取和转换/QuantileDiscretizer.md)
65 | * [特征选择](特征抽取和转换/VectorSlicer.md)
66 | * [VectorSlicer](特征抽取和转换/VectorSlicer.md)
67 | * [RFormula](特征抽取和转换/RFormula.md)
68 | * [ChiSqSelector(卡方选择器)](特征抽取和转换/chi-square-selector.md)
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-slate
--------------------------------------------------------------------------------
/分类和回归/readme.md:
--------------------------------------------------------------------------------
1 | # 分类与回归
2 |
3 | `spark.mllib`提供了多种方法用于用于[二分类](http://en.wikipedia.org/wiki/Binary_classification)、[多分类](http://en.wikipedia.org/wiki/Multiclass_classification)以及[回归分析](http://en.wikipedia.org/wiki/Regression_analysis)。
4 | 下表介绍了每种问题类型支持的算法。
5 |
6 | | 问题类型 | 支持的方法 |
7 | | ------------- |:-------------:|
8 | | 二分类 | 线性SVMs、逻辑回归、决策树、随机森林、梯度增强树、朴素贝叶斯 |
9 | | 多分类 | 逻辑回归、决策树、随机森林、朴素贝叶斯 |
10 | | 回归 | 线性最小二乘、决策树、随机森林、梯度增强树、保序回归 |
11 |
12 | 点击链接,了解具体的算法实现。
13 |
14 | * 分类和回归
15 | * [线性模型](线性模型/readme.md)
16 | * [SVMs(支持向量机)](线性模型/支持向量机/lsvm.md)
17 | * [逻辑回归](线性模型/逻辑回归/logic-regression.md)
18 | * [线性回归](线性模型/回归/regression.md)
19 | * [朴素贝叶斯](朴素贝叶斯/nb.md)
20 | * [决策树](决策树/decision-tree.md)
21 | * [组合树](组合树/readme.md)
22 | * [随机森林](组合树/随机森林/random-forests.md)
23 | * [梯度提升树](组合树/梯度提升树/gbts.md)
24 | * [保序回归](保序回归/isotonic-regression.md)
--------------------------------------------------------------------------------
/分类和回归/保序回归/imgs/1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/保序回归/imgs/1.1.png
--------------------------------------------------------------------------------
/分类和回归/保序回归/imgs/1.10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/保序回归/imgs/1.10.png
--------------------------------------------------------------------------------
/分类和回归/保序回归/imgs/1.11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/保序回归/imgs/1.11.png
--------------------------------------------------------------------------------
/分类和回归/保序回归/imgs/1.12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/保序回归/imgs/1.12.png
--------------------------------------------------------------------------------
/分类和回归/保序回归/imgs/1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/保序回归/imgs/1.2.png
--------------------------------------------------------------------------------
/分类和回归/保序回归/imgs/1.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/保序回归/imgs/1.3.png
--------------------------------------------------------------------------------
/分类和回归/保序回归/imgs/1.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/保序回归/imgs/1.4.png
--------------------------------------------------------------------------------
/分类和回归/保序回归/imgs/1.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/保序回归/imgs/1.5.png
--------------------------------------------------------------------------------
/分类和回归/保序回归/imgs/1.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/保序回归/imgs/1.6.png
--------------------------------------------------------------------------------
/分类和回归/保序回归/imgs/1.7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/保序回归/imgs/1.7.png
--------------------------------------------------------------------------------
/分类和回归/保序回归/imgs/1.8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/保序回归/imgs/1.8.png
--------------------------------------------------------------------------------
/分类和回归/保序回归/imgs/1.9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/保序回归/imgs/1.9.png
--------------------------------------------------------------------------------
/分类和回归/决策树/imgs/1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/决策树/imgs/1.1.png
--------------------------------------------------------------------------------
/分类和回归/决策树/imgs/1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/决策树/imgs/1.2.png
--------------------------------------------------------------------------------
/分类和回归/决策树/imgs/1.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/决策树/imgs/1.3.png
--------------------------------------------------------------------------------
/分类和回归/决策树/imgs/1.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/决策树/imgs/1.4.png
--------------------------------------------------------------------------------
/分类和回归/朴素贝叶斯/imgs/1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/朴素贝叶斯/imgs/1.1.png
--------------------------------------------------------------------------------
/分类和回归/朴素贝叶斯/imgs/1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/朴素贝叶斯/imgs/1.2.png
--------------------------------------------------------------------------------
/分类和回归/朴素贝叶斯/imgs/1.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/朴素贝叶斯/imgs/1.3.png
--------------------------------------------------------------------------------
/分类和回归/朴素贝叶斯/imgs/1.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/朴素贝叶斯/imgs/1.4.png
--------------------------------------------------------------------------------
/分类和回归/朴素贝叶斯/imgs/1.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/朴素贝叶斯/imgs/1.5.png
--------------------------------------------------------------------------------
/分类和回归/朴素贝叶斯/imgs/1.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/朴素贝叶斯/imgs/1.6.png
--------------------------------------------------------------------------------
/分类和回归/朴素贝叶斯/imgs/1.7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/朴素贝叶斯/imgs/1.7.png
--------------------------------------------------------------------------------
/分类和回归/朴素贝叶斯/imgs/1.8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/朴素贝叶斯/imgs/1.8.png
--------------------------------------------------------------------------------
/分类和回归/朴素贝叶斯/imgs/2.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/朴素贝叶斯/imgs/2.1.png
--------------------------------------------------------------------------------
/分类和回归/朴素贝叶斯/imgs/2.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/朴素贝叶斯/imgs/2.2.png
--------------------------------------------------------------------------------
/分类和回归/朴素贝叶斯/imgs/2.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/朴素贝叶斯/imgs/2.3.png
--------------------------------------------------------------------------------
/分类和回归/朴素贝叶斯/imgs/3.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/朴素贝叶斯/imgs/3.1.png
--------------------------------------------------------------------------------
/分类和回归/朴素贝叶斯/imgs/3.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/朴素贝叶斯/imgs/3.2.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/imgs/1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/imgs/1.1.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/imgs/1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/imgs/1.2.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/imgs/1.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/imgs/1.3.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/imgs/1.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/imgs/1.4.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/imgs/1.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/imgs/1.5.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/imgs/1.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/imgs/1.6.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/imgs/1.7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/imgs/1.7.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/imgs/1.8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/imgs/1.8.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/imgs/introduce1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/imgs/introduce1.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/readme.md:
--------------------------------------------------------------------------------
1 | # 线性模型
2 |
3 | # 1 数学描述
4 |
5 | 许多标准的机器学习算法可以归结为凸优化问题。例如,找到凸函数`f`的一个极小值的任务,这个凸函数依赖于可变向量`w`(在`spark`源码中,一般表示为`weights`)。
6 | 形式上,我们可以将其当作一个凸优化问题${min}_{w}f(w)$。它的目标函数可以表示为如下公式**(1)**:
7 |
8 |

9 |
10 | 在上式中,向量`x`表示训练数据集,`y`表示它相应的标签,也是我们想预测的值。如果`L(w;x,y)`可以表示为${w}^{T}x$和`y`的函数,
11 | 我们称这个方法为线性的。`spark.mllib`中的几种分类算法和回归算法可以归为这一类。
12 |
13 | 目标函数`f`包含两部分:正则化(`regularizer `),用于控制模型的复杂度;损失函数,用于度量模型的误差。损失函数`L(w;.)`是一个典型的基于`w`的凸函数。固定的正则化参数`gamma`定义了两种目标的权衡(`trade-off`),
14 | 这两个目标分别是最小化损失(训练误差)以及最小化模型复杂度(为了避免过拟合)。
15 |
16 | ## 1.1 损失函数
17 |
18 | 下面介绍`spark.mllib`中提供的几种损失函数以及它们的梯度或子梯度(`sub-gradient`)。
19 |
20 | - **hinge loss**
21 |
22 | `hinge`损失的损失函数`L(w;x,y)`以及梯度分别是:
23 |
24 | 
25 | 
26 |
27 | - **logistic loss**
28 |
29 | `logistic`损失的损失函数`L(w;x,y)`以及梯度分别是:
30 |
31 | 
32 | 
33 |
34 | - **squared loss**
35 |
36 | `squared`损失的损失函数`L(w;x,y)`以及梯度分别是:
37 |
38 | 
39 | 
40 |
41 | ## 1.2 正则化
42 |
43 | 正则化的目的是为了简化模型及防止过拟合。`spark.mllib`中提供了下面的正则化方法。
44 |
45 | | 问题 | 规则化函数R(w) | 梯度 |
46 | | ------------- |:-------------:|:-------------:|
47 | | Zero | 0 | 0 |
48 | | L2 | 如下公式(1) | w |
49 | | L1 | 如下公式(2) | sign(w) |
50 | | elastic net | alpha * L1 +(1-alpha) * L2 | alpha * sign(w) + (1-alpha) * w |
51 |
52 | 
53 | 
54 |
55 | 在上面的表格中,`sign(w)`是一个向量,它由`w`中的所有实体的信号量`(+1,-1)`组成。`L2`问题往往比`L1`问题更容易解决,那是因为`L2`是平滑的。然而,`L1`可以使权重矩阵更稀疏,
56 | 从而构建更小以及更可判断的模型,模型的可判断性在特征选择中很有用。
57 |
58 | # 2 分类
59 |
60 | 分类的目的就是将数据切分为不同的类别。最一般的分类类型是二分类,即有两个类别,通常称为正和负。如果类别数超过两个,我们称之为多分类。`spark.ml`提供了两种线性方法用于分类:线性支持向量机以及逻辑回归。
61 | 线性支持向量机仅仅支持二分类,逻辑回归既支持二分类也支持多分类。对所有的方法,`spark.ml`支持`L1`和`L2`正则化。分类算法的详细介绍见下面的链接。
62 |
63 | * [SVMs(支持向量机)](支持向量机/lsvm.md)
64 | * [逻辑回归](逻辑回归/logic-regression.md)
65 | * [线性回归](回归/regression.md)
66 |
67 |
--------------------------------------------------------------------------------
/分类和回归/线性模型/回归/imgs/1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/回归/imgs/1.1.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/回归/imgs/1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/回归/imgs/1.2.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/广义线性回归/glr.md:
--------------------------------------------------------------------------------
1 | # 广义线性回归
2 |
3 | ## 1 普通线性模型
4 |
5 | 普通线性模型(`ordinary linear model`)可以用下式表示:
6 |
7 | $$Y = \beta_0 + \beta_1 x_1 + \beta_2 x_2 + … + \beta_{p-1} x_{p-1} + \epsilon$$
8 |
9 | 这里$\beta$是未知参数,$\epsilon$是误差项。普通线性模型主要有以下几点假设:
10 |
11 | - 响应变量$Y$和误差项$\epsilon$均服从正太分布。其中$\epsilon \sim N(0,{{\sigma }^{2}})$,$Y\sim N({{\beta }^{T}}x,{{\sigma }^{2}})$。
12 | - 预测量$x_i$和未知参数$\beta_i$均具有非随机性。预测量$x_i$具有非随机性、可测且不存在测量误差;未知参数$\beta_i$被认为是未知但不具随机性的常数。
13 | - 普通线性模型的输出项是随机变量$Y$。普通线性模型主要研究响应变量的期望$E[Y]$。
14 | - 连接方式:在上面三点假设下,对上式两边取数学期望,可得
15 |
16 | $$E[Y] = \beta_0 + \beta_1 x_1 + \beta_2 x_2 + … + \beta_{p-1} x_{p-1}$$
17 |
18 | 在普通线性模型里,响应变量的均值$E[Y]$与预测量的线性组合$\beta_0 + \beta_1 x_1 + \beta_2 x_2 + … + \beta_{p-1} x_{p-1}$通过恒等式(`identity`)连接。
19 | 也可以说是通过$f(x)=x$这个链接函数(`link function`)连接。
20 |
21 | ## 2 广义线性模型
22 |
23 | 广义线性模型(`generalized linear model`)是在普通线性模型的基础上,对上述四点假设进行推广而得出的应用范围更广,更具实用性的回归模型。
24 | 主要有两点不同,这两点分别是:
25 |
26 | - 响应变量$Y$和误差项$\epsilon$的分布推广至指数分散族(`exponential dispersion family`)。在`spark ml`中,广义线性回归支持的指数分布分别是正态分布、泊松分布、二项分布以及伽玛分布。
27 | - 连接方式:广义线性模型里采用的链接函数(`link function`)理论上可以是任意的,而不再局限于$f(x)=x$。
28 |
29 | 这里需要重点说明一下链接函数。链接函数描述了线性预测$X\beta$与分布期望值$E[Y]$的关系:$E[Y] = \mu = g^{-1}(X\beta)$,其中$g$表示链接函数,$\mu$表示均值函数。
30 | 一般情况下,高斯分布对应于恒等式,泊松分布对应于自然对数函数等。下面列出了`spark ml`中提供的链接函数以及该链接函数使用的指数分布。
31 |
32 | | 连接函数名称 | 链接函数 | 均值函数 | 对应的指数分布 |
33 | |------------|-------|-----------|-------------|
34 | | identity(恒等)| $\mu = X\beta$ | $\mu = X\beta$| 高斯分布,泊松分布,伽马分布 |
35 | | inverse(倒数)| $\mu^{-1} = X\beta$ | $\mu = (X\beta)^{-1}$ | 高斯分布,伽马分布 |
36 | | sqrt(均分) | $\mu^{1/2} = X\beta$ | $\mu = (X\beta)^{2}$ | 泊松分布 |
37 | | log(对数)| $ln(\mu) = X\beta$ | $\mu = exp(X\beta)$ | 高斯分布,泊松分布,伽马分布 |
38 | | logit | $ln(\frac{\mu }{1-\mu }) = X\beta$ | $\mu = \frac{exp(X\beta)}{1 + exp(1 + X\beta)}$ | 高斯分布,泊松分布,伽马分布 |
39 | | cloglog | $ln(- ln(1-\mu)) = X\beta$ | $\mu = 1 - exp(- exp(X\beta))$ | 二次分布 |
40 | | probit | 标准高斯分布的inverse cdf,其中p值为$\mu$ | 标准高斯分布的cdf | 二次分布 |
41 |
42 | ## 3 源码分析
43 |
44 | ### 3.1 使用实例
45 |
46 | ```scala
47 | import org.apache.spark.ml.regression.GeneralizedLinearRegression
48 |
49 | // Load training data
50 | val dataset = spark.read.format("libsvm")
51 | .load("data/mllib/sample_linear_regression_data.txt")
52 |
53 | val glr = new GeneralizedLinearRegression()
54 | .setFamily("gaussian")
55 | .setLink("identity")
56 | .setMaxIter(10)
57 | .setRegParam(0.3)
58 |
59 | // Fit the model
60 | val model = glr.fit(dataset)
61 |
62 | // Print the coefficients and intercept for generalized linear regression model
63 | println(s"Coefficients: ${model.coefficients}")
64 | println(s"Intercept: ${model.intercept}")
65 | ```
66 |
67 | ### 3.2 训练模型
68 |
69 | 广义线性回归的训练比较简单。当指数分布是高斯分布,同时链接函数是恒等(`identity`)时,此时的情况就是普通的线性回归。可以利用带权最小二乘求解。
70 |
71 | ```scala
72 | val model = if (familyObj == Gaussian && linkObj == Identity) {
73 | val optimizer = new WeightedLeastSquares($(fitIntercept), $(regParam), elasticNetParam = 0.0,
74 | standardizeFeatures = true, standardizeLabel = true)
75 | val wlsModel = optimizer.fit(instances)
76 | val model = copyValues(
77 | new GeneralizedLinearRegressionModel(uid, wlsModel.coefficients, wlsModel.intercept)
78 | .setParent(this))
79 | val trainingSummary = new GeneralizedLinearRegressionTrainingSummary(dataset, model,
80 | wlsModel.diagInvAtWA.toArray, 1, getSolver)
81 | model.setSummary(Some(trainingSummary))
82 | }
83 | ```
84 | 如果是其它的情况,使用迭代再加权最小二乘(`Iteratively reweighted least squares(IRLS)`)求解。
85 |
86 | ```scala
87 | // Fit Generalized Linear Model by iteratively reweighted least squares (IRLS).
88 | val initialModel = familyAndLink.initialize(instances, $(fitIntercept), $(regParam))
89 | val optimizer = new IterativelyReweightedLeastSquares(initialModel,
90 | familyAndLink.reweightFunc, $(fitIntercept), $(regParam), $(maxIter), $(tol))
91 | val irlsModel = optimizer.fit(instances)
92 | val model = copyValues(
93 | new GeneralizedLinearRegressionModel(uid, irlsModel.coefficients, irlsModel.intercept)
94 | .setParent(this))
95 | val trainingSummary = new GeneralizedLinearRegressionTrainingSummary(dataset, model,
96 | irlsModel.diagInvAtWA.toArray, irlsModel.numIterations, getSolver)
97 | model.setSummary(Some(trainingSummary))
98 | ```
99 | 迭代再加权最小二乘的分析见最优化章节:[迭代再加权最小二乘](../../../最优化算法/IRLS.md)。
100 |
101 | ### 3.3 链接函数
102 |
103 | 根据第二章中表格描述的链接函数和均值函数,我们可以很容易实现链接函数。链接函数和均值函数的值可以用于对样本进行更新,
104 | 更新相应的标签值和权重值。
105 |
106 | - Identity
107 |
108 | ```scala
109 | private[regression] object Identity extends Link("identity") {
110 | override def link(mu: Double): Double = mu // 链接函数
111 | override def deriv(mu: Double): Double = 1.0 // 链接函数求导数
112 | override def unlink(eta: Double): Double = eta // 均值函数
113 | }
114 | ```
115 | - Logit
116 |
117 | ```scala
118 | private[regression] object Logit extends Link("logit") {
119 | override def link(mu: Double): Double = math.log(mu / (1.0 - mu)) // 链接函数
120 | override def deriv(mu: Double): Double = 1.0 / (mu * (1.0 - mu)) // 链接函数导数
121 | override def unlink(eta: Double): Double = 1.0 / (1.0 + math.exp(-1.0 * eta)) // 均值函数
122 | }
123 | ```
124 |
125 | - Log
126 |
127 | ```scala
128 | private[regression] object Log extends Link("log") {
129 | override def link(mu: Double): Double = math.log(mu) // 链接函数
130 | override def deriv(mu: Double): Double = 1.0 / mu // 链接函数导数
131 | override def unlink(eta: Double): Double = math.exp(eta) // 均值函数
132 | }
133 | ```
134 |
135 | - Inverse
136 |
137 | ```scala
138 | private[regression] object Inverse extends Link("inverse") {
139 | override def link(mu: Double): Double = 1.0 / mu // 链接函数
140 | override def deriv(mu: Double): Double = -1.0 * math.pow(mu, -2.0) // 链接函数导数
141 | override def unlink(eta: Double): Double = 1.0 / eta // 均值函数
142 | }
143 | ```
144 |
145 | - Probit
146 |
147 | ```scala
148 | private[regression] object Probit extends Link("probit") {
149 | override def link(mu: Double): Double = dist.Gaussian(0.0, 1.0).icdf(mu) // 链接函数
150 | override def deriv(mu: Double): Double = {
151 | 1.0 / dist.Gaussian(0.0, 1.0).pdf(dist.Gaussian(0.0, 1.0).icdf(mu)) // 链接函数导数
152 | }
153 | override def unlink(eta: Double): Double = dist.Gaussian(0.0, 1.0).cdf(eta) // 均值函数
154 | }
155 | ```
156 | - CLogLog
157 |
158 | ```scala
159 | private[regression] object CLogLog extends Link("cloglog") {
160 | override def link(mu: Double): Double = math.log(-1.0 * math.log(1 - mu)) // 链接函数
161 | override def deriv(mu: Double): Double = 1.0 / ((mu - 1.0) * math.log(1.0 - mu)) // 链接函数导数
162 | override def unlink(eta: Double): Double = 1.0 - math.exp(-1.0 * math.exp(eta)) // 均值函数
163 | }
164 | ```
165 | - Sqrt
166 |
167 | ```scala
168 | private[regression] object Sqrt extends Link("sqrt") {
169 | override def link(mu: Double): Double = math.sqrt(mu) // 链接函数
170 | override def deriv(mu: Double): Double = 1.0 / (2.0 * math.sqrt(mu)) // 链接函数导数
171 | override def unlink(eta: Double): Double = eta * eta // 均值函数
172 | }
173 | ```
174 |
175 | ## 参考文献
176 |
177 | 【1】[从线性模型到广义线性模型](http://cos.name/2011/01/how-does-glm-generalize-lm-assumption/)
178 |
179 | 【2】[广义线性模型-维基百科](https://zh.wikipedia.org/wiki/%E5%BB%A3%E7%BE%A9%E7%B7%9A%E6%80%A7%E6%A8%A1%E5%9E%8B)
--------------------------------------------------------------------------------
/分类和回归/线性模型/支持向量机/imgs/1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/支持向量机/imgs/1.1.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/支持向量机/lsvm.md:
--------------------------------------------------------------------------------
1 | # 线性支持向量机
2 |
3 | ## 1 介绍
4 |
5 | 线性支持向量机是一个用于大规模分类任务的标准方法。它的目标函数[线性模型](../readme.md)中的公式(1)。它的损失函数是海格损失,如下所示
6 |
7 | 
8 |
9 | 默认情况下,线性支持向量机训练时使用`L2`正则化。线性支持向量机输出一个`SVM`模型。给定一个新的数据点`x`,模型通过`w^Tx`的值预测,当这个值大于0时,输出为正,否则输出为负。
10 |
11 | 线性支持向量机并不需要核函数,要详细了解支持向量机,请参考文献【1】。
12 |
13 | ## 2 源码分析
14 |
15 | ### 2.1 实例
16 |
17 | ```scala
18 | import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
19 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
20 | import org.apache.spark.mllib.util.MLUtils
21 | // Load training data in LIBSVM format.
22 | val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
23 | // Split data into training (60%) and test (40%).
24 | val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
25 | val training = splits(0).cache()
26 | val test = splits(1)
27 | // Run training algorithm to build the model
28 | val numIterations = 100
29 | val model = SVMWithSGD.train(training, numIterations)
30 | // Clear the default threshold.
31 | model.clearThreshold()
32 | // Compute raw scores on the test set.
33 | val scoreAndLabels = test.map { point =>
34 | val score = model.predict(point.features)
35 | (score, point.label)
36 | }
37 | // Get evaluation metrics.
38 | val metrics = new BinaryClassificationMetrics(scoreAndLabels)
39 | val auROC = metrics.areaUnderROC()
40 | println("Area under ROC = " + auROC)
41 | ```
42 |
43 | ### 2.2 训练
44 |
45 | 和逻辑回归一样,训练过程均使用`GeneralizedLinearModel`中的`run`训练,只是训练使用的`Gradient`和`Updater`不同。在线性支持向量机中,使用`HingeGradient`计算梯度,使用`SquaredL2Updater`进行更新。
46 | 它的实现过程分为4步。参加[逻辑回归](../逻辑回归/logic-regression.md)了解这五步的详细情况。我们只需要了解`HingeGradient`和`SquaredL2Updater`的实现。
47 |
48 | ```scala
49 | class HingeGradient extends Gradient {
50 | override def compute(data: Vector, label: Double, weights: Vector): (Vector, Double) = {
51 | val dotProduct = dot(data, weights)
52 | // 我们的损失函数是 max(0, 1 - (2y - 1) (f_w(x)))
53 | // 所以梯度是 -(2y - 1)*x
54 | val labelScaled = 2 * label - 1.0
55 | if (1.0 > labelScaled * dotProduct) {
56 | val gradient = data.copy
57 | scal(-labelScaled, gradient)
58 | (gradient, 1.0 - labelScaled * dotProduct)
59 | } else {
60 | (Vectors.sparse(weights.size, Array.empty, Array.empty), 0.0)
61 | }
62 | }
63 |
64 | override def compute(
65 | data: Vector,
66 | label: Double,
67 | weights: Vector,
68 | cumGradient: Vector): Double = {
69 | val dotProduct = dot(data, weights)
70 | // 我们的损失函数是 max(0, 1 - (2y - 1) (f_w(x)))
71 | // 所以梯度是 -(2y - 1)*x
72 | val labelScaled = 2 * label - 1.0
73 | if (1.0 > labelScaled * dotProduct) {
74 | //cumGradient -= labelScaled * data
75 | axpy(-labelScaled, data, cumGradient)
76 | //损失值
77 | 1.0 - labelScaled * dotProduct
78 | } else {
79 | 0.0
80 | }
81 | }
82 | }
83 | ```
84 |
85 | 线性支持向量机的训练使用`L2`正则化方法。
86 |
87 | ```scala
88 | class SquaredL2Updater extends Updater {
89 | override def compute(
90 | weightsOld: Vector,
91 | gradient: Vector,
92 | stepSize: Double,
93 | iter: Int,
94 | regParam: Double): (Vector, Double) = {
95 | // w' = w - thisIterStepSize * (gradient + regParam * w)
96 | // w' = (1 - thisIterStepSize * regParam) * w - thisIterStepSize * gradient
97 | //表示步长,即负梯度方向的大小
98 | val thisIterStepSize = stepSize / math.sqrt(iter)
99 | val brzWeights: BV[Double] = weightsOld.toBreeze.toDenseVector
100 | //正则化,brzWeights每行数据均乘以(1.0 - thisIterStepSize * regParam)
101 | brzWeights :*= (1.0 - thisIterStepSize * regParam)
102 | //y += x * a,即brzWeights -= gradient * thisInterStepSize
103 | brzAxpy(-thisIterStepSize, gradient.toBreeze, brzWeights)
104 | //正则化||w||_2
105 | val norm = brzNorm(brzWeights, 2.0)
106 | (Vectors.fromBreeze(brzWeights), 0.5 * regParam * norm * norm)
107 | }
108 | }
109 | ```
110 | 该函数的实现规则是:
111 |
112 | ```scala
113 | w' = w - thisIterStepSize * (gradient + regParam * w)
114 | w' = (1 - thisIterStepSize * regParam) * w - thisIterStepSize * gradient
115 | ```
116 | 这里`thisIterStepSize`表示参数沿负梯度方向改变的速率,它随着迭代次数的增多而减小。
117 |
118 | ### 2.3 预测
119 |
120 | ```scala
121 | override protected def predictPoint(
122 | dataMatrix: Vector,
123 | weightMatrix: Vector,
124 | intercept: Double) = {
125 | //w^Tx
126 | val margin = weightMatrix.toBreeze.dot(dataMatrix.toBreeze) + intercept
127 | threshold match {
128 | case Some(t) => if (margin > t) 1.0 else 0.0
129 | case None => margin
130 | }
131 | }
132 | ```
133 |
134 | # 参考文献
135 |
136 | 【1】[支持向量机通俗导论(理解SVM的三层境界)](http://blog.csdn.net/macyang/article/details/38782399)
137 |
138 |
139 |
--------------------------------------------------------------------------------
/分类和回归/线性模型/逻辑回归/imgs/1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/逻辑回归/imgs/1.1.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/逻辑回归/imgs/1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/逻辑回归/imgs/1.2.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/逻辑回归/imgs/1.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/逻辑回归/imgs/1.3.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/逻辑回归/imgs/1.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/逻辑回归/imgs/1.4.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/逻辑回归/imgs/1.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/逻辑回归/imgs/1.5.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/逻辑回归/imgs/1.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/逻辑回归/imgs/1.6.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/逻辑回归/imgs/1.7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/逻辑回归/imgs/1.7.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/逻辑回归/imgs/2.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/逻辑回归/imgs/2.1.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/逻辑回归/imgs/2.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/逻辑回归/imgs/2.2.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/逻辑回归/imgs/2.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/逻辑回归/imgs/2.3.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/逻辑回归/imgs/2.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/逻辑回归/imgs/2.4.png
--------------------------------------------------------------------------------
/分类和回归/线性模型/逻辑回归/imgs/2.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/线性模型/逻辑回归/imgs/2.5.png
--------------------------------------------------------------------------------
/分类和回归/组合树/readme.md:
--------------------------------------------------------------------------------
1 | # 集成学习
2 |
3 | 集成学习通过构建并结合多个学习器来完成学习任务,有时也被称为多分类器系统。集成学习通过将多个学习器进行结合,常可获得比单一学习器显著优越的泛化能力。
4 |
5 | 根据个体学习器的生成方式,目前的集成学习方法大致可以分为两大类。即个体学习器之间存在强依赖性,必须串行生成的序列化方法以及个体学习器之间不存在强依赖性,可同时生成的并行化方法。
6 | 前者的代表是`Boosting`,后者的代表是`Bagging`和随机森林。后面的随机森林章节会详细介绍`Bagging`和随机森林;梯度提升树章节会详细介绍`Boosting`和梯度提升树。
--------------------------------------------------------------------------------
/分类和回归/组合树/梯度提升树/imgs/1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/组合树/梯度提升树/imgs/1.1.png
--------------------------------------------------------------------------------
/分类和回归/组合树/梯度提升树/imgs/1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/组合树/梯度提升树/imgs/1.2.png
--------------------------------------------------------------------------------
/分类和回归/组合树/梯度提升树/imgs/1.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/组合树/梯度提升树/imgs/1.3.png
--------------------------------------------------------------------------------
/分类和回归/组合树/随机森林/imgs/1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/组合树/随机森林/imgs/1.1.png
--------------------------------------------------------------------------------
/分类和回归/组合树/随机森林/imgs/1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/组合树/随机森林/imgs/1.2.png
--------------------------------------------------------------------------------
/分类和回归/组合树/随机森林/imgs/1.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/组合树/随机森林/imgs/1.3.png
--------------------------------------------------------------------------------
/分类和回归/组合树/随机森林/imgs/1.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/分类和回归/组合树/随机森林/imgs/1.4.png
--------------------------------------------------------------------------------
/基本统计/correlations.md:
--------------------------------------------------------------------------------
1 | # 相关性系数
2 |
3 | 计算两个数据集的相关性是统计中的常用操作。在`MLlib`中提供了计算多个数据集两两相关的方法。目前支持的相关性方法有皮尔森(`Pearson`)相关和斯皮尔曼(`Spearman`)相关。
4 |
5 | `Statistics`提供方法计算数据集的相关性。根据输入的类型,两个`RDD[Double]`或者一个`RDD[Vector]`,输出将会是一个`Double`值或者相关性矩阵。下面是一个应用的例子。
6 |
7 | ```scala
8 | import org.apache.spark.SparkContext
9 | import org.apache.spark.mllib.linalg._
10 | import org.apache.spark.mllib.stat.Statistics
11 | val sc: SparkContext = ...
12 | val seriesX: RDD[Double] = ... // a series
13 | val seriesY: RDD[Double] = ... // must have the same number of partitions and cardinality as seriesX
14 | // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
15 | // method is not specified, Pearson's method will be used by default.
16 | val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson")
17 | val data: RDD[Vector] = ... // note that each Vector is a row and not a column
18 | // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
19 | // If a method is not specified, Pearson's method will be used by default.
20 | val correlMatrix: Matrix = Statistics.corr(data, "pearson")
21 | ```
22 | 这个例子中我们看到,计算相关性的入口函数是`Statistics.corr`,当输入的数据集是两个`RDD[Double]`时,它的实际实现是`Correlations.corr`,当输入数据集是`RDD[Vector]`时,它的实际实现是`Correlations.corrMatrix`。
23 | 下文会分别分析这两个函数。
24 |
25 | ```scala
26 | def corr(x: RDD[Double],
27 | y: RDD[Double],
28 | method: String = CorrelationNames.defaultCorrName): Double = {
29 | val correlation = getCorrelationFromName(method)
30 | correlation.computeCorrelation(x, y)
31 | }
32 | def corrMatrix(X: RDD[Vector],
33 | method: String = CorrelationNames.defaultCorrName): Matrix = {
34 | val correlation = getCorrelationFromName(method)
35 | correlation.computeCorrelationMatrix(X)
36 | }
37 | ```
38 | 这两个函数的第一步就是获得对应方法名的相关性方法的实现对象。并且如果输入数据集是两个`RDD[Double]`,`MLlib`会将其统一转换为`RDD[Vector]`进行处理。
39 |
40 | ```scala
41 | def computeCorrelationWithMatrixImpl(x: RDD[Double], y: RDD[Double]): Double = {
42 | val mat: RDD[Vector] = x.zip(y).map { case (xi, yi) => new DenseVector(Array(xi, yi)) }
43 | computeCorrelationMatrix(mat)(0, 1)
44 | }
45 | ```
46 | 不同的相关性方法,`computeCorrelationMatrix`的实现不同。下面分别介绍皮尔森相关与斯皮尔曼相关的实现。
47 |
48 | ## 1 皮尔森相关系数
49 |
50 | 皮尔森相关系数也叫皮尔森积差相关系数,是用来反映两个变量相似程度的统计量。或者说可以用来计算两个向量的相似度(在基于向量空间模型的文本分类、用户喜好推荐系统中都有应用)。皮尔森相关系数计算公式如下:
51 |
52 | 
53 |
54 | 当两个变量的线性关系增强时,相关系数趋于1或-1。正相关时趋于1,负相关时趋于-1。当两个变量独立时相关系统为0,但反之不成立。当`Y`和`X`服从联合正态分布时,其相互独立和不相关是等价的。
55 | 皮尔森相关系数的计算通过下面代码实现。
56 |
57 | ```scala
58 | override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
59 | val rowMatrix = new RowMatrix(X)
60 | //计算协方差矩阵
61 | val cov = rowMatrix.computeCovariance()
62 | computeCorrelationMatrixFromCovariance(cov)
63 | }
64 | def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = {
65 | val cov = covarianceMatrix.toBreeze.asInstanceOf[BDM[Double]]
66 | val n = cov.cols
67 | // 计算对角元素的标准差
68 | var i = 0
69 | while (i < n) {
70 | cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i))
71 | i +=1
72 | }
73 | // Loop through columns since cov is column major
74 | var j = 0
75 | var sigma = 0.0
76 | var containNaN = false
77 | while (j < n) {
78 | sigma = cov(j, j)
79 | i = 0
80 | while (i < j) {
81 | val corr = if (sigma == 0.0 || cov(i, i) == 0.0) {
82 | containNaN = true
83 | Double.NaN
84 | } else {
85 | //根据上文的公式计算,即cov(x,y)/(sigma_x * sigma_y)
86 | cov(i, j) / (sigma * cov(i, i))
87 | }
88 | cov(i, j) = corr
89 | cov(j, i) = corr
90 | i += 1
91 | }
92 | j += 1
93 | }
94 | // put 1.0 on the diagonals
95 | i = 0
96 | while (i < n) {
97 | cov(i, i) = 1.0
98 | i +=1
99 | }
100 | Matrices.fromBreeze(cov)
101 | }
102 | ```
103 |
104 | ## 2 斯皮尔曼相关系数
105 |
106 | 使用皮尔森线性相关系数有2个局限:首先,必须假设数据是成对地从正态分布中取得的;其次,数据至少在逻辑范围内是等距的。对不服从正态分布的资料不符合使用矩相关系数来描述关联性。
107 | 此时可采用秩相关(`rank correlation`),也称等级相关,来描述两个变量之间的关联程度与方向。斯皮尔曼秩相关系数就是其中一种。
108 |
109 | 斯皮尔曼秩相关系数定义为排序变量(`ranked variables`)之间的皮尔逊相关系数。对于大小为`n`的样本集,将原始的数据`X_i`和`Y_i`转换成排序变量`rgX_i`和`rgY_i`,然后按照皮尔森相关系数的计算公式进行计算。
110 |
111 | 
112 |
113 | 下面的代码将原始数据转换成了排序数据。
114 |
115 | ```scala
116 | override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
117 | // ((columnIndex, value), rowUid)
118 | //使用zipWithUniqueId产生的rowUid全局唯一
119 | val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) =>
120 | vec.toArray.view.zipWithIndex.map { case (v, j) =>
121 | ((j, v), uid)
122 | }
123 | }
124 | // 通过(columnIndex, value)全局排序,排序的好处是使下面只需迭代一次
125 | val sorted = colBased.sortByKey()
126 | // 分配全局的ranks (using average ranks for tied values)
127 | val globalRanks = sorted.zipWithIndex().mapPartitions { iter =>
128 | var preCol = -1
129 | var preVal = Double.NaN
130 | var startRank = -1.0
131 | var cachedUids = ArrayBuffer.empty[Long]
132 | val flush: () => Iterable[(Long, (Int, Double))] = () => {
133 | val averageRank = startRank + (cachedUids.size - 1) / 2.0
134 | val output = cachedUids.map { uid =>
135 | (uid, (preCol, averageRank))
136 | }
137 | cachedUids.clear()
138 | output
139 | }
140 | iter.flatMap { case (((j, v), uid), rank) =>
141 | // 如果有新的值或者cachedUids过大, 调用flush
142 | if (j != preCol || v != preVal || cachedUids.size >= 10000000) {
143 | val output = flush()
144 | preCol = j
145 | preVal = v
146 | startRank = rank
147 | cachedUids += uid
148 | output
149 | } else {
150 | cachedUids += uid
151 | Iterator.empty
152 | }
153 | } ++ flush()
154 | }
155 | //使用rank值代替原来的值
156 | val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) =>
157 | // 根据列索引排序
158 | Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray)
159 | }
160 | ```
161 | 在每个分区内部,对于列索引相同且值相同的数据对,我们为其分配平均`rank`值。平均`rank`的计算方式如下面公式所示:
162 |
163 | 
164 |
165 | 其中`rank_start`表示列索引相同且值相同的数据对在分区中第一次出现时的索引位置,`n`表示列索引相同且值相同的数据对出现的次数。
166 |
167 | ## 3 参考文献
168 |
169 | 【1】[Pearson product-moment correlation coefficient](https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient)
170 |
171 | 【2】[Spearman's rank correlation coefficient](https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient)
172 |
173 | 【3】[相关性检验--Spearman秩相关系数和皮尔森相关系数](http://www.cnblogs.com/zhangchaoyang/articles/2631907.html)
174 |
175 |
176 |
177 |
--------------------------------------------------------------------------------
/基本统计/hypothesis-testing.md:
--------------------------------------------------------------------------------
1 | # 假设检测
2 |
3 | 假设检测是统计中有力的工具,它用于判断一个结果是否在统计上是显著的、这个结果是否有机会发生。`spark.mllib`目前支持皮尔森卡方检测。输入属性的类型决定是作拟合优度(`goodness of fit`)检测还是作独立性检测。
4 | 拟合优度检测需要输入数据的类型是`vector`,独立性检测需要输入数据的类型是`Matrix`。
5 |
6 | `spark.mllib`也支持输入数据类型为`RDD[LabeledPoint]`,它用来通过卡方独立性检测作特征选择。`Statistics`提供方法用来作皮尔森卡方检测。下面是一个例子:
7 |
8 | ```scala
9 | import org.apache.spark.SparkContext
10 | import org.apache.spark.mllib.linalg._
11 | import org.apache.spark.mllib.regression.LabeledPoint
12 | import org.apache.spark.mllib.stat.Statistics._
13 | val sc: SparkContext = ...
14 | val vec: Vector = ... // a vector composed of the frequencies of events
15 | // 作皮尔森拟合优度检测
16 | val goodnessOfFitTestResult = Statistics.chiSqTest(vec)
17 | println(goodnessOfFitTestResult)
18 | val mat: Matrix = ... // a contingency matrix
19 | // 作皮尔森独立性检测
20 | val independenceTestResult = Statistics.chiSqTest(mat)
21 | println(independenceTestResult) // summary of the test including the p-value, degrees of freedom...
22 | val obs: RDD[LabeledPoint] = ... // (feature, label) pairs.
23 | // 独立性检测用于特征选择
24 | val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs)
25 | var i = 1
26 | featureTestResults.foreach { result =>
27 | println(s"Column $i:\n$result")
28 | i += 1
29 | }
30 | ```
31 | 另外,`spark.mllib`提供了一个`Kolmogorov-Smirnov (KS)`检测的`1-sample, 2-sided`实现,用来检测概率分布的相等性。通过提供理论分布(现在仅仅支持正太分布)的名字以及它相应的参数,
32 | 或者提供一个计算累积分布(`cumulative distribution`)的函数,用户可以检测原假设或零假设(`null hypothesis`):即样本是否来自于这个分布。用户检测正太分布,但是不提供分布参数,检测会默认该分布为标准正太分布。
33 |
34 | `Statistics`提供了一个运行`1-sample, 2-sided KS`检测的方法,下面就是一个应用的例子。
35 |
36 | ```scala
37 | import org.apache.spark.mllib.stat.Statistics
38 | val data: RDD[Double] = ... // an RDD of sample data
39 | // run a KS test for the sample versus a standard normal distribution
40 | val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
41 | println(testResult)
42 | // perform a KS test using a cumulative distribution function of our making
43 | val myCDF: Double => Double = ...
44 | val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
45 | ```
46 |
47 | ## 流式显著性检测
48 |
49 | 显著性检验即用于实验处理组与对照组或两种不同处理的效应之间是否有差异,以及这种差异是否显著的方法。
50 |
51 | 常把一个要检验的假设记作`H0`,称为原假设(或零假设) (`null hypothesis`) ,与`H0`对立的假设记作`H1`,称为备择假设(`alternative hypothesis`) 。
52 |
53 | - 在原假设为真时,决定放弃原假设,称为第一类错误,其出现的概率通常记作`alpha`
54 |
55 | - 在原假设不真时,决定接受原假设,称为第二类错误,其出现的概率通常记作`beta`
56 |
57 | 通常只限定犯第一类错误的最大概率`alpha`, 不考虑犯第二类错误的概率`beta`。这样的假设检验又称为显著性检验,概率`alpha`称为显著性水平。
58 |
59 | `MLlib`提供一些检测的在线实现,用于支持诸如`A/B`测试的场景。这些检测可能执行在`Spark Streaming`的`DStream[(Boolean,Double)]`上,元组的第一个元素表示控制组(`control group (false)`)或者处理组(` treatment group (true)`),
60 | 第二个元素表示观察者的值。
61 |
62 | 流式显著性检测支持下面的参数:
63 |
64 | - `peacePeriod`:来自流中忽略的初始数据点的数量,用于减少`novelty effects`;
65 |
66 | - `windowSize`:执行假设检测的以往批次的数量。如果设置为0,将对之前所有的批次数据作累积处理。
67 |
68 | `StreamingTest`支持流式假设检测。下面是一个应用的例子。
69 |
70 | ```scala
71 | val data = ssc.textFileStream(dataDir).map(line => line.split(",") match {
72 | case Array(label, value) => BinarySample(label.toBoolean, value.toDouble)
73 | })
74 | val streamingTest = new StreamingTest()
75 | .setPeacePeriod(0)
76 | .setWindowSize(0)
77 | .setTestMethod("welch")
78 | val out = streamingTest.registerStream(data)
79 | out.print()
80 | ```
81 |
82 | # 参考文献
83 |
84 | 【1】[显著性检验](http://wiki.mbalib.com/wiki/Significance_Testing)
--------------------------------------------------------------------------------
/基本统计/imgs/1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/基本统计/imgs/1.1.png
--------------------------------------------------------------------------------
/基本统计/imgs/1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/基本统计/imgs/1.2.png
--------------------------------------------------------------------------------
/基本统计/imgs/1.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/基本统计/imgs/1.3.png
--------------------------------------------------------------------------------
/基本统计/imgs/1.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/基本统计/imgs/1.4.png
--------------------------------------------------------------------------------
/基本统计/imgs/1.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/基本统计/imgs/1.5.png
--------------------------------------------------------------------------------
/基本统计/imgs/1.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/基本统计/imgs/1.6.png
--------------------------------------------------------------------------------
/基本统计/imgs/2.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/基本统计/imgs/2.1.png
--------------------------------------------------------------------------------
/基本统计/imgs/2.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/基本统计/imgs/2.2.png
--------------------------------------------------------------------------------
/基本统计/imgs/2.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/基本统计/imgs/2.3.png
--------------------------------------------------------------------------------
/基本统计/imgs/5.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/基本统计/imgs/5.1.png
--------------------------------------------------------------------------------
/基本统计/imgs/5.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/基本统计/imgs/5.2.png
--------------------------------------------------------------------------------
/基本统计/kernel-density-estimation.md:
--------------------------------------------------------------------------------
1 | # 核密度估计
2 |
3 | ## 1 理论分析
4 |
5 | 核密度估计是在概率论中用来估计未知的密度函数,属于非参数检验方法之一。假设我们有`n`个数$x_{1},x_{2},...,x_{n}$,要计算某个数`X`的概率密度有多大,
6 | 可以通过下面的核密度估计方法估计。
7 |
8 | 
9 |
10 | 在上面的式子中,`K`为核密度函数,`h`为窗宽。核密度函数的原理比较简单,在我们知道某一事物的概率分布的情况下,如果某一个数在观察中出现了,我们可以认为这个数的概率密度很大,和这个数比较近的数的概率密度也会比较大,而那些离这个数远的数的概率密度会比较小。
11 |
12 | 基于这种想法,针对观察中的第一个数,我们可以用`K`去拟合我们想象中的那个远小近大概率密度。对每一个观察数拟合出的多个概率密度分布函数,取平均。如果某些数是比较重要的,则可以取加权平均。需要说明的一点是,核密度的估计并不是找到真正的分布函数。
13 |
14 | 在`MLlib`中,仅仅支持以高斯核做核密度估计。以高斯核做核密度估计时核密度估计公式**(1)**如下:
15 |
16 | 
17 |
18 | ## 2 例子
19 |
20 | `KernelDensity`提供了方法通过样本`RDD`计算核密度估计。下面的例子给出了使用方法。
21 |
22 | ```scala
23 | import org.apache.spark.mllib.stat.KernelDensity
24 | import org.apache.spark.rdd.RDD
25 | val data: RDD[Double] = ... // an RDD of sample data
26 | // Construct the density estimator with the sample data and a standard deviation for the Gaussian
27 | // kernels
28 | val kd = new KernelDensity()
29 | .setSample(data)
30 | .setBandwidth(3.0)
31 | // Find density estimates for the given values
32 | val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
33 | ```
34 |
35 | ## 3 代码实现
36 |
37 | 通过调用`KernelDensity.estimate`方法来实现核密度函数估计。看下面的代码。
38 |
39 | ```scala
40 | def estimate(points: Array[Double]): Array[Double] = {
41 | val sample = this.sample
42 | val bandwidth = this.bandwidth
43 | val n = points.length
44 | // 在每个高斯密度函数计算中,这个值都需要计算,所以提前计算。
45 | val logStandardDeviationPlusHalfLog2Pi = math.log(bandwidth) + 0.5 * math.log(2 * math.Pi)
46 | val (densities, count) = sample.aggregate((new Array[Double](n), 0L))(
47 | (x, y) => {
48 | var i = 0
49 | while (i < n) {
50 | x._1(i) += normPdf(y, bandwidth, logStandardDeviationPlusHalfLog2Pi, points(i))
51 | i += 1
52 | }
53 | (x._1, x._2 + 1)
54 | },
55 | (x, y) => {
56 | //daxpy函数的作用是将一个向量加上另一个向量的值,即:dy[i]+=da*dx[i],其中da为常数
57 | blas.daxpy(n, 1.0, y._1, 1, x._1, 1)
58 | (x._1, x._2 + y._2)
59 | })
60 | //在向量上乘一个常数
61 | blas.dscal(n, 1.0 / count, densities, 1)
62 | densities
63 | }
64 | }
65 | ```
66 | 上述代码的`seqOp`函数中调用了`normPdf`,这个函数用于计算核函数为高斯分布的概率密度函数。参见上面的公式**(1)**。公式**(1)**的实现如下面代码。
67 |
68 | ```scala
69 | def normPdf(
70 | mean: Double,
71 | standardDeviation: Double,
72 | logStandardDeviationPlusHalfLog2Pi: Double,
73 | x: Double): Double = {
74 | val x0 = x - mean
75 | val x1 = x0 / standardDeviation
76 | val logDensity = -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi
77 | math.exp(logDensity)
78 | }
79 | ```
80 | 该方法首先将公式**(1)**取对数,计算结果,然后再对计算结果取指数。
81 |
82 | ## 参考文献
83 |
84 | 【1】[核密度估计](http://blog.163.com/zhuandi_h/blog/static/1802702882012111092743556/)
85 |
86 | 【2】[R语言与非参数统计(核密度估计)](http://blog.sina.com.cn/s/blog_62b37bfe0101homb.html)
--------------------------------------------------------------------------------
/基本统计/random-data-generation.md:
--------------------------------------------------------------------------------
1 | # 随机数生成
2 |
3 | 随机数生成在随机算法、性能测试中非常有用,`spark.mllib`支持生成随机的`RDD`,`RDD`的独立同分布(`iid`)的值来自于给定的分布:均匀分布、标准正太分布、泊松分布。
4 |
5 | `RandomRDDs`提供工厂方法生成随机的双精度`RDD`或者向量`RDD`。下面的例子生成了一个随机的双精度`RDD`,它的值来自于标准的正太分布`N(0,1)`。
6 |
7 | ```scala
8 | import org.apache.spark.SparkContext
9 | import org.apache.spark.mllib.random.RandomRDDs._
10 | val sc: SparkContext = ...
11 | // Generate a random double RDD that contains 1 million i.i.d. values drawn from the
12 | // standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
13 | val u = normalRDD(sc, 1000000L, 10)
14 | // Apply a transform to get a random double RDD following `N(1, 4)`.
15 | val v = u.map(x => 1.0 + 2.0 * x)
16 | ```
17 |
18 | `normalRDD`的实现如下面代码所示。
19 |
20 | ```scala
21 | def normalRDD(
22 | sc: SparkContext,
23 | size: Long,
24 | numPartitions: Int = 0,
25 | seed: Long = Utils.random.nextLong()): RDD[Double] = {
26 | val normal = new StandardNormalGenerator()
27 | randomRDD(sc, normal, size, numPartitionsOrDefault(sc, numPartitions), seed)
28 | }
29 | def randomRDD[T: ClassTag](
30 | sc: SparkContext,
31 | generator: RandomDataGenerator[T],
32 | size: Long,
33 | numPartitions: Int = 0,
34 | seed: Long = Utils.random.nextLong()): RDD[T] = {
35 | new RandomRDD[T](sc, size, numPartitionsOrDefault(sc, numPartitions), generator, seed)
36 | }
37 | private[mllib] class RandomRDD[T: ClassTag](sc: SparkContext,
38 | size: Long,
39 | numPartitions: Int,
40 | @transient private val rng: RandomDataGenerator[T],
41 | @transient private val seed: Long = Utils.random.nextLong) extends RDD[T](sc, Nil)
42 | ```
--------------------------------------------------------------------------------
/基本统计/summary-statistics.md:
--------------------------------------------------------------------------------
1 | # 概括统计
2 |
3 | `MLlib`支持`RDD[Vector]`列的概括统计,它通过调用`Statistics`的`colStats`方法实现。`colStats`返回一个`MultivariateStatisticalSummary`对象,这个对象包含列式的最大值、最小值、均值、方差等等。
4 | 下面是一个应用例子:
5 |
6 | ```scala
7 | import org.apache.spark.mllib.linalg.Vector
8 | import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
9 | val observations: RDD[Vector] = ... // an RDD of Vectors
10 | // Compute column summary statistics.
11 | val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
12 | println(summary.mean) // a dense vector containing the mean value for each column
13 | println(summary.variance) // column-wise variance
14 | println(summary.numNonzeros) // number of nonzeros in each column
15 | ```
16 | 下面我们具体看看`colStats`方法的实现。
17 |
18 | ```scala
19 | def colStats(X: RDD[Vector]): MultivariateStatisticalSummary = {
20 | new RowMatrix(X).computeColumnSummaryStatistics()
21 | }
22 | ```
23 | 上面的代码非常明显,利用传人的`RDD`创建`RowMatrix`对象,利用方法`computeColumnSummaryStatistics`统计指标。
24 |
25 | ```scala
26 | def computeColumnSummaryStatistics(): MultivariateStatisticalSummary = {
27 | val summary = rows.treeAggregate(new MultivariateOnlineSummarizer)(
28 | (aggregator, data) => aggregator.add(data),
29 | (aggregator1, aggregator2) => aggregator1.merge(aggregator2))
30 | updateNumRows(summary.count)
31 | summary
32 | }
33 | ```
34 | 上面的代码调用了`RDD`的`treeAggregate`方法,`treeAggregate`是聚合方法,它迭代处理`RDD`中的数据,其中,`(aggregator, data) => aggregator.add(data)`处理每条数据,将其添加到`MultivariateOnlineSummarizer`,
35 | `(aggregator1, aggregator2) => aggregator1.merge(aggregator2)`将不同分区的`MultivariateOnlineSummarizer`对象汇总。所以上述代码实现的重点是`add`方法和`merge`方法。它们都定义在`MultivariateOnlineSummarizer`中。
36 | 我们先来看`add`代码。
37 |
38 | ```scala
39 | @Since("1.1.0")
40 | def add(sample: Vector): this.type = add(sample, 1.0)
41 | private[spark] def add(instance: Vector, weight: Double): this.type = {
42 | if (weight == 0.0) return this
43 | if (n == 0) {
44 | n = instance.size
45 | currMean = Array.ofDim[Double](n)
46 | currM2n = Array.ofDim[Double](n)
47 | currM2 = Array.ofDim[Double](n)
48 | currL1 = Array.ofDim[Double](n)
49 | nnz = Array.ofDim[Double](n)
50 | currMax = Array.fill[Double](n)(Double.MinValue)
51 | currMin = Array.fill[Double](n)(Double.MaxValue)
52 | }
53 | val localCurrMean = currMean
54 | val localCurrM2n = currM2n
55 | val localCurrM2 = currM2
56 | val localCurrL1 = currL1
57 | val localNnz = nnz
58 | val localCurrMax = currMax
59 | val localCurrMin = currMin
60 | instance.foreachActive { (index, value) =>
61 | if (value != 0.0) {
62 | if (localCurrMax(index) < value) {
63 | localCurrMax(index) = value
64 | }
65 | if (localCurrMin(index) > value) {
66 | localCurrMin(index) = value
67 | }
68 | val prevMean = localCurrMean(index)
69 | val diff = value - prevMean
70 | localCurrMean(index) = prevMean + weight * diff / (localNnz(index) + weight)
71 | localCurrM2n(index) += weight * (value - localCurrMean(index)) * diff
72 | localCurrM2(index) += weight * value * value
73 | localCurrL1(index) += weight * math.abs(value)
74 | localNnz(index) += weight
75 | }
76 | }
77 | weightSum += weight
78 | weightSquareSum += weight * weight
79 | totalCnt += 1
80 | this
81 | }
82 | ```
83 | 这段代码使用了在线算法来计算均值和方差。根据文献【1】的介绍,计算均值和方差遵循如下的迭代公式:
84 |
85 | 
86 |
87 | 
88 |
89 | 在上面的公式中,`x`表示样本均值,`s`表示样本方差,`delta`表示总体方差。`MLlib`实现的是带有权重的计算,所以使用的迭代公式略有不同,参考文献【2】。
90 |
91 | 
92 |
93 | `merge`方法相对比较简单,它只是对两个`MultivariateOnlineSummarizer`对象的指标作合并操作。
94 |
95 | ```scala
96 | def merge(other: MultivariateOnlineSummarizer): this.type = {
97 | if (this.weightSum != 0.0 && other.weightSum != 0.0) {
98 | totalCnt += other.totalCnt
99 | weightSum += other.weightSum
100 | weightSquareSum += other.weightSquareSum
101 | var i = 0
102 | while (i < n) {
103 | val thisNnz = nnz(i)
104 | val otherNnz = other.nnz(i)
105 | val totalNnz = thisNnz + otherNnz
106 | if (totalNnz != 0.0) {
107 | val deltaMean = other.currMean(i) - currMean(i)
108 | // merge mean together
109 | currMean(i) += deltaMean * otherNnz / totalNnz
110 | // merge m2n together,不单纯是累加
111 | currM2n(i) += other.currM2n(i) + deltaMean * deltaMean * thisNnz * otherNnz / totalNnz
112 | // merge m2 together
113 | currM2(i) += other.currM2(i)
114 | // merge l1 together
115 | currL1(i) += other.currL1(i)
116 | // merge max and min
117 | currMax(i) = math.max(currMax(i), other.currMax(i))
118 | currMin(i) = math.min(currMin(i), other.currMin(i))
119 | }
120 | nnz(i) = totalNnz
121 | i += 1
122 | }
123 | } else if (weightSum == 0.0 && other.weightSum != 0.0) {
124 | this.n = other.n
125 | this.currMean = other.currMean.clone()
126 | this.currM2n = other.currM2n.clone()
127 | this.currM2 = other.currM2.clone()
128 | this.currL1 = other.currL1.clone()
129 | this.totalCnt = other.totalCnt
130 | this.weightSum = other.weightSum
131 | this.weightSquareSum = other.weightSquareSum
132 | this.nnz = other.nnz.clone()
133 | this.currMax = other.currMax.clone()
134 | this.currMin = other.currMin.clone()
135 | }
136 | this
137 | }
138 | ```
139 | 这里需要注意的是,在线算法的并行化实现是一种特殊情况。例如样本集`X`分到两个不同的分区,分别为`X_A`和`X_B`,那么它们的合并需要满足下面的公式:
140 |
141 | 
142 |
143 | 依靠文献【3】我们可以知道,样本方差的无偏估计由下面的公式给出:
144 |
145 | 
146 |
147 | 
148 |
149 | 所以,真实的样本均值和样本方差通过下面的代码实现。
150 |
151 | ```scala
152 | override def mean: Vector = {
153 | val realMean = Array.ofDim[Double](n)
154 | var i = 0
155 | while (i < n) {
156 | realMean(i) = currMean(i) * (nnz(i) / weightSum)
157 | i += 1
158 | }
159 | Vectors.dense(realMean)
160 | }
161 | override def variance: Vector = {
162 | val realVariance = Array.ofDim[Double](n)
163 | val denominator = weightSum - (weightSquareSum / weightSum)
164 | // Sample variance is computed, if the denominator is less than 0, the variance is just 0.
165 | if (denominator > 0.0) {
166 | val deltaMean = currMean
167 | var i = 0
168 | val len = currM2n.length
169 | while (i < len) {
170 | realVariance(i) = (currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) *
171 | (weightSum - nnz(i)) / weightSum) / denominator
172 | i += 1
173 | }
174 | }
175 | Vectors.dense(realVariance)
176 | }
177 | ```
178 |
179 | # 参考文献
180 |
181 | 【1】[Algorithms for calculating variance](https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance)
182 |
183 | 【2】[Updating mean and variance estimates: an improved method](http://people.xiph.org/~tterribe/tmp/homs/West79-_Updating_Mean_and_Variance_Estimates-_An_Improved_Method.pdf)
184 |
185 | 【3】[Weighted arithmetic mean](https://en.wikipedia.org/wiki/Weighted_arithmetic_mean)
186 |
--------------------------------------------------------------------------------
/基本统计/tratified-sampling.md:
--------------------------------------------------------------------------------
1 | # 分层取样
2 |
3 | 先将总体的单位按某种特征分为若干次级总体(层),然后再从每一层内进行单纯随机抽样,组成一个样本的统计学计算方法叫做分层抽样。在`spark.mllib`中,用`key`来分层。
4 |
5 | 与存在于`spark.mllib`中的其它统计函数不同,分层采样方法`sampleByKey`和`sampleByKeyExact`可以在`key-value`对的`RDD`上执行。在分层采样中,可以认为`key`是一个标签,
6 | `value`是特定的属性。例如,`key`可以是男人或者女人或者文档`id`,它相应的`value`可能是一组年龄或者是文档中的词。`sampleByKey`方法通过掷硬币的方式决定是否采样一个观察数据,
7 | 因此它需要我们传递(`pass over`)数据并且提供期望的数据大小(`size`)。`sampleByKeyExact`比每层使用`sampleByKey`随机抽样需要更多的有意义的资源,但是它能使样本大小的准确性达到了`99.99%`。
8 |
9 | [sampleByKeyExact()](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions)允许用户准确抽取`f_k * n_k`个样本,
10 | 这里`f_k`表示期望获取键为`k`的样本的比例,`n_k`表示键为`k`的键值对的数量。下面是一个使用的例子:
11 |
12 | ```scala
13 | import org.apache.spark.SparkContext
14 | import org.apache.spark.SparkContext._
15 | import org.apache.spark.rdd.PairRDDFunctions
16 | val sc: SparkContext = ...
17 | val data = ... // an RDD[(K, V)] of any key value pairs
18 | val fractions: Map[K, Double] = ... // specify the exact fraction desired from each key
19 | // Get an exact sample from each stratum
20 | val approxSample = data.sampleByKey(withReplacement = false, fractions)
21 | val exactSample = data.sampleByKeyExact(withReplacement = false, fractions)
22 | ```
23 |
24 | 当`withReplacement`为`true`时,采用`PoissonSampler`取样器,当`withReplacement`为`false`使,采用`BernoulliSampler`取样器。
25 |
26 | ```scala
27 | def sampleByKey(withReplacement: Boolean,
28 | fractions: Map[K, Double],
29 | seed: Long = Utils.random.nextLong): RDD[(K, V)] = self.withScope {
30 | val samplingFunc = if (withReplacement) {
31 | StratifiedSamplingUtils.getPoissonSamplingFunction(self, fractions, false, seed)
32 | } else {
33 | StratifiedSamplingUtils.getBernoulliSamplingFunction(self, fractions, false, seed)
34 | }
35 | self.mapPartitionsWithIndex(samplingFunc, preservesPartitioning = true)
36 | }
37 | def sampleByKeyExact(
38 | withReplacement: Boolean,
39 | fractions: Map[K, Double],
40 | seed: Long = Utils.random.nextLong): RDD[(K, V)] = self.withScope {
41 | val samplingFunc = if (withReplacement) {
42 | StratifiedSamplingUtils.getPoissonSamplingFunction(self, fractions, true, seed)
43 | } else {
44 | StratifiedSamplingUtils.getBernoulliSamplingFunction(self, fractions, true, seed)
45 | }
46 | self.mapPartitionsWithIndex(samplingFunc, preservesPartitioning = true)
47 | }
48 | ```
49 | 下面我们分别来看`sampleByKey`和`sampleByKeyExact`的实现。
50 |
51 | ## 1 `sampleByKey`的实现
52 |
53 | 当我们需要不重复抽样时,我们需要用泊松抽样器来抽样。当需要重复抽样时,用伯努利抽样器抽样。`sampleByKey`的实现比较简单,它就是统一的随机抽样。
54 |
55 | ### 1.1 泊松抽样器
56 |
57 | 我们首先看泊松抽样器的实现。
58 |
59 | ```scala
60 | def getPoissonSamplingFunction[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)],
61 | fractions: Map[K, Double],
62 | exact: Boolean,
63 | seed: Long): (Int, Iterator[(K, V)]) => Iterator[(K, V)] = {
64 | (idx: Int, iter: Iterator[(K, V)]) => {
65 | //初始化随机生成器
66 | val rng = new RandomDataGenerator()
67 | rng.reSeed(seed + idx)
68 | iter.flatMap { item =>
69 | //获得下一个泊松值
70 | val count = rng.nextPoisson(fractions(item._1))
71 | if (count == 0) {
72 | Iterator.empty
73 | } else {
74 | Iterator.fill(count)(item)
75 | }
76 | }
77 | }
78 | }
79 | ```
80 | `getPoissonSamplingFunction`返回的是一个函数,传递给`mapPartitionsWithIndex`处理每个分区的数据。这里`RandomDataGenerator`是一个随机生成器,它用于同时生成均匀值(`uniform values`)和泊松值(`Poisson values`)。
81 |
82 | ### 1.2 伯努利抽样器
83 |
84 | ```scala
85 | def getBernoulliSamplingFunction[K, V](rdd: RDD[(K, V)],
86 | fractions: Map[K, Double],
87 | exact: Boolean,
88 | seed: Long): (Int, Iterator[(K, V)]) => Iterator[(K, V)] = {
89 | var samplingRateByKey = fractions
90 | (idx: Int, iter: Iterator[(K, V)]) => {
91 | //初始化随机生成器
92 | val rng = new RandomDataGenerator()
93 | rng.reSeed(seed + idx)
94 | // Must use the same invoke pattern on the rng as in getSeqOp for without replacement
95 | // in order to generate the same sequence of random numbers when creating the sample
96 | iter.filter(t => rng.nextUniform() < samplingRateByKey(t._1))
97 | }
98 | }
99 | ```
100 |
101 | ## 2 `sampleByKeyExact`的实现
102 |
103 | `sampleByKeyExact`获取更准确的抽样结果,它的实现也分为两种情况,重复抽样和不重复抽样。前者使用泊松抽样器,后者使用伯努利抽样器。
104 |
105 | ### 2.1 泊松抽样器
106 |
107 | ```scala
108 | val counts = Some(rdd.countByKey())
109 | //计算立即接受的样本数量,并且为每层生成候选名单
110 | val finalResult = getAcceptanceResults(rdd, true, fractions, counts, seed)
111 | //决定接受样本的阈值,生成准确的样本大小
112 | val thresholdByKey = computeThresholdByKey(finalResult, fractions)
113 | (idx: Int, iter: Iterator[(K, V)]) => {
114 | val rng = new RandomDataGenerator()
115 | rng.reSeed(seed + idx)
116 | iter.flatMap { item =>
117 | val key = item._1
118 | val acceptBound = finalResult(key).acceptBound
119 | // Must use the same invoke pattern on the rng as in getSeqOp for with replacement
120 | // in order to generate the same sequence of random numbers when creating the sample
121 | val copiesAccepted = if (acceptBound == 0) 0L else rng.nextPoisson(acceptBound)
122 | //候选名单
123 | val copiesWaitlisted = rng.nextPoisson(finalResult(key).waitListBound)
124 | val copiesInSample = copiesAccepted +
125 | (0 until copiesWaitlisted).count(i => rng.nextUniform() < thresholdByKey(key))
126 | if (copiesInSample > 0) {
127 | Iterator.fill(copiesInSample.toInt)(item)
128 | } else {
129 | Iterator.empty
130 | }
131 | }
132 | }
133 | ```
134 |
135 | ### 2.2 伯努利抽样
136 |
137 | ```scala
138 | def getBernoulliSamplingFunction[K, V](rdd: RDD[(K, V)],
139 | fractions: Map[K, Double],
140 | exact: Boolean,
141 | seed: Long): (Int, Iterator[(K, V)]) => Iterator[(K, V)] = {
142 | var samplingRateByKey = fractions
143 | //计算立即接受的样本数量,并且为每层生成候选名单
144 | val finalResult = getAcceptanceResults(rdd, false, fractions, None, seed)
145 | //决定接受样本的阈值,生成准确的样本大小
146 | samplingRateByKey = computeThresholdByKey(finalResult, fractions)
147 | (idx: Int, iter: Iterator[(K, V)]) => {
148 | val rng = new RandomDataGenerator()
149 | rng.reSeed(seed + idx)
150 | // Must use the same invoke pattern on the rng as in getSeqOp for without replacement
151 | // in order to generate the same sequence of random numbers when creating the sample
152 | iter.filter(t => rng.nextUniform() < samplingRateByKey(t._1))
153 | }
154 | }
155 | ```
--------------------------------------------------------------------------------
/推荐/imgs/ALS.1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/推荐/imgs/ALS.1.1.png
--------------------------------------------------------------------------------
/推荐/imgs/ALS.2.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/推荐/imgs/ALS.2.1.png
--------------------------------------------------------------------------------
/推荐/imgs/ALS.3.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/推荐/imgs/ALS.3.1.png
--------------------------------------------------------------------------------
/推荐/imgs/ALS.3.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/推荐/imgs/ALS.3.2.png
--------------------------------------------------------------------------------
/推荐/imgs/math.1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/推荐/imgs/math.1.1.png
--------------------------------------------------------------------------------
/推荐/imgs/math.2.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/推荐/imgs/math.2.1.png
--------------------------------------------------------------------------------
/推荐/imgs/math.2.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/推荐/imgs/math.2.2.png
--------------------------------------------------------------------------------
/推荐/imgs/math.2.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/推荐/imgs/math.2.3.png
--------------------------------------------------------------------------------
/推荐/imgs/math.2.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/推荐/imgs/math.2.4.png
--------------------------------------------------------------------------------
/推荐/papers/Collaborative Filtering for Implicit Feedback Datasets.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/推荐/papers/Collaborative Filtering for Implicit Feedback Datasets.pdf
--------------------------------------------------------------------------------
/推荐/papers/Large-scale Parallel Collaborative Filtering the Netflix Prize.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/推荐/papers/Large-scale Parallel Collaborative Filtering the Netflix Prize.pdf
--------------------------------------------------------------------------------
/推荐/papers/Matrix Factorization Techniques for Recommender Systems.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/推荐/papers/Matrix Factorization Techniques for Recommender Systems.pdf
--------------------------------------------------------------------------------
/数据类型/imgs/1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/数据类型/imgs/1.1.png
--------------------------------------------------------------------------------
/最优化算法/IRLS.md:
--------------------------------------------------------------------------------
1 | # 迭代再加权最小二乘
2 |
3 | ## 1 原理
4 |
5 | 迭代再加权最小二乘(`IRLS`)用于解决特定的最优化问题,这个最优化问题的目标函数如下所示:
6 |
7 | $$arg min_{\beta} \sum_{i=1}^{n}|y_{i} - f_{i}(\beta)|^{p}$$
8 |
9 | 这个目标函数可以通过迭代的方法求解。在每次迭代中,解决一个带权最小二乘问题,形式如下:
10 |
11 | $$\beta ^{t+1} = argmin_{\beta} \sum_{i=1}^{n} w_{i}(\beta^{(t)}))|y_{i} - f_{i}(\beta)|^{2} = (X^{T}W^{(t)}X)^{-1}X^{T}W^{(t)}y$$
12 |
13 | 在这个公式中,$W^{(t)}$是权重对角矩阵,它的所有元素都初始化为1。每次迭代中,通过下面的公式更新。
14 |
15 | $$W_{i}^{(t)} = |y_{i} - X_{i}\beta^{(t)}|^{p-2}$$
16 |
17 | ## 2 源码分析
18 |
19 | 在`spark ml`中,迭代再加权最小二乘主要解决广义线性回归问题。下面看看实现代码。
20 |
21 | ### 2.1 更新权重
22 |
23 | ```scala
24 | // Update offsets and weights using reweightFunc
25 | val newInstances = instances.map { instance =>
26 | val (newOffset, newWeight) = reweightFunc(instance, oldModel)
27 | Instance(newOffset, newWeight, instance.features)
28 | }
29 | ```
30 | 这里使用`reweightFunc`方法更新权重。具体的实现在广义线性回归的实现中。
31 |
32 | ```scala
33 | /**
34 | * The reweight function used to update offsets and weights
35 | * at each iteration of [[IterativelyReweightedLeastSquares]].
36 | */
37 | val reweightFunc: (Instance, WeightedLeastSquaresModel) => (Double, Double) = {
38 | (instance: Instance, model: WeightedLeastSquaresModel) => {
39 | val eta = model.predict(instance.features)
40 | val mu = fitted(eta)
41 | val offset = eta + (instance.label - mu) * link.deriv(mu)
42 | val weight = instance.weight / (math.pow(this.link.deriv(mu), 2.0) * family.variance(mu))
43 | (offset, weight)
44 | }
45 | }
46 |
47 | def fitted(eta: Double): Double = family.project(link.unlink(eta))
48 | ```
49 | 这里的`model.predict`利用带权最小二乘模型预测样本的取值,然后调用`fitted`方法计算均值函数$\mu$。`offset`表示
50 | 更新后的标签值,`weight`表示更新后的权重。关于链接函数的相关计算可以参考[广义线性回归](../分类和回归/线性模型/广义线性回归/glr.md)的分析。
51 |
52 | 有一点需要说明的是,这段代码中标签和权重的更新并没有参照上面的原理或者说我理解有误。
53 |
54 | ### 2.2 训练新的模型
55 |
56 | ```scala
57 | // 使用更新过的样本训练新的模型
58 | model = new WeightedLeastSquares(fitIntercept, regParam, elasticNetParam = 0.0,
59 | standardizeFeatures = false, standardizeLabel = false).fit(newInstances)
60 |
61 | // 检查是否收敛
62 | val oldCoefficients = oldModel.coefficients
63 | val coefficients = model.coefficients
64 | BLAS.axpy(-1.0, coefficients, oldCoefficients)
65 | val maxTolOfCoefficients = oldCoefficients.toArray.reduce { (x, y) =>
66 | math.max(math.abs(x), math.abs(y))
67 | }
68 | val maxTol = math.max(maxTolOfCoefficients, math.abs(oldModel.intercept - model.intercept))
69 | if (maxTol < tol) {
70 | converged = true
71 | }
72 | ```
73 | 训练完新的模型后,重复2.1步,直到参数收敛或者到达迭代的最大次数。
74 |
75 | ## 3 参考文献
76 |
77 | 【1】[Iteratively reweighted least squares](https://en.wikipedia.org/wiki/Iteratively_reweighted_least_squares)
78 |
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/docs/On the Limited Memory BFGS Method for Large Scale Optimization.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/docs/On the Limited Memory BFGS Method for Large Scale Optimization.pdf
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/docs/Updating Quasi-Newton Matrices with Limited Storage.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/docs/Updating Quasi-Newton Matrices with Limited Storage.pdf
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/1.1.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/1.2.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/1.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/1.3.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/1.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/1.4.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/1.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/1.5.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.1.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.10.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.11.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.12.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.13.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.14.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.15.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.16.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.17.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.18.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.19.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.2.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.20.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.21.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.22.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.23.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.23.jpeg
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.24.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.25.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.26.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.26.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.27.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.28.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.29.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.3.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.30.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.30.jpg
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.31.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.32.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.33.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.33.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.34.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.34.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.35.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.35.gif
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.36.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.36.gif
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.37.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.37.gif
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.38.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.38.gif
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.39.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.39.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.4.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.5.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.6.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.7.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.8.png
--------------------------------------------------------------------------------
/最优化算法/L-BFGS/imgs/2.9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/L-BFGS/imgs/2.9.png
--------------------------------------------------------------------------------
/最优化算法/梯度下降/gradient-descent.md:
--------------------------------------------------------------------------------
1 | # 梯度下降算法
2 |
3 | 梯度下降(`GD`)是最小化风险函数、损失函数的一种常用方法,随机梯度下降和批量梯度下降是两种迭代求解思路。
4 |
5 | ## 1 批量梯度下降算法
6 |
7 | 假设`h(theta)`是要拟合的函数,`J(theta)`是损失函数,这里`theta`是要迭代求解的值。这两个函数的公式如下,其中`m`是训练集的记录条数,`j`是参数的个数:
8 |
9 | 
10 |
11 | 梯度下降法目的就是求出使损失函数最小时的`theta`。批量梯度下降的求解思路如下:
12 |
13 | - 对损失函数求`theta`的偏导,得到每个`theta`对应的的梯度
14 |
15 | 
16 |
17 | - 按每个参数`theta`的梯度负方向,来更新每个`theta`
18 |
19 | 
20 |
21 | 从上面公式可以看到,虽然它得到的是一个全局最优解,但是每迭代一步(即修改`j`个`theta`参数中的一个),都要用到训练集所有的数据,如果`m`很大,迭代速度会非常慢。
22 |
23 | ## 2 随机梯度下降算法
24 |
25 | 随机梯度下降是通过每个样本来迭代更新一次`theta`,它大大加快了迭代速度。更新`theta`的公式如下所示。
26 |
27 | 
28 |
29 | 批量梯度下降会最小化所有训练样本的损失函数,使得最终求解的是全局的最优解,即求解的参数是使得风险函数最小。随机梯度下降会最小化每条样本的损失函数,
30 | 虽然不是每次迭代得到的损失函数都向着全局最优方向, 但是大的整体的方向是向全局最优解的,最终的结果往往是在全局最优解附近。
31 |
32 | ## 3 批随机梯度下降算法
33 |
34 | 在`MLlib`中,并不是严格实现批量梯度下降算法和随机梯度下降算法,而是结合了这两种算法。即在每次迭代中,既不是使用所有的样本,也不是使用单个样本,而是抽样一小批样本用于计算。
35 |
36 | 
37 |
38 | 下面分析该算法的实现。首先我们看看`GradientDescent`的定义。
39 |
40 | ```scala
41 | class GradientDescent private[spark] (private var gradient: Gradient, private var updater: Updater)
42 | extends Optimizer with Logging
43 | ```
44 | 这里`Gradient`类用于计算给定数据点的损失函数的梯度。`Gradient`类用于实现参数的更新,即上文中的`theta`。梯度下降算法的具体实现在`runMiniBatchSGD`中。
45 |
46 | ```scala
47 | def runMiniBatchSGD(
48 | data: RDD[(Double, Vector)],
49 | gradient: Gradient,
50 | updater: Updater,
51 | stepSize: Double,
52 | numIterations: Int,
53 | regParam: Double,
54 | miniBatchFraction: Double,
55 | initialWeights: Vector,
56 | convergenceTol: Double): (Vector, Array[Double])
57 | ```
58 | 这里`stepSize`是更新时的步长,`regParam`表示归一化参数,`miniBatchFraction`表示采样比例。迭代内部的处理分为两步。
59 |
60 | - 1 采样并计算梯度
61 |
62 | ```scala
63 | val (gradientSum, lossSum, miniBatchSize) = data.sample(false, miniBatchFraction, 42 + i)
64 | .treeAggregate((BDV.zeros[Double](n), 0.0, 0L))(
65 | seqOp = (c, v) => {
66 | // c: (grad, loss, count), v: (label, features)
67 | val l = gradient.compute(v._2, v._1, bcWeights.value, Vectors.fromBreeze(c._1))
68 | (c._1, c._2 + l, c._3 + 1)
69 | },
70 | combOp = (c1, c2) => {
71 | // c: (grad, loss, count)
72 | (c1._1 += c2._1, c1._2 + c2._2, c1._3 + c2._3)
73 | })
74 | ```
75 |
76 | 这里`treeAggregate`类似于`aggregate`方法,不同的是在每个分区,该函数会做两次(默认两次)或两次以上的`merge`聚合操作,避免将所有的局部值传回`driver`端。
77 |
78 | 该步按照上文提到的偏导公式求参数的梯度,但是根据提供的`h`函数的不同,计算结果会有所不同。`MLlib`现在提供的求导方法分别有`HingeGradient`、`LeastSquaresGradient`、`LogisticGradient`以及
79 | `ANNGradient`。这些类的实现会在具体的算法中介绍。
80 |
81 | - 2 更新权重参数
82 |
83 | ```scala
84 | val update = updater.compute(
85 | weights, Vectors.fromBreeze(gradientSum / miniBatchSize.toDouble),
86 | stepSize, i, regParam)
87 | weights = update._1
88 | regVal = update._2
89 | ```
90 | 求出梯度之后,我们就可以根据梯度的值更新现有的权重参数。`MLlib`现在提供的`Updater`主要有`SquaredL2Updater`、`L1Updater`、`SimpleUpdater`等。这些类的实现会在具体的算法中介绍。
91 |
92 |
93 | # 参考文献
94 |
95 | 【1】[随机梯度下降和批量梯度下降的公式对比、实现对比](http://blog.csdn.net/lilyth_lilyth/article/details/8973972)
96 |
97 |
--------------------------------------------------------------------------------
/最优化算法/梯度下降/imgs/1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/梯度下降/imgs/1.1.png
--------------------------------------------------------------------------------
/最优化算法/梯度下降/imgs/1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/梯度下降/imgs/1.2.png
--------------------------------------------------------------------------------
/最优化算法/梯度下降/imgs/1.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/梯度下降/imgs/1.3.png
--------------------------------------------------------------------------------
/最优化算法/梯度下降/imgs/1.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/梯度下降/imgs/1.4.png
--------------------------------------------------------------------------------
/最优化算法/梯度下降/imgs/1.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/梯度下降/imgs/1.5.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/NNLS.2.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/NNLS.2.1.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.1.1.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.1.10.append1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.1.10.append1.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.1.10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.1.10.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.1.11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.1.11.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.1.12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.1.12.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.1.13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.1.13.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.1.14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.1.14.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.1.15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.1.15.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.1.16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.1.16.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.1.17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.1.17.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.1.2.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.1.3.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.1.3.1.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.1.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.1.3.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.1.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.1.4.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.1.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.1.5.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.1.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.1.6.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.1.7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.1.7.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.1.8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.1.8.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.1.9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.1.9.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.2.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.2.1.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.2.10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.2.10.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.2.11.a1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.2.11.a1.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.2.11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.2.11.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.2.12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.2.12.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.2.2.a1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.2.2.a1.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.2.2.a2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.2.2.a2.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.2.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.2.2.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.2.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.2.3.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.2.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.2.4.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.2.5.a1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.2.5.a1.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.2.5.a2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.2.5.a2.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.2.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.2.5.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.2.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.2.6.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.2.7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.2.7.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.2.8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.2.8.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.2.9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.2.9.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.3.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.3.1.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/imgs/math.3.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/imgs/math.3.2.png
--------------------------------------------------------------------------------
/最优化算法/非负最小二乘/papers/The conjugate gradient method in extreme problems.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/最优化算法/非负最小二乘/papers/The conjugate gradient method in extreme problems.pdf
--------------------------------------------------------------------------------
/特征抽取和转换/Binarizer.md:
--------------------------------------------------------------------------------
1 | # Binarizer
2 |
3 | `Binarization`是一个将数值特征转换为二值特征的处理过程。`threshold`参数表示决定二值化的阈值。
4 | 值大于阈值的特征二值化为1,否则二值化为0。下面是代码调用的例子。
5 |
6 | ```scala
7 | import org.apache.spark.ml.feature.Binarizer
8 |
9 | val data = Array((0, 0.1), (1, 0.8), (2, 0.2))
10 | val dataFrame = spark.createDataFrame(data).toDF("label", "feature")
11 |
12 | val binarizer: Binarizer = new Binarizer()
13 | .setInputCol("feature")
14 | .setOutputCol("binarized_feature")
15 | .setThreshold(0.5)
16 |
17 | val binarizedDataFrame = binarizer.transform(dataFrame)
18 | val binarizedFeatures = binarizedDataFrame.select("binarized_feature")
19 | binarizedFeatures.collect().foreach(println)
20 | ```
--------------------------------------------------------------------------------
/特征抽取和转换/Bucketizer.md:
--------------------------------------------------------------------------------
1 | # Bucketizer
2 |
3 | `Bucketizer`将连续的特征列转换成特征桶(`buckets`)列。这些桶由用户指定。它拥有一个`splits`参数。
4 |
5 | - `splits`:如果有`n+1`个`splits`,那么将有`n`个桶。桶将由`split x`和`split y`共同确定,它的值范围为`[x,y)`,如果是最后
6 | 一个桶,范围将是`[x,y]`。`splits`应该严格递增。负无穷和正无穷必须明确的提供用来覆盖所有的双精度值,否则,超出`splits`的值将会被
7 | 认为是一个错误。`splits`的两个例子是`Array(Double.NegativeInfinity, 0.0, 1.0, Double.PositiveInfinity)` 和 `Array(0.0, 1.0, 2.0)`。
8 |
9 | 注意,如果你并不知道目标列的上界和下界,你应该添加`Double.NegativeInfinity`和`Double.PositiveInfinity`作为边界从而防止潜在的
10 | 超过边界的异常。下面是程序调用的例子。
11 |
12 | ```scala
13 | import org.apache.spark.ml.feature.Bucketizer
14 |
15 | val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity)
16 |
17 | val data = Array(-0.5, -0.3, 0.0, 0.2)
18 | val dataFrame = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
19 |
20 | val bucketizer = new Bucketizer()
21 | .setInputCol("features")
22 | .setOutputCol("bucketedFeatures")
23 | .setSplits(splits)
24 |
25 | // Transform original data into its bucket index.
26 | val bucketedData = bucketizer.transform(dataFrame)
27 | bucketedData.show()
28 | ```
--------------------------------------------------------------------------------
/特征抽取和转换/CountVectorizer.md:
--------------------------------------------------------------------------------
1 | # CountVectorizer
2 |
3 | `CountVectorizer`和`CountVectorizerModel`的目的是帮助我们将文本文档集转换为词频(`token counts`)向量。
4 | 当事先没有可用的词典时,`CountVectorizer`可以被当做一个`Estimator`去抽取词汇,并且生成`CountVectorizerModel`。
5 | 这个模型通过词汇集为文档生成一个稀疏的表示,这个表示可以作为其它算法的输入,比如`LDA`。
6 | 在训练的过程中,`CountVectorizer`将会选择使用语料中词频个数前`vocabSize`的词。一个可选的参数`minDF`也
7 | 会影响训练过程。这个参数表示可以包含在词典中的词的最小个数(如果该参数小于1,则表示比例)。另外一个可选的`boolean`参数控制着输出向量。
8 | 如果将它设置为`true`,那么所有的非0词频都会赋值为1。这对离散的概率模型非常有用。
9 |
10 | ## 举例
11 |
12 | 假设我们有下面的`DataFrame`,它的列名分别是`id`和`texts`.
13 |
14 | ```
15 | id | texts
16 | ----|-------------------------------
17 | 0 | Array("a", "b", "c")
18 | 1 | Array("a", "b", "b", "c", "a")
19 | ```
20 |
21 | `texts`列的每一行表示一个类型为`Array[String]`的文档。`CountVectorizer`生成了一个带有词典`(a, b, c)`的`CountVectorizerModel`。
22 | 经过转换之后,输出的列为`vector`。
23 |
24 | ```
25 | id | texts | vector
26 | ----|---------------------------------|---------------
27 | 0 | Array("a", "b", "c") | (3,[0,1,2],[1.0,1.0,1.0])
28 | 1 | Array("a", "b", "b", "c", "a") | (3,[0,1,2],[2.0,2.0,1.0])
29 | ```
30 | 下面是代码调用的方法。
31 |
32 | ```scala
33 | import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
34 |
35 | val df = spark.createDataFrame(Seq(
36 | (0, Array("a", "b", "c")),
37 | (1, Array("a", "b", "b", "c", "a"))
38 | )).toDF("id", "words")
39 |
40 | // fit a CountVectorizerModel from the corpus
41 | val cvModel: CountVectorizerModel = new CountVectorizer()
42 | .setInputCol("words")
43 | .setOutputCol("features")
44 | .setVocabSize(3)
45 | .setMinDF(2)
46 | .fit(df)
47 |
48 | // alternatively, define CountVectorizerModel with a-priori vocabulary
49 | val cvm = new CountVectorizerModel(Array("a", "b", "c"))
50 | .setInputCol("words")
51 | .setOutputCol("features")
52 |
53 | cvModel.transform(df).select("features").show()
54 | ```
--------------------------------------------------------------------------------
/特征抽取和转换/DCT.md:
--------------------------------------------------------------------------------
1 | # Discrete Cosine Transform (DCT)
2 |
3 | [Discrete Cosine Transform](https://en.wikipedia.org/wiki/Discrete_cosine_transform)将一个在时间域(`time domain`)内长度为`N`的实值序列转换为另外一个
4 | 在频率域(`frequency domain`)内的长度为`N`的实值序列。下面是程序调用的例子。
5 |
6 | ```scala
7 | import org.apache.spark.ml.feature.DCT
8 | import org.apache.spark.ml.linalg.Vectors
9 |
10 | val data = Seq(
11 | Vectors.dense(0.0, 1.0, -2.0, 3.0),
12 | Vectors.dense(-1.0, 2.0, 4.0, -7.0),
13 | Vectors.dense(14.0, -2.0, -5.0, 1.0))
14 |
15 | val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
16 |
17 | val dct = new DCT()
18 | .setInputCol("features")
19 | .setOutputCol("featuresDCT")
20 | .setInverse(false)
21 |
22 | val dctDf = dct.transform(df)
23 | dctDf.select("featuresDCT").show(3)
24 | ```
--------------------------------------------------------------------------------
/特征抽取和转换/IndexToString.md:
--------------------------------------------------------------------------------
1 | # IndexToString
2 |
3 | 与`StringIndexer`相对的是,`IndexToString`将标签索引列映射回原来的字符串标签。一个通用的使用案例是使用
4 | `StringIndexer`将标签转换为索引,然后通过索引训练模型,最后通过`IndexToString`将预测的标签索引恢复成字符串标签。
5 |
6 | ## 例子
7 |
8 | 假设我们有下面的`DataFrame`,它的列名为`id`和`categoryIndex`。
9 |
10 | ```
11 | id | categoryIndex
12 | ----|---------------
13 | 0 | 0.0
14 | 1 | 2.0
15 | 2 | 1.0
16 | 3 | 0.0
17 | 4 | 0.0
18 | 5 | 1.0
19 | ```
20 | 把`categoryIndex`作为输入列,`originalCategory`作为输出列,使用`IndexToString`我们可以恢复原来的标签。
21 |
22 | ```
23 | id | categoryIndex | originalCategory
24 | ----|---------------|-----------------
25 | 0 | 0.0 | a
26 | 1 | 2.0 | b
27 | 2 | 1.0 | c
28 | 3 | 0.0 | a
29 | 4 | 0.0 | a
30 | 5 | 1.0 | c
31 | ```
32 | 下面是程序调用的例子。
33 |
34 | ```scala
35 | import org.apache.spark.ml.feature.{IndexToString, StringIndexer}
36 |
37 | val df = spark.createDataFrame(Seq(
38 | (0, "a"),
39 | (1, "b"),
40 | (2, "c"),
41 | (3, "a"),
42 | (4, "a"),
43 | (5, "c")
44 | )).toDF("id", "category")
45 |
46 | val indexer = new StringIndexer()
47 | .setInputCol("category")
48 | .setOutputCol("categoryIndex")
49 | .fit(df)
50 | val indexed = indexer.transform(df)
51 |
52 | val converter = new IndexToString()
53 | .setInputCol("categoryIndex")
54 | .setOutputCol("originalCategory")
55 |
56 | val converted = converter.transform(indexed)
57 | converted.select("id", "originalCategory").show()
58 | ```
--------------------------------------------------------------------------------
/特征抽取和转换/MaxAbsScaler.md:
--------------------------------------------------------------------------------
1 | # MaxAbsScaler
2 |
3 | `MaxAbsScaler`转换由向量列组成的数据集,将每个特征调整到`[-1,1]`的范围,它通过每个特征内的最大绝对值来划分。
4 | 它不会移动和聚集数据,因此不会破坏任何的稀疏性。
5 |
6 | `MaxAbsScaler`计算数据集上的统计数据,生成`MaxAbsScalerModel`,然后使用生成的模型分别的转换特征到范围`[-1,1]`。下面是程序调用的例子。
7 |
8 | ```scala
9 | import org.apache.spark.ml.feature.MaxAbsScaler
10 |
11 | val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
12 | val scaler = new MaxAbsScaler()
13 | .setInputCol("features")
14 | .setOutputCol("scaledFeatures")
15 |
16 | // Compute summary statistics and generate MaxAbsScalerModel
17 | val scalerModel = scaler.fit(dataFrame)
18 |
19 | // rescale each feature to range [-1, 1]
20 | val scaledData = scalerModel.transform(dataFrame)
21 | scaledData.show()
22 | ```
--------------------------------------------------------------------------------
/特征抽取和转换/MinMaxScaler.md:
--------------------------------------------------------------------------------
1 | # MinMaxScaler
2 |
3 | `MinMaxScaler`转换由向量行组成的数据集,将每个特征调整到一个特定的范围(通常是`[0,1]`)。它有下面两个参数:
4 |
5 | - `min`:默认是0。转换的下界,被所有的特征共享。
6 | - `max`:默认是1。转换的上界,被所有特征共享。
7 |
8 | `MinMaxScaler`计算数据集上的概要统计数据,产生一个`MinMaxScalerModel`。然后就可以用这个模型单独的转换每个特征到特定的范围。
9 | 特征`E`被转换后的值可以用下面的公式计算:
10 |
11 | $$\frac{e_{i} - E_{min}}{E_{max} - E_{min}} * (max - min) + min$$
12 |
13 | 对于`E_{max} == E_{min}`的情况,`Rescaled(e_i) = 0.5 * (max + min)`。
14 |
15 | 注意,由于0值有可能转换成非0的值,所以转换的输出为`DenseVector`,即使输入为稀疏的数据也一样。下面的例子展示了如何将特征转换到`[0,1]`。
16 |
17 | ```scala
18 | import org.apache.spark.ml.feature.MinMaxScaler
19 |
20 | val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
21 |
22 | val scaler = new MinMaxScaler()
23 | .setInputCol("features")
24 | .setOutputCol("scaledFeatures")
25 |
26 | // Compute summary statistics and generate MinMaxScalerModel
27 | val scalerModel = scaler.fit(dataFrame)
28 |
29 | // rescale each feature to range [min, max].
30 | val scaledData = scalerModel.transform(dataFrame)
31 | scaledData.show()
32 | ```
--------------------------------------------------------------------------------
/特征抽取和转换/OneHotEncoder.md:
--------------------------------------------------------------------------------
1 | # OneHotEncoder
2 |
3 | [One-hot encoding](http://en.wikipedia.org/wiki/One-hot)将标签索引列映射为二值向量,这个向量至多有一个1值。
4 | 这个编码允许要求连续特征的算法(如逻辑回归)使用类别特征。下面是程序调用的例子。
5 |
6 | ```scala
7 | import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
8 |
9 | val df = spark.createDataFrame(Seq(
10 | (0, "a"),
11 | (1, "b"),
12 | (2, "c"),
13 | (3, "a"),
14 | (4, "a"),
15 | (5, "c")
16 | )).toDF("id", "category")
17 |
18 | val indexer = new StringIndexer()
19 | .setInputCol("category")
20 | .setOutputCol("categoryIndex")
21 | .fit(df)
22 | val indexed = indexer.transform(df)
23 |
24 | val encoder = new OneHotEncoder()
25 | .setInputCol("categoryIndex")
26 | .setOutputCol("categoryVec")
27 | val encoded = encoder.transform(indexed)
28 | encoded.select("id", "categoryVec").show()
29 | ```
--------------------------------------------------------------------------------
/特征抽取和转换/PolynomialExpansion.md:
--------------------------------------------------------------------------------
1 | # PolynomialExpansion(多元展开)
2 |
3 | [Polynomial expansion](http://en.wikipedia.org/wiki/Polynomial_expansion)是一个将特征展开到多元空间的处理过程。
4 | 它通过`n-degree`结合原始的维度来定义。比如设置`degree`为2就可以将`(x, y)`转化为`(x, x x, y, x y, y y)`。`PolynomialExpansion`提供了这个功能。
5 | 下面的例子展示了如何将特征展开为一个`3-degree`多项式空间。
6 |
7 | ```scala
8 | import org.apache.spark.ml.feature.PolynomialExpansion
9 | import org.apache.spark.ml.linalg.Vectors
10 |
11 | val data = Array(
12 | Vectors.dense(-2.0, 2.3),
13 | Vectors.dense(0.0, 0.0),
14 | Vectors.dense(0.6, -1.1)
15 | )
16 | val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
17 | val polynomialExpansion = new PolynomialExpansion()
18 | .setInputCol("features")
19 | .setOutputCol("polyFeatures")
20 | .setDegree(3)
21 | val polyDF = polynomialExpansion.transform(df)
22 | polyDF.select("polyFeatures").take(3).foreach(println)
23 | ```
--------------------------------------------------------------------------------
/特征抽取和转换/QuantileDiscretizer.md:
--------------------------------------------------------------------------------
1 | # QuantileDiscretizer
2 |
3 | `QuantileDiscretizer`输入连续的特征列,输出分箱的类别特征。分箱数是通过参数`numBuckets`来指定的。
4 | 箱的范围是通过使用近似算法(见[approxQuantile ](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.DataFrameStatFunctions))来得到的。
5 | 近似的精度可以通过`relativeError`参数来控制。当这个参数设置为0时,将会计算精确的分位数。箱的上边界和下边界分别是正无穷和负无穷时,
6 | 取值将会覆盖所有的实数值。
7 |
8 | ## 例子
9 |
10 | 假设我们有下面的`DataFrame`,它的列名是`id,hour`。
11 |
12 | ```
13 | id | hour
14 | ----|------
15 | 0 | 18.0
16 | ----|------
17 | 1 | 19.0
18 | ----|------
19 | 2 | 8.0
20 | ----|------
21 | 3 | 5.0
22 | ----|------
23 | 4 | 2.2
24 | ```
25 |
26 | `hour`是类型为`DoubleType`的连续特征。我们想将连续特征转换为一个分类特征。给定`numBuckets`为3,我们可以得到下面的结果。
27 |
28 | ```
29 | id | hour | result
30 | ----|------|------
31 | 0 | 18.0 | 2.0
32 | ----|------|------
33 | 1 | 19.0 | 2.0
34 | ----|------|------
35 | 2 | 8.0 | 1.0
36 | ----|------|------
37 | 3 | 5.0 | 1.0
38 | ----|------|------
39 | 4 | 2.2 | 0.0
40 | ```
41 | 下面是代码实现的例子。
42 |
43 | ```scala
44 | import org.apache.spark.ml.feature.QuantileDiscretizer
45 |
46 | val data = Array((0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2))
47 | var df = spark.createDataFrame(data).toDF("id", "hour")
48 |
49 | val discretizer = new QuantileDiscretizer()
50 | .setInputCol("hour")
51 | .setOutputCol("result")
52 | .setNumBuckets(3)
53 |
54 | val result = discretizer.fit(df).transform(df)
55 | result.show()
56 | ```
57 |
--------------------------------------------------------------------------------
/特征抽取和转换/RFormula.md:
--------------------------------------------------------------------------------
1 | # RFormula
2 |
3 | `RFormula`通过一个[R model formula](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/formula.html)选择一个特定的列。
4 | 目前我们支持`R`算子的一个受限的子集,包括`~`,`.`,`:`,`+`,`-`。这些基本的算子是:
5 |
6 | - `~` 分开`target`和`terms`
7 | - `+` 连接`term`,`+ 0`表示删除截距(`intercept`)
8 | - `-` 删除`term`,`- 1`表示删除截距
9 | - `:` 交集
10 | - `.` 除了`target`之外的所有列
11 |
12 | 假设`a`和`b`是`double`列,我们用下面简单的例子来证明`RFormula`的有效性。
13 |
14 | - `y ~ a + b` 表示模型 `y ~ w0 + w1 * a + w2 * b`,其中`w0`是截距,`w1`和`w2`是系数
15 | - `y ~ a + b + a:b - 1`表示模型`y ~ w1 * a + w2 * b + w3 * a * b`,其中`w1`,`w2`,`w3`是系数
16 |
17 | `RFormula`产生一个特征向量列和一个`double`或`string`类型的标签列。比如在线性回归中使用`R`中的公式时,
18 | 字符串输入列是`one-hot`编码,数值列强制转换为`double`类型。如果标签列是字符串类型,它将使用`StringIndexer`转换为`double`
19 | 类型。如果`DataFrame`中不存在标签列,输出的标签列将通过公式中指定的返回变量来创建。
20 |
21 | ## 例子
22 |
23 | 假设我们有一个`DataFrame`,它的列名是`id`, `country`, `hour`和`clicked`。
24 |
25 | ```
26 | id | country | hour | clicked
27 | ---|---------|------|---------
28 | 7 | "US" | 18 | 1.0
29 | 8 | "CA" | 12 | 0.0
30 | 9 | "NZ" | 15 | 0.0
31 | ```
32 | 如果我们用`clicked ~ country + hour`(基于`country`和`hour`来预测`clicked`)来作用于`RFormula`,将会得到下面的结果。
33 |
34 | ```
35 | id | country | hour | clicked | features | label
36 | ---|---------|------|---------|------------------|-------
37 | 7 | "US" | 18 | 1.0 | [0.0, 0.0, 18.0] | 1.0
38 | 8 | "CA" | 12 | 0.0 | [0.0, 1.0, 12.0] | 0.0
39 | 9 | "NZ" | 15 | 0.0 | [1.0, 0.0, 15.0] | 0.0
40 | ```
41 | 下面是代码调用的例子。
42 |
43 | ```scala
44 | import org.apache.spark.ml.feature.RFormula
45 |
46 | val dataset = spark.createDataFrame(Seq(
47 | (7, "US", 18, 1.0),
48 | (8, "CA", 12, 0.0),
49 | (9, "NZ", 15, 0.0)
50 | )).toDF("id", "country", "hour", "clicked")
51 | val formula = new RFormula()
52 | .setFormula("clicked ~ country + hour")
53 | .setFeaturesCol("features")
54 | .setLabelCol("label")
55 | val output = formula.fit(dataset).transform(dataset)
56 | output.select("features", "label").show()
57 | ```
--------------------------------------------------------------------------------
/特征抽取和转换/SQLTransformer.md:
--------------------------------------------------------------------------------
1 | # SQLTransformer
2 |
3 | `SQLTransformer`实现了一种转换,这个转换通过`SQl`语句来定义。目前我们仅仅支持的`SQL`语法是像`SELECT ... FROM __THIS__ ...`的形式。
4 | 这里`__THIS__`表示输入数据集相关的表。例如,`SQLTransformer`支持的语句如下:
5 |
6 | - `SELECT a, a + b AS a_b FROM __THIS__`
7 | - `SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5`
8 | - `SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b`
9 |
10 | ## 例子
11 |
12 | 假设我们拥有下面的`DataFrame`,它的列名是`id,v1,v2`。
13 |
14 | ```
15 | id | v1 | v2
16 | ----|-----|-----
17 | 0 | 1.0 | 3.0
18 | 2 | 2.0 | 5.0
19 | ```
20 | 下面是语句`SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__`的输出结果。
21 |
22 | ```
23 | id | v1 | v2 | v3 | v4
24 | ----|-----|-----|-----|-----
25 | 0 | 1.0 | 3.0 | 4.0 | 3.0
26 | 2 | 2.0 | 5.0 | 7.0 |10.0
27 | ```
28 | 下面是程序调用的例子。
29 |
30 | ```scala
31 | import org.apache.spark.ml.feature.SQLTransformer
32 |
33 | val df = spark.createDataFrame(
34 | Seq((0, 1.0, 3.0), (2, 2.0, 5.0))).toDF("id", "v1", "v2")
35 |
36 | val sqlTrans = new SQLTransformer().setStatement(
37 | "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
38 |
39 | sqlTrans.transform(df).show()
40 | ```
41 |
--------------------------------------------------------------------------------
/特征抽取和转换/StandardScaler.md:
--------------------------------------------------------------------------------
1 | # 特征缩放
2 |
3 | 特征缩放是用来统一资料中的自变项或特征范围的方法,在资料处理中,通常会被使用在资料前处理这个步骤。
4 |
5 | ## 1 动机
6 |
7 | 因为在原始的资料中,各变数的范围大不相同。对于某些机器学习的算法,若没有做过标准化,目标函数会无法适当的运作。举例来说,多数的分类器利用两点间的距离计算两点的差异,
8 | 若其中一个特征具有非常广的范围,那两点间的差异就会被该特征左右,因此,所有的特征都该被标准化,这样才能大略的使各特征依比例影响距离。另外一个做特征缩放的理由是他能使加速梯度下降法的收敛。
9 |
10 | ## 2 方法
11 |
12 | ## 2.1 重新缩放
13 |
14 | 最简单的方式是重新缩放特征的范围到`[0, 1]`或`[-1, 1]`, 依据原始的资料选择目标范围,通式如下:
15 |
16 | 
17 |
18 | ## 2.2 标准化
19 |
20 | 在机器学习中,我们可能要处理不同种类的资料,例如,音讯和图片上的像素值,这些资料可能是高维度的,资料标准化后会使每个特征中的数值平均变为0(将每个特征的值都减掉原始资料中该特征的平均)、标准差变为1,这个方法被广泛的使用在许多机器学习算法中。
21 |
22 | ## 3 实例
23 |
24 | ```scala
25 | import org.apache.spark.SparkContext._
26 | import org.apache.spark.mllib.feature.StandardScaler
27 | import org.apache.spark.mllib.linalg.Vectors
28 | import org.apache.spark.mllib.util.MLUtils
29 | val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
30 |
31 | val scaler1 = new StandardScaler().fit(data.map(x => x.features))
32 | val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features))
33 | // scaler3 is an identical model to scaler2, and will produce identical transformations
34 | val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean)
35 | // data1 will be unit variance.
36 | val data1 = data.map(x => (x.label, scaler1.transform(x.features)))
37 | // Without converting the features into dense vectors, transformation with zero mean will raise
38 | // exception on sparse vector.
39 | // data2 will be unit variance and zero mean.
40 | val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray))))
41 | ```
42 |
43 | ## 4 源代码实现
44 |
45 | 在`MLlib`中,`StandardScaler`类用于标准化特征。
46 |
47 | ```scala
48 | class StandardScaler @Since("1.1.0") (withMean: Boolean, withStd: Boolean)
49 | ```
50 | `StandardScaler`的实现中提供了两个参数`withMean`和`withStd`。在介绍这两个参数之前,我们先了解`fit`方法的实现。
51 |
52 | ```scala
53 | def fit(data: RDD[Vector]): StandardScalerModel = {
54 | // TODO: skip computation if both withMean and withStd are false
55 | val summary = data.treeAggregate(new MultivariateOnlineSummarizer)(
56 | (aggregator, data) => aggregator.add(data),
57 | (aggregator1, aggregator2) => aggregator1.merge(aggregator2))
58 | new StandardScalerModel(
59 | Vectors.dense(summary.variance.toArray.map(v => math.sqrt(v))),
60 | summary.mean,
61 | withStd,
62 | withMean)
63 | }
64 | ```
65 | 该方法计算数据集的均值和方差(查看[概括统计](../基本统计/summary-statistics.md)以了解更多信息),并初始化`StandardScalerModel`。初始化`StandardScalerModel`之后,我们就可以调用`transform`方法转换特征了。
66 |
67 | 当`withMean`参数为`true`时,`transform`的实现如下。
68 |
69 | ```scala
70 | private lazy val shift: Array[Double] = mean.toArray
71 | val localShift = shift
72 | vector match {
73 | case DenseVector(vs) =>
74 | val values = vs.clone()
75 | val size = values.size
76 | if (withStd) {
77 | var i = 0
78 | while (i < size) {
79 | values(i) = if (std(i) != 0.0) (values(i) - localShift(i)) * (1.0 / std(i)) else 0.0
80 | i += 1
81 | }
82 | } else {
83 | var i = 0
84 | while (i < size) {
85 | values(i) -= localShift(i)
86 | i += 1
87 | }
88 | }
89 | Vectors.dense(values)
90 | case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
91 | }
92 | ```
93 | 以上代码显示,当`withMean`为`true`,`withStd`为`false`时,向量中的各元素均减去它相应的均值。当`withMean`和`withStd`均为`true`时,各元素在减去相应的均值之后,还要除以它们相应的方差。
94 | 当`withMean`为`true`,程序只能处理稠密的向量,不能处理稀疏向量。
95 |
96 | 当`withMean`为`false`时,`transform`的实现如下。
97 |
98 | ```scala
99 | vector match {
100 | case DenseVector(vs) =>
101 | val values = vs.clone()
102 | val size = values.size
103 | var i = 0
104 | while(i < size) {
105 | values(i) *= (if (std(i) != 0.0) 1.0 / std(i) else 0.0)
106 | i += 1
107 | }
108 | Vectors.dense(values)
109 | case SparseVector(size, indices, vs) =>
110 | // For sparse vector, the `index` array inside sparse vector object will not be changed,
111 | // so we can re-use it to save memory.
112 | val values = vs.clone()
113 | val nnz = values.size
114 | var i = 0
115 | while (i < nnz) {
116 | values(i) *= (if (std(indices(i)) != 0.0) 1.0 / std(indices(i)) else 0.0)
117 | i += 1
118 | }
119 | Vectors.sparse(size, indices, values)
120 | case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
121 | }
122 | ```
123 |
124 | 这里的处理很简单,就是将数据集的列的标准差归一化为1。
125 |
126 | # 参考文献
127 |
128 | 【1】[特征缩放](https://zh.wikipedia.org/wiki/%E7%89%B9%E5%BE%B5%E7%B8%AE%E6%94%BE)
129 |
130 |
131 |
132 |
--------------------------------------------------------------------------------
/特征抽取和转换/StopWordsRemover.md:
--------------------------------------------------------------------------------
1 | # StopWordsRemover
2 |
3 | [Stop words](https://en.wikipedia.org/wiki/Stop_words)是那些需要从输入数据中排除掉的词。删除这些词的原因是,
4 | 这些词出现频繁,并没有携带太多有意义的信息。
5 |
6 | `StopWordsRemover`输入一串句子,将这些输入句子中的停用词全部删掉。停用词列表是通过`stopWords`参数来指定的。
7 | 一些语言的默认停用词可以通过调用`StopWordsRemover.loadDefaultStopWords(language)`来获得。可以用的语言选项有`danish`, `dutch`, `english`, `finnish`, `french`, `german`,
8 | `hungarian`, `italian`, `norwegian`, `portuguese`, `russian`, `spanish`, `swedish`以及 `turkish`。参数`caseSensitive`表示是否对大小写敏感,默认为`false`。
9 |
10 | ## 例子
11 |
12 | 假设我们有下面的`DataFrame`,列名为`id`和`raw`。
13 |
14 | ```
15 | id | raw
16 | ----|----------
17 | 0 | [I, saw, the, red, baloon]
18 | 1 | [Mary, had, a, little, lamb]
19 | ```
20 | 把`raw`作为输入列,`filtered`作为输出列,通过应用`StopWordsRemover`我们可以得到下面的结果。
21 |
22 | ```
23 | id | raw | filtered
24 | ----|-----------------------------|--------------------
25 | 0 | [I, saw, the, red, baloon] | [saw, red, baloon]
26 | 1 | [Mary, had, a, little, lamb]|[Mary, little, lamb]
27 | ```
28 |
29 | 下面是代码调用的例子。
30 |
31 | ```scala
32 | import org.apache.spark.ml.feature.StopWordsRemover
33 |
34 | val remover = new StopWordsRemover()
35 | .setInputCol("raw")
36 | .setOutputCol("filtered")
37 |
38 | val dataSet = spark.createDataFrame(Seq(
39 | (0, Seq("I", "saw", "the", "red", "baloon")),
40 | (1, Seq("Mary", "had", "a", "little", "lamb"))
41 | )).toDF("id", "raw")
42 |
43 | remover.transform(dataSet).show()
44 | ```
--------------------------------------------------------------------------------
/特征抽取和转换/StringIndexer.md:
--------------------------------------------------------------------------------
1 | # StringIndexer
2 |
3 | `StringIndexer`将标签列的字符串编码为标签索引。这些索引是`[0,numLabels)`,通过标签频率排序,所以频率最高的标签的索引为0。
4 | 如果输入列是数字,我们把它强转为字符串然后在编码。
5 |
6 | ## 例子
7 |
8 | 假设我们有下面的`DataFrame`,它的列名是`id`和`category`。
9 |
10 | ```
11 | id | category
12 | ----|----------
13 | 0 | a
14 | 1 | b
15 | 2 | c
16 | 3 | a
17 | 4 | a
18 | 5 | c
19 | ```
20 | `category`是字符串列,拥有三个标签`a,b,c`。把`category`作为输入列,`categoryIndex`作为输出列,使用`StringIndexer`我们可以得到下面的结果。
21 |
22 | ```
23 | id | category | categoryIndex
24 | ----|----------|---------------
25 | 0 | a | 0.0
26 | 1 | b | 2.0
27 | 2 | c | 1.0
28 | 3 | a | 0.0
29 | 4 | a | 0.0
30 | 5 | c | 1.0
31 | ```
32 | `a`的索引号为0是因为它的频率最高,c次之,b最后。
33 |
34 | 另外,`StringIndexer`处理未出现的标签的策略有两个:
35 |
36 | - 抛出一个异常(默认情况)
37 | - 跳过出现该标签的行
38 |
39 | 让我们回到上面的例子,但是这次我们重用上面的`StringIndexer`到下面的数据集。
40 |
41 | ```
42 | id | category
43 | ----|----------
44 | 0 | a
45 | 1 | b
46 | 2 | c
47 | 3 | d
48 | ```
49 | 如果我们没有为`StringIndexer`设置怎么处理未见过的标签或者设置为`error`,它将抛出异常,否则若设置为`skip`,它将得到下面的结果。
50 |
51 | ```
52 | id | category | categoryIndex
53 | ----|----------|---------------
54 | 0 | a | 0.0
55 | 1 | b | 2.0
56 | 2 | c | 1.0
57 | ```
58 |
59 | 下面是程序调用的例子。
60 |
61 | ```scala
62 | import org.apache.spark.ml.feature.StringIndexer
63 |
64 | val df = spark.createDataFrame(
65 | Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c"))
66 | ).toDF("id", "category")
67 |
68 | val indexer = new StringIndexer()
69 | .setInputCol("category")
70 | .setOutputCol("categoryIndex")
71 |
72 | val indexed = indexer.fit(df).transform(df)
73 | indexed.show()
74 | ```
--------------------------------------------------------------------------------
/特征抽取和转换/TF-IDF.md:
--------------------------------------------------------------------------------
1 | # TF-IDF
2 |
3 | ## 1 介绍
4 |
5 | [词频-逆文档频率法](http://en.wikipedia.org/wiki/Tf%E2%80%93idf)(`Term frequency-inverse document frequency,TF-IDF`)是在文本挖掘中广泛使用的特征向量化方法。
6 | 它反映语料中词对文档的重要程度。假设用`t`表示词,`d`表示文档,`D`表示语料。词频`TF(t,d)`表示词`t`在文档`d`中出现的次数。文档频率`DF(t,D)`表示语料中出现词`t`的文档的个数。
7 | 如果我们仅仅用词频去衡量重要程度,这很容易过分强调出现频繁但携带较少文档信息的词,如`of`、`the`等。如果一个词在语料中出现很频繁,这意味着它不携带特定文档的特殊信息。逆文档频率数值衡量一个词提供多少信息。
8 |
9 | 
10 |
11 | 如果某个词出现在所有的文档中,它的`IDF`值为0。注意,上式有个平滑项,这是为了避免分母为0的情况发生。`TF-IDF`就是`TF`和`IDF`简单的相乘。
12 |
13 | 
14 |
15 | 词频和文档频率的定义有很多种不同的变种。在`Mllib`中,分别提供了`TF`和`IDF`的实现,以便有更好的灵活性。
16 |
17 | `Mllib`使用`hashing trick`实现词频。元素的特征应用一个`hash`函数映射到一个索引(即词),通过这个索引计算词频。这个方法避免计算全局的词-索引映射,因为全局的词-索引映射在大规模语料中花费较大。
18 | 但是,它会出现哈希冲突,这是因为不同的元素特征可能得到相同的哈希值。为了减少碰撞冲突,我们可以增加目标特征的维度,例如哈希表的桶数量。默认的特征维度是1048576。
19 |
20 | ## 2 实例
21 |
22 | - TF的计算
23 |
24 | ```scala
25 | import org.apache.spark.rdd.RDD
26 | import org.apache.spark.SparkContext
27 | import org.apache.spark.mllib.feature.HashingTF
28 | import org.apache.spark.mllib.linalg.Vector
29 | val sc: SparkContext = ...
30 | // Load documents (one per line).
31 | val documents: RDD[Seq[String]] = sc.textFile("...").map(_.split(" ").toSeq)
32 | val hashingTF = new HashingTF()
33 | val tf: RDD[Vector] = hashingTF.transform(documents)
34 | ```
35 | - IDF的计算
36 |
37 | ```scala
38 | import org.apache.spark.mllib.feature.IDF
39 | // ... continue from the previous example
40 | tf.cache()
41 | val idf = new IDF().fit(tf)
42 | val tfidf: RDD[Vector] = idf.transform(tf)
43 | //或者
44 | val idf = new IDF(minDocFreq = 2).fit(tf)
45 | val tfidf: RDD[Vector] = idf.transform(tf)
46 | ```
47 |
48 | ## 3 源码实现
49 |
50 | 下面分别分析`HashingTF`和`IDF`的实现。
51 |
52 | ### 3.1 HashingTF
53 |
54 | ```scala
55 | def transform(document: Iterable[_]): Vector = {
56 | val termFrequencies = mutable.HashMap.empty[Int, Double]
57 | document.foreach { term =>
58 | val i = indexOf(term)
59 | termFrequencies.put(i, termFrequencies.getOrElse(i, 0.0) + 1.0)
60 | }
61 | Vectors.sparse(numFeatures, termFrequencies.toSeq)
62 | }
63 | ```
64 | 以上代码中,`indexOf`方法使用哈希获得索引。
65 |
66 | ```scala
67 | //为了减少碰撞,将numFeatures设置为1048576
68 | def indexOf(term: Any): Int = Utils.nonNegativeMod(term.##, numFeatures)
69 | def nonNegativeMod(x: Int, mod: Int): Int = {
70 | val rawMod = x % mod
71 | rawMod + (if (rawMod < 0) mod else 0)
72 | }
73 | ```
74 | 这里的`term.##`等价于`term.hashCode`,得到哈希值之后,作取余操作得到相应的索引。
75 |
76 | ### 3.2 IDF
77 |
78 | 我们先看`IDF`的`fit`方法。
79 |
80 | ```scala
81 | def fit(dataset: RDD[Vector]): IDFModel = {
82 | val idf = dataset.treeAggregate(new IDF.DocumentFrequencyAggregator(
83 | minDocFreq = minDocFreq))(
84 | seqOp = (df, v) => df.add(v),
85 | combOp = (df1, df2) => df1.merge(df2)
86 | ).idf()
87 | new IDFModel(idf)
88 | }
89 | ```
90 | 该函数使用`treeAggregate`处理数据集,生成一个`DocumentFrequencyAggregator`对象,它用于计算文档频率。重点看`add`和`merge`方法。
91 |
92 | ```scala
93 | def add(doc: Vector): this.type = {
94 | if (isEmpty) {
95 | df = BDV.zeros(doc.size)
96 | }
97 | //计算
98 | doc match {
99 | case SparseVector(size, indices, values) =>
100 | val nnz = indices.size
101 | var k = 0
102 | while (k < nnz) {
103 | if (values(k) > 0) {
104 | df(indices(k)) += 1L
105 | }
106 | k += 1
107 | }
108 | case DenseVector(values) =>
109 | val n = values.size
110 | var j = 0
111 | while (j < n) {
112 | if (values(j) > 0.0) {
113 | df(j) += 1L
114 | }
115 | j += 1
116 | }
117 | case other =>
118 | throw new UnsupportedOperationException
119 | }
120 | m += 1L
121 | this
122 | }
123 | ```
124 | `df`这个向量的每个元素都表示该索引对应的词出现的文档数。`m`表示文档总数。
125 |
126 | ```scala
127 | def merge(other: DocumentFrequencyAggregator): this.type = {
128 | if (!other.isEmpty) {
129 | m += other.m
130 | if (df == null) {
131 | df = other.df.copy
132 | } else {
133 | //简单的向量相加
134 | df += other.df
135 | }
136 | }
137 | this
138 | }
139 | ```
140 | `treeAggregate`方法处理完数据之后,调用`idf`方法将文档频率低于给定值的词的`idf`置为0,其它的按照上面的公式计算。
141 |
142 | ```scala
143 | def idf(): Vector = {
144 | val n = df.length
145 | val inv = new Array[Double](n)
146 | var j = 0
147 | while (j < n) {
148 | if (df(j) >= minDocFreq) {
149 | //计算得到idf
150 | inv(j) = math.log((m + 1.0) / (df(j) + 1.0))
151 | }
152 | j += 1
153 | }
154 | Vectors.dense(inv)
155 | }
156 | ```
157 | 最后使用`transform`方法计算`tfidf`值。
158 |
159 | ```scala
160 | //这里的dataset指tf
161 | def transform(dataset: RDD[Vector]): RDD[Vector] = {
162 | val bcIdf = dataset.context.broadcast(idf)
163 | dataset.mapPartitions(iter => iter.map(v => IDFModel.transform(bcIdf.value, v)))
164 | }
165 | def transform(idf: Vector, v: Vector): Vector = {
166 | val n = v.size
167 | v match {
168 | case SparseVector(size, indices, values) =>
169 | val nnz = indices.size
170 | val newValues = new Array[Double](nnz)
171 | var k = 0
172 | while (k < nnz) {
173 | //tf-idf = tf * idf
174 | newValues(k) = values(k) * idf(indices(k))
175 | k += 1
176 | }
177 | Vectors.sparse(n, indices, newValues)
178 | case DenseVector(values) =>
179 | val newValues = new Array[Double](n)
180 | var j = 0
181 | while (j < n) {
182 | newValues(j) = values(j) * idf(j)
183 | j += 1
184 | }
185 | Vectors.dense(newValues)
186 | case other =>
187 | throw new UnsupportedOperationException
188 | }
189 | }
190 | ```
--------------------------------------------------------------------------------
/特征抽取和转换/Tokenizer.md:
--------------------------------------------------------------------------------
1 | # Tokenizer
2 |
3 | [Tokenization](http://en.wikipedia.org/wiki/Lexical_analysis#Tokenization)是一个将文本(如一个句子)转换为个体单元(如词)的处理过程。
4 | 一个简单的`Tokenizer`类就提供了这个功能。下面的例子展示了如何将句子转换为此序列。
5 |
6 | `RegexTokenizer`基于正则表达式匹配提供了更高级的断词(`tokenization`)。默认情况下,参数`pattern`(默认是`\s+`)作为分隔符,
7 | 用来切分输入文本。用户可以设置`gaps`参数为`false`用来表明正则参数`pattern`表示`tokens`而不是`splitting gaps`,这个类可以找到所有匹配的事件并作为结果返回。下面是调用的例子。
8 |
9 | ```scala
10 | import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}
11 |
12 | val sentenceDataFrame = spark.createDataFrame(Seq(
13 | (0, "Hi I heard about Spark"),
14 | (1, "I wish Java could use case classes"),
15 | (2, "Logistic,regression,models,are,neat")
16 | )).toDF("label", "sentence")
17 |
18 | val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
19 | val regexTokenizer = new RegexTokenizer()
20 | .setInputCol("sentence")
21 | .setOutputCol("words")
22 | .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)
23 |
24 | val tokenized = tokenizer.transform(sentenceDataFrame)
25 | tokenized.select("words", "label").take(3).foreach(println)
26 | val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
27 | regexTokenized.select("words", "label").take(3).foreach(println)
28 | ```
--------------------------------------------------------------------------------
/特征抽取和转换/VectorAssembler.md:
--------------------------------------------------------------------------------
1 | # VectorAssembler
2 |
3 | `VectorAssembler`是一个转换器,它可以将给定的多列转换为一个向量列。合并原始特征与通过不同的转换器转换而来的特征,从而训练机器学习模型,
4 | `VectorAssembler`是非常有用的。`VectorAssembler`允许这些类型:所有的数值类型,`boolean`类型以及`vector`类型。
5 |
6 | ## 例子
7 |
8 | 假设我们有下面的`DataFrame`,它的列名分别是`id, hour, mobile, userFeatures, clicked`。
9 |
10 | ```
11 | id | hour | mobile | userFeatures | clicked
12 | ----|------|--------|------------------|---------
13 | 0 | 18 | 1.0 | [0.0, 10.0, 0.5] | 1.0
14 | ```
15 |
16 | `userFeatures`是一个向量列,包含三个用户特征。我们想合并`hour`, `mobile`和`userFeatures`到一个名为`features`的特征列。
17 | 通过转换之后,我们可以得到下面的结果。
18 |
19 | ```
20 | id | hour | mobile | userFeatures | clicked | features
21 | ----|------|--------|------------------|---------|-----------------------------
22 | 0 | 18 | 1.0 | [0.0, 10.0, 0.5] | 1.0 | [18.0, 1.0, 0.0, 10.0, 0.5]
23 | ```
24 |
25 | 下面是程序调用的例子。
26 |
27 | ```scala
28 | import org.apache.spark.ml.feature.VectorAssembler
29 | import org.apache.spark.ml.linalg.Vectors
30 |
31 | val dataset = spark.createDataFrame(
32 | Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0))
33 | ).toDF("id", "hour", "mobile", "userFeatures", "clicked")
34 |
35 | val assembler = new VectorAssembler()
36 | .setInputCols(Array("hour", "mobile", "userFeatures"))
37 | .setOutputCol("features")
38 |
39 | val output = assembler.transform(dataset)
40 | println(output.select("features", "clicked").first())
41 | ```
--------------------------------------------------------------------------------
/特征抽取和转换/VectorIndexer.md:
--------------------------------------------------------------------------------
1 | # VectorIndexer
2 |
3 | `VectorIndexer`把数据集中的类型特征索引为向量。它不仅可以自动的判断哪些特征是可以类别化,也能将原有的值转换为类别索引。
4 | 通常情况下,它的过程如下:
5 |
6 | - 1 拿到类型为`vector`的输入列和参数`maxCategories`
7 | - 2 根据有区别的值的数量,判断哪些特征可以类别化。拥有的不同值的数量至少要为`maxCategories`的特征才能判断可以类别化。
8 | - 3 对每一个可以类别化的特征计算基于0的类别索引。
9 | - 4 为类别特征建立索引,将原有的特征值转换为索引。
10 |
11 | 索引类别特征允许诸如决策树和集合树等算法适当处理可分类化的特征,提高效率。
12 |
13 | 在下面的例子中,我们从数据集中读取标签点,然后利用`VectorIndexer`去判断哪些特征可以被认为是可分类化的。
14 | 我们将可分类特征的值转换为索引。转换后的数据可以传递给`DecisionTreeRegressor`等可以操作分类特征的算法。
15 |
16 | ```scala
17 | import org.apache.spark.ml.feature.VectorIndexer
18 |
19 | val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
20 |
21 | val indexer = new VectorIndexer()
22 | .setInputCol("features")
23 | .setOutputCol("indexed")
24 | .setMaxCategories(10)
25 |
26 | val indexerModel = indexer.fit(data)
27 |
28 | val categoricalFeatures: Set[Int] = indexerModel.categoryMaps.keys.toSet
29 | println(s"Chose ${categoricalFeatures.size} categorical features: " +
30 | categoricalFeatures.mkString(", "))
31 |
32 | // Create new column "indexed" with categorical values transformed to indices
33 | val indexedData = indexerModel.transform(data)
34 | indexedData.show()
35 | ```
--------------------------------------------------------------------------------
/特征抽取和转换/VectorSlicer.md:
--------------------------------------------------------------------------------
1 | # VectorSlicer
2 |
3 | `VectorSlicer`是一个转换器,输入一个特征向量输出一个特征向量,它是原特征的一个子集。这在从向量列中抽取特征非常有用。
4 | `VectorSlicer`接收一个拥有特定索引的特征列,它的输出是一个新的特征列,它的值通过输入的索引来选择。有两种类型的索引:
5 |
6 | - 1、整数索引表示进入向量的索引,调用`setIndices()`
7 | - 2、字符串索引表示进入向量的特征列的名称,调用`setNames()`。这种情况需要向量列拥有一个`AttributeGroup`,这是因为实现是通过属性的名字来匹配的。
8 |
9 | 整数和字符串都是可以使用的,并且,整数和字符串可以同时使用。至少需要选择一个特征,而且重复的特征是不被允许的。
10 |
11 | 输出向量首先会按照选择的索引进行排序,然后再按照选择的特征名进行排序。
12 |
13 | ## 例子
14 |
15 | 假设我们有下面的`DataFrame`,它的列名是`userFeatures`。
16 |
17 | ```
18 | userFeatures
19 | ------------------
20 | [0.0, 10.0, 0.5]
21 | ```
22 | `userFeatures`是一个向量列,它包含三个用户特征。假设用户特征的第一列均为0,所以我们想删除它,仅仅选择后面的两列。
23 | `VectorSlicer`通过`setIndices(1,2)`选择后面的两项,产生下面新的名为`features`的向量列。
24 |
25 | ```
26 | userFeatures | features
27 | ------------------|-----------------------------
28 | [0.0, 10.0, 0.5] | [10.0, 0.5]
29 | ```
30 | 假设我们还有潜在的输入特性,如`["f1", "f2", "f3"]`,我们还可以通过`setNames("f2", "f3")`来选择。
31 |
32 | ```
33 | userFeatures | features
34 | ------------------|-----------------------------
35 | [0.0, 10.0, 0.5] | [10.0, 0.5]
36 | ["f1", "f2", "f3"] | ["f2", "f3"]
37 | ```
38 | 下面是程序调用的例子。
39 |
40 | ```scala
41 | import java.util.Arrays
42 |
43 | import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
44 | import org.apache.spark.ml.feature.VectorSlicer
45 | import org.apache.spark.ml.linalg.Vectors
46 | import org.apache.spark.sql.Row
47 | import org.apache.spark.sql.types.StructType
48 |
49 | val data = Arrays.asList(Row(Vectors.dense(-2.0, 2.3, 0.0)))
50 |
51 | val defaultAttr = NumericAttribute.defaultAttr
52 | val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName)
53 | val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]])
54 |
55 | val dataset = spark.createDataFrame(data, StructType(Array(attrGroup.toStructField())))
56 |
57 | val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features")
58 |
59 | slicer.setIndices(Array(1)).setNames(Array("f3"))
60 | // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3"))
61 |
62 | val output = slicer.transform(dataset)
63 | println(output.select("userFeatures", "features").first())
64 | ```
65 |
--------------------------------------------------------------------------------
/特征抽取和转换/Word2Vector.md:
--------------------------------------------------------------------------------
1 | # Word2Vector
2 |
3 | [Word2Vector](https://code.google.com/p/word2vec/)将词转换成分布式向量。分布式表示的主要优势是相似的词在向量空间距离较近,这使我们更容易泛化新的模式并且使模型估计更加健壮。
4 | 分布式的向量表示在许多自然语言处理应用(如命名实体识别、消歧、词法分析、机器翻译)中非常有用。
5 |
6 | ## 1 模型
7 |
8 | 在`MLlib`中,`Word2Vector`使用`skip-gram`模型来实现。`skip-gram`的训练目标是学习词向量表示,这个表示可以很好的预测它在相同句子中的上下文。数学上,给定训练词`w_1,w_2,...,w_T`,
9 | `skip-gram`模型的目标是最大化下面的平均对数似然。
10 |
11 | 
12 |
13 | 其中`k`是训练窗口的大小。在`skip-gram`模型中,每个词`w`和两个向量`u_w`和`v_w`相关联,这两个向量分别表示词和上下文。正确地预测给定词`w_j`的条件下`w_i`的概率使用`softmax`模型。
14 |
15 | 
16 |
17 | 其中`V`表示词汇数量。在`skip-gram`模型中使用`softmax`是非常昂贵的,因为计算`log p(w_i|w_j)`与`V`是成比例的。为了加快`Word2Vec`的训练速度,`MLlib`使用了分层`softmax`,这样可以将计算的复杂度降低为`O(log(V))`。
18 |
19 | ## 2 实例
20 |
21 | 下面的例子展示了怎样加载文本数据、切分数据、构造`Word2Vec`实例、训练模型。最后,我们打印某个词的40个同义词。
22 |
23 | ```scala
24 | import org.apache.spark._
25 | import org.apache.spark.rdd._
26 | import org.apache.spark.SparkContext._
27 | import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel}
28 | val input = sc.textFile("text8").map(line => line.split(" ").toSeq)
29 | val word2vec = new Word2Vec()
30 | val model = word2vec.fit(input)
31 | val synonyms = model.findSynonyms("china", 40)
32 | for((synonym, cosineSimilarity) <- synonyms) {
33 | println(s"$synonym $cosineSimilarity")
34 | }
35 | ```
36 |
37 | ## 3 源码分析
38 |
39 | 由于涉及神经网络相关的知识,这里先不作分析,后续会补上。要更详细了解`Word2Vector`可以阅读文献【2】。
40 |
41 | # 参考文献
42 |
43 | 【1】[哈夫曼树与哈夫曼编码](http://www.cnblogs.com/Jezze/archive/2011/12/23/2299884.html)
44 |
45 | 【2】[Deep Learning 实战之 word2vec](docs/word2vec.pdf)
46 |
47 | 【3】[Word2Vector谷歌实现](https://code.google.com/p/word2vec/)
--------------------------------------------------------------------------------
/特征抽取和转换/chi-square-selector.md:
--------------------------------------------------------------------------------
1 | # 卡方选择器
2 |
3 | [特征选择](http://en.wikipedia.org/wiki/Feature_selection)试图识别相关的特征用于模型构建。它改变特征空间的大小,它可以提高速度以及统计学习行为。`ChiSqSelector`实现卡方特征选择,它操作于带有类别特征的标注数据。
4 | `ChiSqSelector`根据独立的卡方测试对特征进行排序,然后选择排序最高的特征。下面是一个使用的例子。
5 |
6 | ```scala
7 | import org.apache.spark.SparkContext._
8 | import org.apache.spark.mllib.linalg.Vectors
9 | import org.apache.spark.mllib.regression.LabeledPoint
10 | import org.apache.spark.mllib.util.MLUtils
11 | import org.apache.spark.mllib.feature.ChiSqSelector
12 | // 加载数据
13 | val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
14 | // 卡方分布需要类别特征,所以对特征除一个整数。虽然特征是double类型,
15 | //但是ChiSqSelector将每个唯一的值当做一个类别
16 | val discretizedData = data.map { lp =>
17 | LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map { x => (x / 16).floor } ) )
18 | }
19 | // Create ChiSqSelector that will select top 50 of 692 features
20 | val selector = new ChiSqSelector(50)
21 | // Create ChiSqSelector model (selecting features)
22 | val transformer = selector.fit(discretizedData)
23 | // Filter the top 50 features from each feature vector
24 | val filteredData = discretizedData.map { lp =>
25 | LabeledPoint(lp.label, transformer.transform(lp.features))
26 | }
27 | ```
28 | 下面看看选择特征的实现,入口函数是`fit`。
29 |
30 | ```scala
31 | def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = {
32 | //计算数据卡方值
33 | val indices = Statistics.chiSqTest(data)
34 | .zipWithIndex.sortBy { case (res, _) => -res.statistic }
35 | .take(numTopFeatures)
36 | .map { case (_, indices) => indices }
37 | .sorted
38 | new ChiSqSelectorModel(indices)
39 | }
40 | ```
41 |
42 | 这里通过`Statistics.chiSqTest`计算卡方检测的值。下面需要了解卡方检测的理论基础。
43 |
44 | ## 1 卡方检测
45 |
46 | ### 1.1 什么是卡方检测
47 |
48 | 卡方检验是一种用途很广的计数资料的假设检验方法。它属于非参数检验的范畴,主要是比较两个及两个以上样本率( 构成比)以及两个分类变量的关联性分析。
49 | 其根本思想就是在于比较理论频数和实际频数的吻合程度或拟合优度问题。
50 |
51 | ### 1.2 卡方检测的基本思想
52 |
53 | 卡方检验是以${X}^{2}$分布为基础的一种常用假设检验方法,它的无效假设`H0`是:观察频数与期望频数没有差别。
54 |
55 | 该检验的基本思想是:首先假设`H0`成立,基于此前提计算出${X}^{2}$值,它表示观察值与理论值之间的偏离程度。根据${X}^{2}$分布及自由度可以确定在`H0`假设成立的情况下获得当前统计量及更极端情况的概率`P`。
56 | 如果P值很小,说明观察值与理论值偏离程度太大,应当拒绝无效假设,表示比较资料之间有显著差异;否则就不能拒绝无效假设,尚不能认为样本所代表的实际情况和理论假设有差别。
57 |
58 | ### 1.3 卡方值的计算与意义
59 |
60 | 卡方值表示观察值与理论值之问的偏离程度。计算这种偏离程度的基本思路如下。
61 |
62 | - 设`A`代表某个类别的观察频数,`E`代表基于`H0`计算出的期望频数,`A`与`E`之差称为残差。
63 |
64 | - 残差可以表示某一个类别观察值和理论值的偏离程度,但如果将残差简单相加以表示各类别观察频数与期望频数的差别,则有一定的不足之处。
65 | 因为残差有正有负,相加后会彼此抵消,总和仍然为0,为此可以将残差平方后求和。
66 |
67 | - 另一方面,残差大小是一个相对的概念,相对于期望频数为10时,期望频数为20的残差非常大,但相对于期望频数为1000时20的残差就很小了。
68 | 考虑到这一点,人们又将残差平方除以期望频数再求和,以估计观察频数与期望频数的差别。
69 |
70 | 进行上述操作之后,就得到了常用的${X}^{2}$统计量。其计算公式是:
71 |
72 | 
73 |
74 | 当`n`比较大时,卡方统计量近似服从`k-1`(计算`E_i`时用到的参数个数)个自由度的卡方分布。由卡方的计算公式可知,当观察频数与期望频数完全一致时,卡方值为0;观察频数与期望频数越接近,两者之间的差异越小,卡方值越小;
75 | 反之,观察频数与期望频数差别越大,两者之间的差异越大,卡方值越大。
76 |
77 | ## 2 卡方检测的源码实现
78 |
79 | 在`MLlib`中,使用`chiSquaredFeatures`方法实现卡方检测。它为每个特征进行皮尔森独立检测。下面看它的代码实现。
80 |
81 | ```scala
82 | def chiSquaredFeatures(data: RDD[LabeledPoint],
83 | methodName: String = PEARSON.name): Array[ChiSqTestResult] = {
84 | val maxCategories = 10000
85 | val numCols = data.first().features.size
86 | val results = new Array[ChiSqTestResult](numCols)
87 | var labels: Map[Double, Int] = null
88 | // 某个时刻至少1000列
89 | val batchSize = 1000
90 | var batch = 0
91 | while (batch * batchSize < numCols) {
92 | val startCol = batch * batchSize
93 | val endCol = startCol + math.min(batchSize, numCols - startCol)
94 | val pairCounts = data.mapPartitions { iter =>
95 | val distinctLabels = mutable.HashSet.empty[Double]
96 | val allDistinctFeatures: Map[Int, mutable.HashSet[Double]] =
97 | Map((startCol until endCol).map(col => (col, mutable.HashSet.empty[Double])): _*)
98 | var i = 1
99 | iter.flatMap { case LabeledPoint(label, features) =>
100 | if (i % 1000 == 0) {
101 | if (distinctLabels.size > maxCategories) {
102 | throw new SparkException
103 | }
104 | allDistinctFeatures.foreach { case (col, distinctFeatures) =>
105 | if (distinctFeatures.size > maxCategories) {
106 | throw new SparkException
107 | }
108 | }
109 | }
110 | i += 1
111 | distinctLabels += label
112 | features.toArray.view.zipWithIndex.slice(startCol, endCol).map { case (feature, col) =>
113 | allDistinctFeatures(col) += feature
114 | (col, feature, label)
115 | }
116 | }
117 | }.countByValue()
118 | if (labels == null) {
119 | // Do this only once for the first column since labels are invariant across features.
120 | labels =
121 | pairCounts.keys.filter(_._1 == startCol).map(_._3).toArray.distinct.zipWithIndex.toMap
122 | }
123 | val numLabels = labels.size
124 | pairCounts.keys.groupBy(_._1).map { case (col, keys) =>
125 | val features = keys.map(_._2).toArray.distinct.zipWithIndex.toMap
126 | val numRows = features.size
127 | val contingency = new BDM(numRows, numLabels, new Array[Double](numRows * numLabels))
128 | keys.foreach { case (_, feature, label) =>
129 | val i = features(feature)
130 | val j = labels(label)
131 | //带有标签的特征的出现次数
132 | contingency(i, j) += pairCounts((col, feature, label))
133 | }
134 | results(col) = chiSquaredMatrix(Matrices.fromBreeze(contingency), methodName)
135 | }
136 | batch += 1
137 | }
138 | results
139 | }
140 | ```
141 | 上述代码主要对数据进行处理,获取带有标签的特征的出现次数,并用这个次数计算卡方值。真正获取卡方值的函数是`chiSquaredMatrix`。
142 |
143 | ```scala
144 | def chiSquaredMatrix(counts: Matrix, methodName: String = PEARSON.name): ChiSqTestResult = {
145 | val method = methodFromString(methodName)
146 | val numRows = counts.numRows
147 | val numCols = counts.numCols
148 | // get row and column sums
149 | val colSums = new Array[Double](numCols)
150 | val rowSums = new Array[Double](numRows)
151 | val colMajorArr = counts.toArray
152 | val colMajorArrLen = colMajorArr.length
153 | var i = 0
154 | while (i < colMajorArrLen) {
155 | val elem = colMajorArr(i)
156 | if (elem < 0.0) {
157 | throw new IllegalArgumentException("Contingency table cannot contain negative entries.")
158 | }
159 | //每列的总数
160 | colSums(i / numRows) += elem
161 | //每行的总数
162 | rowSums(i % numRows) += elem
163 | i += 1
164 | }
165 | //所有元素的总和
166 | val total = colSums.sum
167 | // second pass to collect statistic
168 | var statistic = 0.0
169 | var j = 0
170 | while (j < colMajorArrLen) {
171 | val col = j / numRows
172 | val colSum = colSums(col)
173 | if (colSum == 0.0) {
174 | throw new IllegalArgumentException("Chi-squared statistic undefined for input matrix due to"
175 | + s"0 sum in column [$col].")
176 | }
177 | val row = j % numRows
178 | val rowSum = rowSums(row)
179 | if (rowSum == 0.0) {
180 | throw new IllegalArgumentException("Chi-squared statistic undefined for input matrix due to"
181 | + s"0 sum in row [$row].")
182 | }
183 | //期望值
184 | val expected = colSum * rowSum / total
185 | //PEARSON
186 | statistic += method.chiSqFunc(colMajorArr(j), expected)
187 | j += 1
188 | }
189 | //自由度
190 | val df = (numCols - 1) * (numRows - 1)
191 | if (df == 0) {
192 | // 1 column or 1 row. Constant distribution is independent of anything.
193 | // pValue = 1.0 and statistic = 0.0 in this case.
194 | new ChiSqTestResult(1.0, 0, 0.0, methodName, NullHypothesis.independence.toString)
195 | } else {
196 | //计算累积概率
197 | val pValue = 1.0 - new ChiSquaredDistribution(df).cumulativeProbability(statistic)
198 | new ChiSqTestResult(pValue, df, statistic, methodName, NullHypothesis.independence.toString)
199 | }
200 | }
201 | //上述代码中的method.chiSqFunc(colMajorArr(j), expected),调用下面的代码
202 | val PEARSON = new Method("pearson", (observed: Double, expected: Double) => {
203 | val dev = observed - expected
204 | dev * dev / expected
205 | })
206 | ```
207 | 上述代码的实现和参考文献【2】中`Test of independence`的描述一致。
208 |
209 | ## 参考文献
210 |
211 | 【1】[卡方检验](http://wiki.mbalib.com/wiki/%E5%8D%A1%E6%96%B9%E6%A3%80%E9%AA%8C)
212 |
213 | 【2】[Pearson's chi-squared test](https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test)
--------------------------------------------------------------------------------
/特征抽取和转换/docs/word2vec.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/特征抽取和转换/docs/word2vec.pdf
--------------------------------------------------------------------------------
/特征抽取和转换/element-wise-product.md:
--------------------------------------------------------------------------------
1 | # 元素智能乘积
2 |
3 | `ElementwiseProduct`对每一个输入向量乘以一个给定的“权重”向量。换句话说,就是通过一个乘子对数据集的每一列进行缩放。这个转换可以表示为如下的形式:
4 |
5 | 
6 |
7 | 下面是一个使用的实例。
8 |
9 | ```scala
10 | import org.apache.spark.SparkContext._
11 | import org.apache.spark.mllib.feature.ElementwiseProduct
12 | import org.apache.spark.mllib.linalg.Vectors
13 | // Create some vector data; also works for sparse vectors
14 | val data = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0)))
15 | val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
16 | val transformer = new ElementwiseProduct(transformingVector)
17 |
18 | // Batch transform and per-row transform give the same results:
19 | val transformedData = transformer.transform(data)
20 | val transformedData2 = data.map(x => transformer.transform(x))
21 | ```
22 |
23 | 下面看`transform`的实现。
24 |
25 | ```scala
26 | override def transform(vector: Vector): Vector = {
27 | vector match {
28 | case dv: DenseVector =>
29 | val values: Array[Double] = dv.values.clone()
30 | val dim = scalingVec.size
31 | var i = 0
32 | while (i < dim) {
33 | //相对应的值相乘
34 | values(i) *= scalingVec(i)
35 | i += 1
36 | }
37 | Vectors.dense(values)
38 | case SparseVector(size, indices, vs) =>
39 | val values = vs.clone()
40 | val dim = values.length
41 | var i = 0
42 | while (i < dim) {
43 | //相对应的值相乘
44 | values(i) *= scalingVec(indices(i))
45 | i += 1
46 | }
47 | Vectors.sparse(size, indices, values)
48 | case v => throw new IllegalArgumentException("Does not support vector type " + v.getClass)
49 | }
50 | }
51 | ```
52 |
--------------------------------------------------------------------------------
/特征抽取和转换/imgs/1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/特征抽取和转换/imgs/1.1.png
--------------------------------------------------------------------------------
/特征抽取和转换/imgs/1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/特征抽取和转换/imgs/1.2.png
--------------------------------------------------------------------------------
/特征抽取和转换/imgs/2.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/特征抽取和转换/imgs/2.1.png
--------------------------------------------------------------------------------
/特征抽取和转换/imgs/2.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/特征抽取和转换/imgs/2.2.png
--------------------------------------------------------------------------------
/特征抽取和转换/imgs/3.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/特征抽取和转换/imgs/3.1.png
--------------------------------------------------------------------------------
/特征抽取和转换/imgs/4.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/特征抽取和转换/imgs/4.1.png
--------------------------------------------------------------------------------
/特征抽取和转换/imgs/5.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/特征抽取和转换/imgs/5.1.png
--------------------------------------------------------------------------------
/特征抽取和转换/imgs/6.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/特征抽取和转换/imgs/6.1.png
--------------------------------------------------------------------------------
/特征抽取和转换/n_gram.md:
--------------------------------------------------------------------------------
1 | # n-gram
2 |
3 | 一个[n-gram](https://en.wikipedia.org/wiki/N-gram)是一个包含`n`个`tokens`(如词)的序列。`NGram`可以将输入特征
4 | 转换为`n-grams`。
5 |
6 | `NGram`输入一系列的序列,参数`n`用来决定每个`n-gram`的词个数。输出包含一个`n-grams`序列,每个`n-gram`表示一个划定空间的连续词序列。
7 | 如果输入序列包含的词少于`n`,将不会有输出。
8 |
9 | ```scala
10 | import org.apache.spark.ml.feature.NGram
11 |
12 | val wordDataFrame = spark.createDataFrame(Seq(
13 | (0, Array("Hi", "I", "heard", "about", "Spark")),
14 | (1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
15 | (2, Array("Logistic", "regression", "models", "are", "neat"))
16 | )).toDF("label", "words")
17 |
18 | val ngram = new NGram().setInputCol("words").setOutputCol("ngrams")
19 | val ngramDataFrame = ngram.transform(wordDataFrame)
20 | ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println)
21 | ```
--------------------------------------------------------------------------------
/特征抽取和转换/normalizer.md:
--------------------------------------------------------------------------------
1 | # 规则化
2 |
3 | 规则化器缩放单个样本让其拥有单位$L^{p}$范数。这是文本分类和聚类常用的操作。例如,两个$L^{2}$规则化的`TFIDF`向量的点乘就是两个向量的`cosine`相似度。
4 |
5 | `Normalizer`实现` VectorTransformer`,将一个向量规则化为转换的向量,或者将一个`RDD`规则化为另一个`RDD`。下面是一个规则化的例子。
6 |
7 | ```scala
8 | import org.apache.spark.SparkContext._
9 | import org.apache.spark.mllib.feature.Normalizer
10 | import org.apache.spark.mllib.linalg.Vectors
11 | import org.apache.spark.mllib.util.MLUtils
12 | val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
13 | //默认情况下,p=2。计算2阶范数
14 | val normalizer1 = new Normalizer()
15 | val normalizer2 = new Normalizer(p = Double.PositiveInfinity)
16 | // Each sample in data1 will be normalized using $L^2$ norm.
17 | val data1 = data.map(x => (x.label, normalizer1.transform(x.features)))
18 | // Each sample in data2 will be normalized using $L^\infty$ norm.
19 | val data2 = data.map(x => (x.label, normalizer2.transform(x.features)))
20 | ```
21 | 规则化的实现很简单,我们看它的`transform`方法。
22 |
23 | ```scala
24 | override def transform(vector: Vector): Vector = {
25 | //求范数
26 | val norm = Vectors.norm(vector, p)
27 | if (norm != 0.0) {
28 | //稀疏向量可以重用index
29 | vector match {
30 | case DenseVector(vs) =>
31 | val values = vs.clone()
32 | val size = values.size
33 | var i = 0
34 | while (i < size) {
35 | values(i) /= norm
36 | i += 1
37 | }
38 | Vectors.dense(values)
39 | case SparseVector(size, ids, vs) =>
40 | val values = vs.clone()
41 | val nnz = values.size
42 | var i = 0
43 | while (i < nnz) {
44 | values(i) /= norm
45 | i += 1
46 | }
47 | Vectors.sparse(size, ids, values)
48 | case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
49 | }
50 | } else {
51 | vector
52 | }
53 | }
54 | ```
55 | 求范数调用了`Vectors.norm`方法,我们可以看看该方法的实现。
56 |
57 | ```scala
58 | def norm(vector: Vector, p: Double): Double = {
59 | val values = vector match {
60 | case DenseVector(vs) => vs
61 | case SparseVector(n, ids, vs) => vs
62 | case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
63 | }
64 | val size = values.length
65 | if (p == 1) {
66 | var sum = 0.0
67 | var i = 0
68 | while (i < size) {
69 | sum += math.abs(values(i))
70 | i += 1
71 | }
72 | sum
73 | } else if (p == 2) {
74 | var sum = 0.0
75 | var i = 0
76 | while (i < size) {
77 | sum += values(i) * values(i)
78 | i += 1
79 | }
80 | math.sqrt(sum)
81 | } else if (p == Double.PositiveInfinity) {
82 | var max = 0.0
83 | var i = 0
84 | while (i < size) {
85 | val value = math.abs(values(i))
86 | if (value > max) max = value
87 | i += 1
88 | }
89 | max
90 | } else {
91 | var sum = 0.0
92 | var i = 0
93 | while (i < size) {
94 | sum += math.pow(math.abs(values(i)), p)
95 | i += 1
96 | }
97 | math.pow(sum, 1.0 / p)
98 | }
99 | }
100 | ```
101 | 这里分四种情况。当`p=1`时,即计算一阶范数,它的值为所有元素绝对值之和。当`p=2`时,它的值为所有元素的平方和。当`p == Double.PositiveInfinity`时,返回所有元素绝对值的最大值。
102 | 如果以上三种情况都不满足,那么按照下面的公式计算。
103 |
104 | 
105 |
106 |
107 |
--------------------------------------------------------------------------------
/聚类/LDA/docs/Latent Dirichlet Allocation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/docs/Latent Dirichlet Allocation.pdf
--------------------------------------------------------------------------------
/聚类/LDA/docs/On Smoothing and Inference for Topic Models.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/docs/On Smoothing and Inference for Topic Models.pdf
--------------------------------------------------------------------------------
/聚类/LDA/docs/Online Learning for Latent Dirichlet Allocation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/docs/Online Learning for Latent Dirichlet Allocation.pdf
--------------------------------------------------------------------------------
/聚类/LDA/docs/dirichlet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/docs/dirichlet.pdf
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.1.1.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.1.2.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.1.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.1.3.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.2.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.2.1.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.2.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.2.2.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.3.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.3.1.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.3.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.3.2.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.4.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.4.1.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.4.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.4.2.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.5.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.5.1.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.5.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.5.2.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.5.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.5.3.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.5.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.5.4.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.5.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.5.5.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.5.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.5.6.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.5.7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.5.7.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.6.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.6.1.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.6.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.6.2.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.6.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.6.3.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.6.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.6.4.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.6.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.6.5.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.6.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.6.6.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.6.7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.6.7.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.6.8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.6.8.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.7.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.7.1.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.7.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.7.2.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.7.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.7.3.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/1.7.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/1.7.4.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/2.1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/2.1.1.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/2.1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/2.1.2.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/2.2.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/2.2.1.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/2.2.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/2.2.2.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/2.3.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/2.3.1.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/2.3.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/2.3.2.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/2.3.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/2.3.3.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/2.3.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/2.3.4.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/2.3.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/2.3.5.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.1.1.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.1.2.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.1.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.1.3.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.1.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.1.4.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.1.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.1.5.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.1.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.1.6.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.1.7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.1.7.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.2.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.2.1.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.2.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.2.2.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.2.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.2.3.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.2.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.2.4.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.2.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.2.5.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.2.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.2.6.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.2.7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.2.7.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.2.8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.2.8.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.2.9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.2.9.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.3.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.3.1.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.3.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.3.2.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.3.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.3.3.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.3.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.3.4.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.3.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.3.5.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.3.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.3.6.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/3.3.7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/3.3.7.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/LDA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/LDA.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/alg1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/alg1.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/alg2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/alg2.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/docs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/docs.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/question1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/question1.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/question2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/question2.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/question3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/question3.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/question4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/question4.png
--------------------------------------------------------------------------------
/聚类/LDA/imgs/topic_words.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/LDA/imgs/topic_words.png
--------------------------------------------------------------------------------
/聚类/PIC/imgs/PIC.1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/PIC/imgs/PIC.1.1.png
--------------------------------------------------------------------------------
/聚类/PIC/imgs/PIC.1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/PIC/imgs/PIC.1.2.png
--------------------------------------------------------------------------------
/聚类/PIC/imgs/PIC.1.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/PIC/imgs/PIC.1.3.png
--------------------------------------------------------------------------------
/聚类/PIC/imgs/PIC.1.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/PIC/imgs/PIC.1.4.png
--------------------------------------------------------------------------------
/聚类/PIC/imgs/PIC.1.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/PIC/imgs/PIC.1.5.png
--------------------------------------------------------------------------------
/聚类/PIC/imgs/PIC.1.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/PIC/imgs/PIC.1.6.png
--------------------------------------------------------------------------------
/聚类/PIC/papers/Power Iteration Clustering.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/PIC/papers/Power Iteration Clustering.pdf
--------------------------------------------------------------------------------
/聚类/bis-k-means/imgs/dis-k-means.1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/bis-k-means/imgs/dis-k-means.1.1.png
--------------------------------------------------------------------------------
/聚类/bis-k-means/imgs/dis-k-means.1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/bis-k-means/imgs/dis-k-means.1.2.png
--------------------------------------------------------------------------------
/聚类/bis-k-means/papers/A Comparison of Document Clustering Techniques.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/bis-k-means/papers/A Comparison of Document Clustering Techniques.pdf
--------------------------------------------------------------------------------
/聚类/gaussian-mixture/imgs/1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/gaussian-mixture/imgs/1.1.png
--------------------------------------------------------------------------------
/聚类/gaussian-mixture/imgs/1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/gaussian-mixture/imgs/1.2.png
--------------------------------------------------------------------------------
/聚类/gaussian-mixture/imgs/1.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/gaussian-mixture/imgs/1.3.png
--------------------------------------------------------------------------------
/聚类/gaussian-mixture/imgs/1.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/gaussian-mixture/imgs/1.4.png
--------------------------------------------------------------------------------
/聚类/gaussian-mixture/imgs/1.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/gaussian-mixture/imgs/1.5.png
--------------------------------------------------------------------------------
/聚类/gaussian-mixture/imgs/1.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/gaussian-mixture/imgs/1.6.png
--------------------------------------------------------------------------------
/聚类/gaussian-mixture/imgs/1.7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/gaussian-mixture/imgs/1.7.png
--------------------------------------------------------------------------------
/聚类/gaussian-mixture/imgs/1.8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/gaussian-mixture/imgs/1.8.png
--------------------------------------------------------------------------------
/聚类/gaussian-mixture/imgs/1.9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/gaussian-mixture/imgs/1.9.png
--------------------------------------------------------------------------------
/聚类/k-means/imgs/1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/k-means/imgs/1.1.png
--------------------------------------------------------------------------------
/聚类/k-means/imgs/math.1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/k-means/imgs/math.1.1.png
--------------------------------------------------------------------------------
/聚类/k-means/imgs/math.1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/k-means/imgs/math.1.2.png
--------------------------------------------------------------------------------
/聚类/k-means/imgs/math.1.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/k-means/imgs/math.1.3.png
--------------------------------------------------------------------------------
/聚类/k-means/papers/Scalable K-Means++.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/k-means/papers/Scalable K-Means++.pdf
--------------------------------------------------------------------------------
/聚类/k-means/papers/k-means++ The Advantages of Careful Seeding.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/k-means/papers/k-means++ The Advantages of Careful Seeding.pdf
--------------------------------------------------------------------------------
/聚类/readme.md:
--------------------------------------------------------------------------------
1 | # 聚类
2 |
3 | [聚类](https://en.wikipedia.org/wiki/Cluster_analysis)是一种无监督学习问题,它的目标就是基于相似度将相似的子集聚合在一起。聚类经常用于探索性研究或者作为分层有监督流程的一部分。
4 | `spark.mllib`包中支持下面的模型。
5 |
6 | * [k-means算法](k-means/k-means.md)
7 | * [GMM(高斯混合模型)](gaussian-mixture/gaussian-mixture.md)
8 | * [PIC(快速迭代聚类)](PIC/pic.md)
9 | * [LDA(隐式狄利克雷分布)](LDA/lda.md)
10 | * [二分k-means算法](bis-k-means/bisecting-k-means.md)
11 | * [流式k-means算法](streaming-k-means/streaming-k-means.md)
12 |
--------------------------------------------------------------------------------
/聚类/streaming-k-means/imgs/streaming-k-means.1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/streaming-k-means/imgs/streaming-k-means.1.1.png
--------------------------------------------------------------------------------
/聚类/streaming-k-means/imgs/streaming-k-means.1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/聚类/streaming-k-means/imgs/streaming-k-means.1.2.png
--------------------------------------------------------------------------------
/聚类/streaming-k-means/streaming-k-means.md:
--------------------------------------------------------------------------------
1 | # 流式`k-means`算法
2 |
3 | 当数据是以流的方式到达的时候,我们可能想动态的估计(`estimate `)聚类的簇,通过新的到达的数据来更新聚类。`spark.mllib`支持流式`k-means`聚类,并且可以通过参数控制估计衰减(`decay`)(或“健忘”(`forgetfulness`))。
4 | 这个算法使用一般地小批量更新规则来更新簇。
5 |
6 | ## 1 流式`k-means`算法原理
7 |
8 | 对每批新到的数据,我们首先将点分配给距离它们最近的簇,然后计算新的数据中心,最后更新每一个簇。使用的公式如下所示:
9 |
10 | 
11 |
12 | 
13 |
14 | 在上面的公式中,$c_{t}$表示前一个簇中心,$n_{t}$表示分配给这个簇的点的数量,
15 | $x_{t}$表示从当前批数据的簇中心,$m_{t}$表示当前批数据的点数量。
16 | 当评价新的数据时,把衰减因子`alpha`当做折扣加权应用到当前的点上,用以衡量当前预测的簇的贡献度量。当`alpha`等于1时,所有的批数据赋予相同的权重,当`alpha`等于0时,数据中心点完全通过当前数据确定。
17 |
18 | 衰减因子`alpha`也可以通过`halfLife`参数联合时间单元(`time unit`)来确定,时间单元可以是一批数据也可以是一个数据点。假如数据从`t`时刻到来并定义了`halfLife`为`h`,
19 | 在`t+h`时刻,应用到`t`时刻的数据的折扣(`discount`)为0.5。
20 |
21 | 流式`k-means`算法的步骤如下所示:
22 |
23 | - (1)分配新的数据点到离其最近的簇;
24 |
25 | - (2)根据时间单元(`time unit`)计算折扣(`discount`)值,并更新簇权重;
26 |
27 | - (3)应用更新规则;
28 |
29 | - (4)应用更新规则后,有些簇可能消失了,那么切分最大的簇为两个簇。
30 |
31 | ## 2 流式`k-means`算法源码分析
32 |
33 | 在分步骤分析源码之前,我们先了解一下`StreamingKMeans`参数表达的含义。
34 |
35 | ```scala
36 | class StreamingKMeans(
37 | var k: Int, //簇个数
38 | var decayFactor: Double,//衰减因子
39 | var timeUnit: String //时间单元
40 | )
41 | ```
42 | 在上述定义中,`k`表示我们要聚类的个数,`decayFactor`表示衰减因子,用于计算折扣,`timeUnit`表示时间单元,时间单元既可以是一批数据(`StreamingKMeans.BATCHES`)也可以是单条数据(`StreamingKMeans.POINTS`)。
43 |
44 | 由于我们处理的是流式数据,所以我们在流式数据来之前要先初始化模型。有两种初始化模型的方法,一种是直接指定初始化中心点及簇权重,一种是随机初始化中心点以及簇权重。
45 |
46 | ```scala
47 | //直接初始化中心点及簇权重
48 | def setInitialCenters(centers: Array[Vector], weights: Array[Double]): this.type = {
49 | model = new StreamingKMeansModel(centers, weights)
50 | this
51 | }
52 | //随机初始化中心点以及簇权重
53 | def setRandomCenters(dim: Int, weight: Double, seed: Long = Utils.random.nextLong): this.type = {
54 | val random = new XORShiftRandom(seed)
55 | val centers = Array.fill(k)(Vectors.dense(Array.fill(dim)(random.nextGaussian())))
56 | val weights = Array.fill(k)(weight)
57 | model = new StreamingKMeansModel(centers, weights)
58 | this
59 | }
60 | ```
61 | 初始化中心点以及簇权重之后,对于新到的流数据,我们使用更新规则修改中心点和权重,调整聚类情况。更新过程在`update`方法中实现,下面我们分步骤分析该方法。
62 |
63 | - (1)分配新到的数据到离其最近的簇,并计算更新后的簇的向量和以及点数量
64 |
65 | ```scala
66 | //选择离数据点最近的簇
67 | val closest = data.map(point => (this.predict(point), (point, 1L)))
68 | def predict(point: Vector): Int = {
69 | //返回和给定点相隔最近的中心
70 | KMeans.findClosest(clusterCentersWithNorm, new VectorWithNorm(point))._1
71 | }
72 | // 获得更新的簇的向量和以及点数量
73 | val mergeContribs: ((Vector, Long), (Vector, Long)) => (Vector, Long) = (p1, p2) => {
74 | // y += a * x,向量相加
75 | BLAS.axpy(1.0, p2._1, p1._1)
76 | (p1._1, p1._2 + p2._2)
77 | }
78 | val pointStats: Array[(Int, (Vector, Long))] = closest
79 | .aggregateByKey((Vectors.zeros(dim), 0L))(mergeContribs, mergeContribs)
80 | .collect()
81 | ```
82 |
83 | - (2)获取折扣值,并用折扣值作用到权重上
84 |
85 | ```scala
86 | // 折扣
87 | val discount = timeUnit match {
88 | case StreamingKMeans.BATCHES => decayFactor
89 | case StreamingKMeans.POINTS =>
90 | //所有新增点的数量和
91 | val numNewPoints = pointStats.view.map { case (_, (_, n)) =>
92 | n
93 | }.sum
94 | // x^y
95 | math.pow(decayFactor, numNewPoints)
96 | }
97 | //将折扣应用到权重上
98 | //x = a * x
99 | BLAS.scal(discount, Vectors.dense(clusterWeights))
100 | ```
101 | 上面的代码更加时间单元的不同获得不同的折扣值。当时间单元为`StreamingKMeans.BATCHES`时,折扣就为衰减因子;当时间单元为`StreamingKMeans.POINTS`时,折扣由新增数据点的个数`n`和衰减因子`decay`共同决定。
102 | 折扣值为`n`个`decay`相乘。
103 |
104 | - (3)实现更新规则
105 |
106 | ```scala
107 | // 实现更新规则
108 | pointStats.foreach { case (label, (sum, count)) =>
109 | //获取中心点
110 | val centroid = clusterCenters(label)
111 | //更新权重
112 | val updatedWeight = clusterWeights(label) + count
113 | val lambda = count / math.max(updatedWeight, 1e-16)
114 | clusterWeights(label) = updatedWeight
115 | //x = a * x,即(1-lambda)*centroid
116 | BLAS.scal(1.0 - lambda, centroid)
117 | // y += a * x,即centroid +=sum*lambda/count
118 | BLAS.axpy(lambda / count, sum, centroid)
119 | }
120 | ```
121 | 上面的代码对每一个簇,首先更新簇的权重,权重值为原有的权重加上新增数据点的个数。然后计算`lambda`,通过`lambda`更新中心点。`lambda`为新增数据的个数和更新权重的商。
122 | 假设更新之前的中心点为`c1`,更新之后的中心点为`c2`,那么`c2=(1-lambda)*c1+sum/count`,其中`sum/count`为所有点的平均值。
123 |
124 | - (4)调整权重最小和最大的簇
125 |
126 | ```scala
127 | val weightsWithIndex = clusterWeights.view.zipWithIndex
128 | //获取权重值最大的簇
129 | val (maxWeight, largest) = weightsWithIndex.maxBy(_._1)
130 | //获取权重值最小的簇
131 | val (minWeight, smallest) = weightsWithIndex.minBy(_._1)
132 | //判断权重最小的簇是否过小,如果过小,就将这两个簇重新划分为两个新的簇,权重为两者的均值
133 | if (minWeight < 1e-8 * maxWeight) {
134 | logInfo(s"Cluster $smallest is dying. Split the largest cluster $largest into two.")
135 | val weight = (maxWeight + minWeight) / 2.0
136 | clusterWeights(largest) = weight
137 | clusterWeights(smallest) = weight
138 | val largestClusterCenter = clusterCenters(largest)
139 | val smallestClusterCenter = clusterCenters(smallest)
140 | var j = 0
141 | while (j < dim) {
142 | val x = largestClusterCenter(j)
143 | val p = 1e-14 * math.max(math.abs(x), 1.0)
144 | largestClusterCenter.toBreeze(j) = x + p
145 | smallestClusterCenter.toBreeze(j) = x - p
146 | j += 1
147 | }
148 | }
149 | ```
--------------------------------------------------------------------------------
/降维/EVD/evd.md:
--------------------------------------------------------------------------------
1 | # 特征值分解
2 |
3 | 假设向量`v`是方阵`A`的特征向量,可以表示成下面的形式:
4 |
5 | 
6 |
7 | 这里`lambda`表示特征向量`v`所对应的特征值。并且一个矩阵的一组特征向量是一组正交向量。特征值分解是将一个矩阵分解为下面的形式:
8 |
9 | 
10 |
11 | 其中`Q`是这个矩阵`A`的特征向量组成的矩阵。`sigma`是一个对角矩阵,每个对角线上的元素就是一个特征值。
12 |
13 | 特征值分解是一个提取矩阵特征很不错的方法,但是它只适合于方阵,对于非方阵,它不适合。这就需要用到奇异值分解。
14 |
15 | ## 1 源码分析
16 |
17 | `MLlib`使用`ARPACK`来求解特征值分解。它的实现代码如下
18 |
19 | ```scala
20 | def symmetricEigs(
21 | mul: BDV[Double] => BDV[Double],
22 | n: Int,
23 | k: Int,
24 | tol: Double,
25 | maxIterations: Int): (BDV[Double], BDM[Double]) = {
26 | val arpack = ARPACK.getInstance()
27 | // tolerance used in stopping criterion
28 | val tolW = new doubleW(tol)
29 | // number of desired eigenvalues, 0 < nev < n
30 | val nev = new intW(k)
31 | // nev Lanczos vectors are generated in the first iteration
32 | // ncv-nev Lanczos vectors are generated in each subsequent iteration
33 | // ncv must be smaller than n
34 | val ncv = math.min(2 * k, n)
35 | // "I" for standard eigenvalue problem, "G" for generalized eigenvalue problem
36 | val bmat = "I"
37 | // "LM" : compute the NEV largest (in magnitude) eigenvalues
38 | val which = "LM"
39 | var iparam = new Array[Int](11)
40 | // use exact shift in each iteration
41 | iparam(0) = 1
42 | // maximum number of Arnoldi update iterations, or the actual number of iterations on output
43 | iparam(2) = maxIterations
44 | // Mode 1: A*x = lambda*x, A symmetric
45 | iparam(6) = 1
46 |
47 | var ido = new intW(0)
48 | var info = new intW(0)
49 | var resid = new Array[Double](n)
50 | var v = new Array[Double](n * ncv)
51 | var workd = new Array[Double](n * 3)
52 | var workl = new Array[Double](ncv * (ncv + 8))
53 | var ipntr = new Array[Int](11)
54 |
55 | // call ARPACK's reverse communication, first iteration with ido = 0
56 | arpack.dsaupd(ido, bmat, n, which, nev.`val`, tolW, resid, ncv, v, n, iparam, ipntr, workd,
57 | workl, workl.length, info)
58 | val w = BDV(workd)
59 | // ido = 99 : done flag in reverse communication
60 | while (ido.`val` != 99) {
61 | if (ido.`val` != -1 && ido.`val` != 1) {
62 | throw new IllegalStateException("ARPACK returns ido = " + ido.`val` +
63 | " This flag is not compatible with Mode 1: A*x = lambda*x, A symmetric.")
64 | }
65 | // multiply working vector with the matrix
66 | val inputOffset = ipntr(0) - 1
67 | val outputOffset = ipntr(1) - 1
68 | val x = w.slice(inputOffset, inputOffset + n)
69 | val y = w.slice(outputOffset, outputOffset + n)
70 | y := mul(x)
71 | // call ARPACK's reverse communication
72 | arpack.dsaupd(ido, bmat, n, which, nev.`val`, tolW, resid, ncv, v, n, iparam, ipntr,
73 | workd, workl, workl.length, info)
74 | }
75 |
76 | val d = new Array[Double](nev.`val`)
77 | val select = new Array[Boolean](ncv)
78 | // copy the Ritz vectors
79 | val z = java.util.Arrays.copyOfRange(v, 0, nev.`val` * n)
80 |
81 | // call ARPACK's post-processing for eigenvectors
82 | arpack.dseupd(true, "A", select, d, z, n, 0.0, bmat, n, which, nev, tol, resid, ncv, v, n,
83 | iparam, ipntr, workd, workl, workl.length, info)
84 |
85 | // number of computed eigenvalues, might be smaller than k
86 | val computed = iparam(4)
87 |
88 | val eigenPairs = java.util.Arrays.copyOfRange(d, 0, computed).zipWithIndex.map { r =>
89 | (r._1, java.util.Arrays.copyOfRange(z, r._2 * n, r._2 * n + n))
90 | }
91 |
92 | // sort the eigen-pairs in descending order
93 | val sortedEigenPairs = eigenPairs.sortBy(- _._1)
94 |
95 | // copy eigenvectors in descending order of eigenvalues
96 | val sortedU = BDM.zeros[Double](n, computed)
97 | sortedEigenPairs.zipWithIndex.foreach { r =>
98 | val b = r._2 * n
99 | var i = 0
100 | while (i < n) {
101 | sortedU.data(b + i) = r._1._2(i)
102 | i += 1
103 | }
104 | }
105 | (BDV[Double](sortedEigenPairs.map(_._1)), sortedU)
106 | }
107 | ```
108 | 我们可以查看`ARPACK`的注释详细了解`dsaupd`和`dseupd`方法的作用。
--------------------------------------------------------------------------------
/降维/EVD/imgs/1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/降维/EVD/imgs/1.1.png
--------------------------------------------------------------------------------
/降维/EVD/imgs/1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/降维/EVD/imgs/1.2.png
--------------------------------------------------------------------------------
/降维/PCA/imgs/1.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/降维/PCA/imgs/1.1.png
--------------------------------------------------------------------------------
/降维/PCA/imgs/1.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/降维/PCA/imgs/1.2.png
--------------------------------------------------------------------------------
/降维/PCA/imgs/1.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/降维/PCA/imgs/1.3.png
--------------------------------------------------------------------------------
/降维/PCA/imgs/1.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/降维/PCA/imgs/1.4.png
--------------------------------------------------------------------------------
/降维/PCA/imgs/1.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/降维/PCA/imgs/1.5.png
--------------------------------------------------------------------------------
/降维/PCA/imgs/1.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/降维/PCA/imgs/1.6.png
--------------------------------------------------------------------------------
/降维/PCA/imgs/1.7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/降维/PCA/imgs/1.7.png
--------------------------------------------------------------------------------
/降维/PCA/imgs/1.8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/降维/PCA/imgs/1.8.png
--------------------------------------------------------------------------------
/降维/PCA/pca.md:
--------------------------------------------------------------------------------
1 | # 主成分分析
2 |
3 | ## 1 主成分分析原理
4 |
5 | 主成分分析是最常用的一种降维方法。我们首先考虑一个问题:对于正交矩阵空间中的样本点,如何用一个超平面对所有样本进行恰当的表达。容易想到,如果这样的超平面存在,那么他大概应该具有下面的性质。
6 |
7 | - 最近重构性:样本点到超平面的距离都足够近
8 |
9 | - 最大可分性:样本点在这个超平面上的投影尽可能分开
10 |
11 | 基于最近重构性和最大可分性,能分别得到主成分分析的两种等价推导。
12 |
13 | ### 1.1 最近重构性
14 |
15 | 假设我们对样本点进行了中心化,即所有样本的和为0。再假设投影变换后得到的新坐标系为:
16 |
17 | 
18 |
19 | 若丢弃新坐标系中的部分坐标,将维度降到`d'`,则样本点$x_{i}$在低位坐标系中的投影是$z_{i}$ :
20 |
21 | 
22 |
23 | 这里$z_{ij}$是$x_{i}$在低维坐标系下第`j`维的坐标。若基于$z_{i}$来重构$x_{i}$ ,那么可以得到
24 |
25 | 
26 |
27 | 考虑整个训练集,原样本点和基于投影重构的样本点之间的距离为
28 |
29 | 
30 |
31 | 根据最近重构性,最小化上面的式子,就可以得到主成分分析的优化目标
32 |
33 | 
34 |
35 | ### 1.2 最大可分性
36 |
37 | 从最大可分性出发,我们可以得到主成分分析的另一种解释。我们知道,样本点$x_{i}$在新空间中超平面上的投影是$W^{T}x_{i}$ ,
38 | 若所有样本点的投影能尽可能分开,则应该使投影后样本点的方差最大化。投影后样本点的方差是
39 |
40 | 
41 |
42 | 于是优化目标可以写为
43 |
44 | 
45 |
46 | 这个优化目标和上文的优化目标是等价的。对优化目标使用拉格朗日乘子法可得
47 |
48 | 
49 |
50 | 于是,只需要对协方差矩阵进行特征值分解,将得到的特征值排序,在取前`d'`个特征值对应的特征向量,即得到主成分分析的解。
51 |
52 | ## 2 源码分析
53 |
54 | ### 2.1 实例
55 |
56 | ```scala
57 | import org.apache.spark.mllib.linalg.Matrix
58 | import org.apache.spark.mllib.linalg.distributed.RowMatrix
59 | val mat: RowMatrix = ...
60 | // Compute the top 10 principal components.
61 | val pc: Matrix = mat.computePrincipalComponents(10) // Principal components are stored in a local dense matrix.
62 | // Project the rows to the linear space spanned by the top 10 principal components.
63 | val projected: RowMatrix = mat.multiply(pc)
64 | ```
65 |
66 | ### 2.2 实现代码
67 |
68 | 主成分分析的实现代码在`RowMatrix`中实现。源码如下:
69 |
70 | ```scala
71 | def computePrincipalComponents(k: Int): Matrix = {
72 | val n = numCols().toInt
73 | //计算协方差矩阵
74 | val Cov = computeCovariance().toBreeze.asInstanceOf[BDM[Double]]
75 | //特征值分解
76 | val brzSvd.SVD(u: BDM[Double], _, _) = brzSvd(Cov)
77 | if (k == n) {
78 | Matrices.dense(n, k, u.data)
79 | } else {
80 | Matrices.dense(n, k, Arrays.copyOfRange(u.data, 0, n * k))
81 | }
82 | }
83 | ```
84 | 这段代码首先会计算样本的协方差矩阵,然后在通过`breeze`的`svd`方法进行奇异值分解。这里由于协方差矩阵是方阵,所以奇异值分解等价于特征值分解。下面是计算协方差的代码
85 |
86 | ```scala
87 | def computeCovariance(): Matrix = {
88 | val n = numCols().toInt
89 | checkNumColumns(n)
90 | val (m, mean) = rows.treeAggregate[(Long, BDV[Double])]((0L, BDV.zeros[Double](n)))(
91 | seqOp = (s: (Long, BDV[Double]), v: Vector) => (s._1 + 1L, s._2 += v.toBreeze),
92 | combOp = (s1: (Long, BDV[Double]), s2: (Long, BDV[Double])) =>
93 | (s1._1 + s2._1, s1._2 += s2._2)
94 | )
95 | updateNumRows(m)
96 | mean :/= m.toDouble
97 | // We use the formula Cov(X, Y) = E[X * Y] - E[X] E[Y], which is not accurate if E[X * Y] is
98 | // large but Cov(X, Y) is small, but it is good for sparse computation.
99 | // TODO: find a fast and stable way for sparse data.
100 | val G = computeGramianMatrix().toBreeze.asInstanceOf[BDM[Double]]
101 | var i = 0
102 | var j = 0
103 | val m1 = m - 1.0
104 | var alpha = 0.0
105 | while (i < n) {
106 | alpha = m / m1 * mean(i)
107 | j = i
108 | while (j < n) {
109 | val Gij = G(i, j) / m1 - alpha * mean(j)
110 | G(i, j) = Gij
111 | G(j, i) = Gij
112 | j += 1
113 | }
114 | i += 1
115 | }
116 | Matrices.fromBreeze(G)
117 | }
118 | ```
119 |
120 | # 参考文献
121 |
122 | 【1】 机器学习.周志华
123 |
124 |
--------------------------------------------------------------------------------
/降维/SVD/imgs/1.10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/降维/SVD/imgs/1.10.png
--------------------------------------------------------------------------------
/降维/SVD/imgs/1.11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/降维/SVD/imgs/1.11.png
--------------------------------------------------------------------------------
/降维/SVD/imgs/1.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/降维/SVD/imgs/1.3.png
--------------------------------------------------------------------------------
/降维/SVD/imgs/1.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/降维/SVD/imgs/1.4.png
--------------------------------------------------------------------------------
/降维/SVD/imgs/1.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/降维/SVD/imgs/1.5.png
--------------------------------------------------------------------------------
/降维/SVD/imgs/1.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/降维/SVD/imgs/1.6.png
--------------------------------------------------------------------------------
/降维/SVD/imgs/1.7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/降维/SVD/imgs/1.7.png
--------------------------------------------------------------------------------
/降维/SVD/imgs/1.8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/降维/SVD/imgs/1.8.png
--------------------------------------------------------------------------------
/降维/SVD/imgs/1.9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jquanlee/spark-ml-analysis/c93e36a3c49b2ae3c25daa4b26a2d9cb3aee8836/降维/SVD/imgs/1.9.png
--------------------------------------------------------------------------------
/降维/SVD/svd.md:
--------------------------------------------------------------------------------
1 | # 奇异值分解
2 |
3 | ## 1 奇异值分解
4 |
5 | 在了解[特征值分解](../EVD/evd.md)之后,我们知道,矩阵`A`不一定是方阵。为了得到方阵,可以将矩阵`A`的转置乘以该矩阵。从而可以得到公式:
6 |
7 | 
8 |
9 | 现在假设存在`M*N`矩阵`A`,我们的目标是在`n`维空间中找一组正交基,使得经过`A`变换后还是正交的。假设已经找到这样一组正交基:
10 |
11 | 
12 |
13 | `A`矩阵可以将这组正则基映射为如下的形式。
14 |
15 | 
16 |
17 | 要使上面的基也为正则基,即使它们两两正交,那么需要满足下面的条件。
18 |
19 | 
20 |
21 | 如果正交基`v`选择为$A^{T}A$的特征向量的话,由于$A^{T}A$是对称阵,`v`之间两两正交,那么
22 |
23 | 
24 |
25 | 由于下面的公式成立
26 |
27 | 
28 |
29 | 所以取单位向量
30 |
31 | 
32 |
33 | 可以得到
34 |
35 | 
36 |
37 | 奇异值分解是一个能适用于任意的矩阵的一种分解的方法,它的形式如下:
38 |
39 | 
40 |
41 | 其中,`U`是一个`M*M`的方阵,它包含的向量是正交的,称为左奇异向量(即上文的`u`)。`sigma`是一个`N*M`的对角矩阵,每个对角线上的元素就是一个奇异值。`V`是一个`N*N`的矩阵,它包含的向量是正交的,称为右奇异向量(即上文的`v`)。
42 |
43 |
44 | ## 2 源码分析
45 |
46 | `MLlib`在`RowMatrix`类中实现了奇异值分解。下面是一个使用奇异值分解的例子。
47 |
48 | ```scala
49 | import org.apache.spark.mllib.linalg.Matrix
50 | import org.apache.spark.mllib.linalg.distributed.RowMatrix
51 | import org.apache.spark.mllib.linalg.SingularValueDecomposition
52 | val mat: RowMatrix = ...
53 | // Compute the top 20 singular values and corresponding singular vectors.
54 | val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(20, computeU = true)
55 | val U: RowMatrix = svd.U // The U factor is a RowMatrix.
56 | val s: Vector = svd.s // The singular values are stored in a local dense vector.
57 | val V: Matrix = svd.V // The V factor is a local dense matrix.
58 | ```
59 |
60 | ### 2.1 性能
61 |
62 | 我们假设`n`比`m`小。奇异值和右奇异值向量可以通过方阵$A^{T}A$的特征值和特征向量得到。左奇异向量通过$AVS^{-1}$求得。
63 | `ml`实际使用的方法方法依赖计算花费。
64 |
65 | - 当`n`很小(`n<100`)或者`k`比`n`大(`k>n/2`),我们会首先计算方阵$A^{T}A$ ,然后在`driver`本地计算它的`top`特征值和特征向量。它的空间复杂度是`O(n*n)`,时间复杂度是`O(n*n*k)`。
66 |
67 | - 否则,我们用分布式的方式先计算$A^{T}Av$,然后把它传给[ARPACK](http://www.caam.rice.edu/software/ARPACK/)在`driver`上计算`top`特征值和特征向量。它需要传递`O(k)`的数据,每个`executor`的空间复杂度是`O(n)`,`driver`的空间复杂度是`O(nk)`
68 |
69 | ### 2.2 代码实现
70 |
71 | ```scala
72 | def computeSVD(
73 | k: Int,
74 | computeU: Boolean = false,
75 | rCond: Double = 1e-9): SingularValueDecomposition[RowMatrix, Matrix] = {
76 | // 迭代次数
77 | val maxIter = math.max(300, k * 3)
78 | // 阈值
79 | val tol = 1e-10
80 | computeSVD(k, computeU, rCond, maxIter, tol, "auto")
81 | }
82 | ```
83 | `computeSVD(k, computeU, rCond, maxIter, tol, "auto")`的实现分为三步。分别是选择计算模式,$A^{T}A$的特征值分解,计算`V`,`U`,`Sigma`。
84 | 下面分别介绍这三步。
85 |
86 | - **1** 选择计算模式
87 |
88 | ```scala
89 | val computeMode = mode match {
90 | case "auto" =>
91 | if (k > 5000) {
92 | logWarning(s"computing svd with k=$k and n=$n, please check necessity")
93 | }
94 | if (n < 100 || (k > n / 2 && n <= 15000)) {
95 | // 满足上述条件,首先计算方阵,然后本地计算特征值,避免数据传递
96 | if (k < n / 3) {
97 | SVDMode.LocalARPACK
98 | } else {
99 | SVDMode.LocalLAPACK
100 | }
101 | } else {
102 | // 分布式实现
103 | SVDMode.DistARPACK
104 | }
105 | case "local-svd" => SVDMode.LocalLAPACK
106 | case "local-eigs" => SVDMode.LocalARPACK
107 | case "dist-eigs" => SVDMode.DistARPACK
108 | }
109 | ```
110 |
111 | - **2** 特征值分解
112 |
113 | ```scala
114 | val (sigmaSquares: BDV[Double], u: BDM[Double]) = computeMode match {
115 | case SVDMode.LocalARPACK =>
116 | val G = computeGramianMatrix().toBreeze.asInstanceOf[BDM[Double]]
117 | EigenValueDecomposition.symmetricEigs(v => G * v, n, k, tol, maxIter)
118 | case SVDMode.LocalLAPACK =>
119 | // breeze (v0.10) svd latent constraint, 7 * n * n + 4 * n < Int.MaxValue
120 | val G = computeGramianMatrix().toBreeze.asInstanceOf[BDM[Double]]
121 | val brzSvd.SVD(uFull: BDM[Double], sigmaSquaresFull: BDV[Double], _) = brzSvd(G)
122 | (sigmaSquaresFull, uFull)
123 | case SVDMode.DistARPACK =>
124 | if (rows.getStorageLevel == StorageLevel.NONE) {
125 | logWarning("The input data is not directly cached, which may hurt performance if its"
126 | + " parent RDDs are also uncached.")
127 | }
128 | EigenValueDecomposition.symmetricEigs(multiplyGramianMatrixBy, n, k, tol, maxIter)
129 | }
130 | ```
131 | 当计算模式是`SVDMode.LocalARPACK`和`SVDMode.LocalLAPACK`时,程序实现的步骤是先获取方阵$A^{T}A$ ,在计算其特征值和特征向量。
132 | 获取方阵无需赘述,我们只需要注意它无法处理列大于65535的矩阵。我们分别看这两种模式下,如何获取特征值和特征向量。
133 |
134 | 在`SVDMode.LocalARPACK`模式下,使用`EigenValueDecomposition.symmetricEigs(v => G * v, n, k, tol, maxIter)`计算特征值和特征向量。在`SVDMode.LocalLAPACK`模式下,直接使用`breeze`的方法计算。
135 |
136 | 在`SVDMode.DistARPACK`模式下,不需要先计算方阵,但是传入`EigenValueDecomposition.symmetricEigs`方法的函数不同。
137 |
138 | ```scala
139 | private[mllib] def multiplyGramianMatrixBy(v: BDV[Double]): BDV[Double] = {
140 | val n = numCols().toInt
141 | //v作为广播变量
142 | val vbr = rows.context.broadcast(v)
143 | rows.treeAggregate(BDV.zeros[Double](n))(
144 | seqOp = (U, r) => {
145 | val rBrz = r.toBreeze
146 | val a = rBrz.dot(vbr.value)
147 | rBrz match {
148 | //计算y += x * a
149 | case _: BDV[_] => brzAxpy(a, rBrz.asInstanceOf[BDV[Double]], U)
150 | case _: BSV[_] => brzAxpy(a, rBrz.asInstanceOf[BSV[Double]], U)
151 | case _ => throw new UnsupportedOperationException
152 | }
153 | U
154 | }, combOp = (U1, U2) => U1 += U2)
155 | }
156 | ```
157 | 特征值分解的具体分析在[特征值分解](../EVD/evd.md)中有详细分析,请参考该文了解详情。
158 |
159 | - **3** 计算`U`,`V`以及`Sigma`
160 |
161 | ```scala
162 | //获取特征值向量
163 | val sigmas: BDV[Double] = brzSqrt(sigmaSquares)
164 | val sigma0 = sigmas(0)
165 | val threshold = rCond * sigma0
166 | var i = 0
167 | // sigmas的长度可能会小于k
168 | // 所以使用 i < min(k, sigmas.length) 代替 i < k.
169 | if (sigmas.length < k) {
170 | logWarning(s"Requested $k singular values but only found ${sigmas.length} converged.")
171 | }
172 | while (i < math.min(k, sigmas.length) && sigmas(i) >= threshold) {
173 | i += 1
174 | }
175 | val sk = i
176 | if (sk < k) {
177 | logWarning(s"Requested $k singular values but only found $sk nonzeros.")
178 | }
179 | //计算s,也即sigma
180 | val s = Vectors.dense(Arrays.copyOfRange(sigmas.data, 0, sk))
181 | //计算V
182 | val V = Matrices.dense(n, sk, Arrays.copyOfRange(u.data, 0, n * sk))
183 | //计算U
184 | // N = Vk * Sk^{-1}
185 | val N = new BDM[Double](n, sk, Arrays.copyOfRange(u.data, 0, n * sk))
186 | var i = 0
187 | var j = 0
188 | while (j < sk) {
189 | i = 0
190 | val sigma = sigmas(j)
191 | while (i < n) {
192 | //对角矩阵的逆即为倒数
193 | N(i, j) /= sigma
194 | i += 1
195 | }
196 | j += 1
197 | }
198 | //U=A * N
199 | val U = this.multiply(Matrices.fromBreeze(N))
200 | ```
201 |
202 | ## 参考文献
203 |
204 | 【1】[强大的矩阵奇异值分解(SVD)及其应用](http://www.cnblogs.com/LeftNotEasy/archive/2011/01/19/svd-and-applications.html)
205 |
206 | 【2】[奇异值分解(SVD)原理详解及推导](http://blog.csdn.net/zhongkejingwang/article/details/43053513)
207 |
208 | 【3】[A Singularly Valuable Decomposition: The SVD of a Matrix](http://www-users.math.umn.edu/~lerman/math5467/svd.pdf)
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
--------------------------------------------------------------------------------