├── .gitattributes
├── .gitignore
├── README.md
├── data
├── aco
│ └── input.txt
├── adaboost
│ └── input.txt
├── ann
│ ├── testInput.txt
│ └── trainInput.txt
├── apriori
│ └── testInput.txt
├── bayesnetwork
│ ├── attach.txt
│ └── input.txt
├── birch
│ ├── realData.txt
│ └── testInput.txt
├── cabddcc
│ └── graphData.txt
├── cart
│ └── input.txt
├── cba
│ └── input.txt
├── chameleon
│ └── graphData.txt
├── dbscan
│ └── input.txt
├── em
│ └── input.txt
├── fptree
│ └── testInput.txt
├── gsp
│ └── testInput.txt
├── gspan
│ ├── input.txt
│ └── reallyData.txt
├── hits
│ └── input.txt
├── id3
│ └── input.txt
├── kdtree
│ └── input.txt
├── kmeans
│ └── input.txt
├── knn
│ ├── testInput.txt
│ └── trainInput.txt
├── maze
│ └── mapData.txt
├── msapriori
│ ├── testInput.txt
│ └── testInput2.txt
├── naivebayes
│ └── input.txt
├── pagerank
│ └── input.txt
├── pca
│ ├── Makefile
│ ├── basilevsy.data
│ ├── compressor_1_day_detail.data
│ ├── compressor_per_day_kwh.data
│ └── simple.data
├── prefixspan
│ └── input.txt
├── randomforest
│ └── input.txt
├── roughsets
│ └── input.txt
├── tan
│ └── input.txt
└── viterbi
│ ├── humidity-matrix.txt
│ └── stmatrix.txt
├── pom.xml
├── src
├── main
│ ├── assembly
│ │ └── distribution.xml
│ ├── bin
│ │ └── ctl.sh
│ ├── java
│ │ └── com
│ │ │ └── jusdt
│ │ │ └── datamining
│ │ │ ├── association
│ │ │ └── analysis
│ │ │ │ ├── apriori
│ │ │ │ ├── AprioriCore.java
│ │ │ │ ├── AprioriExample.java
│ │ │ │ └── FrequentItem.java
│ │ │ │ └── fptree
│ │ │ │ ├── FPTreeCore.java
│ │ │ │ ├── FPTreeExample.java
│ │ │ │ └── TreeNode.java
│ │ │ ├── bagging
│ │ │ └── boosting
│ │ │ │ └── adaboost
│ │ │ │ ├── AdaBoostCore.java
│ │ │ │ ├── AdaBoostExample.java
│ │ │ │ └── Point.java
│ │ │ ├── classification
│ │ │ ├── cart
│ │ │ │ ├── AttrNode.java
│ │ │ │ ├── CARTCore.java
│ │ │ │ └── CARTExample.java
│ │ │ ├── id3
│ │ │ │ ├── AttrNode.java
│ │ │ │ ├── DataNode.java
│ │ │ │ ├── ID3Core.java
│ │ │ │ └── ID3Example.java
│ │ │ ├── knn
│ │ │ │ ├── KNNCore.java
│ │ │ │ ├── KNNExample.java
│ │ │ │ └── Sample.java
│ │ │ └── naivebayes
│ │ │ │ ├── NaiveBayesCore.java
│ │ │ │ └── NaiveBayesExample.java
│ │ │ ├── clustering
│ │ │ ├── birch
│ │ │ │ ├── BIRCHCore.java
│ │ │ │ ├── BIRCHExample.java
│ │ │ │ ├── Cluster.java
│ │ │ │ ├── ClusteringFeature.java
│ │ │ │ ├── LeafNode.java
│ │ │ │ └── NonLeafNode.java
│ │ │ └── kmeans
│ │ │ │ ├── KMeansCore.java
│ │ │ │ ├── KMeansExample.java
│ │ │ │ └── Point.java
│ │ │ ├── dimensionality
│ │ │ └── reduction
│ │ │ │ └── pca
│ │ │ │ ├── DataReader.java
│ │ │ │ ├── EVD.java
│ │ │ │ ├── Main.java
│ │ │ │ ├── Matrix.java
│ │ │ │ ├── MatrixException.java
│ │ │ │ ├── MatrixHelper.java
│ │ │ │ ├── PCACore.java
│ │ │ │ ├── PCACoreHandler.java
│ │ │ │ ├── PCAExample.java
│ │ │ │ ├── SVD.java
│ │ │ │ ├── ToeplitzMatrix.java
│ │ │ │ └── TrajectoryMatrix.java
│ │ │ ├── graph
│ │ │ └── gspan
│ │ │ │ ├── DFSCodeTraveler.java
│ │ │ │ ├── Edge.java
│ │ │ │ ├── EdgeFrequency.java
│ │ │ │ ├── GSpanExample.java
│ │ │ │ ├── GSpanTool.java
│ │ │ │ ├── Graph.java
│ │ │ │ ├── GraphCode.java
│ │ │ │ ├── GraphData.java
│ │ │ │ └── SubChildTraveler.java
│ │ │ ├── integrated
│ │ │ └── cba
│ │ │ │ ├── AprioriCore.java
│ │ │ │ ├── CBACore.java
│ │ │ │ ├── CBAExample.java
│ │ │ │ └── FrequentItem.java
│ │ │ ├── link
│ │ │ ├── hits
│ │ │ │ ├── HITSCore.java
│ │ │ │ └── HITSExample.java
│ │ │ └── pagerank
│ │ │ │ ├── PageRankCore.java
│ │ │ │ └── PageRankExample.java
│ │ │ ├── others
│ │ │ ├── aco
│ │ │ │ ├── ACOCore.java
│ │ │ │ ├── ACOExample.java
│ │ │ │ └── Ant.java
│ │ │ ├── bayesnetwork
│ │ │ │ ├── BayesNetWorkCore.java
│ │ │ │ ├── BayesNetWorkExample.java
│ │ │ │ └── Node.java
│ │ │ ├── cabddcc
│ │ │ │ ├── CABDDCCCore.java
│ │ │ │ ├── CABDDCCExample.java
│ │ │ │ ├── Graph.java
│ │ │ │ └── Point.java
│ │ │ ├── chameleon
│ │ │ │ ├── ChameleonCore.java
│ │ │ │ ├── ChameleonExample.java
│ │ │ │ ├── Cluster.java
│ │ │ │ └── Point.java
│ │ │ ├── dbscan
│ │ │ │ ├── DBSCANCore.java
│ │ │ │ ├── DBSCANExample.java
│ │ │ │ └── Point.java
│ │ │ ├── ga
│ │ │ │ ├── GACore.java
│ │ │ │ ├── GAExample.java
│ │ │ │ └── maze
│ │ │ │ │ ├── GAMazeCore.java
│ │ │ │ │ └── GAMazeExample.java
│ │ │ ├── kdtree
│ │ │ │ ├── KDTreeCore.java
│ │ │ │ ├── KDTreeExample.java
│ │ │ │ ├── Point.java
│ │ │ │ ├── Range.java
│ │ │ │ └── TreeNode.java
│ │ │ ├── msapriori
│ │ │ │ ├── FrequentItem.java
│ │ │ │ ├── MSAprioriCore.java
│ │ │ │ └── MSAprioriExample.java
│ │ │ ├── randomforest
│ │ │ │ ├── CARTCore.java
│ │ │ │ ├── DecisionTree.java
│ │ │ │ ├── RandomForestCore.java
│ │ │ │ ├── RandomForestExample.java
│ │ │ │ └── TreeNode.java
│ │ │ ├── tan
│ │ │ │ ├── AttrMutualInfo.java
│ │ │ │ ├── Node.java
│ │ │ │ ├── TANCore.java
│ │ │ │ └── TanExample.java
│ │ │ └── viterbi
│ │ │ │ ├── BaseNames.java
│ │ │ │ ├── ViterbiCore.java
│ │ │ │ └── ViterbiExample.java
│ │ │ ├── roughsets
│ │ │ ├── KnowledgeSystem.java
│ │ │ ├── Record.java
│ │ │ ├── RecordCollection.java
│ │ │ ├── RoughSetsCore.java
│ │ │ └── RoughSetsExample.java
│ │ │ ├── sequential
│ │ │ └── patterns
│ │ │ │ ├── gsp
│ │ │ │ ├── GSPCore.java
│ │ │ │ ├── GSPExample.java
│ │ │ │ ├── ItemSet.java
│ │ │ │ └── Sequence.java
│ │ │ │ └── prefixspan
│ │ │ │ ├── ItemSet.java
│ │ │ │ ├── PrefixSpanCore.java
│ │ │ │ ├── PrefixSpanExample.java
│ │ │ │ └── Sequence.java
│ │ │ └── statistical
│ │ │ └── learning
│ │ │ ├── ann
│ │ │ ├── ANN.java
│ │ │ ├── ANNCore.java
│ │ │ ├── ANNExample.java
│ │ │ ├── ANNModel.java
│ │ │ ├── ANNNode.java
│ │ │ ├── ANNParameter.java
│ │ │ ├── ANNPrintInterface.java
│ │ │ └── ANNProblem.java
│ │ │ └── em
│ │ │ ├── EMCore.java
│ │ │ ├── EMExample.java
│ │ │ └── Point.java
│ └── resources
│ │ └── logback.xml
└── test
│ ├── java
│ └── com
│ │ └── jusdt
│ │ └── datamining
│ │ ├── demo
│ │ └── MainDemo.java
│ │ └── dimensionality
│ │ └── reduction
│ │ └── pca
│ │ └── ToeplitzMatrixTest.java
│ └── resources
│ └── logback-test.xml
└── 需要验收的算法
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
4 | # Custom for Visual Studio
5 | *.cs diff=csharp
6 | *.sln merge=union
7 | *.csproj merge=union
8 | *.vbproj merge=union
9 | *.fsproj merge=union
10 | *.dbproj merge=union
11 |
12 | # Standard to msysgit
13 | *.doc diff=astextplain
14 | *.DOC diff=astextplain
15 | *.docx diff=astextplain
16 | *.DOCX diff=astextplain
17 | *.dot diff=astextplain
18 | *.DOT diff=astextplain
19 | *.pdf diff=astextplain
20 | *.PDF diff=astextplain
21 | *.rtf diff=astextplain
22 | *.RTF diff=astextplain
23 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .classpath
2 | .project
3 | .settings/
4 | target/
5 | logs/
6 |
--------------------------------------------------------------------------------
/data/aco/input.txt:
--------------------------------------------------------------------------------
1 | # CityName
2 | 1
3 | 2
4 | 3
5 | 4
6 | # Distance
7 | 1 2 1
8 | 1 3 1.4
9 | 1 4 1
10 | 2 3 1
11 | 2 4 1
12 | 3 4 1
--------------------------------------------------------------------------------
/data/adaboost/input.txt:
--------------------------------------------------------------------------------
1 | 1 5 1
2 | 2 3 1
3 | 3 1 -1
4 | 4 5 -1
5 | 5 6 1
6 | 6 4 -1
7 | 6 7 1
8 | 7 6 1
9 | 8 7 -1
10 | 8 2 -1
--------------------------------------------------------------------------------
/data/ann/testInput.txt:
--------------------------------------------------------------------------------
1 | 18.7,18.9,19.1,19.3,19.6
2 | 18.9,19.1,19.3,19.6,19.9
3 | 19.1,19.3,19.6,19.9,20.2
4 | 19.3,19.6,19.9,20.2,20.6
5 | 19.6,19.9,20.2,20.6,21
6 | 19.9,20.2,20.6,21,21.5
7 | 20.2,20.6,21,21.5,22
--------------------------------------------------------------------------------
/data/ann/trainInput.txt:
--------------------------------------------------------------------------------
1 | 17.6,17.7,17.7,17.7,17.8
2 | 17.7,17.7,17.7,17.8,17.8
3 | 17.7,17.7,17.8,17.8,17.9
4 | 17.7,17.8,17.8,17.9,18
5 | 17.8,17.8,17.9,18,18.1
6 | 17.8,17.9,18,18.1,18.2
7 | 17.9,18,18.1,18.2,18.4
8 | 18,18.1,18.2,18.4,18.6
9 | 18.1,18.2,18.4,18.6,18.7
10 | 18.2,18.4,18.6,18.7,18.9
11 | 18.4,18.6,18.7,18.9,19.1
12 | 18.6,18.7,18.9,19.1,19.3
--------------------------------------------------------------------------------
/data/apriori/testInput.txt:
--------------------------------------------------------------------------------
1 | T1 1 2 5
2 | T2 2 4
3 | T3 2 3
4 | T4 1 2 4
5 | T5 1 3
6 | T6 2 3
7 | T7 1 3
8 | T8 1 2 3 5
9 | T9 1 2 3
--------------------------------------------------------------------------------
/data/bayesnetwork/attach.txt:
--------------------------------------------------------------------------------
1 | B A
2 | E A
3 | A M
4 | A J
--------------------------------------------------------------------------------
/data/bayesnetwork/input.txt:
--------------------------------------------------------------------------------
1 | B E A M J P
2 | y y y y y 0.00012
3 | y y y y n 0.000051
4 | y y y n y 0.000013
5 | y y y n n 0.0000057
6 | y y n y y 0.000000005
7 | y y n y n 0.00000049
8 | y y n n y 0.000000095
9 | y y n n n 0.0000094
10 | y n y y y 0.0058
11 | y n y y n 0.0025
12 | y n y n y 0.00065
13 | y n y n n 0.00028
14 | y n n y y 0.00000029
15 | y n n y n 0.000029
16 | y n n n y 0.0000056
17 | y n n n n 0.00055
18 | n y y y y 0.0036
19 | n y y y n 0.0016
20 | n y y n y 0.0004
21 | n y y n n 0.00017
22 | n y n y y 0.000007
23 | n y n y n 0.00069
24 | n y n n y 0.00013
25 | n y n n n 0.013
26 | n n y y y 0.00061
27 | n n y y n 0.00026
28 | n n y n y 0.000068
29 | n n y n n 0.000029
30 | n n n y y 0.00048
31 | n n n y n 0.048
32 | n n n n y 0.0092
33 | n n n n n 0.91
--------------------------------------------------------------------------------
/data/birch/realData.txt:
--------------------------------------------------------------------------------
1 | 5.1 3.5 1.4 0.2
2 | 4.9 3.0 1.4 0.2
3 | 4.7 3.2 1.3 0.2
4 | 4.6 3.1 1.5 0.2
5 | 5.0 3.6 1.4 0.2
6 | 5.4 3.9 1.7 0.4
7 | 4.6 3.4 1.4 0.3
8 | 5.0 3.4 1.5 0.2
9 | 4.4 2.9 1.4 0.2
10 | 4.9 3.1 1.5 0.1
11 | 5.4 3.7 1.5 0.2
12 | 4.8 3.4 1.6 0.2
13 | 4.8 3.0 1.4 0.1
14 | 4.3 3.0 1.1 0.1
15 | 5.8 4.0 1.2 0.2
16 | 5.7 4.4 1.5 0.4
17 | 5.4 3.9 1.3 0.4
18 | 5.1 3.5 1.4 0.3
19 | 5.7 3.8 1.7 0.3
20 | 5.1 3.8 1.5 0.3
21 | 5.4 3.4 1.7 0.2
22 | 5.1 3.7 1.5 0.4
23 | 4.6 3.6 1.0 0.2
24 | 5.1 3.3 1.7 0.5
25 | 4.8 3.4 1.9 0.2
26 | 5.0 3.0 1.6 0.2
27 | 5.0 3.4 1.6 0.4
28 | 5.2 3.5 1.5 0.2
29 | 5.2 3.4 1.4 0.2
30 | 4.7 3.2 1.6 0.2
31 | 4.8 3.1 1.6 0.2
32 | 5.4 3.4 1.5 0.4
33 | 5.2 4.1 1.5 0.1
34 | 5.5 4.2 1.4 0.2
35 | 4.9 3.1 1.5 0.1
36 | 5.0 3.2 1.2 0.2
37 | 5.5 3.5 1.3 0.2
38 | 4.9 3.1 1.5 0.1
39 | 4.4 3.0 1.3 0.2
40 | 5.1 3.4 1.5 0.2
41 | 5.0 3.5 1.3 0.3
42 | 4.5 2.3 1.3 0.3
43 | 4.4 3.2 1.3 0.2
44 | 5.0 3.5 1.6 0.6
45 | 5.1 3.8 1.9 0.4
46 | 4.8 3.0 1.4 0.3
47 | 5.1 3.8 1.6 0.2
48 | 4.6 3.2 1.4 0.2
49 | 5.3 3.7 1.5 0.2
50 | 5.0 3.3 1.4 0.2
51 | 7.0 3.2 4.7 1.4
52 | 6.4 3.2 4.5 1.5
53 | 6.9 3.1 4.9 1.5
54 | 5.5 2.3 4.0 1.3
55 | 6.5 2.8 4.6 1.5
56 | 5.7 2.8 4.5 1.3
57 | 6.3 3.3 4.7 1.6
58 | 4.9 2.4 3.3 1.0
59 | 6.6 2.9 4.6 1.3
60 | 5.2 2.7 3.9 1.4
61 | 5.0 2.0 3.5 1.0
62 | 5.9 3.0 4.2 1.5
63 | 6.0 2.2 4.0 1.0
64 | 6.1 2.9 4.7 1.4
65 | 5.6 2.9 3.6 1.3
66 | 6.7 3.1 4.4 1.4
67 | 5.6 3.0 4.5 1.5
68 | 5.8 2.7 4.1 1.0
69 | 6.2 2.2 4.5 1.5
70 | 5.6 2.5 3.9 1.1
71 | 5.9 3.2 4.8 1.8
72 | 6.1 2.8 4.0 1.3
73 | 6.3 2.5 4.9 1.5
74 | 6.1 2.8 4.7 1.2
75 | 6.4 2.9 4.3 1.3
76 | 6.6 3.0 4.4 1.4
77 | 6.8 2.8 4.8 1.4
78 | 6.7 3.0 5.0 1.7
79 | 6.0 2.9 4.5 1.5
80 | 5.7 2.6 3.5 1.0
81 | 5.5 2.4 3.8 1.1
82 | 5.5 2.4 3.7 1.0
83 | 5.8 2.7 3.9 1.2
84 | 6.0 2.7 5.1 1.6
85 | 5.4 3.0 4.5 1.5
86 | 6.0 3.4 4.5 1.6
87 | 6.7 3.1 4.7 1.5
88 | 6.3 2.3 4.4 1.3
89 | 5.6 3.0 4.1 1.3
90 | 5.5 2.5 4.0 1.3
91 | 5.5 2.6 4.4 1.2
92 | 6.1 3.0 4.6 1.4
93 | 5.8 2.6 4.0 1.2
94 | 5.0 2.3 3.3 1.0
95 | 5.6 2.7 4.2 1.3
96 | 5.7 3.0 4.2 1.2
97 | 5.7 2.9 4.2 1.3
98 | 6.2 2.9 4.3 1.3
99 | 5.1 2.5 3.0 1.1
100 | 5.7 2.8 4.1 1.3
101 | 6.3 3.3 6.0 2.5
102 | 5.8 2.7 5.1 1.9
103 | 7.1 3.0 5.9 2.1
104 | 6.3 2.9 5.6 1.8
105 | 6.5 3.0 5.8 2.2
106 | 7.6 3.0 6.6 2.1
107 | 4.9 2.5 4.5 1.7
108 | 7.3 2.9 6.3 1.8
109 | 6.7 2.5 5.8 1.8
110 | 7.2 3.6 6.1 2.5
111 | 6.5 3.2 5.1 2.0
112 | 6.4 2.7 5.3 1.9
113 | 6.8 3.0 5.5 2.1
114 | 5.7 2.5 5.0 2.0
115 | 5.8 2.8 5.1 2.4
116 | 6.4 3.2 5.3 2.3
117 | 6.5 3.0 5.5 1.8
118 | 7.7 3.8 6.7 2.2
119 | 7.7 2.6 6.9 2.3
120 | 6.0 2.2 5.0 1.5
121 | 6.9 3.2 5.7 2.3
122 | 5.6 2.8 4.9 2.0
123 | 7.7 2.8 6.7 2.0
124 | 6.3 2.7 4.9 1.8
125 | 6.7 3.3 5.7 2.1
126 | 7.2 3.2 6.0 1.8
127 | 6.2 2.8 4.8 1.8
128 | 6.1 3.0 4.9 1.8
129 | 6.4 2.8 5.6 2.1
130 | 7.2 3.0 5.8 1.6
131 | 7.4 2.8 6.1 1.9
132 | 7.9 3.8 6.4 2.0
133 | 6.4 2.8 5.6 2.2
134 | 6.3 2.8 5.1 1.5
135 | 6.1 2.6 5.6 1.4
136 | 7.7 3.0 6.1 2.3
137 | 6.3 3.4 5.6 2.4
138 | 6.4 3.1 5.5 1.8
139 | 6.0 3.0 4.8 1.8
140 | 6.9 3.1 5.4 2.1
141 | 6.7 3.1 5.6 2.4
142 | 6.9 3.1 5.1 2.3
143 | 5.8 2.7 5.1 1.9
144 | 6.8 3.2 5.9 2.3
145 | 6.7 3.3 5.7 2.5
146 | 6.7 3.0 5.2 2.3
147 | 6.3 2.5 5.0 1.9
148 | 6.5 3.0 5.2 2.0
149 | 6.2 3.4 5.4 2.3
150 | 5.9 3.0 5.1 1.8
--------------------------------------------------------------------------------
/data/birch/testInput.txt:
--------------------------------------------------------------------------------
1 | 5.1 3.5 1.4 0.2
2 | 4.9 3.0 1.4 0.2
3 | 4.7 3.2 1.3 0.8
4 | 4.6 3.1 1.5 0.8
5 | 5.0 3.6 1.8 0.6
6 | 4.7 3.2 1.4 0.8
--------------------------------------------------------------------------------
/data/cabddcc/graphData.txt:
--------------------------------------------------------------------------------
1 | 0 1 12
2 | 1 3 9
3 | 2 3 12
4 | 3 4 10
5 | 4 4 4
6 | 5 4 1
7 | 6 6 1
8 | 7 6 3
9 | 8 6 9
10 | 9 8 3
11 | 10 8 10
12 | 11 9 2
13 | 12 9 11
14 | 13 10 9
15 | 14 11 12
--------------------------------------------------------------------------------
/data/cart/input.txt:
--------------------------------------------------------------------------------
1 | Rid Age Income Student CreditRating BuysComputer
2 | 1 Youth High No Fair No
3 | 2 Youth High No Excellent No
4 | 3 MiddleAged High No Fair Yes
5 | 4 Senior Medium No Fair Yes
6 | 5 Senior Low Yes Fair Yes
7 | 6 Senior Low Yes Excellent No
8 | 7 MiddleAged Low Yes Excellent Yes
9 | 8 Youth Medium No Fair No
10 | 9 Youth Low Yes Fair Yes
11 | 10 Senior Medium Yes Fair Yes
12 | 11 Youth Medium Yes Excellent Yes
13 | 12 MiddleAged Medium No Excellent Yes
14 | 13 MiddleAged High Yes Fair Yes
15 | 14 Senior Medium No Excellent No
--------------------------------------------------------------------------------
/data/cba/input.txt:
--------------------------------------------------------------------------------
1 | Rid Age Income Student CreditRating BuysComputer
2 | 1 13 High No Fair CLassNo
3 | 2 11 High No Excellent CLassNo
4 | 3 25 High No Fair CLassYes
5 | 4 45 Medium No Fair CLassYes
6 | 5 50 Low Yes Fair CLassYes
7 | 6 51 Low Yes Excellent CLassNo
8 | 7 30 Low Yes Excellent CLassYes
9 | 8 13 Medium No Fair CLassNo
10 | 9 9 Low Yes Fair CLassYes
11 | 10 55 Medium Yes Fair CLassYes
12 | 11 14 Medium Yes Excellent CLassYes
13 | 12 33 Medium No Excellent CLassYes
14 | 13 33 High Yes Fair CLassYes
15 | 14 41 Medium No Excellent CLassNo
--------------------------------------------------------------------------------
/data/chameleon/graphData.txt:
--------------------------------------------------------------------------------
1 | 0 2 2
2 | 1 3 1
3 | 2 3 4
4 | 3 3 14
5 | 4 5 3
6 | 5 8 3
7 | 6 8 6
8 | 7 9 8
9 | 8 10 4
10 | 9 10 7
11 | 10 10 10
12 | 11 10 14
13 | 12 11 13
14 | 13 12 8
15 | 14 12 15
16 | 15 14 7
17 | 16 14 9
18 | 17 14 15
19 | 18 15 8
--------------------------------------------------------------------------------
/data/dbscan/input.txt:
--------------------------------------------------------------------------------
1 | 2 2
2 | 3 1
3 | 3 4
4 | 3 14
5 | 5 3
6 | 8 3
7 | 8 6
8 | 9 8
9 | 10 4
10 | 10 7
11 | 10 10
12 | 10 14
13 | 11 13
14 | 12 8
15 | 12 15
16 | 14 7
17 | 14 9
18 | 14 15
19 | 15 8
--------------------------------------------------------------------------------
/data/em/input.txt:
--------------------------------------------------------------------------------
1 | 3 3
2 | 4 10
3 | 9 6
4 | 14 8
5 | 18 11
6 | 21 7
--------------------------------------------------------------------------------
/data/fptree/testInput.txt:
--------------------------------------------------------------------------------
1 | T1 1 2 5
2 | T2 2 4
3 | T3 2 3
4 | T4 1 2 4
5 | T5 1 3
6 | T6 2 3
7 | T7 1 3
8 | T8 1 2 3 5
9 | T9 1 2 3
--------------------------------------------------------------------------------
/data/gsp/testInput.txt:
--------------------------------------------------------------------------------
1 | 1 2 1 5
2 | 1 1 2
3 | 1 1 3
4 | 1 1 4
5 | 2 1 1
6 | 2 1 3
7 | 2 1 4
8 | 2 2 3 5
9 | 3 1 1
10 | 3 1 2
11 | 3 1 3
12 | 3 1 4
13 | 3 1 5
14 | 4 1 1
15 | 4 1 3
16 | 4 1 5
17 | 5 1 4
18 | 5 1 5
--------------------------------------------------------------------------------
/data/gspan/input.txt:
--------------------------------------------------------------------------------
1 | t # 0
2 | v 0 0
3 | v 1 1
4 | v 2 0
5 | v 3 0
6 | v 4 0
7 | v 5 1
8 | e 0 1 0
9 | e 1 2 0
10 | e 1 3 0
11 | e 2 4 0
12 | e 3 5 1
--------------------------------------------------------------------------------
/data/hits/input.txt:
--------------------------------------------------------------------------------
1 | 1 2
2 | 1 3
3 | 2 3
4 | 3 1
--------------------------------------------------------------------------------
/data/id3/input.txt:
--------------------------------------------------------------------------------
1 | Day OutLook Temperature Humidity Wind PlayTennis
2 | 1 Sunny Hot High Weak No
3 | 2 Sunny Hot High Strong No
4 | 3 Overcast Hot High Weak Yes
5 | 4 Rainy Mild High Weak Yes
6 | 5 Rainy Cool Normal Weak Yes
7 | 6 Rainy Cool Normal Strong No
8 | 7 Overcast Cool Normal Strong Yes
9 | 8 Sunny Mild High Weak No
10 | 9 Sunny Cool Normal Weak Yes
11 | 10 Rainy Mild Normal Weak Yes
12 | 11 Sunny Mild Normal Strong Yes
13 | 12 Overcast Mild High Strong Yes
14 | 13 Overcast Hot Normal Weak Yes
15 | 14 Rainy Mild High Strong No
--------------------------------------------------------------------------------
/data/kdtree/input.txt:
--------------------------------------------------------------------------------
1 | 4 7
2 | 5 4
3 | 9 6
4 | 7 2
5 | 2 3
6 | 8 1
--------------------------------------------------------------------------------
/data/kmeans/input.txt:
--------------------------------------------------------------------------------
1 | 3 3
2 | 4 10
3 | 9 6
4 | 14 8
5 | 18 11
6 | 21 7
--------------------------------------------------------------------------------
/data/knn/testInput.txt:
--------------------------------------------------------------------------------
1 | 1 2 3 2 4
2 | 2 3 4 2 1
3 | 8 7 2 3 5
4 | -3 -2 2 4 0
5 | -4 -4 -4 -4 -4
6 | 1 2 3 4 4
7 | 4 4 3 2 1
8 | 3 3 3 2 4
9 | 0 0 1 1 -2
--------------------------------------------------------------------------------
/data/knn/trainInput.txt:
--------------------------------------------------------------------------------
1 | a 1 2 3 4 5
2 | b 5 4 3 2 1
3 | c 3 3 3 3 3
4 | d -3 -3 -3 -3 -3
5 | a 1 2 3 4 4
6 | b 4 4 3 2 1
7 | c 3 3 3 2 4
8 | d 0 0 1 1 -2
9 |
--------------------------------------------------------------------------------
/data/maze/mapData.txt:
--------------------------------------------------------------------------------
1 | 0 0 0 0 0
2 | 2 0 0 -1 0
3 | 0 0 0 0 0
4 | 0 -1 0 0 -1
5 | 0 0 0 0 1
--------------------------------------------------------------------------------
/data/msapriori/testInput.txt:
--------------------------------------------------------------------------------
1 | T1 1 2 5
2 | T2 2 4
3 | T3 2 3
4 | T4 1 2 4
5 | T5 1 3
6 | T6 2 3
7 | T7 1 3
8 | T8 1 2 3 5
9 | T9 1 2 3
--------------------------------------------------------------------------------
/data/msapriori/testInput2.txt:
--------------------------------------------------------------------------------
1 | Rid Age Income Student CreditRating BuysComputer
2 | 1 Youth High No Fair No
3 | 2 Youth High No Excellent No
4 | 3 MiddleAged High No Fair Yes
5 | 4 Senior Medium No Fair Yes
6 | 5 Senior Low Yes Fair Yes
7 | 6 Senior Low Yes Excellent No
8 | 7 MiddleAged Low Yes Excellent Yes
9 | 8 Youth Medium No Fair No
10 | 9 Youth Low Yes Fair Yes
11 | 10 Senior Medium Yes Fair Yes
12 | 11 Youth Medium Yes Excellent Yes
13 | 12 MiddleAged Medium No Excellent Yes
14 | 13 MiddleAged High Yes Fair Yes
15 | 14 Senior Medium No Excellent No
--------------------------------------------------------------------------------
/data/naivebayes/input.txt:
--------------------------------------------------------------------------------
1 | Day OutLook Temperature Humidity Wind PlayTennis
2 | 1 Sunny Hot High Weak No
3 | 2 Sunny Hot High Strong No
4 | 3 Overcast Hot High Weak Yes
5 | 4 Rainy Mild High Weak Yes
6 | 5 Rainy Cool Normal Weak Yes
7 | 6 Rainy Cool Normal Strong No
8 | 7 Overcast Cool Normal Strong Yes
9 | 8 Sunny Mild High Weak No
10 | 9 Sunny Cool Normal Weak Yes
11 | 10 Rainy Mild Normal Weak Yes
12 | 11 Sunny Mild Normal Strong Yes
13 | 12 Overcast Mild High Strong Yes
14 | 13 Overcast Hot Normal Weak Yes
15 | 14 Rainy Mild High Strong No
--------------------------------------------------------------------------------
/data/pagerank/input.txt:
--------------------------------------------------------------------------------
1 | 1 2
2 | 1 3
3 | 2 3
4 | 3 1
--------------------------------------------------------------------------------
/data/pca/Makefile:
--------------------------------------------------------------------------------
1 | INPUTS= simple basilevsy compressor_per_day_kwh compressor_1_day_detail
2 |
3 | all:
4 | @for i in $(INPUTS) ; do \
5 | java -cp ../target/pca-1.0.jar com.uwemeding.pca.Main $$i ; \
6 | done
7 |
8 |
9 | clean:; rm -f *lambda* *pcomps* *pfacs* *_cc* *_cumcon*
10 |
11 |
--------------------------------------------------------------------------------
/data/pca/basilevsy.data:
--------------------------------------------------------------------------------
1 | 335.6
2 | 245.3
3 | 226
4 | 318.5
5 | 450.8
6 | 508.6
7 | 445.7
8 | 445.1
9 | 472.6
10 | 376.
11 | 319.4
12 | 352.2
13 | 408.5
14 | 314.5
15 | 262.0
16 | 287.8
17 | 320.3
18 | 265.1
19 | 224.7
20 | 248
21 | 304.9
22 | 266.3
23 | 276.5
24 | 300.9
25 | 415.6
26 | 341.5
27 | 289.8
28 | 342.1
29 | 465.5
30 | 488.6
31 | 483.2
32 | 566.2
33 | 636.8
34 | 511
35 | 442.7
36 | 456.7
37 | 478.1
38 | 378.1
39 | 334.6
40 | 360.3
41 | 424.7
42 | 336.5
43 | 328.9
44 | 417.2
45 | 493.4
46 | 457.2
47 | 477.5
48 | 571.5
49 | 847.1
50 | 584.4
51 | 514.2
52 | 503.4
53 | 501.7
54 | 402.0
55 | 373
56 | 376.7
57 | 405.7
58 | 340.3
59 | 341.0
60 | 352.3
61 | 366.0
62 | 312.7
63 | 336.7
64 | 549.
65 | 632
66 | 577
67 | 574.7
68 | 612.7
69 | 651.7
70 | 584.7
71 | 577.3
72 | 591.7
73 | 632.3
74 | 562.7
75 | 581.7
76 | 608.7
77 | 662.3
78 | 614.3
79 | 639.3
80 | 643.3
81 | 761.7
82 | 789.7
83 | 887.6
84 | 956.2
85 |
--------------------------------------------------------------------------------
/data/pca/compressor_per_day_kwh.data:
--------------------------------------------------------------------------------
1 | 49.71
2 | 49.71
3 | 66.85
4 | 63
5 | 50.83
6 | 56.32
7 | 72.6
8 | 57.32
9 | 62.59
10 | 63.77
11 | 52.3
12 | 61.13
13 | 51.95
14 | 52.88
15 | 82.31
16 | 78.95
17 | 48.6
18 | 59.39
19 | 53.15
20 | 51.07
21 | 69.49
22 | 59.64
23 | 69.42
24 | 63.53
25 | 45.46
26 | 49.7
27 | 66.45
28 | 59.93
29 | 49.16
30 | 57.46
31 | 73.2
32 | 73.96
33 | 75.21
34 | 69.14
35 | 71.74
36 | 71.56
37 | 65.69
38 | 78.28
39 | 81.58
40 | 79.3
41 | 87.15
42 | 84.37
43 | 64.88
44 | 74.96
45 | 83.11
46 | 79.55
47 | 74.98
48 | 70.58
49 | 51.26
50 | 60.05
51 | 78.74
52 | 66.67
53 | 54.14
54 | 61.11
55 | 79.62
56 | 73.98
57 | 76.75
58 | 70.02
59 | 71.36
60 | 76.58
61 | 88.28
62 | 84.84
63 | 86.02
64 | 83.16
65 | 85.33
66 | 72.7
67 | 86.17
68 | 85.18
69 | 82.75
70 | 68.04
71 | 77.58
72 | 72.35
73 | 54.76
74 | 64.33
75 | 76.18
76 | 63.9
77 | 51.22
78 | 61.4
79 | 80.38
80 | 73.94
81 | 75.65
82 | 67.71
83 | 71.52
84 | 69.39
85 | 82.43
86 | 85.62
87 | 86.32
88 | 84.03
89 | 86.84
90 | 91.22
91 | 74.63
92 | 74.21
93 | 81.11
94 | 73.22
95 | 70.19
96 | 68.52
97 | 50.24
98 | 49.13
99 | 68.15
100 | 65.49
101 | 58.94
102 | 62.41
103 | 78.54
104 | 75.25
105 | 77.07
106 | 85.93
107 | 74.82
108 | 71.71
109 | 83.85
110 | 86.4
111 | 82.57
112 | 79.8
113 | 83.25
114 | 71.62
115 | 80.17
116 | 80.73
117 | 84.12
118 | 79.11
119 | 76.92
120 | 65.89
121 | 52.59
122 | 50.17
123 | 70.89
124 | 67.02
125 | 54.84
126 | 62.24
127 | 80.07
128 | 76.92
129 | 75.2
130 | 69
131 | 69.17
132 | 69.82
133 | 83.9
134 | 82.93
135 | 85.61
136 | 81.07
137 | 82.83
138 | 69.2
139 | 70.42
140 | 67.16
141 | 82.06
142 | 75.12
143 | 75.48
144 | 67.02
145 | 51.69
146 | 63.76
147 | 76.22
148 | 65.95
149 | 49.61
150 | 60.49
151 | 77.93
152 | 67.68
153 | 72.95
154 | 65.82
155 | 50.18
156 | 61.66
157 | 51.29
158 | 50.5
159 | 83.48
160 | 73.63
161 | 60.27
162 | 62.76
163 | 52.41
164 | 52.61
165 | 67.24
166 | 64.84
167 | 72.24
168 | 64.17
169 |
--------------------------------------------------------------------------------
/data/pca/simple.data:
--------------------------------------------------------------------------------
1 | 1
2 | 2
3 | 3
4 | 4
5 | 5
6 | 6
7 |
--------------------------------------------------------------------------------
/data/prefixspan/input.txt:
--------------------------------------------------------------------------------
1 | bd c b ac
2 | bf ce b fg
3 | ah bf a b f
4 | be ce d
5 | a bd b c b ade
--------------------------------------------------------------------------------
/data/randomforest/input.txt:
--------------------------------------------------------------------------------
1 | Rid Age Income Student CreditRating BuysComputer
2 | 1 Youth High No Fair No
3 | 2 Youth High No Excellent No
4 | 3 MiddleAged High No Fair Yes
5 | 4 Senior Medium No Fair Yes
6 | 5 Senior Low Yes Fair Yes
7 | 6 Senior Low Yes Excellent No
8 | 7 MiddleAged Low Yes Excellent Yes
9 | 8 Youth Medium No Fair No
10 | 9 Youth Low Yes Fair Yes
11 | 10 Senior Medium Yes Fair Yes
12 | 11 Youth Medium Yes Excellent Yes
13 | 12 MiddleAged Medium No Excellent Yes
14 | 13 MiddleAged High Yes Fair Yes
15 | 14 Senior Medium No Excellent No
--------------------------------------------------------------------------------
/data/roughsets/input.txt:
--------------------------------------------------------------------------------
1 | Element Color Shape Size Stability
2 | x1 Red Triangle Large Stable
3 | x2 Red Triangle Large Stable
4 | x3 Yellow Circle Small UnStable
5 | x4 Yellow Circle Small UnStable
6 | x5 Blue Rectangle Large Stable
7 | x6 Red Circle Middle UnStable
8 | x7 Blue Circle Small UnStable
9 | x8 Blue Rectangle Middle UnStable
--------------------------------------------------------------------------------
/data/tan/input.txt:
--------------------------------------------------------------------------------
1 | OutLook Temperature Humidity Wind PlayTennis
2 | Sunny Hot High Weak No
3 | Sunny Hot High Strong No
4 | Overcast Hot High Weak Yes
5 | Rainy Mild High Weak Yes
6 | Rainy Cool Normal Weak Yes
7 | Rainy Cool Normal Strong No
8 | Overcast Cool Normal Strong Yes
9 | Sunny Mild High Weak No
10 | Sunny Cool Normal Weak Yes
11 | Rainy Mild Normal Weak Yes
12 | Sunny Mild Normal Strong Yes
13 | Overcast Mild High Strong Yes
14 | Overcast Hot Normal Weak Yes
15 | Rainy Mild High Strong No
--------------------------------------------------------------------------------
/data/viterbi/humidity-matrix.txt:
--------------------------------------------------------------------------------
1 | # Dry Dryish Damp Soggy
2 | Sunny 0.6 0.2 0.15 0.05
3 | Cloudy 0.25 0.25 0.25 0.25
4 | Rainy 0.05 0.10 0.35 0.50
--------------------------------------------------------------------------------
/data/viterbi/stmatrix.txt:
--------------------------------------------------------------------------------
1 | # Sunny Cloudy Rainy
2 | Sunny 0.5 0.375 0.125
3 | Cloudy 0.25 0.125 0.625
4 | Rainy 0.25 0.375 0.375
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.jusdt
6 | datamining-18algorithms
7 | 1.0.0
8 | DataMining 18 Algorithms
9 |
10 |
11 | UTF-8
12 | 1.1.7
13 |
14 |
15 |
16 |
17 |
18 | info.bbd
19 | common-utils
20 | 1.0.0
21 |
22 |
23 | com.github.jnr
24 | jnr-posix
25 |
26 |
27 |
28 |
29 |
30 | ch.qos.logback
31 | logback-classic
32 | ${logback.version}
33 |
34 |
35 | ch.qos.logback
36 | logback-core
37 | ${logback.version}
38 |
39 |
40 | ch.qos.logback
41 | logback-access
42 | ${logback.version}
43 |
44 |
45 | org.slf4j
46 | slf4j-api
47 | 1.7.21
48 |
49 |
50 |
51 | com.google.guava
52 | guava
53 | 14.0.1
54 |
55 |
56 |
57 | junit
58 | junit
59 | 4.10
60 | test
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 | org.apache.maven.plugins
70 | maven-compiler-plugin
71 | 3.6.1
72 |
73 | true
74 | 1.8
75 | 1.8
76 | UTF-8
77 | 1.8
78 |
79 |
80 |
81 | org.apache.maven.plugins
82 | maven-source-plugin
83 | 3.0.1
84 |
85 |
86 | attach-sources
87 | verify
88 |
89 | jar-no-fork
90 |
91 |
92 |
93 |
94 |
95 | org.apache.maven.plugins
96 | maven-resources-plugin
97 | 3.0.2
98 |
99 | UTF-8
100 |
101 |
102 |
103 |
108 |
109 | org.apache.maven.plugins
110 | maven-assembly-plugin
111 | 2.4
112 |
113 |
114 |
115 | com.jusdt.zcm.mapred.driver.ZcmDriver
116 |
117 |
118 |
119 | jar-with-dependencies
120 |
121 |
122 |
123 |
124 | make-assembly
125 | package
126 |
127 | single
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 | src/main/resources
138 |
139 | *.*
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 | ${project.artifactId}-${project.version}
149 |
150 |
151 |
152 |
--------------------------------------------------------------------------------
/src/main/assembly/distribution.xml:
--------------------------------------------------------------------------------
1 |
2 |
6 | distribution
7 |
8 | tar.gz
9 |
10 | ${project.artifactId}
11 |
12 |
13 | src/main/resources
14 |
15 | logback.xml
16 | conf.properties
17 | utils.properties
18 |
19 | /conf
20 | true
21 |
22 |
23 | src/main/bin
24 |
25 | *
26 |
27 | /bin
28 | 0755
29 |
30 |
31 |
32 |
33 | /lib
34 |
35 |
36 |
--------------------------------------------------------------------------------
/src/main/bin/ctl.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | mainClass=com.jusdt.zcm.mapred.driver.ZcmDriver
4 |
5 | # resolve links - $0 may be a softlink
6 | PRG="$0"
7 |
8 | while [ -h "$PRG" ]; do
9 | ls=`ls -ld "$PRG"`
10 | link=`expr "$ls" : '.*-> \(.*\)$'`
11 | if expr "$link" : '/.*' > /dev/null; then
12 | PRG="$link"
13 | else
14 | PRG=`dirname "$PRG"`/"$link"
15 | fi
16 | done
17 |
18 | # Get standard environment variables
19 | PRGDIR=`dirname "$PRG"`
20 |
21 | PROJECT_DIR=`cd "$PRGDIR/.." >/dev/null; pwd`
22 | echo PROJECT_DIR=$PROJECT_DIR
23 |
24 | CLASSPATH="$CLASSHPATH:$PROJECT_DIR/conf"
25 |
26 | for jar in "$PROJECT_DIR/lib"/*.jar; do
27 | CLASSPATH="$CLASSPATH:$jar"
28 | done
29 | echo CLASSPATH=$CLASSPATH
30 |
31 | JVMARGS="${JVMARGS} -Dproject_dir=${PROJECT_DIR} -Djava.net.preferIPv4Stack=true"
32 | echo JVMARGS=$JVMARGS
33 |
34 | usage() {
35 | echo >&2 "usage: $PRG [args]"
36 | echo 'Valid commands: start, stop'
37 | exit 1
38 | }
39 |
40 | start() {
41 | JAVA=${JAVA-'java'}
42 | exec $JAVA $JVMARGS -classpath "$CLASSPATH" $mainClass "$@" &
43 | echo $! > main.pid
44 | }
45 |
46 | stop() {
47 | kill `cat main.pid` > /dev/null
48 | }
49 |
50 | case $1 in
51 | (start)
52 | shift
53 | start $@
54 | ;;
55 | (stop)
56 | stop
57 | ;;
58 | (restart)
59 | stop
60 | shift
61 | start $@
62 | ;;
63 | (*)
64 | echo >&2 "$PRG: error: unknown command '$1'"
65 | usage
66 | ;;
67 | esac
68 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/association/analysis/apriori/AprioriExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.association.analysis.apriori;
2 |
3 | /**
4 | * apriori关联规则挖掘算法调用类
5 | */
6 | public class AprioriExample {
7 |
8 | public static void main(String[] args) {
9 | String filePath = "data/apriori/testInput.txt";
10 |
11 | AprioriCore tool = new AprioriCore(filePath, 2);
12 | tool.printAttachRule(0.7);
13 | }
14 |
15 | }
16 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/association/analysis/apriori/FrequentItem.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.association.analysis.apriori;
2 |
3 | /**
4 | * 频繁项集
5 | */
6 | public class FrequentItem implements Comparable {
7 |
8 | // 频繁项集的集合ID
9 | private String[] idArray;
10 | // 频繁项集的支持度计数
11 | private int count;
12 | //频繁项集的长度,1项集或是2项集,亦或是3项集
13 | private int length;
14 |
15 | public FrequentItem(String[] idArray, int count) {
16 | this.idArray = idArray;
17 | this.count = count;
18 | length = idArray.length;
19 | }
20 |
21 | public String[] getIdArray() {
22 | return idArray;
23 | }
24 |
25 | public void setIdArray(String[] idArray) {
26 | this.idArray = idArray;
27 | }
28 |
29 | public int getCount() {
30 | return count;
31 | }
32 |
33 | public void setCount(int count) {
34 | this.count = count;
35 | }
36 |
37 | public int getLength() {
38 | return length;
39 | }
40 |
41 | public void setLength(int length) {
42 | this.length = length;
43 | }
44 |
45 | @Override
46 | public int compareTo(FrequentItem o) {
47 | // TODO Auto-generated method stub
48 | Integer int1 = Integer.parseInt(this.getIdArray()[0]);
49 | Integer int2 = Integer.parseInt(o.getIdArray()[0]);
50 |
51 | return int1.compareTo(int2);
52 | }
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/association/analysis/fptree/FPTreeExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.association.analysis.fptree;
2 |
3 | /**
4 | * FPTree频繁模式树算法
5 | */
6 | public class FPTreeExample {
7 |
8 | public static void main(String[] args) {
9 | String filePath = "data/fptree/testInput.txt";
10 | //最小支持度阈值
11 | int minSupportCount = 2;
12 |
13 | FPTreeCore tool = new FPTreeCore(filePath, minSupportCount);
14 | tool.startBuildingTree();
15 | }
16 |
17 | }
18 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/association/analysis/fptree/TreeNode.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.association.analysis.fptree;
2 |
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * FP树节点
7 | */
8 | public class TreeNode implements Comparable, Cloneable {
9 |
10 | // 节点类别名称
11 | private String name;
12 | // 计数数量
13 | private Integer count;
14 | // 父亲节点
15 | private TreeNode parentNode;
16 | // 孩子节点,可以为多个
17 | private ArrayList childNodes;
18 |
19 | public TreeNode(String name, int count) {
20 | this.name = name;
21 | this.count = count;
22 | }
23 |
24 | public String getName() {
25 | return name;
26 | }
27 |
28 | public void setName(String name) {
29 | this.name = name;
30 | }
31 |
32 | public Integer getCount() {
33 | return count;
34 | }
35 |
36 | public void setCount(Integer count) {
37 | this.count = count;
38 | }
39 |
40 | public TreeNode getParentNode() {
41 | return parentNode;
42 | }
43 |
44 | public void setParentNode(TreeNode parentNode) {
45 | this.parentNode = parentNode;
46 | }
47 |
48 | public ArrayList getChildNodes() {
49 | return childNodes;
50 | }
51 |
52 | public void setChildNodes(ArrayList childNodes) {
53 | this.childNodes = childNodes;
54 | }
55 |
56 | @Override
57 | public int compareTo(TreeNode o) {
58 | // TODO Auto-generated method stub
59 | return o.getCount().compareTo(this.getCount());
60 | }
61 |
62 | @Override
63 | protected Object clone() throws CloneNotSupportedException {
64 | // TODO Auto-generated method stub
65 | //因为对象内部有引用,需要采用深拷贝
66 | TreeNode node = (TreeNode) super.clone();
67 | if (this.getParentNode() != null) {
68 | node.setParentNode((TreeNode) this.getParentNode().clone());
69 | }
70 |
71 | if (this.getChildNodes() != null) {
72 | node.setChildNodes((ArrayList) this.getChildNodes().clone());
73 | }
74 |
75 | return node;
76 | }
77 |
78 | }
79 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/bagging/boosting/adaboost/AdaBoostExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.bagging.boosting.adaboost;
2 |
3 | /**
4 | * AdaBoost提升算法调用类
5 | */
6 | public class AdaBoostExample {
7 |
8 | public static void main(String[] agrs) {
9 | String filePath = "data/adaboost/input.txt";
10 | //误差率阈值
11 | double errorValue = 0.2;
12 |
13 | AdaBoostCore tool = new AdaBoostCore(filePath, errorValue);
14 | tool.adaBoostClassify();
15 | }
16 |
17 | }
18 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/bagging/boosting/adaboost/Point.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.bagging.boosting.adaboost;
2 |
3 | /**
4 | * 坐标点类
5 | */
6 | public class Point {
7 |
8 | // 坐标点x坐标
9 | private int x;
10 | // 坐标点y坐标
11 | private int y;
12 | // 坐标点的分类类别
13 | private int classType;
14 | //如果此节点被划错,他的误差率,不能用个数除以总数,因为不同坐标点的权重不一定相等
15 | private double probably;
16 |
17 | public Point(int x, int y, int classType) {
18 | this.x = x;
19 | this.y = y;
20 | this.classType = classType;
21 | }
22 |
23 | public Point(String x, String y, String classType) {
24 | this.x = Integer.parseInt(x);
25 | this.y = Integer.parseInt(y);
26 | this.classType = Integer.parseInt(classType);
27 | }
28 |
29 | public int getX() {
30 | return x;
31 | }
32 |
33 | public void setX(int x) {
34 | this.x = x;
35 | }
36 |
37 | public int getY() {
38 | return y;
39 | }
40 |
41 | public void setY(int y) {
42 | this.y = y;
43 | }
44 |
45 | public int getClassType() {
46 | return classType;
47 | }
48 |
49 | public void setClassType(int classType) {
50 | this.classType = classType;
51 | }
52 |
53 | public double getProbably() {
54 | return probably;
55 | }
56 |
57 | public void setProbably(double probably) {
58 | this.probably = probably;
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/cart/AttrNode.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.classification.cart;
2 |
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * 回归分类树节点
7 | */
8 | public class AttrNode {
9 |
10 | // 节点属性名字
11 | private String attrName;
12 | // 节点索引标号
13 | private int nodeIndex;
14 | //包含的叶子节点数
15 | private int leafNum;
16 | // 节点误差率
17 | private double alpha;
18 | // 父亲分类属性值
19 | private String parentAttrValue;
20 | // 孩子节点
21 | private AttrNode[] childAttrNode;
22 | // 数据记录索引
23 | private ArrayList dataIndex;
24 |
25 | public String getAttrName() {
26 | return attrName;
27 | }
28 |
29 | public void setAttrName(String attrName) {
30 | this.attrName = attrName;
31 | }
32 |
33 | public int getNodeIndex() {
34 | return nodeIndex;
35 | }
36 |
37 | public void setNodeIndex(int nodeIndex) {
38 | this.nodeIndex = nodeIndex;
39 | }
40 |
41 | public double getAlpha() {
42 | return alpha;
43 | }
44 |
45 | public void setAlpha(double alpha) {
46 | this.alpha = alpha;
47 | }
48 |
49 | public String getParentAttrValue() {
50 | return parentAttrValue;
51 | }
52 |
53 | public void setParentAttrValue(String parentAttrValue) {
54 | this.parentAttrValue = parentAttrValue;
55 | }
56 |
57 | public AttrNode[] getChildAttrNode() {
58 | return childAttrNode;
59 | }
60 |
61 | public void setChildAttrNode(AttrNode[] childAttrNode) {
62 | this.childAttrNode = childAttrNode;
63 | }
64 |
65 | public ArrayList getDataIndex() {
66 | return dataIndex;
67 | }
68 |
69 | public void setDataIndex(ArrayList dataIndex) {
70 | this.dataIndex = dataIndex;
71 | }
72 |
73 | public int getLeafNum() {
74 | return leafNum;
75 | }
76 |
77 | public void setLeafNum(int leafNum) {
78 | this.leafNum = leafNum;
79 | }
80 |
81 | }
82 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/cart/CARTExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.classification.cart;
2 |
3 | public class CARTExample {
4 |
5 | public static void main(String[] args) {
6 | String filePath = "data/cart/input.txt";
7 |
8 | CARTCore tool = new CARTCore(filePath);
9 |
10 | tool.startBuildingTree();
11 | }
12 |
13 | }
14 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/id3/AttrNode.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.classification.id3;
2 |
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * 属性节点,不是叶子节点
7 | */
8 | public class AttrNode {
9 |
10 | //当前属性的名字
11 | private String attrName;
12 | //父节点的分类属性值
13 | private String parentAttrValue;
14 | //属性子节点
15 | private AttrNode[] childAttrNode;
16 | //孩子叶子节点
17 | private ArrayList childDataIndex;
18 |
19 | public String getAttrName() {
20 | return attrName;
21 | }
22 |
23 | public void setAttrName(String attrName) {
24 | this.attrName = attrName;
25 | }
26 |
27 | public AttrNode[] getChildAttrNode() {
28 | return childAttrNode;
29 | }
30 |
31 | public void setChildAttrNode(AttrNode[] childAttrNode) {
32 | this.childAttrNode = childAttrNode;
33 | }
34 |
35 | public String getParentAttrValue() {
36 | return parentAttrValue;
37 | }
38 |
39 | public void setParentAttrValue(String parentAttrValue) {
40 | this.parentAttrValue = parentAttrValue;
41 | }
42 |
43 | public ArrayList getChildDataIndex() {
44 | return childDataIndex;
45 | }
46 |
47 | public void setChildDataIndex(ArrayList childDataIndex) {
48 | this.childDataIndex = childDataIndex;
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/id3/DataNode.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.classification.id3;
2 |
3 | /**
4 | * 存放数据的叶子节点
5 | */
6 | public class DataNode {
7 |
8 | /**
9 | * 数据的标号
10 | */
11 | private int dataIndex;
12 |
13 | public DataNode(int dataIndex) {
14 | this.dataIndex = dataIndex;
15 | }
16 |
17 | }
18 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/id3/ID3Example.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.classification.id3;
2 |
3 | /**
4 | * ID3决策树分类算法测试场景类
5 | */
6 | public class ID3Example {
7 |
8 | public static void main(String[] args) {
9 | String filePath = "data/id3/input.txt";
10 |
11 | ID3Core tool = new ID3Core(filePath);
12 | tool.startBuildingTree(true);
13 | }
14 |
15 | }
16 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/knn/KNNCore.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.classification.knn;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileReader;
6 | import java.io.IOException;
7 | import java.util.ArrayList;
8 | import java.util.Collections;
9 | import java.util.HashMap;
10 | import java.util.Map;
11 |
12 | //import org.apache.activemq.filter.ComparisonExpression;
13 |
14 | /**
15 | * K最近邻算法工具类
16 | */
17 | public class KNNCore {
18 |
19 | // 为4个类别设置权重,默认权重比一致
20 | public int[] classWeightArray = new int[] { 1, 1, 1, 1 };
21 | // 测试数据
22 | private String testDataPath;
23 | // 训练集数据地址
24 | private String trainDataPath;
25 | // 分类的不同类型
26 | private ArrayList classTypes;
27 | // 结果数据
28 | private ArrayList resultSamples;
29 | // 训练集数据列表容器
30 | private ArrayList trainSamples;
31 | // 训练集数据
32 | private String[][] trainData;
33 | // 测试集数据
34 | private String[][] testData;
35 |
36 | public KNNCore(String trainDataPath, String testDataPath) {
37 | this.trainDataPath = trainDataPath;
38 | this.testDataPath = testDataPath;
39 | readDataFormFile();
40 | }
41 |
42 | /**
43 | * 从文件中阅读测试数和训练数据集
44 | */
45 | private void readDataFormFile() {
46 | ArrayList tempArray;
47 |
48 | tempArray = fileDataToArray(trainDataPath);
49 | trainData = new String[tempArray.size()][];
50 | tempArray.toArray(trainData);
51 |
52 | classTypes = new ArrayList<>();
53 | for (String[] s : tempArray) {
54 | if (!classTypes.contains(s[0])) {
55 | // 添加类型
56 | classTypes.add(s[0]);
57 | }
58 | }
59 |
60 | tempArray = fileDataToArray(testDataPath);
61 | testData = new String[tempArray.size()][];
62 | tempArray.toArray(testData);
63 | }
64 |
65 | /**
66 | * 将文件转为列表数据输出
67 | *
68 | * @param filePath
69 | * 数据文件的内容
70 | */
71 | private ArrayList fileDataToArray(String filePath) {
72 | File file = new File(filePath);
73 | ArrayList dataArray = new ArrayList();
74 |
75 | try {
76 | BufferedReader in = new BufferedReader(new FileReader(file));
77 | String str;
78 | String[] tempArray;
79 | while ((str = in.readLine()) != null) {
80 | tempArray = str.split(" ");
81 | dataArray.add(tempArray);
82 | }
83 | in.close();
84 | } catch (IOException e) {
85 | e.getStackTrace();
86 | }
87 |
88 | return dataArray;
89 | }
90 |
91 | /**
92 | * 计算样本特征向量的欧几里得距离
93 | *
94 | * @param f1
95 | * 待比较样本1
96 | * @param f2
97 | * 待比较样本2
98 | * @return
99 | */
100 | private int computeEuclideanDistance(Sample s1, Sample s2) {
101 | String[] f1 = s1.getFeatures();
102 | String[] f2 = s2.getFeatures();
103 | // 欧几里得距离
104 | int distance = 0;
105 |
106 | for (int i = 0; i < f1.length; i++) {
107 | int subF1 = Integer.parseInt(f1[i]);
108 | int subF2 = Integer.parseInt(f2[i]);
109 |
110 | distance += (subF1 - subF2) * (subF1 - subF2);
111 | }
112 |
113 | return distance;
114 | }
115 |
116 | /**
117 | * 计算K最近邻
118 | * @param k
119 | * 在多少的k范围内
120 | */
121 | public void knnCompute(int k) {
122 | String className = "";
123 | String[] tempF = null;
124 | Sample temp;
125 | resultSamples = new ArrayList<>();
126 | trainSamples = new ArrayList<>();
127 | // 分类类别计数
128 | HashMap classCount;
129 | // 类别权重比
130 | HashMap classWeight = new HashMap<>();
131 | // 首先讲测试数据转化到结果数据中
132 | for (String[] s : testData) {
133 | temp = new Sample(s);
134 | resultSamples.add(temp);
135 | }
136 |
137 | for (String[] s : trainData) {
138 | className = s[0];
139 | tempF = new String[s.length - 1];
140 | System.arraycopy(s, 1, tempF, 0, s.length - 1);
141 | temp = new Sample(className, tempF);
142 | trainSamples.add(temp);
143 | }
144 |
145 | // 离样本最近排序的的训练集数据
146 | ArrayList kNNSample = new ArrayList<>();
147 | // 计算训练数据集中离样本数据最近的K个训练集数据
148 | for (Sample s : resultSamples) {
149 | classCount = new HashMap<>();
150 | int index = 0;
151 | for (String type : classTypes) {
152 | // 开始时计数为0
153 | classCount.put(type, 0);
154 | classWeight.put(type, classWeightArray[index++]);
155 | }
156 | for (Sample tS : trainSamples) {
157 | int dis = computeEuclideanDistance(s, tS);
158 | tS.setDistance(dis);
159 | }
160 |
161 | Collections.sort(trainSamples);
162 | kNNSample.clear();
163 | // 挑选出前k个数据作为分类标准
164 | for (int i = 0; i < trainSamples.size(); i++) {
165 | if (i < k) {
166 | kNNSample.add(trainSamples.get(i));
167 | } else {
168 | break;
169 | }
170 | }
171 | // 判定K个训练数据的多数的分类标准
172 | for (Sample s1 : kNNSample) {
173 | int num = classCount.get(s1.getClassName());
174 | // 进行分类权重的叠加,默认类别权重平等,可自行改变,近的权重大,远的权重小
175 | num += classWeight.get(s1.getClassName());
176 | classCount.put(s1.getClassName(), num);
177 | }
178 |
179 | int maxCount = 0;
180 | // 筛选出k个训练集数据中最多的一个分类
181 | for (Map.Entry entry : classCount.entrySet()) {
182 | if ((Integer) entry.getValue() > maxCount) {
183 | maxCount = (Integer) entry.getValue();
184 | s.setClassName((String) entry.getKey());
185 | }
186 | }
187 |
188 | System.out.print("测试数据特征:");
189 | for (String s1 : s.getFeatures()) {
190 | System.out.print(s1 + " ");
191 | }
192 | System.out.println("分类:" + s.getClassName());
193 | }
194 | }
195 | }
196 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/knn/KNNExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.classification.knn;
2 |
3 | /**
4 | * k最近邻算法场景类型
5 | */
6 | public class KNNExample {
7 |
8 | public static void main(String[] args) {
9 | String trainDataPath = "data/knn/trainInput.txt";
10 | String testDataPath = "data/knn/testinput.txt";
11 |
12 | KNNCore tool = new KNNCore(trainDataPath, testDataPath);
13 | tool.knnCompute(3);
14 | }
15 |
16 | }
17 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/knn/Sample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.classification.knn;
2 |
3 | /**
4 | * 样本数据类
5 | */
6 | public class Sample implements Comparable {
7 |
8 | // 样本数据的分类名称
9 | private String className;
10 | // 样本数据的特征向量
11 | private String[] features;
12 | // 测试样本之间的间距值,以此做排序
13 | private Integer distance;
14 |
15 | public Sample(String[] features) {
16 | this.features = features;
17 | }
18 |
19 | public Sample(String className, String[] features) {
20 | this.className = className;
21 | this.features = features;
22 | }
23 |
24 | public String getClassName() {
25 | return className;
26 | }
27 |
28 | public void setClassName(String className) {
29 | this.className = className;
30 | }
31 |
32 | public String[] getFeatures() {
33 | return features;
34 | }
35 |
36 | public void setFeatures(String[] features) {
37 | this.features = features;
38 | }
39 |
40 | public Integer getDistance() {
41 | return distance;
42 | }
43 |
44 | public void setDistance(int distance) {
45 | this.distance = distance;
46 | }
47 |
48 | @Override
49 | public int compareTo(Sample o) {
50 | return this.getDistance().compareTo(o.getDistance());
51 | }
52 |
53 | }
54 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/naivebayes/NaiveBayesCore.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.classification.naivebayes;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileReader;
6 | import java.io.IOException;
7 | import java.util.ArrayList;
8 | import java.util.HashMap;
9 | import java.util.Map;
10 |
11 | /**
12 | * 朴素贝叶斯算法工具类
13 | */
14 | public class NaiveBayesCore {
15 |
16 | // 类标记符,这里分为2类,YES和NO
17 | private String YES = "Yes";
18 | private String NO = "No";
19 |
20 | // 已分类训练数据集文件路径
21 | private String filePath;
22 | // 属性名称数组
23 | private String[] attrNames;
24 | // 训练数据集
25 | private String[][] data;
26 |
27 | // 每个属性的值所有类型
28 | private HashMap> attrValue;
29 |
30 | public NaiveBayesCore(String filePath) {
31 | this.filePath = filePath;
32 |
33 | readDataFile();
34 | initAttrValue();
35 | }
36 |
37 | /**
38 | * 从文件中读取数据
39 | */
40 | private void readDataFile() {
41 | File file = new File(filePath);
42 | ArrayList dataArray = new ArrayList();
43 |
44 | try {
45 | BufferedReader in = new BufferedReader(new FileReader(file));
46 | String str;
47 | String[] tempArray;
48 | while ((str = in.readLine()) != null) {
49 | tempArray = str.split(" ");
50 | dataArray.add(tempArray);
51 | }
52 | in.close();
53 | } catch (IOException e) {
54 | e.getStackTrace();
55 | }
56 |
57 | data = new String[dataArray.size()][];
58 | dataArray.toArray(data);
59 | attrNames = data[0];
60 |
61 | /*
62 | * for(int i=0; i();
74 | ArrayList tempValues;
75 |
76 | // 按照列的方式,从左往右找
77 | for (int j = 1; j < attrNames.length; j++) {
78 | // 从一列中的上往下开始寻找值
79 | tempValues = new ArrayList<>();
80 | for (int i = 1; i < data.length; i++) {
81 | if (!tempValues.contains(data[i][j])) {
82 | // 如果这个属性的值没有添加过,则添加
83 | tempValues.add(data[i][j]);
84 | }
85 | }
86 |
87 | // 一列属性的值已经遍历完毕,复制到map属性表中
88 | attrValue.put(data[0][j], tempValues);
89 | }
90 |
91 | }
92 |
93 | /**
94 | * 在classType的情况下,发生condition条件的概率
95 | *
96 | * @param condition
97 | * 属性条件
98 | * @param classType
99 | * 分类的类型
100 | * @return
101 | */
102 | private double computeConditionProbably(String condition, String classType) {
103 | // 条件计数器
104 | int count = 0;
105 | // 条件属性的索引列
106 | int attrIndex = 1;
107 | // yes类标记符数据
108 | ArrayList yClassData = new ArrayList<>();
109 | // no类标记符数据
110 | ArrayList nClassData = new ArrayList<>();
111 | ArrayList classData;
112 |
113 | for (int i = 1; i < data.length; i++) {
114 | // data数据按照yes和no分类
115 | if (data[i][attrNames.length - 1].equals(YES)) {
116 | yClassData.add(data[i]);
117 | } else {
118 | nClassData.add(data[i]);
119 | }
120 | }
121 |
122 | if (classType.equals(YES)) {
123 | classData = yClassData;
124 | } else {
125 | classData = nClassData;
126 | }
127 |
128 | // 如果没有设置条件则,计算的是纯粹的类事件概率
129 | if (condition == null) {
130 | return 1.0 * classData.size() / (data.length - 1);
131 | }
132 |
133 | // 寻找此条件的属性列
134 | attrIndex = getConditionAttrName(condition);
135 |
136 | for (String[] s : classData) {
137 | if (s[attrIndex].equals(condition)) {
138 | count++;
139 | }
140 | }
141 |
142 | return 1.0 * count / classData.size();
143 | }
144 |
145 | /**
146 | * 根据条件值返回条件所属属性的列值
147 | *
148 | * @param condition
149 | * 条件
150 | * @return
151 | */
152 | private int getConditionAttrName(String condition) {
153 | // 条件所属属性名
154 | String attrName = "";
155 | // 条件所在属性列索引
156 | int attrIndex = 1;
157 | // 临时属性值类型
158 | ArrayList valueTypes;
159 | for (Map.Entry entry : attrValue.entrySet()) {
160 | valueTypes = (ArrayList) entry.getValue();
161 | if (valueTypes.contains(condition) && !((String) entry.getKey()).equals("BuysComputer")) {
162 | attrName = (String) entry.getKey();
163 | }
164 | }
165 |
166 | for (int i = 0; i < attrNames.length - 1; i++) {
167 | if (attrNames[i].equals(attrName)) {
168 | attrIndex = i;
169 | break;
170 | }
171 | }
172 |
173 | return attrIndex;
174 | }
175 |
176 | /**
177 | * 进行朴素贝叶斯分类
178 | *
179 | * @param data
180 | * 待分类数据
181 | */
182 | public String naiveBayesClassificate(String data) {
183 | // 测试数据的属性值特征
184 | String[] dataFeatures;
185 | // 在yes的条件下,x事件发生的概率
186 | double xWhenYes = 1.0;
187 | // 在no的条件下,x事件发生的概率
188 | double xWhenNo = 1.0;
189 | // 最后也是yes和no分类的总概率,用P(X|Ci)*P(Ci)的公式计算
190 | double pYes = 1;
191 | double pNo = 1;
192 |
193 | dataFeatures = data.split(" ");
194 | for (int i = 0; i < dataFeatures.length; i++) {
195 | // 因为朴素贝叶斯算法是类条件独立的,所以可以进行累积的计算
196 | xWhenYes *= computeConditionProbably(dataFeatures[i], YES);
197 | xWhenNo *= computeConditionProbably(dataFeatures[i], NO);
198 | }
199 |
200 | pYes = xWhenYes * computeConditionProbably(null, YES);
201 | pNo = xWhenNo * computeConditionProbably(null, NO);
202 |
203 | return (pYes > pNo ? YES : NO);
204 | }
205 |
206 | }
207 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/naivebayes/NaiveBayesExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.classification.naivebayes;
2 |
3 | /**
4 | * 朴素贝叶斯算法场景调用类
5 | */
6 | public class NaiveBayesExample {
7 |
8 | public static void main(String[] args) {
9 | //训练集数据
10 | String filePath = "data/naivebayes/input.txt";
11 | String testData = "Youth Medium Yes Fair";
12 | NaiveBayesCore tool = new NaiveBayesCore(filePath);
13 | System.out.println(testData + " 数据的分类为:" + tool.naiveBayesClassificate(testData));
14 | }
15 |
16 | }
17 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/clustering/birch/BIRCHCore.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.clustering.birch;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileReader;
6 | import java.io.IOException;
7 | import java.text.MessageFormat;
8 | import java.util.ArrayList;
9 | import java.util.LinkedList;
10 |
11 | /**
12 | * BIRCH聚类算法工具类
13 | */
14 | public class BIRCHCore {
15 |
16 | // 节点类型名称
17 | public static final String NON_LEAFNODE = "【NonLeafNode】";
18 | public static final String LEAFNODE = "【LeafNode】";
19 | public static final String CLUSTER = "【Cluster】";
20 |
21 | // 测试数据文件地址
22 | private String filePath;
23 | // 内部节点平衡因子B
24 | public static int B;
25 | // 叶子节点平衡因子L
26 | public static int L;
27 | // 簇直径阈值T
28 | public static double T;
29 | // 总的测试数据记录
30 | private ArrayList totalDataRecords;
31 |
32 | public BIRCHCore(String filePath, int B, int L, double T) {
33 | this.filePath = filePath;
34 | this.B = B;
35 | this.L = L;
36 | this.T = T;
37 | readDataFile();
38 | }
39 |
40 | /**
41 | * 从文件中读取数据
42 | */
43 | private void readDataFile() {
44 | File file = new File(filePath);
45 | ArrayList dataArray = new ArrayList();
46 |
47 | try {
48 | BufferedReader in = new BufferedReader(new FileReader(file));
49 | String str;
50 | String[] tempArray;
51 | while ((str = in.readLine()) != null) {
52 | tempArray = str.split(" ");
53 | dataArray.add(tempArray);
54 | }
55 | in.close();
56 | } catch (IOException e) {
57 | e.getStackTrace();
58 | }
59 |
60 | totalDataRecords = new ArrayList<>();
61 | for (String[] array : dataArray) {
62 | totalDataRecords.add(array);
63 | }
64 | }
65 |
66 | /**
67 | * 构建CF聚类特征树
68 | *
69 | * @return
70 | */
71 | private ClusteringFeature buildCFTree() {
72 | NonLeafNode rootNode = null;
73 | LeafNode leafNode = null;
74 | Cluster cluster = null;
75 |
76 | for (String[] record : totalDataRecords) {
77 | cluster = new Cluster(record);
78 |
79 | if (rootNode == null) {
80 | // CF树只有1个节点的时候的情况
81 | if (leafNode == null) {
82 | leafNode = new LeafNode();
83 | }
84 | leafNode.addingCluster(cluster);
85 | if (leafNode.getParentNode() != null) {
86 | rootNode = leafNode.getParentNode();
87 | }
88 | } else {
89 | if (rootNode.getParentNode() != null) {
90 | rootNode = rootNode.getParentNode();
91 | }
92 |
93 | // 从根节点开始,从上往下寻找到最近的添加目标叶子节点
94 | LeafNode temp = rootNode.findedClosestNode(cluster);
95 | temp.addingCluster(cluster);
96 | }
97 | }
98 |
99 | // 从下往上找出最上面的节点
100 | LeafNode node = cluster.getParentNode();
101 | NonLeafNode upNode = node.getParentNode();
102 | if (upNode == null) {
103 | return node;
104 | } else {
105 | while (upNode.getParentNode() != null) {
106 | upNode = upNode.getParentNode();
107 | }
108 |
109 | return upNode;
110 | }
111 | }
112 |
113 | /**
114 | * 开始构建CF聚类特征树
115 | */
116 | public void startBuilding() {
117 | // 树深度
118 | int level = 1;
119 | ClusteringFeature rootNode = buildCFTree();
120 |
121 | setTreeLevel(rootNode, level);
122 | showCFTree(rootNode);
123 | }
124 |
125 | /**
126 | * 设置节点深度
127 | *
128 | * @param clusteringFeature
129 | * 当前节点
130 | * @param level
131 | * 当前深度值
132 | */
133 | private void setTreeLevel(ClusteringFeature clusteringFeature, int level) {
134 | LeafNode leafNode = null;
135 | NonLeafNode nonLeafNode = null;
136 |
137 | if (clusteringFeature instanceof LeafNode) {
138 | leafNode = (LeafNode) clusteringFeature;
139 | } else if (clusteringFeature instanceof NonLeafNode) {
140 | nonLeafNode = (NonLeafNode) clusteringFeature;
141 | }
142 |
143 | if (nonLeafNode != null) {
144 | nonLeafNode.setLevel(level);
145 | level++;
146 | // 设置子节点
147 | if (nonLeafNode.getNonLeafChilds() != null) {
148 | for (NonLeafNode n1 : nonLeafNode.getNonLeafChilds()) {
149 | setTreeLevel(n1, level);
150 | }
151 | } else {
152 | for (LeafNode n2 : nonLeafNode.getLeafChilds()) {
153 | setTreeLevel(n2, level);
154 | }
155 | }
156 | } else {
157 | leafNode.setLevel(level);
158 | level++;
159 | // 设置子聚簇
160 | for (Cluster c : leafNode.getClusterChilds()) {
161 | c.setLevel(level);
162 | }
163 | }
164 | }
165 |
166 | /**
167 | * 显示CF聚类特征树
168 | *
169 | * @param rootNode
170 | * CF树根节点
171 | */
172 | private void showCFTree(ClusteringFeature rootNode) {
173 | // 空格数,用于输出
174 | int blankNum = 5;
175 | // 当前树深度
176 | int currentLevel = 1;
177 | LinkedList nodeQueue = new LinkedList<>();
178 | ClusteringFeature cf;
179 | LeafNode leafNode;
180 | NonLeafNode nonLeafNode;
181 | ArrayList clusterList = new ArrayList<>();
182 | String typeName;
183 |
184 | nodeQueue.add(rootNode);
185 | while (nodeQueue.size() > 0) {
186 | cf = nodeQueue.poll();
187 |
188 | if (cf instanceof LeafNode) {
189 | leafNode = (LeafNode) cf;
190 | typeName = LEAFNODE;
191 |
192 | if (leafNode.getClusterChilds() != null) {
193 | for (Cluster c : leafNode.getClusterChilds()) {
194 | nodeQueue.add(c);
195 | }
196 | }
197 | } else if (cf instanceof NonLeafNode) {
198 | nonLeafNode = (NonLeafNode) cf;
199 | typeName = NON_LEAFNODE;
200 |
201 | if (nonLeafNode.getNonLeafChilds() != null) {
202 | for (NonLeafNode n1 : nonLeafNode.getNonLeafChilds()) {
203 | nodeQueue.add(n1);
204 | }
205 | } else {
206 | for (LeafNode n2 : nonLeafNode.getLeafChilds()) {
207 | nodeQueue.add(n2);
208 | }
209 | }
210 | } else {
211 | clusterList.add((Cluster) cf);
212 | typeName = CLUSTER;
213 | }
214 |
215 | if (currentLevel != cf.getLevel()) {
216 | currentLevel = cf.getLevel();
217 | System.out.println();
218 | System.out.println("|");
219 | System.out.println("|");
220 | } else if (currentLevel == cf.getLevel() && currentLevel != 1) {
221 | for (int i = 0; i < blankNum; i++) {
222 | System.out.print("-");
223 | }
224 | }
225 |
226 | System.out.print(typeName);
227 | System.out.print("N:" + cf.getN() + ", LS:");
228 | System.out.print("[");
229 | for (double d : cf.getLS()) {
230 | System.out.print(MessageFormat.format("{0}, ", d));
231 | }
232 | System.out.print("]");
233 | }
234 |
235 | System.out.println();
236 | System.out.println("*******最终分好的聚簇****");
237 | //显示已经分好类的聚簇点
238 | for (int i = 0; i < clusterList.size(); i++) {
239 | System.out.println("Cluster" + (i + 1) + ":");
240 | for (double[] point : clusterList.get(i).getData()) {
241 | System.out.print("[");
242 | for (double d : point) {
243 | System.out.print(MessageFormat.format("{0}, ", d));
244 | }
245 | System.out.println("]");
246 | }
247 | }
248 | }
249 |
250 | }
251 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/clustering/birch/BIRCHExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.clustering.birch;
2 |
3 | /**
4 | * BIRCH聚类算法调用类
5 | */
6 | public class BIRCHExample {
7 |
8 | public static void main(String[] args) {
9 | String filePath = "data/birch/testInput.txt";
10 | //内部节点平衡因子B
11 | int B = 2;
12 | //叶子节点平衡因子L
13 | int L = 2;
14 | //簇直径阈值T
15 | double T = 0.6;
16 |
17 | BIRCHCore tool = new BIRCHCore(filePath, B, L, T);
18 | tool.startBuilding();
19 | }
20 |
21 | }
22 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/clustering/birch/Cluster.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.clustering.birch;
2 |
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * 叶子节点中的小集群
7 | */
8 | public class Cluster extends ClusteringFeature {
9 |
10 | //集群中的数据点
11 | private ArrayList data;
12 | //父亲节点
13 | private LeafNode parentNode;
14 |
15 | public Cluster(String[] record) {
16 | double[] d = new double[record.length];
17 | data = new ArrayList<>();
18 | for (int i = 0; i < record.length; i++) {
19 | d[i] = Double.parseDouble(record[i]);
20 | }
21 | data.add(d);
22 | //计算CF聚类特征
23 | this.setLS(data);
24 | this.setSS(data);
25 | this.setN(data);
26 | }
27 |
28 | public ArrayList getData() {
29 | return data;
30 | }
31 |
32 | public void setData(ArrayList data) {
33 | this.data = data;
34 | }
35 |
36 | @Override
37 | protected void directAddCluster(ClusteringFeature node) {
38 | //如果是聚类包括数据记录,则还需合并数据记录
39 | Cluster c = (Cluster) node;
40 | ArrayList dataRecords = c.getData();
41 | this.data.addAll(dataRecords);
42 |
43 | super.directAddCluster(node);
44 | }
45 |
46 | public LeafNode getParentNode() {
47 | return parentNode;
48 | }
49 |
50 | public void setParentNode(LeafNode parentNode) {
51 | this.parentNode = parentNode;
52 | }
53 |
54 | @Override
55 | public void addingCluster(ClusteringFeature clusteringFeature) {
56 | // TODO Auto-generated method stub
57 | }
58 |
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/clustering/birch/ClusteringFeature.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.clustering.birch;
2 |
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * 聚类特征基本属性
7 | */
8 | public abstract class ClusteringFeature {
9 |
10 | // 子类中节点的总数目
11 | protected int N;
12 | // 子类中N个节点的线性和
13 | protected double[] LS;
14 | // 子类中N个节点的平方和
15 | protected double[] SS;
16 | //节点深度,用于CF树的输出
17 | protected int level;
18 |
19 | public int getN() {
20 | return N;
21 | }
22 |
23 | public void setN(int n) {
24 | N = n;
25 | }
26 |
27 | public double[] getLS() {
28 | return LS;
29 | }
30 |
31 | public void setLS(double[] lS) {
32 | LS = lS;
33 | }
34 |
35 | public double[] getSS() {
36 | return SS;
37 | }
38 |
39 | public void setSS(double[] sS) {
40 | SS = sS;
41 | }
42 |
43 | protected void setN(ArrayList dataRecords) {
44 | this.N = dataRecords.size();
45 | }
46 |
47 | public int getLevel() {
48 | return level;
49 | }
50 |
51 | public void setLevel(int level) {
52 | this.level = level;
53 | }
54 |
55 | /**
56 | * 根据节点数据计算线性和
57 | *
58 | * @param dataRecords
59 | * 节点数据记录
60 | */
61 | protected void setLS(ArrayList dataRecords) {
62 | int num = dataRecords.get(0).length;
63 | double[] record;
64 | LS = new double[num];
65 | for (int j = 0; j < num; j++) {
66 | LS[j] = 0;
67 | }
68 |
69 | for (int i = 0; i < dataRecords.size(); i++) {
70 | record = dataRecords.get(i);
71 | for (int j = 0; j < record.length; j++) {
72 | LS[j] += record[j];
73 | }
74 | }
75 | }
76 |
77 | /**
78 | * 根据节点数据计算平方
79 | *
80 | * @param dataRecords
81 | * 节点数据
82 | */
83 | protected void setSS(ArrayList dataRecords) {
84 | int num = dataRecords.get(0).length;
85 | double[] record;
86 | SS = new double[num];
87 | for (int j = 0; j < num; j++) {
88 | SS[j] = 0;
89 | }
90 |
91 | for (int i = 0; i < dataRecords.size(); i++) {
92 | record = dataRecords.get(i);
93 | for (int j = 0; j < record.length; j++) {
94 | SS[j] += record[j] * record[j];
95 | }
96 | }
97 | }
98 |
99 | /**
100 | * CF向量特征的叠加,无须考虑划分
101 | *
102 | * @param node
103 | */
104 | protected void directAddCluster(ClusteringFeature node) {
105 | int N = node.getN();
106 | double[] otherLS = node.getLS();
107 | double[] otherSS = node.getSS();
108 |
109 | if (LS == null) {
110 | this.N = 0;
111 | LS = new double[otherLS.length];
112 | SS = new double[otherLS.length];
113 |
114 | for (int i = 0; i < LS.length; i++) {
115 | LS[i] = 0;
116 | SS[i] = 0;
117 | }
118 | }
119 |
120 | // 3个数量上进行叠加
121 | for (int i = 0; i < LS.length; i++) {
122 | LS[i] += otherLS[i];
123 | SS[i] += otherSS[i];
124 | }
125 | this.N += N;
126 | }
127 |
128 | /**
129 | * 计算簇与簇之间的距离即簇中心之间的距离
130 | *
131 | * @return
132 | */
133 | protected double computerClusterDistance(ClusteringFeature cluster) {
134 | double distance = 0;
135 | double[] otherLS = cluster.LS;
136 | int num = N;
137 |
138 | int otherNum = cluster.N;
139 |
140 | for (int i = 0; i < LS.length; i++) {
141 | distance += (LS[i] / num - otherLS[i] / otherNum) * (LS[i] / num - otherLS[i] / otherNum);
142 | }
143 | distance = Math.sqrt(distance);
144 |
145 | return distance;
146 | }
147 |
148 | /**
149 | * 计算簇内对象的平均距离
150 | *
151 | * @param records
152 | * 簇内的数据记录
153 | * @return
154 | */
155 | protected double computerInClusterDistance(ArrayList records) {
156 | double sumDistance = 0;
157 | double[] data1;
158 | double[] data2;
159 | // 数据总数
160 | int totalNum = records.size();
161 |
162 | for (int i = 0; i < totalNum - 1; i++) {
163 | data1 = records.get(i);
164 | for (int j = i + 1; j < totalNum; j++) {
165 | data2 = records.get(j);
166 | sumDistance += computeOuDistance(data1, data2);
167 | }
168 | }
169 |
170 | // 返回的值除以总对数,总对数应减半,会重复算一次
171 | return Math.sqrt(sumDistance / (totalNum * (totalNum - 1) / 2));
172 | }
173 |
174 | /**
175 | * 对给定的2个向量,计算欧式距离
176 | *
177 | * @param record1
178 | * 向量点1
179 | * @param record2
180 | * 向量点2
181 | */
182 | private double computeOuDistance(double[] record1, double[] record2) {
183 | double distance = 0;
184 |
185 | for (int i = 0; i < record1.length; i++) {
186 | distance += (record1[i] - record2[i]) * (record1[i] - record2[i]);
187 | }
188 |
189 | return distance;
190 | }
191 |
192 | /**
193 | * 聚类添加节点包括,超出阈值进行分裂的操作
194 | *
195 | * @param clusteringFeature
196 | * 待添加聚簇
197 | */
198 | public abstract void addingCluster(ClusteringFeature clusteringFeature);
199 | }
200 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/clustering/birch/LeafNode.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.clustering.birch;
2 |
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * CF树叶子节点
7 | */
8 | public class LeafNode extends ClusteringFeature {
9 |
10 | // 孩子集群
11 | private ArrayList clusterChilds;
12 | // 父亲节点
13 | private NonLeafNode parentNode;
14 |
15 | public ArrayList getClusterChilds() {
16 | return clusterChilds;
17 | }
18 |
19 | public void setClusterChilds(ArrayList clusterChilds) {
20 | this.clusterChilds = clusterChilds;
21 | }
22 |
23 | /**
24 | * 将叶子节点划分出2个
25 | *
26 | * @return
27 | */
28 | public LeafNode[] divideLeafNode() {
29 | LeafNode[] leafNodeArray = new LeafNode[2];
30 | // 簇间距离差距最大的2个簇,后面的簇按照就近原则划分即可
31 | Cluster cluster1 = null;
32 | Cluster cluster2 = null;
33 | Cluster tempCluster = null;
34 | double maxValue = 0;
35 | double temp = 0;
36 |
37 | // 找出簇心距离差距最大的2个簇
38 | for (int i = 0; i < clusterChilds.size() - 1; i++) {
39 | tempCluster = clusterChilds.get(i);
40 | for (int j = i + 1; j < clusterChilds.size(); j++) {
41 | temp = tempCluster.computerClusterDistance(clusterChilds.get(j));
42 |
43 | if (temp > maxValue) {
44 | maxValue = temp;
45 | cluster1 = tempCluster;
46 | cluster2 = clusterChilds.get(j);
47 | }
48 | }
49 | }
50 |
51 | leafNodeArray[0] = new LeafNode();
52 | leafNodeArray[0].addingCluster(cluster1);
53 | cluster1.setParentNode(leafNodeArray[0]);
54 | leafNodeArray[1] = new LeafNode();
55 | leafNodeArray[1].addingCluster(cluster2);
56 | cluster2.setParentNode(leafNodeArray[1]);
57 | clusterChilds.remove(cluster1);
58 | clusterChilds.remove(cluster2);
59 | // 就近分配簇
60 | for (Cluster c : clusterChilds) {
61 | if (cluster1.computerClusterDistance(c) < cluster2.computerClusterDistance(c)) {
62 | // 簇间距离如果接近最小簇,就加入最小簇所属叶子节点
63 | leafNodeArray[0].addingCluster(c);
64 | c.setParentNode(leafNodeArray[0]);
65 | } else {
66 | leafNodeArray[1].addingCluster(c);
67 | c.setParentNode(leafNodeArray[1]);
68 | }
69 | }
70 |
71 | return leafNodeArray;
72 | }
73 |
74 | public NonLeafNode getParentNode() {
75 | return parentNode;
76 | }
77 |
78 | public void setParentNode(NonLeafNode parentNode) {
79 | this.parentNode = parentNode;
80 | }
81 |
82 | @Override
83 | public void addingCluster(ClusteringFeature clusteringFeature) {
84 | //更新聚类特征值
85 | directAddCluster(clusteringFeature);
86 |
87 | // 寻找到的目标集群
88 | Cluster findedCluster = null;
89 | Cluster cluster = (Cluster) clusteringFeature;
90 | // 簇内对象平均距离
91 | double disance = Integer.MAX_VALUE;
92 | // 簇间距离差值
93 | double errorDistance = 0;
94 | boolean needDivided = false;
95 | if (clusterChilds == null) {
96 | clusterChilds = new ArrayList<>();
97 | clusterChilds.add(cluster);
98 | cluster.setParentNode(this);
99 | } else {
100 | for (Cluster c : clusterChilds) {
101 | errorDistance = c.computerClusterDistance(cluster);
102 | if (disance > errorDistance) {
103 | // 选出簇间距离最近的
104 | disance = errorDistance;
105 | findedCluster = c;
106 | }
107 | }
108 |
109 | ArrayList data1 = (ArrayList) findedCluster.getData().clone();
110 | ArrayList data2 = cluster.getData();
111 | data1.addAll(data2);
112 | // 如果添加后的聚类的簇间距离超过给定阈值,需要额外新建簇
113 | if (findedCluster.computerInClusterDistance(data1) > BIRCHCore.T) {
114 | // 叶子节点的孩子数不能超过平衡因子L
115 | if (clusterChilds.size() + 1 > BIRCHCore.L) {
116 | needDivided = true;
117 | }
118 | clusterChilds.add(cluster);
119 | cluster.setParentNode(this);
120 | } else {
121 | findedCluster.directAddCluster(cluster);
122 | cluster.setParentNode(this);
123 | }
124 | }
125 |
126 | if (needDivided) {
127 | if (parentNode == null) {
128 | parentNode = new NonLeafNode();
129 | } else {
130 | parentNode.getLeafChilds().remove(this);
131 | }
132 |
133 | LeafNode[] nodeArray = divideLeafNode();
134 | for (LeafNode n : nodeArray) {
135 | parentNode.addingCluster(n);
136 | }
137 | }
138 | }
139 |
140 | @Override
141 | protected void directAddCluster(ClusteringFeature node) {
142 | // TODO Auto-generated method stub
143 | if (parentNode != null) {
144 | parentNode.directAddCluster(node);
145 | }
146 |
147 | super.directAddCluster(node);
148 | }
149 |
150 | }
151 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/clustering/kmeans/KMeansCore.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.clustering.kmeans;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileReader;
6 | import java.io.IOException;
7 | import java.text.MessageFormat;
8 | import java.util.ArrayList;
9 | import java.util.Collections;
10 |
11 | /**
12 | * k均值算法工具类
13 | */
14 | public class KMeansCore {
15 |
16 | // 输入数据文件地址
17 | private String filePath;
18 | // 分类类别个数
19 | private int classNum;
20 | // 类名称
21 | private ArrayList classNames;
22 | // 聚类坐标点
23 | private ArrayList classPoints;
24 | // 所有的数据左边点
25 | private ArrayList totalPoints;
26 |
27 | public KMeansCore(String filePath, int classNum) {
28 | this.filePath = filePath;
29 | this.classNum = classNum;
30 | readDataFile();
31 | }
32 |
33 | /**
34 | * 从文件中读取数据
35 | */
36 | private void readDataFile() {
37 | File file = new File(filePath);
38 | ArrayList dataArray = new ArrayList();
39 |
40 | try {
41 | BufferedReader in = new BufferedReader(new FileReader(file));
42 | String str;
43 | String[] tempArray;
44 | while ((str = in.readLine()) != null) {
45 | tempArray = str.split(" ");
46 | dataArray.add(tempArray);
47 | }
48 | in.close();
49 | } catch (IOException e) {
50 | e.getStackTrace();
51 | }
52 |
53 | classPoints = new ArrayList<>();
54 | totalPoints = new ArrayList<>();
55 | classNames = new ArrayList<>();
56 | for (int i = 0, j = 1; i < dataArray.size(); i++) {
57 | if (j <= classNum) {
58 | classPoints.add(new Point(dataArray.get(i)[0], dataArray.get(i)[1], j + ""));
59 | classNames.add(i + "");
60 | j++;
61 | }
62 | totalPoints.add(new Point(dataArray.get(i)[0], dataArray.get(i)[1]));
63 | }
64 | }
65 |
66 | /**
67 | * K均值聚类算法实现
68 | */
69 | public void kMeansClustering() {
70 | double tempX = 0;
71 | double tempY = 0;
72 | int count = 0;
73 | double error = Integer.MAX_VALUE;
74 | Point temp;
75 |
76 | while (error > 0.01 * classNum) {
77 | for (Point p1 : totalPoints) {
78 | // 将所有的测试坐标点就近分类
79 | for (Point p2 : classPoints) {
80 | p2.computerDistance(p1);
81 | }
82 | Collections.sort(classPoints);
83 |
84 | // 取出p1离类坐标点最近的那个点
85 | p1.setClassName(classPoints.get(0).getClassName());
86 | }
87 |
88 | error = 0;
89 | // 按照均值重新划分聚类中心点
90 | for (Point p1 : classPoints) {
91 | count = 0;
92 | tempX = 0;
93 | tempY = 0;
94 | for (Point p : totalPoints) {
95 | if (p.getClassName().equals(p1.getClassName())) {
96 | count++;
97 | tempX += p.getX();
98 | tempY += p.getY();
99 | }
100 | }
101 | tempX /= count;
102 | tempY /= count;
103 |
104 | error += Math.abs((tempX - p1.getX()));
105 | error += Math.abs((tempY - p1.getY()));
106 | // 计算均值
107 | p1.setX(tempX);
108 | p1.setY(tempY);
109 |
110 | }
111 |
112 | for (int i = 0; i < classPoints.size(); i++) {
113 | temp = classPoints.get(i);
114 | System.out.println(MessageFormat.format("聚类中心点{0},x={1},y={2}", (i + 1), temp.getX(), temp.getY()));
115 | }
116 | System.out.println("----------");
117 | }
118 |
119 | System.out.println("结果值收敛");
120 | for (int i = 0; i < classPoints.size(); i++) {
121 | temp = classPoints.get(i);
122 | System.out.println(MessageFormat.format("聚类中心点{0},x={1},y={2}", (i + 1), temp.getX(), temp.getY()));
123 | }
124 |
125 | }
126 |
127 | }
128 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/clustering/kmeans/KMeansExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.clustering.kmeans;
2 |
3 | /**
4 | * K-means(K均值)算法调用类
5 | */
6 | public class KMeansExample {
7 |
8 | public static void main(String[] args) {
9 | String filePath = "data/kmeans/input.txt";
10 | // 聚类中心数量设定
11 | int classNum = 3;
12 |
13 | KMeansCore tool = new KMeansCore(filePath, classNum);
14 | tool.kMeansClustering();
15 | }
16 |
17 | }
18 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/clustering/kmeans/Point.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.clustering.kmeans;
2 |
3 | /**
4 | * 坐标点类
5 | */
6 | public class Point implements Comparable {
7 |
8 | // 坐标点横坐标
9 | private double x;
10 | // 坐标点纵坐标
11 | private double y;
12 | //以此点作为聚类中心的类的类名称
13 | private String className;
14 | // 坐标点之间的欧式距离
15 | private Double distance;
16 |
17 | public Point(double x, double y) {
18 | this.x = x;
19 | this.y = y;
20 | }
21 |
22 | public Point(String x, String y) {
23 | this.x = Double.parseDouble(x);
24 | this.y = Double.parseDouble(y);
25 | }
26 |
27 | public Point(String x, String y, String className) {
28 | this.x = Double.parseDouble(x);
29 | this.y = Double.parseDouble(y);
30 | this.className = className;
31 | }
32 |
33 | /**
34 | * 距离目标点p的欧几里得距离
35 | *
36 | * @param p
37 | */
38 | public void computerDistance(Point p) {
39 | if (p == null) {
40 | return;
41 | }
42 |
43 | this.distance = (this.x - p.x) * (this.x - p.x) + (this.y - p.y) * (this.y - p.y);
44 | }
45 |
46 | public double getX() {
47 | return x;
48 | }
49 |
50 | public void setX(double x) {
51 | this.x = x;
52 | }
53 |
54 | public double getY() {
55 | return y;
56 | }
57 |
58 | public void setY(double y) {
59 | this.y = y;
60 | }
61 |
62 | public String getClassName() {
63 | return className;
64 | }
65 |
66 | public void setClassName(String className) {
67 | this.className = className;
68 | }
69 |
70 | public double getDistance() {
71 | return distance;
72 | }
73 |
74 | public void setDistance(double distance) {
75 | this.distance = distance;
76 | }
77 |
78 | @Override
79 | public int compareTo(Point o) {
80 | return this.distance.compareTo(o.distance);
81 | }
82 |
83 | }
84 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/DataReader.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.dimensionality.reduction.pca;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.IOException;
5 | import java.io.Reader;
6 | import java.util.ArrayList;
7 | import java.util.List;
8 |
9 | /**
10 | * 简单的向量数据读取
11 | */
12 | public class DataReader extends BufferedReader {
13 |
14 | public DataReader(Reader in, int sz) {
15 | super(in, sz);
16 | }
17 |
18 | public DataReader(Reader in) {
19 | super(in);
20 | }
21 |
22 | /**
23 | * Get the (vector) data contained in the file. The data is stored one value
24 | * per line. Empty lines are ignored.
25 | *
26 | * @return the data
27 | */
28 | public double[] getData() throws IOException {
29 | List dataList = new ArrayList<>();
30 | String line;
31 | while ((line = readLine()) != null) {
32 | line = line.trim();
33 | if (line.isEmpty()) {
34 | continue;
35 | }
36 | dataList.add(Double.valueOf(line));
37 | }
38 |
39 | double[] vector = new double[dataList.size()];
40 | int i = 0;
41 | for (Double d : dataList) {
42 | vector[i++] = d;
43 | }
44 |
45 | return vector;
46 | }
47 |
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/Main.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.dimensionality.reduction.pca;
2 |
3 | import java.io.FileReader;
4 | import java.io.FileWriter;
5 | import java.io.IOException;
6 | import java.io.PrintWriter;
7 |
8 | /**
9 | * 命令行操作类
10 | */
11 | public class Main {
12 |
13 | /**
14 | * Run a PCA on vector data
15 | *
16 | * @param av are file references containing vector data
17 | * @throws Exception
18 | */
19 | public static void main(String... av) throws Exception {
20 |
21 | if (av.length == 0) {
22 | throw new IllegalArgumentException("Usage: pca FILES...");
23 | }
24 |
25 | for (String filename : av) {
26 | try (DataReader dr = new DataReader(new FileReader(filename + ".data"))) {
27 | double[] data = dr.getData();
28 | System.out.println(filename + ": vector length = " + data.length);
29 |
30 | PCACoreHandler handler = new PCACoreHandler();
31 | PCACore pca = handler.fromSimpleTimeSeries(data);
32 |
33 | log(filename + "_pcomps.data", filename + ": principle components", pca.getPrincipalComponents());
34 | log(filename + "_lambda.data", filename + ": lambda", pca.getLambda());
35 | log(filename + "_pfacs.data", filename + ": principle factors", pca.getPrinicipalFactors());
36 |
37 | Matrix cc = handler.correlationCircle(pca);
38 | log(filename + "_cc.data", filename + ": correlation circle", cc);
39 |
40 | Matrix cumcon = handler.cumulativeContribution(pca);
41 | log(filename + "_cumcon.data", filename + ": cumulative contributions", cumcon);
42 | }
43 | }
44 | }
45 |
46 | private static void log(String filename, String tag, Matrix m) throws IOException {
47 | try (PrintWriter fp = new PrintWriter(new FileWriter(filename))) {
48 | System.out.println(tag + ":");
49 | MatrixHelper.print(m, fp, 1, 4);
50 | }
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/MatrixException.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.dimensionality.reduction.pca;
2 |
3 | /**
4 | * 矩阵异常
5 | */
6 | public class MatrixException extends RuntimeException {
7 |
8 | private static final long serialVersionUID = -65073227556727585L;
9 |
10 | public MatrixException(String s) {
11 | super(s);
12 | }
13 |
14 | }
15 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/MatrixHelper.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.dimensionality.reduction.pca;
2 |
3 | import java.io.PrintWriter;
4 | import java.text.DecimalFormat;
5 | import java.text.DecimalFormatSymbols;
6 | import java.text.NumberFormat;
7 | import java.util.Locale;
8 |
9 | /**
10 | * 矩阵的操作类
11 | */
12 | public class MatrixHelper {
13 |
14 | /**
15 | * Print the matrix to stdout. Line the elements up in columns with a
16 | * Fortran-like 'Fw.d' style format.
17 | *
18 | * @param w Column width.
19 | * @param d Number of digits after the decimal.
20 | */
21 | public static void print(Matrix a, int w, int d) {
22 | print(a, new PrintWriter(System.out, true), w, d);
23 | }
24 |
25 | /**
26 | * Print the matrix to the output stream. Line the elements up in columns
27 | * with a Fortran-like 'Fw.d' style format.
28 | *
29 | * @param output Output stream.
30 | * @param w Column width.
31 | * @param d Number of digits after the decimal.
32 | */
33 | public static void print(Matrix a, PrintWriter output, int w, int d) {
34 | DecimalFormat format = new DecimalFormat();
35 | format.setDecimalFormatSymbols(new DecimalFormatSymbols(Locale.US));
36 | format.setMinimumIntegerDigits(1);
37 | format.setMaximumFractionDigits(d);
38 | format.setMinimumFractionDigits(d);
39 | format.setGroupingUsed(false);
40 | print(a, output, format, w + 2);
41 | }
42 |
43 | /**
44 | * Print the matrix to stdout. Line the elements up in columns. Use the
45 | * format object, and right justify within columns of width characters. Note
46 | * that is the matrix is to be read back in, you probably will want to use a
47 | * NumberFormat that is set to US Locale.
48 | *
49 | * @param format A Formatting object for individual elements.
50 | * @param width Field width for each column.
51 | * @see java.text.DecimalFormat#setDecimalFormatSymbols
52 | */
53 | public static void print(Matrix a, NumberFormat format, int width) {
54 | print(a, new PrintWriter(System.out, true), format, width);
55 | }
56 |
57 | // DecimalFormat is a little disappointing coming from Fortran or C's printf.
58 | // Since it doesn't pad on the left, the elements will come out different
59 | // widths. Consequently, we'll pass the desired column width in as an
60 | // argument and do the extra padding ourselves.
61 | /**
62 | * Print the matrix to the output stream. Line the elements up in columns.
63 | * Use the format object, and right justify within columns of width
64 | * characters. Note that is the matrix is to be read back in, you probably
65 | * will want to use a NumberFormat that is set to US Locale.
66 | *
67 | * @param output the output stream.
68 | * @param format A formatting object to format the matrix elements
69 | * @param width Column width.
70 | * @see java.text.DecimalFormat#setDecimalFormatSymbols
71 | */
72 | public static void print(Matrix a, PrintWriter output, NumberFormat format, int width) {
73 | output.println(); // start on new line.
74 | int m = a.getNRows();
75 | int n = a.getNCols();
76 | double[][] A = a.getArray();
77 | for (int i = 0; i < m; i++) {
78 | for (int j = 0; j < n; j++) {
79 | String s = format.format(A[i][j]); // format the number
80 | int padding = Math.max(1, width - s.length()); // At _least_ 1 space
81 | for (int k = 0; k < padding; k++) {
82 | output.print(' ');
83 | }
84 | output.print(s);
85 | }
86 | output.println();
87 | }
88 | output.println(); // end with blank line.
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/PCACore.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.dimensionality.reduction.pca;
2 |
3 | /**
4 | * PCA核心算法类
5 | */
6 | public class PCACore {
7 |
8 | // The incoming matrix
9 | private final Matrix m;
10 | // the principal components
11 | private final Matrix pc;
12 | // facpr
13 | private final Matrix facpr;
14 | // lambda
15 | private final Matrix lambda;
16 |
17 | public PCACore(Matrix x) {
18 |
19 | // Weight and center the matrix
20 | this.m = x.wcenter();
21 | // compute the eigenvectors of y'*y using svd
22 | SVD svd = new SVD(this.m);
23 |
24 | // calculate the lambda
25 | this.lambda = calculateLambda(svd.getS());
26 | // get the principle factors
27 | this.facpr = svd.getV();
28 |
29 | // calculate the principle components
30 | this.pc = this.m.times(svd.getV());
31 | }
32 |
33 | private Matrix calculateLambda(Matrix s) {
34 |
35 | Matrix d = s.diag();
36 | double[][] D = d.getArray();
37 |
38 | int size = d.getNRows();
39 | for (int i = 0; i < size; i++) {
40 | D[i][0] = (D[i][0] * D[i][0]) / (size - 1);
41 | }
42 |
43 | return d;
44 | }
45 |
46 | public Matrix getPrincipalComponents() {
47 | return pc;
48 | }
49 |
50 | public Matrix getLambda() {
51 | return lambda;
52 | }
53 |
54 | public Matrix getPrinicipalFactors() {
55 | return facpr;
56 | }
57 |
58 | }
59 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/PCACoreHandler.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.dimensionality.reduction.pca;
2 |
3 | /**
4 | * 封装PCACore的调用类
5 | */
6 | public class PCACoreHandler {
7 |
8 | public PCACoreHandler() {
9 | }
10 |
11 | /**
12 | * Run a principal component analysis of a matrix.
13 | *
14 | * @param m the matrix
15 | * @return the principle components
16 | */
17 | public PCACore fromMatrix(Matrix m) {
18 | return new PCACore(m);
19 | }
20 |
21 | /**
22 | * Run a principal component analysis from a simple time series vector. We
23 | * are converting the data into a Toeplitz style matrix before running the
24 | * PCA.
25 | *
26 | * @param data the time series vector
27 | * @return the principle components
28 | */
29 | public PCACore fromSimpleTimeSeries(double[] data) {
30 | Matrix m = new ToeplitzMatrix(data);
31 | PCACore pca = new PCACore(m);
32 | return pca;
33 | }
34 |
35 | /**
36 | * Calculate the correlations circle for two components. This is quick and
37 | * dirty we are not doing any validity checks to make sure the PCA has
38 | * completed successfully.
39 | *
40 | * @param pca the PCA
41 | * @param compare the principal factor columns to compare
42 | * @return the correlations circle
43 | */
44 | public Matrix correlationCircle(PCACore pca, int[] compare) {
45 | double[][] F = pca.getPrinicipalFactors().getArray();
46 | double[][] L = pca.getLambda().getArray();
47 |
48 | // calculate the correlation circle
49 | Matrix cc = new Matrix(F.length, compare.length);
50 | double[][] CC = cc.getArray();
51 |
52 | for (int n = 0; n < compare.length; n++) {
53 | int index = compare[n];
54 | double s = Math.sqrt(L[index][0]);
55 | for (int m = 0; m < F.length; m++) {
56 | double f = F[m][index];
57 |
58 | CC[m][n] = s * f;
59 | }
60 | }
61 | return cc;
62 | }
63 |
64 | /**
65 | * Calculate the correlations circle for the two largest eigenvalues.
66 | *
67 | * @param pca the pca
68 | * @return the correlations circle
69 | */
70 | public Matrix correlationCircle(PCACore pca) {
71 | return correlationCircle(pca, new int[] { 0, 1 });
72 | }
73 |
74 | /**
75 | * Normalize the eigenvalues so we can create a scree plot.
76 | *
77 | * @param pca the pca
78 | * @return the normalized eigenvalues;
79 | */
80 | public Matrix normalizeLambda(PCACore pca) {
81 |
82 | double[][] L = pca.getLambda().getArrayCopy();
83 | Matrix nl = new Matrix(L);
84 | double sum = 0;
85 | for (int n = 0; n < L.length; n++) {
86 | sum += L[n][0];
87 | }
88 | for (int n = 0; n < L.length; n++) {
89 | L[n][0] = L[n][0] / sum;
90 | }
91 | return nl;
92 | }
93 |
94 | /**
95 | * Calculate the cumulative contribution of the eigenvectors
96 | *
97 | * @param pca is the pca
98 | * @return the cumulative contributions of the eigenvectors
99 | */
100 | public Matrix cumulativeContribution(PCACore pca) {
101 | Matrix nl = normalizeLambda(pca);
102 | double[][] CC = nl.getArrayCopy();
103 | Matrix cc = new Matrix(CC);
104 | double cum = 0;
105 | for (int n = 0; n < CC.length; n++) {
106 | cum = CC[n][0] = CC[n][0] + cum;
107 | }
108 | return cc;
109 | }
110 | }
111 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/PCAExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.dimensionality.reduction.pca;
2 |
3 | public class PCAExample {
4 |
5 | // expected results
6 | private static int SIZE = 6;
7 | private static double[] data = new double[] { 1, 2, 3, 4, 5, 6 };
8 |
9 | private static double[][] pcomps = new double[][] { { -2.4530, -1.4869, -0.2693, -0.1327, -0.1141, -0.0000 },
10 | { -1.9344, 0.1112, 0.1731, 0.3003, 0.2126, 0.0000 }, { -0.7959, 1.3757, 0.4091, -0.1677, -0.1651, -0.0000 },
11 | { 0.7959, 1.3757, -0.4091, -0.1677, 0.1651, -0.0000 }, { 1.9344, 0.1112, -0.1731, 0.3003, -0.2126, 0.0000 },
12 | { 2.4530, -1.4869, 0.2693, -0.1327, 0.1141, 0.0000 } };
13 |
14 | private static double[] plambda = new double[] { 4.1572, 1.6463, 0.1080, 0.0544, 0.0342, 0.0000 };
15 |
16 | private static double[][] pfacs = { { 0.4851, -0.0000, 0.4138, 0.0000, 0.3056, 0.7071 },
17 | { 0.4562, -0.2454, -0.1519, -0.6631, -0.5185, -0.0000 },
18 | { 0.2378, -0.6631, -0.5529, 0.2454, 0.3712, -0.0000 },
19 | { -0.2378, -0.6631, 0.5529, 0.2454, -0.3712, 0.0000 },
20 | { -0.4562, -0.2454, 0.1519, -0.6631, 0.5185, -0.0000 },
21 | { -0.4851, -0.0000, -0.4138, -0.0000, -0.3056, 0.7071 } };
22 |
23 | public static void main(String[] args) {
24 | PCACoreHandler instance = new PCACoreHandler();
25 | PCACore result = instance.fromSimpleTimeSeries(data);
26 |
27 | // compare the principal components
28 | System.out.println("compare the principal components:");
29 | double[][] res_pcomp = result.getPrincipalComponents().getArray();
30 | for (int i = 0; i < SIZE; i++) {
31 | for (int j = 0; j < SIZE; j++) {
32 | System.out.println(pcomps[i][j] + " , " + res_pcomp[i][j]);
33 | }
34 | }
35 |
36 | // compare the lambdas
37 | System.out.println("compare the lambdas:");
38 | double[] res_plambda = result.getLambda().transpose().getArray()[0];
39 | for (int i = 0; i < SIZE; i++) {
40 | System.out.println(plambda[i] + " , " + res_plambda[i]);
41 | }
42 |
43 | // compare the principle factors
44 | System.out.println("compare the principle factors:");
45 | double[][] res_pfacs = result.getPrinicipalFactors().getArray();
46 | for (int i = 0; i < SIZE; i++) {
47 | for (int j = 0; j < SIZE; j++) {
48 | System.out.println(pfacs[i][j] + " , " + res_pfacs[i][j]);
49 | }
50 | }
51 | }
52 |
53 | }
54 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/ToeplitzMatrix.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.dimensionality.reduction.pca;
2 |
3 | /**
4 | * Toeplitz matrix
5 | */
6 | public class ToeplitzMatrix extends Matrix {
7 |
8 | /**
9 | * Toeplitz matrix styles
10 | */
11 | public static enum Type {
12 |
13 | Triangular, Symmetrical, Circulant
14 | };
15 |
16 | /**
17 | * Create a symmetrical Toeplitz-style matrix from a vector.
18 | *
19 | * @param v
20 | */
21 | public ToeplitzMatrix(double[] v) {
22 | this(v, Type.Symmetrical);
23 | }
24 |
25 | /**
26 | * Create a Toeplitz matrix from a vector.
27 | *
28 | * @param v the vector
29 | * @param type the matrix style
30 | */
31 | public ToeplitzMatrix(double[] v, Type type) {
32 | super(v.length, v.length);
33 | int n = v.length;
34 | double[][] arr = getArray();
35 |
36 | for (int i = 0; i < v.length; i++) {
37 | for (int j = 0; j <= i; j++) {
38 | int index = i - j;
39 | arr[i][j] = v[i - j];
40 | switch (type) {
41 | default:
42 | case Triangular:
43 | // do nothing
44 | break;
45 | case Symmetrical:
46 | arr[j][i] = v[i - j];
47 | break;
48 | case Circulant:
49 | if (j != i) {
50 | arr[j][i] = v[n - index];
51 | }
52 | break;
53 | }
54 | }
55 | }
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/TrajectoryMatrix.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.dimensionality.reduction.pca;
2 |
3 | /**
4 | * Create a trajectory style matrix from a vector.
5 | */
6 | public class TrajectoryMatrix extends Matrix {
7 |
8 | public TrajectoryMatrix(double[] v, int ncols) {
9 | super(v.length - ncols + 1, ncols);
10 | double[][] arr = getArray();
11 | int nrows = getNRows();
12 | int pos = 0; // position in vector
13 |
14 | for (int i = 0; i < nrows; i++) {
15 | double value = v[pos++];
16 | int availCols = i < ncols ? i + 1 : ncols;
17 | for (int j = 0, m = i; j < availCols && m >= 0; j++, m--) {
18 | arr[m][j] = value;
19 | }
20 | }
21 | for (int i = 1; i < ncols; i++) {
22 | double value = v[pos++];
23 | for (int j = i, m = nrows - 1; j < ncols && m > 0; j++, m--) {
24 | arr[m][j] = value;
25 | }
26 | }
27 | }
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/graph/gspan/DFSCodeTraveler.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.graph.gspan;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Stack;
5 |
6 | /**
7 | * 图编码深度优先搜索类,判断当前编码在给定图中是否为最小编码
8 | */
9 | public class DFSCodeTraveler {
10 |
11 | // 当前的编码是否为最下编码标识
12 | boolean isMin;
13 | // 当前挖掘的图的边五元组编码组
14 | ArrayList edgeSeqs;
15 | // 当前的图结构
16 | Graph graph;
17 | // 图节点id对应的边五元组中的id标识
18 | int[] g2s;
19 | // 代表图中的边是否被用到了
20 | boolean f[][];
21 |
22 | public DFSCodeTraveler(ArrayList edgeSeqs, Graph graph) {
23 | this.isMin = true;
24 | this.edgeSeqs = edgeSeqs;
25 | this.graph = graph;
26 | }
27 |
28 | public void traveler() {
29 | int nodeLNums = graph.nodeLabels.size();
30 | g2s = new int[nodeLNums];
31 | for (int i = 0; i < nodeLNums; i++) {
32 | // 设置-1代表此点还未被计入编码
33 | g2s[i] = -1;
34 | }
35 |
36 | f = new boolean[nodeLNums][nodeLNums];
37 | for (int i = 0; i < nodeLNums; i++) {
38 | for (int j = 0; j < nodeLNums; j++) {
39 | f[i][j] = false;
40 | }
41 | }
42 |
43 | // 从每个点开始寻找最小编码五元组
44 | for (int i = 0; i < nodeLNums; i++) {
45 | //对选择的第一个点的标号做判断
46 | if (graph.getNodeLabels().get(i) > edgeSeqs.get(0).x) {
47 | continue;
48 | }
49 | // 五元组id从0开始设置
50 | g2s[i] = 0;
51 |
52 | Stack s = new Stack<>();
53 | s.push(i);
54 | dfsSearch(s, 0, 1);
55 | if (!isMin) {
56 | return;
57 | }
58 | g2s[i] = -1;
59 | }
60 | }
61 |
62 | /**
63 | * 深度优先搜索最小编码组
64 | *
65 | * @param stack
66 | * 加入的节点id栈
67 | * @param currentPosition
68 | * 当前进行的层次,代表找到的第几条边
69 | * @param next
70 | * 五元组边下一条边的点的临时标识
71 | */
72 | private void dfsSearch(Stack stack, int currentPosition, int next) {
73 | if (currentPosition >= edgeSeqs.size()) {
74 | stack.pop();
75 | // 比较到底了则返回
76 | return;
77 | }
78 |
79 | while (!stack.isEmpty()) {
80 | int x = stack.pop();
81 | for (int i = 0; i < graph.edgeNexts.get(x).size(); i++) {
82 | // 从此id节点所连接的点中选取1个点作为下一个点
83 | int y = graph.edgeNexts.get(x).get(i);
84 | // 如果这2个点所构成的边已经被用过,则继续
85 | if (f[x][y] || f[y][x]) {
86 | continue;
87 | }
88 |
89 | // 如果y这个点未被用过
90 | if (g2s[y] < 0) {
91 | // 新建这条边五元组
92 | Edge e = new Edge(g2s[x], next, graph.nodeLabels.get(x), graph.edgeLabels.get(x).get(i),
93 | graph.nodeLabels.get(y));
94 |
95 | // 与相应位置的边做比较,如果不是最小则失败
96 | int compareResult = e.compareWith(edgeSeqs.get(currentPosition));
97 | if (compareResult == Edge.EDGE_SMALLER) {
98 | isMin = false;
99 | return;
100 | } else if (compareResult == Edge.EDGE_LARGER) {
101 | continue;
102 | }
103 | // 如果相等则继续比
104 | g2s[y] = next;
105 | f[x][y] = true;
106 | f[y][x] = true;
107 | stack.push(y);
108 | dfsSearch(stack, currentPosition + 1, next + 1);
109 | if (!isMin) {
110 | return;
111 | }
112 | f[x][y] = false;
113 | f[y][x] = false;
114 | g2s[y] = -1;
115 | } else {
116 | // 这个点已经被用过的时候,不需要再设置五元组id标识
117 | // 新建这条边五元组
118 | Edge e = new Edge(g2s[x], g2s[y], graph.nodeLabels.get(x), graph.edgeLabels.get(x).get(i),
119 | graph.nodeLabels.get(y));
120 |
121 | // 与相应位置的边做比较,如果不是最小则失败
122 | int compareResult = e.compareWith(edgeSeqs.get(currentPosition));
123 | if (compareResult == Edge.EDGE_SMALLER) {
124 | isMin = false;
125 | return;
126 | } else if (compareResult == Edge.EDGE_LARGER) {
127 | continue;
128 | }
129 | // 如果相等则继续比
130 | g2s[y] = next;
131 | f[x][y] = true;
132 | f[y][x] = true;
133 | stack.push(y);
134 | dfsSearch(stack, currentPosition + 1, next);
135 | if (!isMin) {
136 | return;
137 | }
138 | f[x][y] = false;
139 | f[y][x] = false;
140 | }
141 | }
142 | }
143 | }
144 | }
145 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/graph/gspan/Edge.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.graph.gspan;
2 |
3 | /**
4 | * 边,用五元组表示
5 | */
6 | public class Edge {
7 |
8 | // 五元组的大小比较结果
9 | public static final int EDGE_EQUAL = 0;
10 | public static final int EDGE_SMALLER = 1;
11 | public static final int EDGE_LARGER = 2;
12 |
13 | // 边的一端的id号标识
14 | int ix;
15 | // 边的另一端的id号标识
16 | int iy;
17 | // 边的一端的点标号
18 | int x;
19 | // 边的标号
20 | int a;
21 | // 边的另一端的点标号
22 | int y;
23 |
24 | public Edge(int ix, int iy, int x, int a, int y) {
25 | this.ix = ix;
26 | this.iy = iy;
27 | this.x = x;
28 | this.a = a;
29 | this.y = y;
30 | }
31 |
32 | /**
33 | * 当前边是与给定的边的大小比较关系
34 | *
35 | * @param e
36 | * @return
37 | */
38 | public int compareWith(Edge e) {
39 | int result = EDGE_EQUAL;
40 | int[] array1 = new int[] { ix, iy, x, y, a };
41 | int[] array2 = new int[] { e.ix, e.iy, e.x, e.y, e.a };
42 |
43 | // 按照ix, iy,x,y,a的次序依次比较
44 | for (int i = 0; i < array1.length; i++) {
45 | if (array1[i] < array2[i]) {
46 | result = EDGE_SMALLER;
47 | break;
48 | } else if (array1[i] > array2[i]) {
49 | result = EDGE_LARGER;
50 | break;
51 | } else {
52 | // 如果相等,继续比较下一个
53 | continue;
54 | }
55 | }
56 |
57 | return result;
58 | }
59 |
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/graph/gspan/EdgeFrequency.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.graph.gspan;
2 |
3 | /**
4 | * 边的频繁统计
5 | */
6 | public class EdgeFrequency {
7 |
8 | // 节点标号数量
9 | private int nodeLabelNum;
10 | // 边的标号数量
11 | private int edgeLabelNum;
12 | // 用于存放边计数的3维数组
13 | public int[][][] edgeFreqCount;
14 |
15 | public EdgeFrequency(int nodeLabelNum, int edgeLabelNum) {
16 | this.nodeLabelNum = nodeLabelNum;
17 | this.edgeLabelNum = edgeLabelNum;
18 |
19 | edgeFreqCount = new int[nodeLabelNum][edgeLabelNum][nodeLabelNum];
20 | //最初始化操作
21 | for (int i = 0; i < nodeLabelNum; i++) {
22 | for (int j = 0; j < edgeLabelNum; j++) {
23 | for (int k = 0; k < nodeLabelNum; k++) {
24 | edgeFreqCount[i][j][k] = 0;
25 | }
26 | }
27 | }
28 | }
29 |
30 | }
31 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/graph/gspan/GSpanExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.graph.gspan;
2 |
3 | /**
4 | * gSpan频繁子图挖掘算法
5 | */
6 | public class GSpanExample {
7 |
8 | public static void main(String[] args) {
9 | //测试数据文件地址
10 | String filePath = "data/gspan/input.txt";
11 | //最小支持度率
12 | double minSupportRate = 0.2;
13 |
14 | GSpanTool tool = new GSpanTool(filePath, minSupportRate);
15 | tool.freqGraphMining();
16 | }
17 |
18 | }
19 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/graph/gspan/Graph.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.graph.gspan;
2 |
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * 图结构类
7 | */
8 | public class Graph {
9 |
10 | // 图节点标号组
11 | ArrayList nodeLabels;
12 | // 图的边标号组
13 | ArrayList> edgeLabels;
14 | // 边2头的节点id号,在这里可以理解为下标号
15 | ArrayList> edgeNexts;
16 |
17 | public Graph() {
18 | nodeLabels = new ArrayList<>();
19 | edgeLabels = new ArrayList<>();
20 | edgeNexts = new ArrayList<>();
21 | }
22 |
23 | public ArrayList getNodeLabels() {
24 | return nodeLabels;
25 | }
26 |
27 | public void setNodeLabels(ArrayList nodeLabels) {
28 | this.nodeLabels = nodeLabels;
29 | }
30 |
31 | /**
32 | * 判断图中是否存在某条边
33 | *
34 | * @param x
35 | * 边的一端的节点标号
36 | * @param a
37 | * 边的标号
38 | * @param y
39 | * 边的另外一端节点标号
40 | * @return
41 | */
42 | public boolean hasEdge(int x, int a, int y) {
43 | boolean isContained = false;
44 | int t;
45 |
46 | for (int i = 0; i < nodeLabels.size(); i++) {
47 | // 先寻找2个端点标号,t代表找到的点的另外一个端点标号
48 | if (nodeLabels.get(i) == x) {
49 | t = y;
50 | } else if (nodeLabels.get(i) == y) {
51 | t = x;
52 | } else {
53 | continue;
54 | }
55 |
56 | for (int j = 0; j < edgeNexts.get(i).size(); j++) {
57 | // 从此端点的所连接的点去比较对应的点和边
58 | if (edgeLabels.get(i).get(j) == a && nodeLabels.get(edgeNexts.get(i).get(j)) == t) {
59 | isContained = true;
60 | return isContained;
61 | }
62 | }
63 | }
64 |
65 | return isContained;
66 | }
67 |
68 | /**
69 | * 在图中移除某个边
70 | *
71 | * @param x
72 | * 边的某端的一个点标号
73 | * @param a
74 | * 边的标号
75 | * @param y
76 | * 边的另一端的一个点标号
77 | */
78 | public void removeEdge(int x, int a, int y) {
79 | int t;
80 |
81 | for (int i = 0; i < nodeLabels.size(); i++) {
82 | // 先寻找2个端点标号,t代表找到的点的另外一个端点标号
83 | if (nodeLabels.get(i) == x) {
84 | t = y;
85 | } else if (nodeLabels.get(i) == y) {
86 | t = x;
87 | } else {
88 | continue;
89 | }
90 |
91 | for (int j = 0; j < edgeNexts.get(i).size(); j++) {
92 | // 从此端点的所连接的点去比较对应的点和边
93 | if (edgeLabels.get(i).get(j) == a && nodeLabels.get(edgeNexts.get(i).get(j)) == t) {
94 | int id;
95 | // 在连接的点中去除该点
96 | edgeLabels.get(i).remove(j);
97 |
98 | id = edgeNexts.get(i).get(j);
99 | edgeNexts.get(i).remove(j);
100 | for (int k = 0; k < edgeNexts.get(id).size(); k++) {
101 | if (edgeNexts.get(id).get(k) == i) {
102 | edgeNexts.get(id).remove(k);
103 | break;
104 | }
105 | }
106 | break;
107 | }
108 | }
109 | }
110 |
111 | }
112 |
113 | /**
114 | * 根据图数据构造一个图
115 | *
116 | * @param gd
117 | * 图数据
118 | * @return
119 | */
120 | public Graph constructGraph(GraphData gd) {
121 | Graph graph = new Graph();
122 |
123 | // 构造一个图需要知道3点,1.图中有哪些点2.图中的每个点周围连着哪些点3.每个点周围连着哪些边
124 | for (int i = 0; i < gd.getNodeVisibles().size(); i++) {
125 | if (gd.getNodeVisibles().get(i)) {
126 | graph.getNodeLabels().add(gd.getNodeLabels().get(i));
127 | }
128 |
129 | // 添加对应id下的集合
130 | // id节点后有多少相连的边的标号
131 | graph.edgeLabels.add(new ArrayList());
132 | // id节点后有多少相连的节点的id
133 | graph.edgeNexts.add(new ArrayList());
134 | }
135 |
136 | for (int i = 0; i < gd.getEdgeLabels().size(); i++) {
137 | if (gd.getEdgeVisibles().get(i)) {
138 | // 在此后面添加一个边标号
139 | graph.edgeLabels.get(gd.getEdgeX().get(i)).add(gd.getEdgeLabels().get(i));
140 | graph.edgeLabels.get(gd.getEdgeY().get(i)).add(gd.getEdgeLabels().get(i));
141 | graph.edgeNexts.get(gd.getEdgeX().get(i)).add(gd.getEdgeY().get(i));
142 | graph.edgeNexts.get(gd.getEdgeY().get(i)).add(gd.getEdgeX().get(i));
143 | }
144 | }
145 |
146 | return graph;
147 | }
148 | }
149 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/graph/gspan/GraphCode.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.graph.gspan;
2 |
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * 图编码类
7 | */
8 | public class GraphCode {
9 |
10 | //边的集合,边的排序代表着边的添加次序
11 | ArrayList edgeSeq;
12 | //拥有这些边的图的id
13 | ArrayList gs;
14 |
15 | public GraphCode() {
16 | this.edgeSeq = new ArrayList<>();
17 | this.gs = new ArrayList<>();
18 | }
19 |
20 | public ArrayList getEdgeSeq() {
21 | return edgeSeq;
22 | }
23 |
24 | public void setEdgeSeq(ArrayList edgeSeq) {
25 | this.edgeSeq = edgeSeq;
26 | }
27 |
28 | public ArrayList getGs() {
29 | return gs;
30 | }
31 |
32 | public void setGs(ArrayList gs) {
33 | this.gs = gs;
34 | }
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/graph/gspan/GraphData.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.graph.gspan;
2 |
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * 图的数据类
7 | */
8 | public class GraphData {
9 |
10 | // 节点组标号
11 | private ArrayList nodeLabels;
12 | // 节点是否可用,可能被移除
13 | private ArrayList nodeVisibles;
14 | // 边的集合标号
15 | private ArrayList edgeLabels;
16 | // 边的一边点id
17 | private ArrayList edgeX;
18 | // 边的另一边的点id
19 | private ArrayList edgeY;
20 | // 边是否可用
21 | private ArrayList edgeVisibles;
22 |
23 | public GraphData() {
24 | nodeLabels = new ArrayList<>();
25 | nodeVisibles = new ArrayList<>();
26 |
27 | edgeLabels = new ArrayList<>();
28 | edgeX = new ArrayList<>();
29 | edgeY = new ArrayList<>();
30 | edgeVisibles = new ArrayList<>();
31 | }
32 |
33 | public ArrayList getNodeLabels() {
34 | return nodeLabels;
35 | }
36 |
37 | public void setNodeLabels(ArrayList nodeLabels) {
38 | this.nodeLabels = nodeLabels;
39 | }
40 |
41 | public ArrayList getNodeVisibles() {
42 | return nodeVisibles;
43 | }
44 |
45 | public void setNodeVisibles(ArrayList nodeVisibles) {
46 | this.nodeVisibles = nodeVisibles;
47 | }
48 |
49 | public ArrayList getEdgeLabels() {
50 | return edgeLabels;
51 | }
52 |
53 | public void setEdgeLabels(ArrayList edgeLabels) {
54 | this.edgeLabels = edgeLabels;
55 | }
56 |
57 | public ArrayList getEdgeX() {
58 | return edgeX;
59 | }
60 |
61 | public void setEdgeX(ArrayList edgeX) {
62 | this.edgeX = edgeX;
63 | }
64 |
65 | public ArrayList getEdgeY() {
66 | return edgeY;
67 | }
68 |
69 | public void setEdgeY(ArrayList edgeY) {
70 | this.edgeY = edgeY;
71 | }
72 |
73 | public ArrayList getEdgeVisibles() {
74 | return edgeVisibles;
75 | }
76 |
77 | public void setEdgeVisibles(ArrayList edgeVisibles) {
78 | this.edgeVisibles = edgeVisibles;
79 | }
80 |
81 | /**
82 | * 根据点边频繁度移除图中不频繁的点边
83 | *
84 | * @param freqNodeLabel
85 | * 点的频繁度统计
86 | * @param freqEdgeLabel
87 | * 边的频繁度统计
88 | * @param minSupportCount
89 | * 最小支持度计数
90 | */
91 | public void removeInFreqNodeAndEdge(int[] freqNodeLabel, int[] freqEdgeLabel, int minSupportCount) {
92 | int label = 0;
93 | int x = 0;
94 | int y = 0;
95 |
96 | for (int i = 0; i < nodeLabels.size(); i++) {
97 | label = nodeLabels.get(i);
98 | if (freqNodeLabel[label] < minSupportCount) {
99 | // 如果小于支持度计数,则此点不可用
100 | nodeVisibles.set(i, false);
101 | }
102 | }
103 |
104 | for (int i = 0; i < edgeLabels.size(); i++) {
105 | label = edgeLabels.get(i);
106 |
107 | if (freqEdgeLabel[label] < minSupportCount) {
108 | // 如果小于支持度计数,则此边不可用
109 | edgeVisibles.set(i, false);
110 | continue;
111 | }
112 |
113 | // 如果此边的某个端的端点已经不可用了,则此边也不可用,x,y表示id号
114 | x = edgeX.get(i);
115 | y = edgeY.get(i);
116 | if (!nodeVisibles.get(x) || !nodeVisibles.get(y)) {
117 | edgeVisibles.set(i, false);
118 | }
119 | }
120 | }
121 |
122 | /**
123 | * 根据标号排序重新对满足条件的点边重新编号
124 | *
125 | * @param nodeLabel2Rank
126 | * 点排名
127 | * @param edgeLabel2Rank
128 | * 边排名
129 | */
130 | public void reLabelByRank(int[] nodeLabel2Rank, int[] edgeLabel2Rank) {
131 | int label = 0;
132 | int count = 0;
133 | int temp = 0;
134 | // 旧的id对新id号的映射
135 | int[] oldId2New = new int[nodeLabels.size()];
136 | for (int i = 0; i < nodeLabels.size(); i++) {
137 | label = nodeLabels.get(i);
138 |
139 | // 如果当前点是可用的,将此标号的排名号作为此点新的标号
140 | if (nodeVisibles.get(i)) {
141 | nodeLabels.set(i, nodeLabel2Rank[label]);
142 | oldId2New[i] = count;
143 | count++;
144 | }
145 | }
146 |
147 | for (int i = 0; i < edgeLabels.size(); i++) {
148 | label = edgeLabels.get(i);
149 |
150 | // 如果当前边是可用的,将此标号的排名号作为此点新的标号
151 | if (edgeVisibles.get(i)) {
152 | edgeLabels.set(i, edgeLabel2Rank[label]);
153 |
154 | // 对此点做x,y的id号替换
155 | temp = edgeX.get(i);
156 | edgeX.set(i, oldId2New[temp]);
157 | temp = edgeY.get(i);
158 | edgeY.set(i, oldId2New[temp]);
159 | }
160 | }
161 | }
162 | }
163 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/graph/gspan/SubChildTraveler.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.graph.gspan;
2 |
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * 孩子图搜寻类,在当前边的基础上寻找可能的孩子边
7 | */
8 | public class SubChildTraveler {
9 |
10 | // 当前的五元组边
11 | ArrayList edgeSeq;
12 | // 当前的图
13 | Graph graph;
14 | // 结果数据,孩子边对所属的图id组
15 | ArrayList childEdge;
16 | // 图的点id对五元组id标识的映射
17 | int[] g2s;
18 | // 五元组id标识对图的点id的映射
19 | int[] s2g;
20 | // 图中边是否被用的情况
21 | boolean f[][];
22 | // 最右路径,rm[id]表示的是此id节点在最右路径中的下一个节点id
23 | int[] rm;
24 | // 下一个五元组的id
25 | int next;
26 |
27 | public SubChildTraveler(ArrayList edgeSeq, Graph graph) {
28 | this.edgeSeq = edgeSeq;
29 | this.graph = graph;
30 | this.childEdge = new ArrayList<>();
31 | }
32 |
33 | /**
34 | * 在图中搜索可能存在的孩子边
35 | *
36 | * @param next
37 | * 新加入边的节点将设置的id
38 | */
39 | public void traveler() {
40 | this.next = edgeSeq.size() + 1;
41 | int size = graph.nodeLabels.size();
42 | // 做id映射的初始化操作
43 | g2s = new int[size];
44 | s2g = new int[size];
45 | f = new boolean[size][size];
46 |
47 | for (int i = 0; i < size; i++) {
48 | g2s[i] = -1;
49 | s2g[i] = -1;
50 |
51 | for (int j = 0; j < size; j++) {
52 | // 代表点id为i到id为j点此边没有被用过
53 | f[i][j] = false;
54 | }
55 | }
56 |
57 | rm = new int[edgeSeq.size() + 1];
58 | for (int i = 0; i < edgeSeq.size() + 1; i++) {
59 | rm[i] = -1;
60 | }
61 | // 寻找最右路径
62 | for (Edge e : edgeSeq) {
63 | if (e.ix < e.iy && e.iy > rm[e.ix]) {
64 | rm[e.ix] = e.iy;
65 | }
66 | }
67 |
68 | for (int i = 0; i < size; i++) {
69 | // 寻找第一个标号相等的点
70 | if (edgeSeq.get(0).x != graph.nodeLabels.get(i)) {
71 | continue;
72 | }
73 |
74 | g2s[i] = 0;
75 | s2g[0] = i;
76 | dfsSearchEdge(0);
77 | g2s[i] = -1;
78 | s2g[0] = -1;
79 | }
80 |
81 | }
82 |
83 | /**
84 | * 在当前图中深度优先寻找正确的子图
85 | *
86 | * @param currentPosition
87 | * 当前找到的位置
88 | */
89 | public void dfsSearchEdge(int currentPosition) {
90 | int rmPosition = 0;
91 | // 如果找到底了,则在当前的子图的最右路径中寻找可能的边
92 | if (currentPosition >= edgeSeq.size()) {
93 | rmPosition = 0;
94 | while (rmPosition >= 0) {
95 | int gId = s2g[rmPosition];
96 | // 在此点附近寻找可能的边
97 | for (int i = 0; i < graph.edgeNexts.get(gId).size(); i++) {
98 | int gId2 = graph.edgeNexts.get(gId).get(i);
99 | // 如果这条边已经被用过
100 | if (f[gId][gId2] || f[gId][gId2]) {
101 | continue;
102 | }
103 |
104 | // 在最右路径中添加边分为2种情况,第一种为在最右节点上添加,第二中为在最右路径上 的点添加
105 | // 如果找到的点没有被用过,可以进行边的拓展
106 | if (g2s[gId2] < 0) {
107 | g2s[gId2] = next;
108 | Edge e = new Edge(g2s[gId], g2s[gId2], graph.nodeLabels.get(gId),
109 | graph.edgeLabels.get(gId).get(i), graph.nodeLabels.get(gId2));
110 | // 将新建的子边加入集合
111 | childEdge.add(e);
112 | } else {
113 | boolean flag = true;
114 | // 如果这点已经存在,判断他是不是最右的点
115 | for (int j = 0; j < graph.edgeNexts.get(gId2).size(); j++) {
116 | int tempId = graph.edgeNexts.get(gId2).get(j);
117 | if (g2s[gId2] < g2s[tempId]) {
118 | flag = false;
119 | break;
120 | }
121 | }
122 |
123 | if (flag) {
124 | Edge e = new Edge(g2s[gId], g2s[gId2], graph.nodeLabels.get(gId),
125 | graph.edgeLabels.get(gId).get(i), graph.nodeLabels.get(gId2));
126 | // 将新建的子边加入集合
127 | childEdge.add(e);
128 | }
129 | }
130 | }
131 | // 一个最右路径上点找完,继续下一个
132 | rmPosition = rm[rmPosition];
133 | }
134 | return;
135 | }
136 |
137 | Edge e = edgeSeq.get(currentPosition);
138 | // 所连接的点标号
139 | int y = e.y;
140 | // 所连接的边标号
141 | int a = e.a;
142 | int gId1 = s2g[e.ix];
143 | int gId2 = 0;
144 |
145 | for (int i = 0; i < graph.edgeLabels.get(gId1).size(); i++) {
146 | // 判断所连接的边对应的标号
147 | if (graph.edgeLabels.get(gId1).get(i) != a) {
148 | continue;
149 | }
150 |
151 | // 判断所连接的点的标号
152 | int tempId = graph.edgeNexts.get(gId1).get(i);
153 | if (graph.nodeLabels.get(tempId) != y) {
154 | continue;
155 | }
156 |
157 | gId2 = tempId;
158 | // 如果这两点是没有设置过的
159 | if (g2s[gId2] == -1 && s2g[e.iy] == -1) {
160 | g2s[gId2] = e.iy;
161 | s2g[e.iy] = gId2;
162 | f[gId1][gId2] = true;
163 | f[gId2][gId1] = true;
164 | dfsSearchEdge(currentPosition + 1);
165 | f[gId1][gId2] = false;
166 | f[gId2][gId1] = false;
167 | g2s[gId2] = -1;
168 | s2g[e.iy] = -1;
169 | } else {
170 | if (g2s[gId2] != e.iy) {
171 | continue;
172 | }
173 | if (s2g[e.iy] != gId2) {
174 | continue;
175 | }
176 | f[gId1][gId2] = true;
177 | f[gId2][gId1] = true;
178 | dfsSearchEdge(currentPosition);
179 | f[gId1][gId2] = false;
180 | f[gId2][gId1] = false;
181 | }
182 | }
183 |
184 | }
185 |
186 | /**
187 | * 获取结果数据对
188 | *
189 | * @return
190 | */
191 | public ArrayList getResultChildEdge() {
192 | return this.childEdge;
193 | }
194 |
195 | }
196 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/integrated/cba/CBAExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.integrated.cba;
2 |
3 | import java.text.MessageFormat;
4 |
5 | /**
6 | * CBA算法--基于关联规则的分类算法
7 | */
8 | public class CBAExample {
9 |
10 | public static void main(String[] args) {
11 | String filePath = "data/cba/input.txt";
12 | String attrDesc = "Age=Senior,CreditRating=Fair";
13 | String classification = null;
14 |
15 | //最小支持度阈值率
16 | double minSupportRate = 0.2;
17 | //最小置信度阈值
18 | double minConf = 0.7;
19 |
20 | CBACore tool = new CBACore(filePath, minSupportRate, minConf);
21 | classification = tool.CBAJudge(attrDesc);
22 | System.out.println(MessageFormat.format("{0}的关联分类结果为{1}", attrDesc, classification));
23 | }
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/integrated/cba/FrequentItem.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.integrated.cba;
2 |
3 | /**
4 | * 频繁项集
5 | */
6 | public class FrequentItem implements Comparable {
7 |
8 | // 频繁项集的集合ID
9 | private String[] idArray;
10 | // 频繁项集的支持度计数
11 | private int count;
12 | //频繁项集的长度,1项集或是2项集,亦或是3项集
13 | private int length;
14 |
15 | public FrequentItem(String[] idArray, int count) {
16 | this.idArray = idArray;
17 | this.count = count;
18 | length = idArray.length;
19 | }
20 |
21 | public String[] getIdArray() {
22 | return idArray;
23 | }
24 |
25 | public void setIdArray(String[] idArray) {
26 | this.idArray = idArray;
27 | }
28 |
29 | public int getCount() {
30 | return count;
31 | }
32 |
33 | public void setCount(int count) {
34 | this.count = count;
35 | }
36 |
37 | public int getLength() {
38 | return length;
39 | }
40 |
41 | public void setLength(int length) {
42 | this.length = length;
43 | }
44 |
45 | @Override
46 | public int compareTo(FrequentItem o) {
47 | // TODO Auto-generated method stub
48 | Integer int1 = Integer.parseInt(this.getIdArray()[0]);
49 | Integer int2 = Integer.parseInt(o.getIdArray()[0]);
50 |
51 | return int1.compareTo(int2);
52 | }
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/link/hits/HITSCore.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.link.hits;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileReader;
6 | import java.io.IOException;
7 | import java.util.ArrayList;
8 |
9 | /**
10 | * HITS链接分析算法工具类
11 | */
12 | public class HITSCore {
13 |
14 | //输入数据文件地址
15 | private String filePath;
16 | //网页个数
17 | private int pageNum;
18 | //网页Authority权威值
19 | private double[] authority;
20 | //网页hub中心值
21 | private double[] hub;
22 | //链接矩阵关系
23 | private int[][] linkMatrix;
24 | //网页种类
25 | private ArrayList pageClass;
26 |
27 | public HITSCore(String filePath) {
28 | this.filePath = filePath;
29 | readDataFile();
30 | }
31 |
32 | /**
33 | * 从文件中读取数据
34 | */
35 | private void readDataFile() {
36 | File file = new File(filePath);
37 | ArrayList dataArray = new ArrayList();
38 |
39 | try {
40 | BufferedReader in = new BufferedReader(new FileReader(file));
41 | String str;
42 | String[] tempArray;
43 | while ((str = in.readLine()) != null) {
44 | tempArray = str.split(" ");
45 | dataArray.add(tempArray);
46 | }
47 | in.close();
48 | } catch (IOException e) {
49 | e.getStackTrace();
50 | }
51 |
52 | pageClass = new ArrayList<>();
53 | // 统计网页类型种数
54 | for (String[] array : dataArray) {
55 | for (String s : array) {
56 | if (!pageClass.contains(s)) {
57 | pageClass.add(s);
58 | }
59 | }
60 | }
61 |
62 | int i = 0;
63 | int j = 0;
64 | pageNum = pageClass.size();
65 | linkMatrix = new int[pageNum][pageNum];
66 | authority = new double[pageNum];
67 | hub = new double[pageNum];
68 | for (int k = 0; k < pageNum; k++) {
69 | //初始时默认权威值和中心值都为1
70 | authority[k] = 1;
71 | hub[k] = 1;
72 | }
73 |
74 | for (String[] array : dataArray) {
75 |
76 | i = Integer.parseInt(array[0]);
77 | j = Integer.parseInt(array[1]);
78 |
79 | // 设置linkMatrix[i][j]为1代表i网页包含指向j网页的链接
80 | linkMatrix[i - 1][j - 1] = 1;
81 | }
82 | }
83 |
84 | /**
85 | * 输出结果页面,也就是authority权威值最高的页面
86 | */
87 | public void printResultPage() {
88 | //最大Hub和Authority值,用于后面的归一化计算
89 | double maxHub = 0;
90 | double maxAuthority = 0;
91 | int maxAuthorityIndex = 0;
92 | //误差值,用于收敛判断
93 | double error = Integer.MAX_VALUE;
94 | double[] newHub = new double[pageNum];
95 | double[] newAuthority = new double[pageNum];
96 |
97 | while (error > 0.01 * pageNum) {
98 | for (int k = 0; k < pageNum; k++) {
99 | newHub[k] = 0;
100 | newAuthority[k] = 0;
101 | }
102 |
103 | //hub和authority值的更新计算
104 | for (int i = 0; i < pageNum; i++) {
105 | for (int j = 0; j < pageNum; j++) {
106 | if (linkMatrix[i][j] == 1) {
107 | newHub[i] += authority[j];
108 | newAuthority[j] += hub[i];
109 | }
110 | }
111 | }
112 |
113 | maxHub = 0;
114 | maxAuthority = 0;
115 | for (int k = 0; k < pageNum; k++) {
116 | if (newHub[k] > maxHub) {
117 | maxHub = newHub[k];
118 | }
119 |
120 | if (newAuthority[k] > maxAuthority) {
121 | maxAuthority = newAuthority[k];
122 | maxAuthorityIndex = k;
123 | }
124 | }
125 |
126 | error = 0;
127 | //归一化处理
128 | for (int k = 0; k < pageNum; k++) {
129 | newHub[k] /= maxHub;
130 | newAuthority[k] /= maxAuthority;
131 |
132 | error += Math.abs(newHub[k] - hub[k]);
133 | System.out.println(newAuthority[k] + ":" + newHub[k]);
134 |
135 | hub[k] = newHub[k];
136 | authority[k] = newAuthority[k];
137 | }
138 | System.out.println("---------");
139 | }
140 |
141 | System.out.println("****最终收敛的网页的权威值和中心值****");
142 | for (int k = 0; k < pageNum; k++) {
143 | System.out.println("网页" + pageClass.get(k) + ":" + authority[k] + ":" + hub[k]);
144 | }
145 | System.out.println("权威值最高的网页为:网页" + pageClass.get(maxAuthorityIndex));
146 | }
147 |
148 | }
149 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/link/hits/HITSExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.link.hits;
2 |
3 | /**
4 | * HITS链接分析算法
5 | */
6 | public class HITSExample {
7 |
8 | public static void main(String[] args) {
9 | String filePath = "data/hits/input.txt";
10 |
11 | HITSCore tool = new HITSCore(filePath);
12 | tool.printResultPage();
13 | }
14 |
15 | }
16 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/link/pagerank/PageRankCore.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.link.pagerank;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileReader;
6 | import java.io.IOException;
7 | import java.text.MessageFormat;
8 | import java.util.ArrayList;
9 |
10 | /**
11 | * PageRank网页排名算法工具类
12 | */
13 | public class PageRankCore {
14 |
15 | // 测试输入数据
16 | private String filePath;
17 | // 网页总数量
18 | private int pageNum;
19 | // 链接关系矩阵
20 | private double[][] linkMatrix;
21 | // 每个页面pageRank值初始向量
22 | private double[] pageRankVecor;
23 |
24 | // 网页数量分类
25 | ArrayList pageClass;
26 |
27 | public PageRankCore(String filePath) {
28 | this.filePath = filePath;
29 | readDataFile();
30 | }
31 |
32 | /**
33 | * 从文件中读取数据
34 | */
35 | private void readDataFile() {
36 | File file = new File(filePath);
37 | ArrayList dataArray = new ArrayList();
38 |
39 | try {
40 | BufferedReader in = new BufferedReader(new FileReader(file));
41 | String str;
42 | String[] tempArray;
43 | while ((str = in.readLine()) != null) {
44 | tempArray = str.split(" ");
45 | dataArray.add(tempArray);
46 | }
47 | in.close();
48 | } catch (IOException e) {
49 | e.getStackTrace();
50 | }
51 |
52 | pageClass = new ArrayList<>();
53 | // 统计网页类型种数
54 | for (String[] array : dataArray) {
55 | for (String s : array) {
56 | if (!pageClass.contains(s)) {
57 | pageClass.add(s);
58 | }
59 | }
60 | }
61 |
62 | int i = 0;
63 | int j = 0;
64 | pageNum = pageClass.size();
65 | linkMatrix = new double[pageNum][pageNum];
66 | pageRankVecor = new double[pageNum];
67 | for (int k = 0; k < pageNum; k++) {
68 | // 初始每个页面的pageRank值为1
69 | pageRankVecor[k] = 1.0;
70 | }
71 | for (String[] array : dataArray) {
72 |
73 | i = Integer.parseInt(array[0]);
74 | j = Integer.parseInt(array[1]);
75 |
76 | // 设置linkMatrix[i][j]为1代表i网页包含指向j网页的链接
77 | linkMatrix[i - 1][j - 1] = 1;
78 | }
79 | }
80 |
81 | /**
82 | * 将矩阵转置
83 | */
84 | private void transferMatrix() {
85 | int count = 0;
86 | for (double[] array : linkMatrix) {
87 | // 计算页面链接个数
88 | count = 0;
89 | for (double d : array) {
90 | if (d == 1) {
91 | count++;
92 | }
93 | }
94 | // 按概率均分
95 | for (int i = 0; i < array.length; i++) {
96 | if (array[i] == 1) {
97 | array[i] /= count;
98 | }
99 | }
100 | }
101 |
102 | double t = 0;
103 | // 将矩阵转置换,作为概率转移矩阵
104 | for (int i = 0; i < linkMatrix.length; i++) {
105 | for (int j = i + 1; j < linkMatrix[0].length; j++) {
106 | t = linkMatrix[i][j];
107 | linkMatrix[i][j] = linkMatrix[j][i];
108 | linkMatrix[j][i] = t;
109 | }
110 | }
111 | }
112 |
113 | /**
114 | * 利用幂法计算pageRank值
115 | */
116 | public void printPageRankValue() {
117 | transferMatrix();
118 | // 阻尼系数
119 | double damp = 0.5;
120 | // 链接概率矩阵
121 | double[][] A = new double[pageNum][pageNum];
122 | double[][] e = new double[pageNum][pageNum];
123 |
124 | // 调用公式A=d*q+(1-d)*e/m,m为网页总个数,d就是damp
125 | double temp = (1 - damp) / pageNum;
126 | for (int i = 0; i < e.length; i++) {
127 | for (int j = 0; j < e[0].length; j++) {
128 | e[i][j] = temp;
129 | }
130 | }
131 |
132 | for (int i = 0; i < pageNum; i++) {
133 | for (int j = 0; j < pageNum; j++) {
134 | temp = damp * linkMatrix[i][j] + e[i][j];
135 | A[i][j] = temp;
136 |
137 | }
138 | }
139 |
140 | // 误差值,作为判断收敛标准
141 | double errorValue = Integer.MAX_VALUE;
142 | double[] newPRVector = new double[pageNum];
143 | // 当平均每个PR值误差小于0.001时就算达到收敛
144 | while (errorValue > 0.001 * pageNum) {
145 | System.out.println("**********");
146 | for (int i = 0; i < pageNum; i++) {
147 | temp = 0;
148 | // 将A*pageRankVector,利用幂法求解,直到pageRankVector值收敛
149 | for (int j = 0; j < pageNum; j++) {
150 | // temp就是每个网页到i页面的pageRank值
151 | temp += A[i][j] * pageRankVecor[j];
152 | }
153 |
154 | // 最后的temp就是i网页的总PageRank值
155 | newPRVector[i] = temp;
156 | System.out.println(temp);
157 | }
158 |
159 | errorValue = 0;
160 | for (int i = 0; i < pageNum; i++) {
161 | errorValue += Math.abs(pageRankVecor[i] - newPRVector[i]);
162 | // 新的向量代替旧的向量
163 | pageRankVecor[i] = newPRVector[i];
164 | }
165 | }
166 |
167 | String name = null;
168 | temp = 0;
169 | System.out.println("--------------------");
170 | for (int i = 0; i < pageNum; i++) {
171 | System.out.println(MessageFormat.format("网页{0}的pageRank值:{1}", pageClass.get(i), pageRankVecor[i]));
172 | if (pageRankVecor[i] > temp) {
173 | temp = pageRankVecor[i];
174 | name = pageClass.get(i);
175 | }
176 | }
177 | System.out.println(MessageFormat.format("等级最高的网页为:{0}", name));
178 | }
179 |
180 | }
181 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/link/pagerank/PageRankExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.link.pagerank;
2 |
3 | /**
4 | * PageRank计算网页重要性/排名算法
5 | */
6 | public class PageRankExample {
7 |
8 | public static void main(String[] args) {
9 | String filePath = "data/pagerank/input.txt";
10 |
11 | PageRankCore tool = new PageRankCore(filePath);
12 | tool.printPageRankValue();
13 | }
14 |
15 | }
16 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/aco/ACOExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.aco;
2 |
3 | /**
4 | * 蚁群算法测试类
5 | */
6 | public class ACOExample {
7 |
8 | public static void main(String[] args) {
9 | //测试数据
10 | String filePath = "data/aco/input.txt";
11 | //蚂蚁数量
12 | int antNum;
13 | //蚁群算法迭代次数
14 | int loopCount;
15 | //控制参数
16 | double alpha;
17 | double beita;
18 | double p;
19 | double Q;
20 |
21 | antNum = 3;
22 | alpha = 0.5;
23 | beita = 1;
24 | p = 0.5;
25 | Q = 5;
26 | loopCount = 5;
27 |
28 | ACOCore tool = new ACOCore(filePath, antNum, alpha, beita, p, Q);
29 | tool.antStartSearching(loopCount);
30 | }
31 |
32 | }
33 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/aco/Ant.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.aco;
2 |
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * 蚂蚁类,进行路径搜索的载体
7 | */
8 | public class Ant implements Comparable {
9 |
10 | // 蚂蚁当前所在城市
11 | String currentPos;
12 | // 蚂蚁遍历完回到原点所用的总距离
13 | Double sumDistance;
14 | // 城市间的信息素浓度矩阵,随着时间的增多而减少
15 | double[][] pheromoneMatrix;
16 | // 蚂蚁已经走过的城市集合
17 | ArrayList visitedCitys;
18 | // 还未走过的城市集合
19 | ArrayList nonVisitedCitys;
20 | // 蚂蚁当前走过的路径
21 | ArrayList currentPath;
22 |
23 | public Ant(double[][] pheromoneMatrix, ArrayList nonVisitedCitys) {
24 | this.pheromoneMatrix = pheromoneMatrix;
25 | this.nonVisitedCitys = nonVisitedCitys;
26 |
27 | this.visitedCitys = new ArrayList<>();
28 | this.currentPath = new ArrayList<>();
29 | }
30 |
31 | /**
32 | * 计算路径的总成本(距离)
33 | *
34 | * @return
35 | */
36 | public double calSumDistance() {
37 | sumDistance = 0.0;
38 | String lastCity;
39 | String currentCity;
40 |
41 | for (int i = 0; i < currentPath.size() - 1; i++) {
42 | lastCity = currentPath.get(i);
43 | currentCity = currentPath.get(i + 1);
44 |
45 | // 通过距离矩阵进行计算
46 | sumDistance += ACOCore.disMatrix[Integer.parseInt(lastCity)][Integer.parseInt(currentCity)];
47 | }
48 |
49 | return sumDistance;
50 | }
51 |
52 | /**
53 | * 蚂蚁选择前往下一个城市
54 | *
55 | * @param city
56 | * 所选的城市
57 | */
58 | public void goToNextCity(String city) {
59 | this.currentPath.add(city);
60 | this.currentPos = city;
61 | this.nonVisitedCitys.remove(city);
62 | this.visitedCitys.add(city);
63 | }
64 |
65 | /**
66 | * 判断蚂蚁是否已经又重新回到起点
67 | *
68 | * @return
69 | */
70 | public boolean isBack() {
71 | boolean isBack = false;
72 | String startPos;
73 | String endPos;
74 |
75 | if (currentPath.size() == 0) {
76 | return isBack;
77 | }
78 |
79 | startPos = currentPath.get(0);
80 | endPos = currentPath.get(currentPath.size() - 1);
81 | if (currentPath.size() > 1 && startPos.equals(endPos)) {
82 | isBack = true;
83 | }
84 |
85 | return isBack;
86 | }
87 |
88 | /**
89 | * 判断蚂蚁在本次的走过的路径中是否包含从城市i到城市j
90 | *
91 | * @param cityI
92 | * 城市I
93 | * @param cityJ
94 | * 城市J
95 | * @return
96 | */
97 | public boolean pathContained(String cityI, String cityJ) {
98 | String lastCity;
99 | String currentCity;
100 | boolean isContained = false;
101 |
102 | for (int i = 0; i < currentPath.size() - 1; i++) {
103 | lastCity = currentPath.get(i);
104 | currentCity = currentPath.get(i + 1);
105 |
106 | // 如果某一段路径的始末位置一致,则认为有经过此城市
107 | if ((lastCity.equals(cityI) && currentCity.equals(cityJ))
108 | || (lastCity.equals(cityJ) && currentCity.equals(cityI))) {
109 | isContained = true;
110 | break;
111 | }
112 | }
113 |
114 | return isContained;
115 | }
116 |
117 | @Override
118 | public int compareTo(Ant o) {
119 | // TODO Auto-generated method stub
120 | return this.sumDistance.compareTo(o.sumDistance);
121 | }
122 | }
123 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/bayesnetwork/BayesNetWorkExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.bayesnetwork;
2 |
3 | /**
4 | * 贝叶斯网络场景测试类
5 | */
6 | public class BayesNetWorkExample {
7 |
8 | public static void main(String[] args) {
9 | String dataFilePath = "data/bayesnetwork/input.txt";
10 | String attachFilePath = "data/bayesnetwork/attach.txt";
11 | // 查询串语句
12 | String queryStr;
13 | // 结果概率
14 | double result;
15 |
16 | // 查询语句的描述的事件是地震发生了,导致响铃响了,导致接到Mary的电话
17 | queryStr = "E=y,A=y,M=y";
18 | BayesNetWorkCore tool = new BayesNetWorkCore(dataFilePath, attachFilePath);
19 | result = tool.calProByNetWork(queryStr);
20 |
21 | if (result == -1) {
22 | System.out.println("所描述的事件不满足贝叶斯网络的结构,无法求其概率");
23 | } else {
24 | System.out.println(String.format("事件%s发生的概率为%s", queryStr, result));
25 | }
26 | }
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/bayesnetwork/Node.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.bayesnetwork;
2 |
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * 贝叶斯网络节点类
7 | */
8 | public class Node {
9 |
10 | // 节点的属性名称
11 | String name;
12 | // 节点的父亲节点,也就是上游节点,可能多个
13 | ArrayList parentNodes;
14 | // 节点的子节点,也就是下游节点,可能多个
15 | ArrayList childNodes;
16 |
17 | public Node(String name) {
18 | this.name = name;
19 |
20 | // 初始化变量
21 | this.parentNodes = new ArrayList<>();
22 | this.childNodes = new ArrayList<>();
23 | }
24 |
25 | /**
26 | * 将自身节点连接到目标给定的节点
27 | *
28 | * @param node
29 | * 下游节点
30 | */
31 | public void connectNode(Node node) {
32 | // 将下游节点加入自身节点的孩子节点中
33 | this.childNodes.add(node);
34 | // 将自身节点加入到下游节点的父节点中
35 | node.parentNodes.add(this);
36 | }
37 |
38 | /**
39 | * 判断与目标节点是否相同,主要比较名称是否相同即可
40 | *
41 | * @param node
42 | * 目标结点
43 | * @return
44 | */
45 | public boolean isEqual(Node node) {
46 | boolean isEqual;
47 |
48 | isEqual = false;
49 | // 节点名称相同则视为相等
50 | if (this.name.equals(node.name)) {
51 | isEqual = true;
52 | }
53 |
54 | return isEqual;
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/cabddcc/CABDDCCCore.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.cabddcc;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileReader;
6 | import java.io.IOException;
7 | import java.text.MessageFormat;
8 | import java.util.ArrayList;
9 |
10 | /**
11 | * 基于连通图的分裂聚类算法
12 | */
13 | public class CABDDCCCore {
14 |
15 | // 测试数据点数据
16 | private String filePath;
17 | // 连通图距离阈值l
18 | private int length;
19 | // 原始坐标点
20 | public static ArrayList totalPoints;
21 | // 聚类结果坐标点集合
22 | private ArrayList> resultClusters;
23 | // 连通图
24 | private Graph graph;
25 |
26 | public CABDDCCCore(String filePath, int length) {
27 | this.filePath = filePath;
28 | this.length = length;
29 |
30 | readDataFile();
31 | }
32 |
33 | /**
34 | * 从文件中读取数据
35 | */
36 | public void readDataFile() {
37 | File file = new File(filePath);
38 | ArrayList dataArray = new ArrayList();
39 |
40 | try {
41 | BufferedReader in = new BufferedReader(new FileReader(file));
42 | String str;
43 | String[] tempArray;
44 | while ((str = in.readLine()) != null) {
45 | tempArray = str.split(" ");
46 | dataArray.add(tempArray);
47 | }
48 | in.close();
49 | } catch (IOException e) {
50 | e.getStackTrace();
51 | }
52 |
53 | Point p;
54 | totalPoints = new ArrayList<>();
55 | for (String[] array : dataArray) {
56 | p = new Point(array[0], array[1], array[2]);
57 | totalPoints.add(p);
58 | }
59 |
60 | // 用边和点构造图
61 | graph = new Graph(null, totalPoints);
62 | }
63 |
64 | /**
65 | * 分裂连通图得到聚类
66 | */
67 | public void splitCluster() {
68 | // 获取形成连通子图
69 | ArrayList subGraphs;
70 | ArrayList> pointList;
71 | resultClusters = new ArrayList<>();
72 |
73 | subGraphs = graph.splitGraphByLength(length);
74 |
75 | for (Graph g : subGraphs) {
76 | // 获取每个连通子图分裂后的聚类结果
77 | pointList = g.getClusterByDivding();
78 | resultClusters.addAll(pointList);
79 | }
80 |
81 | printResultCluster();
82 | }
83 |
84 | /**
85 | * 输出结果聚簇
86 | */
87 | private void printResultCluster() {
88 | int i = 1;
89 | for (ArrayList cluster : resultClusters) {
90 | System.out.print("聚簇" + i + ":");
91 | for (Point p : cluster) {
92 | System.out.print(MessageFormat.format("({0}, {1}) ", p.x, p.y));
93 | }
94 | System.out.println();
95 | i++;
96 | }
97 |
98 | }
99 |
100 | }
101 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/cabddcc/CABDDCCExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.cabddcc;
2 |
3 | /**
4 | * 基于连通图的分裂聚类算法
5 | */
6 | public class CABDDCCExample {
7 |
8 | public static void main(String[] agrs) {
9 | String filePath = "data/cabddcc/graphData.txt";
10 | //连通距离阈值
11 | int length = 3;
12 |
13 | CABDDCCCore tool = new CABDDCCCore(filePath, length);
14 | tool.splitCluster();
15 | }
16 |
17 | }
18 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/cabddcc/Point.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.cabddcc;
2 |
3 | /**
4 | * 坐标点类
5 | */
6 | public class Point implements Comparable {
7 |
8 | //坐标点id号,id号唯一
9 | int id;
10 | //坐标横坐标
11 | Integer x;
12 | //坐标纵坐标
13 | Integer y;
14 | //坐标点是否已经被访问(处理)过,在生成连通子图的时候用到
15 | boolean isVisited;
16 |
17 | public Point(String id, String x, String y) {
18 | this.id = Integer.parseInt(id);
19 | this.x = Integer.parseInt(x);
20 | this.y = Integer.parseInt(y);
21 | }
22 |
23 | /**
24 | * 计算当前点与制定点之间的欧式距离
25 | *
26 | * @param p
27 | * 待计算聚类的p点
28 | * @return
29 | */
30 | public double ouDistance(Point p) {
31 | double distance = 0;
32 |
33 | distance = (this.x - p.x) * (this.x - p.x) + (this.y - p.y) * (this.y - p.y);
34 | distance = Math.sqrt(distance);
35 |
36 | return distance;
37 | }
38 |
39 | /**
40 | * 判断2个坐标点是否为用个坐标点
41 | *
42 | * @param p
43 | * 待比较坐标点
44 | * @return
45 | */
46 | public boolean isTheSame(Point p) {
47 | boolean isSamed = false;
48 |
49 | if (this.x == p.x && this.y == p.y) {
50 | isSamed = true;
51 | }
52 |
53 | return isSamed;
54 | }
55 |
56 | @Override
57 | public int compareTo(Point p) {
58 | if (this.x.compareTo(p.x) != 0) {
59 | return this.x.compareTo(p.x);
60 | } else {
61 | //如果在x坐标相等的情况下比较y坐标
62 | return this.y.compareTo(p.y);
63 | }
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/chameleon/ChameleonExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.chameleon;
2 |
3 | /**
4 | * Chameleon(变色龙)两阶段聚类算法
5 | */
6 | public class ChameleonExample {
7 |
8 | public static void main(String[] args) {
9 | String filePath = "data/chameleon/graphData.txt";
10 | //k-近邻的k设置
11 | int k = 1;
12 | //度量函数阈值
13 | double minMetric = 0.1;
14 |
15 | ChameleonCore tool = new ChameleonCore(filePath, k, minMetric);
16 | tool.buildCluster();
17 | }
18 |
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/chameleon/Cluster.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.chameleon;
2 |
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * 聚簇类
7 | */
8 | public class Cluster implements Cloneable {
9 |
10 | //簇唯一id标识号
11 | int id;
12 | // 聚簇内的坐标点集合
13 | ArrayList points;
14 | // 聚簇内的所有边的权重和
15 | double weightSum = 0;
16 |
17 | public Cluster(int id, ArrayList points) {
18 | this.id = id;
19 | this.points = points;
20 | }
21 |
22 | /**
23 | * 计算聚簇的内部的边权重和
24 | *
25 | * @return
26 | */
27 | public double calEC() {
28 | int id1 = 0;
29 | int id2 = 0;
30 | weightSum = 0;
31 |
32 | for (Point p1 : points) {
33 | for (Point p2 : points) {
34 | id1 = p1.id;
35 | id2 = p2.id;
36 |
37 | // 为了避免重复计算,取id1小的对应大的
38 | if (id1 < id2 && ChameleonCore.edges[id1][id2] == 1) {
39 | weightSum += ChameleonCore.weights[id1][id2];
40 | }
41 | }
42 | }
43 |
44 | return weightSum;
45 | }
46 |
47 | /**
48 | * 计算2个簇之间最近的n条边
49 | *
50 | * @param otherCluster
51 | * 待比较的簇
52 | * @param n
53 | * 最近的边的数目
54 | * @return
55 | */
56 | public ArrayList calNearestEdge(Cluster otherCluster, int n) {
57 | int count = 0;
58 | double distance = 0;
59 | double minDistance = Integer.MAX_VALUE;
60 | Point point1 = null;
61 | Point point2 = null;
62 | ArrayList edgeList = new ArrayList<>();
63 | ArrayList pointList1 = (ArrayList) points.clone();
64 | ArrayList pointList2 = null;
65 | Cluster c2 = null;
66 |
67 | try {
68 | c2 = (Cluster) otherCluster.clone();
69 | pointList2 = c2.points;
70 | } catch (CloneNotSupportedException e) {
71 | // TODO Auto-generated catch block
72 | e.printStackTrace();
73 | }
74 |
75 | int[] tempEdge;
76 | // 循环计算出每次的最近距离
77 | while (count < n) {
78 | tempEdge = new int[2];
79 | minDistance = Integer.MAX_VALUE;
80 |
81 | for (Point p1 : pointList1) {
82 | for (Point p2 : pointList2) {
83 | distance = p1.ouDistance(p2);
84 | if (distance < minDistance) {
85 | point1 = p1;
86 | point2 = p2;
87 | tempEdge[0] = p1.id;
88 | tempEdge[1] = p2.id;
89 |
90 | minDistance = distance;
91 | }
92 | }
93 | }
94 |
95 | pointList1.remove(point1);
96 | pointList2.remove(point2);
97 | edgeList.add(tempEdge);
98 | count++;
99 | }
100 |
101 | return edgeList;
102 | }
103 |
104 | @Override
105 | protected Object clone() throws CloneNotSupportedException {
106 | // TODO Auto-generated method stub
107 |
108 | //引用需要再次复制,实现深拷贝
109 | ArrayList pointList = (ArrayList) this.points.clone();
110 | Cluster cluster = new Cluster(id, pointList);
111 |
112 | return cluster;
113 | }
114 |
115 | }
116 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/chameleon/Point.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.chameleon;
2 |
3 | /**
4 | * 坐标点类
5 | */
6 | public class Point {
7 |
8 | //坐标点id号,id号唯一
9 | int id;
10 | //坐标横坐标
11 | Integer x;
12 | //坐标纵坐标
13 | Integer y;
14 | //是否已经被访问过
15 | boolean isVisited;
16 |
17 | public Point(String id, String x, String y) {
18 | this.id = Integer.parseInt(id);
19 | this.x = Integer.parseInt(x);
20 | this.y = Integer.parseInt(y);
21 | }
22 |
23 | /**
24 | * 计算当前点与制定点之间的欧式距离
25 | *
26 | * @param p
27 | * 待计算聚类的p点
28 | * @return
29 | */
30 | public double ouDistance(Point p) {
31 | double distance = 0;
32 |
33 | distance = (this.x - p.x) * (this.x - p.x) + (this.y - p.y) * (this.y - p.y);
34 | distance = Math.sqrt(distance);
35 |
36 | return distance;
37 | }
38 |
39 | /**
40 | * 判断2个坐标点是否为用个坐标点
41 | *
42 | * @param p
43 | * 待比较坐标点
44 | * @return
45 | */
46 | public boolean isTheSame(Point p) {
47 | boolean isSamed = false;
48 |
49 | if (this.x == p.x && this.y == p.y) {
50 | isSamed = true;
51 | }
52 |
53 | return isSamed;
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/dbscan/DBSCANCore.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.dbscan;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileReader;
6 | import java.io.IOException;
7 | import java.text.MessageFormat;
8 | import java.util.ArrayList;
9 |
10 | /**
11 | * DBSCAN基于密度聚类算法工具类
12 | */
13 | public class DBSCANCore {
14 |
15 | // 测试数据文件地址
16 | private String filePath;
17 | // 簇扫描半径
18 | private double eps;
19 | // 最小包含点数阈值
20 | private int minPts;
21 | // 所有的数据坐标点
22 | private ArrayList totalPoints;
23 | // 聚簇结果
24 | private ArrayList> resultClusters;
25 | //噪声数据
26 | private ArrayList noisePoint;
27 |
28 | public DBSCANCore(String filePath, double eps, int minPts) {
29 | this.filePath = filePath;
30 | this.eps = eps;
31 | this.minPts = minPts;
32 | readDataFile();
33 | }
34 |
35 | /**
36 | * 从文件中读取数据
37 | */
38 | public void readDataFile() {
39 | File file = new File(filePath);
40 | ArrayList dataArray = new ArrayList();
41 |
42 | try {
43 | BufferedReader in = new BufferedReader(new FileReader(file));
44 | String str;
45 | String[] tempArray;
46 | while ((str = in.readLine()) != null) {
47 | tempArray = str.split(" ");
48 | dataArray.add(tempArray);
49 | }
50 | in.close();
51 | } catch (IOException e) {
52 | e.getStackTrace();
53 | }
54 |
55 | Point p;
56 | totalPoints = new ArrayList<>();
57 | for (String[] array : dataArray) {
58 | p = new Point(array[0], array[1]);
59 | totalPoints.add(p);
60 | }
61 | }
62 |
63 | /**
64 | * 递归的寻找聚簇
65 | *
66 | * @param pointList
67 | * 当前的点列表
68 | * @param parentCluster
69 | * 父聚簇
70 | */
71 | private void recursiveCluster(Point point, ArrayList parentCluster) {
72 | double distance = 0;
73 | ArrayList cluster;
74 |
75 | // 如果已经访问过了,则跳过
76 | if (point.isVisited) {
77 | return;
78 | }
79 |
80 | point.isVisited = true;
81 | cluster = new ArrayList<>();
82 | for (Point p2 : totalPoints) {
83 | // 过滤掉自身的坐标点
84 | if (point.isTheSame(p2)) {
85 | continue;
86 | }
87 |
88 | distance = point.ouDistance(p2);
89 | if (distance <= eps) {
90 | // 如果聚类小于给定的半径,则加入簇中
91 | cluster.add(p2);
92 | }
93 | }
94 |
95 | if (cluster.size() >= minPts) {
96 | // 将自己也加入到聚簇中
97 | cluster.add(point);
98 | // 如果附近的节点个数超过最下值,则加入到父聚簇中,同时去除重复的点
99 | addCluster(parentCluster, cluster);
100 |
101 | for (Point p : cluster) {
102 | recursiveCluster(p, parentCluster);
103 | }
104 | }
105 | }
106 |
107 | /**
108 | * 往父聚簇中添加局部簇坐标点
109 | *
110 | * @param parentCluster
111 | * 原始父聚簇坐标点
112 | * @param cluster
113 | * 待合并的聚簇
114 | */
115 | private void addCluster(ArrayList parentCluster, ArrayList cluster) {
116 | boolean isCotained = false;
117 | ArrayList addPoints = new ArrayList<>();
118 |
119 | for (Point p : cluster) {
120 | isCotained = false;
121 | for (Point p2 : parentCluster) {
122 | if (p.isTheSame(p2)) {
123 | isCotained = true;
124 | break;
125 | }
126 | }
127 |
128 | if (!isCotained) {
129 | addPoints.add(p);
130 | }
131 | }
132 |
133 | parentCluster.addAll(addPoints);
134 | }
135 |
136 | /**
137 | * dbScan算法基于密度的聚类
138 | */
139 | public void dbScanCluster() {
140 | ArrayList cluster = null;
141 | resultClusters = new ArrayList<>();
142 | noisePoint = new ArrayList<>();
143 |
144 | for (Point p : totalPoints) {
145 | if (p.isVisited) {
146 | continue;
147 | }
148 |
149 | cluster = new ArrayList<>();
150 | recursiveCluster(p, cluster);
151 |
152 | if (cluster.size() > 0) {
153 | resultClusters.add(cluster);
154 | } else {
155 | noisePoint.add(p);
156 | }
157 | }
158 | removeFalseNoise();
159 |
160 | printClusters();
161 | }
162 |
163 | /**
164 | * 移除被错误分类的噪声点数据
165 | */
166 | private void removeFalseNoise() {
167 | ArrayList totalCluster = new ArrayList<>();
168 | ArrayList deletePoints = new ArrayList<>();
169 |
170 | //将聚簇合并
171 | for (ArrayList list : resultClusters) {
172 | totalCluster.addAll(list);
173 | }
174 |
175 | for (Point p : noisePoint) {
176 | for (Point p2 : totalCluster) {
177 | if (p2.isTheSame(p)) {
178 | deletePoints.add(p);
179 | }
180 | }
181 | }
182 |
183 | noisePoint.removeAll(deletePoints);
184 | }
185 |
186 | /**
187 | * 输出聚类结果
188 | */
189 | private void printClusters() {
190 | int i = 1;
191 | for (ArrayList pList : resultClusters) {
192 | System.out.print("聚簇" + (i++) + ":");
193 | for (Point p : pList) {
194 | System.out.print(MessageFormat.format("({0},{1}) ", p.x, p.y));
195 | }
196 | System.out.println();
197 | }
198 |
199 | System.out.println();
200 | System.out.print("噪声数据:");
201 | for (Point p : noisePoint) {
202 | System.out.print(MessageFormat.format("({0},{1}) ", p.x, p.y));
203 | }
204 | System.out.println();
205 | }
206 | }
207 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/dbscan/DBSCANExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.dbscan;
2 |
3 | /**
4 | * Dbscan基于密度的聚类算法测试类
5 | */
6 | public class DBSCANExample {
7 |
8 | public static void main(String[] args) {
9 | String filePath = "data/dbscan/input.txt";
10 | //簇扫描半径
11 | double eps = 3;
12 | //最小包含点数阈值
13 | int minPts = 3;
14 |
15 | DBSCANCore tool = new DBSCANCore(filePath, eps, minPts);
16 | tool.dbScanCluster();
17 | }
18 |
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/dbscan/Point.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.dbscan;
2 |
3 | /**
4 | * 坐标点类
5 | */
6 | public class Point {
7 |
8 | // 坐标点横坐标
9 | int x;
10 | // 坐标点纵坐标
11 | int y;
12 | // 此节点是否已经被访问过
13 | boolean isVisited;
14 |
15 | public Point(String x, String y) {
16 | this.x = (Integer.parseInt(x));
17 | this.y = (Integer.parseInt(y));
18 | this.isVisited = false;
19 | }
20 |
21 | /**
22 | * 计算当前点与制定点之间的欧式距离
23 | *
24 | * @param p
25 | * 待计算聚类的p点
26 | * @return
27 | */
28 | public double ouDistance(Point p) {
29 | double distance = 0;
30 |
31 | distance = (this.x - p.x) * (this.x - p.x) + (this.y - p.y) * (this.y - p.y);
32 | distance = Math.sqrt(distance);
33 |
34 | return distance;
35 | }
36 |
37 | /**
38 | * 判断2个坐标点是否为用个坐标点
39 | *
40 | * @param p
41 | * 待比较坐标点
42 | * @return
43 | */
44 | public boolean isTheSame(Point p) {
45 | boolean isSamed = false;
46 |
47 | if (this.x == p.x && this.y == p.y) {
48 | isSamed = true;
49 | }
50 |
51 | return isSamed;
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/ga/GAExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.ga;
2 |
3 | /**
4 | * Genetic遗传算法测试类
5 | */
6 | public class GAExample {
7 |
8 | public static void main(String[] args) {
9 | //变量最小值和最大值
10 | int minNum = 1;
11 | int maxNum = 7;
12 | //初始群体规模
13 | int initSetsNum = 4;
14 |
15 | GACore tool = new GACore(minNum, maxNum, initSetsNum);
16 | tool.geneticCal();
17 | }
18 |
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/ga/maze/GAMazeExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.ga.maze;
2 |
3 | /**
4 | * 遗传算法在走迷宫游戏的应用
5 | */
6 | public class GAMazeExample {
7 |
8 | public static void main(String[] args) {
9 | //迷宫地图文件数据地址
10 | String filePath = "data/maze/mapData.txt";
11 | //初始个体数量
12 | int initSetsNum = 10;
13 |
14 | GAMazeCore tool = new GAMazeCore(filePath, initSetsNum);
15 | tool.goOutMaze();
16 | }
17 |
18 | }
19 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/kdtree/KDTreeExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.kdtree;
2 |
3 | import java.text.MessageFormat;
4 |
5 | /**
6 | * KD树算法测试类
7 | */
8 | public class KDTreeExample {
9 |
10 | public static void main(String[] args) {
11 | String filePath = "data/kdtree/input.txt";
12 | Point queryNode;
13 | Point searchedNode;
14 | KDTreeCore tool = new KDTreeCore(filePath);
15 |
16 | // 进行KD树的构建
17 | tool.createKDTree();
18 |
19 | // 通过KD树进行数据点的最近点查询
20 | queryNode = new Point(2.1, 3.1);
21 | searchedNode = tool.searchNearestData(queryNode);
22 | System.out.println(MessageFormat.format("距离查询点({0}, {1})最近的坐标点为({2}, {3})", queryNode.x, queryNode.y,
23 | searchedNode.x, searchedNode.y));
24 |
25 | //重新构造KD树,去除之前的访问记录
26 | tool.createKDTree();
27 | queryNode = new Point(2, 4.5);
28 | searchedNode = tool.searchNearestData(queryNode);
29 | System.out.println(MessageFormat.format("距离查询点({0}, {1})最近的坐标点为({2}, {3})", queryNode.x, queryNode.y,
30 | searchedNode.x, searchedNode.y));
31 | }
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/kdtree/Point.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.kdtree;
2 |
3 | /**
4 | * 坐标点类
5 | */
6 | public class Point {
7 |
8 | // 坐标点横坐标
9 | Double x;
10 | // 坐标点纵坐标
11 | Double y;
12 |
13 | public Point(double x, double y) {
14 | this.x = x;
15 | this.y = y;
16 | }
17 |
18 | public Point(String x, String y) {
19 | this.x = (Double.parseDouble(x));
20 | this.y = (Double.parseDouble(y));
21 | }
22 |
23 | /**
24 | * 计算当前点与制定点之间的欧式距离
25 | *
26 | * @param p
27 | * 待计算聚类的p点
28 | * @return
29 | */
30 | public double ouDistance(Point p) {
31 | double distance = 0;
32 |
33 | distance = (this.x - p.x) * (this.x - p.x) + (this.y - p.y) * (this.y - p.y);
34 | distance = Math.sqrt(distance);
35 |
36 | return distance;
37 | }
38 |
39 | /**
40 | * 判断2个坐标点是否为用个坐标点
41 | *
42 | * @param p
43 | * 待比较坐标点
44 | * @return
45 | */
46 | public boolean isTheSame(Point p) {
47 | boolean isSamed = false;
48 |
49 | if (this.x == p.x && this.y == p.y) {
50 | isSamed = true;
51 | }
52 |
53 | return isSamed;
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/kdtree/Range.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.kdtree;
2 |
3 | /**
4 | * 空间矢量,表示所代表的空间范围
5 | */
6 | public class Range {
7 |
8 | // 边界左边界
9 | double left;
10 | // 边界右边界
11 | double right;
12 | // 边界上边界
13 | double top;
14 | // 边界下边界
15 | double bottom;
16 |
17 | public Range() {
18 | this.left = -Integer.MAX_VALUE;
19 | this.right = Integer.MAX_VALUE;
20 | this.top = Integer.MAX_VALUE;
21 | this.bottom = -Integer.MAX_VALUE;
22 | }
23 |
24 | public Range(int left, int right, int top, int bottom) {
25 | this.left = left;
26 | this.right = right;
27 | this.top = top;
28 | this.bottom = bottom;
29 | }
30 |
31 | /**
32 | * 空间矢量进行并操作
33 | *
34 | * @param range
35 | * @return
36 | */
37 | public Range crossOperation(Range r) {
38 | Range range = new Range();
39 |
40 | // 取靠近右侧的左边界
41 | if (r.left > this.left) {
42 | range.left = r.left;
43 | } else {
44 | range.left = this.left;
45 | }
46 |
47 | // 取靠近左侧的右边界
48 | if (r.right < this.right) {
49 | range.right = r.right;
50 | } else {
51 | range.right = this.right;
52 | }
53 |
54 | // 取靠近下侧的上边界
55 | if (r.top < this.top) {
56 | range.top = r.top;
57 | } else {
58 | range.top = this.top;
59 | }
60 |
61 | // 取靠近上侧的下边界
62 | if (r.bottom > this.bottom) {
63 | range.bottom = r.bottom;
64 | } else {
65 | range.bottom = this.bottom;
66 | }
67 |
68 | return range;
69 | }
70 |
71 | /**
72 | * 根据坐标点分割方向确定左侧空间矢量
73 | *
74 | * @param p
75 | * 数据矢量
76 | * @param dir
77 | * 分割方向
78 | * @return
79 | */
80 | public static Range initLeftRange(Point p, int dir) {
81 | Range range = new Range();
82 |
83 | if (dir == KDTreeCore.DIRECTION_X) {
84 | range.right = p.x;
85 | } else {
86 | range.bottom = p.y;
87 | }
88 |
89 | return range;
90 | }
91 |
92 | /**
93 | * 根据坐标点分割方向确定右侧空间矢量
94 | *
95 | * @param p
96 | * 数据矢量
97 | * @param dir
98 | * 分割方向
99 | * @return
100 | */
101 | public static Range initRightRange(Point p, int dir) {
102 | Range range = new Range();
103 |
104 | if (dir == KDTreeCore.DIRECTION_X) {
105 | range.left = p.x;
106 | } else {
107 | range.top = p.y;
108 | }
109 |
110 | return range;
111 | }
112 | }
113 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/kdtree/TreeNode.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.kdtree;
2 |
3 | /**
4 | * KD树节点
5 | */
6 | public class TreeNode {
7 |
8 | //数据矢量
9 | Point nodeData;
10 | //分割平面的分割线
11 | int spilt;
12 | //空间矢量,该节点所表示的空间范围
13 | Range range;
14 | //父节点
15 | TreeNode parentNode;
16 | //位于分割超平面左侧的孩子节点
17 | TreeNode leftNode;
18 | //位于分割超平面右侧的孩子节点
19 | TreeNode rightNode;
20 | //节点是否被访问过,用于回溯时使用
21 | boolean isVisited;
22 |
23 | public TreeNode() {
24 | this.isVisited = false;
25 | }
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/msapriori/FrequentItem.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.msapriori;
2 |
3 | /**
4 | * 频繁项集
5 | */
6 | public class FrequentItem implements Comparable {
7 |
8 | // 频繁项集的集合ID
9 | private String[] idArray;
10 | // 频繁项集的支持度计数
11 | private int count;
12 | //频繁项集的长度,1项集或是2项集,亦或是3项集
13 | private int length;
14 |
15 | public FrequentItem(String[] idArray, int count) {
16 | this.idArray = idArray;
17 | this.count = count;
18 | length = idArray.length;
19 | }
20 |
21 | public String[] getIdArray() {
22 | return idArray;
23 | }
24 |
25 | public void setIdArray(String[] idArray) {
26 | this.idArray = idArray;
27 | }
28 |
29 | public int getCount() {
30 | return count;
31 | }
32 |
33 | public void setCount(int count) {
34 | this.count = count;
35 | }
36 |
37 | public int getLength() {
38 | return length;
39 | }
40 |
41 | public void setLength(int length) {
42 | this.length = length;
43 | }
44 |
45 | @Override
46 | public int compareTo(FrequentItem o) {
47 | // TODO Auto-generated method stub
48 | Integer int1 = Integer.parseInt(this.getIdArray()[0]);
49 | Integer int2 = Integer.parseInt(o.getIdArray()[0]);
50 |
51 | return int1.compareTo(int2);
52 | }
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/msapriori/MSAprioriExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.msapriori;
2 |
3 | /**
4 | * 基于多支持度的Apriori算法测试类
5 | */
6 | public class MSAprioriExample {
7 |
8 | public static void main(String[] args) {
9 | //是否是事务型数据
10 | boolean isTransaction;
11 | //测试数据文件地址
12 | String filePath = "data/msapriori/testInput.txt";
13 | //关系表型数据文件地址
14 | String tableFilePath = "data/msapriori/testInput2.txt";
15 | //最小支持度阈值
16 | double minSup;
17 | // 最小置信度率
18 | double minConf;
19 | //最大支持度差别阈值
20 | double delta;
21 | //多项目的最小支持度数,括号中的下标代表的是商品的ID
22 | double[] mis;
23 | //msApriori算法工具类
24 | MSAprioriCore tool;
25 |
26 | //为了测试的方便,取一个偏低的置信度值0.3
27 | minConf = 0.3;
28 | minSup = 0.1;
29 | delta = 0.5;
30 | //每项的支持度率都默认为0.1,第一项不使用
31 | mis = new double[] { -1, 0.1, 0.1, 0.1, 0.1, 0.1 };
32 | isTransaction = true;
33 |
34 | isTransaction = true;
35 | tool = new MSAprioriCore(filePath, minConf, delta, mis, isTransaction);
36 | tool.calFItems();
37 | System.out.println();
38 |
39 | isTransaction = false;
40 | //重新初始化数据
41 | tool = new MSAprioriCore(tableFilePath, minConf, minSup, isTransaction);
42 | tool.calFItems();
43 | }
44 |
45 | }
46 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/randomforest/DecisionTree.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.randomforest;
2 |
3 | import java.util.ArrayList;
4 | import java.util.HashMap;
5 | import java.util.Map;
6 |
7 | /**
8 | * 决策树
9 | */
10 | public class DecisionTree {
11 |
12 | // 树的根节点
13 | TreeNode rootNode;
14 | // 数据的属性列名称
15 | String[] featureNames;
16 | // 这棵树所包含的数据
17 | ArrayList datas;
18 | // 决策树构造的的工具类
19 | CARTCore tool;
20 |
21 | public DecisionTree(ArrayList datas) {
22 | this.datas = datas;
23 | this.featureNames = datas.get(0);
24 |
25 | tool = new CARTCore(datas);
26 | // 通过CART工具类进行决策树的构建,并返回树的根节点
27 | rootNode = tool.startBuildingTree();
28 | }
29 |
30 | /**
31 | * 根据给定的数据特征描述进行类别的判断
32 | *
33 | * @param features
34 | * @return
35 | */
36 | public String decideClassType(String features) {
37 | String classType = "";
38 | // 查询属性组
39 | String[] queryFeatures;
40 | // 在本决策树中对应的查询的属性值描述
41 | ArrayList featureStrs;
42 |
43 | featureStrs = new ArrayList<>();
44 | queryFeatures = features.split(",");
45 |
46 | String[] array;
47 | for (String name : featureNames) {
48 | for (String featureValue : queryFeatures) {
49 | array = featureValue.split("=");
50 | // 将对应的属性值加入到列表中
51 | if (array[0].equals(name)) {
52 | featureStrs.add(array);
53 | }
54 | }
55 | }
56 |
57 | // 开始从根据节点往下递归搜索
58 | classType = recusiveSearchClassType(rootNode, featureStrs);
59 |
60 | return classType;
61 | }
62 |
63 | /**
64 | * 递归搜索树,查询属性的分类类别
65 | *
66 | * @param node
67 | * 当前搜索到的节点
68 | * @param remainFeatures
69 | * 剩余未判断的属性
70 | * @return
71 | */
72 | private String recusiveSearchClassType(TreeNode node, ArrayList remainFeatures) {
73 | String classType = null;
74 |
75 | // 如果节点包含了数据的id索引,说明已经分类到底了
76 | if (node.getDataIndex() != null && node.getDataIndex().size() > 0) {
77 | classType = judgeClassType(node.getDataIndex());
78 |
79 | return classType;
80 | }
81 |
82 | // 取出剩余属性中的一个匹配属性作为当前的判断属性名称
83 | String[] currentFeature = null;
84 | for (String[] featureValue : remainFeatures) {
85 | if (node.getAttrName().equals(featureValue[0])) {
86 | currentFeature = featureValue;
87 | break;
88 | }
89 | }
90 |
91 | for (TreeNode childNode : node.getChildAttrNode()) {
92 | // 寻找子节点中属于此属性值的分支
93 | if (childNode.getParentAttrValue().equals(currentFeature[1])) {
94 | remainFeatures.remove(currentFeature);
95 | classType = recusiveSearchClassType(childNode, remainFeatures);
96 |
97 | // 如果找到了分类结果,则直接挑出循环
98 | break;
99 | } else {
100 | //进行第二种情况的判断加上!符号的情况
101 | String value = childNode.getParentAttrValue();
102 |
103 | if (value.charAt(0) == '!') {
104 | //去掉第一个!字符
105 | value = value.substring(1, value.length());
106 |
107 | if (!value.equals(currentFeature[1])) {
108 | remainFeatures.remove(currentFeature);
109 | classType = recusiveSearchClassType(childNode, remainFeatures);
110 |
111 | break;
112 | }
113 | }
114 | }
115 | }
116 |
117 | return classType;
118 | }
119 |
120 | /**
121 | * 根据得到的数据行分类进行类别的决策
122 | *
123 | * @param dataIndex
124 | * 根据分类的数据索引号
125 | * @return
126 | */
127 | public String judgeClassType(ArrayList dataIndex) {
128 | // 结果类型值
129 | String resultClassType = "";
130 | String classType = "";
131 | int count = 0;
132 | int temp = 0;
133 | Map type2Num = new HashMap();
134 |
135 | for (String index : dataIndex) {
136 | temp = Integer.parseInt(index);
137 | // 取最后一列的决策类别数据
138 | classType = datas.get(temp)[featureNames.length - 1];
139 |
140 | if (type2Num.containsKey(classType)) {
141 | // 如果类别已经存在,则使其计数加1
142 | count = type2Num.get(classType);
143 | count++;
144 | } else {
145 | count = 1;
146 | }
147 |
148 | type2Num.put(classType, count);
149 | }
150 |
151 | // 选出其中类别支持计数最多的一个类别值
152 | count = -1;
153 | for (Map.Entry entry : type2Num.entrySet()) {
154 | if ((int) entry.getValue() > count) {
155 | count = (int) entry.getValue();
156 | resultClassType = (String) entry.getKey();
157 | }
158 | }
159 |
160 | return resultClassType;
161 | }
162 | }
163 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/randomforest/RandomForestCore.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.randomforest;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileReader;
6 | import java.io.IOException;
7 | import java.util.ArrayList;
8 | import java.util.HashMap;
9 | import java.util.Map;
10 | import java.util.Random;
11 |
12 | /**
13 | * 随机森林算法工具类
14 | */
15 | public class RandomForestCore {
16 |
17 | // 测试数据文件地址
18 | private String filePath;
19 | // 决策树的样本占总数的占比率
20 | private double sampleNumRatio;
21 | // 样本数据的采集特征数量占总特征的比例
22 | private double featureNumRatio;
23 | // 决策树的采样样本数
24 | private int sampleNum;
25 | // 样本数据的采集采样特征数
26 | private int featureNum;
27 | // 随机森林中的决策树的数目,等于总的数据数/用于构造每棵树的数据的数量
28 | private int treeNum;
29 | // 随机数产生器
30 | private Random random;
31 | // 样本数据列属性名称行
32 | private String[] featureNames;
33 | // 原始的总的数据
34 | private ArrayList totalDatas;
35 | // 决策树森林
36 | private ArrayList decisionForest;
37 |
38 | public RandomForestCore(String filePath, double sampleNumRatio, double featureNumRatio) {
39 | this.filePath = filePath;
40 | this.sampleNumRatio = sampleNumRatio;
41 | this.featureNumRatio = featureNumRatio;
42 |
43 | readDataFile();
44 | }
45 |
46 | /**
47 | * 从文件中读取数据
48 | */
49 | private void readDataFile() {
50 | File file = new File(filePath);
51 | ArrayList dataArray = new ArrayList();
52 |
53 | try {
54 | BufferedReader in = new BufferedReader(new FileReader(file));
55 | String str;
56 | String[] tempArray;
57 | while ((str = in.readLine()) != null) {
58 | tempArray = str.split(" ");
59 | dataArray.add(tempArray);
60 | }
61 | in.close();
62 | } catch (IOException e) {
63 | e.getStackTrace();
64 | }
65 |
66 | totalDatas = dataArray;
67 | featureNames = totalDatas.get(0);
68 | sampleNum = (int) ((totalDatas.size() - 1) * sampleNumRatio);
69 | //算属性数量的时候需要去掉id属性和决策属性,用条件属性计算
70 | featureNum = (int) ((featureNames.length - 2) * featureNumRatio);
71 | // 算数量的时候需要去掉首行属性名称行
72 | treeNum = (totalDatas.size() - 1) / sampleNum;
73 | }
74 |
75 | /**
76 | * 产生决策树
77 | */
78 | private DecisionTree produceDecisionTree() {
79 | int temp = 0;
80 | DecisionTree tree;
81 | String[] tempData;
82 | //采样数据的随机行号组
83 | ArrayList sampleRandomNum;
84 | //采样属性特征的随机列号组
85 | ArrayList featureRandomNum;
86 | ArrayList datas;
87 |
88 | sampleRandomNum = new ArrayList<>();
89 | featureRandomNum = new ArrayList<>();
90 | datas = new ArrayList<>();
91 |
92 | for (int i = 0; i < sampleNum;) {
93 | temp = random.nextInt(totalDatas.size());
94 |
95 | //如果是行首属性名称行,则跳过
96 | if (temp == 0) {
97 | continue;
98 | }
99 |
100 | if (!sampleRandomNum.contains(temp)) {
101 | sampleRandomNum.add(temp);
102 | i++;
103 | }
104 | }
105 |
106 | for (int i = 0; i < featureNum;) {
107 | temp = random.nextInt(featureNames.length);
108 |
109 | //如果是第一列的数据id号或者是决策属性列,则跳过
110 | if (temp == 0 || temp == featureNames.length - 1) {
111 | continue;
112 | }
113 |
114 | if (!featureRandomNum.contains(temp)) {
115 | featureRandomNum.add(temp);
116 | i++;
117 | }
118 | }
119 |
120 | String[] singleRecord;
121 | String[] headCulumn = null;
122 | // 获取随机数据行
123 | for (int dataIndex : sampleRandomNum) {
124 | singleRecord = totalDatas.get(dataIndex);
125 |
126 | //每行的列数=所选的特征数+id号
127 | tempData = new String[featureNum + 2];
128 | headCulumn = new String[featureNum + 2];
129 |
130 | for (int i = 0, k = 1; i < featureRandomNum.size(); i++, k++) {
131 | temp = featureRandomNum.get(i);
132 |
133 | headCulumn[k] = featureNames[temp];
134 | tempData[k] = singleRecord[temp];
135 | }
136 |
137 | //加上id列的信息
138 | headCulumn[0] = featureNames[0];
139 | //加上决策分类列的信息
140 | headCulumn[featureNum + 1] = featureNames[featureNames.length - 1];
141 | tempData[featureNum + 1] = singleRecord[featureNames.length - 1];
142 |
143 | //加入此行数据
144 | datas.add(tempData);
145 | }
146 |
147 | //加入行首列出现名称
148 | datas.add(0, headCulumn);
149 | //对筛选出的数据重新做id分配
150 | temp = 0;
151 | for (String[] array : datas) {
152 | //从第2行开始赋值
153 | if (temp > 0) {
154 | array[0] = temp + "";
155 | }
156 |
157 | temp++;
158 | }
159 |
160 | tree = new DecisionTree(datas);
161 |
162 | return tree;
163 | }
164 |
165 | /**
166 | * 构造随机森林
167 | */
168 | public void constructRandomTree() {
169 | DecisionTree tree;
170 | random = new Random();
171 | decisionForest = new ArrayList<>();
172 |
173 | System.out.println("下面是随机森林中的决策树:");
174 | // 构造决策树加入森林中
175 | for (int i = 0; i < treeNum; i++) {
176 | System.out.println("\n决策树" + (i + 1));
177 | tree = produceDecisionTree();
178 | decisionForest.add(tree);
179 | }
180 | }
181 |
182 | /**
183 | * 根据给定的属性条件进行类别的决策
184 | *
185 | * @param features
186 | * 给定的已知的属性描述
187 | * @return
188 | */
189 | public String judgeClassType(String features) {
190 | // 结果类型值
191 | String resultClassType = "";
192 | String classType = "";
193 | int count = 0;
194 | Map type2Num = new HashMap();
195 |
196 | for (DecisionTree tree : decisionForest) {
197 | classType = tree.decideClassType(features);
198 | if (type2Num.containsKey(classType)) {
199 | // 如果类别已经存在,则使其计数加1
200 | count = type2Num.get(classType);
201 | count++;
202 | } else {
203 | count = 1;
204 | }
205 |
206 | type2Num.put(classType, count);
207 | }
208 |
209 | // 选出其中类别支持计数最多的一个类别值
210 | count = -1;
211 | for (Map.Entry entry : type2Num.entrySet()) {
212 | if ((int) entry.getValue() > count) {
213 | count = (int) entry.getValue();
214 | resultClassType = (String) entry.getKey();
215 | }
216 | }
217 |
218 | return resultClassType;
219 | }
220 | }
221 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/randomforest/RandomForestExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.randomforest;
2 |
3 | import java.text.MessageFormat;
4 |
5 | /**
6 | * 随机森林算法测试场景
7 | */
8 | public class RandomForestExample {
9 |
10 | public static void main(String[] args) {
11 | String filePath = "data/randomforest/input.txt";
12 | String queryStr = "Age=Youth,Income=Low,Student=No,CreditRating=Fair";
13 | String resultClassType = "";
14 | // 决策树的样本占总数的占比率
15 | double sampleNumRatio = 0.4;
16 | // 样本数据的采集特征数量占总特征的比例
17 | double featureNumRatio = 0.5;
18 |
19 | RandomForestCore tool = new RandomForestCore(filePath, sampleNumRatio, featureNumRatio);
20 | tool.constructRandomTree();
21 |
22 | resultClassType = tool.judgeClassType(queryStr);
23 |
24 | System.out.println();
25 | System.out.println(MessageFormat.format("查询属性描述{0},预测的分类结果为BuysCompute:{1}", queryStr, resultClassType));
26 | }
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/randomforest/TreeNode.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.randomforest;
2 |
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * 回归分类树节点
7 | */
8 | public class TreeNode {
9 |
10 | // 节点属性名字
11 | private String attrName;
12 | // 节点索引标号
13 | private int nodeIndex;
14 | //包含的叶子节点数
15 | private int leafNum;
16 | // 节点误差率
17 | private double alpha;
18 | // 父亲分类属性值
19 | private String parentAttrValue;
20 | // 孩子节点
21 | private TreeNode[] childAttrNode;
22 | // 数据记录索引
23 | private ArrayList dataIndex;
24 |
25 | public String getAttrName() {
26 | return attrName;
27 | }
28 |
29 | public void setAttrName(String attrName) {
30 | this.attrName = attrName;
31 | }
32 |
33 | public int getNodeIndex() {
34 | return nodeIndex;
35 | }
36 |
37 | public void setNodeIndex(int nodeIndex) {
38 | this.nodeIndex = nodeIndex;
39 | }
40 |
41 | public double getAlpha() {
42 | return alpha;
43 | }
44 |
45 | public void setAlpha(double alpha) {
46 | this.alpha = alpha;
47 | }
48 |
49 | public String getParentAttrValue() {
50 | return parentAttrValue;
51 | }
52 |
53 | public void setParentAttrValue(String parentAttrValue) {
54 | this.parentAttrValue = parentAttrValue;
55 | }
56 |
57 | public TreeNode[] getChildAttrNode() {
58 | return childAttrNode;
59 | }
60 |
61 | public void setChildAttrNode(TreeNode[] childAttrNode) {
62 | this.childAttrNode = childAttrNode;
63 | }
64 |
65 | public ArrayList getDataIndex() {
66 | return dataIndex;
67 | }
68 |
69 | public void setDataIndex(ArrayList dataIndex) {
70 | this.dataIndex = dataIndex;
71 | }
72 |
73 | public int getLeafNum() {
74 | return leafNum;
75 | }
76 |
77 | public void setLeafNum(int leafNum) {
78 | this.leafNum = leafNum;
79 | }
80 |
81 | }
82 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/tan/AttrMutualInfo.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.tan;
2 |
3 | /**
4 | * 属性之间的互信息值,表示属性之间的关联性大小
5 | */
6 | public class AttrMutualInfo implements Comparable {
7 |
8 | //互信息值
9 | Double value;
10 | //关联属性值对
11 | Node[] nodeArray;
12 |
13 | public AttrMutualInfo(double value, Node node1, Node node2) {
14 | this.value = value;
15 |
16 | this.nodeArray = new Node[2];
17 | this.nodeArray[0] = node1;
18 | this.nodeArray[1] = node2;
19 | }
20 |
21 | @Override
22 | public int compareTo(AttrMutualInfo o) {
23 | return o.value.compareTo(this.value);
24 | }
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/tan/Node.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.tan;
2 |
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * 贝叶斯网络节点类
7 | */
8 | public class Node {
9 |
10 | //节点唯一id,方便后面节点连接方向的确定
11 | int id;
12 | // 节点的属性名称
13 | String name;
14 | // 该节点所连续的节点
15 | ArrayList connectedNodes;
16 |
17 | public Node(int id, String name) {
18 | this.id = id;
19 | this.name = name;
20 |
21 | // 初始化变量
22 | this.connectedNodes = new ArrayList<>();
23 | }
24 |
25 | /**
26 | * 将自身节点连接到目标给定的节点
27 | *
28 | * @param node
29 | * 下游节点
30 | */
31 | public void connectNode(Node node) {
32 | //避免连接自身
33 | if (this.id == node.id) {
34 | return;
35 | }
36 |
37 | // 将节点加入自身节点的节点列表中
38 | this.connectedNodes.add(node);
39 | // 将自身节点加入到目标节点的列表中
40 | node.connectedNodes.add(this);
41 | }
42 |
43 | /**
44 | * 判断与目标节点是否相同,主要比较名称是否相同即可
45 | *
46 | * @param node
47 | * 目标结点
48 | * @return
49 | */
50 | public boolean isEqual(Node node) {
51 | boolean isEqual;
52 |
53 | isEqual = false;
54 | // 节点名称相同则视为相等
55 | if (this.id == node.id) {
56 | isEqual = true;
57 | }
58 |
59 | return isEqual;
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/tan/TanExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.tan;
2 |
3 | /**
4 | * TAN树型朴素贝叶斯算法
5 | */
6 | public class TanExample {
7 |
8 | public static void main(String[] args) {
9 | String filePath = "data/tan/input.txt";
10 | // 条件查询语句
11 | String queryStr;
12 | // 分类结果概率1
13 | double classResult1;
14 | // 分类结果概率2
15 | double classResult2;
16 |
17 | TANCore tool = new TANCore(filePath);
18 | queryStr = "OutLook=Sunny,Temperature=Hot,Humidity=High,Wind=Weak,PlayTennis=No";
19 | classResult1 = tool.calHappenedPro(queryStr);
20 |
21 | queryStr = "OutLook=Sunny,Temperature=Hot,Humidity=High,Wind=Weak,PlayTennis=Yes";
22 | classResult2 = tool.calHappenedPro(queryStr);
23 |
24 | System.out.println(String.format("类别为%s所求得的概率为%s", "PlayTennis=No", classResult1));
25 | System.out.println(String.format("类别为%s所求得的概率为%s", "PlayTennis=Yes", classResult2));
26 | if (classResult1 > classResult2) {
27 | System.out.println("分类类别为PlayTennis=No");
28 | } else {
29 | System.out.println("分类类别为PlayTennis=Yes");
30 | }
31 | }
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/viterbi/BaseNames.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.viterbi;
2 |
3 | /**
4 | * 基本变量定义类
5 | */
6 | public class BaseNames {
7 |
8 | //日期天数下标
9 | public static final int DAY1 = 0;
10 | public static final int DAY2 = 1;
11 | public static final int DAY3 = 2;
12 |
13 | //天气属性类别
14 | public static final int WEATHER_SUNNY = 0;
15 | public static final int WEATHER_CLOUDY = 1;
16 | public static final int WEATHER_RAINY = 2;
17 |
18 | //湿度属性类别
19 | public static final int HUMIDITY_DRY = 0;
20 | public static final int HUMIDITY_DRYISH = 1;
21 | public static final int HUMIDITY_DAMP = 1;
22 | public static final int HUMIDITY_SOGGY = 1;
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/viterbi/ViterbiCore.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.viterbi;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileReader;
6 | import java.io.IOException;
7 | import java.util.ArrayList;
8 | import java.util.HashMap;
9 |
10 | /**
11 | * 维特比算法工具类
12 | */
13 | public class ViterbiCore {
14 |
15 | // 状态转移概率矩阵文件地址
16 | private String stmFilePath;
17 | // 混淆矩阵文件地址
18 | private String confusionFilePath;
19 | // 初始状态概率
20 | private double[] initStatePro;
21 | // 观察到的状态序列
22 | public String[] observeStates;
23 | // 状态转移矩阵值
24 | private double[][] stMatrix;
25 | // 混淆矩阵值
26 | private double[][] confusionMatrix;
27 | // 各个条件下的潜在特征概率值
28 | private double[][] potentialValues;
29 | // 潜在特征
30 | private ArrayList potentialAttrs;
31 | // 属性值列坐标映射图
32 | private HashMap name2Index;
33 | // 列坐标属性值映射图
34 | private HashMap index2name;
35 |
36 | public ViterbiCore(String stmFilePath, String confusionFilePath, double[] initStatePro, String[] observeStates) {
37 | this.stmFilePath = stmFilePath;
38 | this.confusionFilePath = confusionFilePath;
39 | this.initStatePro = initStatePro;
40 | this.observeStates = observeStates;
41 |
42 | initOperation();
43 | }
44 |
45 | /**
46 | * 初始化数据操作
47 | */
48 | private void initOperation() {
49 | double[] temp;
50 | int index;
51 | ArrayList smtDatas;
52 | ArrayList cfDatas;
53 |
54 | smtDatas = readDataFile(stmFilePath);
55 | cfDatas = readDataFile(confusionFilePath);
56 |
57 | index = 0;
58 | this.stMatrix = new double[smtDatas.size()][];
59 | for (String[] array : smtDatas) {
60 | temp = new double[array.length];
61 | for (int i = 0; i < array.length; i++) {
62 | try {
63 | temp[i] = Double.parseDouble(array[i]);
64 | } catch (NumberFormatException e) {
65 | temp[i] = -1;
66 | }
67 | }
68 |
69 | // 将转换后的值赋给数组中
70 | this.stMatrix[index] = temp;
71 | index++;
72 | }
73 |
74 | index = 0;
75 | this.confusionMatrix = new double[cfDatas.size()][];
76 | for (String[] array : cfDatas) {
77 | temp = new double[array.length];
78 | for (int i = 0; i < array.length; i++) {
79 | try {
80 | temp[i] = Double.parseDouble(array[i]);
81 | } catch (NumberFormatException e) {
82 | temp[i] = -1;
83 | }
84 | }
85 |
86 | // 将转换后的值赋给数组中
87 | this.confusionMatrix[index] = temp;
88 | index++;
89 | }
90 |
91 | this.potentialAttrs = new ArrayList<>();
92 | // 添加潜在特征属性
93 | for (String s : smtDatas.get(0)) {
94 | this.potentialAttrs.add(s);
95 | }
96 | // 去除首列无效列
97 | potentialAttrs.remove(0);
98 |
99 | this.name2Index = new HashMap<>();
100 | this.index2name = new HashMap<>();
101 |
102 | // 添加名称下标映射关系
103 | for (int i = 1; i < smtDatas.get(0).length; i++) {
104 | this.name2Index.put(smtDatas.get(0)[i], i);
105 | // 添加下标到名称的映射
106 | this.index2name.put(i, smtDatas.get(0)[i]);
107 | }
108 |
109 | for (int i = 1; i < cfDatas.get(0).length; i++) {
110 | this.name2Index.put(cfDatas.get(0)[i], i);
111 | }
112 | }
113 |
114 | /**
115 | * 从文件中读取数据
116 | */
117 | private ArrayList readDataFile(String filePath) {
118 | File file = new File(filePath);
119 | ArrayList dataArray = new ArrayList();
120 |
121 | try {
122 | BufferedReader in = new BufferedReader(new FileReader(file));
123 | String str;
124 | String[] tempArray;
125 | while ((str = in.readLine()) != null) {
126 | tempArray = str.split(" ");
127 | dataArray.add(tempArray);
128 | }
129 | in.close();
130 | } catch (IOException e) {
131 | e.getStackTrace();
132 | }
133 |
134 | return dataArray;
135 | }
136 |
137 | /**
138 | * 根据观察特征计算隐藏的特征概率矩阵
139 | */
140 | private void calPotencialProMatrix() {
141 | String curObserveState;
142 | // 观察特征和潜在特征的下标
143 | int osIndex;
144 | int psIndex;
145 | double temp;
146 | double maxPro;
147 | // 混淆矩阵概率值,就是相关影响的因素概率
148 | double confusionPro;
149 |
150 | this.potentialValues = new double[observeStates.length][potentialAttrs.size() + 1];
151 | for (int i = 0; i < this.observeStates.length; i++) {
152 | curObserveState = this.observeStates[i];
153 | osIndex = this.name2Index.get(curObserveState);
154 | maxPro = -1;
155 |
156 | // 因为是第一个观察特征,没有前面的影响,根据初始状态计算
157 | if (i == 0) {
158 | for (String attr : this.potentialAttrs) {
159 | psIndex = this.name2Index.get(attr);
160 | confusionPro = this.confusionMatrix[psIndex][osIndex];
161 |
162 | temp = this.initStatePro[psIndex - 1] * confusionPro;
163 | this.potentialValues[BaseNames.DAY1][psIndex] = temp;
164 | }
165 | } else {
166 | // 后面的潜在特征受前一个特征的影响,以及当前的混淆因素影响
167 | for (String toDayAttr : this.potentialAttrs) {
168 | psIndex = this.name2Index.get(toDayAttr);
169 | confusionPro = this.confusionMatrix[psIndex][osIndex];
170 |
171 | int index;
172 | maxPro = -1;
173 | // 通过昨天的概率计算今天此特征的最大概率
174 | for (String yAttr : this.potentialAttrs) {
175 | index = this.name2Index.get(yAttr);
176 | temp = this.potentialValues[i - 1][index] * this.stMatrix[index][psIndex];
177 |
178 | // 计算得到今天此潜在特征的最大概率
179 | if (temp > maxPro) {
180 | maxPro = temp;
181 | }
182 | }
183 |
184 | this.potentialValues[i][psIndex] = maxPro * confusionPro;
185 | }
186 | }
187 | }
188 | }
189 |
190 | /**
191 | * 根据同时期最大概率值输出潜在特征值
192 | */
193 | private void outputResultAttr() {
194 | double maxPro;
195 | int maxIndex;
196 | ArrayList psValues;
197 |
198 | psValues = new ArrayList<>();
199 | for (int i = 0; i < this.potentialValues.length; i++) {
200 | maxPro = -1;
201 | maxIndex = 0;
202 |
203 | for (int j = 0; j < potentialValues[i].length; j++) {
204 | if (this.potentialValues[i][j] > maxPro) {
205 | maxPro = potentialValues[i][j];
206 | maxIndex = j;
207 | }
208 | }
209 |
210 | // 取出最大概率下标对应的潜在特征
211 | psValues.add(this.index2name.get(maxIndex));
212 | }
213 |
214 | System.out.println("观察特征为:");
215 | for (String s : this.observeStates) {
216 | System.out.print(s + ", ");
217 | }
218 | System.out.println();
219 |
220 | System.out.println("潜在特征为:");
221 | for (String s : psValues) {
222 | System.out.print(s + ", ");
223 | }
224 | System.out.println();
225 | }
226 |
227 | /**
228 | * 根据观察属性,得到潜在属性信息
229 | */
230 | public void calHMMObserve() {
231 | calPotencialProMatrix();
232 | outputResultAttr();
233 | }
234 | }
235 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/viterbi/ViterbiExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.others.viterbi;
2 |
3 | /**
4 | * 维特比算法
5 | */
6 | public class ViterbiExample {
7 |
8 | public static void main(String[] args) {
9 | // 状态转移概率矩阵路径
10 | String stmFilePath;
11 | // 混淆矩阵路径
12 | String cfFilePath;
13 | // 观察到的状态
14 | String[] observeStates;
15 | // 初始状态
16 | double[] initStatePro;
17 | ViterbiCore tool;
18 |
19 | stmFilePath = "data/viterbi/stmatrix.txt";
20 | cfFilePath = "data/viterbi/humidity-matrix.txt";
21 |
22 | initStatePro = new double[] { 0.63, 0.17, 0.20 };
23 | observeStates = new String[] { "Dry", "Damp", "Soggy" };
24 |
25 | tool = new ViterbiCore(stmFilePath, cfFilePath, initStatePro, observeStates);
26 | tool.calHMMObserve();
27 | }
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/roughsets/KnowledgeSystem.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.roughsets;
2 |
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * 知识系统
7 | */
8 | public class KnowledgeSystem {
9 |
10 | // 知识系统内的集合
11 | ArrayList ksCollections;
12 |
13 | public KnowledgeSystem(ArrayList ksCollections) {
14 | this.ksCollections = ksCollections;
15 | }
16 |
17 | /**
18 | * 获取集合的上近似集合
19 | *
20 | * @param rc
21 | * 原始集合
22 | * @return
23 | */
24 | public RecordCollection getUpSimilarRC(RecordCollection rc) {
25 | RecordCollection resultRc = null;
26 | ArrayList nameArray;
27 | ArrayList targetArray;
28 | ArrayList copyRcs = new ArrayList<>();
29 | ArrayList deleteRcs = new ArrayList<>();
30 | targetArray = rc.getRecordNames();
31 |
32 | // 做一个集合拷贝
33 | for (RecordCollection recordCollection : ksCollections) {
34 | copyRcs.add(recordCollection);
35 | }
36 |
37 | for (RecordCollection recordCollection : copyRcs) {
38 | nameArray = recordCollection.getRecordNames();
39 |
40 | if (strIsContained(targetArray, nameArray)) {
41 | removeOverLaped(targetArray, nameArray);
42 | deleteRcs.add(recordCollection);
43 |
44 | if (resultRc == null) {
45 | resultRc = recordCollection;
46 | } else {
47 | // 进行并运算
48 | resultRc = resultRc.unionCal(recordCollection);
49 | }
50 |
51 | if (targetArray.size() == 0) {
52 | break;
53 | }
54 | }
55 | }
56 | //去除已经添加过的集合
57 | copyRcs.removeAll(deleteRcs);
58 |
59 | if (targetArray.size() > 0) {
60 | // 说明已经完全还未找全上近似的集合
61 | for (RecordCollection recordCollection : copyRcs) {
62 | nameArray = recordCollection.getRecordNames();
63 |
64 | if (strHasOverlap(targetArray, nameArray)) {
65 | removeOverLaped(targetArray, nameArray);
66 |
67 | if (resultRc == null) {
68 | resultRc = recordCollection;
69 | } else {
70 | // 进行并运算
71 | resultRc = resultRc.unionCal(recordCollection);
72 | }
73 |
74 | if (targetArray.size() == 0) {
75 | break;
76 | }
77 | }
78 | }
79 | }
80 |
81 | return resultRc;
82 | }
83 |
84 | /**
85 | * 获取集合的下近似集合
86 | *
87 | * @param rc
88 | * 原始集合
89 | * @return
90 | */
91 | public RecordCollection getDownSimilarRC(RecordCollection rc) {
92 | RecordCollection resultRc = null;
93 | ArrayList nameArray;
94 | ArrayList targetArray;
95 | targetArray = rc.getRecordNames();
96 |
97 | for (RecordCollection recordCollection : ksCollections) {
98 | nameArray = recordCollection.getRecordNames();
99 |
100 | if (strIsContained(targetArray, nameArray)) {
101 | removeOverLaped(targetArray, nameArray);
102 |
103 | if (resultRc == null) {
104 | resultRc = recordCollection;
105 | } else {
106 | // 进行并运算
107 | resultRc = resultRc.unionCal(recordCollection);
108 | }
109 |
110 | if (targetArray.size() == 0) {
111 | break;
112 | }
113 | }
114 | }
115 |
116 | return resultRc;
117 | }
118 |
119 | /**
120 | * 判断2个字符数组之间是否有交集
121 | *
122 | * @param str1
123 | * 字符列表1
124 | * @param str2
125 | * 字符列表2
126 | * @return
127 | */
128 | public boolean strHasOverlap(ArrayList str1, ArrayList str2) {
129 | boolean hasOverlap = false;
130 |
131 | for (String s1 : str1) {
132 | for (String s2 : str2) {
133 | if (s1.equals(s2)) {
134 | hasOverlap = true;
135 | break;
136 | }
137 | }
138 |
139 | if (hasOverlap) {
140 | break;
141 | }
142 | }
143 |
144 | return hasOverlap;
145 | }
146 |
147 | /**
148 | * 判断字符集str2是否完全包含于str1中
149 | *
150 | * @param str1
151 | * @param str2
152 | * @return
153 | */
154 | public boolean strIsContained(ArrayList str1, ArrayList str2) {
155 | boolean isContained = false;
156 | int count = 0;
157 |
158 | for (String s : str2) {
159 | if (str1.contains(s)) {
160 | count++;
161 | }
162 | }
163 |
164 | if (count == str2.size()) {
165 | isContained = true;
166 | }
167 |
168 | return isContained;
169 | }
170 |
171 | /**
172 | * 字符列表移除公共元素
173 | *
174 | * @param str1
175 | * @param str2
176 | */
177 | public void removeOverLaped(ArrayList str1, ArrayList str2) {
178 | ArrayList deleteStrs = new ArrayList<>();
179 |
180 | for (String s1 : str1) {
181 | for (String s2 : str2) {
182 | if (s1.equals(s2)) {
183 | deleteStrs.add(s1);
184 | break;
185 | }
186 | }
187 | }
188 |
189 | // 进行公共元素的移除
190 | str1.removeAll(deleteStrs);
191 | }
192 | }
193 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/roughsets/Record.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.roughsets;
2 |
3 | import java.text.MessageFormat;
4 | import java.util.ArrayList;
5 | import java.util.HashMap;
6 | import java.util.Map;
7 |
8 | /**
9 | * 数据记录,包含这条记录所有属性
10 | */
11 | public class Record {
12 |
13 | // 记录名称
14 | private String name;
15 | // 记录属性键值对
16 | private HashMap attrValues;
17 |
18 | public Record(String name, HashMap attrValues) {
19 | this.name = name;
20 | this.attrValues = attrValues;
21 | }
22 |
23 | public String getName() {
24 | return this.name;
25 | }
26 |
27 | /**
28 | * 此数据是否包含此属性值
29 | *
30 | * @param attr
31 | * 待判断属性值
32 | * @return
33 | */
34 | public boolean isContainedAttr(String attr) {
35 | boolean isContained = false;
36 |
37 | if (attrValues.containsValue(attr)) {
38 | isContained = true;
39 | }
40 |
41 | return isContained;
42 | }
43 |
44 | /**
45 | * 判断数据记录是否是同一条记录,根据数据名称来判断
46 | *
47 | * @param record
48 | * 目标比较对象
49 | * @return
50 | */
51 | public boolean isRecordSame(Record record) {
52 | boolean isSame = false;
53 |
54 | if (this.name.equals(record.name)) {
55 | isSame = true;
56 | }
57 |
58 | return isSame;
59 | }
60 |
61 | /**
62 | * 数据的决策属性分类
63 | *
64 | * @return
65 | */
66 | public String getRecordDecisionClass() {
67 | String value = null;
68 |
69 | value = attrValues.get(RoughSetsCore.DECISION_ATTR_NAME);
70 |
71 | return value;
72 | }
73 |
74 | /**
75 | * 根据约简属性输出决策规则
76 | *
77 | * @param reductAttr
78 | * 约简属性集合
79 | */
80 | public String getDecisionRule(ArrayList reductAttr) {
81 | String ruleStr = "";
82 | String attrName = null;
83 | String value = null;
84 | String decisionValue;
85 |
86 | decisionValue = attrValues.get(RoughSetsCore.DECISION_ATTR_NAME);
87 | ruleStr += "属性";
88 | for (Map.Entry entry : this.attrValues.entrySet()) {
89 | attrName = (String) entry.getKey();
90 | value = (String) entry.getValue();
91 |
92 | if (attrName.equals(RoughSetsCore.DECISION_ATTR_NAME) || reductAttr.contains(attrName)
93 | || value.equals(name)) {
94 | continue;
95 | }
96 |
97 | ruleStr += MessageFormat.format("{0}={1},", attrName, value);
98 | }
99 | ruleStr += "他的分类为" + decisionValue;
100 |
101 | return ruleStr;
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/roughsets/RecordCollection.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.roughsets;
2 |
3 | import java.util.ArrayList;
4 | import java.util.HashMap;
5 | import java.util.Map;
6 |
7 | /**
8 | * 数据记录集合,包含一些共同的属性
9 | */
10 | public class RecordCollection {
11 |
12 | // 集合包含的属性
13 | private HashMap attrValues;
14 | // 数据记录列表
15 | private ArrayList recordList;
16 |
17 | public RecordCollection() {
18 | this.attrValues = new HashMap<>();
19 | this.recordList = new ArrayList<>();
20 | }
21 |
22 | public RecordCollection(HashMap attrValues, ArrayList recordList) {
23 | this.attrValues = attrValues;
24 | this.recordList = recordList;
25 | }
26 |
27 | public ArrayList getRecord() {
28 | return this.recordList;
29 | }
30 |
31 | /**
32 | * 返回集合的字符名称数组
33 | *
34 | * @return
35 | */
36 | public ArrayList getRecordNames() {
37 | ArrayList names = new ArrayList<>();
38 |
39 | for (int i = 0; i < recordList.size(); i++) {
40 | names.add(recordList.get(i).getName());
41 | }
42 |
43 | return names;
44 | }
45 |
46 | /**
47 | * 判断集合是否包含此属性名称对应的属性值
48 | *
49 | * @param attrName
50 | * 属性名
51 | * @return
52 | */
53 | public boolean isContainedAttrName(String attrName) {
54 | boolean isContained = false;
55 |
56 | if (this.attrValues.containsKey(attrName)) {
57 | isContained = true;
58 | }
59 |
60 | return isContained;
61 | }
62 |
63 | /**
64 | * 判断2个集合是否相等,比较包含的数据记录是否完全一致
65 | *
66 | * @param rc
67 | * 待比较集合
68 | * @return
69 | */
70 | public boolean isCollectionSame(RecordCollection rc) {
71 | boolean isSame = false;
72 |
73 | for (Record r : recordList) {
74 | isSame = false;
75 |
76 | for (Record r2 : rc.recordList) {
77 | if (r.isRecordSame(r2)) {
78 | isSame = true;
79 | break;
80 | }
81 | }
82 |
83 | // 如果有1个记录不包含,就算集合不相等
84 | if (!isSame) {
85 | break;
86 | }
87 | }
88 |
89 | return isSame;
90 | }
91 |
92 | /**
93 | * 集合之间的交运算
94 | *
95 | * @param rc
96 | * 交运算的参与运算的另外一集合
97 | * @return
98 | */
99 | public RecordCollection overlapCalculate(RecordCollection rc) {
100 | String key;
101 | String value;
102 | RecordCollection resultCollection = null;
103 | HashMap resultAttrValues = new HashMap<>();
104 | ArrayList resultRecords = new ArrayList<>();
105 |
106 | // 进行集合的交运算,有相同的记录的则进行添加
107 | for (Record record : this.recordList) {
108 | for (Record record2 : rc.recordList) {
109 | if (record.isRecordSame(record2)) {
110 | resultRecords.add(record);
111 | break;
112 | }
113 | }
114 | }
115 |
116 | // 如果没有交集,则直接返回
117 | if (resultRecords.size() == 0) {
118 | return null;
119 | }
120 |
121 | // 将2个集合的属性进行合并
122 | for (Map.Entry entry : this.attrValues.entrySet()) {
123 | key = (String) entry.getKey();
124 | value = (String) entry.getValue();
125 |
126 | resultAttrValues.put(key, value);
127 | }
128 |
129 | for (Map.Entry entry : rc.attrValues.entrySet()) {
130 | key = (String) entry.getKey();
131 | value = (String) entry.getValue();
132 |
133 | resultAttrValues.put(key, value);
134 | }
135 |
136 | resultCollection = new RecordCollection(resultAttrValues, resultRecords);
137 | return resultCollection;
138 | }
139 |
140 | /**
141 | * 求集合的并集,各自保留各自的属性
142 | *
143 | * @param rc
144 | * 待合并的集合
145 | * @return
146 | */
147 | public RecordCollection unionCal(RecordCollection rc) {
148 | RecordCollection resultRc = null;
149 | ArrayList records = new ArrayList<>();
150 |
151 | for (Record r1 : this.recordList) {
152 | records.add(r1);
153 | }
154 |
155 | for (Record r2 : rc.recordList) {
156 | records.add(r2);
157 | }
158 |
159 | resultRc = new RecordCollection(null, records);
160 | return resultRc;
161 | }
162 |
163 | /**
164 | * 输出集合中包含的元素
165 | */
166 | public void printRc() {
167 | System.out.print("{");
168 | for (Record r : this.getRecord()) {
169 | System.out.print(r.getName() + ", ");
170 | }
171 | System.out.println("}");
172 | }
173 | }
174 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/roughsets/RoughSetsExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.roughsets;
2 |
3 | /**
4 | * 粗糙集约简算法
5 | */
6 | public class RoughSetsExample {
7 |
8 | public static void main(String[] args) {
9 | String filePath = "data/roughsets/input.txt";
10 |
11 | RoughSetsCore tool = new RoughSetsCore(filePath);
12 | tool.findingReduct();
13 | }
14 |
15 | }
16 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/sequential/patterns/gsp/GSPExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.sequential.patterns.gsp;
2 |
3 | /**
4 | * GSP序列模式分析算法
5 | */
6 | public class GSPExample {
7 |
8 | public static void main(String[] args) {
9 | String filePath = "data/gsp/testInput.txt";
10 | //最小支持度阈值
11 | int minSupportCount = 2;
12 | //时间最小间隔
13 | int min_gap = 1;
14 | //施加最大间隔
15 | int max_gap = 5;
16 |
17 | GSPCore tool = new GSPCore(filePath, minSupportCount, min_gap, max_gap);
18 | tool.gspCalculate();
19 | }
20 |
21 | }
22 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/sequential/patterns/gsp/ItemSet.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.sequential.patterns.gsp;
2 |
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * 序列中的子项集
7 | */
8 | public class ItemSet {
9 |
10 | /**
11 | * 项集中保存的是数字项数组
12 | */
13 | private ArrayList items;
14 |
15 | public ItemSet(String[] itemStr) {
16 | items = new ArrayList<>();
17 | for (String s : itemStr) {
18 | items.add(Integer.parseInt(s));
19 | }
20 | }
21 |
22 | public ItemSet(int[] itemNum) {
23 | items = new ArrayList<>();
24 | for (int num : itemNum) {
25 | items.add(num);
26 | }
27 | }
28 |
29 | public ItemSet(ArrayList itemNum) {
30 | this.items = itemNum;
31 | }
32 |
33 | public ArrayList getItems() {
34 | return items;
35 | }
36 |
37 | public void setItems(ArrayList items) {
38 | this.items = items;
39 | }
40 |
41 | /**
42 | * 判断2个项集是否相等
43 | *
44 | * @param itemSet
45 | * 比较对象
46 | * @return
47 | */
48 | public boolean compareIsSame(ItemSet itemSet) {
49 | boolean result = true;
50 |
51 | if (this.items.size() != itemSet.items.size()) {
52 | return false;
53 | }
54 |
55 | for (int i = 0; i < itemSet.items.size(); i++) {
56 | if (this.items.get(i) != itemSet.items.get(i)) {
57 | // 只要有值不相等,直接算作不相等
58 | result = false;
59 | break;
60 | }
61 | }
62 |
63 | return result;
64 | }
65 |
66 | /**
67 | * 拷贝项集中同样的数据一份
68 | *
69 | * @return
70 | */
71 | public ArrayList copyItems() {
72 | ArrayList copyItems = new ArrayList<>();
73 |
74 | for (int num : this.items) {
75 | copyItems.add(num);
76 | }
77 |
78 | return copyItems;
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/sequential/patterns/gsp/Sequence.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.sequential.patterns.gsp;
2 |
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * 序列,每个序列内部包含多组ItemSet项集
7 | */
8 | public class Sequence implements Comparable, Cloneable {
9 |
10 | // 序列所属事务ID
11 | private int trsanctionID;
12 | // 项集列表
13 | private ArrayList itemSetList;
14 |
15 | public Sequence(int trsanctionID) {
16 | this.trsanctionID = trsanctionID;
17 | this.itemSetList = new ArrayList<>();
18 | }
19 |
20 | public Sequence() {
21 | this.itemSetList = new ArrayList<>();
22 | }
23 |
24 | public int getTrsanctionID() {
25 | return trsanctionID;
26 | }
27 |
28 | public void setTrsanctionID(int trsanctionID) {
29 | this.trsanctionID = trsanctionID;
30 | }
31 |
32 | public ArrayList getItemSetList() {
33 | return itemSetList;
34 | }
35 |
36 | public void setItemSetList(ArrayList itemSetList) {
37 | this.itemSetList = itemSetList;
38 | }
39 |
40 | /**
41 | * 取出序列中第一个项集的第一个元素
42 | *
43 | * @return
44 | */
45 | public Integer getFirstItemSetNum() {
46 | return this.getItemSetList().get(0).getItems().get(0);
47 | }
48 |
49 | /**
50 | * 获取序列中最后一个项集
51 | *
52 | * @return
53 | */
54 | public ItemSet getLastItemSet() {
55 | return getItemSetList().get(getItemSetList().size() - 1);
56 | }
57 |
58 | /**
59 | * 获取序列中最后一个项集的最后一个一个元素
60 | *
61 | * @return
62 | */
63 | public Integer getLastItemSetNum() {
64 | ItemSet lastItemSet = getItemSetList().get(getItemSetList().size() - 1);
65 | int lastItemNum = lastItemSet.getItems().get(lastItemSet.getItems().size() - 1);
66 |
67 | return lastItemNum;
68 | }
69 |
70 | /**
71 | * 判断序列中最后一个项集是否为单一的值
72 | *
73 | * @return
74 | */
75 | public boolean isLastItemSetSingleNum() {
76 | ItemSet lastItemSet = getItemSetList().get(getItemSetList().size() - 1);
77 | int size = lastItemSet.getItems().size();
78 |
79 | return size == 1 ? true : false;
80 | }
81 |
82 | @Override
83 | public int compareTo(Sequence o) {
84 | // TODO Auto-generated method stub
85 | return this.getFirstItemSetNum().compareTo(o.getFirstItemSetNum());
86 | }
87 |
88 | @Override
89 | protected Object clone() throws CloneNotSupportedException {
90 | // TODO Auto-generated method stub
91 | return super.clone();
92 | }
93 |
94 | /**
95 | * 拷贝一份一模一样的序列
96 | */
97 | public Sequence copySeqence() {
98 | Sequence copySeq = new Sequence();
99 | for (ItemSet itemSet : this.itemSetList) {
100 | copySeq.getItemSetList().add(new ItemSet(itemSet.copyItems()));
101 | }
102 |
103 | return copySeq;
104 | }
105 |
106 | /**
107 | * 比较2个序列是否相等,需要判断内部的每个项集是否完全一致
108 | *
109 | * @param seq
110 | * 比较的序列对象
111 | * @return
112 | */
113 | public boolean compareIsSame(Sequence seq) {
114 | boolean result = true;
115 | ArrayList itemSetList2 = seq.getItemSetList();
116 | ItemSet tempItemSet1;
117 | ItemSet tempItemSet2;
118 |
119 | if (itemSetList2.size() != this.itemSetList.size()) {
120 | return false;
121 | }
122 | for (int i = 0; i < itemSetList2.size(); i++) {
123 | tempItemSet1 = this.itemSetList.get(i);
124 | tempItemSet2 = itemSetList2.get(i);
125 |
126 | if (!tempItemSet1.compareIsSame(tempItemSet2)) {
127 | // 只要不相等,直接退出函数
128 | result = false;
129 | break;
130 | }
131 | }
132 |
133 | return result;
134 | }
135 |
136 | /**
137 | * 生成此序列的所有子序列
138 | *
139 | * @return
140 | */
141 | public ArrayList createChildSeqs() {
142 | ArrayList childSeqs = new ArrayList<>();
143 | ArrayList tempItems;
144 | Sequence tempSeq = null;
145 | ItemSet tempItemSet;
146 |
147 | for (int i = 0; i < this.itemSetList.size(); i++) {
148 | tempItemSet = itemSetList.get(i);
149 | if (tempItemSet.getItems().size() == 1) {
150 | tempSeq = this.copySeqence();
151 |
152 | // 如果只有项集中只有1个元素,则直接移除
153 | tempSeq.itemSetList.remove(i);
154 | childSeqs.add(tempSeq);
155 | } else {
156 | tempItems = tempItemSet.getItems();
157 | for (int j = 0; j < tempItems.size(); j++) {
158 | tempSeq = this.copySeqence();
159 |
160 | // 在拷贝的序列中移除一个数字
161 | tempSeq.getItemSetList().get(i).getItems().remove(j);
162 | childSeqs.add(tempSeq);
163 | }
164 | }
165 | }
166 |
167 | return childSeqs;
168 | }
169 |
170 | }
171 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/sequential/patterns/prefixspan/ItemSet.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.sequential.patterns.prefixspan;
2 |
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * 字符项集类
7 | */
8 | public class ItemSet {
9 |
10 | // 项集内的字符
11 | private ArrayList items;
12 |
13 | public ItemSet(String[] str) {
14 | items = new ArrayList<>();
15 | for (String s : str) {
16 | items.add(s);
17 | }
18 | }
19 |
20 | public ItemSet(ArrayList itemsList) {
21 | this.items = itemsList;
22 | }
23 |
24 | public ItemSet(String s) {
25 | items = new ArrayList<>();
26 | for (int i = 0; i < s.length(); i++) {
27 | items.add(s.charAt(i) + "");
28 | }
29 | }
30 |
31 | public ArrayList getItems() {
32 | return items;
33 | }
34 |
35 | public void setItems(ArrayList items) {
36 | this.items = items;
37 | }
38 |
39 | /**
40 | * 获取项集最后1个元素
41 | *
42 | * @return
43 | */
44 | public String getLastValue() {
45 | int size = this.items.size();
46 |
47 | return this.items.get(size - 1);
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/sequential/patterns/prefixspan/PrefixSpanExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.sequential.patterns.prefixspan;
2 |
3 | /**
4 | * PrefixSpan序列模式挖掘算法
5 | */
6 | public class PrefixSpanExample {
7 |
8 | public static void main(String[] agrs) {
9 | String filePath = "data/prefixspan/input.txt";
10 | //最小支持度阈值率
11 | double minSupportRate = 0.4;
12 |
13 | PrefixSpanCore tool = new PrefixSpanCore(filePath, minSupportRate);
14 | tool.prefixSpanCalculate();
15 | }
16 |
17 | }
18 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNCore.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.statistical.learning.ann;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileReader;
6 | import java.util.ArrayList;
7 | import java.util.List;
8 |
9 | /**
10 | * SVM支持向量机工具类
11 | */
12 | public class ANNCore {
13 |
14 | // 训练集数据文件路径
15 | private String trainDataPath;
16 | // svm_problem对象,用于构造svm model模型
17 | private ANNProblem sProblem;
18 | // svm参数,里面有svm支持向量机的类型和不同 的svm的核函数类型
19 | private ANNParameter sParam;
20 |
21 | public ANNCore(String trainDataPath) {
22 | this.trainDataPath = trainDataPath;
23 |
24 | // 初始化svm相关变量
25 | sProblem = initSvmProblem();
26 | sParam = initSvmParam();
27 | }
28 |
29 | /**
30 | * 初始化操作,根据训练集数据构造分类模型
31 | */
32 | private void initOperation() {
33 |
34 | }
35 |
36 | /**
37 | * svm_problem对象,训练集数据的相关信息配置
38 | *
39 | * @return
40 | */
41 | private ANNProblem initSvmProblem() {
42 | List label = new ArrayList();
43 | List nodeSet = new ArrayList();
44 | getData(nodeSet, label, trainDataPath);
45 |
46 | int dataRange = nodeSet.get(0).length;
47 | ANNNode[][] datas = new ANNNode[nodeSet.size()][dataRange]; // 训练集的向量表
48 | for (int i = 0; i < datas.length; i++) {
49 | for (int j = 0; j < dataRange; j++) {
50 | datas[i][j] = nodeSet.get(i)[j];
51 | }
52 | }
53 | double[] lables = new double[label.size()]; // a,b 对应的lable
54 | for (int i = 0; i < lables.length; i++) {
55 | lables[i] = label.get(i);
56 | }
57 |
58 | // 定义svm_problem对象
59 | ANNProblem problem = new ANNProblem();
60 | problem.l = nodeSet.size(); // 向量个数
61 | problem.x = datas; // 训练集向量表
62 | problem.y = lables; // 对应的lable数组
63 |
64 | return problem;
65 | }
66 |
67 | /**
68 | * 初始化svm支持向量机的参数,包括svm的类型和核函数的类型
69 | *
70 | * @return
71 | */
72 | private ANNParameter initSvmParam() {
73 | // 定义svm_parameter对象
74 | ANNParameter param = new ANNParameter();
75 | param.svm_type = ANNParameter.EPSILON_SVR;
76 | // 设置svm的核函数类型为线型
77 | param.kernel_type = ANNParameter.LINEAR;
78 | // 后面的参数配置只针对训练集的数据
79 | param.cache_size = 100;
80 | param.eps = 0.00001;
81 | param.C = 1.9;
82 |
83 | return param;
84 | }
85 |
86 | /**
87 | * 通过svm方式预测数据的类型
88 | *
89 | * @param testDataPath
90 | */
91 | public void svmPredictData(String testDataPath) {
92 | // 获取测试数据
93 | List testlabel = new ArrayList();
94 | List testnodeSet = new ArrayList();
95 | getData(testnodeSet, testlabel, testDataPath);
96 | int dataRange = testnodeSet.get(0).length;
97 |
98 | ANNNode[][] testdatas = new ANNNode[testnodeSet.size()][dataRange]; // 训练集的向量表
99 | for (int i = 0; i < testdatas.length; i++) {
100 | for (int j = 0; j < dataRange; j++) {
101 | testdatas[i][j] = testnodeSet.get(i)[j];
102 | }
103 | }
104 | // 测试数据的真实值,在后面将会与svm的预测值做比较
105 | double[] testlables = new double[testlabel.size()]; // a,b 对应的lable
106 | for (int i = 0; i < testlables.length; i++) {
107 | testlables[i] = testlabel.get(i);
108 | }
109 |
110 | // 如果参数没有问题,则svm.svm_check_parameter()函数返回null,否则返回error描述。
111 | // 对svm的配置参数叫验证,因为有些参数只针对部分的支持向量机的类型
112 | System.out.println(ANN.ann_check_parameter(sProblem, sParam));
113 | System.out.println("------------检验参数-----------");
114 | // 训练SVM分类模型
115 | ANNModel model = ANN.ann_train(sProblem, sParam);
116 |
117 | // 预测测试数据的lable
118 | double err = 0.0;
119 | for (int i = 0; i < testdatas.length; i++) {
120 | double truevalue = testlables[i];
121 | // 测试数据真实值
122 | System.out.print(truevalue + " ");
123 | double predictValue = ANN.ann_predict(model, testdatas[i]);
124 | // 测试数据预测值
125 | System.out.println(predictValue);
126 | }
127 | }
128 |
129 | /**
130 | * 从文件中获取数据
131 | *
132 | * @param nodeSet
133 | * 向量节点
134 | * @param label
135 | * 节点值类型值
136 | * @param filename
137 | * 数据文件地址
138 | */
139 | private void getData(List nodeSet, List label, String filename) {
140 | try {
141 |
142 | FileReader fr = new FileReader(new File(filename));
143 | BufferedReader br = new BufferedReader(fr);
144 | String line = null;
145 | while ((line = br.readLine()) != null) {
146 | String[] datas = line.split(",");
147 | ANNNode[] vector = new ANNNode[datas.length - 1];
148 | for (int i = 0; i < datas.length - 1; i++) {
149 | ANNNode node = new ANNNode();
150 | node.index = i + 1;
151 | node.value = Double.parseDouble(datas[i]);
152 | vector[i] = node;
153 | }
154 | nodeSet.add(vector);
155 | double lablevalue = Double.parseDouble(datas[datas.length - 1]);
156 | label.add(lablevalue);
157 | }
158 | } catch (Exception e) {
159 | e.printStackTrace();
160 | }
161 |
162 | }
163 |
164 | }
165 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.statistical.learning.ann;
2 |
3 | public class ANNExample {
4 |
5 | public static void main(String[] args) {
6 | // 训练集数据文件路径
7 | String trainDataPath = "data/ann/trainInput.txt";
8 | // 测试数据文件路径
9 | String testDataPath = "data/ann/testInput.txt";
10 |
11 | ANNCore tool = new ANNCore(trainDataPath);
12 | // 对测试数据进行ANN分类
13 | tool.svmPredictData(testDataPath);
14 | }
15 |
16 | }
17 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNModel.java:
--------------------------------------------------------------------------------
1 | //
2 | // svm_model
3 | //
4 | package com.jusdt.datamining.statistical.learning.ann;
5 |
6 | import java.io.Serializable;
7 |
8 | public class ANNModel implements Serializable {
9 |
10 | private static final long serialVersionUID = 1L;
11 |
12 | //svm支持向量机的参数
13 | ANNParameter param; // parameter
14 | //分类的类型数
15 | int nr_class; // number of classes, = 2 in regression/one class svm
16 | int l; // total #SV
17 | ANNNode[][] SV; // SVs (SV[l])
18 | double[][] sv_coef; // coefficients for SVs in decision functions (sv_coef[k-1][l])
19 | double[] rho; // constants in decision functions (rho[k*(k-1)/2])
20 | double[] probA; // pariwise probability information
21 | double[] probB;
22 |
23 | // for classification only
24 |
25 | //每个类型的类型值
26 | int[] label; // label of each class (label[k])
27 | int[] nSV; // number of SVs for each class (nSV[k])
28 | // nSV[0] + nSV[1] + ... + nSV[k-1] = l
29 |
30 | };
31 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNNode.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.statistical.learning.ann;
2 |
3 | import java.io.Serializable;
4 |
5 | /**
6 | *
7 | * svm向量节点
8 | * @author lyq
9 | *
10 | */
11 | public class ANNNode implements Serializable {
12 |
13 | private static final long serialVersionUID = 1L;
14 |
15 | //节点索引
16 | public int index;
17 | //节点的值
18 | public double value;
19 |
20 | }
21 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNParameter.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.statistical.learning.ann;
2 |
3 | import java.io.Serializable;
4 |
5 | public class ANNParameter implements Cloneable, Serializable {
6 |
7 | private static final long serialVersionUID = 1L;
8 |
9 | /* svm_type 支持向量机的类型*/
10 | public static final int C_SVC = 0;
11 | public static final int NU_SVC = 1;
12 | //一类svm
13 | public static final int ONE_CLASS = 2;
14 | public static final int EPSILON_SVR = 3;
15 | public static final int NU_SVR = 4;
16 |
17 | /* kernel_type 核函数类型*/
18 | //线型核函数
19 | public static final int LINEAR = 0;
20 | //多项式核函数
21 | public static final int POLY = 1;
22 | //RBF径向基函数
23 | public static final int RBF = 2;
24 | //二层神经网络核函数
25 | public static final int SIGMOID = 3;
26 | public static final int PRECOMPUTED = 4;
27 |
28 | public int svm_type;
29 | public int kernel_type;
30 | public int degree; // for poly
31 | public double gamma; // for poly/rbf/sigmoid
32 | public double coef0; // for poly/sigmoid
33 |
34 | // these are for training only 后面这些参数只针对训练集的数据
35 | public double cache_size; // in MB
36 | public double eps; // stopping criteria
37 | public double C; // for C_SVC, EPSILON_SVR and NU_SVR
38 | public int nr_weight; // for C_SVC
39 | public int[] weight_label; // for C_SVC
40 | public double[] weight; // for C_SVC
41 | public double nu; // for NU_SVC, ONE_CLASS, and NU_SVR
42 | public double p; // for EPSILON_SVR
43 | public int shrinking; // use the shrinking heuristics
44 | public int probability; // do probability estimates
45 |
46 | @Override
47 | public Object clone() {
48 | try {
49 | return super.clone();
50 | } catch (CloneNotSupportedException e) {
51 | return null;
52 | }
53 | }
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNPrintInterface.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.statistical.learning.ann;
2 |
3 | public interface ANNPrintInterface {
4 |
5 | public void print(String s);
6 |
7 | }
8 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNProblem.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.statistical.learning.ann;
2 |
3 | import java.io.Serializable;
4 |
5 | /**
6 | * 包含了训练集数据的基本信息
7 | */
8 | public class ANNProblem implements Serializable {
9 |
10 | private static final long serialVersionUID = 1L;
11 |
12 | //定义了向量的总个数
13 | public int l;
14 | //分类类型值数组
15 | public double[] y;
16 | //训练集向量表
17 | public ANNNode[][] x;
18 |
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/em/EMCore.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.statistical.learning.em;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileReader;
6 | import java.io.IOException;
7 | import java.text.MessageFormat;
8 | import java.util.ArrayList;
9 |
10 | /**
11 | * EM最大期望算法工具类
12 | */
13 | public class EMCore {
14 |
15 | // 测试数据文件地址
16 | private String dataFilePath;
17 | // 测试坐标点数据
18 | private String[][] data;
19 | // 测试坐标点数据列表
20 | private ArrayList pointArray;
21 | // 目标C1点
22 | private Point p1;
23 | // 目标C2点
24 | private Point p2;
25 |
26 | public EMCore(String dataFilePath) {
27 | this.dataFilePath = dataFilePath;
28 | pointArray = new ArrayList<>();
29 | }
30 |
31 | /**
32 | * 从文件中读取数据
33 | */
34 | public void readDataFile() {
35 | File file = new File(dataFilePath);
36 | ArrayList dataArray = new ArrayList();
37 |
38 | try {
39 | BufferedReader in = new BufferedReader(new FileReader(file));
40 | String str;
41 | String[] tempArray;
42 | while ((str = in.readLine()) != null) {
43 | tempArray = str.split(" ");
44 | dataArray.add(tempArray);
45 | }
46 | in.close();
47 | } catch (IOException e) {
48 | e.getStackTrace();
49 | }
50 |
51 | data = new String[dataArray.size()][];
52 | dataArray.toArray(data);
53 |
54 | // 开始时默认取头2个点作为2个簇中心
55 | p1 = new Point(Integer.parseInt(data[0][0]), Integer.parseInt(data[0][1]));
56 | p2 = new Point(Integer.parseInt(data[1][0]), Integer.parseInt(data[1][1]));
57 |
58 | Point p;
59 | for (String[] array : data) {
60 | // 将数据转换为对象加入列表方便计算
61 | p = new Point(Integer.parseInt(array[0]), Integer.parseInt(array[1]));
62 | pointArray.add(p);
63 | }
64 | }
65 |
66 | /**
67 | * 计算坐标点对于2个簇中心点的隶属度
68 | *
69 | * @param p
70 | * 待测试坐标点
71 | */
72 | private void computeMemberShip(Point p) {
73 | // p点距离第一个簇中心点的距离
74 | double distance1 = 0;
75 | // p距离第二个中心点的距离
76 | double distance2 = 0;
77 |
78 | // 用欧式距离计算
79 | distance1 = Math.pow(p.getX() - p1.getX(), 2) + Math.pow(p.getY() - p1.getY(), 2);
80 | distance2 = Math.pow(p.getX() - p2.getX(), 2) + Math.pow(p.getY() - p2.getY(), 2);
81 |
82 | // 计算对于p1点的隶属度,与距离成反比关系,距离靠近越小,隶属度越大,所以要用大的distance2另外的距离来表示
83 | p.setMemberShip1(distance2 / (distance1 + distance2));
84 | // 计算对于p2点的隶属度
85 | p.setMemberShip2(distance1 / (distance1 + distance2));
86 | }
87 |
88 | /**
89 | * 执行期望最大化步骤
90 | */
91 | public void exceptMaxStep() {
92 | // 新的优化过的簇中心点
93 | double p1X = 0;
94 | double p1Y = 0;
95 | double p2X = 0;
96 | double p2Y = 0;
97 | double temp1 = 0;
98 | double temp2 = 0;
99 | // 误差值
100 | double errorValue1 = 0;
101 | double errorValue2 = 0;
102 | // 上次更新的簇点坐标
103 | Point lastP1 = null;
104 | Point lastP2 = null;
105 |
106 | // 当开始计算的时候,或是中心点的误差值超过1的时候都需要再次迭代计算
107 | while (lastP1 == null || errorValue1 > 1.0 || errorValue2 > 1.0) {
108 | for (Point p : pointArray) {
109 | computeMemberShip(p);
110 | p1X += p.getMemberShip1() * p.getMemberShip1() * p.getX();
111 | p1Y += p.getMemberShip1() * p.getMemberShip1() * p.getY();
112 | temp1 += p.getMemberShip1() * p.getMemberShip1();
113 |
114 | p2X += p.getMemberShip2() * p.getMemberShip2() * p.getX();
115 | p2Y += p.getMemberShip2() * p.getMemberShip2() * p.getY();
116 | temp2 += p.getMemberShip2() * p.getMemberShip2();
117 | }
118 |
119 | lastP1 = new Point(p1.getX(), p1.getY());
120 | lastP2 = new Point(p2.getX(), p2.getY());
121 |
122 | // 套公式计算新的簇中心点坐标,最最大化处理
123 | p1.setX(p1X / temp1);
124 | p1.setY(p1Y / temp1);
125 | p2.setX(p2X / temp2);
126 | p2.setY(p2Y / temp2);
127 |
128 | errorValue1 = Math.abs(lastP1.getX() - p1.getX()) + Math.abs(lastP1.getY() - p1.getY());
129 | errorValue2 = Math.abs(lastP2.getX() - p2.getX()) + Math.abs(lastP2.getY() - p2.getY());
130 | }
131 |
132 | System.out.println(
133 | MessageFormat.format("簇中心节点p1({0}, {1}), p2({2}, {3})", p1.getX(), p1.getY(), p2.getX(), p2.getY()));
134 | }
135 |
136 | }
137 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/em/EMExample.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.statistical.learning.em;
2 |
3 | /**
4 | * EM期望最大化算法场景调用类
5 | */
6 | public class EMExample {
7 |
8 | public static void main(String[] args) {
9 | String filePath = "data/em/input.txt";
10 |
11 | EMCore tool = new EMCore(filePath);
12 | tool.readDataFile();
13 | tool.exceptMaxStep();
14 | }
15 |
16 | }
17 |
--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/em/Point.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.statistical.learning.em;
2 |
3 | /**
4 | * 坐标点类
5 | */
6 | public class Point {
7 |
8 | // 坐标点横坐标
9 | private double x;
10 | // 坐标点纵坐标
11 | private double y;
12 | // 坐标点对于P1的隶属度
13 | private double memberShip1;
14 | // 坐标点对于P2的隶属度
15 | private double memberShip2;
16 |
17 | public Point(double d, double e) {
18 | this.x = d;
19 | this.y = e;
20 | }
21 |
22 | public double getX() {
23 | return x;
24 | }
25 |
26 | public void setX(double x) {
27 | this.x = x;
28 | }
29 |
30 | public double getY() {
31 | return y;
32 | }
33 |
34 | public void setY(double y) {
35 | this.y = y;
36 | }
37 |
38 | public double getMemberShip1() {
39 | return memberShip1;
40 | }
41 |
42 | public void setMemberShip1(double memberShip1) {
43 | this.memberShip1 = memberShip1;
44 | }
45 |
46 | public double getMemberShip2() {
47 | return memberShip2;
48 | }
49 |
50 | public void setMemberShip2(double memberShip2) {
51 | this.memberShip2 = memberShip2;
52 | }
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | %d{ISO8601} [%thread] %-5level %logger{36} [Line:%-3L] - %msg%n
8 |
9 |
10 |
11 | INFO
12 | ACCEPT
13 | DENY
14 |
15 |
16 |
17 |
19 | logs/datamining.log
20 |
21 | %d{ISO8601} [%thread] %-5level %logger{36} [Line:%-3L] - %msg%n
22 |
23 |
24 |
25 | INFO
26 |
27 |
28 | logs/datamining.log.%d{yyyy-MM-dd}.gz
29 | 30
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/src/test/java/com/jusdt/datamining/demo/MainDemo.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.demo;
2 |
3 | public class MainDemo {
4 |
5 | public static void main(String[] args) {
6 | // TODO Auto-generated method stub
7 |
8 | }
9 |
10 | }
11 |
--------------------------------------------------------------------------------
/src/test/java/com/jusdt/datamining/dimensionality/reduction/pca/ToeplitzMatrixTest.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.dimensionality.reduction.pca;
2 |
3 | import com.jusdt.datamining.dimensionality.reduction.pca.ToeplitzMatrix;
4 |
5 | import junit.framework.TestCase;
6 |
7 | public class ToeplitzMatrixTest extends TestCase {
8 |
9 | public ToeplitzMatrixTest(String testName) {
10 | super(testName);
11 | }
12 |
13 | @Override
14 | protected void setUp() throws Exception {
15 | super.setUp();
16 | }
17 |
18 | @Override
19 | protected void tearDown() throws Exception {
20 | super.tearDown();
21 | }
22 |
23 | public void testToeplitz() {
24 | double[] data = new double[] { 1, 2, 3, 4, 5, 6 };
25 |
26 | ToeplitzMatrix m = new ToeplitzMatrix(data);
27 | // MatrixHelper.print(m, 1, 3);
28 | assertTrue("nrows wrong", m.getNRows() == 6);
29 | assertTrue("ncols wrong", m.getNCols() == 6);
30 | double[][] a = m.getArray();
31 |
32 | assertEquals("0,0", 1., a[0][0]);
33 | assertEquals("0,1", 2., a[0][1]);
34 | assertEquals("0,2", 3., a[0][2]);
35 | assertEquals("0,3", 4., a[0][3]);
36 | assertEquals("0,4", 5., a[0][4]);
37 | assertEquals("0,5", 6., a[0][5]);
38 |
39 | assertEquals("1,0", 2., a[1][0]);
40 | assertEquals("1,1", 1., a[1][1]);
41 | assertEquals("1,2", 2., a[1][2]);
42 | assertEquals("1,3", 3., a[1][3]);
43 | assertEquals("1,4", 4., a[1][4]);
44 | assertEquals("1,5", 5., a[1][5]);
45 |
46 | assertEquals("2,0", 3., a[2][0]);
47 | assertEquals("2,1", 2., a[2][1]);
48 | assertEquals("2,2", 1., a[2][2]);
49 | assertEquals("2,3", 2., a[2][3]);
50 | assertEquals("2,4", 3., a[2][4]);
51 | assertEquals("2,5", 4., a[2][5]);
52 |
53 | assertEquals("3,0", 4., a[3][0]);
54 | assertEquals("3,1", 3., a[3][1]);
55 | assertEquals("3,2", 2., a[3][2]);
56 | assertEquals("3,3", 1., a[3][3]);
57 | assertEquals("3,4", 2., a[3][4]);
58 | assertEquals("3,5", 3., a[3][5]);
59 |
60 | assertEquals("4,0", 5., a[4][0]);
61 | assertEquals("4,1", 4., a[4][1]);
62 | assertEquals("4,2", 3., a[4][2]);
63 | assertEquals("4,3", 2., a[4][3]);
64 | assertEquals("4,4", 1., a[4][4]);
65 | assertEquals("4,5", 2., a[4][5]);
66 |
67 | assertEquals("5,0", 6., a[5][0]);
68 | assertEquals("5,1", 5., a[5][1]);
69 | assertEquals("5,2", 4., a[5][2]);
70 | assertEquals("5,3", 3., a[5][3]);
71 | assertEquals("5,4", 2., a[5][4]);
72 | assertEquals("5,5", 1., a[5][5]);
73 | }
74 |
75 | }
76 |
--------------------------------------------------------------------------------
/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | %d{MMdd.HHmmss.SSS} [%-20t] [%-5p] [%-20c] [L:%-3L] - %m%n
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/需要验收的算法:
--------------------------------------------------------------------------------
1 | 1、朴素贝叶斯
2 | 2、KMeans
3 | 3、KNN
4 | 4、PCA
5 | 5、ANN
6 | 6、决策树
7 | 7、层次聚类
8 | 及其他辅助算法 ok
9 | 多媒体数据处理算法 ok
--------------------------------------------------------------------------------