├── .gitattributes
├── .gitignore
├── README.md
├── data
    ├── aco
    │   └── input.txt
    ├── adaboost
    │   └── input.txt
    ├── ann
    │   ├── testInput.txt
    │   └── trainInput.txt
    ├── apriori
    │   └── testInput.txt
    ├── bayesnetwork
    │   ├── attach.txt
    │   └── input.txt
    ├── birch
    │   ├── realData.txt
    │   └── testInput.txt
    ├── cabddcc
    │   └── graphData.txt
    ├── cart
    │   └── input.txt
    ├── cba
    │   └── input.txt
    ├── chameleon
    │   └── graphData.txt
    ├── dbscan
    │   └── input.txt
    ├── em
    │   └── input.txt
    ├── fptree
    │   └── testInput.txt
    ├── gsp
    │   └── testInput.txt
    ├── gspan
    │   ├── input.txt
    │   └── reallyData.txt
    ├── hits
    │   └── input.txt
    ├── id3
    │   └── input.txt
    ├── kdtree
    │   └── input.txt
    ├── kmeans
    │   └── input.txt
    ├── knn
    │   ├── testInput.txt
    │   └── trainInput.txt
    ├── maze
    │   └── mapData.txt
    ├── msapriori
    │   ├── testInput.txt
    │   └── testInput2.txt
    ├── naivebayes
    │   └── input.txt
    ├── pagerank
    │   └── input.txt
    ├── pca
    │   ├── Makefile
    │   ├── basilevsy.data
    │   ├── compressor_1_day_detail.data
    │   ├── compressor_per_day_kwh.data
    │   └── simple.data
    ├── prefixspan
    │   └── input.txt
    ├── randomforest
    │   └── input.txt
    ├── roughsets
    │   └── input.txt
    ├── tan
    │   └── input.txt
    └── viterbi
    │   ├── humidity-matrix.txt
    │   └── stmatrix.txt
├── pom.xml
├── src
    ├── main
    │   ├── assembly
    │   │   └── distribution.xml
    │   ├── bin
    │   │   └── ctl.sh
    │   ├── java
    │   │   └── com
    │   │   │   └── jusdt
    │   │   │       └── datamining
    │   │   │           ├── association
    │   │   │               └── analysis
    │   │   │               │   ├── apriori
    │   │   │               │       ├── AprioriCore.java
    │   │   │               │       ├── AprioriExample.java
    │   │   │               │       └── FrequentItem.java
    │   │   │               │   └── fptree
    │   │   │               │       ├── FPTreeCore.java
    │   │   │               │       ├── FPTreeExample.java
    │   │   │               │       └── TreeNode.java
    │   │   │           ├── bagging
    │   │   │               └── boosting
    │   │   │               │   └── adaboost
    │   │   │               │       ├── AdaBoostCore.java
    │   │   │               │       ├── AdaBoostExample.java
    │   │   │               │       └── Point.java
    │   │   │           ├── classification
    │   │   │               ├── cart
    │   │   │               │   ├── AttrNode.java
    │   │   │               │   ├── CARTCore.java
    │   │   │               │   └── CARTExample.java
    │   │   │               ├── id3
    │   │   │               │   ├── AttrNode.java
    │   │   │               │   ├── DataNode.java
    │   │   │               │   ├── ID3Core.java
    │   │   │               │   └── ID3Example.java
    │   │   │               ├── knn
    │   │   │               │   ├── KNNCore.java
    │   │   │               │   ├── KNNExample.java
    │   │   │               │   └── Sample.java
    │   │   │               └── naivebayes
    │   │   │               │   ├── NaiveBayesCore.java
    │   │   │               │   └── NaiveBayesExample.java
    │   │   │           ├── clustering
    │   │   │               ├── birch
    │   │   │               │   ├── BIRCHCore.java
    │   │   │               │   ├── BIRCHExample.java
    │   │   │               │   ├── Cluster.java
    │   │   │               │   ├── ClusteringFeature.java
    │   │   │               │   ├── LeafNode.java
    │   │   │               │   └── NonLeafNode.java
    │   │   │               └── kmeans
    │   │   │               │   ├── KMeansCore.java
    │   │   │               │   ├── KMeansExample.java
    │   │   │               │   └── Point.java
    │   │   │           ├── dimensionality
    │   │   │               └── reduction
    │   │   │               │   └── pca
    │   │   │               │       ├── DataReader.java
    │   │   │               │       ├── EVD.java
    │   │   │               │       ├── Main.java
    │   │   │               │       ├── Matrix.java
    │   │   │               │       ├── MatrixException.java
    │   │   │               │       ├── MatrixHelper.java
    │   │   │               │       ├── PCACore.java
    │   │   │               │       ├── PCACoreHandler.java
    │   │   │               │       ├── PCAExample.java
    │   │   │               │       ├── SVD.java
    │   │   │               │       ├── ToeplitzMatrix.java
    │   │   │               │       └── TrajectoryMatrix.java
    │   │   │           ├── graph
    │   │   │               └── gspan
    │   │   │               │   ├── DFSCodeTraveler.java
    │   │   │               │   ├── Edge.java
    │   │   │               │   ├── EdgeFrequency.java
    │   │   │               │   ├── GSpanExample.java
    │   │   │               │   ├── GSpanTool.java
    │   │   │               │   ├── Graph.java
    │   │   │               │   ├── GraphCode.java
    │   │   │               │   ├── GraphData.java
    │   │   │               │   └── SubChildTraveler.java
    │   │   │           ├── integrated
    │   │   │               └── cba
    │   │   │               │   ├── AprioriCore.java
    │   │   │               │   ├── CBACore.java
    │   │   │               │   ├── CBAExample.java
    │   │   │               │   └── FrequentItem.java
    │   │   │           ├── link
    │   │   │               ├── hits
    │   │   │               │   ├── HITSCore.java
    │   │   │               │   └── HITSExample.java
    │   │   │               └── pagerank
    │   │   │               │   ├── PageRankCore.java
    │   │   │               │   └── PageRankExample.java
    │   │   │           ├── others
    │   │   │               ├── aco
    │   │   │               │   ├── ACOCore.java
    │   │   │               │   ├── ACOExample.java
    │   │   │               │   └── Ant.java
    │   │   │               ├── bayesnetwork
    │   │   │               │   ├── BayesNetWorkCore.java
    │   │   │               │   ├── BayesNetWorkExample.java
    │   │   │               │   └── Node.java
    │   │   │               ├── cabddcc
    │   │   │               │   ├── CABDDCCCore.java
    │   │   │               │   ├── CABDDCCExample.java
    │   │   │               │   ├── Graph.java
    │   │   │               │   └── Point.java
    │   │   │               ├── chameleon
    │   │   │               │   ├── ChameleonCore.java
    │   │   │               │   ├── ChameleonExample.java
    │   │   │               │   ├── Cluster.java
    │   │   │               │   └── Point.java
    │   │   │               ├── dbscan
    │   │   │               │   ├── DBSCANCore.java
    │   │   │               │   ├── DBSCANExample.java
    │   │   │               │   └── Point.java
    │   │   │               ├── ga
    │   │   │               │   ├── GACore.java
    │   │   │               │   ├── GAExample.java
    │   │   │               │   └── maze
    │   │   │               │   │   ├── GAMazeCore.java
    │   │   │               │   │   └── GAMazeExample.java
    │   │   │               ├── kdtree
    │   │   │               │   ├── KDTreeCore.java
    │   │   │               │   ├── KDTreeExample.java
    │   │   │               │   ├── Point.java
    │   │   │               │   ├── Range.java
    │   │   │               │   └── TreeNode.java
    │   │   │               ├── msapriori
    │   │   │               │   ├── FrequentItem.java
    │   │   │               │   ├── MSAprioriCore.java
    │   │   │               │   └── MSAprioriExample.java
    │   │   │               ├── randomforest
    │   │   │               │   ├── CARTCore.java
    │   │   │               │   ├── DecisionTree.java
    │   │   │               │   ├── RandomForestCore.java
    │   │   │               │   ├── RandomForestExample.java
    │   │   │               │   └── TreeNode.java
    │   │   │               ├── tan
    │   │   │               │   ├── AttrMutualInfo.java
    │   │   │               │   ├── Node.java
    │   │   │               │   ├── TANCore.java
    │   │   │               │   └── TanExample.java
    │   │   │               └── viterbi
    │   │   │               │   ├── BaseNames.java
    │   │   │               │   ├── ViterbiCore.java
    │   │   │               │   └── ViterbiExample.java
    │   │   │           ├── roughsets
    │   │   │               ├── KnowledgeSystem.java
    │   │   │               ├── Record.java
    │   │   │               ├── RecordCollection.java
    │   │   │               ├── RoughSetsCore.java
    │   │   │               └── RoughSetsExample.java
    │   │   │           ├── sequential
    │   │   │               └── patterns
    │   │   │               │   ├── gsp
    │   │   │               │       ├── GSPCore.java
    │   │   │               │       ├── GSPExample.java
    │   │   │               │       ├── ItemSet.java
    │   │   │               │       └── Sequence.java
    │   │   │               │   └── prefixspan
    │   │   │               │       ├── ItemSet.java
    │   │   │               │       ├── PrefixSpanCore.java
    │   │   │               │       ├── PrefixSpanExample.java
    │   │   │               │       └── Sequence.java
    │   │   │           └── statistical
    │   │   │               └── learning
    │   │   │                   ├── ann
    │   │   │                       ├── ANN.java
    │   │   │                       ├── ANNCore.java
    │   │   │                       ├── ANNExample.java
    │   │   │                       ├── ANNModel.java
    │   │   │                       ├── ANNNode.java
    │   │   │                       ├── ANNParameter.java
    │   │   │                       ├── ANNPrintInterface.java
    │   │   │                       └── ANNProblem.java
    │   │   │                   └── em
    │   │   │                       ├── EMCore.java
    │   │   │                       ├── EMExample.java
    │   │   │                       └── Point.java
    │   └── resources
    │   │   └── logback.xml
    └── test
    │   ├── java
    │       └── com
    │       │   └── jusdt
    │       │       └── datamining
    │       │           ├── demo
    │       │               └── MainDemo.java
    │       │           └── dimensionality
    │       │               └── reduction
    │       │                   └── pca
    │       │                       └── ToeplitzMatrixTest.java
    │   └── resources
    │       └── logback-test.xml
└── 需要验收的算法


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | *.sln    merge=union
 7 | *.csproj merge=union
 8 | *.vbproj merge=union
 9 | *.fsproj merge=union
10 | *.dbproj merge=union
11 | 
12 | # Standard to msysgit
13 | *.doc	 diff=astextplain
14 | *.DOC	 diff=astextplain
15 | *.docx diff=astextplain
16 | *.DOCX diff=astextplain
17 | *.dot  diff=astextplain
18 | *.DOT  diff=astextplain
19 | *.pdf  diff=astextplain
20 | *.PDF	 diff=astextplain
21 | *.rtf	 diff=astextplain
22 | *.RTF	 diff=astextplain
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .classpath
2 | .project
3 | .settings/
4 | target/
5 | logs/
6 | 


--------------------------------------------------------------------------------
/data/aco/input.txt:
--------------------------------------------------------------------------------
 1 | # CityName
 2 | 1
 3 | 2
 4 | 3
 5 | 4
 6 | # Distance
 7 | 1 2 1
 8 | 1 3 1.4
 9 | 1 4 1
10 | 2 3 1
11 | 2 4 1
12 | 3 4 1


--------------------------------------------------------------------------------
/data/adaboost/input.txt:
--------------------------------------------------------------------------------
 1 | 1 5 1
 2 | 2 3 1
 3 | 3 1 -1
 4 | 4 5 -1
 5 | 5 6 1
 6 | 6 4 -1
 7 | 6 7 1
 8 | 7 6 1
 9 | 8 7 -1
10 | 8 2 -1


--------------------------------------------------------------------------------
/data/ann/testInput.txt:
--------------------------------------------------------------------------------
1 | 18.7,18.9,19.1,19.3,19.6
2 | 18.9,19.1,19.3,19.6,19.9
3 | 19.1,19.3,19.6,19.9,20.2
4 | 19.3,19.6,19.9,20.2,20.6
5 | 19.6,19.9,20.2,20.6,21
6 | 19.9,20.2,20.6,21,21.5
7 | 20.2,20.6,21,21.5,22


--------------------------------------------------------------------------------
/data/ann/trainInput.txt:
--------------------------------------------------------------------------------
 1 | 17.6,17.7,17.7,17.7,17.8
 2 | 17.7,17.7,17.7,17.8,17.8
 3 | 17.7,17.7,17.8,17.8,17.9
 4 | 17.7,17.8,17.8,17.9,18
 5 | 17.8,17.8,17.9,18,18.1
 6 | 17.8,17.9,18,18.1,18.2
 7 | 17.9,18,18.1,18.2,18.4
 8 | 18,18.1,18.2,18.4,18.6
 9 | 18.1,18.2,18.4,18.6,18.7
10 | 18.2,18.4,18.6,18.7,18.9
11 | 18.4,18.6,18.7,18.9,19.1
12 | 18.6,18.7,18.9,19.1,19.3


--------------------------------------------------------------------------------
/data/apriori/testInput.txt:
--------------------------------------------------------------------------------
1 | T1 1 2 5
2 | T2 2 4
3 | T3 2 3
4 | T4 1 2 4
5 | T5 1 3
6 | T6 2 3
7 | T7 1 3
8 | T8 1 2 3 5
9 | T9 1 2 3


--------------------------------------------------------------------------------
/data/bayesnetwork/attach.txt:
--------------------------------------------------------------------------------
1 | B A
2 | E A
3 | A M
4 | A J


--------------------------------------------------------------------------------
/data/bayesnetwork/input.txt:
--------------------------------------------------------------------------------
 1 | B E A M J P
 2 | y y y y y 0.00012
 3 | y y y y n 0.000051
 4 | y y y n y 0.000013
 5 | y y y n n 0.0000057
 6 | y y n y y 0.000000005
 7 | y y n y n 0.00000049
 8 | y y n n y 0.000000095
 9 | y y n n n 0.0000094
10 | y n y y y 0.0058
11 | y n y y n 0.0025
12 | y n y n y 0.00065
13 | y n y n n 0.00028
14 | y n n y y 0.00000029
15 | y n n y n 0.000029
16 | y n n n y 0.0000056
17 | y n n n n 0.00055
18 | n y y y y 0.0036
19 | n y y y n 0.0016
20 | n y y n y 0.0004
21 | n y y n n 0.00017
22 | n y n y y 0.000007
23 | n y n y n 0.00069
24 | n y n n y 0.00013
25 | n y n n n 0.013
26 | n n y y y 0.00061
27 | n n y y n 0.00026
28 | n n y n y 0.000068
29 | n n y n n 0.000029
30 | n n n y y 0.00048
31 | n n n y n 0.048
32 | n n n n y 0.0092
33 | n n n n n 0.91


--------------------------------------------------------------------------------
/data/birch/realData.txt:
--------------------------------------------------------------------------------
  1 | 5.1     3.5     1.4     0.2
  2 | 4.9     3.0     1.4     0.2
  3 | 4.7     3.2     1.3     0.2
  4 | 4.6     3.1     1.5     0.2
  5 | 5.0     3.6     1.4     0.2
  6 | 5.4     3.9     1.7     0.4
  7 | 4.6     3.4     1.4     0.3
  8 | 5.0     3.4     1.5     0.2
  9 | 4.4     2.9     1.4     0.2
 10 | 4.9     3.1     1.5     0.1
 11 | 5.4     3.7     1.5     0.2
 12 | 4.8     3.4     1.6     0.2
 13 | 4.8     3.0     1.4     0.1
 14 | 4.3     3.0     1.1     0.1
 15 | 5.8     4.0     1.2     0.2
 16 | 5.7     4.4     1.5     0.4
 17 | 5.4     3.9     1.3     0.4
 18 | 5.1     3.5     1.4     0.3
 19 | 5.7     3.8     1.7     0.3
 20 | 5.1     3.8     1.5     0.3
 21 | 5.4     3.4     1.7     0.2
 22 | 5.1     3.7     1.5     0.4
 23 | 4.6     3.6     1.0     0.2
 24 | 5.1     3.3     1.7     0.5
 25 | 4.8     3.4     1.9     0.2
 26 | 5.0     3.0     1.6     0.2
 27 | 5.0     3.4     1.6     0.4
 28 | 5.2     3.5     1.5     0.2
 29 | 5.2     3.4     1.4     0.2
 30 | 4.7     3.2     1.6     0.2
 31 | 4.8     3.1     1.6     0.2
 32 | 5.4     3.4     1.5     0.4
 33 | 5.2     4.1     1.5     0.1
 34 | 5.5     4.2     1.4     0.2
 35 | 4.9     3.1     1.5     0.1
 36 | 5.0     3.2     1.2     0.2
 37 | 5.5     3.5     1.3     0.2
 38 | 4.9     3.1     1.5     0.1
 39 | 4.4     3.0     1.3     0.2
 40 | 5.1     3.4     1.5     0.2
 41 | 5.0     3.5     1.3     0.3
 42 | 4.5     2.3     1.3     0.3
 43 | 4.4     3.2     1.3     0.2
 44 | 5.0     3.5     1.6     0.6
 45 | 5.1     3.8     1.9     0.4
 46 | 4.8     3.0     1.4     0.3
 47 | 5.1     3.8     1.6     0.2
 48 | 4.6     3.2     1.4     0.2
 49 | 5.3     3.7     1.5     0.2
 50 | 5.0     3.3     1.4     0.2
 51 | 7.0     3.2     4.7     1.4
 52 | 6.4     3.2     4.5     1.5
 53 | 6.9     3.1     4.9     1.5
 54 | 5.5     2.3     4.0     1.3
 55 | 6.5     2.8     4.6     1.5
 56 | 5.7     2.8     4.5     1.3
 57 | 6.3     3.3     4.7     1.6
 58 | 4.9     2.4     3.3     1.0
 59 | 6.6     2.9     4.6     1.3
 60 | 5.2     2.7     3.9     1.4
 61 | 5.0     2.0     3.5     1.0
 62 | 5.9     3.0     4.2     1.5
 63 | 6.0     2.2     4.0     1.0
 64 | 6.1     2.9     4.7     1.4
 65 | 5.6     2.9     3.6     1.3
 66 | 6.7     3.1     4.4     1.4
 67 | 5.6     3.0     4.5     1.5
 68 | 5.8     2.7     4.1     1.0
 69 | 6.2     2.2     4.5     1.5
 70 | 5.6     2.5     3.9     1.1
 71 | 5.9     3.2     4.8     1.8
 72 | 6.1     2.8     4.0     1.3
 73 | 6.3     2.5     4.9     1.5
 74 | 6.1     2.8     4.7     1.2
 75 | 6.4     2.9     4.3     1.3
 76 | 6.6     3.0     4.4     1.4
 77 | 6.8     2.8     4.8     1.4
 78 | 6.7     3.0     5.0     1.7
 79 | 6.0     2.9     4.5     1.5
 80 | 5.7     2.6     3.5     1.0
 81 | 5.5     2.4     3.8     1.1
 82 | 5.5     2.4     3.7     1.0
 83 | 5.8     2.7     3.9     1.2
 84 | 6.0     2.7     5.1     1.6
 85 | 5.4     3.0     4.5     1.5
 86 | 6.0     3.4     4.5     1.6
 87 | 6.7     3.1     4.7     1.5
 88 | 6.3     2.3     4.4     1.3
 89 | 5.6     3.0     4.1     1.3
 90 | 5.5     2.5     4.0     1.3
 91 | 5.5     2.6     4.4     1.2
 92 | 6.1     3.0     4.6     1.4
 93 | 5.8     2.6     4.0     1.2
 94 | 5.0     2.3     3.3     1.0
 95 | 5.6     2.7     4.2     1.3
 96 | 5.7     3.0     4.2     1.2
 97 | 5.7     2.9     4.2     1.3
 98 | 6.2     2.9     4.3     1.3
 99 | 5.1     2.5     3.0     1.1
100 | 5.7     2.8     4.1     1.3
101 | 6.3     3.3     6.0     2.5
102 | 5.8     2.7     5.1     1.9
103 | 7.1     3.0     5.9     2.1
104 | 6.3     2.9     5.6     1.8
105 | 6.5     3.0     5.8     2.2
106 | 7.6     3.0     6.6     2.1
107 | 4.9     2.5     4.5     1.7
108 | 7.3     2.9     6.3     1.8
109 | 6.7     2.5     5.8     1.8
110 | 7.2     3.6     6.1     2.5
111 | 6.5     3.2     5.1     2.0
112 | 6.4     2.7     5.3     1.9
113 | 6.8     3.0     5.5     2.1
114 | 5.7     2.5     5.0     2.0
115 | 5.8     2.8     5.1     2.4
116 | 6.4     3.2     5.3     2.3
117 | 6.5     3.0     5.5     1.8
118 | 7.7     3.8     6.7     2.2
119 | 7.7     2.6     6.9     2.3
120 | 6.0     2.2     5.0     1.5
121 | 6.9     3.2     5.7     2.3
122 | 5.6     2.8     4.9     2.0
123 | 7.7     2.8     6.7     2.0
124 | 6.3     2.7     4.9     1.8
125 | 6.7     3.3     5.7     2.1
126 | 7.2     3.2     6.0     1.8
127 | 6.2     2.8     4.8     1.8
128 | 6.1     3.0     4.9     1.8
129 | 6.4     2.8     5.6     2.1
130 | 7.2     3.0     5.8     1.6
131 | 7.4     2.8     6.1     1.9
132 | 7.9     3.8     6.4     2.0
133 | 6.4     2.8     5.6     2.2
134 | 6.3     2.8     5.1     1.5
135 | 6.1     2.6     5.6     1.4
136 | 7.7     3.0     6.1     2.3
137 | 6.3     3.4     5.6     2.4
138 | 6.4     3.1     5.5     1.8
139 | 6.0     3.0     4.8     1.8
140 | 6.9     3.1     5.4     2.1
141 | 6.7     3.1     5.6     2.4
142 | 6.9     3.1     5.1     2.3
143 | 5.8     2.7     5.1     1.9
144 | 6.8     3.2     5.9     2.3
145 | 6.7     3.3     5.7     2.5
146 | 6.7     3.0     5.2     2.3
147 | 6.3     2.5     5.0     1.9
148 | 6.5     3.0     5.2     2.0
149 | 6.2     3.4     5.4     2.3
150 | 5.9     3.0     5.1     1.8


--------------------------------------------------------------------------------
/data/birch/testInput.txt:
--------------------------------------------------------------------------------
1 | 5.1     3.5     1.4     0.2
2 | 4.9     3.0     1.4     0.2
3 | 4.7     3.2     1.3     0.8
4 | 4.6     3.1     1.5     0.8
5 | 5.0     3.6     1.8     0.6
6 | 4.7     3.2     1.4     0.8


--------------------------------------------------------------------------------
/data/cabddcc/graphData.txt:
--------------------------------------------------------------------------------
 1 | 0 1 12
 2 | 1 3 9
 3 | 2 3 12
 4 | 3 4 10
 5 | 4 4 4
 6 | 5 4 1
 7 | 6 6 1
 8 | 7 6 3
 9 | 8 6 9
10 | 9 8 3
11 | 10 8 10
12 | 11 9 2
13 | 12 9 11
14 | 13 10 9
15 | 14 11 12


--------------------------------------------------------------------------------
/data/cart/input.txt:
--------------------------------------------------------------------------------
 1 | Rid Age Income Student CreditRating BuysComputer
 2 | 1 Youth High No Fair No
 3 | 2 Youth High No Excellent No
 4 | 3 MiddleAged High No Fair Yes
 5 | 4 Senior Medium No Fair Yes
 6 | 5 Senior Low Yes Fair Yes
 7 | 6 Senior Low Yes Excellent No
 8 | 7 MiddleAged Low Yes Excellent Yes
 9 | 8 Youth Medium No Fair No
10 | 9 Youth Low Yes Fair Yes
11 | 10 Senior Medium Yes Fair Yes
12 | 11 Youth Medium Yes Excellent Yes
13 | 12 MiddleAged Medium No Excellent Yes
14 | 13 MiddleAged High Yes Fair Yes
15 | 14 Senior Medium No Excellent No


--------------------------------------------------------------------------------
/data/cba/input.txt:
--------------------------------------------------------------------------------
 1 | Rid Age Income Student CreditRating BuysComputer
 2 | 1 13 High No Fair CLassNo
 3 | 2 11 High No Excellent CLassNo
 4 | 3 25 High No Fair CLassYes
 5 | 4 45 Medium No Fair CLassYes
 6 | 5 50 Low Yes Fair CLassYes
 7 | 6 51 Low Yes Excellent CLassNo
 8 | 7 30 Low Yes Excellent CLassYes
 9 | 8 13 Medium No Fair CLassNo
10 | 9 9 Low Yes Fair CLassYes
11 | 10 55 Medium Yes Fair CLassYes
12 | 11 14 Medium Yes Excellent CLassYes
13 | 12 33 Medium No Excellent CLassYes
14 | 13 33 High Yes Fair CLassYes
15 | 14 41 Medium No Excellent CLassNo


--------------------------------------------------------------------------------
/data/chameleon/graphData.txt:
--------------------------------------------------------------------------------
 1 | 0 2 2
 2 | 1 3 1
 3 | 2 3 4
 4 | 3 3 14
 5 | 4 5 3
 6 | 5 8 3
 7 | 6 8 6
 8 | 7 9 8
 9 | 8 10 4
10 | 9 10 7
11 | 10 10 10
12 | 11 10 14
13 | 12 11 13
14 | 13 12 8
15 | 14 12 15
16 | 15 14 7
17 | 16 14 9
18 | 17 14 15
19 | 18 15 8


--------------------------------------------------------------------------------
/data/dbscan/input.txt:
--------------------------------------------------------------------------------
 1 | 2 2
 2 | 3 1
 3 | 3 4
 4 | 3 14
 5 | 5 3
 6 | 8 3
 7 | 8 6
 8 | 9 8
 9 | 10 4
10 | 10 7
11 | 10 10
12 | 10 14
13 | 11 13
14 | 12 8
15 | 12 15
16 | 14 7
17 | 14 9
18 | 14 15
19 | 15 8


--------------------------------------------------------------------------------
/data/em/input.txt:
--------------------------------------------------------------------------------
1 | 3 3
2 | 4 10
3 | 9 6
4 | 14 8
5 | 18 11
6 | 21 7


--------------------------------------------------------------------------------
/data/fptree/testInput.txt:
--------------------------------------------------------------------------------
1 | T1 1 2 5
2 | T2 2 4
3 | T3 2 3
4 | T4 1 2 4
5 | T5 1 3
6 | T6 2 3
7 | T7 1 3
8 | T8 1 2 3 5
9 | T9 1 2 3


--------------------------------------------------------------------------------
/data/gsp/testInput.txt:
--------------------------------------------------------------------------------
 1 | 1 2 1 5
 2 | 1 1 2
 3 | 1 1 3
 4 | 1 1 4
 5 | 2 1 1
 6 | 2 1 3
 7 | 2 1 4
 8 | 2 2 3 5
 9 | 3 1 1
10 | 3 1 2
11 | 3 1 3
12 | 3 1 4
13 | 3 1 5
14 | 4 1 1
15 | 4 1 3
16 | 4 1 5
17 | 5 1 4
18 | 5 1 5


--------------------------------------------------------------------------------
/data/gspan/input.txt:
--------------------------------------------------------------------------------
 1 | t # 0
 2 | v 0 0
 3 | v 1 1
 4 | v 2 0
 5 | v 3 0
 6 | v 4 0
 7 | v 5 1
 8 | e 0 1 0
 9 | e 1 2 0
10 | e 1 3 0
11 | e 2 4 0
12 | e 3 5 1


--------------------------------------------------------------------------------
/data/hits/input.txt:
--------------------------------------------------------------------------------
1 | 1 2
2 | 1 3
3 | 2 3
4 | 3 1


--------------------------------------------------------------------------------
/data/id3/input.txt:
--------------------------------------------------------------------------------
 1 | Day OutLook Temperature Humidity Wind PlayTennis
 2 | 1 Sunny Hot High Weak No
 3 | 2 Sunny Hot High Strong No
 4 | 3 Overcast Hot High Weak Yes
 5 | 4 Rainy Mild High Weak Yes
 6 | 5 Rainy Cool Normal Weak Yes
 7 | 6 Rainy Cool Normal Strong No
 8 | 7 Overcast Cool Normal Strong Yes
 9 | 8 Sunny Mild High Weak No
10 | 9 Sunny Cool Normal Weak Yes
11 | 10 Rainy Mild Normal Weak Yes
12 | 11 Sunny Mild Normal Strong Yes
13 | 12 Overcast Mild High Strong Yes
14 | 13 Overcast Hot Normal Weak Yes
15 | 14 Rainy Mild High Strong No


--------------------------------------------------------------------------------
/data/kdtree/input.txt:
--------------------------------------------------------------------------------
1 | 4 7
2 | 5 4
3 | 9 6
4 | 7 2
5 | 2 3
6 | 8 1


--------------------------------------------------------------------------------
/data/kmeans/input.txt:
--------------------------------------------------------------------------------
1 | 3 3
2 | 4 10
3 | 9 6
4 | 14 8
5 | 18 11
6 | 21 7


--------------------------------------------------------------------------------
/data/knn/testInput.txt:
--------------------------------------------------------------------------------
1 | 1 2 3 2 4 
2 | 2 3 4 2 1 
3 | 8 7 2 3 5 
4 | -3 -2 2 4 0 
5 | -4 -4 -4 -4 -4 
6 | 1 2 3 4 4 
7 | 4 4 3 2 1 
8 | 3 3 3 2 4 
9 | 0 0 1 1 -2 


--------------------------------------------------------------------------------
/data/knn/trainInput.txt:
--------------------------------------------------------------------------------
1 | a 1 2 3 4 5 
2 | b 5 4 3 2 1 
3 | c 3 3 3 3 3 
4 | d -3 -3 -3 -3 -3 
5 | a 1 2 3 4 4 
6 | b 4 4 3 2 1 
7 | c 3 3 3 2 4 
8 | d 0 0 1 1 -2 
9 | 


--------------------------------------------------------------------------------
/data/maze/mapData.txt:
--------------------------------------------------------------------------------
1 | 0 0 0 0 0
2 | 2 0 0 -1 0
3 | 0 0 0 0 0 
4 | 0 -1 0 0 -1
5 | 0 0 0 0 1


--------------------------------------------------------------------------------
/data/msapriori/testInput.txt:
--------------------------------------------------------------------------------
1 | T1 1 2 5
2 | T2 2 4
3 | T3 2 3
4 | T4 1 2 4
5 | T5 1 3
6 | T6 2 3
7 | T7 1 3
8 | T8 1 2 3 5
9 | T9 1 2 3


--------------------------------------------------------------------------------
/data/msapriori/testInput2.txt:
--------------------------------------------------------------------------------
 1 | Rid Age Income Student CreditRating BuysComputer
 2 | 1 Youth High No Fair No
 3 | 2 Youth High No Excellent No
 4 | 3 MiddleAged High No Fair Yes
 5 | 4 Senior Medium No Fair Yes
 6 | 5 Senior Low Yes Fair Yes
 7 | 6 Senior Low Yes Excellent No
 8 | 7 MiddleAged Low Yes Excellent Yes
 9 | 8 Youth Medium No Fair No
10 | 9 Youth Low Yes Fair Yes
11 | 10 Senior Medium Yes Fair Yes
12 | 11 Youth Medium Yes Excellent Yes
13 | 12 MiddleAged Medium No Excellent Yes
14 | 13 MiddleAged High Yes Fair Yes
15 | 14 Senior Medium No Excellent No


--------------------------------------------------------------------------------
/data/naivebayes/input.txt:
--------------------------------------------------------------------------------
 1 | Day OutLook Temperature Humidity Wind PlayTennis
 2 | 1 Sunny Hot High Weak No
 3 | 2 Sunny Hot High Strong No
 4 | 3 Overcast Hot High Weak Yes
 5 | 4 Rainy Mild High Weak Yes
 6 | 5 Rainy Cool Normal Weak Yes
 7 | 6 Rainy Cool Normal Strong No
 8 | 7 Overcast Cool Normal Strong Yes
 9 | 8 Sunny Mild High Weak No
10 | 9 Sunny Cool Normal Weak Yes
11 | 10 Rainy Mild Normal Weak Yes
12 | 11 Sunny Mild Normal Strong Yes
13 | 12 Overcast Mild High Strong Yes
14 | 13 Overcast Hot Normal Weak Yes
15 | 14 Rainy Mild High Strong No


--------------------------------------------------------------------------------
/data/pagerank/input.txt:
--------------------------------------------------------------------------------
1 | 1 2
2 | 1 3
3 | 2 3
4 | 3 1


--------------------------------------------------------------------------------
/data/pca/Makefile:
--------------------------------------------------------------------------------
 1 | INPUTS= simple basilevsy compressor_per_day_kwh compressor_1_day_detail
 2 | 
 3 | all:
 4 | 	@for i in $(INPUTS) ; do \
 5 | 		java -cp ../target/pca-1.0.jar com.uwemeding.pca.Main $$i ; \
 6 | 	done
 7 |     
 8 | 
 9 | clean:; rm -f *lambda* *pcomps* *pfacs* *_cc* *_cumcon*
10 | 
11 | 


--------------------------------------------------------------------------------
/data/pca/basilevsy.data:
--------------------------------------------------------------------------------
 1 | 335.6
 2 | 245.3
 3 | 226
 4 | 318.5
 5 | 450.8
 6 | 508.6
 7 | 445.7
 8 | 445.1
 9 | 472.6
10 | 376.
11 | 319.4
12 | 352.2
13 | 408.5
14 | 314.5
15 | 262.0
16 | 287.8
17 | 320.3
18 | 265.1
19 | 224.7
20 | 248
21 | 304.9
22 | 266.3
23 | 276.5
24 | 300.9
25 | 415.6
26 | 341.5
27 | 289.8
28 | 342.1
29 | 465.5
30 | 488.6
31 | 483.2
32 | 566.2
33 | 636.8
34 | 511
35 | 442.7
36 | 456.7
37 | 478.1
38 | 378.1
39 | 334.6
40 | 360.3
41 | 424.7
42 | 336.5
43 | 328.9
44 | 417.2
45 | 493.4
46 | 457.2
47 | 477.5
48 | 571.5
49 | 847.1
50 | 584.4
51 | 514.2
52 | 503.4
53 | 501.7
54 | 402.0
55 | 373
56 | 376.7
57 | 405.7
58 | 340.3
59 | 341.0
60 | 352.3
61 | 366.0
62 | 312.7
63 | 336.7
64 | 549.
65 | 632
66 | 577
67 | 574.7
68 | 612.7
69 | 651.7
70 | 584.7
71 | 577.3
72 | 591.7
73 | 632.3
74 | 562.7
75 | 581.7
76 | 608.7
77 | 662.3
78 | 614.3
79 | 639.3
80 | 643.3
81 | 761.7
82 | 789.7
83 | 887.6
84 | 956.2
85 | 


--------------------------------------------------------------------------------
/data/pca/compressor_per_day_kwh.data:
--------------------------------------------------------------------------------
  1 | 49.71
  2 | 49.71
  3 | 66.85
  4 | 63
  5 | 50.83
  6 | 56.32
  7 | 72.6
  8 | 57.32
  9 | 62.59
 10 | 63.77
 11 | 52.3
 12 | 61.13
 13 | 51.95
 14 | 52.88
 15 | 82.31
 16 | 78.95
 17 | 48.6
 18 | 59.39
 19 | 53.15
 20 | 51.07
 21 | 69.49
 22 | 59.64
 23 | 69.42
 24 | 63.53
 25 | 45.46
 26 | 49.7
 27 | 66.45
 28 | 59.93
 29 | 49.16
 30 | 57.46
 31 | 73.2
 32 | 73.96
 33 | 75.21
 34 | 69.14
 35 | 71.74
 36 | 71.56
 37 | 65.69
 38 | 78.28
 39 | 81.58
 40 | 79.3
 41 | 87.15
 42 | 84.37
 43 | 64.88
 44 | 74.96
 45 | 83.11
 46 | 79.55
 47 | 74.98
 48 | 70.58
 49 | 51.26
 50 | 60.05
 51 | 78.74
 52 | 66.67
 53 | 54.14
 54 | 61.11
 55 | 79.62
 56 | 73.98
 57 | 76.75
 58 | 70.02
 59 | 71.36
 60 | 76.58
 61 | 88.28
 62 | 84.84
 63 | 86.02
 64 | 83.16
 65 | 85.33
 66 | 72.7
 67 | 86.17
 68 | 85.18
 69 | 82.75
 70 | 68.04
 71 | 77.58
 72 | 72.35
 73 | 54.76
 74 | 64.33
 75 | 76.18
 76 | 63.9
 77 | 51.22
 78 | 61.4
 79 | 80.38
 80 | 73.94
 81 | 75.65
 82 | 67.71
 83 | 71.52
 84 | 69.39
 85 | 82.43
 86 | 85.62
 87 | 86.32
 88 | 84.03
 89 | 86.84
 90 | 91.22
 91 | 74.63
 92 | 74.21
 93 | 81.11
 94 | 73.22
 95 | 70.19
 96 | 68.52
 97 | 50.24
 98 | 49.13
 99 | 68.15
100 | 65.49
101 | 58.94
102 | 62.41
103 | 78.54
104 | 75.25
105 | 77.07
106 | 85.93
107 | 74.82
108 | 71.71
109 | 83.85
110 | 86.4
111 | 82.57
112 | 79.8
113 | 83.25
114 | 71.62
115 | 80.17
116 | 80.73
117 | 84.12
118 | 79.11
119 | 76.92
120 | 65.89
121 | 52.59
122 | 50.17
123 | 70.89
124 | 67.02
125 | 54.84
126 | 62.24
127 | 80.07
128 | 76.92
129 | 75.2
130 | 69
131 | 69.17
132 | 69.82
133 | 83.9
134 | 82.93
135 | 85.61
136 | 81.07
137 | 82.83
138 | 69.2
139 | 70.42
140 | 67.16
141 | 82.06
142 | 75.12
143 | 75.48
144 | 67.02
145 | 51.69
146 | 63.76
147 | 76.22
148 | 65.95
149 | 49.61
150 | 60.49
151 | 77.93
152 | 67.68
153 | 72.95
154 | 65.82
155 | 50.18
156 | 61.66
157 | 51.29
158 | 50.5
159 | 83.48
160 | 73.63
161 | 60.27
162 | 62.76
163 | 52.41
164 | 52.61
165 | 67.24
166 | 64.84
167 | 72.24
168 | 64.17
169 | 


--------------------------------------------------------------------------------
/data/pca/simple.data:
--------------------------------------------------------------------------------
1 | 1
2 | 2
3 | 3
4 | 4
5 | 5
6 | 6
7 | 


--------------------------------------------------------------------------------
/data/prefixspan/input.txt:
--------------------------------------------------------------------------------
1 | bd c b ac
2 | bf ce b fg
3 | ah bf a b f
4 | be ce d
5 | a bd b c b ade


--------------------------------------------------------------------------------
/data/randomforest/input.txt:
--------------------------------------------------------------------------------
 1 | Rid Age Income Student CreditRating BuysComputer
 2 | 1 Youth High No Fair No
 3 | 2 Youth High No Excellent No
 4 | 3 MiddleAged High No Fair Yes
 5 | 4 Senior Medium No Fair Yes
 6 | 5 Senior Low Yes Fair Yes
 7 | 6 Senior Low Yes Excellent No
 8 | 7 MiddleAged Low Yes Excellent Yes
 9 | 8 Youth Medium No Fair No
10 | 9 Youth Low Yes Fair Yes
11 | 10 Senior Medium Yes Fair Yes
12 | 11 Youth Medium Yes Excellent Yes
13 | 12 MiddleAged Medium No Excellent Yes
14 | 13 MiddleAged High Yes Fair Yes
15 | 14 Senior Medium No Excellent No


--------------------------------------------------------------------------------
/data/roughsets/input.txt:
--------------------------------------------------------------------------------
1 | Element Color Shape Size Stability
2 | x1 Red Triangle Large Stable
3 | x2 Red Triangle Large Stable
4 | x3 Yellow Circle Small UnStable
5 | x4 Yellow Circle Small UnStable
6 | x5 Blue Rectangle Large Stable
7 | x6 Red Circle Middle UnStable
8 | x7 Blue Circle Small UnStable
9 | x8 Blue Rectangle Middle UnStable


--------------------------------------------------------------------------------
/data/tan/input.txt:
--------------------------------------------------------------------------------
 1 | OutLook Temperature Humidity Wind PlayTennis
 2 | Sunny Hot High Weak No
 3 | Sunny Hot High Strong No
 4 | Overcast Hot High Weak Yes
 5 | Rainy Mild High Weak Yes
 6 | Rainy Cool Normal Weak Yes
 7 | Rainy Cool Normal Strong No
 8 | Overcast Cool Normal Strong Yes
 9 | Sunny Mild High Weak No
10 | Sunny Cool Normal Weak Yes
11 | Rainy Mild Normal Weak Yes
12 | Sunny Mild Normal Strong Yes
13 | Overcast Mild High Strong Yes
14 | Overcast Hot Normal Weak Yes
15 | Rainy Mild High Strong No


--------------------------------------------------------------------------------
/data/viterbi/humidity-matrix.txt:
--------------------------------------------------------------------------------
1 | # Dry Dryish Damp Soggy
2 | Sunny 0.6 0.2 0.15 0.05
3 | Cloudy 0.25 0.25 0.25 0.25
4 | Rainy 0.05 0.10 0.35 0.50


--------------------------------------------------------------------------------
/data/viterbi/stmatrix.txt:
--------------------------------------------------------------------------------
1 | # Sunny Cloudy Rainy
2 | Sunny 0.5 0.375 0.125
3 | Cloudy 0.25 0.125 0.625
4 | Rainy 0.25 0.375 0.375


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 | 	<modelVersion>4.0.0</modelVersion>
  4 | 	
  5 | 	<groupId>com.jusdt</groupId>
  6 | 	<artifactId>datamining-18algorithms</artifactId>
  7 | 	<version>1.0.0</version>
  8 | 	<name>DataMining 18 Algorithms</name>
  9 | 	
 10 | 	<properties>
 11 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 12 | 		<logback.version>1.1.7</logback.version>
 13 | 	</properties>
 14 | 
 15 | 	<dependencies>
 16 | 		<!-- 本地包 -->
 17 | 		<dependency>
 18 | 			<groupId>info.bbd</groupId>
 19 | 			<artifactId>common-utils</artifactId>
 20 | 			<version>1.0.0</version>
 21 | 			<exclusions>
 22 | 				<exclusion>
 23 | 					<groupId>com.github.jnr</groupId>
 24 | 					<artifactId>jnr-posix</artifactId>
 25 | 				</exclusion>
 26 | 			</exclusions>
 27 | 		</dependency>
 28 | 		<!-- 日志组件 -->
 29 | 		<dependency>
 30 | 			<groupId>ch.qos.logback</groupId>
 31 | 			<artifactId>logback-classic</artifactId>
 32 | 			<version>${logback.version}</version>
 33 | 		</dependency>
 34 | 		<dependency>
 35 | 			<groupId>ch.qos.logback</groupId>
 36 | 			<artifactId>logback-core</artifactId>
 37 | 			<version>${logback.version}</version>
 38 | 		</dependency>
 39 | 		<dependency>
 40 | 			<groupId>ch.qos.logback</groupId>
 41 | 			<artifactId>logback-access</artifactId>
 42 | 			<version>${logback.version}</version>
 43 | 		</dependency>
 44 | 		<dependency>
 45 | 			<groupId>org.slf4j</groupId>
 46 | 			<artifactId>slf4j-api</artifactId>
 47 | 			<version>1.7.21</version>
 48 | 		</dependency>
 49 | 		<!-- Guava -->
 50 | 		<dependency>
 51 | 			<groupId>com.google.guava</groupId>
 52 | 			<artifactId>guava</artifactId>
 53 | 			<version>14.0.1</version>
 54 | 		</dependency>
 55 | 		<!-- 测试包 -->
 56 | 		<dependency>
 57 | 			<groupId>junit</groupId>
 58 | 			<artifactId>junit</artifactId>
 59 | 			<version>4.10</version>
 60 | 			<scope>test</scope>
 61 | 		</dependency>
 62 | 	</dependencies>
 63 | 
 64 | 	<!-- 编译管理 -->
 65 | 	<build>
 66 | 		<!-- 插件管理 -->
 67 | 		<plugins>
 68 | 			<plugin>
 69 | 				<groupId>org.apache.maven.plugins</groupId>
 70 | 				<artifactId>maven-compiler-plugin</artifactId>
 71 | 				<version>3.6.1</version>
 72 | 				<configuration>
 73 | 					<fork>true</fork>
 74 | 					<source>1.8</source>
 75 | 					<target>1.8</target>
 76 | 					<encoding>UTF-8</encoding>
 77 | 					<compilerVersion>1.8</compilerVersion>
 78 | 				</configuration>
 79 | 			</plugin>
 80 | 			<plugin>
 81 | 				<groupId>org.apache.maven.plugins</groupId>
 82 | 				<artifactId>maven-source-plugin</artifactId>
 83 | 				<version>3.0.1</version>
 84 | 				<executions>
 85 | 					<execution>
 86 | 						<id>attach-sources</id>
 87 | 						<phase>verify</phase>
 88 | 						<goals>
 89 | 							<goal>jar-no-fork</goal>
 90 | 						</goals>
 91 | 					</execution>
 92 | 				</executions>
 93 | 			</plugin>
 94 | 			<plugin>
 95 | 				<groupId>org.apache.maven.plugins</groupId>
 96 | 				<artifactId>maven-resources-plugin</artifactId>
 97 | 				<version>3.0.2</version>
 98 | 				<configuration>
 99 | 					<encoding>UTF-8</encoding>
100 | 				</configuration>
101 | 			</plugin>
102 | 			<!-- 自定义打包插件 -->
103 | 			<!-- <plugin> <artifactId>maven-assembly-plugin</artifactId> <version>2.4</version> 
104 | 				<configuration> <encoding>${project.build.sourceEncoding}</encoding> <descriptors> 
105 | 				<descriptor>src/main/assembly/distribution.xml</descriptor> </descriptors> 
106 | 				</configuration> <executions> <execution> <id>make-assembly</id> <phase>package</phase> 
107 | 				<goals> <goal>single</goal> </goals> </execution> </executions> </plugin> -->
108 | 			<plugin>
109 | 				<groupId>org.apache.maven.plugins</groupId>
110 | 				<artifactId>maven-assembly-plugin</artifactId>
111 | 				<version>2.4</version>
112 | 				<configuration>
113 | 					<archive>
114 | 						<manifest>
115 | 							<mainClass>com.jusdt.zcm.mapred.driver.ZcmDriver</mainClass>
116 | 						</manifest>
117 | 					</archive>
118 | 					<descriptorRefs>
119 | 						<descriptorRef>jar-with-dependencies</descriptorRef>
120 | 					</descriptorRefs>
121 | 				</configuration>
122 | 				<executions>
123 | 					<execution>
124 | 						<id>make-assembly</id>
125 | 						<phase>package</phase>
126 | 						<goals>
127 | 							<goal>single</goal>
128 | 						</goals>
129 | 					</execution>
130 | 				</executions>
131 | 			</plugin>
132 | 		</plugins>
133 | 
134 | 		<!--资源文件管理 -->
135 | 		<resources>
136 | 			<resource>
137 | 				<directory>src/main/resources</directory>
138 | 				<includes>
139 | 					<include>*.*</include>
140 | 				</includes>
141 | 				<excludes>
142 | 					<exclude></exclude>
143 | 				</excludes>
144 | 			</resource>
145 | 		</resources>
146 | 
147 | 		<!-- 打包名称 -->
148 | 		<finalName>${project.artifactId}-${project.version}</finalName>
149 | 		
150 | 	</build>
151 | 
152 | </project>


--------------------------------------------------------------------------------
/src/main/assembly/distribution.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <assembly
 3 | 	xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2"
 4 | 	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 5 | 	xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2 http://maven.apache.org/xsd/assembly-1.1.2.xsd">
 6 | 	<id>distribution</id>
 7 | 	<formats>
 8 | 		<format>tar.gz</format>
 9 | 	</formats>
10 | 	<baseDirectory>${project.artifactId}</baseDirectory>
11 | 	<fileSets>
12 | 		<fileSet>
13 | 			<directory>src/main/resources</directory>
14 | 			<includes>
15 | 				<include>logback.xml</include>
16 | 				<include>conf.properties</include>
17 | 				<include>utils.properties</include>
18 | 			</includes>
19 | 			<outputDirectory>/conf</outputDirectory>
20 | 			<filtered>true</filtered>
21 | 		</fileSet>
22 | 		<fileSet>
23 | 			<directory>src/main/bin</directory>
24 | 			<includes>
25 | 				<include>*</include>
26 | 			</includes>
27 | 			<outputDirectory>/bin</outputDirectory>
28 | 			<fileMode>0755</fileMode>
29 | 		</fileSet>
30 | 	</fileSets>
31 | 	<dependencySets>
32 | 		<dependencySet>
33 | 			<outputDirectory>/lib</outputDirectory>
34 | 		</dependencySet>
35 | 	</dependencySets>
36 | </assembly>


--------------------------------------------------------------------------------
/src/main/bin/ctl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mainClass=com.jusdt.zcm.mapred.driver.ZcmDriver
 4 | 
 5 | # resolve links - $0 may be a softlink
 6 | PRG="$0"
 7 | 
 8 | while [ -h "$PRG" ]; do
 9 |   ls=`ls -ld "$PRG"`
10 |   link=`expr "$ls" : '.*-> \(.*\)$'`
11 |   if expr "$link" : '/.*' > /dev/null; then
12 |     PRG="$link"
13 |   else
14 |     PRG=`dirname "$PRG"`/"$link"
15 |   fi
16 | done
17 | 
18 | # Get standard environment variables
19 | PRGDIR=`dirname "$PRG"`
20 | 
21 | PROJECT_DIR=`cd "$PRGDIR/.." >/dev/null; pwd`
22 | echo PROJECT_DIR=$PROJECT_DIR
23 | 
24 | CLASSPATH="$CLASSHPATH:$PROJECT_DIR/conf"
25 | 
26 | for jar in "$PROJECT_DIR/lib"/*.jar; do
27 |   CLASSPATH="$CLASSPATH:$jar"
28 | done
29 | echo CLASSPATH=$CLASSPATH
30 | 
31 | JVMARGS="${JVMARGS} -Dproject_dir=${PROJECT_DIR} -Djava.net.preferIPv4Stack=true"
32 | echo JVMARGS=$JVMARGS
33 | 
34 | usage() {
35 |   echo >&2 "usage: $PRG <command> [args]"
36 |   echo 'Valid commands: start, stop'
37 |   exit 1
38 | }
39 | 
40 | start() {
41 |   JAVA=${JAVA-'java'}
42 |   exec $JAVA $JVMARGS -classpath "$CLASSPATH" $mainClass "$@" &
43 |   echo $! > main.pid
44 | }
45 | 
46 | stop() {
47 |   kill `cat main.pid` > /dev/null
48 | }
49 | 
50 | case $1 in
51 |   (start)
52 |     shift
53 |     start $@
54 |     ;;
55 |   (stop)
56 |     stop
57 |     ;;
58 |   (restart)
59 |     stop
60 |     shift
61 |     start $@
62 |     ;;
63 |   (*)
64 |     echo >&2 "$PRG: error: unknown command '$1'"
65 |     usage
66 |     ;;
67 | esac
68 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/association/analysis/apriori/AprioriExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.association.analysis.apriori;
 2 | 
 3 | /**
 4 |  * apriori关联规则挖掘算法调用类
 5 |  */
 6 | public class AprioriExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		String filePath = "data/apriori/testInput.txt";
10 | 
11 | 		AprioriCore tool = new AprioriCore(filePath, 2);
12 | 		tool.printAttachRule(0.7);
13 | 	}
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/association/analysis/apriori/FrequentItem.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.association.analysis.apriori;
 2 | 
 3 | /**
 4 |  * 频繁项集
 5 |  */
 6 | public class FrequentItem implements Comparable<FrequentItem> {
 7 | 
 8 | 	// 频繁项集的集合ID
 9 | 	private String[] idArray;
10 | 	// 频繁项集的支持度计数
11 | 	private int count;
12 | 	//频繁项集的长度，1项集或是2项集，亦或是3项集
13 | 	private int length;
14 | 
15 | 	public FrequentItem(String[] idArray, int count) {
16 | 		this.idArray = idArray;
17 | 		this.count = count;
18 | 		length = idArray.length;
19 | 	}
20 | 
21 | 	public String[] getIdArray() {
22 | 		return idArray;
23 | 	}
24 | 
25 | 	public void setIdArray(String[] idArray) {
26 | 		this.idArray = idArray;
27 | 	}
28 | 
29 | 	public int getCount() {
30 | 		return count;
31 | 	}
32 | 
33 | 	public void setCount(int count) {
34 | 		this.count = count;
35 | 	}
36 | 
37 | 	public int getLength() {
38 | 		return length;
39 | 	}
40 | 
41 | 	public void setLength(int length) {
42 | 		this.length = length;
43 | 	}
44 | 
45 | 	@Override
46 | 	public int compareTo(FrequentItem o) {
47 | 		// TODO Auto-generated method stub
48 | 		Integer int1 = Integer.parseInt(this.getIdArray()[0]);
49 | 		Integer int2 = Integer.parseInt(o.getIdArray()[0]);
50 | 
51 | 		return int1.compareTo(int2);
52 | 	}
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/association/analysis/fptree/FPTreeExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.association.analysis.fptree;
 2 | 
 3 | /**
 4 |  * FPTree频繁模式树算法
 5 |  */
 6 | public class FPTreeExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		String filePath = "data/fptree/testInput.txt";
10 | 		//最小支持度阈值
11 | 		int minSupportCount = 2;
12 | 
13 | 		FPTreeCore tool = new FPTreeCore(filePath, minSupportCount);
14 | 		tool.startBuildingTree();
15 | 	}
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/association/analysis/fptree/TreeNode.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.association.analysis.fptree;
 2 | 
 3 | import java.util.ArrayList;
 4 | 
 5 | /**
 6 |  * FP树节点
 7 |  */
 8 | public class TreeNode implements Comparable<TreeNode>, Cloneable {
 9 | 
10 | 	// 节点类别名称
11 | 	private String name;
12 | 	// 计数数量
13 | 	private Integer count;
14 | 	// 父亲节点
15 | 	private TreeNode parentNode;
16 | 	// 孩子节点，可以为多个
17 | 	private ArrayList<TreeNode> childNodes;
18 | 
19 | 	public TreeNode(String name, int count) {
20 | 		this.name = name;
21 | 		this.count = count;
22 | 	}
23 | 
24 | 	public String getName() {
25 | 		return name;
26 | 	}
27 | 
28 | 	public void setName(String name) {
29 | 		this.name = name;
30 | 	}
31 | 
32 | 	public Integer getCount() {
33 | 		return count;
34 | 	}
35 | 
36 | 	public void setCount(Integer count) {
37 | 		this.count = count;
38 | 	}
39 | 
40 | 	public TreeNode getParentNode() {
41 | 		return parentNode;
42 | 	}
43 | 
44 | 	public void setParentNode(TreeNode parentNode) {
45 | 		this.parentNode = parentNode;
46 | 	}
47 | 
48 | 	public ArrayList<TreeNode> getChildNodes() {
49 | 		return childNodes;
50 | 	}
51 | 
52 | 	public void setChildNodes(ArrayList<TreeNode> childNodes) {
53 | 		this.childNodes = childNodes;
54 | 	}
55 | 
56 | 	@Override
57 | 	public int compareTo(TreeNode o) {
58 | 		// TODO Auto-generated method stub
59 | 		return o.getCount().compareTo(this.getCount());
60 | 	}
61 | 
62 | 	@Override
63 | 	protected Object clone() throws CloneNotSupportedException {
64 | 		// TODO Auto-generated method stub
65 | 		//因为对象内部有引用，需要采用深拷贝
66 | 		TreeNode node = (TreeNode) super.clone();
67 | 		if (this.getParentNode() != null) {
68 | 			node.setParentNode((TreeNode) this.getParentNode().clone());
69 | 		}
70 | 
71 | 		if (this.getChildNodes() != null) {
72 | 			node.setChildNodes((ArrayList<TreeNode>) this.getChildNodes().clone());
73 | 		}
74 | 
75 | 		return node;
76 | 	}
77 | 
78 | }
79 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/bagging/boosting/adaboost/AdaBoostExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.bagging.boosting.adaboost;
 2 | 
 3 | /**
 4 |  * AdaBoost提升算法调用类
 5 |  */
 6 | public class AdaBoostExample {
 7 | 
 8 | 	public static void main(String[] agrs) {
 9 | 		String filePath = "data/adaboost/input.txt";
10 | 		//误差率阈值
11 | 		double errorValue = 0.2;
12 | 
13 | 		AdaBoostCore tool = new AdaBoostCore(filePath, errorValue);
14 | 		tool.adaBoostClassify();
15 | 	}
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/bagging/boosting/adaboost/Point.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.bagging.boosting.adaboost;
 2 | 
 3 | /**
 4 |  * 坐标点类
 5 |  */
 6 | public class Point {
 7 | 
 8 | 	// 坐标点x坐标
 9 | 	private int x;
10 | 	// 坐标点y坐标
11 | 	private int y;
12 | 	// 坐标点的分类类别
13 | 	private int classType;
14 | 	//如果此节点被划错，他的误差率，不能用个数除以总数，因为不同坐标点的权重不一定相等
15 | 	private double probably;
16 | 
17 | 	public Point(int x, int y, int classType) {
18 | 		this.x = x;
19 | 		this.y = y;
20 | 		this.classType = classType;
21 | 	}
22 | 
23 | 	public Point(String x, String y, String classType) {
24 | 		this.x = Integer.parseInt(x);
25 | 		this.y = Integer.parseInt(y);
26 | 		this.classType = Integer.parseInt(classType);
27 | 	}
28 | 
29 | 	public int getX() {
30 | 		return x;
31 | 	}
32 | 
33 | 	public void setX(int x) {
34 | 		this.x = x;
35 | 	}
36 | 
37 | 	public int getY() {
38 | 		return y;
39 | 	}
40 | 
41 | 	public void setY(int y) {
42 | 		this.y = y;
43 | 	}
44 | 
45 | 	public int getClassType() {
46 | 		return classType;
47 | 	}
48 | 
49 | 	public void setClassType(int classType) {
50 | 		this.classType = classType;
51 | 	}
52 | 
53 | 	public double getProbably() {
54 | 		return probably;
55 | 	}
56 | 
57 | 	public void setProbably(double probably) {
58 | 		this.probably = probably;
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/cart/AttrNode.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.classification.cart;
 2 | 
 3 | import java.util.ArrayList;
 4 | 
 5 | /**
 6 |  * 回归分类树节点
 7 |  */
 8 | public class AttrNode {
 9 | 
10 | 	// 节点属性名字
11 | 	private String attrName;
12 | 	// 节点索引标号
13 | 	private int nodeIndex;
14 | 	//包含的叶子节点数
15 | 	private int leafNum;
16 | 	// 节点误差率
17 | 	private double alpha;
18 | 	// 父亲分类属性值
19 | 	private String parentAttrValue;
20 | 	// 孩子节点
21 | 	private AttrNode[] childAttrNode;
22 | 	// 数据记录索引
23 | 	private ArrayList<String> dataIndex;
24 | 
25 | 	public String getAttrName() {
26 | 		return attrName;
27 | 	}
28 | 
29 | 	public void setAttrName(String attrName) {
30 | 		this.attrName = attrName;
31 | 	}
32 | 
33 | 	public int getNodeIndex() {
34 | 		return nodeIndex;
35 | 	}
36 | 
37 | 	public void setNodeIndex(int nodeIndex) {
38 | 		this.nodeIndex = nodeIndex;
39 | 	}
40 | 
41 | 	public double getAlpha() {
42 | 		return alpha;
43 | 	}
44 | 
45 | 	public void setAlpha(double alpha) {
46 | 		this.alpha = alpha;
47 | 	}
48 | 
49 | 	public String getParentAttrValue() {
50 | 		return parentAttrValue;
51 | 	}
52 | 
53 | 	public void setParentAttrValue(String parentAttrValue) {
54 | 		this.parentAttrValue = parentAttrValue;
55 | 	}
56 | 
57 | 	public AttrNode[] getChildAttrNode() {
58 | 		return childAttrNode;
59 | 	}
60 | 
61 | 	public void setChildAttrNode(AttrNode[] childAttrNode) {
62 | 		this.childAttrNode = childAttrNode;
63 | 	}
64 | 
65 | 	public ArrayList<String> getDataIndex() {
66 | 		return dataIndex;
67 | 	}
68 | 
69 | 	public void setDataIndex(ArrayList<String> dataIndex) {
70 | 		this.dataIndex = dataIndex;
71 | 	}
72 | 
73 | 	public int getLeafNum() {
74 | 		return leafNum;
75 | 	}
76 | 
77 | 	public void setLeafNum(int leafNum) {
78 | 		this.leafNum = leafNum;
79 | 	}
80 | 
81 | }
82 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/cart/CARTExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.classification.cart;
 2 | 
 3 | public class CARTExample {
 4 | 
 5 | 	public static void main(String[] args) {
 6 | 		String filePath = "data/cart/input.txt";
 7 | 
 8 | 		CARTCore tool = new CARTCore(filePath);
 9 | 
10 | 		tool.startBuildingTree();
11 | 	}
12 | 
13 | }
14 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/id3/AttrNode.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.classification.id3;
 2 | 
 3 | import java.util.ArrayList;
 4 | 
 5 | /**
 6 |  * 属性节点，不是叶子节点
 7 |  */
 8 | public class AttrNode {
 9 | 
10 | 	//当前属性的名字
11 | 	private String attrName;
12 | 	//父节点的分类属性值
13 | 	private String parentAttrValue;
14 | 	//属性子节点
15 | 	private AttrNode[] childAttrNode;
16 | 	//孩子叶子节点
17 | 	private ArrayList<String> childDataIndex;
18 | 
19 | 	public String getAttrName() {
20 | 		return attrName;
21 | 	}
22 | 
23 | 	public void setAttrName(String attrName) {
24 | 		this.attrName = attrName;
25 | 	}
26 | 
27 | 	public AttrNode[] getChildAttrNode() {
28 | 		return childAttrNode;
29 | 	}
30 | 
31 | 	public void setChildAttrNode(AttrNode[] childAttrNode) {
32 | 		this.childAttrNode = childAttrNode;
33 | 	}
34 | 
35 | 	public String getParentAttrValue() {
36 | 		return parentAttrValue;
37 | 	}
38 | 
39 | 	public void setParentAttrValue(String parentAttrValue) {
40 | 		this.parentAttrValue = parentAttrValue;
41 | 	}
42 | 
43 | 	public ArrayList<String> getChildDataIndex() {
44 | 		return childDataIndex;
45 | 	}
46 | 
47 | 	public void setChildDataIndex(ArrayList<String> childDataIndex) {
48 | 		this.childDataIndex = childDataIndex;
49 | 	}
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/id3/DataNode.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.classification.id3;
 2 | 
 3 | /**
 4 |  * 存放数据的叶子节点
 5 |  */
 6 | public class DataNode {
 7 | 
 8 | 	/**
 9 | 	 * 数据的标号
10 | 	 */
11 | 	private int dataIndex;
12 | 
13 | 	public DataNode(int dataIndex) {
14 | 		this.dataIndex = dataIndex;
15 | 	}
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/id3/ID3Example.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.classification.id3;
 2 | 
 3 | /**
 4 |  * ID3决策树分类算法测试场景类
 5 |  */
 6 | public class ID3Example {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		String filePath = "data/id3/input.txt";
10 | 
11 | 		ID3Core tool = new ID3Core(filePath);
12 | 		tool.startBuildingTree(true);
13 | 	}
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/knn/KNNCore.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.classification.knn;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | import java.util.Collections;
  9 | import java.util.HashMap;
 10 | import java.util.Map;
 11 | 
 12 | //import org.apache.activemq.filter.ComparisonExpression;
 13 | 
 14 | /**
 15 |  * K最近邻算法工具类
 16 |  */
 17 | public class KNNCore {
 18 | 
 19 | 	// 为4个类别设置权重，默认权重比一致
 20 | 	public int[] classWeightArray = new int[] { 1, 1, 1, 1 };
 21 | 	// 测试数据
 22 | 	private String testDataPath;
 23 | 	// 训练集数据地址
 24 | 	private String trainDataPath;
 25 | 	// 分类的不同类型
 26 | 	private ArrayList<String> classTypes;
 27 | 	// 结果数据
 28 | 	private ArrayList<Sample> resultSamples;
 29 | 	// 训练集数据列表容器
 30 | 	private ArrayList<Sample> trainSamples;
 31 | 	// 训练集数据
 32 | 	private String[][] trainData;
 33 | 	// 测试集数据
 34 | 	private String[][] testData;
 35 | 
 36 | 	public KNNCore(String trainDataPath, String testDataPath) {
 37 | 		this.trainDataPath = trainDataPath;
 38 | 		this.testDataPath = testDataPath;
 39 | 		readDataFormFile();
 40 | 	}
 41 | 
 42 | 	/**
 43 | 	 * 从文件中阅读测试数和训练数据集
 44 | 	 */
 45 | 	private void readDataFormFile() {
 46 | 		ArrayList<String[]> tempArray;
 47 | 
 48 | 		tempArray = fileDataToArray(trainDataPath);
 49 | 		trainData = new String[tempArray.size()][];
 50 | 		tempArray.toArray(trainData);
 51 | 
 52 | 		classTypes = new ArrayList<>();
 53 | 		for (String[] s : tempArray) {
 54 | 			if (!classTypes.contains(s[0])) {
 55 | 				// 添加类型
 56 | 				classTypes.add(s[0]);
 57 | 			}
 58 | 		}
 59 | 
 60 | 		tempArray = fileDataToArray(testDataPath);
 61 | 		testData = new String[tempArray.size()][];
 62 | 		tempArray.toArray(testData);
 63 | 	}
 64 | 
 65 | 	/**
 66 | 	 * 将文件转为列表数据输出
 67 | 	 *
 68 | 	 * @param filePath
 69 | 	 *            数据文件的内容
 70 | 	 */
 71 | 	private ArrayList<String[]> fileDataToArray(String filePath) {
 72 | 		File file = new File(filePath);
 73 | 		ArrayList<String[]> dataArray = new ArrayList<String[]>();
 74 | 
 75 | 		try {
 76 | 			BufferedReader in = new BufferedReader(new FileReader(file));
 77 | 			String str;
 78 | 			String[] tempArray;
 79 | 			while ((str = in.readLine()) != null) {
 80 | 				tempArray = str.split(" ");
 81 | 				dataArray.add(tempArray);
 82 | 			}
 83 | 			in.close();
 84 | 		} catch (IOException e) {
 85 | 			e.getStackTrace();
 86 | 		}
 87 | 
 88 | 		return dataArray;
 89 | 	}
 90 | 
 91 | 	/**
 92 | 	 * 计算样本特征向量的欧几里得距离
 93 | 	 *
 94 | 	 * @param f1
 95 | 	 *            待比较样本1
 96 | 	 * @param f2
 97 | 	 *            待比较样本2
 98 | 	 * @return
 99 | 	 */
100 | 	private int computeEuclideanDistance(Sample s1, Sample s2) {
101 | 		String[] f1 = s1.getFeatures();
102 | 		String[] f2 = s2.getFeatures();
103 | 		// 欧几里得距离
104 | 		int distance = 0;
105 | 
106 | 		for (int i = 0; i < f1.length; i++) {
107 | 			int subF1 = Integer.parseInt(f1[i]);
108 | 			int subF2 = Integer.parseInt(f2[i]);
109 | 
110 | 			distance += (subF1 - subF2) * (subF1 - subF2);
111 | 		}
112 | 
113 | 		return distance;
114 | 	}
115 | 
116 | 	/**
117 | 	 * 计算K最近邻
118 | 	 * @param k
119 | 	 * 在多少的k范围内
120 | 	 */
121 | 	public void knnCompute(int k) {
122 | 		String className = "";
123 | 		String[] tempF = null;
124 | 		Sample temp;
125 | 		resultSamples = new ArrayList<>();
126 | 		trainSamples = new ArrayList<>();
127 | 		// 分类类别计数
128 | 		HashMap<String, Integer> classCount;
129 | 		// 类别权重比
130 | 		HashMap<String, Integer> classWeight = new HashMap<>();
131 | 		// 首先讲测试数据转化到结果数据中
132 | 		for (String[] s : testData) {
133 | 			temp = new Sample(s);
134 | 			resultSamples.add(temp);
135 | 		}
136 | 
137 | 		for (String[] s : trainData) {
138 | 			className = s[0];
139 | 			tempF = new String[s.length - 1];
140 | 			System.arraycopy(s, 1, tempF, 0, s.length - 1);
141 | 			temp = new Sample(className, tempF);
142 | 			trainSamples.add(temp);
143 | 		}
144 | 
145 | 		// 离样本最近排序的的训练集数据
146 | 		ArrayList<Sample> kNNSample = new ArrayList<>();
147 | 		// 计算训练数据集中离样本数据最近的K个训练集数据
148 | 		for (Sample s : resultSamples) {
149 | 			classCount = new HashMap<>();
150 | 			int index = 0;
151 | 			for (String type : classTypes) {
152 | 				// 开始时计数为0
153 | 				classCount.put(type, 0);
154 | 				classWeight.put(type, classWeightArray[index++]);
155 | 			}
156 | 			for (Sample tS : trainSamples) {
157 | 				int dis = computeEuclideanDistance(s, tS);
158 | 				tS.setDistance(dis);
159 | 			}
160 | 
161 | 			Collections.sort(trainSamples);
162 | 			kNNSample.clear();
163 | 			// 挑选出前k个数据作为分类标准
164 | 			for (int i = 0; i < trainSamples.size(); i++) {
165 | 				if (i < k) {
166 | 					kNNSample.add(trainSamples.get(i));
167 | 				} else {
168 | 					break;
169 | 				}
170 | 			}
171 | 			// 判定K个训练数据的多数的分类标准
172 | 			for (Sample s1 : kNNSample) {
173 | 				int num = classCount.get(s1.getClassName());
174 | 				// 进行分类权重的叠加，默认类别权重平等，可自行改变，近的权重大，远的权重小
175 | 				num += classWeight.get(s1.getClassName());
176 | 				classCount.put(s1.getClassName(), num);
177 | 			}
178 | 
179 | 			int maxCount = 0;
180 | 			// 筛选出k个训练集数据中最多的一个分类
181 | 			for (Map.Entry entry : classCount.entrySet()) {
182 | 				if ((Integer) entry.getValue() > maxCount) {
183 | 					maxCount = (Integer) entry.getValue();
184 | 					s.setClassName((String) entry.getKey());
185 | 				}
186 | 			}
187 | 
188 | 			System.out.print("测试数据特征：");
189 | 			for (String s1 : s.getFeatures()) {
190 | 				System.out.print(s1 + " ");
191 | 			}
192 | 			System.out.println("分类：" + s.getClassName());
193 | 		}
194 | 	}
195 | }
196 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/knn/KNNExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.classification.knn;
 2 | 
 3 | /**
 4 |  * k最近邻算法场景类型
 5 |  */
 6 | public class KNNExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		String trainDataPath = "data/knn/trainInput.txt";
10 | 		String testDataPath = "data/knn/testinput.txt";
11 | 
12 | 		KNNCore tool = new KNNCore(trainDataPath, testDataPath);
13 | 		tool.knnCompute(3);
14 | 	}
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/knn/Sample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.classification.knn;
 2 | 
 3 | /**
 4 |  * 样本数据类
 5 |  */
 6 | public class Sample implements Comparable<Sample> {
 7 | 
 8 | 	// 样本数据的分类名称
 9 | 	private String className;
10 | 	// 样本数据的特征向量
11 | 	private String[] features;
12 | 	// 测试样本之间的间距值，以此做排序
13 | 	private Integer distance;
14 | 
15 | 	public Sample(String[] features) {
16 | 		this.features = features;
17 | 	}
18 | 
19 | 	public Sample(String className, String[] features) {
20 | 		this.className = className;
21 | 		this.features = features;
22 | 	}
23 | 
24 | 	public String getClassName() {
25 | 		return className;
26 | 	}
27 | 
28 | 	public void setClassName(String className) {
29 | 		this.className = className;
30 | 	}
31 | 
32 | 	public String[] getFeatures() {
33 | 		return features;
34 | 	}
35 | 
36 | 	public void setFeatures(String[] features) {
37 | 		this.features = features;
38 | 	}
39 | 
40 | 	public Integer getDistance() {
41 | 		return distance;
42 | 	}
43 | 
44 | 	public void setDistance(int distance) {
45 | 		this.distance = distance;
46 | 	}
47 | 
48 | 	@Override
49 | 	public int compareTo(Sample o) {
50 | 		return this.getDistance().compareTo(o.getDistance());
51 | 	}
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/naivebayes/NaiveBayesCore.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.classification.naivebayes;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | import java.util.HashMap;
  9 | import java.util.Map;
 10 | 
 11 | /**
 12 |  * 朴素贝叶斯算法工具类
 13 |  */
 14 | public class NaiveBayesCore {
 15 | 
 16 | 	// 类标记符，这里分为2类，YES和NO
 17 | 	private String YES = "Yes";
 18 | 	private String NO = "No";
 19 | 
 20 | 	// 已分类训练数据集文件路径
 21 | 	private String filePath;
 22 | 	// 属性名称数组
 23 | 	private String[] attrNames;
 24 | 	// 训练数据集
 25 | 	private String[][] data;
 26 | 
 27 | 	// 每个属性的值所有类型
 28 | 	private HashMap<String, ArrayList<String>> attrValue;
 29 | 
 30 | 	public NaiveBayesCore(String filePath) {
 31 | 		this.filePath = filePath;
 32 | 
 33 | 		readDataFile();
 34 | 		initAttrValue();
 35 | 	}
 36 | 
 37 | 	/**
 38 | 	 * 从文件中读取数据
 39 | 	 */
 40 | 	private void readDataFile() {
 41 | 		File file = new File(filePath);
 42 | 		ArrayList<String[]> dataArray = new ArrayList<String[]>();
 43 | 
 44 | 		try {
 45 | 			BufferedReader in = new BufferedReader(new FileReader(file));
 46 | 			String str;
 47 | 			String[] tempArray;
 48 | 			while ((str = in.readLine()) != null) {
 49 | 				tempArray = str.split(" ");
 50 | 				dataArray.add(tempArray);
 51 | 			}
 52 | 			in.close();
 53 | 		} catch (IOException e) {
 54 | 			e.getStackTrace();
 55 | 		}
 56 | 
 57 | 		data = new String[dataArray.size()][];
 58 | 		dataArray.toArray(data);
 59 | 		attrNames = data[0];
 60 | 
 61 | 		/*
 62 | 		 * for(int i=0; i<data.length;i++){ for(int j=0; j<data[0].length; j++){
 63 | 		 * System.out.print(" " + data[i][j]); }
 64 | 		 *
 65 | 		 * System.out.print("\n"); }
 66 | 		 */
 67 | 	}
 68 | 
 69 | 	/**
 70 | 	 * 首先初始化每种属性的值的所有类型，用于后面的子类熵的计算时用
 71 | 	 */
 72 | 	private void initAttrValue() {
 73 | 		attrValue = new HashMap<>();
 74 | 		ArrayList<String> tempValues;
 75 | 
 76 | 		// 按照列的方式，从左往右找
 77 | 		for (int j = 1; j < attrNames.length; j++) {
 78 | 			// 从一列中的上往下开始寻找值
 79 | 			tempValues = new ArrayList<>();
 80 | 			for (int i = 1; i < data.length; i++) {
 81 | 				if (!tempValues.contains(data[i][j])) {
 82 | 					// 如果这个属性的值没有添加过，则添加
 83 | 					tempValues.add(data[i][j]);
 84 | 				}
 85 | 			}
 86 | 
 87 | 			// 一列属性的值已经遍历完毕，复制到map属性表中
 88 | 			attrValue.put(data[0][j], tempValues);
 89 | 		}
 90 | 
 91 | 	}
 92 | 
 93 | 	/**
 94 | 	 * 在classType的情况下，发生condition条件的概率
 95 | 	 *
 96 | 	 * @param condition
 97 | 	 *            属性条件
 98 | 	 * @param classType
 99 | 	 *            分类的类型
100 | 	 * @return
101 | 	 */
102 | 	private double computeConditionProbably(String condition, String classType) {
103 | 		// 条件计数器
104 | 		int count = 0;
105 | 		// 条件属性的索引列
106 | 		int attrIndex = 1;
107 | 		// yes类标记符数据
108 | 		ArrayList<String[]> yClassData = new ArrayList<>();
109 | 		// no类标记符数据
110 | 		ArrayList<String[]> nClassData = new ArrayList<>();
111 | 		ArrayList<String[]> classData;
112 | 
113 | 		for (int i = 1; i < data.length; i++) {
114 | 			// data数据按照yes和no分类
115 | 			if (data[i][attrNames.length - 1].equals(YES)) {
116 | 				yClassData.add(data[i]);
117 | 			} else {
118 | 				nClassData.add(data[i]);
119 | 			}
120 | 		}
121 | 
122 | 		if (classType.equals(YES)) {
123 | 			classData = yClassData;
124 | 		} else {
125 | 			classData = nClassData;
126 | 		}
127 | 
128 | 		// 如果没有设置条件则，计算的是纯粹的类事件概率
129 | 		if (condition == null) {
130 | 			return 1.0 * classData.size() / (data.length - 1);
131 | 		}
132 | 
133 | 		// 寻找此条件的属性列
134 | 		attrIndex = getConditionAttrName(condition);
135 | 
136 | 		for (String[] s : classData) {
137 | 			if (s[attrIndex].equals(condition)) {
138 | 				count++;
139 | 			}
140 | 		}
141 | 
142 | 		return 1.0 * count / classData.size();
143 | 	}
144 | 
145 | 	/**
146 | 	 * 根据条件值返回条件所属属性的列值
147 | 	 *
148 | 	 * @param condition
149 | 	 *            条件
150 | 	 * @return
151 | 	 */
152 | 	private int getConditionAttrName(String condition) {
153 | 		// 条件所属属性名
154 | 		String attrName = "";
155 | 		// 条件所在属性列索引
156 | 		int attrIndex = 1;
157 | 		// 临时属性值类型
158 | 		ArrayList<String[]> valueTypes;
159 | 		for (Map.Entry entry : attrValue.entrySet()) {
160 | 			valueTypes = (ArrayList<String[]>) entry.getValue();
161 | 			if (valueTypes.contains(condition) && !((String) entry.getKey()).equals("BuysComputer")) {
162 | 				attrName = (String) entry.getKey();
163 | 			}
164 | 		}
165 | 
166 | 		for (int i = 0; i < attrNames.length - 1; i++) {
167 | 			if (attrNames[i].equals(attrName)) {
168 | 				attrIndex = i;
169 | 				break;
170 | 			}
171 | 		}
172 | 
173 | 		return attrIndex;
174 | 	}
175 | 
176 | 	/**
177 | 	 * 进行朴素贝叶斯分类
178 | 	 *
179 | 	 * @param data
180 | 	 *            待分类数据
181 | 	 */
182 | 	public String naiveBayesClassificate(String data) {
183 | 		// 测试数据的属性值特征
184 | 		String[] dataFeatures;
185 | 		// 在yes的条件下，x事件发生的概率
186 | 		double xWhenYes = 1.0;
187 | 		// 在no的条件下，x事件发生的概率
188 | 		double xWhenNo = 1.0;
189 | 		// 最后也是yes和no分类的总概率，用P(X|Ci)*P(Ci)的公式计算
190 | 		double pYes = 1;
191 | 		double pNo = 1;
192 | 
193 | 		dataFeatures = data.split(" ");
194 | 		for (int i = 0; i < dataFeatures.length; i++) {
195 | 			// 因为朴素贝叶斯算法是类条件独立的，所以可以进行累积的计算
196 | 			xWhenYes *= computeConditionProbably(dataFeatures[i], YES);
197 | 			xWhenNo *= computeConditionProbably(dataFeatures[i], NO);
198 | 		}
199 | 
200 | 		pYes = xWhenYes * computeConditionProbably(null, YES);
201 | 		pNo = xWhenNo * computeConditionProbably(null, NO);
202 | 
203 | 		return (pYes > pNo ? YES : NO);
204 | 	}
205 | 
206 | }
207 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/classification/naivebayes/NaiveBayesExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.classification.naivebayes;
 2 | 
 3 | /**
 4 |  * 朴素贝叶斯算法场景调用类
 5 |  */
 6 | public class NaiveBayesExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		//训练集数据
10 | 		String filePath = "data/naivebayes/input.txt";
11 | 		String testData = "Youth Medium Yes Fair";
12 | 		NaiveBayesCore tool = new NaiveBayesCore(filePath);
13 | 		System.out.println(testData + " 数据的分类为:" + tool.naiveBayesClassificate(testData));
14 | 	}
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/clustering/birch/BIRCHCore.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.clustering.birch;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.text.MessageFormat;
  8 | import java.util.ArrayList;
  9 | import java.util.LinkedList;
 10 | 
 11 | /**
 12 |  * BIRCH聚类算法工具类
 13 |  */
 14 | public class BIRCHCore {
 15 | 
 16 | 	// 节点类型名称
 17 | 	public static final String NON_LEAFNODE = "【NonLeafNode】";
 18 | 	public static final String LEAFNODE = "【LeafNode】";
 19 | 	public static final String CLUSTER = "【Cluster】";
 20 | 
 21 | 	// 测试数据文件地址
 22 | 	private String filePath;
 23 | 	// 内部节点平衡因子B
 24 | 	public static int B;
 25 | 	// 叶子节点平衡因子L
 26 | 	public static int L;
 27 | 	// 簇直径阈值T
 28 | 	public static double T;
 29 | 	// 总的测试数据记录
 30 | 	private ArrayList<String[]> totalDataRecords;
 31 | 
 32 | 	public BIRCHCore(String filePath, int B, int L, double T) {
 33 | 		this.filePath = filePath;
 34 | 		this.B = B;
 35 | 		this.L = L;
 36 | 		this.T = T;
 37 | 		readDataFile();
 38 | 	}
 39 | 
 40 | 	/**
 41 | 	 * 从文件中读取数据
 42 | 	 */
 43 | 	private void readDataFile() {
 44 | 		File file = new File(filePath);
 45 | 		ArrayList<String[]> dataArray = new ArrayList<String[]>();
 46 | 
 47 | 		try {
 48 | 			BufferedReader in = new BufferedReader(new FileReader(file));
 49 | 			String str;
 50 | 			String[] tempArray;
 51 | 			while ((str = in.readLine()) != null) {
 52 | 				tempArray = str.split("     ");
 53 | 				dataArray.add(tempArray);
 54 | 			}
 55 | 			in.close();
 56 | 		} catch (IOException e) {
 57 | 			e.getStackTrace();
 58 | 		}
 59 | 
 60 | 		totalDataRecords = new ArrayList<>();
 61 | 		for (String[] array : dataArray) {
 62 | 			totalDataRecords.add(array);
 63 | 		}
 64 | 	}
 65 | 
 66 | 	/**
 67 | 	 * 构建CF聚类特征树
 68 | 	 *
 69 | 	 * @return
 70 | 	 */
 71 | 	private ClusteringFeature buildCFTree() {
 72 | 		NonLeafNode rootNode = null;
 73 | 		LeafNode leafNode = null;
 74 | 		Cluster cluster = null;
 75 | 
 76 | 		for (String[] record : totalDataRecords) {
 77 | 			cluster = new Cluster(record);
 78 | 
 79 | 			if (rootNode == null) {
 80 | 				// CF树只有1个节点的时候的情况
 81 | 				if (leafNode == null) {
 82 | 					leafNode = new LeafNode();
 83 | 				}
 84 | 				leafNode.addingCluster(cluster);
 85 | 				if (leafNode.getParentNode() != null) {
 86 | 					rootNode = leafNode.getParentNode();
 87 | 				}
 88 | 			} else {
 89 | 				if (rootNode.getParentNode() != null) {
 90 | 					rootNode = rootNode.getParentNode();
 91 | 				}
 92 | 
 93 | 				// 从根节点开始，从上往下寻找到最近的添加目标叶子节点
 94 | 				LeafNode temp = rootNode.findedClosestNode(cluster);
 95 | 				temp.addingCluster(cluster);
 96 | 			}
 97 | 		}
 98 | 
 99 | 		// 从下往上找出最上面的节点
100 | 		LeafNode node = cluster.getParentNode();
101 | 		NonLeafNode upNode = node.getParentNode();
102 | 		if (upNode == null) {
103 | 			return node;
104 | 		} else {
105 | 			while (upNode.getParentNode() != null) {
106 | 				upNode = upNode.getParentNode();
107 | 			}
108 | 
109 | 			return upNode;
110 | 		}
111 | 	}
112 | 
113 | 	/**
114 | 	 * 开始构建CF聚类特征树
115 | 	 */
116 | 	public void startBuilding() {
117 | 		// 树深度
118 | 		int level = 1;
119 | 		ClusteringFeature rootNode = buildCFTree();
120 | 
121 | 		setTreeLevel(rootNode, level);
122 | 		showCFTree(rootNode);
123 | 	}
124 | 
125 | 	/**
126 | 	 * 设置节点深度
127 | 	 *
128 | 	 * @param clusteringFeature
129 | 	 *            当前节点
130 | 	 * @param level
131 | 	 *            当前深度值
132 | 	 */
133 | 	private void setTreeLevel(ClusteringFeature clusteringFeature, int level) {
134 | 		LeafNode leafNode = null;
135 | 		NonLeafNode nonLeafNode = null;
136 | 
137 | 		if (clusteringFeature instanceof LeafNode) {
138 | 			leafNode = (LeafNode) clusteringFeature;
139 | 		} else if (clusteringFeature instanceof NonLeafNode) {
140 | 			nonLeafNode = (NonLeafNode) clusteringFeature;
141 | 		}
142 | 
143 | 		if (nonLeafNode != null) {
144 | 			nonLeafNode.setLevel(level);
145 | 			level++;
146 | 			// 设置子节点
147 | 			if (nonLeafNode.getNonLeafChilds() != null) {
148 | 				for (NonLeafNode n1 : nonLeafNode.getNonLeafChilds()) {
149 | 					setTreeLevel(n1, level);
150 | 				}
151 | 			} else {
152 | 				for (LeafNode n2 : nonLeafNode.getLeafChilds()) {
153 | 					setTreeLevel(n2, level);
154 | 				}
155 | 			}
156 | 		} else {
157 | 			leafNode.setLevel(level);
158 | 			level++;
159 | 			// 设置子聚簇
160 | 			for (Cluster c : leafNode.getClusterChilds()) {
161 | 				c.setLevel(level);
162 | 			}
163 | 		}
164 | 	}
165 | 
166 | 	/**
167 | 	 * 显示CF聚类特征树
168 | 	 *
169 | 	 * @param rootNode
170 | 	 *            CF树根节点
171 | 	 */
172 | 	private void showCFTree(ClusteringFeature rootNode) {
173 | 		// 空格数，用于输出
174 | 		int blankNum = 5;
175 | 		// 当前树深度
176 | 		int currentLevel = 1;
177 | 		LinkedList<ClusteringFeature> nodeQueue = new LinkedList<>();
178 | 		ClusteringFeature cf;
179 | 		LeafNode leafNode;
180 | 		NonLeafNode nonLeafNode;
181 | 		ArrayList<Cluster> clusterList = new ArrayList<>();
182 | 		String typeName;
183 | 
184 | 		nodeQueue.add(rootNode);
185 | 		while (nodeQueue.size() > 0) {
186 | 			cf = nodeQueue.poll();
187 | 
188 | 			if (cf instanceof LeafNode) {
189 | 				leafNode = (LeafNode) cf;
190 | 				typeName = LEAFNODE;
191 | 
192 | 				if (leafNode.getClusterChilds() != null) {
193 | 					for (Cluster c : leafNode.getClusterChilds()) {
194 | 						nodeQueue.add(c);
195 | 					}
196 | 				}
197 | 			} else if (cf instanceof NonLeafNode) {
198 | 				nonLeafNode = (NonLeafNode) cf;
199 | 				typeName = NON_LEAFNODE;
200 | 
201 | 				if (nonLeafNode.getNonLeafChilds() != null) {
202 | 					for (NonLeafNode n1 : nonLeafNode.getNonLeafChilds()) {
203 | 						nodeQueue.add(n1);
204 | 					}
205 | 				} else {
206 | 					for (LeafNode n2 : nonLeafNode.getLeafChilds()) {
207 | 						nodeQueue.add(n2);
208 | 					}
209 | 				}
210 | 			} else {
211 | 				clusterList.add((Cluster) cf);
212 | 				typeName = CLUSTER;
213 | 			}
214 | 
215 | 			if (currentLevel != cf.getLevel()) {
216 | 				currentLevel = cf.getLevel();
217 | 				System.out.println();
218 | 				System.out.println("|");
219 | 				System.out.println("|");
220 | 			} else if (currentLevel == cf.getLevel() && currentLevel != 1) {
221 | 				for (int i = 0; i < blankNum; i++) {
222 | 					System.out.print("-");
223 | 				}
224 | 			}
225 | 
226 | 			System.out.print(typeName);
227 | 			System.out.print("N:" + cf.getN() + ", LS:");
228 | 			System.out.print("[");
229 | 			for (double d : cf.getLS()) {
230 | 				System.out.print(MessageFormat.format("{0}, ", d));
231 | 			}
232 | 			System.out.print("]");
233 | 		}
234 | 
235 | 		System.out.println();
236 | 		System.out.println("*******最终分好的聚簇****");
237 | 		//显示已经分好类的聚簇点
238 | 		for (int i = 0; i < clusterList.size(); i++) {
239 | 			System.out.println("Cluster" + (i + 1) + "：");
240 | 			for (double[] point : clusterList.get(i).getData()) {
241 | 				System.out.print("[");
242 | 				for (double d : point) {
243 | 					System.out.print(MessageFormat.format("{0}, ", d));
244 | 				}
245 | 				System.out.println("]");
246 | 			}
247 | 		}
248 | 	}
249 | 
250 | }
251 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/clustering/birch/BIRCHExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.clustering.birch;
 2 | 
 3 | /**
 4 |  * BIRCH聚类算法调用类
 5 |  */
 6 | public class BIRCHExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		String filePath = "data/birch/testInput.txt";
10 | 		//内部节点平衡因子B
11 | 		int B = 2;
12 | 		//叶子节点平衡因子L
13 | 		int L = 2;
14 | 		//簇直径阈值T
15 | 		double T = 0.6;
16 | 
17 | 		BIRCHCore tool = new BIRCHCore(filePath, B, L, T);
18 | 		tool.startBuilding();
19 | 	}
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/clustering/birch/Cluster.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.clustering.birch;
 2 | 
 3 | import java.util.ArrayList;
 4 | 
 5 | /**
 6 |  * 叶子节点中的小集群
 7 |  */
 8 | public class Cluster extends ClusteringFeature {
 9 | 
10 | 	//集群中的数据点
11 | 	private ArrayList<double[]> data;
12 | 	//父亲节点
13 | 	private LeafNode parentNode;
14 | 
15 | 	public Cluster(String[] record) {
16 | 		double[] d = new double[record.length];
17 | 		data = new ArrayList<>();
18 | 		for (int i = 0; i < record.length; i++) {
19 | 			d[i] = Double.parseDouble(record[i]);
20 | 		}
21 | 		data.add(d);
22 | 		//计算CF聚类特征
23 | 		this.setLS(data);
24 | 		this.setSS(data);
25 | 		this.setN(data);
26 | 	}
27 | 
28 | 	public ArrayList<double[]> getData() {
29 | 		return data;
30 | 	}
31 | 
32 | 	public void setData(ArrayList<double[]> data) {
33 | 		this.data = data;
34 | 	}
35 | 
36 | 	@Override
37 | 	protected void directAddCluster(ClusteringFeature node) {
38 | 		//如果是聚类包括数据记录，则还需合并数据记录
39 | 		Cluster c = (Cluster) node;
40 | 		ArrayList<double[]> dataRecords = c.getData();
41 | 		this.data.addAll(dataRecords);
42 | 
43 | 		super.directAddCluster(node);
44 | 	}
45 | 
46 | 	public LeafNode getParentNode() {
47 | 		return parentNode;
48 | 	}
49 | 
50 | 	public void setParentNode(LeafNode parentNode) {
51 | 		this.parentNode = parentNode;
52 | 	}
53 | 
54 | 	@Override
55 | 	public void addingCluster(ClusteringFeature clusteringFeature) {
56 | 		// TODO Auto-generated method stub
57 | 	}
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/clustering/birch/ClusteringFeature.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.clustering.birch;
  2 | 
  3 | import java.util.ArrayList;
  4 | 
  5 | /**
  6 |  * 聚类特征基本属性
  7 |  */
  8 | public abstract class ClusteringFeature {
  9 | 
 10 | 	// 子类中节点的总数目
 11 | 	protected int N;
 12 | 	// 子类中N个节点的线性和
 13 | 	protected double[] LS;
 14 | 	// 子类中N个节点的平方和
 15 | 	protected double[] SS;
 16 | 	//节点深度，用于CF树的输出
 17 | 	protected int level;
 18 | 
 19 | 	public int getN() {
 20 | 		return N;
 21 | 	}
 22 | 
 23 | 	public void setN(int n) {
 24 | 		N = n;
 25 | 	}
 26 | 
 27 | 	public double[] getLS() {
 28 | 		return LS;
 29 | 	}
 30 | 
 31 | 	public void setLS(double[] lS) {
 32 | 		LS = lS;
 33 | 	}
 34 | 
 35 | 	public double[] getSS() {
 36 | 		return SS;
 37 | 	}
 38 | 
 39 | 	public void setSS(double[] sS) {
 40 | 		SS = sS;
 41 | 	}
 42 | 
 43 | 	protected void setN(ArrayList<double[]> dataRecords) {
 44 | 		this.N = dataRecords.size();
 45 | 	}
 46 | 
 47 | 	public int getLevel() {
 48 | 		return level;
 49 | 	}
 50 | 
 51 | 	public void setLevel(int level) {
 52 | 		this.level = level;
 53 | 	}
 54 | 
 55 | 	/**
 56 | 	 * 根据节点数据计算线性和
 57 | 	 *
 58 | 	 * @param dataRecords
 59 | 	 *            节点数据记录
 60 | 	 */
 61 | 	protected void setLS(ArrayList<double[]> dataRecords) {
 62 | 		int num = dataRecords.get(0).length;
 63 | 		double[] record;
 64 | 		LS = new double[num];
 65 | 		for (int j = 0; j < num; j++) {
 66 | 			LS[j] = 0;
 67 | 		}
 68 | 
 69 | 		for (int i = 0; i < dataRecords.size(); i++) {
 70 | 			record = dataRecords.get(i);
 71 | 			for (int j = 0; j < record.length; j++) {
 72 | 				LS[j] += record[j];
 73 | 			}
 74 | 		}
 75 | 	}
 76 | 
 77 | 	/**
 78 | 	 * 根据节点数据计算平方
 79 | 	 *
 80 | 	 * @param dataRecords
 81 | 	 *            节点数据
 82 | 	 */
 83 | 	protected void setSS(ArrayList<double[]> dataRecords) {
 84 | 		int num = dataRecords.get(0).length;
 85 | 		double[] record;
 86 | 		SS = new double[num];
 87 | 		for (int j = 0; j < num; j++) {
 88 | 			SS[j] = 0;
 89 | 		}
 90 | 
 91 | 		for (int i = 0; i < dataRecords.size(); i++) {
 92 | 			record = dataRecords.get(i);
 93 | 			for (int j = 0; j < record.length; j++) {
 94 | 				SS[j] += record[j] * record[j];
 95 | 			}
 96 | 		}
 97 | 	}
 98 | 
 99 | 	/**
100 | 	 * CF向量特征的叠加，无须考虑划分
101 | 	 *
102 | 	 * @param node
103 | 	 */
104 | 	protected void directAddCluster(ClusteringFeature node) {
105 | 		int N = node.getN();
106 | 		double[] otherLS = node.getLS();
107 | 		double[] otherSS = node.getSS();
108 | 
109 | 		if (LS == null) {
110 | 			this.N = 0;
111 | 			LS = new double[otherLS.length];
112 | 			SS = new double[otherLS.length];
113 | 
114 | 			for (int i = 0; i < LS.length; i++) {
115 | 				LS[i] = 0;
116 | 				SS[i] = 0;
117 | 			}
118 | 		}
119 | 
120 | 		// 3个数量上进行叠加
121 | 		for (int i = 0; i < LS.length; i++) {
122 | 			LS[i] += otherLS[i];
123 | 			SS[i] += otherSS[i];
124 | 		}
125 | 		this.N += N;
126 | 	}
127 | 
128 | 	/**
129 | 	 * 计算簇与簇之间的距离即簇中心之间的距离
130 | 	 *
131 | 	 * @return
132 | 	 */
133 | 	protected double computerClusterDistance(ClusteringFeature cluster) {
134 | 		double distance = 0;
135 | 		double[] otherLS = cluster.LS;
136 | 		int num = N;
137 | 
138 | 		int otherNum = cluster.N;
139 | 
140 | 		for (int i = 0; i < LS.length; i++) {
141 | 			distance += (LS[i] / num - otherLS[i] / otherNum) * (LS[i] / num - otherLS[i] / otherNum);
142 | 		}
143 | 		distance = Math.sqrt(distance);
144 | 
145 | 		return distance;
146 | 	}
147 | 
148 | 	/**
149 | 	 * 计算簇内对象的平均距离
150 | 	 *
151 | 	 * @param records
152 | 	 *            簇内的数据记录
153 | 	 * @return
154 | 	 */
155 | 	protected double computerInClusterDistance(ArrayList<double[]> records) {
156 | 		double sumDistance = 0;
157 | 		double[] data1;
158 | 		double[] data2;
159 | 		// 数据总数
160 | 		int totalNum = records.size();
161 | 
162 | 		for (int i = 0; i < totalNum - 1; i++) {
163 | 			data1 = records.get(i);
164 | 			for (int j = i + 1; j < totalNum; j++) {
165 | 				data2 = records.get(j);
166 | 				sumDistance += computeOuDistance(data1, data2);
167 | 			}
168 | 		}
169 | 
170 | 		// 返回的值除以总对数，总对数应减半，会重复算一次
171 | 		return Math.sqrt(sumDistance / (totalNum * (totalNum - 1) / 2));
172 | 	}
173 | 
174 | 	/**
175 | 	 * 对给定的2个向量，计算欧式距离
176 | 	 *
177 | 	 * @param record1
178 | 	 *            向量点1
179 | 	 * @param record2
180 | 	 *            向量点2
181 | 	 */
182 | 	private double computeOuDistance(double[] record1, double[] record2) {
183 | 		double distance = 0;
184 | 
185 | 		for (int i = 0; i < record1.length; i++) {
186 | 			distance += (record1[i] - record2[i]) * (record1[i] - record2[i]);
187 | 		}
188 | 
189 | 		return distance;
190 | 	}
191 | 
192 | 	/**
193 | 	 * 聚类添加节点包括，超出阈值进行分裂的操作
194 | 	 *
195 | 	 * @param clusteringFeature
196 | 	 *            待添加聚簇
197 | 	 */
198 | 	public abstract void addingCluster(ClusteringFeature clusteringFeature);
199 | }
200 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/clustering/birch/LeafNode.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.clustering.birch;
  2 | 
  3 | import java.util.ArrayList;
  4 | 
  5 | /**
  6 |  * CF树叶子节点
  7 |  */
  8 | public class LeafNode extends ClusteringFeature {
  9 | 
 10 | 	// 孩子集群
 11 | 	private ArrayList<Cluster> clusterChilds;
 12 | 	// 父亲节点
 13 | 	private NonLeafNode parentNode;
 14 | 
 15 | 	public ArrayList<Cluster> getClusterChilds() {
 16 | 		return clusterChilds;
 17 | 	}
 18 | 
 19 | 	public void setClusterChilds(ArrayList<Cluster> clusterChilds) {
 20 | 		this.clusterChilds = clusterChilds;
 21 | 	}
 22 | 
 23 | 	/**
 24 | 	 * 将叶子节点划分出2个
 25 | 	 *
 26 | 	 * @return
 27 | 	 */
 28 | 	public LeafNode[] divideLeafNode() {
 29 | 		LeafNode[] leafNodeArray = new LeafNode[2];
 30 | 		// 簇间距离差距最大的2个簇，后面的簇按照就近原则划分即可
 31 | 		Cluster cluster1 = null;
 32 | 		Cluster cluster2 = null;
 33 | 		Cluster tempCluster = null;
 34 | 		double maxValue = 0;
 35 | 		double temp = 0;
 36 | 
 37 | 		// 找出簇心距离差距最大的2个簇
 38 | 		for (int i = 0; i < clusterChilds.size() - 1; i++) {
 39 | 			tempCluster = clusterChilds.get(i);
 40 | 			for (int j = i + 1; j < clusterChilds.size(); j++) {
 41 | 				temp = tempCluster.computerClusterDistance(clusterChilds.get(j));
 42 | 
 43 | 				if (temp > maxValue) {
 44 | 					maxValue = temp;
 45 | 					cluster1 = tempCluster;
 46 | 					cluster2 = clusterChilds.get(j);
 47 | 				}
 48 | 			}
 49 | 		}
 50 | 
 51 | 		leafNodeArray[0] = new LeafNode();
 52 | 		leafNodeArray[0].addingCluster(cluster1);
 53 | 		cluster1.setParentNode(leafNodeArray[0]);
 54 | 		leafNodeArray[1] = new LeafNode();
 55 | 		leafNodeArray[1].addingCluster(cluster2);
 56 | 		cluster2.setParentNode(leafNodeArray[1]);
 57 | 		clusterChilds.remove(cluster1);
 58 | 		clusterChilds.remove(cluster2);
 59 | 		// 就近分配簇
 60 | 		for (Cluster c : clusterChilds) {
 61 | 			if (cluster1.computerClusterDistance(c) < cluster2.computerClusterDistance(c)) {
 62 | 				// 簇间距离如果接近最小簇，就加入最小簇所属叶子节点
 63 | 				leafNodeArray[0].addingCluster(c);
 64 | 				c.setParentNode(leafNodeArray[0]);
 65 | 			} else {
 66 | 				leafNodeArray[1].addingCluster(c);
 67 | 				c.setParentNode(leafNodeArray[1]);
 68 | 			}
 69 | 		}
 70 | 
 71 | 		return leafNodeArray;
 72 | 	}
 73 | 
 74 | 	public NonLeafNode getParentNode() {
 75 | 		return parentNode;
 76 | 	}
 77 | 
 78 | 	public void setParentNode(NonLeafNode parentNode) {
 79 | 		this.parentNode = parentNode;
 80 | 	}
 81 | 
 82 | 	@Override
 83 | 	public void addingCluster(ClusteringFeature clusteringFeature) {
 84 | 		//更新聚类特征值
 85 | 		directAddCluster(clusteringFeature);
 86 | 
 87 | 		// 寻找到的目标集群
 88 | 		Cluster findedCluster = null;
 89 | 		Cluster cluster = (Cluster) clusteringFeature;
 90 | 		// 簇内对象平均距离
 91 | 		double disance = Integer.MAX_VALUE;
 92 | 		// 簇间距离差值
 93 | 		double errorDistance = 0;
 94 | 		boolean needDivided = false;
 95 | 		if (clusterChilds == null) {
 96 | 			clusterChilds = new ArrayList<>();
 97 | 			clusterChilds.add(cluster);
 98 | 			cluster.setParentNode(this);
 99 | 		} else {
100 | 			for (Cluster c : clusterChilds) {
101 | 				errorDistance = c.computerClusterDistance(cluster);
102 | 				if (disance > errorDistance) {
103 | 					// 选出簇间距离最近的
104 | 					disance = errorDistance;
105 | 					findedCluster = c;
106 | 				}
107 | 			}
108 | 
109 | 			ArrayList<double[]> data1 = (ArrayList<double[]>) findedCluster.getData().clone();
110 | 			ArrayList<double[]> data2 = cluster.getData();
111 | 			data1.addAll(data2);
112 | 			// 如果添加后的聚类的簇间距离超过给定阈值，需要额外新建簇
113 | 			if (findedCluster.computerInClusterDistance(data1) > BIRCHCore.T) {
114 | 				// 叶子节点的孩子数不能超过平衡因子L
115 | 				if (clusterChilds.size() + 1 > BIRCHCore.L) {
116 | 					needDivided = true;
117 | 				}
118 | 				clusterChilds.add(cluster);
119 | 				cluster.setParentNode(this);
120 | 			} else {
121 | 				findedCluster.directAddCluster(cluster);
122 | 				cluster.setParentNode(this);
123 | 			}
124 | 		}
125 | 
126 | 		if (needDivided) {
127 | 			if (parentNode == null) {
128 | 				parentNode = new NonLeafNode();
129 | 			} else {
130 | 				parentNode.getLeafChilds().remove(this);
131 | 			}
132 | 
133 | 			LeafNode[] nodeArray = divideLeafNode();
134 | 			for (LeafNode n : nodeArray) {
135 | 				parentNode.addingCluster(n);
136 | 			}
137 | 		}
138 | 	}
139 | 
140 | 	@Override
141 | 	protected void directAddCluster(ClusteringFeature node) {
142 | 		// TODO Auto-generated method stub
143 | 		if (parentNode != null) {
144 | 			parentNode.directAddCluster(node);
145 | 		}
146 | 
147 | 		super.directAddCluster(node);
148 | 	}
149 | 
150 | }
151 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/clustering/kmeans/KMeansCore.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.clustering.kmeans;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.text.MessageFormat;
  8 | import java.util.ArrayList;
  9 | import java.util.Collections;
 10 | 
 11 | /**
 12 |  * k均值算法工具类
 13 |  */
 14 | public class KMeansCore {
 15 | 
 16 | 	// 输入数据文件地址
 17 | 	private String filePath;
 18 | 	// 分类类别个数
 19 | 	private int classNum;
 20 | 	// 类名称
 21 | 	private ArrayList<String> classNames;
 22 | 	// 聚类坐标点
 23 | 	private ArrayList<Point> classPoints;
 24 | 	// 所有的数据左边点
 25 | 	private ArrayList<Point> totalPoints;
 26 | 
 27 | 	public KMeansCore(String filePath, int classNum) {
 28 | 		this.filePath = filePath;
 29 | 		this.classNum = classNum;
 30 | 		readDataFile();
 31 | 	}
 32 | 
 33 | 	/**
 34 | 	 * 从文件中读取数据
 35 | 	 */
 36 | 	private void readDataFile() {
 37 | 		File file = new File(filePath);
 38 | 		ArrayList<String[]> dataArray = new ArrayList<String[]>();
 39 | 
 40 | 		try {
 41 | 			BufferedReader in = new BufferedReader(new FileReader(file));
 42 | 			String str;
 43 | 			String[] tempArray;
 44 | 			while ((str = in.readLine()) != null) {
 45 | 				tempArray = str.split(" ");
 46 | 				dataArray.add(tempArray);
 47 | 			}
 48 | 			in.close();
 49 | 		} catch (IOException e) {
 50 | 			e.getStackTrace();
 51 | 		}
 52 | 
 53 | 		classPoints = new ArrayList<>();
 54 | 		totalPoints = new ArrayList<>();
 55 | 		classNames = new ArrayList<>();
 56 | 		for (int i = 0, j = 1; i < dataArray.size(); i++) {
 57 | 			if (j <= classNum) {
 58 | 				classPoints.add(new Point(dataArray.get(i)[0], dataArray.get(i)[1], j + ""));
 59 | 				classNames.add(i + "");
 60 | 				j++;
 61 | 			}
 62 | 			totalPoints.add(new Point(dataArray.get(i)[0], dataArray.get(i)[1]));
 63 | 		}
 64 | 	}
 65 | 
 66 | 	/**
 67 | 	 * K均值聚类算法实现
 68 | 	 */
 69 | 	public void kMeansClustering() {
 70 | 		double tempX = 0;
 71 | 		double tempY = 0;
 72 | 		int count = 0;
 73 | 		double error = Integer.MAX_VALUE;
 74 | 		Point temp;
 75 | 
 76 | 		while (error > 0.01 * classNum) {
 77 | 			for (Point p1 : totalPoints) {
 78 | 				// 将所有的测试坐标点就近分类
 79 | 				for (Point p2 : classPoints) {
 80 | 					p2.computerDistance(p1);
 81 | 				}
 82 | 				Collections.sort(classPoints);
 83 | 
 84 | 				// 取出p1离类坐标点最近的那个点
 85 | 				p1.setClassName(classPoints.get(0).getClassName());
 86 | 			}
 87 | 
 88 | 			error = 0;
 89 | 			// 按照均值重新划分聚类中心点
 90 | 			for (Point p1 : classPoints) {
 91 | 				count = 0;
 92 | 				tempX = 0;
 93 | 				tempY = 0;
 94 | 				for (Point p : totalPoints) {
 95 | 					if (p.getClassName().equals(p1.getClassName())) {
 96 | 						count++;
 97 | 						tempX += p.getX();
 98 | 						tempY += p.getY();
 99 | 					}
100 | 				}
101 | 				tempX /= count;
102 | 				tempY /= count;
103 | 
104 | 				error += Math.abs((tempX - p1.getX()));
105 | 				error += Math.abs((tempY - p1.getY()));
106 | 				// 计算均值
107 | 				p1.setX(tempX);
108 | 				p1.setY(tempY);
109 | 
110 | 			}
111 | 
112 | 			for (int i = 0; i < classPoints.size(); i++) {
113 | 				temp = classPoints.get(i);
114 | 				System.out.println(MessageFormat.format("聚类中心点{0}，x={1},y={2}", (i + 1), temp.getX(), temp.getY()));
115 | 			}
116 | 			System.out.println("----------");
117 | 		}
118 | 
119 | 		System.out.println("结果值收敛");
120 | 		for (int i = 0; i < classPoints.size(); i++) {
121 | 			temp = classPoints.get(i);
122 | 			System.out.println(MessageFormat.format("聚类中心点{0}，x={1},y={2}", (i + 1), temp.getX(), temp.getY()));
123 | 		}
124 | 
125 | 	}
126 | 
127 | }
128 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/clustering/kmeans/KMeansExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.clustering.kmeans;
 2 | 
 3 | /**
 4 |  * K-means（K均值）算法调用类
 5 |  */
 6 | public class KMeansExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		String filePath = "data/kmeans/input.txt";
10 | 		// 聚类中心数量设定
11 | 		int classNum = 3;
12 | 
13 | 		KMeansCore tool = new KMeansCore(filePath, classNum);
14 | 		tool.kMeansClustering();
15 | 	}
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/clustering/kmeans/Point.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.clustering.kmeans;
 2 | 
 3 | /**
 4 |  * 坐标点类
 5 |  */
 6 | public class Point implements Comparable<Point> {
 7 | 
 8 | 	// 坐标点横坐标
 9 | 	private double x;
10 | 	// 坐标点纵坐标
11 | 	private double y;
12 | 	//以此点作为聚类中心的类的类名称
13 | 	private String className;
14 | 	// 坐标点之间的欧式距离
15 | 	private Double distance;
16 | 
17 | 	public Point(double x, double y) {
18 | 		this.x = x;
19 | 		this.y = y;
20 | 	}
21 | 
22 | 	public Point(String x, String y) {
23 | 		this.x = Double.parseDouble(x);
24 | 		this.y = Double.parseDouble(y);
25 | 	}
26 | 
27 | 	public Point(String x, String y, String className) {
28 | 		this.x = Double.parseDouble(x);
29 | 		this.y = Double.parseDouble(y);
30 | 		this.className = className;
31 | 	}
32 | 
33 | 	/**
34 | 	 * 距离目标点p的欧几里得距离
35 | 	 *
36 | 	 * @param p
37 | 	 */
38 | 	public void computerDistance(Point p) {
39 | 		if (p == null) {
40 | 			return;
41 | 		}
42 | 
43 | 		this.distance = (this.x - p.x) * (this.x - p.x) + (this.y - p.y) * (this.y - p.y);
44 | 	}
45 | 
46 | 	public double getX() {
47 | 		return x;
48 | 	}
49 | 
50 | 	public void setX(double x) {
51 | 		this.x = x;
52 | 	}
53 | 
54 | 	public double getY() {
55 | 		return y;
56 | 	}
57 | 
58 | 	public void setY(double y) {
59 | 		this.y = y;
60 | 	}
61 | 
62 | 	public String getClassName() {
63 | 		return className;
64 | 	}
65 | 
66 | 	public void setClassName(String className) {
67 | 		this.className = className;
68 | 	}
69 | 
70 | 	public double getDistance() {
71 | 		return distance;
72 | 	}
73 | 
74 | 	public void setDistance(double distance) {
75 | 		this.distance = distance;
76 | 	}
77 | 
78 | 	@Override
79 | 	public int compareTo(Point o) {
80 | 		return this.distance.compareTo(o.distance);
81 | 	}
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/DataReader.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.dimensionality.reduction.pca;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.IOException;
 5 | import java.io.Reader;
 6 | import java.util.ArrayList;
 7 | import java.util.List;
 8 | 
 9 | /**
10 |  * 简单的向量数据读取
11 |  */
12 | public class DataReader extends BufferedReader {
13 | 
14 | 	public DataReader(Reader in, int sz) {
15 | 		super(in, sz);
16 | 	}
17 | 
18 | 	public DataReader(Reader in) {
19 | 		super(in);
20 | 	}
21 | 
22 | 	/**
23 | 	 * Get the (vector) data contained in the file. The data is stored one value
24 | 	 * per line. Empty lines are ignored.
25 | 	 *
26 | 	 * @return the data
27 | 	 */
28 | 	public double[] getData() throws IOException {
29 | 		List<Double> dataList = new ArrayList<>();
30 | 		String line;
31 | 		while ((line = readLine()) != null) {
32 | 			line = line.trim();
33 | 			if (line.isEmpty()) {
34 | 				continue;
35 | 			}
36 | 			dataList.add(Double.valueOf(line));
37 | 		}
38 | 
39 | 		double[] vector = new double[dataList.size()];
40 | 		int i = 0;
41 | 		for (Double d : dataList) {
42 | 			vector[i++] = d;
43 | 		}
44 | 
45 | 		return vector;
46 | 	}
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/Main.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.dimensionality.reduction.pca;
 2 | 
 3 | import java.io.FileReader;
 4 | import java.io.FileWriter;
 5 | import java.io.IOException;
 6 | import java.io.PrintWriter;
 7 | 
 8 | /**
 9 |  * 命令行操作类
10 |  */
11 | public class Main {
12 | 
13 | 	/**
14 | 	 * Run a PCA on vector data
15 | 	 *
16 | 	 * @param av are file references containing vector data
17 | 	 * @throws Exception
18 | 	 */
19 | 	public static void main(String... av) throws Exception {
20 | 
21 | 		if (av.length == 0) {
22 | 			throw new IllegalArgumentException("Usage: pca FILES...");
23 | 		}
24 | 
25 | 		for (String filename : av) {
26 | 			try (DataReader dr = new DataReader(new FileReader(filename + ".data"))) {
27 | 				double[] data = dr.getData();
28 | 				System.out.println(filename + ": vector length = " + data.length);
29 | 
30 | 				PCACoreHandler handler = new PCACoreHandler();
31 | 				PCACore pca = handler.fromSimpleTimeSeries(data);
32 | 
33 | 				log(filename + "_pcomps.data", filename + ": principle components", pca.getPrincipalComponents());
34 | 				log(filename + "_lambda.data", filename + ": lambda", pca.getLambda());
35 | 				log(filename + "_pfacs.data", filename + ": principle factors", pca.getPrinicipalFactors());
36 | 
37 | 				Matrix cc = handler.correlationCircle(pca);
38 | 				log(filename + "_cc.data", filename + ": correlation circle", cc);
39 | 
40 | 				Matrix cumcon = handler.cumulativeContribution(pca);
41 | 				log(filename + "_cumcon.data", filename + ": cumulative contributions", cumcon);
42 | 			}
43 | 		}
44 | 	}
45 | 
46 | 	private static void log(String filename, String tag, Matrix m) throws IOException {
47 | 		try (PrintWriter fp = new PrintWriter(new FileWriter(filename))) {
48 | 			System.out.println(tag + ":");
49 | 			MatrixHelper.print(m, fp, 1, 4);
50 | 		}
51 | 	}
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/MatrixException.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.dimensionality.reduction.pca;
 2 | 
 3 | /**
 4 |  * 矩阵异常
 5 |  */
 6 | public class MatrixException extends RuntimeException {
 7 | 
 8 | 	private static final long serialVersionUID = -65073227556727585L;
 9 | 
10 | 	public MatrixException(String s) {
11 | 		super(s);
12 | 	}
13 | 
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/MatrixHelper.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.dimensionality.reduction.pca;
 2 | 
 3 | import java.io.PrintWriter;
 4 | import java.text.DecimalFormat;
 5 | import java.text.DecimalFormatSymbols;
 6 | import java.text.NumberFormat;
 7 | import java.util.Locale;
 8 | 
 9 | /**
10 |  * 矩阵的操作类
11 |  */
12 | public class MatrixHelper {
13 | 
14 | 	/**
15 | 	 * Print the matrix to stdout. Line the elements up in columns with a
16 | 	 * Fortran-like 'Fw.d' style format.
17 | 	 *
18 | 	 * @param w Column width.
19 | 	 * @param d Number of digits after the decimal.
20 | 	 */
21 | 	public static void print(Matrix a, int w, int d) {
22 | 		print(a, new PrintWriter(System.out, true), w, d);
23 | 	}
24 | 
25 | 	/**
26 | 	 * Print the matrix to the output stream. Line the elements up in columns
27 | 	 * with a Fortran-like 'Fw.d' style format.
28 | 	 *
29 | 	 * @param output Output stream.
30 | 	 * @param w Column width.
31 | 	 * @param d Number of digits after the decimal.
32 | 	 */
33 | 	public static void print(Matrix a, PrintWriter output, int w, int d) {
34 | 		DecimalFormat format = new DecimalFormat();
35 | 		format.setDecimalFormatSymbols(new DecimalFormatSymbols(Locale.US));
36 | 		format.setMinimumIntegerDigits(1);
37 | 		format.setMaximumFractionDigits(d);
38 | 		format.setMinimumFractionDigits(d);
39 | 		format.setGroupingUsed(false);
40 | 		print(a, output, format, w + 2);
41 | 	}
42 | 
43 | 	/**
44 | 	 * Print the matrix to stdout. Line the elements up in columns. Use the
45 | 	 * format object, and right justify within columns of width characters. Note
46 | 	 * that is the matrix is to be read back in, you probably will want to use a
47 | 	 * NumberFormat that is set to US Locale.
48 | 	 *
49 | 	 * @param format A Formatting object for individual elements.
50 | 	 * @param width Field width for each column.
51 | 	 * @see java.text.DecimalFormat#setDecimalFormatSymbols
52 | 	 */
53 | 	public static void print(Matrix a, NumberFormat format, int width) {
54 | 		print(a, new PrintWriter(System.out, true), format, width);
55 | 	}
56 | 
57 | 	// DecimalFormat is a little disappointing coming from Fortran or C's printf.
58 | 	// Since it doesn't pad on the left, the elements will come out different
59 | 	// widths.  Consequently, we'll pass the desired column width in as an
60 | 	// argument and do the extra padding ourselves.
61 | 	/**
62 | 	 * Print the matrix to the output stream. Line the elements up in columns.
63 | 	 * Use the format object, and right justify within columns of width
64 | 	 * characters. Note that is the matrix is to be read back in, you probably
65 | 	 * will want to use a NumberFormat that is set to US Locale.
66 | 	 *
67 | 	 * @param output the output stream.
68 | 	 * @param format A formatting object to format the matrix elements
69 | 	 * @param width Column width.
70 | 	 * @see java.text.DecimalFormat#setDecimalFormatSymbols
71 | 	 */
72 | 	public static void print(Matrix a, PrintWriter output, NumberFormat format, int width) {
73 | 		output.println(); // start on new line.
74 | 		int m = a.getNRows();
75 | 		int n = a.getNCols();
76 | 		double[][] A = a.getArray();
77 | 		for (int i = 0; i < m; i++) {
78 | 			for (int j = 0; j < n; j++) {
79 | 				String s = format.format(A[i][j]); // format the number
80 | 				int padding = Math.max(1, width - s.length()); // At _least_ 1 space
81 | 				for (int k = 0; k < padding; k++) {
82 | 					output.print(' ');
83 | 				}
84 | 				output.print(s);
85 | 			}
86 | 			output.println();
87 | 		}
88 | 		output.println(); // end with blank line.
89 | 	}
90 | }
91 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/PCACore.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.dimensionality.reduction.pca;
 2 | 
 3 | /**
 4 |  * PCA核心算法类
 5 |  */
 6 | public class PCACore {
 7 | 
 8 | 	// The incoming matrix
 9 | 	private final Matrix m;
10 | 	// the principal components
11 | 	private final Matrix pc;
12 | 	// facpr
13 | 	private final Matrix facpr;
14 | 	// lambda
15 | 	private final Matrix lambda;
16 | 
17 | 	public PCACore(Matrix x) {
18 | 
19 | 		// Weight and center the matrix
20 | 		this.m = x.wcenter();
21 | 		// compute the eigenvectors of y'*y using svd
22 | 		SVD svd = new SVD(this.m);
23 | 
24 | 		// calculate the lambda
25 | 		this.lambda = calculateLambda(svd.getS());
26 | 		// get the principle factors
27 | 		this.facpr = svd.getV();
28 | 
29 | 		// calculate the principle components
30 | 		this.pc = this.m.times(svd.getV());
31 | 	}
32 | 
33 | 	private Matrix calculateLambda(Matrix s) {
34 | 
35 | 		Matrix d = s.diag();
36 | 		double[][] D = d.getArray();
37 | 
38 | 		int size = d.getNRows();
39 | 		for (int i = 0; i < size; i++) {
40 | 			D[i][0] = (D[i][0] * D[i][0]) / (size - 1);
41 | 		}
42 | 
43 | 		return d;
44 | 	}
45 | 
46 | 	public Matrix getPrincipalComponents() {
47 | 		return pc;
48 | 	}
49 | 
50 | 	public Matrix getLambda() {
51 | 		return lambda;
52 | 	}
53 | 
54 | 	public Matrix getPrinicipalFactors() {
55 | 		return facpr;
56 | 	}
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/PCACoreHandler.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.dimensionality.reduction.pca;
  2 | 
  3 | /**
  4 |  * 封装PCACore的调用类
  5 |  */
  6 | public class PCACoreHandler {
  7 | 
  8 | 	public PCACoreHandler() {
  9 | 	}
 10 | 
 11 | 	/**
 12 | 	 * Run a principal component analysis of a matrix.
 13 | 	 *
 14 | 	 * @param m the matrix
 15 | 	 * @return the principle components
 16 | 	 */
 17 | 	public PCACore fromMatrix(Matrix m) {
 18 | 		return new PCACore(m);
 19 | 	}
 20 | 
 21 | 	/**
 22 | 	 * Run a principal component analysis from a simple time series vector. We
 23 | 	 * are converting the data into a Toeplitz style matrix before running the
 24 | 	 * PCA.
 25 | 	 *
 26 | 	 * @param data the time series vector
 27 | 	 * @return the principle components
 28 | 	 */
 29 | 	public PCACore fromSimpleTimeSeries(double[] data) {
 30 | 		Matrix m = new ToeplitzMatrix(data);
 31 | 		PCACore pca = new PCACore(m);
 32 | 		return pca;
 33 | 	}
 34 | 
 35 | 	/**
 36 | 	 * Calculate the correlations circle for two components. This is quick and
 37 | 	 * dirty we are not doing any validity checks to make sure the PCA has
 38 | 	 * completed successfully.
 39 | 	 *
 40 | 	 * @param pca the PCA
 41 | 	 * @param compare the principal factor columns to compare
 42 | 	 * @return the correlations circle
 43 | 	 */
 44 | 	public Matrix correlationCircle(PCACore pca, int[] compare) {
 45 | 		double[][] F = pca.getPrinicipalFactors().getArray();
 46 | 		double[][] L = pca.getLambda().getArray();
 47 | 
 48 | 		// calculate the correlation circle
 49 | 		Matrix cc = new Matrix(F.length, compare.length);
 50 | 		double[][] CC = cc.getArray();
 51 | 
 52 | 		for (int n = 0; n < compare.length; n++) {
 53 | 			int index = compare[n];
 54 | 			double s = Math.sqrt(L[index][0]);
 55 | 			for (int m = 0; m < F.length; m++) {
 56 | 				double f = F[m][index];
 57 | 
 58 | 				CC[m][n] = s * f;
 59 | 			}
 60 | 		}
 61 | 		return cc;
 62 | 	}
 63 | 
 64 | 	/**
 65 | 	 * Calculate the correlations circle for the two largest eigenvalues.
 66 | 	 *
 67 | 	 * @param pca the pca
 68 | 	 * @return the correlations circle
 69 | 	 */
 70 | 	public Matrix correlationCircle(PCACore pca) {
 71 | 		return correlationCircle(pca, new int[] { 0, 1 });
 72 | 	}
 73 | 
 74 | 	/**
 75 | 	 * Normalize the eigenvalues so we can create a scree plot.
 76 | 	 *
 77 | 	 * @param pca the pca
 78 | 	 * @return the normalized eigenvalues;
 79 | 	 */
 80 | 	public Matrix normalizeLambda(PCACore pca) {
 81 | 
 82 | 		double[][] L = pca.getLambda().getArrayCopy();
 83 | 		Matrix nl = new Matrix(L);
 84 | 		double sum = 0;
 85 | 		for (int n = 0; n < L.length; n++) {
 86 | 			sum += L[n][0];
 87 | 		}
 88 | 		for (int n = 0; n < L.length; n++) {
 89 | 			L[n][0] = L[n][0] / sum;
 90 | 		}
 91 | 		return nl;
 92 | 	}
 93 | 
 94 | 	/**
 95 | 	 * Calculate the cumulative contribution of the eigenvectors
 96 | 	 *
 97 | 	 * @param pca is the pca
 98 | 	 * @return the cumulative contributions of the eigenvectors
 99 | 	 */
100 | 	public Matrix cumulativeContribution(PCACore pca) {
101 | 		Matrix nl = normalizeLambda(pca);
102 | 		double[][] CC = nl.getArrayCopy();
103 | 		Matrix cc = new Matrix(CC);
104 | 		double cum = 0;
105 | 		for (int n = 0; n < CC.length; n++) {
106 | 			cum = CC[n][0] = CC[n][0] + cum;
107 | 		}
108 | 		return cc;
109 | 	}
110 | }
111 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/PCAExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.dimensionality.reduction.pca;
 2 | 
 3 | public class PCAExample {
 4 | 
 5 | 	// expected results
 6 | 	private static int SIZE = 6;
 7 | 	private static double[] data = new double[] { 1, 2, 3, 4, 5, 6 };
 8 | 
 9 | 	private static double[][] pcomps = new double[][] { { -2.4530, -1.4869, -0.2693, -0.1327, -0.1141, -0.0000 },
10 | 			{ -1.9344, 0.1112, 0.1731, 0.3003, 0.2126, 0.0000 }, { -0.7959, 1.3757, 0.4091, -0.1677, -0.1651, -0.0000 },
11 | 			{ 0.7959, 1.3757, -0.4091, -0.1677, 0.1651, -0.0000 }, { 1.9344, 0.1112, -0.1731, 0.3003, -0.2126, 0.0000 },
12 | 			{ 2.4530, -1.4869, 0.2693, -0.1327, 0.1141, 0.0000 } };
13 | 
14 | 	private static double[] plambda = new double[] { 4.1572, 1.6463, 0.1080, 0.0544, 0.0342, 0.0000 };
15 | 
16 | 	private static double[][] pfacs = { { 0.4851, -0.0000, 0.4138, 0.0000, 0.3056, 0.7071 },
17 | 			{ 0.4562, -0.2454, -0.1519, -0.6631, -0.5185, -0.0000 },
18 | 			{ 0.2378, -0.6631, -0.5529, 0.2454, 0.3712, -0.0000 },
19 | 			{ -0.2378, -0.6631, 0.5529, 0.2454, -0.3712, 0.0000 },
20 | 			{ -0.4562, -0.2454, 0.1519, -0.6631, 0.5185, -0.0000 },
21 | 			{ -0.4851, -0.0000, -0.4138, -0.0000, -0.3056, 0.7071 } };
22 | 
23 | 	public static void main(String[] args) {
24 | 		PCACoreHandler instance = new PCACoreHandler();
25 | 		PCACore result = instance.fromSimpleTimeSeries(data);
26 | 
27 | 		// compare the principal components
28 | 		System.out.println("compare the principal components:");
29 | 		double[][] res_pcomp = result.getPrincipalComponents().getArray();
30 | 		for (int i = 0; i < SIZE; i++) {
31 | 			for (int j = 0; j < SIZE; j++) {
32 | 				System.out.println(pcomps[i][j] + "   ,   " + res_pcomp[i][j]);
33 | 			}
34 | 		}
35 | 
36 | 		// compare the lambdas
37 | 		System.out.println("compare the lambdas:");
38 | 		double[] res_plambda = result.getLambda().transpose().getArray()[0];
39 | 		for (int i = 0; i < SIZE; i++) {
40 | 			System.out.println(plambda[i] + "  ,  " + res_plambda[i]);
41 | 		}
42 | 
43 | 		// compare the principle factors
44 | 		System.out.println("compare the principle factors:");
45 | 		double[][] res_pfacs = result.getPrinicipalFactors().getArray();
46 | 		for (int i = 0; i < SIZE; i++) {
47 | 			for (int j = 0; j < SIZE; j++) {
48 | 				System.out.println(pfacs[i][j] + "  ,  " + res_pfacs[i][j]);
49 | 			}
50 | 		}
51 | 	}
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/ToeplitzMatrix.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.dimensionality.reduction.pca;
 2 | 
 3 | /**
 4 |  * Toeplitz matrix
 5 |  */
 6 | public class ToeplitzMatrix extends Matrix {
 7 | 
 8 | 	/**
 9 | 	 * Toeplitz matrix styles
10 | 	 */
11 | 	public static enum Type {
12 | 
13 | 		Triangular, Symmetrical, Circulant
14 | 	};
15 | 
16 | 	/**
17 | 	 * Create a symmetrical Toeplitz-style matrix from a vector.
18 | 	 *
19 | 	 * @param v
20 | 	 */
21 | 	public ToeplitzMatrix(double[] v) {
22 | 		this(v, Type.Symmetrical);
23 | 	}
24 | 
25 | 	/**
26 | 	 * Create a Toeplitz matrix from a vector.
27 | 	 *
28 | 	 * @param v the vector
29 | 	 * @param type the matrix style
30 | 	 */
31 | 	public ToeplitzMatrix(double[] v, Type type) {
32 | 		super(v.length, v.length);
33 | 		int n = v.length;
34 | 		double[][] arr = getArray();
35 | 
36 | 		for (int i = 0; i < v.length; i++) {
37 | 			for (int j = 0; j <= i; j++) {
38 | 				int index = i - j;
39 | 				arr[i][j] = v[i - j];
40 | 				switch (type) {
41 | 				default:
42 | 				case Triangular:
43 | 					// do nothing
44 | 					break;
45 | 				case Symmetrical:
46 | 					arr[j][i] = v[i - j];
47 | 					break;
48 | 				case Circulant:
49 | 					if (j != i) {
50 | 						arr[j][i] = v[n - index];
51 | 					}
52 | 					break;
53 | 				}
54 | 			}
55 | 		}
56 | 	}
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/TrajectoryMatrix.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.dimensionality.reduction.pca;
 2 | 
 3 | /**
 4 |  * Create a trajectory style matrix from a vector.
 5 |  */
 6 | public class TrajectoryMatrix extends Matrix {
 7 | 
 8 | 	public TrajectoryMatrix(double[] v, int ncols) {
 9 | 		super(v.length - ncols + 1, ncols);
10 | 		double[][] arr = getArray();
11 | 		int nrows = getNRows();
12 | 		int pos = 0; // position in vector
13 | 
14 | 		for (int i = 0; i < nrows; i++) {
15 | 			double value = v[pos++];
16 | 			int availCols = i < ncols ? i + 1 : ncols;
17 | 			for (int j = 0, m = i; j < availCols && m >= 0; j++, m--) {
18 | 				arr[m][j] = value;
19 | 			}
20 | 		}
21 | 		for (int i = 1; i < ncols; i++) {
22 | 			double value = v[pos++];
23 | 			for (int j = i, m = nrows - 1; j < ncols && m > 0; j++, m--) {
24 | 				arr[m][j] = value;
25 | 			}
26 | 		}
27 | 	}
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/graph/gspan/DFSCodeTraveler.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.graph.gspan;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.Stack;
  5 | 
  6 | /**
  7 |  * 图编码深度优先搜索类，判断当前编码在给定图中是否为最小编码
  8 |  */
  9 | public class DFSCodeTraveler {
 10 | 
 11 | 	// 当前的编码是否为最下编码标识
 12 | 	boolean isMin;
 13 | 	// 当前挖掘的图的边五元组编码组
 14 | 	ArrayList<Edge> edgeSeqs;
 15 | 	// 当前的图结构
 16 | 	Graph graph;
 17 | 	// 图节点id对应的边五元组中的id标识
 18 | 	int[] g2s;
 19 | 	// 代表图中的边是否被用到了
 20 | 	boolean f[][];
 21 | 
 22 | 	public DFSCodeTraveler(ArrayList<Edge> edgeSeqs, Graph graph) {
 23 | 		this.isMin = true;
 24 | 		this.edgeSeqs = edgeSeqs;
 25 | 		this.graph = graph;
 26 | 	}
 27 | 
 28 | 	public void traveler() {
 29 | 		int nodeLNums = graph.nodeLabels.size();
 30 | 		g2s = new int[nodeLNums];
 31 | 		for (int i = 0; i < nodeLNums; i++) {
 32 | 			// 设置-1代表此点还未被计入编码
 33 | 			g2s[i] = -1;
 34 | 		}
 35 | 
 36 | 		f = new boolean[nodeLNums][nodeLNums];
 37 | 		for (int i = 0; i < nodeLNums; i++) {
 38 | 			for (int j = 0; j < nodeLNums; j++) {
 39 | 				f[i][j] = false;
 40 | 			}
 41 | 		}
 42 | 
 43 | 		// 从每个点开始寻找最小编码五元组
 44 | 		for (int i = 0; i < nodeLNums; i++) {
 45 | 			//对选择的第一个点的标号做判断
 46 | 			if (graph.getNodeLabels().get(i) > edgeSeqs.get(0).x) {
 47 | 				continue;
 48 | 			}
 49 | 			// 五元组id从0开始设置
 50 | 			g2s[i] = 0;
 51 | 
 52 | 			Stack<Integer> s = new Stack<>();
 53 | 			s.push(i);
 54 | 			dfsSearch(s, 0, 1);
 55 | 			if (!isMin) {
 56 | 				return;
 57 | 			}
 58 | 			g2s[i] = -1;
 59 | 		}
 60 | 	}
 61 | 
 62 | 	/**
 63 | 	 * 深度优先搜索最小编码组
 64 | 	 *
 65 | 	 * @param stack
 66 | 	 *            加入的节点id栈
 67 | 	 * @param currentPosition
 68 | 	 *            当前进行的层次，代表找到的第几条边
 69 | 	 * @param next
 70 | 	 *            五元组边下一条边的点的临时标识
 71 | 	 */
 72 | 	private void dfsSearch(Stack<Integer> stack, int currentPosition, int next) {
 73 | 		if (currentPosition >= edgeSeqs.size()) {
 74 | 			stack.pop();
 75 | 			// 比较到底了则返回
 76 | 			return;
 77 | 		}
 78 | 
 79 | 		while (!stack.isEmpty()) {
 80 | 			int x = stack.pop();
 81 | 			for (int i = 0; i < graph.edgeNexts.get(x).size(); i++) {
 82 | 				// 从此id节点所连接的点中选取1个点作为下一个点
 83 | 				int y = graph.edgeNexts.get(x).get(i);
 84 | 				// 如果这2个点所构成的边已经被用过，则继续
 85 | 				if (f[x][y] || f[y][x]) {
 86 | 					continue;
 87 | 				}
 88 | 
 89 | 				// 如果y这个点未被用过
 90 | 				if (g2s[y] < 0) {
 91 | 					// 新建这条边五元组
 92 | 					Edge e = new Edge(g2s[x], next, graph.nodeLabels.get(x), graph.edgeLabels.get(x).get(i),
 93 | 							graph.nodeLabels.get(y));
 94 | 
 95 | 					// 与相应位置的边做比较，如果不是最小则失败
 96 | 					int compareResult = e.compareWith(edgeSeqs.get(currentPosition));
 97 | 					if (compareResult == Edge.EDGE_SMALLER) {
 98 | 						isMin = false;
 99 | 						return;
100 | 					} else if (compareResult == Edge.EDGE_LARGER) {
101 | 						continue;
102 | 					}
103 | 					// 如果相等则继续比
104 | 					g2s[y] = next;
105 | 					f[x][y] = true;
106 | 					f[y][x] = true;
107 | 					stack.push(y);
108 | 					dfsSearch(stack, currentPosition + 1, next + 1);
109 | 					if (!isMin) {
110 | 						return;
111 | 					}
112 | 					f[x][y] = false;
113 | 					f[y][x] = false;
114 | 					g2s[y] = -1;
115 | 				} else {
116 | 					// 这个点已经被用过的时候，不需要再设置五元组id标识
117 | 					// 新建这条边五元组
118 | 					Edge e = new Edge(g2s[x], g2s[y], graph.nodeLabels.get(x), graph.edgeLabels.get(x).get(i),
119 | 							graph.nodeLabels.get(y));
120 | 
121 | 					// 与相应位置的边做比较，如果不是最小则失败
122 | 					int compareResult = e.compareWith(edgeSeqs.get(currentPosition));
123 | 					if (compareResult == Edge.EDGE_SMALLER) {
124 | 						isMin = false;
125 | 						return;
126 | 					} else if (compareResult == Edge.EDGE_LARGER) {
127 | 						continue;
128 | 					}
129 | 					// 如果相等则继续比
130 | 					g2s[y] = next;
131 | 					f[x][y] = true;
132 | 					f[y][x] = true;
133 | 					stack.push(y);
134 | 					dfsSearch(stack, currentPosition + 1, next);
135 | 					if (!isMin) {
136 | 						return;
137 | 					}
138 | 					f[x][y] = false;
139 | 					f[y][x] = false;
140 | 				}
141 | 			}
142 | 		}
143 | 	}
144 | }
145 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/graph/gspan/Edge.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.graph.gspan;
 2 | 
 3 | /**
 4 |  * 边，用五元组表示
 5 |  */
 6 | public class Edge {
 7 | 
 8 | 	// 五元组的大小比较结果
 9 | 	public static final int EDGE_EQUAL = 0;
10 | 	public static final int EDGE_SMALLER = 1;
11 | 	public static final int EDGE_LARGER = 2;
12 | 
13 | 	// 边的一端的id号标识
14 | 	int ix;
15 | 	// 边的另一端的id号标识
16 | 	int iy;
17 | 	// 边的一端的点标号
18 | 	int x;
19 | 	// 边的标号
20 | 	int a;
21 | 	// 边的另一端的点标号
22 | 	int y;
23 | 
24 | 	public Edge(int ix, int iy, int x, int a, int y) {
25 | 		this.ix = ix;
26 | 		this.iy = iy;
27 | 		this.x = x;
28 | 		this.a = a;
29 | 		this.y = y;
30 | 	}
31 | 
32 | 	/**
33 | 	 * 当前边是与给定的边的大小比较关系
34 | 	 *
35 | 	 * @param e
36 | 	 * @return
37 | 	 */
38 | 	public int compareWith(Edge e) {
39 | 		int result = EDGE_EQUAL;
40 | 		int[] array1 = new int[] { ix, iy, x, y, a };
41 | 		int[] array2 = new int[] { e.ix, e.iy, e.x, e.y, e.a };
42 | 
43 | 		// 按照ix, iy,x,y,a的次序依次比较
44 | 		for (int i = 0; i < array1.length; i++) {
45 | 			if (array1[i] < array2[i]) {
46 | 				result = EDGE_SMALLER;
47 | 				break;
48 | 			} else if (array1[i] > array2[i]) {
49 | 				result = EDGE_LARGER;
50 | 				break;
51 | 			} else {
52 | 				// 如果相等，继续比较下一个
53 | 				continue;
54 | 			}
55 | 		}
56 | 
57 | 		return result;
58 | 	}
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/graph/gspan/EdgeFrequency.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.graph.gspan;
 2 | 
 3 | /**
 4 |  * 边的频繁统计
 5 |  */
 6 | public class EdgeFrequency {
 7 | 
 8 | 	// 节点标号数量
 9 | 	private int nodeLabelNum;
10 | 	// 边的标号数量
11 | 	private int edgeLabelNum;
12 | 	// 用于存放边计数的3维数组
13 | 	public int[][][] edgeFreqCount;
14 | 
15 | 	public EdgeFrequency(int nodeLabelNum, int edgeLabelNum) {
16 | 		this.nodeLabelNum = nodeLabelNum;
17 | 		this.edgeLabelNum = edgeLabelNum;
18 | 
19 | 		edgeFreqCount = new int[nodeLabelNum][edgeLabelNum][nodeLabelNum];
20 | 		//最初始化操作
21 | 		for (int i = 0; i < nodeLabelNum; i++) {
22 | 			for (int j = 0; j < edgeLabelNum; j++) {
23 | 				for (int k = 0; k < nodeLabelNum; k++) {
24 | 					edgeFreqCount[i][j][k] = 0;
25 | 				}
26 | 			}
27 | 		}
28 | 	}
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/graph/gspan/GSpanExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.graph.gspan;
 2 | 
 3 | /**
 4 |  * gSpan频繁子图挖掘算法
 5 |  */
 6 | public class GSpanExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		//测试数据文件地址
10 | 		String filePath = "data/gspan/input.txt";
11 | 		//最小支持度率
12 | 		double minSupportRate = 0.2;
13 | 
14 | 		GSpanTool tool = new GSpanTool(filePath, minSupportRate);
15 | 		tool.freqGraphMining();
16 | 	}
17 | 
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/graph/gspan/Graph.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.graph.gspan;
  2 | 
  3 | import java.util.ArrayList;
  4 | 
  5 | /**
  6 |  * 图结构类
  7 |  */
  8 | public class Graph {
  9 | 
 10 | 	// 图节点标号组
 11 | 	ArrayList<Integer> nodeLabels;
 12 | 	// 图的边标号组
 13 | 	ArrayList<ArrayList<Integer>> edgeLabels;
 14 | 	// 边2头的节点id号,在这里可以理解为下标号
 15 | 	ArrayList<ArrayList<Integer>> edgeNexts;
 16 | 
 17 | 	public Graph() {
 18 | 		nodeLabels = new ArrayList<>();
 19 | 		edgeLabels = new ArrayList<>();
 20 | 		edgeNexts = new ArrayList<>();
 21 | 	}
 22 | 
 23 | 	public ArrayList<Integer> getNodeLabels() {
 24 | 		return nodeLabels;
 25 | 	}
 26 | 
 27 | 	public void setNodeLabels(ArrayList<Integer> nodeLabels) {
 28 | 		this.nodeLabels = nodeLabels;
 29 | 	}
 30 | 
 31 | 	/**
 32 | 	 * 判断图中是否存在某条边
 33 | 	 *
 34 | 	 * @param x
 35 | 	 *            边的一端的节点标号
 36 | 	 * @param a
 37 | 	 *            边的标号
 38 | 	 * @param y
 39 | 	 *            边的另外一端节点标号
 40 | 	 * @return
 41 | 	 */
 42 | 	public boolean hasEdge(int x, int a, int y) {
 43 | 		boolean isContained = false;
 44 | 		int t;
 45 | 
 46 | 		for (int i = 0; i < nodeLabels.size(); i++) {
 47 | 			// 先寻找2个端点标号,t代表找到的点的另外一个端点标号
 48 | 			if (nodeLabels.get(i) == x) {
 49 | 				t = y;
 50 | 			} else if (nodeLabels.get(i) == y) {
 51 | 				t = x;
 52 | 			} else {
 53 | 				continue;
 54 | 			}
 55 | 
 56 | 			for (int j = 0; j < edgeNexts.get(i).size(); j++) {
 57 | 				// 从此端点的所连接的点去比较对应的点和边
 58 | 				if (edgeLabels.get(i).get(j) == a && nodeLabels.get(edgeNexts.get(i).get(j)) == t) {
 59 | 					isContained = true;
 60 | 					return isContained;
 61 | 				}
 62 | 			}
 63 | 		}
 64 | 
 65 | 		return isContained;
 66 | 	}
 67 | 
 68 | 	/**
 69 | 	 * 在图中移除某个边
 70 | 	 *
 71 | 	 * @param x
 72 | 	 *            边的某端的一个点标号
 73 | 	 * @param a
 74 | 	 *            边的标号
 75 | 	 * @param y
 76 | 	 *            边的另一端的一个点标号
 77 | 	 */
 78 | 	public void removeEdge(int x, int a, int y) {
 79 | 		int t;
 80 | 
 81 | 		for (int i = 0; i < nodeLabels.size(); i++) {
 82 | 			// 先寻找2个端点标号,t代表找到的点的另外一个端点标号
 83 | 			if (nodeLabels.get(i) == x) {
 84 | 				t = y;
 85 | 			} else if (nodeLabels.get(i) == y) {
 86 | 				t = x;
 87 | 			} else {
 88 | 				continue;
 89 | 			}
 90 | 
 91 | 			for (int j = 0; j < edgeNexts.get(i).size(); j++) {
 92 | 				// 从此端点的所连接的点去比较对应的点和边
 93 | 				if (edgeLabels.get(i).get(j) == a && nodeLabels.get(edgeNexts.get(i).get(j)) == t) {
 94 | 					int id;
 95 | 					// 在连接的点中去除该点
 96 | 					edgeLabels.get(i).remove(j);
 97 | 
 98 | 					id = edgeNexts.get(i).get(j);
 99 | 					edgeNexts.get(i).remove(j);
100 | 					for (int k = 0; k < edgeNexts.get(id).size(); k++) {
101 | 						if (edgeNexts.get(id).get(k) == i) {
102 | 							edgeNexts.get(id).remove(k);
103 | 							break;
104 | 						}
105 | 					}
106 | 					break;
107 | 				}
108 | 			}
109 | 		}
110 | 
111 | 	}
112 | 
113 | 	/**
114 | 	 * 根据图数据构造一个图
115 | 	 *
116 | 	 * @param gd
117 | 	 *            图数据
118 | 	 * @return
119 | 	 */
120 | 	public Graph constructGraph(GraphData gd) {
121 | 		Graph graph = new Graph();
122 | 
123 | 		// 构造一个图需要知道3点，1.图中有哪些点2.图中的每个点周围连着哪些点3.每个点周围连着哪些边
124 | 		for (int i = 0; i < gd.getNodeVisibles().size(); i++) {
125 | 			if (gd.getNodeVisibles().get(i)) {
126 | 				graph.getNodeLabels().add(gd.getNodeLabels().get(i));
127 | 			}
128 | 
129 | 			// 添加对应id下的集合
130 | 			// id节点后有多少相连的边的标号
131 | 			graph.edgeLabels.add(new ArrayList<Integer>());
132 | 			// id节点后有多少相连的节点的id
133 | 			graph.edgeNexts.add(new ArrayList<Integer>());
134 | 		}
135 | 
136 | 		for (int i = 0; i < gd.getEdgeLabels().size(); i++) {
137 | 			if (gd.getEdgeVisibles().get(i)) {
138 | 				// 在此后面添加一个边标号
139 | 				graph.edgeLabels.get(gd.getEdgeX().get(i)).add(gd.getEdgeLabels().get(i));
140 | 				graph.edgeLabels.get(gd.getEdgeY().get(i)).add(gd.getEdgeLabels().get(i));
141 | 				graph.edgeNexts.get(gd.getEdgeX().get(i)).add(gd.getEdgeY().get(i));
142 | 				graph.edgeNexts.get(gd.getEdgeY().get(i)).add(gd.getEdgeX().get(i));
143 | 			}
144 | 		}
145 | 
146 | 		return graph;
147 | 	}
148 | }
149 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/graph/gspan/GraphCode.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.graph.gspan;
 2 | 
 3 | import java.util.ArrayList;
 4 | 
 5 | /**
 6 |  * 图编码类
 7 |  */
 8 | public class GraphCode {
 9 | 
10 | 	//边的集合，边的排序代表着边的添加次序
11 | 	ArrayList<Edge> edgeSeq;
12 | 	//拥有这些边的图的id
13 | 	ArrayList<Integer> gs;
14 | 
15 | 	public GraphCode() {
16 | 		this.edgeSeq = new ArrayList<>();
17 | 		this.gs = new ArrayList<>();
18 | 	}
19 | 
20 | 	public ArrayList<Edge> getEdgeSeq() {
21 | 		return edgeSeq;
22 | 	}
23 | 
24 | 	public void setEdgeSeq(ArrayList<Edge> edgeSeq) {
25 | 		this.edgeSeq = edgeSeq;
26 | 	}
27 | 
28 | 	public ArrayList<Integer> getGs() {
29 | 		return gs;
30 | 	}
31 | 
32 | 	public void setGs(ArrayList<Integer> gs) {
33 | 		this.gs = gs;
34 | 	}
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/graph/gspan/GraphData.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.graph.gspan;
  2 | 
  3 | import java.util.ArrayList;
  4 | 
  5 | /**
  6 |  * 图的数据类
  7 |  */
  8 | public class GraphData {
  9 | 
 10 | 	// 节点组标号
 11 | 	private ArrayList<Integer> nodeLabels;
 12 | 	// 节点是否可用,可能被移除
 13 | 	private ArrayList<Boolean> nodeVisibles;
 14 | 	// 边的集合标号
 15 | 	private ArrayList<Integer> edgeLabels;
 16 | 	// 边的一边点id
 17 | 	private ArrayList<Integer> edgeX;
 18 | 	// 边的另一边的点id
 19 | 	private ArrayList<Integer> edgeY;
 20 | 	// 边是否可用
 21 | 	private ArrayList<Boolean> edgeVisibles;
 22 | 
 23 | 	public GraphData() {
 24 | 		nodeLabels = new ArrayList<>();
 25 | 		nodeVisibles = new ArrayList<>();
 26 | 
 27 | 		edgeLabels = new ArrayList<>();
 28 | 		edgeX = new ArrayList<>();
 29 | 		edgeY = new ArrayList<>();
 30 | 		edgeVisibles = new ArrayList<>();
 31 | 	}
 32 | 
 33 | 	public ArrayList<Integer> getNodeLabels() {
 34 | 		return nodeLabels;
 35 | 	}
 36 | 
 37 | 	public void setNodeLabels(ArrayList<Integer> nodeLabels) {
 38 | 		this.nodeLabels = nodeLabels;
 39 | 	}
 40 | 
 41 | 	public ArrayList<Boolean> getNodeVisibles() {
 42 | 		return nodeVisibles;
 43 | 	}
 44 | 
 45 | 	public void setNodeVisibles(ArrayList<Boolean> nodeVisibles) {
 46 | 		this.nodeVisibles = nodeVisibles;
 47 | 	}
 48 | 
 49 | 	public ArrayList<Integer> getEdgeLabels() {
 50 | 		return edgeLabels;
 51 | 	}
 52 | 
 53 | 	public void setEdgeLabels(ArrayList<Integer> edgeLabels) {
 54 | 		this.edgeLabels = edgeLabels;
 55 | 	}
 56 | 
 57 | 	public ArrayList<Integer> getEdgeX() {
 58 | 		return edgeX;
 59 | 	}
 60 | 
 61 | 	public void setEdgeX(ArrayList<Integer> edgeX) {
 62 | 		this.edgeX = edgeX;
 63 | 	}
 64 | 
 65 | 	public ArrayList<Integer> getEdgeY() {
 66 | 		return edgeY;
 67 | 	}
 68 | 
 69 | 	public void setEdgeY(ArrayList<Integer> edgeY) {
 70 | 		this.edgeY = edgeY;
 71 | 	}
 72 | 
 73 | 	public ArrayList<Boolean> getEdgeVisibles() {
 74 | 		return edgeVisibles;
 75 | 	}
 76 | 
 77 | 	public void setEdgeVisibles(ArrayList<Boolean> edgeVisibles) {
 78 | 		this.edgeVisibles = edgeVisibles;
 79 | 	}
 80 | 
 81 | 	/**
 82 | 	 * 根据点边频繁度移除图中不频繁的点边
 83 | 	 *
 84 | 	 * @param freqNodeLabel
 85 | 	 *            点的频繁度统计
 86 | 	 * @param freqEdgeLabel
 87 | 	 *            边的频繁度统计
 88 | 	 * @param minSupportCount
 89 | 	 *            最小支持度计数
 90 | 	 */
 91 | 	public void removeInFreqNodeAndEdge(int[] freqNodeLabel, int[] freqEdgeLabel, int minSupportCount) {
 92 | 		int label = 0;
 93 | 		int x = 0;
 94 | 		int y = 0;
 95 | 
 96 | 		for (int i = 0; i < nodeLabels.size(); i++) {
 97 | 			label = nodeLabels.get(i);
 98 | 			if (freqNodeLabel[label] < minSupportCount) {
 99 | 				// 如果小于支持度计数，则此点不可用
100 | 				nodeVisibles.set(i, false);
101 | 			}
102 | 		}
103 | 
104 | 		for (int i = 0; i < edgeLabels.size(); i++) {
105 | 			label = edgeLabels.get(i);
106 | 
107 | 			if (freqEdgeLabel[label] < minSupportCount) {
108 | 				// 如果小于支持度计数，则此边不可用
109 | 				edgeVisibles.set(i, false);
110 | 				continue;
111 | 			}
112 | 
113 | 			// 如果此边的某个端的端点已经不可用了，则此边也不可用,x,y表示id号
114 | 			x = edgeX.get(i);
115 | 			y = edgeY.get(i);
116 | 			if (!nodeVisibles.get(x) || !nodeVisibles.get(y)) {
117 | 				edgeVisibles.set(i, false);
118 | 			}
119 | 		}
120 | 	}
121 | 
122 | 	/**
123 | 	 * 根据标号排序重新对满足条件的点边重新编号
124 | 	 *
125 | 	 * @param nodeLabel2Rank
126 | 	 *            点排名
127 | 	 * @param edgeLabel2Rank
128 | 	 *            边排名
129 | 	 */
130 | 	public void reLabelByRank(int[] nodeLabel2Rank, int[] edgeLabel2Rank) {
131 | 		int label = 0;
132 | 		int count = 0;
133 | 		int temp = 0;
134 | 		// 旧的id对新id号的映射
135 | 		int[] oldId2New = new int[nodeLabels.size()];
136 | 		for (int i = 0; i < nodeLabels.size(); i++) {
137 | 			label = nodeLabels.get(i);
138 | 
139 | 			// 如果当前点是可用的，将此标号的排名号作为此点新的标号
140 | 			if (nodeVisibles.get(i)) {
141 | 				nodeLabels.set(i, nodeLabel2Rank[label]);
142 | 				oldId2New[i] = count;
143 | 				count++;
144 | 			}
145 | 		}
146 | 
147 | 		for (int i = 0; i < edgeLabels.size(); i++) {
148 | 			label = edgeLabels.get(i);
149 | 
150 | 			// 如果当前边是可用的，将此标号的排名号作为此点新的标号
151 | 			if (edgeVisibles.get(i)) {
152 | 				edgeLabels.set(i, edgeLabel2Rank[label]);
153 | 
154 | 				// 对此点做x,y的id号替换
155 | 				temp = edgeX.get(i);
156 | 				edgeX.set(i, oldId2New[temp]);
157 | 				temp = edgeY.get(i);
158 | 				edgeY.set(i, oldId2New[temp]);
159 | 			}
160 | 		}
161 | 	}
162 | }
163 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/graph/gspan/SubChildTraveler.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.graph.gspan;
  2 | 
  3 | import java.util.ArrayList;
  4 | 
  5 | /**
  6 |  * 孩子图搜寻类，在当前边的基础上寻找可能的孩子边
  7 |  */
  8 | public class SubChildTraveler {
  9 | 
 10 | 	// 当前的五元组边
 11 | 	ArrayList<Edge> edgeSeq;
 12 | 	// 当前的图
 13 | 	Graph graph;
 14 | 	// 结果数据，孩子边对所属的图id组
 15 | 	ArrayList<Edge> childEdge;
 16 | 	// 图的点id对五元组id标识的映射
 17 | 	int[] g2s;
 18 | 	// 五元组id标识对图的点id的映射
 19 | 	int[] s2g;
 20 | 	// 图中边是否被用的情况
 21 | 	boolean f[][];
 22 | 	// 最右路径，rm[id]表示的是此id节点在最右路径中的下一个节点id
 23 | 	int[] rm;
 24 | 	// 下一个五元组的id
 25 | 	int next;
 26 | 
 27 | 	public SubChildTraveler(ArrayList<Edge> edgeSeq, Graph graph) {
 28 | 		this.edgeSeq = edgeSeq;
 29 | 		this.graph = graph;
 30 | 		this.childEdge = new ArrayList<>();
 31 | 	}
 32 | 
 33 | 	/**
 34 | 	 * 在图中搜索可能存在的孩子边
 35 | 	 *
 36 | 	 * @param next
 37 | 	 *            新加入边的节点将设置的id
 38 | 	 */
 39 | 	public void traveler() {
 40 | 		this.next = edgeSeq.size() + 1;
 41 | 		int size = graph.nodeLabels.size();
 42 | 		// 做id映射的初始化操作
 43 | 		g2s = new int[size];
 44 | 		s2g = new int[size];
 45 | 		f = new boolean[size][size];
 46 | 
 47 | 		for (int i = 0; i < size; i++) {
 48 | 			g2s[i] = -1;
 49 | 			s2g[i] = -1;
 50 | 
 51 | 			for (int j = 0; j < size; j++) {
 52 | 				// 代表点id为i到id为j点此边没有被用过
 53 | 				f[i][j] = false;
 54 | 			}
 55 | 		}
 56 | 
 57 | 		rm = new int[edgeSeq.size() + 1];
 58 | 		for (int i = 0; i < edgeSeq.size() + 1; i++) {
 59 | 			rm[i] = -1;
 60 | 		}
 61 | 		// 寻找最右路径
 62 | 		for (Edge e : edgeSeq) {
 63 | 			if (e.ix < e.iy && e.iy > rm[e.ix]) {
 64 | 				rm[e.ix] = e.iy;
 65 | 			}
 66 | 		}
 67 | 
 68 | 		for (int i = 0; i < size; i++) {
 69 | 			// 寻找第一个标号相等的点
 70 | 			if (edgeSeq.get(0).x != graph.nodeLabels.get(i)) {
 71 | 				continue;
 72 | 			}
 73 | 
 74 | 			g2s[i] = 0;
 75 | 			s2g[0] = i;
 76 | 			dfsSearchEdge(0);
 77 | 			g2s[i] = -1;
 78 | 			s2g[0] = -1;
 79 | 		}
 80 | 
 81 | 	}
 82 | 
 83 | 	/**
 84 | 	 * 在当前图中深度优先寻找正确的子图
 85 | 	 *
 86 | 	 * @param currentPosition
 87 | 	 *            当前找到的位置
 88 | 	 */
 89 | 	public void dfsSearchEdge(int currentPosition) {
 90 | 		int rmPosition = 0;
 91 | 		// 如果找到底了，则在当前的子图的最右路径中寻找可能的边
 92 | 		if (currentPosition >= edgeSeq.size()) {
 93 | 			rmPosition = 0;
 94 | 			while (rmPosition >= 0) {
 95 | 				int gId = s2g[rmPosition];
 96 | 				// 在此点附近寻找可能的边
 97 | 				for (int i = 0; i < graph.edgeNexts.get(gId).size(); i++) {
 98 | 					int gId2 = graph.edgeNexts.get(gId).get(i);
 99 | 					// 如果这条边已经被用过
100 | 					if (f[gId][gId2] || f[gId][gId2]) {
101 | 						continue;
102 | 					}
103 | 
104 | 					// 在最右路径中添加边分为2种情况，第一种为在最右节点上添加，第二中为在最右路径上 的点添加
105 | 					// 如果找到的点没有被用过，可以进行边的拓展
106 | 					if (g2s[gId2] < 0) {
107 | 						g2s[gId2] = next;
108 | 						Edge e = new Edge(g2s[gId], g2s[gId2], graph.nodeLabels.get(gId),
109 | 								graph.edgeLabels.get(gId).get(i), graph.nodeLabels.get(gId2));
110 | 						// 将新建的子边加入集合
111 | 						childEdge.add(e);
112 | 					} else {
113 | 						boolean flag = true;
114 | 						// 如果这点已经存在，判断他是不是最右的点
115 | 						for (int j = 0; j < graph.edgeNexts.get(gId2).size(); j++) {
116 | 							int tempId = graph.edgeNexts.get(gId2).get(j);
117 | 							if (g2s[gId2] < g2s[tempId]) {
118 | 								flag = false;
119 | 								break;
120 | 							}
121 | 						}
122 | 
123 | 						if (flag) {
124 | 							Edge e = new Edge(g2s[gId], g2s[gId2], graph.nodeLabels.get(gId),
125 | 									graph.edgeLabels.get(gId).get(i), graph.nodeLabels.get(gId2));
126 | 							// 将新建的子边加入集合
127 | 							childEdge.add(e);
128 | 						}
129 | 					}
130 | 				}
131 | 				// 一个最右路径上点找完，继续下一个
132 | 				rmPosition = rm[rmPosition];
133 | 			}
134 | 			return;
135 | 		}
136 | 
137 | 		Edge e = edgeSeq.get(currentPosition);
138 | 		// 所连接的点标号
139 | 		int y = e.y;
140 | 		// 所连接的边标号
141 | 		int a = e.a;
142 | 		int gId1 = s2g[e.ix];
143 | 		int gId2 = 0;
144 | 
145 | 		for (int i = 0; i < graph.edgeLabels.get(gId1).size(); i++) {
146 | 			// 判断所连接的边对应的标号
147 | 			if (graph.edgeLabels.get(gId1).get(i) != a) {
148 | 				continue;
149 | 			}
150 | 
151 | 			// 判断所连接的点的标号
152 | 			int tempId = graph.edgeNexts.get(gId1).get(i);
153 | 			if (graph.nodeLabels.get(tempId) != y) {
154 | 				continue;
155 | 			}
156 | 
157 | 			gId2 = tempId;
158 | 			// 如果这两点是没有设置过的
159 | 			if (g2s[gId2] == -1 && s2g[e.iy] == -1) {
160 | 				g2s[gId2] = e.iy;
161 | 				s2g[e.iy] = gId2;
162 | 				f[gId1][gId2] = true;
163 | 				f[gId2][gId1] = true;
164 | 				dfsSearchEdge(currentPosition + 1);
165 | 				f[gId1][gId2] = false;
166 | 				f[gId2][gId1] = false;
167 | 				g2s[gId2] = -1;
168 | 				s2g[e.iy] = -1;
169 | 			} else {
170 | 				if (g2s[gId2] != e.iy) {
171 | 					continue;
172 | 				}
173 | 				if (s2g[e.iy] != gId2) {
174 | 					continue;
175 | 				}
176 | 				f[gId1][gId2] = true;
177 | 				f[gId2][gId1] = true;
178 | 				dfsSearchEdge(currentPosition);
179 | 				f[gId1][gId2] = false;
180 | 				f[gId2][gId1] = false;
181 | 			}
182 | 		}
183 | 
184 | 	}
185 | 
186 | 	/**
187 | 	 * 获取结果数据对
188 | 	 *
189 | 	 * @return
190 | 	 */
191 | 	public ArrayList<Edge> getResultChildEdge() {
192 | 		return this.childEdge;
193 | 	}
194 | 
195 | }
196 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/integrated/cba/CBAExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.integrated.cba;
 2 | 
 3 | import java.text.MessageFormat;
 4 | 
 5 | /**
 6 |  * CBA算法--基于关联规则的分类算法
 7 |  */
 8 | public class CBAExample {
 9 | 
10 | 	public static void main(String[] args) {
11 | 		String filePath = "data/cba/input.txt";
12 | 		String attrDesc = "Age=Senior,CreditRating=Fair";
13 | 		String classification = null;
14 | 
15 | 		//最小支持度阈值率
16 | 		double minSupportRate = 0.2;
17 | 		//最小置信度阈值
18 | 		double minConf = 0.7;
19 | 
20 | 		CBACore tool = new CBACore(filePath, minSupportRate, minConf);
21 | 		classification = tool.CBAJudge(attrDesc);
22 | 		System.out.println(MessageFormat.format("{0}的关联分类结果为{1}", attrDesc, classification));
23 | 	}
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/integrated/cba/FrequentItem.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.integrated.cba;
 2 | 
 3 | /**
 4 |  * 频繁项集
 5 |  */
 6 | public class FrequentItem implements Comparable<FrequentItem> {
 7 | 
 8 | 	// 频繁项集的集合ID
 9 | 	private String[] idArray;
10 | 	// 频繁项集的支持度计数
11 | 	private int count;
12 | 	//频繁项集的长度，1项集或是2项集，亦或是3项集
13 | 	private int length;
14 | 
15 | 	public FrequentItem(String[] idArray, int count) {
16 | 		this.idArray = idArray;
17 | 		this.count = count;
18 | 		length = idArray.length;
19 | 	}
20 | 
21 | 	public String[] getIdArray() {
22 | 		return idArray;
23 | 	}
24 | 
25 | 	public void setIdArray(String[] idArray) {
26 | 		this.idArray = idArray;
27 | 	}
28 | 
29 | 	public int getCount() {
30 | 		return count;
31 | 	}
32 | 
33 | 	public void setCount(int count) {
34 | 		this.count = count;
35 | 	}
36 | 
37 | 	public int getLength() {
38 | 		return length;
39 | 	}
40 | 
41 | 	public void setLength(int length) {
42 | 		this.length = length;
43 | 	}
44 | 
45 | 	@Override
46 | 	public int compareTo(FrequentItem o) {
47 | 		// TODO Auto-generated method stub
48 | 		Integer int1 = Integer.parseInt(this.getIdArray()[0]);
49 | 		Integer int2 = Integer.parseInt(o.getIdArray()[0]);
50 | 
51 | 		return int1.compareTo(int2);
52 | 	}
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/link/hits/HITSCore.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.link.hits;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | 
  9 | /**
 10 |  * HITS链接分析算法工具类
 11 |  */
 12 | public class HITSCore {
 13 | 
 14 | 	//输入数据文件地址
 15 | 	private String filePath;
 16 | 	//网页个数
 17 | 	private int pageNum;
 18 | 	//网页Authority权威值
 19 | 	private double[] authority;
 20 | 	//网页hub中心值
 21 | 	private double[] hub;
 22 | 	//链接矩阵关系
 23 | 	private int[][] linkMatrix;
 24 | 	//网页种类
 25 | 	private ArrayList<String> pageClass;
 26 | 
 27 | 	public HITSCore(String filePath) {
 28 | 		this.filePath = filePath;
 29 | 		readDataFile();
 30 | 	}
 31 | 
 32 | 	/**
 33 | 	 * 从文件中读取数据
 34 | 	 */
 35 | 	private void readDataFile() {
 36 | 		File file = new File(filePath);
 37 | 		ArrayList<String[]> dataArray = new ArrayList<String[]>();
 38 | 
 39 | 		try {
 40 | 			BufferedReader in = new BufferedReader(new FileReader(file));
 41 | 			String str;
 42 | 			String[] tempArray;
 43 | 			while ((str = in.readLine()) != null) {
 44 | 				tempArray = str.split(" ");
 45 | 				dataArray.add(tempArray);
 46 | 			}
 47 | 			in.close();
 48 | 		} catch (IOException e) {
 49 | 			e.getStackTrace();
 50 | 		}
 51 | 
 52 | 		pageClass = new ArrayList<>();
 53 | 		// 统计网页类型种数
 54 | 		for (String[] array : dataArray) {
 55 | 			for (String s : array) {
 56 | 				if (!pageClass.contains(s)) {
 57 | 					pageClass.add(s);
 58 | 				}
 59 | 			}
 60 | 		}
 61 | 
 62 | 		int i = 0;
 63 | 		int j = 0;
 64 | 		pageNum = pageClass.size();
 65 | 		linkMatrix = new int[pageNum][pageNum];
 66 | 		authority = new double[pageNum];
 67 | 		hub = new double[pageNum];
 68 | 		for (int k = 0; k < pageNum; k++) {
 69 | 			//初始时默认权威值和中心值都为1
 70 | 			authority[k] = 1;
 71 | 			hub[k] = 1;
 72 | 		}
 73 | 
 74 | 		for (String[] array : dataArray) {
 75 | 
 76 | 			i = Integer.parseInt(array[0]);
 77 | 			j = Integer.parseInt(array[1]);
 78 | 
 79 | 			// 设置linkMatrix[i][j]为1代表i网页包含指向j网页的链接
 80 | 			linkMatrix[i - 1][j - 1] = 1;
 81 | 		}
 82 | 	}
 83 | 
 84 | 	/**
 85 | 	 * 输出结果页面，也就是authority权威值最高的页面
 86 | 	 */
 87 | 	public void printResultPage() {
 88 | 		//最大Hub和Authority值，用于后面的归一化计算
 89 | 		double maxHub = 0;
 90 | 		double maxAuthority = 0;
 91 | 		int maxAuthorityIndex = 0;
 92 | 		//误差值，用于收敛判断
 93 | 		double error = Integer.MAX_VALUE;
 94 | 		double[] newHub = new double[pageNum];
 95 | 		double[] newAuthority = new double[pageNum];
 96 | 
 97 | 		while (error > 0.01 * pageNum) {
 98 | 			for (int k = 0; k < pageNum; k++) {
 99 | 				newHub[k] = 0;
100 | 				newAuthority[k] = 0;
101 | 			}
102 | 
103 | 			//hub和authority值的更新计算
104 | 			for (int i = 0; i < pageNum; i++) {
105 | 				for (int j = 0; j < pageNum; j++) {
106 | 					if (linkMatrix[i][j] == 1) {
107 | 						newHub[i] += authority[j];
108 | 						newAuthority[j] += hub[i];
109 | 					}
110 | 				}
111 | 			}
112 | 
113 | 			maxHub = 0;
114 | 			maxAuthority = 0;
115 | 			for (int k = 0; k < pageNum; k++) {
116 | 				if (newHub[k] > maxHub) {
117 | 					maxHub = newHub[k];
118 | 				}
119 | 
120 | 				if (newAuthority[k] > maxAuthority) {
121 | 					maxAuthority = newAuthority[k];
122 | 					maxAuthorityIndex = k;
123 | 				}
124 | 			}
125 | 
126 | 			error = 0;
127 | 			//归一化处理
128 | 			for (int k = 0; k < pageNum; k++) {
129 | 				newHub[k] /= maxHub;
130 | 				newAuthority[k] /= maxAuthority;
131 | 
132 | 				error += Math.abs(newHub[k] - hub[k]);
133 | 				System.out.println(newAuthority[k] + ":" + newHub[k]);
134 | 
135 | 				hub[k] = newHub[k];
136 | 				authority[k] = newAuthority[k];
137 | 			}
138 | 			System.out.println("---------");
139 | 		}
140 | 
141 | 		System.out.println("****最终收敛的网页的权威值和中心值****");
142 | 		for (int k = 0; k < pageNum; k++) {
143 | 			System.out.println("网页" + pageClass.get(k) + ":" + authority[k] + ":" + hub[k]);
144 | 		}
145 | 		System.out.println("权威值最高的网页为：网页" + pageClass.get(maxAuthorityIndex));
146 | 	}
147 | 
148 | }
149 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/link/hits/HITSExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.link.hits;
 2 | 
 3 | /**
 4 |  * HITS链接分析算法
 5 |  */
 6 | public class HITSExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		String filePath = "data/hits/input.txt";
10 | 
11 | 		HITSCore tool = new HITSCore(filePath);
12 | 		tool.printResultPage();
13 | 	}
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/link/pagerank/PageRankCore.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.link.pagerank;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.text.MessageFormat;
  8 | import java.util.ArrayList;
  9 | 
 10 | /**
 11 |  * PageRank网页排名算法工具类
 12 |  */
 13 | public class PageRankCore {
 14 | 
 15 | 	// 测试输入数据
 16 | 	private String filePath;
 17 | 	// 网页总数量
 18 | 	private int pageNum;
 19 | 	// 链接关系矩阵
 20 | 	private double[][] linkMatrix;
 21 | 	// 每个页面pageRank值初始向量
 22 | 	private double[] pageRankVecor;
 23 | 
 24 | 	// 网页数量分类
 25 | 	ArrayList<String> pageClass;
 26 | 
 27 | 	public PageRankCore(String filePath) {
 28 | 		this.filePath = filePath;
 29 | 		readDataFile();
 30 | 	}
 31 | 
 32 | 	/**
 33 | 	 * 从文件中读取数据
 34 | 	 */
 35 | 	private void readDataFile() {
 36 | 		File file = new File(filePath);
 37 | 		ArrayList<String[]> dataArray = new ArrayList<String[]>();
 38 | 
 39 | 		try {
 40 | 			BufferedReader in = new BufferedReader(new FileReader(file));
 41 | 			String str;
 42 | 			String[] tempArray;
 43 | 			while ((str = in.readLine()) != null) {
 44 | 				tempArray = str.split(" ");
 45 | 				dataArray.add(tempArray);
 46 | 			}
 47 | 			in.close();
 48 | 		} catch (IOException e) {
 49 | 			e.getStackTrace();
 50 | 		}
 51 | 
 52 | 		pageClass = new ArrayList<>();
 53 | 		// 统计网页类型种数
 54 | 		for (String[] array : dataArray) {
 55 | 			for (String s : array) {
 56 | 				if (!pageClass.contains(s)) {
 57 | 					pageClass.add(s);
 58 | 				}
 59 | 			}
 60 | 		}
 61 | 
 62 | 		int i = 0;
 63 | 		int j = 0;
 64 | 		pageNum = pageClass.size();
 65 | 		linkMatrix = new double[pageNum][pageNum];
 66 | 		pageRankVecor = new double[pageNum];
 67 | 		for (int k = 0; k < pageNum; k++) {
 68 | 			// 初始每个页面的pageRank值为1
 69 | 			pageRankVecor[k] = 1.0;
 70 | 		}
 71 | 		for (String[] array : dataArray) {
 72 | 
 73 | 			i = Integer.parseInt(array[0]);
 74 | 			j = Integer.parseInt(array[1]);
 75 | 
 76 | 			// 设置linkMatrix[i][j]为1代表i网页包含指向j网页的链接
 77 | 			linkMatrix[i - 1][j - 1] = 1;
 78 | 		}
 79 | 	}
 80 | 
 81 | 	/**
 82 | 	 * 将矩阵转置
 83 | 	 */
 84 | 	private void transferMatrix() {
 85 | 		int count = 0;
 86 | 		for (double[] array : linkMatrix) {
 87 | 			// 计算页面链接个数
 88 | 			count = 0;
 89 | 			for (double d : array) {
 90 | 				if (d == 1) {
 91 | 					count++;
 92 | 				}
 93 | 			}
 94 | 			// 按概率均分
 95 | 			for (int i = 0; i < array.length; i++) {
 96 | 				if (array[i] == 1) {
 97 | 					array[i] /= count;
 98 | 				}
 99 | 			}
100 | 		}
101 | 
102 | 		double t = 0;
103 | 		// 将矩阵转置换，作为概率转移矩阵
104 | 		for (int i = 0; i < linkMatrix.length; i++) {
105 | 			for (int j = i + 1; j < linkMatrix[0].length; j++) {
106 | 				t = linkMatrix[i][j];
107 | 				linkMatrix[i][j] = linkMatrix[j][i];
108 | 				linkMatrix[j][i] = t;
109 | 			}
110 | 		}
111 | 	}
112 | 
113 | 	/**
114 | 	 * 利用幂法计算pageRank值
115 | 	 */
116 | 	public void printPageRankValue() {
117 | 		transferMatrix();
118 | 		// 阻尼系数
119 | 		double damp = 0.5;
120 | 		// 链接概率矩阵
121 | 		double[][] A = new double[pageNum][pageNum];
122 | 		double[][] e = new double[pageNum][pageNum];
123 | 
124 | 		// 调用公式A=d*q+(1-d)*e/m，m为网页总个数,d就是damp
125 | 		double temp = (1 - damp) / pageNum;
126 | 		for (int i = 0; i < e.length; i++) {
127 | 			for (int j = 0; j < e[0].length; j++) {
128 | 				e[i][j] = temp;
129 | 			}
130 | 		}
131 | 
132 | 		for (int i = 0; i < pageNum; i++) {
133 | 			for (int j = 0; j < pageNum; j++) {
134 | 				temp = damp * linkMatrix[i][j] + e[i][j];
135 | 				A[i][j] = temp;
136 | 
137 | 			}
138 | 		}
139 | 
140 | 		// 误差值，作为判断收敛标准
141 | 		double errorValue = Integer.MAX_VALUE;
142 | 		double[] newPRVector = new double[pageNum];
143 | 		// 当平均每个PR值误差小于0.001时就算达到收敛
144 | 		while (errorValue > 0.001 * pageNum) {
145 | 			System.out.println("**********");
146 | 			for (int i = 0; i < pageNum; i++) {
147 | 				temp = 0;
148 | 				// 将A*pageRankVector,利用幂法求解,直到pageRankVector值收敛
149 | 				for (int j = 0; j < pageNum; j++) {
150 | 					// temp就是每个网页到i页面的pageRank值
151 | 					temp += A[i][j] * pageRankVecor[j];
152 | 				}
153 | 
154 | 				// 最后的temp就是i网页的总PageRank值
155 | 				newPRVector[i] = temp;
156 | 				System.out.println(temp);
157 | 			}
158 | 
159 | 			errorValue = 0;
160 | 			for (int i = 0; i < pageNum; i++) {
161 | 				errorValue += Math.abs(pageRankVecor[i] - newPRVector[i]);
162 | 				// 新的向量代替旧的向量
163 | 				pageRankVecor[i] = newPRVector[i];
164 | 			}
165 | 		}
166 | 
167 | 		String name = null;
168 | 		temp = 0;
169 | 		System.out.println("--------------------");
170 | 		for (int i = 0; i < pageNum; i++) {
171 | 			System.out.println(MessageFormat.format("网页{0}的pageRank值：{1}", pageClass.get(i), pageRankVecor[i]));
172 | 			if (pageRankVecor[i] > temp) {
173 | 				temp = pageRankVecor[i];
174 | 				name = pageClass.get(i);
175 | 			}
176 | 		}
177 | 		System.out.println(MessageFormat.format("等级最高的网页为：{0}", name));
178 | 	}
179 | 
180 | }
181 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/link/pagerank/PageRankExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.link.pagerank;
 2 | 
 3 | /**
 4 |  * PageRank计算网页重要性/排名算法
 5 |  */
 6 | public class PageRankExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		String filePath = "data/pagerank/input.txt";
10 | 
11 | 		PageRankCore tool = new PageRankCore(filePath);
12 | 		tool.printPageRankValue();
13 | 	}
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/aco/ACOExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.aco;
 2 | 
 3 | /**
 4 |  * 蚁群算法测试类
 5 |  */
 6 | public class ACOExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		//测试数据
10 | 		String filePath = "data/aco/input.txt";
11 | 		//蚂蚁数量
12 | 		int antNum;
13 | 		//蚁群算法迭代次数
14 | 		int loopCount;
15 | 		//控制参数
16 | 		double alpha;
17 | 		double beita;
18 | 		double p;
19 | 		double Q;
20 | 
21 | 		antNum = 3;
22 | 		alpha = 0.5;
23 | 		beita = 1;
24 | 		p = 0.5;
25 | 		Q = 5;
26 | 		loopCount = 5;
27 | 
28 | 		ACOCore tool = new ACOCore(filePath, antNum, alpha, beita, p, Q);
29 | 		tool.antStartSearching(loopCount);
30 | 	}
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/aco/Ant.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.others.aco;
  2 | 
  3 | import java.util.ArrayList;
  4 | 
  5 | /**
  6 |  * 蚂蚁类，进行路径搜索的载体
  7 |  */
  8 | public class Ant implements Comparable<Ant> {
  9 | 
 10 | 	// 蚂蚁当前所在城市
 11 | 	String currentPos;
 12 | 	// 蚂蚁遍历完回到原点所用的总距离
 13 | 	Double sumDistance;
 14 | 	// 城市间的信息素浓度矩阵，随着时间的增多而减少
 15 | 	double[][] pheromoneMatrix;
 16 | 	// 蚂蚁已经走过的城市集合
 17 | 	ArrayList<String> visitedCitys;
 18 | 	// 还未走过的城市集合
 19 | 	ArrayList<String> nonVisitedCitys;
 20 | 	// 蚂蚁当前走过的路径
 21 | 	ArrayList<String> currentPath;
 22 | 
 23 | 	public Ant(double[][] pheromoneMatrix, ArrayList<String> nonVisitedCitys) {
 24 | 		this.pheromoneMatrix = pheromoneMatrix;
 25 | 		this.nonVisitedCitys = nonVisitedCitys;
 26 | 
 27 | 		this.visitedCitys = new ArrayList<>();
 28 | 		this.currentPath = new ArrayList<>();
 29 | 	}
 30 | 
 31 | 	/**
 32 | 	 * 计算路径的总成本(距离)
 33 | 	 *
 34 | 	 * @return
 35 | 	 */
 36 | 	public double calSumDistance() {
 37 | 		sumDistance = 0.0;
 38 | 		String lastCity;
 39 | 		String currentCity;
 40 | 
 41 | 		for (int i = 0; i < currentPath.size() - 1; i++) {
 42 | 			lastCity = currentPath.get(i);
 43 | 			currentCity = currentPath.get(i + 1);
 44 | 
 45 | 			// 通过距离矩阵进行计算
 46 | 			sumDistance += ACOCore.disMatrix[Integer.parseInt(lastCity)][Integer.parseInt(currentCity)];
 47 | 		}
 48 | 
 49 | 		return sumDistance;
 50 | 	}
 51 | 
 52 | 	/**
 53 | 	 * 蚂蚁选择前往下一个城市
 54 | 	 *
 55 | 	 * @param city
 56 | 	 *            所选的城市
 57 | 	 */
 58 | 	public void goToNextCity(String city) {
 59 | 		this.currentPath.add(city);
 60 | 		this.currentPos = city;
 61 | 		this.nonVisitedCitys.remove(city);
 62 | 		this.visitedCitys.add(city);
 63 | 	}
 64 | 
 65 | 	/**
 66 | 	 * 判断蚂蚁是否已经又重新回到起点
 67 | 	 *
 68 | 	 * @return
 69 | 	 */
 70 | 	public boolean isBack() {
 71 | 		boolean isBack = false;
 72 | 		String startPos;
 73 | 		String endPos;
 74 | 
 75 | 		if (currentPath.size() == 0) {
 76 | 			return isBack;
 77 | 		}
 78 | 
 79 | 		startPos = currentPath.get(0);
 80 | 		endPos = currentPath.get(currentPath.size() - 1);
 81 | 		if (currentPath.size() > 1 && startPos.equals(endPos)) {
 82 | 			isBack = true;
 83 | 		}
 84 | 
 85 | 		return isBack;
 86 | 	}
 87 | 
 88 | 	/**
 89 | 	 * 判断蚂蚁在本次的走过的路径中是否包含从城市i到城市j
 90 | 	 *
 91 | 	 * @param cityI
 92 | 	 *            城市I
 93 | 	 * @param cityJ
 94 | 	 *            城市J
 95 | 	 * @return
 96 | 	 */
 97 | 	public boolean pathContained(String cityI, String cityJ) {
 98 | 		String lastCity;
 99 | 		String currentCity;
100 | 		boolean isContained = false;
101 | 
102 | 		for (int i = 0; i < currentPath.size() - 1; i++) {
103 | 			lastCity = currentPath.get(i);
104 | 			currentCity = currentPath.get(i + 1);
105 | 
106 | 			// 如果某一段路径的始末位置一致，则认为有经过此城市
107 | 			if ((lastCity.equals(cityI) && currentCity.equals(cityJ))
108 | 					|| (lastCity.equals(cityJ) && currentCity.equals(cityI))) {
109 | 				isContained = true;
110 | 				break;
111 | 			}
112 | 		}
113 | 
114 | 		return isContained;
115 | 	}
116 | 
117 | 	@Override
118 | 	public int compareTo(Ant o) {
119 | 		// TODO Auto-generated method stub
120 | 		return this.sumDistance.compareTo(o.sumDistance);
121 | 	}
122 | }
123 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/bayesnetwork/BayesNetWorkExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.bayesnetwork;
 2 | 
 3 | /**
 4 |  * 贝叶斯网络场景测试类
 5 |  */
 6 | public class BayesNetWorkExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		String dataFilePath = "data/bayesnetwork/input.txt";
10 | 		String attachFilePath = "data/bayesnetwork/attach.txt";
11 | 		// 查询串语句
12 | 		String queryStr;
13 | 		// 结果概率
14 | 		double result;
15 | 
16 | 		// 查询语句的描述的事件是地震发生了，导致响铃响了，导致接到Mary的电话
17 | 		queryStr = "E=y,A=y,M=y";
18 | 		BayesNetWorkCore tool = new BayesNetWorkCore(dataFilePath, attachFilePath);
19 | 		result = tool.calProByNetWork(queryStr);
20 | 
21 | 		if (result == -1) {
22 | 			System.out.println("所描述的事件不满足贝叶斯网络的结构，无法求其概率");
23 | 		} else {
24 | 			System.out.println(String.format("事件%s发生的概率为%s", queryStr, result));
25 | 		}
26 | 	}
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/bayesnetwork/Node.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.bayesnetwork;
 2 | 
 3 | import java.util.ArrayList;
 4 | 
 5 | /**
 6 |  * 贝叶斯网络节点类
 7 |  */
 8 | public class Node {
 9 | 
10 | 	// 节点的属性名称
11 | 	String name;
12 | 	// 节点的父亲节点，也就是上游节点，可能多个
13 | 	ArrayList<Node> parentNodes;
14 | 	// 节点的子节点，也就是下游节点，可能多个
15 | 	ArrayList<Node> childNodes;
16 | 
17 | 	public Node(String name) {
18 | 		this.name = name;
19 | 
20 | 		// 初始化变量
21 | 		this.parentNodes = new ArrayList<>();
22 | 		this.childNodes = new ArrayList<>();
23 | 	}
24 | 
25 | 	/**
26 | 	 * 将自身节点连接到目标给定的节点
27 | 	 *
28 | 	 * @param node
29 | 	 *            下游节点
30 | 	 */
31 | 	public void connectNode(Node node) {
32 | 		// 将下游节点加入自身节点的孩子节点中
33 | 		this.childNodes.add(node);
34 | 		// 将自身节点加入到下游节点的父节点中
35 | 		node.parentNodes.add(this);
36 | 	}
37 | 
38 | 	/**
39 | 	 * 判断与目标节点是否相同，主要比较名称是否相同即可
40 | 	 *
41 | 	 * @param node
42 | 	 *            目标结点
43 | 	 * @return
44 | 	 */
45 | 	public boolean isEqual(Node node) {
46 | 		boolean isEqual;
47 | 
48 | 		isEqual = false;
49 | 		// 节点名称相同则视为相等
50 | 		if (this.name.equals(node.name)) {
51 | 			isEqual = true;
52 | 		}
53 | 
54 | 		return isEqual;
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/cabddcc/CABDDCCCore.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.others.cabddcc;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.text.MessageFormat;
  8 | import java.util.ArrayList;
  9 | 
 10 | /**
 11 |  * 基于连通图的分裂聚类算法
 12 |  */
 13 | public class CABDDCCCore {
 14 | 
 15 | 	// 测试数据点数据
 16 | 	private String filePath;
 17 | 	// 连通图距离阈值l
 18 | 	private int length;
 19 | 	// 原始坐标点
 20 | 	public static ArrayList<Point> totalPoints;
 21 | 	// 聚类结果坐标点集合
 22 | 	private ArrayList<ArrayList<Point>> resultClusters;
 23 | 	// 连通图
 24 | 	private Graph graph;
 25 | 
 26 | 	public CABDDCCCore(String filePath, int length) {
 27 | 		this.filePath = filePath;
 28 | 		this.length = length;
 29 | 
 30 | 		readDataFile();
 31 | 	}
 32 | 
 33 | 	/**
 34 | 	 * 从文件中读取数据
 35 | 	 */
 36 | 	public void readDataFile() {
 37 | 		File file = new File(filePath);
 38 | 		ArrayList<String[]> dataArray = new ArrayList<String[]>();
 39 | 
 40 | 		try {
 41 | 			BufferedReader in = new BufferedReader(new FileReader(file));
 42 | 			String str;
 43 | 			String[] tempArray;
 44 | 			while ((str = in.readLine()) != null) {
 45 | 				tempArray = str.split(" ");
 46 | 				dataArray.add(tempArray);
 47 | 			}
 48 | 			in.close();
 49 | 		} catch (IOException e) {
 50 | 			e.getStackTrace();
 51 | 		}
 52 | 
 53 | 		Point p;
 54 | 		totalPoints = new ArrayList<>();
 55 | 		for (String[] array : dataArray) {
 56 | 			p = new Point(array[0], array[1], array[2]);
 57 | 			totalPoints.add(p);
 58 | 		}
 59 | 
 60 | 		// 用边和点构造图
 61 | 		graph = new Graph(null, totalPoints);
 62 | 	}
 63 | 
 64 | 	/**
 65 | 	 * 分裂连通图得到聚类
 66 | 	 */
 67 | 	public void splitCluster() {
 68 | 		// 获取形成连通子图
 69 | 		ArrayList<Graph> subGraphs;
 70 | 		ArrayList<ArrayList<Point>> pointList;
 71 | 		resultClusters = new ArrayList<>();
 72 | 
 73 | 		subGraphs = graph.splitGraphByLength(length);
 74 | 
 75 | 		for (Graph g : subGraphs) {
 76 | 			// 获取每个连通子图分裂后的聚类结果
 77 | 			pointList = g.getClusterByDivding();
 78 | 			resultClusters.addAll(pointList);
 79 | 		}
 80 | 
 81 | 		printResultCluster();
 82 | 	}
 83 | 
 84 | 	/**
 85 | 	 * 输出结果聚簇
 86 | 	 */
 87 | 	private void printResultCluster() {
 88 | 		int i = 1;
 89 | 		for (ArrayList<Point> cluster : resultClusters) {
 90 | 			System.out.print("聚簇" + i + ":");
 91 | 			for (Point p : cluster) {
 92 | 				System.out.print(MessageFormat.format("({0}, {1}) ", p.x, p.y));
 93 | 			}
 94 | 			System.out.println();
 95 | 			i++;
 96 | 		}
 97 | 
 98 | 	}
 99 | 
100 | }
101 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/cabddcc/CABDDCCExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.cabddcc;
 2 | 
 3 | /**
 4 |  * 基于连通图的分裂聚类算法
 5 |  */
 6 | public class CABDDCCExample {
 7 | 
 8 | 	public static void main(String[] agrs) {
 9 | 		String filePath = "data/cabddcc/graphData.txt";
10 | 		//连通距离阈值
11 | 		int length = 3;
12 | 
13 | 		CABDDCCCore tool = new CABDDCCCore(filePath, length);
14 | 		tool.splitCluster();
15 | 	}
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/cabddcc/Point.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.cabddcc;
 2 | 
 3 | /**
 4 |  * 坐标点类
 5 |  */
 6 | public class Point implements Comparable<Point> {
 7 | 
 8 | 	//坐标点id号,id号唯一
 9 | 	int id;
10 | 	//坐标横坐标
11 | 	Integer x;
12 | 	//坐标纵坐标
13 | 	Integer y;
14 | 	//坐标点是否已经被访问(处理)过，在生成连通子图的时候用到
15 | 	boolean isVisited;
16 | 
17 | 	public Point(String id, String x, String y) {
18 | 		this.id = Integer.parseInt(id);
19 | 		this.x = Integer.parseInt(x);
20 | 		this.y = Integer.parseInt(y);
21 | 	}
22 | 
23 | 	/**
24 | 	 * 计算当前点与制定点之间的欧式距离
25 | 	 *
26 | 	 * @param p
27 | 	 *            待计算聚类的p点
28 | 	 * @return
29 | 	 */
30 | 	public double ouDistance(Point p) {
31 | 		double distance = 0;
32 | 
33 | 		distance = (this.x - p.x) * (this.x - p.x) + (this.y - p.y) * (this.y - p.y);
34 | 		distance = Math.sqrt(distance);
35 | 
36 | 		return distance;
37 | 	}
38 | 
39 | 	/**
40 | 	 * 判断2个坐标点是否为用个坐标点
41 | 	 *
42 | 	 * @param p
43 | 	 *            待比较坐标点
44 | 	 * @return
45 | 	 */
46 | 	public boolean isTheSame(Point p) {
47 | 		boolean isSamed = false;
48 | 
49 | 		if (this.x == p.x && this.y == p.y) {
50 | 			isSamed = true;
51 | 		}
52 | 
53 | 		return isSamed;
54 | 	}
55 | 
56 | 	@Override
57 | 	public int compareTo(Point p) {
58 | 		if (this.x.compareTo(p.x) != 0) {
59 | 			return this.x.compareTo(p.x);
60 | 		} else {
61 | 			//如果在x坐标相等的情况下比较y坐标
62 | 			return this.y.compareTo(p.y);
63 | 		}
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/chameleon/ChameleonExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.chameleon;
 2 | 
 3 | /**
 4 |  * Chameleon(变色龙)两阶段聚类算法
 5 |  */
 6 | public class ChameleonExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		String filePath = "data/chameleon/graphData.txt";
10 | 		//k-近邻的k设置
11 | 		int k = 1;
12 | 		//度量函数阈值
13 | 		double minMetric = 0.1;
14 | 
15 | 		ChameleonCore tool = new ChameleonCore(filePath, k, minMetric);
16 | 		tool.buildCluster();
17 | 	}
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/chameleon/Cluster.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.others.chameleon;
  2 | 
  3 | import java.util.ArrayList;
  4 | 
  5 | /**
  6 |  * 聚簇类
  7 |  */
  8 | public class Cluster implements Cloneable {
  9 | 
 10 | 	//簇唯一id标识号
 11 | 	int id;
 12 | 	// 聚簇内的坐标点集合
 13 | 	ArrayList<Point> points;
 14 | 	// 聚簇内的所有边的权重和
 15 | 	double weightSum = 0;
 16 | 
 17 | 	public Cluster(int id, ArrayList<Point> points) {
 18 | 		this.id = id;
 19 | 		this.points = points;
 20 | 	}
 21 | 
 22 | 	/**
 23 | 	 * 计算聚簇的内部的边权重和
 24 | 	 *
 25 | 	 * @return
 26 | 	 */
 27 | 	public double calEC() {
 28 | 		int id1 = 0;
 29 | 		int id2 = 0;
 30 | 		weightSum = 0;
 31 | 
 32 | 		for (Point p1 : points) {
 33 | 			for (Point p2 : points) {
 34 | 				id1 = p1.id;
 35 | 				id2 = p2.id;
 36 | 
 37 | 				// 为了避免重复计算，取id1小的对应大的
 38 | 				if (id1 < id2 && ChameleonCore.edges[id1][id2] == 1) {
 39 | 					weightSum += ChameleonCore.weights[id1][id2];
 40 | 				}
 41 | 			}
 42 | 		}
 43 | 
 44 | 		return weightSum;
 45 | 	}
 46 | 
 47 | 	/**
 48 | 	 * 计算2个簇之间最近的n条边
 49 | 	 *
 50 | 	 * @param otherCluster
 51 | 	 *            待比较的簇
 52 | 	 * @param n
 53 | 	 *            最近的边的数目
 54 | 	 * @return
 55 | 	 */
 56 | 	public ArrayList<int[]> calNearestEdge(Cluster otherCluster, int n) {
 57 | 		int count = 0;
 58 | 		double distance = 0;
 59 | 		double minDistance = Integer.MAX_VALUE;
 60 | 		Point point1 = null;
 61 | 		Point point2 = null;
 62 | 		ArrayList<int[]> edgeList = new ArrayList<>();
 63 | 		ArrayList<Point> pointList1 = (ArrayList<Point>) points.clone();
 64 | 		ArrayList<Point> pointList2 = null;
 65 | 		Cluster c2 = null;
 66 | 
 67 | 		try {
 68 | 			c2 = (Cluster) otherCluster.clone();
 69 | 			pointList2 = c2.points;
 70 | 		} catch (CloneNotSupportedException e) {
 71 | 			// TODO Auto-generated catch block
 72 | 			e.printStackTrace();
 73 | 		}
 74 | 
 75 | 		int[] tempEdge;
 76 | 		// 循环计算出每次的最近距离
 77 | 		while (count < n) {
 78 | 			tempEdge = new int[2];
 79 | 			minDistance = Integer.MAX_VALUE;
 80 | 
 81 | 			for (Point p1 : pointList1) {
 82 | 				for (Point p2 : pointList2) {
 83 | 					distance = p1.ouDistance(p2);
 84 | 					if (distance < minDistance) {
 85 | 						point1 = p1;
 86 | 						point2 = p2;
 87 | 						tempEdge[0] = p1.id;
 88 | 						tempEdge[1] = p2.id;
 89 | 
 90 | 						minDistance = distance;
 91 | 					}
 92 | 				}
 93 | 			}
 94 | 
 95 | 			pointList1.remove(point1);
 96 | 			pointList2.remove(point2);
 97 | 			edgeList.add(tempEdge);
 98 | 			count++;
 99 | 		}
100 | 
101 | 		return edgeList;
102 | 	}
103 | 
104 | 	@Override
105 | 	protected Object clone() throws CloneNotSupportedException {
106 | 		// TODO Auto-generated method stub
107 | 
108 | 		//引用需要再次复制，实现深拷贝
109 | 		ArrayList<Point> pointList = (ArrayList<Point>) this.points.clone();
110 | 		Cluster cluster = new Cluster(id, pointList);
111 | 
112 | 		return cluster;
113 | 	}
114 | 
115 | }
116 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/chameleon/Point.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.chameleon;
 2 | 
 3 | /**
 4 |  * 坐标点类
 5 |  */
 6 | public class Point {
 7 | 
 8 | 	//坐标点id号,id号唯一
 9 | 	int id;
10 | 	//坐标横坐标
11 | 	Integer x;
12 | 	//坐标纵坐标
13 | 	Integer y;
14 | 	//是否已经被访问过
15 | 	boolean isVisited;
16 | 
17 | 	public Point(String id, String x, String y) {
18 | 		this.id = Integer.parseInt(id);
19 | 		this.x = Integer.parseInt(x);
20 | 		this.y = Integer.parseInt(y);
21 | 	}
22 | 
23 | 	/**
24 | 	 * 计算当前点与制定点之间的欧式距离
25 | 	 *
26 | 	 * @param p
27 | 	 *            待计算聚类的p点
28 | 	 * @return
29 | 	 */
30 | 	public double ouDistance(Point p) {
31 | 		double distance = 0;
32 | 
33 | 		distance = (this.x - p.x) * (this.x - p.x) + (this.y - p.y) * (this.y - p.y);
34 | 		distance = Math.sqrt(distance);
35 | 
36 | 		return distance;
37 | 	}
38 | 
39 | 	/**
40 | 	 * 判断2个坐标点是否为用个坐标点
41 | 	 *
42 | 	 * @param p
43 | 	 *            待比较坐标点
44 | 	 * @return
45 | 	 */
46 | 	public boolean isTheSame(Point p) {
47 | 		boolean isSamed = false;
48 | 
49 | 		if (this.x == p.x && this.y == p.y) {
50 | 			isSamed = true;
51 | 		}
52 | 
53 | 		return isSamed;
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/dbscan/DBSCANCore.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.others.dbscan;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.text.MessageFormat;
  8 | import java.util.ArrayList;
  9 | 
 10 | /**
 11 |  * DBSCAN基于密度聚类算法工具类
 12 |  */
 13 | public class DBSCANCore {
 14 | 
 15 | 	// 测试数据文件地址
 16 | 	private String filePath;
 17 | 	// 簇扫描半径
 18 | 	private double eps;
 19 | 	// 最小包含点数阈值
 20 | 	private int minPts;
 21 | 	// 所有的数据坐标点
 22 | 	private ArrayList<Point> totalPoints;
 23 | 	// 聚簇结果
 24 | 	private ArrayList<ArrayList<Point>> resultClusters;
 25 | 	//噪声数据
 26 | 	private ArrayList<Point> noisePoint;
 27 | 
 28 | 	public DBSCANCore(String filePath, double eps, int minPts) {
 29 | 		this.filePath = filePath;
 30 | 		this.eps = eps;
 31 | 		this.minPts = minPts;
 32 | 		readDataFile();
 33 | 	}
 34 | 
 35 | 	/**
 36 | 	 * 从文件中读取数据
 37 | 	 */
 38 | 	public void readDataFile() {
 39 | 		File file = new File(filePath);
 40 | 		ArrayList<String[]> dataArray = new ArrayList<String[]>();
 41 | 
 42 | 		try {
 43 | 			BufferedReader in = new BufferedReader(new FileReader(file));
 44 | 			String str;
 45 | 			String[] tempArray;
 46 | 			while ((str = in.readLine()) != null) {
 47 | 				tempArray = str.split(" ");
 48 | 				dataArray.add(tempArray);
 49 | 			}
 50 | 			in.close();
 51 | 		} catch (IOException e) {
 52 | 			e.getStackTrace();
 53 | 		}
 54 | 
 55 | 		Point p;
 56 | 		totalPoints = new ArrayList<>();
 57 | 		for (String[] array : dataArray) {
 58 | 			p = new Point(array[0], array[1]);
 59 | 			totalPoints.add(p);
 60 | 		}
 61 | 	}
 62 | 
 63 | 	/**
 64 | 	 * 递归的寻找聚簇
 65 | 	 *
 66 | 	 * @param pointList
 67 | 	 *            当前的点列表
 68 | 	 * @param parentCluster
 69 | 	 *            父聚簇
 70 | 	 */
 71 | 	private void recursiveCluster(Point point, ArrayList<Point> parentCluster) {
 72 | 		double distance = 0;
 73 | 		ArrayList<Point> cluster;
 74 | 
 75 | 		// 如果已经访问过了，则跳过
 76 | 		if (point.isVisited) {
 77 | 			return;
 78 | 		}
 79 | 
 80 | 		point.isVisited = true;
 81 | 		cluster = new ArrayList<>();
 82 | 		for (Point p2 : totalPoints) {
 83 | 			// 过滤掉自身的坐标点
 84 | 			if (point.isTheSame(p2)) {
 85 | 				continue;
 86 | 			}
 87 | 
 88 | 			distance = point.ouDistance(p2);
 89 | 			if (distance <= eps) {
 90 | 				// 如果聚类小于给定的半径，则加入簇中
 91 | 				cluster.add(p2);
 92 | 			}
 93 | 		}
 94 | 
 95 | 		if (cluster.size() >= minPts) {
 96 | 			// 将自己也加入到聚簇中
 97 | 			cluster.add(point);
 98 | 			// 如果附近的节点个数超过最下值，则加入到父聚簇中,同时去除重复的点
 99 | 			addCluster(parentCluster, cluster);
100 | 
101 | 			for (Point p : cluster) {
102 | 				recursiveCluster(p, parentCluster);
103 | 			}
104 | 		}
105 | 	}
106 | 
107 | 	/**
108 | 	 * 往父聚簇中添加局部簇坐标点
109 | 	 *
110 | 	 * @param parentCluster
111 | 	 *            原始父聚簇坐标点
112 | 	 * @param cluster
113 | 	 *            待合并的聚簇
114 | 	 */
115 | 	private void addCluster(ArrayList<Point> parentCluster, ArrayList<Point> cluster) {
116 | 		boolean isCotained = false;
117 | 		ArrayList<Point> addPoints = new ArrayList<>();
118 | 
119 | 		for (Point p : cluster) {
120 | 			isCotained = false;
121 | 			for (Point p2 : parentCluster) {
122 | 				if (p.isTheSame(p2)) {
123 | 					isCotained = true;
124 | 					break;
125 | 				}
126 | 			}
127 | 
128 | 			if (!isCotained) {
129 | 				addPoints.add(p);
130 | 			}
131 | 		}
132 | 
133 | 		parentCluster.addAll(addPoints);
134 | 	}
135 | 
136 | 	/**
137 | 	 * dbScan算法基于密度的聚类
138 | 	 */
139 | 	public void dbScanCluster() {
140 | 		ArrayList<Point> cluster = null;
141 | 		resultClusters = new ArrayList<>();
142 | 		noisePoint = new ArrayList<>();
143 | 
144 | 		for (Point p : totalPoints) {
145 | 			if (p.isVisited) {
146 | 				continue;
147 | 			}
148 | 
149 | 			cluster = new ArrayList<>();
150 | 			recursiveCluster(p, cluster);
151 | 
152 | 			if (cluster.size() > 0) {
153 | 				resultClusters.add(cluster);
154 | 			} else {
155 | 				noisePoint.add(p);
156 | 			}
157 | 		}
158 | 		removeFalseNoise();
159 | 
160 | 		printClusters();
161 | 	}
162 | 
163 | 	/**
164 | 	 * 移除被错误分类的噪声点数据
165 | 	 */
166 | 	private void removeFalseNoise() {
167 | 		ArrayList<Point> totalCluster = new ArrayList<>();
168 | 		ArrayList<Point> deletePoints = new ArrayList<>();
169 | 
170 | 		//将聚簇合并
171 | 		for (ArrayList<Point> list : resultClusters) {
172 | 			totalCluster.addAll(list);
173 | 		}
174 | 
175 | 		for (Point p : noisePoint) {
176 | 			for (Point p2 : totalCluster) {
177 | 				if (p2.isTheSame(p)) {
178 | 					deletePoints.add(p);
179 | 				}
180 | 			}
181 | 		}
182 | 
183 | 		noisePoint.removeAll(deletePoints);
184 | 	}
185 | 
186 | 	/**
187 | 	 * 输出聚类结果
188 | 	 */
189 | 	private void printClusters() {
190 | 		int i = 1;
191 | 		for (ArrayList<Point> pList : resultClusters) {
192 | 			System.out.print("聚簇" + (i++) + ":");
193 | 			for (Point p : pList) {
194 | 				System.out.print(MessageFormat.format("({0},{1}) ", p.x, p.y));
195 | 			}
196 | 			System.out.println();
197 | 		}
198 | 
199 | 		System.out.println();
200 | 		System.out.print("噪声数据:");
201 | 		for (Point p : noisePoint) {
202 | 			System.out.print(MessageFormat.format("({0},{1}) ", p.x, p.y));
203 | 		}
204 | 		System.out.println();
205 | 	}
206 | }
207 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/dbscan/DBSCANExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.dbscan;
 2 | 
 3 | /**
 4 |  * Dbscan基于密度的聚类算法测试类
 5 |  */
 6 | public class DBSCANExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		String filePath = "data/dbscan/input.txt";
10 | 		//簇扫描半径
11 | 		double eps = 3;
12 | 		//最小包含点数阈值
13 | 		int minPts = 3;
14 | 
15 | 		DBSCANCore tool = new DBSCANCore(filePath, eps, minPts);
16 | 		tool.dbScanCluster();
17 | 	}
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/dbscan/Point.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.dbscan;
 2 | 
 3 | /**
 4 |  * 坐标点类
 5 |  */
 6 | public class Point {
 7 | 
 8 | 	// 坐标点横坐标
 9 | 	int x;
10 | 	// 坐标点纵坐标
11 | 	int y;
12 | 	// 此节点是否已经被访问过
13 | 	boolean isVisited;
14 | 
15 | 	public Point(String x, String y) {
16 | 		this.x = (Integer.parseInt(x));
17 | 		this.y = (Integer.parseInt(y));
18 | 		this.isVisited = false;
19 | 	}
20 | 
21 | 	/**
22 | 	 * 计算当前点与制定点之间的欧式距离
23 | 	 *
24 | 	 * @param p
25 | 	 *            待计算聚类的p点
26 | 	 * @return
27 | 	 */
28 | 	public double ouDistance(Point p) {
29 | 		double distance = 0;
30 | 
31 | 		distance = (this.x - p.x) * (this.x - p.x) + (this.y - p.y) * (this.y - p.y);
32 | 		distance = Math.sqrt(distance);
33 | 
34 | 		return distance;
35 | 	}
36 | 
37 | 	/**
38 | 	 * 判断2个坐标点是否为用个坐标点
39 | 	 *
40 | 	 * @param p
41 | 	 *            待比较坐标点
42 | 	 * @return
43 | 	 */
44 | 	public boolean isTheSame(Point p) {
45 | 		boolean isSamed = false;
46 | 
47 | 		if (this.x == p.x && this.y == p.y) {
48 | 			isSamed = true;
49 | 		}
50 | 
51 | 		return isSamed;
52 | 	}
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/ga/GAExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.ga;
 2 | 
 3 | /**
 4 |  * Genetic遗传算法测试类
 5 |  */
 6 | public class GAExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		//变量最小值和最大值
10 | 		int minNum = 1;
11 | 		int maxNum = 7;
12 | 		//初始群体规模
13 | 		int initSetsNum = 4;
14 | 
15 | 		GACore tool = new GACore(minNum, maxNum, initSetsNum);
16 | 		tool.geneticCal();
17 | 	}
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/ga/maze/GAMazeExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.ga.maze;
 2 | 
 3 | /**
 4 |  * 遗传算法在走迷宫游戏的应用
 5 |  */
 6 | public class GAMazeExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		//迷宫地图文件数据地址
10 | 		String filePath = "data/maze/mapData.txt";
11 | 		//初始个体数量
12 | 		int initSetsNum = 10;
13 | 
14 | 		GAMazeCore tool = new GAMazeCore(filePath, initSetsNum);
15 | 		tool.goOutMaze();
16 | 	}
17 | 
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/kdtree/KDTreeExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.kdtree;
 2 | 
 3 | import java.text.MessageFormat;
 4 | 
 5 | /**
 6 |  * KD树算法测试类
 7 |  */
 8 | public class KDTreeExample {
 9 | 
10 | 	public static void main(String[] args) {
11 | 		String filePath = "data/kdtree/input.txt";
12 | 		Point queryNode;
13 | 		Point searchedNode;
14 | 		KDTreeCore tool = new KDTreeCore(filePath);
15 | 
16 | 		// 进行KD树的构建
17 | 		tool.createKDTree();
18 | 
19 | 		// 通过KD树进行数据点的最近点查询
20 | 		queryNode = new Point(2.1, 3.1);
21 | 		searchedNode = tool.searchNearestData(queryNode);
22 | 		System.out.println(MessageFormat.format("距离查询点({0}, {1})最近的坐标点为({2}, {3})", queryNode.x, queryNode.y,
23 | 				searchedNode.x, searchedNode.y));
24 | 
25 | 		//重新构造KD树,去除之前的访问记录
26 | 		tool.createKDTree();
27 | 		queryNode = new Point(2, 4.5);
28 | 		searchedNode = tool.searchNearestData(queryNode);
29 | 		System.out.println(MessageFormat.format("距离查询点({0}, {1})最近的坐标点为({2}, {3})", queryNode.x, queryNode.y,
30 | 				searchedNode.x, searchedNode.y));
31 | 	}
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/kdtree/Point.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.kdtree;
 2 | 
 3 | /**
 4 |  * 坐标点类
 5 |  */
 6 | public class Point {
 7 | 
 8 | 	// 坐标点横坐标
 9 | 	Double x;
10 | 	// 坐标点纵坐标
11 | 	Double y;
12 | 
13 | 	public Point(double x, double y) {
14 | 		this.x = x;
15 | 		this.y = y;
16 | 	}
17 | 
18 | 	public Point(String x, String y) {
19 | 		this.x = (Double.parseDouble(x));
20 | 		this.y = (Double.parseDouble(y));
21 | 	}
22 | 
23 | 	/**
24 | 	 * 计算当前点与制定点之间的欧式距离
25 | 	 *
26 | 	 * @param p
27 | 	 *            待计算聚类的p点
28 | 	 * @return
29 | 	 */
30 | 	public double ouDistance(Point p) {
31 | 		double distance = 0;
32 | 
33 | 		distance = (this.x - p.x) * (this.x - p.x) + (this.y - p.y) * (this.y - p.y);
34 | 		distance = Math.sqrt(distance);
35 | 
36 | 		return distance;
37 | 	}
38 | 
39 | 	/**
40 | 	 * 判断2个坐标点是否为用个坐标点
41 | 	 *
42 | 	 * @param p
43 | 	 *            待比较坐标点
44 | 	 * @return
45 | 	 */
46 | 	public boolean isTheSame(Point p) {
47 | 		boolean isSamed = false;
48 | 
49 | 		if (this.x == p.x && this.y == p.y) {
50 | 			isSamed = true;
51 | 		}
52 | 
53 | 		return isSamed;
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/kdtree/Range.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.others.kdtree;
  2 | 
  3 | /**
  4 |  * 空间矢量，表示所代表的空间范围
  5 |  */
  6 | public class Range {
  7 | 
  8 | 	// 边界左边界
  9 | 	double left;
 10 | 	// 边界右边界
 11 | 	double right;
 12 | 	// 边界上边界
 13 | 	double top;
 14 | 	// 边界下边界
 15 | 	double bottom;
 16 | 
 17 | 	public Range() {
 18 | 		this.left = -Integer.MAX_VALUE;
 19 | 		this.right = Integer.MAX_VALUE;
 20 | 		this.top = Integer.MAX_VALUE;
 21 | 		this.bottom = -Integer.MAX_VALUE;
 22 | 	}
 23 | 
 24 | 	public Range(int left, int right, int top, int bottom) {
 25 | 		this.left = left;
 26 | 		this.right = right;
 27 | 		this.top = top;
 28 | 		this.bottom = bottom;
 29 | 	}
 30 | 
 31 | 	/**
 32 | 	 * 空间矢量进行并操作
 33 | 	 *
 34 | 	 * @param range
 35 | 	 * @return
 36 | 	 */
 37 | 	public Range crossOperation(Range r) {
 38 | 		Range range = new Range();
 39 | 
 40 | 		// 取靠近右侧的左边界
 41 | 		if (r.left > this.left) {
 42 | 			range.left = r.left;
 43 | 		} else {
 44 | 			range.left = this.left;
 45 | 		}
 46 | 
 47 | 		// 取靠近左侧的右边界
 48 | 		if (r.right < this.right) {
 49 | 			range.right = r.right;
 50 | 		} else {
 51 | 			range.right = this.right;
 52 | 		}
 53 | 
 54 | 		// 取靠近下侧的上边界
 55 | 		if (r.top < this.top) {
 56 | 			range.top = r.top;
 57 | 		} else {
 58 | 			range.top = this.top;
 59 | 		}
 60 | 
 61 | 		// 取靠近上侧的下边界
 62 | 		if (r.bottom > this.bottom) {
 63 | 			range.bottom = r.bottom;
 64 | 		} else {
 65 | 			range.bottom = this.bottom;
 66 | 		}
 67 | 
 68 | 		return range;
 69 | 	}
 70 | 
 71 | 	/**
 72 | 	 * 根据坐标点分割方向确定左侧空间矢量
 73 | 	 *
 74 | 	 * @param p
 75 | 	 *            数据矢量
 76 | 	 * @param dir
 77 | 	 *            分割方向
 78 | 	 * @return
 79 | 	 */
 80 | 	public static Range initLeftRange(Point p, int dir) {
 81 | 		Range range = new Range();
 82 | 
 83 | 		if (dir == KDTreeCore.DIRECTION_X) {
 84 | 			range.right = p.x;
 85 | 		} else {
 86 | 			range.bottom = p.y;
 87 | 		}
 88 | 
 89 | 		return range;
 90 | 	}
 91 | 
 92 | 	/**
 93 | 	 * 根据坐标点分割方向确定右侧空间矢量
 94 | 	 *
 95 | 	 * @param p
 96 | 	 *            数据矢量
 97 | 	 * @param dir
 98 | 	 *            分割方向
 99 | 	 * @return
100 | 	 */
101 | 	public static Range initRightRange(Point p, int dir) {
102 | 		Range range = new Range();
103 | 
104 | 		if (dir == KDTreeCore.DIRECTION_X) {
105 | 			range.left = p.x;
106 | 		} else {
107 | 			range.top = p.y;
108 | 		}
109 | 
110 | 		return range;
111 | 	}
112 | }
113 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/kdtree/TreeNode.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.kdtree;
 2 | 
 3 | /**
 4 |  * KD树节点
 5 |  */
 6 | public class TreeNode {
 7 | 
 8 | 	//数据矢量
 9 | 	Point nodeData;
10 | 	//分割平面的分割线
11 | 	int spilt;
12 | 	//空间矢量，该节点所表示的空间范围
13 | 	Range range;
14 | 	//父节点
15 | 	TreeNode parentNode;
16 | 	//位于分割超平面左侧的孩子节点
17 | 	TreeNode leftNode;
18 | 	//位于分割超平面右侧的孩子节点
19 | 	TreeNode rightNode;
20 | 	//节点是否被访问过,用于回溯时使用
21 | 	boolean isVisited;
22 | 
23 | 	public TreeNode() {
24 | 		this.isVisited = false;
25 | 	}
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/msapriori/FrequentItem.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.msapriori;
 2 | 
 3 | /**
 4 |  * 频繁项集
 5 |  */
 6 | public class FrequentItem implements Comparable<FrequentItem> {
 7 | 
 8 | 	// 频繁项集的集合ID
 9 | 	private String[] idArray;
10 | 	// 频繁项集的支持度计数
11 | 	private int count;
12 | 	//频繁项集的长度，1项集或是2项集，亦或是3项集
13 | 	private int length;
14 | 
15 | 	public FrequentItem(String[] idArray, int count) {
16 | 		this.idArray = idArray;
17 | 		this.count = count;
18 | 		length = idArray.length;
19 | 	}
20 | 
21 | 	public String[] getIdArray() {
22 | 		return idArray;
23 | 	}
24 | 
25 | 	public void setIdArray(String[] idArray) {
26 | 		this.idArray = idArray;
27 | 	}
28 | 
29 | 	public int getCount() {
30 | 		return count;
31 | 	}
32 | 
33 | 	public void setCount(int count) {
34 | 		this.count = count;
35 | 	}
36 | 
37 | 	public int getLength() {
38 | 		return length;
39 | 	}
40 | 
41 | 	public void setLength(int length) {
42 | 		this.length = length;
43 | 	}
44 | 
45 | 	@Override
46 | 	public int compareTo(FrequentItem o) {
47 | 		// TODO Auto-generated method stub
48 | 		Integer int1 = Integer.parseInt(this.getIdArray()[0]);
49 | 		Integer int2 = Integer.parseInt(o.getIdArray()[0]);
50 | 
51 | 		return int1.compareTo(int2);
52 | 	}
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/msapriori/MSAprioriExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.msapriori;
 2 | 
 3 | /**
 4 |  * 基于多支持度的Apriori算法测试类
 5 |  */
 6 | public class MSAprioriExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		//是否是事务型数据
10 | 		boolean isTransaction;
11 | 		//测试数据文件地址
12 | 		String filePath = "data/msapriori/testInput.txt";
13 | 		//关系表型数据文件地址
14 | 		String tableFilePath = "data/msapriori/testInput2.txt";
15 | 		//最小支持度阈值
16 | 		double minSup;
17 | 		// 最小置信度率
18 | 		double minConf;
19 | 		//最大支持度差别阈值
20 | 		double delta;
21 | 		//多项目的最小支持度数,括号中的下标代表的是商品的ID
22 | 		double[] mis;
23 | 		//msApriori算法工具类
24 | 		MSAprioriCore tool;
25 | 
26 | 		//为了测试的方便，取一个偏低的置信度值0.3
27 | 		minConf = 0.3;
28 | 		minSup = 0.1;
29 | 		delta = 0.5;
30 | 		//每项的支持度率都默认为0.1，第一项不使用
31 | 		mis = new double[] { -1, 0.1, 0.1, 0.1, 0.1, 0.1 };
32 | 		isTransaction = true;
33 | 
34 | 		isTransaction = true;
35 | 		tool = new MSAprioriCore(filePath, minConf, delta, mis, isTransaction);
36 | 		tool.calFItems();
37 | 		System.out.println();
38 | 
39 | 		isTransaction = false;
40 | 		//重新初始化数据
41 | 		tool = new MSAprioriCore(tableFilePath, minConf, minSup, isTransaction);
42 | 		tool.calFItems();
43 | 	}
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/randomforest/DecisionTree.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.others.randomforest;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.HashMap;
  5 | import java.util.Map;
  6 | 
  7 | /**
  8 |  * 决策树
  9 |  */
 10 | public class DecisionTree {
 11 | 
 12 | 	// 树的根节点
 13 | 	TreeNode rootNode;
 14 | 	// 数据的属性列名称
 15 | 	String[] featureNames;
 16 | 	// 这棵树所包含的数据
 17 | 	ArrayList<String[]> datas;
 18 | 	// 决策树构造的的工具类
 19 | 	CARTCore tool;
 20 | 
 21 | 	public DecisionTree(ArrayList<String[]> datas) {
 22 | 		this.datas = datas;
 23 | 		this.featureNames = datas.get(0);
 24 | 
 25 | 		tool = new CARTCore(datas);
 26 | 		// 通过CART工具类进行决策树的构建，并返回树的根节点
 27 | 		rootNode = tool.startBuildingTree();
 28 | 	}
 29 | 
 30 | 	/**
 31 | 	 * 根据给定的数据特征描述进行类别的判断
 32 | 	 *
 33 | 	 * @param features
 34 | 	 * @return
 35 | 	 */
 36 | 	public String decideClassType(String features) {
 37 | 		String classType = "";
 38 | 		// 查询属性组
 39 | 		String[] queryFeatures;
 40 | 		// 在本决策树中对应的查询的属性值描述
 41 | 		ArrayList<String[]> featureStrs;
 42 | 
 43 | 		featureStrs = new ArrayList<>();
 44 | 		queryFeatures = features.split(",");
 45 | 
 46 | 		String[] array;
 47 | 		for (String name : featureNames) {
 48 | 			for (String featureValue : queryFeatures) {
 49 | 				array = featureValue.split("=");
 50 | 				// 将对应的属性值加入到列表中
 51 | 				if (array[0].equals(name)) {
 52 | 					featureStrs.add(array);
 53 | 				}
 54 | 			}
 55 | 		}
 56 | 
 57 | 		// 开始从根据节点往下递归搜索
 58 | 		classType = recusiveSearchClassType(rootNode, featureStrs);
 59 | 
 60 | 		return classType;
 61 | 	}
 62 | 
 63 | 	/**
 64 | 	 * 递归搜索树，查询属性的分类类别
 65 | 	 *
 66 | 	 * @param node
 67 | 	 *            当前搜索到的节点
 68 | 	 * @param remainFeatures
 69 | 	 *            剩余未判断的属性
 70 | 	 * @return
 71 | 	 */
 72 | 	private String recusiveSearchClassType(TreeNode node, ArrayList<String[]> remainFeatures) {
 73 | 		String classType = null;
 74 | 
 75 | 		// 如果节点包含了数据的id索引，说明已经分类到底了
 76 | 		if (node.getDataIndex() != null && node.getDataIndex().size() > 0) {
 77 | 			classType = judgeClassType(node.getDataIndex());
 78 | 
 79 | 			return classType;
 80 | 		}
 81 | 
 82 | 		// 取出剩余属性中的一个匹配属性作为当前的判断属性名称
 83 | 		String[] currentFeature = null;
 84 | 		for (String[] featureValue : remainFeatures) {
 85 | 			if (node.getAttrName().equals(featureValue[0])) {
 86 | 				currentFeature = featureValue;
 87 | 				break;
 88 | 			}
 89 | 		}
 90 | 
 91 | 		for (TreeNode childNode : node.getChildAttrNode()) {
 92 | 			// 寻找子节点中属于此属性值的分支
 93 | 			if (childNode.getParentAttrValue().equals(currentFeature[1])) {
 94 | 				remainFeatures.remove(currentFeature);
 95 | 				classType = recusiveSearchClassType(childNode, remainFeatures);
 96 | 
 97 | 				// 如果找到了分类结果，则直接挑出循环
 98 | 				break;
 99 | 			} else {
100 | 				//进行第二种情况的判断加上!符号的情况
101 | 				String value = childNode.getParentAttrValue();
102 | 
103 | 				if (value.charAt(0) == '!') {
104 | 					//去掉第一个！字符
105 | 					value = value.substring(1, value.length());
106 | 
107 | 					if (!value.equals(currentFeature[1])) {
108 | 						remainFeatures.remove(currentFeature);
109 | 						classType = recusiveSearchClassType(childNode, remainFeatures);
110 | 
111 | 						break;
112 | 					}
113 | 				}
114 | 			}
115 | 		}
116 | 
117 | 		return classType;
118 | 	}
119 | 
120 | 	/**
121 | 	 * 根据得到的数据行分类进行类别的决策
122 | 	 *
123 | 	 * @param dataIndex
124 | 	 *            根据分类的数据索引号
125 | 	 * @return
126 | 	 */
127 | 	public String judgeClassType(ArrayList<String> dataIndex) {
128 | 		// 结果类型值
129 | 		String resultClassType = "";
130 | 		String classType = "";
131 | 		int count = 0;
132 | 		int temp = 0;
133 | 		Map<String, Integer> type2Num = new HashMap<String, Integer>();
134 | 
135 | 		for (String index : dataIndex) {
136 | 			temp = Integer.parseInt(index);
137 | 			// 取最后一列的决策类别数据
138 | 			classType = datas.get(temp)[featureNames.length - 1];
139 | 
140 | 			if (type2Num.containsKey(classType)) {
141 | 				// 如果类别已经存在，则使其计数加1
142 | 				count = type2Num.get(classType);
143 | 				count++;
144 | 			} else {
145 | 				count = 1;
146 | 			}
147 | 
148 | 			type2Num.put(classType, count);
149 | 		}
150 | 
151 | 		// 选出其中类别支持计数最多的一个类别值
152 | 		count = -1;
153 | 		for (Map.Entry entry : type2Num.entrySet()) {
154 | 			if ((int) entry.getValue() > count) {
155 | 				count = (int) entry.getValue();
156 | 				resultClassType = (String) entry.getKey();
157 | 			}
158 | 		}
159 | 
160 | 		return resultClassType;
161 | 	}
162 | }
163 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/randomforest/RandomForestCore.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.others.randomforest;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | import java.util.HashMap;
  9 | import java.util.Map;
 10 | import java.util.Random;
 11 | 
 12 | /**
 13 |  * 随机森林算法工具类
 14 |  */
 15 | public class RandomForestCore {
 16 | 
 17 | 	// 测试数据文件地址
 18 | 	private String filePath;
 19 | 	// 决策树的样本占总数的占比率
 20 | 	private double sampleNumRatio;
 21 | 	// 样本数据的采集特征数量占总特征的比例
 22 | 	private double featureNumRatio;
 23 | 	// 决策树的采样样本数
 24 | 	private int sampleNum;
 25 | 	// 样本数据的采集采样特征数
 26 | 	private int featureNum;
 27 | 	// 随机森林中的决策树的数目,等于总的数据数/用于构造每棵树的数据的数量
 28 | 	private int treeNum;
 29 | 	// 随机数产生器
 30 | 	private Random random;
 31 | 	// 样本数据列属性名称行
 32 | 	private String[] featureNames;
 33 | 	// 原始的总的数据
 34 | 	private ArrayList<String[]> totalDatas;
 35 | 	// 决策树森林
 36 | 	private ArrayList<DecisionTree> decisionForest;
 37 | 
 38 | 	public RandomForestCore(String filePath, double sampleNumRatio, double featureNumRatio) {
 39 | 		this.filePath = filePath;
 40 | 		this.sampleNumRatio = sampleNumRatio;
 41 | 		this.featureNumRatio = featureNumRatio;
 42 | 
 43 | 		readDataFile();
 44 | 	}
 45 | 
 46 | 	/**
 47 | 	 * 从文件中读取数据
 48 | 	 */
 49 | 	private void readDataFile() {
 50 | 		File file = new File(filePath);
 51 | 		ArrayList<String[]> dataArray = new ArrayList<String[]>();
 52 | 
 53 | 		try {
 54 | 			BufferedReader in = new BufferedReader(new FileReader(file));
 55 | 			String str;
 56 | 			String[] tempArray;
 57 | 			while ((str = in.readLine()) != null) {
 58 | 				tempArray = str.split(" ");
 59 | 				dataArray.add(tempArray);
 60 | 			}
 61 | 			in.close();
 62 | 		} catch (IOException e) {
 63 | 			e.getStackTrace();
 64 | 		}
 65 | 
 66 | 		totalDatas = dataArray;
 67 | 		featureNames = totalDatas.get(0);
 68 | 		sampleNum = (int) ((totalDatas.size() - 1) * sampleNumRatio);
 69 | 		//算属性数量的时候需要去掉id属性和决策属性，用条件属性计算
 70 | 		featureNum = (int) ((featureNames.length - 2) * featureNumRatio);
 71 | 		// 算数量的时候需要去掉首行属性名称行
 72 | 		treeNum = (totalDatas.size() - 1) / sampleNum;
 73 | 	}
 74 | 
 75 | 	/**
 76 | 	 * 产生决策树
 77 | 	 */
 78 | 	private DecisionTree produceDecisionTree() {
 79 | 		int temp = 0;
 80 | 		DecisionTree tree;
 81 | 		String[] tempData;
 82 | 		//采样数据的随机行号组
 83 | 		ArrayList<Integer> sampleRandomNum;
 84 | 		//采样属性特征的随机列号组
 85 | 		ArrayList<Integer> featureRandomNum;
 86 | 		ArrayList<String[]> datas;
 87 | 
 88 | 		sampleRandomNum = new ArrayList<>();
 89 | 		featureRandomNum = new ArrayList<>();
 90 | 		datas = new ArrayList<>();
 91 | 
 92 | 		for (int i = 0; i < sampleNum;) {
 93 | 			temp = random.nextInt(totalDatas.size());
 94 | 
 95 | 			//如果是行首属性名称行，则跳过
 96 | 			if (temp == 0) {
 97 | 				continue;
 98 | 			}
 99 | 
100 | 			if (!sampleRandomNum.contains(temp)) {
101 | 				sampleRandomNum.add(temp);
102 | 				i++;
103 | 			}
104 | 		}
105 | 
106 | 		for (int i = 0; i < featureNum;) {
107 | 			temp = random.nextInt(featureNames.length);
108 | 
109 | 			//如果是第一列的数据id号或者是决策属性列，则跳过
110 | 			if (temp == 0 || temp == featureNames.length - 1) {
111 | 				continue;
112 | 			}
113 | 
114 | 			if (!featureRandomNum.contains(temp)) {
115 | 				featureRandomNum.add(temp);
116 | 				i++;
117 | 			}
118 | 		}
119 | 
120 | 		String[] singleRecord;
121 | 		String[] headCulumn = null;
122 | 		// 获取随机数据行
123 | 		for (int dataIndex : sampleRandomNum) {
124 | 			singleRecord = totalDatas.get(dataIndex);
125 | 
126 | 			//每行的列数=所选的特征数+id号
127 | 			tempData = new String[featureNum + 2];
128 | 			headCulumn = new String[featureNum + 2];
129 | 
130 | 			for (int i = 0, k = 1; i < featureRandomNum.size(); i++, k++) {
131 | 				temp = featureRandomNum.get(i);
132 | 
133 | 				headCulumn[k] = featureNames[temp];
134 | 				tempData[k] = singleRecord[temp];
135 | 			}
136 | 
137 | 			//加上id列的信息
138 | 			headCulumn[0] = featureNames[0];
139 | 			//加上决策分类列的信息
140 | 			headCulumn[featureNum + 1] = featureNames[featureNames.length - 1];
141 | 			tempData[featureNum + 1] = singleRecord[featureNames.length - 1];
142 | 
143 | 			//加入此行数据
144 | 			datas.add(tempData);
145 | 		}
146 | 
147 | 		//加入行首列出现名称
148 | 		datas.add(0, headCulumn);
149 | 		//对筛选出的数据重新做id分配
150 | 		temp = 0;
151 | 		for (String[] array : datas) {
152 | 			//从第2行开始赋值
153 | 			if (temp > 0) {
154 | 				array[0] = temp + "";
155 | 			}
156 | 
157 | 			temp++;
158 | 		}
159 | 
160 | 		tree = new DecisionTree(datas);
161 | 
162 | 		return tree;
163 | 	}
164 | 
165 | 	/**
166 | 	 * 构造随机森林
167 | 	 */
168 | 	public void constructRandomTree() {
169 | 		DecisionTree tree;
170 | 		random = new Random();
171 | 		decisionForest = new ArrayList<>();
172 | 
173 | 		System.out.println("下面是随机森林中的决策树：");
174 | 		// 构造决策树加入森林中
175 | 		for (int i = 0; i < treeNum; i++) {
176 | 			System.out.println("\n决策树" + (i + 1));
177 | 			tree = produceDecisionTree();
178 | 			decisionForest.add(tree);
179 | 		}
180 | 	}
181 | 
182 | 	/**
183 | 	 * 根据给定的属性条件进行类别的决策
184 | 	 *
185 | 	 * @param features
186 | 	 *            给定的已知的属性描述
187 | 	 * @return
188 | 	 */
189 | 	public String judgeClassType(String features) {
190 | 		// 结果类型值
191 | 		String resultClassType = "";
192 | 		String classType = "";
193 | 		int count = 0;
194 | 		Map<String, Integer> type2Num = new HashMap<String, Integer>();
195 | 
196 | 		for (DecisionTree tree : decisionForest) {
197 | 			classType = tree.decideClassType(features);
198 | 			if (type2Num.containsKey(classType)) {
199 | 				// 如果类别已经存在，则使其计数加1
200 | 				count = type2Num.get(classType);
201 | 				count++;
202 | 			} else {
203 | 				count = 1;
204 | 			}
205 | 
206 | 			type2Num.put(classType, count);
207 | 		}
208 | 
209 | 		// 选出其中类别支持计数最多的一个类别值
210 | 		count = -1;
211 | 		for (Map.Entry entry : type2Num.entrySet()) {
212 | 			if ((int) entry.getValue() > count) {
213 | 				count = (int) entry.getValue();
214 | 				resultClassType = (String) entry.getKey();
215 | 			}
216 | 		}
217 | 
218 | 		return resultClassType;
219 | 	}
220 | }
221 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/randomforest/RandomForestExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.randomforest;
 2 | 
 3 | import java.text.MessageFormat;
 4 | 
 5 | /**
 6 |  * 随机森林算法测试场景
 7 |  */
 8 | public class RandomForestExample {
 9 | 
10 | 	public static void main(String[] args) {
11 | 		String filePath = "data/randomforest/input.txt";
12 | 		String queryStr = "Age=Youth,Income=Low,Student=No,CreditRating=Fair";
13 | 		String resultClassType = "";
14 | 		// 决策树的样本占总数的占比率
15 | 		double sampleNumRatio = 0.4;
16 | 		// 样本数据的采集特征数量占总特征的比例
17 | 		double featureNumRatio = 0.5;
18 | 
19 | 		RandomForestCore tool = new RandomForestCore(filePath, sampleNumRatio, featureNumRatio);
20 | 		tool.constructRandomTree();
21 | 
22 | 		resultClassType = tool.judgeClassType(queryStr);
23 | 
24 | 		System.out.println();
25 | 		System.out.println(MessageFormat.format("查询属性描述{0},预测的分类结果为BuysCompute:{1}", queryStr, resultClassType));
26 | 	}
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/randomforest/TreeNode.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.randomforest;
 2 | 
 3 | import java.util.ArrayList;
 4 | 
 5 | /**
 6 |  * 回归分类树节点
 7 |  */
 8 | public class TreeNode {
 9 | 
10 | 	// 节点属性名字
11 | 	private String attrName;
12 | 	// 节点索引标号
13 | 	private int nodeIndex;
14 | 	//包含的叶子节点数
15 | 	private int leafNum;
16 | 	// 节点误差率
17 | 	private double alpha;
18 | 	// 父亲分类属性值
19 | 	private String parentAttrValue;
20 | 	// 孩子节点
21 | 	private TreeNode[] childAttrNode;
22 | 	// 数据记录索引
23 | 	private ArrayList<String> dataIndex;
24 | 
25 | 	public String getAttrName() {
26 | 		return attrName;
27 | 	}
28 | 
29 | 	public void setAttrName(String attrName) {
30 | 		this.attrName = attrName;
31 | 	}
32 | 
33 | 	public int getNodeIndex() {
34 | 		return nodeIndex;
35 | 	}
36 | 
37 | 	public void setNodeIndex(int nodeIndex) {
38 | 		this.nodeIndex = nodeIndex;
39 | 	}
40 | 
41 | 	public double getAlpha() {
42 | 		return alpha;
43 | 	}
44 | 
45 | 	public void setAlpha(double alpha) {
46 | 		this.alpha = alpha;
47 | 	}
48 | 
49 | 	public String getParentAttrValue() {
50 | 		return parentAttrValue;
51 | 	}
52 | 
53 | 	public void setParentAttrValue(String parentAttrValue) {
54 | 		this.parentAttrValue = parentAttrValue;
55 | 	}
56 | 
57 | 	public TreeNode[] getChildAttrNode() {
58 | 		return childAttrNode;
59 | 	}
60 | 
61 | 	public void setChildAttrNode(TreeNode[] childAttrNode) {
62 | 		this.childAttrNode = childAttrNode;
63 | 	}
64 | 
65 | 	public ArrayList<String> getDataIndex() {
66 | 		return dataIndex;
67 | 	}
68 | 
69 | 	public void setDataIndex(ArrayList<String> dataIndex) {
70 | 		this.dataIndex = dataIndex;
71 | 	}
72 | 
73 | 	public int getLeafNum() {
74 | 		return leafNum;
75 | 	}
76 | 
77 | 	public void setLeafNum(int leafNum) {
78 | 		this.leafNum = leafNum;
79 | 	}
80 | 
81 | }
82 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/tan/AttrMutualInfo.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.tan;
 2 | 
 3 | /**
 4 |  * 属性之间的互信息值，表示属性之间的关联性大小
 5 |  */
 6 | public class AttrMutualInfo implements Comparable<AttrMutualInfo> {
 7 | 
 8 | 	//互信息值
 9 | 	Double value;
10 | 	//关联属性值对
11 | 	Node[] nodeArray;
12 | 
13 | 	public AttrMutualInfo(double value, Node node1, Node node2) {
14 | 		this.value = value;
15 | 
16 | 		this.nodeArray = new Node[2];
17 | 		this.nodeArray[0] = node1;
18 | 		this.nodeArray[1] = node2;
19 | 	}
20 | 
21 | 	@Override
22 | 	public int compareTo(AttrMutualInfo o) {
23 | 		return o.value.compareTo(this.value);
24 | 	}
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/tan/Node.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.tan;
 2 | 
 3 | import java.util.ArrayList;
 4 | 
 5 | /**
 6 |  * 贝叶斯网络节点类
 7 |  */
 8 | public class Node {
 9 | 
10 | 	//节点唯一id，方便后面节点连接方向的确定
11 | 	int id;
12 | 	// 节点的属性名称
13 | 	String name;
14 | 	// 该节点所连续的节点
15 | 	ArrayList<Node> connectedNodes;
16 | 
17 | 	public Node(int id, String name) {
18 | 		this.id = id;
19 | 		this.name = name;
20 | 
21 | 		// 初始化变量
22 | 		this.connectedNodes = new ArrayList<>();
23 | 	}
24 | 
25 | 	/**
26 | 	 * 将自身节点连接到目标给定的节点
27 | 	 *
28 | 	 * @param node
29 | 	 *            下游节点
30 | 	 */
31 | 	public void connectNode(Node node) {
32 | 		//避免连接自身
33 | 		if (this.id == node.id) {
34 | 			return;
35 | 		}
36 | 
37 | 		// 将节点加入自身节点的节点列表中
38 | 		this.connectedNodes.add(node);
39 | 		// 将自身节点加入到目标节点的列表中
40 | 		node.connectedNodes.add(this);
41 | 	}
42 | 
43 | 	/**
44 | 	 * 判断与目标节点是否相同，主要比较名称是否相同即可
45 | 	 *
46 | 	 * @param node
47 | 	 *            目标结点
48 | 	 * @return
49 | 	 */
50 | 	public boolean isEqual(Node node) {
51 | 		boolean isEqual;
52 | 
53 | 		isEqual = false;
54 | 		// 节点名称相同则视为相等
55 | 		if (this.id == node.id) {
56 | 			isEqual = true;
57 | 		}
58 | 
59 | 		return isEqual;
60 | 	}
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/tan/TanExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.tan;
 2 | 
 3 | /**
 4 |  * TAN树型朴素贝叶斯算法
 5 |  */
 6 | public class TanExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		String filePath = "data/tan/input.txt";
10 | 		// 条件查询语句
11 | 		String queryStr;
12 | 		// 分类结果概率1
13 | 		double classResult1;
14 | 		// 分类结果概率2
15 | 		double classResult2;
16 | 
17 | 		TANCore tool = new TANCore(filePath);
18 | 		queryStr = "OutLook=Sunny,Temperature=Hot,Humidity=High,Wind=Weak,PlayTennis=No";
19 | 		classResult1 = tool.calHappenedPro(queryStr);
20 | 
21 | 		queryStr = "OutLook=Sunny,Temperature=Hot,Humidity=High,Wind=Weak,PlayTennis=Yes";
22 | 		classResult2 = tool.calHappenedPro(queryStr);
23 | 
24 | 		System.out.println(String.format("类别为%s所求得的概率为%s", "PlayTennis=No", classResult1));
25 | 		System.out.println(String.format("类别为%s所求得的概率为%s", "PlayTennis=Yes", classResult2));
26 | 		if (classResult1 > classResult2) {
27 | 			System.out.println("分类类别为PlayTennis=No");
28 | 		} else {
29 | 			System.out.println("分类类别为PlayTennis=Yes");
30 | 		}
31 | 	}
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/viterbi/BaseNames.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.viterbi;
 2 | 
 3 | /**
 4 |  * 基本变量定义类
 5 |  */
 6 | public class BaseNames {
 7 | 
 8 | 	//日期天数下标
 9 | 	public static final int DAY1 = 0;
10 | 	public static final int DAY2 = 1;
11 | 	public static final int DAY3 = 2;
12 | 
13 | 	//天气属性类别
14 | 	public static final int WEATHER_SUNNY = 0;
15 | 	public static final int WEATHER_CLOUDY = 1;
16 | 	public static final int WEATHER_RAINY = 2;
17 | 
18 | 	//湿度属性类别
19 | 	public static final int HUMIDITY_DRY = 0;
20 | 	public static final int HUMIDITY_DRYISH = 1;
21 | 	public static final int HUMIDITY_DAMP = 1;
22 | 	public static final int HUMIDITY_SOGGY = 1;
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/viterbi/ViterbiCore.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.others.viterbi;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | import java.util.HashMap;
  9 | 
 10 | /**
 11 |  * 维特比算法工具类
 12 |  */
 13 | public class ViterbiCore {
 14 | 
 15 | 	// 状态转移概率矩阵文件地址
 16 | 	private String stmFilePath;
 17 | 	// 混淆矩阵文件地址
 18 | 	private String confusionFilePath;
 19 | 	// 初始状态概率
 20 | 	private double[] initStatePro;
 21 | 	// 观察到的状态序列
 22 | 	public String[] observeStates;
 23 | 	// 状态转移矩阵值
 24 | 	private double[][] stMatrix;
 25 | 	// 混淆矩阵值
 26 | 	private double[][] confusionMatrix;
 27 | 	// 各个条件下的潜在特征概率值
 28 | 	private double[][] potentialValues;
 29 | 	// 潜在特征
 30 | 	private ArrayList<String> potentialAttrs;
 31 | 	// 属性值列坐标映射图
 32 | 	private HashMap<String, Integer> name2Index;
 33 | 	// 列坐标属性值映射图
 34 | 	private HashMap<Integer, String> index2name;
 35 | 
 36 | 	public ViterbiCore(String stmFilePath, String confusionFilePath, double[] initStatePro, String[] observeStates) {
 37 | 		this.stmFilePath = stmFilePath;
 38 | 		this.confusionFilePath = confusionFilePath;
 39 | 		this.initStatePro = initStatePro;
 40 | 		this.observeStates = observeStates;
 41 | 
 42 | 		initOperation();
 43 | 	}
 44 | 
 45 | 	/**
 46 | 	 * 初始化数据操作
 47 | 	 */
 48 | 	private void initOperation() {
 49 | 		double[] temp;
 50 | 		int index;
 51 | 		ArrayList<String[]> smtDatas;
 52 | 		ArrayList<String[]> cfDatas;
 53 | 
 54 | 		smtDatas = readDataFile(stmFilePath);
 55 | 		cfDatas = readDataFile(confusionFilePath);
 56 | 
 57 | 		index = 0;
 58 | 		this.stMatrix = new double[smtDatas.size()][];
 59 | 		for (String[] array : smtDatas) {
 60 | 			temp = new double[array.length];
 61 | 			for (int i = 0; i < array.length; i++) {
 62 | 				try {
 63 | 					temp[i] = Double.parseDouble(array[i]);
 64 | 				} catch (NumberFormatException e) {
 65 | 					temp[i] = -1;
 66 | 				}
 67 | 			}
 68 | 
 69 | 			// 将转换后的值赋给数组中
 70 | 			this.stMatrix[index] = temp;
 71 | 			index++;
 72 | 		}
 73 | 
 74 | 		index = 0;
 75 | 		this.confusionMatrix = new double[cfDatas.size()][];
 76 | 		for (String[] array : cfDatas) {
 77 | 			temp = new double[array.length];
 78 | 			for (int i = 0; i < array.length; i++) {
 79 | 				try {
 80 | 					temp[i] = Double.parseDouble(array[i]);
 81 | 				} catch (NumberFormatException e) {
 82 | 					temp[i] = -1;
 83 | 				}
 84 | 			}
 85 | 
 86 | 			// 将转换后的值赋给数组中
 87 | 			this.confusionMatrix[index] = temp;
 88 | 			index++;
 89 | 		}
 90 | 
 91 | 		this.potentialAttrs = new ArrayList<>();
 92 | 		// 添加潜在特征属性
 93 | 		for (String s : smtDatas.get(0)) {
 94 | 			this.potentialAttrs.add(s);
 95 | 		}
 96 | 		// 去除首列无效列
 97 | 		potentialAttrs.remove(0);
 98 | 
 99 | 		this.name2Index = new HashMap<>();
100 | 		this.index2name = new HashMap<>();
101 | 
102 | 		// 添加名称下标映射关系
103 | 		for (int i = 1; i < smtDatas.get(0).length; i++) {
104 | 			this.name2Index.put(smtDatas.get(0)[i], i);
105 | 			// 添加下标到名称的映射
106 | 			this.index2name.put(i, smtDatas.get(0)[i]);
107 | 		}
108 | 
109 | 		for (int i = 1; i < cfDatas.get(0).length; i++) {
110 | 			this.name2Index.put(cfDatas.get(0)[i], i);
111 | 		}
112 | 	}
113 | 
114 | 	/**
115 | 	 * 从文件中读取数据
116 | 	 */
117 | 	private ArrayList<String[]> readDataFile(String filePath) {
118 | 		File file = new File(filePath);
119 | 		ArrayList<String[]> dataArray = new ArrayList<String[]>();
120 | 
121 | 		try {
122 | 			BufferedReader in = new BufferedReader(new FileReader(file));
123 | 			String str;
124 | 			String[] tempArray;
125 | 			while ((str = in.readLine()) != null) {
126 | 				tempArray = str.split(" ");
127 | 				dataArray.add(tempArray);
128 | 			}
129 | 			in.close();
130 | 		} catch (IOException e) {
131 | 			e.getStackTrace();
132 | 		}
133 | 
134 | 		return dataArray;
135 | 	}
136 | 
137 | 	/**
138 | 	 * 根据观察特征计算隐藏的特征概率矩阵
139 | 	 */
140 | 	private void calPotencialProMatrix() {
141 | 		String curObserveState;
142 | 		// 观察特征和潜在特征的下标
143 | 		int osIndex;
144 | 		int psIndex;
145 | 		double temp;
146 | 		double maxPro;
147 | 		// 混淆矩阵概率值，就是相关影响的因素概率
148 | 		double confusionPro;
149 | 
150 | 		this.potentialValues = new double[observeStates.length][potentialAttrs.size() + 1];
151 | 		for (int i = 0; i < this.observeStates.length; i++) {
152 | 			curObserveState = this.observeStates[i];
153 | 			osIndex = this.name2Index.get(curObserveState);
154 | 			maxPro = -1;
155 | 
156 | 			// 因为是第一个观察特征，没有前面的影响，根据初始状态计算
157 | 			if (i == 0) {
158 | 				for (String attr : this.potentialAttrs) {
159 | 					psIndex = this.name2Index.get(attr);
160 | 					confusionPro = this.confusionMatrix[psIndex][osIndex];
161 | 
162 | 					temp = this.initStatePro[psIndex - 1] * confusionPro;
163 | 					this.potentialValues[BaseNames.DAY1][psIndex] = temp;
164 | 				}
165 | 			} else {
166 | 				// 后面的潜在特征受前一个特征的影响，以及当前的混淆因素影响
167 | 				for (String toDayAttr : this.potentialAttrs) {
168 | 					psIndex = this.name2Index.get(toDayAttr);
169 | 					confusionPro = this.confusionMatrix[psIndex][osIndex];
170 | 
171 | 					int index;
172 | 					maxPro = -1;
173 | 					// 通过昨天的概率计算今天此特征的最大概率
174 | 					for (String yAttr : this.potentialAttrs) {
175 | 						index = this.name2Index.get(yAttr);
176 | 						temp = this.potentialValues[i - 1][index] * this.stMatrix[index][psIndex];
177 | 
178 | 						// 计算得到今天此潜在特征的最大概率
179 | 						if (temp > maxPro) {
180 | 							maxPro = temp;
181 | 						}
182 | 					}
183 | 
184 | 					this.potentialValues[i][psIndex] = maxPro * confusionPro;
185 | 				}
186 | 			}
187 | 		}
188 | 	}
189 | 
190 | 	/**
191 | 	 * 根据同时期最大概率值输出潜在特征值
192 | 	 */
193 | 	private void outputResultAttr() {
194 | 		double maxPro;
195 | 		int maxIndex;
196 | 		ArrayList<String> psValues;
197 | 
198 | 		psValues = new ArrayList<>();
199 | 		for (int i = 0; i < this.potentialValues.length; i++) {
200 | 			maxPro = -1;
201 | 			maxIndex = 0;
202 | 
203 | 			for (int j = 0; j < potentialValues[i].length; j++) {
204 | 				if (this.potentialValues[i][j] > maxPro) {
205 | 					maxPro = potentialValues[i][j];
206 | 					maxIndex = j;
207 | 				}
208 | 			}
209 | 
210 | 			// 取出最大概率下标对应的潜在特征
211 | 			psValues.add(this.index2name.get(maxIndex));
212 | 		}
213 | 
214 | 		System.out.println("观察特征为：");
215 | 		for (String s : this.observeStates) {
216 | 			System.out.print(s + ", ");
217 | 		}
218 | 		System.out.println();
219 | 
220 | 		System.out.println("潜在特征为：");
221 | 		for (String s : psValues) {
222 | 			System.out.print(s + ", ");
223 | 		}
224 | 		System.out.println();
225 | 	}
226 | 
227 | 	/**
228 | 	 * 根据观察属性，得到潜在属性信息
229 | 	 */
230 | 	public void calHMMObserve() {
231 | 		calPotencialProMatrix();
232 | 		outputResultAttr();
233 | 	}
234 | }
235 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/others/viterbi/ViterbiExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.others.viterbi;
 2 | 
 3 | /**
 4 |  * 维特比算法
 5 |  */
 6 | public class ViterbiExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		// 状态转移概率矩阵路径
10 | 		String stmFilePath;
11 | 		// 混淆矩阵路径
12 | 		String cfFilePath;
13 | 		// 观察到的状态
14 | 		String[] observeStates;
15 | 		// 初始状态
16 | 		double[] initStatePro;
17 | 		ViterbiCore tool;
18 | 
19 | 		stmFilePath = "data/viterbi/stmatrix.txt";
20 | 		cfFilePath = "data/viterbi/humidity-matrix.txt";
21 | 
22 | 		initStatePro = new double[] { 0.63, 0.17, 0.20 };
23 | 		observeStates = new String[] { "Dry", "Damp", "Soggy" };
24 | 
25 | 		tool = new ViterbiCore(stmFilePath, cfFilePath, initStatePro, observeStates);
26 | 		tool.calHMMObserve();
27 | 	}
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/roughsets/KnowledgeSystem.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.roughsets;
  2 | 
  3 | import java.util.ArrayList;
  4 | 
  5 | /**
  6 |  * 知识系统
  7 |  */
  8 | public class KnowledgeSystem {
  9 | 
 10 | 	// 知识系统内的集合
 11 | 	ArrayList<RecordCollection> ksCollections;
 12 | 
 13 | 	public KnowledgeSystem(ArrayList<RecordCollection> ksCollections) {
 14 | 		this.ksCollections = ksCollections;
 15 | 	}
 16 | 
 17 | 	/**
 18 | 	 * 获取集合的上近似集合
 19 | 	 *
 20 | 	 * @param rc
 21 | 	 *            原始集合
 22 | 	 * @return
 23 | 	 */
 24 | 	public RecordCollection getUpSimilarRC(RecordCollection rc) {
 25 | 		RecordCollection resultRc = null;
 26 | 		ArrayList<String> nameArray;
 27 | 		ArrayList<String> targetArray;
 28 | 		ArrayList<RecordCollection> copyRcs = new ArrayList<>();
 29 | 		ArrayList<RecordCollection> deleteRcs = new ArrayList<>();
 30 | 		targetArray = rc.getRecordNames();
 31 | 
 32 | 		// 做一个集合拷贝
 33 | 		for (RecordCollection recordCollection : ksCollections) {
 34 | 			copyRcs.add(recordCollection);
 35 | 		}
 36 | 
 37 | 		for (RecordCollection recordCollection : copyRcs) {
 38 | 			nameArray = recordCollection.getRecordNames();
 39 | 
 40 | 			if (strIsContained(targetArray, nameArray)) {
 41 | 				removeOverLaped(targetArray, nameArray);
 42 | 				deleteRcs.add(recordCollection);
 43 | 
 44 | 				if (resultRc == null) {
 45 | 					resultRc = recordCollection;
 46 | 				} else {
 47 | 					// 进行并运算
 48 | 					resultRc = resultRc.unionCal(recordCollection);
 49 | 				}
 50 | 
 51 | 				if (targetArray.size() == 0) {
 52 | 					break;
 53 | 				}
 54 | 			}
 55 | 		}
 56 | 		//去除已经添加过的集合
 57 | 		copyRcs.removeAll(deleteRcs);
 58 | 
 59 | 		if (targetArray.size() > 0) {
 60 | 			// 说明已经完全还未找全上近似的集合
 61 | 			for (RecordCollection recordCollection : copyRcs) {
 62 | 				nameArray = recordCollection.getRecordNames();
 63 | 
 64 | 				if (strHasOverlap(targetArray, nameArray)) {
 65 | 					removeOverLaped(targetArray, nameArray);
 66 | 
 67 | 					if (resultRc == null) {
 68 | 						resultRc = recordCollection;
 69 | 					} else {
 70 | 						// 进行并运算
 71 | 						resultRc = resultRc.unionCal(recordCollection);
 72 | 					}
 73 | 
 74 | 					if (targetArray.size() == 0) {
 75 | 						break;
 76 | 					}
 77 | 				}
 78 | 			}
 79 | 		}
 80 | 
 81 | 		return resultRc;
 82 | 	}
 83 | 
 84 | 	/**
 85 | 	 * 获取集合的下近似集合
 86 | 	 *
 87 | 	 * @param rc
 88 | 	 *            原始集合
 89 | 	 * @return
 90 | 	 */
 91 | 	public RecordCollection getDownSimilarRC(RecordCollection rc) {
 92 | 		RecordCollection resultRc = null;
 93 | 		ArrayList<String> nameArray;
 94 | 		ArrayList<String> targetArray;
 95 | 		targetArray = rc.getRecordNames();
 96 | 
 97 | 		for (RecordCollection recordCollection : ksCollections) {
 98 | 			nameArray = recordCollection.getRecordNames();
 99 | 
100 | 			if (strIsContained(targetArray, nameArray)) {
101 | 				removeOverLaped(targetArray, nameArray);
102 | 
103 | 				if (resultRc == null) {
104 | 					resultRc = recordCollection;
105 | 				} else {
106 | 					// 进行并运算
107 | 					resultRc = resultRc.unionCal(recordCollection);
108 | 				}
109 | 
110 | 				if (targetArray.size() == 0) {
111 | 					break;
112 | 				}
113 | 			}
114 | 		}
115 | 
116 | 		return resultRc;
117 | 	}
118 | 
119 | 	/**
120 | 	 * 判断2个字符数组之间是否有交集
121 | 	 *
122 | 	 * @param str1
123 | 	 *            字符列表1
124 | 	 * @param str2
125 | 	 *            字符列表2
126 | 	 * @return
127 | 	 */
128 | 	public boolean strHasOverlap(ArrayList<String> str1, ArrayList<String> str2) {
129 | 		boolean hasOverlap = false;
130 | 
131 | 		for (String s1 : str1) {
132 | 			for (String s2 : str2) {
133 | 				if (s1.equals(s2)) {
134 | 					hasOverlap = true;
135 | 					break;
136 | 				}
137 | 			}
138 | 
139 | 			if (hasOverlap) {
140 | 				break;
141 | 			}
142 | 		}
143 | 
144 | 		return hasOverlap;
145 | 	}
146 | 
147 | 	/**
148 | 	 * 判断字符集str2是否完全包含于str1中
149 | 	 *
150 | 	 * @param str1
151 | 	 * @param str2
152 | 	 * @return
153 | 	 */
154 | 	public boolean strIsContained(ArrayList<String> str1, ArrayList<String> str2) {
155 | 		boolean isContained = false;
156 | 		int count = 0;
157 | 
158 | 		for (String s : str2) {
159 | 			if (str1.contains(s)) {
160 | 				count++;
161 | 			}
162 | 		}
163 | 
164 | 		if (count == str2.size()) {
165 | 			isContained = true;
166 | 		}
167 | 
168 | 		return isContained;
169 | 	}
170 | 
171 | 	/**
172 | 	 * 字符列表移除公共元素
173 | 	 *
174 | 	 * @param str1
175 | 	 * @param str2
176 | 	 */
177 | 	public void removeOverLaped(ArrayList<String> str1, ArrayList<String> str2) {
178 | 		ArrayList<String> deleteStrs = new ArrayList<>();
179 | 
180 | 		for (String s1 : str1) {
181 | 			for (String s2 : str2) {
182 | 				if (s1.equals(s2)) {
183 | 					deleteStrs.add(s1);
184 | 					break;
185 | 				}
186 | 			}
187 | 		}
188 | 
189 | 		// 进行公共元素的移除
190 | 		str1.removeAll(deleteStrs);
191 | 	}
192 | }
193 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/roughsets/Record.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.roughsets;
  2 | 
  3 | import java.text.MessageFormat;
  4 | import java.util.ArrayList;
  5 | import java.util.HashMap;
  6 | import java.util.Map;
  7 | 
  8 | /**
  9 |  * 数据记录，包含这条记录所有属性
 10 |  */
 11 | public class Record {
 12 | 
 13 | 	// 记录名称
 14 | 	private String name;
 15 | 	// 记录属性键值对
 16 | 	private HashMap<String, String> attrValues;
 17 | 
 18 | 	public Record(String name, HashMap<String, String> attrValues) {
 19 | 		this.name = name;
 20 | 		this.attrValues = attrValues;
 21 | 	}
 22 | 
 23 | 	public String getName() {
 24 | 		return this.name;
 25 | 	}
 26 | 
 27 | 	/**
 28 | 	 * 此数据是否包含此属性值
 29 | 	 *
 30 | 	 * @param attr
 31 | 	 *            待判断属性值
 32 | 	 * @return
 33 | 	 */
 34 | 	public boolean isContainedAttr(String attr) {
 35 | 		boolean isContained = false;
 36 | 
 37 | 		if (attrValues.containsValue(attr)) {
 38 | 			isContained = true;
 39 | 		}
 40 | 
 41 | 		return isContained;
 42 | 	}
 43 | 
 44 | 	/**
 45 | 	 * 判断数据记录是否是同一条记录，根据数据名称来判断
 46 | 	 *
 47 | 	 * @param record
 48 | 	 *            目标比较对象
 49 | 	 * @return
 50 | 	 */
 51 | 	public boolean isRecordSame(Record record) {
 52 | 		boolean isSame = false;
 53 | 
 54 | 		if (this.name.equals(record.name)) {
 55 | 			isSame = true;
 56 | 		}
 57 | 
 58 | 		return isSame;
 59 | 	}
 60 | 
 61 | 	/**
 62 | 	 * 数据的决策属性分类
 63 | 	 *
 64 | 	 * @return
 65 | 	 */
 66 | 	public String getRecordDecisionClass() {
 67 | 		String value = null;
 68 | 
 69 | 		value = attrValues.get(RoughSetsCore.DECISION_ATTR_NAME);
 70 | 
 71 | 		return value;
 72 | 	}
 73 | 
 74 | 	/**
 75 | 	 * 根据约简属性输出决策规则
 76 | 	 *
 77 | 	 * @param reductAttr
 78 | 	 *            约简属性集合
 79 | 	 */
 80 | 	public String getDecisionRule(ArrayList<String> reductAttr) {
 81 | 		String ruleStr = "";
 82 | 		String attrName = null;
 83 | 		String value = null;
 84 | 		String decisionValue;
 85 | 
 86 | 		decisionValue = attrValues.get(RoughSetsCore.DECISION_ATTR_NAME);
 87 | 		ruleStr += "属性";
 88 | 		for (Map.Entry entry : this.attrValues.entrySet()) {
 89 | 			attrName = (String) entry.getKey();
 90 | 			value = (String) entry.getValue();
 91 | 
 92 | 			if (attrName.equals(RoughSetsCore.DECISION_ATTR_NAME) || reductAttr.contains(attrName)
 93 | 					|| value.equals(name)) {
 94 | 				continue;
 95 | 			}
 96 | 
 97 | 			ruleStr += MessageFormat.format("{0}={1},", attrName, value);
 98 | 		}
 99 | 		ruleStr += "他的分类为" + decisionValue;
100 | 
101 | 		return ruleStr;
102 | 	}
103 | }
104 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/roughsets/RecordCollection.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.roughsets;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.HashMap;
  5 | import java.util.Map;
  6 | 
  7 | /**
  8 |  * 数据记录集合，包含一些共同的属性
  9 |  */
 10 | public class RecordCollection {
 11 | 
 12 | 	// 集合包含的属性
 13 | 	private HashMap<String, String> attrValues;
 14 | 	// 数据记录列表
 15 | 	private ArrayList<Record> recordList;
 16 | 
 17 | 	public RecordCollection() {
 18 | 		this.attrValues = new HashMap<>();
 19 | 		this.recordList = new ArrayList<>();
 20 | 	}
 21 | 
 22 | 	public RecordCollection(HashMap<String, String> attrValues, ArrayList<Record> recordList) {
 23 | 		this.attrValues = attrValues;
 24 | 		this.recordList = recordList;
 25 | 	}
 26 | 
 27 | 	public ArrayList<Record> getRecord() {
 28 | 		return this.recordList;
 29 | 	}
 30 | 
 31 | 	/**
 32 | 	 * 返回集合的字符名称数组
 33 | 	 *
 34 | 	 * @return
 35 | 	 */
 36 | 	public ArrayList<String> getRecordNames() {
 37 | 		ArrayList<String> names = new ArrayList<>();
 38 | 
 39 | 		for (int i = 0; i < recordList.size(); i++) {
 40 | 			names.add(recordList.get(i).getName());
 41 | 		}
 42 | 
 43 | 		return names;
 44 | 	}
 45 | 
 46 | 	/**
 47 | 	 * 判断集合是否包含此属性名称对应的属性值
 48 | 	 *
 49 | 	 * @param attrName
 50 | 	 *            属性名
 51 | 	 * @return
 52 | 	 */
 53 | 	public boolean isContainedAttrName(String attrName) {
 54 | 		boolean isContained = false;
 55 | 
 56 | 		if (this.attrValues.containsKey(attrName)) {
 57 | 			isContained = true;
 58 | 		}
 59 | 
 60 | 		return isContained;
 61 | 	}
 62 | 
 63 | 	/**
 64 | 	 * 判断2个集合是否相等，比较包含的数据记录是否完全一致
 65 | 	 *
 66 | 	 * @param rc
 67 | 	 *            待比较集合
 68 | 	 * @return
 69 | 	 */
 70 | 	public boolean isCollectionSame(RecordCollection rc) {
 71 | 		boolean isSame = false;
 72 | 
 73 | 		for (Record r : recordList) {
 74 | 			isSame = false;
 75 | 
 76 | 			for (Record r2 : rc.recordList) {
 77 | 				if (r.isRecordSame(r2)) {
 78 | 					isSame = true;
 79 | 					break;
 80 | 				}
 81 | 			}
 82 | 
 83 | 			// 如果有1个记录不包含，就算集合不相等
 84 | 			if (!isSame) {
 85 | 				break;
 86 | 			}
 87 | 		}
 88 | 
 89 | 		return isSame;
 90 | 	}
 91 | 
 92 | 	/**
 93 | 	 * 集合之间的交运算
 94 | 	 *
 95 | 	 * @param rc
 96 | 	 *            交运算的参与运算的另外一集合
 97 | 	 * @return
 98 | 	 */
 99 | 	public RecordCollection overlapCalculate(RecordCollection rc) {
100 | 		String key;
101 | 		String value;
102 | 		RecordCollection resultCollection = null;
103 | 		HashMap<String, String> resultAttrValues = new HashMap<>();
104 | 		ArrayList<Record> resultRecords = new ArrayList<>();
105 | 
106 | 		// 进行集合的交运算，有相同的记录的则进行添加
107 | 		for (Record record : this.recordList) {
108 | 			for (Record record2 : rc.recordList) {
109 | 				if (record.isRecordSame(record2)) {
110 | 					resultRecords.add(record);
111 | 					break;
112 | 				}
113 | 			}
114 | 		}
115 | 
116 | 		// 如果没有交集，则直接返回
117 | 		if (resultRecords.size() == 0) {
118 | 			return null;
119 | 		}
120 | 
121 | 		// 将2个集合的属性进行合并
122 | 		for (Map.Entry entry : this.attrValues.entrySet()) {
123 | 			key = (String) entry.getKey();
124 | 			value = (String) entry.getValue();
125 | 
126 | 			resultAttrValues.put(key, value);
127 | 		}
128 | 
129 | 		for (Map.Entry entry : rc.attrValues.entrySet()) {
130 | 			key = (String) entry.getKey();
131 | 			value = (String) entry.getValue();
132 | 
133 | 			resultAttrValues.put(key, value);
134 | 		}
135 | 
136 | 		resultCollection = new RecordCollection(resultAttrValues, resultRecords);
137 | 		return resultCollection;
138 | 	}
139 | 
140 | 	/**
141 | 	 * 求集合的并集，各自保留各自的属性
142 | 	 *
143 | 	 * @param rc
144 | 	 *            待合并的集合
145 | 	 * @return
146 | 	 */
147 | 	public RecordCollection unionCal(RecordCollection rc) {
148 | 		RecordCollection resultRc = null;
149 | 		ArrayList<Record> records = new ArrayList<>();
150 | 
151 | 		for (Record r1 : this.recordList) {
152 | 			records.add(r1);
153 | 		}
154 | 
155 | 		for (Record r2 : rc.recordList) {
156 | 			records.add(r2);
157 | 		}
158 | 
159 | 		resultRc = new RecordCollection(null, records);
160 | 		return resultRc;
161 | 	}
162 | 
163 | 	/**
164 | 	 * 输出集合中包含的元素
165 | 	 */
166 | 	public void printRc() {
167 | 		System.out.print("{");
168 | 		for (Record r : this.getRecord()) {
169 | 			System.out.print(r.getName() + ", ");
170 | 		}
171 | 		System.out.println("}");
172 | 	}
173 | }
174 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/roughsets/RoughSetsExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.roughsets;
 2 | 
 3 | /**
 4 |  * 粗糙集约简算法
 5 |  */
 6 | public class RoughSetsExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		String filePath = "data/roughsets/input.txt";
10 | 
11 | 		RoughSetsCore tool = new RoughSetsCore(filePath);
12 | 		tool.findingReduct();
13 | 	}
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/sequential/patterns/gsp/GSPExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.sequential.patterns.gsp;
 2 | 
 3 | /**
 4 |  * GSP序列模式分析算法
 5 |  */
 6 | public class GSPExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		String filePath = "data/gsp/testInput.txt";
10 | 		//最小支持度阈值
11 | 		int minSupportCount = 2;
12 | 		//时间最小间隔
13 | 		int min_gap = 1;
14 | 		//施加最大间隔
15 | 		int max_gap = 5;
16 | 
17 | 		GSPCore tool = new GSPCore(filePath, minSupportCount, min_gap, max_gap);
18 | 		tool.gspCalculate();
19 | 	}
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/sequential/patterns/gsp/ItemSet.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.sequential.patterns.gsp;
 2 | 
 3 | import java.util.ArrayList;
 4 | 
 5 | /**
 6 |  * 序列中的子项集
 7 |  */
 8 | public class ItemSet {
 9 | 
10 | 	/**
11 | 	 * 项集中保存的是数字项数组
12 | 	 */
13 | 	private ArrayList<Integer> items;
14 | 
15 | 	public ItemSet(String[] itemStr) {
16 | 		items = new ArrayList<>();
17 | 		for (String s : itemStr) {
18 | 			items.add(Integer.parseInt(s));
19 | 		}
20 | 	}
21 | 
22 | 	public ItemSet(int[] itemNum) {
23 | 		items = new ArrayList<>();
24 | 		for (int num : itemNum) {
25 | 			items.add(num);
26 | 		}
27 | 	}
28 | 
29 | 	public ItemSet(ArrayList<Integer> itemNum) {
30 | 		this.items = itemNum;
31 | 	}
32 | 
33 | 	public ArrayList<Integer> getItems() {
34 | 		return items;
35 | 	}
36 | 
37 | 	public void setItems(ArrayList<Integer> items) {
38 | 		this.items = items;
39 | 	}
40 | 
41 | 	/**
42 | 	 * 判断2个项集是否相等
43 | 	 *
44 | 	 * @param itemSet
45 | 	 *            比较对象
46 | 	 * @return
47 | 	 */
48 | 	public boolean compareIsSame(ItemSet itemSet) {
49 | 		boolean result = true;
50 | 
51 | 		if (this.items.size() != itemSet.items.size()) {
52 | 			return false;
53 | 		}
54 | 
55 | 		for (int i = 0; i < itemSet.items.size(); i++) {
56 | 			if (this.items.get(i) != itemSet.items.get(i)) {
57 | 				// 只要有值不相等，直接算作不相等
58 | 				result = false;
59 | 				break;
60 | 			}
61 | 		}
62 | 
63 | 		return result;
64 | 	}
65 | 
66 | 	/**
67 | 	 * 拷贝项集中同样的数据一份
68 | 	 *
69 | 	 * @return
70 | 	 */
71 | 	public ArrayList<Integer> copyItems() {
72 | 		ArrayList<Integer> copyItems = new ArrayList<>();
73 | 
74 | 		for (int num : this.items) {
75 | 			copyItems.add(num);
76 | 		}
77 | 
78 | 		return copyItems;
79 | 	}
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/sequential/patterns/gsp/Sequence.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.sequential.patterns.gsp;
  2 | 
  3 | import java.util.ArrayList;
  4 | 
  5 | /**
  6 |  * 序列，每个序列内部包含多组ItemSet项集
  7 |  */
  8 | public class Sequence implements Comparable<Sequence>, Cloneable {
  9 | 
 10 | 	// 序列所属事务ID
 11 | 	private int trsanctionID;
 12 | 	// 项集列表
 13 | 	private ArrayList<ItemSet> itemSetList;
 14 | 
 15 | 	public Sequence(int trsanctionID) {
 16 | 		this.trsanctionID = trsanctionID;
 17 | 		this.itemSetList = new ArrayList<>();
 18 | 	}
 19 | 
 20 | 	public Sequence() {
 21 | 		this.itemSetList = new ArrayList<>();
 22 | 	}
 23 | 
 24 | 	public int getTrsanctionID() {
 25 | 		return trsanctionID;
 26 | 	}
 27 | 
 28 | 	public void setTrsanctionID(int trsanctionID) {
 29 | 		this.trsanctionID = trsanctionID;
 30 | 	}
 31 | 
 32 | 	public ArrayList<ItemSet> getItemSetList() {
 33 | 		return itemSetList;
 34 | 	}
 35 | 
 36 | 	public void setItemSetList(ArrayList<ItemSet> itemSetList) {
 37 | 		this.itemSetList = itemSetList;
 38 | 	}
 39 | 
 40 | 	/**
 41 | 	 * 取出序列中第一个项集的第一个元素
 42 | 	 *
 43 | 	 * @return
 44 | 	 */
 45 | 	public Integer getFirstItemSetNum() {
 46 | 		return this.getItemSetList().get(0).getItems().get(0);
 47 | 	}
 48 | 
 49 | 	/**
 50 | 	 * 获取序列中最后一个项集
 51 | 	 *
 52 | 	 * @return
 53 | 	 */
 54 | 	public ItemSet getLastItemSet() {
 55 | 		return getItemSetList().get(getItemSetList().size() - 1);
 56 | 	}
 57 | 
 58 | 	/**
 59 | 	 * 获取序列中最后一个项集的最后一个一个元素
 60 | 	 *
 61 | 	 * @return
 62 | 	 */
 63 | 	public Integer getLastItemSetNum() {
 64 | 		ItemSet lastItemSet = getItemSetList().get(getItemSetList().size() - 1);
 65 | 		int lastItemNum = lastItemSet.getItems().get(lastItemSet.getItems().size() - 1);
 66 | 
 67 | 		return lastItemNum;
 68 | 	}
 69 | 
 70 | 	/**
 71 | 	 * 判断序列中最后一个项集是否为单一的值
 72 | 	 *
 73 | 	 * @return
 74 | 	 */
 75 | 	public boolean isLastItemSetSingleNum() {
 76 | 		ItemSet lastItemSet = getItemSetList().get(getItemSetList().size() - 1);
 77 | 		int size = lastItemSet.getItems().size();
 78 | 
 79 | 		return size == 1 ? true : false;
 80 | 	}
 81 | 
 82 | 	@Override
 83 | 	public int compareTo(Sequence o) {
 84 | 		// TODO Auto-generated method stub
 85 | 		return this.getFirstItemSetNum().compareTo(o.getFirstItemSetNum());
 86 | 	}
 87 | 
 88 | 	@Override
 89 | 	protected Object clone() throws CloneNotSupportedException {
 90 | 		// TODO Auto-generated method stub
 91 | 		return super.clone();
 92 | 	}
 93 | 
 94 | 	/**
 95 | 	 * 拷贝一份一模一样的序列
 96 | 	 */
 97 | 	public Sequence copySeqence() {
 98 | 		Sequence copySeq = new Sequence();
 99 | 		for (ItemSet itemSet : this.itemSetList) {
100 | 			copySeq.getItemSetList().add(new ItemSet(itemSet.copyItems()));
101 | 		}
102 | 
103 | 		return copySeq;
104 | 	}
105 | 
106 | 	/**
107 | 	 * 比较2个序列是否相等，需要判断内部的每个项集是否完全一致
108 | 	 *
109 | 	 * @param seq
110 | 	 *            比较的序列对象
111 | 	 * @return
112 | 	 */
113 | 	public boolean compareIsSame(Sequence seq) {
114 | 		boolean result = true;
115 | 		ArrayList<ItemSet> itemSetList2 = seq.getItemSetList();
116 | 		ItemSet tempItemSet1;
117 | 		ItemSet tempItemSet2;
118 | 
119 | 		if (itemSetList2.size() != this.itemSetList.size()) {
120 | 			return false;
121 | 		}
122 | 		for (int i = 0; i < itemSetList2.size(); i++) {
123 | 			tempItemSet1 = this.itemSetList.get(i);
124 | 			tempItemSet2 = itemSetList2.get(i);
125 | 
126 | 			if (!tempItemSet1.compareIsSame(tempItemSet2)) {
127 | 				// 只要不相等，直接退出函数
128 | 				result = false;
129 | 				break;
130 | 			}
131 | 		}
132 | 
133 | 		return result;
134 | 	}
135 | 
136 | 	/**
137 | 	 * 生成此序列的所有子序列
138 | 	 *
139 | 	 * @return
140 | 	 */
141 | 	public ArrayList<Sequence> createChildSeqs() {
142 | 		ArrayList<Sequence> childSeqs = new ArrayList<>();
143 | 		ArrayList<Integer> tempItems;
144 | 		Sequence tempSeq = null;
145 | 		ItemSet tempItemSet;
146 | 
147 | 		for (int i = 0; i < this.itemSetList.size(); i++) {
148 | 			tempItemSet = itemSetList.get(i);
149 | 			if (tempItemSet.getItems().size() == 1) {
150 | 				tempSeq = this.copySeqence();
151 | 
152 | 				// 如果只有项集中只有1个元素，则直接移除
153 | 				tempSeq.itemSetList.remove(i);
154 | 				childSeqs.add(tempSeq);
155 | 			} else {
156 | 				tempItems = tempItemSet.getItems();
157 | 				for (int j = 0; j < tempItems.size(); j++) {
158 | 					tempSeq = this.copySeqence();
159 | 
160 | 					// 在拷贝的序列中移除一个数字
161 | 					tempSeq.getItemSetList().get(i).getItems().remove(j);
162 | 					childSeqs.add(tempSeq);
163 | 				}
164 | 			}
165 | 		}
166 | 
167 | 		return childSeqs;
168 | 	}
169 | 
170 | }
171 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/sequential/patterns/prefixspan/ItemSet.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.sequential.patterns.prefixspan;
 2 | 
 3 | import java.util.ArrayList;
 4 | 
 5 | /**
 6 |  * 字符项集类
 7 |  */
 8 | public class ItemSet {
 9 | 
10 | 	// 项集内的字符
11 | 	private ArrayList<String> items;
12 | 
13 | 	public ItemSet(String[] str) {
14 | 		items = new ArrayList<>();
15 | 		for (String s : str) {
16 | 			items.add(s);
17 | 		}
18 | 	}
19 | 
20 | 	public ItemSet(ArrayList<String> itemsList) {
21 | 		this.items = itemsList;
22 | 	}
23 | 
24 | 	public ItemSet(String s) {
25 | 		items = new ArrayList<>();
26 | 		for (int i = 0; i < s.length(); i++) {
27 | 			items.add(s.charAt(i) + "");
28 | 		}
29 | 	}
30 | 
31 | 	public ArrayList<String> getItems() {
32 | 		return items;
33 | 	}
34 | 
35 | 	public void setItems(ArrayList<String> items) {
36 | 		this.items = items;
37 | 	}
38 | 
39 | 	/**
40 | 	 * 获取项集最后1个元素
41 | 	 *
42 | 	 * @return
43 | 	 */
44 | 	public String getLastValue() {
45 | 		int size = this.items.size();
46 | 
47 | 		return this.items.get(size - 1);
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/sequential/patterns/prefixspan/PrefixSpanExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.sequential.patterns.prefixspan;
 2 | 
 3 | /**
 4 |  * PrefixSpan序列模式挖掘算法
 5 |  */
 6 | public class PrefixSpanExample {
 7 | 
 8 | 	public static void main(String[] agrs) {
 9 | 		String filePath = "data/prefixspan/input.txt";
10 | 		//最小支持度阈值率
11 | 		double minSupportRate = 0.4;
12 | 
13 | 		PrefixSpanCore tool = new PrefixSpanCore(filePath, minSupportRate);
14 | 		tool.prefixSpanCalculate();
15 | 	}
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNCore.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.statistical.learning.ann;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileReader;
  6 | import java.util.ArrayList;
  7 | import java.util.List;
  8 | 
  9 | /**
 10 |  * SVM支持向量机工具类
 11 |  */
 12 | public class ANNCore {
 13 | 
 14 | 	// 训练集数据文件路径
 15 | 	private String trainDataPath;
 16 | 	// svm_problem对象，用于构造svm model模型
 17 | 	private ANNProblem sProblem;
 18 | 	// svm参数，里面有svm支持向量机的类型和不同 的svm的核函数类型
 19 | 	private ANNParameter sParam;
 20 | 
 21 | 	public ANNCore(String trainDataPath) {
 22 | 		this.trainDataPath = trainDataPath;
 23 | 
 24 | 		// 初始化svm相关变量
 25 | 		sProblem = initSvmProblem();
 26 | 		sParam = initSvmParam();
 27 | 	}
 28 | 
 29 | 	/**
 30 | 	 * 初始化操作，根据训练集数据构造分类模型
 31 | 	 */
 32 | 	private void initOperation() {
 33 | 
 34 | 	}
 35 | 
 36 | 	/**
 37 | 	 * svm_problem对象，训练集数据的相关信息配置
 38 | 	 *
 39 | 	 * @return
 40 | 	 */
 41 | 	private ANNProblem initSvmProblem() {
 42 | 		List<Double> label = new ArrayList<Double>();
 43 | 		List<ANNNode[]> nodeSet = new ArrayList<ANNNode[]>();
 44 | 		getData(nodeSet, label, trainDataPath);
 45 | 
 46 | 		int dataRange = nodeSet.get(0).length;
 47 | 		ANNNode[][] datas = new ANNNode[nodeSet.size()][dataRange]; // 训练集的向量表
 48 | 		for (int i = 0; i < datas.length; i++) {
 49 | 			for (int j = 0; j < dataRange; j++) {
 50 | 				datas[i][j] = nodeSet.get(i)[j];
 51 | 			}
 52 | 		}
 53 | 		double[] lables = new double[label.size()]; // a,b 对应的lable
 54 | 		for (int i = 0; i < lables.length; i++) {
 55 | 			lables[i] = label.get(i);
 56 | 		}
 57 | 
 58 | 		// 定义svm_problem对象
 59 | 		ANNProblem problem = new ANNProblem();
 60 | 		problem.l = nodeSet.size(); // 向量个数
 61 | 		problem.x = datas; // 训练集向量表
 62 | 		problem.y = lables; // 对应的lable数组
 63 | 
 64 | 		return problem;
 65 | 	}
 66 | 
 67 | 	/**
 68 | 	 * 初始化svm支持向量机的参数，包括svm的类型和核函数的类型
 69 | 	 *
 70 | 	 * @return
 71 | 	 */
 72 | 	private ANNParameter initSvmParam() {
 73 | 		// 定义svm_parameter对象
 74 | 		ANNParameter param = new ANNParameter();
 75 | 		param.svm_type = ANNParameter.EPSILON_SVR;
 76 | 		// 设置svm的核函数类型为线型
 77 | 		param.kernel_type = ANNParameter.LINEAR;
 78 | 		// 后面的参数配置只针对训练集的数据
 79 | 		param.cache_size = 100;
 80 | 		param.eps = 0.00001;
 81 | 		param.C = 1.9;
 82 | 
 83 | 		return param;
 84 | 	}
 85 | 
 86 | 	/**
 87 | 	 * 通过svm方式预测数据的类型
 88 | 	 *
 89 | 	 * @param testDataPath
 90 | 	 */
 91 | 	public void svmPredictData(String testDataPath) {
 92 | 		// 获取测试数据
 93 | 		List<Double> testlabel = new ArrayList<Double>();
 94 | 		List<ANNNode[]> testnodeSet = new ArrayList<ANNNode[]>();
 95 | 		getData(testnodeSet, testlabel, testDataPath);
 96 | 		int dataRange = testnodeSet.get(0).length;
 97 | 
 98 | 		ANNNode[][] testdatas = new ANNNode[testnodeSet.size()][dataRange]; // 训练集的向量表
 99 | 		for (int i = 0; i < testdatas.length; i++) {
100 | 			for (int j = 0; j < dataRange; j++) {
101 | 				testdatas[i][j] = testnodeSet.get(i)[j];
102 | 			}
103 | 		}
104 | 		// 测试数据的真实值，在后面将会与svm的预测值做比较
105 | 		double[] testlables = new double[testlabel.size()]; // a,b 对应的lable
106 | 		for (int i = 0; i < testlables.length; i++) {
107 | 			testlables[i] = testlabel.get(i);
108 | 		}
109 | 
110 | 		// 如果参数没有问题，则svm.svm_check_parameter()函数返回null,否则返回error描述。
111 | 		// 对svm的配置参数叫验证，因为有些参数只针对部分的支持向量机的类型
112 | 		System.out.println(ANN.ann_check_parameter(sProblem, sParam));
113 | 		System.out.println("------------检验参数-----------");
114 | 		// 训练SVM分类模型
115 | 		ANNModel model = ANN.ann_train(sProblem, sParam);
116 | 
117 | 		// 预测测试数据的lable
118 | 		double err = 0.0;
119 | 		for (int i = 0; i < testdatas.length; i++) {
120 | 			double truevalue = testlables[i];
121 | 			// 测试数据真实值
122 | 			System.out.print(truevalue + " ");
123 | 			double predictValue = ANN.ann_predict(model, testdatas[i]);
124 | 			// 测试数据预测值
125 | 			System.out.println(predictValue);
126 | 		}
127 | 	}
128 | 
129 | 	/**
130 | 	 * 从文件中获取数据
131 | 	 *
132 | 	 * @param nodeSet
133 | 	 *            向量节点
134 | 	 * @param label
135 | 	 *            节点值类型值
136 | 	 * @param filename
137 | 	 *            数据文件地址
138 | 	 */
139 | 	private void getData(List<ANNNode[]> nodeSet, List<Double> label, String filename) {
140 | 		try {
141 | 
142 | 			FileReader fr = new FileReader(new File(filename));
143 | 			BufferedReader br = new BufferedReader(fr);
144 | 			String line = null;
145 | 			while ((line = br.readLine()) != null) {
146 | 				String[] datas = line.split(",");
147 | 				ANNNode[] vector = new ANNNode[datas.length - 1];
148 | 				for (int i = 0; i < datas.length - 1; i++) {
149 | 					ANNNode node = new ANNNode();
150 | 					node.index = i + 1;
151 | 					node.value = Double.parseDouble(datas[i]);
152 | 					vector[i] = node;
153 | 				}
154 | 				nodeSet.add(vector);
155 | 				double lablevalue = Double.parseDouble(datas[datas.length - 1]);
156 | 				label.add(lablevalue);
157 | 			}
158 | 		} catch (Exception e) {
159 | 			e.printStackTrace();
160 | 		}
161 | 
162 | 	}
163 | 
164 | }
165 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.statistical.learning.ann;
 2 | 
 3 | public class ANNExample {
 4 | 
 5 | 	public static void main(String[] args) {
 6 | 		// 训练集数据文件路径
 7 | 		String trainDataPath = "data/ann/trainInput.txt";
 8 | 		// 测试数据文件路径
 9 | 		String testDataPath = "data/ann/testInput.txt";
10 | 
11 | 		ANNCore tool = new ANNCore(trainDataPath);
12 | 		// 对测试数据进行ANN分类
13 | 		tool.svmPredictData(testDataPath);
14 | 	}
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNModel.java:
--------------------------------------------------------------------------------
 1 | //
 2 | // svm_model
 3 | //
 4 | package com.jusdt.datamining.statistical.learning.ann;
 5 | 
 6 | import java.io.Serializable;
 7 | 
 8 | public class ANNModel implements Serializable {
 9 | 
10 | 	private static final long serialVersionUID = 1L;
11 | 
12 | 	//svm支持向量机的参数
13 | 	ANNParameter param; // parameter
14 | 	//分类的类型数
15 | 	int nr_class; // number of classes, = 2 in regression/one class svm
16 | 	int l; // total #SV
17 | 	ANNNode[][] SV; // SVs (SV[l])
18 | 	double[][] sv_coef; // coefficients for SVs in decision functions (sv_coef[k-1][l])
19 | 	double[] rho; // constants in decision functions (rho[k*(k-1)/2])
20 | 	double[] probA; // pariwise probability information
21 | 	double[] probB;
22 | 
23 | 	// for classification only
24 | 
25 | 	//每个类型的类型值
26 | 	int[] label; // label of each class (label[k])
27 | 	int[] nSV; // number of SVs for each class (nSV[k])
28 | 	// nSV[0] + nSV[1] + ... + nSV[k-1] = l
29 | 
30 | };
31 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNNode.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.statistical.learning.ann;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | /**
 6 |  *
 7 |  * svm向量节点
 8 |  * @author lyq
 9 |  *
10 |  */
11 | public class ANNNode implements Serializable {
12 | 
13 | 	private static final long serialVersionUID = 1L;
14 | 
15 | 	//节点索引
16 | 	public int index;
17 | 	//节点的值
18 | 	public double value;
19 | 
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNParameter.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.statistical.learning.ann;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | public class ANNParameter implements Cloneable, Serializable {
 6 | 
 7 | 	private static final long serialVersionUID = 1L;
 8 | 
 9 | 	/* svm_type 支持向量机的类型*/
10 | 	public static final int C_SVC = 0;
11 | 	public static final int NU_SVC = 1;
12 | 	//一类svm
13 | 	public static final int ONE_CLASS = 2;
14 | 	public static final int EPSILON_SVR = 3;
15 | 	public static final int NU_SVR = 4;
16 | 
17 | 	/* kernel_type 核函数类型*/
18 | 	//线型核函数
19 | 	public static final int LINEAR = 0;
20 | 	//多项式核函数
21 | 	public static final int POLY = 1;
22 | 	//RBF径向基函数
23 | 	public static final int RBF = 2;
24 | 	//二层神经网络核函数
25 | 	public static final int SIGMOID = 3;
26 | 	public static final int PRECOMPUTED = 4;
27 | 
28 | 	public int svm_type;
29 | 	public int kernel_type;
30 | 	public int degree; // for poly
31 | 	public double gamma; // for poly/rbf/sigmoid
32 | 	public double coef0; // for poly/sigmoid
33 | 
34 | 	// these are for training only 后面这些参数只针对训练集的数据
35 | 	public double cache_size; // in MB
36 | 	public double eps; // stopping criteria
37 | 	public double C; // for C_SVC, EPSILON_SVR and NU_SVR
38 | 	public int nr_weight; // for C_SVC
39 | 	public int[] weight_label; // for C_SVC
40 | 	public double[] weight; // for C_SVC
41 | 	public double nu; // for NU_SVC, ONE_CLASS, and NU_SVR
42 | 	public double p; // for EPSILON_SVR
43 | 	public int shrinking; // use the shrinking heuristics
44 | 	public int probability; // do probability estimates
45 | 
46 | 	@Override
47 | 	public Object clone() {
48 | 		try {
49 | 			return super.clone();
50 | 		} catch (CloneNotSupportedException e) {
51 | 			return null;
52 | 		}
53 | 	}
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNPrintInterface.java:
--------------------------------------------------------------------------------
1 | package com.jusdt.datamining.statistical.learning.ann;
2 | 
3 | public interface ANNPrintInterface {
4 | 
5 | 	public void print(String s);
6 | 
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNProblem.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.statistical.learning.ann;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | /**
 6 |  * 包含了训练集数据的基本信息
 7 |  */
 8 | public class ANNProblem implements Serializable {
 9 | 
10 | 	private static final long serialVersionUID = 1L;
11 | 
12 | 	//定义了向量的总个数
13 | 	public int l;
14 | 	//分类类型值数组
15 | 	public double[] y;
16 | 	//训练集向量表
17 | 	public ANNNode[][] x;
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/em/EMCore.java:
--------------------------------------------------------------------------------
  1 | package com.jusdt.datamining.statistical.learning.em;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.text.MessageFormat;
  8 | import java.util.ArrayList;
  9 | 
 10 | /**
 11 |  * EM最大期望算法工具类
 12 |  */
 13 | public class EMCore {
 14 | 
 15 | 	// 测试数据文件地址
 16 | 	private String dataFilePath;
 17 | 	// 测试坐标点数据
 18 | 	private String[][] data;
 19 | 	// 测试坐标点数据列表
 20 | 	private ArrayList<Point> pointArray;
 21 | 	// 目标C1点
 22 | 	private Point p1;
 23 | 	// 目标C2点
 24 | 	private Point p2;
 25 | 
 26 | 	public EMCore(String dataFilePath) {
 27 | 		this.dataFilePath = dataFilePath;
 28 | 		pointArray = new ArrayList<>();
 29 | 	}
 30 | 
 31 | 	/**
 32 | 	 * 从文件中读取数据
 33 | 	 */
 34 | 	public void readDataFile() {
 35 | 		File file = new File(dataFilePath);
 36 | 		ArrayList<String[]> dataArray = new ArrayList<String[]>();
 37 | 
 38 | 		try {
 39 | 			BufferedReader in = new BufferedReader(new FileReader(file));
 40 | 			String str;
 41 | 			String[] tempArray;
 42 | 			while ((str = in.readLine()) != null) {
 43 | 				tempArray = str.split(" ");
 44 | 				dataArray.add(tempArray);
 45 | 			}
 46 | 			in.close();
 47 | 		} catch (IOException e) {
 48 | 			e.getStackTrace();
 49 | 		}
 50 | 
 51 | 		data = new String[dataArray.size()][];
 52 | 		dataArray.toArray(data);
 53 | 
 54 | 		// 开始时默认取头2个点作为2个簇中心
 55 | 		p1 = new Point(Integer.parseInt(data[0][0]), Integer.parseInt(data[0][1]));
 56 | 		p2 = new Point(Integer.parseInt(data[1][0]), Integer.parseInt(data[1][1]));
 57 | 
 58 | 		Point p;
 59 | 		for (String[] array : data) {
 60 | 			// 将数据转换为对象加入列表方便计算
 61 | 			p = new Point(Integer.parseInt(array[0]), Integer.parseInt(array[1]));
 62 | 			pointArray.add(p);
 63 | 		}
 64 | 	}
 65 | 
 66 | 	/**
 67 | 	 * 计算坐标点对于2个簇中心点的隶属度
 68 | 	 *
 69 | 	 * @param p
 70 | 	 *            待测试坐标点
 71 | 	 */
 72 | 	private void computeMemberShip(Point p) {
 73 | 		// p点距离第一个簇中心点的距离
 74 | 		double distance1 = 0;
 75 | 		// p距离第二个中心点的距离
 76 | 		double distance2 = 0;
 77 | 
 78 | 		// 用欧式距离计算
 79 | 		distance1 = Math.pow(p.getX() - p1.getX(), 2) + Math.pow(p.getY() - p1.getY(), 2);
 80 | 		distance2 = Math.pow(p.getX() - p2.getX(), 2) + Math.pow(p.getY() - p2.getY(), 2);
 81 | 
 82 | 		// 计算对于p1点的隶属度，与距离成反比关系，距离靠近越小，隶属度越大，所以要用大的distance2另外的距离来表示
 83 | 		p.setMemberShip1(distance2 / (distance1 + distance2));
 84 | 		// 计算对于p2点的隶属度
 85 | 		p.setMemberShip2(distance1 / (distance1 + distance2));
 86 | 	}
 87 | 
 88 | 	/**
 89 | 	 * 执行期望最大化步骤
 90 | 	 */
 91 | 	public void exceptMaxStep() {
 92 | 		// 新的优化过的簇中心点
 93 | 		double p1X = 0;
 94 | 		double p1Y = 0;
 95 | 		double p2X = 0;
 96 | 		double p2Y = 0;
 97 | 		double temp1 = 0;
 98 | 		double temp2 = 0;
 99 | 		// 误差值
100 | 		double errorValue1 = 0;
101 | 		double errorValue2 = 0;
102 | 		// 上次更新的簇点坐标
103 | 		Point lastP1 = null;
104 | 		Point lastP2 = null;
105 | 
106 | 		// 当开始计算的时候，或是中心点的误差值超过1的时候都需要再次迭代计算
107 | 		while (lastP1 == null || errorValue1 > 1.0 || errorValue2 > 1.0) {
108 | 			for (Point p : pointArray) {
109 | 				computeMemberShip(p);
110 | 				p1X += p.getMemberShip1() * p.getMemberShip1() * p.getX();
111 | 				p1Y += p.getMemberShip1() * p.getMemberShip1() * p.getY();
112 | 				temp1 += p.getMemberShip1() * p.getMemberShip1();
113 | 
114 | 				p2X += p.getMemberShip2() * p.getMemberShip2() * p.getX();
115 | 				p2Y += p.getMemberShip2() * p.getMemberShip2() * p.getY();
116 | 				temp2 += p.getMemberShip2() * p.getMemberShip2();
117 | 			}
118 | 
119 | 			lastP1 = new Point(p1.getX(), p1.getY());
120 | 			lastP2 = new Point(p2.getX(), p2.getY());
121 | 
122 | 			// 套公式计算新的簇中心点坐标,最最大化处理
123 | 			p1.setX(p1X / temp1);
124 | 			p1.setY(p1Y / temp1);
125 | 			p2.setX(p2X / temp2);
126 | 			p2.setY(p2Y / temp2);
127 | 
128 | 			errorValue1 = Math.abs(lastP1.getX() - p1.getX()) + Math.abs(lastP1.getY() - p1.getY());
129 | 			errorValue2 = Math.abs(lastP2.getX() - p2.getX()) + Math.abs(lastP2.getY() - p2.getY());
130 | 		}
131 | 
132 | 		System.out.println(
133 | 				MessageFormat.format("簇中心节点p1({0}, {1}), p2({2}, {3})", p1.getX(), p1.getY(), p2.getX(), p2.getY()));
134 | 	}
135 | 
136 | }
137 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/em/EMExample.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.statistical.learning.em;
 2 | 
 3 | /**
 4 |  * EM期望最大化算法场景调用类
 5 |  */
 6 | public class EMExample {
 7 | 
 8 | 	public static void main(String[] args) {
 9 | 		String filePath = "data/em/input.txt";
10 | 
11 | 		EMCore tool = new EMCore(filePath);
12 | 		tool.readDataFile();
13 | 		tool.exceptMaxStep();
14 | 	}
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/com/jusdt/datamining/statistical/learning/em/Point.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.statistical.learning.em;
 2 | 
 3 | /**
 4 |  * 坐标点类
 5 |  */
 6 | public class Point {
 7 | 
 8 | 	// 坐标点横坐标
 9 | 	private double x;
10 | 	// 坐标点纵坐标
11 | 	private double y;
12 | 	// 坐标点对于P1的隶属度
13 | 	private double memberShip1;
14 | 	// 坐标点对于P2的隶属度
15 | 	private double memberShip2;
16 | 
17 | 	public Point(double d, double e) {
18 | 		this.x = d;
19 | 		this.y = e;
20 | 	}
21 | 
22 | 	public double getX() {
23 | 		return x;
24 | 	}
25 | 
26 | 	public void setX(double x) {
27 | 		this.x = x;
28 | 	}
29 | 
30 | 	public double getY() {
31 | 		return y;
32 | 	}
33 | 
34 | 	public void setY(double y) {
35 | 		this.y = y;
36 | 	}
37 | 
38 | 	public double getMemberShip1() {
39 | 		return memberShip1;
40 | 	}
41 | 
42 | 	public void setMemberShip1(double memberShip1) {
43 | 		this.memberShip1 = memberShip1;
44 | 	}
45 | 
46 | 	public double getMemberShip2() {
47 | 		return memberShip2;
48 | 	}
49 | 
50 | 	public void setMemberShip2(double memberShip2) {
51 | 		this.memberShip2 = memberShip2;
52 | 	}
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <configuration>
 4 | 
 5 | 	<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 6 | 		<encoder>
 7 | 			<pattern>%d{ISO8601} [%thread] %-5level %logger{36} [Line:%-3L] - %msg%n</pattern>
 8 | 		</encoder>
 9 | 		<!-- 只输出level级别的日志 -->
10 | 		<filter class="ch.qos.logback.classic.filter.LevelFilter">
11 | 			<level>INFO</level>
12 | 			<onMatch>ACCEPT</onMatch>
13 | 			<onMismatch>DENY</onMismatch>
14 | 		</filter>
15 | 	</appender>
16 | 
17 | 	<appender name="FILE"
18 | 		class="ch.qos.logback.core.rolling.RollingFileAppender">
19 | 		<File>logs/datamining.log</File>
20 | 		<encoder>
21 | 			<pattern>%d{ISO8601} [%thread] %-5level %logger{36} [Line:%-3L] - %msg%n</pattern>
22 | 		</encoder>
23 | 		<!-- 只输出level级别以上的日志 -->
24 | 		<filter class="ch.qos.logback.classic.filter.ThresholdFilter">
25 | 			<level>INFO</level>
26 | 		</filter>
27 | 		<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
28 | 			<fileNamePattern>logs/datamining.log.%d{yyyy-MM-dd}.gz</fileNamePattern>
29 | 			<maxHistory>30</maxHistory>
30 | 		</rollingPolicy>
31 | 	</appender>
32 | 
33 | 	<root level="INFO">
34 | 		<appender-ref ref="FILE" />
35 | 		<appender-ref ref="STDOUT" />
36 | 	</root>
37 | 
38 | </configuration>


--------------------------------------------------------------------------------
/src/test/java/com/jusdt/datamining/demo/MainDemo.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.demo;
 2 | 
 3 | public class MainDemo {
 4 | 
 5 | 	public static void main(String[] args) {
 6 | 		// TODO Auto-generated method stub
 7 | 
 8 | 	}
 9 | 
10 | }
11 | 


--------------------------------------------------------------------------------
/src/test/java/com/jusdt/datamining/dimensionality/reduction/pca/ToeplitzMatrixTest.java:
--------------------------------------------------------------------------------
 1 | package com.jusdt.datamining.dimensionality.reduction.pca;
 2 | 
 3 | import com.jusdt.datamining.dimensionality.reduction.pca.ToeplitzMatrix;
 4 | 
 5 | import junit.framework.TestCase;
 6 | 
 7 | public class ToeplitzMatrixTest extends TestCase {
 8 | 
 9 | 	public ToeplitzMatrixTest(String testName) {
10 | 		super(testName);
11 | 	}
12 | 
13 | 	@Override
14 | 	protected void setUp() throws Exception {
15 | 		super.setUp();
16 | 	}
17 | 
18 | 	@Override
19 | 	protected void tearDown() throws Exception {
20 | 		super.tearDown();
21 | 	}
22 | 
23 | 	public void testToeplitz() {
24 | 		double[] data = new double[] { 1, 2, 3, 4, 5, 6 };
25 | 
26 | 		ToeplitzMatrix m = new ToeplitzMatrix(data);
27 | 		//		MatrixHelper.print(m, 1, 3);
28 | 		assertTrue("nrows wrong", m.getNRows() == 6);
29 | 		assertTrue("ncols wrong", m.getNCols() == 6);
30 | 		double[][] a = m.getArray();
31 | 
32 | 		assertEquals("0,0", 1., a[0][0]);
33 | 		assertEquals("0,1", 2., a[0][1]);
34 | 		assertEquals("0,2", 3., a[0][2]);
35 | 		assertEquals("0,3", 4., a[0][3]);
36 | 		assertEquals("0,4", 5., a[0][4]);
37 | 		assertEquals("0,5", 6., a[0][5]);
38 | 
39 | 		assertEquals("1,0", 2., a[1][0]);
40 | 		assertEquals("1,1", 1., a[1][1]);
41 | 		assertEquals("1,2", 2., a[1][2]);
42 | 		assertEquals("1,3", 3., a[1][3]);
43 | 		assertEquals("1,4", 4., a[1][4]);
44 | 		assertEquals("1,5", 5., a[1][5]);
45 | 
46 | 		assertEquals("2,0", 3., a[2][0]);
47 | 		assertEquals("2,1", 2., a[2][1]);
48 | 		assertEquals("2,2", 1., a[2][2]);
49 | 		assertEquals("2,3", 2., a[2][3]);
50 | 		assertEquals("2,4", 3., a[2][4]);
51 | 		assertEquals("2,5", 4., a[2][5]);
52 | 
53 | 		assertEquals("3,0", 4., a[3][0]);
54 | 		assertEquals("3,1", 3., a[3][1]);
55 | 		assertEquals("3,2", 2., a[3][2]);
56 | 		assertEquals("3,3", 1., a[3][3]);
57 | 		assertEquals("3,4", 2., a[3][4]);
58 | 		assertEquals("3,5", 3., a[3][5]);
59 | 
60 | 		assertEquals("4,0", 5., a[4][0]);
61 | 		assertEquals("4,1", 4., a[4][1]);
62 | 		assertEquals("4,2", 3., a[4][2]);
63 | 		assertEquals("4,3", 2., a[4][3]);
64 | 		assertEquals("4,4", 1., a[4][4]);
65 | 		assertEquals("4,5", 2., a[4][5]);
66 | 
67 | 		assertEquals("5,0", 6., a[5][0]);
68 | 		assertEquals("5,1", 5., a[5][1]);
69 | 		assertEquals("5,2", 4., a[5][2]);
70 | 		assertEquals("5,3", 3., a[5][3]);
71 | 		assertEquals("5,4", 2., a[5][4]);
72 | 		assertEquals("5,5", 1., a[5][5]);
73 | 	}
74 | 
75 | }
76 | 


--------------------------------------------------------------------------------
/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <configuration>
 4 | 
 5 | 	<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 6 | 		<encoder>
 7 | 			<pattern>%d{MMdd.HHmmss.SSS} [%-20t] [%-5p] [%-20c] [L:%-3L] - %m%n</pattern>
 8 | 		</encoder>
 9 | 	</appender>
10 | 
11 | 	<logger name="com.jusdt" level="DEBUG">
12 | 		<appender-ref ref="STDOUT" />
13 | 	</logger>
14 | 
15 | 	<root level="DEBUG">
16 | 		<appender-ref ref="STDOUT" />
17 | 	</root>
18 | 
19 | </configuration>


--------------------------------------------------------------------------------
/需要验收的算法:
--------------------------------------------------------------------------------
1 | 1、朴素贝叶斯
2 | 2、KMeans
3 | 3、KNN
4 | 4、PCA
5 | 5、ANN
6 | 6、决策树
7 | 7、层次聚类
8 | 及其他辅助算法  ok
9 | 多媒体数据处理算法  ok


--------------------------------------------------------------------------------