├── .gitattributes ├── .gitignore ├── README.md ├── data ├── aco │ └── input.txt ├── adaboost │ └── input.txt ├── ann │ ├── testInput.txt │ └── trainInput.txt ├── apriori │ └── testInput.txt ├── bayesnetwork │ ├── attach.txt │ └── input.txt ├── birch │ ├── realData.txt │ └── testInput.txt ├── cabddcc │ └── graphData.txt ├── cart │ └── input.txt ├── cba │ └── input.txt ├── chameleon │ └── graphData.txt ├── dbscan │ └── input.txt ├── em │ └── input.txt ├── fptree │ └── testInput.txt ├── gsp │ └── testInput.txt ├── gspan │ ├── input.txt │ └── reallyData.txt ├── hits │ └── input.txt ├── id3 │ └── input.txt ├── kdtree │ └── input.txt ├── kmeans │ └── input.txt ├── knn │ ├── testInput.txt │ └── trainInput.txt ├── maze │ └── mapData.txt ├── msapriori │ ├── testInput.txt │ └── testInput2.txt ├── naivebayes │ └── input.txt ├── pagerank │ └── input.txt ├── pca │ ├── Makefile │ ├── basilevsy.data │ ├── compressor_1_day_detail.data │ ├── compressor_per_day_kwh.data │ └── simple.data ├── prefixspan │ └── input.txt ├── randomforest │ └── input.txt ├── roughsets │ └── input.txt ├── tan │ └── input.txt └── viterbi │ ├── humidity-matrix.txt │ └── stmatrix.txt ├── pom.xml ├── src ├── main │ ├── assembly │ │ └── distribution.xml │ ├── bin │ │ └── ctl.sh │ ├── java │ │ └── com │ │ │ └── jusdt │ │ │ └── datamining │ │ │ ├── association │ │ │ └── analysis │ │ │ │ ├── apriori │ │ │ │ ├── AprioriCore.java │ │ │ │ ├── AprioriExample.java │ │ │ │ └── FrequentItem.java │ │ │ │ └── fptree │ │ │ │ ├── FPTreeCore.java │ │ │ │ ├── FPTreeExample.java │ │ │ │ └── TreeNode.java │ │ │ ├── bagging │ │ │ └── boosting │ │ │ │ └── adaboost │ │ │ │ ├── AdaBoostCore.java │ │ │ │ ├── AdaBoostExample.java │ │ │ │ └── Point.java │ │ │ ├── classification │ │ │ ├── cart │ │ │ │ ├── AttrNode.java │ │ │ │ ├── CARTCore.java │ │ │ │ └── CARTExample.java │ │ │ ├── id3 │ │ │ │ ├── AttrNode.java │ │ │ │ ├── DataNode.java │ │ │ │ ├── ID3Core.java │ │ │ │ └── ID3Example.java │ │ │ ├── knn │ │ │ │ ├── KNNCore.java │ │ │ │ ├── KNNExample.java │ │ │ │ └── Sample.java │ │ │ └── naivebayes │ │ │ │ ├── NaiveBayesCore.java │ │ │ │ └── NaiveBayesExample.java │ │ │ ├── clustering │ │ │ ├── birch │ │ │ │ ├── BIRCHCore.java │ │ │ │ ├── BIRCHExample.java │ │ │ │ ├── Cluster.java │ │ │ │ ├── ClusteringFeature.java │ │ │ │ ├── LeafNode.java │ │ │ │ └── NonLeafNode.java │ │ │ └── kmeans │ │ │ │ ├── KMeansCore.java │ │ │ │ ├── KMeansExample.java │ │ │ │ └── Point.java │ │ │ ├── dimensionality │ │ │ └── reduction │ │ │ │ └── pca │ │ │ │ ├── DataReader.java │ │ │ │ ├── EVD.java │ │ │ │ ├── Main.java │ │ │ │ ├── Matrix.java │ │ │ │ ├── MatrixException.java │ │ │ │ ├── MatrixHelper.java │ │ │ │ ├── PCACore.java │ │ │ │ ├── PCACoreHandler.java │ │ │ │ ├── PCAExample.java │ │ │ │ ├── SVD.java │ │ │ │ ├── ToeplitzMatrix.java │ │ │ │ └── TrajectoryMatrix.java │ │ │ ├── graph │ │ │ └── gspan │ │ │ │ ├── DFSCodeTraveler.java │ │ │ │ ├── Edge.java │ │ │ │ ├── EdgeFrequency.java │ │ │ │ ├── GSpanExample.java │ │ │ │ ├── GSpanTool.java │ │ │ │ ├── Graph.java │ │ │ │ ├── GraphCode.java │ │ │ │ ├── GraphData.java │ │ │ │ └── SubChildTraveler.java │ │ │ ├── integrated │ │ │ └── cba │ │ │ │ ├── AprioriCore.java │ │ │ │ ├── CBACore.java │ │ │ │ ├── CBAExample.java │ │ │ │ └── FrequentItem.java │ │ │ ├── link │ │ │ ├── hits │ │ │ │ ├── HITSCore.java │ │ │ │ └── HITSExample.java │ │ │ └── pagerank │ │ │ │ ├── PageRankCore.java │ │ │ │ └── PageRankExample.java │ │ │ ├── others │ │ │ ├── aco │ │ │ │ ├── ACOCore.java │ │ │ │ ├── ACOExample.java │ │ │ │ └── Ant.java │ │ │ ├── bayesnetwork │ │ │ │ ├── BayesNetWorkCore.java │ │ │ │ ├── BayesNetWorkExample.java │ │ │ │ └── Node.java │ │ │ ├── cabddcc │ │ │ │ ├── CABDDCCCore.java │ │ │ │ ├── CABDDCCExample.java │ │ │ │ ├── Graph.java │ │ │ │ └── Point.java │ │ │ ├── chameleon │ │ │ │ ├── ChameleonCore.java │ │ │ │ ├── ChameleonExample.java │ │ │ │ ├── Cluster.java │ │ │ │ └── Point.java │ │ │ ├── dbscan │ │ │ │ ├── DBSCANCore.java │ │ │ │ ├── DBSCANExample.java │ │ │ │ └── Point.java │ │ │ ├── ga │ │ │ │ ├── GACore.java │ │ │ │ ├── GAExample.java │ │ │ │ └── maze │ │ │ │ │ ├── GAMazeCore.java │ │ │ │ │ └── GAMazeExample.java │ │ │ ├── kdtree │ │ │ │ ├── KDTreeCore.java │ │ │ │ ├── KDTreeExample.java │ │ │ │ ├── Point.java │ │ │ │ ├── Range.java │ │ │ │ └── TreeNode.java │ │ │ ├── msapriori │ │ │ │ ├── FrequentItem.java │ │ │ │ ├── MSAprioriCore.java │ │ │ │ └── MSAprioriExample.java │ │ │ ├── randomforest │ │ │ │ ├── CARTCore.java │ │ │ │ ├── DecisionTree.java │ │ │ │ ├── RandomForestCore.java │ │ │ │ ├── RandomForestExample.java │ │ │ │ └── TreeNode.java │ │ │ ├── tan │ │ │ │ ├── AttrMutualInfo.java │ │ │ │ ├── Node.java │ │ │ │ ├── TANCore.java │ │ │ │ └── TanExample.java │ │ │ └── viterbi │ │ │ │ ├── BaseNames.java │ │ │ │ ├── ViterbiCore.java │ │ │ │ └── ViterbiExample.java │ │ │ ├── roughsets │ │ │ ├── KnowledgeSystem.java │ │ │ ├── Record.java │ │ │ ├── RecordCollection.java │ │ │ ├── RoughSetsCore.java │ │ │ └── RoughSetsExample.java │ │ │ ├── sequential │ │ │ └── patterns │ │ │ │ ├── gsp │ │ │ │ ├── GSPCore.java │ │ │ │ ├── GSPExample.java │ │ │ │ ├── ItemSet.java │ │ │ │ └── Sequence.java │ │ │ │ └── prefixspan │ │ │ │ ├── ItemSet.java │ │ │ │ ├── PrefixSpanCore.java │ │ │ │ ├── PrefixSpanExample.java │ │ │ │ └── Sequence.java │ │ │ └── statistical │ │ │ └── learning │ │ │ ├── ann │ │ │ ├── ANN.java │ │ │ ├── ANNCore.java │ │ │ ├── ANNExample.java │ │ │ ├── ANNModel.java │ │ │ ├── ANNNode.java │ │ │ ├── ANNParameter.java │ │ │ ├── ANNPrintInterface.java │ │ │ └── ANNProblem.java │ │ │ └── em │ │ │ ├── EMCore.java │ │ │ ├── EMExample.java │ │ │ └── Point.java │ └── resources │ │ └── logback.xml └── test │ ├── java │ └── com │ │ └── jusdt │ │ └── datamining │ │ ├── demo │ │ └── MainDemo.java │ │ └── dimensionality │ │ └── reduction │ │ └── pca │ │ └── ToeplitzMatrixTest.java │ └── resources │ └── logback-test.xml └── 需要验收的算法 /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | *.sln merge=union 7 | *.csproj merge=union 8 | *.vbproj merge=union 9 | *.fsproj merge=union 10 | *.dbproj merge=union 11 | 12 | # Standard to msysgit 13 | *.doc diff=astextplain 14 | *.DOC diff=astextplain 15 | *.docx diff=astextplain 16 | *.DOCX diff=astextplain 17 | *.dot diff=astextplain 18 | *.DOT diff=astextplain 19 | *.pdf diff=astextplain 20 | *.PDF diff=astextplain 21 | *.rtf diff=astextplain 22 | *.RTF diff=astextplain 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .classpath 2 | .project 3 | .settings/ 4 | target/ 5 | logs/ 6 | -------------------------------------------------------------------------------- /data/aco/input.txt: -------------------------------------------------------------------------------- 1 | # CityName 2 | 1 3 | 2 4 | 3 5 | 4 6 | # Distance 7 | 1 2 1 8 | 1 3 1.4 9 | 1 4 1 10 | 2 3 1 11 | 2 4 1 12 | 3 4 1 -------------------------------------------------------------------------------- /data/adaboost/input.txt: -------------------------------------------------------------------------------- 1 | 1 5 1 2 | 2 3 1 3 | 3 1 -1 4 | 4 5 -1 5 | 5 6 1 6 | 6 4 -1 7 | 6 7 1 8 | 7 6 1 9 | 8 7 -1 10 | 8 2 -1 -------------------------------------------------------------------------------- /data/ann/testInput.txt: -------------------------------------------------------------------------------- 1 | 18.7,18.9,19.1,19.3,19.6 2 | 18.9,19.1,19.3,19.6,19.9 3 | 19.1,19.3,19.6,19.9,20.2 4 | 19.3,19.6,19.9,20.2,20.6 5 | 19.6,19.9,20.2,20.6,21 6 | 19.9,20.2,20.6,21,21.5 7 | 20.2,20.6,21,21.5,22 -------------------------------------------------------------------------------- /data/ann/trainInput.txt: -------------------------------------------------------------------------------- 1 | 17.6,17.7,17.7,17.7,17.8 2 | 17.7,17.7,17.7,17.8,17.8 3 | 17.7,17.7,17.8,17.8,17.9 4 | 17.7,17.8,17.8,17.9,18 5 | 17.8,17.8,17.9,18,18.1 6 | 17.8,17.9,18,18.1,18.2 7 | 17.9,18,18.1,18.2,18.4 8 | 18,18.1,18.2,18.4,18.6 9 | 18.1,18.2,18.4,18.6,18.7 10 | 18.2,18.4,18.6,18.7,18.9 11 | 18.4,18.6,18.7,18.9,19.1 12 | 18.6,18.7,18.9,19.1,19.3 -------------------------------------------------------------------------------- /data/apriori/testInput.txt: -------------------------------------------------------------------------------- 1 | T1 1 2 5 2 | T2 2 4 3 | T3 2 3 4 | T4 1 2 4 5 | T5 1 3 6 | T6 2 3 7 | T7 1 3 8 | T8 1 2 3 5 9 | T9 1 2 3 -------------------------------------------------------------------------------- /data/bayesnetwork/attach.txt: -------------------------------------------------------------------------------- 1 | B A 2 | E A 3 | A M 4 | A J -------------------------------------------------------------------------------- /data/bayesnetwork/input.txt: -------------------------------------------------------------------------------- 1 | B E A M J P 2 | y y y y y 0.00012 3 | y y y y n 0.000051 4 | y y y n y 0.000013 5 | y y y n n 0.0000057 6 | y y n y y 0.000000005 7 | y y n y n 0.00000049 8 | y y n n y 0.000000095 9 | y y n n n 0.0000094 10 | y n y y y 0.0058 11 | y n y y n 0.0025 12 | y n y n y 0.00065 13 | y n y n n 0.00028 14 | y n n y y 0.00000029 15 | y n n y n 0.000029 16 | y n n n y 0.0000056 17 | y n n n n 0.00055 18 | n y y y y 0.0036 19 | n y y y n 0.0016 20 | n y y n y 0.0004 21 | n y y n n 0.00017 22 | n y n y y 0.000007 23 | n y n y n 0.00069 24 | n y n n y 0.00013 25 | n y n n n 0.013 26 | n n y y y 0.00061 27 | n n y y n 0.00026 28 | n n y n y 0.000068 29 | n n y n n 0.000029 30 | n n n y y 0.00048 31 | n n n y n 0.048 32 | n n n n y 0.0092 33 | n n n n n 0.91 -------------------------------------------------------------------------------- /data/birch/realData.txt: -------------------------------------------------------------------------------- 1 | 5.1 3.5 1.4 0.2 2 | 4.9 3.0 1.4 0.2 3 | 4.7 3.2 1.3 0.2 4 | 4.6 3.1 1.5 0.2 5 | 5.0 3.6 1.4 0.2 6 | 5.4 3.9 1.7 0.4 7 | 4.6 3.4 1.4 0.3 8 | 5.0 3.4 1.5 0.2 9 | 4.4 2.9 1.4 0.2 10 | 4.9 3.1 1.5 0.1 11 | 5.4 3.7 1.5 0.2 12 | 4.8 3.4 1.6 0.2 13 | 4.8 3.0 1.4 0.1 14 | 4.3 3.0 1.1 0.1 15 | 5.8 4.0 1.2 0.2 16 | 5.7 4.4 1.5 0.4 17 | 5.4 3.9 1.3 0.4 18 | 5.1 3.5 1.4 0.3 19 | 5.7 3.8 1.7 0.3 20 | 5.1 3.8 1.5 0.3 21 | 5.4 3.4 1.7 0.2 22 | 5.1 3.7 1.5 0.4 23 | 4.6 3.6 1.0 0.2 24 | 5.1 3.3 1.7 0.5 25 | 4.8 3.4 1.9 0.2 26 | 5.0 3.0 1.6 0.2 27 | 5.0 3.4 1.6 0.4 28 | 5.2 3.5 1.5 0.2 29 | 5.2 3.4 1.4 0.2 30 | 4.7 3.2 1.6 0.2 31 | 4.8 3.1 1.6 0.2 32 | 5.4 3.4 1.5 0.4 33 | 5.2 4.1 1.5 0.1 34 | 5.5 4.2 1.4 0.2 35 | 4.9 3.1 1.5 0.1 36 | 5.0 3.2 1.2 0.2 37 | 5.5 3.5 1.3 0.2 38 | 4.9 3.1 1.5 0.1 39 | 4.4 3.0 1.3 0.2 40 | 5.1 3.4 1.5 0.2 41 | 5.0 3.5 1.3 0.3 42 | 4.5 2.3 1.3 0.3 43 | 4.4 3.2 1.3 0.2 44 | 5.0 3.5 1.6 0.6 45 | 5.1 3.8 1.9 0.4 46 | 4.8 3.0 1.4 0.3 47 | 5.1 3.8 1.6 0.2 48 | 4.6 3.2 1.4 0.2 49 | 5.3 3.7 1.5 0.2 50 | 5.0 3.3 1.4 0.2 51 | 7.0 3.2 4.7 1.4 52 | 6.4 3.2 4.5 1.5 53 | 6.9 3.1 4.9 1.5 54 | 5.5 2.3 4.0 1.3 55 | 6.5 2.8 4.6 1.5 56 | 5.7 2.8 4.5 1.3 57 | 6.3 3.3 4.7 1.6 58 | 4.9 2.4 3.3 1.0 59 | 6.6 2.9 4.6 1.3 60 | 5.2 2.7 3.9 1.4 61 | 5.0 2.0 3.5 1.0 62 | 5.9 3.0 4.2 1.5 63 | 6.0 2.2 4.0 1.0 64 | 6.1 2.9 4.7 1.4 65 | 5.6 2.9 3.6 1.3 66 | 6.7 3.1 4.4 1.4 67 | 5.6 3.0 4.5 1.5 68 | 5.8 2.7 4.1 1.0 69 | 6.2 2.2 4.5 1.5 70 | 5.6 2.5 3.9 1.1 71 | 5.9 3.2 4.8 1.8 72 | 6.1 2.8 4.0 1.3 73 | 6.3 2.5 4.9 1.5 74 | 6.1 2.8 4.7 1.2 75 | 6.4 2.9 4.3 1.3 76 | 6.6 3.0 4.4 1.4 77 | 6.8 2.8 4.8 1.4 78 | 6.7 3.0 5.0 1.7 79 | 6.0 2.9 4.5 1.5 80 | 5.7 2.6 3.5 1.0 81 | 5.5 2.4 3.8 1.1 82 | 5.5 2.4 3.7 1.0 83 | 5.8 2.7 3.9 1.2 84 | 6.0 2.7 5.1 1.6 85 | 5.4 3.0 4.5 1.5 86 | 6.0 3.4 4.5 1.6 87 | 6.7 3.1 4.7 1.5 88 | 6.3 2.3 4.4 1.3 89 | 5.6 3.0 4.1 1.3 90 | 5.5 2.5 4.0 1.3 91 | 5.5 2.6 4.4 1.2 92 | 6.1 3.0 4.6 1.4 93 | 5.8 2.6 4.0 1.2 94 | 5.0 2.3 3.3 1.0 95 | 5.6 2.7 4.2 1.3 96 | 5.7 3.0 4.2 1.2 97 | 5.7 2.9 4.2 1.3 98 | 6.2 2.9 4.3 1.3 99 | 5.1 2.5 3.0 1.1 100 | 5.7 2.8 4.1 1.3 101 | 6.3 3.3 6.0 2.5 102 | 5.8 2.7 5.1 1.9 103 | 7.1 3.0 5.9 2.1 104 | 6.3 2.9 5.6 1.8 105 | 6.5 3.0 5.8 2.2 106 | 7.6 3.0 6.6 2.1 107 | 4.9 2.5 4.5 1.7 108 | 7.3 2.9 6.3 1.8 109 | 6.7 2.5 5.8 1.8 110 | 7.2 3.6 6.1 2.5 111 | 6.5 3.2 5.1 2.0 112 | 6.4 2.7 5.3 1.9 113 | 6.8 3.0 5.5 2.1 114 | 5.7 2.5 5.0 2.0 115 | 5.8 2.8 5.1 2.4 116 | 6.4 3.2 5.3 2.3 117 | 6.5 3.0 5.5 1.8 118 | 7.7 3.8 6.7 2.2 119 | 7.7 2.6 6.9 2.3 120 | 6.0 2.2 5.0 1.5 121 | 6.9 3.2 5.7 2.3 122 | 5.6 2.8 4.9 2.0 123 | 7.7 2.8 6.7 2.0 124 | 6.3 2.7 4.9 1.8 125 | 6.7 3.3 5.7 2.1 126 | 7.2 3.2 6.0 1.8 127 | 6.2 2.8 4.8 1.8 128 | 6.1 3.0 4.9 1.8 129 | 6.4 2.8 5.6 2.1 130 | 7.2 3.0 5.8 1.6 131 | 7.4 2.8 6.1 1.9 132 | 7.9 3.8 6.4 2.0 133 | 6.4 2.8 5.6 2.2 134 | 6.3 2.8 5.1 1.5 135 | 6.1 2.6 5.6 1.4 136 | 7.7 3.0 6.1 2.3 137 | 6.3 3.4 5.6 2.4 138 | 6.4 3.1 5.5 1.8 139 | 6.0 3.0 4.8 1.8 140 | 6.9 3.1 5.4 2.1 141 | 6.7 3.1 5.6 2.4 142 | 6.9 3.1 5.1 2.3 143 | 5.8 2.7 5.1 1.9 144 | 6.8 3.2 5.9 2.3 145 | 6.7 3.3 5.7 2.5 146 | 6.7 3.0 5.2 2.3 147 | 6.3 2.5 5.0 1.9 148 | 6.5 3.0 5.2 2.0 149 | 6.2 3.4 5.4 2.3 150 | 5.9 3.0 5.1 1.8 -------------------------------------------------------------------------------- /data/birch/testInput.txt: -------------------------------------------------------------------------------- 1 | 5.1 3.5 1.4 0.2 2 | 4.9 3.0 1.4 0.2 3 | 4.7 3.2 1.3 0.8 4 | 4.6 3.1 1.5 0.8 5 | 5.0 3.6 1.8 0.6 6 | 4.7 3.2 1.4 0.8 -------------------------------------------------------------------------------- /data/cabddcc/graphData.txt: -------------------------------------------------------------------------------- 1 | 0 1 12 2 | 1 3 9 3 | 2 3 12 4 | 3 4 10 5 | 4 4 4 6 | 5 4 1 7 | 6 6 1 8 | 7 6 3 9 | 8 6 9 10 | 9 8 3 11 | 10 8 10 12 | 11 9 2 13 | 12 9 11 14 | 13 10 9 15 | 14 11 12 -------------------------------------------------------------------------------- /data/cart/input.txt: -------------------------------------------------------------------------------- 1 | Rid Age Income Student CreditRating BuysComputer 2 | 1 Youth High No Fair No 3 | 2 Youth High No Excellent No 4 | 3 MiddleAged High No Fair Yes 5 | 4 Senior Medium No Fair Yes 6 | 5 Senior Low Yes Fair Yes 7 | 6 Senior Low Yes Excellent No 8 | 7 MiddleAged Low Yes Excellent Yes 9 | 8 Youth Medium No Fair No 10 | 9 Youth Low Yes Fair Yes 11 | 10 Senior Medium Yes Fair Yes 12 | 11 Youth Medium Yes Excellent Yes 13 | 12 MiddleAged Medium No Excellent Yes 14 | 13 MiddleAged High Yes Fair Yes 15 | 14 Senior Medium No Excellent No -------------------------------------------------------------------------------- /data/cba/input.txt: -------------------------------------------------------------------------------- 1 | Rid Age Income Student CreditRating BuysComputer 2 | 1 13 High No Fair CLassNo 3 | 2 11 High No Excellent CLassNo 4 | 3 25 High No Fair CLassYes 5 | 4 45 Medium No Fair CLassYes 6 | 5 50 Low Yes Fair CLassYes 7 | 6 51 Low Yes Excellent CLassNo 8 | 7 30 Low Yes Excellent CLassYes 9 | 8 13 Medium No Fair CLassNo 10 | 9 9 Low Yes Fair CLassYes 11 | 10 55 Medium Yes Fair CLassYes 12 | 11 14 Medium Yes Excellent CLassYes 13 | 12 33 Medium No Excellent CLassYes 14 | 13 33 High Yes Fair CLassYes 15 | 14 41 Medium No Excellent CLassNo -------------------------------------------------------------------------------- /data/chameleon/graphData.txt: -------------------------------------------------------------------------------- 1 | 0 2 2 2 | 1 3 1 3 | 2 3 4 4 | 3 3 14 5 | 4 5 3 6 | 5 8 3 7 | 6 8 6 8 | 7 9 8 9 | 8 10 4 10 | 9 10 7 11 | 10 10 10 12 | 11 10 14 13 | 12 11 13 14 | 13 12 8 15 | 14 12 15 16 | 15 14 7 17 | 16 14 9 18 | 17 14 15 19 | 18 15 8 -------------------------------------------------------------------------------- /data/dbscan/input.txt: -------------------------------------------------------------------------------- 1 | 2 2 2 | 3 1 3 | 3 4 4 | 3 14 5 | 5 3 6 | 8 3 7 | 8 6 8 | 9 8 9 | 10 4 10 | 10 7 11 | 10 10 12 | 10 14 13 | 11 13 14 | 12 8 15 | 12 15 16 | 14 7 17 | 14 9 18 | 14 15 19 | 15 8 -------------------------------------------------------------------------------- /data/em/input.txt: -------------------------------------------------------------------------------- 1 | 3 3 2 | 4 10 3 | 9 6 4 | 14 8 5 | 18 11 6 | 21 7 -------------------------------------------------------------------------------- /data/fptree/testInput.txt: -------------------------------------------------------------------------------- 1 | T1 1 2 5 2 | T2 2 4 3 | T3 2 3 4 | T4 1 2 4 5 | T5 1 3 6 | T6 2 3 7 | T7 1 3 8 | T8 1 2 3 5 9 | T9 1 2 3 -------------------------------------------------------------------------------- /data/gsp/testInput.txt: -------------------------------------------------------------------------------- 1 | 1 2 1 5 2 | 1 1 2 3 | 1 1 3 4 | 1 1 4 5 | 2 1 1 6 | 2 1 3 7 | 2 1 4 8 | 2 2 3 5 9 | 3 1 1 10 | 3 1 2 11 | 3 1 3 12 | 3 1 4 13 | 3 1 5 14 | 4 1 1 15 | 4 1 3 16 | 4 1 5 17 | 5 1 4 18 | 5 1 5 -------------------------------------------------------------------------------- /data/gspan/input.txt: -------------------------------------------------------------------------------- 1 | t # 0 2 | v 0 0 3 | v 1 1 4 | v 2 0 5 | v 3 0 6 | v 4 0 7 | v 5 1 8 | e 0 1 0 9 | e 1 2 0 10 | e 1 3 0 11 | e 2 4 0 12 | e 3 5 1 -------------------------------------------------------------------------------- /data/hits/input.txt: -------------------------------------------------------------------------------- 1 | 1 2 2 | 1 3 3 | 2 3 4 | 3 1 -------------------------------------------------------------------------------- /data/id3/input.txt: -------------------------------------------------------------------------------- 1 | Day OutLook Temperature Humidity Wind PlayTennis 2 | 1 Sunny Hot High Weak No 3 | 2 Sunny Hot High Strong No 4 | 3 Overcast Hot High Weak Yes 5 | 4 Rainy Mild High Weak Yes 6 | 5 Rainy Cool Normal Weak Yes 7 | 6 Rainy Cool Normal Strong No 8 | 7 Overcast Cool Normal Strong Yes 9 | 8 Sunny Mild High Weak No 10 | 9 Sunny Cool Normal Weak Yes 11 | 10 Rainy Mild Normal Weak Yes 12 | 11 Sunny Mild Normal Strong Yes 13 | 12 Overcast Mild High Strong Yes 14 | 13 Overcast Hot Normal Weak Yes 15 | 14 Rainy Mild High Strong No -------------------------------------------------------------------------------- /data/kdtree/input.txt: -------------------------------------------------------------------------------- 1 | 4 7 2 | 5 4 3 | 9 6 4 | 7 2 5 | 2 3 6 | 8 1 -------------------------------------------------------------------------------- /data/kmeans/input.txt: -------------------------------------------------------------------------------- 1 | 3 3 2 | 4 10 3 | 9 6 4 | 14 8 5 | 18 11 6 | 21 7 -------------------------------------------------------------------------------- /data/knn/testInput.txt: -------------------------------------------------------------------------------- 1 | 1 2 3 2 4 2 | 2 3 4 2 1 3 | 8 7 2 3 5 4 | -3 -2 2 4 0 5 | -4 -4 -4 -4 -4 6 | 1 2 3 4 4 7 | 4 4 3 2 1 8 | 3 3 3 2 4 9 | 0 0 1 1 -2 -------------------------------------------------------------------------------- /data/knn/trainInput.txt: -------------------------------------------------------------------------------- 1 | a 1 2 3 4 5 2 | b 5 4 3 2 1 3 | c 3 3 3 3 3 4 | d -3 -3 -3 -3 -3 5 | a 1 2 3 4 4 6 | b 4 4 3 2 1 7 | c 3 3 3 2 4 8 | d 0 0 1 1 -2 9 | -------------------------------------------------------------------------------- /data/maze/mapData.txt: -------------------------------------------------------------------------------- 1 | 0 0 0 0 0 2 | 2 0 0 -1 0 3 | 0 0 0 0 0 4 | 0 -1 0 0 -1 5 | 0 0 0 0 1 -------------------------------------------------------------------------------- /data/msapriori/testInput.txt: -------------------------------------------------------------------------------- 1 | T1 1 2 5 2 | T2 2 4 3 | T3 2 3 4 | T4 1 2 4 5 | T5 1 3 6 | T6 2 3 7 | T7 1 3 8 | T8 1 2 3 5 9 | T9 1 2 3 -------------------------------------------------------------------------------- /data/msapriori/testInput2.txt: -------------------------------------------------------------------------------- 1 | Rid Age Income Student CreditRating BuysComputer 2 | 1 Youth High No Fair No 3 | 2 Youth High No Excellent No 4 | 3 MiddleAged High No Fair Yes 5 | 4 Senior Medium No Fair Yes 6 | 5 Senior Low Yes Fair Yes 7 | 6 Senior Low Yes Excellent No 8 | 7 MiddleAged Low Yes Excellent Yes 9 | 8 Youth Medium No Fair No 10 | 9 Youth Low Yes Fair Yes 11 | 10 Senior Medium Yes Fair Yes 12 | 11 Youth Medium Yes Excellent Yes 13 | 12 MiddleAged Medium No Excellent Yes 14 | 13 MiddleAged High Yes Fair Yes 15 | 14 Senior Medium No Excellent No -------------------------------------------------------------------------------- /data/naivebayes/input.txt: -------------------------------------------------------------------------------- 1 | Day OutLook Temperature Humidity Wind PlayTennis 2 | 1 Sunny Hot High Weak No 3 | 2 Sunny Hot High Strong No 4 | 3 Overcast Hot High Weak Yes 5 | 4 Rainy Mild High Weak Yes 6 | 5 Rainy Cool Normal Weak Yes 7 | 6 Rainy Cool Normal Strong No 8 | 7 Overcast Cool Normal Strong Yes 9 | 8 Sunny Mild High Weak No 10 | 9 Sunny Cool Normal Weak Yes 11 | 10 Rainy Mild Normal Weak Yes 12 | 11 Sunny Mild Normal Strong Yes 13 | 12 Overcast Mild High Strong Yes 14 | 13 Overcast Hot Normal Weak Yes 15 | 14 Rainy Mild High Strong No -------------------------------------------------------------------------------- /data/pagerank/input.txt: -------------------------------------------------------------------------------- 1 | 1 2 2 | 1 3 3 | 2 3 4 | 3 1 -------------------------------------------------------------------------------- /data/pca/Makefile: -------------------------------------------------------------------------------- 1 | INPUTS= simple basilevsy compressor_per_day_kwh compressor_1_day_detail 2 | 3 | all: 4 | @for i in $(INPUTS) ; do \ 5 | java -cp ../target/pca-1.0.jar com.uwemeding.pca.Main $$i ; \ 6 | done 7 | 8 | 9 | clean:; rm -f *lambda* *pcomps* *pfacs* *_cc* *_cumcon* 10 | 11 | -------------------------------------------------------------------------------- /data/pca/basilevsy.data: -------------------------------------------------------------------------------- 1 | 335.6 2 | 245.3 3 | 226 4 | 318.5 5 | 450.8 6 | 508.6 7 | 445.7 8 | 445.1 9 | 472.6 10 | 376. 11 | 319.4 12 | 352.2 13 | 408.5 14 | 314.5 15 | 262.0 16 | 287.8 17 | 320.3 18 | 265.1 19 | 224.7 20 | 248 21 | 304.9 22 | 266.3 23 | 276.5 24 | 300.9 25 | 415.6 26 | 341.5 27 | 289.8 28 | 342.1 29 | 465.5 30 | 488.6 31 | 483.2 32 | 566.2 33 | 636.8 34 | 511 35 | 442.7 36 | 456.7 37 | 478.1 38 | 378.1 39 | 334.6 40 | 360.3 41 | 424.7 42 | 336.5 43 | 328.9 44 | 417.2 45 | 493.4 46 | 457.2 47 | 477.5 48 | 571.5 49 | 847.1 50 | 584.4 51 | 514.2 52 | 503.4 53 | 501.7 54 | 402.0 55 | 373 56 | 376.7 57 | 405.7 58 | 340.3 59 | 341.0 60 | 352.3 61 | 366.0 62 | 312.7 63 | 336.7 64 | 549. 65 | 632 66 | 577 67 | 574.7 68 | 612.7 69 | 651.7 70 | 584.7 71 | 577.3 72 | 591.7 73 | 632.3 74 | 562.7 75 | 581.7 76 | 608.7 77 | 662.3 78 | 614.3 79 | 639.3 80 | 643.3 81 | 761.7 82 | 789.7 83 | 887.6 84 | 956.2 85 | -------------------------------------------------------------------------------- /data/pca/compressor_per_day_kwh.data: -------------------------------------------------------------------------------- 1 | 49.71 2 | 49.71 3 | 66.85 4 | 63 5 | 50.83 6 | 56.32 7 | 72.6 8 | 57.32 9 | 62.59 10 | 63.77 11 | 52.3 12 | 61.13 13 | 51.95 14 | 52.88 15 | 82.31 16 | 78.95 17 | 48.6 18 | 59.39 19 | 53.15 20 | 51.07 21 | 69.49 22 | 59.64 23 | 69.42 24 | 63.53 25 | 45.46 26 | 49.7 27 | 66.45 28 | 59.93 29 | 49.16 30 | 57.46 31 | 73.2 32 | 73.96 33 | 75.21 34 | 69.14 35 | 71.74 36 | 71.56 37 | 65.69 38 | 78.28 39 | 81.58 40 | 79.3 41 | 87.15 42 | 84.37 43 | 64.88 44 | 74.96 45 | 83.11 46 | 79.55 47 | 74.98 48 | 70.58 49 | 51.26 50 | 60.05 51 | 78.74 52 | 66.67 53 | 54.14 54 | 61.11 55 | 79.62 56 | 73.98 57 | 76.75 58 | 70.02 59 | 71.36 60 | 76.58 61 | 88.28 62 | 84.84 63 | 86.02 64 | 83.16 65 | 85.33 66 | 72.7 67 | 86.17 68 | 85.18 69 | 82.75 70 | 68.04 71 | 77.58 72 | 72.35 73 | 54.76 74 | 64.33 75 | 76.18 76 | 63.9 77 | 51.22 78 | 61.4 79 | 80.38 80 | 73.94 81 | 75.65 82 | 67.71 83 | 71.52 84 | 69.39 85 | 82.43 86 | 85.62 87 | 86.32 88 | 84.03 89 | 86.84 90 | 91.22 91 | 74.63 92 | 74.21 93 | 81.11 94 | 73.22 95 | 70.19 96 | 68.52 97 | 50.24 98 | 49.13 99 | 68.15 100 | 65.49 101 | 58.94 102 | 62.41 103 | 78.54 104 | 75.25 105 | 77.07 106 | 85.93 107 | 74.82 108 | 71.71 109 | 83.85 110 | 86.4 111 | 82.57 112 | 79.8 113 | 83.25 114 | 71.62 115 | 80.17 116 | 80.73 117 | 84.12 118 | 79.11 119 | 76.92 120 | 65.89 121 | 52.59 122 | 50.17 123 | 70.89 124 | 67.02 125 | 54.84 126 | 62.24 127 | 80.07 128 | 76.92 129 | 75.2 130 | 69 131 | 69.17 132 | 69.82 133 | 83.9 134 | 82.93 135 | 85.61 136 | 81.07 137 | 82.83 138 | 69.2 139 | 70.42 140 | 67.16 141 | 82.06 142 | 75.12 143 | 75.48 144 | 67.02 145 | 51.69 146 | 63.76 147 | 76.22 148 | 65.95 149 | 49.61 150 | 60.49 151 | 77.93 152 | 67.68 153 | 72.95 154 | 65.82 155 | 50.18 156 | 61.66 157 | 51.29 158 | 50.5 159 | 83.48 160 | 73.63 161 | 60.27 162 | 62.76 163 | 52.41 164 | 52.61 165 | 67.24 166 | 64.84 167 | 72.24 168 | 64.17 169 | -------------------------------------------------------------------------------- /data/pca/simple.data: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 4 5 | 5 6 | 6 7 | -------------------------------------------------------------------------------- /data/prefixspan/input.txt: -------------------------------------------------------------------------------- 1 | bd c b ac 2 | bf ce b fg 3 | ah bf a b f 4 | be ce d 5 | a bd b c b ade -------------------------------------------------------------------------------- /data/randomforest/input.txt: -------------------------------------------------------------------------------- 1 | Rid Age Income Student CreditRating BuysComputer 2 | 1 Youth High No Fair No 3 | 2 Youth High No Excellent No 4 | 3 MiddleAged High No Fair Yes 5 | 4 Senior Medium No Fair Yes 6 | 5 Senior Low Yes Fair Yes 7 | 6 Senior Low Yes Excellent No 8 | 7 MiddleAged Low Yes Excellent Yes 9 | 8 Youth Medium No Fair No 10 | 9 Youth Low Yes Fair Yes 11 | 10 Senior Medium Yes Fair Yes 12 | 11 Youth Medium Yes Excellent Yes 13 | 12 MiddleAged Medium No Excellent Yes 14 | 13 MiddleAged High Yes Fair Yes 15 | 14 Senior Medium No Excellent No -------------------------------------------------------------------------------- /data/roughsets/input.txt: -------------------------------------------------------------------------------- 1 | Element Color Shape Size Stability 2 | x1 Red Triangle Large Stable 3 | x2 Red Triangle Large Stable 4 | x3 Yellow Circle Small UnStable 5 | x4 Yellow Circle Small UnStable 6 | x5 Blue Rectangle Large Stable 7 | x6 Red Circle Middle UnStable 8 | x7 Blue Circle Small UnStable 9 | x8 Blue Rectangle Middle UnStable -------------------------------------------------------------------------------- /data/tan/input.txt: -------------------------------------------------------------------------------- 1 | OutLook Temperature Humidity Wind PlayTennis 2 | Sunny Hot High Weak No 3 | Sunny Hot High Strong No 4 | Overcast Hot High Weak Yes 5 | Rainy Mild High Weak Yes 6 | Rainy Cool Normal Weak Yes 7 | Rainy Cool Normal Strong No 8 | Overcast Cool Normal Strong Yes 9 | Sunny Mild High Weak No 10 | Sunny Cool Normal Weak Yes 11 | Rainy Mild Normal Weak Yes 12 | Sunny Mild Normal Strong Yes 13 | Overcast Mild High Strong Yes 14 | Overcast Hot Normal Weak Yes 15 | Rainy Mild High Strong No -------------------------------------------------------------------------------- /data/viterbi/humidity-matrix.txt: -------------------------------------------------------------------------------- 1 | # Dry Dryish Damp Soggy 2 | Sunny 0.6 0.2 0.15 0.05 3 | Cloudy 0.25 0.25 0.25 0.25 4 | Rainy 0.05 0.10 0.35 0.50 -------------------------------------------------------------------------------- /data/viterbi/stmatrix.txt: -------------------------------------------------------------------------------- 1 | # Sunny Cloudy Rainy 2 | Sunny 0.5 0.375 0.125 3 | Cloudy 0.25 0.125 0.625 4 | Rainy 0.25 0.375 0.375 -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.jusdt 6 | datamining-18algorithms 7 | 1.0.0 8 | DataMining 18 Algorithms 9 | 10 | 11 | UTF-8 12 | 1.1.7 13 | 14 | 15 | 16 | 17 | 18 | info.bbd 19 | common-utils 20 | 1.0.0 21 | 22 | 23 | com.github.jnr 24 | jnr-posix 25 | 26 | 27 | 28 | 29 | 30 | ch.qos.logback 31 | logback-classic 32 | ${logback.version} 33 | 34 | 35 | ch.qos.logback 36 | logback-core 37 | ${logback.version} 38 | 39 | 40 | ch.qos.logback 41 | logback-access 42 | ${logback.version} 43 | 44 | 45 | org.slf4j 46 | slf4j-api 47 | 1.7.21 48 | 49 | 50 | 51 | com.google.guava 52 | guava 53 | 14.0.1 54 | 55 | 56 | 57 | junit 58 | junit 59 | 4.10 60 | test 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | org.apache.maven.plugins 70 | maven-compiler-plugin 71 | 3.6.1 72 | 73 | true 74 | 1.8 75 | 1.8 76 | UTF-8 77 | 1.8 78 | 79 | 80 | 81 | org.apache.maven.plugins 82 | maven-source-plugin 83 | 3.0.1 84 | 85 | 86 | attach-sources 87 | verify 88 | 89 | jar-no-fork 90 | 91 | 92 | 93 | 94 | 95 | org.apache.maven.plugins 96 | maven-resources-plugin 97 | 3.0.2 98 | 99 | UTF-8 100 | 101 | 102 | 103 | 108 | 109 | org.apache.maven.plugins 110 | maven-assembly-plugin 111 | 2.4 112 | 113 | 114 | 115 | com.jusdt.zcm.mapred.driver.ZcmDriver 116 | 117 | 118 | 119 | jar-with-dependencies 120 | 121 | 122 | 123 | 124 | make-assembly 125 | package 126 | 127 | single 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | src/main/resources 138 | 139 | *.* 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | ${project.artifactId}-${project.version} 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /src/main/assembly/distribution.xml: -------------------------------------------------------------------------------- 1 | 2 | 6 | distribution 7 | 8 | tar.gz 9 | 10 | ${project.artifactId} 11 | 12 | 13 | src/main/resources 14 | 15 | logback.xml 16 | conf.properties 17 | utils.properties 18 | 19 | /conf 20 | true 21 | 22 | 23 | src/main/bin 24 | 25 | * 26 | 27 | /bin 28 | 0755 29 | 30 | 31 | 32 | 33 | /lib 34 | 35 | 36 | -------------------------------------------------------------------------------- /src/main/bin/ctl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mainClass=com.jusdt.zcm.mapred.driver.ZcmDriver 4 | 5 | # resolve links - $0 may be a softlink 6 | PRG="$0" 7 | 8 | while [ -h "$PRG" ]; do 9 | ls=`ls -ld "$PRG"` 10 | link=`expr "$ls" : '.*-> \(.*\)$'` 11 | if expr "$link" : '/.*' > /dev/null; then 12 | PRG="$link" 13 | else 14 | PRG=`dirname "$PRG"`/"$link" 15 | fi 16 | done 17 | 18 | # Get standard environment variables 19 | PRGDIR=`dirname "$PRG"` 20 | 21 | PROJECT_DIR=`cd "$PRGDIR/.." >/dev/null; pwd` 22 | echo PROJECT_DIR=$PROJECT_DIR 23 | 24 | CLASSPATH="$CLASSHPATH:$PROJECT_DIR/conf" 25 | 26 | for jar in "$PROJECT_DIR/lib"/*.jar; do 27 | CLASSPATH="$CLASSPATH:$jar" 28 | done 29 | echo CLASSPATH=$CLASSPATH 30 | 31 | JVMARGS="${JVMARGS} -Dproject_dir=${PROJECT_DIR} -Djava.net.preferIPv4Stack=true" 32 | echo JVMARGS=$JVMARGS 33 | 34 | usage() { 35 | echo >&2 "usage: $PRG [args]" 36 | echo 'Valid commands: start, stop' 37 | exit 1 38 | } 39 | 40 | start() { 41 | JAVA=${JAVA-'java'} 42 | exec $JAVA $JVMARGS -classpath "$CLASSPATH" $mainClass "$@" & 43 | echo $! > main.pid 44 | } 45 | 46 | stop() { 47 | kill `cat main.pid` > /dev/null 48 | } 49 | 50 | case $1 in 51 | (start) 52 | shift 53 | start $@ 54 | ;; 55 | (stop) 56 | stop 57 | ;; 58 | (restart) 59 | stop 60 | shift 61 | start $@ 62 | ;; 63 | (*) 64 | echo >&2 "$PRG: error: unknown command '$1'" 65 | usage 66 | ;; 67 | esac 68 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/association/analysis/apriori/AprioriExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.association.analysis.apriori; 2 | 3 | /** 4 | * apriori关联规则挖掘算法调用类 5 | */ 6 | public class AprioriExample { 7 | 8 | public static void main(String[] args) { 9 | String filePath = "data/apriori/testInput.txt"; 10 | 11 | AprioriCore tool = new AprioriCore(filePath, 2); 12 | tool.printAttachRule(0.7); 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/association/analysis/apriori/FrequentItem.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.association.analysis.apriori; 2 | 3 | /** 4 | * 频繁项集 5 | */ 6 | public class FrequentItem implements Comparable { 7 | 8 | // 频繁项集的集合ID 9 | private String[] idArray; 10 | // 频繁项集的支持度计数 11 | private int count; 12 | //频繁项集的长度,1项集或是2项集,亦或是3项集 13 | private int length; 14 | 15 | public FrequentItem(String[] idArray, int count) { 16 | this.idArray = idArray; 17 | this.count = count; 18 | length = idArray.length; 19 | } 20 | 21 | public String[] getIdArray() { 22 | return idArray; 23 | } 24 | 25 | public void setIdArray(String[] idArray) { 26 | this.idArray = idArray; 27 | } 28 | 29 | public int getCount() { 30 | return count; 31 | } 32 | 33 | public void setCount(int count) { 34 | this.count = count; 35 | } 36 | 37 | public int getLength() { 38 | return length; 39 | } 40 | 41 | public void setLength(int length) { 42 | this.length = length; 43 | } 44 | 45 | @Override 46 | public int compareTo(FrequentItem o) { 47 | // TODO Auto-generated method stub 48 | Integer int1 = Integer.parseInt(this.getIdArray()[0]); 49 | Integer int2 = Integer.parseInt(o.getIdArray()[0]); 50 | 51 | return int1.compareTo(int2); 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/association/analysis/fptree/FPTreeExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.association.analysis.fptree; 2 | 3 | /** 4 | * FPTree频繁模式树算法 5 | */ 6 | public class FPTreeExample { 7 | 8 | public static void main(String[] args) { 9 | String filePath = "data/fptree/testInput.txt"; 10 | //最小支持度阈值 11 | int minSupportCount = 2; 12 | 13 | FPTreeCore tool = new FPTreeCore(filePath, minSupportCount); 14 | tool.startBuildingTree(); 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/association/analysis/fptree/TreeNode.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.association.analysis.fptree; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * FP树节点 7 | */ 8 | public class TreeNode implements Comparable, Cloneable { 9 | 10 | // 节点类别名称 11 | private String name; 12 | // 计数数量 13 | private Integer count; 14 | // 父亲节点 15 | private TreeNode parentNode; 16 | // 孩子节点,可以为多个 17 | private ArrayList childNodes; 18 | 19 | public TreeNode(String name, int count) { 20 | this.name = name; 21 | this.count = count; 22 | } 23 | 24 | public String getName() { 25 | return name; 26 | } 27 | 28 | public void setName(String name) { 29 | this.name = name; 30 | } 31 | 32 | public Integer getCount() { 33 | return count; 34 | } 35 | 36 | public void setCount(Integer count) { 37 | this.count = count; 38 | } 39 | 40 | public TreeNode getParentNode() { 41 | return parentNode; 42 | } 43 | 44 | public void setParentNode(TreeNode parentNode) { 45 | this.parentNode = parentNode; 46 | } 47 | 48 | public ArrayList getChildNodes() { 49 | return childNodes; 50 | } 51 | 52 | public void setChildNodes(ArrayList childNodes) { 53 | this.childNodes = childNodes; 54 | } 55 | 56 | @Override 57 | public int compareTo(TreeNode o) { 58 | // TODO Auto-generated method stub 59 | return o.getCount().compareTo(this.getCount()); 60 | } 61 | 62 | @Override 63 | protected Object clone() throws CloneNotSupportedException { 64 | // TODO Auto-generated method stub 65 | //因为对象内部有引用,需要采用深拷贝 66 | TreeNode node = (TreeNode) super.clone(); 67 | if (this.getParentNode() != null) { 68 | node.setParentNode((TreeNode) this.getParentNode().clone()); 69 | } 70 | 71 | if (this.getChildNodes() != null) { 72 | node.setChildNodes((ArrayList) this.getChildNodes().clone()); 73 | } 74 | 75 | return node; 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/bagging/boosting/adaboost/AdaBoostExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.bagging.boosting.adaboost; 2 | 3 | /** 4 | * AdaBoost提升算法调用类 5 | */ 6 | public class AdaBoostExample { 7 | 8 | public static void main(String[] agrs) { 9 | String filePath = "data/adaboost/input.txt"; 10 | //误差率阈值 11 | double errorValue = 0.2; 12 | 13 | AdaBoostCore tool = new AdaBoostCore(filePath, errorValue); 14 | tool.adaBoostClassify(); 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/bagging/boosting/adaboost/Point.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.bagging.boosting.adaboost; 2 | 3 | /** 4 | * 坐标点类 5 | */ 6 | public class Point { 7 | 8 | // 坐标点x坐标 9 | private int x; 10 | // 坐标点y坐标 11 | private int y; 12 | // 坐标点的分类类别 13 | private int classType; 14 | //如果此节点被划错,他的误差率,不能用个数除以总数,因为不同坐标点的权重不一定相等 15 | private double probably; 16 | 17 | public Point(int x, int y, int classType) { 18 | this.x = x; 19 | this.y = y; 20 | this.classType = classType; 21 | } 22 | 23 | public Point(String x, String y, String classType) { 24 | this.x = Integer.parseInt(x); 25 | this.y = Integer.parseInt(y); 26 | this.classType = Integer.parseInt(classType); 27 | } 28 | 29 | public int getX() { 30 | return x; 31 | } 32 | 33 | public void setX(int x) { 34 | this.x = x; 35 | } 36 | 37 | public int getY() { 38 | return y; 39 | } 40 | 41 | public void setY(int y) { 42 | this.y = y; 43 | } 44 | 45 | public int getClassType() { 46 | return classType; 47 | } 48 | 49 | public void setClassType(int classType) { 50 | this.classType = classType; 51 | } 52 | 53 | public double getProbably() { 54 | return probably; 55 | } 56 | 57 | public void setProbably(double probably) { 58 | this.probably = probably; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/classification/cart/AttrNode.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.classification.cart; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * 回归分类树节点 7 | */ 8 | public class AttrNode { 9 | 10 | // 节点属性名字 11 | private String attrName; 12 | // 节点索引标号 13 | private int nodeIndex; 14 | //包含的叶子节点数 15 | private int leafNum; 16 | // 节点误差率 17 | private double alpha; 18 | // 父亲分类属性值 19 | private String parentAttrValue; 20 | // 孩子节点 21 | private AttrNode[] childAttrNode; 22 | // 数据记录索引 23 | private ArrayList dataIndex; 24 | 25 | public String getAttrName() { 26 | return attrName; 27 | } 28 | 29 | public void setAttrName(String attrName) { 30 | this.attrName = attrName; 31 | } 32 | 33 | public int getNodeIndex() { 34 | return nodeIndex; 35 | } 36 | 37 | public void setNodeIndex(int nodeIndex) { 38 | this.nodeIndex = nodeIndex; 39 | } 40 | 41 | public double getAlpha() { 42 | return alpha; 43 | } 44 | 45 | public void setAlpha(double alpha) { 46 | this.alpha = alpha; 47 | } 48 | 49 | public String getParentAttrValue() { 50 | return parentAttrValue; 51 | } 52 | 53 | public void setParentAttrValue(String parentAttrValue) { 54 | this.parentAttrValue = parentAttrValue; 55 | } 56 | 57 | public AttrNode[] getChildAttrNode() { 58 | return childAttrNode; 59 | } 60 | 61 | public void setChildAttrNode(AttrNode[] childAttrNode) { 62 | this.childAttrNode = childAttrNode; 63 | } 64 | 65 | public ArrayList getDataIndex() { 66 | return dataIndex; 67 | } 68 | 69 | public void setDataIndex(ArrayList dataIndex) { 70 | this.dataIndex = dataIndex; 71 | } 72 | 73 | public int getLeafNum() { 74 | return leafNum; 75 | } 76 | 77 | public void setLeafNum(int leafNum) { 78 | this.leafNum = leafNum; 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/classification/cart/CARTExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.classification.cart; 2 | 3 | public class CARTExample { 4 | 5 | public static void main(String[] args) { 6 | String filePath = "data/cart/input.txt"; 7 | 8 | CARTCore tool = new CARTCore(filePath); 9 | 10 | tool.startBuildingTree(); 11 | } 12 | 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/classification/id3/AttrNode.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.classification.id3; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * 属性节点,不是叶子节点 7 | */ 8 | public class AttrNode { 9 | 10 | //当前属性的名字 11 | private String attrName; 12 | //父节点的分类属性值 13 | private String parentAttrValue; 14 | //属性子节点 15 | private AttrNode[] childAttrNode; 16 | //孩子叶子节点 17 | private ArrayList childDataIndex; 18 | 19 | public String getAttrName() { 20 | return attrName; 21 | } 22 | 23 | public void setAttrName(String attrName) { 24 | this.attrName = attrName; 25 | } 26 | 27 | public AttrNode[] getChildAttrNode() { 28 | return childAttrNode; 29 | } 30 | 31 | public void setChildAttrNode(AttrNode[] childAttrNode) { 32 | this.childAttrNode = childAttrNode; 33 | } 34 | 35 | public String getParentAttrValue() { 36 | return parentAttrValue; 37 | } 38 | 39 | public void setParentAttrValue(String parentAttrValue) { 40 | this.parentAttrValue = parentAttrValue; 41 | } 42 | 43 | public ArrayList getChildDataIndex() { 44 | return childDataIndex; 45 | } 46 | 47 | public void setChildDataIndex(ArrayList childDataIndex) { 48 | this.childDataIndex = childDataIndex; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/classification/id3/DataNode.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.classification.id3; 2 | 3 | /** 4 | * 存放数据的叶子节点 5 | */ 6 | public class DataNode { 7 | 8 | /** 9 | * 数据的标号 10 | */ 11 | private int dataIndex; 12 | 13 | public DataNode(int dataIndex) { 14 | this.dataIndex = dataIndex; 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/classification/id3/ID3Example.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.classification.id3; 2 | 3 | /** 4 | * ID3决策树分类算法测试场景类 5 | */ 6 | public class ID3Example { 7 | 8 | public static void main(String[] args) { 9 | String filePath = "data/id3/input.txt"; 10 | 11 | ID3Core tool = new ID3Core(filePath); 12 | tool.startBuildingTree(true); 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/classification/knn/KNNCore.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.classification.knn; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | import java.util.Collections; 9 | import java.util.HashMap; 10 | import java.util.Map; 11 | 12 | //import org.apache.activemq.filter.ComparisonExpression; 13 | 14 | /** 15 | * K最近邻算法工具类 16 | */ 17 | public class KNNCore { 18 | 19 | // 为4个类别设置权重,默认权重比一致 20 | public int[] classWeightArray = new int[] { 1, 1, 1, 1 }; 21 | // 测试数据 22 | private String testDataPath; 23 | // 训练集数据地址 24 | private String trainDataPath; 25 | // 分类的不同类型 26 | private ArrayList classTypes; 27 | // 结果数据 28 | private ArrayList resultSamples; 29 | // 训练集数据列表容器 30 | private ArrayList trainSamples; 31 | // 训练集数据 32 | private String[][] trainData; 33 | // 测试集数据 34 | private String[][] testData; 35 | 36 | public KNNCore(String trainDataPath, String testDataPath) { 37 | this.trainDataPath = trainDataPath; 38 | this.testDataPath = testDataPath; 39 | readDataFormFile(); 40 | } 41 | 42 | /** 43 | * 从文件中阅读测试数和训练数据集 44 | */ 45 | private void readDataFormFile() { 46 | ArrayList tempArray; 47 | 48 | tempArray = fileDataToArray(trainDataPath); 49 | trainData = new String[tempArray.size()][]; 50 | tempArray.toArray(trainData); 51 | 52 | classTypes = new ArrayList<>(); 53 | for (String[] s : tempArray) { 54 | if (!classTypes.contains(s[0])) { 55 | // 添加类型 56 | classTypes.add(s[0]); 57 | } 58 | } 59 | 60 | tempArray = fileDataToArray(testDataPath); 61 | testData = new String[tempArray.size()][]; 62 | tempArray.toArray(testData); 63 | } 64 | 65 | /** 66 | * 将文件转为列表数据输出 67 | * 68 | * @param filePath 69 | * 数据文件的内容 70 | */ 71 | private ArrayList fileDataToArray(String filePath) { 72 | File file = new File(filePath); 73 | ArrayList dataArray = new ArrayList(); 74 | 75 | try { 76 | BufferedReader in = new BufferedReader(new FileReader(file)); 77 | String str; 78 | String[] tempArray; 79 | while ((str = in.readLine()) != null) { 80 | tempArray = str.split(" "); 81 | dataArray.add(tempArray); 82 | } 83 | in.close(); 84 | } catch (IOException e) { 85 | e.getStackTrace(); 86 | } 87 | 88 | return dataArray; 89 | } 90 | 91 | /** 92 | * 计算样本特征向量的欧几里得距离 93 | * 94 | * @param f1 95 | * 待比较样本1 96 | * @param f2 97 | * 待比较样本2 98 | * @return 99 | */ 100 | private int computeEuclideanDistance(Sample s1, Sample s2) { 101 | String[] f1 = s1.getFeatures(); 102 | String[] f2 = s2.getFeatures(); 103 | // 欧几里得距离 104 | int distance = 0; 105 | 106 | for (int i = 0; i < f1.length; i++) { 107 | int subF1 = Integer.parseInt(f1[i]); 108 | int subF2 = Integer.parseInt(f2[i]); 109 | 110 | distance += (subF1 - subF2) * (subF1 - subF2); 111 | } 112 | 113 | return distance; 114 | } 115 | 116 | /** 117 | * 计算K最近邻 118 | * @param k 119 | * 在多少的k范围内 120 | */ 121 | public void knnCompute(int k) { 122 | String className = ""; 123 | String[] tempF = null; 124 | Sample temp; 125 | resultSamples = new ArrayList<>(); 126 | trainSamples = new ArrayList<>(); 127 | // 分类类别计数 128 | HashMap classCount; 129 | // 类别权重比 130 | HashMap classWeight = new HashMap<>(); 131 | // 首先讲测试数据转化到结果数据中 132 | for (String[] s : testData) { 133 | temp = new Sample(s); 134 | resultSamples.add(temp); 135 | } 136 | 137 | for (String[] s : trainData) { 138 | className = s[0]; 139 | tempF = new String[s.length - 1]; 140 | System.arraycopy(s, 1, tempF, 0, s.length - 1); 141 | temp = new Sample(className, tempF); 142 | trainSamples.add(temp); 143 | } 144 | 145 | // 离样本最近排序的的训练集数据 146 | ArrayList kNNSample = new ArrayList<>(); 147 | // 计算训练数据集中离样本数据最近的K个训练集数据 148 | for (Sample s : resultSamples) { 149 | classCount = new HashMap<>(); 150 | int index = 0; 151 | for (String type : classTypes) { 152 | // 开始时计数为0 153 | classCount.put(type, 0); 154 | classWeight.put(type, classWeightArray[index++]); 155 | } 156 | for (Sample tS : trainSamples) { 157 | int dis = computeEuclideanDistance(s, tS); 158 | tS.setDistance(dis); 159 | } 160 | 161 | Collections.sort(trainSamples); 162 | kNNSample.clear(); 163 | // 挑选出前k个数据作为分类标准 164 | for (int i = 0; i < trainSamples.size(); i++) { 165 | if (i < k) { 166 | kNNSample.add(trainSamples.get(i)); 167 | } else { 168 | break; 169 | } 170 | } 171 | // 判定K个训练数据的多数的分类标准 172 | for (Sample s1 : kNNSample) { 173 | int num = classCount.get(s1.getClassName()); 174 | // 进行分类权重的叠加,默认类别权重平等,可自行改变,近的权重大,远的权重小 175 | num += classWeight.get(s1.getClassName()); 176 | classCount.put(s1.getClassName(), num); 177 | } 178 | 179 | int maxCount = 0; 180 | // 筛选出k个训练集数据中最多的一个分类 181 | for (Map.Entry entry : classCount.entrySet()) { 182 | if ((Integer) entry.getValue() > maxCount) { 183 | maxCount = (Integer) entry.getValue(); 184 | s.setClassName((String) entry.getKey()); 185 | } 186 | } 187 | 188 | System.out.print("测试数据特征:"); 189 | for (String s1 : s.getFeatures()) { 190 | System.out.print(s1 + " "); 191 | } 192 | System.out.println("分类:" + s.getClassName()); 193 | } 194 | } 195 | } 196 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/classification/knn/KNNExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.classification.knn; 2 | 3 | /** 4 | * k最近邻算法场景类型 5 | */ 6 | public class KNNExample { 7 | 8 | public static void main(String[] args) { 9 | String trainDataPath = "data/knn/trainInput.txt"; 10 | String testDataPath = "data/knn/testinput.txt"; 11 | 12 | KNNCore tool = new KNNCore(trainDataPath, testDataPath); 13 | tool.knnCompute(3); 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/classification/knn/Sample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.classification.knn; 2 | 3 | /** 4 | * 样本数据类 5 | */ 6 | public class Sample implements Comparable { 7 | 8 | // 样本数据的分类名称 9 | private String className; 10 | // 样本数据的特征向量 11 | private String[] features; 12 | // 测试样本之间的间距值,以此做排序 13 | private Integer distance; 14 | 15 | public Sample(String[] features) { 16 | this.features = features; 17 | } 18 | 19 | public Sample(String className, String[] features) { 20 | this.className = className; 21 | this.features = features; 22 | } 23 | 24 | public String getClassName() { 25 | return className; 26 | } 27 | 28 | public void setClassName(String className) { 29 | this.className = className; 30 | } 31 | 32 | public String[] getFeatures() { 33 | return features; 34 | } 35 | 36 | public void setFeatures(String[] features) { 37 | this.features = features; 38 | } 39 | 40 | public Integer getDistance() { 41 | return distance; 42 | } 43 | 44 | public void setDistance(int distance) { 45 | this.distance = distance; 46 | } 47 | 48 | @Override 49 | public int compareTo(Sample o) { 50 | return this.getDistance().compareTo(o.getDistance()); 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/classification/naivebayes/NaiveBayesCore.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.classification.naivebayes; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | import java.util.HashMap; 9 | import java.util.Map; 10 | 11 | /** 12 | * 朴素贝叶斯算法工具类 13 | */ 14 | public class NaiveBayesCore { 15 | 16 | // 类标记符,这里分为2类,YES和NO 17 | private String YES = "Yes"; 18 | private String NO = "No"; 19 | 20 | // 已分类训练数据集文件路径 21 | private String filePath; 22 | // 属性名称数组 23 | private String[] attrNames; 24 | // 训练数据集 25 | private String[][] data; 26 | 27 | // 每个属性的值所有类型 28 | private HashMap> attrValue; 29 | 30 | public NaiveBayesCore(String filePath) { 31 | this.filePath = filePath; 32 | 33 | readDataFile(); 34 | initAttrValue(); 35 | } 36 | 37 | /** 38 | * 从文件中读取数据 39 | */ 40 | private void readDataFile() { 41 | File file = new File(filePath); 42 | ArrayList dataArray = new ArrayList(); 43 | 44 | try { 45 | BufferedReader in = new BufferedReader(new FileReader(file)); 46 | String str; 47 | String[] tempArray; 48 | while ((str = in.readLine()) != null) { 49 | tempArray = str.split(" "); 50 | dataArray.add(tempArray); 51 | } 52 | in.close(); 53 | } catch (IOException e) { 54 | e.getStackTrace(); 55 | } 56 | 57 | data = new String[dataArray.size()][]; 58 | dataArray.toArray(data); 59 | attrNames = data[0]; 60 | 61 | /* 62 | * for(int i=0; i(); 74 | ArrayList tempValues; 75 | 76 | // 按照列的方式,从左往右找 77 | for (int j = 1; j < attrNames.length; j++) { 78 | // 从一列中的上往下开始寻找值 79 | tempValues = new ArrayList<>(); 80 | for (int i = 1; i < data.length; i++) { 81 | if (!tempValues.contains(data[i][j])) { 82 | // 如果这个属性的值没有添加过,则添加 83 | tempValues.add(data[i][j]); 84 | } 85 | } 86 | 87 | // 一列属性的值已经遍历完毕,复制到map属性表中 88 | attrValue.put(data[0][j], tempValues); 89 | } 90 | 91 | } 92 | 93 | /** 94 | * 在classType的情况下,发生condition条件的概率 95 | * 96 | * @param condition 97 | * 属性条件 98 | * @param classType 99 | * 分类的类型 100 | * @return 101 | */ 102 | private double computeConditionProbably(String condition, String classType) { 103 | // 条件计数器 104 | int count = 0; 105 | // 条件属性的索引列 106 | int attrIndex = 1; 107 | // yes类标记符数据 108 | ArrayList yClassData = new ArrayList<>(); 109 | // no类标记符数据 110 | ArrayList nClassData = new ArrayList<>(); 111 | ArrayList classData; 112 | 113 | for (int i = 1; i < data.length; i++) { 114 | // data数据按照yes和no分类 115 | if (data[i][attrNames.length - 1].equals(YES)) { 116 | yClassData.add(data[i]); 117 | } else { 118 | nClassData.add(data[i]); 119 | } 120 | } 121 | 122 | if (classType.equals(YES)) { 123 | classData = yClassData; 124 | } else { 125 | classData = nClassData; 126 | } 127 | 128 | // 如果没有设置条件则,计算的是纯粹的类事件概率 129 | if (condition == null) { 130 | return 1.0 * classData.size() / (data.length - 1); 131 | } 132 | 133 | // 寻找此条件的属性列 134 | attrIndex = getConditionAttrName(condition); 135 | 136 | for (String[] s : classData) { 137 | if (s[attrIndex].equals(condition)) { 138 | count++; 139 | } 140 | } 141 | 142 | return 1.0 * count / classData.size(); 143 | } 144 | 145 | /** 146 | * 根据条件值返回条件所属属性的列值 147 | * 148 | * @param condition 149 | * 条件 150 | * @return 151 | */ 152 | private int getConditionAttrName(String condition) { 153 | // 条件所属属性名 154 | String attrName = ""; 155 | // 条件所在属性列索引 156 | int attrIndex = 1; 157 | // 临时属性值类型 158 | ArrayList valueTypes; 159 | for (Map.Entry entry : attrValue.entrySet()) { 160 | valueTypes = (ArrayList) entry.getValue(); 161 | if (valueTypes.contains(condition) && !((String) entry.getKey()).equals("BuysComputer")) { 162 | attrName = (String) entry.getKey(); 163 | } 164 | } 165 | 166 | for (int i = 0; i < attrNames.length - 1; i++) { 167 | if (attrNames[i].equals(attrName)) { 168 | attrIndex = i; 169 | break; 170 | } 171 | } 172 | 173 | return attrIndex; 174 | } 175 | 176 | /** 177 | * 进行朴素贝叶斯分类 178 | * 179 | * @param data 180 | * 待分类数据 181 | */ 182 | public String naiveBayesClassificate(String data) { 183 | // 测试数据的属性值特征 184 | String[] dataFeatures; 185 | // 在yes的条件下,x事件发生的概率 186 | double xWhenYes = 1.0; 187 | // 在no的条件下,x事件发生的概率 188 | double xWhenNo = 1.0; 189 | // 最后也是yes和no分类的总概率,用P(X|Ci)*P(Ci)的公式计算 190 | double pYes = 1; 191 | double pNo = 1; 192 | 193 | dataFeatures = data.split(" "); 194 | for (int i = 0; i < dataFeatures.length; i++) { 195 | // 因为朴素贝叶斯算法是类条件独立的,所以可以进行累积的计算 196 | xWhenYes *= computeConditionProbably(dataFeatures[i], YES); 197 | xWhenNo *= computeConditionProbably(dataFeatures[i], NO); 198 | } 199 | 200 | pYes = xWhenYes * computeConditionProbably(null, YES); 201 | pNo = xWhenNo * computeConditionProbably(null, NO); 202 | 203 | return (pYes > pNo ? YES : NO); 204 | } 205 | 206 | } 207 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/classification/naivebayes/NaiveBayesExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.classification.naivebayes; 2 | 3 | /** 4 | * 朴素贝叶斯算法场景调用类 5 | */ 6 | public class NaiveBayesExample { 7 | 8 | public static void main(String[] args) { 9 | //训练集数据 10 | String filePath = "data/naivebayes/input.txt"; 11 | String testData = "Youth Medium Yes Fair"; 12 | NaiveBayesCore tool = new NaiveBayesCore(filePath); 13 | System.out.println(testData + " 数据的分类为:" + tool.naiveBayesClassificate(testData)); 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/clustering/birch/BIRCHCore.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.clustering.birch; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.text.MessageFormat; 8 | import java.util.ArrayList; 9 | import java.util.LinkedList; 10 | 11 | /** 12 | * BIRCH聚类算法工具类 13 | */ 14 | public class BIRCHCore { 15 | 16 | // 节点类型名称 17 | public static final String NON_LEAFNODE = "【NonLeafNode】"; 18 | public static final String LEAFNODE = "【LeafNode】"; 19 | public static final String CLUSTER = "【Cluster】"; 20 | 21 | // 测试数据文件地址 22 | private String filePath; 23 | // 内部节点平衡因子B 24 | public static int B; 25 | // 叶子节点平衡因子L 26 | public static int L; 27 | // 簇直径阈值T 28 | public static double T; 29 | // 总的测试数据记录 30 | private ArrayList totalDataRecords; 31 | 32 | public BIRCHCore(String filePath, int B, int L, double T) { 33 | this.filePath = filePath; 34 | this.B = B; 35 | this.L = L; 36 | this.T = T; 37 | readDataFile(); 38 | } 39 | 40 | /** 41 | * 从文件中读取数据 42 | */ 43 | private void readDataFile() { 44 | File file = new File(filePath); 45 | ArrayList dataArray = new ArrayList(); 46 | 47 | try { 48 | BufferedReader in = new BufferedReader(new FileReader(file)); 49 | String str; 50 | String[] tempArray; 51 | while ((str = in.readLine()) != null) { 52 | tempArray = str.split(" "); 53 | dataArray.add(tempArray); 54 | } 55 | in.close(); 56 | } catch (IOException e) { 57 | e.getStackTrace(); 58 | } 59 | 60 | totalDataRecords = new ArrayList<>(); 61 | for (String[] array : dataArray) { 62 | totalDataRecords.add(array); 63 | } 64 | } 65 | 66 | /** 67 | * 构建CF聚类特征树 68 | * 69 | * @return 70 | */ 71 | private ClusteringFeature buildCFTree() { 72 | NonLeafNode rootNode = null; 73 | LeafNode leafNode = null; 74 | Cluster cluster = null; 75 | 76 | for (String[] record : totalDataRecords) { 77 | cluster = new Cluster(record); 78 | 79 | if (rootNode == null) { 80 | // CF树只有1个节点的时候的情况 81 | if (leafNode == null) { 82 | leafNode = new LeafNode(); 83 | } 84 | leafNode.addingCluster(cluster); 85 | if (leafNode.getParentNode() != null) { 86 | rootNode = leafNode.getParentNode(); 87 | } 88 | } else { 89 | if (rootNode.getParentNode() != null) { 90 | rootNode = rootNode.getParentNode(); 91 | } 92 | 93 | // 从根节点开始,从上往下寻找到最近的添加目标叶子节点 94 | LeafNode temp = rootNode.findedClosestNode(cluster); 95 | temp.addingCluster(cluster); 96 | } 97 | } 98 | 99 | // 从下往上找出最上面的节点 100 | LeafNode node = cluster.getParentNode(); 101 | NonLeafNode upNode = node.getParentNode(); 102 | if (upNode == null) { 103 | return node; 104 | } else { 105 | while (upNode.getParentNode() != null) { 106 | upNode = upNode.getParentNode(); 107 | } 108 | 109 | return upNode; 110 | } 111 | } 112 | 113 | /** 114 | * 开始构建CF聚类特征树 115 | */ 116 | public void startBuilding() { 117 | // 树深度 118 | int level = 1; 119 | ClusteringFeature rootNode = buildCFTree(); 120 | 121 | setTreeLevel(rootNode, level); 122 | showCFTree(rootNode); 123 | } 124 | 125 | /** 126 | * 设置节点深度 127 | * 128 | * @param clusteringFeature 129 | * 当前节点 130 | * @param level 131 | * 当前深度值 132 | */ 133 | private void setTreeLevel(ClusteringFeature clusteringFeature, int level) { 134 | LeafNode leafNode = null; 135 | NonLeafNode nonLeafNode = null; 136 | 137 | if (clusteringFeature instanceof LeafNode) { 138 | leafNode = (LeafNode) clusteringFeature; 139 | } else if (clusteringFeature instanceof NonLeafNode) { 140 | nonLeafNode = (NonLeafNode) clusteringFeature; 141 | } 142 | 143 | if (nonLeafNode != null) { 144 | nonLeafNode.setLevel(level); 145 | level++; 146 | // 设置子节点 147 | if (nonLeafNode.getNonLeafChilds() != null) { 148 | for (NonLeafNode n1 : nonLeafNode.getNonLeafChilds()) { 149 | setTreeLevel(n1, level); 150 | } 151 | } else { 152 | for (LeafNode n2 : nonLeafNode.getLeafChilds()) { 153 | setTreeLevel(n2, level); 154 | } 155 | } 156 | } else { 157 | leafNode.setLevel(level); 158 | level++; 159 | // 设置子聚簇 160 | for (Cluster c : leafNode.getClusterChilds()) { 161 | c.setLevel(level); 162 | } 163 | } 164 | } 165 | 166 | /** 167 | * 显示CF聚类特征树 168 | * 169 | * @param rootNode 170 | * CF树根节点 171 | */ 172 | private void showCFTree(ClusteringFeature rootNode) { 173 | // 空格数,用于输出 174 | int blankNum = 5; 175 | // 当前树深度 176 | int currentLevel = 1; 177 | LinkedList nodeQueue = new LinkedList<>(); 178 | ClusteringFeature cf; 179 | LeafNode leafNode; 180 | NonLeafNode nonLeafNode; 181 | ArrayList clusterList = new ArrayList<>(); 182 | String typeName; 183 | 184 | nodeQueue.add(rootNode); 185 | while (nodeQueue.size() > 0) { 186 | cf = nodeQueue.poll(); 187 | 188 | if (cf instanceof LeafNode) { 189 | leafNode = (LeafNode) cf; 190 | typeName = LEAFNODE; 191 | 192 | if (leafNode.getClusterChilds() != null) { 193 | for (Cluster c : leafNode.getClusterChilds()) { 194 | nodeQueue.add(c); 195 | } 196 | } 197 | } else if (cf instanceof NonLeafNode) { 198 | nonLeafNode = (NonLeafNode) cf; 199 | typeName = NON_LEAFNODE; 200 | 201 | if (nonLeafNode.getNonLeafChilds() != null) { 202 | for (NonLeafNode n1 : nonLeafNode.getNonLeafChilds()) { 203 | nodeQueue.add(n1); 204 | } 205 | } else { 206 | for (LeafNode n2 : nonLeafNode.getLeafChilds()) { 207 | nodeQueue.add(n2); 208 | } 209 | } 210 | } else { 211 | clusterList.add((Cluster) cf); 212 | typeName = CLUSTER; 213 | } 214 | 215 | if (currentLevel != cf.getLevel()) { 216 | currentLevel = cf.getLevel(); 217 | System.out.println(); 218 | System.out.println("|"); 219 | System.out.println("|"); 220 | } else if (currentLevel == cf.getLevel() && currentLevel != 1) { 221 | for (int i = 0; i < blankNum; i++) { 222 | System.out.print("-"); 223 | } 224 | } 225 | 226 | System.out.print(typeName); 227 | System.out.print("N:" + cf.getN() + ", LS:"); 228 | System.out.print("["); 229 | for (double d : cf.getLS()) { 230 | System.out.print(MessageFormat.format("{0}, ", d)); 231 | } 232 | System.out.print("]"); 233 | } 234 | 235 | System.out.println(); 236 | System.out.println("*******最终分好的聚簇****"); 237 | //显示已经分好类的聚簇点 238 | for (int i = 0; i < clusterList.size(); i++) { 239 | System.out.println("Cluster" + (i + 1) + ":"); 240 | for (double[] point : clusterList.get(i).getData()) { 241 | System.out.print("["); 242 | for (double d : point) { 243 | System.out.print(MessageFormat.format("{0}, ", d)); 244 | } 245 | System.out.println("]"); 246 | } 247 | } 248 | } 249 | 250 | } 251 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/clustering/birch/BIRCHExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.clustering.birch; 2 | 3 | /** 4 | * BIRCH聚类算法调用类 5 | */ 6 | public class BIRCHExample { 7 | 8 | public static void main(String[] args) { 9 | String filePath = "data/birch/testInput.txt"; 10 | //内部节点平衡因子B 11 | int B = 2; 12 | //叶子节点平衡因子L 13 | int L = 2; 14 | //簇直径阈值T 15 | double T = 0.6; 16 | 17 | BIRCHCore tool = new BIRCHCore(filePath, B, L, T); 18 | tool.startBuilding(); 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/clustering/birch/Cluster.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.clustering.birch; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * 叶子节点中的小集群 7 | */ 8 | public class Cluster extends ClusteringFeature { 9 | 10 | //集群中的数据点 11 | private ArrayList data; 12 | //父亲节点 13 | private LeafNode parentNode; 14 | 15 | public Cluster(String[] record) { 16 | double[] d = new double[record.length]; 17 | data = new ArrayList<>(); 18 | for (int i = 0; i < record.length; i++) { 19 | d[i] = Double.parseDouble(record[i]); 20 | } 21 | data.add(d); 22 | //计算CF聚类特征 23 | this.setLS(data); 24 | this.setSS(data); 25 | this.setN(data); 26 | } 27 | 28 | public ArrayList getData() { 29 | return data; 30 | } 31 | 32 | public void setData(ArrayList data) { 33 | this.data = data; 34 | } 35 | 36 | @Override 37 | protected void directAddCluster(ClusteringFeature node) { 38 | //如果是聚类包括数据记录,则还需合并数据记录 39 | Cluster c = (Cluster) node; 40 | ArrayList dataRecords = c.getData(); 41 | this.data.addAll(dataRecords); 42 | 43 | super.directAddCluster(node); 44 | } 45 | 46 | public LeafNode getParentNode() { 47 | return parentNode; 48 | } 49 | 50 | public void setParentNode(LeafNode parentNode) { 51 | this.parentNode = parentNode; 52 | } 53 | 54 | @Override 55 | public void addingCluster(ClusteringFeature clusteringFeature) { 56 | // TODO Auto-generated method stub 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/clustering/birch/ClusteringFeature.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.clustering.birch; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * 聚类特征基本属性 7 | */ 8 | public abstract class ClusteringFeature { 9 | 10 | // 子类中节点的总数目 11 | protected int N; 12 | // 子类中N个节点的线性和 13 | protected double[] LS; 14 | // 子类中N个节点的平方和 15 | protected double[] SS; 16 | //节点深度,用于CF树的输出 17 | protected int level; 18 | 19 | public int getN() { 20 | return N; 21 | } 22 | 23 | public void setN(int n) { 24 | N = n; 25 | } 26 | 27 | public double[] getLS() { 28 | return LS; 29 | } 30 | 31 | public void setLS(double[] lS) { 32 | LS = lS; 33 | } 34 | 35 | public double[] getSS() { 36 | return SS; 37 | } 38 | 39 | public void setSS(double[] sS) { 40 | SS = sS; 41 | } 42 | 43 | protected void setN(ArrayList dataRecords) { 44 | this.N = dataRecords.size(); 45 | } 46 | 47 | public int getLevel() { 48 | return level; 49 | } 50 | 51 | public void setLevel(int level) { 52 | this.level = level; 53 | } 54 | 55 | /** 56 | * 根据节点数据计算线性和 57 | * 58 | * @param dataRecords 59 | * 节点数据记录 60 | */ 61 | protected void setLS(ArrayList dataRecords) { 62 | int num = dataRecords.get(0).length; 63 | double[] record; 64 | LS = new double[num]; 65 | for (int j = 0; j < num; j++) { 66 | LS[j] = 0; 67 | } 68 | 69 | for (int i = 0; i < dataRecords.size(); i++) { 70 | record = dataRecords.get(i); 71 | for (int j = 0; j < record.length; j++) { 72 | LS[j] += record[j]; 73 | } 74 | } 75 | } 76 | 77 | /** 78 | * 根据节点数据计算平方 79 | * 80 | * @param dataRecords 81 | * 节点数据 82 | */ 83 | protected void setSS(ArrayList dataRecords) { 84 | int num = dataRecords.get(0).length; 85 | double[] record; 86 | SS = new double[num]; 87 | for (int j = 0; j < num; j++) { 88 | SS[j] = 0; 89 | } 90 | 91 | for (int i = 0; i < dataRecords.size(); i++) { 92 | record = dataRecords.get(i); 93 | for (int j = 0; j < record.length; j++) { 94 | SS[j] += record[j] * record[j]; 95 | } 96 | } 97 | } 98 | 99 | /** 100 | * CF向量特征的叠加,无须考虑划分 101 | * 102 | * @param node 103 | */ 104 | protected void directAddCluster(ClusteringFeature node) { 105 | int N = node.getN(); 106 | double[] otherLS = node.getLS(); 107 | double[] otherSS = node.getSS(); 108 | 109 | if (LS == null) { 110 | this.N = 0; 111 | LS = new double[otherLS.length]; 112 | SS = new double[otherLS.length]; 113 | 114 | for (int i = 0; i < LS.length; i++) { 115 | LS[i] = 0; 116 | SS[i] = 0; 117 | } 118 | } 119 | 120 | // 3个数量上进行叠加 121 | for (int i = 0; i < LS.length; i++) { 122 | LS[i] += otherLS[i]; 123 | SS[i] += otherSS[i]; 124 | } 125 | this.N += N; 126 | } 127 | 128 | /** 129 | * 计算簇与簇之间的距离即簇中心之间的距离 130 | * 131 | * @return 132 | */ 133 | protected double computerClusterDistance(ClusteringFeature cluster) { 134 | double distance = 0; 135 | double[] otherLS = cluster.LS; 136 | int num = N; 137 | 138 | int otherNum = cluster.N; 139 | 140 | for (int i = 0; i < LS.length; i++) { 141 | distance += (LS[i] / num - otherLS[i] / otherNum) * (LS[i] / num - otherLS[i] / otherNum); 142 | } 143 | distance = Math.sqrt(distance); 144 | 145 | return distance; 146 | } 147 | 148 | /** 149 | * 计算簇内对象的平均距离 150 | * 151 | * @param records 152 | * 簇内的数据记录 153 | * @return 154 | */ 155 | protected double computerInClusterDistance(ArrayList records) { 156 | double sumDistance = 0; 157 | double[] data1; 158 | double[] data2; 159 | // 数据总数 160 | int totalNum = records.size(); 161 | 162 | for (int i = 0; i < totalNum - 1; i++) { 163 | data1 = records.get(i); 164 | for (int j = i + 1; j < totalNum; j++) { 165 | data2 = records.get(j); 166 | sumDistance += computeOuDistance(data1, data2); 167 | } 168 | } 169 | 170 | // 返回的值除以总对数,总对数应减半,会重复算一次 171 | return Math.sqrt(sumDistance / (totalNum * (totalNum - 1) / 2)); 172 | } 173 | 174 | /** 175 | * 对给定的2个向量,计算欧式距离 176 | * 177 | * @param record1 178 | * 向量点1 179 | * @param record2 180 | * 向量点2 181 | */ 182 | private double computeOuDistance(double[] record1, double[] record2) { 183 | double distance = 0; 184 | 185 | for (int i = 0; i < record1.length; i++) { 186 | distance += (record1[i] - record2[i]) * (record1[i] - record2[i]); 187 | } 188 | 189 | return distance; 190 | } 191 | 192 | /** 193 | * 聚类添加节点包括,超出阈值进行分裂的操作 194 | * 195 | * @param clusteringFeature 196 | * 待添加聚簇 197 | */ 198 | public abstract void addingCluster(ClusteringFeature clusteringFeature); 199 | } 200 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/clustering/birch/LeafNode.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.clustering.birch; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * CF树叶子节点 7 | */ 8 | public class LeafNode extends ClusteringFeature { 9 | 10 | // 孩子集群 11 | private ArrayList clusterChilds; 12 | // 父亲节点 13 | private NonLeafNode parentNode; 14 | 15 | public ArrayList getClusterChilds() { 16 | return clusterChilds; 17 | } 18 | 19 | public void setClusterChilds(ArrayList clusterChilds) { 20 | this.clusterChilds = clusterChilds; 21 | } 22 | 23 | /** 24 | * 将叶子节点划分出2个 25 | * 26 | * @return 27 | */ 28 | public LeafNode[] divideLeafNode() { 29 | LeafNode[] leafNodeArray = new LeafNode[2]; 30 | // 簇间距离差距最大的2个簇,后面的簇按照就近原则划分即可 31 | Cluster cluster1 = null; 32 | Cluster cluster2 = null; 33 | Cluster tempCluster = null; 34 | double maxValue = 0; 35 | double temp = 0; 36 | 37 | // 找出簇心距离差距最大的2个簇 38 | for (int i = 0; i < clusterChilds.size() - 1; i++) { 39 | tempCluster = clusterChilds.get(i); 40 | for (int j = i + 1; j < clusterChilds.size(); j++) { 41 | temp = tempCluster.computerClusterDistance(clusterChilds.get(j)); 42 | 43 | if (temp > maxValue) { 44 | maxValue = temp; 45 | cluster1 = tempCluster; 46 | cluster2 = clusterChilds.get(j); 47 | } 48 | } 49 | } 50 | 51 | leafNodeArray[0] = new LeafNode(); 52 | leafNodeArray[0].addingCluster(cluster1); 53 | cluster1.setParentNode(leafNodeArray[0]); 54 | leafNodeArray[1] = new LeafNode(); 55 | leafNodeArray[1].addingCluster(cluster2); 56 | cluster2.setParentNode(leafNodeArray[1]); 57 | clusterChilds.remove(cluster1); 58 | clusterChilds.remove(cluster2); 59 | // 就近分配簇 60 | for (Cluster c : clusterChilds) { 61 | if (cluster1.computerClusterDistance(c) < cluster2.computerClusterDistance(c)) { 62 | // 簇间距离如果接近最小簇,就加入最小簇所属叶子节点 63 | leafNodeArray[0].addingCluster(c); 64 | c.setParentNode(leafNodeArray[0]); 65 | } else { 66 | leafNodeArray[1].addingCluster(c); 67 | c.setParentNode(leafNodeArray[1]); 68 | } 69 | } 70 | 71 | return leafNodeArray; 72 | } 73 | 74 | public NonLeafNode getParentNode() { 75 | return parentNode; 76 | } 77 | 78 | public void setParentNode(NonLeafNode parentNode) { 79 | this.parentNode = parentNode; 80 | } 81 | 82 | @Override 83 | public void addingCluster(ClusteringFeature clusteringFeature) { 84 | //更新聚类特征值 85 | directAddCluster(clusteringFeature); 86 | 87 | // 寻找到的目标集群 88 | Cluster findedCluster = null; 89 | Cluster cluster = (Cluster) clusteringFeature; 90 | // 簇内对象平均距离 91 | double disance = Integer.MAX_VALUE; 92 | // 簇间距离差值 93 | double errorDistance = 0; 94 | boolean needDivided = false; 95 | if (clusterChilds == null) { 96 | clusterChilds = new ArrayList<>(); 97 | clusterChilds.add(cluster); 98 | cluster.setParentNode(this); 99 | } else { 100 | for (Cluster c : clusterChilds) { 101 | errorDistance = c.computerClusterDistance(cluster); 102 | if (disance > errorDistance) { 103 | // 选出簇间距离最近的 104 | disance = errorDistance; 105 | findedCluster = c; 106 | } 107 | } 108 | 109 | ArrayList data1 = (ArrayList) findedCluster.getData().clone(); 110 | ArrayList data2 = cluster.getData(); 111 | data1.addAll(data2); 112 | // 如果添加后的聚类的簇间距离超过给定阈值,需要额外新建簇 113 | if (findedCluster.computerInClusterDistance(data1) > BIRCHCore.T) { 114 | // 叶子节点的孩子数不能超过平衡因子L 115 | if (clusterChilds.size() + 1 > BIRCHCore.L) { 116 | needDivided = true; 117 | } 118 | clusterChilds.add(cluster); 119 | cluster.setParentNode(this); 120 | } else { 121 | findedCluster.directAddCluster(cluster); 122 | cluster.setParentNode(this); 123 | } 124 | } 125 | 126 | if (needDivided) { 127 | if (parentNode == null) { 128 | parentNode = new NonLeafNode(); 129 | } else { 130 | parentNode.getLeafChilds().remove(this); 131 | } 132 | 133 | LeafNode[] nodeArray = divideLeafNode(); 134 | for (LeafNode n : nodeArray) { 135 | parentNode.addingCluster(n); 136 | } 137 | } 138 | } 139 | 140 | @Override 141 | protected void directAddCluster(ClusteringFeature node) { 142 | // TODO Auto-generated method stub 143 | if (parentNode != null) { 144 | parentNode.directAddCluster(node); 145 | } 146 | 147 | super.directAddCluster(node); 148 | } 149 | 150 | } 151 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/clustering/kmeans/KMeansCore.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.clustering.kmeans; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.text.MessageFormat; 8 | import java.util.ArrayList; 9 | import java.util.Collections; 10 | 11 | /** 12 | * k均值算法工具类 13 | */ 14 | public class KMeansCore { 15 | 16 | // 输入数据文件地址 17 | private String filePath; 18 | // 分类类别个数 19 | private int classNum; 20 | // 类名称 21 | private ArrayList classNames; 22 | // 聚类坐标点 23 | private ArrayList classPoints; 24 | // 所有的数据左边点 25 | private ArrayList totalPoints; 26 | 27 | public KMeansCore(String filePath, int classNum) { 28 | this.filePath = filePath; 29 | this.classNum = classNum; 30 | readDataFile(); 31 | } 32 | 33 | /** 34 | * 从文件中读取数据 35 | */ 36 | private void readDataFile() { 37 | File file = new File(filePath); 38 | ArrayList dataArray = new ArrayList(); 39 | 40 | try { 41 | BufferedReader in = new BufferedReader(new FileReader(file)); 42 | String str; 43 | String[] tempArray; 44 | while ((str = in.readLine()) != null) { 45 | tempArray = str.split(" "); 46 | dataArray.add(tempArray); 47 | } 48 | in.close(); 49 | } catch (IOException e) { 50 | e.getStackTrace(); 51 | } 52 | 53 | classPoints = new ArrayList<>(); 54 | totalPoints = new ArrayList<>(); 55 | classNames = new ArrayList<>(); 56 | for (int i = 0, j = 1; i < dataArray.size(); i++) { 57 | if (j <= classNum) { 58 | classPoints.add(new Point(dataArray.get(i)[0], dataArray.get(i)[1], j + "")); 59 | classNames.add(i + ""); 60 | j++; 61 | } 62 | totalPoints.add(new Point(dataArray.get(i)[0], dataArray.get(i)[1])); 63 | } 64 | } 65 | 66 | /** 67 | * K均值聚类算法实现 68 | */ 69 | public void kMeansClustering() { 70 | double tempX = 0; 71 | double tempY = 0; 72 | int count = 0; 73 | double error = Integer.MAX_VALUE; 74 | Point temp; 75 | 76 | while (error > 0.01 * classNum) { 77 | for (Point p1 : totalPoints) { 78 | // 将所有的测试坐标点就近分类 79 | for (Point p2 : classPoints) { 80 | p2.computerDistance(p1); 81 | } 82 | Collections.sort(classPoints); 83 | 84 | // 取出p1离类坐标点最近的那个点 85 | p1.setClassName(classPoints.get(0).getClassName()); 86 | } 87 | 88 | error = 0; 89 | // 按照均值重新划分聚类中心点 90 | for (Point p1 : classPoints) { 91 | count = 0; 92 | tempX = 0; 93 | tempY = 0; 94 | for (Point p : totalPoints) { 95 | if (p.getClassName().equals(p1.getClassName())) { 96 | count++; 97 | tempX += p.getX(); 98 | tempY += p.getY(); 99 | } 100 | } 101 | tempX /= count; 102 | tempY /= count; 103 | 104 | error += Math.abs((tempX - p1.getX())); 105 | error += Math.abs((tempY - p1.getY())); 106 | // 计算均值 107 | p1.setX(tempX); 108 | p1.setY(tempY); 109 | 110 | } 111 | 112 | for (int i = 0; i < classPoints.size(); i++) { 113 | temp = classPoints.get(i); 114 | System.out.println(MessageFormat.format("聚类中心点{0},x={1},y={2}", (i + 1), temp.getX(), temp.getY())); 115 | } 116 | System.out.println("----------"); 117 | } 118 | 119 | System.out.println("结果值收敛"); 120 | for (int i = 0; i < classPoints.size(); i++) { 121 | temp = classPoints.get(i); 122 | System.out.println(MessageFormat.format("聚类中心点{0},x={1},y={2}", (i + 1), temp.getX(), temp.getY())); 123 | } 124 | 125 | } 126 | 127 | } 128 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/clustering/kmeans/KMeansExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.clustering.kmeans; 2 | 3 | /** 4 | * K-means(K均值)算法调用类 5 | */ 6 | public class KMeansExample { 7 | 8 | public static void main(String[] args) { 9 | String filePath = "data/kmeans/input.txt"; 10 | // 聚类中心数量设定 11 | int classNum = 3; 12 | 13 | KMeansCore tool = new KMeansCore(filePath, classNum); 14 | tool.kMeansClustering(); 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/clustering/kmeans/Point.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.clustering.kmeans; 2 | 3 | /** 4 | * 坐标点类 5 | */ 6 | public class Point implements Comparable { 7 | 8 | // 坐标点横坐标 9 | private double x; 10 | // 坐标点纵坐标 11 | private double y; 12 | //以此点作为聚类中心的类的类名称 13 | private String className; 14 | // 坐标点之间的欧式距离 15 | private Double distance; 16 | 17 | public Point(double x, double y) { 18 | this.x = x; 19 | this.y = y; 20 | } 21 | 22 | public Point(String x, String y) { 23 | this.x = Double.parseDouble(x); 24 | this.y = Double.parseDouble(y); 25 | } 26 | 27 | public Point(String x, String y, String className) { 28 | this.x = Double.parseDouble(x); 29 | this.y = Double.parseDouble(y); 30 | this.className = className; 31 | } 32 | 33 | /** 34 | * 距离目标点p的欧几里得距离 35 | * 36 | * @param p 37 | */ 38 | public void computerDistance(Point p) { 39 | if (p == null) { 40 | return; 41 | } 42 | 43 | this.distance = (this.x - p.x) * (this.x - p.x) + (this.y - p.y) * (this.y - p.y); 44 | } 45 | 46 | public double getX() { 47 | return x; 48 | } 49 | 50 | public void setX(double x) { 51 | this.x = x; 52 | } 53 | 54 | public double getY() { 55 | return y; 56 | } 57 | 58 | public void setY(double y) { 59 | this.y = y; 60 | } 61 | 62 | public String getClassName() { 63 | return className; 64 | } 65 | 66 | public void setClassName(String className) { 67 | this.className = className; 68 | } 69 | 70 | public double getDistance() { 71 | return distance; 72 | } 73 | 74 | public void setDistance(double distance) { 75 | this.distance = distance; 76 | } 77 | 78 | @Override 79 | public int compareTo(Point o) { 80 | return this.distance.compareTo(o.distance); 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/DataReader.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.dimensionality.reduction.pca; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.Reader; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | /** 10 | * 简单的向量数据读取 11 | */ 12 | public class DataReader extends BufferedReader { 13 | 14 | public DataReader(Reader in, int sz) { 15 | super(in, sz); 16 | } 17 | 18 | public DataReader(Reader in) { 19 | super(in); 20 | } 21 | 22 | /** 23 | * Get the (vector) data contained in the file. The data is stored one value 24 | * per line. Empty lines are ignored. 25 | * 26 | * @return the data 27 | */ 28 | public double[] getData() throws IOException { 29 | List dataList = new ArrayList<>(); 30 | String line; 31 | while ((line = readLine()) != null) { 32 | line = line.trim(); 33 | if (line.isEmpty()) { 34 | continue; 35 | } 36 | dataList.add(Double.valueOf(line)); 37 | } 38 | 39 | double[] vector = new double[dataList.size()]; 40 | int i = 0; 41 | for (Double d : dataList) { 42 | vector[i++] = d; 43 | } 44 | 45 | return vector; 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/Main.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.dimensionality.reduction.pca; 2 | 3 | import java.io.FileReader; 4 | import java.io.FileWriter; 5 | import java.io.IOException; 6 | import java.io.PrintWriter; 7 | 8 | /** 9 | * 命令行操作类 10 | */ 11 | public class Main { 12 | 13 | /** 14 | * Run a PCA on vector data 15 | * 16 | * @param av are file references containing vector data 17 | * @throws Exception 18 | */ 19 | public static void main(String... av) throws Exception { 20 | 21 | if (av.length == 0) { 22 | throw new IllegalArgumentException("Usage: pca FILES..."); 23 | } 24 | 25 | for (String filename : av) { 26 | try (DataReader dr = new DataReader(new FileReader(filename + ".data"))) { 27 | double[] data = dr.getData(); 28 | System.out.println(filename + ": vector length = " + data.length); 29 | 30 | PCACoreHandler handler = new PCACoreHandler(); 31 | PCACore pca = handler.fromSimpleTimeSeries(data); 32 | 33 | log(filename + "_pcomps.data", filename + ": principle components", pca.getPrincipalComponents()); 34 | log(filename + "_lambda.data", filename + ": lambda", pca.getLambda()); 35 | log(filename + "_pfacs.data", filename + ": principle factors", pca.getPrinicipalFactors()); 36 | 37 | Matrix cc = handler.correlationCircle(pca); 38 | log(filename + "_cc.data", filename + ": correlation circle", cc); 39 | 40 | Matrix cumcon = handler.cumulativeContribution(pca); 41 | log(filename + "_cumcon.data", filename + ": cumulative contributions", cumcon); 42 | } 43 | } 44 | } 45 | 46 | private static void log(String filename, String tag, Matrix m) throws IOException { 47 | try (PrintWriter fp = new PrintWriter(new FileWriter(filename))) { 48 | System.out.println(tag + ":"); 49 | MatrixHelper.print(m, fp, 1, 4); 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/MatrixException.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.dimensionality.reduction.pca; 2 | 3 | /** 4 | * 矩阵异常 5 | */ 6 | public class MatrixException extends RuntimeException { 7 | 8 | private static final long serialVersionUID = -65073227556727585L; 9 | 10 | public MatrixException(String s) { 11 | super(s); 12 | } 13 | 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/MatrixHelper.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.dimensionality.reduction.pca; 2 | 3 | import java.io.PrintWriter; 4 | import java.text.DecimalFormat; 5 | import java.text.DecimalFormatSymbols; 6 | import java.text.NumberFormat; 7 | import java.util.Locale; 8 | 9 | /** 10 | * 矩阵的操作类 11 | */ 12 | public class MatrixHelper { 13 | 14 | /** 15 | * Print the matrix to stdout. Line the elements up in columns with a 16 | * Fortran-like 'Fw.d' style format. 17 | * 18 | * @param w Column width. 19 | * @param d Number of digits after the decimal. 20 | */ 21 | public static void print(Matrix a, int w, int d) { 22 | print(a, new PrintWriter(System.out, true), w, d); 23 | } 24 | 25 | /** 26 | * Print the matrix to the output stream. Line the elements up in columns 27 | * with a Fortran-like 'Fw.d' style format. 28 | * 29 | * @param output Output stream. 30 | * @param w Column width. 31 | * @param d Number of digits after the decimal. 32 | */ 33 | public static void print(Matrix a, PrintWriter output, int w, int d) { 34 | DecimalFormat format = new DecimalFormat(); 35 | format.setDecimalFormatSymbols(new DecimalFormatSymbols(Locale.US)); 36 | format.setMinimumIntegerDigits(1); 37 | format.setMaximumFractionDigits(d); 38 | format.setMinimumFractionDigits(d); 39 | format.setGroupingUsed(false); 40 | print(a, output, format, w + 2); 41 | } 42 | 43 | /** 44 | * Print the matrix to stdout. Line the elements up in columns. Use the 45 | * format object, and right justify within columns of width characters. Note 46 | * that is the matrix is to be read back in, you probably will want to use a 47 | * NumberFormat that is set to US Locale. 48 | * 49 | * @param format A Formatting object for individual elements. 50 | * @param width Field width for each column. 51 | * @see java.text.DecimalFormat#setDecimalFormatSymbols 52 | */ 53 | public static void print(Matrix a, NumberFormat format, int width) { 54 | print(a, new PrintWriter(System.out, true), format, width); 55 | } 56 | 57 | // DecimalFormat is a little disappointing coming from Fortran or C's printf. 58 | // Since it doesn't pad on the left, the elements will come out different 59 | // widths. Consequently, we'll pass the desired column width in as an 60 | // argument and do the extra padding ourselves. 61 | /** 62 | * Print the matrix to the output stream. Line the elements up in columns. 63 | * Use the format object, and right justify within columns of width 64 | * characters. Note that is the matrix is to be read back in, you probably 65 | * will want to use a NumberFormat that is set to US Locale. 66 | * 67 | * @param output the output stream. 68 | * @param format A formatting object to format the matrix elements 69 | * @param width Column width. 70 | * @see java.text.DecimalFormat#setDecimalFormatSymbols 71 | */ 72 | public static void print(Matrix a, PrintWriter output, NumberFormat format, int width) { 73 | output.println(); // start on new line. 74 | int m = a.getNRows(); 75 | int n = a.getNCols(); 76 | double[][] A = a.getArray(); 77 | for (int i = 0; i < m; i++) { 78 | for (int j = 0; j < n; j++) { 79 | String s = format.format(A[i][j]); // format the number 80 | int padding = Math.max(1, width - s.length()); // At _least_ 1 space 81 | for (int k = 0; k < padding; k++) { 82 | output.print(' '); 83 | } 84 | output.print(s); 85 | } 86 | output.println(); 87 | } 88 | output.println(); // end with blank line. 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/PCACore.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.dimensionality.reduction.pca; 2 | 3 | /** 4 | * PCA核心算法类 5 | */ 6 | public class PCACore { 7 | 8 | // The incoming matrix 9 | private final Matrix m; 10 | // the principal components 11 | private final Matrix pc; 12 | // facpr 13 | private final Matrix facpr; 14 | // lambda 15 | private final Matrix lambda; 16 | 17 | public PCACore(Matrix x) { 18 | 19 | // Weight and center the matrix 20 | this.m = x.wcenter(); 21 | // compute the eigenvectors of y'*y using svd 22 | SVD svd = new SVD(this.m); 23 | 24 | // calculate the lambda 25 | this.lambda = calculateLambda(svd.getS()); 26 | // get the principle factors 27 | this.facpr = svd.getV(); 28 | 29 | // calculate the principle components 30 | this.pc = this.m.times(svd.getV()); 31 | } 32 | 33 | private Matrix calculateLambda(Matrix s) { 34 | 35 | Matrix d = s.diag(); 36 | double[][] D = d.getArray(); 37 | 38 | int size = d.getNRows(); 39 | for (int i = 0; i < size; i++) { 40 | D[i][0] = (D[i][0] * D[i][0]) / (size - 1); 41 | } 42 | 43 | return d; 44 | } 45 | 46 | public Matrix getPrincipalComponents() { 47 | return pc; 48 | } 49 | 50 | public Matrix getLambda() { 51 | return lambda; 52 | } 53 | 54 | public Matrix getPrinicipalFactors() { 55 | return facpr; 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/PCACoreHandler.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.dimensionality.reduction.pca; 2 | 3 | /** 4 | * 封装PCACore的调用类 5 | */ 6 | public class PCACoreHandler { 7 | 8 | public PCACoreHandler() { 9 | } 10 | 11 | /** 12 | * Run a principal component analysis of a matrix. 13 | * 14 | * @param m the matrix 15 | * @return the principle components 16 | */ 17 | public PCACore fromMatrix(Matrix m) { 18 | return new PCACore(m); 19 | } 20 | 21 | /** 22 | * Run a principal component analysis from a simple time series vector. We 23 | * are converting the data into a Toeplitz style matrix before running the 24 | * PCA. 25 | * 26 | * @param data the time series vector 27 | * @return the principle components 28 | */ 29 | public PCACore fromSimpleTimeSeries(double[] data) { 30 | Matrix m = new ToeplitzMatrix(data); 31 | PCACore pca = new PCACore(m); 32 | return pca; 33 | } 34 | 35 | /** 36 | * Calculate the correlations circle for two components. This is quick and 37 | * dirty we are not doing any validity checks to make sure the PCA has 38 | * completed successfully. 39 | * 40 | * @param pca the PCA 41 | * @param compare the principal factor columns to compare 42 | * @return the correlations circle 43 | */ 44 | public Matrix correlationCircle(PCACore pca, int[] compare) { 45 | double[][] F = pca.getPrinicipalFactors().getArray(); 46 | double[][] L = pca.getLambda().getArray(); 47 | 48 | // calculate the correlation circle 49 | Matrix cc = new Matrix(F.length, compare.length); 50 | double[][] CC = cc.getArray(); 51 | 52 | for (int n = 0; n < compare.length; n++) { 53 | int index = compare[n]; 54 | double s = Math.sqrt(L[index][0]); 55 | for (int m = 0; m < F.length; m++) { 56 | double f = F[m][index]; 57 | 58 | CC[m][n] = s * f; 59 | } 60 | } 61 | return cc; 62 | } 63 | 64 | /** 65 | * Calculate the correlations circle for the two largest eigenvalues. 66 | * 67 | * @param pca the pca 68 | * @return the correlations circle 69 | */ 70 | public Matrix correlationCircle(PCACore pca) { 71 | return correlationCircle(pca, new int[] { 0, 1 }); 72 | } 73 | 74 | /** 75 | * Normalize the eigenvalues so we can create a scree plot. 76 | * 77 | * @param pca the pca 78 | * @return the normalized eigenvalues; 79 | */ 80 | public Matrix normalizeLambda(PCACore pca) { 81 | 82 | double[][] L = pca.getLambda().getArrayCopy(); 83 | Matrix nl = new Matrix(L); 84 | double sum = 0; 85 | for (int n = 0; n < L.length; n++) { 86 | sum += L[n][0]; 87 | } 88 | for (int n = 0; n < L.length; n++) { 89 | L[n][0] = L[n][0] / sum; 90 | } 91 | return nl; 92 | } 93 | 94 | /** 95 | * Calculate the cumulative contribution of the eigenvectors 96 | * 97 | * @param pca is the pca 98 | * @return the cumulative contributions of the eigenvectors 99 | */ 100 | public Matrix cumulativeContribution(PCACore pca) { 101 | Matrix nl = normalizeLambda(pca); 102 | double[][] CC = nl.getArrayCopy(); 103 | Matrix cc = new Matrix(CC); 104 | double cum = 0; 105 | for (int n = 0; n < CC.length; n++) { 106 | cum = CC[n][0] = CC[n][0] + cum; 107 | } 108 | return cc; 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/PCAExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.dimensionality.reduction.pca; 2 | 3 | public class PCAExample { 4 | 5 | // expected results 6 | private static int SIZE = 6; 7 | private static double[] data = new double[] { 1, 2, 3, 4, 5, 6 }; 8 | 9 | private static double[][] pcomps = new double[][] { { -2.4530, -1.4869, -0.2693, -0.1327, -0.1141, -0.0000 }, 10 | { -1.9344, 0.1112, 0.1731, 0.3003, 0.2126, 0.0000 }, { -0.7959, 1.3757, 0.4091, -0.1677, -0.1651, -0.0000 }, 11 | { 0.7959, 1.3757, -0.4091, -0.1677, 0.1651, -0.0000 }, { 1.9344, 0.1112, -0.1731, 0.3003, -0.2126, 0.0000 }, 12 | { 2.4530, -1.4869, 0.2693, -0.1327, 0.1141, 0.0000 } }; 13 | 14 | private static double[] plambda = new double[] { 4.1572, 1.6463, 0.1080, 0.0544, 0.0342, 0.0000 }; 15 | 16 | private static double[][] pfacs = { { 0.4851, -0.0000, 0.4138, 0.0000, 0.3056, 0.7071 }, 17 | { 0.4562, -0.2454, -0.1519, -0.6631, -0.5185, -0.0000 }, 18 | { 0.2378, -0.6631, -0.5529, 0.2454, 0.3712, -0.0000 }, 19 | { -0.2378, -0.6631, 0.5529, 0.2454, -0.3712, 0.0000 }, 20 | { -0.4562, -0.2454, 0.1519, -0.6631, 0.5185, -0.0000 }, 21 | { -0.4851, -0.0000, -0.4138, -0.0000, -0.3056, 0.7071 } }; 22 | 23 | public static void main(String[] args) { 24 | PCACoreHandler instance = new PCACoreHandler(); 25 | PCACore result = instance.fromSimpleTimeSeries(data); 26 | 27 | // compare the principal components 28 | System.out.println("compare the principal components:"); 29 | double[][] res_pcomp = result.getPrincipalComponents().getArray(); 30 | for (int i = 0; i < SIZE; i++) { 31 | for (int j = 0; j < SIZE; j++) { 32 | System.out.println(pcomps[i][j] + " , " + res_pcomp[i][j]); 33 | } 34 | } 35 | 36 | // compare the lambdas 37 | System.out.println("compare the lambdas:"); 38 | double[] res_plambda = result.getLambda().transpose().getArray()[0]; 39 | for (int i = 0; i < SIZE; i++) { 40 | System.out.println(plambda[i] + " , " + res_plambda[i]); 41 | } 42 | 43 | // compare the principle factors 44 | System.out.println("compare the principle factors:"); 45 | double[][] res_pfacs = result.getPrinicipalFactors().getArray(); 46 | for (int i = 0; i < SIZE; i++) { 47 | for (int j = 0; j < SIZE; j++) { 48 | System.out.println(pfacs[i][j] + " , " + res_pfacs[i][j]); 49 | } 50 | } 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/ToeplitzMatrix.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.dimensionality.reduction.pca; 2 | 3 | /** 4 | * Toeplitz matrix 5 | */ 6 | public class ToeplitzMatrix extends Matrix { 7 | 8 | /** 9 | * Toeplitz matrix styles 10 | */ 11 | public static enum Type { 12 | 13 | Triangular, Symmetrical, Circulant 14 | }; 15 | 16 | /** 17 | * Create a symmetrical Toeplitz-style matrix from a vector. 18 | * 19 | * @param v 20 | */ 21 | public ToeplitzMatrix(double[] v) { 22 | this(v, Type.Symmetrical); 23 | } 24 | 25 | /** 26 | * Create a Toeplitz matrix from a vector. 27 | * 28 | * @param v the vector 29 | * @param type the matrix style 30 | */ 31 | public ToeplitzMatrix(double[] v, Type type) { 32 | super(v.length, v.length); 33 | int n = v.length; 34 | double[][] arr = getArray(); 35 | 36 | for (int i = 0; i < v.length; i++) { 37 | for (int j = 0; j <= i; j++) { 38 | int index = i - j; 39 | arr[i][j] = v[i - j]; 40 | switch (type) { 41 | default: 42 | case Triangular: 43 | // do nothing 44 | break; 45 | case Symmetrical: 46 | arr[j][i] = v[i - j]; 47 | break; 48 | case Circulant: 49 | if (j != i) { 50 | arr[j][i] = v[n - index]; 51 | } 52 | break; 53 | } 54 | } 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/dimensionality/reduction/pca/TrajectoryMatrix.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.dimensionality.reduction.pca; 2 | 3 | /** 4 | * Create a trajectory style matrix from a vector. 5 | */ 6 | public class TrajectoryMatrix extends Matrix { 7 | 8 | public TrajectoryMatrix(double[] v, int ncols) { 9 | super(v.length - ncols + 1, ncols); 10 | double[][] arr = getArray(); 11 | int nrows = getNRows(); 12 | int pos = 0; // position in vector 13 | 14 | for (int i = 0; i < nrows; i++) { 15 | double value = v[pos++]; 16 | int availCols = i < ncols ? i + 1 : ncols; 17 | for (int j = 0, m = i; j < availCols && m >= 0; j++, m--) { 18 | arr[m][j] = value; 19 | } 20 | } 21 | for (int i = 1; i < ncols; i++) { 22 | double value = v[pos++]; 23 | for (int j = i, m = nrows - 1; j < ncols && m > 0; j++, m--) { 24 | arr[m][j] = value; 25 | } 26 | } 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/graph/gspan/DFSCodeTraveler.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.graph.gspan; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Stack; 5 | 6 | /** 7 | * 图编码深度优先搜索类,判断当前编码在给定图中是否为最小编码 8 | */ 9 | public class DFSCodeTraveler { 10 | 11 | // 当前的编码是否为最下编码标识 12 | boolean isMin; 13 | // 当前挖掘的图的边五元组编码组 14 | ArrayList edgeSeqs; 15 | // 当前的图结构 16 | Graph graph; 17 | // 图节点id对应的边五元组中的id标识 18 | int[] g2s; 19 | // 代表图中的边是否被用到了 20 | boolean f[][]; 21 | 22 | public DFSCodeTraveler(ArrayList edgeSeqs, Graph graph) { 23 | this.isMin = true; 24 | this.edgeSeqs = edgeSeqs; 25 | this.graph = graph; 26 | } 27 | 28 | public void traveler() { 29 | int nodeLNums = graph.nodeLabels.size(); 30 | g2s = new int[nodeLNums]; 31 | for (int i = 0; i < nodeLNums; i++) { 32 | // 设置-1代表此点还未被计入编码 33 | g2s[i] = -1; 34 | } 35 | 36 | f = new boolean[nodeLNums][nodeLNums]; 37 | for (int i = 0; i < nodeLNums; i++) { 38 | for (int j = 0; j < nodeLNums; j++) { 39 | f[i][j] = false; 40 | } 41 | } 42 | 43 | // 从每个点开始寻找最小编码五元组 44 | for (int i = 0; i < nodeLNums; i++) { 45 | //对选择的第一个点的标号做判断 46 | if (graph.getNodeLabels().get(i) > edgeSeqs.get(0).x) { 47 | continue; 48 | } 49 | // 五元组id从0开始设置 50 | g2s[i] = 0; 51 | 52 | Stack s = new Stack<>(); 53 | s.push(i); 54 | dfsSearch(s, 0, 1); 55 | if (!isMin) { 56 | return; 57 | } 58 | g2s[i] = -1; 59 | } 60 | } 61 | 62 | /** 63 | * 深度优先搜索最小编码组 64 | * 65 | * @param stack 66 | * 加入的节点id栈 67 | * @param currentPosition 68 | * 当前进行的层次,代表找到的第几条边 69 | * @param next 70 | * 五元组边下一条边的点的临时标识 71 | */ 72 | private void dfsSearch(Stack stack, int currentPosition, int next) { 73 | if (currentPosition >= edgeSeqs.size()) { 74 | stack.pop(); 75 | // 比较到底了则返回 76 | return; 77 | } 78 | 79 | while (!stack.isEmpty()) { 80 | int x = stack.pop(); 81 | for (int i = 0; i < graph.edgeNexts.get(x).size(); i++) { 82 | // 从此id节点所连接的点中选取1个点作为下一个点 83 | int y = graph.edgeNexts.get(x).get(i); 84 | // 如果这2个点所构成的边已经被用过,则继续 85 | if (f[x][y] || f[y][x]) { 86 | continue; 87 | } 88 | 89 | // 如果y这个点未被用过 90 | if (g2s[y] < 0) { 91 | // 新建这条边五元组 92 | Edge e = new Edge(g2s[x], next, graph.nodeLabels.get(x), graph.edgeLabels.get(x).get(i), 93 | graph.nodeLabels.get(y)); 94 | 95 | // 与相应位置的边做比较,如果不是最小则失败 96 | int compareResult = e.compareWith(edgeSeqs.get(currentPosition)); 97 | if (compareResult == Edge.EDGE_SMALLER) { 98 | isMin = false; 99 | return; 100 | } else if (compareResult == Edge.EDGE_LARGER) { 101 | continue; 102 | } 103 | // 如果相等则继续比 104 | g2s[y] = next; 105 | f[x][y] = true; 106 | f[y][x] = true; 107 | stack.push(y); 108 | dfsSearch(stack, currentPosition + 1, next + 1); 109 | if (!isMin) { 110 | return; 111 | } 112 | f[x][y] = false; 113 | f[y][x] = false; 114 | g2s[y] = -1; 115 | } else { 116 | // 这个点已经被用过的时候,不需要再设置五元组id标识 117 | // 新建这条边五元组 118 | Edge e = new Edge(g2s[x], g2s[y], graph.nodeLabels.get(x), graph.edgeLabels.get(x).get(i), 119 | graph.nodeLabels.get(y)); 120 | 121 | // 与相应位置的边做比较,如果不是最小则失败 122 | int compareResult = e.compareWith(edgeSeqs.get(currentPosition)); 123 | if (compareResult == Edge.EDGE_SMALLER) { 124 | isMin = false; 125 | return; 126 | } else if (compareResult == Edge.EDGE_LARGER) { 127 | continue; 128 | } 129 | // 如果相等则继续比 130 | g2s[y] = next; 131 | f[x][y] = true; 132 | f[y][x] = true; 133 | stack.push(y); 134 | dfsSearch(stack, currentPosition + 1, next); 135 | if (!isMin) { 136 | return; 137 | } 138 | f[x][y] = false; 139 | f[y][x] = false; 140 | } 141 | } 142 | } 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/graph/gspan/Edge.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.graph.gspan; 2 | 3 | /** 4 | * 边,用五元组表示 5 | */ 6 | public class Edge { 7 | 8 | // 五元组的大小比较结果 9 | public static final int EDGE_EQUAL = 0; 10 | public static final int EDGE_SMALLER = 1; 11 | public static final int EDGE_LARGER = 2; 12 | 13 | // 边的一端的id号标识 14 | int ix; 15 | // 边的另一端的id号标识 16 | int iy; 17 | // 边的一端的点标号 18 | int x; 19 | // 边的标号 20 | int a; 21 | // 边的另一端的点标号 22 | int y; 23 | 24 | public Edge(int ix, int iy, int x, int a, int y) { 25 | this.ix = ix; 26 | this.iy = iy; 27 | this.x = x; 28 | this.a = a; 29 | this.y = y; 30 | } 31 | 32 | /** 33 | * 当前边是与给定的边的大小比较关系 34 | * 35 | * @param e 36 | * @return 37 | */ 38 | public int compareWith(Edge e) { 39 | int result = EDGE_EQUAL; 40 | int[] array1 = new int[] { ix, iy, x, y, a }; 41 | int[] array2 = new int[] { e.ix, e.iy, e.x, e.y, e.a }; 42 | 43 | // 按照ix, iy,x,y,a的次序依次比较 44 | for (int i = 0; i < array1.length; i++) { 45 | if (array1[i] < array2[i]) { 46 | result = EDGE_SMALLER; 47 | break; 48 | } else if (array1[i] > array2[i]) { 49 | result = EDGE_LARGER; 50 | break; 51 | } else { 52 | // 如果相等,继续比较下一个 53 | continue; 54 | } 55 | } 56 | 57 | return result; 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/graph/gspan/EdgeFrequency.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.graph.gspan; 2 | 3 | /** 4 | * 边的频繁统计 5 | */ 6 | public class EdgeFrequency { 7 | 8 | // 节点标号数量 9 | private int nodeLabelNum; 10 | // 边的标号数量 11 | private int edgeLabelNum; 12 | // 用于存放边计数的3维数组 13 | public int[][][] edgeFreqCount; 14 | 15 | public EdgeFrequency(int nodeLabelNum, int edgeLabelNum) { 16 | this.nodeLabelNum = nodeLabelNum; 17 | this.edgeLabelNum = edgeLabelNum; 18 | 19 | edgeFreqCount = new int[nodeLabelNum][edgeLabelNum][nodeLabelNum]; 20 | //最初始化操作 21 | for (int i = 0; i < nodeLabelNum; i++) { 22 | for (int j = 0; j < edgeLabelNum; j++) { 23 | for (int k = 0; k < nodeLabelNum; k++) { 24 | edgeFreqCount[i][j][k] = 0; 25 | } 26 | } 27 | } 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/graph/gspan/GSpanExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.graph.gspan; 2 | 3 | /** 4 | * gSpan频繁子图挖掘算法 5 | */ 6 | public class GSpanExample { 7 | 8 | public static void main(String[] args) { 9 | //测试数据文件地址 10 | String filePath = "data/gspan/input.txt"; 11 | //最小支持度率 12 | double minSupportRate = 0.2; 13 | 14 | GSpanTool tool = new GSpanTool(filePath, minSupportRate); 15 | tool.freqGraphMining(); 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/graph/gspan/Graph.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.graph.gspan; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * 图结构类 7 | */ 8 | public class Graph { 9 | 10 | // 图节点标号组 11 | ArrayList nodeLabels; 12 | // 图的边标号组 13 | ArrayList> edgeLabels; 14 | // 边2头的节点id号,在这里可以理解为下标号 15 | ArrayList> edgeNexts; 16 | 17 | public Graph() { 18 | nodeLabels = new ArrayList<>(); 19 | edgeLabels = new ArrayList<>(); 20 | edgeNexts = new ArrayList<>(); 21 | } 22 | 23 | public ArrayList getNodeLabels() { 24 | return nodeLabels; 25 | } 26 | 27 | public void setNodeLabels(ArrayList nodeLabels) { 28 | this.nodeLabels = nodeLabels; 29 | } 30 | 31 | /** 32 | * 判断图中是否存在某条边 33 | * 34 | * @param x 35 | * 边的一端的节点标号 36 | * @param a 37 | * 边的标号 38 | * @param y 39 | * 边的另外一端节点标号 40 | * @return 41 | */ 42 | public boolean hasEdge(int x, int a, int y) { 43 | boolean isContained = false; 44 | int t; 45 | 46 | for (int i = 0; i < nodeLabels.size(); i++) { 47 | // 先寻找2个端点标号,t代表找到的点的另外一个端点标号 48 | if (nodeLabels.get(i) == x) { 49 | t = y; 50 | } else if (nodeLabels.get(i) == y) { 51 | t = x; 52 | } else { 53 | continue; 54 | } 55 | 56 | for (int j = 0; j < edgeNexts.get(i).size(); j++) { 57 | // 从此端点的所连接的点去比较对应的点和边 58 | if (edgeLabels.get(i).get(j) == a && nodeLabels.get(edgeNexts.get(i).get(j)) == t) { 59 | isContained = true; 60 | return isContained; 61 | } 62 | } 63 | } 64 | 65 | return isContained; 66 | } 67 | 68 | /** 69 | * 在图中移除某个边 70 | * 71 | * @param x 72 | * 边的某端的一个点标号 73 | * @param a 74 | * 边的标号 75 | * @param y 76 | * 边的另一端的一个点标号 77 | */ 78 | public void removeEdge(int x, int a, int y) { 79 | int t; 80 | 81 | for (int i = 0; i < nodeLabels.size(); i++) { 82 | // 先寻找2个端点标号,t代表找到的点的另外一个端点标号 83 | if (nodeLabels.get(i) == x) { 84 | t = y; 85 | } else if (nodeLabels.get(i) == y) { 86 | t = x; 87 | } else { 88 | continue; 89 | } 90 | 91 | for (int j = 0; j < edgeNexts.get(i).size(); j++) { 92 | // 从此端点的所连接的点去比较对应的点和边 93 | if (edgeLabels.get(i).get(j) == a && nodeLabels.get(edgeNexts.get(i).get(j)) == t) { 94 | int id; 95 | // 在连接的点中去除该点 96 | edgeLabels.get(i).remove(j); 97 | 98 | id = edgeNexts.get(i).get(j); 99 | edgeNexts.get(i).remove(j); 100 | for (int k = 0; k < edgeNexts.get(id).size(); k++) { 101 | if (edgeNexts.get(id).get(k) == i) { 102 | edgeNexts.get(id).remove(k); 103 | break; 104 | } 105 | } 106 | break; 107 | } 108 | } 109 | } 110 | 111 | } 112 | 113 | /** 114 | * 根据图数据构造一个图 115 | * 116 | * @param gd 117 | * 图数据 118 | * @return 119 | */ 120 | public Graph constructGraph(GraphData gd) { 121 | Graph graph = new Graph(); 122 | 123 | // 构造一个图需要知道3点,1.图中有哪些点2.图中的每个点周围连着哪些点3.每个点周围连着哪些边 124 | for (int i = 0; i < gd.getNodeVisibles().size(); i++) { 125 | if (gd.getNodeVisibles().get(i)) { 126 | graph.getNodeLabels().add(gd.getNodeLabels().get(i)); 127 | } 128 | 129 | // 添加对应id下的集合 130 | // id节点后有多少相连的边的标号 131 | graph.edgeLabels.add(new ArrayList()); 132 | // id节点后有多少相连的节点的id 133 | graph.edgeNexts.add(new ArrayList()); 134 | } 135 | 136 | for (int i = 0; i < gd.getEdgeLabels().size(); i++) { 137 | if (gd.getEdgeVisibles().get(i)) { 138 | // 在此后面添加一个边标号 139 | graph.edgeLabels.get(gd.getEdgeX().get(i)).add(gd.getEdgeLabels().get(i)); 140 | graph.edgeLabels.get(gd.getEdgeY().get(i)).add(gd.getEdgeLabels().get(i)); 141 | graph.edgeNexts.get(gd.getEdgeX().get(i)).add(gd.getEdgeY().get(i)); 142 | graph.edgeNexts.get(gd.getEdgeY().get(i)).add(gd.getEdgeX().get(i)); 143 | } 144 | } 145 | 146 | return graph; 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/graph/gspan/GraphCode.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.graph.gspan; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * 图编码类 7 | */ 8 | public class GraphCode { 9 | 10 | //边的集合,边的排序代表着边的添加次序 11 | ArrayList edgeSeq; 12 | //拥有这些边的图的id 13 | ArrayList gs; 14 | 15 | public GraphCode() { 16 | this.edgeSeq = new ArrayList<>(); 17 | this.gs = new ArrayList<>(); 18 | } 19 | 20 | public ArrayList getEdgeSeq() { 21 | return edgeSeq; 22 | } 23 | 24 | public void setEdgeSeq(ArrayList edgeSeq) { 25 | this.edgeSeq = edgeSeq; 26 | } 27 | 28 | public ArrayList getGs() { 29 | return gs; 30 | } 31 | 32 | public void setGs(ArrayList gs) { 33 | this.gs = gs; 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/graph/gspan/GraphData.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.graph.gspan; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * 图的数据类 7 | */ 8 | public class GraphData { 9 | 10 | // 节点组标号 11 | private ArrayList nodeLabels; 12 | // 节点是否可用,可能被移除 13 | private ArrayList nodeVisibles; 14 | // 边的集合标号 15 | private ArrayList edgeLabels; 16 | // 边的一边点id 17 | private ArrayList edgeX; 18 | // 边的另一边的点id 19 | private ArrayList edgeY; 20 | // 边是否可用 21 | private ArrayList edgeVisibles; 22 | 23 | public GraphData() { 24 | nodeLabels = new ArrayList<>(); 25 | nodeVisibles = new ArrayList<>(); 26 | 27 | edgeLabels = new ArrayList<>(); 28 | edgeX = new ArrayList<>(); 29 | edgeY = new ArrayList<>(); 30 | edgeVisibles = new ArrayList<>(); 31 | } 32 | 33 | public ArrayList getNodeLabels() { 34 | return nodeLabels; 35 | } 36 | 37 | public void setNodeLabels(ArrayList nodeLabels) { 38 | this.nodeLabels = nodeLabels; 39 | } 40 | 41 | public ArrayList getNodeVisibles() { 42 | return nodeVisibles; 43 | } 44 | 45 | public void setNodeVisibles(ArrayList nodeVisibles) { 46 | this.nodeVisibles = nodeVisibles; 47 | } 48 | 49 | public ArrayList getEdgeLabels() { 50 | return edgeLabels; 51 | } 52 | 53 | public void setEdgeLabels(ArrayList edgeLabels) { 54 | this.edgeLabels = edgeLabels; 55 | } 56 | 57 | public ArrayList getEdgeX() { 58 | return edgeX; 59 | } 60 | 61 | public void setEdgeX(ArrayList edgeX) { 62 | this.edgeX = edgeX; 63 | } 64 | 65 | public ArrayList getEdgeY() { 66 | return edgeY; 67 | } 68 | 69 | public void setEdgeY(ArrayList edgeY) { 70 | this.edgeY = edgeY; 71 | } 72 | 73 | public ArrayList getEdgeVisibles() { 74 | return edgeVisibles; 75 | } 76 | 77 | public void setEdgeVisibles(ArrayList edgeVisibles) { 78 | this.edgeVisibles = edgeVisibles; 79 | } 80 | 81 | /** 82 | * 根据点边频繁度移除图中不频繁的点边 83 | * 84 | * @param freqNodeLabel 85 | * 点的频繁度统计 86 | * @param freqEdgeLabel 87 | * 边的频繁度统计 88 | * @param minSupportCount 89 | * 最小支持度计数 90 | */ 91 | public void removeInFreqNodeAndEdge(int[] freqNodeLabel, int[] freqEdgeLabel, int minSupportCount) { 92 | int label = 0; 93 | int x = 0; 94 | int y = 0; 95 | 96 | for (int i = 0; i < nodeLabels.size(); i++) { 97 | label = nodeLabels.get(i); 98 | if (freqNodeLabel[label] < minSupportCount) { 99 | // 如果小于支持度计数,则此点不可用 100 | nodeVisibles.set(i, false); 101 | } 102 | } 103 | 104 | for (int i = 0; i < edgeLabels.size(); i++) { 105 | label = edgeLabels.get(i); 106 | 107 | if (freqEdgeLabel[label] < minSupportCount) { 108 | // 如果小于支持度计数,则此边不可用 109 | edgeVisibles.set(i, false); 110 | continue; 111 | } 112 | 113 | // 如果此边的某个端的端点已经不可用了,则此边也不可用,x,y表示id号 114 | x = edgeX.get(i); 115 | y = edgeY.get(i); 116 | if (!nodeVisibles.get(x) || !nodeVisibles.get(y)) { 117 | edgeVisibles.set(i, false); 118 | } 119 | } 120 | } 121 | 122 | /** 123 | * 根据标号排序重新对满足条件的点边重新编号 124 | * 125 | * @param nodeLabel2Rank 126 | * 点排名 127 | * @param edgeLabel2Rank 128 | * 边排名 129 | */ 130 | public void reLabelByRank(int[] nodeLabel2Rank, int[] edgeLabel2Rank) { 131 | int label = 0; 132 | int count = 0; 133 | int temp = 0; 134 | // 旧的id对新id号的映射 135 | int[] oldId2New = new int[nodeLabels.size()]; 136 | for (int i = 0; i < nodeLabels.size(); i++) { 137 | label = nodeLabels.get(i); 138 | 139 | // 如果当前点是可用的,将此标号的排名号作为此点新的标号 140 | if (nodeVisibles.get(i)) { 141 | nodeLabels.set(i, nodeLabel2Rank[label]); 142 | oldId2New[i] = count; 143 | count++; 144 | } 145 | } 146 | 147 | for (int i = 0; i < edgeLabels.size(); i++) { 148 | label = edgeLabels.get(i); 149 | 150 | // 如果当前边是可用的,将此标号的排名号作为此点新的标号 151 | if (edgeVisibles.get(i)) { 152 | edgeLabels.set(i, edgeLabel2Rank[label]); 153 | 154 | // 对此点做x,y的id号替换 155 | temp = edgeX.get(i); 156 | edgeX.set(i, oldId2New[temp]); 157 | temp = edgeY.get(i); 158 | edgeY.set(i, oldId2New[temp]); 159 | } 160 | } 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/graph/gspan/SubChildTraveler.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.graph.gspan; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * 孩子图搜寻类,在当前边的基础上寻找可能的孩子边 7 | */ 8 | public class SubChildTraveler { 9 | 10 | // 当前的五元组边 11 | ArrayList edgeSeq; 12 | // 当前的图 13 | Graph graph; 14 | // 结果数据,孩子边对所属的图id组 15 | ArrayList childEdge; 16 | // 图的点id对五元组id标识的映射 17 | int[] g2s; 18 | // 五元组id标识对图的点id的映射 19 | int[] s2g; 20 | // 图中边是否被用的情况 21 | boolean f[][]; 22 | // 最右路径,rm[id]表示的是此id节点在最右路径中的下一个节点id 23 | int[] rm; 24 | // 下一个五元组的id 25 | int next; 26 | 27 | public SubChildTraveler(ArrayList edgeSeq, Graph graph) { 28 | this.edgeSeq = edgeSeq; 29 | this.graph = graph; 30 | this.childEdge = new ArrayList<>(); 31 | } 32 | 33 | /** 34 | * 在图中搜索可能存在的孩子边 35 | * 36 | * @param next 37 | * 新加入边的节点将设置的id 38 | */ 39 | public void traveler() { 40 | this.next = edgeSeq.size() + 1; 41 | int size = graph.nodeLabels.size(); 42 | // 做id映射的初始化操作 43 | g2s = new int[size]; 44 | s2g = new int[size]; 45 | f = new boolean[size][size]; 46 | 47 | for (int i = 0; i < size; i++) { 48 | g2s[i] = -1; 49 | s2g[i] = -1; 50 | 51 | for (int j = 0; j < size; j++) { 52 | // 代表点id为i到id为j点此边没有被用过 53 | f[i][j] = false; 54 | } 55 | } 56 | 57 | rm = new int[edgeSeq.size() + 1]; 58 | for (int i = 0; i < edgeSeq.size() + 1; i++) { 59 | rm[i] = -1; 60 | } 61 | // 寻找最右路径 62 | for (Edge e : edgeSeq) { 63 | if (e.ix < e.iy && e.iy > rm[e.ix]) { 64 | rm[e.ix] = e.iy; 65 | } 66 | } 67 | 68 | for (int i = 0; i < size; i++) { 69 | // 寻找第一个标号相等的点 70 | if (edgeSeq.get(0).x != graph.nodeLabels.get(i)) { 71 | continue; 72 | } 73 | 74 | g2s[i] = 0; 75 | s2g[0] = i; 76 | dfsSearchEdge(0); 77 | g2s[i] = -1; 78 | s2g[0] = -1; 79 | } 80 | 81 | } 82 | 83 | /** 84 | * 在当前图中深度优先寻找正确的子图 85 | * 86 | * @param currentPosition 87 | * 当前找到的位置 88 | */ 89 | public void dfsSearchEdge(int currentPosition) { 90 | int rmPosition = 0; 91 | // 如果找到底了,则在当前的子图的最右路径中寻找可能的边 92 | if (currentPosition >= edgeSeq.size()) { 93 | rmPosition = 0; 94 | while (rmPosition >= 0) { 95 | int gId = s2g[rmPosition]; 96 | // 在此点附近寻找可能的边 97 | for (int i = 0; i < graph.edgeNexts.get(gId).size(); i++) { 98 | int gId2 = graph.edgeNexts.get(gId).get(i); 99 | // 如果这条边已经被用过 100 | if (f[gId][gId2] || f[gId][gId2]) { 101 | continue; 102 | } 103 | 104 | // 在最右路径中添加边分为2种情况,第一种为在最右节点上添加,第二中为在最右路径上 的点添加 105 | // 如果找到的点没有被用过,可以进行边的拓展 106 | if (g2s[gId2] < 0) { 107 | g2s[gId2] = next; 108 | Edge e = new Edge(g2s[gId], g2s[gId2], graph.nodeLabels.get(gId), 109 | graph.edgeLabels.get(gId).get(i), graph.nodeLabels.get(gId2)); 110 | // 将新建的子边加入集合 111 | childEdge.add(e); 112 | } else { 113 | boolean flag = true; 114 | // 如果这点已经存在,判断他是不是最右的点 115 | for (int j = 0; j < graph.edgeNexts.get(gId2).size(); j++) { 116 | int tempId = graph.edgeNexts.get(gId2).get(j); 117 | if (g2s[gId2] < g2s[tempId]) { 118 | flag = false; 119 | break; 120 | } 121 | } 122 | 123 | if (flag) { 124 | Edge e = new Edge(g2s[gId], g2s[gId2], graph.nodeLabels.get(gId), 125 | graph.edgeLabels.get(gId).get(i), graph.nodeLabels.get(gId2)); 126 | // 将新建的子边加入集合 127 | childEdge.add(e); 128 | } 129 | } 130 | } 131 | // 一个最右路径上点找完,继续下一个 132 | rmPosition = rm[rmPosition]; 133 | } 134 | return; 135 | } 136 | 137 | Edge e = edgeSeq.get(currentPosition); 138 | // 所连接的点标号 139 | int y = e.y; 140 | // 所连接的边标号 141 | int a = e.a; 142 | int gId1 = s2g[e.ix]; 143 | int gId2 = 0; 144 | 145 | for (int i = 0; i < graph.edgeLabels.get(gId1).size(); i++) { 146 | // 判断所连接的边对应的标号 147 | if (graph.edgeLabels.get(gId1).get(i) != a) { 148 | continue; 149 | } 150 | 151 | // 判断所连接的点的标号 152 | int tempId = graph.edgeNexts.get(gId1).get(i); 153 | if (graph.nodeLabels.get(tempId) != y) { 154 | continue; 155 | } 156 | 157 | gId2 = tempId; 158 | // 如果这两点是没有设置过的 159 | if (g2s[gId2] == -1 && s2g[e.iy] == -1) { 160 | g2s[gId2] = e.iy; 161 | s2g[e.iy] = gId2; 162 | f[gId1][gId2] = true; 163 | f[gId2][gId1] = true; 164 | dfsSearchEdge(currentPosition + 1); 165 | f[gId1][gId2] = false; 166 | f[gId2][gId1] = false; 167 | g2s[gId2] = -1; 168 | s2g[e.iy] = -1; 169 | } else { 170 | if (g2s[gId2] != e.iy) { 171 | continue; 172 | } 173 | if (s2g[e.iy] != gId2) { 174 | continue; 175 | } 176 | f[gId1][gId2] = true; 177 | f[gId2][gId1] = true; 178 | dfsSearchEdge(currentPosition); 179 | f[gId1][gId2] = false; 180 | f[gId2][gId1] = false; 181 | } 182 | } 183 | 184 | } 185 | 186 | /** 187 | * 获取结果数据对 188 | * 189 | * @return 190 | */ 191 | public ArrayList getResultChildEdge() { 192 | return this.childEdge; 193 | } 194 | 195 | } 196 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/integrated/cba/CBAExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.integrated.cba; 2 | 3 | import java.text.MessageFormat; 4 | 5 | /** 6 | * CBA算法--基于关联规则的分类算法 7 | */ 8 | public class CBAExample { 9 | 10 | public static void main(String[] args) { 11 | String filePath = "data/cba/input.txt"; 12 | String attrDesc = "Age=Senior,CreditRating=Fair"; 13 | String classification = null; 14 | 15 | //最小支持度阈值率 16 | double minSupportRate = 0.2; 17 | //最小置信度阈值 18 | double minConf = 0.7; 19 | 20 | CBACore tool = new CBACore(filePath, minSupportRate, minConf); 21 | classification = tool.CBAJudge(attrDesc); 22 | System.out.println(MessageFormat.format("{0}的关联分类结果为{1}", attrDesc, classification)); 23 | } 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/integrated/cba/FrequentItem.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.integrated.cba; 2 | 3 | /** 4 | * 频繁项集 5 | */ 6 | public class FrequentItem implements Comparable { 7 | 8 | // 频繁项集的集合ID 9 | private String[] idArray; 10 | // 频繁项集的支持度计数 11 | private int count; 12 | //频繁项集的长度,1项集或是2项集,亦或是3项集 13 | private int length; 14 | 15 | public FrequentItem(String[] idArray, int count) { 16 | this.idArray = idArray; 17 | this.count = count; 18 | length = idArray.length; 19 | } 20 | 21 | public String[] getIdArray() { 22 | return idArray; 23 | } 24 | 25 | public void setIdArray(String[] idArray) { 26 | this.idArray = idArray; 27 | } 28 | 29 | public int getCount() { 30 | return count; 31 | } 32 | 33 | public void setCount(int count) { 34 | this.count = count; 35 | } 36 | 37 | public int getLength() { 38 | return length; 39 | } 40 | 41 | public void setLength(int length) { 42 | this.length = length; 43 | } 44 | 45 | @Override 46 | public int compareTo(FrequentItem o) { 47 | // TODO Auto-generated method stub 48 | Integer int1 = Integer.parseInt(this.getIdArray()[0]); 49 | Integer int2 = Integer.parseInt(o.getIdArray()[0]); 50 | 51 | return int1.compareTo(int2); 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/link/hits/HITSCore.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.link.hits; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | 9 | /** 10 | * HITS链接分析算法工具类 11 | */ 12 | public class HITSCore { 13 | 14 | //输入数据文件地址 15 | private String filePath; 16 | //网页个数 17 | private int pageNum; 18 | //网页Authority权威值 19 | private double[] authority; 20 | //网页hub中心值 21 | private double[] hub; 22 | //链接矩阵关系 23 | private int[][] linkMatrix; 24 | //网页种类 25 | private ArrayList pageClass; 26 | 27 | public HITSCore(String filePath) { 28 | this.filePath = filePath; 29 | readDataFile(); 30 | } 31 | 32 | /** 33 | * 从文件中读取数据 34 | */ 35 | private void readDataFile() { 36 | File file = new File(filePath); 37 | ArrayList dataArray = new ArrayList(); 38 | 39 | try { 40 | BufferedReader in = new BufferedReader(new FileReader(file)); 41 | String str; 42 | String[] tempArray; 43 | while ((str = in.readLine()) != null) { 44 | tempArray = str.split(" "); 45 | dataArray.add(tempArray); 46 | } 47 | in.close(); 48 | } catch (IOException e) { 49 | e.getStackTrace(); 50 | } 51 | 52 | pageClass = new ArrayList<>(); 53 | // 统计网页类型种数 54 | for (String[] array : dataArray) { 55 | for (String s : array) { 56 | if (!pageClass.contains(s)) { 57 | pageClass.add(s); 58 | } 59 | } 60 | } 61 | 62 | int i = 0; 63 | int j = 0; 64 | pageNum = pageClass.size(); 65 | linkMatrix = new int[pageNum][pageNum]; 66 | authority = new double[pageNum]; 67 | hub = new double[pageNum]; 68 | for (int k = 0; k < pageNum; k++) { 69 | //初始时默认权威值和中心值都为1 70 | authority[k] = 1; 71 | hub[k] = 1; 72 | } 73 | 74 | for (String[] array : dataArray) { 75 | 76 | i = Integer.parseInt(array[0]); 77 | j = Integer.parseInt(array[1]); 78 | 79 | // 设置linkMatrix[i][j]为1代表i网页包含指向j网页的链接 80 | linkMatrix[i - 1][j - 1] = 1; 81 | } 82 | } 83 | 84 | /** 85 | * 输出结果页面,也就是authority权威值最高的页面 86 | */ 87 | public void printResultPage() { 88 | //最大Hub和Authority值,用于后面的归一化计算 89 | double maxHub = 0; 90 | double maxAuthority = 0; 91 | int maxAuthorityIndex = 0; 92 | //误差值,用于收敛判断 93 | double error = Integer.MAX_VALUE; 94 | double[] newHub = new double[pageNum]; 95 | double[] newAuthority = new double[pageNum]; 96 | 97 | while (error > 0.01 * pageNum) { 98 | for (int k = 0; k < pageNum; k++) { 99 | newHub[k] = 0; 100 | newAuthority[k] = 0; 101 | } 102 | 103 | //hub和authority值的更新计算 104 | for (int i = 0; i < pageNum; i++) { 105 | for (int j = 0; j < pageNum; j++) { 106 | if (linkMatrix[i][j] == 1) { 107 | newHub[i] += authority[j]; 108 | newAuthority[j] += hub[i]; 109 | } 110 | } 111 | } 112 | 113 | maxHub = 0; 114 | maxAuthority = 0; 115 | for (int k = 0; k < pageNum; k++) { 116 | if (newHub[k] > maxHub) { 117 | maxHub = newHub[k]; 118 | } 119 | 120 | if (newAuthority[k] > maxAuthority) { 121 | maxAuthority = newAuthority[k]; 122 | maxAuthorityIndex = k; 123 | } 124 | } 125 | 126 | error = 0; 127 | //归一化处理 128 | for (int k = 0; k < pageNum; k++) { 129 | newHub[k] /= maxHub; 130 | newAuthority[k] /= maxAuthority; 131 | 132 | error += Math.abs(newHub[k] - hub[k]); 133 | System.out.println(newAuthority[k] + ":" + newHub[k]); 134 | 135 | hub[k] = newHub[k]; 136 | authority[k] = newAuthority[k]; 137 | } 138 | System.out.println("---------"); 139 | } 140 | 141 | System.out.println("****最终收敛的网页的权威值和中心值****"); 142 | for (int k = 0; k < pageNum; k++) { 143 | System.out.println("网页" + pageClass.get(k) + ":" + authority[k] + ":" + hub[k]); 144 | } 145 | System.out.println("权威值最高的网页为:网页" + pageClass.get(maxAuthorityIndex)); 146 | } 147 | 148 | } 149 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/link/hits/HITSExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.link.hits; 2 | 3 | /** 4 | * HITS链接分析算法 5 | */ 6 | public class HITSExample { 7 | 8 | public static void main(String[] args) { 9 | String filePath = "data/hits/input.txt"; 10 | 11 | HITSCore tool = new HITSCore(filePath); 12 | tool.printResultPage(); 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/link/pagerank/PageRankCore.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.link.pagerank; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.text.MessageFormat; 8 | import java.util.ArrayList; 9 | 10 | /** 11 | * PageRank网页排名算法工具类 12 | */ 13 | public class PageRankCore { 14 | 15 | // 测试输入数据 16 | private String filePath; 17 | // 网页总数量 18 | private int pageNum; 19 | // 链接关系矩阵 20 | private double[][] linkMatrix; 21 | // 每个页面pageRank值初始向量 22 | private double[] pageRankVecor; 23 | 24 | // 网页数量分类 25 | ArrayList pageClass; 26 | 27 | public PageRankCore(String filePath) { 28 | this.filePath = filePath; 29 | readDataFile(); 30 | } 31 | 32 | /** 33 | * 从文件中读取数据 34 | */ 35 | private void readDataFile() { 36 | File file = new File(filePath); 37 | ArrayList dataArray = new ArrayList(); 38 | 39 | try { 40 | BufferedReader in = new BufferedReader(new FileReader(file)); 41 | String str; 42 | String[] tempArray; 43 | while ((str = in.readLine()) != null) { 44 | tempArray = str.split(" "); 45 | dataArray.add(tempArray); 46 | } 47 | in.close(); 48 | } catch (IOException e) { 49 | e.getStackTrace(); 50 | } 51 | 52 | pageClass = new ArrayList<>(); 53 | // 统计网页类型种数 54 | for (String[] array : dataArray) { 55 | for (String s : array) { 56 | if (!pageClass.contains(s)) { 57 | pageClass.add(s); 58 | } 59 | } 60 | } 61 | 62 | int i = 0; 63 | int j = 0; 64 | pageNum = pageClass.size(); 65 | linkMatrix = new double[pageNum][pageNum]; 66 | pageRankVecor = new double[pageNum]; 67 | for (int k = 0; k < pageNum; k++) { 68 | // 初始每个页面的pageRank值为1 69 | pageRankVecor[k] = 1.0; 70 | } 71 | for (String[] array : dataArray) { 72 | 73 | i = Integer.parseInt(array[0]); 74 | j = Integer.parseInt(array[1]); 75 | 76 | // 设置linkMatrix[i][j]为1代表i网页包含指向j网页的链接 77 | linkMatrix[i - 1][j - 1] = 1; 78 | } 79 | } 80 | 81 | /** 82 | * 将矩阵转置 83 | */ 84 | private void transferMatrix() { 85 | int count = 0; 86 | for (double[] array : linkMatrix) { 87 | // 计算页面链接个数 88 | count = 0; 89 | for (double d : array) { 90 | if (d == 1) { 91 | count++; 92 | } 93 | } 94 | // 按概率均分 95 | for (int i = 0; i < array.length; i++) { 96 | if (array[i] == 1) { 97 | array[i] /= count; 98 | } 99 | } 100 | } 101 | 102 | double t = 0; 103 | // 将矩阵转置换,作为概率转移矩阵 104 | for (int i = 0; i < linkMatrix.length; i++) { 105 | for (int j = i + 1; j < linkMatrix[0].length; j++) { 106 | t = linkMatrix[i][j]; 107 | linkMatrix[i][j] = linkMatrix[j][i]; 108 | linkMatrix[j][i] = t; 109 | } 110 | } 111 | } 112 | 113 | /** 114 | * 利用幂法计算pageRank值 115 | */ 116 | public void printPageRankValue() { 117 | transferMatrix(); 118 | // 阻尼系数 119 | double damp = 0.5; 120 | // 链接概率矩阵 121 | double[][] A = new double[pageNum][pageNum]; 122 | double[][] e = new double[pageNum][pageNum]; 123 | 124 | // 调用公式A=d*q+(1-d)*e/m,m为网页总个数,d就是damp 125 | double temp = (1 - damp) / pageNum; 126 | for (int i = 0; i < e.length; i++) { 127 | for (int j = 0; j < e[0].length; j++) { 128 | e[i][j] = temp; 129 | } 130 | } 131 | 132 | for (int i = 0; i < pageNum; i++) { 133 | for (int j = 0; j < pageNum; j++) { 134 | temp = damp * linkMatrix[i][j] + e[i][j]; 135 | A[i][j] = temp; 136 | 137 | } 138 | } 139 | 140 | // 误差值,作为判断收敛标准 141 | double errorValue = Integer.MAX_VALUE; 142 | double[] newPRVector = new double[pageNum]; 143 | // 当平均每个PR值误差小于0.001时就算达到收敛 144 | while (errorValue > 0.001 * pageNum) { 145 | System.out.println("**********"); 146 | for (int i = 0; i < pageNum; i++) { 147 | temp = 0; 148 | // 将A*pageRankVector,利用幂法求解,直到pageRankVector值收敛 149 | for (int j = 0; j < pageNum; j++) { 150 | // temp就是每个网页到i页面的pageRank值 151 | temp += A[i][j] * pageRankVecor[j]; 152 | } 153 | 154 | // 最后的temp就是i网页的总PageRank值 155 | newPRVector[i] = temp; 156 | System.out.println(temp); 157 | } 158 | 159 | errorValue = 0; 160 | for (int i = 0; i < pageNum; i++) { 161 | errorValue += Math.abs(pageRankVecor[i] - newPRVector[i]); 162 | // 新的向量代替旧的向量 163 | pageRankVecor[i] = newPRVector[i]; 164 | } 165 | } 166 | 167 | String name = null; 168 | temp = 0; 169 | System.out.println("--------------------"); 170 | for (int i = 0; i < pageNum; i++) { 171 | System.out.println(MessageFormat.format("网页{0}的pageRank值:{1}", pageClass.get(i), pageRankVecor[i])); 172 | if (pageRankVecor[i] > temp) { 173 | temp = pageRankVecor[i]; 174 | name = pageClass.get(i); 175 | } 176 | } 177 | System.out.println(MessageFormat.format("等级最高的网页为:{0}", name)); 178 | } 179 | 180 | } 181 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/link/pagerank/PageRankExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.link.pagerank; 2 | 3 | /** 4 | * PageRank计算网页重要性/排名算法 5 | */ 6 | public class PageRankExample { 7 | 8 | public static void main(String[] args) { 9 | String filePath = "data/pagerank/input.txt"; 10 | 11 | PageRankCore tool = new PageRankCore(filePath); 12 | tool.printPageRankValue(); 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/aco/ACOExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.aco; 2 | 3 | /** 4 | * 蚁群算法测试类 5 | */ 6 | public class ACOExample { 7 | 8 | public static void main(String[] args) { 9 | //测试数据 10 | String filePath = "data/aco/input.txt"; 11 | //蚂蚁数量 12 | int antNum; 13 | //蚁群算法迭代次数 14 | int loopCount; 15 | //控制参数 16 | double alpha; 17 | double beita; 18 | double p; 19 | double Q; 20 | 21 | antNum = 3; 22 | alpha = 0.5; 23 | beita = 1; 24 | p = 0.5; 25 | Q = 5; 26 | loopCount = 5; 27 | 28 | ACOCore tool = new ACOCore(filePath, antNum, alpha, beita, p, Q); 29 | tool.antStartSearching(loopCount); 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/aco/Ant.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.aco; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * 蚂蚁类,进行路径搜索的载体 7 | */ 8 | public class Ant implements Comparable { 9 | 10 | // 蚂蚁当前所在城市 11 | String currentPos; 12 | // 蚂蚁遍历完回到原点所用的总距离 13 | Double sumDistance; 14 | // 城市间的信息素浓度矩阵,随着时间的增多而减少 15 | double[][] pheromoneMatrix; 16 | // 蚂蚁已经走过的城市集合 17 | ArrayList visitedCitys; 18 | // 还未走过的城市集合 19 | ArrayList nonVisitedCitys; 20 | // 蚂蚁当前走过的路径 21 | ArrayList currentPath; 22 | 23 | public Ant(double[][] pheromoneMatrix, ArrayList nonVisitedCitys) { 24 | this.pheromoneMatrix = pheromoneMatrix; 25 | this.nonVisitedCitys = nonVisitedCitys; 26 | 27 | this.visitedCitys = new ArrayList<>(); 28 | this.currentPath = new ArrayList<>(); 29 | } 30 | 31 | /** 32 | * 计算路径的总成本(距离) 33 | * 34 | * @return 35 | */ 36 | public double calSumDistance() { 37 | sumDistance = 0.0; 38 | String lastCity; 39 | String currentCity; 40 | 41 | for (int i = 0; i < currentPath.size() - 1; i++) { 42 | lastCity = currentPath.get(i); 43 | currentCity = currentPath.get(i + 1); 44 | 45 | // 通过距离矩阵进行计算 46 | sumDistance += ACOCore.disMatrix[Integer.parseInt(lastCity)][Integer.parseInt(currentCity)]; 47 | } 48 | 49 | return sumDistance; 50 | } 51 | 52 | /** 53 | * 蚂蚁选择前往下一个城市 54 | * 55 | * @param city 56 | * 所选的城市 57 | */ 58 | public void goToNextCity(String city) { 59 | this.currentPath.add(city); 60 | this.currentPos = city; 61 | this.nonVisitedCitys.remove(city); 62 | this.visitedCitys.add(city); 63 | } 64 | 65 | /** 66 | * 判断蚂蚁是否已经又重新回到起点 67 | * 68 | * @return 69 | */ 70 | public boolean isBack() { 71 | boolean isBack = false; 72 | String startPos; 73 | String endPos; 74 | 75 | if (currentPath.size() == 0) { 76 | return isBack; 77 | } 78 | 79 | startPos = currentPath.get(0); 80 | endPos = currentPath.get(currentPath.size() - 1); 81 | if (currentPath.size() > 1 && startPos.equals(endPos)) { 82 | isBack = true; 83 | } 84 | 85 | return isBack; 86 | } 87 | 88 | /** 89 | * 判断蚂蚁在本次的走过的路径中是否包含从城市i到城市j 90 | * 91 | * @param cityI 92 | * 城市I 93 | * @param cityJ 94 | * 城市J 95 | * @return 96 | */ 97 | public boolean pathContained(String cityI, String cityJ) { 98 | String lastCity; 99 | String currentCity; 100 | boolean isContained = false; 101 | 102 | for (int i = 0; i < currentPath.size() - 1; i++) { 103 | lastCity = currentPath.get(i); 104 | currentCity = currentPath.get(i + 1); 105 | 106 | // 如果某一段路径的始末位置一致,则认为有经过此城市 107 | if ((lastCity.equals(cityI) && currentCity.equals(cityJ)) 108 | || (lastCity.equals(cityJ) && currentCity.equals(cityI))) { 109 | isContained = true; 110 | break; 111 | } 112 | } 113 | 114 | return isContained; 115 | } 116 | 117 | @Override 118 | public int compareTo(Ant o) { 119 | // TODO Auto-generated method stub 120 | return this.sumDistance.compareTo(o.sumDistance); 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/bayesnetwork/BayesNetWorkExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.bayesnetwork; 2 | 3 | /** 4 | * 贝叶斯网络场景测试类 5 | */ 6 | public class BayesNetWorkExample { 7 | 8 | public static void main(String[] args) { 9 | String dataFilePath = "data/bayesnetwork/input.txt"; 10 | String attachFilePath = "data/bayesnetwork/attach.txt"; 11 | // 查询串语句 12 | String queryStr; 13 | // 结果概率 14 | double result; 15 | 16 | // 查询语句的描述的事件是地震发生了,导致响铃响了,导致接到Mary的电话 17 | queryStr = "E=y,A=y,M=y"; 18 | BayesNetWorkCore tool = new BayesNetWorkCore(dataFilePath, attachFilePath); 19 | result = tool.calProByNetWork(queryStr); 20 | 21 | if (result == -1) { 22 | System.out.println("所描述的事件不满足贝叶斯网络的结构,无法求其概率"); 23 | } else { 24 | System.out.println(String.format("事件%s发生的概率为%s", queryStr, result)); 25 | } 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/bayesnetwork/Node.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.bayesnetwork; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * 贝叶斯网络节点类 7 | */ 8 | public class Node { 9 | 10 | // 节点的属性名称 11 | String name; 12 | // 节点的父亲节点,也就是上游节点,可能多个 13 | ArrayList parentNodes; 14 | // 节点的子节点,也就是下游节点,可能多个 15 | ArrayList childNodes; 16 | 17 | public Node(String name) { 18 | this.name = name; 19 | 20 | // 初始化变量 21 | this.parentNodes = new ArrayList<>(); 22 | this.childNodes = new ArrayList<>(); 23 | } 24 | 25 | /** 26 | * 将自身节点连接到目标给定的节点 27 | * 28 | * @param node 29 | * 下游节点 30 | */ 31 | public void connectNode(Node node) { 32 | // 将下游节点加入自身节点的孩子节点中 33 | this.childNodes.add(node); 34 | // 将自身节点加入到下游节点的父节点中 35 | node.parentNodes.add(this); 36 | } 37 | 38 | /** 39 | * 判断与目标节点是否相同,主要比较名称是否相同即可 40 | * 41 | * @param node 42 | * 目标结点 43 | * @return 44 | */ 45 | public boolean isEqual(Node node) { 46 | boolean isEqual; 47 | 48 | isEqual = false; 49 | // 节点名称相同则视为相等 50 | if (this.name.equals(node.name)) { 51 | isEqual = true; 52 | } 53 | 54 | return isEqual; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/cabddcc/CABDDCCCore.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.cabddcc; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.text.MessageFormat; 8 | import java.util.ArrayList; 9 | 10 | /** 11 | * 基于连通图的分裂聚类算法 12 | */ 13 | public class CABDDCCCore { 14 | 15 | // 测试数据点数据 16 | private String filePath; 17 | // 连通图距离阈值l 18 | private int length; 19 | // 原始坐标点 20 | public static ArrayList totalPoints; 21 | // 聚类结果坐标点集合 22 | private ArrayList> resultClusters; 23 | // 连通图 24 | private Graph graph; 25 | 26 | public CABDDCCCore(String filePath, int length) { 27 | this.filePath = filePath; 28 | this.length = length; 29 | 30 | readDataFile(); 31 | } 32 | 33 | /** 34 | * 从文件中读取数据 35 | */ 36 | public void readDataFile() { 37 | File file = new File(filePath); 38 | ArrayList dataArray = new ArrayList(); 39 | 40 | try { 41 | BufferedReader in = new BufferedReader(new FileReader(file)); 42 | String str; 43 | String[] tempArray; 44 | while ((str = in.readLine()) != null) { 45 | tempArray = str.split(" "); 46 | dataArray.add(tempArray); 47 | } 48 | in.close(); 49 | } catch (IOException e) { 50 | e.getStackTrace(); 51 | } 52 | 53 | Point p; 54 | totalPoints = new ArrayList<>(); 55 | for (String[] array : dataArray) { 56 | p = new Point(array[0], array[1], array[2]); 57 | totalPoints.add(p); 58 | } 59 | 60 | // 用边和点构造图 61 | graph = new Graph(null, totalPoints); 62 | } 63 | 64 | /** 65 | * 分裂连通图得到聚类 66 | */ 67 | public void splitCluster() { 68 | // 获取形成连通子图 69 | ArrayList subGraphs; 70 | ArrayList> pointList; 71 | resultClusters = new ArrayList<>(); 72 | 73 | subGraphs = graph.splitGraphByLength(length); 74 | 75 | for (Graph g : subGraphs) { 76 | // 获取每个连通子图分裂后的聚类结果 77 | pointList = g.getClusterByDivding(); 78 | resultClusters.addAll(pointList); 79 | } 80 | 81 | printResultCluster(); 82 | } 83 | 84 | /** 85 | * 输出结果聚簇 86 | */ 87 | private void printResultCluster() { 88 | int i = 1; 89 | for (ArrayList cluster : resultClusters) { 90 | System.out.print("聚簇" + i + ":"); 91 | for (Point p : cluster) { 92 | System.out.print(MessageFormat.format("({0}, {1}) ", p.x, p.y)); 93 | } 94 | System.out.println(); 95 | i++; 96 | } 97 | 98 | } 99 | 100 | } 101 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/cabddcc/CABDDCCExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.cabddcc; 2 | 3 | /** 4 | * 基于连通图的分裂聚类算法 5 | */ 6 | public class CABDDCCExample { 7 | 8 | public static void main(String[] agrs) { 9 | String filePath = "data/cabddcc/graphData.txt"; 10 | //连通距离阈值 11 | int length = 3; 12 | 13 | CABDDCCCore tool = new CABDDCCCore(filePath, length); 14 | tool.splitCluster(); 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/cabddcc/Point.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.cabddcc; 2 | 3 | /** 4 | * 坐标点类 5 | */ 6 | public class Point implements Comparable { 7 | 8 | //坐标点id号,id号唯一 9 | int id; 10 | //坐标横坐标 11 | Integer x; 12 | //坐标纵坐标 13 | Integer y; 14 | //坐标点是否已经被访问(处理)过,在生成连通子图的时候用到 15 | boolean isVisited; 16 | 17 | public Point(String id, String x, String y) { 18 | this.id = Integer.parseInt(id); 19 | this.x = Integer.parseInt(x); 20 | this.y = Integer.parseInt(y); 21 | } 22 | 23 | /** 24 | * 计算当前点与制定点之间的欧式距离 25 | * 26 | * @param p 27 | * 待计算聚类的p点 28 | * @return 29 | */ 30 | public double ouDistance(Point p) { 31 | double distance = 0; 32 | 33 | distance = (this.x - p.x) * (this.x - p.x) + (this.y - p.y) * (this.y - p.y); 34 | distance = Math.sqrt(distance); 35 | 36 | return distance; 37 | } 38 | 39 | /** 40 | * 判断2个坐标点是否为用个坐标点 41 | * 42 | * @param p 43 | * 待比较坐标点 44 | * @return 45 | */ 46 | public boolean isTheSame(Point p) { 47 | boolean isSamed = false; 48 | 49 | if (this.x == p.x && this.y == p.y) { 50 | isSamed = true; 51 | } 52 | 53 | return isSamed; 54 | } 55 | 56 | @Override 57 | public int compareTo(Point p) { 58 | if (this.x.compareTo(p.x) != 0) { 59 | return this.x.compareTo(p.x); 60 | } else { 61 | //如果在x坐标相等的情况下比较y坐标 62 | return this.y.compareTo(p.y); 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/chameleon/ChameleonExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.chameleon; 2 | 3 | /** 4 | * Chameleon(变色龙)两阶段聚类算法 5 | */ 6 | public class ChameleonExample { 7 | 8 | public static void main(String[] args) { 9 | String filePath = "data/chameleon/graphData.txt"; 10 | //k-近邻的k设置 11 | int k = 1; 12 | //度量函数阈值 13 | double minMetric = 0.1; 14 | 15 | ChameleonCore tool = new ChameleonCore(filePath, k, minMetric); 16 | tool.buildCluster(); 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/chameleon/Cluster.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.chameleon; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * 聚簇类 7 | */ 8 | public class Cluster implements Cloneable { 9 | 10 | //簇唯一id标识号 11 | int id; 12 | // 聚簇内的坐标点集合 13 | ArrayList points; 14 | // 聚簇内的所有边的权重和 15 | double weightSum = 0; 16 | 17 | public Cluster(int id, ArrayList points) { 18 | this.id = id; 19 | this.points = points; 20 | } 21 | 22 | /** 23 | * 计算聚簇的内部的边权重和 24 | * 25 | * @return 26 | */ 27 | public double calEC() { 28 | int id1 = 0; 29 | int id2 = 0; 30 | weightSum = 0; 31 | 32 | for (Point p1 : points) { 33 | for (Point p2 : points) { 34 | id1 = p1.id; 35 | id2 = p2.id; 36 | 37 | // 为了避免重复计算,取id1小的对应大的 38 | if (id1 < id2 && ChameleonCore.edges[id1][id2] == 1) { 39 | weightSum += ChameleonCore.weights[id1][id2]; 40 | } 41 | } 42 | } 43 | 44 | return weightSum; 45 | } 46 | 47 | /** 48 | * 计算2个簇之间最近的n条边 49 | * 50 | * @param otherCluster 51 | * 待比较的簇 52 | * @param n 53 | * 最近的边的数目 54 | * @return 55 | */ 56 | public ArrayList calNearestEdge(Cluster otherCluster, int n) { 57 | int count = 0; 58 | double distance = 0; 59 | double minDistance = Integer.MAX_VALUE; 60 | Point point1 = null; 61 | Point point2 = null; 62 | ArrayList edgeList = new ArrayList<>(); 63 | ArrayList pointList1 = (ArrayList) points.clone(); 64 | ArrayList pointList2 = null; 65 | Cluster c2 = null; 66 | 67 | try { 68 | c2 = (Cluster) otherCluster.clone(); 69 | pointList2 = c2.points; 70 | } catch (CloneNotSupportedException e) { 71 | // TODO Auto-generated catch block 72 | e.printStackTrace(); 73 | } 74 | 75 | int[] tempEdge; 76 | // 循环计算出每次的最近距离 77 | while (count < n) { 78 | tempEdge = new int[2]; 79 | minDistance = Integer.MAX_VALUE; 80 | 81 | for (Point p1 : pointList1) { 82 | for (Point p2 : pointList2) { 83 | distance = p1.ouDistance(p2); 84 | if (distance < minDistance) { 85 | point1 = p1; 86 | point2 = p2; 87 | tempEdge[0] = p1.id; 88 | tempEdge[1] = p2.id; 89 | 90 | minDistance = distance; 91 | } 92 | } 93 | } 94 | 95 | pointList1.remove(point1); 96 | pointList2.remove(point2); 97 | edgeList.add(tempEdge); 98 | count++; 99 | } 100 | 101 | return edgeList; 102 | } 103 | 104 | @Override 105 | protected Object clone() throws CloneNotSupportedException { 106 | // TODO Auto-generated method stub 107 | 108 | //引用需要再次复制,实现深拷贝 109 | ArrayList pointList = (ArrayList) this.points.clone(); 110 | Cluster cluster = new Cluster(id, pointList); 111 | 112 | return cluster; 113 | } 114 | 115 | } 116 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/chameleon/Point.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.chameleon; 2 | 3 | /** 4 | * 坐标点类 5 | */ 6 | public class Point { 7 | 8 | //坐标点id号,id号唯一 9 | int id; 10 | //坐标横坐标 11 | Integer x; 12 | //坐标纵坐标 13 | Integer y; 14 | //是否已经被访问过 15 | boolean isVisited; 16 | 17 | public Point(String id, String x, String y) { 18 | this.id = Integer.parseInt(id); 19 | this.x = Integer.parseInt(x); 20 | this.y = Integer.parseInt(y); 21 | } 22 | 23 | /** 24 | * 计算当前点与制定点之间的欧式距离 25 | * 26 | * @param p 27 | * 待计算聚类的p点 28 | * @return 29 | */ 30 | public double ouDistance(Point p) { 31 | double distance = 0; 32 | 33 | distance = (this.x - p.x) * (this.x - p.x) + (this.y - p.y) * (this.y - p.y); 34 | distance = Math.sqrt(distance); 35 | 36 | return distance; 37 | } 38 | 39 | /** 40 | * 判断2个坐标点是否为用个坐标点 41 | * 42 | * @param p 43 | * 待比较坐标点 44 | * @return 45 | */ 46 | public boolean isTheSame(Point p) { 47 | boolean isSamed = false; 48 | 49 | if (this.x == p.x && this.y == p.y) { 50 | isSamed = true; 51 | } 52 | 53 | return isSamed; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/dbscan/DBSCANCore.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.dbscan; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.text.MessageFormat; 8 | import java.util.ArrayList; 9 | 10 | /** 11 | * DBSCAN基于密度聚类算法工具类 12 | */ 13 | public class DBSCANCore { 14 | 15 | // 测试数据文件地址 16 | private String filePath; 17 | // 簇扫描半径 18 | private double eps; 19 | // 最小包含点数阈值 20 | private int minPts; 21 | // 所有的数据坐标点 22 | private ArrayList totalPoints; 23 | // 聚簇结果 24 | private ArrayList> resultClusters; 25 | //噪声数据 26 | private ArrayList noisePoint; 27 | 28 | public DBSCANCore(String filePath, double eps, int minPts) { 29 | this.filePath = filePath; 30 | this.eps = eps; 31 | this.minPts = minPts; 32 | readDataFile(); 33 | } 34 | 35 | /** 36 | * 从文件中读取数据 37 | */ 38 | public void readDataFile() { 39 | File file = new File(filePath); 40 | ArrayList dataArray = new ArrayList(); 41 | 42 | try { 43 | BufferedReader in = new BufferedReader(new FileReader(file)); 44 | String str; 45 | String[] tempArray; 46 | while ((str = in.readLine()) != null) { 47 | tempArray = str.split(" "); 48 | dataArray.add(tempArray); 49 | } 50 | in.close(); 51 | } catch (IOException e) { 52 | e.getStackTrace(); 53 | } 54 | 55 | Point p; 56 | totalPoints = new ArrayList<>(); 57 | for (String[] array : dataArray) { 58 | p = new Point(array[0], array[1]); 59 | totalPoints.add(p); 60 | } 61 | } 62 | 63 | /** 64 | * 递归的寻找聚簇 65 | * 66 | * @param pointList 67 | * 当前的点列表 68 | * @param parentCluster 69 | * 父聚簇 70 | */ 71 | private void recursiveCluster(Point point, ArrayList parentCluster) { 72 | double distance = 0; 73 | ArrayList cluster; 74 | 75 | // 如果已经访问过了,则跳过 76 | if (point.isVisited) { 77 | return; 78 | } 79 | 80 | point.isVisited = true; 81 | cluster = new ArrayList<>(); 82 | for (Point p2 : totalPoints) { 83 | // 过滤掉自身的坐标点 84 | if (point.isTheSame(p2)) { 85 | continue; 86 | } 87 | 88 | distance = point.ouDistance(p2); 89 | if (distance <= eps) { 90 | // 如果聚类小于给定的半径,则加入簇中 91 | cluster.add(p2); 92 | } 93 | } 94 | 95 | if (cluster.size() >= minPts) { 96 | // 将自己也加入到聚簇中 97 | cluster.add(point); 98 | // 如果附近的节点个数超过最下值,则加入到父聚簇中,同时去除重复的点 99 | addCluster(parentCluster, cluster); 100 | 101 | for (Point p : cluster) { 102 | recursiveCluster(p, parentCluster); 103 | } 104 | } 105 | } 106 | 107 | /** 108 | * 往父聚簇中添加局部簇坐标点 109 | * 110 | * @param parentCluster 111 | * 原始父聚簇坐标点 112 | * @param cluster 113 | * 待合并的聚簇 114 | */ 115 | private void addCluster(ArrayList parentCluster, ArrayList cluster) { 116 | boolean isCotained = false; 117 | ArrayList addPoints = new ArrayList<>(); 118 | 119 | for (Point p : cluster) { 120 | isCotained = false; 121 | for (Point p2 : parentCluster) { 122 | if (p.isTheSame(p2)) { 123 | isCotained = true; 124 | break; 125 | } 126 | } 127 | 128 | if (!isCotained) { 129 | addPoints.add(p); 130 | } 131 | } 132 | 133 | parentCluster.addAll(addPoints); 134 | } 135 | 136 | /** 137 | * dbScan算法基于密度的聚类 138 | */ 139 | public void dbScanCluster() { 140 | ArrayList cluster = null; 141 | resultClusters = new ArrayList<>(); 142 | noisePoint = new ArrayList<>(); 143 | 144 | for (Point p : totalPoints) { 145 | if (p.isVisited) { 146 | continue; 147 | } 148 | 149 | cluster = new ArrayList<>(); 150 | recursiveCluster(p, cluster); 151 | 152 | if (cluster.size() > 0) { 153 | resultClusters.add(cluster); 154 | } else { 155 | noisePoint.add(p); 156 | } 157 | } 158 | removeFalseNoise(); 159 | 160 | printClusters(); 161 | } 162 | 163 | /** 164 | * 移除被错误分类的噪声点数据 165 | */ 166 | private void removeFalseNoise() { 167 | ArrayList totalCluster = new ArrayList<>(); 168 | ArrayList deletePoints = new ArrayList<>(); 169 | 170 | //将聚簇合并 171 | for (ArrayList list : resultClusters) { 172 | totalCluster.addAll(list); 173 | } 174 | 175 | for (Point p : noisePoint) { 176 | for (Point p2 : totalCluster) { 177 | if (p2.isTheSame(p)) { 178 | deletePoints.add(p); 179 | } 180 | } 181 | } 182 | 183 | noisePoint.removeAll(deletePoints); 184 | } 185 | 186 | /** 187 | * 输出聚类结果 188 | */ 189 | private void printClusters() { 190 | int i = 1; 191 | for (ArrayList pList : resultClusters) { 192 | System.out.print("聚簇" + (i++) + ":"); 193 | for (Point p : pList) { 194 | System.out.print(MessageFormat.format("({0},{1}) ", p.x, p.y)); 195 | } 196 | System.out.println(); 197 | } 198 | 199 | System.out.println(); 200 | System.out.print("噪声数据:"); 201 | for (Point p : noisePoint) { 202 | System.out.print(MessageFormat.format("({0},{1}) ", p.x, p.y)); 203 | } 204 | System.out.println(); 205 | } 206 | } 207 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/dbscan/DBSCANExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.dbscan; 2 | 3 | /** 4 | * Dbscan基于密度的聚类算法测试类 5 | */ 6 | public class DBSCANExample { 7 | 8 | public static void main(String[] args) { 9 | String filePath = "data/dbscan/input.txt"; 10 | //簇扫描半径 11 | double eps = 3; 12 | //最小包含点数阈值 13 | int minPts = 3; 14 | 15 | DBSCANCore tool = new DBSCANCore(filePath, eps, minPts); 16 | tool.dbScanCluster(); 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/dbscan/Point.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.dbscan; 2 | 3 | /** 4 | * 坐标点类 5 | */ 6 | public class Point { 7 | 8 | // 坐标点横坐标 9 | int x; 10 | // 坐标点纵坐标 11 | int y; 12 | // 此节点是否已经被访问过 13 | boolean isVisited; 14 | 15 | public Point(String x, String y) { 16 | this.x = (Integer.parseInt(x)); 17 | this.y = (Integer.parseInt(y)); 18 | this.isVisited = false; 19 | } 20 | 21 | /** 22 | * 计算当前点与制定点之间的欧式距离 23 | * 24 | * @param p 25 | * 待计算聚类的p点 26 | * @return 27 | */ 28 | public double ouDistance(Point p) { 29 | double distance = 0; 30 | 31 | distance = (this.x - p.x) * (this.x - p.x) + (this.y - p.y) * (this.y - p.y); 32 | distance = Math.sqrt(distance); 33 | 34 | return distance; 35 | } 36 | 37 | /** 38 | * 判断2个坐标点是否为用个坐标点 39 | * 40 | * @param p 41 | * 待比较坐标点 42 | * @return 43 | */ 44 | public boolean isTheSame(Point p) { 45 | boolean isSamed = false; 46 | 47 | if (this.x == p.x && this.y == p.y) { 48 | isSamed = true; 49 | } 50 | 51 | return isSamed; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/ga/GAExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.ga; 2 | 3 | /** 4 | * Genetic遗传算法测试类 5 | */ 6 | public class GAExample { 7 | 8 | public static void main(String[] args) { 9 | //变量最小值和最大值 10 | int minNum = 1; 11 | int maxNum = 7; 12 | //初始群体规模 13 | int initSetsNum = 4; 14 | 15 | GACore tool = new GACore(minNum, maxNum, initSetsNum); 16 | tool.geneticCal(); 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/ga/maze/GAMazeExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.ga.maze; 2 | 3 | /** 4 | * 遗传算法在走迷宫游戏的应用 5 | */ 6 | public class GAMazeExample { 7 | 8 | public static void main(String[] args) { 9 | //迷宫地图文件数据地址 10 | String filePath = "data/maze/mapData.txt"; 11 | //初始个体数量 12 | int initSetsNum = 10; 13 | 14 | GAMazeCore tool = new GAMazeCore(filePath, initSetsNum); 15 | tool.goOutMaze(); 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/kdtree/KDTreeExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.kdtree; 2 | 3 | import java.text.MessageFormat; 4 | 5 | /** 6 | * KD树算法测试类 7 | */ 8 | public class KDTreeExample { 9 | 10 | public static void main(String[] args) { 11 | String filePath = "data/kdtree/input.txt"; 12 | Point queryNode; 13 | Point searchedNode; 14 | KDTreeCore tool = new KDTreeCore(filePath); 15 | 16 | // 进行KD树的构建 17 | tool.createKDTree(); 18 | 19 | // 通过KD树进行数据点的最近点查询 20 | queryNode = new Point(2.1, 3.1); 21 | searchedNode = tool.searchNearestData(queryNode); 22 | System.out.println(MessageFormat.format("距离查询点({0}, {1})最近的坐标点为({2}, {3})", queryNode.x, queryNode.y, 23 | searchedNode.x, searchedNode.y)); 24 | 25 | //重新构造KD树,去除之前的访问记录 26 | tool.createKDTree(); 27 | queryNode = new Point(2, 4.5); 28 | searchedNode = tool.searchNearestData(queryNode); 29 | System.out.println(MessageFormat.format("距离查询点({0}, {1})最近的坐标点为({2}, {3})", queryNode.x, queryNode.y, 30 | searchedNode.x, searchedNode.y)); 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/kdtree/Point.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.kdtree; 2 | 3 | /** 4 | * 坐标点类 5 | */ 6 | public class Point { 7 | 8 | // 坐标点横坐标 9 | Double x; 10 | // 坐标点纵坐标 11 | Double y; 12 | 13 | public Point(double x, double y) { 14 | this.x = x; 15 | this.y = y; 16 | } 17 | 18 | public Point(String x, String y) { 19 | this.x = (Double.parseDouble(x)); 20 | this.y = (Double.parseDouble(y)); 21 | } 22 | 23 | /** 24 | * 计算当前点与制定点之间的欧式距离 25 | * 26 | * @param p 27 | * 待计算聚类的p点 28 | * @return 29 | */ 30 | public double ouDistance(Point p) { 31 | double distance = 0; 32 | 33 | distance = (this.x - p.x) * (this.x - p.x) + (this.y - p.y) * (this.y - p.y); 34 | distance = Math.sqrt(distance); 35 | 36 | return distance; 37 | } 38 | 39 | /** 40 | * 判断2个坐标点是否为用个坐标点 41 | * 42 | * @param p 43 | * 待比较坐标点 44 | * @return 45 | */ 46 | public boolean isTheSame(Point p) { 47 | boolean isSamed = false; 48 | 49 | if (this.x == p.x && this.y == p.y) { 50 | isSamed = true; 51 | } 52 | 53 | return isSamed; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/kdtree/Range.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.kdtree; 2 | 3 | /** 4 | * 空间矢量,表示所代表的空间范围 5 | */ 6 | public class Range { 7 | 8 | // 边界左边界 9 | double left; 10 | // 边界右边界 11 | double right; 12 | // 边界上边界 13 | double top; 14 | // 边界下边界 15 | double bottom; 16 | 17 | public Range() { 18 | this.left = -Integer.MAX_VALUE; 19 | this.right = Integer.MAX_VALUE; 20 | this.top = Integer.MAX_VALUE; 21 | this.bottom = -Integer.MAX_VALUE; 22 | } 23 | 24 | public Range(int left, int right, int top, int bottom) { 25 | this.left = left; 26 | this.right = right; 27 | this.top = top; 28 | this.bottom = bottom; 29 | } 30 | 31 | /** 32 | * 空间矢量进行并操作 33 | * 34 | * @param range 35 | * @return 36 | */ 37 | public Range crossOperation(Range r) { 38 | Range range = new Range(); 39 | 40 | // 取靠近右侧的左边界 41 | if (r.left > this.left) { 42 | range.left = r.left; 43 | } else { 44 | range.left = this.left; 45 | } 46 | 47 | // 取靠近左侧的右边界 48 | if (r.right < this.right) { 49 | range.right = r.right; 50 | } else { 51 | range.right = this.right; 52 | } 53 | 54 | // 取靠近下侧的上边界 55 | if (r.top < this.top) { 56 | range.top = r.top; 57 | } else { 58 | range.top = this.top; 59 | } 60 | 61 | // 取靠近上侧的下边界 62 | if (r.bottom > this.bottom) { 63 | range.bottom = r.bottom; 64 | } else { 65 | range.bottom = this.bottom; 66 | } 67 | 68 | return range; 69 | } 70 | 71 | /** 72 | * 根据坐标点分割方向确定左侧空间矢量 73 | * 74 | * @param p 75 | * 数据矢量 76 | * @param dir 77 | * 分割方向 78 | * @return 79 | */ 80 | public static Range initLeftRange(Point p, int dir) { 81 | Range range = new Range(); 82 | 83 | if (dir == KDTreeCore.DIRECTION_X) { 84 | range.right = p.x; 85 | } else { 86 | range.bottom = p.y; 87 | } 88 | 89 | return range; 90 | } 91 | 92 | /** 93 | * 根据坐标点分割方向确定右侧空间矢量 94 | * 95 | * @param p 96 | * 数据矢量 97 | * @param dir 98 | * 分割方向 99 | * @return 100 | */ 101 | public static Range initRightRange(Point p, int dir) { 102 | Range range = new Range(); 103 | 104 | if (dir == KDTreeCore.DIRECTION_X) { 105 | range.left = p.x; 106 | } else { 107 | range.top = p.y; 108 | } 109 | 110 | return range; 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/kdtree/TreeNode.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.kdtree; 2 | 3 | /** 4 | * KD树节点 5 | */ 6 | public class TreeNode { 7 | 8 | //数据矢量 9 | Point nodeData; 10 | //分割平面的分割线 11 | int spilt; 12 | //空间矢量,该节点所表示的空间范围 13 | Range range; 14 | //父节点 15 | TreeNode parentNode; 16 | //位于分割超平面左侧的孩子节点 17 | TreeNode leftNode; 18 | //位于分割超平面右侧的孩子节点 19 | TreeNode rightNode; 20 | //节点是否被访问过,用于回溯时使用 21 | boolean isVisited; 22 | 23 | public TreeNode() { 24 | this.isVisited = false; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/msapriori/FrequentItem.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.msapriori; 2 | 3 | /** 4 | * 频繁项集 5 | */ 6 | public class FrequentItem implements Comparable { 7 | 8 | // 频繁项集的集合ID 9 | private String[] idArray; 10 | // 频繁项集的支持度计数 11 | private int count; 12 | //频繁项集的长度,1项集或是2项集,亦或是3项集 13 | private int length; 14 | 15 | public FrequentItem(String[] idArray, int count) { 16 | this.idArray = idArray; 17 | this.count = count; 18 | length = idArray.length; 19 | } 20 | 21 | public String[] getIdArray() { 22 | return idArray; 23 | } 24 | 25 | public void setIdArray(String[] idArray) { 26 | this.idArray = idArray; 27 | } 28 | 29 | public int getCount() { 30 | return count; 31 | } 32 | 33 | public void setCount(int count) { 34 | this.count = count; 35 | } 36 | 37 | public int getLength() { 38 | return length; 39 | } 40 | 41 | public void setLength(int length) { 42 | this.length = length; 43 | } 44 | 45 | @Override 46 | public int compareTo(FrequentItem o) { 47 | // TODO Auto-generated method stub 48 | Integer int1 = Integer.parseInt(this.getIdArray()[0]); 49 | Integer int2 = Integer.parseInt(o.getIdArray()[0]); 50 | 51 | return int1.compareTo(int2); 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/msapriori/MSAprioriExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.msapriori; 2 | 3 | /** 4 | * 基于多支持度的Apriori算法测试类 5 | */ 6 | public class MSAprioriExample { 7 | 8 | public static void main(String[] args) { 9 | //是否是事务型数据 10 | boolean isTransaction; 11 | //测试数据文件地址 12 | String filePath = "data/msapriori/testInput.txt"; 13 | //关系表型数据文件地址 14 | String tableFilePath = "data/msapriori/testInput2.txt"; 15 | //最小支持度阈值 16 | double minSup; 17 | // 最小置信度率 18 | double minConf; 19 | //最大支持度差别阈值 20 | double delta; 21 | //多项目的最小支持度数,括号中的下标代表的是商品的ID 22 | double[] mis; 23 | //msApriori算法工具类 24 | MSAprioriCore tool; 25 | 26 | //为了测试的方便,取一个偏低的置信度值0.3 27 | minConf = 0.3; 28 | minSup = 0.1; 29 | delta = 0.5; 30 | //每项的支持度率都默认为0.1,第一项不使用 31 | mis = new double[] { -1, 0.1, 0.1, 0.1, 0.1, 0.1 }; 32 | isTransaction = true; 33 | 34 | isTransaction = true; 35 | tool = new MSAprioriCore(filePath, minConf, delta, mis, isTransaction); 36 | tool.calFItems(); 37 | System.out.println(); 38 | 39 | isTransaction = false; 40 | //重新初始化数据 41 | tool = new MSAprioriCore(tableFilePath, minConf, minSup, isTransaction); 42 | tool.calFItems(); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/randomforest/DecisionTree.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.randomforest; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | 7 | /** 8 | * 决策树 9 | */ 10 | public class DecisionTree { 11 | 12 | // 树的根节点 13 | TreeNode rootNode; 14 | // 数据的属性列名称 15 | String[] featureNames; 16 | // 这棵树所包含的数据 17 | ArrayList datas; 18 | // 决策树构造的的工具类 19 | CARTCore tool; 20 | 21 | public DecisionTree(ArrayList datas) { 22 | this.datas = datas; 23 | this.featureNames = datas.get(0); 24 | 25 | tool = new CARTCore(datas); 26 | // 通过CART工具类进行决策树的构建,并返回树的根节点 27 | rootNode = tool.startBuildingTree(); 28 | } 29 | 30 | /** 31 | * 根据给定的数据特征描述进行类别的判断 32 | * 33 | * @param features 34 | * @return 35 | */ 36 | public String decideClassType(String features) { 37 | String classType = ""; 38 | // 查询属性组 39 | String[] queryFeatures; 40 | // 在本决策树中对应的查询的属性值描述 41 | ArrayList featureStrs; 42 | 43 | featureStrs = new ArrayList<>(); 44 | queryFeatures = features.split(","); 45 | 46 | String[] array; 47 | for (String name : featureNames) { 48 | for (String featureValue : queryFeatures) { 49 | array = featureValue.split("="); 50 | // 将对应的属性值加入到列表中 51 | if (array[0].equals(name)) { 52 | featureStrs.add(array); 53 | } 54 | } 55 | } 56 | 57 | // 开始从根据节点往下递归搜索 58 | classType = recusiveSearchClassType(rootNode, featureStrs); 59 | 60 | return classType; 61 | } 62 | 63 | /** 64 | * 递归搜索树,查询属性的分类类别 65 | * 66 | * @param node 67 | * 当前搜索到的节点 68 | * @param remainFeatures 69 | * 剩余未判断的属性 70 | * @return 71 | */ 72 | private String recusiveSearchClassType(TreeNode node, ArrayList remainFeatures) { 73 | String classType = null; 74 | 75 | // 如果节点包含了数据的id索引,说明已经分类到底了 76 | if (node.getDataIndex() != null && node.getDataIndex().size() > 0) { 77 | classType = judgeClassType(node.getDataIndex()); 78 | 79 | return classType; 80 | } 81 | 82 | // 取出剩余属性中的一个匹配属性作为当前的判断属性名称 83 | String[] currentFeature = null; 84 | for (String[] featureValue : remainFeatures) { 85 | if (node.getAttrName().equals(featureValue[0])) { 86 | currentFeature = featureValue; 87 | break; 88 | } 89 | } 90 | 91 | for (TreeNode childNode : node.getChildAttrNode()) { 92 | // 寻找子节点中属于此属性值的分支 93 | if (childNode.getParentAttrValue().equals(currentFeature[1])) { 94 | remainFeatures.remove(currentFeature); 95 | classType = recusiveSearchClassType(childNode, remainFeatures); 96 | 97 | // 如果找到了分类结果,则直接挑出循环 98 | break; 99 | } else { 100 | //进行第二种情况的判断加上!符号的情况 101 | String value = childNode.getParentAttrValue(); 102 | 103 | if (value.charAt(0) == '!') { 104 | //去掉第一个!字符 105 | value = value.substring(1, value.length()); 106 | 107 | if (!value.equals(currentFeature[1])) { 108 | remainFeatures.remove(currentFeature); 109 | classType = recusiveSearchClassType(childNode, remainFeatures); 110 | 111 | break; 112 | } 113 | } 114 | } 115 | } 116 | 117 | return classType; 118 | } 119 | 120 | /** 121 | * 根据得到的数据行分类进行类别的决策 122 | * 123 | * @param dataIndex 124 | * 根据分类的数据索引号 125 | * @return 126 | */ 127 | public String judgeClassType(ArrayList dataIndex) { 128 | // 结果类型值 129 | String resultClassType = ""; 130 | String classType = ""; 131 | int count = 0; 132 | int temp = 0; 133 | Map type2Num = new HashMap(); 134 | 135 | for (String index : dataIndex) { 136 | temp = Integer.parseInt(index); 137 | // 取最后一列的决策类别数据 138 | classType = datas.get(temp)[featureNames.length - 1]; 139 | 140 | if (type2Num.containsKey(classType)) { 141 | // 如果类别已经存在,则使其计数加1 142 | count = type2Num.get(classType); 143 | count++; 144 | } else { 145 | count = 1; 146 | } 147 | 148 | type2Num.put(classType, count); 149 | } 150 | 151 | // 选出其中类别支持计数最多的一个类别值 152 | count = -1; 153 | for (Map.Entry entry : type2Num.entrySet()) { 154 | if ((int) entry.getValue() > count) { 155 | count = (int) entry.getValue(); 156 | resultClassType = (String) entry.getKey(); 157 | } 158 | } 159 | 160 | return resultClassType; 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/randomforest/RandomForestCore.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.randomforest; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | import java.util.HashMap; 9 | import java.util.Map; 10 | import java.util.Random; 11 | 12 | /** 13 | * 随机森林算法工具类 14 | */ 15 | public class RandomForestCore { 16 | 17 | // 测试数据文件地址 18 | private String filePath; 19 | // 决策树的样本占总数的占比率 20 | private double sampleNumRatio; 21 | // 样本数据的采集特征数量占总特征的比例 22 | private double featureNumRatio; 23 | // 决策树的采样样本数 24 | private int sampleNum; 25 | // 样本数据的采集采样特征数 26 | private int featureNum; 27 | // 随机森林中的决策树的数目,等于总的数据数/用于构造每棵树的数据的数量 28 | private int treeNum; 29 | // 随机数产生器 30 | private Random random; 31 | // 样本数据列属性名称行 32 | private String[] featureNames; 33 | // 原始的总的数据 34 | private ArrayList totalDatas; 35 | // 决策树森林 36 | private ArrayList decisionForest; 37 | 38 | public RandomForestCore(String filePath, double sampleNumRatio, double featureNumRatio) { 39 | this.filePath = filePath; 40 | this.sampleNumRatio = sampleNumRatio; 41 | this.featureNumRatio = featureNumRatio; 42 | 43 | readDataFile(); 44 | } 45 | 46 | /** 47 | * 从文件中读取数据 48 | */ 49 | private void readDataFile() { 50 | File file = new File(filePath); 51 | ArrayList dataArray = new ArrayList(); 52 | 53 | try { 54 | BufferedReader in = new BufferedReader(new FileReader(file)); 55 | String str; 56 | String[] tempArray; 57 | while ((str = in.readLine()) != null) { 58 | tempArray = str.split(" "); 59 | dataArray.add(tempArray); 60 | } 61 | in.close(); 62 | } catch (IOException e) { 63 | e.getStackTrace(); 64 | } 65 | 66 | totalDatas = dataArray; 67 | featureNames = totalDatas.get(0); 68 | sampleNum = (int) ((totalDatas.size() - 1) * sampleNumRatio); 69 | //算属性数量的时候需要去掉id属性和决策属性,用条件属性计算 70 | featureNum = (int) ((featureNames.length - 2) * featureNumRatio); 71 | // 算数量的时候需要去掉首行属性名称行 72 | treeNum = (totalDatas.size() - 1) / sampleNum; 73 | } 74 | 75 | /** 76 | * 产生决策树 77 | */ 78 | private DecisionTree produceDecisionTree() { 79 | int temp = 0; 80 | DecisionTree tree; 81 | String[] tempData; 82 | //采样数据的随机行号组 83 | ArrayList sampleRandomNum; 84 | //采样属性特征的随机列号组 85 | ArrayList featureRandomNum; 86 | ArrayList datas; 87 | 88 | sampleRandomNum = new ArrayList<>(); 89 | featureRandomNum = new ArrayList<>(); 90 | datas = new ArrayList<>(); 91 | 92 | for (int i = 0; i < sampleNum;) { 93 | temp = random.nextInt(totalDatas.size()); 94 | 95 | //如果是行首属性名称行,则跳过 96 | if (temp == 0) { 97 | continue; 98 | } 99 | 100 | if (!sampleRandomNum.contains(temp)) { 101 | sampleRandomNum.add(temp); 102 | i++; 103 | } 104 | } 105 | 106 | for (int i = 0; i < featureNum;) { 107 | temp = random.nextInt(featureNames.length); 108 | 109 | //如果是第一列的数据id号或者是决策属性列,则跳过 110 | if (temp == 0 || temp == featureNames.length - 1) { 111 | continue; 112 | } 113 | 114 | if (!featureRandomNum.contains(temp)) { 115 | featureRandomNum.add(temp); 116 | i++; 117 | } 118 | } 119 | 120 | String[] singleRecord; 121 | String[] headCulumn = null; 122 | // 获取随机数据行 123 | for (int dataIndex : sampleRandomNum) { 124 | singleRecord = totalDatas.get(dataIndex); 125 | 126 | //每行的列数=所选的特征数+id号 127 | tempData = new String[featureNum + 2]; 128 | headCulumn = new String[featureNum + 2]; 129 | 130 | for (int i = 0, k = 1; i < featureRandomNum.size(); i++, k++) { 131 | temp = featureRandomNum.get(i); 132 | 133 | headCulumn[k] = featureNames[temp]; 134 | tempData[k] = singleRecord[temp]; 135 | } 136 | 137 | //加上id列的信息 138 | headCulumn[0] = featureNames[0]; 139 | //加上决策分类列的信息 140 | headCulumn[featureNum + 1] = featureNames[featureNames.length - 1]; 141 | tempData[featureNum + 1] = singleRecord[featureNames.length - 1]; 142 | 143 | //加入此行数据 144 | datas.add(tempData); 145 | } 146 | 147 | //加入行首列出现名称 148 | datas.add(0, headCulumn); 149 | //对筛选出的数据重新做id分配 150 | temp = 0; 151 | for (String[] array : datas) { 152 | //从第2行开始赋值 153 | if (temp > 0) { 154 | array[0] = temp + ""; 155 | } 156 | 157 | temp++; 158 | } 159 | 160 | tree = new DecisionTree(datas); 161 | 162 | return tree; 163 | } 164 | 165 | /** 166 | * 构造随机森林 167 | */ 168 | public void constructRandomTree() { 169 | DecisionTree tree; 170 | random = new Random(); 171 | decisionForest = new ArrayList<>(); 172 | 173 | System.out.println("下面是随机森林中的决策树:"); 174 | // 构造决策树加入森林中 175 | for (int i = 0; i < treeNum; i++) { 176 | System.out.println("\n决策树" + (i + 1)); 177 | tree = produceDecisionTree(); 178 | decisionForest.add(tree); 179 | } 180 | } 181 | 182 | /** 183 | * 根据给定的属性条件进行类别的决策 184 | * 185 | * @param features 186 | * 给定的已知的属性描述 187 | * @return 188 | */ 189 | public String judgeClassType(String features) { 190 | // 结果类型值 191 | String resultClassType = ""; 192 | String classType = ""; 193 | int count = 0; 194 | Map type2Num = new HashMap(); 195 | 196 | for (DecisionTree tree : decisionForest) { 197 | classType = tree.decideClassType(features); 198 | if (type2Num.containsKey(classType)) { 199 | // 如果类别已经存在,则使其计数加1 200 | count = type2Num.get(classType); 201 | count++; 202 | } else { 203 | count = 1; 204 | } 205 | 206 | type2Num.put(classType, count); 207 | } 208 | 209 | // 选出其中类别支持计数最多的一个类别值 210 | count = -1; 211 | for (Map.Entry entry : type2Num.entrySet()) { 212 | if ((int) entry.getValue() > count) { 213 | count = (int) entry.getValue(); 214 | resultClassType = (String) entry.getKey(); 215 | } 216 | } 217 | 218 | return resultClassType; 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/randomforest/RandomForestExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.randomforest; 2 | 3 | import java.text.MessageFormat; 4 | 5 | /** 6 | * 随机森林算法测试场景 7 | */ 8 | public class RandomForestExample { 9 | 10 | public static void main(String[] args) { 11 | String filePath = "data/randomforest/input.txt"; 12 | String queryStr = "Age=Youth,Income=Low,Student=No,CreditRating=Fair"; 13 | String resultClassType = ""; 14 | // 决策树的样本占总数的占比率 15 | double sampleNumRatio = 0.4; 16 | // 样本数据的采集特征数量占总特征的比例 17 | double featureNumRatio = 0.5; 18 | 19 | RandomForestCore tool = new RandomForestCore(filePath, sampleNumRatio, featureNumRatio); 20 | tool.constructRandomTree(); 21 | 22 | resultClassType = tool.judgeClassType(queryStr); 23 | 24 | System.out.println(); 25 | System.out.println(MessageFormat.format("查询属性描述{0},预测的分类结果为BuysCompute:{1}", queryStr, resultClassType)); 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/randomforest/TreeNode.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.randomforest; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * 回归分类树节点 7 | */ 8 | public class TreeNode { 9 | 10 | // 节点属性名字 11 | private String attrName; 12 | // 节点索引标号 13 | private int nodeIndex; 14 | //包含的叶子节点数 15 | private int leafNum; 16 | // 节点误差率 17 | private double alpha; 18 | // 父亲分类属性值 19 | private String parentAttrValue; 20 | // 孩子节点 21 | private TreeNode[] childAttrNode; 22 | // 数据记录索引 23 | private ArrayList dataIndex; 24 | 25 | public String getAttrName() { 26 | return attrName; 27 | } 28 | 29 | public void setAttrName(String attrName) { 30 | this.attrName = attrName; 31 | } 32 | 33 | public int getNodeIndex() { 34 | return nodeIndex; 35 | } 36 | 37 | public void setNodeIndex(int nodeIndex) { 38 | this.nodeIndex = nodeIndex; 39 | } 40 | 41 | public double getAlpha() { 42 | return alpha; 43 | } 44 | 45 | public void setAlpha(double alpha) { 46 | this.alpha = alpha; 47 | } 48 | 49 | public String getParentAttrValue() { 50 | return parentAttrValue; 51 | } 52 | 53 | public void setParentAttrValue(String parentAttrValue) { 54 | this.parentAttrValue = parentAttrValue; 55 | } 56 | 57 | public TreeNode[] getChildAttrNode() { 58 | return childAttrNode; 59 | } 60 | 61 | public void setChildAttrNode(TreeNode[] childAttrNode) { 62 | this.childAttrNode = childAttrNode; 63 | } 64 | 65 | public ArrayList getDataIndex() { 66 | return dataIndex; 67 | } 68 | 69 | public void setDataIndex(ArrayList dataIndex) { 70 | this.dataIndex = dataIndex; 71 | } 72 | 73 | public int getLeafNum() { 74 | return leafNum; 75 | } 76 | 77 | public void setLeafNum(int leafNum) { 78 | this.leafNum = leafNum; 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/tan/AttrMutualInfo.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.tan; 2 | 3 | /** 4 | * 属性之间的互信息值,表示属性之间的关联性大小 5 | */ 6 | public class AttrMutualInfo implements Comparable { 7 | 8 | //互信息值 9 | Double value; 10 | //关联属性值对 11 | Node[] nodeArray; 12 | 13 | public AttrMutualInfo(double value, Node node1, Node node2) { 14 | this.value = value; 15 | 16 | this.nodeArray = new Node[2]; 17 | this.nodeArray[0] = node1; 18 | this.nodeArray[1] = node2; 19 | } 20 | 21 | @Override 22 | public int compareTo(AttrMutualInfo o) { 23 | return o.value.compareTo(this.value); 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/tan/Node.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.tan; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * 贝叶斯网络节点类 7 | */ 8 | public class Node { 9 | 10 | //节点唯一id,方便后面节点连接方向的确定 11 | int id; 12 | // 节点的属性名称 13 | String name; 14 | // 该节点所连续的节点 15 | ArrayList connectedNodes; 16 | 17 | public Node(int id, String name) { 18 | this.id = id; 19 | this.name = name; 20 | 21 | // 初始化变量 22 | this.connectedNodes = new ArrayList<>(); 23 | } 24 | 25 | /** 26 | * 将自身节点连接到目标给定的节点 27 | * 28 | * @param node 29 | * 下游节点 30 | */ 31 | public void connectNode(Node node) { 32 | //避免连接自身 33 | if (this.id == node.id) { 34 | return; 35 | } 36 | 37 | // 将节点加入自身节点的节点列表中 38 | this.connectedNodes.add(node); 39 | // 将自身节点加入到目标节点的列表中 40 | node.connectedNodes.add(this); 41 | } 42 | 43 | /** 44 | * 判断与目标节点是否相同,主要比较名称是否相同即可 45 | * 46 | * @param node 47 | * 目标结点 48 | * @return 49 | */ 50 | public boolean isEqual(Node node) { 51 | boolean isEqual; 52 | 53 | isEqual = false; 54 | // 节点名称相同则视为相等 55 | if (this.id == node.id) { 56 | isEqual = true; 57 | } 58 | 59 | return isEqual; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/tan/TanExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.tan; 2 | 3 | /** 4 | * TAN树型朴素贝叶斯算法 5 | */ 6 | public class TanExample { 7 | 8 | public static void main(String[] args) { 9 | String filePath = "data/tan/input.txt"; 10 | // 条件查询语句 11 | String queryStr; 12 | // 分类结果概率1 13 | double classResult1; 14 | // 分类结果概率2 15 | double classResult2; 16 | 17 | TANCore tool = new TANCore(filePath); 18 | queryStr = "OutLook=Sunny,Temperature=Hot,Humidity=High,Wind=Weak,PlayTennis=No"; 19 | classResult1 = tool.calHappenedPro(queryStr); 20 | 21 | queryStr = "OutLook=Sunny,Temperature=Hot,Humidity=High,Wind=Weak,PlayTennis=Yes"; 22 | classResult2 = tool.calHappenedPro(queryStr); 23 | 24 | System.out.println(String.format("类别为%s所求得的概率为%s", "PlayTennis=No", classResult1)); 25 | System.out.println(String.format("类别为%s所求得的概率为%s", "PlayTennis=Yes", classResult2)); 26 | if (classResult1 > classResult2) { 27 | System.out.println("分类类别为PlayTennis=No"); 28 | } else { 29 | System.out.println("分类类别为PlayTennis=Yes"); 30 | } 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/viterbi/BaseNames.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.viterbi; 2 | 3 | /** 4 | * 基本变量定义类 5 | */ 6 | public class BaseNames { 7 | 8 | //日期天数下标 9 | public static final int DAY1 = 0; 10 | public static final int DAY2 = 1; 11 | public static final int DAY3 = 2; 12 | 13 | //天气属性类别 14 | public static final int WEATHER_SUNNY = 0; 15 | public static final int WEATHER_CLOUDY = 1; 16 | public static final int WEATHER_RAINY = 2; 17 | 18 | //湿度属性类别 19 | public static final int HUMIDITY_DRY = 0; 20 | public static final int HUMIDITY_DRYISH = 1; 21 | public static final int HUMIDITY_DAMP = 1; 22 | public static final int HUMIDITY_SOGGY = 1; 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/viterbi/ViterbiCore.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.viterbi; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | import java.util.HashMap; 9 | 10 | /** 11 | * 维特比算法工具类 12 | */ 13 | public class ViterbiCore { 14 | 15 | // 状态转移概率矩阵文件地址 16 | private String stmFilePath; 17 | // 混淆矩阵文件地址 18 | private String confusionFilePath; 19 | // 初始状态概率 20 | private double[] initStatePro; 21 | // 观察到的状态序列 22 | public String[] observeStates; 23 | // 状态转移矩阵值 24 | private double[][] stMatrix; 25 | // 混淆矩阵值 26 | private double[][] confusionMatrix; 27 | // 各个条件下的潜在特征概率值 28 | private double[][] potentialValues; 29 | // 潜在特征 30 | private ArrayList potentialAttrs; 31 | // 属性值列坐标映射图 32 | private HashMap name2Index; 33 | // 列坐标属性值映射图 34 | private HashMap index2name; 35 | 36 | public ViterbiCore(String stmFilePath, String confusionFilePath, double[] initStatePro, String[] observeStates) { 37 | this.stmFilePath = stmFilePath; 38 | this.confusionFilePath = confusionFilePath; 39 | this.initStatePro = initStatePro; 40 | this.observeStates = observeStates; 41 | 42 | initOperation(); 43 | } 44 | 45 | /** 46 | * 初始化数据操作 47 | */ 48 | private void initOperation() { 49 | double[] temp; 50 | int index; 51 | ArrayList smtDatas; 52 | ArrayList cfDatas; 53 | 54 | smtDatas = readDataFile(stmFilePath); 55 | cfDatas = readDataFile(confusionFilePath); 56 | 57 | index = 0; 58 | this.stMatrix = new double[smtDatas.size()][]; 59 | for (String[] array : smtDatas) { 60 | temp = new double[array.length]; 61 | for (int i = 0; i < array.length; i++) { 62 | try { 63 | temp[i] = Double.parseDouble(array[i]); 64 | } catch (NumberFormatException e) { 65 | temp[i] = -1; 66 | } 67 | } 68 | 69 | // 将转换后的值赋给数组中 70 | this.stMatrix[index] = temp; 71 | index++; 72 | } 73 | 74 | index = 0; 75 | this.confusionMatrix = new double[cfDatas.size()][]; 76 | for (String[] array : cfDatas) { 77 | temp = new double[array.length]; 78 | for (int i = 0; i < array.length; i++) { 79 | try { 80 | temp[i] = Double.parseDouble(array[i]); 81 | } catch (NumberFormatException e) { 82 | temp[i] = -1; 83 | } 84 | } 85 | 86 | // 将转换后的值赋给数组中 87 | this.confusionMatrix[index] = temp; 88 | index++; 89 | } 90 | 91 | this.potentialAttrs = new ArrayList<>(); 92 | // 添加潜在特征属性 93 | for (String s : smtDatas.get(0)) { 94 | this.potentialAttrs.add(s); 95 | } 96 | // 去除首列无效列 97 | potentialAttrs.remove(0); 98 | 99 | this.name2Index = new HashMap<>(); 100 | this.index2name = new HashMap<>(); 101 | 102 | // 添加名称下标映射关系 103 | for (int i = 1; i < smtDatas.get(0).length; i++) { 104 | this.name2Index.put(smtDatas.get(0)[i], i); 105 | // 添加下标到名称的映射 106 | this.index2name.put(i, smtDatas.get(0)[i]); 107 | } 108 | 109 | for (int i = 1; i < cfDatas.get(0).length; i++) { 110 | this.name2Index.put(cfDatas.get(0)[i], i); 111 | } 112 | } 113 | 114 | /** 115 | * 从文件中读取数据 116 | */ 117 | private ArrayList readDataFile(String filePath) { 118 | File file = new File(filePath); 119 | ArrayList dataArray = new ArrayList(); 120 | 121 | try { 122 | BufferedReader in = new BufferedReader(new FileReader(file)); 123 | String str; 124 | String[] tempArray; 125 | while ((str = in.readLine()) != null) { 126 | tempArray = str.split(" "); 127 | dataArray.add(tempArray); 128 | } 129 | in.close(); 130 | } catch (IOException e) { 131 | e.getStackTrace(); 132 | } 133 | 134 | return dataArray; 135 | } 136 | 137 | /** 138 | * 根据观察特征计算隐藏的特征概率矩阵 139 | */ 140 | private void calPotencialProMatrix() { 141 | String curObserveState; 142 | // 观察特征和潜在特征的下标 143 | int osIndex; 144 | int psIndex; 145 | double temp; 146 | double maxPro; 147 | // 混淆矩阵概率值,就是相关影响的因素概率 148 | double confusionPro; 149 | 150 | this.potentialValues = new double[observeStates.length][potentialAttrs.size() + 1]; 151 | for (int i = 0; i < this.observeStates.length; i++) { 152 | curObserveState = this.observeStates[i]; 153 | osIndex = this.name2Index.get(curObserveState); 154 | maxPro = -1; 155 | 156 | // 因为是第一个观察特征,没有前面的影响,根据初始状态计算 157 | if (i == 0) { 158 | for (String attr : this.potentialAttrs) { 159 | psIndex = this.name2Index.get(attr); 160 | confusionPro = this.confusionMatrix[psIndex][osIndex]; 161 | 162 | temp = this.initStatePro[psIndex - 1] * confusionPro; 163 | this.potentialValues[BaseNames.DAY1][psIndex] = temp; 164 | } 165 | } else { 166 | // 后面的潜在特征受前一个特征的影响,以及当前的混淆因素影响 167 | for (String toDayAttr : this.potentialAttrs) { 168 | psIndex = this.name2Index.get(toDayAttr); 169 | confusionPro = this.confusionMatrix[psIndex][osIndex]; 170 | 171 | int index; 172 | maxPro = -1; 173 | // 通过昨天的概率计算今天此特征的最大概率 174 | for (String yAttr : this.potentialAttrs) { 175 | index = this.name2Index.get(yAttr); 176 | temp = this.potentialValues[i - 1][index] * this.stMatrix[index][psIndex]; 177 | 178 | // 计算得到今天此潜在特征的最大概率 179 | if (temp > maxPro) { 180 | maxPro = temp; 181 | } 182 | } 183 | 184 | this.potentialValues[i][psIndex] = maxPro * confusionPro; 185 | } 186 | } 187 | } 188 | } 189 | 190 | /** 191 | * 根据同时期最大概率值输出潜在特征值 192 | */ 193 | private void outputResultAttr() { 194 | double maxPro; 195 | int maxIndex; 196 | ArrayList psValues; 197 | 198 | psValues = new ArrayList<>(); 199 | for (int i = 0; i < this.potentialValues.length; i++) { 200 | maxPro = -1; 201 | maxIndex = 0; 202 | 203 | for (int j = 0; j < potentialValues[i].length; j++) { 204 | if (this.potentialValues[i][j] > maxPro) { 205 | maxPro = potentialValues[i][j]; 206 | maxIndex = j; 207 | } 208 | } 209 | 210 | // 取出最大概率下标对应的潜在特征 211 | psValues.add(this.index2name.get(maxIndex)); 212 | } 213 | 214 | System.out.println("观察特征为:"); 215 | for (String s : this.observeStates) { 216 | System.out.print(s + ", "); 217 | } 218 | System.out.println(); 219 | 220 | System.out.println("潜在特征为:"); 221 | for (String s : psValues) { 222 | System.out.print(s + ", "); 223 | } 224 | System.out.println(); 225 | } 226 | 227 | /** 228 | * 根据观察属性,得到潜在属性信息 229 | */ 230 | public void calHMMObserve() { 231 | calPotencialProMatrix(); 232 | outputResultAttr(); 233 | } 234 | } 235 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/others/viterbi/ViterbiExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.others.viterbi; 2 | 3 | /** 4 | * 维特比算法 5 | */ 6 | public class ViterbiExample { 7 | 8 | public static void main(String[] args) { 9 | // 状态转移概率矩阵路径 10 | String stmFilePath; 11 | // 混淆矩阵路径 12 | String cfFilePath; 13 | // 观察到的状态 14 | String[] observeStates; 15 | // 初始状态 16 | double[] initStatePro; 17 | ViterbiCore tool; 18 | 19 | stmFilePath = "data/viterbi/stmatrix.txt"; 20 | cfFilePath = "data/viterbi/humidity-matrix.txt"; 21 | 22 | initStatePro = new double[] { 0.63, 0.17, 0.20 }; 23 | observeStates = new String[] { "Dry", "Damp", "Soggy" }; 24 | 25 | tool = new ViterbiCore(stmFilePath, cfFilePath, initStatePro, observeStates); 26 | tool.calHMMObserve(); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/roughsets/KnowledgeSystem.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.roughsets; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * 知识系统 7 | */ 8 | public class KnowledgeSystem { 9 | 10 | // 知识系统内的集合 11 | ArrayList ksCollections; 12 | 13 | public KnowledgeSystem(ArrayList ksCollections) { 14 | this.ksCollections = ksCollections; 15 | } 16 | 17 | /** 18 | * 获取集合的上近似集合 19 | * 20 | * @param rc 21 | * 原始集合 22 | * @return 23 | */ 24 | public RecordCollection getUpSimilarRC(RecordCollection rc) { 25 | RecordCollection resultRc = null; 26 | ArrayList nameArray; 27 | ArrayList targetArray; 28 | ArrayList copyRcs = new ArrayList<>(); 29 | ArrayList deleteRcs = new ArrayList<>(); 30 | targetArray = rc.getRecordNames(); 31 | 32 | // 做一个集合拷贝 33 | for (RecordCollection recordCollection : ksCollections) { 34 | copyRcs.add(recordCollection); 35 | } 36 | 37 | for (RecordCollection recordCollection : copyRcs) { 38 | nameArray = recordCollection.getRecordNames(); 39 | 40 | if (strIsContained(targetArray, nameArray)) { 41 | removeOverLaped(targetArray, nameArray); 42 | deleteRcs.add(recordCollection); 43 | 44 | if (resultRc == null) { 45 | resultRc = recordCollection; 46 | } else { 47 | // 进行并运算 48 | resultRc = resultRc.unionCal(recordCollection); 49 | } 50 | 51 | if (targetArray.size() == 0) { 52 | break; 53 | } 54 | } 55 | } 56 | //去除已经添加过的集合 57 | copyRcs.removeAll(deleteRcs); 58 | 59 | if (targetArray.size() > 0) { 60 | // 说明已经完全还未找全上近似的集合 61 | for (RecordCollection recordCollection : copyRcs) { 62 | nameArray = recordCollection.getRecordNames(); 63 | 64 | if (strHasOverlap(targetArray, nameArray)) { 65 | removeOverLaped(targetArray, nameArray); 66 | 67 | if (resultRc == null) { 68 | resultRc = recordCollection; 69 | } else { 70 | // 进行并运算 71 | resultRc = resultRc.unionCal(recordCollection); 72 | } 73 | 74 | if (targetArray.size() == 0) { 75 | break; 76 | } 77 | } 78 | } 79 | } 80 | 81 | return resultRc; 82 | } 83 | 84 | /** 85 | * 获取集合的下近似集合 86 | * 87 | * @param rc 88 | * 原始集合 89 | * @return 90 | */ 91 | public RecordCollection getDownSimilarRC(RecordCollection rc) { 92 | RecordCollection resultRc = null; 93 | ArrayList nameArray; 94 | ArrayList targetArray; 95 | targetArray = rc.getRecordNames(); 96 | 97 | for (RecordCollection recordCollection : ksCollections) { 98 | nameArray = recordCollection.getRecordNames(); 99 | 100 | if (strIsContained(targetArray, nameArray)) { 101 | removeOverLaped(targetArray, nameArray); 102 | 103 | if (resultRc == null) { 104 | resultRc = recordCollection; 105 | } else { 106 | // 进行并运算 107 | resultRc = resultRc.unionCal(recordCollection); 108 | } 109 | 110 | if (targetArray.size() == 0) { 111 | break; 112 | } 113 | } 114 | } 115 | 116 | return resultRc; 117 | } 118 | 119 | /** 120 | * 判断2个字符数组之间是否有交集 121 | * 122 | * @param str1 123 | * 字符列表1 124 | * @param str2 125 | * 字符列表2 126 | * @return 127 | */ 128 | public boolean strHasOverlap(ArrayList str1, ArrayList str2) { 129 | boolean hasOverlap = false; 130 | 131 | for (String s1 : str1) { 132 | for (String s2 : str2) { 133 | if (s1.equals(s2)) { 134 | hasOverlap = true; 135 | break; 136 | } 137 | } 138 | 139 | if (hasOverlap) { 140 | break; 141 | } 142 | } 143 | 144 | return hasOverlap; 145 | } 146 | 147 | /** 148 | * 判断字符集str2是否完全包含于str1中 149 | * 150 | * @param str1 151 | * @param str2 152 | * @return 153 | */ 154 | public boolean strIsContained(ArrayList str1, ArrayList str2) { 155 | boolean isContained = false; 156 | int count = 0; 157 | 158 | for (String s : str2) { 159 | if (str1.contains(s)) { 160 | count++; 161 | } 162 | } 163 | 164 | if (count == str2.size()) { 165 | isContained = true; 166 | } 167 | 168 | return isContained; 169 | } 170 | 171 | /** 172 | * 字符列表移除公共元素 173 | * 174 | * @param str1 175 | * @param str2 176 | */ 177 | public void removeOverLaped(ArrayList str1, ArrayList str2) { 178 | ArrayList deleteStrs = new ArrayList<>(); 179 | 180 | for (String s1 : str1) { 181 | for (String s2 : str2) { 182 | if (s1.equals(s2)) { 183 | deleteStrs.add(s1); 184 | break; 185 | } 186 | } 187 | } 188 | 189 | // 进行公共元素的移除 190 | str1.removeAll(deleteStrs); 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/roughsets/Record.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.roughsets; 2 | 3 | import java.text.MessageFormat; 4 | import java.util.ArrayList; 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | 8 | /** 9 | * 数据记录,包含这条记录所有属性 10 | */ 11 | public class Record { 12 | 13 | // 记录名称 14 | private String name; 15 | // 记录属性键值对 16 | private HashMap attrValues; 17 | 18 | public Record(String name, HashMap attrValues) { 19 | this.name = name; 20 | this.attrValues = attrValues; 21 | } 22 | 23 | public String getName() { 24 | return this.name; 25 | } 26 | 27 | /** 28 | * 此数据是否包含此属性值 29 | * 30 | * @param attr 31 | * 待判断属性值 32 | * @return 33 | */ 34 | public boolean isContainedAttr(String attr) { 35 | boolean isContained = false; 36 | 37 | if (attrValues.containsValue(attr)) { 38 | isContained = true; 39 | } 40 | 41 | return isContained; 42 | } 43 | 44 | /** 45 | * 判断数据记录是否是同一条记录,根据数据名称来判断 46 | * 47 | * @param record 48 | * 目标比较对象 49 | * @return 50 | */ 51 | public boolean isRecordSame(Record record) { 52 | boolean isSame = false; 53 | 54 | if (this.name.equals(record.name)) { 55 | isSame = true; 56 | } 57 | 58 | return isSame; 59 | } 60 | 61 | /** 62 | * 数据的决策属性分类 63 | * 64 | * @return 65 | */ 66 | public String getRecordDecisionClass() { 67 | String value = null; 68 | 69 | value = attrValues.get(RoughSetsCore.DECISION_ATTR_NAME); 70 | 71 | return value; 72 | } 73 | 74 | /** 75 | * 根据约简属性输出决策规则 76 | * 77 | * @param reductAttr 78 | * 约简属性集合 79 | */ 80 | public String getDecisionRule(ArrayList reductAttr) { 81 | String ruleStr = ""; 82 | String attrName = null; 83 | String value = null; 84 | String decisionValue; 85 | 86 | decisionValue = attrValues.get(RoughSetsCore.DECISION_ATTR_NAME); 87 | ruleStr += "属性"; 88 | for (Map.Entry entry : this.attrValues.entrySet()) { 89 | attrName = (String) entry.getKey(); 90 | value = (String) entry.getValue(); 91 | 92 | if (attrName.equals(RoughSetsCore.DECISION_ATTR_NAME) || reductAttr.contains(attrName) 93 | || value.equals(name)) { 94 | continue; 95 | } 96 | 97 | ruleStr += MessageFormat.format("{0}={1},", attrName, value); 98 | } 99 | ruleStr += "他的分类为" + decisionValue; 100 | 101 | return ruleStr; 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/roughsets/RecordCollection.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.roughsets; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | 7 | /** 8 | * 数据记录集合,包含一些共同的属性 9 | */ 10 | public class RecordCollection { 11 | 12 | // 集合包含的属性 13 | private HashMap attrValues; 14 | // 数据记录列表 15 | private ArrayList recordList; 16 | 17 | public RecordCollection() { 18 | this.attrValues = new HashMap<>(); 19 | this.recordList = new ArrayList<>(); 20 | } 21 | 22 | public RecordCollection(HashMap attrValues, ArrayList recordList) { 23 | this.attrValues = attrValues; 24 | this.recordList = recordList; 25 | } 26 | 27 | public ArrayList getRecord() { 28 | return this.recordList; 29 | } 30 | 31 | /** 32 | * 返回集合的字符名称数组 33 | * 34 | * @return 35 | */ 36 | public ArrayList getRecordNames() { 37 | ArrayList names = new ArrayList<>(); 38 | 39 | for (int i = 0; i < recordList.size(); i++) { 40 | names.add(recordList.get(i).getName()); 41 | } 42 | 43 | return names; 44 | } 45 | 46 | /** 47 | * 判断集合是否包含此属性名称对应的属性值 48 | * 49 | * @param attrName 50 | * 属性名 51 | * @return 52 | */ 53 | public boolean isContainedAttrName(String attrName) { 54 | boolean isContained = false; 55 | 56 | if (this.attrValues.containsKey(attrName)) { 57 | isContained = true; 58 | } 59 | 60 | return isContained; 61 | } 62 | 63 | /** 64 | * 判断2个集合是否相等,比较包含的数据记录是否完全一致 65 | * 66 | * @param rc 67 | * 待比较集合 68 | * @return 69 | */ 70 | public boolean isCollectionSame(RecordCollection rc) { 71 | boolean isSame = false; 72 | 73 | for (Record r : recordList) { 74 | isSame = false; 75 | 76 | for (Record r2 : rc.recordList) { 77 | if (r.isRecordSame(r2)) { 78 | isSame = true; 79 | break; 80 | } 81 | } 82 | 83 | // 如果有1个记录不包含,就算集合不相等 84 | if (!isSame) { 85 | break; 86 | } 87 | } 88 | 89 | return isSame; 90 | } 91 | 92 | /** 93 | * 集合之间的交运算 94 | * 95 | * @param rc 96 | * 交运算的参与运算的另外一集合 97 | * @return 98 | */ 99 | public RecordCollection overlapCalculate(RecordCollection rc) { 100 | String key; 101 | String value; 102 | RecordCollection resultCollection = null; 103 | HashMap resultAttrValues = new HashMap<>(); 104 | ArrayList resultRecords = new ArrayList<>(); 105 | 106 | // 进行集合的交运算,有相同的记录的则进行添加 107 | for (Record record : this.recordList) { 108 | for (Record record2 : rc.recordList) { 109 | if (record.isRecordSame(record2)) { 110 | resultRecords.add(record); 111 | break; 112 | } 113 | } 114 | } 115 | 116 | // 如果没有交集,则直接返回 117 | if (resultRecords.size() == 0) { 118 | return null; 119 | } 120 | 121 | // 将2个集合的属性进行合并 122 | for (Map.Entry entry : this.attrValues.entrySet()) { 123 | key = (String) entry.getKey(); 124 | value = (String) entry.getValue(); 125 | 126 | resultAttrValues.put(key, value); 127 | } 128 | 129 | for (Map.Entry entry : rc.attrValues.entrySet()) { 130 | key = (String) entry.getKey(); 131 | value = (String) entry.getValue(); 132 | 133 | resultAttrValues.put(key, value); 134 | } 135 | 136 | resultCollection = new RecordCollection(resultAttrValues, resultRecords); 137 | return resultCollection; 138 | } 139 | 140 | /** 141 | * 求集合的并集,各自保留各自的属性 142 | * 143 | * @param rc 144 | * 待合并的集合 145 | * @return 146 | */ 147 | public RecordCollection unionCal(RecordCollection rc) { 148 | RecordCollection resultRc = null; 149 | ArrayList records = new ArrayList<>(); 150 | 151 | for (Record r1 : this.recordList) { 152 | records.add(r1); 153 | } 154 | 155 | for (Record r2 : rc.recordList) { 156 | records.add(r2); 157 | } 158 | 159 | resultRc = new RecordCollection(null, records); 160 | return resultRc; 161 | } 162 | 163 | /** 164 | * 输出集合中包含的元素 165 | */ 166 | public void printRc() { 167 | System.out.print("{"); 168 | for (Record r : this.getRecord()) { 169 | System.out.print(r.getName() + ", "); 170 | } 171 | System.out.println("}"); 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/roughsets/RoughSetsExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.roughsets; 2 | 3 | /** 4 | * 粗糙集约简算法 5 | */ 6 | public class RoughSetsExample { 7 | 8 | public static void main(String[] args) { 9 | String filePath = "data/roughsets/input.txt"; 10 | 11 | RoughSetsCore tool = new RoughSetsCore(filePath); 12 | tool.findingReduct(); 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/sequential/patterns/gsp/GSPExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.sequential.patterns.gsp; 2 | 3 | /** 4 | * GSP序列模式分析算法 5 | */ 6 | public class GSPExample { 7 | 8 | public static void main(String[] args) { 9 | String filePath = "data/gsp/testInput.txt"; 10 | //最小支持度阈值 11 | int minSupportCount = 2; 12 | //时间最小间隔 13 | int min_gap = 1; 14 | //施加最大间隔 15 | int max_gap = 5; 16 | 17 | GSPCore tool = new GSPCore(filePath, minSupportCount, min_gap, max_gap); 18 | tool.gspCalculate(); 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/sequential/patterns/gsp/ItemSet.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.sequential.patterns.gsp; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * 序列中的子项集 7 | */ 8 | public class ItemSet { 9 | 10 | /** 11 | * 项集中保存的是数字项数组 12 | */ 13 | private ArrayList items; 14 | 15 | public ItemSet(String[] itemStr) { 16 | items = new ArrayList<>(); 17 | for (String s : itemStr) { 18 | items.add(Integer.parseInt(s)); 19 | } 20 | } 21 | 22 | public ItemSet(int[] itemNum) { 23 | items = new ArrayList<>(); 24 | for (int num : itemNum) { 25 | items.add(num); 26 | } 27 | } 28 | 29 | public ItemSet(ArrayList itemNum) { 30 | this.items = itemNum; 31 | } 32 | 33 | public ArrayList getItems() { 34 | return items; 35 | } 36 | 37 | public void setItems(ArrayList items) { 38 | this.items = items; 39 | } 40 | 41 | /** 42 | * 判断2个项集是否相等 43 | * 44 | * @param itemSet 45 | * 比较对象 46 | * @return 47 | */ 48 | public boolean compareIsSame(ItemSet itemSet) { 49 | boolean result = true; 50 | 51 | if (this.items.size() != itemSet.items.size()) { 52 | return false; 53 | } 54 | 55 | for (int i = 0; i < itemSet.items.size(); i++) { 56 | if (this.items.get(i) != itemSet.items.get(i)) { 57 | // 只要有值不相等,直接算作不相等 58 | result = false; 59 | break; 60 | } 61 | } 62 | 63 | return result; 64 | } 65 | 66 | /** 67 | * 拷贝项集中同样的数据一份 68 | * 69 | * @return 70 | */ 71 | public ArrayList copyItems() { 72 | ArrayList copyItems = new ArrayList<>(); 73 | 74 | for (int num : this.items) { 75 | copyItems.add(num); 76 | } 77 | 78 | return copyItems; 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/sequential/patterns/gsp/Sequence.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.sequential.patterns.gsp; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * 序列,每个序列内部包含多组ItemSet项集 7 | */ 8 | public class Sequence implements Comparable, Cloneable { 9 | 10 | // 序列所属事务ID 11 | private int trsanctionID; 12 | // 项集列表 13 | private ArrayList itemSetList; 14 | 15 | public Sequence(int trsanctionID) { 16 | this.trsanctionID = trsanctionID; 17 | this.itemSetList = new ArrayList<>(); 18 | } 19 | 20 | public Sequence() { 21 | this.itemSetList = new ArrayList<>(); 22 | } 23 | 24 | public int getTrsanctionID() { 25 | return trsanctionID; 26 | } 27 | 28 | public void setTrsanctionID(int trsanctionID) { 29 | this.trsanctionID = trsanctionID; 30 | } 31 | 32 | public ArrayList getItemSetList() { 33 | return itemSetList; 34 | } 35 | 36 | public void setItemSetList(ArrayList itemSetList) { 37 | this.itemSetList = itemSetList; 38 | } 39 | 40 | /** 41 | * 取出序列中第一个项集的第一个元素 42 | * 43 | * @return 44 | */ 45 | public Integer getFirstItemSetNum() { 46 | return this.getItemSetList().get(0).getItems().get(0); 47 | } 48 | 49 | /** 50 | * 获取序列中最后一个项集 51 | * 52 | * @return 53 | */ 54 | public ItemSet getLastItemSet() { 55 | return getItemSetList().get(getItemSetList().size() - 1); 56 | } 57 | 58 | /** 59 | * 获取序列中最后一个项集的最后一个一个元素 60 | * 61 | * @return 62 | */ 63 | public Integer getLastItemSetNum() { 64 | ItemSet lastItemSet = getItemSetList().get(getItemSetList().size() - 1); 65 | int lastItemNum = lastItemSet.getItems().get(lastItemSet.getItems().size() - 1); 66 | 67 | return lastItemNum; 68 | } 69 | 70 | /** 71 | * 判断序列中最后一个项集是否为单一的值 72 | * 73 | * @return 74 | */ 75 | public boolean isLastItemSetSingleNum() { 76 | ItemSet lastItemSet = getItemSetList().get(getItemSetList().size() - 1); 77 | int size = lastItemSet.getItems().size(); 78 | 79 | return size == 1 ? true : false; 80 | } 81 | 82 | @Override 83 | public int compareTo(Sequence o) { 84 | // TODO Auto-generated method stub 85 | return this.getFirstItemSetNum().compareTo(o.getFirstItemSetNum()); 86 | } 87 | 88 | @Override 89 | protected Object clone() throws CloneNotSupportedException { 90 | // TODO Auto-generated method stub 91 | return super.clone(); 92 | } 93 | 94 | /** 95 | * 拷贝一份一模一样的序列 96 | */ 97 | public Sequence copySeqence() { 98 | Sequence copySeq = new Sequence(); 99 | for (ItemSet itemSet : this.itemSetList) { 100 | copySeq.getItemSetList().add(new ItemSet(itemSet.copyItems())); 101 | } 102 | 103 | return copySeq; 104 | } 105 | 106 | /** 107 | * 比较2个序列是否相等,需要判断内部的每个项集是否完全一致 108 | * 109 | * @param seq 110 | * 比较的序列对象 111 | * @return 112 | */ 113 | public boolean compareIsSame(Sequence seq) { 114 | boolean result = true; 115 | ArrayList itemSetList2 = seq.getItemSetList(); 116 | ItemSet tempItemSet1; 117 | ItemSet tempItemSet2; 118 | 119 | if (itemSetList2.size() != this.itemSetList.size()) { 120 | return false; 121 | } 122 | for (int i = 0; i < itemSetList2.size(); i++) { 123 | tempItemSet1 = this.itemSetList.get(i); 124 | tempItemSet2 = itemSetList2.get(i); 125 | 126 | if (!tempItemSet1.compareIsSame(tempItemSet2)) { 127 | // 只要不相等,直接退出函数 128 | result = false; 129 | break; 130 | } 131 | } 132 | 133 | return result; 134 | } 135 | 136 | /** 137 | * 生成此序列的所有子序列 138 | * 139 | * @return 140 | */ 141 | public ArrayList createChildSeqs() { 142 | ArrayList childSeqs = new ArrayList<>(); 143 | ArrayList tempItems; 144 | Sequence tempSeq = null; 145 | ItemSet tempItemSet; 146 | 147 | for (int i = 0; i < this.itemSetList.size(); i++) { 148 | tempItemSet = itemSetList.get(i); 149 | if (tempItemSet.getItems().size() == 1) { 150 | tempSeq = this.copySeqence(); 151 | 152 | // 如果只有项集中只有1个元素,则直接移除 153 | tempSeq.itemSetList.remove(i); 154 | childSeqs.add(tempSeq); 155 | } else { 156 | tempItems = tempItemSet.getItems(); 157 | for (int j = 0; j < tempItems.size(); j++) { 158 | tempSeq = this.copySeqence(); 159 | 160 | // 在拷贝的序列中移除一个数字 161 | tempSeq.getItemSetList().get(i).getItems().remove(j); 162 | childSeqs.add(tempSeq); 163 | } 164 | } 165 | } 166 | 167 | return childSeqs; 168 | } 169 | 170 | } 171 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/sequential/patterns/prefixspan/ItemSet.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.sequential.patterns.prefixspan; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * 字符项集类 7 | */ 8 | public class ItemSet { 9 | 10 | // 项集内的字符 11 | private ArrayList items; 12 | 13 | public ItemSet(String[] str) { 14 | items = new ArrayList<>(); 15 | for (String s : str) { 16 | items.add(s); 17 | } 18 | } 19 | 20 | public ItemSet(ArrayList itemsList) { 21 | this.items = itemsList; 22 | } 23 | 24 | public ItemSet(String s) { 25 | items = new ArrayList<>(); 26 | for (int i = 0; i < s.length(); i++) { 27 | items.add(s.charAt(i) + ""); 28 | } 29 | } 30 | 31 | public ArrayList getItems() { 32 | return items; 33 | } 34 | 35 | public void setItems(ArrayList items) { 36 | this.items = items; 37 | } 38 | 39 | /** 40 | * 获取项集最后1个元素 41 | * 42 | * @return 43 | */ 44 | public String getLastValue() { 45 | int size = this.items.size(); 46 | 47 | return this.items.get(size - 1); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/sequential/patterns/prefixspan/PrefixSpanExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.sequential.patterns.prefixspan; 2 | 3 | /** 4 | * PrefixSpan序列模式挖掘算法 5 | */ 6 | public class PrefixSpanExample { 7 | 8 | public static void main(String[] agrs) { 9 | String filePath = "data/prefixspan/input.txt"; 10 | //最小支持度阈值率 11 | double minSupportRate = 0.4; 12 | 13 | PrefixSpanCore tool = new PrefixSpanCore(filePath, minSupportRate); 14 | tool.prefixSpanCalculate(); 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNCore.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.statistical.learning.ann; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | /** 10 | * SVM支持向量机工具类 11 | */ 12 | public class ANNCore { 13 | 14 | // 训练集数据文件路径 15 | private String trainDataPath; 16 | // svm_problem对象,用于构造svm model模型 17 | private ANNProblem sProblem; 18 | // svm参数,里面有svm支持向量机的类型和不同 的svm的核函数类型 19 | private ANNParameter sParam; 20 | 21 | public ANNCore(String trainDataPath) { 22 | this.trainDataPath = trainDataPath; 23 | 24 | // 初始化svm相关变量 25 | sProblem = initSvmProblem(); 26 | sParam = initSvmParam(); 27 | } 28 | 29 | /** 30 | * 初始化操作,根据训练集数据构造分类模型 31 | */ 32 | private void initOperation() { 33 | 34 | } 35 | 36 | /** 37 | * svm_problem对象,训练集数据的相关信息配置 38 | * 39 | * @return 40 | */ 41 | private ANNProblem initSvmProblem() { 42 | List label = new ArrayList(); 43 | List nodeSet = new ArrayList(); 44 | getData(nodeSet, label, trainDataPath); 45 | 46 | int dataRange = nodeSet.get(0).length; 47 | ANNNode[][] datas = new ANNNode[nodeSet.size()][dataRange]; // 训练集的向量表 48 | for (int i = 0; i < datas.length; i++) { 49 | for (int j = 0; j < dataRange; j++) { 50 | datas[i][j] = nodeSet.get(i)[j]; 51 | } 52 | } 53 | double[] lables = new double[label.size()]; // a,b 对应的lable 54 | for (int i = 0; i < lables.length; i++) { 55 | lables[i] = label.get(i); 56 | } 57 | 58 | // 定义svm_problem对象 59 | ANNProblem problem = new ANNProblem(); 60 | problem.l = nodeSet.size(); // 向量个数 61 | problem.x = datas; // 训练集向量表 62 | problem.y = lables; // 对应的lable数组 63 | 64 | return problem; 65 | } 66 | 67 | /** 68 | * 初始化svm支持向量机的参数,包括svm的类型和核函数的类型 69 | * 70 | * @return 71 | */ 72 | private ANNParameter initSvmParam() { 73 | // 定义svm_parameter对象 74 | ANNParameter param = new ANNParameter(); 75 | param.svm_type = ANNParameter.EPSILON_SVR; 76 | // 设置svm的核函数类型为线型 77 | param.kernel_type = ANNParameter.LINEAR; 78 | // 后面的参数配置只针对训练集的数据 79 | param.cache_size = 100; 80 | param.eps = 0.00001; 81 | param.C = 1.9; 82 | 83 | return param; 84 | } 85 | 86 | /** 87 | * 通过svm方式预测数据的类型 88 | * 89 | * @param testDataPath 90 | */ 91 | public void svmPredictData(String testDataPath) { 92 | // 获取测试数据 93 | List testlabel = new ArrayList(); 94 | List testnodeSet = new ArrayList(); 95 | getData(testnodeSet, testlabel, testDataPath); 96 | int dataRange = testnodeSet.get(0).length; 97 | 98 | ANNNode[][] testdatas = new ANNNode[testnodeSet.size()][dataRange]; // 训练集的向量表 99 | for (int i = 0; i < testdatas.length; i++) { 100 | for (int j = 0; j < dataRange; j++) { 101 | testdatas[i][j] = testnodeSet.get(i)[j]; 102 | } 103 | } 104 | // 测试数据的真实值,在后面将会与svm的预测值做比较 105 | double[] testlables = new double[testlabel.size()]; // a,b 对应的lable 106 | for (int i = 0; i < testlables.length; i++) { 107 | testlables[i] = testlabel.get(i); 108 | } 109 | 110 | // 如果参数没有问题,则svm.svm_check_parameter()函数返回null,否则返回error描述。 111 | // 对svm的配置参数叫验证,因为有些参数只针对部分的支持向量机的类型 112 | System.out.println(ANN.ann_check_parameter(sProblem, sParam)); 113 | System.out.println("------------检验参数-----------"); 114 | // 训练SVM分类模型 115 | ANNModel model = ANN.ann_train(sProblem, sParam); 116 | 117 | // 预测测试数据的lable 118 | double err = 0.0; 119 | for (int i = 0; i < testdatas.length; i++) { 120 | double truevalue = testlables[i]; 121 | // 测试数据真实值 122 | System.out.print(truevalue + " "); 123 | double predictValue = ANN.ann_predict(model, testdatas[i]); 124 | // 测试数据预测值 125 | System.out.println(predictValue); 126 | } 127 | } 128 | 129 | /** 130 | * 从文件中获取数据 131 | * 132 | * @param nodeSet 133 | * 向量节点 134 | * @param label 135 | * 节点值类型值 136 | * @param filename 137 | * 数据文件地址 138 | */ 139 | private void getData(List nodeSet, List label, String filename) { 140 | try { 141 | 142 | FileReader fr = new FileReader(new File(filename)); 143 | BufferedReader br = new BufferedReader(fr); 144 | String line = null; 145 | while ((line = br.readLine()) != null) { 146 | String[] datas = line.split(","); 147 | ANNNode[] vector = new ANNNode[datas.length - 1]; 148 | for (int i = 0; i < datas.length - 1; i++) { 149 | ANNNode node = new ANNNode(); 150 | node.index = i + 1; 151 | node.value = Double.parseDouble(datas[i]); 152 | vector[i] = node; 153 | } 154 | nodeSet.add(vector); 155 | double lablevalue = Double.parseDouble(datas[datas.length - 1]); 156 | label.add(lablevalue); 157 | } 158 | } catch (Exception e) { 159 | e.printStackTrace(); 160 | } 161 | 162 | } 163 | 164 | } 165 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.statistical.learning.ann; 2 | 3 | public class ANNExample { 4 | 5 | public static void main(String[] args) { 6 | // 训练集数据文件路径 7 | String trainDataPath = "data/ann/trainInput.txt"; 8 | // 测试数据文件路径 9 | String testDataPath = "data/ann/testInput.txt"; 10 | 11 | ANNCore tool = new ANNCore(trainDataPath); 12 | // 对测试数据进行ANN分类 13 | tool.svmPredictData(testDataPath); 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNModel.java: -------------------------------------------------------------------------------- 1 | // 2 | // svm_model 3 | // 4 | package com.jusdt.datamining.statistical.learning.ann; 5 | 6 | import java.io.Serializable; 7 | 8 | public class ANNModel implements Serializable { 9 | 10 | private static final long serialVersionUID = 1L; 11 | 12 | //svm支持向量机的参数 13 | ANNParameter param; // parameter 14 | //分类的类型数 15 | int nr_class; // number of classes, = 2 in regression/one class svm 16 | int l; // total #SV 17 | ANNNode[][] SV; // SVs (SV[l]) 18 | double[][] sv_coef; // coefficients for SVs in decision functions (sv_coef[k-1][l]) 19 | double[] rho; // constants in decision functions (rho[k*(k-1)/2]) 20 | double[] probA; // pariwise probability information 21 | double[] probB; 22 | 23 | // for classification only 24 | 25 | //每个类型的类型值 26 | int[] label; // label of each class (label[k]) 27 | int[] nSV; // number of SVs for each class (nSV[k]) 28 | // nSV[0] + nSV[1] + ... + nSV[k-1] = l 29 | 30 | }; 31 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNNode.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.statistical.learning.ann; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * 7 | * svm向量节点 8 | * @author lyq 9 | * 10 | */ 11 | public class ANNNode implements Serializable { 12 | 13 | private static final long serialVersionUID = 1L; 14 | 15 | //节点索引 16 | public int index; 17 | //节点的值 18 | public double value; 19 | 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNParameter.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.statistical.learning.ann; 2 | 3 | import java.io.Serializable; 4 | 5 | public class ANNParameter implements Cloneable, Serializable { 6 | 7 | private static final long serialVersionUID = 1L; 8 | 9 | /* svm_type 支持向量机的类型*/ 10 | public static final int C_SVC = 0; 11 | public static final int NU_SVC = 1; 12 | //一类svm 13 | public static final int ONE_CLASS = 2; 14 | public static final int EPSILON_SVR = 3; 15 | public static final int NU_SVR = 4; 16 | 17 | /* kernel_type 核函数类型*/ 18 | //线型核函数 19 | public static final int LINEAR = 0; 20 | //多项式核函数 21 | public static final int POLY = 1; 22 | //RBF径向基函数 23 | public static final int RBF = 2; 24 | //二层神经网络核函数 25 | public static final int SIGMOID = 3; 26 | public static final int PRECOMPUTED = 4; 27 | 28 | public int svm_type; 29 | public int kernel_type; 30 | public int degree; // for poly 31 | public double gamma; // for poly/rbf/sigmoid 32 | public double coef0; // for poly/sigmoid 33 | 34 | // these are for training only 后面这些参数只针对训练集的数据 35 | public double cache_size; // in MB 36 | public double eps; // stopping criteria 37 | public double C; // for C_SVC, EPSILON_SVR and NU_SVR 38 | public int nr_weight; // for C_SVC 39 | public int[] weight_label; // for C_SVC 40 | public double[] weight; // for C_SVC 41 | public double nu; // for NU_SVC, ONE_CLASS, and NU_SVR 42 | public double p; // for EPSILON_SVR 43 | public int shrinking; // use the shrinking heuristics 44 | public int probability; // do probability estimates 45 | 46 | @Override 47 | public Object clone() { 48 | try { 49 | return super.clone(); 50 | } catch (CloneNotSupportedException e) { 51 | return null; 52 | } 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNPrintInterface.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.statistical.learning.ann; 2 | 3 | public interface ANNPrintInterface { 4 | 5 | public void print(String s); 6 | 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/statistical/learning/ann/ANNProblem.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.statistical.learning.ann; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * 包含了训练集数据的基本信息 7 | */ 8 | public class ANNProblem implements Serializable { 9 | 10 | private static final long serialVersionUID = 1L; 11 | 12 | //定义了向量的总个数 13 | public int l; 14 | //分类类型值数组 15 | public double[] y; 16 | //训练集向量表 17 | public ANNNode[][] x; 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/statistical/learning/em/EMCore.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.statistical.learning.em; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.text.MessageFormat; 8 | import java.util.ArrayList; 9 | 10 | /** 11 | * EM最大期望算法工具类 12 | */ 13 | public class EMCore { 14 | 15 | // 测试数据文件地址 16 | private String dataFilePath; 17 | // 测试坐标点数据 18 | private String[][] data; 19 | // 测试坐标点数据列表 20 | private ArrayList pointArray; 21 | // 目标C1点 22 | private Point p1; 23 | // 目标C2点 24 | private Point p2; 25 | 26 | public EMCore(String dataFilePath) { 27 | this.dataFilePath = dataFilePath; 28 | pointArray = new ArrayList<>(); 29 | } 30 | 31 | /** 32 | * 从文件中读取数据 33 | */ 34 | public void readDataFile() { 35 | File file = new File(dataFilePath); 36 | ArrayList dataArray = new ArrayList(); 37 | 38 | try { 39 | BufferedReader in = new BufferedReader(new FileReader(file)); 40 | String str; 41 | String[] tempArray; 42 | while ((str = in.readLine()) != null) { 43 | tempArray = str.split(" "); 44 | dataArray.add(tempArray); 45 | } 46 | in.close(); 47 | } catch (IOException e) { 48 | e.getStackTrace(); 49 | } 50 | 51 | data = new String[dataArray.size()][]; 52 | dataArray.toArray(data); 53 | 54 | // 开始时默认取头2个点作为2个簇中心 55 | p1 = new Point(Integer.parseInt(data[0][0]), Integer.parseInt(data[0][1])); 56 | p2 = new Point(Integer.parseInt(data[1][0]), Integer.parseInt(data[1][1])); 57 | 58 | Point p; 59 | for (String[] array : data) { 60 | // 将数据转换为对象加入列表方便计算 61 | p = new Point(Integer.parseInt(array[0]), Integer.parseInt(array[1])); 62 | pointArray.add(p); 63 | } 64 | } 65 | 66 | /** 67 | * 计算坐标点对于2个簇中心点的隶属度 68 | * 69 | * @param p 70 | * 待测试坐标点 71 | */ 72 | private void computeMemberShip(Point p) { 73 | // p点距离第一个簇中心点的距离 74 | double distance1 = 0; 75 | // p距离第二个中心点的距离 76 | double distance2 = 0; 77 | 78 | // 用欧式距离计算 79 | distance1 = Math.pow(p.getX() - p1.getX(), 2) + Math.pow(p.getY() - p1.getY(), 2); 80 | distance2 = Math.pow(p.getX() - p2.getX(), 2) + Math.pow(p.getY() - p2.getY(), 2); 81 | 82 | // 计算对于p1点的隶属度,与距离成反比关系,距离靠近越小,隶属度越大,所以要用大的distance2另外的距离来表示 83 | p.setMemberShip1(distance2 / (distance1 + distance2)); 84 | // 计算对于p2点的隶属度 85 | p.setMemberShip2(distance1 / (distance1 + distance2)); 86 | } 87 | 88 | /** 89 | * 执行期望最大化步骤 90 | */ 91 | public void exceptMaxStep() { 92 | // 新的优化过的簇中心点 93 | double p1X = 0; 94 | double p1Y = 0; 95 | double p2X = 0; 96 | double p2Y = 0; 97 | double temp1 = 0; 98 | double temp2 = 0; 99 | // 误差值 100 | double errorValue1 = 0; 101 | double errorValue2 = 0; 102 | // 上次更新的簇点坐标 103 | Point lastP1 = null; 104 | Point lastP2 = null; 105 | 106 | // 当开始计算的时候,或是中心点的误差值超过1的时候都需要再次迭代计算 107 | while (lastP1 == null || errorValue1 > 1.0 || errorValue2 > 1.0) { 108 | for (Point p : pointArray) { 109 | computeMemberShip(p); 110 | p1X += p.getMemberShip1() * p.getMemberShip1() * p.getX(); 111 | p1Y += p.getMemberShip1() * p.getMemberShip1() * p.getY(); 112 | temp1 += p.getMemberShip1() * p.getMemberShip1(); 113 | 114 | p2X += p.getMemberShip2() * p.getMemberShip2() * p.getX(); 115 | p2Y += p.getMemberShip2() * p.getMemberShip2() * p.getY(); 116 | temp2 += p.getMemberShip2() * p.getMemberShip2(); 117 | } 118 | 119 | lastP1 = new Point(p1.getX(), p1.getY()); 120 | lastP2 = new Point(p2.getX(), p2.getY()); 121 | 122 | // 套公式计算新的簇中心点坐标,最最大化处理 123 | p1.setX(p1X / temp1); 124 | p1.setY(p1Y / temp1); 125 | p2.setX(p2X / temp2); 126 | p2.setY(p2Y / temp2); 127 | 128 | errorValue1 = Math.abs(lastP1.getX() - p1.getX()) + Math.abs(lastP1.getY() - p1.getY()); 129 | errorValue2 = Math.abs(lastP2.getX() - p2.getX()) + Math.abs(lastP2.getY() - p2.getY()); 130 | } 131 | 132 | System.out.println( 133 | MessageFormat.format("簇中心节点p1({0}, {1}), p2({2}, {3})", p1.getX(), p1.getY(), p2.getX(), p2.getY())); 134 | } 135 | 136 | } 137 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/statistical/learning/em/EMExample.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.statistical.learning.em; 2 | 3 | /** 4 | * EM期望最大化算法场景调用类 5 | */ 6 | public class EMExample { 7 | 8 | public static void main(String[] args) { 9 | String filePath = "data/em/input.txt"; 10 | 11 | EMCore tool = new EMCore(filePath); 12 | tool.readDataFile(); 13 | tool.exceptMaxStep(); 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/com/jusdt/datamining/statistical/learning/em/Point.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.statistical.learning.em; 2 | 3 | /** 4 | * 坐标点类 5 | */ 6 | public class Point { 7 | 8 | // 坐标点横坐标 9 | private double x; 10 | // 坐标点纵坐标 11 | private double y; 12 | // 坐标点对于P1的隶属度 13 | private double memberShip1; 14 | // 坐标点对于P2的隶属度 15 | private double memberShip2; 16 | 17 | public Point(double d, double e) { 18 | this.x = d; 19 | this.y = e; 20 | } 21 | 22 | public double getX() { 23 | return x; 24 | } 25 | 26 | public void setX(double x) { 27 | this.x = x; 28 | } 29 | 30 | public double getY() { 31 | return y; 32 | } 33 | 34 | public void setY(double y) { 35 | this.y = y; 36 | } 37 | 38 | public double getMemberShip1() { 39 | return memberShip1; 40 | } 41 | 42 | public void setMemberShip1(double memberShip1) { 43 | this.memberShip1 = memberShip1; 44 | } 45 | 46 | public double getMemberShip2() { 47 | return memberShip2; 48 | } 49 | 50 | public void setMemberShip2(double memberShip2) { 51 | this.memberShip2 = memberShip2; 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | %d{ISO8601} [%thread] %-5level %logger{36} [Line:%-3L] - %msg%n 8 | 9 | 10 | 11 | INFO 12 | ACCEPT 13 | DENY 14 | 15 | 16 | 17 | 19 | logs/datamining.log 20 | 21 | %d{ISO8601} [%thread] %-5level %logger{36} [Line:%-3L] - %msg%n 22 | 23 | 24 | 25 | INFO 26 | 27 | 28 | logs/datamining.log.%d{yyyy-MM-dd}.gz 29 | 30 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /src/test/java/com/jusdt/datamining/demo/MainDemo.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.demo; 2 | 3 | public class MainDemo { 4 | 5 | public static void main(String[] args) { 6 | // TODO Auto-generated method stub 7 | 8 | } 9 | 10 | } 11 | -------------------------------------------------------------------------------- /src/test/java/com/jusdt/datamining/dimensionality/reduction/pca/ToeplitzMatrixTest.java: -------------------------------------------------------------------------------- 1 | package com.jusdt.datamining.dimensionality.reduction.pca; 2 | 3 | import com.jusdt.datamining.dimensionality.reduction.pca.ToeplitzMatrix; 4 | 5 | import junit.framework.TestCase; 6 | 7 | public class ToeplitzMatrixTest extends TestCase { 8 | 9 | public ToeplitzMatrixTest(String testName) { 10 | super(testName); 11 | } 12 | 13 | @Override 14 | protected void setUp() throws Exception { 15 | super.setUp(); 16 | } 17 | 18 | @Override 19 | protected void tearDown() throws Exception { 20 | super.tearDown(); 21 | } 22 | 23 | public void testToeplitz() { 24 | double[] data = new double[] { 1, 2, 3, 4, 5, 6 }; 25 | 26 | ToeplitzMatrix m = new ToeplitzMatrix(data); 27 | // MatrixHelper.print(m, 1, 3); 28 | assertTrue("nrows wrong", m.getNRows() == 6); 29 | assertTrue("ncols wrong", m.getNCols() == 6); 30 | double[][] a = m.getArray(); 31 | 32 | assertEquals("0,0", 1., a[0][0]); 33 | assertEquals("0,1", 2., a[0][1]); 34 | assertEquals("0,2", 3., a[0][2]); 35 | assertEquals("0,3", 4., a[0][3]); 36 | assertEquals("0,4", 5., a[0][4]); 37 | assertEquals("0,5", 6., a[0][5]); 38 | 39 | assertEquals("1,0", 2., a[1][0]); 40 | assertEquals("1,1", 1., a[1][1]); 41 | assertEquals("1,2", 2., a[1][2]); 42 | assertEquals("1,3", 3., a[1][3]); 43 | assertEquals("1,4", 4., a[1][4]); 44 | assertEquals("1,5", 5., a[1][5]); 45 | 46 | assertEquals("2,0", 3., a[2][0]); 47 | assertEquals("2,1", 2., a[2][1]); 48 | assertEquals("2,2", 1., a[2][2]); 49 | assertEquals("2,3", 2., a[2][3]); 50 | assertEquals("2,4", 3., a[2][4]); 51 | assertEquals("2,5", 4., a[2][5]); 52 | 53 | assertEquals("3,0", 4., a[3][0]); 54 | assertEquals("3,1", 3., a[3][1]); 55 | assertEquals("3,2", 2., a[3][2]); 56 | assertEquals("3,3", 1., a[3][3]); 57 | assertEquals("3,4", 2., a[3][4]); 58 | assertEquals("3,5", 3., a[3][5]); 59 | 60 | assertEquals("4,0", 5., a[4][0]); 61 | assertEquals("4,1", 4., a[4][1]); 62 | assertEquals("4,2", 3., a[4][2]); 63 | assertEquals("4,3", 2., a[4][3]); 64 | assertEquals("4,4", 1., a[4][4]); 65 | assertEquals("4,5", 2., a[4][5]); 66 | 67 | assertEquals("5,0", 6., a[5][0]); 68 | assertEquals("5,1", 5., a[5][1]); 69 | assertEquals("5,2", 4., a[5][2]); 70 | assertEquals("5,3", 3., a[5][3]); 71 | assertEquals("5,4", 2., a[5][4]); 72 | assertEquals("5,5", 1., a[5][5]); 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | %d{MMdd.HHmmss.SSS} [%-20t] [%-5p] [%-20c] [L:%-3L] - %m%n 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /需要验收的算法: -------------------------------------------------------------------------------- 1 | 1、朴素贝叶斯 2 | 2、KMeans 3 | 3、KNN 4 | 4、PCA 5 | 5、ANN 6 | 6、决策树 7 | 7、层次聚类 8 | 及其他辅助算法 ok 9 | 多媒体数据处理算法 ok --------------------------------------------------------------------------------