├── README.md
└── com.homework
├── .classpath
├── .classpath.bak
├── .gitignore
├── .gitignore.bak
├── .project
├── .settings
├── org.eclipse.core.resources.prefs
├── org.eclipse.jdt.core.prefs
└── org.eclipse.m2e.core.prefs
├── datafile
├── association
│ ├── cnItems.dat
│ ├── fpg
│ ├── fpg2
│ ├── items
│ └── user2items.csv
├── cluster
│ ├── data.csv
│ └── simple_k-means.txt
├── decisiontree
│ ├── test
│ │ └── in
│ │ │ └── weather.nominal.arff
│ └── train
│ │ └── in
│ │ └── weather.nominal.arff
├── hosts
├── hosts.txt
├── naivebayes
│ ├── test
│ │ └── in
│ │ │ └── test.arff
│ └── train
│ │ ├── in
│ │ └── weather.nominal.arff
│ │ └── out
│ │ └── trainresult.arff
├── week5
│ ├── Ma
│ ├── Mb
│ ├── SparseMatrix
│ │ ├── a.txt
│ │ └── b.txt
│ ├── a.txt
│ ├── b.txt
│ ├── small.csv
│ ├── small2.csv
│ └── test
│ │ ├── Ma
│ │ └── Mb
└── week6
│ ├── job.csv
│ └── pv.csv
├── lib
├── je-analysis-1.5.1.jar
├── lucene-core-2.3.0.jar
├── lucene-core-3.1.0.jar
├── paoding-analysis.jar
└── 说明
├── pom.xml
├── scripts
├── clustering
│ └── canopy
│ │ ├── canopy-mahout.txt
│ │ └── canopy.dat
├── fp-growth
│ ├── fpg-mahout.txt
│ └── fpg.txt
├── hive
│ ├── HiveJDBC.java
│ └── sql.hive
├── week10
│ ├── 1.pig
│ ├── common_friend.pig
│ ├── karate.csv
│ ├── w10.pig
│ ├── 杂文件
│ │ ├── common_prj.java.bak
│ │ ├── karate2.csv
│ │ ├── karate2.csv.bak
│ │ ├── mytest.txt
│ │ ├── noway
│ │ └── tes2.txt
│ └── 计算33的好友推荐(不关注别人的没有推荐)
│ │ ├── common.java
│ │ ├── common.java.bak
│ │ ├── common_flt.java
│ │ ├── common_flt.java.bak
│ │ ├── common_grp.java
│ │ ├── common_jnd.java
│ │ ├── common_prj.java
│ │ ├── pig.pig
│ │ └── user.java
├── week13
│ └── week13
├── week8.rar
├── week8
│ ├── homework.txt
│ ├── week8.pig
│ └── week8.txt
└── week9
│ └── pagerank.r
└── src
├── common
└── com
│ └── homework
│ └── hdfs
│ ├── HdfsDAO.java
│ └── package-info.java
├── hadoop
└── machinelearning
│ └── clustering
│ └── hadoop
│ └── machinelearning
│ └── clustering
│ ├── canopy
│ └── package-info.java
│ └── kmeans
│ ├── KmeansHadoop.java
│ └── package-info.java
├── main
└── java
│ └── com
│ └── homework
│ └── App.java
├── mommon
├── com
│ └── homework
│ │ └── mommon
│ │ ├── ComTest.java
│ │ └── package-info.java
└── mytest
│ ├── MenuTree.java
│ ├── Node.java
│ ├── Recursive.java
│ └── package-info.java
├── sequence
└── machinelearning
│ ├── association
│ └── sequence
│ │ └── machinelearning
│ │ └── association
│ │ ├── apriori
│ │ ├── ItemMap.java
│ │ ├── MyApriori.java
│ │ ├── Subset.java
│ │ └── package-info.java
│ │ ├── common
│ │ ├── Definition.java
│ │ ├── Mytest.java
│ │ ├── ReadData.java
│ │ ├── SortTest.java
│ │ ├── Transaction.java
│ │ └── package-info.java
│ │ ├── fpgrowth
│ │ ├── Myfptree2.java
│ │ ├── TreeNode2.java
│ │ └── package-info.java
│ │ ├── fpgtest
│ │ ├── FPTree.java
│ │ ├── TreeNode.java
│ │ └── package-info.java
│ │ └── otherdemo
│ │ ├── Apriori.java
│ │ ├── Apriori_1.java
│ │ ├── Apriori_NathanMagnus.java
│ │ └── package-info.java
│ ├── clustering
│ └── sequence
│ │ └── machinelearning
│ │ └── clustering
│ │ ├── canopy
│ │ ├── MyCanopy.java
│ │ ├── Point.java
│ │ ├── UserPoint.java
│ │ └── package-info.java
│ │ └── kmeans
│ │ ├── MyKmeans.java
│ │ ├── MyKmeansForUser.java
│ │ └── package-info.java
│ ├── decisiontree
│ └── sequence
│ │ └── machinelearning
│ │ └── decisiontree
│ │ ├── c45
│ │ ├── DecisionTreeNode.java
│ │ ├── DecisionTreeUtil.java
│ │ ├── SequenceComparator.java
│ │ ├── c4.java
│ │ └── package-info.java
│ │ ├── id3
│ │ ├── DicisionTree.java
│ │ ├── OtherID3.java
│ │ └── package-info.java
│ │ ├── id3test
│ │ ├── DTreeUtil.java
│ │ ├── ID3.java
│ │ ├── SequenceComparator.java
│ │ ├── TreeNode.java
│ │ └── package-info.java
│ │ ├── myc45
│ │ └── package-info.java
│ │ └── myid3
│ │ ├── Maxgain.java
│ │ ├── MyID3.java
│ │ ├── Point.java
│ │ ├── TheMath.java
│ │ ├── TreeNode.java
│ │ └── package-info.java
│ └── naivebayes
│ └── sequence
│ └── machinelearning
│ └── naivebayes
│ ├── bayesdemo
│ ├── Main.java
│ ├── Test.java
│ ├── Train.java
│ └── package-info.java
│ └── textmining
│ ├── ParticipleTest.java
│ └── package-info.java
├── test
└── java
│ └── com
│ └── homework
│ └── AppTest.java
├── week2
├── business
│ ├── DayIp.java
│ ├── StatPV.java
│ └── package-info.java
└── entity
│ ├── Kpi.java
│ └── package-info.java
├── week3
├── mine
│ ├── Outinfo.java
│ ├── StationInfo.java
│ ├── StayTime.java
│ ├── StayTime2.java
│ ├── StayTime2改造前备份.rar
│ ├── my.net
│ ├── my.pos
│ └── package-info.java
└── tutorial
│ ├── BaseStationDataPreprocess.java
│ ├── TableLine.java
│ └── package-info.java
├── week5
├── matrix
│ ├── Bigmmult.java
│ ├── MatrixMult.java
│ ├── Multiply.java
│ ├── MyTest.java
│ ├── Recommend.java
│ ├── SparseMatrix.java
│ └── package-info.java
└── recommend
│ ├── MainPodium.java
│ ├── Step1.java
│ ├── Step2.java
│ ├── Step3.java
│ ├── Step4.java
│ └── package-info.java
├── week6
├── filterSalary
│ ├── Main.java
│ ├── Step0.java
│ ├── Step1.java
│ ├── Step2.java
│ ├── Step3.java
│ └── package-info.java
├── recommendJob
│ ├── ItemLoglikelihood.java
│ ├── UserCityBlock.java
│ └── package-info.java
└── test
│ └── package-info.java
├── week7
├── classfier
│ ├── Main.java
│ ├── PaodingFirst.java
│ ├── PaodingTest.java
│ └── package-info.java
├── dic
│ ├── .compiled
│ │ └── most-words-mode
│ │ │ ├── .metadata
│ │ │ ├── vocabulary.dic.compiled
│ │ │ ├── x-confucian-family-name.dic.compiled
│ │ │ ├── x-for-combinatorics.dic.compiled
│ │ │ ├── x-noise-charactor.dic.compiled
│ │ │ ├── x-noise-word.dic.compiled
│ │ │ └── x-unit.dic.compiled
│ ├── administrative.dic
│ ├── appellation.dic
│ ├── company.dic
│ ├── comupter-science.dic
│ ├── contemporary-words.dic
│ ├── division
│ │ ├── africa.dic
│ │ ├── america.dic
│ │ ├── china.dic
│ │ ├── europe.dic
│ │ ├── japan.dic
│ │ ├── korea.dic
│ │ ├── oceania.dic
│ │ ├── readme.txt
│ │ └── taiwan.dic
│ ├── festival.dic
│ ├── language.dic
│ ├── locale
│ │ ├── beijing.dic
│ │ ├── fuzhou.dic
│ │ ├── quanzhou.dic
│ │ ├── readme.txt
│ │ └── xiamen.dic
│ ├── name-foreign.dic
│ ├── nation.dic
│ ├── org-domestic.dic
│ ├── org-foreign.dic
│ ├── paoding-dic-names.properties
│ ├── star-domestic.dic
│ ├── star-foreign.dic
│ ├── t-base.dic
│ ├── x-confucian-family-name.dic
│ ├── x-for-combinatorics.dic
│ ├── x-noise-charactor.dic
│ ├── x-noise-word.dic
│ └── x-unit.dic
└── myInputFormat
│ ├── JamesInputFormat.java
│ ├── JamesRecordReader.java
│ └── package-info.java
└── week8
└── mrclassify
└── package-info.java
/README.md:
--------------------------------------------------------------------------------
1 | myhomework
2 | ==========
3 |
--------------------------------------------------------------------------------
/com.homework/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
--------------------------------------------------------------------------------
/com.homework/.classpath.bak:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/com.homework/.gitignore:
--------------------------------------------------------------------------------
1 | /target/
2 | /target/classes/META-INF/maven/com/com.homework/pom.properties
3 | .project
4 | .settings
5 | target
6 | *.log
7 | data
8 | build
9 | bin
10 | assets
11 | runtime
12 | *.class
13 | *.war
14 | *.ear
15 | input
16 | output
17 |
18 |
--------------------------------------------------------------------------------
/com.homework/.gitignore.bak:
--------------------------------------------------------------------------------
1 | /target/
2 | /target/classes/META-INF/maven/com/com.homework/pom.properties
3 |
4 | .project
5 |
6 | .settings
7 | target
8 | *.log
9 | data
10 | build
11 | bin
12 | assets
13 | runtime
14 | *.class
15 | *.war
16 | *.ear
17 | input
18 | output
19 |
20 |
--------------------------------------------------------------------------------
/com.homework/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | com.homework
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 | org.eclipse.m2e.core.maven2Builder
15 |
16 |
17 |
18 |
19 |
20 | org.eclipse.jdt.core.javanature
21 | org.eclipse.m2e.core.maven2Nature
22 |
23 |
24 |
--------------------------------------------------------------------------------
/com.homework/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//scripts/fp-growth/fpg-mahout.txt=UTF-8
3 | encoding//src/main/java=UTF-8
4 | encoding//src/test/java=UTF-8
5 | encoding/=UTF-8
6 |
--------------------------------------------------------------------------------
/com.homework/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
5 | org.eclipse.jdt.core.compiler.compliance=1.7
6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
12 | org.eclipse.jdt.core.compiler.source=1.7
13 |
--------------------------------------------------------------------------------
/com.homework/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/com.homework/datafile/association/cnItems.dat:
--------------------------------------------------------------------------------
1 | 1 牛奶,鸡蛋,面包,薯片
2 | 2 鸡蛋,爆米花,薯片,啤酒
3 | 3 鸡蛋,面包,薯片
4 | 4 牛奶,鸡蛋,面包,爆米花,薯片,啤酒
5 | 5 牛奶,面包,啤酒
6 | 6 鸡蛋,面包,啤酒
7 | 7 牛奶,面包,薯片
8 | 8 牛奶,鸡蛋,面包,黄油,薯片
9 | 9 牛奶,鸡蛋,黄油,薯片
--------------------------------------------------------------------------------
/com.homework/datafile/association/fpg:
--------------------------------------------------------------------------------
1 | 牛奶,鸡蛋,面包,薯片
2 | 鸡蛋,爆米花,薯片,啤酒
3 | 鸡蛋,面包,薯片
4 | 牛奶,鸡蛋,面包,爆米花,薯片,啤酒
5 | 牛奶,面包,啤酒
6 | 鸡蛋,面包,啤酒
7 | 牛奶,面包,薯片
8 | 牛奶,鸡蛋,面包,黄油,薯片
9 | 牛奶,鸡蛋,黄油,薯片
--------------------------------------------------------------------------------
/com.homework/datafile/association/fpg2:
--------------------------------------------------------------------------------
1 | I1,I2,I5
2 | I2,I4
3 | I2,I3
4 | I1,I2,I4
5 | I1,I3
6 | I2,I3
7 | I1,I3
8 | I1,I2,I3,I5
9 | I1,I2,I3
--------------------------------------------------------------------------------
/com.homework/datafile/association/items:
--------------------------------------------------------------------------------
1 | T100 I1,I2,I5
2 | T200 I2,I4
3 | T300 I2,I3
4 | T400 I1,I2,I4
5 | T500 I1,I3
6 | T600 I2,I3
7 | T700 I1,I3
8 | T800 I1,I2,I3,I5
9 | T900 I1,I2,I3
--------------------------------------------------------------------------------
/com.homework/datafile/cluster/simple_k-means.txt:
--------------------------------------------------------------------------------
1 | 1 1
2 | 2 1
3 | 1 2
4 | 2 2
5 | 3 3
6 | 8 8
7 | 8 9
8 | 9 8
9 | 9 9
--------------------------------------------------------------------------------
/com.homework/datafile/decisiontree/test/in/weather.nominal.arff:
--------------------------------------------------------------------------------
1 |
2 | #存放做决策的属性,一般是或否
3 | @decision
4 | yes,no
5 |
6 | @attribute outlook {sunny, overcast, rainy}
7 | @attribute temperature {hot, mild, cool}
8 | @attribute humidity {high, normal}
9 | @attribute windy {TRUE, FALSE}
10 |
11 |
12 | @data
13 | sunny,hot,high,FALSE,no
14 | sunny,hot,high,TRUE,no
15 | overcast,hot,high,FALSE,yes
16 | rainy,mild,high,FALSE,yes
17 | rainy,cool,normal,FALSE,yes
18 | rainy,cool,normal,TRUE,no
19 | overcast,cool,normal,TRUE,yes
20 | sunny,mild,high,FALSE,no
21 | sunny,cool,normal,FALSE,yes
22 | rainy,mild,normal,FALSE,yes
23 | sunny,mild,normal,TRUE,yes
24 | overcast,mild,high,TRUE,yes
25 | overcast,hot,normal,FALSE,yes
26 | rainy,mild,high,TRUE,no
--------------------------------------------------------------------------------
/com.homework/datafile/decisiontree/train/in/weather.nominal.arff:
--------------------------------------------------------------------------------
1 |
2 | #存放做决策的属性,一般是或否
3 | @decision
4 | yes,no
5 |
6 | @attribute outlook {sunny, overcast, rainy}
7 | @attribute temperature {hot, mild, cool}
8 | @attribute humidity {high, normal}
9 | @attribute windy {TRUE, FALSE}
10 |
11 |
12 | @data
13 | sunny,hot,high,FALSE,no
14 | sunny,hot,high,TRUE,no
15 | overcast,hot,high,FALSE,yes
16 | rainy,mild,high,FALSE,yes
17 | rainy,cool,normal,FALSE,yes
18 | rainy,cool,normal,TRUE,no
19 | overcast,cool,normal,TRUE,yes
20 | sunny,mild,high,FALSE,no
21 | sunny,cool,normal,FALSE,yes
22 | rainy,mild,normal,FALSE,yes
23 | sunny,mild,normal,TRUE,yes
24 | overcast,mild,high,TRUE,yes
25 | overcast,hot,normal,FALSE,yes
26 | rainy,mild,high,TRUE,no
--------------------------------------------------------------------------------
/com.homework/datafile/naivebayes/test/in/test.arff:
--------------------------------------------------------------------------------
1 | @decision
2 | yes,no
3 | @attribute outlook {sunny, overcast, rainy}
4 | @attribute temperature {hot, mild, cool}
5 | @attribute humidity {high, normal}
6 | @attribute windy {TRUE, FALSE}
7 | @data
8 | sunny,hot,high,FALSE
9 | overcast,mild,high,TRUE
10 | overcast,hot,normal,FALSE
11 | rainy,mild,high,TRUE
--------------------------------------------------------------------------------
/com.homework/datafile/naivebayes/train/in/weather.nominal.arff:
--------------------------------------------------------------------------------
1 | #存放做决策的属性,一般是或否
2 | @decision
3 | yes,no
4 | @attribute outlook {sunny, overcast, rainy}
5 | @attribute temperature {hot, mild, cool}
6 | @attribute humidity {high, normal}
7 | @attribute windy {TRUE, FALSE}
8 | @data
9 | sunny,hot,high,FALSE,no
10 | sunny,hot,high,TRUE,no
11 | overcast,hot,high,FALSE,yes
12 | rainy,mild,high,FALSE,yes
13 | rainy,cool,normal,FALSE,yes
14 | rainy,cool,normal,TRUE,no
15 | overcast,cool,normal,TRUE,yes
16 | sunny,mild,high,FALSE,no
17 | sunny,cool,normal,FALSE,yes
18 | rainy,mild,normal,FALSE,yes
19 | sunny,mild,normal,TRUE,yes
20 | overcast,mild,high,TRUE,yes
21 | overcast,hot,normal,FALSE,yes
22 | rainy,mild,high,TRUE,no
--------------------------------------------------------------------------------
/com.homework/datafile/naivebayes/train/out/trainresult.arff:
--------------------------------------------------------------------------------
1 | @decision P(yes) {0.7142857142857143}
2 | @decision P(no) {0.42857142857142855}
3 | @data
4 | P(outlook=sunny|yes),0.3
5 | P(outlook=sunny|no),0.6666666666666666
6 | P(outlook=overcast|yes),0.5
7 | P(outlook=overcast|no),0.16666666666666666
8 | P(outlook=rainy|yes),0.4
9 | P(outlook=rainy|no),0.5
10 | P(temperature=hot|yes),0.3
11 | P(temperature=hot|no),0.5
12 | P(temperature=mild|yes),0.5
13 | P(temperature=mild|no),0.5
14 | P(temperature=cool|yes),0.4
15 | P(temperature=cool|no),0.3333333333333333
16 | P(humidity=high|yes),0.4
17 | P(humidity=high|no),0.8333333333333334
18 | P(humidity=normal|yes),0.7
19 | P(humidity=normal|no),0.3333333333333333
20 | P(windy=TRUE|yes),0.4
21 | P(windy=TRUE|no),0.6666666666666666
22 | P(windy=FALSE|yes),0.7
23 | P(windy=FALSE|no),0.5
24 |
--------------------------------------------------------------------------------
/com.homework/datafile/week5/Ma:
--------------------------------------------------------------------------------
1 | 1,1,1
2 | 2,1,2
3 | 2,2,3
--------------------------------------------------------------------------------
/com.homework/datafile/week5/Mb:
--------------------------------------------------------------------------------
1 | 1,1,2
2 | 1,2,4
3 | 2,1,1
4 | 2,2,2
--------------------------------------------------------------------------------
/com.homework/datafile/week5/SparseMatrix/a.txt:
--------------------------------------------------------------------------------
1 | 1,1,1
2 | 1,2,2
3 | 1,3,3
4 | 2,1,4
5 | 2,2,5
6 | 3,1,7
7 | 3,2,8
8 | 3,3,9
9 | 4,1,10
10 | 4,2,11
11 | 4,3,12
--------------------------------------------------------------------------------
/com.homework/datafile/week5/SparseMatrix/b.txt:
--------------------------------------------------------------------------------
1 | 1,1,10
2 | 1,2,15
3 | 2,2,2
4 | 3,1,11
5 | 3,2,9
--------------------------------------------------------------------------------
/com.homework/datafile/week5/a.txt:
--------------------------------------------------------------------------------
1 | 1,1,1
2 | 1,2,2
3 | 1,3,3
4 | 2,1,4
5 | 2,2,5
6 | 3,1,7
7 | 3,2,8
8 | 3,3,9
9 | 4,1,10
10 | 4,2,11
11 | 4,3,12
--------------------------------------------------------------------------------
/com.homework/datafile/week5/b.txt:
--------------------------------------------------------------------------------
1 | 1,1,10
2 | 1,2,15
3 | 2,2,2
4 | 3,1,11
5 | 3,2,9
--------------------------------------------------------------------------------
/com.homework/datafile/week5/small.csv:
--------------------------------------------------------------------------------
1 | 1,101,5.0
2 | 1,102,3.0
3 | 1,103,2.5
4 | 2,101,2.0
5 | 2,102,2.5
6 | 2,103,5.0
7 | 2,104,2.0
8 | 3,101,2.0
9 | 3,104,4.0
10 | 3,105,4.5
11 | 3,107,5.0
12 | 4,101,5.0
13 | 4,103,3.0
14 | 4,104,4.5
15 | 4,106,4.0
16 | 5,101,4.0
17 | 5,102,3.0
18 | 5,103,2.0
19 | 5,104,4.0
20 | 5,105,3.5
21 | 5,106,4.0
--------------------------------------------------------------------------------
/com.homework/datafile/week5/small2.csv:
--------------------------------------------------------------------------------
1 | 1,101,5.0
2 | 1,102,3.0
3 | 1,103,2.5
4 | 2,101,2.0
5 | 2,102,2.5
6 | 2,103,5.0
7 | 2,104,2.0
8 | 3,101,2.0
9 | 3,104,4.0
10 | 3,105,4.5
11 | 3,107,5.0
12 | 4,101,5.0
13 | 4,103,3.0
14 | 4,104,4.5
15 | 4,106,4.0
16 | 5,101,4.0
17 | 5,102,3.0
18 | 5,103,2.0
19 | 5,104,4.0
20 | 5,105,3.5
21 | 5,106,4.0
22 | 6,102,4.0
23 | 6,103,2.0
24 | 6,105,3.5
25 | 6,107,4.0
--------------------------------------------------------------------------------
/com.homework/datafile/week5/test/Ma:
--------------------------------------------------------------------------------
1 | 1,1,1
2 | 1,2,2
3 | 2,1,2
4 | 2,2,3
--------------------------------------------------------------------------------
/com.homework/datafile/week5/test/Mb:
--------------------------------------------------------------------------------
1 | 1,1,2
2 | 1,2,4
3 | 2,1,1
4 | 2,2,2
--------------------------------------------------------------------------------
/com.homework/datafile/week6/job.csv:
--------------------------------------------------------------------------------
1 | 1,2013-01-24,5600
2 | 2,2011-03-02,5400
3 | 3,2011-03-14,8100
4 | 4,2012-10-05,2200
5 | 5,2011-09-03,14100
6 | 6,2011-03-05,6500
7 | 7,2012-06-06,37000
8 | 8,2013-02-18,5500
9 | 9,2010-07-05,7500
10 | 10,2010-01-23,6700
11 | 11,2011-09-19,5200
12 | 12,2010-01-19,29700
13 | 13,2013-09-28,6000
14 | 14,2013-10-23,3300
15 | 15,2010-10-09,2700
16 | 16,2010-07-14,5100
17 | 17,2010-05-13,29000
18 | 18,2010-01-16,21800
19 | 19,2013-05-23,5700
20 | 20,2011-04-24,5900
21 | 21,2011-09-07,4500
22 | 22,2011-02-20,8100
23 | 23,2012-10-15,6300
24 | 24,2010-04-16,15500
25 | 25,2011-08-22,6300
26 | 26,2011-08-10,8800
27 | 27,2010-09-01,7700
28 | 28,2013-10-16,4300
29 | 29,2010-03-04,8100
30 | 30,2010-05-01,9200
31 | 31,2011-04-16,7700
32 | 32,2013-09-04,2300
33 | 33,2010-05-26,17400
34 | 34,2011-04-14,4000
35 | 35,2010-09-29,5700
36 | 36,2010-04-11,2800
37 | 37,2010-07-26,3600
38 | 38,2011-05-04,17200
39 | 39,2013-04-03,6000
40 | 40,2011-10-21,8400
41 | 41,2010-01-11,5600
42 | 42,2012-03-17,6400
43 | 43,2010-07-10,8800
44 | 44,2010-09-22,22100
45 | 45,2012-08-31,4000
46 | 46,2011-06-11,8800
47 | 47,2010-03-08,5400
48 | 48,2010-04-29,8300
49 | 49,2011-02-05,14500
50 | 50,2011-10-24,7500
51 | 51,2011-04-17,7400
52 | 52,2011-03-19,4000
53 | 53,2010-07-02,5300
54 | 54,2010-07-21,15700
55 | 55,2013-08-09,2800
56 | 56,2013-01-14,48900
57 | 57,2011-06-14,4100
58 | 58,2010-07-30,12300
59 | 59,2010-05-13,9100
60 | 60,2013-06-19,7600
61 | 61,2010-03-13,9700
62 | 62,2013-10-15,5000
63 | 63,2012-10-02,4900
64 | 64,2010-06-08,6300
65 | 65,2010-08-02,3300
66 | 66,2010-05-03,8600
67 | 67,2013-08-23,11300
68 | 68,2010-10-03,7300
69 | 69,2010-05-23,5200
70 | 70,2010-03-28,26400
71 | 71,2010-02-05,9300
72 | 72,2010-06-18,6900
73 | 73,2013-07-08,7500
74 | 74,2010-04-04,6600
75 | 75,2011-05-27,8700
76 | 76,2011-03-17,8800
77 | 77,2013-03-03,6500
78 | 78,2012-01-29,6800
79 | 79,2010-07-19,4900
80 | 80,2010-01-13,5600
81 | 81,2013-01-22,7800
82 | 82,2010-07-05,7500
83 | 83,2010-04-17,3200
84 | 84,2010-10-13,16100
85 | 85,2010-06-26,5400
86 | 86,2011-07-04,7500
87 | 87,2010-05-29,2100
88 | 88,2012-02-04,6500
89 | 89,2013-06-15,8400
90 | 90,2010-01-04,3600
91 | 91,2010-09-07,6900
92 | 92,2012-05-19,5700
93 | 93,2010-08-13,15300
94 | 94,2011-05-11,15700
95 | 95,2013-09-23,6100
96 | 96,2011-05-27,14900
97 | 97,2010-03-30,2700
98 | 98,2010-01-15,2900
99 | 99,2013-07-21,12900
100 | 100,2010-07-22,7500
101 | 101,2013-03-10,7100
102 | 102,2010-07-04,9500
103 | 103,2010-01-02,7000
104 | 104,2012-05-02,8700
105 | 105,2013-04-28,8000
106 | 106,2011-04-25,5200
107 | 107,2010-10-23,9200
108 | 108,2010-07-21,5900
109 | 109,2010-07-14,8900
110 | 110,2010-09-10,3400
111 | 111,2012-05-05,6400
112 | 112,2010-10-16,2000
113 | 113,2013-03-31,8200
114 | 114,2013-08-01,8300
115 | 115,2010-04-23,5100
116 | 116,2011-10-16,6100
117 | 117,2010-03-01,3100
118 | 118,2010-06-23,4100
119 | 119,2011-10-17,14400
120 | 120,2013-07-10,3200
121 | 121,2010-06-19,5300
122 | 122,2013-04-25,9100
123 | 123,2010-06-22,3900
124 | 124,2013-09-14,7900
125 | 125,2010-03-08,5100
126 | 126,2010-01-06,8500
127 | 127,2010-08-16,5800
128 | 128,2010-05-27,12800
129 | 129,2010-03-01,14900
130 | 130,2010-08-16,9500
131 | 131,2010-01-24,5400
132 | 132,2010-05-10,6000
133 | 133,2011-01-31,3200
134 | 134,2010-08-12,4300
135 | 135,2012-09-01,6900
136 | 136,2010-08-29,6600
137 | 137,2010-01-20,7400
138 | 138,2012-02-23,4800
139 | 139,2012-09-26,8700
140 | 140,2010-02-23,9100
141 | 141,2011-10-05,5200
142 | 142,2010-04-18,44500
143 | 143,2010-06-28,10800
144 | 144,2010-09-18,12600
145 | 145,2013-08-02,6800
146 | 146,2013-09-28,8500
147 | 147,2011-09-20,19900
148 | 148,2012-09-02,9200
149 | 149,2010-03-19,11200
150 | 150,2012-01-14,3700
151 | 151,2013-02-21,6400
152 | 152,2012-09-28,7500
153 | 153,2010-05-02,5400
154 | 154,2010-03-19,17700
155 | 155,2010-10-13,2700
156 | 156,2010-09-19,9400
157 | 157,2011-08-26,10500
158 | 158,2011-08-29,9800
159 | 159,2011-02-22,18200
160 | 160,2010-03-14,5100
161 | 161,2010-08-23,6900
162 | 162,2010-01-28,11700
163 | 163,2013-07-02,6600
164 | 164,2011-09-22,6700
165 | 165,2010-07-06,7800
166 | 166,2010-01-25,8900
167 | 167,2013-06-02,9400
168 | 168,2013-01-13,2400
169 | 169,2011-03-02,2700
170 | 170,2013-02-24,5300
171 | 171,2010-10-09,5100
172 | 172,2010-09-07,6100
173 | 173,2013-09-13,5200
174 | 174,2013-05-09,4500
175 | 175,2013-09-12,36700
176 | 176,2012-05-04,8800
177 | 177,2010-08-17,12600
178 | 178,2011-08-16,8300
179 | 179,2010-08-11,5300
180 | 180,2010-04-28,8000
181 | 181,2010-04-24,6300
182 | 182,2010-03-01,10400
183 | 183,2010-05-20,6500
184 | 184,2010-01-03,4600
185 | 185,2013-09-21,5300
186 | 186,2010-04-22,7800
187 | 187,2010-08-08,6100
188 | 188,2010-07-14,6000
189 | 189,2011-06-19,6000
190 | 190,2010-01-10,12300
191 | 191,2011-07-27,2400
192 | 192,2012-02-14,12200
193 | 193,2010-02-28,2800
194 | 194,2011-10-14,14400
195 | 195,2012-03-12,3500
196 | 196,2010-04-11,3800
197 | 197,2013-03-13,18000
198 | 198,2010-07-20,41600
199 | 199,2013-10-02,9800
200 | 200,2013-02-05,7100
201 |
--------------------------------------------------------------------------------
/com.homework/lib/je-analysis-1.5.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bellonor/myHadoopProject/5b7f551dda6daf86489eeed8370d868a90a5cb8e/com.homework/lib/je-analysis-1.5.1.jar
--------------------------------------------------------------------------------
/com.homework/lib/lucene-core-2.3.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bellonor/myHadoopProject/5b7f551dda6daf86489eeed8370d868a90a5cb8e/com.homework/lib/lucene-core-2.3.0.jar
--------------------------------------------------------------------------------
/com.homework/lib/lucene-core-3.1.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bellonor/myHadoopProject/5b7f551dda6daf86489eeed8370d868a90a5cb8e/com.homework/lib/lucene-core-3.1.0.jar
--------------------------------------------------------------------------------
/com.homework/lib/paoding-analysis.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bellonor/myHadoopProject/5b7f551dda6daf86489eeed8370d868a90a5cb8e/com.homework/lib/paoding-analysis.jar
--------------------------------------------------------------------------------
/com.homework/lib/说明:
--------------------------------------------------------------------------------
1 | paoding-analysis.jar只支持lucene-core-3.1.0.jar
2 | je-analysis-1.5.1.jar不支持lucene3.0以上的,所以
3 | 用paoding只能先lucene3.1
4 | 用je只能选lucene2.3
5 |
--------------------------------------------------------------------------------
/com.homework/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com
6 | com.homework
7 | 0.0.1-SNAPSHOT
8 | jar
9 |
10 | com.homework
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 | 0.8
16 |
17 |
18 |
19 |
20 | org.apache.hadoop
21 | hadoop-core
22 | 1.1.2
23 |
24 |
25 | org.apache.mahout
26 | mahout-core
27 | ${mahout.version}
28 |
29 |
30 | org.apache.mahout
31 | mahout-integration
32 | ${mahout.version}
33 |
34 |
35 | org.mortbay.jetty
36 | jetty
37 |
38 |
39 | org.apache.cassandra
40 | cassandra-all
41 |
42 |
43 | me.prettyprint
44 | hector-core
45 |
46 |
47 |
48 |
49 | org.apache.hive
50 | hive-service
51 | 0.11.0
52 |
53 |
54 |
55 |
56 | junit
57 | junit
58 | 3.8.1
59 | test
60 |
61 |
62 |
63 |
64 | dom4j
65 | dom4j
66 | 1.6.1
67 |
68 |
69 | jaxen
70 | jaxen
71 | 1.1.6
72 |
73 |
74 |
75 |
76 |
--------------------------------------------------------------------------------
/com.homework/scripts/clustering/canopy/canopy-mahout.txt:
--------------------------------------------------------------------------------
1 |
2 | 数据准备:
3 | canopy.dat文件
4 | 8.1 8.1
5 |
6 | 7.1 7.1
7 |
8 | 6.2 6.2
9 |
10 | 7.1 7.1
11 |
12 | 2.1 2.1
13 |
14 | 1.1 1.1
15 |
16 | 0.1 0.1
17 |
18 | 3.0 3.0
19 |
20 | # 1.转换成向量,mahout用InputDriver数据转换时候,需要数据默认用空格分隔
21 | mahout org.apache.mahout.clustering.conversion.InputDriver -i /user/hdfs/canopy/in/canopy.dat -o /user/hdfs/canopy/vecfile -v org.apache.mahout.math.RandomAccessSparseVector
22 | # 2. 调用命令
23 | mahout canopy -i /user/hdfs/canopy/vecfile -o /user/hdfs/canopy/out/result -t1 8 -t2 4 -ow -cl
24 |
25 |
26 | # 3.查看结果
27 |
28 | mahout seqdumper -i /user/hdfs/canopy/out/result/clusters-0-final/part-r-00000 -o /home/hadoop/output/result
29 | #关联各个点
30 | mahout clusterdump -i /user/hdfs/canopy/out/result/clusters-0-final/part-r-00000 -o /home/hadoop/output/result -p /user/hdfs/canopy/out/result/clusteredPoints
31 |
32 |
33 | C-0{n=2 c=[6.888, 6.888] r=[0.237, 0.237]}
34 | Weight : [props - optional]: Point:
35 | 1.0: [8.100, 8.100]
36 | 1.0: [7.100, 7.100]
37 | 1.0: [6.200, 6.200]
38 | 1.0: [7.100, 7.100]
39 | C-1{n=2 c=[1.083, 1.083] r=[0.983, 0.983]}
40 | Weight : [props - optional]: Point:
41 | 1.0: [2.100, 2.100]
42 | 1.0: [1.100, 1.100]
43 | 1.0: [3.000, 3.000]
44 | C-2{n=1 c=[0.100, 0.100] r=[]}
45 | Weight : [props - optional]: Point:
46 | 1.0: [0.100, 0.100]
--------------------------------------------------------------------------------
/com.homework/scripts/clustering/canopy/canopy.dat:
--------------------------------------------------------------------------------
1 | 8.1 8.1
2 | 7.1 7.1
3 | 6.2 6.2
4 | 7.1 7.1
5 | 2.1 2.1
6 | 1.1 1.1
7 | 0.1 0.1
8 | 3.0 3.0
--------------------------------------------------------------------------------
/com.homework/scripts/fp-growth/fpg-mahout.txt:
--------------------------------------------------------------------------------
1 | mahout fpg -i /user/hdfs/fp-growth/in/fpg.txt -o /user/hdfs/fp-growth/out -k 50 -method mapreduce -regex '[\ ]' -s 2
2 | 13周作业
3 | mahout fpg -i /user/hdfs/week13/user2items.csv -o /user/hdfs/week13/out -k 50 -method mapreduce -regex '[\ ]' -s 4
4 | 查看结果
5 |
6 | mahout seqdumper -i /user/hdfs/fp-growth/out/frequentpatterns/part-r-00000
7 | 结果:
8 | Key: I1: Value: ([I1],6), ([I2, I1],4), ([I1, I3],4), ([I2, I1, I5],2), ([I2, I1, I3],2)
9 | Key: I2: Value: ([I2],7), ([I2, I3],4), ([I2, I1],4), ([I2, I1, I5],2), ([I2, I1, I3],2), ([I2, I4],2)
10 | Key: I3: Value: ([I3],6), ([I2, I3],4), ([I1, I3],4), ([I2, I1, I3],2)
11 | Key: I4: Value: ([I2, I4],2)
12 | Key: I5: Value: ([I2, I1, I5],2)
13 | Count: 5
14 | 查看fpgrowth
15 | mahout seqdumper -i /user/hdfs/fp-growth/out/fpgrowth/part-r-00000
16 | Key: I2: Value: ([I2],7)
17 | Key: I1: Value: ([I1],6), ([I2, I1],4)
18 | Key: I3: Value: ([I3],6), ([I2, I3],4), ([I1, I3],4), ([I2, I1, I3],2)
19 | Key: I4: Value: ([I2, I4],2)
20 | Key: I5: Value: ([I2, I1, I5],2)
21 | Count: 5
22 | 查看fList
23 | mahout seqdumper -i /user/hdfs/fp-growth/out/fList
24 | Key: I2: Value: 7
25 | Key: I1: Value: 6
26 | Key: I3: Value: 6
27 | Key: I4: Value: 2
28 | Key: I5: Value: 2
29 | Count: 5
--------------------------------------------------------------------------------
/com.homework/scripts/fp-growth/fpg.txt:
--------------------------------------------------------------------------------
1 | I1 I2 I5
2 | I2 I4
3 | I2 I3
4 | I1 I2 I4
5 | I1 I3
6 | I2 I3
7 | I1 I3
8 | I1 I2 I3 I5
9 | I1 I2 I3
--------------------------------------------------------------------------------
/com.homework/scripts/hive/HiveJDBC.java:
--------------------------------------------------------------------------------
1 | package com.hive.jdbc;
2 | import java.sql.Connection;
3 | import java.sql.DriverManager;
4 | import java.sql.ResultSet;
5 | import java.sql.Statement;
6 |
7 | public class HiveJDBC {
8 |
9 | public static void main(String[] args) {
10 | try {
11 | Class.forName("org.apache.hadoop.hive.jdbc.HiveDriver");
12 | // 查询语句
13 | String querySQL = "SELECT * FROM t_rp";
14 | // 链接hive
15 | Connection con = DriverManager.getConnection("jdbc:hive://192.168.0.100:10000/default", "hive", "hive");
16 | Statement stmt = con.createStatement();
17 | // 执行查询语句
18 | ResultSet res = stmt.executeQuery(querySQL);
19 | while (res.next()) {
20 | System.out.println("Result: key:" + res.getString(1) + " –> value:" + res.getString(2));
21 | }
22 | stmt.close();
23 | con.close();
24 | } catch (Exception e) {
25 | e.printStackTrace();
26 | }
27 | }
28 |
29 |
30 | }
31 | /*
32 | import java.sql.SQLException;
33 | import java.sql.Connection;
34 | import java.sql.ResultSet;
35 | import java.sql.Statement;
36 | import java.sql.DriverManager;
37 |
38 | public class HiveJdbcClient {
39 | private static String driverName = "org.apache.hadoop.hive.jdbc.HiveDriver";
40 |
41 | *//**
42 | * @param args
43 | * @throws SQLException
44 | *//*
45 | public static void main(String[] args) throws SQLException {
46 | try {
47 | Class.forName(driverName);
48 | } catch (ClassNotFoundException e) {
49 | // TODO Auto-generated catch block
50 | e.printStackTrace();
51 | System.exit(1);
52 | }
53 | Connection con = DriverManager.getConnection("jdbc:hive://localhost:10000/default", "", "");
54 | Statement stmt = con.createStatement();
55 | String tableName = "testHiveDriverTable";
56 | stmt.executeQuery("drop table " + tableName);
57 | ResultSet res = stmt.executeQuery("create table " + tableName + " (key int, value string)");
58 | // show tables
59 | String sql = "show tables '" + tableName + "'";
60 | System.out.println("Running: " + sql);
61 | res = stmt.executeQuery(sql);
62 | if (res.next()) {
63 | System.out.println(res.getString(1));
64 | }
65 | // describe table
66 | sql = "describe " + tableName;
67 | System.out.println("Running: " + sql);
68 | res = stmt.executeQuery(sql);
69 | while (res.next()) {
70 | System.out.println(res.getString(1) + "\t" + res.getString(2));
71 | }
72 |
73 | // load data into table
74 | // NOTE: filepath has to be local to the hive server
75 | // NOTE: /tmp/a.txt is a ctrl-A separated file with two fields per line
76 | String filepath = "/tmp/a.txt";
77 | sql = "load data local inpath '" + filepath + "' into table " + tableName;
78 | System.out.println("Running: " + sql);
79 | res = stmt.executeQuery(sql);
80 |
81 | // select * query
82 | sql = "select * from " + tableName;
83 | System.out.println("Running: " + sql);
84 | res = stmt.executeQuery(sql);
85 | while (res.next()) {
86 | System.out.println(String.valueOf(res.getInt(1)) + "\t" + res.getString(2));
87 | }
88 |
89 | // regular hive query
90 | sql = "select count(1) from " + tableName;
91 | System.out.println("Running: " + sql);
92 | res = stmt.executeQuery(sql);
93 | while (res.next()) {
94 | System.out.println(res.getString(1));
95 | }
96 | }
97 | }*/
--------------------------------------------------------------------------------
/com.homework/scripts/week10/1.pig:
--------------------------------------------------------------------------------
1 | #计算1的好友推荐
2 | -- Dataguru Hadoop Course
3 | -- Code by James
4 |
5 | -- Load Data
6 | data1 = LOAD '/user/hdfs/week10/karate.csv' AS ( source, target );
7 |
8 | data2 = LOAD '/user/hdfs/week10/karate.csv' AS ( source, target );
9 |
10 | -- Mine the common friends
11 | common_jnd = JOIN data1 BY target, data2 BY target;
12 |
13 | common_prj = FOREACH common_jnd GENERATE data1::target AS common_friend, data1::source AS user, data2::source AS candidate;
14 |
15 | common_flt = FILTER common_prj BY user != candidate;
16 | common_grp = GROUP common_flt BY (user,candidate);-- 此句测试用
17 | common = FOREACH ( GROUP common_flt BY (user,candidate) ) GENERATE FLATTEN(group) AS (user,candidate), COUNT(common_flt) AS cnt;
18 |
19 | -- Recommendation
20 | user = FOREACH ( GROUP common BY user )
21 | {
22 | candidate_srt = ORDER common BY cnt DESC;
23 | candidate_lim = LIMIT candidate_srt 5;
24 | GENERATE FLATTEN(candidate_lim) AS ( user, candidate, cnt );
25 | }
26 |
27 | STORE user INTO '/user/hdfs/week10/result_1';
--------------------------------------------------------------------------------
/com.homework/scripts/week10/common_friend.pig:
--------------------------------------------------------------------------------
1 | -- Dataguru Hadoop Course
2 | -- Code by James
3 |
4 | -- Load Data
5 | data1 = LOAD '/user/huangjun/dataguru/wiki-Vote' AS ( source, target );
6 |
7 | data2 = LOAD '/user/huangjun/dataguru/wiki-Vote' AS ( source, target );
8 |
9 | -- Mine the common friends
10 | common_jnd = JOIN data1 BY target, data2 BY target;
11 |
12 | common_prj = FOREACH common_jnd GENERATE data1::target AS common_friend, data1::source AS user, data2::source AS candidate;
13 |
14 | common_flt = FILTER common_prj BY user != candidate;
15 |
16 | common = FOREACH ( GROUP common_flt BY (user,candidate) ) GENERATE FLATTEN(group) AS (user,candidate), COUNT(common_flt) AS cnt;
17 |
18 | -- Recommendation
19 | user = FOREACH ( GROUP common BY user )
20 | {
21 | candidate_srt = ORDER common BY cnt DESC;
22 | candidate_lim = LIMIT candidate_srt 5;
23 | GENERATE FLATTEN(candidate_lim) AS ( user, candidate, cnt );
24 | }
25 |
26 | STORE user INTO '/user/huangjun/dataguru/result';
--------------------------------------------------------------------------------
/com.homework/scripts/week10/karate.csv:
--------------------------------------------------------------------------------
1 | Source Target
2 | 2 1
3 | 3 1
4 | 3 2
5 | 4 1
6 | 4 2
7 | 4 3
8 | 5 1
9 | 6 1
10 | 7 1
11 | 7 5
12 | 7 6
13 | 8 1
14 | 8 2
15 | 8 3
16 | 8 4
17 | 9 1
18 | 9 3
19 | 10 3
20 | 11 1
21 | 11 5
22 | 11 6
23 | 12 1
24 | 13 1
25 | 13 4
26 | 14 1
27 | 14 2
28 | 14 3
29 | 14 4
30 | 17 6
31 | 17 7
32 | 18 1
33 | 18 2
34 | 20 1
35 | 20 2
36 | 22 1
37 | 22 2
38 | 26 24
39 | 26 25
40 | 28 3
41 | 28 24
42 | 28 25
43 | 29 3
44 | 30 24
45 | 30 27
46 | 31 2
47 | 31 9
48 | 32 1
49 | 32 25
50 | 32 26
51 | 32 29
52 | 33 3
53 | 33 9
54 | 33 15
55 | 33 16
56 | 33 19
57 | 33 21
58 | 33 23
59 | 33 24
60 | 33 30
61 | 33 31
62 | 33 32
63 | 34 9
64 | 34 10
65 | 34 14
66 | 34 15
67 | 34 16
68 | 34 19
69 | 34 20
70 | 34 21
71 | 34 23
72 | 34 24
73 | 34 27
74 | 34 28
75 | 34 29
76 | 34 30
77 | 34 31
78 | 34 32
79 | 34 33
--------------------------------------------------------------------------------
/com.homework/scripts/week10/w10.pig:
--------------------------------------------------------------------------------
1 | -- Dataguru Hadoop Course
2 | -- Code by James
3 |
4 | -- Load Data
5 | data1 = LOAD '/user/hdfs/week10/noway' AS ( source, target );
6 |
7 | data2 = LOAD '/user/hdfs/week10/noway' AS ( source, target );
8 |
9 | -- Mine the common friends
10 | common_jnd = JOIN data1 BY target, data2 BY target;
11 |
12 | common_prj = FOREACH common_jnd GENERATE data1::target AS common_friend, data1::source AS user, data2::source AS candidate;
13 |
14 | common_flt = FILTER common_prj BY user != candidate;
15 | -- common_grp = GROUP common_flt BY (user,candidate);-- 此句测试用
16 | common = FOREACH ( GROUP common_flt BY (user,candidate) ) GENERATE FLATTEN(group) AS (user,candidate), COUNT(common_flt) AS cnt;
17 |
18 | -- Recommendation
19 | user = FOREACH ( GROUP common BY user )
20 | {
21 | candidate_srt = ORDER common BY cnt DESC;
22 | candidate_lim = LIMIT candidate_srt 5;
23 | GENERATE FLATTEN(candidate_lim) AS ( user, candidate, cnt );
24 | }
25 |
26 | STORE user INTO '/user/hdfs/week10/noway_out/';
--------------------------------------------------------------------------------
/com.homework/scripts/week10/杂文件/karate2.csv:
--------------------------------------------------------------------------------
1 | Source Target
2 | 1 2
3 | 2 1
4 | 3 1
5 | 3 2
6 | 4 1
7 | 4 2
8 | 4 3
9 | 5 1
10 | 6 1
11 | 7 1
12 | 7 5
13 | 7 6
14 | 8 1
15 | 8 2
16 | 8 3
17 | 8 4
18 | 9 1
19 | 9 3
20 | 10 3
21 | 11 1
22 | 11 5
23 | 11 6
24 | 12 1
25 | 13 1
26 | 13 4
27 | 14 1
28 | 14 2
29 | 14 3
30 | 14 4
31 | 17 6
32 | 17 7
33 | 18 1
34 | 18 2
35 | 20 1
36 | 20 2
37 | 22 1
38 | 22 2
39 | 26 24
40 | 26 25
41 | 28 3
42 | 28 24
43 | 28 25
44 | 29 3
45 | 30 24
46 | 30 27
47 | 31 2
48 | 31 9
49 | 32 1
50 | 32 25
51 | 32 26
52 | 32 29
53 | 33 3
54 | 33 9
55 | 33 15
56 | 33 16
57 | 33 19
58 | 33 21
59 | 33 23
60 | 33 24
61 | 33 30
62 | 33 31
63 | 33 32
64 | 34 9
65 | 34 10
66 | 34 14
67 | 34 15
68 | 34 16
69 | 34 19
70 | 34 20
71 | 34 21
72 | 34 23
73 | 34 24
74 | 34 27
75 | 34 28
76 | 34 29
77 | 34 30
78 | 34 31
79 | 34 32
80 | 34 33
--------------------------------------------------------------------------------
/com.homework/scripts/week10/杂文件/karate2.csv.bak:
--------------------------------------------------------------------------------
1 | Source Target
2 | 2 1
3 | 3 1
4 | 3 2
5 | 4 1
6 | 4 2
7 | 4 3
8 | 5 1
9 | 6 1
10 | 7 1
11 | 7 5
12 | 7 6
13 | 8 1
14 | 8 2
15 | 8 3
16 | 8 4
17 | 9 1
18 | 9 3
19 | 10 3
20 | 11 1
21 | 11 5
22 | 11 6
23 | 12 1
24 | 13 1
25 | 13 4
26 | 14 1
27 | 14 2
28 | 14 3
29 | 14 4
30 | 17 6
31 | 17 7
32 | 18 1
33 | 18 2
34 | 20 1
35 | 20 2
36 | 22 1
37 | 22 2
38 | 26 24
39 | 26 25
40 | 28 3
41 | 28 24
42 | 28 25
43 | 29 3
44 | 30 24
45 | 30 27
46 | 31 2
47 | 31 9
48 | 32 1
49 | 32 25
50 | 32 26
51 | 32 29
52 | 33 3
53 | 33 9
54 | 33 15
55 | 33 16
56 | 33 19
57 | 33 21
58 | 33 23
59 | 33 24
60 | 33 30
61 | 33 31
62 | 33 32
63 | 34 9
64 | 34 10
65 | 34 14
66 | 34 15
67 | 34 16
68 | 34 19
69 | 34 20
70 | 34 21
71 | 34 23
72 | 34 24
73 | 34 27
74 | 34 28
75 | 34 29
76 | 34 30
77 | 34 31
78 | 34 32
79 | 34 33
--------------------------------------------------------------------------------
/com.homework/scripts/week10/杂文件/mytest.txt:
--------------------------------------------------------------------------------
1 | Source Target
2 | 1 3
3 | 1 4
4 | 2 3
5 | 2 4
6 | 2 1
7 | 3 1
8 | 3 2
9 | 4 1
10 | 4 2
11 | 4 3
--------------------------------------------------------------------------------
/com.homework/scripts/week10/杂文件/noway:
--------------------------------------------------------------------------------
1 | Source Target
2 | 1 3
3 | 1 4
4 | 2 3
5 | 2 4
6 | 3 1
7 | 3 2
8 | 4 1
9 | 4 2
10 |
--------------------------------------------------------------------------------
/com.homework/scripts/week10/杂文件/tes2.txt:
--------------------------------------------------------------------------------
1 | Source Target
2 | 2 3
3 | 2 4
4 | 2 1
5 | 3 1
6 | 3 2
7 | 4 1
8 | 4 2
9 | 4 3
--------------------------------------------------------------------------------
/com.homework/scripts/week10/计算33的好友推荐(不关注别人的没有推荐)/common.java.bak:
--------------------------------------------------------------------------------
1 | class
2 | {
3 | public static void main(String[] args)
4 | {
5 | System.out.println("Hello World!");
6 | }
7 | }
8 |
--------------------------------------------------------------------------------
/com.homework/scripts/week10/计算33的好友推荐(不关注别人的没有推荐)/pig.pig:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bellonor/myHadoopProject/5b7f551dda6daf86489eeed8370d868a90a5cb8e/com.homework/scripts/week10/计算33的好友推荐(不关注别人的没有推荐)/pig.pig
--------------------------------------------------------------------------------
/com.homework/scripts/week10/计算33的好友推荐(不关注别人的没有推荐)/user.java:
--------------------------------------------------------------------------------
1 | (10,33,1)
2 | (10,29,1)
3 | (10,28,1)
4 | (10,14,1)
5 | (10,9,1)
6 | (11,7,3)
7 | (11,12,1)
8 | (11,8,1)
9 | (11,6,1)
10 | (11,5,1)
11 | (12,4,1)
12 | (12,2,1)
13 | (12,3,1)
14 | (12,32,1)
15 | (12,22,1)
16 | (13,8,2)
17 | (13,14,2)
18 | (13,22,1)
19 | (13,18,1)
20 | (13,12,1)
21 | (14,8,4)
22 | (14,4,3)
23 | (14,22,2)
24 | (14,20,2)
25 | (14,18,2)
26 | (17,11,1)
27 | (17,7,1)
28 | (18,8,2)
29 | (18,4,2)
30 | (18,3,2)
31 | (18,22,2)
32 | (18,20,2)
33 | (2,3,1)
34 | (2,4,1)
35 | (2,5,1)
36 | (2,6,1)
37 | (2,7,1)
38 | (20,3,2)
39 | (20,22,2)
40 | (20,18,2)
41 | (20,14,2)
42 | (20,8,2)
43 | (22,8,2)
44 | (22,4,2)
45 | (22,3,2)
46 | (22,20,2)
47 | (22,14,2)
48 | (26,28,2)
49 | (26,34,1)
50 | (26,30,1)
51 | (26,32,1)
52 | (26,33,1)
53 | (28,33,2)
54 | (28,26,2)
55 | (28,29,1)
56 | (28,14,1)
57 | (28,10,1)
58 | (29,8,1)
59 | (29,4,1)
60 | (29,10,1)
61 | (29,33,1)
62 | (29,14,1)
63 | (3,20,2)
64 | (3,18,2)
65 | (3,14,2)
66 | (3,4,2)
67 | (3,8,2)
68 | (30,34,2)
69 | (30,33,1)
70 | (30,28,1)
71 | (30,26,1)
72 | (31,18,1)
73 | (31,22,1)
74 | (31,33,1)
75 | (31,34,1)
76 | (31,20,1)
77 | (32,11,1)
78 | (32,13,1)
79 | (32,14,1)
80 | (32,18,1)
81 | (32,20,1)
82 | (33,34,10)
83 | (33,28,2)
84 | (33,29,1)
85 | (33,4,1)
86 | (33,8,1)
87 | (34,33,10)
88 | (34,30,2)
89 | (34,28,1)
90 | (34,26,1)
91 | (34,32,1)
92 | (4,8,3)
93 | (4,14,3)
94 | (4,18,2)
95 | (4,20,2)
96 | (4,22,2)
97 | (5,7,1)
98 | (5,9,1)
99 | (5,11,1)
100 | (5,3,1)
101 | (5,13,1)
102 | (6,8,1)
103 | (6,9,1)
104 | (6,11,1)
105 | (6,12,1)
106 | (6,13,1)
107 | (7,11,3)
108 | (7,8,1)
109 | (7,2,1)
110 | (7,3,1)
111 | (7,4,1)
112 | (8,14,4)
113 | (8,4,3)
114 | (8,18,2)
115 | (8,20,2)
116 | (8,22,2)
117 | (9,8,2)
118 | (9,4,2)
119 | (9,14,2)
120 | (9,20,1)
121 | (9,3,1)
--------------------------------------------------------------------------------
/com.homework/scripts/week13/week13:
--------------------------------------------------------------------------------
1 |
2 | 13周作业
3 | mahout fpg -i /user/hdfs/week13/in/user2items2.csv -o /user/hdfs/week13/out -k 50 -method mapreduce -regex '[\ ]' -s 4
4 | 查看结果
5 |
6 | mahout seqdumper -i /user/hdfs/week13/out/frequentpatterns/part-r-00000
7 | 结果:
8 | Key: I1: Value: ([I1],6), ([I2, I1],4), ([I1, I3],4), ([I2, I1, I5],2), ([I2, I1, I3],2)
9 | Key: I2: Value: ([I2],7), ([I2, I3],4), ([I2, I1],4), ([I2, I1, I5],2), ([I2, I1, I3],2), ([I2, I4],2)
10 | Key: I3: Value: ([I3],6), ([I2, I3],4), ([I1, I3],4), ([I2, I1, I3],2)
11 | Key: I4: Value: ([I2, I4],2)
12 | Key: I5: Value: ([I2, I1, I5],2)
13 | Count: 5
14 | 查看fpgrowth
15 | mahout seqdumper -i /user/hdfs/week13/out/fpgrowth/part-r-00000
16 | Key: I2: Value: ([I2],7)
17 | Key: I1: Value: ([I1],6), ([I2, I1],4)
18 | Key: I3: Value: ([I3],6), ([I2, I3],4), ([I1, I3],4), ([I2, I1, I3],2)
19 | Key: I4: Value: ([I2, I4],2)
20 | Key: I5: Value: ([I2, I1, I5],2)
21 | Count: 5
22 | 查看fList
23 | mahout seqdumper -i /user/hdfs/fp-growth/out/fList
24 | Key: I2: Value: 7
25 | Key: I1: Value: 6
26 | Key: I3: Value: 6
27 | Key: I4: Value: 2
28 | Key: I5: Value: 2
29 | Count: 5
--------------------------------------------------------------------------------
/com.homework/scripts/week8.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bellonor/myHadoopProject/5b7f551dda6daf86489eeed8370d868a90a5cb8e/com.homework/scripts/week8.rar
--------------------------------------------------------------------------------
/com.homework/scripts/week8/homework.txt:
--------------------------------------------------------------------------------
1 | --1.样本分词
2 | hadoop jar /home/hadoop/in/tokenize.jar /user/hdfs/week8/homework/sport /user/hdfs/week8/homework/sport-out
3 |
4 | --2.划分样本,80%训练集,20%测试集
5 | processed= load '/user/hdfs/week8/homework/sport-out/part-r-00000' as (category:chararray,doc:chararray);
6 | test = sample processed 0.2;
7 | tfull= JOIN processed BY (category,doc) LEFT OUTER,test BY (category,doc);
8 | t8= filter tfull BY test::category is null;
9 | train= foreach t8 generate processed::category as category,processed::doc as doc;
10 | store test into '/user/hdfs/week8/homework/test';
11 | store train into '/user/hdfs/week8/homework/train';
12 | --查看划分结果
13 | test_count = foreach ( group test by category) generate group,COUNT(test.category);
14 | DUMP test_count;
15 | train_count = foreach ( group train by category) generate group,COUNT(train.category);
16 | DUMP train_count;
17 |
18 | --3.训练学习集,及测试
19 | --mahout-0.6版,0.8不行
20 | --a.bayes
21 | mahout trainclassifier -i /user/hdfs/week8/homework/train -o /user/hdfs/week8/homework/model-bayes -type bayes -ng 1 -source hdfs
22 | mahout testclassifier -d /user/hdfs/week8/homework/test -m /user/hdfs/week8/homework/model-bayes -type bayes -ng 1 -source hdfs -method mapreduce
23 | --b.cbayes
24 | mahout trainclassifier -i /user/hdfs/week8/homework/train -o /user/hdfs/week8/homework/model-cbayes -type cbayes -ng 1 -source hdfs
25 | mahout testclassifier -d /user/hdfs/week8/homework/test -m /user/hdfs/week8/homework/model-cbayes -type cbayes -ng 1 -source hdfs -method mapreduce
26 |
27 | --实战
28 | --分词
29 | hadoop jar /home/hadoop/in/tokenize.jar /user/hdfs/week8/homework/user-sport /user/hdfs/week8/homework/user-sport-out
30 | --运行:win7,eclipse下运行
31 | cbayes:
32 | hdfs://192.168.0.100:9000/user/hdfs/week8/homework/user-sport-out hdfs://192.168.0.100:9000/user/hdfs/week8/homework/result_cbayes hdfs://192.168.0.100:9000/user/hdfs/week8/homework/model-cbayes cbayes
33 | bayes:
34 | hdfs://192.168.0.100:9000/user/hdfs/week8/homework/user-sport-out hdfs://192.168.0.100:9000/user/hdfs/week8/homework/result_bayes hdfs://192.168.0.100:9000/user/hdfs/week8/homework/model-bayes bayes
35 | --求最大值——求用户浏览最多的类别,判断用户偏好
36 | bayes:
37 | user_count= load '/user/hdfs/week8/homework/result_bayes/part-r-00000' using PigStorage('|') AS (userid:chararray,category:chararray,times:int);
38 | result = foreach (group user_count by userid) {
39 | sorted = order user_count by times desc;
40 | top1= limit sorted 1;
41 | generate flatten(top1),SUM(user_count.times);
42 | };
43 | DUMP result;
44 | store result into '/user/hdfs/week8/homework/final_result_bayes';
45 | cbayes:
46 | user_count= load '/user/hdfs/week8/homework/result_cbayes/part-r-00000' using PigStorage('|') AS (userid:chararray,category:chararray,times:int);
47 | result = foreach (group user_count by userid) {
48 | sorted = order user_count by times desc;
49 | top1= limit sorted 1;
50 | generate flatten(top1),SUM(user_count.times);
51 | };
52 | DUMP result;
53 | store result into '/user/hdfs/week8/homework/final_result_cbayes';
54 |
--------------------------------------------------------------------------------
/com.homework/scripts/week8/week8.pig:
--------------------------------------------------------------------------------
1 | processed= load '/user/hdfs/week8/teacher/in/processed' as (category:chararray,doc:chararray);
2 | test = sample processed 0.2;
3 |
4 | --测试用
5 | processed= load '/user/mypig/lefta.txt' as (a1:chararray,a2:chararray,a3:chararray);
6 | test = sample processed 0.2;
7 |
8 | tfull= JOIN processed BY (a1,a2,a3) LEFT OUTER,test BY (a1,a2,a3);
9 | t8= filter tfull BY test::a1 is null;
10 | train= foreach t8 generate processed::a1 as a1,processed::a2 as a2,processed::a3 as a3;
11 | store test into '/user/mypig/test';
12 | store train into '/user/mypig/train';
13 | --正式
14 | processed= load '/user/hdfs/week8/teacher/in/processed' as (category:chararray,doc:chararray);
15 | test = sample processed 0.2;
16 | tfull= JOIN processed BY (category,doc) LEFT OUTER,test BY (category,doc);
17 | t8= filter tfull BY test::category is null;
18 | train= foreach t8 generate processed::category as category,processed::doc as doc;
19 | store test into '/user/hdfs/week8/teacher/test';
20 | store train into '/user/hdfs/week8/teacher/train';
21 | --统计
22 | test_count = foreach ( group test by category) generate group,COUNT(test.category);
23 | DUMP test_count;
24 | train_count = foreach ( group train by category) generate group,COUNT(train.category);
25 | DUMP train_count;
26 | --mahout-0.6,0.8不行
27 | mahout trainclassifier -i /user/hdfs/week8/teacher/train -o /user/hdfs/week8/model-bayes -type bayes -ng 1 -source hdfs
28 |
29 | mahout testclassifier -d /user/hdfs/week8/teacher/test -m /user/hdfs/week8/model-bayes -type bayes -ng 1 -source hdfs -method mapreduce
30 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/com.homework/scripts/week8/week8.txt:
--------------------------------------------------------------------------------
1 | --1.分词
2 | hadoop jar /home/hadoop/in/tokenize.jar /user/hdfs/week7/in /user/hdfs/week7/out
3 |
4 | --2.划分样本,80%训练集,20%测试集
5 | processed= load '/user/hdfs/week8/mine/in/processed' as (category:chararray,doc:chararray);
6 | test = sample processed 0.2;
7 | tfull= JOIN processed BY (category,doc) LEFT OUTER,test BY (category,doc);
8 | t8= filter tfull BY test::category is null;
9 | train= foreach t8 generate processed::category as category,processed::doc as doc;
10 | store test into '/user/hdfs/week8/mine/test';
11 | store train into '/user/hdfs/week8/mine/train';
12 | --查看划分结果
13 | test_count = foreach ( group test by category) generate group,COUNT(test.category);
14 | DUMP test_count;
15 | train_count = foreach ( group train by category) generate group,COUNT(train.category);
16 | DUMP train_count;
17 |
18 | --3.训练学习集,及测试
19 | --mahout-0.6版,0.8不行
20 | --a.bayes
21 | mahout trainclassifier -i /user/hdfs/week8/mine/train -o /user/hdfs/week8/mine/model-bayes -type bayes -ng 1 -source hdfs
22 | mahout testclassifier -d /user/hdfs/week8/mine/test -m /user/hdfs/week8/mine/model-bayes -type bayes -ng 1 -source hdfs -method mapreduce
23 | --b.cbayes
24 | mahout trainclassifier -i /user/hdfs/week8/mine/train -o /user/hdfs/week8/mine/model-cbayes -type cbayes -ng 1 -source hdfs
25 | mahout testclassifier -d /user/hdfs/week8/mine/test -m /user/hdfs/week8/mine/model-cbayes -type cbayes -ng 1 -source hdfs -method mapreduce
26 |
27 | --用户数据测试
28 | --分词
29 | hadoop jar /home/hadoop/in/tokenize.jar /user/hdfs/week8/mine/user /user/hdfs/week8/mine/user-out
30 | --运行:
31 | hdfs://192.168.0.100:9000/user/hdfs/week8/mine/user-out hdfs://192.168.0.100:9000/user/hdfs/week8/mine/user-output hdfs://192.168.0.100:9000/user/hdfs/week8/mine/model-cbayes cbayes
32 |
33 |
34 |
--------------------------------------------------------------------------------
/com.homework/scripts/week9/pagerank.r:
--------------------------------------------------------------------------------
1 | #pages<-read.csv("page",header=FALSE);
2 | pages<-read.csv("people.csv",header=FALSE);
3 | #构造邻接矩阵(方阵):
4 | mrow<-max(pages)
5 | A<-matrix(0,nrow=mrow,ncol=mrow);
6 | #cols=length(pages[1,]);
7 | rows=length(pages[,1]);
8 | for(i in 1:rows){
9 | p1<-pages[i,1];
10 | p2<-pages[i,2];
11 | A[p2,p1]<-1;
12 | }
13 |
14 |
15 | #考虑阻尼系统的情况
16 | csum<-colSums(A);
17 | csum[csum==0] <- 1;
18 | Arow=nrow(A);
19 | d<-0.85;
20 | de<-1-d/Arow;
21 | delta <- (1-d)/Arow;
22 | B <- matrix(delta,nrow(A),ncol(A));
23 | for (i in 1:Arow) B[i,] <- B[i,] + d*A[i,]/csum;
24 | # 迭代求解特征向量值
25 | x <- rep(1,Arow);
26 | for (i in 1:100) x <- B %*% x
27 | x/sum(x)
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 | #转换为概率矩阵(转移矩阵),不考虑阻尼系统
39 | csum<-colSums(A);
40 | csum[csum==0] <- 1;
41 | Arow=nrow(A);
42 | for(i in 1:Arow){
43 | A[i,]<-A[i,]/csum;
44 | }
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 | #利用幂法求解特征向量,不考虑阻尼系统的情况
60 | x <- rep(1,Arow);
61 | for (i in 1:10) x <- A %*% x
62 | #除以一个常数
63 | x/sum(x);
64 |
65 |
66 |
67 |
68 |
--------------------------------------------------------------------------------
/com.homework/src/common/com/homework/hdfs/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | /**
5 | * @author Administrator
6 | *
7 | */
8 | package com.homework.hdfs;
--------------------------------------------------------------------------------
/com.homework/src/hadoop/machinelearning/clustering/hadoop/machinelearning/clustering/canopy/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | /**
5 | * @author Administrator
6 | *
7 | */
8 | package hadoop.machinelearning.clustering.canopy;
--------------------------------------------------------------------------------
/com.homework/src/hadoop/machinelearning/clustering/hadoop/machinelearning/clustering/kmeans/KmeansHadoop.java:
--------------------------------------------------------------------------------
1 | package hadoop.machinelearning.clustering.kmeans;
2 |
3 |
4 |
5 | import java.util.Iterator;
6 |
7 | import org.apache.hadoop.fs.Path;
8 | import org.apache.hadoop.mapred.JobConf;
9 | import org.apache.mahout.clustering.classify.WeightedVectorWritable;
10 | import org.apache.mahout.clustering.conversion.InputDriver;
11 | import org.apache.mahout.clustering.kmeans.KMeansDriver;
12 | import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
13 | import org.apache.mahout.common.distance.DistanceMeasure;
14 | import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
15 | import org.apache.mahout.math.Vector;
16 | import org.apache.mahout.utils.clustering.ClusterDumper;
17 |
18 | import com.homework.hdfs.HdfsDAO;
19 |
20 | /**
21 | * 素材我已经改成《Mahout in Action》第七章的Hello world例子,运行正常,结果也书一样
22 | * @author Administrator
23 | *书中源码地址: https://github.com/tdunning/MiA
24 | */
25 | public class KmeansHadoop {
26 | private static final String HDFS = "hdfs://192.168.0.100:9000";
27 |
28 | public static void main(String[] args) throws Exception {
29 | //String localFile = "datafile/randomData.csv";
30 | String localFile = "datafile/cluster/simple_k-means.txt";
31 | String inPath = HDFS + "/user/hdfs/mix_data";
32 | String seqFile = inPath + "/seqfile";
33 | String seeds = inPath + "/seeds";
34 | String outPath = inPath + "/result/";
35 | String clusteredPoints = outPath + "/clusteredPoints";
36 |
37 | JobConf conf = config();
38 | HdfsDAO hdfs = new HdfsDAO(HDFS, conf);
39 | hdfs.rmr(inPath);
40 | hdfs.mkdirs(inPath);
41 | hdfs.copyFile(localFile, inPath);
42 | hdfs.ls(inPath);
43 |
44 | InputDriver.runJob(new Path(inPath), new Path(seqFile), "org.apache.mahout.math.RandomAccessSparseVector");
45 |
46 | //int k = 3;
47 | int k = 2;
48 | Path seqFilePath = new Path(seqFile);
49 | Path clustersSeeds = new Path(seeds);
50 | DistanceMeasure measure = new EuclideanDistanceMeasure();
51 | clustersSeeds = RandomSeedGenerator.buildRandom(conf, seqFilePath, clustersSeeds, k, measure);
52 | KMeansDriver.run(conf, seqFilePath, clustersSeeds, new Path(outPath), measure, 0.01, 10, true, 0.01, false);
53 |
54 | Path outGlobPath = new Path(outPath, "clusters-*-final");
55 | Path clusteredPointsPath = new Path(clusteredPoints);
56 | System.out.printf("Dumping out clusters from clusters: %s and clusteredPoints: %s\n", outGlobPath, clusteredPointsPath);
57 |
58 | ClusterDumper clusterDumper = new ClusterDumper(outGlobPath, clusteredPointsPath);
59 | clusterDumper.printClusters(null);
60 | }
61 |
62 | public static JobConf config() {
63 | JobConf conf = new JobConf(KmeansHadoop.class);
64 | conf.setJobName("ItemCFHadoop");
65 | conf.addResource("classpath:/hadoop/core-site.xml");
66 | conf.addResource("classpath:/hadoop/hdfs-site.xml");
67 | conf.addResource("classpath:/hadoop/mapred-site.xml");
68 | return conf;
69 | }
70 |
71 | public static void displayCluster(ClusterDumper clusterDumper) {
72 | Iterator keys = clusterDumper.getClusterIdToPoints().keySet().iterator();
73 | while (keys.hasNext()) {
74 | Integer center = keys.next();
75 | System.out.println("Center:" + center);
76 | for (WeightedVectorWritable point : clusterDumper.getClusterIdToPoints().get(center)) {
77 | Vector v = point.getVector();
78 | System.out.println(v.get(0) + "" + v.get(1));
79 | }
80 | }
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/com.homework/src/hadoop/machinelearning/clustering/hadoop/machinelearning/clustering/kmeans/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | /**
5 | * @author Administrator
6 | *
7 | */
8 | package hadoop.machinelearning.clustering.kmeans;
--------------------------------------------------------------------------------
/com.homework/src/main/java/com/homework/App.java:
--------------------------------------------------------------------------------
1 | package com.homework;
2 |
3 | /**
4 | * Hello world!
5 | *
6 | */
7 | public class App
8 | {
9 | public static void main( String[] args )
10 | {
11 | System.out.println( "Hello World!" );
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/com.homework/src/mommon/com/homework/mommon/ComTest.java:
--------------------------------------------------------------------------------
1 | package com.homework.mommon;
2 |
3 | public class ComTest {
4 |
5 | public static void main(String[] args) {
6 | // TODO Auto-generated method stub
7 | String str="sss";
8 | String str2="dd";
9 | }
10 |
11 | }
12 |
--------------------------------------------------------------------------------
/com.homework/src/mommon/com/homework/mommon/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | /**
5 | * @author Jamas
6 | *
7 | */
8 | package com.homework.mommon;
--------------------------------------------------------------------------------
/com.homework/src/mommon/mytest/MenuTree.java:
--------------------------------------------------------------------------------
1 | package mytest;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Iterator;
5 | import java.util.List;
6 | //递归一颗树
7 | public class MenuTree {
8 |
9 |
10 | public static void mytree(List nlist,Node node){
11 | System.out.print(node.getId()+node.getNodeName());
12 | Node subnode=null;
13 | Long id=node.getId();
14 | Iterator iter=nlist.iterator();
15 | boolean isexit=false;
16 | while(iter.hasNext()){
17 | Node nod=iter.next();
18 | if(nod.getParentId()==id){
19 | isexit=true;
20 | subnode=nod;
21 | mytree(nlist,subnode);
22 |
23 | }
24 | }
25 | if(!isexit)return;
26 |
27 | }
28 |
29 |
30 | public static void main(String[] args) {
31 |
32 | long start = System.currentTimeMillis();
33 | List nodeList = new ArrayList();
34 | Node node1 = new Node(1l, "蔬菜", 0l);
35 | Node node2 = new Node(2l, "水产", 0l);
36 | Node node3 = new Node(3l, "畜牧", 0l);
37 | Node node4 = new Node(4l, "瓜类", 1l);
38 | Node node5 = new Node(5l, "叶类", 1l);
39 | Node node6 = new Node(6l, "丝瓜", 4l);
40 | Node node7 = new Node(7l, "黄瓜", 4l);
41 | Node node8 = new Node(8l, "白菜", 1l);
42 | Node node9 = new Node(9l, "虾", 2l);
43 | Node node10 = new Node(10l, "鱼", 2l);
44 | Node node11 = new Node(11l, "牛", 3l);
45 | Node node0=new Node(0l,"市场种类",-1l);
46 |
47 | nodeList.add(node0);
48 | nodeList.add(node1);
49 | nodeList.add(node2);
50 | nodeList.add(node3);
51 | nodeList.add(node4);
52 | nodeList.add(node5);
53 | nodeList.add(node6);
54 | nodeList.add(node7);
55 | nodeList.add(node8);
56 | nodeList.add(node9);
57 | nodeList.add(node10);
58 | nodeList.add(node11);
59 |
60 | mytree(nodeList,node0);
61 | //NodeUtil mt = new NodeUtil();
62 | //System.out.println(mt.getChildNodes(nodeList, 1l));
63 | long end = System.currentTimeMillis();
64 | System.out.println("用时:" + (end - start) + "ms");
65 | }
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/com.homework/src/mommon/mytest/Node.java:
--------------------------------------------------------------------------------
1 | package mytest;
2 |
3 | /**
4 | * 无限级节点模型
5 | */
6 | public class Node {
7 | /**
8 | * 节点id
9 | */
10 | private Long id;
11 |
12 | /**
13 | * 节点名称
14 | */
15 | private String nodeName;
16 |
17 | /**
18 | * 父节点id
19 | */
20 | private Long parentId;
21 |
22 | public Node() {
23 | }
24 |
25 | Node(Long id, Long parentId) {
26 | this.id = id;
27 | this.parentId = parentId;
28 | }
29 |
30 | Node(Long id, String nodeName, Long parentId) {
31 | this.id = id;
32 | this.nodeName = nodeName;
33 | this.parentId = parentId;
34 | }
35 |
36 | public Long getId() {
37 | return id;
38 | }
39 |
40 | public void setId(Long id) {
41 | this.id = id;
42 | }
43 |
44 | public Long getParentId() {
45 | return parentId;
46 | }
47 |
48 | public void setParentId(Long parentId) {
49 | this.parentId = parentId;
50 | }
51 |
52 | public String getNodeName() {
53 | return nodeName;
54 | }
55 |
56 | public void setNodeName(String nodeName) {
57 | this.nodeName = nodeName;
58 | }
59 |
60 | }
61 |
62 |
--------------------------------------------------------------------------------
/com.homework/src/mommon/mytest/Recursive.java:
--------------------------------------------------------------------------------
1 | package mytest;
2 |
3 | public class Recursive {
4 |
5 | public static void foo(int num){
6 | num=num-1;
7 | if(num==0)return;
8 |
9 | else
10 | {
11 | System.out.println(num);
12 | foo(num);
13 |
14 | }
15 |
16 | }
17 | public static void main(String[] args) {
18 | // TODO Auto-generated method stub
19 | foo(5);
20 | }
21 |
22 | }
23 |
--------------------------------------------------------------------------------
/com.homework/src/mommon/mytest/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | /**
5 | * @author Administrator
6 | *
7 | */
8 | package mytest;
--------------------------------------------------------------------------------
/com.homework/src/sequence/machinelearning/association/sequence/machinelearning/association/apriori/ItemMap.java:
--------------------------------------------------------------------------------
1 | package sequence.machinelearning.association.apriori;
2 |
3 | import java.util.HashMap;
4 | import java.util.Map;
5 |
6 | public class ItemMap {
7 |
8 | public String key;
9 | public Integer value=0;
10 |
11 | public Map map;
12 |
13 | public String getKey() {
14 | return key;
15 | }
16 | public void setKey(String key) {
17 | this.key = key;
18 | }
19 | public Integer getValue() {
20 | return value;
21 | }
22 | public void setValue(Integer value) {
23 | this.value = value;
24 | }
25 | public Map getMap() {
26 | if(map==null){
27 | map=new HashMap();
28 | }
29 | return map;
30 | }
31 | public void setMap(Map map) {
32 | this.map = map;
33 | }
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/com.homework/src/sequence/machinelearning/association/sequence/machinelearning/association/apriori/Subset.java:
--------------------------------------------------------------------------------
1 | package sequence.machinelearning.association.apriori;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 | /**
6 | * 求集合的子集,不包含自己和空集
7 | * @author Administrator
8 | * 算法:先取出第一个,取出一下个的时候与前面的所有项逐个搭配,所以两次FOR就可以了
9 | * 1 //第一次FOR,i=0
10 | * 12,2 //第二次FOR,i=1
11 | * 123,23,13,3 //第三次FOR,i=2
12 | * 14,124,24,1234,234,134,34//第四次FOR,i=3
13 | * 最后删除本身,为了Apriori算法而增加这一步
14 | */
15 | //本类为测试使用,不在MyApriori里面
16 | public class Subset {
17 |
18 | public static List lis=new ArrayList();
19 | public static void main(String[] args) {
20 |
21 | //subset();
22 | // TODO Auto-generated method stub
23 | String[] str =new String[] { "1", "2", "3", "4"};
24 | StringBuilder sb=new StringBuilder();
25 | List li=new ArrayList();
26 | for(int i=0;i li=new ArrayList();
46 | for(int i=0;i dataMap=new TreeMap();
15 | public static final void readF1() throws IOException {
16 |
17 | //String filePath="scripts/clustering/canopy/canopy.dat";
18 | String filePath="datafile/association/items";
19 | BufferedReader br = new BufferedReader(new InputStreamReader(
20 | new FileInputStream(filePath)));
21 | for (String line = br.readLine(); line != null; line = br.readLine()) {
22 | if(line.length()==0||"".equals(line))continue;
23 | String[] str=line.split("\t");
24 | dataMap.put(str[0], str[1].trim());
25 | //System.out.println(line);
26 | }
27 | br.close();
28 |
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/com.homework/src/sequence/machinelearning/association/sequence/machinelearning/association/common/SortTest.java:
--------------------------------------------------------------------------------
1 | package sequence.machinelearning.association.common;
2 |
3 | import java.util.Comparator;
4 | import java.util.List;
5 | import java.util.ArrayList;
6 | import java.util.Collections;
7 |
8 | class User {
9 | String name;
10 | String age;
11 |
12 | public User(String name,String age){
13 | this.name=name;
14 | this.age=age;
15 | }
16 | public String getAge() {
17 | return age;
18 | }
19 | public void setAge(String age) {
20 | this.age = age;
21 | }
22 | public String getName() {
23 | return name;
24 | }
25 | public void setName(String name) {
26 | this.name = name;
27 | }
28 | }
29 |
30 | class ComparatorUser implements Comparator{
31 |
32 | public int compare(Object arg0, Object arg1) {
33 | User user0=(User)arg0;
34 | User user1=(User)arg1;
35 | //首先比较年龄,如果年龄相同,则比较名字
36 | int flag=user0.getAge().compareTo(user1.getAge());
37 | if(flag==0){
38 | return user0.getName().compareTo(user1.getName());
39 | }else{
40 | return flag;
41 | }
42 | }
43 |
44 | }
45 |
46 | public class SortTest {
47 |
48 |
49 | public static void main(String[] args){
50 | List userlist=new ArrayList();
51 | userlist.add(new User("dd","4"));
52 | userlist.add(new User("aa","1"));
53 | userlist.add(new User("ee","5"));
54 | userlist.add(new User("bb","2"));
55 | userlist.add(new User("ff","5"));
56 | userlist.add(new User("cc","3"));
57 | userlist.add(new User("gg","6"));
58 |
59 | ComparatorUser comparator=new ComparatorUser();
60 | Collections.sort(userlist, comparator);
61 |
62 | for (int i=0;i tmap=new TreeMap();
20 |
21 | /**
22 | * 扫描事务集以确定频繁1项集(找出C1)
23 | */
24 | public static List findFrequentOneItemSets(Map map){
25 | TreeMap treemap=new TreeMap();
26 | Iterator> iter=map.entrySet().iterator();
27 | Entry entry;
28 | while(iter.hasNext()){
29 | entry=iter.next();
30 | String str=entry.getValue();
31 | if(str.length()<1)continue;
32 | String[] items=str.split(",");
33 | //找出购物栏最大的项,为循环连接做准备
34 | if(items.length>itemnum)itemnum=items.length;
35 | for(int i=0;i lif1=Transaction.findFrequentOneItemSets(ReadData.dataMap);
51 | for(int i=0;i itemsort(String[] items){
57 | LinkedList linst=new LinkedList();
58 | //选择法排序
59 | int len=items.length;
60 | for(int i=0;i DeleteItem(TreeMap map){
81 | List listmap=new ArrayList();
82 | Iterator> iter=map.entrySet().iterator();
83 | Entry entry;
84 | while(iter.hasNext()){
85 | entry=iter.next();
86 | if(entry.getValue()>=support){
87 | ItemMap item=new ItemMap();
88 | item.setKey(entry.getKey());
89 | item.setValue(entry.getValue());
90 | if(listmap.size()==0)listmap.add(item);
91 | else{
92 |
93 | ItemMap tail=new ItemMap();
94 | int size=listmap.size();
95 | tail=listmap.get(size-1);
96 | if(item.getValue()>tail.getValue()){
97 | listmap.remove(size-1);
98 | listmap.add(item);
99 | listmap.add(tail);
100 | }else{
101 | listmap.add(item);
102 | }
103 |
104 | }
105 |
106 | }
107 | }
108 | return listmap;
109 | }
110 | }
111 |
--------------------------------------------------------------------------------
/com.homework/src/sequence/machinelearning/association/sequence/machinelearning/association/common/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | /**
5 | * @author Administrator
6 | *
7 | */
8 | package sequence.machinelearning.association.common;
--------------------------------------------------------------------------------
/com.homework/src/sequence/machinelearning/association/sequence/machinelearning/association/fpgrowth/TreeNode2.java:
--------------------------------------------------------------------------------
1 | package sequence.machinelearning.association.fpgrowth;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 |
7 |
8 | public class TreeNode2 implements Comparable{
9 |
10 | private String name; // 节点名称
11 | private Integer count; // 计数
12 | private TreeNode2 parent; // 父节点
13 | private List children; // 子节点
14 | private TreeNode2 nextHomonym; // 下一个同名节点
15 |
16 | public TreeNode2() {
17 |
18 | }
19 |
20 | public String getName() {
21 | return name;
22 | }
23 |
24 | public void setName(String name) {
25 | this.name = name;
26 | }
27 |
28 | public Integer getCount() {
29 | return count;
30 | }
31 |
32 | public void setCount(Integer count) {
33 | this.count = count;
34 | }
35 | public void Sum(Integer count) {
36 | this.count =this.count+count;
37 | }
38 | public TreeNode2 getParent() {
39 | return parent;
40 | }
41 |
42 | public void setParent(TreeNode2 parent) {
43 | this.parent = parent;
44 | }
45 |
46 | public List getChildren() {
47 | return children;
48 | }
49 |
50 | public void setChildren(List children) {
51 | this.children = children;
52 | }
53 |
54 | public TreeNode2 getNextHomonym() {
55 | return nextHomonym;
56 | }
57 |
58 | public void setNextHomonym(TreeNode2 nextHomonym) {
59 | this.nextHomonym = nextHomonym;
60 | }
61 | /**
62 | * 添加一个节点
63 | * @param child
64 | */
65 | public void addChild(TreeNode2 child) {
66 | if (this.getChildren() == null) {
67 | List list = new ArrayList();
68 | list.add(child);
69 | this.setChildren(list);
70 | } else {
71 | this.getChildren().add(child);
72 | }
73 | }
74 | /**
75 | * 是否存在着该节点,存在返回该节点,不存在返回空
76 | * @param name
77 | * @return
78 | */
79 | public TreeNode2 findChild(String name) {
80 | List children = this.getChildren();
81 | if (children != null) {
82 | for (TreeNode2 child : children) {
83 | if (child.getName().equals(name)) {
84 | return child;
85 | }
86 | }
87 | }
88 | return null;
89 | }
90 |
91 |
92 | @Override
93 | public int compareTo(TreeNode2 arg0) {
94 | // TODO Auto-generated method stub
95 | int count0 = arg0.getCount();
96 | // 跟默认的比较大小相反,导致调用Arrays.sort()时是按降序排列
97 | return count0 - this.count;
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/com.homework/src/sequence/machinelearning/association/sequence/machinelearning/association/fpgrowth/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | /**
5 | * @author Administrator
6 | *
7 | */
8 | package sequence.machinelearning.association.fpgrowth;
--------------------------------------------------------------------------------
/com.homework/src/sequence/machinelearning/association/sequence/machinelearning/association/fpgtest/TreeNode.java:
--------------------------------------------------------------------------------
1 | package sequence.machinelearning.association.fpgtest;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | public class TreeNode implements Comparable {
7 |
8 | private String name; // 节点名称
9 | private int count; // 计数
10 | private TreeNode parent; // 父节点
11 | private List children; // 子节点
12 | private TreeNode nextHomonym; // 下一个同名节点
13 |
14 | public TreeNode() {
15 |
16 | }
17 |
18 | public TreeNode(String name) {
19 | this.name = name;
20 | }
21 |
22 | public String getName() {
23 | return name;
24 | }
25 |
26 | public void setName(String name) {
27 | this.name = name;
28 | }
29 |
30 | public int getCount() {
31 | return count;
32 | }
33 |
34 | public void setCount(int count) {
35 | this.count = count;
36 | }
37 |
38 | public TreeNode getParent() {
39 | return parent;
40 | }
41 |
42 | public void setParent(TreeNode parent) {
43 | this.parent = parent;
44 | }
45 |
46 | public List getChildren() {
47 | return children;
48 | }
49 |
50 | public void addChild(TreeNode child) {
51 | if (this.getChildren() == null) {
52 | List list = new ArrayList();
53 | list.add(child);
54 | this.setChildren(list);
55 | } else {
56 | this.getChildren().add(child);
57 | }
58 | }
59 |
60 | public TreeNode findChild(String name) {
61 | List children = this.getChildren();
62 | if (children != null) {
63 | for (TreeNode child : children) {
64 | if (child.getName().equals(name)) {
65 | return child;
66 | }
67 | }
68 | }
69 | return null;
70 | }
71 |
72 | public void setChildren(List children) {
73 | this.children = children;
74 | }
75 |
76 | public void printChildrenName() {
77 | List children = this.getChildren();
78 | if (children != null) {
79 | for (TreeNode child : children) {
80 | System.out.print(child.getName() + " ");
81 | }
82 | } else {
83 | System.out.print("null");
84 | }
85 | }
86 |
87 | public TreeNode getNextHomonym() {
88 | return nextHomonym;
89 | }
90 |
91 | public void setNextHomonym(TreeNode nextHomonym) {
92 | this.nextHomonym = nextHomonym;
93 | }
94 |
95 | public void countIncrement(int n) {
96 | this.count += n;
97 | }
98 |
99 | @Override
100 | public int compareTo(TreeNode arg0) {
101 | // TODO Auto-generated method stub
102 | int count0 = arg0.getCount();
103 | // 跟默认的比较大小相反,导致调用Arrays.sort()时是按降序排列
104 | return count0 - this.count;
105 | }
106 | }
107 |
--------------------------------------------------------------------------------
/com.homework/src/sequence/machinelearning/association/sequence/machinelearning/association/fpgtest/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | /**
5 | * @author Administrator
6 | *
7 | */
8 | package sequence.machinelearning.association.fpgtest;
--------------------------------------------------------------------------------
/com.homework/src/sequence/machinelearning/association/sequence/machinelearning/association/otherdemo/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | /**
5 | * @author Administrator
6 | *
7 | */
8 | package sequence.machinelearning.association.otherdemo;
--------------------------------------------------------------------------------
/com.homework/src/sequence/machinelearning/clustering/sequence/machinelearning/clustering/canopy/MyCanopy.java:
--------------------------------------------------------------------------------
1 | package sequence.machinelearning.clustering.canopy;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.FileInputStream;
5 | import java.io.FileReader;
6 | import java.io.IOException;
7 | import java.io.InputStreamReader;
8 | import java.util.ArrayList;
9 | import java.util.List;
10 | import java.util.Vector;
11 |
12 | public class MyCanopy {
13 |
14 | //x,y之间当且仅当有一个空格,要严格控制,为了与MAHOUT中的输入格式一致,所以这里也采用空格作为分隔。
15 | static Vector li=new Vector();
16 | //static List li=new ArrayList();
17 | static List> list=new ArrayList>();
18 | private final static Double t1=8.0;
19 | private final static Double t2=4.0;
20 | //简单地采用曼哈顿距离|x1 – x2| + |y1 – y2|
21 |
22 | public static final void readF1() throws IOException {
23 |
24 | //String filePath="scripts/clustering/canopy/canopy.dat";
25 | String filePath="datafile/cluster/simple_k-means.txt";
26 | BufferedReader br = new BufferedReader(new InputStreamReader(
27 | new FileInputStream(filePath)));
28 | for (String line = br.readLine(); line != null; line = br.readLine()) {
29 | if(line.length()==0||"".equals(line))continue;
30 | String[] str=line.split(" ");
31 | Point p0=new Point();
32 | p0.setX(Double.valueOf(str[0]));
33 | p0.setY(Double.valueOf(str[1]));
34 | li.add(p0);
35 | //System.out.println(line);
36 | }
37 | br.close();
38 | }
39 | //简单地采用曼哈顿距离|x1 – x2| + |y1 – y2|
40 | public static Double DistanceMeasure(Point p1,Point p2){
41 | return Math.abs(p2.getX()-p1.getX()) +Math.abs(p2.getY()-p1.getY());
42 | }
43 | public static void clustering(){
44 |
45 | //初始化一个canopy
46 | Point p0=new Point();
47 | p0=li.get(0);
48 | Vector v1=new Vector();
49 | v1.add(p0);
50 | list.add(v1);
51 | li.remove(0);
52 | System.out.println("中心点为:"+p0.getX()+","+p0.getY());
53 | while(0 v=list.get(i);
59 | Point p2=v.get(0);
60 | double dist =DistanceMeasure(p1,p2);
61 | //如果小于t2,属于当前的聚类,已经够接近了,不需要再聚类了,所以删除
62 | if(dist vec=new Vector();
89 | vec.add(p1);
90 | li.remove(0);
91 | list.add(vec);
92 |
93 | }
94 | //与各个已经形成的聚类比较距离,比较结束后将其删除,以结束循环
95 | if(li.get(0).getSign()!=-1){
96 | li.remove(0);
97 | }
98 | }
99 | String ss="ddd";
100 | }
101 |
102 |
103 |
104 |
105 |
106 |
107 | public static void main(String[] args) throws IOException {
108 | // TODO Auto-generated method stub
109 | readF1();
110 |
111 | clustering();
112 | String ss="ddd";
113 | }
114 |
115 | }
116 |
--------------------------------------------------------------------------------
/com.homework/src/sequence/machinelearning/clustering/sequence/machinelearning/clustering/canopy/Point.java:
--------------------------------------------------------------------------------
1 | package sequence.machinelearning.clustering.canopy;
2 |
3 | public class Point {
4 |
5 | private Double x;
6 | private Double y;
7 | private Integer sign=-1;
8 | public Double getX() {
9 | return x;
10 | }
11 | public void setX(Double x) {
12 | this.x = x;
13 | }
14 | public Double getY() {
15 | return y;
16 | }
17 | public void setY(Double y) {
18 | this.y = y;
19 | }
20 | public Integer getSign() {
21 | return sign;
22 | }
23 | public void setSign(Integer sign) {
24 | this.sign = sign;
25 | }
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/com.homework/src/sequence/machinelearning/clustering/sequence/machinelearning/clustering/canopy/UserPoint.java:
--------------------------------------------------------------------------------
1 | package sequence.machinelearning.clustering.canopy;
2 |
3 | public class UserPoint {
4 |
5 | private Double x;
6 | private Double y;
7 | private Integer sign=-1;
8 | private String userid;
9 |
10 | public Double getX() {
11 | return x;
12 | }
13 | public void setX(Double x) {
14 | this.x = x;
15 | }
16 | public Double getY() {
17 | return y;
18 | }
19 | public void setY(Double y) {
20 | this.y = y;
21 | }
22 | public Integer getSign() {
23 | return sign;
24 | }
25 | public void setSign(Integer sign) {
26 | this.sign = sign;
27 | }
28 | public String getUserid() {
29 | return userid;
30 | }
31 | public void setUserid(String userid) {
32 | this.userid = userid;
33 | }
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/com.homework/src/sequence/machinelearning/clustering/sequence/machinelearning/clustering/canopy/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | /**
5 | * @author Administrator
6 | *
7 | */
8 | package sequence.machinelearning.clustering.canopy;
--------------------------------------------------------------------------------
/com.homework/src/sequence/machinelearning/clustering/sequence/machinelearning/clustering/kmeans/MyKmeans.java:
--------------------------------------------------------------------------------
1 | package sequence.machinelearning.clustering.kmeans;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.FileInputStream;
5 | import java.io.IOException;
6 | import java.io.InputStreamReader;
7 | import java.util.ArrayList;
8 | import java.util.List;
9 | import java.util.Vector;
10 |
11 | import sequence.machinelearning.clustering.canopy.Point;
12 |
13 |
14 |
15 | public class MyKmeans {
16 |
17 | static Vector li=new Vector();
18 | //static List li=new ArrayList();
19 | static List> list=new ArrayList>(); //每次迭代保存结果,一个vector代表一个簇
20 | private final static Integer K=2; //选K=2,也就是估算有两个簇。
21 | private final static Double converge=0.001; //当距离小于某个值的时候,就认为聚类已经聚类了,不需要再迭代,这里的值选0.001
22 |
23 | //读取数据
24 | public static final void readF1() throws IOException {
25 | String filePath="datafile/cluster/simple_k-means.txt";
26 | BufferedReader br = new BufferedReader(new InputStreamReader(
27 | new FileInputStream(filePath)));
28 | for (String line = br.readLine(); line != null; line = br.readLine()) {
29 | if(line.length()==0||"".equals(line))continue;
30 | String[] str=line.split(" ");
31 | Point p0=new Point();
32 | p0.setX(Double.valueOf(str[0]));
33 | p0.setY(Double.valueOf(str[1]));
34 | li.add(p0);
35 | //System.out.println(line);
36 | }
37 | br.close();
38 | }
39 | //math.sqrt(double n)
40 | //扩展下,如果要给m开n次方就用java.lang.StrictMath.pow(m,1.0/n);
41 | //采用欧氏距离
42 | public static Double DistanceMeasure(Point p1,Point p2){
43 |
44 | Double tmp=StrictMath.pow(p2.getX()-p1.getX(), 2)+StrictMath.pow(p2.getY()-p1.getY(), 2);
45 | return Math.sqrt(tmp);
46 | }
47 |
48 | //计算新的簇心
49 | public static Double CalCentroid(){
50 | System.out.println("------------------------------------------------");
51 | Double movedist=Double.MAX_VALUE;
52 | for(int i=0;i subli=list.get(i);
54 | Point po=new Point();
55 | Double sumX=0.0;
56 | Double sumY=0.0;
57 | Double Clusterlen=Double.valueOf(subli.size());
58 | for(int j=0;jconverge;times++){
83 | System.out.println("第"+times+"次迭代");
84 | //默认每一个list里的Vector第0个元素是质心
85 | for(int i=0;i vect=new Vector();
115 | Point p=new Point();
116 | p=li.get(k);
117 | vect.add(p);
118 | list.add(vect);
119 | }
120 | System.out.println("第1次迭代");
121 | //默认每一个list里的Vector第0个元素是质心
122 | for(int i=K;i li=new Vector();
17 | //static List li=new ArrayList();
18 | static List> list=new ArrayList>(); //每次迭代保存结果,一个vector代表一个簇
19 | private final static Integer K=3; //选K=2,也就是估算有两个簇。
20 | private final static Double converge=0.01; //当距离小于某个值的时候,就认为聚类已经聚类了,不需要再迭代,这里的值选0.001
21 |
22 | //读取数据
23 | public static final void readF1() throws IOException {
24 | String filePath="datafile/cluster/data.csv";
25 | BufferedReader br = new BufferedReader(new InputStreamReader(
26 | new FileInputStream(filePath)));
27 | for (String line = br.readLine(); line != null; line = br.readLine()) {
28 | if(line.length()==0||"".equals(line))continue;
29 | String[] str=line.split(",");
30 | UserPoint p0=new UserPoint();
31 | p0.setUserid(str[0]);
32 | p0.setX(Double.valueOf(str[1]));
33 | p0.setY(Double.valueOf(str[2]));
34 | li.add(p0);
35 | //System.out.println(line);
36 | }
37 | br.close();
38 | }
39 | //math.sqrt(double n)
40 | //扩展下,如果要给m开n次方就用java.lang.StrictMath.pow(m,1.0/n);
41 | //采用欧氏距离
42 | public static Double DistanceMeasure(UserPoint p1,UserPoint p2){
43 |
44 | Double tmp=StrictMath.pow(p2.getX()-p1.getX(), 2)+StrictMath.pow(p2.getY()-p1.getY(), 2);
45 | return Math.sqrt(tmp);
46 | }
47 |
48 | //计算新的簇心
49 | public static Double CalCentroid(){
50 | System.out.println("------------------------------------------------");
51 | Double movedist=Double.MAX_VALUE;
52 | for(int i=0;i subli=list.get(i);
54 | UserPoint po=new UserPoint();
55 | Double sumX=0.0;
56 | Double sumY=0.0;
57 | Double Clusterlen=Double.valueOf(subli.size());
58 | for(int j=0;jconverge;times++){
83 | System.out.println("第"+times+"次迭代");
84 | //默认每一个list里的Vector第0个元素是质心
85 | for(int i=0;i vect=new Vector();
115 | UserPoint p=new UserPoint();
116 | p=li.get(k);
117 | vect.add(p);
118 | list.add(vect);
119 | }
120 | System.out.println("第1次迭代");
121 | //默认每一个list里的Vector第0个元素是质心
122 | for(int i=K;i li){
18 |
19 | Double entropy=new Double(0.0);
20 | for(int i=0;i lasv){
35 | Double gain=new Double(0.0);
36 | Double enSum=new Double(0.0);
37 | Map.Entryentry;
38 | for(int i=0;i children; // 子节点
10 | private String fatherAttribute; // 此节点是父类的哪具属性的分支
11 | //可信度
12 | private Double percent;
13 |
14 | //属性数组
15 | private ArrayList liatts;
16 |
17 |
18 | public ArrayList getLiatts() {
19 | return liatts;
20 | }
21 | public void setLiatts(ArrayList liatts) {
22 | this.liatts = liatts;
23 | }
24 | public String getName() {
25 | return name;
26 | }
27 | public void setName(String name) {
28 | this.name = name;
29 | }
30 | public TreeNode getParent() {
31 | return parent;
32 | }
33 | public void setParent(TreeNode parent) {
34 | this.parent = parent;
35 | }
36 | public List getChildren() {
37 | return children;
38 | }
39 | public void setChildren(List children) {
40 | this.children = children;
41 | }
42 |
43 | public String getFatherAttribute() {
44 | return fatherAttribute;
45 | }
46 | public void setFatherAttribute(String fatherAttribute) {
47 | this.fatherAttribute = fatherAttribute;
48 | }
49 | public Double getPercent() {
50 | return percent;
51 | }
52 | public void setPercent(Double percent) {
53 | this.percent = percent;
54 | }
55 | /**
56 | * 添加一个节点
57 | * @param child
58 | */
59 | public void addChild(TreeNode child) {
60 | if (this.getChildren() == null) {
61 | List list = new ArrayList();
62 | list.add(child);
63 | this.setChildren(list);
64 | } else {
65 | this.getChildren().add(child);
66 | }
67 | }
68 | /**
69 | * 是否存在着该节点,存在返回该节点,不存在返回空
70 | * @param name
71 | * @return
72 | */
73 | public TreeNode findChild(String name) {
74 | List children = this.getChildren();
75 | if (children != null) {
76 | for (TreeNode child : children) {
77 | if (child.getName().equals(name)) {
78 | return child;
79 | }
80 | }
81 | }
82 | return null;
83 | }
84 |
85 | }
86 |
--------------------------------------------------------------------------------
/com.homework/src/sequence/machinelearning/decisiontree/sequence/machinelearning/decisiontree/myid3/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | /**
5 | * @author Administrator
6 | *
7 | */
8 | package sequence.machinelearning.decisiontree.myid3;
--------------------------------------------------------------------------------
/com.homework/src/sequence/machinelearning/naivebayes/sequence/machinelearning/naivebayes/bayesdemo/Main.java:
--------------------------------------------------------------------------------
1 | package sequence.machinelearning.naivebayes.bayesdemo;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileInputStream;
6 | import java.io.FileOutputStream;
7 | import java.io.FileWriter;
8 | import java.io.IOException;
9 | import java.io.InputStreamReader;
10 |
11 | public class Main {
12 |
13 | public static void main(String[] args) throws IOException {
14 | // TODO Auto-generated method stub
15 | Main m=new Main();
16 | m.stringBufferDemo();
17 | //m.fileWriter("D:/test.txt");
18 | m.readF1();
19 | }
20 |
21 | public void fileWriter(String fileName) throws IOException{
22 | //创建一个FileWriter对象
23 | FileWriter fw = new FileWriter(fileName);
24 | //遍历clist集合写入到fileName中
25 | for (int i=0;i<10;i++){
26 | fw.write("第"+i+"行----");
27 | fw.write("\n");
28 | }
29 | //刷新缓冲区
30 | fw.flush();
31 | //关闭文件流对象
32 | fw.close();
33 | }
34 |
35 |
36 |
37 | /**
38 | * 利用StringBuffer写文件
39 | * 该方法可以设定使用何种编码,有效解决中文问题。
40 | * @throws IOException
41 | */
42 |
43 | public void stringBufferDemo() throws IOException
44 | {
45 | String src="datafile/naivebayes/train/out/result.arff";
46 | delfile(src);
47 | File file=new File(src);
48 | if(file.exists())
49 | file.createNewFile();
50 | FileOutputStream out=new FileOutputStream(file,true);
51 | for(int i=0;i<10;i++)
52 | {
53 | StringBuffer sb=new StringBuffer();
54 | sb.append("这是第"+i+"行 \n");//如果不加"/n"则不能实现换行。
55 | System.out.print(sb.toString());
56 |
57 | out.write(sb.toString().getBytes("utf-8"));
58 | }
59 | out.close();
60 | }
61 | public void delfile(String filepath){
62 | File file=new File(filepath);
63 | if(file.exists())
64 | {
65 | //file.createNewFile();
66 | file.delete();
67 | }
68 |
69 | }
70 | public void readF1() throws IOException {
71 |
72 | //String filePath="scripts/clustering/canopy/canopy.dat";
73 | String filePath="datafile/naivebayes/train/out/result";
74 | BufferedReader br = new BufferedReader(new InputStreamReader(
75 | new FileInputStream(filePath)));
76 | for (String line = br.readLine(); line != null; line = br.readLine()) {
77 | if(line.length()==0||"".equals(line))continue;
78 | String[] str=line.split(",");
79 |
80 |
81 | }
82 | br.close();
83 |
84 | }
85 |
86 |
87 | }
88 |
--------------------------------------------------------------------------------
/com.homework/src/sequence/machinelearning/naivebayes/sequence/machinelearning/naivebayes/bayesdemo/Test.java:
--------------------------------------------------------------------------------
1 | package sequence.machinelearning.naivebayes.bayesdemo;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileReader;
6 | import java.io.IOException;
7 | import java.math.BigDecimal;
8 | import java.util.ArrayList;
9 | import java.util.HashMap;
10 | import java.util.Map;
11 | import java.util.regex.Matcher;
12 | import java.util.regex.Pattern;
13 |
14 | public class Test {
15 |
16 | private static Map cmap=new HashMap();
17 | private static Map pmap=new HashMap();
18 | public static final String patternString = "@decision(.*)[{](.*?)[}]";
19 | public BigDecimal getProbability(String[] line,String decision){
20 |
21 | String ckey="P("+decision+")";
22 | //获取P(yes)的概率
23 | BigDecimal result=new BigDecimal(cmap.get(ckey));
24 | for(int j=0;j{
27 | private final static IntWritable one=new IntWritable(1);
28 | Text ip=new Text();
29 | @Override
30 | public void map(Object key, Text value,OutputCollector output, Reporter reporter)throws IOException {
31 | // TODO Auto-generated method stub
32 | Kpi kpi=new Kpi();
33 | kpi=Kpi.filterIPs(value.toString());
34 | if(kpi.isValid()==true){
35 | ip.set(kpi.getRemote_addr());
36 | output.collect(ip, one);
37 | }
38 | }
39 | }
40 | public static class IpReducer extends MapReduceBase implements Reducer{
41 | private IntWritable sumresult=new IntWritable(0);
42 | //private final static IntWritable one =new IntWritable(1);
43 | private int sum=0;
44 |
45 | @Override
46 | public void reduce(Text key, Iterator values,OutputCollector output, Reporter reporter)throws IOException {
47 | // TODO Auto-generated method stub
48 | sum=sum+1;
49 |
50 | sumresult.set(sum);
51 | System.out.print(key+"is:"+sumresult);
52 | output.collect(key, sumresult);
53 | }
54 | }
55 | /**
56 | * @param args
57 | */
58 | public static void main(String[] args) throws Exception{
59 | // TODO Auto-generated method stub
60 | String inpath="hdfs://10.6.3.200:9000/user/hdfs/in/";
61 | String outpath="hdfs://10.6.3.200:9000/user/hdfs/ip_out/";
62 |
63 | JobConf conf=new JobConf(DayIp.class);
64 | conf.setJobName("depend ip count is:");
65 |
66 | conf.setMapOutputKeyClass(Text.class);
67 | conf.setMapOutputValueClass(IntWritable.class);
68 |
69 | conf.setOutputKeyClass(Text.class);
70 | conf.setOutputValueClass(IntWritable.class);
71 |
72 | conf.setMapperClass(IpMapper.class);
73 | conf.setReducerClass(IpReducer.class);
74 | conf.setCombinerClass(IpReducer.class);
75 |
76 | conf.setInputFormat(TextInputFormat.class);
77 | conf.setOutputFormat(TextOutputFormat.class);
78 |
79 | FileInputFormat.setInputPaths(conf, new Path(inpath));
80 | FileOutputFormat.setOutputPath(conf,new Path(outpath));
81 |
82 | JobClient.runJob(conf);
83 | System.out.println("finish");
84 | System.exit(0);
85 |
86 | }
87 |
88 | }
89 |
--------------------------------------------------------------------------------
/com.homework/src/week2/business/StatPV.java:
--------------------------------------------------------------------------------
1 | package business;
2 |
3 | import java.io.IOException;
4 | import java.util.Iterator;
5 |
6 | import org.apache.hadoop.fs.Path;
7 | import org.apache.hadoop.io.IntWritable;
8 | import org.apache.hadoop.io.Text;
9 | import org.apache.hadoop.mapred.FileInputFormat;
10 | import org.apache.hadoop.mapred.FileOutputFormat;
11 | import org.apache.hadoop.mapred.JobClient;
12 | import org.apache.hadoop.mapred.JobConf;
13 | import org.apache.hadoop.mapred.MapReduceBase;
14 | import org.apache.hadoop.mapred.Mapper;
15 | import org.apache.hadoop.mapred.OutputCollector;
16 | import org.apache.hadoop.mapred.Reducer;
17 | import org.apache.hadoop.mapred.Reporter;
18 | import org.apache.hadoop.mapred.TextInputFormat;
19 | import org.apache.hadoop.mapred.TextOutputFormat;
20 |
21 | import entity.Kpi;
22 |
23 | public class StatPV {
24 |
25 | private static class PvMapper extends MapReduceBase implements Mapper