├── README.md └── com.homework ├── .classpath ├── .classpath.bak ├── .gitignore ├── .gitignore.bak ├── .project ├── .settings ├── org.eclipse.core.resources.prefs ├── org.eclipse.jdt.core.prefs └── org.eclipse.m2e.core.prefs ├── datafile ├── association │ ├── cnItems.dat │ ├── fpg │ ├── fpg2 │ ├── items │ └── user2items.csv ├── cluster │ ├── data.csv │ └── simple_k-means.txt ├── decisiontree │ ├── test │ │ └── in │ │ │ └── weather.nominal.arff │ └── train │ │ └── in │ │ └── weather.nominal.arff ├── hosts ├── hosts.txt ├── naivebayes │ ├── test │ │ └── in │ │ │ └── test.arff │ └── train │ │ ├── in │ │ └── weather.nominal.arff │ │ └── out │ │ └── trainresult.arff ├── week5 │ ├── Ma │ ├── Mb │ ├── SparseMatrix │ │ ├── a.txt │ │ └── b.txt │ ├── a.txt │ ├── b.txt │ ├── small.csv │ ├── small2.csv │ └── test │ │ ├── Ma │ │ └── Mb └── week6 │ ├── job.csv │ └── pv.csv ├── lib ├── je-analysis-1.5.1.jar ├── lucene-core-2.3.0.jar ├── lucene-core-3.1.0.jar ├── paoding-analysis.jar └── 说明 ├── pom.xml ├── scripts ├── clustering │ └── canopy │ │ ├── canopy-mahout.txt │ │ └── canopy.dat ├── fp-growth │ ├── fpg-mahout.txt │ └── fpg.txt ├── hive │ ├── HiveJDBC.java │ └── sql.hive ├── week10 │ ├── 1.pig │ ├── common_friend.pig │ ├── karate.csv │ ├── w10.pig │ ├── 杂文件 │ │ ├── common_prj.java.bak │ │ ├── karate2.csv │ │ ├── karate2.csv.bak │ │ ├── mytest.txt │ │ ├── noway │ │ └── tes2.txt │ └── 计算33的好友推荐(不关注别人的没有推荐) │ │ ├── common.java │ │ ├── common.java.bak │ │ ├── common_flt.java │ │ ├── common_flt.java.bak │ │ ├── common_grp.java │ │ ├── common_jnd.java │ │ ├── common_prj.java │ │ ├── pig.pig │ │ └── user.java ├── week13 │ └── week13 ├── week8.rar ├── week8 │ ├── homework.txt │ ├── week8.pig │ └── week8.txt └── week9 │ └── pagerank.r └── src ├── common └── com │ └── homework │ └── hdfs │ ├── HdfsDAO.java │ └── package-info.java ├── hadoop └── machinelearning │ └── clustering │ └── hadoop │ └── machinelearning │ └── clustering │ ├── canopy │ └── package-info.java │ └── kmeans │ ├── KmeansHadoop.java │ └── package-info.java ├── main └── java │ └── com │ └── homework │ └── App.java ├── mommon ├── com │ └── homework │ │ └── mommon │ │ ├── ComTest.java │ │ └── package-info.java └── mytest │ ├── MenuTree.java │ ├── Node.java │ ├── Recursive.java │ └── package-info.java ├── sequence └── machinelearning │ ├── association │ └── sequence │ │ └── machinelearning │ │ └── association │ │ ├── apriori │ │ ├── ItemMap.java │ │ ├── MyApriori.java │ │ ├── Subset.java │ │ └── package-info.java │ │ ├── common │ │ ├── Definition.java │ │ ├── Mytest.java │ │ ├── ReadData.java │ │ ├── SortTest.java │ │ ├── Transaction.java │ │ └── package-info.java │ │ ├── fpgrowth │ │ ├── Myfptree2.java │ │ ├── TreeNode2.java │ │ └── package-info.java │ │ ├── fpgtest │ │ ├── FPTree.java │ │ ├── TreeNode.java │ │ └── package-info.java │ │ └── otherdemo │ │ ├── Apriori.java │ │ ├── Apriori_1.java │ │ ├── Apriori_NathanMagnus.java │ │ └── package-info.java │ ├── clustering │ └── sequence │ │ └── machinelearning │ │ └── clustering │ │ ├── canopy │ │ ├── MyCanopy.java │ │ ├── Point.java │ │ ├── UserPoint.java │ │ └── package-info.java │ │ └── kmeans │ │ ├── MyKmeans.java │ │ ├── MyKmeansForUser.java │ │ └── package-info.java │ ├── decisiontree │ └── sequence │ │ └── machinelearning │ │ └── decisiontree │ │ ├── c45 │ │ ├── DecisionTreeNode.java │ │ ├── DecisionTreeUtil.java │ │ ├── SequenceComparator.java │ │ ├── c4.java │ │ └── package-info.java │ │ ├── id3 │ │ ├── DicisionTree.java │ │ ├── OtherID3.java │ │ └── package-info.java │ │ ├── id3test │ │ ├── DTreeUtil.java │ │ ├── ID3.java │ │ ├── SequenceComparator.java │ │ ├── TreeNode.java │ │ └── package-info.java │ │ ├── myc45 │ │ └── package-info.java │ │ └── myid3 │ │ ├── Maxgain.java │ │ ├── MyID3.java │ │ ├── Point.java │ │ ├── TheMath.java │ │ ├── TreeNode.java │ │ └── package-info.java │ └── naivebayes │ └── sequence │ └── machinelearning │ └── naivebayes │ ├── bayesdemo │ ├── Main.java │ ├── Test.java │ ├── Train.java │ └── package-info.java │ └── textmining │ ├── ParticipleTest.java │ └── package-info.java ├── test └── java │ └── com │ └── homework │ └── AppTest.java ├── week2 ├── business │ ├── DayIp.java │ ├── StatPV.java │ └── package-info.java └── entity │ ├── Kpi.java │ └── package-info.java ├── week3 ├── mine │ ├── Outinfo.java │ ├── StationInfo.java │ ├── StayTime.java │ ├── StayTime2.java │ ├── StayTime2改造前备份.rar │ ├── my.net │ ├── my.pos │ └── package-info.java └── tutorial │ ├── BaseStationDataPreprocess.java │ ├── TableLine.java │ └── package-info.java ├── week5 ├── matrix │ ├── Bigmmult.java │ ├── MatrixMult.java │ ├── Multiply.java │ ├── MyTest.java │ ├── Recommend.java │ ├── SparseMatrix.java │ └── package-info.java └── recommend │ ├── MainPodium.java │ ├── Step1.java │ ├── Step2.java │ ├── Step3.java │ ├── Step4.java │ └── package-info.java ├── week6 ├── filterSalary │ ├── Main.java │ ├── Step0.java │ ├── Step1.java │ ├── Step2.java │ ├── Step3.java │ └── package-info.java ├── recommendJob │ ├── ItemLoglikelihood.java │ ├── UserCityBlock.java │ └── package-info.java └── test │ └── package-info.java ├── week7 ├── classfier │ ├── Main.java │ ├── PaodingFirst.java │ ├── PaodingTest.java │ └── package-info.java ├── dic │ ├── .compiled │ │ └── most-words-mode │ │ │ ├── .metadata │ │ │ ├── vocabulary.dic.compiled │ │ │ ├── x-confucian-family-name.dic.compiled │ │ │ ├── x-for-combinatorics.dic.compiled │ │ │ ├── x-noise-charactor.dic.compiled │ │ │ ├── x-noise-word.dic.compiled │ │ │ └── x-unit.dic.compiled │ ├── administrative.dic │ ├── appellation.dic │ ├── company.dic │ ├── comupter-science.dic │ ├── contemporary-words.dic │ ├── division │ │ ├── africa.dic │ │ ├── america.dic │ │ ├── china.dic │ │ ├── europe.dic │ │ ├── japan.dic │ │ ├── korea.dic │ │ ├── oceania.dic │ │ ├── readme.txt │ │ └── taiwan.dic │ ├── festival.dic │ ├── language.dic │ ├── locale │ │ ├── beijing.dic │ │ ├── fuzhou.dic │ │ ├── quanzhou.dic │ │ ├── readme.txt │ │ └── xiamen.dic │ ├── name-foreign.dic │ ├── nation.dic │ ├── org-domestic.dic │ ├── org-foreign.dic │ ├── paoding-dic-names.properties │ ├── star-domestic.dic │ ├── star-foreign.dic │ ├── t-base.dic │ ├── x-confucian-family-name.dic │ ├── x-for-combinatorics.dic │ ├── x-noise-charactor.dic │ ├── x-noise-word.dic │ └── x-unit.dic └── myInputFormat │ ├── JamesInputFormat.java │ ├── JamesRecordReader.java │ └── package-info.java └── week8 └── mrclassify └── package-info.java /README.md: -------------------------------------------------------------------------------- 1 | myhomework 2 | ========== 3 | -------------------------------------------------------------------------------- /com.homework/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /com.homework/.classpath.bak: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /com.homework/.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | /target/classes/META-INF/maven/com/com.homework/pom.properties 3 | .project 4 | .settings 5 | target 6 | *.log 7 | data 8 | build 9 | bin 10 | assets 11 | runtime 12 | *.class 13 | *.war 14 | *.ear 15 | input 16 | output 17 | 18 | -------------------------------------------------------------------------------- /com.homework/.gitignore.bak: -------------------------------------------------------------------------------- 1 | /target/ 2 | /target/classes/META-INF/maven/com/com.homework/pom.properties 3 | 4 | .project 5 | 6 | .settings 7 | target 8 | *.log 9 | data 10 | build 11 | bin 12 | assets 13 | runtime 14 | *.class 15 | *.war 16 | *.ear 17 | input 18 | output 19 | 20 | -------------------------------------------------------------------------------- /com.homework/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | com.homework 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.jdt.core.javanature 21 | org.eclipse.m2e.core.maven2Nature 22 | 23 | 24 | -------------------------------------------------------------------------------- /com.homework/.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//scripts/fp-growth/fpg-mahout.txt=UTF-8 3 | encoding//src/main/java=UTF-8 4 | encoding//src/test/java=UTF-8 5 | encoding/=UTF-8 6 | -------------------------------------------------------------------------------- /com.homework/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.7 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 12 | org.eclipse.jdt.core.compiler.source=1.7 13 | -------------------------------------------------------------------------------- /com.homework/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /com.homework/datafile/association/cnItems.dat: -------------------------------------------------------------------------------- 1 | 1 牛奶,鸡蛋,面包,薯片 2 | 2 鸡蛋,爆米花,薯片,啤酒 3 | 3 鸡蛋,面包,薯片 4 | 4 牛奶,鸡蛋,面包,爆米花,薯片,啤酒 5 | 5 牛奶,面包,啤酒 6 | 6 鸡蛋,面包,啤酒 7 | 7 牛奶,面包,薯片 8 | 8 牛奶,鸡蛋,面包,黄油,薯片 9 | 9 牛奶,鸡蛋,黄油,薯片 -------------------------------------------------------------------------------- /com.homework/datafile/association/fpg: -------------------------------------------------------------------------------- 1 | 牛奶,鸡蛋,面包,薯片 2 | 鸡蛋,爆米花,薯片,啤酒 3 | 鸡蛋,面包,薯片 4 | 牛奶,鸡蛋,面包,爆米花,薯片,啤酒 5 | 牛奶,面包,啤酒 6 | 鸡蛋,面包,啤酒 7 | 牛奶,面包,薯片 8 | 牛奶,鸡蛋,面包,黄油,薯片 9 | 牛奶,鸡蛋,黄油,薯片 -------------------------------------------------------------------------------- /com.homework/datafile/association/fpg2: -------------------------------------------------------------------------------- 1 | I1,I2,I5 2 | I2,I4 3 | I2,I3 4 | I1,I2,I4 5 | I1,I3 6 | I2,I3 7 | I1,I3 8 | I1,I2,I3,I5 9 | I1,I2,I3 -------------------------------------------------------------------------------- /com.homework/datafile/association/items: -------------------------------------------------------------------------------- 1 | T100 I1,I2,I5 2 | T200 I2,I4 3 | T300 I2,I3 4 | T400 I1,I2,I4 5 | T500 I1,I3 6 | T600 I2,I3 7 | T700 I1,I3 8 | T800 I1,I2,I3,I5 9 | T900 I1,I2,I3 -------------------------------------------------------------------------------- /com.homework/datafile/cluster/simple_k-means.txt: -------------------------------------------------------------------------------- 1 | 1 1 2 | 2 1 3 | 1 2 4 | 2 2 5 | 3 3 6 | 8 8 7 | 8 9 8 | 9 8 9 | 9 9 -------------------------------------------------------------------------------- /com.homework/datafile/decisiontree/test/in/weather.nominal.arff: -------------------------------------------------------------------------------- 1 | 2 | #存放做决策的属性,一般是或否 3 | @decision 4 | yes,no 5 | 6 | @attribute outlook {sunny, overcast, rainy} 7 | @attribute temperature {hot, mild, cool} 8 | @attribute humidity {high, normal} 9 | @attribute windy {TRUE, FALSE} 10 | 11 | 12 | @data 13 | sunny,hot,high,FALSE,no 14 | sunny,hot,high,TRUE,no 15 | overcast,hot,high,FALSE,yes 16 | rainy,mild,high,FALSE,yes 17 | rainy,cool,normal,FALSE,yes 18 | rainy,cool,normal,TRUE,no 19 | overcast,cool,normal,TRUE,yes 20 | sunny,mild,high,FALSE,no 21 | sunny,cool,normal,FALSE,yes 22 | rainy,mild,normal,FALSE,yes 23 | sunny,mild,normal,TRUE,yes 24 | overcast,mild,high,TRUE,yes 25 | overcast,hot,normal,FALSE,yes 26 | rainy,mild,high,TRUE,no -------------------------------------------------------------------------------- /com.homework/datafile/decisiontree/train/in/weather.nominal.arff: -------------------------------------------------------------------------------- 1 | 2 | #存放做决策的属性,一般是或否 3 | @decision 4 | yes,no 5 | 6 | @attribute outlook {sunny, overcast, rainy} 7 | @attribute temperature {hot, mild, cool} 8 | @attribute humidity {high, normal} 9 | @attribute windy {TRUE, FALSE} 10 | 11 | 12 | @data 13 | sunny,hot,high,FALSE,no 14 | sunny,hot,high,TRUE,no 15 | overcast,hot,high,FALSE,yes 16 | rainy,mild,high,FALSE,yes 17 | rainy,cool,normal,FALSE,yes 18 | rainy,cool,normal,TRUE,no 19 | overcast,cool,normal,TRUE,yes 20 | sunny,mild,high,FALSE,no 21 | sunny,cool,normal,FALSE,yes 22 | rainy,mild,normal,FALSE,yes 23 | sunny,mild,normal,TRUE,yes 24 | overcast,mild,high,TRUE,yes 25 | overcast,hot,normal,FALSE,yes 26 | rainy,mild,high,TRUE,no -------------------------------------------------------------------------------- /com.homework/datafile/naivebayes/test/in/test.arff: -------------------------------------------------------------------------------- 1 | @decision 2 | yes,no 3 | @attribute outlook {sunny, overcast, rainy} 4 | @attribute temperature {hot, mild, cool} 5 | @attribute humidity {high, normal} 6 | @attribute windy {TRUE, FALSE} 7 | @data 8 | sunny,hot,high,FALSE 9 | overcast,mild,high,TRUE 10 | overcast,hot,normal,FALSE 11 | rainy,mild,high,TRUE -------------------------------------------------------------------------------- /com.homework/datafile/naivebayes/train/in/weather.nominal.arff: -------------------------------------------------------------------------------- 1 | #存放做决策的属性,一般是或否 2 | @decision 3 | yes,no 4 | @attribute outlook {sunny, overcast, rainy} 5 | @attribute temperature {hot, mild, cool} 6 | @attribute humidity {high, normal} 7 | @attribute windy {TRUE, FALSE} 8 | @data 9 | sunny,hot,high,FALSE,no 10 | sunny,hot,high,TRUE,no 11 | overcast,hot,high,FALSE,yes 12 | rainy,mild,high,FALSE,yes 13 | rainy,cool,normal,FALSE,yes 14 | rainy,cool,normal,TRUE,no 15 | overcast,cool,normal,TRUE,yes 16 | sunny,mild,high,FALSE,no 17 | sunny,cool,normal,FALSE,yes 18 | rainy,mild,normal,FALSE,yes 19 | sunny,mild,normal,TRUE,yes 20 | overcast,mild,high,TRUE,yes 21 | overcast,hot,normal,FALSE,yes 22 | rainy,mild,high,TRUE,no -------------------------------------------------------------------------------- /com.homework/datafile/naivebayes/train/out/trainresult.arff: -------------------------------------------------------------------------------- 1 | @decision P(yes) {0.7142857142857143} 2 | @decision P(no) {0.42857142857142855} 3 | @data 4 | P(outlook=sunny|yes),0.3 5 | P(outlook=sunny|no),0.6666666666666666 6 | P(outlook=overcast|yes),0.5 7 | P(outlook=overcast|no),0.16666666666666666 8 | P(outlook=rainy|yes),0.4 9 | P(outlook=rainy|no),0.5 10 | P(temperature=hot|yes),0.3 11 | P(temperature=hot|no),0.5 12 | P(temperature=mild|yes),0.5 13 | P(temperature=mild|no),0.5 14 | P(temperature=cool|yes),0.4 15 | P(temperature=cool|no),0.3333333333333333 16 | P(humidity=high|yes),0.4 17 | P(humidity=high|no),0.8333333333333334 18 | P(humidity=normal|yes),0.7 19 | P(humidity=normal|no),0.3333333333333333 20 | P(windy=TRUE|yes),0.4 21 | P(windy=TRUE|no),0.6666666666666666 22 | P(windy=FALSE|yes),0.7 23 | P(windy=FALSE|no),0.5 24 | -------------------------------------------------------------------------------- /com.homework/datafile/week5/Ma: -------------------------------------------------------------------------------- 1 | 1,1,1 2 | 2,1,2 3 | 2,2,3 -------------------------------------------------------------------------------- /com.homework/datafile/week5/Mb: -------------------------------------------------------------------------------- 1 | 1,1,2 2 | 1,2,4 3 | 2,1,1 4 | 2,2,2 -------------------------------------------------------------------------------- /com.homework/datafile/week5/SparseMatrix/a.txt: -------------------------------------------------------------------------------- 1 | 1,1,1 2 | 1,2,2 3 | 1,3,3 4 | 2,1,4 5 | 2,2,5 6 | 3,1,7 7 | 3,2,8 8 | 3,3,9 9 | 4,1,10 10 | 4,2,11 11 | 4,3,12 -------------------------------------------------------------------------------- /com.homework/datafile/week5/SparseMatrix/b.txt: -------------------------------------------------------------------------------- 1 | 1,1,10 2 | 1,2,15 3 | 2,2,2 4 | 3,1,11 5 | 3,2,9 -------------------------------------------------------------------------------- /com.homework/datafile/week5/a.txt: -------------------------------------------------------------------------------- 1 | 1,1,1 2 | 1,2,2 3 | 1,3,3 4 | 2,1,4 5 | 2,2,5 6 | 3,1,7 7 | 3,2,8 8 | 3,3,9 9 | 4,1,10 10 | 4,2,11 11 | 4,3,12 -------------------------------------------------------------------------------- /com.homework/datafile/week5/b.txt: -------------------------------------------------------------------------------- 1 | 1,1,10 2 | 1,2,15 3 | 2,2,2 4 | 3,1,11 5 | 3,2,9 -------------------------------------------------------------------------------- /com.homework/datafile/week5/small.csv: -------------------------------------------------------------------------------- 1 | 1,101,5.0 2 | 1,102,3.0 3 | 1,103,2.5 4 | 2,101,2.0 5 | 2,102,2.5 6 | 2,103,5.0 7 | 2,104,2.0 8 | 3,101,2.0 9 | 3,104,4.0 10 | 3,105,4.5 11 | 3,107,5.0 12 | 4,101,5.0 13 | 4,103,3.0 14 | 4,104,4.5 15 | 4,106,4.0 16 | 5,101,4.0 17 | 5,102,3.0 18 | 5,103,2.0 19 | 5,104,4.0 20 | 5,105,3.5 21 | 5,106,4.0 -------------------------------------------------------------------------------- /com.homework/datafile/week5/small2.csv: -------------------------------------------------------------------------------- 1 | 1,101,5.0 2 | 1,102,3.0 3 | 1,103,2.5 4 | 2,101,2.0 5 | 2,102,2.5 6 | 2,103,5.0 7 | 2,104,2.0 8 | 3,101,2.0 9 | 3,104,4.0 10 | 3,105,4.5 11 | 3,107,5.0 12 | 4,101,5.0 13 | 4,103,3.0 14 | 4,104,4.5 15 | 4,106,4.0 16 | 5,101,4.0 17 | 5,102,3.0 18 | 5,103,2.0 19 | 5,104,4.0 20 | 5,105,3.5 21 | 5,106,4.0 22 | 6,102,4.0 23 | 6,103,2.0 24 | 6,105,3.5 25 | 6,107,4.0 -------------------------------------------------------------------------------- /com.homework/datafile/week5/test/Ma: -------------------------------------------------------------------------------- 1 | 1,1,1 2 | 1,2,2 3 | 2,1,2 4 | 2,2,3 -------------------------------------------------------------------------------- /com.homework/datafile/week5/test/Mb: -------------------------------------------------------------------------------- 1 | 1,1,2 2 | 1,2,4 3 | 2,1,1 4 | 2,2,2 -------------------------------------------------------------------------------- /com.homework/datafile/week6/job.csv: -------------------------------------------------------------------------------- 1 | 1,2013-01-24,5600 2 | 2,2011-03-02,5400 3 | 3,2011-03-14,8100 4 | 4,2012-10-05,2200 5 | 5,2011-09-03,14100 6 | 6,2011-03-05,6500 7 | 7,2012-06-06,37000 8 | 8,2013-02-18,5500 9 | 9,2010-07-05,7500 10 | 10,2010-01-23,6700 11 | 11,2011-09-19,5200 12 | 12,2010-01-19,29700 13 | 13,2013-09-28,6000 14 | 14,2013-10-23,3300 15 | 15,2010-10-09,2700 16 | 16,2010-07-14,5100 17 | 17,2010-05-13,29000 18 | 18,2010-01-16,21800 19 | 19,2013-05-23,5700 20 | 20,2011-04-24,5900 21 | 21,2011-09-07,4500 22 | 22,2011-02-20,8100 23 | 23,2012-10-15,6300 24 | 24,2010-04-16,15500 25 | 25,2011-08-22,6300 26 | 26,2011-08-10,8800 27 | 27,2010-09-01,7700 28 | 28,2013-10-16,4300 29 | 29,2010-03-04,8100 30 | 30,2010-05-01,9200 31 | 31,2011-04-16,7700 32 | 32,2013-09-04,2300 33 | 33,2010-05-26,17400 34 | 34,2011-04-14,4000 35 | 35,2010-09-29,5700 36 | 36,2010-04-11,2800 37 | 37,2010-07-26,3600 38 | 38,2011-05-04,17200 39 | 39,2013-04-03,6000 40 | 40,2011-10-21,8400 41 | 41,2010-01-11,5600 42 | 42,2012-03-17,6400 43 | 43,2010-07-10,8800 44 | 44,2010-09-22,22100 45 | 45,2012-08-31,4000 46 | 46,2011-06-11,8800 47 | 47,2010-03-08,5400 48 | 48,2010-04-29,8300 49 | 49,2011-02-05,14500 50 | 50,2011-10-24,7500 51 | 51,2011-04-17,7400 52 | 52,2011-03-19,4000 53 | 53,2010-07-02,5300 54 | 54,2010-07-21,15700 55 | 55,2013-08-09,2800 56 | 56,2013-01-14,48900 57 | 57,2011-06-14,4100 58 | 58,2010-07-30,12300 59 | 59,2010-05-13,9100 60 | 60,2013-06-19,7600 61 | 61,2010-03-13,9700 62 | 62,2013-10-15,5000 63 | 63,2012-10-02,4900 64 | 64,2010-06-08,6300 65 | 65,2010-08-02,3300 66 | 66,2010-05-03,8600 67 | 67,2013-08-23,11300 68 | 68,2010-10-03,7300 69 | 69,2010-05-23,5200 70 | 70,2010-03-28,26400 71 | 71,2010-02-05,9300 72 | 72,2010-06-18,6900 73 | 73,2013-07-08,7500 74 | 74,2010-04-04,6600 75 | 75,2011-05-27,8700 76 | 76,2011-03-17,8800 77 | 77,2013-03-03,6500 78 | 78,2012-01-29,6800 79 | 79,2010-07-19,4900 80 | 80,2010-01-13,5600 81 | 81,2013-01-22,7800 82 | 82,2010-07-05,7500 83 | 83,2010-04-17,3200 84 | 84,2010-10-13,16100 85 | 85,2010-06-26,5400 86 | 86,2011-07-04,7500 87 | 87,2010-05-29,2100 88 | 88,2012-02-04,6500 89 | 89,2013-06-15,8400 90 | 90,2010-01-04,3600 91 | 91,2010-09-07,6900 92 | 92,2012-05-19,5700 93 | 93,2010-08-13,15300 94 | 94,2011-05-11,15700 95 | 95,2013-09-23,6100 96 | 96,2011-05-27,14900 97 | 97,2010-03-30,2700 98 | 98,2010-01-15,2900 99 | 99,2013-07-21,12900 100 | 100,2010-07-22,7500 101 | 101,2013-03-10,7100 102 | 102,2010-07-04,9500 103 | 103,2010-01-02,7000 104 | 104,2012-05-02,8700 105 | 105,2013-04-28,8000 106 | 106,2011-04-25,5200 107 | 107,2010-10-23,9200 108 | 108,2010-07-21,5900 109 | 109,2010-07-14,8900 110 | 110,2010-09-10,3400 111 | 111,2012-05-05,6400 112 | 112,2010-10-16,2000 113 | 113,2013-03-31,8200 114 | 114,2013-08-01,8300 115 | 115,2010-04-23,5100 116 | 116,2011-10-16,6100 117 | 117,2010-03-01,3100 118 | 118,2010-06-23,4100 119 | 119,2011-10-17,14400 120 | 120,2013-07-10,3200 121 | 121,2010-06-19,5300 122 | 122,2013-04-25,9100 123 | 123,2010-06-22,3900 124 | 124,2013-09-14,7900 125 | 125,2010-03-08,5100 126 | 126,2010-01-06,8500 127 | 127,2010-08-16,5800 128 | 128,2010-05-27,12800 129 | 129,2010-03-01,14900 130 | 130,2010-08-16,9500 131 | 131,2010-01-24,5400 132 | 132,2010-05-10,6000 133 | 133,2011-01-31,3200 134 | 134,2010-08-12,4300 135 | 135,2012-09-01,6900 136 | 136,2010-08-29,6600 137 | 137,2010-01-20,7400 138 | 138,2012-02-23,4800 139 | 139,2012-09-26,8700 140 | 140,2010-02-23,9100 141 | 141,2011-10-05,5200 142 | 142,2010-04-18,44500 143 | 143,2010-06-28,10800 144 | 144,2010-09-18,12600 145 | 145,2013-08-02,6800 146 | 146,2013-09-28,8500 147 | 147,2011-09-20,19900 148 | 148,2012-09-02,9200 149 | 149,2010-03-19,11200 150 | 150,2012-01-14,3700 151 | 151,2013-02-21,6400 152 | 152,2012-09-28,7500 153 | 153,2010-05-02,5400 154 | 154,2010-03-19,17700 155 | 155,2010-10-13,2700 156 | 156,2010-09-19,9400 157 | 157,2011-08-26,10500 158 | 158,2011-08-29,9800 159 | 159,2011-02-22,18200 160 | 160,2010-03-14,5100 161 | 161,2010-08-23,6900 162 | 162,2010-01-28,11700 163 | 163,2013-07-02,6600 164 | 164,2011-09-22,6700 165 | 165,2010-07-06,7800 166 | 166,2010-01-25,8900 167 | 167,2013-06-02,9400 168 | 168,2013-01-13,2400 169 | 169,2011-03-02,2700 170 | 170,2013-02-24,5300 171 | 171,2010-10-09,5100 172 | 172,2010-09-07,6100 173 | 173,2013-09-13,5200 174 | 174,2013-05-09,4500 175 | 175,2013-09-12,36700 176 | 176,2012-05-04,8800 177 | 177,2010-08-17,12600 178 | 178,2011-08-16,8300 179 | 179,2010-08-11,5300 180 | 180,2010-04-28,8000 181 | 181,2010-04-24,6300 182 | 182,2010-03-01,10400 183 | 183,2010-05-20,6500 184 | 184,2010-01-03,4600 185 | 185,2013-09-21,5300 186 | 186,2010-04-22,7800 187 | 187,2010-08-08,6100 188 | 188,2010-07-14,6000 189 | 189,2011-06-19,6000 190 | 190,2010-01-10,12300 191 | 191,2011-07-27,2400 192 | 192,2012-02-14,12200 193 | 193,2010-02-28,2800 194 | 194,2011-10-14,14400 195 | 195,2012-03-12,3500 196 | 196,2010-04-11,3800 197 | 197,2013-03-13,18000 198 | 198,2010-07-20,41600 199 | 199,2013-10-02,9800 200 | 200,2013-02-05,7100 201 | -------------------------------------------------------------------------------- /com.homework/lib/je-analysis-1.5.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bellonor/myHadoopProject/5b7f551dda6daf86489eeed8370d868a90a5cb8e/com.homework/lib/je-analysis-1.5.1.jar -------------------------------------------------------------------------------- /com.homework/lib/lucene-core-2.3.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bellonor/myHadoopProject/5b7f551dda6daf86489eeed8370d868a90a5cb8e/com.homework/lib/lucene-core-2.3.0.jar -------------------------------------------------------------------------------- /com.homework/lib/lucene-core-3.1.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bellonor/myHadoopProject/5b7f551dda6daf86489eeed8370d868a90a5cb8e/com.homework/lib/lucene-core-3.1.0.jar -------------------------------------------------------------------------------- /com.homework/lib/paoding-analysis.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bellonor/myHadoopProject/5b7f551dda6daf86489eeed8370d868a90a5cb8e/com.homework/lib/paoding-analysis.jar -------------------------------------------------------------------------------- /com.homework/lib/说明: -------------------------------------------------------------------------------- 1 | paoding-analysis.jar只支持lucene-core-3.1.0.jar 2 | je-analysis-1.5.1.jar不支持lucene3.0以上的,所以 3 | 用paoding只能先lucene3.1 4 | 用je只能选lucene2.3 5 | -------------------------------------------------------------------------------- /com.homework/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com 6 | com.homework 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | com.homework 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 0.8 16 | 17 | 18 | 19 | 20 | org.apache.hadoop 21 | hadoop-core 22 | 1.1.2 23 | 24 | 25 | org.apache.mahout 26 | mahout-core 27 | ${mahout.version} 28 | 29 | 30 | org.apache.mahout 31 | mahout-integration 32 | ${mahout.version} 33 | 34 | 35 | org.mortbay.jetty 36 | jetty 37 | 38 | 39 | org.apache.cassandra 40 | cassandra-all 41 | 42 | 43 | me.prettyprint 44 | hector-core 45 | 46 | 47 | 48 | 49 | org.apache.hive 50 | hive-service 51 | 0.11.0 52 | 53 | 54 | 55 | 56 | junit 57 | junit 58 | 3.8.1 59 | test 60 | 61 | 62 | 63 | 64 | dom4j 65 | dom4j 66 | 1.6.1 67 | 68 | 69 | jaxen 70 | jaxen 71 | 1.1.6 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /com.homework/scripts/clustering/canopy/canopy-mahout.txt: -------------------------------------------------------------------------------- 1 | 2 | 数据准备: 3 | canopy.dat文件 4 | 8.1 8.1 5 | 6 | 7.1 7.1 7 | 8 | 6.2 6.2 9 | 10 | 7.1 7.1 11 | 12 | 2.1 2.1 13 | 14 | 1.1 1.1 15 | 16 | 0.1 0.1 17 | 18 | 3.0 3.0 19 | 20 | # 1.转换成向量,mahout用InputDriver数据转换时候,需要数据默认用空格分隔 21 | mahout org.apache.mahout.clustering.conversion.InputDriver -i /user/hdfs/canopy/in/canopy.dat -o /user/hdfs/canopy/vecfile -v org.apache.mahout.math.RandomAccessSparseVector 22 | # 2. 调用命令 23 | mahout canopy -i /user/hdfs/canopy/vecfile -o /user/hdfs/canopy/out/result -t1 8 -t2 4 -ow -cl 24 | 25 | 26 | # 3.查看结果 27 | 28 | mahout seqdumper -i /user/hdfs/canopy/out/result/clusters-0-final/part-r-00000 -o /home/hadoop/output/result 29 | #关联各个点 30 | mahout clusterdump -i /user/hdfs/canopy/out/result/clusters-0-final/part-r-00000 -o /home/hadoop/output/result -p /user/hdfs/canopy/out/result/clusteredPoints 31 | 32 | 33 | C-0{n=2 c=[6.888, 6.888] r=[0.237, 0.237]} 34 | Weight : [props - optional]: Point: 35 | 1.0: [8.100, 8.100] 36 | 1.0: [7.100, 7.100] 37 | 1.0: [6.200, 6.200] 38 | 1.0: [7.100, 7.100] 39 | C-1{n=2 c=[1.083, 1.083] r=[0.983, 0.983]} 40 | Weight : [props - optional]: Point: 41 | 1.0: [2.100, 2.100] 42 | 1.0: [1.100, 1.100] 43 | 1.0: [3.000, 3.000] 44 | C-2{n=1 c=[0.100, 0.100] r=[]} 45 | Weight : [props - optional]: Point: 46 | 1.0: [0.100, 0.100] -------------------------------------------------------------------------------- /com.homework/scripts/clustering/canopy/canopy.dat: -------------------------------------------------------------------------------- 1 | 8.1 8.1 2 | 7.1 7.1 3 | 6.2 6.2 4 | 7.1 7.1 5 | 2.1 2.1 6 | 1.1 1.1 7 | 0.1 0.1 8 | 3.0 3.0 -------------------------------------------------------------------------------- /com.homework/scripts/fp-growth/fpg-mahout.txt: -------------------------------------------------------------------------------- 1 | mahout fpg -i /user/hdfs/fp-growth/in/fpg.txt -o /user/hdfs/fp-growth/out -k 50 -method mapreduce -regex '[\ ]' -s 2 2 | 13周作业 3 | mahout fpg -i /user/hdfs/week13/user2items.csv -o /user/hdfs/week13/out -k 50 -method mapreduce -regex '[\ ]' -s 4 4 | 查看结果 5 | 6 | mahout seqdumper -i /user/hdfs/fp-growth/out/frequentpatterns/part-r-00000 7 | 结果: 8 | Key: I1: Value: ([I1],6), ([I2, I1],4), ([I1, I3],4), ([I2, I1, I5],2), ([I2, I1, I3],2) 9 | Key: I2: Value: ([I2],7), ([I2, I3],4), ([I2, I1],4), ([I2, I1, I5],2), ([I2, I1, I3],2), ([I2, I4],2) 10 | Key: I3: Value: ([I3],6), ([I2, I3],4), ([I1, I3],4), ([I2, I1, I3],2) 11 | Key: I4: Value: ([I2, I4],2) 12 | Key: I5: Value: ([I2, I1, I5],2) 13 | Count: 5 14 | 查看fpgrowth 15 | mahout seqdumper -i /user/hdfs/fp-growth/out/fpgrowth/part-r-00000 16 | Key: I2: Value: ([I2],7) 17 | Key: I1: Value: ([I1],6), ([I2, I1],4) 18 | Key: I3: Value: ([I3],6), ([I2, I3],4), ([I1, I3],4), ([I2, I1, I3],2) 19 | Key: I4: Value: ([I2, I4],2) 20 | Key: I5: Value: ([I2, I1, I5],2) 21 | Count: 5 22 | 查看fList 23 | mahout seqdumper -i /user/hdfs/fp-growth/out/fList 24 | Key: I2: Value: 7 25 | Key: I1: Value: 6 26 | Key: I3: Value: 6 27 | Key: I4: Value: 2 28 | Key: I5: Value: 2 29 | Count: 5 -------------------------------------------------------------------------------- /com.homework/scripts/fp-growth/fpg.txt: -------------------------------------------------------------------------------- 1 | I1 I2 I5 2 | I2 I4 3 | I2 I3 4 | I1 I2 I4 5 | I1 I3 6 | I2 I3 7 | I1 I3 8 | I1 I2 I3 I5 9 | I1 I2 I3 -------------------------------------------------------------------------------- /com.homework/scripts/hive/HiveJDBC.java: -------------------------------------------------------------------------------- 1 | package com.hive.jdbc; 2 | import java.sql.Connection; 3 | import java.sql.DriverManager; 4 | import java.sql.ResultSet; 5 | import java.sql.Statement; 6 | 7 | public class HiveJDBC { 8 | 9 | public static void main(String[] args) { 10 | try { 11 | Class.forName("org.apache.hadoop.hive.jdbc.HiveDriver"); 12 | // 查询语句 13 | String querySQL = "SELECT * FROM t_rp"; 14 | // 链接hive 15 | Connection con = DriverManager.getConnection("jdbc:hive://192.168.0.100:10000/default", "hive", "hive"); 16 | Statement stmt = con.createStatement(); 17 | // 执行查询语句 18 | ResultSet res = stmt.executeQuery(querySQL); 19 | while (res.next()) { 20 | System.out.println("Result: key:" + res.getString(1) + " –> value:" + res.getString(2)); 21 | } 22 | stmt.close(); 23 | con.close(); 24 | } catch (Exception e) { 25 | e.printStackTrace(); 26 | } 27 | } 28 | 29 | 30 | } 31 | /* 32 | import java.sql.SQLException; 33 | import java.sql.Connection; 34 | import java.sql.ResultSet; 35 | import java.sql.Statement; 36 | import java.sql.DriverManager; 37 | 38 | public class HiveJdbcClient { 39 | private static String driverName = "org.apache.hadoop.hive.jdbc.HiveDriver"; 40 | 41 | *//** 42 | * @param args 43 | * @throws SQLException 44 | *//* 45 | public static void main(String[] args) throws SQLException { 46 | try { 47 | Class.forName(driverName); 48 | } catch (ClassNotFoundException e) { 49 | // TODO Auto-generated catch block 50 | e.printStackTrace(); 51 | System.exit(1); 52 | } 53 | Connection con = DriverManager.getConnection("jdbc:hive://localhost:10000/default", "", ""); 54 | Statement stmt = con.createStatement(); 55 | String tableName = "testHiveDriverTable"; 56 | stmt.executeQuery("drop table " + tableName); 57 | ResultSet res = stmt.executeQuery("create table " + tableName + " (key int, value string)"); 58 | // show tables 59 | String sql = "show tables '" + tableName + "'"; 60 | System.out.println("Running: " + sql); 61 | res = stmt.executeQuery(sql); 62 | if (res.next()) { 63 | System.out.println(res.getString(1)); 64 | } 65 | // describe table 66 | sql = "describe " + tableName; 67 | System.out.println("Running: " + sql); 68 | res = stmt.executeQuery(sql); 69 | while (res.next()) { 70 | System.out.println(res.getString(1) + "\t" + res.getString(2)); 71 | } 72 | 73 | // load data into table 74 | // NOTE: filepath has to be local to the hive server 75 | // NOTE: /tmp/a.txt is a ctrl-A separated file with two fields per line 76 | String filepath = "/tmp/a.txt"; 77 | sql = "load data local inpath '" + filepath + "' into table " + tableName; 78 | System.out.println("Running: " + sql); 79 | res = stmt.executeQuery(sql); 80 | 81 | // select * query 82 | sql = "select * from " + tableName; 83 | System.out.println("Running: " + sql); 84 | res = stmt.executeQuery(sql); 85 | while (res.next()) { 86 | System.out.println(String.valueOf(res.getInt(1)) + "\t" + res.getString(2)); 87 | } 88 | 89 | // regular hive query 90 | sql = "select count(1) from " + tableName; 91 | System.out.println("Running: " + sql); 92 | res = stmt.executeQuery(sql); 93 | while (res.next()) { 94 | System.out.println(res.getString(1)); 95 | } 96 | } 97 | }*/ -------------------------------------------------------------------------------- /com.homework/scripts/week10/1.pig: -------------------------------------------------------------------------------- 1 | #计算1的好友推荐 2 | -- Dataguru Hadoop Course 3 | -- Code by James 4 | 5 | -- Load Data 6 | data1 = LOAD '/user/hdfs/week10/karate.csv' AS ( source, target ); 7 | 8 | data2 = LOAD '/user/hdfs/week10/karate.csv' AS ( source, target ); 9 | 10 | -- Mine the common friends 11 | common_jnd = JOIN data1 BY target, data2 BY target; 12 | 13 | common_prj = FOREACH common_jnd GENERATE data1::target AS common_friend, data1::source AS user, data2::source AS candidate; 14 | 15 | common_flt = FILTER common_prj BY user != candidate; 16 | common_grp = GROUP common_flt BY (user,candidate);-- 此句测试用 17 | common = FOREACH ( GROUP common_flt BY (user,candidate) ) GENERATE FLATTEN(group) AS (user,candidate), COUNT(common_flt) AS cnt; 18 | 19 | -- Recommendation 20 | user = FOREACH ( GROUP common BY user ) 21 | { 22 | candidate_srt = ORDER common BY cnt DESC; 23 | candidate_lim = LIMIT candidate_srt 5; 24 | GENERATE FLATTEN(candidate_lim) AS ( user, candidate, cnt ); 25 | } 26 | 27 | STORE user INTO '/user/hdfs/week10/result_1'; -------------------------------------------------------------------------------- /com.homework/scripts/week10/common_friend.pig: -------------------------------------------------------------------------------- 1 | -- Dataguru Hadoop Course 2 | -- Code by James 3 | 4 | -- Load Data 5 | data1 = LOAD '/user/huangjun/dataguru/wiki-Vote' AS ( source, target ); 6 | 7 | data2 = LOAD '/user/huangjun/dataguru/wiki-Vote' AS ( source, target ); 8 | 9 | -- Mine the common friends 10 | common_jnd = JOIN data1 BY target, data2 BY target; 11 | 12 | common_prj = FOREACH common_jnd GENERATE data1::target AS common_friend, data1::source AS user, data2::source AS candidate; 13 | 14 | common_flt = FILTER common_prj BY user != candidate; 15 | 16 | common = FOREACH ( GROUP common_flt BY (user,candidate) ) GENERATE FLATTEN(group) AS (user,candidate), COUNT(common_flt) AS cnt; 17 | 18 | -- Recommendation 19 | user = FOREACH ( GROUP common BY user ) 20 | { 21 | candidate_srt = ORDER common BY cnt DESC; 22 | candidate_lim = LIMIT candidate_srt 5; 23 | GENERATE FLATTEN(candidate_lim) AS ( user, candidate, cnt ); 24 | } 25 | 26 | STORE user INTO '/user/huangjun/dataguru/result'; -------------------------------------------------------------------------------- /com.homework/scripts/week10/karate.csv: -------------------------------------------------------------------------------- 1 | Source Target 2 | 2 1 3 | 3 1 4 | 3 2 5 | 4 1 6 | 4 2 7 | 4 3 8 | 5 1 9 | 6 1 10 | 7 1 11 | 7 5 12 | 7 6 13 | 8 1 14 | 8 2 15 | 8 3 16 | 8 4 17 | 9 1 18 | 9 3 19 | 10 3 20 | 11 1 21 | 11 5 22 | 11 6 23 | 12 1 24 | 13 1 25 | 13 4 26 | 14 1 27 | 14 2 28 | 14 3 29 | 14 4 30 | 17 6 31 | 17 7 32 | 18 1 33 | 18 2 34 | 20 1 35 | 20 2 36 | 22 1 37 | 22 2 38 | 26 24 39 | 26 25 40 | 28 3 41 | 28 24 42 | 28 25 43 | 29 3 44 | 30 24 45 | 30 27 46 | 31 2 47 | 31 9 48 | 32 1 49 | 32 25 50 | 32 26 51 | 32 29 52 | 33 3 53 | 33 9 54 | 33 15 55 | 33 16 56 | 33 19 57 | 33 21 58 | 33 23 59 | 33 24 60 | 33 30 61 | 33 31 62 | 33 32 63 | 34 9 64 | 34 10 65 | 34 14 66 | 34 15 67 | 34 16 68 | 34 19 69 | 34 20 70 | 34 21 71 | 34 23 72 | 34 24 73 | 34 27 74 | 34 28 75 | 34 29 76 | 34 30 77 | 34 31 78 | 34 32 79 | 34 33 -------------------------------------------------------------------------------- /com.homework/scripts/week10/w10.pig: -------------------------------------------------------------------------------- 1 | -- Dataguru Hadoop Course 2 | -- Code by James 3 | 4 | -- Load Data 5 | data1 = LOAD '/user/hdfs/week10/noway' AS ( source, target ); 6 | 7 | data2 = LOAD '/user/hdfs/week10/noway' AS ( source, target ); 8 | 9 | -- Mine the common friends 10 | common_jnd = JOIN data1 BY target, data2 BY target; 11 | 12 | common_prj = FOREACH common_jnd GENERATE data1::target AS common_friend, data1::source AS user, data2::source AS candidate; 13 | 14 | common_flt = FILTER common_prj BY user != candidate; 15 | -- common_grp = GROUP common_flt BY (user,candidate);-- 此句测试用 16 | common = FOREACH ( GROUP common_flt BY (user,candidate) ) GENERATE FLATTEN(group) AS (user,candidate), COUNT(common_flt) AS cnt; 17 | 18 | -- Recommendation 19 | user = FOREACH ( GROUP common BY user ) 20 | { 21 | candidate_srt = ORDER common BY cnt DESC; 22 | candidate_lim = LIMIT candidate_srt 5; 23 | GENERATE FLATTEN(candidate_lim) AS ( user, candidate, cnt ); 24 | } 25 | 26 | STORE user INTO '/user/hdfs/week10/noway_out/'; -------------------------------------------------------------------------------- /com.homework/scripts/week10/杂文件/karate2.csv: -------------------------------------------------------------------------------- 1 | Source Target 2 | 1 2 3 | 2 1 4 | 3 1 5 | 3 2 6 | 4 1 7 | 4 2 8 | 4 3 9 | 5 1 10 | 6 1 11 | 7 1 12 | 7 5 13 | 7 6 14 | 8 1 15 | 8 2 16 | 8 3 17 | 8 4 18 | 9 1 19 | 9 3 20 | 10 3 21 | 11 1 22 | 11 5 23 | 11 6 24 | 12 1 25 | 13 1 26 | 13 4 27 | 14 1 28 | 14 2 29 | 14 3 30 | 14 4 31 | 17 6 32 | 17 7 33 | 18 1 34 | 18 2 35 | 20 1 36 | 20 2 37 | 22 1 38 | 22 2 39 | 26 24 40 | 26 25 41 | 28 3 42 | 28 24 43 | 28 25 44 | 29 3 45 | 30 24 46 | 30 27 47 | 31 2 48 | 31 9 49 | 32 1 50 | 32 25 51 | 32 26 52 | 32 29 53 | 33 3 54 | 33 9 55 | 33 15 56 | 33 16 57 | 33 19 58 | 33 21 59 | 33 23 60 | 33 24 61 | 33 30 62 | 33 31 63 | 33 32 64 | 34 9 65 | 34 10 66 | 34 14 67 | 34 15 68 | 34 16 69 | 34 19 70 | 34 20 71 | 34 21 72 | 34 23 73 | 34 24 74 | 34 27 75 | 34 28 76 | 34 29 77 | 34 30 78 | 34 31 79 | 34 32 80 | 34 33 -------------------------------------------------------------------------------- /com.homework/scripts/week10/杂文件/karate2.csv.bak: -------------------------------------------------------------------------------- 1 | Source Target 2 | 2 1 3 | 3 1 4 | 3 2 5 | 4 1 6 | 4 2 7 | 4 3 8 | 5 1 9 | 6 1 10 | 7 1 11 | 7 5 12 | 7 6 13 | 8 1 14 | 8 2 15 | 8 3 16 | 8 4 17 | 9 1 18 | 9 3 19 | 10 3 20 | 11 1 21 | 11 5 22 | 11 6 23 | 12 1 24 | 13 1 25 | 13 4 26 | 14 1 27 | 14 2 28 | 14 3 29 | 14 4 30 | 17 6 31 | 17 7 32 | 18 1 33 | 18 2 34 | 20 1 35 | 20 2 36 | 22 1 37 | 22 2 38 | 26 24 39 | 26 25 40 | 28 3 41 | 28 24 42 | 28 25 43 | 29 3 44 | 30 24 45 | 30 27 46 | 31 2 47 | 31 9 48 | 32 1 49 | 32 25 50 | 32 26 51 | 32 29 52 | 33 3 53 | 33 9 54 | 33 15 55 | 33 16 56 | 33 19 57 | 33 21 58 | 33 23 59 | 33 24 60 | 33 30 61 | 33 31 62 | 33 32 63 | 34 9 64 | 34 10 65 | 34 14 66 | 34 15 67 | 34 16 68 | 34 19 69 | 34 20 70 | 34 21 71 | 34 23 72 | 34 24 73 | 34 27 74 | 34 28 75 | 34 29 76 | 34 30 77 | 34 31 78 | 34 32 79 | 34 33 -------------------------------------------------------------------------------- /com.homework/scripts/week10/杂文件/mytest.txt: -------------------------------------------------------------------------------- 1 | Source Target 2 | 1 3 3 | 1 4 4 | 2 3 5 | 2 4 6 | 2 1 7 | 3 1 8 | 3 2 9 | 4 1 10 | 4 2 11 | 4 3 -------------------------------------------------------------------------------- /com.homework/scripts/week10/杂文件/noway: -------------------------------------------------------------------------------- 1 | Source Target 2 | 1 3 3 | 1 4 4 | 2 3 5 | 2 4 6 | 3 1 7 | 3 2 8 | 4 1 9 | 4 2 10 | -------------------------------------------------------------------------------- /com.homework/scripts/week10/杂文件/tes2.txt: -------------------------------------------------------------------------------- 1 | Source Target 2 | 2 3 3 | 2 4 4 | 2 1 5 | 3 1 6 | 3 2 7 | 4 1 8 | 4 2 9 | 4 3 -------------------------------------------------------------------------------- /com.homework/scripts/week10/计算33的好友推荐(不关注别人的没有推荐)/common.java.bak: -------------------------------------------------------------------------------- 1 | class 2 | { 3 | public static void main(String[] args) 4 | { 5 | System.out.println("Hello World!"); 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /com.homework/scripts/week10/计算33的好友推荐(不关注别人的没有推荐)/pig.pig: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bellonor/myHadoopProject/5b7f551dda6daf86489eeed8370d868a90a5cb8e/com.homework/scripts/week10/计算33的好友推荐(不关注别人的没有推荐)/pig.pig -------------------------------------------------------------------------------- /com.homework/scripts/week10/计算33的好友推荐(不关注别人的没有推荐)/user.java: -------------------------------------------------------------------------------- 1 | (10,33,1) 2 | (10,29,1) 3 | (10,28,1) 4 | (10,14,1) 5 | (10,9,1) 6 | (11,7,3) 7 | (11,12,1) 8 | (11,8,1) 9 | (11,6,1) 10 | (11,5,1) 11 | (12,4,1) 12 | (12,2,1) 13 | (12,3,1) 14 | (12,32,1) 15 | (12,22,1) 16 | (13,8,2) 17 | (13,14,2) 18 | (13,22,1) 19 | (13,18,1) 20 | (13,12,1) 21 | (14,8,4) 22 | (14,4,3) 23 | (14,22,2) 24 | (14,20,2) 25 | (14,18,2) 26 | (17,11,1) 27 | (17,7,1) 28 | (18,8,2) 29 | (18,4,2) 30 | (18,3,2) 31 | (18,22,2) 32 | (18,20,2) 33 | (2,3,1) 34 | (2,4,1) 35 | (2,5,1) 36 | (2,6,1) 37 | (2,7,1) 38 | (20,3,2) 39 | (20,22,2) 40 | (20,18,2) 41 | (20,14,2) 42 | (20,8,2) 43 | (22,8,2) 44 | (22,4,2) 45 | (22,3,2) 46 | (22,20,2) 47 | (22,14,2) 48 | (26,28,2) 49 | (26,34,1) 50 | (26,30,1) 51 | (26,32,1) 52 | (26,33,1) 53 | (28,33,2) 54 | (28,26,2) 55 | (28,29,1) 56 | (28,14,1) 57 | (28,10,1) 58 | (29,8,1) 59 | (29,4,1) 60 | (29,10,1) 61 | (29,33,1) 62 | (29,14,1) 63 | (3,20,2) 64 | (3,18,2) 65 | (3,14,2) 66 | (3,4,2) 67 | (3,8,2) 68 | (30,34,2) 69 | (30,33,1) 70 | (30,28,1) 71 | (30,26,1) 72 | (31,18,1) 73 | (31,22,1) 74 | (31,33,1) 75 | (31,34,1) 76 | (31,20,1) 77 | (32,11,1) 78 | (32,13,1) 79 | (32,14,1) 80 | (32,18,1) 81 | (32,20,1) 82 | (33,34,10) 83 | (33,28,2) 84 | (33,29,1) 85 | (33,4,1) 86 | (33,8,1) 87 | (34,33,10) 88 | (34,30,2) 89 | (34,28,1) 90 | (34,26,1) 91 | (34,32,1) 92 | (4,8,3) 93 | (4,14,3) 94 | (4,18,2) 95 | (4,20,2) 96 | (4,22,2) 97 | (5,7,1) 98 | (5,9,1) 99 | (5,11,1) 100 | (5,3,1) 101 | (5,13,1) 102 | (6,8,1) 103 | (6,9,1) 104 | (6,11,1) 105 | (6,12,1) 106 | (6,13,1) 107 | (7,11,3) 108 | (7,8,1) 109 | (7,2,1) 110 | (7,3,1) 111 | (7,4,1) 112 | (8,14,4) 113 | (8,4,3) 114 | (8,18,2) 115 | (8,20,2) 116 | (8,22,2) 117 | (9,8,2) 118 | (9,4,2) 119 | (9,14,2) 120 | (9,20,1) 121 | (9,3,1) -------------------------------------------------------------------------------- /com.homework/scripts/week13/week13: -------------------------------------------------------------------------------- 1 | 2 | 13周作业 3 | mahout fpg -i /user/hdfs/week13/in/user2items2.csv -o /user/hdfs/week13/out -k 50 -method mapreduce -regex '[\ ]' -s 4 4 | 查看结果 5 | 6 | mahout seqdumper -i /user/hdfs/week13/out/frequentpatterns/part-r-00000 7 | 结果: 8 | Key: I1: Value: ([I1],6), ([I2, I1],4), ([I1, I3],4), ([I2, I1, I5],2), ([I2, I1, I3],2) 9 | Key: I2: Value: ([I2],7), ([I2, I3],4), ([I2, I1],4), ([I2, I1, I5],2), ([I2, I1, I3],2), ([I2, I4],2) 10 | Key: I3: Value: ([I3],6), ([I2, I3],4), ([I1, I3],4), ([I2, I1, I3],2) 11 | Key: I4: Value: ([I2, I4],2) 12 | Key: I5: Value: ([I2, I1, I5],2) 13 | Count: 5 14 | 查看fpgrowth 15 | mahout seqdumper -i /user/hdfs/week13/out/fpgrowth/part-r-00000 16 | Key: I2: Value: ([I2],7) 17 | Key: I1: Value: ([I1],6), ([I2, I1],4) 18 | Key: I3: Value: ([I3],6), ([I2, I3],4), ([I1, I3],4), ([I2, I1, I3],2) 19 | Key: I4: Value: ([I2, I4],2) 20 | Key: I5: Value: ([I2, I1, I5],2) 21 | Count: 5 22 | 查看fList 23 | mahout seqdumper -i /user/hdfs/fp-growth/out/fList 24 | Key: I2: Value: 7 25 | Key: I1: Value: 6 26 | Key: I3: Value: 6 27 | Key: I4: Value: 2 28 | Key: I5: Value: 2 29 | Count: 5 -------------------------------------------------------------------------------- /com.homework/scripts/week8.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bellonor/myHadoopProject/5b7f551dda6daf86489eeed8370d868a90a5cb8e/com.homework/scripts/week8.rar -------------------------------------------------------------------------------- /com.homework/scripts/week8/homework.txt: -------------------------------------------------------------------------------- 1 | --1.样本分词 2 | hadoop jar /home/hadoop/in/tokenize.jar /user/hdfs/week8/homework/sport /user/hdfs/week8/homework/sport-out 3 | 4 | --2.划分样本,80%训练集,20%测试集 5 | processed= load '/user/hdfs/week8/homework/sport-out/part-r-00000' as (category:chararray,doc:chararray); 6 | test = sample processed 0.2; 7 | tfull= JOIN processed BY (category,doc) LEFT OUTER,test BY (category,doc); 8 | t8= filter tfull BY test::category is null; 9 | train= foreach t8 generate processed::category as category,processed::doc as doc; 10 | store test into '/user/hdfs/week8/homework/test'; 11 | store train into '/user/hdfs/week8/homework/train'; 12 | --查看划分结果 13 | test_count = foreach ( group test by category) generate group,COUNT(test.category); 14 | DUMP test_count; 15 | train_count = foreach ( group train by category) generate group,COUNT(train.category); 16 | DUMP train_count; 17 | 18 | --3.训练学习集,及测试 19 | --mahout-0.6版,0.8不行 20 | --a.bayes 21 | mahout trainclassifier -i /user/hdfs/week8/homework/train -o /user/hdfs/week8/homework/model-bayes -type bayes -ng 1 -source hdfs 22 | mahout testclassifier -d /user/hdfs/week8/homework/test -m /user/hdfs/week8/homework/model-bayes -type bayes -ng 1 -source hdfs -method mapreduce 23 | --b.cbayes 24 | mahout trainclassifier -i /user/hdfs/week8/homework/train -o /user/hdfs/week8/homework/model-cbayes -type cbayes -ng 1 -source hdfs 25 | mahout testclassifier -d /user/hdfs/week8/homework/test -m /user/hdfs/week8/homework/model-cbayes -type cbayes -ng 1 -source hdfs -method mapreduce 26 | 27 | --实战 28 | --分词 29 | hadoop jar /home/hadoop/in/tokenize.jar /user/hdfs/week8/homework/user-sport /user/hdfs/week8/homework/user-sport-out 30 | --运行:win7,eclipse下运行 31 | cbayes: 32 | hdfs://192.168.0.100:9000/user/hdfs/week8/homework/user-sport-out hdfs://192.168.0.100:9000/user/hdfs/week8/homework/result_cbayes hdfs://192.168.0.100:9000/user/hdfs/week8/homework/model-cbayes cbayes 33 | bayes: 34 | hdfs://192.168.0.100:9000/user/hdfs/week8/homework/user-sport-out hdfs://192.168.0.100:9000/user/hdfs/week8/homework/result_bayes hdfs://192.168.0.100:9000/user/hdfs/week8/homework/model-bayes bayes 35 | --求最大值——求用户浏览最多的类别,判断用户偏好 36 | bayes: 37 | user_count= load '/user/hdfs/week8/homework/result_bayes/part-r-00000' using PigStorage('|') AS (userid:chararray,category:chararray,times:int); 38 | result = foreach (group user_count by userid) { 39 | sorted = order user_count by times desc; 40 | top1= limit sorted 1; 41 | generate flatten(top1),SUM(user_count.times); 42 | }; 43 | DUMP result; 44 | store result into '/user/hdfs/week8/homework/final_result_bayes'; 45 | cbayes: 46 | user_count= load '/user/hdfs/week8/homework/result_cbayes/part-r-00000' using PigStorage('|') AS (userid:chararray,category:chararray,times:int); 47 | result = foreach (group user_count by userid) { 48 | sorted = order user_count by times desc; 49 | top1= limit sorted 1; 50 | generate flatten(top1),SUM(user_count.times); 51 | }; 52 | DUMP result; 53 | store result into '/user/hdfs/week8/homework/final_result_cbayes'; 54 | -------------------------------------------------------------------------------- /com.homework/scripts/week8/week8.pig: -------------------------------------------------------------------------------- 1 | processed= load '/user/hdfs/week8/teacher/in/processed' as (category:chararray,doc:chararray); 2 | test = sample processed 0.2; 3 | 4 | --测试用 5 | processed= load '/user/mypig/lefta.txt' as (a1:chararray,a2:chararray,a3:chararray); 6 | test = sample processed 0.2; 7 | 8 | tfull= JOIN processed BY (a1,a2,a3) LEFT OUTER,test BY (a1,a2,a3); 9 | t8= filter tfull BY test::a1 is null; 10 | train= foreach t8 generate processed::a1 as a1,processed::a2 as a2,processed::a3 as a3; 11 | store test into '/user/mypig/test'; 12 | store train into '/user/mypig/train'; 13 | --正式 14 | processed= load '/user/hdfs/week8/teacher/in/processed' as (category:chararray,doc:chararray); 15 | test = sample processed 0.2; 16 | tfull= JOIN processed BY (category,doc) LEFT OUTER,test BY (category,doc); 17 | t8= filter tfull BY test::category is null; 18 | train= foreach t8 generate processed::category as category,processed::doc as doc; 19 | store test into '/user/hdfs/week8/teacher/test'; 20 | store train into '/user/hdfs/week8/teacher/train'; 21 | --统计 22 | test_count = foreach ( group test by category) generate group,COUNT(test.category); 23 | DUMP test_count; 24 | train_count = foreach ( group train by category) generate group,COUNT(train.category); 25 | DUMP train_count; 26 | --mahout-0.6,0.8不行 27 | mahout trainclassifier -i /user/hdfs/week8/teacher/train -o /user/hdfs/week8/model-bayes -type bayes -ng 1 -source hdfs 28 | 29 | mahout testclassifier -d /user/hdfs/week8/teacher/test -m /user/hdfs/week8/model-bayes -type bayes -ng 1 -source hdfs -method mapreduce 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /com.homework/scripts/week8/week8.txt: -------------------------------------------------------------------------------- 1 | --1.分词 2 | hadoop jar /home/hadoop/in/tokenize.jar /user/hdfs/week7/in /user/hdfs/week7/out 3 | 4 | --2.划分样本,80%训练集,20%测试集 5 | processed= load '/user/hdfs/week8/mine/in/processed' as (category:chararray,doc:chararray); 6 | test = sample processed 0.2; 7 | tfull= JOIN processed BY (category,doc) LEFT OUTER,test BY (category,doc); 8 | t8= filter tfull BY test::category is null; 9 | train= foreach t8 generate processed::category as category,processed::doc as doc; 10 | store test into '/user/hdfs/week8/mine/test'; 11 | store train into '/user/hdfs/week8/mine/train'; 12 | --查看划分结果 13 | test_count = foreach ( group test by category) generate group,COUNT(test.category); 14 | DUMP test_count; 15 | train_count = foreach ( group train by category) generate group,COUNT(train.category); 16 | DUMP train_count; 17 | 18 | --3.训练学习集,及测试 19 | --mahout-0.6版,0.8不行 20 | --a.bayes 21 | mahout trainclassifier -i /user/hdfs/week8/mine/train -o /user/hdfs/week8/mine/model-bayes -type bayes -ng 1 -source hdfs 22 | mahout testclassifier -d /user/hdfs/week8/mine/test -m /user/hdfs/week8/mine/model-bayes -type bayes -ng 1 -source hdfs -method mapreduce 23 | --b.cbayes 24 | mahout trainclassifier -i /user/hdfs/week8/mine/train -o /user/hdfs/week8/mine/model-cbayes -type cbayes -ng 1 -source hdfs 25 | mahout testclassifier -d /user/hdfs/week8/mine/test -m /user/hdfs/week8/mine/model-cbayes -type cbayes -ng 1 -source hdfs -method mapreduce 26 | 27 | --用户数据测试 28 | --分词 29 | hadoop jar /home/hadoop/in/tokenize.jar /user/hdfs/week8/mine/user /user/hdfs/week8/mine/user-out 30 | --运行: 31 | hdfs://192.168.0.100:9000/user/hdfs/week8/mine/user-out hdfs://192.168.0.100:9000/user/hdfs/week8/mine/user-output hdfs://192.168.0.100:9000/user/hdfs/week8/mine/model-cbayes cbayes 32 | 33 | 34 | -------------------------------------------------------------------------------- /com.homework/scripts/week9/pagerank.r: -------------------------------------------------------------------------------- 1 | #pages<-read.csv("page",header=FALSE); 2 | pages<-read.csv("people.csv",header=FALSE); 3 | #构造邻接矩阵(方阵): 4 | mrow<-max(pages) 5 | A<-matrix(0,nrow=mrow,ncol=mrow); 6 | #cols=length(pages[1,]); 7 | rows=length(pages[,1]); 8 | for(i in 1:rows){ 9 | p1<-pages[i,1]; 10 | p2<-pages[i,2]; 11 | A[p2,p1]<-1; 12 | } 13 | 14 | 15 | #考虑阻尼系统的情况 16 | csum<-colSums(A); 17 | csum[csum==0] <- 1; 18 | Arow=nrow(A); 19 | d<-0.85; 20 | de<-1-d/Arow; 21 | delta <- (1-d)/Arow; 22 | B <- matrix(delta,nrow(A),ncol(A)); 23 | for (i in 1:Arow) B[i,] <- B[i,] + d*A[i,]/csum; 24 | # 迭代求解特征向量值 25 | x <- rep(1,Arow); 26 | for (i in 1:100) x <- B %*% x 27 | x/sum(x) 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | #转换为概率矩阵(转移矩阵),不考虑阻尼系统 39 | csum<-colSums(A); 40 | csum[csum==0] <- 1; 41 | Arow=nrow(A); 42 | for(i in 1:Arow){ 43 | A[i,]<-A[i,]/csum; 44 | } 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | #利用幂法求解特征向量,不考虑阻尼系统的情况 60 | x <- rep(1,Arow); 61 | for (i in 1:10) x <- A %*% x 62 | #除以一个常数 63 | x/sum(x); 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /com.homework/src/common/com/homework/hdfs/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Administrator 6 | * 7 | */ 8 | package com.homework.hdfs; -------------------------------------------------------------------------------- /com.homework/src/hadoop/machinelearning/clustering/hadoop/machinelearning/clustering/canopy/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Administrator 6 | * 7 | */ 8 | package hadoop.machinelearning.clustering.canopy; -------------------------------------------------------------------------------- /com.homework/src/hadoop/machinelearning/clustering/hadoop/machinelearning/clustering/kmeans/KmeansHadoop.java: -------------------------------------------------------------------------------- 1 | package hadoop.machinelearning.clustering.kmeans; 2 | 3 | 4 | 5 | import java.util.Iterator; 6 | 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.mapred.JobConf; 9 | import org.apache.mahout.clustering.classify.WeightedVectorWritable; 10 | import org.apache.mahout.clustering.conversion.InputDriver; 11 | import org.apache.mahout.clustering.kmeans.KMeansDriver; 12 | import org.apache.mahout.clustering.kmeans.RandomSeedGenerator; 13 | import org.apache.mahout.common.distance.DistanceMeasure; 14 | import org.apache.mahout.common.distance.EuclideanDistanceMeasure; 15 | import org.apache.mahout.math.Vector; 16 | import org.apache.mahout.utils.clustering.ClusterDumper; 17 | 18 | import com.homework.hdfs.HdfsDAO; 19 | 20 | /** 21 | * 素材我已经改成《Mahout in Action》第七章的Hello world例子,运行正常,结果也书一样 22 | * @author Administrator 23 | *书中源码地址: https://github.com/tdunning/MiA 24 | */ 25 | public class KmeansHadoop { 26 | private static final String HDFS = "hdfs://192.168.0.100:9000"; 27 | 28 | public static void main(String[] args) throws Exception { 29 | //String localFile = "datafile/randomData.csv"; 30 | String localFile = "datafile/cluster/simple_k-means.txt"; 31 | String inPath = HDFS + "/user/hdfs/mix_data"; 32 | String seqFile = inPath + "/seqfile"; 33 | String seeds = inPath + "/seeds"; 34 | String outPath = inPath + "/result/"; 35 | String clusteredPoints = outPath + "/clusteredPoints"; 36 | 37 | JobConf conf = config(); 38 | HdfsDAO hdfs = new HdfsDAO(HDFS, conf); 39 | hdfs.rmr(inPath); 40 | hdfs.mkdirs(inPath); 41 | hdfs.copyFile(localFile, inPath); 42 | hdfs.ls(inPath); 43 | 44 | InputDriver.runJob(new Path(inPath), new Path(seqFile), "org.apache.mahout.math.RandomAccessSparseVector"); 45 | 46 | //int k = 3; 47 | int k = 2; 48 | Path seqFilePath = new Path(seqFile); 49 | Path clustersSeeds = new Path(seeds); 50 | DistanceMeasure measure = new EuclideanDistanceMeasure(); 51 | clustersSeeds = RandomSeedGenerator.buildRandom(conf, seqFilePath, clustersSeeds, k, measure); 52 | KMeansDriver.run(conf, seqFilePath, clustersSeeds, new Path(outPath), measure, 0.01, 10, true, 0.01, false); 53 | 54 | Path outGlobPath = new Path(outPath, "clusters-*-final"); 55 | Path clusteredPointsPath = new Path(clusteredPoints); 56 | System.out.printf("Dumping out clusters from clusters: %s and clusteredPoints: %s\n", outGlobPath, clusteredPointsPath); 57 | 58 | ClusterDumper clusterDumper = new ClusterDumper(outGlobPath, clusteredPointsPath); 59 | clusterDumper.printClusters(null); 60 | } 61 | 62 | public static JobConf config() { 63 | JobConf conf = new JobConf(KmeansHadoop.class); 64 | conf.setJobName("ItemCFHadoop"); 65 | conf.addResource("classpath:/hadoop/core-site.xml"); 66 | conf.addResource("classpath:/hadoop/hdfs-site.xml"); 67 | conf.addResource("classpath:/hadoop/mapred-site.xml"); 68 | return conf; 69 | } 70 | 71 | public static void displayCluster(ClusterDumper clusterDumper) { 72 | Iterator keys = clusterDumper.getClusterIdToPoints().keySet().iterator(); 73 | while (keys.hasNext()) { 74 | Integer center = keys.next(); 75 | System.out.println("Center:" + center); 76 | for (WeightedVectorWritable point : clusterDumper.getClusterIdToPoints().get(center)) { 77 | Vector v = point.getVector(); 78 | System.out.println(v.get(0) + "" + v.get(1)); 79 | } 80 | } 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /com.homework/src/hadoop/machinelearning/clustering/hadoop/machinelearning/clustering/kmeans/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Administrator 6 | * 7 | */ 8 | package hadoop.machinelearning.clustering.kmeans; -------------------------------------------------------------------------------- /com.homework/src/main/java/com/homework/App.java: -------------------------------------------------------------------------------- 1 | package com.homework; 2 | 3 | /** 4 | * Hello world! 5 | * 6 | */ 7 | public class App 8 | { 9 | public static void main( String[] args ) 10 | { 11 | System.out.println( "Hello World!" ); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /com.homework/src/mommon/com/homework/mommon/ComTest.java: -------------------------------------------------------------------------------- 1 | package com.homework.mommon; 2 | 3 | public class ComTest { 4 | 5 | public static void main(String[] args) { 6 | // TODO Auto-generated method stub 7 | String str="sss"; 8 | String str2="dd"; 9 | } 10 | 11 | } 12 | -------------------------------------------------------------------------------- /com.homework/src/mommon/com/homework/mommon/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Jamas 6 | * 7 | */ 8 | package com.homework.mommon; -------------------------------------------------------------------------------- /com.homework/src/mommon/mytest/MenuTree.java: -------------------------------------------------------------------------------- 1 | package mytest; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Iterator; 5 | import java.util.List; 6 | //递归一颗树 7 | public class MenuTree { 8 | 9 | 10 | public static void mytree(List nlist,Node node){ 11 | System.out.print(node.getId()+node.getNodeName()); 12 | Node subnode=null; 13 | Long id=node.getId(); 14 | Iterator iter=nlist.iterator(); 15 | boolean isexit=false; 16 | while(iter.hasNext()){ 17 | Node nod=iter.next(); 18 | if(nod.getParentId()==id){ 19 | isexit=true; 20 | subnode=nod; 21 | mytree(nlist,subnode); 22 | 23 | } 24 | } 25 | if(!isexit)return; 26 | 27 | } 28 | 29 | 30 | public static void main(String[] args) { 31 | 32 | long start = System.currentTimeMillis(); 33 | List nodeList = new ArrayList(); 34 | Node node1 = new Node(1l, "蔬菜", 0l); 35 | Node node2 = new Node(2l, "水产", 0l); 36 | Node node3 = new Node(3l, "畜牧", 0l); 37 | Node node4 = new Node(4l, "瓜类", 1l); 38 | Node node5 = new Node(5l, "叶类", 1l); 39 | Node node6 = new Node(6l, "丝瓜", 4l); 40 | Node node7 = new Node(7l, "黄瓜", 4l); 41 | Node node8 = new Node(8l, "白菜", 1l); 42 | Node node9 = new Node(9l, "虾", 2l); 43 | Node node10 = new Node(10l, "鱼", 2l); 44 | Node node11 = new Node(11l, "牛", 3l); 45 | Node node0=new Node(0l,"市场种类",-1l); 46 | 47 | nodeList.add(node0); 48 | nodeList.add(node1); 49 | nodeList.add(node2); 50 | nodeList.add(node3); 51 | nodeList.add(node4); 52 | nodeList.add(node5); 53 | nodeList.add(node6); 54 | nodeList.add(node7); 55 | nodeList.add(node8); 56 | nodeList.add(node9); 57 | nodeList.add(node10); 58 | nodeList.add(node11); 59 | 60 | mytree(nodeList,node0); 61 | //NodeUtil mt = new NodeUtil(); 62 | //System.out.println(mt.getChildNodes(nodeList, 1l)); 63 | long end = System.currentTimeMillis(); 64 | System.out.println("用时:" + (end - start) + "ms"); 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /com.homework/src/mommon/mytest/Node.java: -------------------------------------------------------------------------------- 1 | package mytest; 2 | 3 | /** 4 | * 无限级节点模型 5 | */ 6 | public class Node { 7 | /** 8 | * 节点id 9 | */ 10 | private Long id; 11 | 12 | /** 13 | * 节点名称 14 | */ 15 | private String nodeName; 16 | 17 | /** 18 | * 父节点id 19 | */ 20 | private Long parentId; 21 | 22 | public Node() { 23 | } 24 | 25 | Node(Long id, Long parentId) { 26 | this.id = id; 27 | this.parentId = parentId; 28 | } 29 | 30 | Node(Long id, String nodeName, Long parentId) { 31 | this.id = id; 32 | this.nodeName = nodeName; 33 | this.parentId = parentId; 34 | } 35 | 36 | public Long getId() { 37 | return id; 38 | } 39 | 40 | public void setId(Long id) { 41 | this.id = id; 42 | } 43 | 44 | public Long getParentId() { 45 | return parentId; 46 | } 47 | 48 | public void setParentId(Long parentId) { 49 | this.parentId = parentId; 50 | } 51 | 52 | public String getNodeName() { 53 | return nodeName; 54 | } 55 | 56 | public void setNodeName(String nodeName) { 57 | this.nodeName = nodeName; 58 | } 59 | 60 | } 61 | 62 | -------------------------------------------------------------------------------- /com.homework/src/mommon/mytest/Recursive.java: -------------------------------------------------------------------------------- 1 | package mytest; 2 | 3 | public class Recursive { 4 | 5 | public static void foo(int num){ 6 | num=num-1; 7 | if(num==0)return; 8 | 9 | else 10 | { 11 | System.out.println(num); 12 | foo(num); 13 | 14 | } 15 | 16 | } 17 | public static void main(String[] args) { 18 | // TODO Auto-generated method stub 19 | foo(5); 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /com.homework/src/mommon/mytest/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Administrator 6 | * 7 | */ 8 | package mytest; -------------------------------------------------------------------------------- /com.homework/src/sequence/machinelearning/association/sequence/machinelearning/association/apriori/ItemMap.java: -------------------------------------------------------------------------------- 1 | package sequence.machinelearning.association.apriori; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | public class ItemMap { 7 | 8 | public String key; 9 | public Integer value=0; 10 | 11 | public Map map; 12 | 13 | public String getKey() { 14 | return key; 15 | } 16 | public void setKey(String key) { 17 | this.key = key; 18 | } 19 | public Integer getValue() { 20 | return value; 21 | } 22 | public void setValue(Integer value) { 23 | this.value = value; 24 | } 25 | public Map getMap() { 26 | if(map==null){ 27 | map=new HashMap(); 28 | } 29 | return map; 30 | } 31 | public void setMap(Map map) { 32 | this.map = map; 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /com.homework/src/sequence/machinelearning/association/sequence/machinelearning/association/apriori/Subset.java: -------------------------------------------------------------------------------- 1 | package sequence.machinelearning.association.apriori; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | /** 6 | * 求集合的子集,不包含自己和空集 7 | * @author Administrator 8 | * 算法:先取出第一个,取出一下个的时候与前面的所有项逐个搭配,所以两次FOR就可以了 9 | * 1 //第一次FOR,i=0 10 | * 12,2 //第二次FOR,i=1 11 | * 123,23,13,3 //第三次FOR,i=2 12 | * 14,124,24,1234,234,134,34//第四次FOR,i=3 13 | * 最后删除本身,为了Apriori算法而增加这一步 14 | */ 15 | //本类为测试使用,不在MyApriori里面 16 | public class Subset { 17 | 18 | public static List lis=new ArrayList(); 19 | public static void main(String[] args) { 20 | 21 | //subset(); 22 | // TODO Auto-generated method stub 23 | String[] str =new String[] { "1", "2", "3", "4"}; 24 | StringBuilder sb=new StringBuilder(); 25 | List li=new ArrayList(); 26 | for(int i=0;i li=new ArrayList(); 46 | for(int i=0;i dataMap=new TreeMap(); 15 | public static final void readF1() throws IOException { 16 | 17 | //String filePath="scripts/clustering/canopy/canopy.dat"; 18 | String filePath="datafile/association/items"; 19 | BufferedReader br = new BufferedReader(new InputStreamReader( 20 | new FileInputStream(filePath))); 21 | for (String line = br.readLine(); line != null; line = br.readLine()) { 22 | if(line.length()==0||"".equals(line))continue; 23 | String[] str=line.split("\t"); 24 | dataMap.put(str[0], str[1].trim()); 25 | //System.out.println(line); 26 | } 27 | br.close(); 28 | 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /com.homework/src/sequence/machinelearning/association/sequence/machinelearning/association/common/SortTest.java: -------------------------------------------------------------------------------- 1 | package sequence.machinelearning.association.common; 2 | 3 | import java.util.Comparator; 4 | import java.util.List; 5 | import java.util.ArrayList; 6 | import java.util.Collections; 7 | 8 | class User { 9 | String name; 10 | String age; 11 | 12 | public User(String name,String age){ 13 | this.name=name; 14 | this.age=age; 15 | } 16 | public String getAge() { 17 | return age; 18 | } 19 | public void setAge(String age) { 20 | this.age = age; 21 | } 22 | public String getName() { 23 | return name; 24 | } 25 | public void setName(String name) { 26 | this.name = name; 27 | } 28 | } 29 | 30 | class ComparatorUser implements Comparator{ 31 | 32 | public int compare(Object arg0, Object arg1) { 33 | User user0=(User)arg0; 34 | User user1=(User)arg1; 35 | //首先比较年龄,如果年龄相同,则比较名字 36 | int flag=user0.getAge().compareTo(user1.getAge()); 37 | if(flag==0){ 38 | return user0.getName().compareTo(user1.getName()); 39 | }else{ 40 | return flag; 41 | } 42 | } 43 | 44 | } 45 | 46 | public class SortTest { 47 | 48 | 49 | public static void main(String[] args){ 50 | List userlist=new ArrayList(); 51 | userlist.add(new User("dd","4")); 52 | userlist.add(new User("aa","1")); 53 | userlist.add(new User("ee","5")); 54 | userlist.add(new User("bb","2")); 55 | userlist.add(new User("ff","5")); 56 | userlist.add(new User("cc","3")); 57 | userlist.add(new User("gg","6")); 58 | 59 | ComparatorUser comparator=new ComparatorUser(); 60 | Collections.sort(userlist, comparator); 61 | 62 | for (int i=0;i tmap=new TreeMap(); 20 | 21 | /** 22 | * 扫描事务集以确定频繁1项集(找出C1) 23 | */ 24 | public static List findFrequentOneItemSets(Map map){ 25 | TreeMap treemap=new TreeMap(); 26 | Iterator> iter=map.entrySet().iterator(); 27 | Entry entry; 28 | while(iter.hasNext()){ 29 | entry=iter.next(); 30 | String str=entry.getValue(); 31 | if(str.length()<1)continue; 32 | String[] items=str.split(","); 33 | //找出购物栏最大的项,为循环连接做准备 34 | if(items.length>itemnum)itemnum=items.length; 35 | for(int i=0;i lif1=Transaction.findFrequentOneItemSets(ReadData.dataMap); 51 | for(int i=0;i itemsort(String[] items){ 57 | LinkedList linst=new LinkedList(); 58 | //选择法排序 59 | int len=items.length; 60 | for(int i=0;i DeleteItem(TreeMap map){ 81 | List listmap=new ArrayList(); 82 | Iterator> iter=map.entrySet().iterator(); 83 | Entry entry; 84 | while(iter.hasNext()){ 85 | entry=iter.next(); 86 | if(entry.getValue()>=support){ 87 | ItemMap item=new ItemMap(); 88 | item.setKey(entry.getKey()); 89 | item.setValue(entry.getValue()); 90 | if(listmap.size()==0)listmap.add(item); 91 | else{ 92 | 93 | ItemMap tail=new ItemMap(); 94 | int size=listmap.size(); 95 | tail=listmap.get(size-1); 96 | if(item.getValue()>tail.getValue()){ 97 | listmap.remove(size-1); 98 | listmap.add(item); 99 | listmap.add(tail); 100 | }else{ 101 | listmap.add(item); 102 | } 103 | 104 | } 105 | 106 | } 107 | } 108 | return listmap; 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /com.homework/src/sequence/machinelearning/association/sequence/machinelearning/association/common/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Administrator 6 | * 7 | */ 8 | package sequence.machinelearning.association.common; -------------------------------------------------------------------------------- /com.homework/src/sequence/machinelearning/association/sequence/machinelearning/association/fpgrowth/TreeNode2.java: -------------------------------------------------------------------------------- 1 | package sequence.machinelearning.association.fpgrowth; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | 7 | 8 | public class TreeNode2 implements Comparable{ 9 | 10 | private String name; // 节点名称 11 | private Integer count; // 计数 12 | private TreeNode2 parent; // 父节点 13 | private List children; // 子节点 14 | private TreeNode2 nextHomonym; // 下一个同名节点 15 | 16 | public TreeNode2() { 17 | 18 | } 19 | 20 | public String getName() { 21 | return name; 22 | } 23 | 24 | public void setName(String name) { 25 | this.name = name; 26 | } 27 | 28 | public Integer getCount() { 29 | return count; 30 | } 31 | 32 | public void setCount(Integer count) { 33 | this.count = count; 34 | } 35 | public void Sum(Integer count) { 36 | this.count =this.count+count; 37 | } 38 | public TreeNode2 getParent() { 39 | return parent; 40 | } 41 | 42 | public void setParent(TreeNode2 parent) { 43 | this.parent = parent; 44 | } 45 | 46 | public List getChildren() { 47 | return children; 48 | } 49 | 50 | public void setChildren(List children) { 51 | this.children = children; 52 | } 53 | 54 | public TreeNode2 getNextHomonym() { 55 | return nextHomonym; 56 | } 57 | 58 | public void setNextHomonym(TreeNode2 nextHomonym) { 59 | this.nextHomonym = nextHomonym; 60 | } 61 | /** 62 | * 添加一个节点 63 | * @param child 64 | */ 65 | public void addChild(TreeNode2 child) { 66 | if (this.getChildren() == null) { 67 | List list = new ArrayList(); 68 | list.add(child); 69 | this.setChildren(list); 70 | } else { 71 | this.getChildren().add(child); 72 | } 73 | } 74 | /** 75 | * 是否存在着该节点,存在返回该节点,不存在返回空 76 | * @param name 77 | * @return 78 | */ 79 | public TreeNode2 findChild(String name) { 80 | List children = this.getChildren(); 81 | if (children != null) { 82 | for (TreeNode2 child : children) { 83 | if (child.getName().equals(name)) { 84 | return child; 85 | } 86 | } 87 | } 88 | return null; 89 | } 90 | 91 | 92 | @Override 93 | public int compareTo(TreeNode2 arg0) { 94 | // TODO Auto-generated method stub 95 | int count0 = arg0.getCount(); 96 | // 跟默认的比较大小相反,导致调用Arrays.sort()时是按降序排列 97 | return count0 - this.count; 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /com.homework/src/sequence/machinelearning/association/sequence/machinelearning/association/fpgrowth/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Administrator 6 | * 7 | */ 8 | package sequence.machinelearning.association.fpgrowth; -------------------------------------------------------------------------------- /com.homework/src/sequence/machinelearning/association/sequence/machinelearning/association/fpgtest/TreeNode.java: -------------------------------------------------------------------------------- 1 | package sequence.machinelearning.association.fpgtest; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | public class TreeNode implements Comparable { 7 | 8 | private String name; // 节点名称 9 | private int count; // 计数 10 | private TreeNode parent; // 父节点 11 | private List children; // 子节点 12 | private TreeNode nextHomonym; // 下一个同名节点 13 | 14 | public TreeNode() { 15 | 16 | } 17 | 18 | public TreeNode(String name) { 19 | this.name = name; 20 | } 21 | 22 | public String getName() { 23 | return name; 24 | } 25 | 26 | public void setName(String name) { 27 | this.name = name; 28 | } 29 | 30 | public int getCount() { 31 | return count; 32 | } 33 | 34 | public void setCount(int count) { 35 | this.count = count; 36 | } 37 | 38 | public TreeNode getParent() { 39 | return parent; 40 | } 41 | 42 | public void setParent(TreeNode parent) { 43 | this.parent = parent; 44 | } 45 | 46 | public List getChildren() { 47 | return children; 48 | } 49 | 50 | public void addChild(TreeNode child) { 51 | if (this.getChildren() == null) { 52 | List list = new ArrayList(); 53 | list.add(child); 54 | this.setChildren(list); 55 | } else { 56 | this.getChildren().add(child); 57 | } 58 | } 59 | 60 | public TreeNode findChild(String name) { 61 | List children = this.getChildren(); 62 | if (children != null) { 63 | for (TreeNode child : children) { 64 | if (child.getName().equals(name)) { 65 | return child; 66 | } 67 | } 68 | } 69 | return null; 70 | } 71 | 72 | public void setChildren(List children) { 73 | this.children = children; 74 | } 75 | 76 | public void printChildrenName() { 77 | List children = this.getChildren(); 78 | if (children != null) { 79 | for (TreeNode child : children) { 80 | System.out.print(child.getName() + " "); 81 | } 82 | } else { 83 | System.out.print("null"); 84 | } 85 | } 86 | 87 | public TreeNode getNextHomonym() { 88 | return nextHomonym; 89 | } 90 | 91 | public void setNextHomonym(TreeNode nextHomonym) { 92 | this.nextHomonym = nextHomonym; 93 | } 94 | 95 | public void countIncrement(int n) { 96 | this.count += n; 97 | } 98 | 99 | @Override 100 | public int compareTo(TreeNode arg0) { 101 | // TODO Auto-generated method stub 102 | int count0 = arg0.getCount(); 103 | // 跟默认的比较大小相反,导致调用Arrays.sort()时是按降序排列 104 | return count0 - this.count; 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /com.homework/src/sequence/machinelearning/association/sequence/machinelearning/association/fpgtest/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Administrator 6 | * 7 | */ 8 | package sequence.machinelearning.association.fpgtest; -------------------------------------------------------------------------------- /com.homework/src/sequence/machinelearning/association/sequence/machinelearning/association/otherdemo/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Administrator 6 | * 7 | */ 8 | package sequence.machinelearning.association.otherdemo; -------------------------------------------------------------------------------- /com.homework/src/sequence/machinelearning/clustering/sequence/machinelearning/clustering/canopy/MyCanopy.java: -------------------------------------------------------------------------------- 1 | package sequence.machinelearning.clustering.canopy; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileInputStream; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | import java.util.Vector; 11 | 12 | public class MyCanopy { 13 | 14 | //x,y之间当且仅当有一个空格,要严格控制,为了与MAHOUT中的输入格式一致,所以这里也采用空格作为分隔。 15 | static Vector li=new Vector(); 16 | //static List li=new ArrayList(); 17 | static List> list=new ArrayList>(); 18 | private final static Double t1=8.0; 19 | private final static Double t2=4.0; 20 | //简单地采用曼哈顿距离|x1 – x2| + |y1 – y2| 21 | 22 | public static final void readF1() throws IOException { 23 | 24 | //String filePath="scripts/clustering/canopy/canopy.dat"; 25 | String filePath="datafile/cluster/simple_k-means.txt"; 26 | BufferedReader br = new BufferedReader(new InputStreamReader( 27 | new FileInputStream(filePath))); 28 | for (String line = br.readLine(); line != null; line = br.readLine()) { 29 | if(line.length()==0||"".equals(line))continue; 30 | String[] str=line.split(" "); 31 | Point p0=new Point(); 32 | p0.setX(Double.valueOf(str[0])); 33 | p0.setY(Double.valueOf(str[1])); 34 | li.add(p0); 35 | //System.out.println(line); 36 | } 37 | br.close(); 38 | } 39 | //简单地采用曼哈顿距离|x1 – x2| + |y1 – y2| 40 | public static Double DistanceMeasure(Point p1,Point p2){ 41 | return Math.abs(p2.getX()-p1.getX()) +Math.abs(p2.getY()-p1.getY()); 42 | } 43 | public static void clustering(){ 44 | 45 | //初始化一个canopy 46 | Point p0=new Point(); 47 | p0=li.get(0); 48 | Vector v1=new Vector(); 49 | v1.add(p0); 50 | list.add(v1); 51 | li.remove(0); 52 | System.out.println("中心点为:"+p0.getX()+","+p0.getY()); 53 | while(0 v=list.get(i); 59 | Point p2=v.get(0); 60 | double dist =DistanceMeasure(p1,p2); 61 | //如果小于t2,属于当前的聚类,已经够接近了,不需要再聚类了,所以删除 62 | if(dist vec=new Vector(); 89 | vec.add(p1); 90 | li.remove(0); 91 | list.add(vec); 92 | 93 | } 94 | //与各个已经形成的聚类比较距离,比较结束后将其删除,以结束循环 95 | if(li.get(0).getSign()!=-1){ 96 | li.remove(0); 97 | } 98 | } 99 | String ss="ddd"; 100 | } 101 | 102 | 103 | 104 | 105 | 106 | 107 | public static void main(String[] args) throws IOException { 108 | // TODO Auto-generated method stub 109 | readF1(); 110 | 111 | clustering(); 112 | String ss="ddd"; 113 | } 114 | 115 | } 116 | -------------------------------------------------------------------------------- /com.homework/src/sequence/machinelearning/clustering/sequence/machinelearning/clustering/canopy/Point.java: -------------------------------------------------------------------------------- 1 | package sequence.machinelearning.clustering.canopy; 2 | 3 | public class Point { 4 | 5 | private Double x; 6 | private Double y; 7 | private Integer sign=-1; 8 | public Double getX() { 9 | return x; 10 | } 11 | public void setX(Double x) { 12 | this.x = x; 13 | } 14 | public Double getY() { 15 | return y; 16 | } 17 | public void setY(Double y) { 18 | this.y = y; 19 | } 20 | public Integer getSign() { 21 | return sign; 22 | } 23 | public void setSign(Integer sign) { 24 | this.sign = sign; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /com.homework/src/sequence/machinelearning/clustering/sequence/machinelearning/clustering/canopy/UserPoint.java: -------------------------------------------------------------------------------- 1 | package sequence.machinelearning.clustering.canopy; 2 | 3 | public class UserPoint { 4 | 5 | private Double x; 6 | private Double y; 7 | private Integer sign=-1; 8 | private String userid; 9 | 10 | public Double getX() { 11 | return x; 12 | } 13 | public void setX(Double x) { 14 | this.x = x; 15 | } 16 | public Double getY() { 17 | return y; 18 | } 19 | public void setY(Double y) { 20 | this.y = y; 21 | } 22 | public Integer getSign() { 23 | return sign; 24 | } 25 | public void setSign(Integer sign) { 26 | this.sign = sign; 27 | } 28 | public String getUserid() { 29 | return userid; 30 | } 31 | public void setUserid(String userid) { 32 | this.userid = userid; 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /com.homework/src/sequence/machinelearning/clustering/sequence/machinelearning/clustering/canopy/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Administrator 6 | * 7 | */ 8 | package sequence.machinelearning.clustering.canopy; -------------------------------------------------------------------------------- /com.homework/src/sequence/machinelearning/clustering/sequence/machinelearning/clustering/kmeans/MyKmeans.java: -------------------------------------------------------------------------------- 1 | package sequence.machinelearning.clustering.kmeans; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileInputStream; 5 | import java.io.IOException; 6 | import java.io.InputStreamReader; 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | import java.util.Vector; 10 | 11 | import sequence.machinelearning.clustering.canopy.Point; 12 | 13 | 14 | 15 | public class MyKmeans { 16 | 17 | static Vector li=new Vector(); 18 | //static List li=new ArrayList(); 19 | static List> list=new ArrayList>(); //每次迭代保存结果,一个vector代表一个簇 20 | private final static Integer K=2; //选K=2,也就是估算有两个簇。 21 | private final static Double converge=0.001; //当距离小于某个值的时候,就认为聚类已经聚类了,不需要再迭代,这里的值选0.001 22 | 23 | //读取数据 24 | public static final void readF1() throws IOException { 25 | String filePath="datafile/cluster/simple_k-means.txt"; 26 | BufferedReader br = new BufferedReader(new InputStreamReader( 27 | new FileInputStream(filePath))); 28 | for (String line = br.readLine(); line != null; line = br.readLine()) { 29 | if(line.length()==0||"".equals(line))continue; 30 | String[] str=line.split(" "); 31 | Point p0=new Point(); 32 | p0.setX(Double.valueOf(str[0])); 33 | p0.setY(Double.valueOf(str[1])); 34 | li.add(p0); 35 | //System.out.println(line); 36 | } 37 | br.close(); 38 | } 39 | //math.sqrt(double n) 40 | //扩展下,如果要给m开n次方就用java.lang.StrictMath.pow(m,1.0/n); 41 | //采用欧氏距离 42 | public static Double DistanceMeasure(Point p1,Point p2){ 43 | 44 | Double tmp=StrictMath.pow(p2.getX()-p1.getX(), 2)+StrictMath.pow(p2.getY()-p1.getY(), 2); 45 | return Math.sqrt(tmp); 46 | } 47 | 48 | //计算新的簇心 49 | public static Double CalCentroid(){ 50 | System.out.println("------------------------------------------------"); 51 | Double movedist=Double.MAX_VALUE; 52 | for(int i=0;i subli=list.get(i); 54 | Point po=new Point(); 55 | Double sumX=0.0; 56 | Double sumY=0.0; 57 | Double Clusterlen=Double.valueOf(subli.size()); 58 | for(int j=0;jconverge;times++){ 83 | System.out.println("第"+times+"次迭代"); 84 | //默认每一个list里的Vector第0个元素是质心 85 | for(int i=0;i li=new Vector(); 17 | //static List li=new ArrayList(); 18 | static List> list=new ArrayList>(); //每次迭代保存结果,一个vector代表一个簇 19 | private final static Integer K=3; //选K=2,也就是估算有两个簇。 20 | private final static Double converge=0.01; //当距离小于某个值的时候,就认为聚类已经聚类了,不需要再迭代,这里的值选0.001 21 | 22 | //读取数据 23 | public static final void readF1() throws IOException { 24 | String filePath="datafile/cluster/data.csv"; 25 | BufferedReader br = new BufferedReader(new InputStreamReader( 26 | new FileInputStream(filePath))); 27 | for (String line = br.readLine(); line != null; line = br.readLine()) { 28 | if(line.length()==0||"".equals(line))continue; 29 | String[] str=line.split(","); 30 | UserPoint p0=new UserPoint(); 31 | p0.setUserid(str[0]); 32 | p0.setX(Double.valueOf(str[1])); 33 | p0.setY(Double.valueOf(str[2])); 34 | li.add(p0); 35 | //System.out.println(line); 36 | } 37 | br.close(); 38 | } 39 | //math.sqrt(double n) 40 | //扩展下,如果要给m开n次方就用java.lang.StrictMath.pow(m,1.0/n); 41 | //采用欧氏距离 42 | public static Double DistanceMeasure(UserPoint p1,UserPoint p2){ 43 | 44 | Double tmp=StrictMath.pow(p2.getX()-p1.getX(), 2)+StrictMath.pow(p2.getY()-p1.getY(), 2); 45 | return Math.sqrt(tmp); 46 | } 47 | 48 | //计算新的簇心 49 | public static Double CalCentroid(){ 50 | System.out.println("------------------------------------------------"); 51 | Double movedist=Double.MAX_VALUE; 52 | for(int i=0;i subli=list.get(i); 54 | UserPoint po=new UserPoint(); 55 | Double sumX=0.0; 56 | Double sumY=0.0; 57 | Double Clusterlen=Double.valueOf(subli.size()); 58 | for(int j=0;jconverge;times++){ 83 | System.out.println("第"+times+"次迭代"); 84 | //默认每一个list里的Vector第0个元素是质心 85 | for(int i=0;i li){ 18 | 19 | Double entropy=new Double(0.0); 20 | for(int i=0;i lasv){ 35 | Double gain=new Double(0.0); 36 | Double enSum=new Double(0.0); 37 | Map.Entryentry; 38 | for(int i=0;i children; // 子节点 10 | private String fatherAttribute; // 此节点是父类的哪具属性的分支 11 | //可信度 12 | private Double percent; 13 | 14 | //属性数组 15 | private ArrayList liatts; 16 | 17 | 18 | public ArrayList getLiatts() { 19 | return liatts; 20 | } 21 | public void setLiatts(ArrayList liatts) { 22 | this.liatts = liatts; 23 | } 24 | public String getName() { 25 | return name; 26 | } 27 | public void setName(String name) { 28 | this.name = name; 29 | } 30 | public TreeNode getParent() { 31 | return parent; 32 | } 33 | public void setParent(TreeNode parent) { 34 | this.parent = parent; 35 | } 36 | public List getChildren() { 37 | return children; 38 | } 39 | public void setChildren(List children) { 40 | this.children = children; 41 | } 42 | 43 | public String getFatherAttribute() { 44 | return fatherAttribute; 45 | } 46 | public void setFatherAttribute(String fatherAttribute) { 47 | this.fatherAttribute = fatherAttribute; 48 | } 49 | public Double getPercent() { 50 | return percent; 51 | } 52 | public void setPercent(Double percent) { 53 | this.percent = percent; 54 | } 55 | /** 56 | * 添加一个节点 57 | * @param child 58 | */ 59 | public void addChild(TreeNode child) { 60 | if (this.getChildren() == null) { 61 | List list = new ArrayList(); 62 | list.add(child); 63 | this.setChildren(list); 64 | } else { 65 | this.getChildren().add(child); 66 | } 67 | } 68 | /** 69 | * 是否存在着该节点,存在返回该节点,不存在返回空 70 | * @param name 71 | * @return 72 | */ 73 | public TreeNode findChild(String name) { 74 | List children = this.getChildren(); 75 | if (children != null) { 76 | for (TreeNode child : children) { 77 | if (child.getName().equals(name)) { 78 | return child; 79 | } 80 | } 81 | } 82 | return null; 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /com.homework/src/sequence/machinelearning/decisiontree/sequence/machinelearning/decisiontree/myid3/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Administrator 6 | * 7 | */ 8 | package sequence.machinelearning.decisiontree.myid3; -------------------------------------------------------------------------------- /com.homework/src/sequence/machinelearning/naivebayes/sequence/machinelearning/naivebayes/bayesdemo/Main.java: -------------------------------------------------------------------------------- 1 | package sequence.machinelearning.naivebayes.bayesdemo; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.FileOutputStream; 7 | import java.io.FileWriter; 8 | import java.io.IOException; 9 | import java.io.InputStreamReader; 10 | 11 | public class Main { 12 | 13 | public static void main(String[] args) throws IOException { 14 | // TODO Auto-generated method stub 15 | Main m=new Main(); 16 | m.stringBufferDemo(); 17 | //m.fileWriter("D:/test.txt"); 18 | m.readF1(); 19 | } 20 | 21 | public void fileWriter(String fileName) throws IOException{ 22 | //创建一个FileWriter对象 23 | FileWriter fw = new FileWriter(fileName); 24 | //遍历clist集合写入到fileName中 25 | for (int i=0;i<10;i++){ 26 | fw.write("第"+i+"行----"); 27 | fw.write("\n"); 28 | } 29 | //刷新缓冲区 30 | fw.flush(); 31 | //关闭文件流对象 32 | fw.close(); 33 | } 34 | 35 | 36 | 37 | /** 38 | * 利用StringBuffer写文件 39 | * 该方法可以设定使用何种编码,有效解决中文问题。 40 | * @throws IOException 41 | */ 42 | 43 | public void stringBufferDemo() throws IOException 44 | { 45 | String src="datafile/naivebayes/train/out/result.arff"; 46 | delfile(src); 47 | File file=new File(src); 48 | if(file.exists()) 49 | file.createNewFile(); 50 | FileOutputStream out=new FileOutputStream(file,true); 51 | for(int i=0;i<10;i++) 52 | { 53 | StringBuffer sb=new StringBuffer(); 54 | sb.append("这是第"+i+"行 \n");//如果不加"/n"则不能实现换行。 55 | System.out.print(sb.toString()); 56 | 57 | out.write(sb.toString().getBytes("utf-8")); 58 | } 59 | out.close(); 60 | } 61 | public void delfile(String filepath){ 62 | File file=new File(filepath); 63 | if(file.exists()) 64 | { 65 | //file.createNewFile(); 66 | file.delete(); 67 | } 68 | 69 | } 70 | public void readF1() throws IOException { 71 | 72 | //String filePath="scripts/clustering/canopy/canopy.dat"; 73 | String filePath="datafile/naivebayes/train/out/result"; 74 | BufferedReader br = new BufferedReader(new InputStreamReader( 75 | new FileInputStream(filePath))); 76 | for (String line = br.readLine(); line != null; line = br.readLine()) { 77 | if(line.length()==0||"".equals(line))continue; 78 | String[] str=line.split(","); 79 | 80 | 81 | } 82 | br.close(); 83 | 84 | } 85 | 86 | 87 | } 88 | -------------------------------------------------------------------------------- /com.homework/src/sequence/machinelearning/naivebayes/sequence/machinelearning/naivebayes/bayesdemo/Test.java: -------------------------------------------------------------------------------- 1 | package sequence.machinelearning.naivebayes.bayesdemo; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.math.BigDecimal; 8 | import java.util.ArrayList; 9 | import java.util.HashMap; 10 | import java.util.Map; 11 | import java.util.regex.Matcher; 12 | import java.util.regex.Pattern; 13 | 14 | public class Test { 15 | 16 | private static Map cmap=new HashMap(); 17 | private static Map pmap=new HashMap(); 18 | public static final String patternString = "@decision(.*)[{](.*?)[}]"; 19 | public BigDecimal getProbability(String[] line,String decision){ 20 | 21 | String ckey="P("+decision+")"; 22 | //获取P(yes)的概率 23 | BigDecimal result=new BigDecimal(cmap.get(ckey)); 24 | for(int j=0;j{ 27 | private final static IntWritable one=new IntWritable(1); 28 | Text ip=new Text(); 29 | @Override 30 | public void map(Object key, Text value,OutputCollector output, Reporter reporter)throws IOException { 31 | // TODO Auto-generated method stub 32 | Kpi kpi=new Kpi(); 33 | kpi=Kpi.filterIPs(value.toString()); 34 | if(kpi.isValid()==true){ 35 | ip.set(kpi.getRemote_addr()); 36 | output.collect(ip, one); 37 | } 38 | } 39 | } 40 | public static class IpReducer extends MapReduceBase implements Reducer{ 41 | private IntWritable sumresult=new IntWritable(0); 42 | //private final static IntWritable one =new IntWritable(1); 43 | private int sum=0; 44 | 45 | @Override 46 | public void reduce(Text key, Iterator values,OutputCollector output, Reporter reporter)throws IOException { 47 | // TODO Auto-generated method stub 48 | sum=sum+1; 49 | 50 | sumresult.set(sum); 51 | System.out.print(key+"is:"+sumresult); 52 | output.collect(key, sumresult); 53 | } 54 | } 55 | /** 56 | * @param args 57 | */ 58 | public static void main(String[] args) throws Exception{ 59 | // TODO Auto-generated method stub 60 | String inpath="hdfs://10.6.3.200:9000/user/hdfs/in/"; 61 | String outpath="hdfs://10.6.3.200:9000/user/hdfs/ip_out/"; 62 | 63 | JobConf conf=new JobConf(DayIp.class); 64 | conf.setJobName("depend ip count is:"); 65 | 66 | conf.setMapOutputKeyClass(Text.class); 67 | conf.setMapOutputValueClass(IntWritable.class); 68 | 69 | conf.setOutputKeyClass(Text.class); 70 | conf.setOutputValueClass(IntWritable.class); 71 | 72 | conf.setMapperClass(IpMapper.class); 73 | conf.setReducerClass(IpReducer.class); 74 | conf.setCombinerClass(IpReducer.class); 75 | 76 | conf.setInputFormat(TextInputFormat.class); 77 | conf.setOutputFormat(TextOutputFormat.class); 78 | 79 | FileInputFormat.setInputPaths(conf, new Path(inpath)); 80 | FileOutputFormat.setOutputPath(conf,new Path(outpath)); 81 | 82 | JobClient.runJob(conf); 83 | System.out.println("finish"); 84 | System.exit(0); 85 | 86 | } 87 | 88 | } 89 | -------------------------------------------------------------------------------- /com.homework/src/week2/business/StatPV.java: -------------------------------------------------------------------------------- 1 | package business; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapred.FileInputFormat; 10 | import org.apache.hadoop.mapred.FileOutputFormat; 11 | import org.apache.hadoop.mapred.JobClient; 12 | import org.apache.hadoop.mapred.JobConf; 13 | import org.apache.hadoop.mapred.MapReduceBase; 14 | import org.apache.hadoop.mapred.Mapper; 15 | import org.apache.hadoop.mapred.OutputCollector; 16 | import org.apache.hadoop.mapred.Reducer; 17 | import org.apache.hadoop.mapred.Reporter; 18 | import org.apache.hadoop.mapred.TextInputFormat; 19 | import org.apache.hadoop.mapred.TextOutputFormat; 20 | 21 | import entity.Kpi; 22 | 23 | public class StatPV { 24 | 25 | private static class PvMapper extends MapReduceBase implements Mapper{ 26 | 27 | private IntWritable one=new IntWritable(1); 28 | private Text pvtxt=new Text(); 29 | @Override 30 | public void map(Object key, Text value,OutputCollector output, Reporter reporter)throws IOException { 31 | // TODO Auto-generated method stub 32 | try { 33 | Kpi kpi=Kpi.filterPVs(value.toString()); 34 | pvtxt.set("pv"); 35 | output.collect(pvtxt, one); 36 | } catch (Exception e) { 37 | // TODO Auto-generated catch block 38 | e.printStackTrace(); 39 | } 40 | } 41 | 42 | } 43 | private static class PvReducer extends MapReduceBase implements Reducer{ 44 | 45 | private IntWritable result=new IntWritable(0); 46 | 47 | @Override 48 | public void reduce(Text key, Iterator values,OutputCollector output, Reporter reporter)throws IOException { 49 | // TODO Auto-generated method stub 50 | int sum=0; 51 | try { 52 | while(values.hasNext()){ 53 | 54 | sum=sum+ values.next().get(); 55 | } 56 | result.set(sum); 57 | output.collect(key, result); 58 | } catch (Exception e) { 59 | // TODO Auto-generated catch block 60 | e.printStackTrace(); 61 | } 62 | } 63 | } 64 | public static void main(String[] args) throws IOException { 65 | String inpath="hdfs://localhost:9000/user/hdfs/in/"; 66 | String outpath="hdfs://localhost:9000/user/hdfs/pv_out/"; 67 | JobConf conf=new JobConf(StatPV.class); 68 | conf.setJobName("StatPV"); 69 | conf.setMapperClass(PvMapper.class); 70 | conf.setCombinerClass(PvReducer.class); 71 | conf.setReducerClass(PvReducer.class); 72 | 73 | 74 | conf.setOutputKeyClass(Text.class); 75 | conf.setOutputValueClass(IntWritable.class); 76 | 77 | conf.setMapOutputKeyClass(Text.class); 78 | conf.setMapOutputValueClass(IntWritable.class); 79 | 80 | conf.setInputFormat(TextInputFormat.class); 81 | conf.setOutputFormat(TextOutputFormat.class); 82 | 83 | FileInputFormat.setInputPaths(conf, new Path(inpath)); 84 | FileOutputFormat.setOutputPath(conf, new Path(outpath)); 85 | JobClient.runJob(conf); 86 | System.out.println("finish"); 87 | System.exit(0); 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /com.homework/src/week2/business/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author hadoop 6 | * 7 | */ 8 | package business; -------------------------------------------------------------------------------- /com.homework/src/week2/entity/Kpi.java: -------------------------------------------------------------------------------- 1 | package entity; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | 7 | 8 | 9 | 10 | public class Kpi { 11 | 12 | private String remote_addr;// 记录客户端的ip地址 13 | private String remote_user;// 记录客户端用户名称,忽略属性"-" 14 | private String time_local;// 记录访问时间与时区 15 | private String request;// 记录请求的url与http协议 16 | private String status;// 记录请求状态;成功是200 17 | private String body_bytes_sent;// 记录发送给客户端文件主体内容大小 18 | private String http_referer;// 用来记录从那个页面链接访问过来的 19 | private String http_user_agent;// 记录客户浏览器的相关信息 20 | 21 | private boolean valid = true;// 判断数据是否合法 22 | 23 | private static Kpi parser(String line) { 24 | //System.out.println(line); 25 | Kpi kpi = new Kpi(); 26 | try { 27 | String[] arr = line.split(" "); 28 | if (arr.length > 11) { 29 | kpi.setRemote_addr(arr[0]); 30 | kpi.setRemote_user(arr[1]); 31 | kpi.setTime_local(arr[3].substring(1)); 32 | kpi.setRequest(arr[6]); 33 | kpi.setStatus(arr[8]); 34 | kpi.setBody_bytes_sent(arr[9]); 35 | kpi.setHttp_referer(arr[10]); 36 | 37 | if (arr.length > 12) { 38 | kpi.setHttp_user_agent(arr[11] + " " + arr[12]); 39 | } else { 40 | kpi.setHttp_user_agent(arr[11]); 41 | } 42 | 43 | if (Integer.parseInt(kpi.getStatus()) >= 400) {// 大于400,HTTP错误 44 | kpi.setValid(false); 45 | } 46 | } else { 47 | kpi.setValid(false); 48 | } 49 | } catch (NumberFormatException e) { 50 | // TODO Auto-generated catch block 51 | e.printStackTrace(); 52 | } 53 | 54 | 55 | return kpi; 56 | } 57 | public static Kpi filterIPs(String line) { 58 | 59 | Kpi kpi=new Kpi(); 60 | try { 61 | kpi = parser(line); 62 | int n1=kpi.getRequest().indexOf(".php"); 63 | int n2=kpi.getRequest().indexOf(".html"); 64 | if((n1+n2)==-2){ 65 | kpi.setValid(false); 66 | } 67 | } catch (Exception e) { 68 | // TODO Auto-generated catch block 69 | e.printStackTrace(); 70 | } 71 | return kpi; 72 | } 73 | public static Kpi filterPVs(String line) { 74 | 75 | Kpi kpi=new Kpi(); 76 | try { 77 | kpi = parser(line); 78 | int n1=kpi.getRequest().indexOf(".php"); 79 | int n2=kpi.getRequest().indexOf(".html"); 80 | if((n1+n2)==-2){ 81 | kpi.setValid(false); 82 | } 83 | int n3=kpi.getRequest().indexOf("baidu"); 84 | int n4=kpi.getRequest().indexOf("google"); 85 | if((n3+n4)!=-2){ 86 | kpi.setValid(false); 87 | } 88 | } catch (Exception e) { 89 | // TODO Auto-generated catch block 90 | e.printStackTrace(); 91 | } 92 | return kpi; 93 | } 94 | 95 | public String getRemote_addr() { 96 | return remote_addr; 97 | } 98 | 99 | public void setRemote_addr(String remote_addr) { 100 | this.remote_addr = remote_addr; 101 | } 102 | 103 | public String getRemote_user() { 104 | return remote_user; 105 | } 106 | 107 | public void setRemote_user(String remote_user) { 108 | this.remote_user = remote_user; 109 | } 110 | 111 | public String getTime_local() { 112 | return time_local; 113 | } 114 | 115 | public void setTime_local(String time_local) { 116 | this.time_local = time_local; 117 | } 118 | 119 | public String getRequest() { 120 | return request; 121 | } 122 | 123 | public void setRequest(String request) { 124 | this.request = request; 125 | } 126 | 127 | public String getStatus() { 128 | return status; 129 | } 130 | 131 | public void setStatus(String status) { 132 | this.status = status; 133 | } 134 | 135 | public String getBody_bytes_sent() { 136 | return body_bytes_sent; 137 | } 138 | 139 | public void setBody_bytes_sent(String body_bytes_sent) { 140 | this.body_bytes_sent = body_bytes_sent; 141 | } 142 | 143 | public String getHttp_referer() { 144 | return http_referer; 145 | } 146 | 147 | public void setHttp_referer(String http_referer) { 148 | this.http_referer = http_referer; 149 | } 150 | 151 | public String getHttp_user_agent() { 152 | return http_user_agent; 153 | } 154 | 155 | public void setHttp_user_agent(String http_user_agent) { 156 | this.http_user_agent = http_user_agent; 157 | } 158 | 159 | public boolean isValid() { 160 | return valid; 161 | } 162 | 163 | public void setValid(boolean valid) { 164 | this.valid = valid; 165 | } 166 | 167 | 168 | } 169 | -------------------------------------------------------------------------------- /com.homework/src/week2/entity/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author hadoop 6 | * 7 | */ 8 | package entity; -------------------------------------------------------------------------------- /com.homework/src/week3/mine/Outinfo.java: -------------------------------------------------------------------------------- 1 | package mine; 2 | 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.Text; 5 | 6 | public class Outinfo { 7 | 8 | private Text outkey; 9 | private Text outvalue; 10 | private boolean outValidate=true; 11 | 12 | public Text getOutkey() { 13 | return outkey; 14 | } 15 | public void setOutkey(Text outkey) { 16 | this.outkey = outkey; 17 | } 18 | 19 | public Text getOutvalue() { 20 | return outvalue; 21 | } 22 | public void setOutvalue(Text outvalue) { 23 | this.outvalue = outvalue; 24 | } 25 | public boolean isOutValidate() { 26 | return outValidate; 27 | } 28 | public void setOutValidate(boolean outValidate) { 29 | this.outValidate = outValidate; 30 | } 31 | 32 | 33 | } 34 | -------------------------------------------------------------------------------- /com.homework/src/week3/mine/StationInfo.java: -------------------------------------------------------------------------------- 1 | package mine; 2 | 3 | import java.text.ParseException; 4 | import java.text.SimpleDateFormat; 5 | import java.util.Date; 6 | 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | 10 | 11 | 12 | public class StationInfo { 13 | 14 | private String imsi; 15 | private String imei; 16 | private String updatetype; 17 | private String local; 18 | private Date time; 19 | private String url; 20 | private Integer type; 21 | private boolean validate=true; 22 | 23 | //type=0表示POS位置信息,1表示NET上网记录 24 | private SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); 25 | 26 | public StationInfo init(String line,int type) throws ParseException{ 27 | 28 | StationInfo info= new StationInfo(); 29 | if(line==null||line.trim().length()<=0){ 30 | info.setValidate(false); 31 | return info; 32 | } 33 | try { 34 | 35 | if(line.length()>0){ 36 | String[] arr=line.split("\t"); 37 | if(arr.length!=5){ 38 | info.setValidate(false); 39 | return info; 40 | } 41 | if(type==0){ 42 | info.setImsi(arr[0]); 43 | info.setImei(arr[1]); 44 | info.setUpdatetype(arr[2]); 45 | info.setLocal(arr[3]); 46 | info.setTime(this.formatter.parse(arr[4])); 47 | }else if(type==1){ 48 | info.setImsi(arr[0]); 49 | info.setImei(arr[1]); 50 | info.setLocal(arr[2]); 51 | info.setTime(this.formatter.parse(arr[3])); 52 | info.setUrl(arr[4]); 53 | } 54 | } 55 | else{ 56 | info.setValidate(false); 57 | } 58 | } catch (Exception e) { 59 | // TODO Auto-generated catch block 60 | e.printStackTrace(); 61 | info.setValidate(false); 62 | return info; 63 | } 64 | return info; 65 | 66 | } 67 | //date表示所要计算的日期 68 | public Outinfo output(String line,int type,String date,String[] timepoint) throws ParseException{ 69 | 70 | StationInfo info=new StationInfo(); 71 | Outinfo outinfo=new Outinfo(); 72 | String timeFlag; 73 | 74 | try { 75 | info=info.init(line, type); 76 | if(!info.isValidate()){ 77 | outinfo.setOutValidate(false); 78 | return outinfo; 79 | } 80 | String dateValue= formatter.format(info.getTime()); 81 | if(!dateValue.startsWith(date)){ 82 | outinfo.setOutValidate(false); 83 | } 84 | 85 | //计算所属时间段 86 | int i = 0, n = timepoint.length; 87 | int hour = Integer.valueOf( dateValue.split(" ")[1].split(":")[0] ); 88 | while ( i < n && Integer.valueOf( timepoint[i] ) <= hour ) 89 | i++; 90 | if ( i < n ) 91 | { 92 | if ( i == 0 ) 93 | timeFlag = ( "00-" + timepoint[i] ); 94 | else 95 | timeFlag = ( timepoint[i-1] + "-" + timepoint[i] ); 96 | } 97 | else //Hour大于最大的时间点 98 | timeFlag="unknow"; 99 | String outkey=info.getImsi()+"|"+timeFlag; 100 | Text keytext=new Text(); 101 | Text valuenum=new Text(); 102 | long t=(info.getTime().getTime()/1000l); 103 | 104 | valuenum.set(info.getLocal()+"|"+String.valueOf(t)); 105 | keytext.set(outkey.toString()); 106 | outinfo.setOutkey(keytext); 107 | outinfo.setOutvalue(valuenum); 108 | } catch (NumberFormatException e) { 109 | // TODO Auto-generated catch block 110 | e.printStackTrace(); 111 | outinfo.setOutValidate(false); 112 | return outinfo; 113 | } 114 | 115 | return outinfo; 116 | } 117 | 118 | 119 | 120 | public String getImsi() { 121 | return imsi; 122 | } 123 | public void setImsi(String imsi) { 124 | this.imsi = imsi; 125 | } 126 | public String getImei() { 127 | return imei; 128 | } 129 | public void setImei(String imei) { 130 | this.imei = imei; 131 | } 132 | public String getUpdatetype() { 133 | return updatetype; 134 | } 135 | public void setUpdatetype(String updatetype) { 136 | this.updatetype = updatetype; 137 | } 138 | public String getLocal() { 139 | return local; 140 | } 141 | public void setLocal(String local) { 142 | this.local = local; 143 | } 144 | public Date getTime() { 145 | return time; 146 | } 147 | public void setTime(Date time) { 148 | this.time = time; 149 | } 150 | public String getUrl() { 151 | return url; 152 | } 153 | public void setUrl(String url) { 154 | this.url = url; 155 | } 156 | 157 | 158 | 159 | public boolean isValidate() { 160 | return validate; 161 | } 162 | 163 | 164 | 165 | public void setValidate(boolean validate) { 166 | this.validate = validate; 167 | } 168 | 169 | 170 | 171 | public Integer getType() { 172 | return type; 173 | } 174 | 175 | 176 | 177 | public void setType(Integer type) { 178 | this.type = type; 179 | } 180 | 181 | 182 | 183 | } 184 | -------------------------------------------------------------------------------- /com.homework/src/week3/mine/StayTime2改造前备份.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bellonor/myHadoopProject/5b7f551dda6daf86489eeed8370d868a90a5cb8e/com.homework/src/week3/mine/StayTime2改造前备份.rar -------------------------------------------------------------------------------- /com.homework/src/week3/mine/my.net: -------------------------------------------------------------------------------- 1 | 0000000001 0000000001 10000001 2014-03-19 08:50:00 www.baidu.com 2 | 0000000002 0000000002 20000001 2014-03-19 07:20:00 www.baidu.com 3 | 0000000003 0000000003 30000001 2014-03-19 08:10:00 www.google.com 4 | 5 | -------------------------------------------------------------------------------- /com.homework/src/week3/mine/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author hadoop 6 | * 7 | */ 8 | package mine; -------------------------------------------------------------------------------- /com.homework/src/week3/tutorial/TableLine.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 本包为 Dataguru.cn Hadoop 实战案例课程程序 3 | * 编写者:James. 4 | */ 5 | package tutorial; 6 | 7 | import java.text.ParseException; 8 | import java.text.SimpleDateFormat; 9 | import java.util.Date; 10 | 11 | import org.apache.hadoop.io.Text; 12 | 13 | /** 14 | * 定义异常类 15 | */ 16 | class LineException extends Exception 17 | { 18 | private static final long serialVersionUID = 8245008693589452584L; 19 | int flag; 20 | public LineException(String msg, int flag) 21 | { 22 | super(msg); 23 | this.flag = flag; 24 | } 25 | public int getFlag() 26 | { 27 | return flag; 28 | } 29 | } 30 | 31 | 32 | /** 33 | * 读取一行数据 34 | * 提取所要字段 35 | */ 36 | public class TableLine 37 | { 38 | private String imsi, position, time, timeFlag; 39 | private Date day; 40 | private SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); 41 | 42 | /** 43 | * 初始化并检查该行的合法性 44 | */ 45 | public void set ( String line, boolean source, String date, String [] timepoint ) throws LineException 46 | { 47 | String [] lineSplit = line.split("\t"); 48 | if( source ) 49 | { 50 | this.imsi = lineSplit[0]; 51 | this.position = lineSplit[3]; 52 | this.time = lineSplit[4]; 53 | } 54 | else 55 | { 56 | this.imsi = lineSplit[0]; 57 | this.position = lineSplit[2]; 58 | this.time = lineSplit[3]; 59 | } 60 | 61 | //检查日期合法性 62 | if ( ! this.time.startsWith(date) ) //年月日必须与date一致 63 | throw new LineException("", -1); 64 | 65 | try 66 | { 67 | this.day = this.formatter.parse(this.time); 68 | } 69 | catch ( ParseException e ) 70 | { 71 | throw new LineException("", 0); 72 | } 73 | 74 | //计算所属时间段 75 | int i = 0, n = timepoint.length; 76 | int hour = Integer.valueOf( this.time.split(" ")[1].split(":")[0] ); 77 | while ( i < n && Integer.valueOf( timepoint[i] ) <= hour ) 78 | i++; 79 | if ( i < n ) 80 | { 81 | if ( i == 0 ) 82 | this.timeFlag = ( "00-" + timepoint[i] ); 83 | else 84 | this.timeFlag = ( timepoint[i-1] + "-" + timepoint[i] ); 85 | } 86 | else //Hour大于最大的时间点 87 | throw new LineException("", -1); 88 | } 89 | 90 | /** 91 | * 输出KEY 92 | */ 93 | public Text outKey() 94 | { 95 | return new Text ( this.imsi + "|" + this.timeFlag ); 96 | } 97 | 98 | /** 99 | * 输出VALUE 100 | */ 101 | public Text outValue() 102 | { 103 | long t = ( day.getTime() / 1000L ); //用时间的偏移量作为输出时间 104 | return new Text ( this.position + "|" + String.valueOf(t) ); 105 | } 106 | } -------------------------------------------------------------------------------- /com.homework/src/week3/tutorial/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author hadoop 6 | * 7 | */ 8 | package tutorial; -------------------------------------------------------------------------------- /com.homework/src/week5/matrix/Multiply.java: -------------------------------------------------------------------------------- 1 | package matrix; 2 | 3 | public class Multiply { 4 | 5 | public static void main(String[] args) { 6 | // TODO Auto-generated method stub 7 | int[][] a={{1,0,3,-1},{2,1,0,2}}; 8 | int[][] b={{4,1,0},{-1,1,3},{2,0,1},{1,3,4}}; 9 | int[][] c=new int[2][3]; 10 | //int[][] c; 11 | for(int i=0;i path = new HashMap(); 17 | //hdfs://localhost:9000/user/hdfs/in/ 18 | path.put("matrixMult", HDFS+"Mult/"); 19 | path.put("matrixMultOut", HDFS+"/Mult/Out/"); 20 | //Step1.run(path); 21 | // MyTest.run(path); 22 | SparseMatrix.run(path); 23 | //Step3.run1(path); 24 | //Step3.run2(path); 25 | // Step4.run(path); 26 | 27 | //Step4_Update.run(path); 28 | //Step4_Update2.run(path); 29 | 30 | 31 | // // hadoop fs -cat /user/hdfs/recommend/step4/part-00000 32 | // JobConf conf = config(); 33 | // HdfsDAO hdfs = new HdfsDAO(HDFS, conf); 34 | // hdfs.cat("/user/hdfs/recommend/step4/part-00000"); 35 | 36 | System.exit(0); 37 | } 38 | 39 | public static JobConf config() { 40 | JobConf conf = new JobConf(Recommend.class); 41 | conf.setJobName("Recommand"); 42 | //conf.addResource("classpath:/hadoop/core-site.xml"); 43 | //conf.addResource("classpath:/hadoop/hdfs-site.xml"); 44 | //conf.addResource("classpath:/hadoop/mapred-site.xml"); 45 | conf.set("io.sort.mb", "1024"); 46 | return conf; 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /com.homework/src/week5/matrix/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Administrator 6 | * 7 | */ 8 | package matrix; -------------------------------------------------------------------------------- /com.homework/src/week5/recommend/MainPodium.java: -------------------------------------------------------------------------------- 1 | package recommend; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | import java.util.regex.Pattern; 6 | 7 | 8 | 9 | import org.apache.hadoop.mapred.JobConf; 10 | 11 | public class MainPodium { 12 | public static final String HDFS = "hdfs://192.168.0.200:9000/user/hdfs/week5/"; 13 | public static final Pattern DELIMITER = Pattern.compile("[\t,]"); 14 | public static final String Step1In = HDFS+"step1In/"; 15 | public static final String Step1Out = HDFS+"step1Out/"; 16 | 17 | public static final String Step2In = Step1Out; 18 | public static final String Step2Out = HDFS+"step2Out/"; 19 | 20 | public static final String Step3In = Step1Out; 21 | public static final String Step3Out = HDFS+"step3Out/"; 22 | 23 | public static final String Step4In1 = Step2Out; 24 | public static final String Step4In2 = Step3Out; 25 | public static final String Step4Out = HDFS+"step4Out/"; 26 | 27 | 28 | public static void main(String[] args) throws Exception { 29 | 30 | Map path = new HashMap(); 31 | path.put("data", "datafile/week5/small.csv"); 32 | path.put("Step1In", Step1In); 33 | path.put("Step1Out", Step1Out); 34 | 35 | path.put("Step2In", Step2In); 36 | path.put("Step2Out", Step2Out); 37 | 38 | path.put("Step3In", Step3In); 39 | path.put("Step3Out", Step3Out); 40 | 41 | path.put("Step4In1", Step4In1); 42 | path.put("Step4In2", Step4In2); 43 | path.put("Step4Out", Step4Out); 44 | Step1.run(path); 45 | Step2.run(path); 46 | Step3.run(path); 47 | 48 | Step4.run(path); 49 | 50 | //Step4_Update.run(path); 51 | //Step4_Update2.run(path); 52 | 53 | 54 | // // hadoop fs -cat /user/hdfs/recommend/step4/part-00000 55 | // JobConf conf = config(); 56 | // HdfsDAO hdfs = new HdfsDAO(HDFS, conf); 57 | // hdfs.cat("/user/hdfs/recommend/step4/part-00000"); 58 | 59 | System.exit(0); 60 | } 61 | public static JobConf config() { 62 | JobConf conf = new JobConf(MainPodium.class); 63 | conf.setJobName("Recommand"); 64 | //conf.addResource("classpath:/hadoop/core-site.xml"); 65 | //conf.addResource("classpath:/hadoop/hdfs-site.xml"); 66 | //conf.addResource("classpath:/hadoop/mapred-site.xml"); 67 | conf.set("io.sort.mb", "1024"); 68 | return conf; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /com.homework/src/week5/recommend/Step1.java: -------------------------------------------------------------------------------- 1 | package recommend; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | import java.util.Map; 6 | 7 | import com.homework.hdfs.HdfsDAO; 8 | 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.IntWritable; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.mapred.FileInputFormat; 13 | import org.apache.hadoop.mapred.FileOutputFormat; 14 | import org.apache.hadoop.mapred.JobClient; 15 | import org.apache.hadoop.mapred.JobConf; 16 | import org.apache.hadoop.mapred.MapReduceBase; 17 | import org.apache.hadoop.mapred.Mapper; 18 | import org.apache.hadoop.mapred.OutputCollector; 19 | import org.apache.hadoop.mapred.Reducer; 20 | import org.apache.hadoop.mapred.Reporter; 21 | import org.apache.hadoop.mapred.RunningJob; 22 | import org.apache.hadoop.mapred.TextInputFormat; 23 | import org.apache.hadoop.mapred.TextOutputFormat; 24 | 25 | /*得出以下结果 26 | 1 102:3.0,103:2.5,101:5.0 27 | 2 101:2.0,102:2.5,103:5.0,104:2.0 28 | 3 107:5.0,101:2.0,104:4.0,105:4.5 29 | 4 101:5.0,103:3.0,104:4.5,106:4.0 30 | 5 101:4.0,102:3.0,103:2.0,104:4.0,105:3.5,106:4.0*/ 31 | public class Step1 { 32 | 33 | public static class Step1Map extends MapReduceBase implements Mapper{ 34 | 35 | @Override 36 | public void map(Object key, Text value,OutputCollector output, Reporter reporter)throws IOException { 37 | String[] tokens=MainPodium.DELIMITER.split(value.toString()); 38 | Text k=new Text(); 39 | Text v=new Text(); 40 | k.set(tokens[0]); 41 | v.set(tokens[1]+":"+tokens[2]); 42 | output.collect(k, v); 43 | } 44 | 45 | } 46 | public static class Step1Reduce extends MapReduceBase implements Reducer{ 47 | 48 | @Override 49 | public void reduce(Text key, Iterator values,OutputCollector output, Reporter reporter)throws IOException { 50 | Text v=new Text(); 51 | String str=""; 52 | while(values.hasNext()){ 53 | str=str+values.next()+","; 54 | } 55 | int n=str.lastIndexOf(","); 56 | 57 | v.set(str.substring(0,n)); 58 | output.collect(key, v); 59 | 60 | } 61 | } 62 | 63 | public static void run(Map path) throws IOException { 64 | JobConf conf = MainPodium.config(); 65 | 66 | String input = path.get("Step1In"); 67 | String output = path.get("Step1Out"); 68 | 69 | HdfsDAO hdfs = new HdfsDAO(MainPodium.HDFS, conf); 70 | // hdfs.rmr(output); 71 | hdfs.rmr(output); 72 | hdfs.rmr(input); 73 | hdfs.mkdirs(input); 74 | hdfs.copyFile(path.get("data"), input); 75 | 76 | conf.setMapOutputKeyClass(Text.class); 77 | conf.setMapOutputValueClass(Text.class); 78 | 79 | conf.setOutputKeyClass(Text.class); 80 | conf.setOutputValueClass(Text.class); 81 | 82 | conf.setMapperClass(Step1Map.class); 83 | conf.setCombinerClass(Step1Reduce.class); 84 | conf.setReducerClass(Step1Reduce.class); 85 | 86 | conf.setInputFormat(TextInputFormat.class); 87 | conf.setOutputFormat(TextOutputFormat.class); 88 | 89 | FileInputFormat.setInputPaths(conf, new Path(input)); 90 | FileOutputFormat.setOutputPath(conf, new Path(output)); 91 | 92 | RunningJob job = JobClient.runJob(conf); 93 | while (!job.isComplete()) { 94 | job.waitForCompletion(); 95 | } 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /com.homework/src/week5/recommend/Step2.java: -------------------------------------------------------------------------------- 1 | package recommend; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | import java.util.Map; 6 | 7 | import com.homework.hdfs.HdfsDAO; 8 | 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.IntWritable; 11 | import org.apache.hadoop.io.LongWritable; 12 | import org.apache.hadoop.io.NullWritable; 13 | import org.apache.hadoop.io.Text; 14 | import org.apache.hadoop.mapred.FileInputFormat; 15 | import org.apache.hadoop.mapred.FileOutputFormat; 16 | import org.apache.hadoop.mapred.JobClient; 17 | import org.apache.hadoop.mapred.JobConf; 18 | import org.apache.hadoop.mapred.MapReduceBase; 19 | import org.apache.hadoop.mapred.Mapper; 20 | import org.apache.hadoop.mapred.OutputCollector; 21 | import org.apache.hadoop.mapred.Reducer; 22 | import org.apache.hadoop.mapred.Reporter; 23 | import org.apache.hadoop.mapred.RunningJob; 24 | import org.apache.hadoop.mapred.TextInputFormat; 25 | import org.apache.hadoop.mapred.TextOutputFormat; 26 | 27 | /*得出物品同现矩阵*/ 28 | public class Step2 { 29 | 30 | public static class Step2Mapper extends MapReduceBase implements Mapper{ 31 | 32 | private final static IntWritable v = new IntWritable(1); 33 | @Override 34 | public void map(Object key, Text value,OutputCollector output, Reporter reporter) 35 | throws IOException { 36 | String[] tokens=MainPodium.DELIMITER.split(value.toString()); 37 | Text k=new Text(); 38 | 39 | for(int i=1;i{ 52 | 53 | @Override 54 | public void reduce(Text key, Iterator values,OutputCollector output, Reporter reporter) 55 | throws IOException { 56 | Integer sum=0; 57 | while(values.hasNext()){ 58 | sum=sum+values.next().get(); 59 | } 60 | IntWritable result = new IntWritable(); 61 | //result.set(key+","+sum.toString()); 62 | result.set(sum); 63 | output.collect(key,result); 64 | } 65 | 66 | } 67 | public static void run(Map path) throws IOException { 68 | JobConf conf = MainPodium.config(); 69 | 70 | String input = path.get("Step2In"); 71 | String output = path.get("Step2Out"); 72 | 73 | HdfsDAO hdfs = new HdfsDAO(MainPodium.HDFS, conf); 74 | hdfs.rmr(output); 75 | 76 | //conf.setMapOutputKeyClass(Text.class); 77 | //conf.setMapOutputValueClass(IntWritable.class); 78 | 79 | conf.setOutputKeyClass(Text.class); 80 | conf.setOutputValueClass(IntWritable.class); 81 | 82 | conf.setMapperClass(Step2Mapper.class); 83 | conf.setCombinerClass(Step2Reduce.class); 84 | conf.setReducerClass(Step2Reduce.class); 85 | 86 | conf.setInputFormat(TextInputFormat.class); 87 | conf.setOutputFormat(TextOutputFormat.class); 88 | 89 | FileInputFormat.setInputPaths(conf, new Path(input)); 90 | FileOutputFormat.setOutputPath(conf, new Path(output)); 91 | 92 | RunningJob job = JobClient.runJob(conf); 93 | while (!job.isComplete()) { 94 | job.waitForCompletion(); 95 | } 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /com.homework/src/week5/recommend/Step3.java: -------------------------------------------------------------------------------- 1 | package recommend; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | import java.util.Map; 6 | 7 | import com.homework.hdfs.HdfsDAO; 8 | 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.IntWritable; 11 | import org.apache.hadoop.io.LongWritable; 12 | import org.apache.hadoop.io.NullWritable; 13 | import org.apache.hadoop.io.Text; 14 | import org.apache.hadoop.mapred.FileInputFormat; 15 | import org.apache.hadoop.mapred.FileOutputFormat; 16 | import org.apache.hadoop.mapred.JobClient; 17 | import org.apache.hadoop.mapred.JobConf; 18 | import org.apache.hadoop.mapred.MapReduceBase; 19 | import org.apache.hadoop.mapred.Mapper; 20 | import org.apache.hadoop.mapred.OutputCollector; 21 | import org.apache.hadoop.mapred.Reducer; 22 | import org.apache.hadoop.mapred.Reporter; 23 | import org.apache.hadoop.mapred.RunningJob; 24 | import org.apache.hadoop.mapred.TextInputFormat; 25 | import org.apache.hadoop.mapred.TextOutputFormat; 26 | 27 | /*得出用户评分矩阵,用户名从左到右排*/ 28 | public class Step3 { 29 | public static class Step3Mapper extends MapReduceBase implements Mapper{ 30 | 31 | private final static IntWritable v = new IntWritable(1); 32 | @Override 33 | public void map(Object key, Text value,OutputCollector output, Reporter reporter) 34 | throws IOException { 35 | String[] tokens=MainPodium.DELIMITER.split(value.toString()); 36 | Text k=new Text(); 37 | for(int i=1;i path) throws IOException { 49 | JobConf conf = MainPodium.config(); 50 | 51 | String input = path.get("Step3In"); 52 | String output = path.get("Step3Out"); 53 | 54 | HdfsDAO hdfs = new HdfsDAO(MainPodium.HDFS, conf); 55 | hdfs.rmr(output); 56 | 57 | conf.setOutputKeyClass(NullWritable.class); 58 | conf.setOutputValueClass(Text.class); 59 | 60 | conf.setMapperClass(Step3Mapper.class); 61 | //conf.setCombinerClass(Step2Reduce.class); 62 | //conf.setReducerClass(Step2Reduce.class); 63 | 64 | conf.setInputFormat(TextInputFormat.class); 65 | conf.setOutputFormat(TextOutputFormat.class); 66 | 67 | FileInputFormat.setInputPaths(conf, new Path(input)); 68 | FileOutputFormat.setOutputPath(conf, new Path(output)); 69 | 70 | RunningJob job = JobClient.runJob(conf); 71 | while (!job.isComplete()) { 72 | job.waitForCompletion(); 73 | } 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /com.homework/src/week5/recommend/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Administrator 6 | * 7 | */ 8 | package recommend; -------------------------------------------------------------------------------- /com.homework/src/week6/filterSalary/Main.java: -------------------------------------------------------------------------------- 1 | package filterSalary; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | import java.util.regex.Pattern; 6 | 7 | import org.apache.hadoop.mapred.JobConf; 8 | //第一步,MapReduce后产生:用户,浏览职位,职位薪水 9 | /* 1.map: 10 | * job.csv文件 11 | * key:jobid,value:job:job,salary 12 | * pv.csv文件 13 | * key:jobid,value:user,userid 14 | * 2.reduce: 15 | * key:userid, 16 | * value:jobid,salary 17 | * */ 18 | 19 | //第二步, MapReduce后产生: 用户,浏览过职位薪水相加*0.8 20 | 21 | //第三步:过滤: 推荐结果 > 用户浏览过职位平均薪水%80 . 22 | public class Main { 23 | public static final String HDFS = "hdfs://192.168.0.200:9000/user/hdfs/week6/"; 24 | public static final Pattern DELIMITER = Pattern.compile("[\t,]"); 25 | 26 | public static final String Step0In = HDFS+"mahoutInput"; 27 | public static final String Step0Out = HDFS+"step0Out/"; 28 | 29 | public static final String Step1In = HDFS+"step1In/"; 30 | public static final String Step1Out = HDFS+"step1Out/"; 31 | 32 | public static final String Step2In = Step1Out; 33 | public static final String Step2Out = HDFS+"step2Out/"; 34 | 35 | public static final String Step3In1 = Step2Out; 36 | public static final String Step3In2 = Step0Out; 37 | public static final String Step3Out = HDFS+"step3Out/"; 38 | 39 | public static final String Step4In1 = Step0Out; 40 | public static final String Step4In2 = Step3Out; 41 | public static final String Step4Out = HDFS+"step4Out/"; 42 | 43 | public static void main(String[] args) throws Exception { 44 | 45 | Map path = new HashMap(); 46 | path.put("ToHdfsData1", "datafile/week6/job.csv"); 47 | path.put("ToHdfsData2", "datafile/week6/pv.csv"); 48 | 49 | path.put("Step0In", Step0In); 50 | path.put("Step0Out", Step0Out); 51 | 52 | path.put("Step1In", Step1In); 53 | path.put("Step1Out", Step1Out); 54 | 55 | path.put("Step2In", Step2In); 56 | path.put("Step2Out", Step2Out); 57 | 58 | path.put("Step3In1", Step3In1); 59 | path.put("Step3In2", Step3In2); 60 | path.put("Step3Out", Step3Out); 61 | 62 | path.put("Step4In1", Step4In1); 63 | path.put("Step4In2", Step4In2); 64 | path.put("Step4Out", Step4Out); 65 | //Step0.run(path); 66 | //Step1.run(path); 67 | //Step2.run(path); 68 | Step3.run(path); 69 | 70 | //Step4.run(path); 71 | 72 | //Step4_Update.run(path); 73 | //Step4_Update2.run(path); 74 | System.exit(0); 75 | } 76 | 77 | public static JobConf config() { 78 | JobConf conf = new JobConf(Main.class); 79 | conf.setJobName("Main"); 80 | //conf.addResource("classpath:/hadoop/core-site.xml"); 81 | //conf.addResource("classpath:/hadoop/hdfs-site.xml"); 82 | //conf.addResource("classpath:/hadoop/mapred-site.xml"); 83 | conf.set("io.sort.mb", "1024"); 84 | return conf; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /com.homework/src/week6/filterSalary/Step0.java: -------------------------------------------------------------------------------- 1 | package filterSalary; 2 | import java.io.IOException; 3 | import java.util.HashMap; 4 | import java.util.Iterator; 5 | import java.util.Map; 6 | import com.homework.hdfs.HdfsDAO; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.hadoop.io.LongWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapred.JobClient; 12 | import org.apache.hadoop.mapred.JobConf; 13 | import org.apache.hadoop.mapred.RunningJob; 14 | import org.apache.hadoop.mapreduce.Job; 15 | import org.apache.hadoop.mapreduce.Mapper; 16 | import org.apache.hadoop.mapreduce.Reducer; 17 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 18 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 19 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 20 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 21 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 22 | public class Step0 { 23 | public static class Step0Mapper extends Mapper { 24 | 25 | private String flag;// 26 | 27 | @Override 28 | protected void setup(Context context) throws IOException, InterruptedException { 29 | FileSplit split = (FileSplit) context.getInputSplit(); 30 | flag = split.getPath().getName();// 判断读的数据集 31 | 32 | // System.out.println(flag); 33 | } 34 | 35 | @Override 36 | public void map(LongWritable key, Text values, Context context) throws IOException, InterruptedException { 37 | 38 | String str=values.toString(); 39 | String[] line=Main.DELIMITER.split(str); 40 | if(line.length==0)return; 41 | if(flag.equals("mahoutA.txt")){ 42 | String userid=line[0]; 43 | for(int i=1;i { 68 | @Override 69 | public void reduce(Text key,Iterable values, Context context) throws IOException, InterruptedException { 70 | Map map=new HashMap(); 71 | Integer i=0; 72 | for(Text value:values){ 73 | String[] arr=Main.DELIMITER.split(value.toString()); 74 | i=i+1; 75 | if(arr[0].equals("job")) 76 | map.put(arr[0], arr[1]); 77 | else 78 | { 79 | map.put(i.toString(), arr[1]); 80 | } 81 | } 82 | String salary=map.get("job"); 83 | for(Map.Entry entry:map.entrySet()){ 84 | if(entry.getKey().equals("job"))continue; 85 | Text k=new Text(); 86 | Text v=new Text(); 87 | k.set(entry.getValue()); 88 | v.set(key.toString()+","+salary); 89 | context.write(k, v); 90 | } 91 | } 92 | } 93 | public static void run(Map path) throws IOException, ClassNotFoundException, InterruptedException { 94 | JobConf conf = Main.config(); 95 | 96 | String input = path.get("Step0In"); 97 | String output = path.get("Step0Out"); 98 | 99 | HdfsDAO hdfs = new HdfsDAO(Main.HDFS, conf); 100 | hdfs.rmr(output); 101 | //hdfs.copyFile(path.get("ToHdfsData1"), input); 102 | Job job = new Job(conf); 103 | job.setJarByClass(Step0.class); 104 | 105 | job.setOutputKeyClass(Text.class); 106 | job.setOutputValueClass(Text.class); 107 | 108 | job.setMapperClass(Step0Mapper.class); 109 | job.setReducerClass(Step0Reducer.class); 110 | 111 | job.setInputFormatClass(TextInputFormat.class); 112 | job.setOutputFormatClass(TextOutputFormat.class); 113 | 114 | FileInputFormat.setInputPaths(job, new Path(input)); 115 | FileOutputFormat.setOutputPath(job, new Path(output)); 116 | 117 | job.waitForCompletion(true); 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /com.homework/src/week6/filterSalary/Step1.java: -------------------------------------------------------------------------------- 1 | package filterSalary; 2 | import java.io.IOException; 3 | import java.util.HashMap; 4 | import java.util.Iterator; 5 | import java.util.Map; 6 | import com.homework.hdfs.HdfsDAO; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.hadoop.io.LongWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapred.JobClient; 12 | import org.apache.hadoop.mapred.JobConf; 13 | import org.apache.hadoop.mapred.RunningJob; 14 | import org.apache.hadoop.mapreduce.Job; 15 | import org.apache.hadoop.mapreduce.Mapper; 16 | import org.apache.hadoop.mapreduce.Reducer; 17 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 18 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 19 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 20 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 21 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 22 | //第一步,MapReduce后产生:用户,浏览职位,职位薪水 23 | /* 1.map: 24 | * job.csv文件 25 | * key:jobid,value:job:job,salary 26 | * pv.csv文件 27 | * key:jobid,value:user,userid 28 | * 2.reduce: 29 | * key:userid, 30 | * value:jobid,salary 31 | * */ 32 | 33 | //第二步, MapReduce后产生: 用户,浏览过职位薪水相加*0.8 34 | 35 | //第三步:过滤: 推荐结果 > 用户浏览过职位平均薪水%80 . 36 | public class Step1 { 37 | 38 | public static class Step1Mapper extends Mapper { 39 | 40 | private String flag;// 41 | 42 | @Override 43 | protected void setup(Context context) throws IOException, InterruptedException { 44 | FileSplit split = (FileSplit) context.getInputSplit(); 45 | flag = split.getPath().getName();// 判断读的数据集 46 | 47 | // System.out.println(flag); 48 | } 49 | 50 | @Override 51 | public void map(LongWritable key, Text values, Context context) throws IOException, InterruptedException { 52 | 53 | String str=values.toString(); 54 | String[] line=Main.DELIMITER.split(str); 55 | if(line.length==0)return; 56 | if(flag.equals("pv.csv")){ 57 | String userid=line[0]; 58 | String jobid=line[1]; 59 | Text k=new Text(); 60 | Text v=new Text(); 61 | k.set(jobid); 62 | v.set("user,"+userid); 63 | context.write(k, v); 64 | } 65 | if(flag.equals("job.csv")){ 66 | if(line.length!=3)return; 67 | String jobid=line[0]; 68 | String salary=line[2]; 69 | Text k=new Text(); 70 | Text v=new Text(); 71 | k.set(jobid); 72 | v.set("job,"+salary); 73 | context.write(k, v); 74 | } 75 | 76 | 77 | } 78 | } 79 | public static class Step1Reducer extends Reducer { 80 | @Override 81 | public void reduce(Text key,Iterable values, Context context) throws IOException, InterruptedException { 82 | Map map=new HashMap(); 83 | Integer i=0; 84 | for(Text value:values){ 85 | String[] arr=Main.DELIMITER.split(value.toString()); 86 | i=i+1; 87 | if(arr[0].equals("job")) 88 | map.put(arr[0], arr[1]); 89 | else 90 | { 91 | map.put(i.toString(), arr[1]); 92 | } 93 | } 94 | String salary=map.get("job"); 95 | 96 | for(Map.Entry entry:map.entrySet()){ 97 | if(entry.getKey().equals("job"))continue; 98 | Text k=new Text(); 99 | Text v=new Text(); 100 | k.set(entry.getValue()); 101 | v.set(key.toString()+","+salary); 102 | context.write(k, v); 103 | } 104 | 105 | } 106 | } 107 | public static void run(Map path) throws IOException, ClassNotFoundException, InterruptedException { 108 | JobConf conf = Main.config(); 109 | 110 | String input = path.get("Step1In"); 111 | String output = path.get("Step1Out"); 112 | 113 | HdfsDAO hdfs = new HdfsDAO(Main.HDFS, conf); 114 | // hdfs.rmr(output); 115 | hdfs.rmr(output); 116 | hdfs.rmr(input); 117 | hdfs.mkdirs(input); 118 | hdfs.copyFile(path.get("ToHdfsData1"), input); 119 | hdfs.copyFile(path.get("ToHdfsData2"), input); 120 | 121 | Job job = new Job(conf); 122 | job.setJarByClass(Step1.class); 123 | 124 | job.setOutputKeyClass(Text.class); 125 | job.setOutputValueClass(Text.class); 126 | 127 | job.setMapperClass(Step1Mapper.class); 128 | job.setReducerClass(Step1Reducer.class); 129 | 130 | job.setInputFormatClass(TextInputFormat.class); 131 | job.setOutputFormatClass(TextOutputFormat.class); 132 | 133 | FileInputFormat.setInputPaths(job, new Path(input)); 134 | FileOutputFormat.setOutputPath(job, new Path(output)); 135 | 136 | job.waitForCompletion(true); 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /com.homework/src/week6/filterSalary/Step2.java: -------------------------------------------------------------------------------- 1 | package filterSalary; 2 | import java.io.IOException; 3 | import java.util.HashMap; 4 | import java.util.Iterator; 5 | import java.util.Map; 6 | 7 | import com.homework.hdfs.HdfsDAO; 8 | 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.IntWritable; 11 | import org.apache.hadoop.io.LongWritable; 12 | import org.apache.hadoop.io.Text; 13 | import org.apache.hadoop.mapred.JobClient; 14 | import org.apache.hadoop.mapred.JobConf; 15 | import org.apache.hadoop.mapred.RunningJob; 16 | import org.apache.hadoop.mapreduce.Job; 17 | import org.apache.hadoop.mapreduce.Mapper; 18 | import org.apache.hadoop.mapreduce.Reducer; 19 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 20 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 21 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 22 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 23 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 24 | //第二步, MapReduce后产生: 用户,浏览过职位薪水相加*0.8 25 | /*1.map 26 | key:userid 27 | value:salary 28 | 2.reduce 29 | key:userid 30 | value:平均薪水*0.8*/ 31 | public class Step2 { 32 | 33 | public static class Step2Mapper extends Mapper { 34 | 35 | @Override 36 | public void map(LongWritable key, Text values, Context context) throws IOException, InterruptedException { 37 | 38 | String str=values.toString(); 39 | String[] line=Main.DELIMITER.split(str); 40 | if(line.length==0)return; 41 | String userid=line[0]; 42 | String salary=line[2]; 43 | Text k=new Text(); 44 | Text v=new Text(); 45 | k.set(userid); 46 | v.set(salary); 47 | context.write(k, v); 48 | } 49 | } 50 | public static class Step2Reducer extends Reducer { 51 | @Override 52 | public void reduce(Text key,Iterable values, Context context) throws IOException, InterruptedException { 53 | Integer i=0; 54 | Double sum=0.0; 55 | for(Text value:values){ 56 | i=i+1; 57 | Double val=Double.valueOf(value.toString()); 58 | sum=sum+val; 59 | } 60 | Double average=sum/i; 61 | Double va=average*0.8; 62 | 63 | Text v=new Text(); 64 | v.set(va.toString()); 65 | context.write(key, v); 66 | } 67 | } 68 | public static void run(Map path) throws IOException, ClassNotFoundException, InterruptedException { 69 | JobConf conf = Main.config(); 70 | 71 | String input = path.get("Step2In"); 72 | String output = path.get("Step2Out"); 73 | 74 | HdfsDAO hdfs = new HdfsDAO(Main.HDFS, conf); 75 | hdfs.rmr(output); 76 | Job job = new Job(conf); 77 | job.setJarByClass(Step2.class); 78 | 79 | job.setOutputKeyClass(Text.class); 80 | job.setOutputValueClass(Text.class); 81 | 82 | job.setMapperClass(Step2Mapper.class); 83 | job.setReducerClass(Step2Reducer.class); 84 | 85 | job.setInputFormatClass(TextInputFormat.class); 86 | job.setOutputFormatClass(TextOutputFormat.class); 87 | 88 | FileInputFormat.setInputPaths(job, new Path(input)); 89 | FileOutputFormat.setOutputPath(job, new Path(output)); 90 | 91 | job.waitForCompletion(true); 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /com.homework/src/week6/filterSalary/Step3.java: -------------------------------------------------------------------------------- 1 | package filterSalary; 2 | import java.io.IOException; 3 | import java.util.HashMap; 4 | import java.util.Iterator; 5 | import java.util.Map; 6 | import com.homework.hdfs.HdfsDAO; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.hadoop.io.LongWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapred.JobClient; 12 | import org.apache.hadoop.mapred.JobConf; 13 | import org.apache.hadoop.mapred.RunningJob; 14 | import org.apache.hadoop.mapreduce.Job; 15 | import org.apache.hadoop.mapreduce.Mapper; 16 | import org.apache.hadoop.mapreduce.Reducer; 17 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 18 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 19 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 20 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 21 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 22 | public class Step3 { 23 | public static class Step3Mapper extends Mapper { 24 | 25 | private String flag;// 26 | 27 | @Override 28 | protected void setup(Context context) throws IOException, InterruptedException { 29 | FileSplit split = (FileSplit) context.getInputSplit(); 30 | flag = split.getPath().getParent().getName();// 判断读的数据集 31 | 32 | // System.out.println(flag); 33 | } 34 | 35 | @Override 36 | public void map(LongWritable key, Text values, Context context) throws IOException, InterruptedException { 37 | 38 | String str=values.toString(); 39 | String[] line=Main.DELIMITER.split(str); 40 | if(line.length==0)return; 41 | if(flag.equals("step0Out")){ 42 | String userid=line[0]; 43 | String jobid=line[1]; 44 | String salary=line[2]; 45 | Text k=new Text(); 46 | Text v=new Text(); 47 | k.set(userid); 48 | v.set(jobid+","+salary); 49 | context.write(k, v); 50 | } 51 | if(flag.equals("step2Out")){ 52 | if(line.length!=2)return; 53 | String userid=line[0]; 54 | String salary=line[1]; 55 | Text k=new Text(); 56 | Text v=new Text(); 57 | k.set(userid); 58 | v.set("average,"+salary); 59 | context.write(k, v); 60 | } 61 | 62 | 63 | } 64 | } 65 | public static class Step3Reducer extends Reducer { 66 | @Override 67 | public void reduce(Text key,Iterable values, Context context) throws IOException, InterruptedException { 68 | Map map=new HashMap(); 69 | Integer i=0; 70 | for(Text value:values){ 71 | String[] arr=Main.DELIMITER.split(value.toString()); 72 | i=i+1; 73 | if(arr[0].equals("average")) 74 | map.put(arr[0], arr[1]); 75 | else 76 | { 77 | map.put(arr[0], arr[1]); 78 | } 79 | } 80 | String salary=map.get("average"); 81 | Double average=Double.valueOf(salary); 82 | StringBuilder sb=new StringBuilder(); 83 | for(Map.Entry entry:map.entrySet()){ 84 | if(entry.getKey().equals("average"))continue; 85 | 86 | Double val=Double.valueOf(entry.getValue()); 87 | if(val>=average){ 88 | sb.append("(推荐职位ID:"+entry.getKey()+",薪水:"+entry.getValue()+"),"); 89 | } 90 | 91 | } 92 | String result = String.format("%.2f", average); 93 | if(sb.length()>1){ 94 | sb.append("(%80平均薪水:"+result+")"); 95 | } 96 | Text k=new Text(); 97 | Text v=new Text(); 98 | k.set("用户:"+key); 99 | v.set(sb.toString()); 100 | context.write(k, v); 101 | 102 | } 103 | } 104 | public static void run(Map path) throws IOException, ClassNotFoundException, InterruptedException { 105 | JobConf conf = Main.config(); 106 | 107 | String input1 = path.get("Step3In1"); 108 | String input2 = path.get("Step3In2"); 109 | String output = path.get("Step3Out"); 110 | 111 | HdfsDAO hdfs = new HdfsDAO(Main.HDFS, conf); 112 | // hdfs.rmr(output); 113 | hdfs.rmr(output); 114 | 115 | 116 | Job job = new Job(conf); 117 | job.setJarByClass(Step3.class); 118 | 119 | job.setOutputKeyClass(Text.class); 120 | job.setOutputValueClass(Text.class); 121 | 122 | job.setMapperClass(Step3Mapper.class); 123 | job.setReducerClass(Step3Reducer.class); 124 | 125 | job.setInputFormatClass(TextInputFormat.class); 126 | job.setOutputFormatClass(TextOutputFormat.class); 127 | 128 | FileInputFormat.setInputPaths(job, new Path(input1),new Path(input2)); 129 | FileOutputFormat.setOutputPath(job, new Path(output)); 130 | 131 | job.waitForCompletion(true); 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /com.homework/src/week6/filterSalary/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Administrator 6 | * 7 | */ 8 | package filterSalary; -------------------------------------------------------------------------------- /com.homework/src/week6/recommendJob/ItemLoglikelihood.java: -------------------------------------------------------------------------------- 1 | package recommendJob; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.text.ParseException; 8 | import java.text.SimpleDateFormat; 9 | import java.util.Date; 10 | import java.util.HashSet; 11 | import java.util.List; 12 | import java.util.Set; 13 | 14 | import org.apache.mahout.cf.taste.common.TasteException; 15 | import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; 16 | import org.apache.mahout.cf.taste.impl.model.GenericBooleanPrefDataModel; 17 | import org.apache.mahout.cf.taste.impl.model.file.FileDataModel; 18 | import org.apache.mahout.cf.taste.impl.recommender.GenericBooleanPrefItemBasedRecommender; 19 | import org.apache.mahout.cf.taste.impl.similarity.LogLikelihoodSimilarity; 20 | import org.apache.mahout.cf.taste.model.DataModel; 21 | import org.apache.mahout.cf.taste.recommender.IDRescorer; 22 | import org.apache.mahout.cf.taste.recommender.RecommendedItem; 23 | import org.apache.mahout.cf.taste.recommender.Recommender; 24 | import org.apache.mahout.cf.taste.similarity.ItemSimilarity; 25 | import org.apache.mahout.cf.taste.similarity.UserSimilarity; 26 | 27 | public class ItemLoglikelihood { 28 | 29 | final static int neighborhoodNum=2; 30 | final static int recommendNum=3; 31 | public static void main(String[] args) throws TasteException, IOException { 32 | String file="datafile/week6/pv.csv"; 33 | DataModel dataModel=new GenericBooleanPrefDataModel(GenericBooleanPrefDataModel.toDataMap(new FileDataModel(new File(file)))); 34 | ItemSimilarity itemSimilarity=new LogLikelihoodSimilarity(dataModel); 35 | Recommender recommender=new GenericBooleanPrefItemBasedRecommender(dataModel,itemSimilarity); 36 | 37 | LongPrimitiveIterator iterator=dataModel.getUserIDs(); 38 | while(iterator.hasNext()){ 39 | long uid=iterator.nextLong(); 40 | Set jobids = getOutdateJobID("datafile/week6/job.csv"); 41 | IDRescorer rescorer = new JobRescorer(jobids); 42 | List list=recommender.recommend(uid, recommendNum,rescorer); 43 | //System.out.printf("uid:%s", uid); 44 | System.out.printf("%s",uid); 45 | for(RecommendedItem ritem:list){ 46 | //System.out.printf("(%s,%f)", ritem.getItemID(), ritem.getValue()); 47 | System.out.printf("%s",","+ ritem.getItemID()); 48 | 49 | } 50 | System.out.println(); 51 | } 52 | } 53 | 54 | public static Set getOutdateJobID(String file) throws IOException { 55 | BufferedReader br = new BufferedReader(new FileReader(new File(file))); 56 | Set jobids = new HashSet(); 57 | String s = null; 58 | while ((s = br.readLine()) != null) { 59 | String[] cols = s.split(","); 60 | SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd"); 61 | Date date = null; 62 | try { 63 | date = df.parse(cols[1]); 64 | if (date.getTime() < df.parse("2013-01-01").getTime()) { 65 | jobids.add(Long.parseLong(cols[0])); 66 | } 67 | } catch (ParseException e) { 68 | e.printStackTrace(); 69 | } 70 | 71 | } 72 | br.close(); 73 | return jobids; 74 | } 75 | 76 | } 77 | class JobRescorer implements IDRescorer { 78 | final private Set jobids; 79 | 80 | public JobRescorer(Set jobs) { 81 | this.jobids = jobs; 82 | } 83 | 84 | @Override 85 | public double rescore(long id, double originalScore) { 86 | return isFiltered(id) ? Double.NaN : originalScore; 87 | } 88 | 89 | @Override 90 | public boolean isFiltered(long id) { 91 | return jobids.contains(id); 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /com.homework/src/week6/recommendJob/UserCityBlock.java: -------------------------------------------------------------------------------- 1 | package recommendJob; 2 | 3 | 4 | import java.io.BufferedReader; 5 | import java.io.File; 6 | import java.io.FileReader; 7 | import java.io.IOException; 8 | import java.util.HashMap; 9 | import java.util.HashSet; 10 | import java.util.List; 11 | import java.util.Map; 12 | import java.util.Set; 13 | 14 | import org.apache.mahout.cf.taste.common.TasteException; 15 | import org.apache.mahout.cf.taste.impl.common.FastIDSet; 16 | import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; 17 | import org.apache.mahout.cf.taste.model.DataModel; 18 | import org.apache.mahout.cf.taste.recommender.IDRescorer; 19 | import org.apache.mahout.cf.taste.recommender.RecommendedItem; 20 | import org.apache.mahout.cf.taste.impl.model.GenericBooleanPrefDataModel; 21 | import org.apache.mahout.cf.taste.impl.model.file.FileDataModel; 22 | import org.apache.mahout.cf.taste.impl.neighborhood.NearestNUserNeighborhood; 23 | import org.apache.mahout.cf.taste.impl.recommender.GenericBooleanPrefUserBasedRecommender; 24 | import org.apache.mahout.cf.taste.impl.similarity.CityBlockSimilarity; 25 | import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; 26 | import org.apache.mahout.cf.taste.recommender.Recommender; 27 | import org.apache.mahout.cf.taste.similarity.UserSimilarity; 28 | 29 | public class UserCityBlock { 30 | 31 | final static int neighborhoodNum=2; 32 | final static int recommendNum=3; 33 | 34 | public static void main(String[] args) throws TasteException, IOException { 35 | String file="datafile/week6/pv.csv"; 36 | DataModel dataModel=new GenericBooleanPrefDataModel(GenericBooleanPrefDataModel.toDataMap(new FileDataModel(new File(file)))); 37 | UserSimilarity userSimilarity=new CityBlockSimilarity(dataModel); 38 | UserNeighborhood userNeighborhood=new NearestNUserNeighborhood(neighborhoodNum,userSimilarity,dataModel); 39 | Recommender recommender=new GenericBooleanPrefUserBasedRecommender(dataModel, userNeighborhood, userSimilarity); 40 | LongPrimitiveIterator iterator=dataModel.getUserIDs(); 41 | Map averSalary = getAverSalary("datafile/week6/job.csv", dataModel); 42 | while(iterator.hasNext()){ 43 | long uid=iterator.nextLong(); 44 | 45 | Set jobids = getSalaryJobID(uid, "datafile/week6/job.csv", averSalary); 46 | IDRescorer rescorer = new JobRescorer(jobids); 47 | 48 | List list=recommender.recommend(uid, recommendNum,rescorer); 49 | System.out.printf("uid:%s", uid); 50 | for(RecommendedItem ritem:list){ 51 | System.out.printf("(%s,%f)", ritem.getItemID(), ritem.getValue()); 52 | } 53 | System.out.println(); 54 | } 55 | } 56 | 57 | public static Set getSalaryJobID(long uid, String file, Map averSalary) throws IOException { 58 | BufferedReader br = new BufferedReader(new FileReader(new File(file))); 59 | Set jobids = new HashSet(); 60 | String s = null; 61 | while ((s = br.readLine()) != null) { 62 | String[] cols = s.split(","); 63 | double salary = Double.valueOf(cols[2]); 64 | if (salary < averSalary.get(uid)) { 65 | jobids.add(Long.parseLong(cols[0])); 66 | } 67 | } 68 | br.close(); 69 | return jobids; 70 | } 71 | 72 | // 获取每个用户的基准比较工资:aver(浏览过的工资)*0.8 73 | public static Map getAverSalary(String file, DataModel dataModel) 74 | throws NumberFormatException, IOException, TasteException{ 75 | Map salaries = new HashMap(); 76 | BufferedReader br = new BufferedReader(new FileReader(new File(file))); 77 | String s = null; 78 | while ((s = br.readLine()) != null) { 79 | String[] cols = s.split(","); 80 | salaries.put(Long.parseLong(cols[0]), Double.valueOf(cols[2])); 81 | } 82 | br.close(); 83 | 84 | 85 | Map averSalaries = new HashMap(); 86 | LongPrimitiveIterator iter = dataModel.getUserIDs(); 87 | while (iter.hasNext()) { 88 | long uid = iter.nextLong(); 89 | FastIDSet items = dataModel.getItemIDsFromUser(uid); 90 | LongPrimitiveIterator itemsIter = items.iterator(); 91 | double sum = 0; 92 | int count = 0; 93 | double aver = 0.0; 94 | while (itemsIter.hasNext()) { 95 | long item = itemsIter.nextLong(); 96 | double salary = salaries.get(item); 97 | sum += salary; 98 | count ++; 99 | } 100 | if(count > 0) aver = 0.8*sum/count; 101 | averSalaries.put(uid, aver); 102 | } 103 | return averSalaries; 104 | } 105 | 106 | 107 | } 108 | -------------------------------------------------------------------------------- /com.homework/src/week6/recommendJob/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Administrator 6 | * 7 | */ 8 | package recommendJob; -------------------------------------------------------------------------------- /com.homework/src/week6/test/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Administrator 6 | * 7 | */ 8 | package test; -------------------------------------------------------------------------------- /com.homework/src/week7/classfier/Main.java: -------------------------------------------------------------------------------- 1 | package classfier; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | import java.util.regex.Pattern; 6 | 7 | import org.apache.hadoop.mapred.JobConf; 8 | //第一步,MapReduce后产生:用户,浏览职位,职位薪水 9 | /* 1.map: 10 | * job.csv文件 11 | * key:jobid,value:job:job,salary 12 | * pv.csv文件 13 | * key:jobid,value:user,userid 14 | * 2.reduce: 15 | * key:userid, 16 | * value:jobid,salary 17 | * */ 18 | 19 | //第二步, MapReduce后产生: 用户,浏览过职位薪水相加*0.8 20 | 21 | //第三步:过滤: 推荐结果 > 用户浏览过职位平均薪水%80 . 22 | public class Main { 23 | public static final String HDFS = "hdfs://10.3.7.201:9000/user/hdfs/week7/"; 24 | public static final Pattern DELIMITER = Pattern.compile("[\t,]"); 25 | 26 | public static final String PaodingFirstIn = HDFS+"in/"; 27 | public static final String PaodingFirstOut = HDFS+"out/"; 28 | 29 | 30 | 31 | public static void main(String[] args) throws Exception { 32 | 33 | Map path = new HashMap(); 34 | 35 | 36 | path.put("PaodingFirstIn", PaodingFirstIn); 37 | path.put("PaodingFirstOut", PaodingFirstOut); 38 | 39 | 40 | PaodingFirst.run(path); 41 | //Step3.run(path); 42 | 43 | System.exit(0); 44 | } 45 | 46 | public static JobConf config() { 47 | JobConf conf = new JobConf(Main.class); 48 | 49 | conf.setJobName("Main"); 50 | //conf.addResource("classpath:/hadoop/core-site.xml"); 51 | //conf.addResource("classpath:/hadoop/hdfs-site.xml"); 52 | //conf.addResource("classpath:/hadoop/mapred-site.xml"); 53 | conf.set("io.sort.mb", "1024"); 54 | return conf; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /com.homework/src/week7/classfier/PaodingFirst.java: -------------------------------------------------------------------------------- 1 | package classfier; 2 | import java.io.IOException; 3 | import java.io.StringReader; 4 | import java.util.HashMap; 5 | import java.util.Iterator; 6 | import java.util.Map; 7 | 8 | import com.homework.hdfs.HdfsDAO; 9 | 10 | 11 | 12 | import myInputFormat.JamesInputFormat; 13 | import net.paoding.analysis.analyzer.PaodingAnalyzer; 14 | 15 | import org.apache.hadoop.fs.FileStatus; 16 | import org.apache.hadoop.fs.FileSystem; 17 | import org.apache.hadoop.fs.Path; 18 | import org.apache.hadoop.io.IntWritable; 19 | import org.apache.hadoop.io.LongWritable; 20 | import org.apache.hadoop.io.Text; 21 | import org.apache.hadoop.mapred.JobClient; 22 | import org.apache.hadoop.mapred.JobConf; 23 | import org.apache.hadoop.mapred.RunningJob; 24 | import org.apache.hadoop.mapreduce.Job; 25 | import org.apache.hadoop.mapreduce.Mapper; 26 | import org.apache.hadoop.mapreduce.Reducer; 27 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 28 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 29 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 30 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 31 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 32 | import org.apache.lucene.analysis.TokenStream; 33 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 34 | public class PaodingFirst { 35 | public static class PaodingFirstMapper extends Mapper { 36 | 37 | private String flag;// 38 | PaodingAnalyzer analyzer = new PaodingAnalyzer(); 39 | Text v=new Text(); 40 | Text k=new Text(); 41 | @Override 42 | protected void setup(Context context) throws IOException, InterruptedException { 43 | FileSplit split = (FileSplit) context.getInputSplit(); 44 | flag = split.getPath().getParent().getName();// 判断读的数据集 45 | 46 | // System.out.println(flag); 47 | } 48 | 49 | @Override 50 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 51 | 52 | k.set(flag); 53 | PaodingAnalyzer analyzer=new PaodingAnalyzer(); 54 | StringReader sr=new StringReader(value.toString()); 55 | TokenStream ts=analyzer.tokenStream("", sr); 56 | StringBuilder sb=new StringBuilder(); 57 | try{ 58 | while(ts.incrementToken()){ 59 | CharTermAttribute ta=ts.getAttribute(CharTermAttribute.class); 60 | sb.append(ta.toString()); 61 | sb.append(" "); 62 | System.out.print(ta.toString()+" "); 63 | } 64 | }catch(Exception e){ 65 | 66 | } 67 | System.out.println(); 68 | v.set(sb.toString()); 69 | context.write(k, v); 70 | } 71 | } 72 | 73 | 74 | public static void run(Map path) throws IOException, ClassNotFoundException, InterruptedException { 75 | JobConf conf = Main.config(); 76 | conf.set("dfs.permissions","false"); 77 | conf.setInt("mapred.min.split.size", 1); 78 | //conf.set("mapred.job.tracker", "[192.168.0.200]:9001"); 79 | conf.setLong("mapreduce.input.fileinputformat.split.maxsize", 4000000); //max size of Split 80 | String input = path.get("PaodingFirstIn"); 81 | String output = path.get("PaodingFirstOut"); 82 | 83 | HdfsDAO hdfs = new HdfsDAO(Main.HDFS, conf); 84 | hdfs.rmr(output); 85 | //hdfs.copyFile(path.get("ToHdfsData1"), input); 86 | Job job = new Job(conf); 87 | job.setJarByClass(PaodingFirst.class); 88 | 89 | job.setOutputKeyClass(Text.class); 90 | job.setOutputValueClass(Text.class); 91 | 92 | job.setMapperClass(PaodingFirstMapper.class); 93 | 94 | 95 | job.setInputFormatClass(JamesInputFormat.class); 96 | job.setOutputFormatClass(TextOutputFormat.class); 97 | 98 | Path inpath= new Path(input); 99 | try { // input path 100 | FileSystem fs = inpath.getFileSystem(conf); 101 | FileStatus[] stats = fs.listStatus(inpath); 102 | for(int i=0; i{ 13 | @Override 14 | public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException { 15 | 16 | CombineFileSplit combineFileSplit = (CombineFileSplit) split; 17 | CombineFileRecordReader recordReader = new CombineFileRecordReader(combineFileSplit, context, JamesRecordReader.class); 18 | try { 19 | recordReader.initialize(combineFileSplit, context); 20 | } catch (InterruptedException e) { 21 | new RuntimeException("Error to initialize CombineSmallfileRecordReader."); 22 | } 23 | return recordReader; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /com.homework/src/week7/myInputFormat/JamesRecordReader.java: -------------------------------------------------------------------------------- 1 | package myInputFormat; 2 | import java.io.IOException; 3 | 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.io.BytesWritable; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.mapreduce.InputSplit; 8 | import org.apache.hadoop.mapreduce.RecordReader; 9 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 10 | import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit; 11 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 12 | import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; 13 | public class JamesRecordReader extends RecordReader{ 14 | private CombineFileSplit combineFileSplit; 15 | private LineRecordReader lineRecordReader = new LineRecordReader(); 16 | private Path[] paths; 17 | private int totalLength; 18 | private int currentIndex; 19 | private float currentProgress = 0; 20 | private LongWritable currentKey; 21 | private BytesWritable currentValue = new BytesWritable();; 22 | 23 | public JamesRecordReader(CombineFileSplit combineFileSplit, TaskAttemptContext context, Integer index) throws IOException { 24 | super(); 25 | this.combineFileSplit = combineFileSplit; 26 | this.currentIndex = index; // 当前要处理的小文件Block在CombineFileSplit中的索引 27 | } 28 | 29 | @Override 30 | public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { 31 | this.combineFileSplit = (CombineFileSplit) split; 32 | // 处理CombineFileSplit中的一个小文件Block,因为使用LineRecordReader,需要构造一个FileSplit对象,然后才能够读取数据 33 | FileSplit fileSplit = new FileSplit(combineFileSplit.getPath(currentIndex), combineFileSplit.getOffset(currentIndex), combineFileSplit.getLength(currentIndex), combineFileSplit.getLocations()); 34 | lineRecordReader.initialize(fileSplit, context); 35 | 36 | this.paths = combineFileSplit.getPaths(); 37 | totalLength = paths.length; 38 | context.getConfiguration().set("map.input.file.name", combineFileSplit.getPath(currentIndex).getName()); 39 | } 40 | 41 | @Override 42 | public LongWritable getCurrentKey() throws IOException, InterruptedException { 43 | currentKey = lineRecordReader.getCurrentKey(); 44 | return currentKey; 45 | } 46 | 47 | @Override 48 | public BytesWritable getCurrentValue() throws IOException, InterruptedException { 49 | byte[] content = lineRecordReader.getCurrentValue().getBytes(); 50 | currentValue.set(content, 0, content.length); 51 | return currentValue; 52 | } 53 | 54 | @Override 55 | public boolean nextKeyValue() throws IOException, InterruptedException { 56 | if (currentIndex >= 0 && currentIndex < totalLength) { 57 | return lineRecordReader.nextKeyValue(); 58 | } else { 59 | return false; 60 | } 61 | } 62 | 63 | @Override 64 | public float getProgress() throws IOException { 65 | if (currentIndex >= 0 && currentIndex < totalLength) { 66 | currentProgress = (float) currentIndex / totalLength; 67 | return currentProgress; 68 | } 69 | return currentProgress; 70 | } 71 | 72 | @Override 73 | public void close() throws IOException { 74 | lineRecordReader.close(); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /com.homework/src/week7/myInputFormat/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Administrator 6 | * 7 | */ 8 | package myInputFormat; -------------------------------------------------------------------------------- /com.homework/src/week8/mrclassify/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 见scripts文件夹week8.rar文件,由于是用 mahout0.6写的,放在0.8的环境中不支持,git测试 3 | */ 4 | /** 5 | * @author Administrator 6 | * 7 | */ 8 | package mrclassify; --------------------------------------------------------------------------------