├── .gitattributes ├── .gitignore ├── Hadoop ├── README.md ├── WebKPI │ ├── KPIJob.java │ ├── KPI_OneIP_Sum.java │ ├── KPI_OnePV_Sum.java │ ├── KPI_OneRequest_Sum.java │ ├── KPI_OneSource_Sum.java │ ├── KPI_OneTime_Sum.java │ ├── KPIfilter.java │ └── hdfsGYT.java ├── bookTuijian │ ├── Step1.java │ ├── Step2.java │ ├── Step3_1.java │ ├── Step3_2.java │ ├── Step4.java │ ├── Step4_Updata.java │ ├── Step4_Updata2.java │ ├── bookRecommend.java │ ├── hdfsGYT.java │ ├── score.txt │ └── uid_to_bid.csv ├── hdfs │ └── hdfsGYT.java ├── multiple_In_Out │ ├── mulIn-output │ │ └── part-r-00000 │ ├── mulOut-output │ │ ├── china-r-00000 │ │ ├── cpeople-r-00000 │ │ └── usa-r-00000 │ ├── multipleinout.java │ ├── multipleinput_input │ ├── multipleoutput.java │ └── multipleoutput_input ├── pagerankjisuan │ ├── dataEtl.java │ ├── hdfsGYT.java │ ├── prJisuan.java │ ├── prMatrix.java │ ├── prNormal.java │ ├── prSort.java │ └── prjob.java ├── selfSort │ ├── input │ ├── output │ └── selfSort.java ├── sort_twice │ ├── Intpair.java │ ├── groupingComparator.java │ ├── input │ ├── myPartition.java │ ├── output │ ├── sort_twice.jar │ └── sort_twice.java ├── wordcount │ └── wordcount.java └── 二次排序 │ ├── blogURL.txt │ ├── part-r-00000 │ ├── sortTwice.jar │ ├── sortTwice.txt │ └── sorttwice │ ├── IntPair.java │ └── sortTwice.java ├── Hbase └── README.md ├── Hive ├── README.md └── hiveTableExample │ ├── complex_student │ ├── complex_student~ │ ├── external_student │ ├── hiveQL │ ├── hiveQL~ │ ├── partiton_student │ ├── partiton_student 2 │ ├── partiton_student 2~ │ ├── partiton_student2 │ ├── partiton_student~ │ ├── student.txt │ └── student.txt~ ├── Java ├── Dataguru算法导论 │ ├── BitTree │ │ └── tree.java │ ├── Graph │ │ ├── BFS.java │ │ ├── DFS.java │ │ ├── Dijkstra.java │ │ └── GraphTest.java │ ├── Hash │ │ └── hash.java │ ├── Link │ │ ├── DoubleLink.java │ │ ├── DoubleLinkTest.java │ │ ├── Link.java │ │ └── linkTest.java │ ├── Matrix │ │ ├── matrixCheng.java │ │ └── maxArr.java │ ├── Queue │ │ └── Queue.java │ ├── Statck │ │ ├── Statck1.java │ │ └── Statck2.java │ ├── TestCode │ │ ├── BitTreeExample.java │ │ ├── HashTableExample.java │ │ ├── fenZhiTest.java │ │ └── guibingTest.java │ └── sort │ │ ├── duiSort.java │ │ ├── guibing.java │ │ ├── insertSort.java │ │ └── quickSort.java └── 一些小项目 │ └── README.md ├── Mahout └── README.md ├── README.md ├── Spark ├── ChineseWordSplitCount │ ├── WordAnalyzer jar包链接.txt │ ├── blog href.txt │ └── wordSplitCount.py ├── PageRank │ └── Jar包链接.txt ├── README.md └── pairRDD │ ├── driver │ ├── example │ ├── example~ │ └── sample └── cluster_conf ├── README.md ├── master1 ├── core-site.xml ├── hdfs-site.xml ├── mapred-site.xml └── yarn-site.xml ├── slave1 ├── core-site.xml ├── hdfs-site.xml ├── mapred-site.xml └── yarn-site.xml └── slave2 ├── core-site.xml ├── hdfs-site.xml ├── mapred-site.xml └── yarn-site.xml /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | -------------------------------------------------------------------------------- /Hadoop/README.md: -------------------------------------------------------------------------------- 1 | 本目录下主要是我对hadoop两个下属hdfs和mapreduce的操作代码托管地方,代码质量不一定高,但是尽我所能去写好每次的code 2 | 3 | 使用java封装hdfs操作实例,其对应的博客:http://blog.csdn.net/gamer_gyt/article/details/50985606 4 | -------------------------------------------------------------------------------- /Hadoop/WebKPI/KPIJob.java: -------------------------------------------------------------------------------- 1 | package WebKPI; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | 8 | public class KPIJob { 9 | //定义全局变量 hdfs地址url 10 | public static final String HDFS = "hdfs://127.0.0.1:9000"; 11 | 12 | public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException { 13 | //定义一个map集合,存放程序中所需要的路径 14 | Map path= new HashMap(); 15 | 16 | // path.put("local_path", "webLogKPI/weblog/access.log"); //本地目录 17 | path.put("input_log", HDFS+"/mr/webLogKPI/log_files"); //hdfs上存放log的目录 18 | 19 | path.put("output_oneip", HDFS + "/mr/webLogKPI/KPI_OneIP_Sum"); //hdfs上KPI_OneIP_Sum对应的输出文件 20 | path.put("output_pv", HDFS + "/mr/webLogKPI/KPI_OnePV_Sum"); //hdfs上KPI_OnePV_Sum对应的输出文件 21 | path.put("output_request",HDFS+"/mr/webLogKPI/KPI_OneRequest_Sum"); //hdfs 上KPI_OneRequest_Sum对应的输出文件 22 | path.put("output_time", HDFS+"/mr/webLogKPI/KPI_OneTime_Sum"); //hdfs上KPI_OneTime_Sum对应的输出文件 23 | path.put("output_source", HDFS+"/mr/webLogKPI/KPI_OneResource_Sum"); //hdfs上KPI_OneResource_Sum对应的输出文件 24 | 25 | KPI_OneIP_Sum.main(path); //计算独立IP访问量 26 | KPI_OnePV_Sum.main(path); //计算PV访问量 27 | KPI_OneRequest_Sum.main(path); //获得请求方式 28 | KPI_OneTime_Sum.main(path); //每小时的PV 29 | KPI_OneSource_Sum.main(path); //日访问设备统计 30 | 31 | System.exit(0); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /Hadoop/WebKPI/KPI_OnePV_Sum.java: -------------------------------------------------------------------------------- 1 | package WebKPI; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.text.ParseException; 6 | import java.util.Map; 7 | 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.LongWritable; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.mapreduce.InputSplit; 13 | import org.apache.hadoop.mapreduce.Job; 14 | import org.apache.hadoop.mapreduce.Mapper; 15 | import org.apache.hadoop.mapreduce.Reducer; 16 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 17 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 18 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 19 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 20 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; 21 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 22 | 23 | public class KPI_OnePV_Sum { 24 | 25 | private static KPIfilter kpi; //声明一个KPIfilter对象 26 | 27 | //Mapper类 28 | public static class PVMap extends Mapper{ 29 | 30 | private static String filename ;//整个Map函数使用这个变量,意思为 获取当前文件的名称 31 | private static Text pvK1 = new Text(); 32 | private static LongWritable pvV1 = new LongWritable(1); 33 | 34 | //获取文件名,setup函数,每次执行一个Map类时只调用一次 35 | @Override 36 | protected void setup(Context context) throws IOException,InterruptedException { 37 | // TODO Auto-generated method stub 38 | InputSplit input = context.getInputSplit(); 39 | filename = ((FileSplit) input).getPath().getName(); //获得的是形如 26-Apr-2016.txt 40 | filename = filename.substring(0, 11).replace("-",""); //转换为: 26Apr2016 41 | System.out.println("filename:" + filename); 42 | } 43 | 44 | public void map(LongWritable key, Text value ,Context context) throws IOException, InterruptedException{ 45 | try { 46 | kpi = KPIfilter.filterPVs(value.toString()); 47 | if(kpi.isValid()) 48 | { 49 | pvK1.set(kpi.getSee_url() + "\t" + filename); //key设置为从log中解析出的访问入口 50 | context.write(pvK1, pvV1); 51 | } 52 | } catch (ParseException e) { 53 | // TODO Auto-generated catch block 54 | // e.printStackTrace(); 55 | // System.out.println("This is some error"); 56 | } 57 | } 58 | 59 | } 60 | //Reducer类 61 | public static class PVReduce extends Reducer{ 62 | 63 | private static Text pvk2 = new Text(); //key 64 | private static LongWritable pvV2 = new LongWritable(); //value 65 | 66 | //声明mos变量,将不同日期的处理结果写进不同的文件 67 | private MultipleOutputs mos; 68 | 69 | //reduce类中的setup函数 70 | @Override 71 | protected void setup(org.apache.hadoop.mapreduce.Reducer.Context context) throws IOException, InterruptedException { 72 | // TODO Auto-generated method stub 73 | mos = new MultipleOutputs(context); 74 | } 75 | 76 | public void reduce(Text key, Iterable values, Context contexts) throws IOException, InterruptedException{ 77 | String[] arr = key.toString().split("\t"); 78 | String filename = arr[1]; 79 | 80 | //统计每个指定页面的日访问量 81 | int seeNum = 0; 82 | for(LongWritable w : values) 83 | { 84 | seeNum += w.get(); 85 | } 86 | 87 | pvk2.set(arr[0]); 88 | pvV2.set(seeNum); 89 | // System.out.println(filename + "______________" + pvk2 + "===========" + pvV2); 90 | mos.write(filename, pvk2, pvV2); 91 | } 92 | 93 | //cleanup函数 关闭mos 94 | public void cleanup(Context context) throws IOException,InterruptedException { 95 | mos.close(); 96 | } 97 | } 98 | 99 | //main函数 100 | public static void main(Map path) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException { 101 | String hdfs_input = path.get("input_log"); //loghdfs存放 102 | String hdfs_output = path.get("output_pv"); //pv输出的目录 103 | 104 | hdfsGYT hdfs = new hdfsGYT(); 105 | hdfs.rmr(hdfs_output); //如果存在输出的目录的首先删除,否则会报错 106 | 107 | Job job = new Job(new Configuration(), "PV"); 108 | job.setJarByClass(KPI_OnePV_Sum.class); 109 | 110 | job.setMapperClass(PVMap.class); 111 | job.setReducerClass(PVReduce.class); 112 | 113 | job.setMapOutputKeyClass(Text.class); 114 | job.setMapOutputValueClass(LongWritable.class); 115 | 116 | job.setOutputKeyClass(Text.class); 117 | job.setOutputValueClass(LongWritable.class); 118 | 119 | job.setInputFormatClass(TextInputFormat.class); 120 | job.setOutputFormatClass(TextOutputFormat.class); 121 | 122 | MultipleOutputs.addNamedOutput(job, "17Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 123 | MultipleOutputs.addNamedOutput(job, "18Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 124 | MultipleOutputs.addNamedOutput(job, "19Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 125 | MultipleOutputs.addNamedOutput(job, "20Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 126 | MultipleOutputs.addNamedOutput(job, "21Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 127 | MultipleOutputs.addNamedOutput(job, "22Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 128 | MultipleOutputs.addNamedOutput(job, "23Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 129 | MultipleOutputs.addNamedOutput(job, "24Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 130 | MultipleOutputs.addNamedOutput(job, "25Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 131 | MultipleOutputs.addNamedOutput(job, "26Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 132 | MultipleOutputs.addNamedOutput(job, "27Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 133 | MultipleOutputs.addNamedOutput(job, "28Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 134 | MultipleOutputs.addNamedOutput(job, "29Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 135 | MultipleOutputs.addNamedOutput(job, "30Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 136 | 137 | FileInputFormat.addInputPath(job, new Path(hdfs_input)); 138 | FileOutputFormat.setOutputPath(job, new Path(hdfs_output)); 139 | 140 | //提交作业 141 | job.waitForCompletion(true); 142 | 143 | // 144 | System.out.println("User_agent Error:" + kpi.getNumUser_agent()); 145 | System.out.println("Status Error:" + kpi.getStatus()); 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /Hadoop/WebKPI/KPI_OneRequest_Sum.java: -------------------------------------------------------------------------------- 1 | package WebKPI; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.text.ParseException; 6 | import java.util.Map; 7 | 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.LongWritable; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.mapreduce.InputSplit; 13 | import org.apache.hadoop.mapreduce.Job; 14 | import org.apache.hadoop.mapreduce.Mapper; 15 | import org.apache.hadoop.mapreduce.Reducer; 16 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 17 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 18 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 19 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 20 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; 21 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 22 | 23 | public class KPI_OneRequest_Sum { 24 | 25 | private static KPIfilter kpi; //声明一个kpi对象 26 | //Mapper类 27 | public static class ReMap extends Mapper 28 | { 29 | String filename; //读取的文件名 30 | static Text reK1 = new Text(); 31 | static LongWritable reV1 = new LongWritable(1); 32 | 33 | //setup函数,没个Map执行一次 34 | @Override 35 | protected void setup(Context context)throws IOException, InterruptedException { 36 | // TODO Auto-generated method stub 37 | InputSplit split = context.getInputSplit(); 38 | filename = ((FileSplit) split).getPath().getName(); 39 | filename = filename.substring(0, 11).replace("-", ""); //得到合法的文件名 40 | System.out.println("filename: " + filename); 41 | } 42 | //map函数 43 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException 44 | { 45 | try { 46 | kpi = KPIfilter.parser(value.toString()); 47 | if(kpi.isValid()) 48 | { 49 | reK1.set(kpi.getRequest()+"\t"+filename); 50 | context.write(reK1, reV1); 51 | } 52 | } catch (ParseException e) { 53 | // TODO Auto-generated catch block 54 | e.printStackTrace(); 55 | } 56 | } 57 | } 58 | //Reducer类 59 | public static class ReReduce extends Reducer 60 | { 61 | private static Text reK2 = new Text(); //key 62 | private static LongWritable reV2 = new LongWritable(); //value 63 | 64 | private MultipleOutputs mos; //声明多路输出 65 | //setup函数 66 | @Override 67 | protected void setup(Context context) throws IOException, InterruptedException { 68 | mos = new MultipleOutputs(context); 69 | } 70 | //reduce函数 71 | public void reduce(Text key, Iterable values, Context contexts) throws IOException, InterruptedException 72 | { 73 | int sum=0; 74 | String[] arr = key.toString().split("\t"); 75 | for (LongWritable w : values) { 76 | sum += w.get(); 77 | } 78 | reK2.set(arr[0]); 79 | reV2.set(sum); 80 | mos.write(arr[1], reK2, reV2); 81 | } 82 | //cleanup函数 83 | @Override 84 | protected void cleanup(Context context) throws IOException, InterruptedException { 85 | // TODO Auto-generated method stub 86 | mos.close(); 87 | } 88 | } 89 | 90 | public static void main(Map path) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { 91 | // TODO Auto-generated method stub 92 | 93 | String hdfs_input = path.get("input_log"); //指定输入输出文件夹 94 | String hdfs_output = path.get("output_request"); 95 | 96 | hdfsGYT hdfs = new hdfsGYT(); 97 | hdfs.rmr(hdfs_output); //首先删除对应的hdfs上的文件输出目录 98 | 99 | Job job = new Job(new Configuration(), "RequestSum"); 100 | job.setJarByClass(KPI_OneRequest_Sum.class); 101 | 102 | job.setMapperClass(ReMap.class); 103 | job.setReducerClass(ReReduce.class); 104 | 105 | job.setMapOutputKeyClass(Text.class); 106 | job.setMapOutputValueClass(LongWritable.class); 107 | 108 | job.setOutputKeyClass(Text.class); 109 | job.setOutputValueClass(LongWritable.class); 110 | 111 | job.setInputFormatClass(TextInputFormat.class); 112 | job.setOutputFormatClass(TextOutputFormat.class); 113 | 114 | MultipleOutputs.addNamedOutput(job, "17Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 115 | MultipleOutputs.addNamedOutput(job, "18Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 116 | MultipleOutputs.addNamedOutput(job, "19Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 117 | MultipleOutputs.addNamedOutput(job, "20Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 118 | MultipleOutputs.addNamedOutput(job, "21Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 119 | MultipleOutputs.addNamedOutput(job, "22Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 120 | MultipleOutputs.addNamedOutput(job, "23Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 121 | MultipleOutputs.addNamedOutput(job, "24Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 122 | MultipleOutputs.addNamedOutput(job, "25Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 123 | MultipleOutputs.addNamedOutput(job, "26Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 124 | MultipleOutputs.addNamedOutput(job, "27Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 125 | MultipleOutputs.addNamedOutput(job, "28Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 126 | MultipleOutputs.addNamedOutput(job, "29Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 127 | MultipleOutputs.addNamedOutput(job, "30Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 128 | 129 | FileInputFormat.addInputPath(job, new Path(hdfs_input)); 130 | FileOutputFormat.setOutputPath(job, new Path(hdfs_output) ); 131 | 132 | //提交作业 133 | job.waitForCompletion(true); 134 | 135 | // 136 | System.out.println("User_agent Error:" + kpi.getNumUser_agent()); 137 | System.out.println("Status Error:" + kpi.getStatus()); 138 | } 139 | 140 | } 141 | -------------------------------------------------------------------------------- /Hadoop/WebKPI/KPI_OneSource_Sum.java: -------------------------------------------------------------------------------- 1 | package WebKPI; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.text.ParseException; 6 | import java.util.Map; 7 | 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.LongWritable; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.mapreduce.InputSplit; 13 | import org.apache.hadoop.mapreduce.Job; 14 | import org.apache.hadoop.mapreduce.Mapper; 15 | import org.apache.hadoop.mapreduce.Reducer; 16 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 17 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 18 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 19 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 20 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; 21 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 22 | 23 | 24 | public class KPI_OneSource_Sum { 25 | 26 | private static KPIfilter kpi; //声明一个kpi对象 27 | 28 | public static class SourceMap extends Mapper{ 29 | 30 | static String filename; //存储文件名 31 | static Text sK1 = new Text(); //key 32 | static LongWritable sV1 = new LongWritable(1); //value 33 | 34 | @Override 35 | protected void setup(Context context) throws IOException,InterruptedException { 36 | // TODO Auto-generated method stub 37 | InputSplit split = context.getInputSplit(); 38 | filename = ((FileSplit) split).getPath().getName(); 39 | filename = filename.substring(0, 11).replace("-", ""); //得到合法的文件名 40 | System.out.println("filename: " + filename); 41 | } 42 | 43 | //map函数 44 | @Override 45 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 46 | // TODO Auto-generated method stub 47 | try { 48 | kpi = KPIfilter.parser(value.toString()); 49 | if(kpi.isValid()) 50 | { 51 | sK1.set(kpi.getUser_agent()+"\t"+filename); 52 | context.write(sK1, sV1); 53 | } 54 | } catch (ParseException e) { 55 | // TODO Auto-generated catch block 56 | e.printStackTrace(); 57 | } 58 | } 59 | 60 | } 61 | 62 | public static class SourceReduce extends Reducer< Text, LongWritable, Text , LongWritable>{ 63 | 64 | static Text sK2 = new Text(); //key 65 | static LongWritable sV2 = new LongWritable(); //value 66 | 67 | private MultipleOutputs mos; //声明多路输出 68 | 69 | @Override 70 | protected void setup(Context context)throws IOException, InterruptedException { 71 | // TODO Auto-generated method stub 72 | mos =new MultipleOutputs (context); 73 | } 74 | 75 | @Override 76 | protected void reduce(Text key, Iterable values,Context context) 77 | throws IOException, InterruptedException { 78 | // TODO Auto-generated method stub 79 | int sum=0; 80 | String[] arr = key.toString().split("\t"); 81 | for (LongWritable w : values) { 82 | sum += w.get(); 83 | } 84 | sK2.set(arr[0]); 85 | sV2.set(sum); 86 | System.out.println(arr[1]); 87 | mos.write(arr[1], sK2, sV2); 88 | } 89 | @Override 90 | public void cleanup(Context context)throws IOException, InterruptedException { 91 | // TODO Auto-generated method stub 92 | mos.close(); 93 | } 94 | 95 | } 96 | 97 | public static void main(Map path) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { 98 | // TODO Auto-generated method stub 99 | String hdfs_input = path.get("input_log"); //指定输入输出文件夹 100 | String hdfs_output = path.get("output_source"); 101 | 102 | hdfsGYT hdfs = new hdfsGYT(); 103 | hdfs.rmr(hdfs_output); //首先删除对应的hdfs上的文件输出目录 104 | 105 | Job job = new Job(new Configuration(), "Resource"); 106 | job.setJarByClass(KPI_OneSource_Sum.class); 107 | 108 | job.setMapperClass(SourceMap.class); 109 | job.setReducerClass(SourceReduce.class); 110 | 111 | job.setMapOutputKeyClass(Text.class); 112 | job.setMapOutputValueClass(LongWritable.class); 113 | 114 | job.setOutputKeyClass(Text.class); 115 | job.setOutputValueClass(LongWritable.class); 116 | 117 | job.setInputFormatClass(TextInputFormat.class); 118 | job.setOutputFormatClass(TextOutputFormat.class); 119 | 120 | MultipleOutputs.addNamedOutput(job, "17Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 121 | MultipleOutputs.addNamedOutput(job, "18Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 122 | MultipleOutputs.addNamedOutput(job, "19Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 123 | MultipleOutputs.addNamedOutput(job, "20Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 124 | MultipleOutputs.addNamedOutput(job, "21Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 125 | MultipleOutputs.addNamedOutput(job, "22Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 126 | MultipleOutputs.addNamedOutput(job, "23Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 127 | MultipleOutputs.addNamedOutput(job, "24Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 128 | MultipleOutputs.addNamedOutput(job, "25Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 129 | MultipleOutputs.addNamedOutput(job, "26Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 130 | MultipleOutputs.addNamedOutput(job, "27Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 131 | MultipleOutputs.addNamedOutput(job, "28Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 132 | MultipleOutputs.addNamedOutput(job, "29Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 133 | MultipleOutputs.addNamedOutput(job, "30Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 134 | 135 | FileInputFormat.addInputPath(job, new Path(hdfs_input)); 136 | FileOutputFormat.setOutputPath(job, new Path(hdfs_output) ); 137 | 138 | //提交作业 139 | job.waitForCompletion(true); 140 | 141 | // 142 | System.out.println("User_agent Error:" + kpi.getNumUser_agent()); 143 | System.out.println("Status Error:" + kpi.getStatus()); 144 | } 145 | 146 | } 147 | -------------------------------------------------------------------------------- /Hadoop/WebKPI/KPI_OneTime_Sum.java: -------------------------------------------------------------------------------- 1 | package WebKPI; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.text.ParseException; 6 | import java.util.Map; 7 | 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.LongWritable; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.mapreduce.InputSplit; 13 | import org.apache.hadoop.mapreduce.Job; 14 | import org.apache.hadoop.mapreduce.Mapper; 15 | import org.apache.hadoop.mapreduce.Reducer; 16 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 17 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 18 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 19 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 20 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; 21 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 22 | 23 | public class KPI_OneTime_Sum { 24 | 25 | private static KPIfilter kpi; //声明一个kpi对象 26 | 27 | public static class OneTimeMap extends Mapper{ 28 | String filename; //读取的文件名 29 | static Text timeK1 = new Text(); 30 | static LongWritable timeV1 = new LongWritable(1); 31 | 32 | //setup函数,没个Map执行一次 33 | @Override 34 | protected void setup(Context context) throws IOException, InterruptedException { 35 | // TODO Auto-generated method stub 36 | InputSplit split = context.getInputSplit(); 37 | filename = ((FileSplit) split).getPath().getName(); 38 | filename = filename.substring(0, 11).replace("-", ""); //得到合法的文件名 39 | System.out.println("filename: " + filename); 40 | } 41 | 42 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 43 | try { 44 | kpi = KPIfilter.filterPVs(value.toString()); 45 | if(kpi.isValid()){ 46 | timeK1.set(kpi.getTime_local_Date_hour()+"\t" + filename); 47 | context.write(timeK1,timeV1); 48 | } 49 | } catch (ParseException e) { 50 | // TODO Auto-generated catch block 51 | e.printStackTrace(); 52 | } 53 | } 54 | } 55 | 56 | public static class OneTimeReduce extends Reducer{ 57 | 58 | private static Text timeK2 = new Text(); 59 | private static LongWritable timeV2 = new LongWritable(); 60 | 61 | private MultipleOutputs mos; //声明多路输出 62 | //setup函数 63 | @Override 64 | protected void setup(Context context) throws IOException, InterruptedException { 65 | mos = new MultipleOutputs(context); 66 | } 67 | 68 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException{ 69 | int sum = 0; 70 | String[] arr = key.toString().split("\t"); 71 | String filename =arr[1]; 72 | for (LongWritable longWritable : values) { 73 | sum += longWritable.get(); 74 | } 75 | timeK2.set(arr[0]); 76 | timeV2.set(sum); 77 | mos.write(filename, timeK2, timeV2); 78 | } 79 | 80 | public void cleanup(Context context) throws IOException, InterruptedException{ 81 | mos.close(); 82 | } 83 | } 84 | 85 | public static void main(Map path) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { 86 | // TODO Auto-generated method stub 87 | String hdfs_input = path.get("input_log"); //loghdfs存放 88 | String hdfs_output = path.get("output_time"); //pv输出的目录 89 | 90 | hdfsGYT hdfs = new hdfsGYT(); 91 | hdfs.rmr(hdfs_output); //如果存在输出的目录的首先删除,否则会报错 92 | 93 | Job job = new Job(new Configuration(), "OneTime"); 94 | job.setJarByClass(KPI_OnePV_Sum.class); 95 | 96 | job.setMapperClass(OneTimeMap.class); 97 | job.setReducerClass(OneTimeReduce.class); 98 | 99 | job.setMapOutputKeyClass(Text.class); 100 | job.setMapOutputValueClass(LongWritable.class); 101 | 102 | job.setOutputKeyClass(Text.class); 103 | job.setOutputValueClass(LongWritable.class); 104 | 105 | job.setInputFormatClass(TextInputFormat.class); 106 | job.setOutputFormatClass(TextOutputFormat.class); 107 | 108 | MultipleOutputs.addNamedOutput(job, "17Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 109 | MultipleOutputs.addNamedOutput(job, "18Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 110 | MultipleOutputs.addNamedOutput(job, "19Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 111 | MultipleOutputs.addNamedOutput(job, "20Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 112 | MultipleOutputs.addNamedOutput(job, "21Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 113 | MultipleOutputs.addNamedOutput(job, "22Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 114 | MultipleOutputs.addNamedOutput(job, "23Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 115 | MultipleOutputs.addNamedOutput(job, "24Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 116 | MultipleOutputs.addNamedOutput(job, "25Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 117 | MultipleOutputs.addNamedOutput(job, "26Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 118 | MultipleOutputs.addNamedOutput(job, "27Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 119 | MultipleOutputs.addNamedOutput(job, "28Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 120 | MultipleOutputs.addNamedOutput(job, "29Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 121 | MultipleOutputs.addNamedOutput(job, "30Apr2016", TextOutputFormat.class, Text.class, LongWritable.class); 122 | 123 | FileInputFormat.addInputPath(job, new Path(hdfs_input)); 124 | FileOutputFormat.setOutputPath(job, new Path(hdfs_output)); 125 | 126 | //提交作业 127 | job.waitForCompletion(true); 128 | 129 | // 130 | System.out.println("User_agent Error:" + kpi.getNumUser_agent()); 131 | System.out.println("Status Error:" + kpi.getStatus()); 132 | } 133 | 134 | } 135 | -------------------------------------------------------------------------------- /Hadoop/WebKPI/hdfsGYT.java: -------------------------------------------------------------------------------- 1 | package WebKPI; 2 | 3 | import java.io.IOException; 4 | import java.net.URI; 5 | import java.net.URISyntaxException; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.FSDataInputStream; 9 | import org.apache.hadoop.fs.FileStatus; 10 | import org.apache.hadoop.fs.FileSystem; 11 | import org.apache.hadoop.fs.Path; 12 | import org.apache.hadoop.io.IOUtils; 13 | 14 | public class hdfsGYT { 15 | 16 | private static final String HDFS = "hdfs://127.0.0.1:9000/"; 17 | 18 | public hdfsGYT(String hdfs, Configuration conf ){ 19 | this.hdfsPath = hdfs; 20 | this.conf = conf; 21 | } 22 | 23 | public hdfsGYT() { 24 | // TODO Auto-generated constructor stub 25 | } 26 | 27 | private String hdfsPath; 28 | private Configuration conf = new Configuration() ; 29 | 30 | public static void main(String[] args) throws IOException, URISyntaxException{ 31 | hdfsGYT hdfsgyt = new hdfsGYT(); 32 | String folder = HDFS + "mr/groom_system/small2.csv"; 33 | String local = "/home/thinkgamer/Java/hadoop_shizhan/src/user_thing_tuijian/small2.csv"; 34 | String local1 = "/home/thinkgamer/Java/hadoop_shizhan/src/user_thing_tuijian"; 35 | //判断某个文件夹是否存在 36 | //hdfsgyt.isExist(folder); 37 | //创建文件夹 38 | //hdfsgyt.mkdir(folder); 39 | //删除文件夹 40 | //hdfsgyt.rmr(folder); 41 | //列出所有文件夹 42 | //hdfsgyt.ls(folder); 43 | //递归列出所有文件夹 44 | //hdfsgyt.lsr(folder); 45 | //上传文件 46 | //hdfsgyt.put(local, folder); 47 | //下载文件 48 | //hdfsgyt.get(folder,local1); 49 | //删除文件 50 | //hdfsgyt.rm(folder); 51 | //显示文件 52 | //hdfsgyt.cat(folder); 53 | //重命名文件 54 | // String path1 = HDFS + "mr/output"; 55 | // String path2 = HDFS + "mr/input"; 56 | // hdfsgyt.rename(path1,path2); 57 | } 58 | 59 | //重命名文件 60 | public void rename(String path1, String path2) throws IOException, URISyntaxException { 61 | // TODO Auto-generated method stub 62 | FileSystem fs = FileSystem.get(new URI(HDFS), new Configuration()); 63 | try{ 64 | fs.rename(new Path(path1), new Path(path2 ) ); 65 | System.out.println("Rename " + path1 + " To " + path2 ); 66 | }finally{ 67 | fs.close(); 68 | } 69 | } 70 | 71 | //显示文件 72 | public static void cat(String folder) throws IOException, URISyntaxException { 73 | // 与hdfs建立联系 74 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 75 | Path path = new Path(folder); 76 | FSDataInputStream fsdis = null; 77 | System.out.println("cat: " + folder); 78 | try { 79 | fsdis =fs.open(path); 80 | IOUtils.copyBytes(fsdis, System.out, 4096, false); 81 | } finally { 82 | IOUtils.closeStream(fsdis); 83 | fs.close(); 84 | } 85 | } 86 | 87 | //删除文件 88 | public static void rm(String folder) throws IOException, URISyntaxException { 89 | //与hdfs建立联系 90 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 91 | Path path = new Path(folder); 92 | if(fs.deleteOnExit(path)){ 93 | fs.delete(path); 94 | System.out.println("delete:" + folder); 95 | }else{ 96 | System.out.println("The fiel is not exist!"); 97 | } 98 | fs.close(); 99 | } 100 | 101 | //下载文件 102 | public static void get(String remote, String local) throws IllegalArgumentException, IOException, URISyntaxException { 103 | // 建立联系 104 | FileSystem fs = FileSystem.get(new URI(HDFS), new Configuration()); 105 | fs.copyToLocalFile(new Path(remote), new Path(local)); 106 | System.out.println("Get From : " + remote + " To :" + local); 107 | fs.close(); 108 | } 109 | 110 | //上传文件 111 | public static void put(String local, String remote) throws IOException, URISyntaxException { 112 | // 建立联系 113 | FileSystem fs = FileSystem.get(new URI(HDFS), new Configuration()); 114 | fs.copyFromLocalFile(new Path(local), new Path(remote)); 115 | System.out.println("Put :" + local + " To : " + remote); 116 | fs.close(); 117 | } 118 | 119 | //递归列出所有文件夹 120 | public static void lsr(String folder) throws IOException, URISyntaxException { 121 | //与hdfs建立联系 122 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 123 | Path path = new Path(folder); 124 | //得到该目录下的所有文件 125 | FileStatus[] fileList = fs.listStatus(path); 126 | for (FileStatus f : fileList) { 127 | System.out.printf("name: %s | folder: %s | size: %d\n", f.getPath(), f.isDir() , f.getLen()); 128 | try{ 129 | FileStatus[] fileListR = fs.listStatus(f.getPath()); 130 | for(FileStatus fr:fileListR){ 131 | System.out.printf("name: %s | folder: %s | size: %d\n", fr.getPath(), fr.isDir() , fr.getLen()); 132 | } 133 | }finally{ 134 | continue; 135 | } 136 | } 137 | fs.close(); 138 | } 139 | 140 | //列出所有文件夹 141 | public static void ls(String folder) throws IOException, URISyntaxException { 142 | //与hdfs建立联系 143 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 144 | Path path = new Path(folder); 145 | //得到该目录下的所有文件 146 | FileStatus[] fileList = fs.listStatus(path); 147 | for (FileStatus f : fileList) { 148 | System.out.printf("name: %s | folder: %s | size: %d\n", f.getPath(), f.isDir() , f.getLen()); 149 | } 150 | fs.close(); 151 | } 152 | 153 | //删除文件夹 154 | public static void rmr(String folder) throws IOException, URISyntaxException { 155 | //与hdfs建立联系 156 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 157 | Path path = new Path(folder); 158 | fs.delete(path); 159 | System.out.println("delete:" + folder); 160 | fs.close(); 161 | } 162 | 163 | //创建文件夹 164 | public static void mkdir(String folder) throws IOException, URISyntaxException { 165 | //与hdfs建立联系 166 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 167 | Path path = new Path(folder); 168 | if (!fs.exists(path)) { 169 | fs.mkdirs(path); 170 | System.out.println("Create: " + folder); 171 | }else{ 172 | System.out.println("it is have exist:" + folder); 173 | } 174 | fs.close(); 175 | } 176 | 177 | //判断某个文件夹是否存在 178 | public static void isExist(String folder) throws IOException, URISyntaxException { 179 | //与hdfs建立联系 180 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 181 | Path path = new Path(folder); 182 | if(fs.exists(path)){ 183 | System.out.println("it is have exist:" + folder); 184 | }else{ 185 | System.out.println("it is not exist:" + folder); 186 | } 187 | fs.close(); 188 | } 189 | 190 | } -------------------------------------------------------------------------------- /Hadoop/bookTuijian/Step1.java: -------------------------------------------------------------------------------- 1 | package bookTuijian; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.util.Map; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.io.LongWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.InputSplit; 12 | import org.apache.hadoop.mapreduce.Job; 13 | import org.apache.hadoop.mapreduce.Mapper; 14 | import org.apache.hadoop.mapreduce.Reducer; 15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 16 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 17 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 18 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 19 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 20 | 21 | /* 22 | * Step1:得到评分矩阵 23 | */ 24 | public class Step1 { 25 | 26 | //Map类 27 | public static class Step1_Map extends Mapper{ 28 | 29 | String filename; //存放文件名字 30 | static Text k1 = new Text(); //key 31 | static Text v1 = new Text();//value 32 | 33 | //setup函数,每次运行Map类只执行一次,获取并打印文件名 34 | @Override 35 | protected void setup(Context context) throws IOException,InterruptedException { 36 | // TODO Auto-generated method stub 37 | InputSplit inputsplit = context.getInputSplit(); 38 | filename = ((FileSplit) inputsplit).getPath().getName(); 39 | System.out.println("Filename:" + filename); 40 | } 41 | 42 | @Override 43 | protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException { 44 | // TODO Auto-generated method stub 45 | // String[] arr = value.toString().split(","); 46 | String[] arr = value.toString().split("\t"); 47 | k1.set(arr[0]); 48 | v1.set(arr[2]+":"+arr[1]); 49 | // v1.set(arr[1]+":"+arr[2]); 50 | context.write(k1, v1); 51 | } 52 | 53 | } 54 | 55 | //Reduce类 56 | public static class Step1_Reduce extends Reducer{ 57 | static Text k2 = new Text(); 58 | static Text v2 = new Text(); 59 | @Override 60 | protected void reduce(Text key, Iterable values,Context context)throws IOException, InterruptedException { 61 | // TODO Auto-generated method stub 62 | String id_score=""; 63 | for (Text text : values) { 64 | id_score += "," + text.toString(); 65 | } 66 | id_score = id_score.substring(1); 67 | k2.set(key); 68 | v2.set(id_score); 69 | context.write(k2, v2); 70 | } 71 | 72 | } 73 | 74 | public static void run(Map path) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { 75 | // TODO Auto-generated method stub 76 | String local_path = path.get("local_file"); //存放文件的本地目录 77 | String hdfs_file_path = path.get("hdfs_root_file"); //hdfs上存放文件的目录 78 | String input_path = path.get("hdfs_step1_input"); //step1的输入文件目录 79 | String output_path = path.get( "hdfs_step1_output"); //step2的输出文件目录 80 | System.out.println(local_path); 81 | System.out.println(hdfs_file_path); 82 | System.out.println(input_path); 83 | System.out.println(output_path); 84 | 85 | hdfsGYT hdfs = new hdfsGYT(); //声明一个hdfs的操作对象 86 | hdfs.rmr(input_path); //若输入的文件目录存在则删除 87 | hdfs.rmr(output_path); //若输出的文件目录存放则删除 88 | hdfs.put(local_path, input_path); //将本地文件上传至hdfs 89 | 90 | Job job = new Job(new Configuration(), "BookRecommend"); 91 | job.setJarByClass(Step1.class); 92 | 93 | //设置文件路径 94 | FileInputFormat.setInputPaths(job, new Path(input_path)); 95 | FileOutputFormat.setOutputPath(job, new Path(output_path)); 96 | 97 | //设置Map和Reduce类 98 | job.setMapperClass(Step1_Map.class); 99 | job.setReducerClass(Step1_Reduce.class); 100 | 101 | //设置map的输入输出格式 102 | job.setMapOutputKeyClass(Text.class); 103 | job.setMapOutputValueClass(Text.class); 104 | 105 | //设置reduce的输入输出格式 106 | job.setOutputKeyClass(Text.class); 107 | job.setOutputValueClass(Text.class); 108 | 109 | //设置文件 110 | job.setInputFormatClass(TextInputFormat.class); 111 | job.setOutputFormatClass(TextOutputFormat.class); 112 | 113 | //提交作业 114 | job.waitForCompletion(true); 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /Hadoop/bookTuijian/Step2.java: -------------------------------------------------------------------------------- 1 | package bookTuijian; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.util.Map; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.io.LongWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.InputSplit; 12 | import org.apache.hadoop.mapreduce.Job; 13 | import org.apache.hadoop.mapreduce.Mapper; 14 | import org.apache.hadoop.mapreduce.Reducer; 15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 16 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 17 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 18 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 19 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 20 | 21 | public class Step2 { 22 | 23 | //Map 24 | public static class Step2_Map extends Mapper{ 25 | 26 | String filename; 27 | static Text k1 = new Text(); 28 | static LongWritable v1 = new LongWritable(1); 29 | @Override 30 | protected void setup(Context context) throws IOException,InterruptedException { 31 | // TODO Auto-generated method stub 32 | InputSplit inputsplit = context.getInputSplit(); 33 | filename = ((FileSplit) inputsplit).getPath().getName(); 34 | System.out.println("Step2 FileNme:" + filename); 35 | } 36 | 37 | 38 | @Override 39 | protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException { 40 | // TODO Auto-generated method stub 41 | String[] arrs = bookRecommend.DELIMITER.split(value.toString()); 42 | for (int i =1; i < arrs.length; i++ ) 43 | { 44 | String itemID = arrs[i].split(":")[0]; 45 | for( int j=1; j< arrs.length; j++) 46 | { 47 | String itemID2 = arrs[j].split(":")[0]; 48 | k1.set(itemID+":" + itemID2); 49 | context.write(k1,v1); 50 | } 51 | } 52 | } 53 | } 54 | 55 | public static class Step2_Reduce extends Reducer{ 56 | 57 | static LongWritable v2 = new LongWritable(); 58 | @Override 59 | protected void reduce(Text key, Iterable values,Context context)throws IOException, InterruptedException { 60 | // TODO Auto-generated method stub 61 | int num = 0; 62 | for (LongWritable text : values) { 63 | num += text.get(); 64 | } 65 | v2.set(num); 66 | context.write(key,v2); 67 | } 68 | 69 | } 70 | 71 | 72 | public static void run(Map path) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { 73 | // TODO Auto-generated method stub 74 | String input_path = path.get("hdfs_step2_input"); 75 | String output_path = path.get("hdfs_step2_output"); 76 | 77 | hdfsGYT hdfs = new hdfsGYT(); 78 | hdfs.rmr(output_path); 79 | 80 | Job job = new Job(new Configuration(), "Step2"); 81 | job.setJarByClass(Step2.class); 82 | 83 | //设置文件路径 84 | FileInputFormat.setInputPaths(job, new Path(input_path)); 85 | FileOutputFormat.setOutputPath(job, new Path(output_path)); 86 | 87 | //设置Map和Reduce类 88 | job.setMapperClass(Step2_Map.class); 89 | job.setReducerClass(Step2_Reduce.class); 90 | 91 | //设置map的输出格式 92 | job.setMapOutputKeyClass(Text.class); 93 | job.setMapOutputValueClass(LongWritable.class); 94 | 95 | //设置reduce的输出格式 96 | job.setOutputKeyClass(Text.class); 97 | job.setOutputValueClass(LongWritable.class); 98 | 99 | //设置文件 100 | job.setInputFormatClass(TextInputFormat.class); 101 | job.setOutputFormatClass(TextOutputFormat.class); 102 | 103 | //提交作业 104 | job.waitForCompletion(true); 105 | } 106 | 107 | } 108 | -------------------------------------------------------------------------------- /Hadoop/bookTuijian/Step3_1.java: -------------------------------------------------------------------------------- 1 | package bookTuijian; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.util.Map; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.io.LongWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.InputSplit; 12 | import org.apache.hadoop.mapreduce.Job; 13 | import org.apache.hadoop.mapreduce.Mapper; 14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 15 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 16 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 17 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 18 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 19 | 20 | public class Step3_1 { 21 | 22 | public static class Step3_1_Map extends Mapper{ 23 | String filename; 24 | 25 | static Text k1 = new Text(); 26 | static Text v1 = new Text(); 27 | @Override 28 | protected void setup(Context context) throws IOException,InterruptedException { 29 | // TODO Auto-generated method stub 30 | InputSplit inputsplit = context.getInputSplit(); 31 | filename = ((FileSplit) inputsplit).getPath().getName(); 32 | System.out.println("Step2 FileNme:" + filename); 33 | } 34 | 35 | @Override 36 | protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException { 37 | // TODO Auto-generated method stub 38 | String[] arrs = bookRecommend.DELIMITER.split(value.toString()); 39 | for(int i=1; i< arrs.length; i++) 40 | { 41 | String itemID = arrs[i].split(":")[0]; 42 | String score = arrs[i].split(":")[1]; 43 | k1.set(itemID); 44 | v1.set(arrs[0] + ":" + score); 45 | context.write(k1,v1); 46 | } 47 | } 48 | } 49 | public static void run(Map path) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { 50 | // TODO Auto-generated method stub 51 | String input_path = path.get("hdfs_step3_1_input"); 52 | String output_path = path.get("hdfs_step3_1_output"); 53 | 54 | hdfsGYT hdfs = new hdfsGYT(); 55 | hdfs.rmr(output_path); 56 | 57 | Job job = new Job(new Configuration(), "Step3_1"); 58 | job.setJarByClass(Step2.class); 59 | 60 | //设置文件路径 61 | FileInputFormat.setInputPaths(job, new Path(input_path)); 62 | FileOutputFormat.setOutputPath(job, new Path(output_path)); 63 | 64 | //设置Map和Reduce类 65 | job.setMapperClass(Step3_1_Map.class); 66 | 67 | //设置map的输出格式 68 | job.setMapOutputKeyClass(Text.class); 69 | job.setMapOutputValueClass(Text.class); 70 | 71 | //设置文件 72 | job.setInputFormatClass(TextInputFormat.class); 73 | job.setOutputFormatClass(TextOutputFormat.class); 74 | 75 | //提交作业 76 | job.waitForCompletion(true); 77 | } 78 | 79 | } 80 | -------------------------------------------------------------------------------- /Hadoop/bookTuijian/Step3_2.java: -------------------------------------------------------------------------------- 1 | package bookTuijian; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.util.Map; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.io.LongWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.InputSplit; 12 | import org.apache.hadoop.mapreduce.Job; 13 | import org.apache.hadoop.mapreduce.Mapper; 14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 15 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 16 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 17 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 18 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 19 | 20 | public class Step3_2 { 21 | 22 | public static class Step3_2_Map extends Mapper{ 23 | 24 | String filename; 25 | static Text k1 = new Text(); 26 | static LongWritable v1 = new LongWritable(); 27 | 28 | @Override 29 | protected void setup(Context context) throws IOException,InterruptedException { 30 | // TODO Auto-generated method stub 31 | InputSplit inputsplit = context.getInputSplit(); 32 | filename = ((FileSplit) inputsplit).getPath().getName(); 33 | System.out.println("Step3_2 FileName : " + filename); 34 | } 35 | 36 | @Override 37 | protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException { 38 | // TODO Auto-generated method stub 39 | //101:101 5 40 | String[] arrs = bookRecommend.DELIMITER.split(value.toString()); 41 | k1.set(arrs[0]); 42 | v1.set(Integer.parseInt( arrs[1]) ); 43 | context.write(k1, v1); 44 | } 45 | } 46 | 47 | public static void run(Map path) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { 48 | // TODO Auto-generated method stub 49 | String input_path = path.get("hdfs_step3_2_input"); 50 | String output_path = path.get("hdfs_step3_2_output"); 51 | 52 | hdfsGYT hdfs = new hdfsGYT(); 53 | hdfs.rmr(output_path); 54 | 55 | Job job = new Job(new Configuration(), "Step2"); 56 | job.setJarByClass(Step2.class); 57 | 58 | //设置文件路径 59 | FileInputFormat.setInputPaths(job, new Path(input_path)); 60 | FileOutputFormat.setOutputPath(job, new Path(output_path)); 61 | 62 | //设置Map和Reduce类 63 | job.setMapperClass(Step3_2_Map.class); 64 | 65 | //设置map的输出格式 66 | job.setMapOutputKeyClass(Text.class); 67 | job.setMapOutputValueClass(LongWritable.class); 68 | 69 | //设置文件 70 | job.setInputFormatClass(TextInputFormat.class); 71 | job.setOutputFormatClass(TextOutputFormat.class); 72 | 73 | //提交作业 74 | job.waitForCompletion(true); 75 | 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /Hadoop/bookTuijian/Step4.java: -------------------------------------------------------------------------------- 1 | package bookTuijian; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.util.ArrayList; 6 | import java.util.HashMap; 7 | import java.util.Iterator; 8 | import java.util.List; 9 | import java.util.Map; 10 | 11 | import org.apache.hadoop.conf.Configuration; 12 | import org.apache.hadoop.fs.Path; 13 | import org.apache.hadoop.io.LongWritable; 14 | import org.apache.hadoop.io.Text; 15 | import org.apache.hadoop.mapreduce.InputSplit; 16 | import org.apache.hadoop.mapreduce.Job; 17 | import org.apache.hadoop.mapreduce.Mapper; 18 | import org.apache.hadoop.mapreduce.Reducer; 19 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 20 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 21 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 22 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 23 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 24 | 25 | public class Step4 { 26 | 27 | public static class Step4_Map extends Mapper{ 28 | 29 | String filename; 30 | static Text k1 =new Text(); 31 | static Text value1 = new Text(); 32 | 33 | private final static Map> coocurenceMatrix = new HashMap>(); 34 | 35 | @Override 36 | protected void setup(Context context) throws IOException,InterruptedException { 37 | // TODO Auto-generated method stub 38 | InputSplit inputsplit = context.getInputSplit(); 39 | filename = ((FileSplit) inputsplit).getPath().getName(); 40 | System.out.println("Step4 Filename : " + filename); 41 | } 42 | 43 | @Override 44 | protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException { 45 | // TODO Auto-generated method stub 46 | String[] arrs = bookRecommend.DELIMITER.split(value.toString()); 47 | // System.out.println(value.toString() + "=================="); 48 | 49 | String [] v1 = arrs[0].split(":"); 50 | String [] v2 = arrs[1].split(":"); 51 | 52 | if(v1.length>1) //数据来自同现矩阵 53 | { 54 | // System.out.println(value.toString()+"++++++++++++++++++++++++++=="); 55 | int itemID1 = Integer.parseInt(v1[0]); 56 | int itemID2 = Integer.parseInt(v1[1]); 57 | int num = Integer.parseInt(arrs[1]); 58 | 59 | List list = null; 60 | if(!coocurenceMatrix.containsKey(itemID1)){ 61 | list = new ArrayList(); 62 | }else{ 63 | list = coocurenceMatrix.get(itemID1 ); 64 | } 65 | list.add(new Coocurence(itemID1, itemID2, num) ); 66 | coocurenceMatrix.put(itemID1, list); 67 | } 68 | if(v2.length>1) //数据来自评分矩阵 69 | { 70 | System.out.println(value.toString()+"-------------------------------"); 71 | int itemID = Integer.parseInt(arrs[0]); 72 | String userID = v2[0]; 73 | double score = Float.parseFloat(v2[1]); 74 | k1.set(userID); 75 | for(Coocurence co : coocurenceMatrix.get(itemID)) 76 | { 77 | value1.set(co.getItemID2() + "," + score * co.getNum()); 78 | context.write(k1, value1); 79 | //itemID1, itemID2 +"," + score * num 80 | } 81 | } 82 | } 83 | } 84 | 85 | public static class Step4_Reduce extends Reducer{ 86 | 87 | private static Text value2 = new Text(); 88 | 89 | @Override 90 | protected void reduce(Text key, Iterable values,Context context) throws IOException, InterruptedException { 91 | // TODO Auto-generated method stub 92 | Map result = new HashMap(); 93 | for (Text text : values) { 94 | String[] arrs =text.toString().split(","); 95 | if (result.containsKey(arrs[0])) 96 | { 97 | result.put(arrs[0], result.get(arrs[0]) + Double.parseDouble(arrs[1])); 98 | }else 99 | { 100 | result.put(arrs[0], Double.parseDouble(arrs[1])); 101 | } 102 | } 103 | Iterator iter = result.keySet().iterator(); 104 | while(iter.hasNext()) 105 | { 106 | String itemID = (String) iter.next(); 107 | double score = result.get(itemID); 108 | value2.set(itemID + "," + score); 109 | context.write(key, value2); 110 | } 111 | } 112 | } 113 | 114 | public static void run(Map path) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { 115 | // TODO Auto-generated method stub 116 | String input_1 = path.get("hdfs_step4_input_1"); 117 | String input_2 = path.get("hdfs_step4_input_2"); 118 | String output = path.get("hdfs_step4_output"); 119 | 120 | hdfsGYT hdfs = new hdfsGYT(); 121 | hdfs.rmr(output); 122 | 123 | Job job = new Job( new Configuration(), "Step4"); 124 | job.setJarByClass(Step4.class); 125 | 126 | //设置文件路径 127 | FileInputFormat.setInputPaths(job, new Path(input_2),new Path(input_1)); 128 | FileOutputFormat.setOutputPath(job, new Path(output)); 129 | 130 | //设置Map和Reduce类 131 | job.setMapperClass(Step4_Map.class); 132 | job.setReducerClass(Step4_Reduce.class); 133 | 134 | //设置map的输入输出格式 135 | job.setMapOutputKeyClass(Text.class); 136 | job.setMapOutputValueClass(Text.class); 137 | 138 | //设置reduce的输入输出格式 139 | job.setOutputKeyClass(Text.class); 140 | job.setOutputValueClass(Text.class); 141 | 142 | //设置文件 143 | job.setInputFormatClass(TextInputFormat.class); 144 | job.setOutputFormatClass(TextOutputFormat.class); 145 | 146 | //提交作业 147 | job.waitForCompletion(true); 148 | } 149 | } 150 | 151 | class Coocurence{ 152 | private int itemID1; 153 | private int itemID2; 154 | private int num; 155 | 156 | public Coocurence(int itemID1, int itemID2, int num){ 157 | this.itemID1 = itemID1; 158 | this.itemID2 = itemID2; 159 | this.num = num; 160 | } 161 | 162 | public int getItemID1() { 163 | return itemID1; 164 | } 165 | public void setItemID1(int itemID1) { 166 | this.itemID1 = itemID1; 167 | } 168 | public int getItemID2() { 169 | return itemID2; 170 | } 171 | public void setItemID2(int itemID2) { 172 | this.itemID2 = itemID2; 173 | } 174 | public int getNum() { 175 | return num; 176 | } 177 | public void setNum(int num) { 178 | this.num = num; 179 | } 180 | } -------------------------------------------------------------------------------- /Hadoop/bookTuijian/Step4_Updata.java: -------------------------------------------------------------------------------- 1 | package bookTuijian; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.util.HashMap; 6 | import java.util.Iterator; 7 | import java.util.Map; 8 | 9 | import org.apache.hadoop.conf.Configuration; 10 | import org.apache.hadoop.fs.Path; 11 | import org.apache.hadoop.io.LongWritable; 12 | import org.apache.hadoop.io.Text; 13 | import org.apache.hadoop.mapreduce.InputSplit; 14 | import org.apache.hadoop.mapreduce.Job; 15 | import org.apache.hadoop.mapreduce.Mapper; 16 | import org.apache.hadoop.mapreduce.Reducer; 17 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 18 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 19 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 20 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 21 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 22 | 23 | /* 24 | * 是对Step4的优化,分为矩阵相乘和相加,这一步是相乘 25 | */ 26 | public class Step4_Updata { 27 | 28 | public static class Step4_Updata_Map extends Mapper< LongWritable, Text, Text, Text>{ 29 | 30 | String filename; 31 | @Override 32 | protected void setup(Context context) throws IOException,InterruptedException { 33 | // TODO Auto-generated method stub 34 | InputSplit input = context.getInputSplit(); 35 | filename = ((FileSplit) input).getPath().getParent().getName(); 36 | System.out.println("FileName:" +filename); 37 | } 38 | 39 | @Override 40 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 41 | // TODO Auto-generated method stub 42 | String[] tokens = bookRecommend.DELIMITER.split(value.toString()); //切分 43 | 44 | if(filename.equals("Step3_2") ){ //同现矩阵 45 | String[] v1 = tokens[0].split(":"); 46 | String itemID1 = v1[0]; 47 | String itemID2 = v1[1]; 48 | String num = tokens[1]; 49 | 50 | Text key1 = new Text(itemID1); 51 | Text value1 = new Text("A:" + itemID2 +"," +num); 52 | context.write(key1,value1); 53 | // System.out.println(key1.toString() + "\t" + value1.toString()); 54 | }else{ //评分矩阵 55 | String[] v2 = tokens[1].split(":"); 56 | String itemID = tokens[0]; 57 | String userID = v2[0]; 58 | String score = v2[1]; 59 | 60 | Text key1 = new Text(itemID); 61 | Text value1 = new Text("B:" + userID + "," + score); 62 | context.write(key1,value1); 63 | // System.out.println(key1.toString() + "\t" + value1.toString()); 64 | } 65 | } 66 | } 67 | 68 | public static class Step4_Updata_Reduce extends Reducer{ 69 | 70 | @Override 71 | protected void reduce(Text key, Iterable values,Context context) throws IOException, InterruptedException { 72 | // TODO Auto-generated method stub 73 | // System.out.println(key.toString()+ ":"); 74 | 75 | Map mapA = new HashMap(); 76 | Map mapB = new HashMap(); 77 | 78 | for(Text line : values){ 79 | String val = line.toString(); 80 | // System.out.println(val); 81 | if(val.startsWith("A")){ 82 | String[] kv = bookRecommend.DELIMITER.split(val.substring(2)); 83 | mapA.put(kv[0], kv[1]); //ItemID, num 84 | // System.out.println(kv[0] + "\t" + kv[1] + "--------------1"); 85 | }else if(val.startsWith("B")){ 86 | String[] kv = bookRecommend.DELIMITER.split(val.substring(2)); 87 | mapB.put(kv[0], kv[1]); //userID, score 88 | // System.out.println(kv[0] + "\t" + kv[1] + "--------------2"); 89 | } 90 | } 91 | 92 | double result = 0; 93 | Iterator iterA = mapA.keySet().iterator(); 94 | while(iterA.hasNext()){ 95 | String mapkA = (String) iterA.next(); //itemID 96 | int num = Integer.parseInt((String) mapA.get(mapkA)); // num 97 | Iterator iterB = mapB.keySet().iterator(); 98 | while(iterB.hasNext()){ 99 | String mapkB = (String)iterB.next(); //UserID 100 | double score = Double.parseDouble((String) mapB.get(mapkB)); //score 101 | result = num * score; //矩阵乘法结果 102 | 103 | Text key2 = new Text(mapkB); 104 | Text value2 = new Text(mapkA + "," +result); 105 | context.write(key2,value2); //userID \t itemID,result 106 | // System.out.println(key2.toString() + "\t" + value2.toString()); 107 | } 108 | } 109 | } 110 | 111 | } 112 | 113 | public static void run(Map path) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { 114 | // TODO Auto-generated method stub 115 | 116 | String input_1 = path.get("hdfs_step4_updata_input"); 117 | String input_2 = path.get("hdfs_step4_updata2_input"); 118 | String output = path.get("hdfs_step4_updata_output"); 119 | 120 | hdfsGYT hdfs = new hdfsGYT(); 121 | hdfs.rmr(output); 122 | 123 | Job job = new Job(new Configuration(), "Step4_updata"); 124 | job.setJarByClass(Step4_Updata.class); 125 | //设置文件输入输出路径 126 | FileInputFormat.setInputPaths(job, new Path(input_1),new Path(input_2)); 127 | FileOutputFormat.setOutputPath(job, new Path(output)); 128 | 129 | //设置map和reduce类 130 | job.setMapperClass(Step4_Updata_Map.class); 131 | job.setReducerClass(Step4_Updata_Reduce.class); 132 | 133 | //设置Map输出 134 | job.setMapOutputKeyClass(Text.class); 135 | job.setMapOutputValueClass(Text.class); 136 | 137 | //设置Reduce输出 138 | job.setOutputKeyClass(Text.class); 139 | job.setOutputValueClass(Text.class); 140 | 141 | //设置文件输入输出 142 | job.setInputFormatClass(TextInputFormat.class); 143 | job.setOutputFormatClass(TextOutputFormat.class); 144 | 145 | job.waitForCompletion(true); 146 | } 147 | 148 | } 149 | -------------------------------------------------------------------------------- /Hadoop/bookTuijian/Step4_Updata2.java: -------------------------------------------------------------------------------- 1 | package bookTuijian; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.util.HashMap; 6 | import java.util.Iterator; 7 | import java.util.Map; 8 | 9 | import org.apache.hadoop.conf.Configuration; 10 | import org.apache.hadoop.fs.Path; 11 | import org.apache.hadoop.io.LongWritable; 12 | import org.apache.hadoop.io.Text; 13 | import org.apache.hadoop.mapreduce.Job; 14 | import org.apache.hadoop.mapreduce.Mapper; 15 | import org.apache.hadoop.mapreduce.Reducer; 16 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 17 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 18 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 19 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 20 | 21 | 22 | public class Step4_Updata2 { 23 | 24 | public static class Step4_Updata2_Map extends Mapper{ 25 | 26 | @Override 27 | protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException { 28 | // TODO Auto-generated method stub 29 | String[] tokens = bookRecommend.DELIMITER.split(value.toString()); 30 | Text key1 = new Text(tokens[0]);//userID 31 | Text value1 = new Text(tokens[1] + "," + tokens[2]); 32 | context.write(key1, value1); //itemID,result 33 | } 34 | 35 | } 36 | 37 | public static class Step4_Updata_Reduce extends Reducer< Text, Text, Text, Text>{ 38 | 39 | @Override 40 | protected void reduce(Text key, Iterable values, Context context)throws IOException, InterruptedException { 41 | // TODO Auto-generated method stub 42 | Map map = new HashMap(); 43 | 44 | for(Text line: values){ 45 | System.out.println(line.toString()); 46 | String[] tokens = bookRecommend.DELIMITER.split(line.toString()); 47 | String itemID = tokens[0]; 48 | Double result = Double.parseDouble(tokens[1]); 49 | 50 | if(map.containsKey(itemID)){ 51 | map.put(itemID, Double.parseDouble(map.get(itemID).toString()) + result);//矩阵乘法求和计算 52 | }else{ 53 | map.put(itemID, result); 54 | } 55 | } 56 | Iterator iter = map.keySet().iterator(); 57 | while (iter.hasNext()) { 58 | String itemID = (String) iter.next(); 59 | double score = (double) map.get(itemID); 60 | Text v = new Text(itemID + "," + score); 61 | context.write(key, v); 62 | } 63 | } 64 | 65 | } 66 | 67 | public static void run(Map path) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { 68 | // TODO Auto-generated method stub 69 | String input = path.get("hdfs_step4_updata2_input"); 70 | String output = path.get("hdfs_step4_updata2_output"); 71 | 72 | hdfsGYT hdfs = new hdfsGYT(); 73 | hdfs.rmr(output); 74 | 75 | Job job = new Job(new Configuration(), "Step4_Updata2"); 76 | job.setJarByClass(Step4_Updata2.class); 77 | 78 | FileInputFormat.addInputPath(job, new Path(input)); 79 | FileOutputFormat.setOutputPath(job, new Path(output)); 80 | 81 | //设置map和reduce类 82 | job.setMapperClass(Step4_Updata2_Map.class); 83 | job.setReducerClass(Step4_Updata_Reduce.class); 84 | 85 | //设置Map输出 86 | job.setMapOutputKeyClass(Text.class); 87 | job.setMapOutputValueClass(Text.class); 88 | 89 | //设置Reduce输出 90 | job.setOutputKeyClass(Text.class); 91 | job.setOutputValueClass(Text.class); 92 | 93 | //设置文件输入输出 94 | job.setInputFormatClass(TextInputFormat.class); 95 | job.setOutputFormatClass(TextOutputFormat.class); 96 | 97 | job.waitForCompletion(true); 98 | } 99 | 100 | } 101 | -------------------------------------------------------------------------------- /Hadoop/bookTuijian/bookRecommend.java: -------------------------------------------------------------------------------- 1 | package bookTuijian; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | import java.util.regex.Pattern; 8 | 9 | 10 | public class bookRecommend { 11 | 12 | /** 13 | * @param args 14 | * 驱动程序,控制所有的计算结果 15 | */ 16 | public static final String HDFS = "hdfs://127.0.0.1:9000"; 17 | public static final Pattern DELIMITER = Pattern.compile("[\t,]"); 18 | 19 | public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { 20 | // TODO Auto-generated method stub 21 | Map path = new HashMap(); 22 | path.put("local_file", "MyItems/bookTuijian/score.txt"); //本地文件所在的目录 23 | path.put("hdfs_root_file", HDFS+"/mr/bookRecommend/score"); //上传本地文件到HDFS上的存放路径 24 | 25 | path.put("hdfs_step1_input", path.get("hdfs_root_file")); //step1的输入文件存放目录 26 | path.put("hdfs_step1_output", HDFS+"/mr/bookRecommend/step1"); //hdfs上第一步运行的结果存放文件目录 27 | 28 | path.put("hdfs_step2_input", path.get("hdfs_step1_output")); //step2的输入文件目录 29 | path.put("hdfs_step2_output", HDFS+"/mr/bookRecommend/step2"); //step2的输出文件目录 30 | 31 | path.put("hdfs_step3_1_input", path.get("hdfs_step1_output")); //构建评分矩阵 32 | path.put("hdfs_step3_1_output", HDFS+"/mr/bookRecommend/Step3_1"); 33 | 34 | path.put("hdfs_step3_2_input", path.get("hdfs_step2_output")); //构建同现矩阵 35 | path.put("hdfs_step3_2_output", HDFS+"/mr/bookRecommend/Step3_2"); 36 | 37 | path.put("hdfs_step4_input_1", path.get("hdfs_step3_1_output")); //计算乘积 38 | path.put("hdfs_step4_input_2", path.get("hdfs_step3_2_output")); 39 | path.put("hdfs_step4_output", HDFS+"/mr/bookRecommend/result"); 40 | 41 | path.put("hdfs_step4_updata_input",path.get("hdfs_step3_1_output")); //Step4进行优化 42 | path.put("hdfs_step4_updata2_input",path.get("hdfs_step3_2_output")); 43 | path.put("hdfs_step4_updata_output", HDFS+"/mr/bookRecommend/Step4_Updata"); 44 | 45 | path.put("hdfs_step4_updata2_input",path.get("hdfs_step4_updata_output")); //Step4进行优化 46 | path.put("hdfs_step4_updata2_output", HDFS+"/mr/bookRecommend/Step4_Updata2"); 47 | 48 | 49 | // Step1.run(path); 50 | // Step2.run(path); 51 | // Step3_1.run(path); //构造评分矩阵 52 | // Step3_2.run(path); //构造同现矩阵 53 | // Step4.run(path); //计算乘积 54 | // Step4_Updata.run(path); 55 | Step4_Updata2.run(path); 56 | System.exit(0); 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /Hadoop/bookTuijian/hdfsGYT.java: -------------------------------------------------------------------------------- 1 | package bookTuijian; 2 | 3 | import java.io.IOException; 4 | import java.net.URI; 5 | import java.net.URISyntaxException; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.FSDataInputStream; 9 | import org.apache.hadoop.fs.FileStatus; 10 | import org.apache.hadoop.fs.FileSystem; 11 | import org.apache.hadoop.fs.Path; 12 | import org.apache.hadoop.io.IOUtils; 13 | 14 | public class hdfsGYT { 15 | 16 | private static final String HDFS = "hdfs://127.0.0.1:9000/"; 17 | 18 | public hdfsGYT(String hdfs, Configuration conf ){ 19 | this.hdfsPath = hdfs; 20 | this.conf = conf; 21 | } 22 | 23 | public hdfsGYT() { 24 | // TODO Auto-generated constructor stub 25 | } 26 | 27 | private String hdfsPath; 28 | private Configuration conf = new Configuration() ; 29 | 30 | public static void main(String[] args) throws IOException, URISyntaxException{ 31 | hdfsGYT hdfsgyt = new hdfsGYT(); 32 | String folder = HDFS + "mr/groom_system/small2.csv"; 33 | String local = "/home/thinkgamer/Java/hadoop_shizhan/src/user_thing_tuijian/small2.csv"; 34 | String local1 = "/home/thinkgamer/Java/hadoop_shizhan/src/user_thing_tuijian"; 35 | //判断某个文件夹是否存在 36 | //hdfsgyt.isExist(folder); 37 | //创建文件夹 38 | //hdfsgyt.mkdir(folder); 39 | //删除文件夹 40 | //hdfsgyt.rmr(folder); 41 | //列出所有文件夹 42 | //hdfsgyt.ls(folder); 43 | //递归列出所有文件夹 44 | //hdfsgyt.lsr(folder); 45 | //上传文件 46 | //hdfsgyt.put(local, folder); 47 | //下载文件 48 | //hdfsgyt.get(folder,local1); 49 | //删除文件 50 | //hdfsgyt.rm(folder); 51 | //显示文件 52 | //hdfsgyt.cat(folder); 53 | //重命名文件 54 | // String path1 = HDFS + "mr/output"; 55 | // String path2 = HDFS + "mr/input"; 56 | // hdfsgyt.rename(path1,path2); 57 | } 58 | 59 | //重命名文件 60 | public void rename(String path1, String path2) throws IOException, URISyntaxException { 61 | // TODO Auto-generated method stub 62 | FileSystem fs = FileSystem.get(new URI(HDFS), new Configuration()); 63 | try{ 64 | fs.rename(new Path(path1), new Path(path2 ) ); 65 | System.out.println("Rename " + path1 + " To " + path2 ); 66 | }finally{ 67 | fs.close(); 68 | } 69 | } 70 | 71 | //显示文件 72 | public static void cat(String folder) throws IOException, URISyntaxException { 73 | // 与hdfs建立联系 74 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 75 | Path path = new Path(folder); 76 | FSDataInputStream fsdis = null; 77 | System.out.println("cat: " + folder); 78 | try { 79 | fsdis =fs.open(path); 80 | IOUtils.copyBytes(fsdis, System.out, 4096, false); 81 | } finally { 82 | IOUtils.closeStream(fsdis); 83 | fs.close(); 84 | } 85 | } 86 | 87 | //删除文件 88 | public static void rm(String folder) throws IOException, URISyntaxException { 89 | //与hdfs建立联系 90 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 91 | Path path = new Path(folder); 92 | if(fs.deleteOnExit(path)){ 93 | fs.delete(path); 94 | System.out.println("delete:" + folder); 95 | }else{ 96 | System.out.println("The fiel is not exist!"); 97 | } 98 | fs.close(); 99 | } 100 | 101 | //下载文件 102 | public static void get(String remote, String local) throws IllegalArgumentException, IOException, URISyntaxException { 103 | // 建立联系 104 | FileSystem fs = FileSystem.get(new URI(HDFS), new Configuration()); 105 | fs.copyToLocalFile(new Path(remote), new Path(local)); 106 | System.out.println("Get From : " + remote + " To :" + local); 107 | fs.close(); 108 | } 109 | 110 | //上传文件 111 | public static void put(String local, String remote) throws IOException, URISyntaxException { 112 | // 建立联系 113 | FileSystem fs = FileSystem.get(new URI(HDFS), new Configuration()); 114 | fs.copyFromLocalFile(new Path(local), new Path(remote)); 115 | System.out.println("Put :" + local + " To : " + remote); 116 | fs.close(); 117 | } 118 | 119 | //递归列出所有文件夹 120 | public static void lsr(String folder) throws IOException, URISyntaxException { 121 | //与hdfs建立联系 122 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 123 | Path path = new Path(folder); 124 | //得到该目录下的所有文件 125 | FileStatus[] fileList = fs.listStatus(path); 126 | for (FileStatus f : fileList) { 127 | System.out.printf("name: %s | folder: %s | size: %d\n", f.getPath(), f.isDir() , f.getLen()); 128 | try{ 129 | FileStatus[] fileListR = fs.listStatus(f.getPath()); 130 | for(FileStatus fr:fileListR){ 131 | System.out.printf("name: %s | folder: %s | size: %d\n", fr.getPath(), fr.isDir() , fr.getLen()); 132 | } 133 | }finally{ 134 | continue; 135 | } 136 | } 137 | fs.close(); 138 | } 139 | 140 | //列出所有文件夹 141 | public static void ls(String folder) throws IOException, URISyntaxException { 142 | //与hdfs建立联系 143 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 144 | Path path = new Path(folder); 145 | //得到该目录下的所有文件 146 | FileStatus[] fileList = fs.listStatus(path); 147 | for (FileStatus f : fileList) { 148 | System.out.printf("name: %s | folder: %s | size: %d\n", f.getPath(), f.isDir() , f.getLen()); 149 | } 150 | fs.close(); 151 | } 152 | 153 | //删除文件夹 154 | public static void rmr(String folder) throws IOException, URISyntaxException { 155 | //与hdfs建立联系 156 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 157 | Path path = new Path(folder); 158 | fs.delete(path); 159 | System.out.println("delete:" + folder); 160 | fs.close(); 161 | } 162 | 163 | //创建文件夹 164 | public static void mkdir(String folder) throws IOException, URISyntaxException { 165 | //与hdfs建立联系 166 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 167 | Path path = new Path(folder); 168 | if (!fs.exists(path)) { 169 | fs.mkdirs(path); 170 | System.out.println("Create: " + folder); 171 | }else{ 172 | System.out.println("it is have exist:" + folder); 173 | } 174 | fs.close(); 175 | } 176 | 177 | //判断某个文件夹是否存在 178 | public static void isExist(String folder) throws IOException, URISyntaxException { 179 | //与hdfs建立联系 180 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 181 | Path path = new Path(folder); 182 | if(fs.exists(path)){ 183 | System.out.println("it is have exist:" + folder); 184 | }else{ 185 | System.out.println("it is not exist:" + folder); 186 | } 187 | fs.close(); 188 | } 189 | 190 | } -------------------------------------------------------------------------------- /Hadoop/bookTuijian/uid_to_bid.csv: -------------------------------------------------------------------------------- 1 | 1,101,5.0 2 | 1,102,3.0 3 | 1,103,2.5 4 | 2,101,2.0 5 | 2,102,2.5 6 | 2,103,5.0 7 | 2,104,2.0 8 | 3,101,2.5 9 | 3,104,4.0 10 | 3,105,4.5 11 | 3,107,5.0 12 | 4,101,5.0 13 | 4,103,3.0 14 | 4,104,4.5 15 | 4,106,4.0 16 | 5,101,4.0 17 | 5,102,3.0 18 | 5,103,2.0 19 | 5,104,4.0 20 | 5,105,3.5 21 | 5,106,4.0 -------------------------------------------------------------------------------- /Hadoop/hdfs/hdfsGYT.java: -------------------------------------------------------------------------------- 1 | package WebKPI; 2 | 3 | import java.io.IOException; 4 | import java.net.URI; 5 | import java.net.URISyntaxException; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.FSDataInputStream; 9 | import org.apache.hadoop.fs.FileStatus; 10 | import org.apache.hadoop.fs.FileSystem; 11 | import org.apache.hadoop.fs.Path; 12 | import org.apache.hadoop.io.IOUtils; 13 | 14 | public class hdfsGYT { 15 | 16 | private static final String HDFS = "hdfs://127.0.0.1:9000/"; 17 | 18 | public hdfsGYT(String hdfs, Configuration conf ){ 19 | this.hdfsPath = hdfs; 20 | this.conf = conf; 21 | } 22 | 23 | public hdfsGYT() { 24 | // TODO Auto-generated constructor stub 25 | } 26 | 27 | private String hdfsPath; 28 | private Configuration conf = new Configuration() ; 29 | 30 | public static void main(String[] args) throws IOException, URISyntaxException{ 31 | hdfsGYT hdfsgyt = new hdfsGYT(); 32 | String folder = HDFS + "mr/groom_system/small2.csv"; 33 | String local = "/home/thinkgamer/Java/hadoop_shizhan/src/user_thing_tuijian/small2.csv"; 34 | String local1 = "/home/thinkgamer/Java/hadoop_shizhan/src/user_thing_tuijian"; 35 | //判断某个文件夹是否存在 36 | //hdfsgyt.isExist(folder); 37 | //创建文件夹 38 | //hdfsgyt.mkdir(folder); 39 | //删除文件夹 40 | //hdfsgyt.rmr(folder); 41 | //列出所有文件夹 42 | //hdfsgyt.ls(folder); 43 | //递归列出所有文件夹 44 | //hdfsgyt.lsr(folder); 45 | //上传文件 46 | //hdfsgyt.put(local, folder); 47 | //下载文件 48 | //hdfsgyt.get(folder,local1); 49 | //删除文件 50 | //hdfsgyt.rm(folder); 51 | //显示文件 52 | //hdfsgyt.cat(folder); 53 | //重命名文件 54 | // String path1 = HDFS + "mr/output"; 55 | // String path2 = HDFS + "mr/input"; 56 | // hdfsgyt.rename(path1,path2); 57 | } 58 | 59 | //重命名文件 60 | public void rename(String path1, String path2) throws IOException, URISyntaxException { 61 | // TODO Auto-generated method stub 62 | FileSystem fs = FileSystem.get(new URI(HDFS), new Configuration()); 63 | try{ 64 | fs.rename(new Path(path1), new Path(path2 ) ); 65 | System.out.println("Rename " + path1 + " To " + path2 ); 66 | }finally{ 67 | fs.close(); 68 | } 69 | } 70 | 71 | //显示文件 72 | public static void cat(String folder) throws IOException, URISyntaxException { 73 | // 与hdfs建立联系 74 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 75 | Path path = new Path(folder); 76 | FSDataInputStream fsdis = null; 77 | System.out.println("cat: " + folder); 78 | try { 79 | fsdis =fs.open(path); 80 | IOUtils.copyBytes(fsdis, System.out, 4096, false); 81 | } finally { 82 | IOUtils.closeStream(fsdis); 83 | fs.close(); 84 | } 85 | } 86 | 87 | //删除文件 88 | public static void rm(String folder) throws IOException, URISyntaxException { 89 | //与hdfs建立联系 90 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 91 | Path path = new Path(folder); 92 | if(fs.deleteOnExit(path)){ 93 | fs.delete(path); 94 | System.out.println("delete:" + folder); 95 | }else{ 96 | System.out.println("The fiel is not exist!"); 97 | } 98 | fs.close(); 99 | } 100 | 101 | //下载文件 102 | public static void get(String remote, String local) throws IllegalArgumentException, IOException, URISyntaxException { 103 | // 建立联系 104 | FileSystem fs = FileSystem.get(new URI(HDFS), new Configuration()); 105 | fs.copyToLocalFile(new Path(remote), new Path(local)); 106 | System.out.println("Get From : " + remote + " To :" + local); 107 | fs.close(); 108 | } 109 | 110 | //上传文件 111 | public static void put(String local, String remote) throws IOException, URISyntaxException { 112 | // 建立联系 113 | FileSystem fs = FileSystem.get(new URI(HDFS), new Configuration()); 114 | fs.copyFromLocalFile(new Path(local), new Path(remote)); 115 | System.out.println("Put :" + local + " To : " + remote); 116 | fs.close(); 117 | } 118 | 119 | //递归列出所有文件夹 120 | public static void lsr(String folder) throws IOException, URISyntaxException { 121 | //与hdfs建立联系 122 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 123 | Path path = new Path(folder); 124 | //得到该目录下的所有文件 125 | FileStatus[] fileList = fs.listStatus(path); 126 | for (FileStatus f : fileList) { 127 | System.out.printf("name: %s | folder: %s | size: %d\n", f.getPath(), f.isDir() , f.getLen()); 128 | try{ 129 | FileStatus[] fileListR = fs.listStatus(f.getPath()); 130 | for(FileStatus fr:fileListR){ 131 | System.out.printf("name: %s | folder: %s | size: %d\n", fr.getPath(), fr.isDir() , fr.getLen()); 132 | } 133 | }finally{ 134 | continue; 135 | } 136 | } 137 | fs.close(); 138 | } 139 | 140 | //列出所有文件夹 141 | public static void ls(String folder) throws IOException, URISyntaxException { 142 | //与hdfs建立联系 143 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 144 | Path path = new Path(folder); 145 | //得到该目录下的所有文件 146 | FileStatus[] fileList = fs.listStatus(path); 147 | for (FileStatus f : fileList) { 148 | System.out.printf("name: %s | folder: %s | size: %d\n", f.getPath(), f.isDir() , f.getLen()); 149 | } 150 | fs.close(); 151 | } 152 | 153 | //删除文件夹 154 | public static void rmr(String folder) throws IOException, URISyntaxException { 155 | //与hdfs建立联系 156 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 157 | Path path = new Path(folder); 158 | fs.delete(path); 159 | System.out.println("delete:" + folder); 160 | fs.close(); 161 | } 162 | 163 | //创建文件夹 164 | public static void mkdir(String folder) throws IOException, URISyntaxException { 165 | //与hdfs建立联系 166 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 167 | Path path = new Path(folder); 168 | if (!fs.exists(path)) { 169 | fs.mkdirs(path); 170 | System.out.println("Create: " + folder); 171 | }else{ 172 | System.out.println("it is have exist:" + folder); 173 | } 174 | fs.close(); 175 | } 176 | 177 | //判断某个文件夹是否存在 178 | public static void isExist(String folder) throws IOException, URISyntaxException { 179 | //与hdfs建立联系 180 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 181 | Path path = new Path(folder); 182 | if(fs.exists(path)){ 183 | System.out.println("it is have exist:" + folder); 184 | }else{ 185 | System.out.println("it is not exist:" + folder); 186 | } 187 | fs.close(); 188 | } 189 | 190 | } -------------------------------------------------------------------------------- /Hadoop/multiple_In_Out/mulIn-output/part-r-00000: -------------------------------------------------------------------------------- 1 | 中国 3 2 | 中国 我们 3 | 中国 3 4 | 中国 我们 5 | 中国人 很多 6 | 中国人 很多 7 | 美国 32 8 | 美国 它们 9 | 美国 32 10 | 美国 它们 11 | -------------------------------------------------------------------------------- /Hadoop/multiple_In_Out/mulOut-output/china-r-00000: -------------------------------------------------------------------------------- 1 | 中国 3 2 | 中国 我们 3 | -------------------------------------------------------------------------------- /Hadoop/multiple_In_Out/mulOut-output/cpeople-r-00000: -------------------------------------------------------------------------------- 1 | 中国人 很多 2 | -------------------------------------------------------------------------------- /Hadoop/multiple_In_Out/mulOut-output/usa-r-00000: -------------------------------------------------------------------------------- 1 | 美国 32 2 | 美国 它们 3 | -------------------------------------------------------------------------------- /Hadoop/multiple_In_Out/multipleinout.java: -------------------------------------------------------------------------------- 1 | package multiple_In_Out; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Job; 9 | import org.apache.hadoop.mapreduce.Mapper; 10 | import org.apache.hadoop.mapreduce.Reducer; 11 | import org.apache.hadoop.mapreduce.lib.input.MultipleInputs; 12 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 15 | 16 | public class multipleinout { 17 | 18 | static String input1 = "hdfs://127.0.0.1:9000/mr/input1"; 19 | static String input2 = "hdfs://127.0.0.1:9000/mr/input2"; 20 | static String output = "hdfs://127.0.0.1:9000/mr/output"; 21 | 22 | public static class Map extends Mapper{ 23 | private static Text k = new Text(); 24 | private static Text v = new Text(); 25 | 26 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{ 27 | String[] list = value.toString().split(","); 28 | k.set(list[0]); 29 | v.set(list[1]); 30 | context.write(k, v); 31 | } 32 | } 33 | 34 | public static class Reduce extends Reducer{ 35 | 36 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException{ 37 | for (Text text : values) { 38 | context.write(key, text); 39 | } 40 | } 41 | 42 | } 43 | /** 44 | * @param args 45 | * @throws IOException 46 | * @throws InterruptedException 47 | * @throws ClassNotFoundException 48 | */ 49 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 50 | // TODO Auto-generated method stub 51 | Job job = new Job(); 52 | job.setJarByClass(multipleoutput.class); 53 | 54 | job.setMapperClass(Map.class); 55 | job.setMapOutputKeyClass(Text.class); 56 | job.setMapOutputValueClass(Text.class); 57 | 58 | job.setReducerClass(Reduce.class); 59 | job.setOutputKeyClass(Text.class); 60 | job.setOutputValueClass(Text.class); 61 | 62 | job.setInputFormatClass(TextInputFormat.class); 63 | job.setOutputFormatClass(TextOutputFormat.class); 64 | 65 | MultipleInputs.addInputPath(job, new Path(input1), TextInputFormat.class, Map.class); 66 | MultipleInputs.addInputPath(job, new Path(input2), TextInputFormat.class, Map.class); 67 | 68 | FileOutputFormat.setOutputPath(job, new Path(output)); 69 | 70 | System.exit(job.waitForCompletion(true)?0:1); 71 | } 72 | 73 | } 74 | -------------------------------------------------------------------------------- /Hadoop/multiple_In_Out/multipleinput_input: -------------------------------------------------------------------------------- 1 | 中国,我们 2 | 美国,它们 3 | 中国,3 4 | 美国,32 5 | 中国人,很多 6 | -------------------------------------------------------------------------------- /Hadoop/multiple_In_Out/multipleoutput.java: -------------------------------------------------------------------------------- 1 | package multiple_In_Out; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Job; 9 | import org.apache.hadoop.mapreduce.Mapper; 10 | import org.apache.hadoop.mapreduce.Reducer; 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; 15 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 16 | 17 | public class multipleoutput { 18 | 19 | static String input = "hdfs://127.0.0.1:9000/mr/input"; 20 | static String output = "hdfs://127.0.0.1:9000/mr/output"; 21 | 22 | public static class Map extends Mapper{ 23 | private static Text k = new Text(); 24 | private static Text v = new Text(); 25 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{ 26 | String[] list = value.toString().split(","); 27 | k.set(list[0]); 28 | v.set(list[1]); 29 | context.write(k, v); 30 | } 31 | } 32 | 33 | public static class Reduce extends Reducer{ 34 | private MultipleOutputs mos; 35 | public void setup(Context context){ 36 | mos = new MultipleOutputs(context); 37 | } 38 | 39 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException{ 40 | String k = key.toString(); 41 | for(Text t : values){ 42 | if("中国".equals(k)){ 43 | System.out.println(t.toString()); 44 | mos.write("china",new Text("中国"), t); 45 | }else if("美国".equals(k)){ 46 | System.out.println(t.toString()); 47 | mos.write("usa",new Text("美国"),t); 48 | }else if("中国人".equals(k)){ 49 | System.out.println(t.toString()); 50 | mos.write("cpeople",new Text("中国人"),t); 51 | } 52 | } 53 | } 54 | 55 | public void cleanup(Context context) throws IOException, InterruptedException{ 56 | mos.close(); 57 | } 58 | } 59 | /** 60 | * @param args 61 | * @throws IOException 62 | * @throws InterruptedException 63 | * @throws ClassNotFoundException 64 | */ 65 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 66 | // TODO Auto-generated method stub 67 | Job job = new Job(); 68 | job.setJarByClass(multipleoutput.class); 69 | 70 | job.setMapperClass(Map.class); 71 | job.setMapOutputKeyClass(Text.class); 72 | job.setMapOutputValueClass(Text.class); 73 | 74 | job.setReducerClass(Reduce.class); 75 | job.setOutputKeyClass(Text.class); 76 | job.setOutputValueClass(Text.class); 77 | 78 | job.setInputFormatClass(TextInputFormat.class); 79 | job.setOutputFormatClass(TextOutputFormat.class); 80 | 81 | MultipleOutputs.addNamedOutput(job, "china", TextOutputFormat.class, Text.class, Text.class); 82 | MultipleOutputs.addNamedOutput(job, "usa", TextOutputFormat.class, Text.class, Text.class); 83 | MultipleOutputs.addNamedOutput(job, "cpeople", TextOutputFormat.class, Text.class, Text.class); 84 | 85 | FileInputFormat.addInputPath(job, new Path(input)); 86 | FileOutputFormat.setOutputPath(job, new Path(output)); 87 | 88 | System.exit(job.waitForCompletion(true)?0:1); 89 | } 90 | 91 | } 92 | -------------------------------------------------------------------------------- /Hadoop/multiple_In_Out/multipleoutput_input: -------------------------------------------------------------------------------- 1 | 中国,我们 2 | 美国,它们 3 | 中国,3 4 | 美国,32 5 | 中国人,很多 6 | -------------------------------------------------------------------------------- /Hadoop/pagerankjisuan/dataEtl.java: -------------------------------------------------------------------------------- 1 | package pagerankjisuan; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileNotFoundException; 6 | import java.io.FileReader; 7 | import java.io.FileWriter; 8 | import java.io.IOException; 9 | 10 | public class dataEtl { 11 | 12 | public static void main() throws IOException { 13 | 14 | File f1 = new File("MyItems/pagerankjisuan/people.csv"); 15 | if(f1.isFile()){ 16 | f1.delete(); 17 | } 18 | File f = new File("MyItems/pagerankjisuan/peoplerank.txt"); 19 | if(f.isFile()){ 20 | f.delete(); 21 | } 22 | //打开文件 23 | File file = new File("MyItems/pagerankjisuan/day7_author100_mess.csv"); 24 | //定义一个文件指针 25 | BufferedReader reader = new BufferedReader(new FileReader(file)); 26 | try { 27 | String line=null; 28 | //判断读取的一行是否为空 29 | while( (line=reader.readLine()) != null) 30 | { 31 | String[] userMess = line.split( "," ); 32 | //第一字段为id,第是个字段为粉丝列表 33 | String userid = userMess[0]; 34 | if(userMess.length!=0){ 35 | if(userMess.length==11) 36 | { 37 | int i=0; 38 | String[] focusName = userMess[10].split("\\|"); // | 为转义符 39 | for (i=1;i < focusName.length; i++) 40 | { 41 | write(userid,focusName[i]); 42 | // System.out.println(userid+ " " + focusName[i]); 43 | } 44 | } 45 | else 46 | { 47 | int j =0; 48 | String[] focusName = userMess[9].split("\\|"); // | 为转义符 49 | for (j=1;j < focusName.length; j++) 50 | { 51 | write(userid,focusName[j]); 52 | // System.out.println(userid+ " " + focusName[j]); 53 | } 54 | } 55 | } 56 | } 57 | } 58 | catch (FileNotFoundException e) { 59 | // TODO Auto-generated catch block 60 | e.printStackTrace(); 61 | } 62 | finally 63 | { 64 | reader.close(); 65 | 66 | //etl peoplerank.txt 67 | for(int i=1;i<=100;i++){ 68 | FileWriter writer = new FileWriter("MyItems/pagerankjisuan/peoplerank.txt",true); 69 | writer.write(i + "\t" + 1 + "\n"); 70 | writer.close(); 71 | } 72 | } 73 | System.out.println("OK.................."); 74 | } 75 | 76 | private static void write(String userid, String nameid) { 77 | // TODO Auto-generated method stub 78 | //定义写文件,按行写入 79 | try { 80 | if(!nameid.contains("\n")){ 81 | FileWriter writer = new FileWriter("MyItems/pagerankjisuan/people.csv",true); 82 | writer.write(userid + "," + nameid + "\n"); 83 | writer.close(); 84 | } 85 | } catch (IOException e) { 86 | // TODO Auto-generated catch block 87 | e.printStackTrace(); 88 | } 89 | } 90 | 91 | } 92 | -------------------------------------------------------------------------------- /Hadoop/pagerankjisuan/hdfsGYT.java: -------------------------------------------------------------------------------- 1 | package pagerankjisuan; 2 | 3 | import java.io.IOException; 4 | import java.net.URI; 5 | import java.net.URISyntaxException; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.FSDataInputStream; 9 | import org.apache.hadoop.fs.FileStatus; 10 | import org.apache.hadoop.fs.FileSystem; 11 | import org.apache.hadoop.fs.Path; 12 | import org.apache.hadoop.io.IOUtils; 13 | 14 | public class hdfsGYT { 15 | 16 | private static final String HDFS = "hdfs://127.0.0.1:9000/"; 17 | 18 | public hdfsGYT(String hdfs, Configuration conf ){ 19 | this.hdfsPath = hdfs; 20 | this.conf = conf; 21 | } 22 | 23 | public hdfsGYT() { 24 | // TODO Auto-generated constructor stub 25 | } 26 | 27 | private String hdfsPath; 28 | private Configuration conf = new Configuration() ; 29 | 30 | public static void main(String[] args) throws IOException, URISyntaxException{ 31 | hdfsGYT hdfsgyt = new hdfsGYT(); 32 | String folder = HDFS + "mr/groom_system/small2.csv"; 33 | String local = "/home/thinkgamer/Java/hadoop_shizhan/src/user_thing_tuijian/small2.csv"; 34 | String local1 = "/home/thinkgamer/Java/hadoop_shizhan/src/user_thing_tuijian"; 35 | //判断某个文件夹是否存在 36 | //hdfsgyt.isExist(folder); 37 | //创建文件夹 38 | //hdfsgyt.mkdir(folder); 39 | //删除文件夹 40 | //hdfsgyt.rmr(folder); 41 | //列出所有文件夹 42 | //hdfsgyt.ls(folder); 43 | //递归列出所有文件夹 44 | //hdfsgyt.lsr(folder); 45 | //上传文件 46 | //hdfsgyt.put(local, folder); 47 | //下载文件 48 | //hdfsgyt.get(folder,local1); 49 | //删除文件 50 | //hdfsgyt.rm(folder); 51 | //显示文件 52 | //hdfsgyt.cat(folder); 53 | //重命名文件 54 | // String path1 = HDFS + "mr/output"; 55 | // String path2 = HDFS + "mr/input"; 56 | // hdfsgyt.rename(path1,path2); 57 | } 58 | 59 | //重命名文件 60 | public void rename(String path1, String path2) throws IOException, URISyntaxException { 61 | // TODO Auto-generated method stub 62 | FileSystem fs = FileSystem.get(new URI(HDFS), new Configuration()); 63 | try{ 64 | fs.rename(new Path(path1), new Path(path2 ) ); 65 | System.out.println("Rename " + path1 + " To " + path2 ); 66 | }finally{ 67 | fs.close(); 68 | } 69 | } 70 | 71 | //显示文件 72 | public static void cat(String folder) throws IOException, URISyntaxException { 73 | // 与hdfs建立联系 74 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 75 | Path path = new Path(folder); 76 | FSDataInputStream fsdis = null; 77 | System.out.println("cat: " + folder); 78 | try { 79 | fsdis =fs.open(path); 80 | IOUtils.copyBytes(fsdis, System.out, 4096, false); 81 | } finally { 82 | IOUtils.closeStream(fsdis); 83 | fs.close(); 84 | } 85 | } 86 | 87 | //删除文件 88 | public static void rm(String folder) throws IOException, URISyntaxException { 89 | //与hdfs建立联系 90 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 91 | Path path = new Path(folder); 92 | if(fs.deleteOnExit(path)){ 93 | fs.delete(path); 94 | System.out.println("delete:" + folder); 95 | }else{ 96 | System.out.println("The fiel is not exist!"); 97 | } 98 | fs.close(); 99 | } 100 | 101 | //下载文件 102 | public static void get(String remote, String local) throws IllegalArgumentException, IOException, URISyntaxException { 103 | // 建立联系 104 | FileSystem fs = FileSystem.get(new URI(HDFS), new Configuration()); 105 | fs.copyToLocalFile(new Path(remote), new Path(local)); 106 | System.out.println("Get From : " + remote + " To :" + local); 107 | fs.close(); 108 | } 109 | 110 | //上传文件 111 | public static void put(String local, String remote) throws IOException, URISyntaxException { 112 | // 建立联系 113 | FileSystem fs = FileSystem.get(new URI(HDFS), new Configuration()); 114 | fs.copyFromLocalFile(new Path(local), new Path(remote)); 115 | System.out.println("Put :" + local + " To : " + remote); 116 | fs.close(); 117 | } 118 | 119 | //递归列出所有文件夹 120 | public static void lsr(String folder) throws IOException, URISyntaxException { 121 | //与hdfs建立联系 122 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 123 | Path path = new Path(folder); 124 | //得到该目录下的所有文件 125 | FileStatus[] fileList = fs.listStatus(path); 126 | for (FileStatus f : fileList) { 127 | System.out.printf("name: %s | folder: %s | size: %d\n", f.getPath(), f.isDir() , f.getLen()); 128 | try{ 129 | FileStatus[] fileListR = fs.listStatus(f.getPath()); 130 | for(FileStatus fr:fileListR){ 131 | System.out.printf("name: %s | folder: %s | size: %d\n", fr.getPath(), fr.isDir() , fr.getLen()); 132 | } 133 | }finally{ 134 | continue; 135 | } 136 | } 137 | fs.close(); 138 | } 139 | 140 | //列出所有文件夹 141 | public static void ls(String folder) throws IOException, URISyntaxException { 142 | //与hdfs建立联系 143 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 144 | Path path = new Path(folder); 145 | //得到该目录下的所有文件 146 | FileStatus[] fileList = fs.listStatus(path); 147 | for (FileStatus f : fileList) { 148 | System.out.printf("name: %s | folder: %s | size: %d\n", f.getPath(), f.isDir() , f.getLen()); 149 | } 150 | fs.close(); 151 | } 152 | 153 | //删除文件夹 154 | public static void rmr(String folder) throws IOException, URISyntaxException { 155 | //与hdfs建立联系 156 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 157 | Path path = new Path(folder); 158 | fs.delete(path); 159 | System.out.println("delete:" + folder); 160 | fs.close(); 161 | } 162 | 163 | //创建文件夹 164 | public static void mkdir(String folder) throws IOException, URISyntaxException { 165 | //与hdfs建立联系 166 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 167 | Path path = new Path(folder); 168 | if (!fs.exists(path)) { 169 | fs.mkdirs(path); 170 | System.out.println("Create: " + folder); 171 | }else{ 172 | System.out.println("it is have exist:" + folder); 173 | } 174 | fs.close(); 175 | } 176 | 177 | //判断某个文件夹是否存在 178 | public static void isExist(String folder) throws IOException, URISyntaxException { 179 | //与hdfs建立联系 180 | FileSystem fs = FileSystem.get(new URI(HDFS),new Configuration()); 181 | Path path = new Path(folder); 182 | if(fs.exists(path)){ 183 | System.out.println("it is have exist:" + folder); 184 | }else{ 185 | System.out.println("it is not exist:" + folder); 186 | } 187 | fs.close(); 188 | } 189 | 190 | } -------------------------------------------------------------------------------- /Hadoop/pagerankjisuan/prJisuan.java: -------------------------------------------------------------------------------- 1 | package pagerankjisuan; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.util.HashMap; 6 | import java.util.Iterator; 7 | import java.util.Map; 8 | 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.LongWritable; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.mapreduce.Job; 13 | import org.apache.hadoop.mapreduce.Mapper; 14 | import org.apache.hadoop.mapreduce.Reducer; 15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 16 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 17 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 18 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 19 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 20 | public class prJisuan { 21 | 22 | public static class prJisuanMapper extends Mapper { 23 | 24 | private String flag; //tmp1 or result 25 | private static int nums = 100; //页面数 26 | private static Text k =new Text(); 27 | private static Text v =new Text(); 28 | 29 | protected void setup(Context context){ 30 | FileSplit split = (FileSplit) context.getInputSplit(); 31 | flag = split.getPath().getParent().getName(); //判断读的数据集 32 | } 33 | 34 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{ 35 | 36 | if(flag.endsWith("tmp1")){ 37 | String[] tokens = value.toString().split("\t"); 38 | String row =tokens[0]; 39 | String[] vals = tokens[1].split(",");//转置矩阵 40 | for (int i =0;i{ 58 | 59 | public void reduce(Text key, Iterable values,Context context) throws IOException, InterruptedException{ 60 | Map mapA = new HashMap(); 61 | Map mapB = new HashMap(); 62 | float pr = 0f; 63 | for (Text val : values) { 64 | // System.out.println(val.toString()); 65 | String value = val.toString(); 66 | if(value.startsWith("A") ){ 67 | String[] tokenA = value.split(":")[1].split(","); 68 | mapA.put(Integer.parseInt(tokenA[0]), Float.parseFloat(tokenA[1]) ); 69 | } 70 | 71 | if(value.startsWith("B")){ 72 | String[] tokenB = value.split(":")[1].split(","); 73 | mapB.put(Integer.parseInt(tokenB[0]), Float.parseFloat(tokenB[1]) ); 74 | } 75 | } 76 | 77 | Iterator iterA = mapA.keySet().iterator(); 78 | while(iterA.hasNext()){ 79 | int idx = Integer.parseInt( iterA.next().toString() ); 80 | float A = mapA.get(idx); 81 | float B = mapB.get(idx); 82 | pr += A * B; 83 | // System.out.println(idx + " " + A + " " + B); 84 | } 85 | context.write(key,new Text(prjob.scaleFloat(pr))); 86 | } 87 | } 88 | 89 | public static void main(Map path) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { 90 | 91 | String input = path.get("tmp1"); 92 | String output = path.get("tmp2"); 93 | String pr = path.get("input_pr"); 94 | 95 | hdfsGYT hdfs = new hdfsGYT(); 96 | hdfs.rmr(output); 97 | 98 | Job job = new Job(); 99 | job.setJarByClass(prJisuan.class); 100 | 101 | //set file input 102 | FileInputFormat.setInputPaths(job, new Path(input), new Path(pr)); 103 | job.setInputFormatClass(TextInputFormat.class); 104 | 105 | //set map 106 | job.setMapperClass(prJisuanMapper.class); 107 | job.setMapOutputKeyClass(Text.class); 108 | job.setMapOutputValueClass(Text.class); 109 | 110 | //set partition 111 | //set combine 112 | //set sort 113 | 114 | //set reduce 115 | job.setReducerClass(prJisuanReducer.class); 116 | job.setOutputKeyClass(Text.class); 117 | job.setOutputValueClass(Text.class); 118 | 119 | //set outputpath 120 | FileOutputFormat.setOutputPath(job, new Path(output)); 121 | job.setOutputFormatClass(TextOutputFormat.class); 122 | 123 | //upload job 124 | job.waitForCompletion(true); 125 | 126 | hdfs.rmr(pr); 127 | hdfs.rename(output, pr); 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /Hadoop/pagerankjisuan/prMatrix.java: -------------------------------------------------------------------------------- 1 | package pagerankjisuan; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.util.Arrays; 6 | import java.util.Map; 7 | 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.io.LongWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.Mapper; 13 | import org.apache.hadoop.mapreduce.Reducer; 14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 15 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 17 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 18 | 19 | public class prMatrix { 20 | 21 | private static int nums = 100; //页面数 22 | private static float d = 0.85f; //阻尼系数 23 | 24 | private static class MatrixMapper extends Mapper{ 25 | private static final Text k = new Text(); 26 | private static final Text v = new Text(); 27 | public void map(LongWritable key,Text value, Context context) throws IOException, InterruptedException{ 28 | // System.out.println(value.toString()); 29 | String[] tokens = value.toString().split(","); 30 | k.set(tokens[0]); 31 | v.set(tokens[1]); 32 | context.write(k, v); 33 | } 34 | } 35 | 36 | 37 | public static class MatrixReducer extends Reducer{ 38 | 39 | public void reduce(Text key, Iterablevalues, Context context ) throws IOException, InterruptedException{ 40 | float[] G = new float[nums]; //概率矩阵列 41 | Arrays.fill(G, (float)(1-d) / G.length ); //填充矩阵列 42 | 43 | float[] A = new float[nums] ; //近 邻矩阵列 44 | int sum=0; //链出数量 45 | for(Text text :values){ 46 | int idx = Integer.parseInt(text.toString()); 47 | // System.out.println(idx + "idx -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="); 48 | A[idx-1 ] = 1; 49 | sum ++; 50 | } 51 | 52 | if(sum==0){ //分母不能为0 53 | sum=1; 54 | } 55 | 56 | StringBuilder sb = new StringBuilder(); 57 | for(int i=0;i path) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException { 68 | 69 | String input = path.get("input"); 70 | String input_pr = path.get("input_pr"); 71 | String output = path.get("tmp1"); 72 | 73 | String page = path.get("page"); 74 | String pr = path.get("pr"); 75 | 76 | hdfsGYT hdfs = new hdfsGYT(); 77 | //创建需要的文件夹 78 | hdfs.rmr(input); 79 | hdfs.rmr(output); 80 | hdfs.mkdir(input); 81 | hdfs.mkdir(input_pr); 82 | //上传文件到指定的目录 内 83 | hdfs.put(page, input); 84 | hdfs.put(pr, input_pr); 85 | 86 | Job job = new Job(); 87 | job.setJarByClass(prMatrix.class); 88 | 89 | job.setInputFormatClass(TextInputFormat.class); 90 | job.setOutputFormatClass(TextOutputFormat.class); 91 | 92 | FileInputFormat.addInputPath(job, new Path(input)); 93 | FileOutputFormat.setOutputPath(job, new Path(output)); 94 | 95 | job.setMapperClass(MatrixMapper.class); 96 | job.setReducerClass(MatrixReducer.class); 97 | 98 | job.setMapOutputKeyClass(Text.class); 99 | job.setMapOutputValueClass(Text.class); 100 | 101 | job.setOutputKeyClass(Text.class); 102 | job.setOutputValueClass(Text.class); 103 | 104 | job.waitForCompletion(true); 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /Hadoop/pagerankjisuan/prNormal.java: -------------------------------------------------------------------------------- 1 | package pagerankjisuan; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | import java.util.Map; 8 | 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.LongWritable; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.mapreduce.Job; 13 | import org.apache.hadoop.mapreduce.Mapper; 14 | import org.apache.hadoop.mapreduce.Reducer; 15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 16 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 17 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 18 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 19 | 20 | public class prNormal { 21 | 22 | public static class normalMapper extends Mapper{ 23 | private static Text k = new Text("1"); 24 | public void map(LongWritable key , Text value , Context context) throws IOException, InterruptedException{ 25 | // System.out.println(value.toString()); 26 | context.write(k,value); 27 | } 28 | } 29 | 30 | public static class normalReducer extends Reducer{ 31 | 32 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException{ 33 | List list = new ArrayList(); 34 | 35 | float sum = 0f; 36 | for(Text text : values){ 37 | list.add(text.toString()); 38 | 39 | String[] val = text.toString().split("\t"); 40 | float f = Float.parseFloat(val[1]); 41 | sum +=f; 42 | } 43 | 44 | for(String line : list){ 45 | String[] vals = line.split("\t"); 46 | Text k = new Text(vals[0]); 47 | 48 | float f = Float.parseFloat(vals[1]); 49 | Text v = new Text(prjob.scaleFloat ( (float) (f / sum) )); 50 | context.write(k, v); 51 | 52 | // System.out.println(k + ":" + v); 53 | } 54 | } 55 | } 56 | 57 | public static void main(Map path) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException, URISyntaxException { 58 | String input = path.get("input_pr"); 59 | String output = path.get("result"); 60 | 61 | hdfsGYT hdfs = new hdfsGYT(); 62 | hdfs.rmr(output); 63 | 64 | Job job = new Job(); 65 | job.setJarByClass(prNormal.class); 66 | 67 | //set file input 68 | FileInputFormat.addInputPath(job, new Path(input)); 69 | job.setInputFormatClass(TextInputFormat.class); 70 | 71 | //set map 72 | job.setMapperClass(normalMapper.class); 73 | job.setMapOutputKeyClass(Text.class); 74 | job.setMapOutputValueClass(Text.class); 75 | 76 | //set partition 77 | //set combine 78 | //set sort 79 | 80 | //set reduce 81 | job.setReducerClass(normalReducer.class); 82 | job.setOutputKeyClass(Text.class); 83 | job.setOutputValueClass(Text.class); 84 | 85 | //set outputpath 86 | FileOutputFormat.setOutputPath(job, new Path(output)); 87 | job.setOutputFormatClass(TextOutputFormat.class); 88 | 89 | //upload job 90 | job.waitForCompletion(true); 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /Hadoop/pagerankjisuan/prSort.java: -------------------------------------------------------------------------------- 1 | package pagerankjisuan; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.util.Map; 6 | 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.FloatWritable; 9 | import org.apache.hadoop.io.IntWritable; 10 | import org.apache.hadoop.io.IntWritable.Comparator; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.io.WritableComparable; 13 | import org.apache.hadoop.mapreduce.Job; 14 | import org.apache.hadoop.mapreduce.Mapper; 15 | import org.apache.hadoop.mapreduce.Reducer; 16 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 17 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 18 | 19 | public class prSort { 20 | /** 21 | * @param args 22 | * @throws IOException 23 | * @throws IllegalArgumentException 24 | * @throws InterruptedException 25 | * @throws ClassNotFoundException 26 | */ 27 | public static class myComparator extends Comparator { 28 | @SuppressWarnings("rawtypes") 29 | public int compare( WritableComparable a,WritableComparable b){ 30 | return -super.compare(a, b); 31 | } 32 | public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { 33 | return -super.compare(b1, s1, l1, b2, s2, l2); 34 | } 35 | } 36 | 37 | public static class sortMap extends Mapper{ 38 | public void map(Object key,Text value,Context context) throws NumberFormatException, IOException, InterruptedException{ 39 | String[] split = value.toString().split("\t"); 40 | context.write(new FloatWritable(Float.parseFloat(split[1])),new IntWritable(Integer.parseInt(split[0])) ); 41 | } 42 | } 43 | public static class Reduce extends Reducer{ 44 | public void reduce(FloatWritable key,Iterablevalues,Context context) throws IOException, InterruptedException{ 45 | for (IntWritable text : values) { 46 | context.write( text,key); 47 | } 48 | } 49 | } 50 | 51 | public static void main(Map path) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { 52 | // TODO Auto-generated method stub 53 | 54 | String input = path.get("result"); 55 | String output = path.get("sort"); 56 | hdfsGYT hdfs = new hdfsGYT(); 57 | hdfs.rmr(output); 58 | 59 | Job job = new Job(); 60 | job.setJarByClass(prSort.class); 61 | // 1 62 | FileInputFormat.setInputPaths(job, new Path(input) ); 63 | // 2 64 | job.setMapperClass(sortMap.class); 65 | job.setMapOutputKeyClass(FloatWritable.class); 66 | job.setMapOutputValueClass(IntWritable.class); 67 | // 3 68 | // 4 自定义排序 69 | job.setSortComparatorClass( myComparator.class); 70 | // 5 71 | job.setNumReduceTasks(1); 72 | // 6 73 | job.setReducerClass(Reduce.class); 74 | job.setOutputKeyClass(IntWritable.class); 75 | job.setOutputValueClass(FloatWritable.class); 76 | // 7 77 | FileOutputFormat.setOutputPath(job, new Path(output)); 78 | // 8 79 | System.exit(job.waitForCompletion(true)? 0 :1 ); 80 | } 81 | } 82 | 83 | -------------------------------------------------------------------------------- /Hadoop/pagerankjisuan/prjob.java: -------------------------------------------------------------------------------- 1 | package pagerankjisuan; 2 | 3 | import java.text.DecimalFormat; 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | 7 | /* 8 | * 调度函数 9 | */ 10 | public class prjob { 11 | 12 | public static final String HDFS = "hdfs://127.0.0.1:9000"; 13 | 14 | public static void main(String[] args) { 15 | Map path= new HashMap(); 16 | 17 | path.put("page" ,"/home/thinkgamer/MyCode/hadoop/MyItems/pagerankjisuan/people.csv"); 18 | path.put("pr" ,"/home/thinkgamer/MyCode/hadoop/MyItems/pagerankjisuan/peoplerank.txt"); 19 | 20 | path.put("input", HDFS + "/mr/blog_analysic_system/people"); // HDFS的目录 21 | path.put("input_pr", HDFS + "/mr/blog_analysic_system/pr"); // pr存储目录 22 | path.put("tmp1", HDFS + "/mr/blog_analysic_system/tmp1"); // 临时目录,存放邻接矩阵 23 | path.put("tmp2", HDFS + "/mr/blog_analysic_system/tmp2"); // 临时目录,计算到得PR,覆盖input_pr 24 | 25 | path.put("result", HDFS + "/mr/blog_analysic_system/result"); // 计算结果的PR 26 | 27 | path.put("sort", HDFS + "/mr/blog_analysic_system/sort"); //最终排序输出的结果 28 | 29 | try { 30 | dataEtl.main(); 31 | prMatrix.main(path); 32 | int iter = 3; // 迭代次数 33 | for (int i = 0; i < iter; i++) { 34 | prJisuan.main(path); 35 | } 36 | prNormal.main(path); 37 | prSort.main(path); 38 | 39 | } catch (Exception e) { 40 | e.printStackTrace(); 41 | } 42 | System.exit(0); 43 | } 44 | 45 | public static String scaleFloat(float f) {// 保留6位小数 46 | DecimalFormat df = new DecimalFormat("##0.000000"); 47 | return df.format(f); 48 | } 49 | } -------------------------------------------------------------------------------- /Hadoop/selfSort/input: -------------------------------------------------------------------------------- 1 | 2013 1 2 | 2013 5 3 | 2014 5 4 | 2014 8 5 | 2015 9 6 | 2015 4 7 | -------------------------------------------------------------------------------- /Hadoop/selfSort/output: -------------------------------------------------------------------------------- 1 | 2015 4 2 | 2015 9 3 | 2014 5 4 | 2014 8 5 | 2013 1 6 | 2013 5 7 | -------------------------------------------------------------------------------- /Hadoop/selfSort/selfSort.java: -------------------------------------------------------------------------------- 1 | package selfSort; 2 | 3 | /* 4 | * 第一列降序,第一列相同时第二列升序 5 | */ 6 | 7 | import java.io.DataInput; 8 | import java.io.DataOutput; 9 | import java.io.IOException; 10 | 11 | import org.apache.hadoop.fs.Path; 12 | import org.apache.hadoop.io.LongWritable; 13 | import org.apache.hadoop.io.Text; 14 | import org.apache.hadoop.io.WritableComparable; 15 | import org.apache.hadoop.mapreduce.Job; 16 | import org.apache.hadoop.mapreduce.Mapper; 17 | import org.apache.hadoop.mapreduce.Reducer; 18 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 19 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 20 | 21 | public class selfSort { 22 | 23 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 24 | // TODO Auto-generated method stub 25 | 26 | Job job = new Job(); 27 | job.setJarByClass(selfSort.class); 28 | // 1 29 | FileInputFormat.setInputPaths(job, new Path(args[0])); 30 | // 2 31 | job.setMapperClass(Map.class); 32 | job.setMapOutputKeyClass(MyK2.class); 33 | job.setMapOutputValueClass(LongWritable.class); 34 | // 3 35 | // 4 36 | // 5 37 | job.setNumReduceTasks(1); 38 | // 6 39 | job.setReducerClass(Reduce.class); 40 | job.setOutputKeyClass(LongWritable.class); 41 | job.setOutputValueClass(LongWritable.class); 42 | // 7 43 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 44 | // 8 45 | System.exit(job.waitForCompletion(true)? 0 :1 ); 46 | } 47 | public static class Map extends Mapper{ 48 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException{ 49 | String line = value.toString(); 50 | String[] split = line.split("\t"); 51 | MyK2 my = new MyK2(Long.parseLong(split[0]), Long.parseLong(split[1])); 52 | context.write(my, new LongWritable(1)); 53 | } 54 | } 55 | public static class Reduce extends Reducer{ 56 | public void reduce(MyK2 key, Iterable values, Context context) throws IOException, InterruptedException{ 57 | context.write(new LongWritable(key.myk2), new LongWritable(key.myv2)); 58 | } 59 | } 60 | 61 | public static class MyK2 implements WritableComparable{ 62 | 63 | public long myk2; 64 | public long myv2; 65 | 66 | MyK2(){} 67 | 68 | MyK2(long myk2, long myv2){ 69 | this.myk2 = myk2; 70 | this.myv2 = myv2; 71 | } 72 | 73 | @Override 74 | public void readFields(DataInput in) throws IOException { 75 | // TODO Auto-generated method stub 76 | this.myk2 = in.readLong(); 77 | this.myv2 = in.readLong(); 78 | } 79 | 80 | @Override 81 | public void write(DataOutput out) throws IOException { 82 | // TODO Auto-generated method stub 83 | out.writeLong(myk2); 84 | out.writeLong(myv2); 85 | } 86 | 87 | @Override 88 | public int compareTo(MyK2 myk2) { 89 | // TODO Auto-generated method stub 90 | //myk2之差>0 返回-1 <0 返回1 代表 myk2列降序 91 | //myk2之差<0 返回-1 >0 返回1 代表 myk2列升序 92 | long temp = this.myk2 - myk2.myk2; 93 | if(temp>0) 94 | return -1; 95 | else if(temp<0) 96 | return 1; 97 | //控制myv2升序 98 | return (int)(this.myv2 - myk2.myv2); 99 | } 100 | } 101 | } -------------------------------------------------------------------------------- /Hadoop/sort_twice/Intpair.java: -------------------------------------------------------------------------------- 1 | package sort_twice; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.io.WritableComparable; 8 | 9 | public class Intpair implements WritableComparable{ 10 | int first; 11 | int second; 12 | 13 | public void set(int first,int second){ 14 | this.first = first; 15 | this.second = second; 16 | } 17 | 18 | public int getFirst() { 19 | // TODO Auto-generated method stub 20 | return first; 21 | } 22 | 23 | public int getSecond() { 24 | // TODO Auto-generated method stub 25 | return second; 26 | } 27 | 28 | //序列化,从流中读进二进制转换成InPair 29 | @Override 30 | public void readFields(DataInput in) throws IOException { 31 | // TODO Auto-generated method stub 32 | first = in.readInt(); 33 | second = in.readInt(); 34 | } 35 | 36 | //范序列化,将Intpair转换成二进制输出 37 | @Override 38 | public void write(DataOutput out) throws IOException { 39 | // TODO Auto-generated method stub 40 | out.writeInt(first); 41 | out.writeInt(second); 42 | } 43 | 44 | //先按照first比较再按照second比较 45 | @Override 46 | public int compareTo(Intpair o) { 47 | // TODO Auto-generated method stub 48 | if(first != o.first){ 49 | return first < o.first?-1:1; 50 | }else if(second !=o.second){ 51 | return second < o.second?-1:1; 52 | }else{ 53 | return 0; 54 | } 55 | } 56 | 57 | @Override 58 | //The hashCode() method is used by the HashPartitioner (the default partitioner in MapReduce) 59 | public int hashCode() 60 | { 61 | return first+"".hashCode() + second+"".hashCode(); 62 | } 63 | 64 | @Override 65 | public boolean equals(Object right) 66 | { 67 | if (right instanceof Intpair) { 68 | Intpair r = (Intpair) right; 69 | return r.first == first && r.second == second; 70 | } else { 71 | return false; 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /Hadoop/sort_twice/groupingComparator.java: -------------------------------------------------------------------------------- 1 | package sort_twice; 2 | 3 | import org.apache.hadoop.io.RawComparator; 4 | import org.apache.hadoop.io.WritableComparator; 5 | 6 | public class groupingComparator implements RawComparator { 7 | @Override 8 | public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { 9 | return WritableComparator.compareBytes(b1, s1, Integer.SIZE/8, b2, s2, Integer.SIZE/8); 10 | } 11 | @Override 12 | public int compare(Intpair o1, Intpair o2) { 13 | // TODO Auto-generated method stub 14 | int first1 = o1.getFirst(); 15 | int first2 = o2.getFirst(); 16 | return first1 - first2; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /Hadoop/sort_twice/input: -------------------------------------------------------------------------------- 1 | 20 21 2 | 50 51 3 | 50 52 4 | 50 53 5 | 50 54 6 | 60 51 7 | 60 53 8 | 60 52 9 | 60 56 10 | 60 57 11 | 70 58 12 | 60 61 13 | 70 54 14 | 70 55 15 | 70 56 16 | 70 57 17 | 70 58 18 | 1 2 19 | 3 4 20 | 5 6 21 | 7 82 22 | 203 21 23 | 50 512 24 | 50 522 25 | 50 53 26 | 530 54 27 | 40 511 28 | 20 53 29 | 20 522 30 | 60 56 31 | 60 57 32 | 740 58 33 | 63 61 34 | 730 54 35 | 71 55 36 | 71 56 37 | 73 57 38 | 74 58 39 | 12 211 40 | 31 42 41 | 50 62 42 | 7 8 43 | -------------------------------------------------------------------------------- /Hadoop/sort_twice/myPartition.java: -------------------------------------------------------------------------------- 1 | package sort_twice; 2 | 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.mapreduce.Partitioner; 5 | 6 | public class myPartition extends Partitioner { 7 | 8 | @Override 9 | public int getPartition(Intpair key, IntWritable value, int numOfReducer) { 10 | // TODO Auto-generated method stub 11 | 12 | return Math.abs(key.getFirst() * 127) % numOfReducer ; 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /Hadoop/sort_twice/output: -------------------------------------------------------------------------------- 1 | ------------^^我们是同一个分组的^^----------- 2 | 1 2 3 | ------------^^我们是同一个分组的^^----------- 4 | 3 4 5 | ------------^^我们是同一个分组的^^----------- 6 | 5 6 7 | ------------^^我们是同一个分组的^^----------- 8 | 7 8 9 | 7 82 10 | ------------^^我们是同一个分组的^^----------- 11 | 12 211 12 | ------------^^我们是同一个分组的^^----------- 13 | 20 21 14 | 20 53 15 | 20 522 16 | ------------^^我们是同一个分组的^^----------- 17 | 31 42 18 | ------------^^我们是同一个分组的^^----------- 19 | 40 511 20 | ------------^^我们是同一个分组的^^----------- 21 | 50 51 22 | 50 52 23 | 50 53 24 | 50 53 25 | 50 54 26 | 50 62 27 | 50 512 28 | 50 522 29 | ------------^^我们是同一个分组的^^----------- 30 | 60 51 31 | 60 52 32 | 60 53 33 | 60 56 34 | 60 56 35 | 60 57 36 | 60 57 37 | 60 61 38 | ------------^^我们是同一个分组的^^----------- 39 | 63 61 40 | ------------^^我们是同一个分组的^^----------- 41 | 70 54 42 | 70 55 43 | 70 56 44 | 70 57 45 | 70 58 46 | 70 58 47 | ------------^^我们是同一个分组的^^----------- 48 | 71 55 49 | 71 56 50 | ------------^^我们是同一个分组的^^----------- 51 | 73 57 52 | ------------^^我们是同一个分组的^^----------- 53 | 74 58 54 | ------------^^我们是同一个分组的^^----------- 55 | 203 21 56 | ------------^^我们是同一个分组的^^----------- 57 | 530 54 58 | ------------^^我们是同一个分组的^^----------- 59 | 730 54 60 | ------------^^我们是同一个分组的^^----------- 61 | 740 58 62 | -------------------------------------------------------------------------------- /Hadoop/sort_twice/sort_twice.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thinkgamer/Hadoop-Spark-Learning/46f24ae930fc6d426ad14989cb6dbad1e7966d8e/Hadoop/sort_twice/sort_twice.jar -------------------------------------------------------------------------------- /Hadoop/sort_twice/sort_twice.java: -------------------------------------------------------------------------------- 1 | package sort_twice; 2 | 3 | import java.io.IOException; 4 | import java.util.Date; 5 | import java.util.StringTokenizer; 6 | 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.hadoop.mapreduce.Job; 11 | import org.apache.hadoop.mapreduce.Mapper; 12 | import org.apache.hadoop.mapreduce.Reducer; 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 16 | 17 | public class sort_twice { 18 | 19 | /** 20 | * @param args 21 | * @throws IOException 22 | * @throws InterruptedException 23 | * @throws ClassNotFoundException 24 | */ 25 | public static class Map extends Mapper{ 26 | private final Intpair intkey = new Intpair(); 27 | private final IntWritable intvalue = new IntWritable(); 28 | 29 | public void map(Object key, Text value,Context context) throws IOException, InterruptedException{ 30 | StringTokenizer token = new StringTokenizer(value.toString()); 31 | int left = 0; 32 | int right = 0; 33 | while (token.hasMoreElements()){ 34 | left = Integer.parseInt( token.nextToken()); 35 | if(token.hasMoreTokens()) 36 | right = Integer.parseInt(token.nextToken()); 37 | intkey.set(left,right); 38 | intvalue.set(right); 39 | context.write(intkey, intvalue); 40 | } 41 | } 42 | 43 | } 44 | 45 | public static class Reduce extends Reducer{ 46 | private final Text left = new Text(); 47 | private final Text SEPAPATOR= new Text("------------^^我们是同一个分组的^^-----------"); 48 | public void reduce(Intpair key,Iterable values,Context context) throws IOException, InterruptedException{ 49 | left.set(Integer.toString(key.getFirst())); 50 | context.write(SEPAPATOR, null); 51 | for(IntWritable val:values){ 52 | context.write(left, val); 53 | } 54 | } 55 | } 56 | 57 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 58 | // TODO Auto-generated method stub 59 | Job job = new Job(); 60 | job.setJarByClass(sort_twice.class); 61 | 62 | //1 指定输入文件路径 63 | FileInputFormat.addInputPath(job, new Path(args[0])); 64 | job.setInputFormatClass(TextInputFormat.class); 65 | 66 | //2 设置Map相关 67 | job.setMapperClass(Map.class); 68 | job.setMapOutputKeyClass(Intpair.class); 69 | job.setMapOutputValueClass(IntWritable.class); 70 | 71 | //3 设置分区和reducer数目 72 | job.setPartitionerClass(myPartition.class); 73 | 74 | //4 重写分组函数 75 | job.setGroupingComparatorClass(groupingComparator.class); 76 | 77 | //5 归约处理 78 | //6 指定reducer类 79 | job.setReducerClass(Reduce.class); 80 | job.setOutputKeyClass(Text.class); 81 | job.setOutputValueClass(IntWritable.class); 82 | 83 | //7设置输出路径 84 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 85 | 86 | //8 提交任务 87 | int result = job.waitForCompletion(true)? 0 : 1; //任务开始 88 | 89 | 90 | //输出任务相关的信息 91 | Date start = new Date(); 92 | Date end = new Date(); 93 | float time = (float)(end.getTime()-start.getTime()); 94 | 95 | System.out.println("Job ID:"+job.getJobID()); 96 | System.out.println("Job Name:"+job.getJobName()); 97 | System.out.println("Job StartTime:"+start); 98 | System.out.println("Job EndTime:" + end); 99 | System.out.println("Job 经历的时间:" + time); 100 | System.out.println("Job 是否成功:"+job.isSuccessful()); 101 | System.out.println(result); 102 | } 103 | } -------------------------------------------------------------------------------- /Hadoop/wordcount/wordcount.java: -------------------------------------------------------------------------------- 1 | package wordcount; 2 | 3 | import java.io.IOException; 4 | import java.io.StringReader; 5 | 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | import org.wltea.analyzer.core.IKSegmenter; 15 | import org.wltea.analyzer.core.Lexeme; 16 | 17 | public class wordcount { 18 | 19 | public static class Map extends Mapper{ 20 | private static final Text word = new Text(); 21 | public void map(Object key,Text value,Context context) throws IOException, InterruptedException{ 22 | String line = value.toString(); 23 | StringReader sr=new StringReader(line); 24 | IKSegmenter ik=new IKSegmenter(sr, true); 25 | Lexeme lex=null; 26 | while((lex=ik.next())!=null){ 27 | word.set(lex.getLexemeText()); 28 | System.out.println(lex.getLexemeText() + "\tddddddddddddddddd\t" + "1"); 29 | context.write(new Text(word),new IntWritable(1)); 30 | } 31 | } 32 | } 33 | 34 | public static class Reduce extends Reducer{ 35 | private static final IntWritable result = new IntWritable(); 36 | public void reduce(Text key,Iterablevalues,Context context) throws IOException, InterruptedException{ 37 | int num =0; 38 | for(IntWritable value:values){ 39 | num += value.get(); 40 | } 41 | result.set(num); 42 | System.out.println(key.toString() + "\t................." + num); 43 | context.write(key, result); 44 | } 45 | } 46 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 47 | // TODO Auto-generated method stub 48 | Job job = new Job(); 49 | job.setJarByClass(wordcount.class); 50 | 51 | job.setNumReduceTasks(1); //设置reduce进程为1个,即output生成一个文件 52 | 53 | job.setMapperClass(Map.class); 54 | job.setReducerClass(Reduce.class); 55 | 56 | job.setOutputKeyClass(Text.class); //为job的输出数据设置key类 57 | job.setOutputValueClass(IntWritable.class); //为job的输出设置value类 58 | 59 | FileInputFormat.addInputPath(job, new Path(args[0])); //设置输入文件的目录 60 | FileOutputFormat.setOutputPath(job,new Path(args[1])); //设置输出文件的目录 61 | 62 | System.exit(job.waitForCompletion(true)?0:1); //提交任务 63 | } 64 | } -------------------------------------------------------------------------------- /Hadoop/二次排序/blogURL.txt: -------------------------------------------------------------------------------- 1 | http://blog.csdn.net/gamer_gyt/article/details/47315405 -------------------------------------------------------------------------------- /Hadoop/二次排序/part-r-00000: -------------------------------------------------------------------------------- 1 | ================================ 2 | 1 2 3 | ================================ 4 | 3 4 5 | ================================ 6 | 5 6 7 | ================================ 8 | 7 8 9 | 7 82 10 | ================================ 11 | 12 211 12 | ================================ 13 | 20 21 14 | 20 53 15 | 20 522 16 | ================================ 17 | 31 42 18 | ================================ 19 | 40 511 20 | ================================ 21 | 50 51 22 | 50 52 23 | 50 53 24 | 50 53 25 | 50 54 26 | 50 62 27 | 50 512 28 | 50 522 29 | ================================ 30 | 60 51 31 | 60 52 32 | 60 53 33 | 60 56 34 | 60 56 35 | 60 57 36 | 60 57 37 | 60 61 38 | ================================ 39 | 63 61 40 | ================================ 41 | 70 54 42 | 70 55 43 | 70 56 44 | 70 57 45 | 70 58 46 | 70 58 47 | ================================ 48 | 71 55 49 | 71 56 50 | ================================ 51 | 73 57 52 | ================================ 53 | 74 58 54 | ================================ 55 | 203 21 56 | ================================ 57 | 530 54 58 | ================================ 59 | 730 54 60 | ================================ 61 | 740 58 62 | -------------------------------------------------------------------------------- /Hadoop/二次排序/sortTwice.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thinkgamer/Hadoop-Spark-Learning/46f24ae930fc6d426ad14989cb6dbad1e7966d8e/Hadoop/二次排序/sortTwice.jar -------------------------------------------------------------------------------- /Hadoop/二次排序/sortTwice.txt: -------------------------------------------------------------------------------- 1 | 20 21 2 | 50 51 3 | 50 52 4 | 50 53 5 | 50 54 6 | 60 51 7 | 60 53 8 | 60 52 9 | 60 56 10 | 60 57 11 | 70 58 12 | 60 61 13 | 70 54 14 | 70 55 15 | 70 56 16 | 70 57 17 | 70 58 18 | 1 2 19 | 3 4 20 | 5 6 21 | 7 82 22 | 203 21 23 | 50 512 24 | 50 522 25 | 50 53 26 | 530 54 27 | 40 511 28 | 20 53 29 | 20 522 30 | 60 56 31 | 60 57 32 | 740 58 33 | 63 61 34 | 730 54 35 | 71 55 36 | 71 56 37 | 73 57 38 | 74 58 39 | 12 211 40 | 31 42 41 | 50 62 42 | 7 8 43 | -------------------------------------------------------------------------------- /Hadoop/二次排序/sorttwice/IntPair.java: -------------------------------------------------------------------------------- 1 | package sortTwice; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.io.WritableComparable; 8 | 9 | //自己定义的InPair类,实现WritableComparator 10 | public class IntPair implements WritableComparable{ 11 | int left; 12 | int right; 13 | 14 | public void set(int left, int right) { 15 | // TODO Auto-generated method stub 16 | this.left = left; 17 | this.right = right; 18 | } 19 | public int getLeft() { 20 | return left; 21 | } 22 | 23 | public int getRight() { 24 | return right; 25 | } 26 | 27 | //反序列化,从流中读进二进制转换成IntPair 28 | @Override 29 | public void readFields(DataInput in) throws IOException { 30 | // TODO Auto-generated method stub 31 | this.left = in.readInt(); 32 | this.right = in.readInt(); 33 | } 34 | //序列化,将IntPair转换成二进制输出 35 | @Override 36 | public void write(DataOutput out) throws IOException { 37 | // TODO Auto-generated method stub 38 | out.writeInt(left); 39 | out.writeInt(right); 40 | } 41 | 42 | /* 43 | * 为什么要重写equal方法? 44 | * 因为Object的equal方法默认是两个对象的引用的比较,意思就是指向同一内存,地址则相等,否则不相等; 45 | * 如果你现在需要利用对象里面的值来判断是否相等,则重载equal方法。 46 | */ 47 | @Override 48 | public boolean equals(Object obj) { 49 | // TODO Auto-generated method stub 50 | if(obj == null) 51 | return false; 52 | if(this == obj) 53 | return true; 54 | if (obj instanceof IntPair){ 55 | IntPair r = (IntPair) obj; 56 | return r.left == left && r.right==right; 57 | } 58 | else{ 59 | return false; 60 | } 61 | 62 | } 63 | 64 | /* 65 | * 重写equal 的同时为什么必须重写hashcode? 66 | * hashCode是编译器为不同对象产生的不同整数,根据equal方法的定义:如果两个对象是相等(equal)的,那么两个对象调用 hashCode必须产生相同的整数结果, 67 | * 即:equal为true,hashCode必须为true,equal为false,hashCode也必须 为false,所以必须重写hashCode来保证与equal同步。 68 | */ 69 | @Override 70 | public int hashCode() { 71 | // TODO Auto-generated method stub 72 | return left*157 +right; 73 | } 74 | 75 | //实现key的比较 76 | @Override 77 | public int compareTo(IntPair o) { 78 | // TODO Auto-generated method stub 79 | if(left != o.left) 80 | return left{ 28 | private final IntPair intkey = new IntPair(); 29 | private final IntWritable intvalue = new IntWritable(); 30 | public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException{ 31 | StringTokenizer itr = new StringTokenizer(value.toString()); 32 | int left =0; 33 | int right =0; 34 | while(itr.hasMoreTokens()){ 35 | left = Integer.parseInt(itr.nextToken()); 36 | if(itr.hasMoreTokens()) 37 | right = Integer.parseInt(itr.nextToken()); 38 | intkey.set(left, right); 39 | intvalue.set(right); 40 | context.write(intkey, intvalue); 41 | 42 | } 43 | } 44 | } 45 | 46 | //Reducce类 47 | public static class ST_Reduce extends Reducer{ 48 | private final Text left = new Text(); 49 | private static final Text SEPAPATOR= new Text("================================"); 50 | public void reduce(IntPair key,Iterablevalues,Context context) throws IOException, InterruptedException{ 51 | context.write(SEPAPATOR, null); 52 | left.set(Integer.toString(key.getLeft())); 53 | for(IntWritable val:values){ 54 | context.write(left, val); 55 | } 56 | } 57 | } 58 | 59 | //分区函数类,根据first确定Partition 60 | public static class MyPartitioner extends Partitioner{ 61 | @Override 62 | public int getPartition(IntPair key, IntWritable value, int numOfReduce) { 63 | // TODO Auto-generated method stub 64 | return Math.abs(key.getLeft()*127) % numOfReduce; 65 | } 66 | } 67 | /** 68 | * 在分组比较的时候,只比较原来的key,而不是组合key。 69 | */ 70 | public static class MyGroupParator implements RawComparator{ 71 | 72 | @Override 73 | public int compare(IntPair o1 , IntPair o2) { 74 | // TODO Auto-generated method stub 75 | int l = o1.getLeft(); 76 | int r = o2.getRight(); 77 | return l == r ? 0:(l, 42 | stu_score MAP, 43 | stu_friend STRUCT) 44 | comment 'this is complex_student message table' 45 | row format delimited fields terminated by '\t' 46 | COLLECTION ITEMS TERMINATED BY ',' 47 | MAP KEYS TERMINATED BY ':'; 48 | #修改表名字 49 | alter table complex rename to complex_student; 50 | #加载数据 51 | load data local inpath "/home/thinkgamer/MyCode/hive/complex_student" into table complex_student; 52 | 53 | #截断表 :从表或者表分区删除所有行,不指定分区,将截断表中的所有分区,也可以一次指定多个分区,截断多个分区。 54 | truncate table complex_student; 55 | 56 | #查询示例 57 | select stu_mess[0],stu_score["chinese"],stu_friend.a from complex_student; 58 | 结果:thinkgamer 50 cyan 59 | 60 | 61 | 5:创建分区表partition_student 62 | create table partition_student( 63 | id int, 64 | name string, 65 | age int) 66 | comment 'this is student message table' 67 | Partitioned by (grade string,class string) 68 | row format delimited fields terminated by "\t"; 69 | #加载数据 70 | load data local inpath "/home/thinkgamer/MyCode/hive/partiton_student" into table partition_student partition (grade="2013", class="34010301"); 71 | load data local inpath "/home/thinkgamer/MyCode/hive/partiton_student2" into table partition_student partition (grade="2013", class="34010302"); 72 | 73 | 6:桶 74 | 创建临时表 75 | create table student_tmp( 76 | id int, 77 | name string, 78 | age int) 79 | comment 'this is student message table' 80 | row format delimited fields terminated by '\t'; 81 | 82 | 加载数据: 83 | load data local inpath '/home/thinkgamer/MyCode/hive/student.txt' into table student_tmp; 84 | 85 | 创建指定桶的个数的表student_bucket 86 | create table student_bucket(id int, 87 | name string, 88 | age int) 89 | clustered by(id) sorted by(age) into 2 buckets 90 | row format delimited fields terminated by '\t'; 91 | 92 | 设置环境变量: 93 | set hive.enforce.bucketing = true; 94 | 95 | 从student_tmp 装入数据 96 | from student_tmp 97 | insert overwrite table student_bucket 98 | select *; 99 | 100 | -------------------------------------------------------------------------------- /Hive/hiveTableExample/hiveQL~: -------------------------------------------------------------------------------- 1 | 1:创建内部表: 2 | create table student( 3 | id int, 4 | name string, 5 | age int) 6 | comment 'this is student message table' 7 | row format delimited fields terminated by '\t'; 8 | 9 | #从本地加载数据 10 | load data local inpath '/home/thinkgamer/MyCode/hive/student.txt' into table student; 11 | #从HDFS加载数据 12 | load data inpath '/home/thinkgamer/MyCode/hive/student.txt' into table student; 13 | 14 | 2:创建外部表 15 | create external table external_student( 16 | id int, 17 | name string, 18 | age int) 19 | comment 'this is student message table' 20 | row format delimited fields terminated by '\t' 21 | location "/user/hive/external"; 22 | 23 | #加载数据 24 | 直接将源文件放在外部表的目下即可 25 | hdfs dfs -put /home/thinkgamer/MyCode/hive/external_student /user/hive/external 26 | 这种加载方式常常用于当hdfs上有一些历史数据,而我们需要在这些数据上做一些hive的操作时使用。这种方式避免了数据拷贝开销 27 | 28 | 29 | 3:创建copy_student表,并从student表中导入数据 30 | create table copy_student( 31 | id int, 32 | name string, 33 | age int) 34 | comment 'this is student message table' 35 | row format delimited fields terminated by '\t'; 36 | 37 | 导入数据 38 | from student stu insert overwrite table copy_student select *; 39 | 40 | 4:创建复杂类型的表 41 | Create table complex_student(stu_mess ARRAY, 42 | stu_score MAP, 43 | stu_friend STRUCT) 44 | comment 'this is complex_student message table' 45 | row format delimited fields terminated by '\t' 46 | COLLECTION ITEMS TERMINATED BY ',' 47 | MAP KEYS TERMINATED BY ':'; 48 | #修改表名字 49 | alter table complex rename to complex_student; 50 | #加载数据 51 | load data local inpath "/home/thinkgamer/MyCode/hive/complex_student" into table complex_student; 52 | 53 | #截断表 :从表或者表分区删除所有行,不指定分区,将截断表中的所有分区,也可以一次指定多个分区,截断多个分区。 54 | truncate table complex_student; 55 | 56 | #查询示例 57 | select stu_mess[0],stu_score["chinese"],stu_friend.a from complex_student; 58 | 结果:thinkgamer 50 cyan 59 | 60 | 61 | 5:创建分区表partition_student 62 | create table partition_student( 63 | id int, 64 | name string, 65 | age int) 66 | comment 'this is student message table' 67 | Partitioned by (grade string,class string) 68 | row format delimited fields terminated by "\t"; 69 | #加载数据 70 | load data local inpath "/home/thinkgamer/MyCode/hive/partiton_student" into table partition_student partition (grade="2013", class="34010301"); 71 | load data local inpath "/home/thinkgamer/MyCode/hive/partiton_student2" into table partition_student partition (grade="2013", class="34010302"); 72 | 73 | 6:桶 74 | 创建临时表 75 | create table student_tmp( 76 | id int, 77 | name string, 78 | age int) 79 | comment 'this is student message table' 80 | row format delimited fields terminated by '\t'; 81 | 82 | 加载数据: 83 | load data local inpath '/home/thinkgamer/MyCode/hive/student.txt' into table student_tmp; 84 | 85 | 创建指定桶的个数的表student_bucket 86 | create table student_bucket(id int, 87 | name string, 88 | age int) 89 | clustered by(id) sorted by(age) into 2 buckets 90 | row format delimited fields terminated by '\t'; 91 | 92 | 设置环境变量: 93 | set hive.enforce.bucketing = true; 94 | 95 | 96 | -------------------------------------------------------------------------------- /Hive/hiveTableExample/partiton_student: -------------------------------------------------------------------------------- 1 | 1 WEEW 23 2 | 2 QVCD 32 3 | 3 sdfw 43 4 | 4 rfwe 12 5 | -------------------------------------------------------------------------------- /Hive/hiveTableExample/partiton_student 2: -------------------------------------------------------------------------------- 1 | 5 hack 43 2 | 6 spring 54 3 | 7 cyan 23 4 | 8 thinkgamer 43 5 | -------------------------------------------------------------------------------- /Hive/hiveTableExample/partiton_student 2~: -------------------------------------------------------------------------------- 1 | 5 hack 43 2 | 6 spring 54 3 | 7 cyan 23 4 | 8 thinkgamer 43 5 | -------------------------------------------------------------------------------- /Hive/hiveTableExample/partiton_student2: -------------------------------------------------------------------------------- 1 | 5 hack 43 2 | 6 spring 54 3 | 7 cyan 23 4 | 8 thinkgamer 43 5 | -------------------------------------------------------------------------------- /Hive/hiveTableExample/partiton_student~: -------------------------------------------------------------------------------- 1 | 1 WEEW 23 2 | 2 QVCD 32 3 | 3 sdfw 43 4 | 4 rfwe 12 5 | -------------------------------------------------------------------------------- /Hive/hiveTableExample/student.txt: -------------------------------------------------------------------------------- 1 | 1 WEEW 23 2 | 2 QVCD 32 3 | 3 sdfw 43 4 | 4 rfwe 12 5 | -------------------------------------------------------------------------------- /Hive/hiveTableExample/student.txt~: -------------------------------------------------------------------------------- 1 | 1 WEEW 23 2 | 2 QVCD 32 3 | -------------------------------------------------------------------------------- /Java/Dataguru算法导论/Graph/BFS.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thinkgamer/Hadoop-Spark-Learning/46f24ae930fc6d426ad14989cb6dbad1e7966d8e/Java/Dataguru算法导论/Graph/BFS.java -------------------------------------------------------------------------------- /Java/Dataguru算法导论/Graph/DFS.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thinkgamer/Hadoop-Spark-Learning/46f24ae930fc6d426ad14989cb6dbad1e7966d8e/Java/Dataguru算法导论/Graph/DFS.java -------------------------------------------------------------------------------- /Java/Dataguru算法导论/Graph/Dijkstra.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thinkgamer/Hadoop-Spark-Learning/46f24ae930fc6d426ad14989cb6dbad1e7966d8e/Java/Dataguru算法导论/Graph/Dijkstra.java -------------------------------------------------------------------------------- /Java/Dataguru算法导论/Graph/GraphTest.java: -------------------------------------------------------------------------------- 1 | package Graph; 2 | 3 | import java.util.Scanner; 4 | 5 | /* 6 | * 定义图的结构 7 | */ 8 | class Graph { 9 | static final int MaxNum=20; //最大节点数目 10 | static final int MaxValue=65535; 11 | char[] Vertex = new char[MaxNum]; //定义数组,保存顶点信息 12 | 13 | int GType; //图的类型0:无向图 1:有向图 14 | int VertxNum; //顶点的数量 15 | int EdgeNum; //边的数量 16 | 17 | int[][] EdgeWeight = new int[MaxNum][MaxNum]; //定义矩阵保存顶点信息 18 | int[] isTrav = new int[MaxNum]; //遍历标志 19 | 20 | } 21 | 22 | public class GraphTest { 23 | 24 | /** 25 | * @param args 26 | * Author:thinkgamer 27 | */ 28 | static Scanner scan = new Scanner(System.in); 29 | 30 | //创建邻接矩阵图 31 | static void createGraph(Graph g){ 32 | int i , j , k; 33 | int weight; //权 34 | char EstartV, EndV; //边的起始顶点 35 | 36 | System.out.println("输入途中各顶点的信息"); 37 | for(i=0; i < g.VertxNum; i ++) 38 | { 39 | System.out.println("第" + (i+1) + "个顶点"); 40 | g.Vertex[i] = (scan.next().toCharArray() )[0]; 41 | } 42 | System.out.println("输入构成个遍的顶点和权值"); 43 | for(k=0;k" + g.Vertex[n]); //输出节点数据 91 | //添加处理节点的操作 92 | for(i = 0; i< g.VertxNum; i++) 93 | { 94 | //if(g.EdgeWeight[n][i] != g.MaxValue && g.isTrav[n] == 0) 纠错为 下边一行,感谢网友http://blog.csdn.net/ZyManTou 提示 95 | if(g.EdgeWeight[n][i] != g.MaxValue && g.isTrav[i] == 0) 96 | { 97 | DeepTraOne(g, i); //递归进行遍历 98 | } 99 | } 100 | } 101 | 102 | //深度优先遍历 103 | static void DeepTraGraph(Graph g){ 104 | int i; 105 | for(i = 0; i< g.VertxNum; i++) 106 | { 107 | g.isTrav[i]= 0; 108 | } 109 | System.out.println("深度优先遍历:"); 110 | for(i = 0; i< g.VertxNum ; i++) 111 | { 112 | if(g.isTrav[i] == 0) 113 | DeepTraOne(g,i); 114 | } 115 | System.out.println(); 116 | } 117 | 118 | public static void main(String[] args) { 119 | // TODO Auto-generated method stub 120 | Graph g = new Graph(); 121 | System.out.println("输出生成图的类型:"); 122 | g.GType = scan.nextInt(); //图的种类 123 | 124 | System.out.println("输入图的顶点数量:"); 125 | g.VertxNum = scan.nextInt(); 126 | 127 | System.out.println("输入图的边数量:"); 128 | g.EdgeNum = scan.nextInt(); 129 | 130 | clearGraph(g); //清空图 131 | createGraph(g); //生成邻接表结构的图 132 | System.out.println("该图的邻接矩阵数据如下:"); 133 | OutGraph(g); //输出图 134 | DeepTraGraph(g); //深度优先遍历图 135 | } 136 | 137 | } 138 | -------------------------------------------------------------------------------- /Java/Dataguru算法导论/Hash/hash.java: -------------------------------------------------------------------------------- 1 | package Hash; 2 | /* 3 | * Hash函数介绍 4 | * 除下面介绍的集中hash函数外还有取余散列法(m一般选择较大的素数,例如701) h(k) = k mod m 5 | * 乘法散列法(m选择2的计算机的位数(64位或者32位),A为sqrt(5)-1 = 0.618)h(k) = m(kA mod 1 ) 6 | */ 7 | 8 | public class hash { 9 | 10 | //1:RS 11 | public static long RSHash(String str) 12 | { 13 | int b = 378551; 14 | int a = 63689; 15 | long hash = 0; 16 | for(int i = 0; i < str.length(); i++) 17 | { 18 | hash = hash * a + str.charAt(i); 19 | a = a * b; 20 | } 21 | return hash; 22 | } 23 | 24 | //2:JS Justin Sobel写的一个位操作的哈希函数。 25 | public static long JSHash(String str) 26 | { 27 | long hash = 1315423911; 28 | for(int i = 0; i < str.length(); i++) 29 | { 30 | hash ^= ((hash << 5) + str.charAt(i) + (hash >> 2)); 31 | } 32 | return hash; 33 | } 34 | 35 | //3:PJW 该散列算法是基于贝尔实验室的彼得J温伯格的的研究。在Compilers一书中(原则,技术和工具),建议采用这个算法的散列函数的哈希方法。 36 | public static long PJWHash(String str) 37 | { 38 | long BitsInUnsignedInt = (long)(4 * 8); 39 | long ThreeQuarters = (long)((BitsInUnsignedInt * 3) / 4); 40 | long OneEighth = (long)(BitsInUnsignedInt / 8); 41 | long HighBits = (long)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth); 42 | long hash = 0; 43 | long test = 0; 44 | for(int i = 0; i < str.length(); i++) 45 | { 46 | hash = (hash << OneEighth) + str.charAt(i); 47 | if((test = hash & HighBits) != 0) 48 | { 49 | hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits)); 50 | } 51 | } 52 | return hash; 53 | } 54 | 55 | //4:ELF 和PJW很相似,在Unix系统中使用的较多。 56 | public static long ELFHash(String str) 57 | { 58 | long hash = 0; 59 | long x = 0; 60 | for(int i = 0; i < str.length(); i++) 61 | { 62 | hash = (hash << 4) + str.charAt(i); 63 | if((x = hash & 0xF0000000L) != 0) 64 | { 65 | hash ^= (x >> 24); 66 | } 67 | hash &= ~x; 68 | } 69 | return hash; 70 | } 71 | 72 | //5:BKDR 73 | /* 74 | * 这个算法来自Brian Kernighan 和 Dennis Ritchie的 The C Programming Language。 75 | * 这是一个很简单的哈希算法,使用了一系列奇怪的数字,形式如31,3131,31...31,看上去和DJB算法很相似 76 | */ 77 | public static long BKDRHash(String str) 78 | { 79 | long seed = 131; // 31 131 1313 13131 131313 etc.. 80 | long hash = 0; 81 | for(int i = 0; i < str.length(); i++) 82 | { 83 | hash = (hash * seed) + str.charAt(i); 84 | } 85 | return hash; 86 | } 87 | 88 | //6:SDBM 这个算法在开源的SDBM中使用,似乎对很多不同类型的数据都能得到不错的分布。 89 | public static long SDBMHash(String str) 90 | { 91 | long hash = 0; 92 | for(int i = 0; i < str.length(); i++) 93 | { 94 | hash = str.charAt(i) + (hash << 6) + (hash << 16) - hash; 95 | } 96 | return hash; 97 | } 98 | 99 | //7:DJB 这个算法是Daniel J.Bernstein 教授发明的,是目前公布的最有效的哈希函数 100 | public static long DJBHash(String str) 101 | { 102 | long hash = 5381; 103 | for(int i = 0; i < str.length(); i++) 104 | { 105 | hash = ((hash << 5) + hash) + str.charAt(i); 106 | } 107 | return hash; 108 | } 109 | 110 | //8:DEK 由伟大的Knuth在《编程的艺术 第三卷》的第六章排序和搜索中给出。 111 | public static long DEKHash(String str) 112 | { 113 | long hash = str.length(); 114 | for(int i = 0; i < str.length(); i++) 115 | { 116 | hash = ((hash << 5) ^ (hash >> 27)) ^ str.charAt(i); 117 | } 118 | return hash; 119 | } 120 | 121 | //9:AP 这是本文作者Arash Partow贡献的一个哈希函数,继承了上面以旋转以为和加操作。代数描述:AP 122 | public static long APHash(String str) 123 | { 124 | long hash = 0xAAAAAAAA; 125 | for(int i = 0; i < str.length(); i++) 126 | { 127 | if ((i & 1) == 0) 128 | { 129 | hash ^= ((hash << 7) ^ str.charAt(i) * (hash >> 3)); 130 | } 131 | else 132 | { 133 | hash ^= (~((hash << 11) + str.charAt(i) ^ (hash >> 5))); 134 | } 135 | } 136 | return hash; 137 | } 138 | 139 | //主函数 140 | public static void main(String[] args) { 141 | String str = "thinkgamer"; 142 | System.out.println("thinkgamer 的 RSHash:" + RSHash(str)); 143 | System.out.println("thinkgamer 的 JSHash:" + JSHash(str)); 144 | System.out.println("thinkgamer 的 PJWHash:" + PJWHash(str)); 145 | System.out.println("thinkgamer 的 ELFHash:" + ELFHash(str)); 146 | System.out.println("thinkgamer 的 BKDRHash:" + BKDRHash(str)); 147 | System.out.println("thinkgamer 的 SDBMHash:" + SDBMHash(str)); 148 | System.out.println("thinkgamer 的 DJBHash:" + DJBHash(str)); 149 | System.out.println("thinkgamer 的 DEKHash:" + DEKHash(str)); 150 | System.out.println("thinkgamer 的 APHash:" + APHash(str)); 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /Java/Dataguru算法导论/Link/DoubleLink.java: -------------------------------------------------------------------------------- 1 | package Link; 2 | 3 | import java.util.Scanner; 4 | 5 | class Data{ //定义链表的一个节点 6 | String key; //节点的关键字,唯一 7 | String name; 8 | int age; 9 | } 10 | 11 | public class DoubleLink { 12 | 13 | 14 | int flag; //输入选择值 15 | Scanner scan = new Scanner(System.in); 16 | Data data = new Data(); 17 | DoubleLink nextNode; //后继节点 18 | DoubleLink priorNode; //前驱节点 19 | 20 | //链表添加节点 21 | DoubleLink addNode(DoubleLink head, String priorKey, String nextKey, Data nodeData){ 22 | 23 | DoubleLink node=null, htemp=null; 24 | if((node = new DoubleLink()) == null) 25 | System.out.println("内存空间分配失败"); 26 | if(head== null) //如果head为空 27 | { 28 | System.out.println("当前链表为空,是否将当前节点当作头节点?\n0:否\t1:是"); 29 | 30 | node.data=nodeData; 31 | node.nextNode=null; 32 | node.priorNode=null; 33 | flag = scan.nextInt(); 34 | switch(flag) 35 | { 36 | case 0: 37 | break; 38 | case 1: 39 | head=node; 40 | break; 41 | default: 42 | System.out.println("你输入的数据不合法");; 43 | } 44 | } //如果head不为空 45 | else{ 46 | if(linkFindNode(head, priorKey,nextKey,nodeData)) 47 | System.out.println("插入成功"); 48 | else 49 | System.out.println("插入失败(原因可能是你输入的前驱和后继即诶但均不存在)"); 50 | } 51 | 52 | return head; 53 | } 54 | 55 | //查找并插入节点 56 | boolean linkFindNode(DoubleLink head, String priorKey, String nextKey,Data nodeData) { 57 | // TODO Auto-generated method stub 58 | DoubleLink htemp=null,node=null; 59 | 60 | if( (node = new DoubleLink()) == null ) 61 | { 62 | System.out.println("内存分配失败"); 63 | return false; 64 | } 65 | //将传进来的值赋值给node 66 | node.data = nodeData; 67 | node.nextNode = null; 68 | node.priorNode=null; 69 | //两大类情况 70 | htemp = head; 71 | while(htemp != null) 72 | { 73 | if(htemp.data.key.equals(priorKey)) //前驱节点存在 74 | { 75 | if(htemp.nextNode == null) //该节点的后继节点为空,说明该节点为头节点 76 | { 77 | System.out.println("你输入的后继节点不存在,前驱节点为头节点,是否插入在其后面?\n 1:是 \t 0 :否 "); 78 | flag = scan.nextInt(); 79 | if(flag == 0) 80 | break; 81 | else if(flag==1) 82 | { 83 | htemp.nextNode = node; //将查找到的节点的后继节点指向node 84 | node.nextNode = null; 85 | node.priorNode = htemp; 86 | 87 | return true; 88 | } 89 | else 90 | System.out.println("你输入的数字不合法!!!"); 91 | } 92 | else //后继节点不为空 93 | { 94 | if(htemp.nextNode.data.key.equals(nextKey)) //存在的后继节点与nextKey相同。相同执行if 95 | { 96 | node.nextNode = htemp.nextNode; 97 | htemp.nextNode.priorNode = node; 98 | 99 | htemp.nextNode = node; 100 | node.priorNode = htemp; 101 | return true; 102 | 103 | } 104 | else //不同执行else 105 | { 106 | htemp = htemp.nextNode; //若当前节点没找到,遍历下一个节点 107 | } 108 | } 109 | } 110 | else //前驱节点不存在,后驱节点存在 111 | { 112 | if(htemp.data.key.equals(nextKey)) //如果当前节点与nextKey相同 113 | { 114 | if(htemp.nextNode==null) //如果后继节点为空,即当前节点为尾节点 115 | { 116 | System.out.println("你输入的前驱节点不存在,后继节点为头节点,是否插入在其前面?\n 1:是 \t 0 :否 "); 117 | flag = scan.nextInt(); 118 | if(flag == 0) 119 | break; 120 | else if(flag==1) 121 | { 122 | htemp.priorNode = node; 123 | node.nextNode = htemp; 124 | 125 | node.priorNode=null; 126 | return true; 127 | } 128 | else 129 | System.out.println("你输入的数字不合法!!!"); 130 | } 131 | else //如果当前节点的后继节点不为空,则执行下一个节点 132 | { 133 | htemp = htemp.nextNode; //若当前节点没找到,遍历下一个节点 134 | } 135 | } 136 | else 137 | htemp = htemp.nextNode; //若当前节点没找到,遍历下一个节点 138 | } 139 | } 140 | return false; 141 | } 142 | 143 | //输出节点 144 | public void OutputLinkNode(DoubleLink head) 145 | { 146 | if(head == null) 147 | System.out.println("当前链表为空"); 148 | else{ 149 | System.out.println("输入的链表数据如下:"); 150 | DoubleLink htemp; 151 | htemp = head; 152 | while(htemp!=null) 153 | { 154 | System.out.println(htemp.data.key + "\t" + htemp.data.name + "\t" + htemp.data.age); 155 | htemp= htemp.nextNode; 156 | } 157 | } 158 | System.out.println(); 159 | } 160 | 161 | //输出链表的深度 162 | int LinkDepth(DoubleLink head) 163 | { 164 | int sum = 0; 165 | DoubleLink htemp = head; 166 | while(htemp!=null) 167 | { 168 | sum ++; 169 | htemp = htemp.nextNode; 170 | } 171 | return sum; 172 | } 173 | 174 | //查找节点 175 | DoubleLink FindLink(DoubleLink head, String findKey) 176 | { 177 | DoubleLink htemp=head; 178 | while(htemp!=null) 179 | { 180 | if(htemp.data.key.equals(findKey)) 181 | return htemp; 182 | htemp = htemp.nextNode; 183 | } 184 | return null; 185 | } 186 | 187 | //删除节点 188 | DoubleLink DeleteNode(DoubleLink head, String deleteKey) 189 | { 190 | DoubleLink htemp = head; 191 | while(htemp!=null) 192 | { 193 | if(htemp.data.key.equals(deleteKey)) 194 | { 195 | if(htemp.priorNode==null) //如果是头节点 196 | { 197 | return htemp.nextNode; 198 | } 199 | else if (htemp.nextNode==null) //如果是尾节点 200 | { 201 | htemp.priorNode.nextNode=null; 202 | htemp.priorNode=null; 203 | return head; 204 | } 205 | else //如果是中间 206 | { 207 | htemp.priorNode.nextNode=htemp.nextNode; 208 | htemp.nextNode.priorNode = htemp.priorNode; 209 | return head; 210 | } 211 | } 212 | else 213 | htemp = htemp.nextNode; 214 | } 215 | System.out.println("你要删除的节点不存在!"); 216 | return head; 217 | } 218 | 219 | } 220 | -------------------------------------------------------------------------------- /Java/Dataguru算法导论/Link/DoubleLinkTest.java: -------------------------------------------------------------------------------- 1 | package Link; 2 | 3 | import java.util.Scanner; 4 | 5 | public class DoubleLinkTest { 6 | 7 | public static void main(String[] args) { 8 | 9 | DoubleLink node=null, head=null; 10 | DoubleLink dlink = new DoubleLink(); //声明一个双向链表对象 11 | Scanner scan = new Scanner(System.in); 12 | 13 | System.out.println("双向链表测试开始...."); 14 | do{ 15 | System.out.println("请输入插入节点的关键字,姓名和年龄,格式为:关键字 姓名 年龄"); 16 | Data data = new Data(); 17 | data.key = scan.next(); 18 | data.name = scan.next(); 19 | data.age = scan.nextInt(); 20 | 21 | if(data.key.contains("0")) //循环插入节点,直到插入的为0时结束 22 | break; 23 | else 24 | { 25 | System.out.println("请输入插入节点的前驱节点和后继节点,格式为 前驱节点 后继节点"); 26 | String priorKey = scan.next(); 27 | String nextKey = scan.next(); 28 | 29 | head = dlink.addNode(head, priorKey, nextKey, data); //添加节点 30 | dlink.OutputLinkNode(head); //输出链表 31 | } 32 | }while(true); 33 | 34 | //输出链表的深度 35 | System.out.println("该链表的深度为:" + dlink.LinkDepth(head)); 36 | 37 | //查找链表中的某个节点 38 | System.out.println("请输入要查找的节点的关键字..."); 39 | String findKey = scan.next(); 40 | node = dlink.FindLink(head, findKey); 41 | if(node==null) 42 | System.out.println("你所查找的节点不存在!"); 43 | else 44 | System.out.println("该节点的值为:" + node.data.key + "\t" + node.data.name + "\t" + node.data.age); 45 | 46 | //删除节点值 47 | System.out.println("请输入要删除的节点的关键字..."); 48 | String deleteKey = scan.next(); 49 | node = dlink.DeleteNode(head, deleteKey); 50 | if(node == null) 51 | System.out.println("删除节点后的链表为空,其深度为:" + 0); 52 | else 53 | { 54 | System.out.println("删除后的链表为:"); 55 | dlink.OutputLinkNode(head); 56 | System.out.println("删除节点后链表的深度为:" + dlink.LinkDepth(head)); 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /Java/Dataguru算法导论/Link/Link.java: -------------------------------------------------------------------------------- 1 | package Link; 2 | 3 | class DATA{ //定义链表的一个节点 4 | String key; //节点的关键字 5 | String name; 6 | int age; 7 | } 8 | 9 | public class Link { //定义链表结构 10 | 11 | DATA nodeData = new DATA(); //声明一个节点 12 | Link nextNode; //指向下一个节点的指针 13 | 14 | 15 | //添加节点 16 | Link linkAddEnd(Link head, DATA nodeData) 17 | { 18 | Link node, hTemp; 19 | if( (node = new Link()) ==null) //如果内存空间分配失败,则返回为空 20 | { 21 | System.out.println("内存空间分配失败!"); 22 | return null; 23 | } 24 | else 25 | { 26 | node.nodeData = nodeData; 27 | node.nextNode = null; 28 | if(head == null) //如果头节点为空,则把当前节点赋给head,并返回 29 | { 30 | head = node; 31 | return head; 32 | } 33 | hTemp = head; //如果头节点不为空 34 | while(hTemp.nextNode!=null) //查找链表的末尾 35 | { 36 | hTemp = hTemp.nextNode; 37 | } 38 | hTemp.nextNode = node; 39 | return head; 40 | } 41 | } 42 | 43 | //插入头节点 44 | Link linkAddFirst(Link head, DATA nodeData) 45 | { 46 | Link node; 47 | if((node=new Link()) == null ) //如果内存空间分配失败,则返回为空 48 | { 49 | System.out.println("内存分配失败"); 50 | return null; 51 | } 52 | else 53 | { 54 | node.nodeData = nodeData; 55 | node.nextNode = head; 56 | head = node; 57 | return head; 58 | } 59 | } 60 | 61 | //查找节点 62 | Link linkFindNode(Link head, String key) 63 | { 64 | Link hTemp; 65 | hTemp = head; 66 | while(hTemp!=null) //若节点有效,则进行查找 67 | { 68 | if(hTemp.nodeData.key.compareTo(key) == 0) //若节点的关键字与传入的关键字相同 69 | { 70 | return hTemp; 71 | } 72 | hTemp = hTemp.nextNode; //处理下一个节点 73 | } 74 | return null; 75 | } 76 | 77 | //插入节点 78 | Link linkInsertNode(Link head, String findKey,DATA nodeData) 79 | { 80 | Link node,hTemp; 81 | if((node = new Link() ) == null ) //分配内存失败,则返回 82 | { 83 | System.out.println("分配内存失败..."); 84 | return null; 85 | } 86 | node.nodeData = nodeData; //保存当前集节点信息 87 | hTemp = linkFindNode(head, findKey); //查找要插入的节点 88 | if(hTemp != null) 89 | { 90 | node.nextNode = hTemp.nextNode; 91 | hTemp.nextNode = node; 92 | } 93 | else 94 | { 95 | System.out.println("未找到正确的插入位置........."); 96 | } 97 | return head; //返回头引用 98 | } 99 | 100 | //删除节点 101 | int linkDeleteNode(Link head, String key) 102 | { 103 | Link node,hTemp; 104 | hTemp = head; 105 | node = head; 106 | while(hTemp != null ) 107 | { 108 | if(hTemp.nodeData.key.compareTo(key) == 0) //若找到关键字,则删除 109 | { 110 | node.nextNode = hTemp.nextNode; 111 | hTemp = null; 112 | return 1; 113 | } 114 | else //跳到下一个节点 115 | { 116 | node = hTemp; 117 | hTemp = hTemp.nextNode; 118 | } 119 | } 120 | return 0; 121 | } 122 | 123 | //计算链表长度 124 | int linkLength(Link head) 125 | { 126 | Link hTemp; 127 | hTemp = head; 128 | int num = 0; 129 | while(hTemp!=null) 130 | { 131 | num ++ ; 132 | hTemp = hTemp.nextNode; 133 | } 134 | return num; 135 | } 136 | 137 | //显示所有节点 138 | void linkShow(Link head) 139 | { 140 | Link hTemp; 141 | DATA nodeData; 142 | hTemp = head; 143 | System.out.printf("当前链表共有 %d 个节点,链表所有的数据如下:\n" , linkLength(head)); 144 | while(hTemp!=null) 145 | { 146 | nodeData = hTemp.nodeData; //获取当前的节点数据 147 | System.out.printf("节点(%s %s %d)\n",nodeData.key,nodeData.name,nodeData.age); 148 | hTemp = hTemp.nextNode; 149 | } 150 | } 151 | 152 | } -------------------------------------------------------------------------------- /Java/Dataguru算法导论/Link/linkTest.java: -------------------------------------------------------------------------------- 1 | package Link; 2 | 3 | import java.util.Scanner; 4 | 5 | public class linkTest { 6 | 7 | public static void main(String[] args) { 8 | Link node = null , head=null; 9 | Link link = new Link(); 10 | String key, findKey; 11 | Scanner input = new Scanner(System.in); 12 | 13 | System.out.printf("链表测试开始,先输出链表中的数据,格式为:关键字 姓名 年龄\n"); 14 | do 15 | { //循环插入节点,知道输入的key 为0 结束 16 | DATA nodeData = new DATA(); 17 | nodeData.key = input.next(); 18 | if(nodeData.key.equals("0")) 19 | { 20 | break; 21 | } 22 | else 23 | { 24 | nodeData.name = input.next(); 25 | nodeData.age = input.nextInt(); 26 | head = link.linkAddEnd(head, nodeData); //在链表尾部添加节点 27 | } 28 | }while(true); 29 | link.linkShow(head); //显示所有节点 30 | 31 | System.out.printf("\n演示插入节点,输入插入位置的关键字:"); 32 | findKey = input.next(); //输入插入的关键字 33 | System.out.println("输入插入节点的数据(关键字 姓名 年龄)"); 34 | DATA nodeData = new DATA(); //输入节点的元素值 35 | nodeData.key = input.next(); 36 | nodeData.name = input.next(); 37 | nodeData.age = input.nextInt(); 38 | head = link.linkInsertNode(head, findKey, nodeData); //调用插入函数 39 | link.linkShow(head); //显示所有节点 40 | 41 | System.out.println("演示删除节点,输入要删除的关键字:"); 42 | key = input.next(); 43 | link.linkDeleteNode(head, key); //调用删除节点的函数 44 | link.linkShow(head); //显示所有节点 45 | 46 | System.out.println("演示在链表中差找,输入要查找的关键字:"); 47 | key = input.next(); 48 | node = link.linkFindNode(head, key); //调用查找函数,返回节点引用 49 | if(node!=null) 50 | { 51 | nodeData = node.nodeData; //获取节点的数据 52 | System.out.printf("关键字 %s 对应的节点数据为 (%s %s %s)\n", key,nodeData.key,nodeData.name,nodeData.age); 53 | } 54 | else 55 | { 56 | System.out.printf("在链表中为查找的为%s 的关键字 \n" , key); 57 | } 58 | 59 | 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /Java/Dataguru算法导论/Matrix/matrixCheng.java: -------------------------------------------------------------------------------- 1 | package Matrix; 2 | 3 | /* 4 | * 方阵相乘 5 | * strassen,矩阵分块思想 6 | */ 7 | public class matrixCheng { 8 | //用于计算的两个数组 9 | static int[][] a = { 10 | {1,2,3}, 11 | {2,3,4}, 12 | {3,4,5} 13 | }; 14 | static int[][] b = { 15 | {3,4,5}, 16 | {4,5,6}, 17 | {5,6,7} 18 | }; 19 | static int[][] c = { {0,0,0}, {0,0,0}, {0,0,0} }; //用来存放a 与 b相乘的值 20 | 21 | public static void main(String[] args) { 22 | 23 | //正常计算规则计算 24 | normalCheng(); 25 | } 26 | 27 | private static void normalCheng() { 28 | // TODO Auto-generated method stub 29 | for(int line_a = 0 ; line_a< a.length; line_a ++ ) 30 | { 31 | for(int line_b =0 ; line_b< b.length; line_b++) 32 | { 33 | c[line_a][line_b] = 0; 34 | for (int k = 0 ;k< b[0].length; k ++) 35 | { 36 | c[line_a][line_b]= c[line_a][line_b] + a[line_a][k] * b[k][line_b]; 37 | } 38 | } 39 | } 40 | printMatrix(); 41 | 42 | } 43 | 44 | //打印出得到的乘积 45 | private static void printMatrix() { 46 | // TODO Auto-generated method stub 47 | //打印a 48 | System.out.println("a 矩阵:"); 49 | for(int i =0; i< a.length; i++) 50 | { 51 | for(int j =0;j < a[0].length; j ++) 52 | System.out.print( a[i][j] + "\t"); 53 | System.out.println(); 54 | } 55 | //打印b 56 | System.out.println("b 矩阵:"); 57 | for(int i =0; i< b.length; i++) 58 | { 59 | for(int j =0;j < b[0].length; j ++) 60 | System.out.print( b[i][j] + "\t"); 61 | System.out.println(); 62 | } 63 | //打印乘积矩阵 64 | System.out.println("乘积矩阵:"); 65 | for(int i =0 ; i< c.length; i ++) 66 | { 67 | for (int j =0;j max) 37 | { 38 | max = sum; 39 | startIndex = i; 40 | endIndex = j; 41 | } 42 | } 43 | } 44 | System.out.println("Max sum is :" + max); //输出最大子数组和 45 | printMaxArr(startIndex, endIndex); //输出最大子数组 46 | } 47 | 48 | //算法复杂度 n 49 | private static void findMaxArr3() 50 | { 51 | // TODO Auto-generated method stub 52 | int max = arr[0]; 53 | int sum = 0; 54 | int startIndex = 0; //记录最大子串的起始位置 55 | int endIndex = 0 ; // 记录最大子串的结束位置 56 | for ( int i =0 ; i< maxIndex; i ++) 57 | { 58 | if ( sum >= 0) 59 | { 60 | sum += arr[i]; 61 | } 62 | else 63 | { 64 | sum = arr[i]; 65 | startIndex = i; 66 | } 67 | if(sum > max) 68 | { 69 | max = sum; 70 | endIndex = i; 71 | } 72 | } 73 | System.out.println("Max sum is :" + max); 74 | printMaxArr(startIndex, endIndex); 75 | 76 | } 77 | 78 | //输出最大子数组 79 | private static void printMaxArr(int startIndex, int endIndex) { 80 | // TODO Auto-generated method stub 81 | for(int i =startIndex ; i<= endIndex; i ++) 82 | System.out.print( arr[i] + "\t"); 83 | } 84 | 85 | 86 | } 87 | -------------------------------------------------------------------------------- /Java/Dataguru算法导论/Queue/Queue.java: -------------------------------------------------------------------------------- 1 | package Queue; 2 | 3 | /* 4 | * 使用java构建队列,并模拟实现队列的入队和出对方法 5 | */ 6 | 7 | public class Queue { //队列类 8 | 9 | private int maxSize; //定义队列的长度 10 | private int[] arrQueue; //队列 11 | private int rear; //定义队列的尾指针 12 | private int front; //定义队列的头指针 13 | private int empty; //元素的个数 14 | 15 | public Queue(int s) //初始化构造函数 16 | { 17 | maxSize = s; 18 | arrQueue = new int[s]; 19 | rear = -1; 20 | front=0; 21 | empty = 0; 22 | } 23 | 24 | //实现插入方法 25 | public void insert(int m) 26 | { 27 | if(rear == maxSize-1) //处理循环 28 | rear = -1; 29 | arrQueue[++rear] = m; //对尾指针加一,把值放在队列结尾 30 | empty++; //队列元素个数加1 31 | System.out.println("队列入队元素 为:" + m); 32 | } 33 | 34 | //实现出栈的方法,即取得队列的头元素 35 | public int remove() 36 | { 37 | int temp = arrQueue[front++]; //将栈顶元素赋值给temp,栈顶指针加1 38 | if(front == maxSize) //处理循环 39 | front = 0; 40 | empty--; //元素个数-1 41 | return temp; 42 | } 43 | 44 | //判断队列是否为空 45 | public boolean isEmpty() 46 | { 47 | return (empty==0); 48 | } 49 | 50 | //判断对列是否为满 51 | public boolean isFull() 52 | { 53 | return (empty == maxSize); 54 | } 55 | 56 | //返回队列长度 57 | public int qLong() 58 | { 59 | return empty; 60 | } 61 | 62 | public static void main(String[] args) { 63 | Queue q = new Queue(5); //初始化队列为5个元素 64 | 65 | q.insert(1); 66 | q.insert(2); 67 | q.insert(3); 68 | q.insert(4); 69 | q.insert(5); 70 | 71 | int t1 = q.remove(); 72 | System.out.println("队列元素出队:" + t1); 73 | int t2 = q.remove(); 74 | System.out.println("队列元素出队:" + t2); 75 | 76 | System.out.println("队列是否为空:" + q.isEmpty()); 77 | System.out.println("队列是否为满:" + q.isFull()); 78 | System.out.println("队列的长度:" + q.qLong()); 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /Java/Dataguru算法导论/Statck/Statck1.java: -------------------------------------------------------------------------------- 1 | package Statck; 2 | /* 3 | * 使用java构建栈,并模拟实现栈的入栈和出栈方法 4 | * 使用数组实现 5 | */ 6 | 7 | public class Statck1 { 8 | 9 | private int maxSize; //栈的最多元素数 10 | private int top; //栈顶指针 11 | private int len; //栈的深度 12 | private int[] arrStack; // 模拟栈 13 | 14 | //栈的初始化 15 | public Statck1(int s){ 16 | maxSize = s; 17 | len =0; 18 | top= -1; 19 | arrStack = new int[s]; 20 | } 21 | 22 | //获取栈的长度 23 | public int getLen(){ 24 | return len; 25 | } 26 | 27 | //获取当前栈还能插入多少个f元素 28 | public int getLeaveLen(){ 29 | return (maxSize-len); 30 | } 31 | //判断栈是否满 32 | public boolean isFull(){ 33 | return (len==maxSize); 34 | } 35 | 36 | //判断栈是否为空 37 | public boolean isEmpty(){ 38 | return (len ==0); 39 | } 40 | 41 | //元素入栈 42 | public void inStack(int s) 43 | { 44 | arrStack[++top] = s; //栈顶指针加1,入栈 45 | System.out.println("元素入栈:" + s); 46 | len ++ ;//栈深度+1 47 | } 48 | 49 | //元素出栈 50 | public int outStack() 51 | { 52 | int temp = arrStack[top--];//赋值之后减1 53 | System.out.println("元素出栈:" + temp); 54 | len--; //栈深度-1 55 | return temp; 56 | } 57 | 58 | public static void main(String[] args) { 59 | Statck1 s = new Statck1(5); 60 | 61 | s.inStack(1); 62 | s.inStack(2); 63 | s.inStack(3); 64 | s.inStack(4); 65 | s.inStack(5); 66 | 67 | s.outStack(); 68 | s.outStack(); 69 | System.out.println("栈的长度:" + s.getLen()); 70 | System.out.println("还能入栈元素个数:" + s.getLeaveLen()); 71 | System.out.println("栈的是否为空:" + s.isEmpty()); 72 | System.out.println("栈的是否为满:" + s.isFull()); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /Java/Dataguru算法导论/Statck/Statck2.java: -------------------------------------------------------------------------------- 1 | package Statck; 2 | 3 | import java.util.ArrayList; 4 | import java.util.EmptyStackException; 5 | import java.util.List; 6 | 7 | /* 8 | * 使用java构建栈,并模拟实现栈的入栈和出栈方法 9 | * 使用链表实现 10 | */ 11 | 12 | public class Statck2 { 13 | 14 | private List statck = new ArrayList(); 15 | 16 | public Statck2(){ 17 | //栈的初始化 18 | } 19 | 20 | //清空栈 21 | public void clear(){ 22 | statck.clear(); 23 | System.out.println("清空栈.........."); 24 | } 25 | //判断栈是否为空 26 | public boolean isEmpty(){ 27 | return statck.isEmpty(); 28 | } 29 | //获取栈顶元素 30 | public E getTop(){ 31 | if(isEmpty()) 32 | return null; 33 | return statck.get(0); 34 | } 35 | 36 | //弹出栈操作 37 | public E pop(){ 38 | if (isEmpty()) 39 | throw new EmptyStackException(); 40 | System.out.println(statck.size() + "\t 出栈"); 41 | return statck.remove(statck.size() - 1); 42 | } 43 | 44 | //压入栈操作 45 | public void push(E e){ 46 | statck.add(e); 47 | System.out.println(e + "\t 入栈"); 48 | } 49 | 50 | //获取当前栈的深度 51 | public int getStatckSize(){ 52 | if(isEmpty()) 53 | throw new EmptyStackException(); 54 | return statck.size(); 55 | } 56 | 57 | public static void main(String[] args) { 58 | Statck2 s = new Statck2(); 59 | s.clear(); //清空栈 60 | System.out.println("当前栈是否为空:" + s.isEmpty()); 61 | s.push(1); 62 | s.push(2); 63 | s.push(3); 64 | 65 | s.pop(); 66 | System.out.println("当前栈的深度为:" + s.getStatckSize()); 67 | System.out.println("当前栈顶元素为:" + s.getTop()); 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /Java/Dataguru算法导论/TestCode/BitTreeExample.java: -------------------------------------------------------------------------------- 1 | package TestCode; 2 | 3 | /* 4 | * 题目描述:每一次移除二叉树的所有叶子节点,有已知的移除叶子节点序列得到原本的二叉树 5 | * eg: AB 6 | * C 7 | * CAB 8 | */ 9 | public class BitTreeExample { 10 | 11 | } 12 | -------------------------------------------------------------------------------- /Java/Dataguru算法导论/TestCode/HashTableExample.java: -------------------------------------------------------------------------------- 1 | package TestCode; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStreamReader; 6 | import java.util.Hashtable; 7 | import java.util.StringTokenizer; 8 | 9 | /* 10 | * 题目描述:输入读应的几种字符串,其中一个是english,一个是外语开始是输入字典,后来是根据外语来查询字典,没有时输出"en" 11 | */ 12 | 13 | public class HashTableExample { 14 | public static void main(String[] args) throws IOException { 15 | BufferedReader stdin = new BufferedReader(new InputStreamReader(System.in)); 16 | Hashtable table = new Hashtable(); 17 | String s = ""; 18 | String[] arr = new String[2]; 19 | while(true) 20 | { 21 | s = stdin.readLine(); 22 | if(s.equals("")) 23 | break; 24 | arr=s.split(" "); 25 | table.put(arr[1],arr[0]); 26 | } 27 | while(true) 28 | { 29 | s = stdin.readLine(); 30 | if(table.get(s) != null ) 31 | System.out.println(table.get(s)); 32 | else 33 | System.out.println("eh"); 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /Java/Dataguru算法导论/TestCode/fenZhiTest.java: -------------------------------------------------------------------------------- 1 | package TestCode; 2 | 3 | /* 4 | * 题目描述:给定平面上的N个点,计算任意两点的最近距离(N范围是<10000) 5 | */ 6 | 7 | public class fenZhiTest { 8 | 9 | 10 | public static void main(String[] args) { 11 | 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /Java/Dataguru算法导论/TestCode/guibingTest.java: -------------------------------------------------------------------------------- 1 | package TestCode; 2 | 3 | import java.util.Arrays; 4 | /* 5 | * 问题描述 6 | * 有N个整数,A[1],A[2],A[3],....,A[N]。需要找打这样的(i,j)的数对的数量 7 | * 满足 1 <= i < j <=N, A[i] > A[j]。数据范围:1<= N <= 65537,0 <=A[i] <=10^9 8 | */ 9 | public class guibingTest { 10 | 11 | static int[] arr = { 12 | 3,4,1,5,2,6 //示例数组 13 | }; 14 | static int num = 0; //记录满足条件的对数 15 | 16 | public static void main(String[] args) { 17 | MergeSort(arr, 0, 5); 18 | System.out.println("满足条件的逆序数 共: " + num + "对"); 19 | 20 | } 21 | 22 | //归并排序寻找满足条件的对数 23 | private static void MergeSort(int[] arr, int low, int high) { 24 | // TODO Auto-generated method stub 25 | int mid = (low + high) /2; 26 | if(low= K2i+1 Ki>=K2i+2 为大顶堆 8 | * 此为大顶堆的代码实例,小顶堆类似 9 | */ 10 | public class duiSort { 11 | 12 | static int[] arr = { 13 | 16,7,3,20,17,8 //定义待排序数组 14 | }; 15 | public static void main(String[] args) { 16 | 17 | buildHeap();//建立大顶堆并排序 18 | System.out.println("排序好的为:" + Arrays.toString(arr)); 19 | } 20 | 21 | private static void buildHeap() { 22 | // TODO Auto-generated method stub 23 | int len = arr.length; 24 | for(int i =len/2 -1 ;i>=0;i--) //建立大顶堆 25 | { 26 | sortHeap(i,len); 27 | } 28 | System.out.println("建立好的大顶堆如下:" + Arrays.toString(arr)); 29 | for(int j = len-1; j >0; j --) //对大顶堆进行排序 30 | { 31 | swap(0,j); 32 | sortHeap(0,j); 33 | } 34 | } 35 | 36 | private static void sortHeap(int i, int len) { 37 | // TODO Auto-generated method stub 38 | int left = 2*i+1; //定义左节点 39 | int right = 2*i +2; //定义右节点 40 | int large = 0; //存放三个节点中最大节点的下标 41 | if(len >left && arr[left] > arr[i]) //如果左孩子大于根节点 将左孩子下标赋值给large 42 | large = left; 43 | else //否之,将根节点下标赋值给large 44 | large = i; 45 | 46 | if(len > right && arr[right] > arr[large]) 47 | large = right; //若右孩子节点大于根节点,把右孩子节点下标赋值给large 48 | 49 | if(large != i) //若最大节点的下标不等于根节点的下标时,交换其值 50 | { 51 | swap(large,i); 52 | sortHeap(large,len); 53 | } 54 | } 55 | //交换对应下标值 56 | private static void swap(int m, int n) { 57 | // TODO Auto-generated method stub 58 | int temp ; 59 | temp = arr[m]; 60 | arr[m] = arr[n]; 61 | arr[n] = temp; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /Java/Dataguru算法导论/sort/guibing.java: -------------------------------------------------------------------------------- 1 | package sort; 2 | 3 | import java.util.Arrays; 4 | 5 | /* 6 | *归并排序 7 | *时间复杂度 n*lg n 8 | */ 9 | 10 | public class guibing { 11 | 12 | public static void main(String[] args) { 13 | //定义数组 14 | int[] arr = {6,2,4,1,9,65,23,12}; 15 | 16 | //调用归并排序算法 17 | MergeSort(arr,0,7); 18 | 19 | System.out.println(Arrays.toString(arr)); 20 | } 21 | 22 | //归并排序算法 23 | private static void MergeSort(int[] arr, int low, int high) { 24 | // TODO Auto-generated method stub 25 | int mid = (low + high) /2; 26 | if(low0 && arr[j-1] > key){ 23 | arr[j] = arr[j-1]; 24 | j = j-1; 25 | } 26 | arr[j] = key; 27 | } 28 | //输出数组 29 | System.out.println("排序后的数组为:"); 30 | for(int i=0;ihigh) 10 | return; 11 | int temp; //保存基准值 12 | int left=0,right=0,empty=0; 13 | temp=a[low]; //将每次进来的最左边的值作为基准值 14 | left = low; //每次移动的指针初始值最左边的位置复制给left 15 | right = high; //每次移动的指针初始值最右边的位置复制给right 16 | while(left!=right){ //判断循环结束的条件 17 | while(a[right]>=temp && left 3 | 4 | 本部分分为多个项目,会涉及目前比较火的大数据的相关概念,比如说spark,hadoop,mahout,hbase,hive,openstack,storm等,目前主要学习hadoop和mahout,后续有时间和精力的话,会涉及更多,也欢迎大家即使补充相关代码,大家一起学习,一起进步
5 | 6 | 1:Hadoop 目录

7 | 8 | 2:Spark 目录

9 | 10 | 3:Mahout 目录

11 | 12 | 4:Hive 目录

13 | 14 | 5:Hbase 目录

15 | 16 | 6:Java 目录

17 | 18 | 7:cluster_conf 目录
19 | > 集群的配置文件备份 20 | 21 | Email:Thinkagmer_gyt@gmail.com
22 | QQ:1923361654
23 | WeChat:gyt13342445911
24 | 微博:Thinkgamer 25 | -------------------------------------------------------------------------------- /Spark/ChineseWordSplitCount/WordAnalyzer jar包链接.txt: -------------------------------------------------------------------------------- 1 | http://pan.baidu.com/s/1mihghmg -------------------------------------------------------------------------------- /Spark/ChineseWordSplitCount/blog href.txt: -------------------------------------------------------------------------------- 1 | http://blog.csdn.net/gamer_gyt/article/details/52194773 -------------------------------------------------------------------------------- /Spark/ChineseWordSplitCount/wordSplitCount.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thinkgamer/Hadoop-Spark-Learning/46f24ae930fc6d426ad14989cb6dbad1e7966d8e/Spark/ChineseWordSplitCount/wordSplitCount.py -------------------------------------------------------------------------------- /Spark/PageRank/Jar包链接.txt: -------------------------------------------------------------------------------- 1 | http://pan.baidu.com/s/1miASxny -------------------------------------------------------------------------------- /Spark/README.md: -------------------------------------------------------------------------------- 1 | 本目录下主要是我对Spark操作代码托管地方,代码质量不一定高,但是尽我所能去写好每次的code,欢迎补充 2 | -------------------------------------------------------------------------------- /Spark/pairRDD/driver: -------------------------------------------------------------------------------- 1 | package week2 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | import org.apache.spark.SparkContext._ 5 | 6 | object WordCount1 { 7 | def main(args: Array[String]) { 8 | if (args.length == 0) { 9 | System.err.println("Usage: WordCount1 ") 10 | System.exit(1) 11 | } 12 | 13 | val conf = new SparkConf().setAppName("WordCount1") 14 | val sc = new SparkContext(conf) 15 | 16 | .....//此处写你编写的Spark代码 17 | 18 | sc.stop() 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /Spark/pairRDD/example: -------------------------------------------------------------------------------- 1 | scala: 2 | #创建pair RDD 3 | var lines = sc.parallelize(List("i love you")) 4 | val pairs = lines.map(x=>(x,1)) 5 | pairs.foreach(println) 6 | 7 | ============================================================= 8 | #针对一个pair RDD的转化操作 9 | #rdd.reduceByKey(func):合并具有相同key的value值 10 | val rdd = sc.parallelize(List((1,2),(3,4),(3,6))) 11 | val rdd.reduceByKey((x,y)=>x+y) 12 | result.foreach(println) 13 | 14 | #rdd.groupByKey(func):对具有相同键的进行分组 15 | val rdd = sc.parallelize(List((1,2),(3,4),(3,6))) 16 | val result = rdd.groupByKey() 17 | result.foreach(println) 18 | 19 | #rdd.mapValues(func):对pairRDD中的每个值应用func 键不改变 20 | val rdd = sc.parallelize(List((1,2),(3,4),(3,6))) 21 | val result = rdd.mapValues(x=>x+1) 22 | result.foreach(println) 23 | 24 | #rdd.flatMapValues(func):类似于mapValues,返回的是迭代器函数 25 | val rdd = sc.parallelize(List((1,2),(3,4),(3,6))) 26 | val result = rdd.flatMapValues(x=>(x to 5)) 27 | result.foreach(println) 28 | 29 | #rdd.keys:返回一个仅包含键的RDD 30 | val rdd = sc.parallelize(List((1,2),(3,4),(3,6))) 31 | val result = rdd.keys 32 | result.foreach(println) 33 | 34 | #rdd.values:返回一个仅包含value的RDD 35 | val rdd = sc.parallelize(List((1,2),(3,4),(3,6))) 36 | val result = rdd.values 37 | result.foreach(println) 38 | 39 | #rdd.sortByKey():返回一个根据键排序的RDD 40 | val rdd = sc.parallelize(List((1,2),(3,4),(3,6))) 41 | val result = rdd.sortByKey().collect() 42 | result 43 | 44 | =================================================================== 45 | #针对两个pair RDD的转化操作 46 | #rdd.subtractByKey( other ):删除掉RDD中与other RDD中键相同的元素 47 | val rdd = sc.parallelize(List((1,2),(3,4),(3,6))) 48 | val other = sc.parallelize(List((3,9))) 49 | val result = rdd.subtractByKey(other) 50 | result.foreach(println) 51 | 52 | #rdd.join( other ):对两个RDD进行内连接 53 | val result = rdd.join(other) 54 | result.foreach(println) 55 | 56 | #rdd.rightOuterJoin(other),对两个RDD进行连接操作,确保第一个RDD的键必须存在(右外连接) 57 | val result = rdd.rightOuterJoin(other) 58 | result.foreach(println) 59 | 60 | #rdd.leftOuterJoin(other):对两个RDD进行连接操作,确保第一个RDD的键必须存在(左外连接) 61 | val result = rdd.leftOuterJoin(other) 62 | result.foreach(println) 63 | 64 | #rdd.cogroup(other),将有两个rdd中拥有相同键的数据分组 65 | val result = rdd.cogroup(other) 66 | result.foreach(println) 67 | 68 | 69 | 70 | #聚合操作 71 | #使用reduceByKey()和mapValues()计算每个键对应的平均值 72 | val rdd = sc.parallelize(List(Tuple2("panda",0),Tuple2("pink",3),Tuple2("pirate",3),Tuple2("panda",1),Tuple2("pink",4))) 73 | val result = rdd.mapValues(x=>(x,1)).reduceByKey((x,y)=>(x._1+y._1,x._2+y._2)) 74 | result.foreach(println) 75 | 76 | #实现经典的分布式单词计数问题(使用flatMap() 来生成以单词为键,以数字1为值的pair RDD) 77 | val rdd = sc.parallelize(List("i am thinkgamer, i love cyan")) 78 | val words = rdd.flatMap(line => line.split(" ")) 79 | val result = words.map(x=>(x,1)).reduceByKey((x,y) => x+y) 80 | result.foreach(println) 81 | 82 | #实现经典的分布式单词计数问题(使用countByValue更快的实现单词计数) 83 | val rdd = sc.parallelize(List("i am thinkgamer, i love cyan")) 84 | val result = rdd.flatMap(x=>x.split(" ")).countByValue() 85 | result.foreach(println) 86 | 87 | #combineByKey()是最为常用的基于键进行聚合的函数,大多数基于键聚合的函数都是用它实现的,和aggregate()一样,combineByKey()可以让用户返回与输入数据类型不同的返回值 88 | 89 | val data = Seq(("a",3),("b",4),("c",5)) 90 | sc.parallelize(data).reduceByKey((x,y)=>x+y) //默认并行度 91 | sc.parallelize(data).reduceByKey((x,y)=>x+y,10) //自定义并行度 92 | 93 | #获取RDD的分区方式 94 | scala> val pairs = sc.parallelize(List((1,1),(2,2),(3,3))) 95 | pairs: org.apache.spark.rdd.RDD[(Int, Int)] = ParallelCollectionRDD[9] at parallelize at :27 96 | 97 | scala> pairs.partitioner 98 | res4: Option[org.apache.spark.Partitioner] = None 99 | 100 | scala> val partitioned = pairs.partitionBy(new org.apache.spark.HashPartitioner(2)) 101 | partitioned: org.apache.spark.rdd.RDD[(Int, Int)] = ShuffledRDD[10] at partitionBy at :29 102 | 103 | scala> partitioned.partitioner 104 | res5: Option[org.apache.spark.Partitioner] = Some(org.apache.spark.HashPartitioner@2) 105 | 106 | -------------------------------------------------------------------------------- /Spark/pairRDD/example~: -------------------------------------------------------------------------------- 1 | scala: 2 | #创建pair RDD 3 | var lines = sc.parallelize(List("i love you")) 4 | val pairs = lines.map(x=>(x,1)) 5 | pairs.foreach(println) 6 | 7 | ============================================================= 8 | #针对一个pair RDD的转化操作 9 | #rdd.reduceByKey(func):合并具有相同key的value值 10 | val rdd = sc.parallelize(List((1,2),(3,4),(3,6))) 11 | val rdd.reduceByKey((x,y)=>x+y) 12 | result.foreach(println) 13 | 14 | #rdd.groupByKey(func):对具有相同键的进行分组 15 | val rdd = sc.parallelize(List((1,2),(3,4),(3,6))) 16 | val result = rdd.groupByKey() 17 | result.foreach(println) 18 | 19 | #rdd.mapValues(func):对pairRDD中的每个值应用func 键不改变 20 | val rdd = sc.parallelize(List((1,2),(3,4),(3,6))) 21 | val result = rdd.mapValues(x=>x+1) 22 | result.foreach(println) 23 | 24 | #rdd.flatMapValues(func):类似于mapValues,返回的是迭代器函数 25 | val rdd = sc.parallelize(List((1,2),(3,4),(3,6))) 26 | val result = rdd.flatMapValues(x=>(x to 5)) 27 | result.foreach(println) 28 | 29 | #rdd.keys:返回一个仅包含键的RDD 30 | val rdd = sc.parallelize(List((1,2),(3,4),(3,6))) 31 | val result = rdd.keys 32 | result.foreach(println) 33 | 34 | #rdd.values:返回一个仅包含value的RDD 35 | val rdd = sc.parallelize(List((1,2),(3,4),(3,6))) 36 | val result = rdd.values 37 | result.foreach(println) 38 | 39 | #rdd.sortByKey():返回一个根据键排序的RDD 40 | val rdd = sc.parallelize(List((1,2),(3,4),(3,6))) 41 | val result = rdd.sortByKey().collect() 42 | result 43 | 44 | =================================================================== 45 | #针对两个pair RDD的转化操作 46 | #rdd.subtractByKey( other ):删除掉RDD中与other RDD中键相同的元素 47 | val rdd = sc.parallelize(List((1,2),(3,4),(3,6))) 48 | val other = sc.parallelize(List((3,9))) 49 | val result = rdd.subtractByKey(other) 50 | result.foreach(println) 51 | 52 | #rdd.join( other ):对两个RDD进行内连接 53 | val result = rdd.join(other) 54 | result.foreach(println) 55 | 56 | #rdd.rightOuterJoin(other),对两个RDD进行连接操作,确保第一个RDD的键必须存在(右外连接) 57 | val result = rdd.rightOuterJoin(other) 58 | result.foreach(println) 59 | 60 | #rdd.leftOuterJoin(other):对两个RDD进行连接操作,确保第一个RDD的键必须存在(左外连接) 61 | val result = rdd.leftOuterJoin(other) 62 | result.foreach(println) 63 | 64 | #rdd.cogroup(other),将有两个rdd中拥有相同键的数据分组 65 | val result = rdd.cogroup(other) 66 | result.foreach(println) 67 | 68 | 69 | 70 | #聚合操作 71 | #使用reduceByKey()和mapValues()计算每个键对应的平均值 72 | val rdd = sc.parallelize(List(Tuple2("panda",0),Tuple2("pink",3),Tuple2("pirate",3),Tuple2("panda",1),Tuple2("pink",4))) 73 | val result = rdd.mapValues(x=>(x,1)).reduceByKey((x,y)=>(x._1+y._1,x._2+y._2)) 74 | result.foreach(println) 75 | 76 | #实现经典的分布式单词计数问题(使用flatMap() 来生成以单词为键,以数字1为值的pair RDD) 77 | val rdd = sc.parallelize(List("i am thinkgamer, i love cyan")) 78 | val words = rdd.flatMap(line => line.split(" ")) 79 | val result = words.map(x=>(x,1)).reduceByKey((x,y) => x+y) 80 | result.foreach(println) 81 | 82 | #实现经典的分布式单词计数问题(使用countByValue更快的实现单词计数) 83 | val rdd = sc.parallelize(List("i am thinkgamer, i love cyan")) 84 | val result = rdd.flatMap(x=>x.split(" ")).countByValue() 85 | result.foreach(println) 86 | 87 | #combineByKey()是最为常用的基于键进行聚合的函数,大多数基于键聚合的函数都是用它实现的,和aggregate()一样,combineByKey()可以让用户返回与输入数据类型不同的返回值 88 | 89 | val data = Seq(("a",3),("b",4),("c",5)) 90 | sc.parallelize(data).reduceByKey((x,y)=>x+y) //默认并行度 91 | sc.parallelize(data).reduceByKey((x,y)=>x+y,10) //自定义并行度 92 | 93 | #获取RDD的分区方式 94 | val pairs = sc.parallelize(List((1,1),(2,2),(3,3))) 95 | scala> pairs.partitioner 96 | res4: Option[org.apache.spark.Partitioner] = None 97 | scala> val partitioned = pairs.partitionBy(new org.apache.spark.HashPartitioner(2)) 98 | partitioned: org.apache.spark.rdd.RDD[(Int, Int)] = ShuffledRDD[10] at partitionBy at :29 99 | 100 | scala> partitioned.partitioner 101 | res5: Option[org.apache.spark.Partitioner] = Some(org.apache.spark.HashPartitioner@2) 102 | 103 | -------------------------------------------------------------------------------- /Spark/pairRDD/sample: -------------------------------------------------------------------------------- 1 | //parallelize演示 2 | val num=sc.parallelize(1 to 10) 3 | val doublenum = num.map(_*2) 4 | val threenum = doublenum.filter(_ % 3 == 0) 5 | threenum.collect 6 | threenum.toDebugString 7 | 8 | val num1=sc.parallelize(1 to 10,6) 9 | val doublenum1 = num1.map(_*2) 10 | val threenum1 = doublenum1.filter(_ % 3 == 0) 11 | threenum1.collect 12 | threenum1.toDebugString //查看依赖 13 | 14 | threenum.cache() 15 | val fournum = threenum.map(x=>x*x) 16 | fournum.collect //可以在web监控界面查看 17 | fournum.toDebugString 18 | threenum.unpersist() //可以删除cache,立即执行,不像cache函数,需要触发 19 | 20 | num.reduce (_ + _) 21 | num.take(5) 22 | num.first 23 | num.count 24 | num.take(5).foreach(println) 25 | 26 | //K-V演示 27 | val kv1=sc.parallelize(List(("A",1),("B",2),("C",3),("A",4),("B",5))) 28 | kv1.sortByKey().collect //注意sortByKey的小括号不能省 29 | kv1.groupByKey().collect 30 | kv1.reduceByKey(_+_).collect 31 | 32 | val kv2=sc.parallelize(List(("A",4),("A",4),("C",3),("A",4),("B",5))) 33 | kv2.distinct.collect 34 | kv1.union(kv2).collect 35 | 36 | val kv3=sc.parallelize(List(("A",10),("B",20),("D",30))) 37 | kv1.join(kv3).collect 38 | kv1.cogroup(kv3).collect 39 | 40 | val kv4=sc.parallelize(List(List(1,2),List(3,4))) 41 | kv4.flatMap(x=>x.map(_+1)).collect 42 | 43 | //文件读取演示 44 | val rdd1 = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/directory/") 45 | rdd1.toDebugString 46 | val words=rdd1.flatMap(_.split(" ")) 47 | val wordscount=words.map(x=>(x,1)).reduceByKey(_+_) 48 | wordscount.collect 49 | wordscount.toDebugString 50 | 51 | val rdd2 = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/directory/*.txt") 52 | rdd2.flatMap(_.split(" ")).map(x=>(x,1)).reduceByKey(_+_).collect 53 | 54 | //gzip压缩的文件 55 | val rdd3 = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/test.txt.gz") 56 | rdd3.flatMap(_.split(" ")).map(x=>(x,1)).reduceByKey(_+_).collect 57 | 58 | //日志处理演示 59 | //http://download.labs.sogou.com/dl/q.html 完整版(2GB):gz格式 60 | //访问时间\t用户ID\t[查询词]\t该URL在返回结果中的排名\t用户点击的顺序号\t用户点击的URL 61 | //SogouQ1.txt、SogouQ2.txt、SogouQ3.txt分别是用head -n 或者tail -n 从SogouQ数据日志文件中截取n行 62 | 63 | //搜索结果排名第1,但是点击次序排在第2的数据有多少? 64 | val rdd1 = sc.textFile("hdfs://hadoop1:8000/dataguru/data/SogouQ1.txt") 65 | val rdd2=rdd1.map(_.split("\t")).filter(_.length==6) 66 | rdd2.count() 67 | val rdd3=rdd2.filter(_(3).toInt==1).filter(_(4).toInt==2) 68 | rdd3.count() 69 | rdd3.toDebugString 70 | 71 | //session查询次数排行榜 72 | val rdd4=rdd2.map(x=>(x(1),1)).reduceByKey(_+_).map(x=>(x._2,x._1)).sortByKey(false).map(x=>(x._2,x._1)) 73 | rdd4.toDebugString 74 | rdd4.saveAsTextFile("hdfs://hadoop1:8000/dataguru/week2/output1") 75 | 76 | 77 | //cache()演示 78 | //检查block命令:bin/hdfs fsck /dataguru/data/SogouQ3.txt -files -blocks -locations 79 | val rdd5 = sc.textFile("hdfs://hadoop1:8000/dataguru/data/SogouQ3.txt") 80 | rdd5.cache() 81 | rdd5.count() 82 | rdd5.count() //比较时间 83 | 84 | 85 | //join演示 86 | val format = new java.text.SimpleDateFormat("yyyy-MM-dd") 87 | case class Register (d: java.util.Date, uuid: String, cust_id: String, lat: Float,lng: Float) 88 | case class Click (d: java.util.Date, uuid: String, landing_page: Int) 89 | val reg = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/join/reg.tsv").map(_.split("\t")).map(r => (r(1), Register(format.parse(r(0)), r(1), r(2), r(3).toFloat, r(4).toFloat))) 90 | val clk = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/join/clk.tsv").map(_.split("\t")).map(c => (c(1), Click(format.parse(c(0)), c(1), c(2).trim.toInt))) 91 | reg.join(clk).take(2) 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /cluster_conf/README.md: -------------------------------------------------------------------------------- 1 | # 集群说明: 2 | 三个节点:分别为master1和slave1,slave2(由于电脑资源有限) 3 | 4 | master1 文件夹中为主节点的配置文件 5 | 6 | slave1 文件夹中为从节点的配置文件(如果有多个从节点,可复制) 7 | 8 | slave2 文件夹中为从节点2的配置文件 9 | 10 | # hosts文件修改说明 11 | 12 | 在主从节点的/etc/hosts中加入以下两行或者多行 13 | 14 | ``` 15 | master1IP master1 16 | slave1IP slave1 17 | slave2IP slave2 18 | ``` 19 | 20 | # java options问题 21 | 22 | 关于deepin执行java -version显示 23 | ``` 24 | Picked up _JAVA_OPTIONS: -Dawt.useSystemAAFontSettings=gasp 25 | ``` 26 | 是正常的,对于强迫症的我,将其除去的办法是: 27 | 28 | 在 /etc/profile中加入 29 | ``` 30 | unset _JAVA_OPTIONS 31 | 32 | ``` 33 | 34 | --- 35 | 36 | # 遇到的问题及解决办法* 37 | ## 1: sign_and_send_pubkey: signing failed: agent refused operation 38 | 这是因为ssh 产生的秘钥没有加入到系统中,执行 ssh-add即可 39 | 40 | ## 2:Error: JAVA_HOME is not set and could not be found. 41 | 原因:我安装java环境的时候采用的deb安装的,所以系统已经有了$JAVA_HOME,但是在hadoop/etc/hadoop/hadoop-env.sh中不识别 42 | >export JAVA_HOME=${JAVA_HOME} 43 | 44 | 这里将${JAVA_HOME}换成你自己的java环境路径即可,可以通过 45 | >echo $JAVA_HOME 46 | 47 | 来查看 48 | 49 | ## 3:hadoop datanode 服务启动不成功 50 | 原因:datanode的clusterID 和 namenode的 clusterID 不匹配 51 | 解决办法: 52 | 根据 hdfs-site.xml 中的配置: 53 | 1、 打开 dfs.namenode.name.dir 配置对应目录下的 current 目录下的 VERSION 文件,拷贝clusterID; 54 | 2、 打开 dfs.datanode.data.dir 配置对应目录下的 current 目录下的 VERSION 文件,用拷贝的 clusterID 覆盖原有的clusterID; 55 | 3、 保存后重新启动 hadoop,datanode 进程就能正常启动了。 56 | 57 | ## 4:hive配置后启动错误 58 | 错误:Failed to get schema version 59 | 原因:在hive-site.xml配置javax.jdo.option.ConnectionURL value时,我把其中mysql所在的服务器的IP写成了用户名,这里改为localhost或者IP即可 60 | 61 | ## 5:从节点19888端口无法访问 62 | 63 | 执行:mr-jobhistory-daemon.sh start historyserver 64 | 65 | --- 66 | 67 | # 运行MR在远程集群的两种办法 68 | ## 1:提交jar包 69 | ``` 70 | hadoop jar xxx.jar classname inputpath outputpath 71 | ``` 72 | 73 | ## 2:代码中进行配置 74 | ``` 75 | Configuration conf = new Configuration(); 76 | conf.set("mapreduce.app-submission.cross-platform", "true"); 77 | conf.set("yarn.resourcemanager.address", "http://master1:8032"); 78 | conf.set("mapreduce.framework.name", "yarn"); 79 | Job job = Job.getInstance(conf, "wordcount"); 80 | ``` 81 | -------------------------------------------------------------------------------- /cluster_conf/master1/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | fs.default.name 22 | hdfs://master1:9000 23 | 24 | 25 | hadoop.tmp.dir 26 | file:/home/node1/bigdata/hadoop-2.7.3/tmp 27 | 28 | 29 | -------------------------------------------------------------------------------- /cluster_conf/master1/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | dfs.replication 22 | 2 23 | 24 | 25 | dfs.namenode.name.dir 26 | file:/home/node1/bigdata/hadoop-2.7.3/dfs/name 27 | 28 | 29 | dfs.datanode.data.dir 30 | file:/home/node1/bigdata/hadoop-2.7.3/dfs/data 31 | 32 | 33 | -------------------------------------------------------------------------------- /cluster_conf/master1/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | mapreduce.framework.name 22 | yarn 23 | 24 | 25 | mapreduce.jobhistory.address 26 | master1:10020 27 | 28 | 29 | mapreduce.jobhistory.webapp.address 30 | master1:19888 31 | 32 | 33 | -------------------------------------------------------------------------------- /cluster_conf/master1/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 15 | 16 | 17 | 18 | 19 | yarn.nodemanager.aux-services 20 | mapreduce_shuffle 21 | 22 | 23 | yarn.nodemanager.aux-services.mapreduce.shuffle.class 24 | org.apache.hadoop.mapred.ShuffleHandler 25 | 26 | 27 | yarn.resourcemanager.address 28 | master1:8032 29 | 30 | 31 | yarn.resourcemanager.scheduler.address 32 | master1:8030 33 | 34 | 35 | yarn.resourcemanager.resource-tracker.address 36 | master1:8031 37 | 38 | 39 | yarn.resourcemanager.admin.address 40 | master1:8033 41 | 42 | 43 | yarn.resourcemanager.webapp.address 44 | master1:8088 45 | 46 | 47 | -------------------------------------------------------------------------------- /cluster_conf/slave1/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | fs.default.name 22 | hdfs://master1:9000 23 | 24 | 25 | hadoop.tmp.dir 26 | /home/node1/bigdata/hadoop-2.7.3/tmp 27 | 28 | 29 | -------------------------------------------------------------------------------- /cluster_conf/slave1/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | dfs.replication 22 | 2 23 | 24 | 25 | dfs.namenode.name.dir 26 | file:/home/node1/bigdata/hadoop-2.7.3/dfs/name 27 | 28 | 29 | dfs.datanode.data.dir 30 | file:/home/node1/bigdata/hadoop-2.7.3/dfs/data 31 | 32 | 33 | -------------------------------------------------------------------------------- /cluster_conf/slave1/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | mapreduce.framework.name 22 | yarn 23 | 24 | 25 | mapreduce.jobhistory.address 26 | master1:10020 27 | 28 | 29 | mapreduce.jobhistory.webapp.address 30 | master1:19888 31 | 32 | 33 | -------------------------------------------------------------------------------- /cluster_conf/slave1/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 15 | 16 | 17 | 18 | 19 | yarn.nodemanager.aux-services 20 | mapreduce_shuffle 21 | 22 | 23 | yarn.nodemanager.aux-services.mapreduce.shuffle.class 24 | org.apache.hadoop.mapred.ShuffleHandler 25 | 26 | 27 | yarn.resourcemanager.address 28 | master1:8032 29 | 30 | 31 | yarn.resourcemanager.scheduler.address 32 | master1:8030 33 | 34 | 35 | yarn.resourcemanager.resource-tracker.address 36 | master1:8031 37 | 38 | 39 | yarn.resourcemanager.admin.address 40 | master1:8033 41 | 42 | 43 | yarn.resourcemanager.webapp.address 44 | master1:8088 45 | 46 | 47 | -------------------------------------------------------------------------------- /cluster_conf/slave2/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | fs.default.name 22 | hdfs://master1:9000 23 | 24 | 25 | hadoop.tmp.dir 26 | /home/node1/bigdata/hadoop-2.7.3/tmp 27 | 28 | 29 | -------------------------------------------------------------------------------- /cluster_conf/slave2/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | dfs.replication 22 | 2 23 | 24 | 25 | dfs.namenode.name.dir 26 | file:/home/node1/bigdata/hadoop-2.7.3/dfs/name 27 | 28 | 29 | dfs.datanode.data.dir 30 | file:/home/node1/bigdata/hadoop-2.7.3/dfs/data 31 | 32 | 33 | -------------------------------------------------------------------------------- /cluster_conf/slave2/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | mapreduce.framework.name 22 | yarn 23 | 24 | 25 | mapreduce.jobhistory.address 26 | master1:10020 27 | 28 | 29 | mapreduce.jobhistory.webapp.address 30 | master1:19888 31 | 32 | 33 | -------------------------------------------------------------------------------- /cluster_conf/slave2/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 15 | 16 | 17 | 18 | 19 | yarn.nodemanager.aux-services 20 | mapreduce_shuffle 21 | 22 | 23 | yarn.nodemanager.aux-services.mapreduce.shuffle.class 24 | org.apache.hadoop.mapred.ShuffleHandler 25 | 26 | 27 | yarn.resourcemanager.address 28 | master1:8032 29 | 30 | 31 | yarn.resourcemanager.scheduler.address 32 | master1:8030 33 | 34 | 35 | yarn.resourcemanager.resource-tracker.address 36 | master1:8031 37 | 38 | 39 | yarn.resourcemanager.admin.address 40 | master1:8033 41 | 42 | 43 | yarn.resourcemanager.webapp.address 44 | master1:8088 45 | 46 | 47 | --------------------------------------------------------------------------------