├── README.md ├── final ├── .gitignore ├── .idea │ ├── .gitignore │ ├── encodings.xml │ ├── misc.xml │ └── vcs.xml ├── README.md ├── pom.xml ├── report │ └── report.tex └── src │ └── main │ └── java │ ├── CountHour.java │ ├── CountMapred.java │ ├── CountTimes.java │ ├── CountUA.java │ ├── GenericPair.java │ ├── Log.java │ ├── Sorter.java │ ├── UniqueIP.java │ └── UniqueMapred.java ├── lab2 ├── .gitignore ├── .idea │ ├── .gitignore │ ├── encodings.xml │ ├── misc.xml │ ├── remote-targets.xml │ └── vcs.xml ├── pom.xml └── src │ └── main │ ├── java │ ├── InvertedIndex.java │ ├── Sorter.java │ └── TFIDF.java │ └── resources │ └── META-INF │ └── MANIFEST.MF ├── lab3 ├── .gitignore ├── .idea │ ├── .gitignore │ ├── encodings.xml │ ├── misc.xml │ ├── remote-targets.xml │ └── vcs.xml ├── pom.xml └── src │ └── main │ └── java │ ├── GenericPair.java │ ├── MaxShip.java │ ├── MaxShip2.java │ └── SumNation.java └── lab4 ├── .gitignore ├── .idea ├── .gitignore ├── encodings.xml ├── misc.xml ├── remote-targets.xml ├── uiDesigner.xml └── vcs.xml ├── README.md ├── pom.xml └── src └── main └── java ├── Data.java ├── KNN.java ├── MyMapper.java ├── Pair.java └── WKNN.java /README.md: -------------------------------------------------------------------------------- 1 | # NJU-Bigdata 2 | 3 | Lab of NJU Bigdata Course 4 | -------------------------------------------------------------------------------- /final/.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | !.mvn/wrapper/maven-wrapper.jar 3 | !**/src/main/**/target/ 4 | !**/src/test/**/target/ 5 | 6 | ### IntelliJ IDEA ### 7 | .idea/modules.xml 8 | .idea/jarRepositories.xml 9 | .idea/compiler.xml 10 | .idea/libraries/ 11 | *.iws 12 | *.iml 13 | *.ipr 14 | 15 | ### Eclipse ### 16 | .apt_generated 17 | .classpath 18 | .factorypath 19 | .project 20 | .settings 21 | .springBeans 22 | .sts4-cache 23 | 24 | ### NetBeans ### 25 | /nbproject/private/ 26 | /nbbuild/ 27 | /dist/ 28 | /nbdist/ 29 | /.nb-gradle/ 30 | build/ 31 | !**/src/main/**/build/ 32 | !**/src/test/**/build/ 33 | 34 | ### VS Code ### 35 | .vscode/ 36 | 37 | ### Mac OS ### 38 | .DS_Store -------------------------------------------------------------------------------- /final/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # 默认忽略的文件 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /final/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /final/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /final/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /final/README.md: -------------------------------------------------------------------------------- 1 | # Final Project 2 | 3 | ## 编译方法 4 | 5 | 进入这个 README 所在的目录， 6 | 7 | ```bash 8 | mvn package 9 | ``` 10 | 11 | ## 使用方法 12 | 13 | ```bash 14 | hadoop jar LogAnalyse-1.0-SNAPSHOT-jar-with-dependencies.jar 15 | ``` 16 | 17 | - 任务二：`CountTimes` 18 | - 任务三：`UniqueIP` 19 | - 任务四：`CountHour` 20 | - 任务五：`CountUA` 21 | -------------------------------------------------------------------------------- /final/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | nju 8 | LogAnalyse 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 8 13 | 8 14 | UTF-8 15 | 2.7.4 16 | 17 | 18 | 19 | 20 | 21 | org.apache.hadoop 22 | hadoop-common 23 | ${hadoop.version} 24 | provided 25 | 26 | 27 | org.apache.hadoop 28 | hadoop-hdfs 29 | ${hadoop.version} 30 | provided 31 | 32 | 33 | org.apache.hadoop 34 | hadoop-mapreduce-client-core 35 | ${hadoop.version} 36 | provided 37 | 38 | 39 | org.apache.hadoop 40 | hadoop-mapreduce-client-jobclient 41 | ${hadoop.version} 42 | provided 43 | 44 | 45 | io.krakens 46 | java-grok 47 | 0.1.9 48 | 49 | 50 | 51 | 52 | 53 | 54 | maven-assembly-plugin 55 | 56 | 57 | jar-with-dependencies 58 | 59 | 60 | 61 | 62 | make-assembly 63 | package 64 | 65 | single 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /final/report/report.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{ctex} 3 | \usepackage{geometry} 4 | \geometry{left=3.18cm,right=3.18cm,top=2.54cm,bottom=2.54cm} 5 | \usepackage{graphicx} 6 | \pagestyle{plain} 7 | \usepackage{setspace} 8 | \usepackage{listings} 9 | \usepackage{xcolor} 10 | \usepackage{fontspec} 11 | \usepackage{tikz} 12 | \usepackage{booktabs} 13 | \usepackage{multirow} 14 | \usepackage{hyperref} 15 | \usepackage{ulem} 16 | \setmonofont{Fira Code} 17 | \setmainfont{Times New Roman} 18 | % \setCJKsansfont{Noto Sans CJK SC} 19 | \hypersetup{ 20 | colorlinks=true, 21 | linkcolor=black, 22 | filecolor=magenta, 23 | urlcolor=cyan 24 | } 25 | \lstset{ 26 | numbers=left, 27 | numberstyle=\tiny, 28 | keywordstyle=\color{blue!70}, 29 | commentstyle=\color{red!50!green!50!blue!50}, 30 | frame=trbl, 31 | rulesepcolor=\color{red!20!green!20!blue!20}, 32 | basicstyle=\ttfamily\scriptsize 33 | } 34 | \date{} 35 | \begin{document} 36 | 37 | \begin{center} 38 | \quad \\ 39 | \quad \\ 40 | \heiti \fontsize{30}{17} 大数据处理综合实验 41 | \vskip 0.5cm 42 | \heiti \fontsize{45}{17} 课\quad 程\quad 设\quad 计 43 | \vskip 3.5cm 44 | \heiti \zihao{2} 网站访问日志分析 45 | \end{center} 46 | \vskip 3.5cm 47 | 48 | \begin{quotation} 49 | \songti \fontsize{15}{15} 50 | \doublespacing 51 | \par\setlength\parindent{9em} 52 | \quad 53 | 54 | 成员：\underline{\makebox[15em]{\qquad xxx \qquad }} 55 | 56 | 学号：\underline{\makebox[15em]{\qquad xxxxxxxxx \qquad }} 57 | 58 | 邮箱：\underline{\makebox[15em]{\qquad xxxxxxxxx@xxxxx.xxx.xxx.xx \qquad }} 59 | 60 | 成员：\underline{\makebox[15em]{\qquad xxx \qquad }} 61 | 62 | 学号：\underline{\makebox[15em]{\qquad xxxxxxxxx \qquad }} 63 | 64 | 邮箱：\underline{\makebox[15em]{\qquad xxxxxxxxx@xxxxx.xxx.xxx.xx \qquad }} 65 | 66 | 导师: \underline{\makebox[15em]{\qquad xx \qquad }} 67 | 68 | 研究领域: \underline{\makebox[13em]{\qquad 云计算与大数据 \qquad }} 69 | \vskip 2cm 70 | \centering 71 | \today 72 | \end{quotation} 73 | 74 | \newpage 75 | 76 | \tableofcontents 77 | 78 | \newpage 79 | \section{小组分工} 80 | 81 | 本次实验的整体代码架构和所用第三方库由两人共同讨论并确定。在确定整体框架后，由xxx负责编写代码，由xxx负责测试代码和编写报告。 82 | 83 | \section{课程设计题目} 84 | 85 | 我们选择的课程设计题目是第一个，即网站访问日志分析。 86 | 87 | \section{摘要} 88 | 我们采用 MapReduce 框架，完成了网络访问日志分析的任务 89 | 90 | 在此课程设计中，我们设计了两套自定义 MapReduce 基类，分别对应解决两类任务所需要的 91 | 不同策略。通过对对应基类进行继承和重载，我们解决了 4 个要求各异的任务。同时，我们利用 92 | 课程中学到的知识，设计了 Combiner 和 Partitioner, 分别用来优化网络传输带宽和自定义中间结果分区，整体解决方案效率较高。 93 | 最终，我们对不同任务输出的结果进行分析，得到了关于此网站状态的有益的信息。 94 | 95 | \section{研究问题背景} 96 | 97 | 网页日志中包含了一个网页的各种活动信息，对网页日志进行分析，能得出许多深刻而有益的结论。 98 | 随着互联网的飞速发展，各种网站产生的数据飞速增长，复杂度和规模都在不断上升。日志分析是 99 | 典型的可并行化的问题，它包含对每条日志的同样的分析动作，分析产生的综合信息需要后续汇总 100 | 处理，同时，汇总的顺序并不会影响结论，因此非常适合采用 MapReduce 框架设计并行程序 101 | 分析处理。我们的课程设计设计了一套 MapReduce 程序，分析了某网站的网页日志数据，汇总 102 | 出几类信息，并在此基础上得出了关于此网站的一些结论。 103 | 104 | \section{技术难点和解决方案} 105 | \subsection{主要技术难点} 106 | 在完成实验时，我们遇到了如下一些问题。 107 | \begin{enumerate} 108 | \item 日志文件的可靠解析方式 109 | \item 对任务进行合理分解，并设计 MapReduce 并行化算法 110 | \item 合理设计代码结构以复用代码并提供未来的可扩展性 111 | \end{enumerate} 112 | 113 | 其中的技术难点主要在日志文件的解析和合理设计算法上。日志文件的解析涉及到比较复杂的字符串解析，如解析时间戳、解析URL等等， 114 | 这部分若自己实现，则显得繁琐，所以我们使用了开源的第三方库帮助我们解析日志。对于合理设计算法，难度则主要在设计可复用的代码 115 | 框架并提供可扩展性上，问题本身的可并行性则比较明显。 116 | 117 | 下面一节会首先介绍设计思路，然后详细介绍实现细节。 118 | \subsection{设计思路和详细设计说明} 119 | \subsubsection{日志解析（任务一）} 120 | 通过观察日志本身和查询资料，我们发现，此日志的格式为\textbf{Combined Log Format}\cite{clf}. 能够解析这种格式的第三方库非常多，我们采用 \verb|grok|\cite{grok}库帮助我们 121 | 直接解析日志文件格式。 122 | 123 | 我们自定义 \verb|Log|类以存储日志解析的结果，其内包含多个 \verb|final|字段和对应的 \verb|get|方法。 124 | 我们将其构造函数声明为 \verb|private|, 仅向外提供一个 \verb|parseLog|的静态方法，此方法接受日志字符串 125 | 并返回根据日志构造出的 \verb|Log|对象。这么做之后用户只能通过 \verb|parseLog|构造 \verb|Log|对象，能 126 | 防止构造函数被滥用。 127 | 128 | \verb|parseLog|方法基本上只是对 \verb|grok|库解析日志功能的封装，我们首先使用 \verb|grok|将日志字符串 129 | 解析为键值对，然后利用这些键值对初始化 \verb|Log|对象并返回。 130 | 131 | 对于解析出的日志文件，我们仅将其存储在内存中，并不会写入磁盘，后续的任务也不会从磁盘读取任务一 132 | 的中间结果。这是因为，我们写入的解析结果，在磁盘上仍以结构化文本形式存在，后续任务将其读取出来时， 133 | 并不能直接用来初始化对象，仍需要新的一轮字符串解析流程，而且这部分就得由我们自己手工编写了。这样 134 | 我们既没有提升什么效率，又增加了手工编写的工作量以及出错的几率，所以不如直接读取原始日志数据，然后 135 | 采用第三方库将其解析为内存中的数据结构。这样做不仅不会带来性能瓶颈，还能节约工作量，提升可靠性。 136 | 137 | \subsubsection{任务分解和并行算法设计（任务2--5）} 138 | 139 | 此课程设计中，任务一已在上一部分解决，剩余五个任务中，前四个可用 MapReduce 解决。这是因为，他们 140 | 都是对每条日志，提取出我们关心的信息输出作为中间结果，然后对中间结果综合形成最终结果。这非常适合 141 | 采用并行程序解决。我们将``从每条日志中提取中间结果''和``综合中间结果''两部分并行化，能带来非常大的 142 | 性能提升。 143 | 144 | 我们通过观察几个任务，能大致将任务分为两类。一类是 \verb|Count| 类任务，即我们要做的都是 145 | 统计某种东西出现的个数；另一类是 \verb|Unique| 类任务，即我们不仅统计某种现象的出现，还要对 146 | 现象的来源进行统计，即去重。我们针对这两类任务，分别设计了两套基类，通过对基类的继承和实现，我们 147 | 设计出针对每个任务的算法。 148 | 149 | 首先，我们介绍两套基类的实现。 150 | 151 | \paragraph{Count 类任务} 对此类任务，我们在 \verb|CountMapred.java| 文件中定义 \verb|CountMapred| 类来抽象解决方案。 152 | 153 | \verb|CountMapred| 类中包含两个静态抽象类，为 \verb|CountMapper| 和 \verb|CountReducer|，以及一个静态的 154 | \verb|bindJob| 方法。最后一个方法封装了此类的使用方法。 155 | 156 | 两个内部类分别重载了 \verb|map| 和 \verb|reduce| 方法。 157 | 158 | \verb|map| 方法的输入键值对为 \verb|<行偏移量, 一行的内容>|。 159 | 输入文件中一行就是一个完整的日志，所以\verb|map|方法可以直接采用这种输入键值对。 160 | \verb|map| 方法首先调用 \verb|Log| 类的 \verb|parseLog| 静态方法解析日志，然后 161 | 将日志解析结果传给自身的抽象方法 \verb|toKey| 获得需要输出的键。得到这个键以后，\verb|map| 就 162 | 判空并输出键值对\verb||。 \verb|map| 方法的输出键值对为 \verb|Text, 1|。其含义为 163 | 我们关心的资源出现了一次。 164 | 165 | \verb|map| 方法所调用的 \verb|toKey| 抽象方法是用来实现自定义策略的地方，使用 \verb|CountMapper| 的 166 | 程序员需要重载此方法，自定义如何从日志中获得感兴趣的信息。此方法可以提供灵活性和可扩展性。 167 | 168 | \verb|reduce| 则比较简单，它将资源出现的次数加起来，然后输出资源出现的总次数。\verb|reduce| 的输入键值对 169 | 类型是\verb||，其意义则是``我们关心的资源和其出现次数列表''，因为中间可能会经过 Combiner， 170 | 所以次数不一定为 1 。\verb|reduce| 的输出则是统计结果，格式为 \verb||，前者是资源名，后者则是总数。 171 | 172 | 我们在 \verb|bindJob| 中做的事，其实和平常直接写 MapReduce 程序时在 \verb|main| 函数 173 | 中做的事相近，此处只是将其封装起来，作为固定的执行策略。我们在这函数中，将 \verb|CountReducer| 174 | 同时设置为 \verb|Reducer| 和 \verb|Combiner|，因为 \verb|Combiner| 做的事情确实和 \verb|Reducer| 没有区别。 175 | 我们通过这种方式节约了网络传输带宽。 176 | 177 | 此类任务并不需要自定义 \verb|Partitioner|。 178 | 179 | \paragraph{Unique 类任务} 此类任务的策略定义在 \verb|UniqueMapred.java| 文件中，为 \verb|UniqueMapred| 类。 180 | 181 | 此类任务涉及到输入日志中的两个变量，而且涉及到去重，因此实现会更为复杂一些。 182 | 183 | 在之前的实验中，我们自己定义了 \verb|GenericPair| 类，它可以容纳一对支持 hadoop 接口 184 | 的量，自己也实现了 hadoop 要求的接口，因此可以直接在 MapReduce 的过程中使用。我们 185 | 继承这个类，定义 \verb|TextPair| 类，容纳两个 \verb|Text| 类型的变量。 186 | 187 | 我们自定义的 \verb|UniqueMapper| 向外提供两个可重载的抽象方法 \verb|toKey1| 和 \verb|toKey2|，用来 188 | 从日志中获取两个感兴趣的信息，提供灵活性和可扩展性。在 \verb|map| 方法中，我们依旧是首先解析日志，然后调用这两个 189 | 抽象方法获取一对信息，随后输出这一对信息。\verb|map| 的输入格式是 \verb|<偏移量，一行>|, 输出是 190 | \verb||。即输出的值留空，键是一对key。 191 | 192 | 我们也定义了 \verb|Combiner| 做本地的 \verb|reduce|，以节省网络带宽。这里我们要做的事很简单， 193 | 因为我们要去重，即对于 \verb|key1| 相同的，合并所有 \verb|key2| 相同的，所以我们 194 | 只需要丢弃 \verb|Combiner| 的 \verb|values|, 输出 \verb||即可。 195 | 196 | 我们自定义 \verb|Partitioner|，以保证相同第一个键的一定到达相同的 \verb|Reducer|。 197 | 自定义方式和课上学过的一样，仅将 \verb|TextPair| 的第一个发给 \verb|hashPartitioner| 即可。 198 | 199 | 最后是自定义的 \verb|UniqueReducer|。为了去重，我们在类中保存 3 个变量：\verb|lastK1|、\verb|lastK2| 200 | 和\verb|count|。前两者是记录上一次见到的两个键的内容，最后一个则是在第一个键相同的情况下， 201 | 有多少个不同的第二个键。 202 | 203 | \verb|setup| 函数将 \verb|lastK1| 和 \verb|lastK2| 初始化为空，\verb|count| 初始化为 0 。 204 | 205 | \verb|reduce| 的输入是 \verb||，此处仅有键是 206 | 有用的。对信息的统计和去重是在 \verb|reduce| 的多次运行中完成的。 207 | 经过我们自定义的 \verb|Partitioner| 和 hadoop 的排序，到达 \verb|Reducer|的 208 | \verb|Pair| 的顺序必定为具有相同\verb|key1| 的聚在一起，在这聚起来的一堆中，具有 209 | 相同 \verb|key2| 的也聚在一起。因此我们首先将 \verb|key1| 和上一次的比较，如果相同， 210 | 我们就再比较 \verb|key2|和上一次的, 若相同，就不做什么事情，否则更新 \verb|lastK2|, 211 | 并增加 \verb|count|。如果 \verb|key1| 也改变，而且上一次的不是空，说明我们已经统计完 212 | \verb|key1| 的信息了，就输出，并更新 \verb|lastK1|，最终将其余变量 213 | 更新为 \verb|null| 和 0。这里对算法的介绍未完全介绍判空的情形，实际比这个复杂，详见代码。 214 | 215 | \verb|reduce| 的输出是\verb||，其意义是对于 \verb|key1| 代表的资源， 216 | 互不相同的 \verb|key2| 出现了 \verb|count| 次。 217 | 218 | \verb|reduce| 的 \verb|cleanup| 方法用来输出可能未输出的最后一组信息。 219 | 220 | \verb|UniqueMapper| 同样封装了 \verb|bindJob| 方法，用于封装其使用方式。 221 | 222 | 有了这两套基类后，几个任务的实现就简化为继承需要的类，重载方法，在 \verb|main| 函数中初始化并调用 223 | \verb|bindJob| 三步走了。 224 | 225 | \textbf{任务二} 在 \verb|CountTimes.java|中实现。其继承 \verb|CountMapper|，重载 \verb|toKey| 为从 226 | \verb|Log| 中获取其 \verb|request| 信息。 227 | 228 | \textbf{任务三} 在 \verb|UniqueIP.java| 中实现。其继承 \verb|UniqueMapper|，重载 \verb|toKey1| 为从 229 | \verb|Log| 中获取其 \verb|request| 信息，\verb|toKey2| 为获取 \verb|clientIP| 信息。 230 | 231 | \textbf{任务四} 在 \verb|CountHour.java| 中实现。其继承 \verb|CountMapper|，重载 \verb|toKey| 为从 232 | \verb|Log| 中获取精确到小时的时间戳。 233 | 234 | \textbf{任务五} 在 \verb|CountTimes.java| 中实现。其继承 \verb|CountMapper|，重载 \verb|toKey| 为从 235 | \verb|Log| 中获取其 \verb|agent| 信息。 236 | 237 | \section{输入输出格式} 238 | 我们的每个任务的输入都是原始数据，其格式为\textbf{Combined Log Format}\cite{clf}。 239 | 240 | 任务的输出格式严格按照课程指导ppt实现，为\verb|key[\TAB]value|形式。 241 | 242 | \section{程序运行试验结果说明和分析} 243 | 244 | 按照要求跑出来的任务输出并没有按照值排序，因此我们写了一个很简单的 \verb|Sorter| 将输出的各项以值降序排列，以便看出规律。输出文件可在 hdfs 中找到。 245 | 246 | 此网站在两日内有 13770 次访问。用户通常在上午（8时 -- 12时）和傍晚至夜间（19时 -- 00时）访问量较大。在 2013 年 9 月 18 日的下午 15 时至 247 | 17 时，此网站访问量急剧上升，至原来的约1.5倍到两倍，可能是因为其举办了活动或因为一些新闻获得了额外的关注。通过 UA 的统计信息可以看出，此网站的用户使用 248 | 桌面端访问较多，使用 Windows, Macintosh，Linux 的用户数量差距不大，结合后面资源访问信息可看出网站的用户很可能是 IT 从业者。 249 | UA中还有各类爬虫，说明此网站对爬虫持开放态度，或技术不足。此网站的 UA 信息中单项最多的是\verb|DNSPod-Monitor/1.0|, 250 | 说明其域名解析使用的是DNSPod, 并开启了D监控。 251 | 252 | 此网站的资源获取信息中，滤去JavaScript, 图片等杂音，可以从资源中的各类计算机术语看出，这应当是某个IT相关网站，有多个板块，且有博客功能。 253 | 用户大部分时间用在浏览各种文章和博客，当日大概是有一个用户注册。总的来说是比较活跃的社区，不存在关站风险。 254 | 255 | \section{总结} 256 | 本次实验我们采用 MapReduce 并行程序设计对一个网站的日志进行了分析，得到相关信息，并得出了有益的结论。本次课程 257 | 设计的特点是采用 MapReduce 并行程序设计，大大加速了分析日志的速度，并具有较好的性能和可扩展性。 258 | 259 | 我们的并行程序能够统计\textbf{资源请求次数}、\textbf{独立IP}、 260 | \textbf{每小时访问网站次数}、\textbf{访问网站的浏览器类型}信息并输出为文件。我们另外 261 | 还实现了一个简单的排序器提高输出文件的可读性。输出的文件仍需要进一步写脚本统计等等，并不是 262 | 非常清晰，可写另外的 MapReduce 过程，将多种统计信息联系起来，能得出更深刻的结论。 263 | 264 | 在性能上，我们采用自定义的 \verb|Combiner| 进行本地聚合，降低所需的网络传输带宽，同时采用 265 | 自定义 \verb|Partitioner| 消除了 \verb|reducer| 数据的相关性。但仍存在必须从原始文件 266 | 读入并解析的问题，可以做的改进是在任务一解析好并输出为二进制，在后续任务时，自定义 \verb|InputFormat| 267 | 直接读二进制初始化结构体，这样能达到更好的性能。 268 | 269 | 在可扩展性上，我们针对任务的特点，设计了两组基类并封装了两套处理逻辑，实现任务时，仅需要重载 270 | 对应方法即可，可扩展性较好。基类中仍有不少部分是写死的，在本次设计中是合理的，但若后续还有别的 271 | 任务需要处理，则需要提供更多可重载的抽象接口以提高扩展性。 272 | 273 | \newpage 274 | \begin{thebibliography}{99} 275 | \bibitem{clf} \href{https://httpd.apache.org/docs/2.4/logs.html#combined}{https://httpd.apache.org/docs/2.4/logs.html\#combined} 276 | \bibitem{grok} \href{https://github.com/thekrakken/java-grok}{https://github.com/thekrakken/java-grok} 277 | \end{thebibliography} 278 | \end{document} 279 | -------------------------------------------------------------------------------- /final/src/main/java/CountHour.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.conf.Configuration; 2 | import org.apache.hadoop.mapreduce.Job; 3 | 4 | import java.io.IOException; 5 | import java.text.SimpleDateFormat; 6 | 7 | public class CountHour { 8 | public static class CountMapper extends CountMapred.CountMapper { 9 | @Override 10 | public String toKey(Log log) { 11 | SimpleDateFormat format = new SimpleDateFormat("yyyyMMddHH"); 12 | return format.format(log.getTimestamp()); 13 | } 14 | } 15 | 16 | public static void main(String[] args) 17 | throws IOException, InterruptedException, ClassNotFoundException { 18 | Configuration conf = new Configuration(); 19 | Job job = Job.getInstance(conf, "CountHour"); 20 | job.setJarByClass(CountHour.class); 21 | CountMapred.bindJob(job, CountMapper.class, args[0], args[1]); 22 | System.exit(job.waitForCompletion(true) ? 0 : 1); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /final/src/main/java/CountMapred.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.fs.Path; 2 | import org.apache.hadoop.io.IntWritable; 3 | import org.apache.hadoop.io.Text; 4 | import org.apache.hadoop.mapreduce.Job; 5 | import org.apache.hadoop.mapreduce.Mapper; 6 | import org.apache.hadoop.mapreduce.Reducer; 7 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 9 | 10 | import java.io.IOException; 11 | import java.text.ParseException; 12 | 13 | public class CountMapred { 14 | public static abstract class CountMapper 15 | extends Mapper { 16 | // 实现这个方法以实现自定义策略 17 | abstract public String toKey(Log log); 18 | 19 | @Override 20 | public void map(Object key, Text value, Context context) 21 | throws IOException, InterruptedException { 22 | try { 23 | Log log = Log.parseLog(value.toString()); // 解析日志 24 | String key1 = toKey(log); // 调用自定义策略得到感兴趣的信息 25 | if (key1 != null) { // 判空 26 | context.write(new Text(key1), new IntWritable(1)); 27 | } 28 | } catch (ParseException e) { 29 | throw new RuntimeException(e); 30 | } 31 | } 32 | } 33 | 34 | public static class CountReducer 35 | extends Reducer { 36 | @Override 37 | public void reduce(Text key, Iterable values, Context context) 38 | throws IOException, InterruptedException { 39 | int total = 0; 40 | for (IntWritable i: values) { // 求和 41 | total += i.get(); 42 | } 43 | context.write(key, new IntWritable(total)); 44 | } 45 | } 46 | 47 | // 封装 main 函数 48 | public static void bindJob(Job job, Class mapper, 49 | String input, String output) throws IOException { 50 | job.setMapperClass(mapper); 51 | job.setCombinerClass(CountReducer.class); // reducer的实现同时也能作为combiner 52 | job.setReducerClass(CountReducer.class); 53 | job.setMapOutputKeyClass(Text.class); 54 | job.setMapOutputValueClass(IntWritable.class); 55 | job.setOutputKeyClass(Text.class); 56 | job.setOutputValueClass(IntWritable.class); 57 | FileInputFormat.addInputPath(job, new Path(input)); 58 | FileOutputFormat.setOutputPath(job, new Path(output)); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /final/src/main/java/CountTimes.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.conf.Configuration; 2 | import org.apache.hadoop.mapreduce.Job; 3 | 4 | import java.io.IOException; 5 | 6 | public class CountTimes { 7 | public static class CountMapper extends CountMapred.CountMapper { 8 | @Override 9 | public String toKey(Log log) { 10 | return log.getRequest(); 11 | } 12 | } 13 | 14 | public static void main(String[] args) 15 | throws IOException, InterruptedException, ClassNotFoundException { 16 | Configuration conf = new Configuration(); 17 | Job job = Job.getInstance(conf, "CountTimes"); 18 | job.setJarByClass(CountTimes.class); 19 | CountMapred.bindJob(job, CountMapper.class, args[0], args[1]); 20 | System.exit(job.waitForCompletion(true) ? 0 : 1); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /final/src/main/java/CountUA.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.conf.Configuration; 2 | import org.apache.hadoop.mapreduce.Job; 3 | 4 | import java.io.IOException; 5 | 6 | public class CountUA { 7 | public static class CountMapper extends CountMapred.CountMapper { 8 | @Override 9 | public String toKey(Log log) { 10 | return log.getAgent(); 11 | } 12 | } 13 | 14 | public static void main(String[] args) 15 | throws IOException, InterruptedException, ClassNotFoundException { 16 | Configuration conf = new Configuration(); 17 | Job job = Job.getInstance(conf, "CountUA"); 18 | job.setJarByClass(CountUA.class); 19 | CountMapred.bindJob(job, CountMapper.class, args[0], args[1]); 20 | System.exit(job.waitForCompletion(true) ? 0 : 1); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /final/src/main/java/GenericPair.java: -------------------------------------------------------------------------------- 1 | import org.apache.commons.math3.util.Pair; 2 | import org.apache.hadoop.io.WritableComparable; 3 | 4 | import java.io.DataInput; 5 | import java.io.DataOutput; 6 | import java.io.IOException; 7 | 8 | public abstract class GenericPair 9 | , V extends WritableComparable> 10 | extends Pair implements WritableComparable> { 11 | 12 | public GenericPair(K k, V v) { 13 | super(k, v); 14 | } 15 | 16 | @Override 17 | public int compareTo(GenericPair genericPair) { 18 | int r1 = getKey().compareTo(genericPair.getKey()); 19 | if (r1 != 0) { 20 | return r1; 21 | } 22 | return getValue().compareTo(genericPair.getValue()); 23 | } 24 | 25 | @Override 26 | public void write(DataOutput dataOutput) throws IOException { 27 | getKey().write(dataOutput); 28 | getValue().write(dataOutput); 29 | } 30 | 31 | @Override 32 | public void readFields(DataInput dataInput) throws IOException { 33 | getKey().readFields(dataInput); 34 | getValue().readFields(dataInput); 35 | } 36 | } -------------------------------------------------------------------------------- /final/src/main/java/Log.java: -------------------------------------------------------------------------------- 1 | import io.krakens.grok.api.Grok; 2 | import io.krakens.grok.api.GrokCompiler; 3 | import io.krakens.grok.api.Match; 4 | 5 | import java.text.ParseException; 6 | import java.text.SimpleDateFormat; 7 | import java.util.Date; 8 | import java.util.Locale; 9 | import java.util.Map; 10 | import java.util.Objects; 11 | 12 | public class Log { 13 | private final String request; 14 | private final String agent; 15 | private final String auth; 16 | private final String ident; 17 | private final String verb; 18 | private final String referrer; 19 | private final int response; 20 | private final int bytes; 21 | private final String clientIP; 22 | private final String httpVersion; 23 | private final String rawRequest; 24 | private final Date timestamp; 25 | 26 | private Log(String request, String agent, String auth, String ident, 27 | String verb, String referrer, int response, int bytes, 28 | String clientIP, String httpVersion, String rawRequest, Date timestamp) { 29 | this.request = request; 30 | this.agent = agent; 31 | this.auth = auth; 32 | this.ident = ident; 33 | this.verb = verb; 34 | this.referrer = referrer; 35 | this.response = response; 36 | this.bytes = bytes; 37 | this.clientIP = clientIP; 38 | this.httpVersion = httpVersion; 39 | this.rawRequest = rawRequest; 40 | this.timestamp = timestamp; 41 | } 42 | 43 | static public Log parseLog(String log) throws ParseException { 44 | GrokCompiler grokCompiler = GrokCompiler.newInstance(); 45 | grokCompiler.registerDefaultPatterns(); 46 | Grok grok = grokCompiler.compile("%{COMBINEDAPACHELOG}"); 47 | Match grokMatch = grok.match(log); 48 | Map capture = grokMatch.capture(); 49 | SimpleDateFormat format = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss Z", Locale.ENGLISH); 50 | return new Log((String) capture.get("request"), (String) capture.get("agent"), 51 | (String) capture.get("auth"), (String) capture.get("ident"), 52 | (String) capture.get("verb"), (String) capture.get("referrer"), 53 | Integer.parseInt((String) capture.get("response")), 54 | Integer.parseInt((String) capture.get("bytes")), 55 | (String) capture.get("clientip"), (String) capture.get("httpversion"), 56 | (String) capture.get("rawrequest"), format.parse((String) capture.get("timestamp"))); 57 | } 58 | 59 | public Date getTimestamp() { 60 | return timestamp; 61 | } 62 | 63 | public int getBytes() { 64 | return bytes; 65 | } 66 | 67 | public int getResponse() { 68 | return response; 69 | } 70 | 71 | public String getAgent() { 72 | return agent; 73 | } 74 | 75 | public String getAuth() { 76 | return auth; 77 | } 78 | 79 | public String getClientIP() { 80 | return clientIP; 81 | } 82 | 83 | public String getHttpVersion() { 84 | return httpVersion; 85 | } 86 | 87 | public String getIdent() { 88 | return ident; 89 | } 90 | 91 | public String getRawRequest() { 92 | return rawRequest; 93 | } 94 | 95 | public String getReferrer() { 96 | return referrer; 97 | } 98 | 99 | public String getRequest() { 100 | return request; 101 | } 102 | 103 | public String getVerb() { 104 | return verb; 105 | } 106 | 107 | @Override 108 | public boolean equals(Object o) { 109 | if (this == o) return true; 110 | if (o == null || getClass() != o.getClass()) return false; 111 | Log log = (Log) o; 112 | return response == log.response && bytes == log.bytes && Objects.equals(request, log.request) && Objects.equals(agent, log.agent) && Objects.equals(auth, log.auth) && Objects.equals(ident, log.ident) && Objects.equals(verb, log.verb) && Objects.equals(referrer, log.referrer) && Objects.equals(clientIP, log.clientIP) && Objects.equals(httpVersion, log.httpVersion) && Objects.equals(rawRequest, log.rawRequest) && Objects.equals(timestamp, log.timestamp); 113 | } 114 | 115 | @Override 116 | public int hashCode() { 117 | return Objects.hash(request, agent, auth, ident, verb, referrer, response, bytes, clientIP, httpVersion, rawRequest, timestamp); 118 | } 119 | 120 | @Override 121 | public String toString() { 122 | return "Log{" + 123 | "request='" + request + '\'' + 124 | ", agent='" + agent + '\'' + 125 | ", auth='" + auth + '\'' + 126 | ", ident='" + ident + '\'' + 127 | ", verb='" + verb + '\'' + 128 | ", referrer='" + referrer + '\'' + 129 | ", response=" + response + 130 | ", bytes=" + bytes + 131 | ", clientIP='" + clientIP + '\'' + 132 | ", httpVersion='" + httpVersion + '\'' + 133 | ", rawRequest='" + rawRequest + '\'' + 134 | ", timestamp=" + timestamp + 135 | '}'; 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /final/src/main/java/Sorter.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.conf.Configuration; 2 | import org.apache.hadoop.fs.Path; 3 | import org.apache.hadoop.io.FloatWritable; 4 | import org.apache.hadoop.io.IntWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Job; 7 | import org.apache.hadoop.mapreduce.Mapper; 8 | import org.apache.hadoop.mapreduce.Reducer; 9 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 10 | import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | 13 | import java.io.IOException; 14 | 15 | public class Sorter { 16 | public static class SorterMapper extends Mapper { 17 | @Override 18 | public void map(Text key, Text value, Context context) throws IOException, InterruptedException { 19 | context.write(new IntWritable(-Integer.parseInt(value.toString())), key); 20 | } 21 | } 22 | 23 | public static class SorterReducer extends Reducer { 24 | @Override 25 | public void reduce(IntWritable key, Iterable values, Context context) throws IOException, InterruptedException { 26 | for (Text v : values) { 27 | context.write(new IntWritable(-Integer.parseInt(key.toString())), v); 28 | } 29 | } 30 | } 31 | 32 | public static void main(String[] args) throws Exception{ 33 | Configuration conf = new Configuration(); 34 | Job job = Job.getInstance(conf, "Sorter"); 35 | job.setJarByClass(Sorter.class); 36 | job.setInputFormatClass(KeyValueTextInputFormat.class); 37 | job.setMapperClass(Sorter.SorterMapper.class); 38 | job.setReducerClass(Sorter.SorterReducer.class); 39 | job.setOutputKeyClass(IntWritable.class); 40 | job.setOutputValueClass(Text.class); 41 | job.setMapOutputKeyClass(IntWritable.class); 42 | job.setMapOutputValueClass(Text.class); 43 | FileInputFormat.addInputPath(job, new Path(args[0])); 44 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 45 | System.exit(job.waitForCompletion(true) ? 0 : 1); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /final/src/main/java/UniqueIP.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.conf.Configuration; 2 | import org.apache.hadoop.mapreduce.Job; 3 | 4 | import java.io.IOException; 5 | 6 | public class UniqueIP { 7 | public static class UniqueMapper extends UniqueMapred.UniqueMapper { 8 | 9 | @Override 10 | public String toKey1(Log log) { 11 | return log.getRequest(); 12 | } 13 | 14 | @Override 15 | public String toKey2(Log log) { 16 | return log.getClientIP(); 17 | } 18 | } 19 | 20 | public static void main(String[] args) 21 | throws IOException, InterruptedException, ClassNotFoundException { 22 | Configuration conf = new Configuration(); 23 | Job job = Job.getInstance(conf, "UniqueIP"); 24 | job.setJarByClass(UniqueIP.class); 25 | UniqueMapred.bindJob(job, UniqueMapper.class, args[0], args[1]); 26 | System.exit(job.waitForCompletion(true) ? 0 : 1); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /final/src/main/java/UniqueMapred.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.fs.Path; 2 | import org.apache.hadoop.io.IntWritable; 3 | import org.apache.hadoop.io.NullWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapred.lib.HashPartitioner; 6 | import org.apache.hadoop.mapreduce.Job; 7 | import org.apache.hadoop.mapreduce.Mapper; 8 | import org.apache.hadoop.mapreduce.Partitioner; 9 | import org.apache.hadoop.mapreduce.Reducer; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | 13 | import java.io.IOException; 14 | import java.text.ParseException; 15 | import java.util.Objects; 16 | 17 | public class UniqueMapred { 18 | // 的pair 19 | public static class TextPair extends GenericPair { 20 | public TextPair(Text text, Text text2) { 21 | super(text, text2); 22 | } 23 | 24 | public TextPair(String s1, String s2) { 25 | this(new Text(s1), new Text(s2)); 26 | } 27 | 28 | public TextPair() { 29 | this(new Text(), new Text()); 30 | } 31 | } 32 | public static abstract class UniqueMapper 33 | extends Mapper { 34 | 35 | // 实现这 2 个方法以实现自定义策略 36 | abstract public String toKey1(Log log); 37 | abstract public String toKey2(Log log); 38 | 39 | @Override 40 | public void map(Object key, Text value, Context context) 41 | throws IOException, InterruptedException { 42 | try { 43 | Log log = Log.parseLog(value.toString()); // 解析日志 44 | String key1 = toKey1(log), key2 = toKey2(log); // 获取感兴趣的信息 45 | if (key1 != null && key2 != null) { // 判空 46 | context.write(new TextPair(key1, key2), NullWritable.get()); 47 | } 48 | } catch (ParseException e) { 49 | throw new RuntimeException(e); 50 | } 51 | } 52 | } 53 | 54 | public static class UniqueCombiner 55 | extends Reducer { 56 | // 将重复的键合并 57 | @Override 58 | protected void reduce(TextPair key, Iterable values, Context context) 59 | throws IOException, InterruptedException { 60 | context.write(key, NullWritable.get()); 61 | } 62 | } 63 | 64 | public static class UniquePartitioner extends Partitioner { 65 | private final HashPartitioner hashPartitioner = new HashPartitioner<>(); 66 | 67 | // 确保具有相同key1的被分到同一个 reducer 68 | @Override 69 | public int getPartition(TextPair textPair, NullWritable nullWritable, int i) { 70 | return hashPartitioner.getPartition(textPair.getFirst(), nullWritable, i); 71 | } 72 | } 73 | 74 | public static class UniqueReducer 75 | extends Reducer { 76 | private String lastK1, lastK2; 77 | private int count; 78 | 79 | //初始化 80 | @Override 81 | protected void setup(Context context) { 82 | lastK1 = null; 83 | lastK2 = null; 84 | count = 0; 85 | } 86 | 87 | @Override 88 | public void reduce(TextPair key, Iterable values, Context context) 89 | throws IOException, InterruptedException { 90 | //遍历去重 91 | String k1 = key.getFirst().toString(), k2 = key.getSecond().toString(); 92 | if (!Objects.equals(k1, lastK1)) { 93 | if (lastK1 != null) { 94 | context.write(new Text(lastK1), new IntWritable(count)); 95 | } 96 | lastK1 = k1; 97 | lastK2 = null; 98 | count = 0; 99 | } 100 | if (!Objects.equals(k2, lastK2)) { 101 | lastK2 = k2; 102 | count += 1; 103 | } 104 | } 105 | 106 | // 输出（可能的)最后一组信息 107 | @Override 108 | protected void cleanup(Context context) throws IOException, InterruptedException { 109 | if (lastK1 != null) { 110 | context.write(new Text(lastK1), new IntWritable(count)); 111 | } 112 | } 113 | } 114 | 115 | public static void bindJob(Job job, Class mapper, 116 | String input, String output) throws IOException { 117 | job.setMapperClass(mapper); 118 | job.setCombinerClass(UniqueCombiner.class); 119 | job.setPartitionerClass(UniquePartitioner.class); 120 | job.setReducerClass(UniqueReducer.class); 121 | job.setMapOutputKeyClass(TextPair.class); 122 | job.setMapOutputValueClass(NullWritable.class); 123 | job.setOutputKeyClass(Text.class); 124 | job.setOutputValueClass(IntWritable.class); 125 | FileInputFormat.addInputPath(job, new Path(input)); 126 | FileOutputFormat.setOutputPath(job, new Path(output)); 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /lab2/.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | !.mvn/wrapper/maven-wrapper.jar 3 | !**/src/main/**/target/ 4 | !**/src/test/**/target/ 5 | 6 | ### IntelliJ IDEA ### 7 | .idea/modules.xml 8 | .idea/jarRepositories.xml 9 | .idea/compiler.xml 10 | .idea/libraries/ 11 | *.iws 12 | *.iml 13 | *.ipr 14 | 15 | ### Eclipse ### 16 | .apt_generated 17 | .classpath 18 | .factorypath 19 | .project 20 | .settings 21 | .springBeans 22 | .sts4-cache 23 | 24 | ### NetBeans ### 25 | /nbproject/private/ 26 | /nbbuild/ 27 | /dist/ 28 | /nbdist/ 29 | /.nb-gradle/ 30 | build/ 31 | !**/src/main/**/build/ 32 | !**/src/test/**/build/ 33 | 34 | ### VS Code ### 35 | .vscode/ 36 | 37 | ### Mac OS ### 38 | .DS_Store -------------------------------------------------------------------------------- /lab2/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # 默认忽略的文件 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /lab2/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /lab2/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /lab2/.idea/remote-targets.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 9 | 10 | 11 | 13 | 14 | 15 | 16 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /lab2/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /lab2/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | nju 8 | InvertedIndex 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 8 13 | 8 14 | UTF-8 15 | 2.7.4 16 | 17 | 18 | 19 | 20 | 21 | org.apache.hadoop 22 | hadoop-common 23 | ${hadoop.version} 24 | 25 | 26 | org.apache.hadoop 27 | hadoop-hdfs 28 | ${hadoop.version} 29 | 30 | 31 | org.apache.hadoop 32 | hadoop-mapreduce-client-core 33 | ${hadoop.version} 34 | 35 | 36 | org.apache.hadoop 37 | hadoop-mapreduce-client-jobclient 38 | ${hadoop.version} 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /lab2/src/main/java/InvertedIndex.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.conf.Configuration; 2 | import org.apache.hadoop.fs.Path; 3 | import org.apache.hadoop.io.Text; 4 | import org.apache.hadoop.mapreduce.Job; 5 | import org.apache.hadoop.mapreduce.Mapper; 6 | import org.apache.hadoop.mapreduce.Reducer; 7 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 8 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 9 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 10 | 11 | import java.io.IOException; 12 | import java.util.HashMap; 13 | import java.util.Map; 14 | import java.util.StringTokenizer; 15 | 16 | public class InvertedIndex { 17 | public static class MyMapper extends Mapper { 18 | @Override 19 | public void map(Object key, Text value, Context context) 20 | throws IOException, InterruptedException { 21 | FileSplit fileSplit = (FileSplit) context.getInputSplit(); 22 | String fileName = fileSplit.getPath().getName(); 23 | Text fileNameText = new Text(fileName); 24 | StringTokenizer itr = new StringTokenizer(value.toString()); 25 | while (itr.hasMoreTokens()) { 26 | context.write(new Text(itr.nextToken()), fileNameText); 27 | } 28 | } 29 | } 30 | 31 | public static class MyReducer extends Reducer { 32 | @Override 33 | public void reduce(Text key, Iterable values, Context context) 34 | throws IOException, InterruptedException { 35 | HashMap timesMap = new HashMap<>(); 36 | int times = 0; 37 | boolean first = true; 38 | for (Text t: values) { 39 | String fileName = t.toString(); 40 | timesMap.put(fileName, timesMap.getOrDefault(fileName, 0) + 1); 41 | times += 1; 42 | } 43 | StringBuilder sb = new StringBuilder(); 44 | sb.append(String.format("%.2f", ((double) times) / timesMap.size())); 45 | for (Map.Entry entry: timesMap.entrySet()) { 46 | if (first) { 47 | sb.append(", "); 48 | first = false; 49 | } else { 50 | sb.append("; "); 51 | } 52 | sb.append(entry.getKey()).append(":").append(entry.getValue()); 53 | } 54 | context.write(new Text("[" + key.toString() + "]"), new Text(sb.toString())); 55 | } 56 | } 57 | 58 | public static void main(String[] args) throws Exception { 59 | Configuration conf = new Configuration(); 60 | Job job = Job.getInstance(conf, "InvertedIndex"); 61 | job.setJarByClass(InvertedIndex.class); 62 | job.setMapperClass(InvertedIndex.MyMapper.class); 63 | job.setReducerClass(InvertedIndex.MyReducer.class); 64 | job.setOutputKeyClass(Text.class); 65 | job.setOutputValueClass(Text.class); 66 | FileInputFormat.addInputPath(job, new Path(args[0])); 67 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 68 | System.exit(job.waitForCompletion(true) ? 0 : 1); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /lab2/src/main/java/Sorter.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.conf.Configuration; 2 | import org.apache.hadoop.fs.Path; 3 | import org.apache.hadoop.io.FloatWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Job; 6 | import org.apache.hadoop.mapreduce.Mapper; 7 | import org.apache.hadoop.mapreduce.Reducer; 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 9 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 10 | 11 | import java.io.IOException; 12 | 13 | public class Sorter { 14 | public static class SorterMapper extends Mapper { 15 | @Override 16 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 17 | // text should be output of InvertedIndex. for example [我们] 1.0 @#$%$#@#$ 18 | String[] buffer = value.toString().split("[\\s,]+"); 19 | context.write(new FloatWritable(Float.parseFloat(buffer[1])), new Text(buffer[0])); 20 | } 21 | } 22 | 23 | public static class SorterReducer extends Reducer { 24 | @Override 25 | public void reduce(FloatWritable key, Iterable values, Context context) throws IOException, InterruptedException { 26 | for (Text v : values) { 27 | context.write(v, key); 28 | } 29 | } 30 | } 31 | 32 | public static void main(String[] args) throws Exception{ 33 | Configuration conf = new Configuration(); 34 | Job job = Job.getInstance(conf, "Sorter"); 35 | job.setJarByClass(Sorter.class); 36 | job.setMapperClass(Sorter.SorterMapper.class); 37 | job.setReducerClass(Sorter.SorterReducer.class); 38 | job.setOutputKeyClass(Text.class); 39 | job.setOutputValueClass(Text.class); 40 | job.setMapOutputKeyClass(FloatWritable.class); 41 | job.setMapOutputValueClass(Text.class); 42 | FileInputFormat.addInputPath(job, new Path(args[0])); 43 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 44 | System.exit(job.waitForCompletion(true) ? 0 : 1); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /lab2/src/main/java/TFIDF.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.conf.Configuration; 2 | import org.apache.hadoop.fs.FileStatus; 3 | import org.apache.hadoop.fs.FileSystem; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Job; 7 | import org.apache.hadoop.mapreduce.Mapper; 8 | import org.apache.hadoop.mapreduce.Reducer; 9 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 10 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | 13 | import java.io.IOException; 14 | import java.util.HashMap; 15 | import java.util.Map; 16 | import java.util.StringTokenizer; 17 | 18 | public class TFIDF { 19 | 20 | /** 21 | * keyout: word_bookname 22 | * valueout: wordfreq 23 | */ 24 | public static class TMapper extends Mapper { 25 | @Override 26 | public void map(Object object, Text text, Context context) throws IOException, InterruptedException { 27 | FileSplit fileSplit = (FileSplit) context.getInputSplit(); 28 | HashMap n = new HashMap<>(); 29 | String bookName = fileSplit.getPath().getName(); 30 | StringTokenizer strtok = new StringTokenizer(text.toString()); 31 | while (strtok.hasMoreTokens()) { 32 | String s = strtok.nextToken(); 33 | n.put(s, n.getOrDefault(s, 0) + 1); 34 | } 35 | for (Map.Entry entry : n.entrySet()) { 36 | String key = entry.getKey(); 37 | Integer value = entry.getValue(); 38 | context.write(new Text(key), new Text(bookName + "_" + value.toString())); 39 | } 40 | } 41 | } 42 | 43 | public static class TReducer extends Reducer { 44 | private int total; 45 | 46 | @Override 47 | protected void setup(Context context) { 48 | total = Integer.parseInt(context.getConfiguration().get("total")); 49 | } 50 | 51 | @Override 52 | public void reduce(Text word, Iterable value, Context context) throws IOException, InterruptedException { 53 | String w = word.toString(); 54 | HashMap tfMap = new HashMap<>(); 55 | for (Text i : value) { 56 | String[] buf = i.toString().split("_"); 57 | String bookName = buf[0]; 58 | tfMap.put(bookName, tfMap.getOrDefault(bookName, 0) + Integer.parseInt(buf[1])); 59 | } 60 | int docnum = tfMap.size(); 61 | double idf = Math.log10((double) total / (docnum + 1)); 62 | for (Map.Entry entry : tfMap.entrySet()) { 63 | String bookName = entry.getKey(); 64 | Integer tf = entry.getValue(); 65 | context.write(new Text(String.format("%s, %s, %s-%s", bookName, w, tf.toString(), idf)), new Text()); 66 | } 67 | } 68 | } 69 | 70 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 71 | Configuration conf = new Configuration(); 72 | FileSystem hdfs = FileSystem.get(conf); 73 | FileStatus[] stats = hdfs.listStatus(new Path(args[0])); 74 | int DocSum = stats.length; 75 | hdfs.close(); 76 | // 全局变量传递 77 | conf.set("total", String.valueOf(DocSum)); 78 | 79 | Job job = Job.getInstance(conf, "TFIDF"); 80 | job.setJarByClass(TFIDF.class); 81 | job.setMapperClass(TFIDF.TMapper.class); 82 | job.setReducerClass(TFIDF.TReducer.class); 83 | job.setOutputKeyClass(Text.class); 84 | job.setOutputValueClass(Text.class); 85 | FileInputFormat.addInputPath(job, new Path(args[0])); 86 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 87 | System.exit(job.waitForCompletion(true) ? 0 : 1); 88 | 89 | } 90 | 91 | } 92 | -------------------------------------------------------------------------------- /lab2/src/main/resources/META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | Main-Class: InvertedIndex 3 | 4 | -------------------------------------------------------------------------------- /lab3/.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | !.mvn/wrapper/maven-wrapper.jar 3 | !**/src/main/**/target/ 4 | !**/src/test/**/target/ 5 | 6 | ### IntelliJ IDEA ### 7 | .idea/modules.xml 8 | .idea/jarRepositories.xml 9 | .idea/compiler.xml 10 | .idea/libraries/ 11 | *.iws 12 | *.iml 13 | *.ipr 14 | 15 | ### Eclipse ### 16 | .apt_generated 17 | .classpath 18 | .factorypath 19 | .project 20 | .settings 21 | .springBeans 22 | .sts4-cache 23 | 24 | ### NetBeans ### 25 | /nbproject/private/ 26 | /nbbuild/ 27 | /dist/ 28 | /nbdist/ 29 | /.nb-gradle/ 30 | build/ 31 | !**/src/main/**/build/ 32 | !**/src/test/**/build/ 33 | 34 | ### VS Code ### 35 | .vscode/ 36 | 37 | ### Mac OS ### 38 | .DS_Store -------------------------------------------------------------------------------- /lab3/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # 默认忽略的文件 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /lab3/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /lab3/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /lab3/.idea/remote-targets.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 9 | 10 | 11 | 13 | 14 | 15 | 16 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /lab3/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /lab3/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | nju 8 | GroupHive 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 8 13 | 8 14 | UTF-8 15 | 2.7.4 16 | 17 | 18 | 19 | 20 | 21 | org.apache.hadoop 22 | hadoop-common 23 | ${hadoop.version} 24 | 25 | 26 | org.apache.hadoop 27 | hadoop-hdfs 28 | ${hadoop.version} 29 | 30 | 31 | org.apache.hadoop 32 | hadoop-mapreduce-client-core 33 | ${hadoop.version} 34 | 35 | 36 | org.apache.hadoop 37 | hadoop-mapreduce-client-jobclient 38 | ${hadoop.version} 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /lab3/src/main/java/GenericPair.java: -------------------------------------------------------------------------------- 1 | import org.apache.commons.math3.util.Pair; 2 | import org.apache.hadoop.io.WritableComparable; 3 | 4 | import java.io.DataInput; 5 | import java.io.DataOutput; 6 | import java.io.IOException; 7 | 8 | public abstract class GenericPair 9 | , V extends WritableComparable> 10 | extends Pair implements WritableComparable> { 11 | 12 | public GenericPair(K k, V v) { 13 | super(k, v); 14 | } 15 | 16 | @Override 17 | public int compareTo(GenericPair pairWritable) { 18 | int r1 = getKey().compareTo(pairWritable.getKey()); 19 | if (r1 != 0) { 20 | return r1; 21 | } 22 | return getValue().compareTo(pairWritable.getValue()); 23 | } 24 | 25 | @Override 26 | public void write(DataOutput dataOutput) throws IOException { 27 | getKey().write(dataOutput); 28 | getValue().write(dataOutput); 29 | } 30 | 31 | @Override 32 | public void readFields(DataInput dataInput) throws IOException { 33 | getKey().readFields(dataInput); 34 | getValue().readFields(dataInput); 35 | } 36 | } -------------------------------------------------------------------------------- /lab3/src/main/java/MaxShip.java: -------------------------------------------------------------------------------- 1 | import org.apache.commons.math3.util.Pair; 2 | import org.apache.hadoop.conf.Configuration; 3 | import org.apache.hadoop.fs.Path; 4 | import org.apache.hadoop.io.NullWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Job; 7 | import org.apache.hadoop.mapreduce.Mapper; 8 | import org.apache.hadoop.mapreduce.Reducer; 9 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 10 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 11 | 12 | import java.io.IOException; 13 | import java.util.ArrayList; 14 | 15 | public class MaxShip { 16 | public static class MyMapper extends Mapper { 17 | @Override 18 | public void map(Object key, Text value, Context context) 19 | throws IOException, InterruptedException { 20 | String[] fields = value.toString().split("\\|"); 21 | context.write(new Text(fields[5]), new Text(fields[0] + "#" + fields[7])); 22 | } 23 | } 24 | 25 | public static class MyReducer extends Reducer { 26 | @Override 27 | public void reduce(Text key, Iterable values, Context context) 28 | throws IOException, InterruptedException { 29 | int maxShip = Integer.MIN_VALUE; 30 | ArrayList> maxShips = new ArrayList<>(); 31 | for (Text t: values) { 32 | String[] fields = t.toString().split("#"); 33 | int currShip = Integer.parseInt(fields[1]); 34 | if (currShip > maxShip) { 35 | maxShip = currShip; 36 | maxShips.clear(); 37 | maxShips.add(new Pair<>(fields[0], currShip)); 38 | } else if (currShip == maxShip) { 39 | maxShips.add(new Pair<>(fields[0], currShip)); 40 | } 41 | } 42 | for (Pair p: maxShips) { 43 | context.write(new Text(String.format("%s\t%s\t%d", p.getFirst(), key, p.getSecond())), 44 | NullWritable.get()); 45 | } 46 | } 47 | } 48 | 49 | public static void main(String[] args) throws Exception { 50 | Configuration conf = new Configuration(); 51 | Job job = Job.getInstance(conf, "MaxShip"); 52 | job.setJarByClass(MaxShip.class); 53 | job.setMapperClass(MaxShip.MyMapper.class); 54 | job.setReducerClass(MaxShip.MyReducer.class); 55 | job.setMapOutputKeyClass(Text.class); 56 | job.setMapOutputValueClass(Text.class); 57 | job.setOutputKeyClass(Text.class); 58 | job.setOutputValueClass(NullWritable.class); 59 | FileInputFormat.addInputPath(job, new Path(args[0])); 60 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 61 | System.exit(job.waitForCompletion(true) ? 0 : 1); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /lab3/src/main/java/MaxShip2.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.conf.Configuration; 2 | import org.apache.hadoop.fs.Path; 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.NullWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Job; 7 | import org.apache.hadoop.mapreduce.Mapper; 8 | import org.apache.hadoop.mapreduce.Partitioner; 9 | import org.apache.hadoop.mapreduce.Reducer; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner; 13 | 14 | import java.io.IOException; 15 | 16 | public class MaxShip2 { 17 | 18 | public static class MyPair extends GenericPair { 19 | public MyPair(Text text, IntWritable intWritable) { 20 | super(text, intWritable); 21 | } 22 | 23 | public MyPair() { 24 | this(new Text(), new IntWritable()); 25 | } 26 | } 27 | 28 | public static class MyMapper extends Mapper { 29 | 30 | @Override 31 | public void map(Object key, Text value, Context context) 32 | throws IOException, InterruptedException { 33 | String[] fields = value.toString().split("\\|"); 34 | context.write(new MyPair(new Text(fields[5]), 35 | new IntWritable(-Integer.parseInt(fields[7]))), 36 | new Text(fields[0])); 37 | } 38 | } 39 | 40 | public static class MyPartitioner extends Partitioner { 41 | private final HashPartitioner hashPartitioner = new HashPartitioner<>(); 42 | 43 | @Override 44 | public int getPartition(MyPair myPair, Text text, int i) { 45 | return hashPartitioner.getPartition(myPair.getKey(), text, i); 46 | } 47 | } 48 | 49 | public static class MyReducer extends Reducer { 50 | private String curr; 51 | 52 | @Override 53 | public void setup(Context ctx) { 54 | curr = ""; 55 | } 56 | 57 | @Override 58 | public void reduce(MyPair key, Iterable values, Context context) 59 | throws IOException, InterruptedException { 60 | String k = key.getKey().toString(); 61 | int v = key.getValue().get(); 62 | if (!curr.equals(k)) { 63 | curr = k; 64 | for (Text t: values) { 65 | context.write(new Text(String.format("%s\t%s\t%d", t.toString(), k, -v)), 66 | NullWritable.get()); 67 | } 68 | } 69 | } 70 | } 71 | 72 | public static void main(String[] args) throws Exception { 73 | Configuration conf = new Configuration(); 74 | Job job = Job.getInstance(conf, "MaxShip2"); 75 | job.setJarByClass(MaxShip2.class); 76 | job.setMapperClass(MaxShip2.MyMapper.class); 77 | job.setPartitionerClass(MaxShip2.MyPartitioner.class); 78 | job.setReducerClass(MaxShip2.MyReducer.class); 79 | job.setMapOutputKeyClass(MaxShip2.MyPair.class); 80 | job.setMapOutputValueClass(Text.class); 81 | job.setOutputKeyClass(Text.class); 82 | job.setOutputValueClass(NullWritable.class); 83 | FileInputFormat.addInputPath(job, new Path(args[0])); 84 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 85 | System.exit(job.waitForCompletion(true) ? 0 : 1); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /lab3/src/main/java/SumNation.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.conf.Configuration; 2 | import org.apache.hadoop.fs.Path; 3 | import org.apache.hadoop.io.DoubleWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Job; 6 | import org.apache.hadoop.mapreduce.Mapper; 7 | import org.apache.hadoop.mapreduce.Reducer; 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 9 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 10 | 11 | import java.io.IOException; 12 | 13 | public class SumNation { 14 | public static class MyMapper extends Mapper { 15 | @Override 16 | public void map(Object key, Text value, Context context) 17 | throws IOException, InterruptedException { 18 | String[] fields = value.toString().split("\\|"); 19 | context.write(new Text(fields[3]), new DoubleWritable(Double.parseDouble(fields[5]))); 20 | } 21 | } 22 | 23 | public static class MyReducer extends Reducer { 24 | @Override 25 | public void reduce(Text key, Iterable values, Context context) 26 | throws IOException, InterruptedException { 27 | double total = 0.0; 28 | for (DoubleWritable d: values) { 29 | total += d.get(); 30 | } 31 | context.write(key, new DoubleWritable(total)); 32 | } 33 | } 34 | 35 | public static void main(String[] args) throws Exception { 36 | Configuration conf = new Configuration(); 37 | Job job = Job.getInstance(conf, "SumNation"); 38 | job.setJarByClass(SumNation.class); 39 | job.setMapperClass(SumNation.MyMapper.class); 40 | job.setReducerClass(SumNation.MyReducer.class); 41 | job.setOutputKeyClass(Text.class); 42 | job.setOutputValueClass(DoubleWritable.class); 43 | FileInputFormat.addInputPath(job, new Path(args[0])); 44 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 45 | System.exit(job.waitForCompletion(true) ? 0 : 1); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /lab4/.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | !.mvn/wrapper/maven-wrapper.jar 3 | !**/src/main/**/target/ 4 | !**/src/test/**/target/ 5 | 6 | ### IntelliJ IDEA ### 7 | .idea/modules.xml 8 | .idea/jarRepositories.xml 9 | .idea/compiler.xml 10 | .idea/libraries/ 11 | *.iws 12 | *.iml 13 | *.ipr 14 | 15 | ### Eclipse ### 16 | .apt_generated 17 | .classpath 18 | .factorypath 19 | .project 20 | .settings 21 | .springBeans 22 | .sts4-cache 23 | 24 | ### NetBeans ### 25 | /nbproject/private/ 26 | /nbbuild/ 27 | /dist/ 28 | /nbdist/ 29 | /.nb-gradle/ 30 | build/ 31 | !**/src/main/**/build/ 32 | !**/src/test/**/build/ 33 | 34 | ### VS Code ### 35 | .vscode/ 36 | 37 | ### Mac OS ### 38 | .DS_Store -------------------------------------------------------------------------------- /lab4/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # 默认忽略的文件 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /lab4/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /lab4/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /lab4/.idea/remote-targets.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 9 | 10 | 11 | 13 | 14 | 15 | 16 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /lab4/.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /lab4/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /lab4/README.md: -------------------------------------------------------------------------------- 1 | # Lab4 2 | 3 | ## JAR包执行方式说明 4 | 5 | 任务一： 6 | 7 | ```bash 8 | hadoop jar KNN-1.0-SNAPSHOT.jar KNN 9 | ``` 10 | 11 | OJ平台使用示例 12 | 13 | ```bash 14 | yarn jar KNN-1.0-SNAPSHOT.jar KNN 3 /data/exp4/iris_train.csv /data/exp4/iris_test.csv /user/2023stu_13/lab4-1out 15 | ``` 16 | 17 | 任务二： 18 | 19 | ```bash 20 | hadoop jar KNN-1.0-SNAPSHOT.jar WKNN 21 | ``` 22 | 23 | OJ平台使用示例 24 | 25 | ```bash 26 | yarn jar KNN-1.0-SNAPSHOT.jar WKNN 3 /data/exp4/iris_train.csv /data/exp4/iris_test.csv /user/2023stu_13/lab4-2out 27 | ``` 28 | -------------------------------------------------------------------------------- /lab4/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | nju 8 | KNN 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 8 13 | 8 14 | UTF-8 15 | 2.7.4 16 | 17 | 18 | 19 | 20 | 21 | org.apache.hadoop 22 | hadoop-common 23 | ${hadoop.version} 24 | 25 | 26 | org.apache.hadoop 27 | hadoop-hdfs 28 | ${hadoop.version} 29 | 30 | 31 | org.apache.hadoop 32 | hadoop-mapreduce-client-core 33 | ${hadoop.version} 34 | 35 | 36 | org.apache.hadoop 37 | hadoop-mapreduce-client-jobclient 38 | ${hadoop.version} 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /lab4/src/main/java/Data.java: -------------------------------------------------------------------------------- 1 | public class Data { 2 | private final double[] data; 3 | private final String type; 4 | 5 | public Data(String[] d) { 6 | data = new double[4]; 7 | for (int i = 0; i < 4; ++i) { 8 | data[i] = Double.parseDouble(d[i]); 9 | } 10 | if (d.length > 4) { 11 | type = d[4]; 12 | } else { 13 | type = null; 14 | } 15 | } 16 | 17 | public String getType() { 18 | return type; 19 | } 20 | 21 | public double distance(Data d) { 22 | double dis = 0; 23 | for (int i = 0; i < 4; ++i) { 24 | dis += Math.pow(data[i] - d.data[i], 2); 25 | } 26 | return dis; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /lab4/src/main/java/KNN.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.conf.Configuration; 2 | import org.apache.hadoop.fs.Path; 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Job; 6 | import org.apache.hadoop.mapreduce.Reducer; 7 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 9 | 10 | import java.io.IOException; 11 | 12 | public class KNN { 13 | public static Job setupJob(Configuration conf, Class Class, String name, 14 | Path input, Path output) throws IOException { 15 | Job job = Job.getInstance(conf, name); 16 | job.setJarByClass(Class); 17 | job.setMapperClass(MyMapper.class); 18 | job.setReducerClass(Reducer.class); 19 | job.setMapOutputKeyClass(IntWritable.class); 20 | job.setMapOutputValueClass(Text.class); 21 | job.setOutputKeyClass(IntWritable.class); 22 | job.setOutputValueClass(Text.class); 23 | FileInputFormat.addInputPath(job, input); 24 | FileOutputFormat.setOutputPath(job, output); 25 | return job; 26 | } 27 | public static void main(String[] args) 28 | throws IOException, InterruptedException, ClassNotFoundException { 29 | Configuration conf = new Configuration(); 30 | conf.set("k", args[0]); 31 | conf.set("train", args[1]); 32 | conf.setBoolean("weighted", false); 33 | Job job = setupJob(conf, KNN.class, "KNN", new Path(args[2]), new Path(args[3])); 34 | System.exit(job.waitForCompletion(true) ? 0 : 1); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /lab4/src/main/java/MyMapper.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.conf.Configuration; 2 | import org.apache.hadoop.fs.FSDataInputStream; 3 | import org.apache.hadoop.fs.FileSystem; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Mapper; 8 | 9 | import java.io.BufferedReader; 10 | import java.io.IOException; 11 | import java.io.InputStreamReader; 12 | import java.util.ArrayList; 13 | import java.util.PriorityQueue; 14 | 15 | public class MyMapper extends Mapper { 16 | private ArrayList trains; 17 | private int curr_id, k; 18 | private boolean weighted; 19 | 20 | @Override 21 | public void setup(Context context) 22 | throws IOException { 23 | Configuration conf = context.getConfiguration(); 24 | FSDataInputStream train = FileSystem.get(conf).open(new Path(conf.get("train"))); 25 | BufferedReader reader = new BufferedReader(new InputStreamReader(train)); 26 | String s; 27 | trains = new ArrayList<>(); 28 | curr_id = 0; 29 | k = Integer.parseInt(conf.get("k")); 30 | weighted = conf.getBoolean("weighted", false); 31 | while ((s = reader.readLine()) != null) { 32 | trains.add(new Data(s.split(","))); 33 | } 34 | reader.close(); 35 | } 36 | 37 | @Override 38 | public void map(Object key, Text value, Context context) 39 | throws IOException, InterruptedException { 40 | Data curr = new Data(value.toString().split(",")); 41 | PriorityQueue pq = new PriorityQueue<>(); 42 | for (Data data: trains) { 43 | pq.add(new Pair(curr.distance(data), data)); 44 | while (pq.size() > k) { 45 | pq.poll(); 46 | } 47 | } 48 | context.write(new IntWritable(curr_id), new Text(Pair.mostFrequent(pq, weighted))); 49 | curr_id += 1; 50 | } 51 | } -------------------------------------------------------------------------------- /lab4/src/main/java/Pair.java: -------------------------------------------------------------------------------- 1 | import java.util.Comparator; 2 | import java.util.HashMap; 3 | import java.util.Map; 4 | import java.util.NoSuchElementException; 5 | 6 | public class Pair implements Comparable { 7 | private final double first; 8 | private final Data second; 9 | 10 | public Pair(double f, Data s) { 11 | first = f; 12 | second = s; 13 | } 14 | 15 | public Data getSecond() { 16 | return second; 17 | } 18 | 19 | @Override 20 | public int compareTo(Pair d) { 21 | return -Double.compare(first, d.first); 22 | } 23 | 24 | public double getWeight() { 25 | return Math.exp(-(first * first) / 2); 26 | } 27 | 28 | public static String mostFrequent(Iterable pairs, boolean weighted) 29 | throws NoSuchElementException { 30 | HashMap map = new HashMap<>(); 31 | for (Pair p: pairs) { 32 | String type = p.getSecond().getType(); 33 | map.put(type, map.getOrDefault(type, 0.0) + (weighted ? p.getWeight() : 1.0)); 34 | } 35 | return map.entrySet().stream().max(Comparator.comparingDouble(Map.Entry::getValue)). 36 | orElseThrow(NoSuchElementException::new).getKey(); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /lab4/src/main/java/WKNN.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.conf.Configuration; 2 | import org.apache.hadoop.fs.Path; 3 | import org.apache.hadoop.mapreduce.Job; 4 | 5 | import java.io.IOException; 6 | 7 | public class WKNN extends KNN { 8 | public static void main(String[] args) 9 | throws IOException, InterruptedException, ClassNotFoundException { 10 | Configuration conf = new Configuration(); 11 | conf.set("k", args[0]); 12 | conf.set("train", args[1]); 13 | conf.setBoolean("weighted", true); 14 | Job job = setupJob(conf, WKNN.class, "WKNN", new Path(args[2]), new Path(args[3])); 15 | System.exit(job.waitForCompletion(true) ? 0 : 1); 16 | } 17 | } 18 | --------------------------------------------------------------------------------