├── .gitignore ├── LICENSE ├── README.md ├── pom.xml └── src └── main ├── java └── io │ └── miaoji │ └── flink │ └── algorithms │ └── chap02 │ ├── README.md │ └── SecondarySortingJob.java ├── resources ├── chap02 │ ├── mock.html │ ├── secondary_sorting_input.csv │ └── top10_input.txt ├── chap03 │ ├── mock.html │ └── top10_input.txt ├── chap04 │ ├── mock_for_transaction.html │ ├── mock_for_user.html │ ├── transaction_input.txt │ └── user_input.txt └── chap05 │ ├── input.txt │ └── mock.html └── scala └── io └── miaoji └── flink └── algorithms └── scala ├── chap02 └── SecondarySortingJob.scala ├── chap03 ├── README.md └── Top10.scala ├── chap04 └── LeftOuterJoin.scala └── chap05 ├── OrderInversion.scala └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.nar 17 | *.ear 18 | *.zip 19 | *.tar.gz 20 | *.rar 21 | 22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 23 | hs_err_pid* 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # flink-data-algorithms-book 2 | 本代码库基于[data-algorithms-book](https://github.com/mahmoudparsian/data-algorithms-book)的章节,通过flink框架实现其中的算法。 3 | 4 | ## 更新进度 5 | 6 | | | Java | Scala | 7 | | -------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | 8 | | chap02 SecondarySortingJob | [✅](src/main/java/io/miaoji/flink/algorithms/chap02/SecondarySortingJob.java) | [✅](src/main/scala/io/miaoji/flink/algorithms/scala/chap02/SecondarySortingJob.scala) | 9 | | chap03 Top 10 | | [✅](src/main/scala/io/miaoji/flink/algorithms/scala/chap03/Top10.scala) | 10 | | chap04 Left Outer Join | | [✅](src/main/scala/io/miaoji/flink/algorithms/scala/chap04/LeftOuterJoin.scala) | 11 | | chap05 Order Inversion | | [✅](src/main/scala/io/miaoji/flink/algorithms/scala/chap05/OrderInversion.scala) | 12 | | chap06 - chap31 | ... | ... | 13 | 14 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | io.miaoji 8 | flink-data-algorithms-book 9 | 1.0-SNAPSHOT 10 | 11 | 2.11 12 | 1.6.2 13 | 14 | 15 | 16 | 17 | org.apache.flink 18 | flink-clients_${scala.binary.version} 19 | ${flink.version} 20 | 21 | 22 | 23 | org.apache.flink 24 | flink-scala_${scala.binary.version} 25 | ${flink.version} 26 | 27 | 28 | org.apache.flink 29 | flink-streaming-scala_${scala.binary.version} 30 | ${flink.version} 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /src/main/java/io/miaoji/flink/algorithms/chap02/README.md: -------------------------------------------------------------------------------- 1 | # 原文 2 | [secondary_sorting](https://github.com/mahmoudparsian/data-algorithms-book/tree/master/src/main/java/org/dataalgorithms/chap02) 3 | 4 | # 输入示例 5 | [secondary_sorting_input.csv](./../../../../resources/chap02/secondary_sorting_input.csv) -------------------------------------------------------------------------------- /src/main/java/io/miaoji/flink/algorithms/chap02/SecondarySortingJob.java: -------------------------------------------------------------------------------- 1 | package io.miaoji.flink.algorithms.chap02; 2 | 3 | import org.apache.flink.api.common.functions.MapFunction; 4 | import org.apache.flink.api.common.functions.ReduceFunction; 5 | import org.apache.flink.api.common.operators.Order; 6 | import org.apache.flink.api.java.DataSet; 7 | import org.apache.flink.api.java.ExecutionEnvironment; 8 | import org.apache.flink.api.java.operators.DataSource; 9 | import org.apache.flink.api.java.tuple.Tuple2; 10 | import org.apache.flink.api.java.tuple.Tuple3; 11 | import org.apache.flink.core.fs.FileSystem; 12 | 13 | public class SecondarySortingJob { 14 | 15 | static String INPUT_FILENAME = "chap02/secondary_sorting_input.csv"; 16 | static String OUTPUT_FILENAME = "chap02/secondary_sorting_output.csv"; 17 | 18 | public static void main(String[] args) throws Exception { 19 | // 本地环境 20 | ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(); 21 | 22 | // 准备参数(输出路径为 target/classes 下) 23 | String dir = SecondarySortingJob.class.getClassLoader().getResource("").getPath(); 24 | String inputFilePath = dir+"/"+INPUT_FILENAME; 25 | String outputFilePath = dir+"/"+OUTPUT_FILENAME; 26 | 27 | // 准备数据 28 | DataSource dataSource = env.readTextFile(inputFilePath); 29 | 30 | // 开始计算 31 | DataSet result = dataSource 32 | // 2000,12,04,10 33 | .map(new MapFunction>() { 34 | public Tuple3 map(String s) throws Exception { 35 | String[] splited = s.split(","); 36 | String yearMonth = splited[0] + "-" + splited[1]; 37 | Integer temperature = Integer.parseInt(splited[3]); 38 | String day = splited[2]; 39 | return new Tuple3(yearMonth, temperature, day); 40 | } 41 | }) 42 | .partitionByRange(1) 43 | .sortPartition(1, Order.ASCENDING) 44 | .map(new MapFunction, Tuple2>() { 45 | public Tuple2 map(Tuple3 tuple3) throws Exception { 46 | return new Tuple2(tuple3.f0, tuple3.f1.toString()); 47 | } 48 | }) 49 | .groupBy(0) 50 | .reduce(new ReduceFunction>() { 51 | public Tuple2 reduce(Tuple2 t1, Tuple2 t2) throws Exception { 53 | return new Tuple2(t1.f0, t1.f1 + "," + t2.f1); 54 | } 55 | }) 56 | ; 57 | 58 | // 单线程输出(确保是在一个文件中) 59 | result.writeAsText(outputFilePath, FileSystem.WriteMode.OVERWRITE).setParallelism(1); 60 | 61 | env.execute("SecondarySorting"); 62 | 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /src/main/resources/chap02/mock.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /src/main/resources/chap02/secondary_sorting_input.csv: -------------------------------------------------------------------------------- 1 | 2000,12,04,10 2 | 2000,11,01,20 3 | 2000,12,02,-20 4 | 2000,11,07,30 -------------------------------------------------------------------------------- /src/main/resources/chap02/top10_input.txt: -------------------------------------------------------------------------------- 1 | GOOG,1980-02-18,461.78 2 | ILMN,1998-07-01,177.57 3 | IBM,1972-11-06,529.76 4 | GOOG,2007-05-14,268.83 5 | ILMN,2017-05-06,730.35 6 | GOOG,1978-08-04,206.63 7 | GOOG,1989-08-26,323.74 8 | ILMN,1983-02-12,728.37 9 | ILMN,1983-05-16,760.43 10 | GOOG,2003-04-27,552.22 11 | IBM,2017-10-07,254.39 12 | GOOG,2009-03-24,319.83 13 | GOOG,1985-12-02,680.77 14 | ILMN,1999-11-26,410.83 15 | GOOG,2017-02-26,603.11 16 | IBM,1988-06-02,302.76 17 | ILMN,1995-08-13,910.66 18 | GOOG,2001-11-15,805.84 19 | IBM,1990-04-03,968.45 20 | GOOG,2006-03-13,283.94 21 | GOOG,1970-01-06,673.12 22 | IBM,1989-02-26,534.24 23 | ILMN,2003-01-13,601.84 24 | GOOG,1975-08-03,262.16 25 | GOOG,2013-08-26,930.35 26 | IBM,1997-07-25,287.23 27 | IBM,1975-09-19,204.38 28 | GOOG,2011-09-01,993.13 29 | GOOG,1970-08-02,806.29 30 | ILMN,1985-03-14,973.32 -------------------------------------------------------------------------------- /src/main/resources/chap03/mock.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /src/main/resources/chap03/top10_input.txt: -------------------------------------------------------------------------------- 1 | cat1,64 2 | cat2,116 3 | cat3,160 4 | cat4,123 5 | cat5,154 6 | cat6,179 7 | cat7,93 8 | cat8,160 9 | cat9,106 10 | cat10,36 11 | cat11,167 12 | cat12,130 13 | cat13,105 14 | cat14,46 15 | cat15,40 16 | cat16,178 17 | cat17,168 18 | cat18,160 19 | cat19,171 20 | cat20,163 21 | cat21,143 22 | cat22,74 23 | cat23,68 24 | cat24,110 25 | cat25,133 26 | cat26,137 27 | cat27,53 28 | cat28,87 29 | cat29,183 30 | cat30,198 31 | cat31,153 32 | cat32,82 33 | cat33,148 34 | cat34,142 35 | cat35,199 36 | cat36,14 37 | cat37,85 38 | cat38,182 39 | cat39,114 40 | cat40,100 41 | cat41,26 42 | cat42,108 43 | cat43,141 44 | cat44,110 45 | cat45,42 46 | cat46,170 47 | cat47,121 48 | cat48,101 49 | cat49,103 50 | cat50,157 51 | cat51,171 52 | cat52,192 53 | cat53,135 54 | cat54,76 55 | cat55,62 56 | cat56,169 57 | cat57,143 58 | cat58,137 59 | cat59,57 60 | cat60,69 61 | cat61,94 62 | cat62,81 63 | cat63,109 64 | cat64,67 65 | cat65,100 66 | cat66,2 67 | cat67,172 68 | cat68,135 69 | cat69,127 70 | cat70,99 71 | cat71,26 72 | cat72,32 73 | cat73,130 74 | cat74,120 75 | cat75,17 76 | cat76,163 77 | cat77,2 78 | cat78,187 79 | cat79,60 80 | cat80,69 81 | cat81,43 82 | cat82,46 83 | cat83,188 84 | cat84,164 85 | cat85,91 86 | cat86,183 87 | cat87,71 88 | cat88,173 89 | cat89,25 90 | cat90,140 91 | cat91,45 92 | cat92,174 93 | cat93,117 94 | cat94,70 95 | cat95,199 96 | cat96,72 97 | cat97,62 98 | cat98,126 99 | cat99,130 100 | cat100,27 -------------------------------------------------------------------------------- /src/main/resources/chap04/mock_for_transaction.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /src/main/resources/chap04/mock_for_user.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /src/main/resources/chap04/transaction_input.txt: -------------------------------------------------------------------------------- 1 | t1,p31,u5,2,205 2 | t2,p36,u41,2,257 3 | t3,p22,u89,1,29 4 | t4,p4,u21,1,131 5 | t5,p24,u93,2,68 6 | t6,p8,u76,2,63 7 | t7,p47,u48,2,148 8 | t8,p10,u47,2,18 9 | t9,p43,u8,1,222 10 | t10,p38,u33,1,141 11 | t11,p19,u5,2,86 12 | t12,p34,u85,1,49 13 | t13,p16,u5,1,217 14 | t14,p47,u63,2,97 15 | t15,p32,u82,2,113 16 | t16,p16,u38,2,91 17 | t17,p42,u63,2,265 18 | t18,p15,u42,1,60 19 | t19,p30,u73,1,179 20 | t20,p25,u83,2,247 21 | t21,p43,u41,2,17 22 | t22,p40,u9,1,100 23 | t23,p24,u45,2,193 24 | t24,p31,u78,1,199 25 | t25,p5,u2,2,255 26 | t26,p5,u30,1,158 27 | t27,p12,u74,2,105 28 | t28,p28,u69,1,157 29 | t29,p30,u24,1,35 30 | t30,p22,u74,2,214 31 | t31,p48,u62,1,266 32 | t32,p24,u62,1,219 33 | t33,p24,u72,2,286 34 | t34,p5,u81,2,179 35 | t35,p13,u3,2,249 36 | t36,p38,u92,1,82 37 | t37,p21,u36,1,80 38 | t38,p15,u23,1,86 39 | t39,p43,u99,1,287 40 | t40,p49,u67,1,98 41 | t41,p6,u88,1,292 42 | t42,p46,u48,1,63 43 | t43,p33,u39,2,33 44 | t44,p17,u73,1,11 45 | t45,p2,u59,2,72 46 | t46,p42,u95,1,184 47 | t47,p36,u63,2,276 48 | t48,p12,u91,1,188 49 | t49,p49,u17,1,206 50 | t50,p6,u57,1,71 51 | t51,p37,u92,1,225 52 | t52,p10,u82,2,97 53 | t53,p40,u46,1,181 54 | t54,p23,u81,2,180 55 | t55,p39,u12,1,50 56 | t56,p46,u73,1,222 57 | t57,p43,u13,2,99 58 | t58,p13,u50,1,41 59 | t59,p40,u70,2,12 60 | t60,p47,u26,2,103 61 | t61,p20,u29,1,196 62 | t62,p10,u67,1,121 63 | t63,p1,u25,1,92 64 | t64,p42,u35,2,118 65 | t65,p28,u53,1,274 66 | t66,p30,u81,1,93 67 | t67,p17,u26,2,218 68 | t68,p1,u79,2,39 69 | t69,p33,u22,2,19 70 | t70,p47,u1,1,31 71 | t71,p25,u42,2,186 72 | t72,p32,u68,1,115 73 | t73,p29,u34,1,197 74 | t74,p9,u43,1,32 75 | t75,p11,u15,1,171 76 | t76,p36,u78,1,198 77 | t77,p18,u11,2,122 78 | t78,p23,u6,2,245 79 | t79,p19,u32,1,228 80 | t80,p41,u12,1,206 81 | t81,p38,u9,2,175 82 | t82,p30,u77,1,289 83 | t83,p41,u57,1,106 84 | t84,p24,u47,1,212 85 | t85,p12,u67,1,199 86 | t86,p20,u20,2,228 87 | t87,p23,u16,1,240 88 | t88,p11,u34,2,84 89 | t89,p38,u21,1,264 90 | t90,p35,u85,1,76 91 | t91,p30,u100,1,215 92 | t92,p7,u35,2,296 93 | t93,p18,u27,1,278 94 | t94,p15,u60,2,40 95 | t95,p6,u68,1,257 96 | t96,p49,u19,2,290 97 | t97,p27,u58,1,34 98 | t98,p37,u49,2,162 99 | t99,p9,u51,2,83 100 | t100,p47,u67,1,150 101 | t101,p15,u41,1,150 102 | t102,p48,u9,2,61 103 | t103,p42,u49,1,285 104 | t104,p4,u8,1,83 105 | t105,p10,u91,2,162 106 | t106,p8,u27,1,67 107 | t107,p44,u88,1,41 108 | t108,p49,u65,1,43 109 | t109,p30,u49,1,120 110 | t110,p36,u55,1,264 111 | t111,p31,u100,2,135 112 | t112,p5,u58,1,22 113 | t113,p3,u35,1,47 114 | t114,p30,u86,1,76 115 | t115,p6,u47,2,44 116 | t116,p41,u27,2,290 117 | t117,p15,u93,1,155 118 | t118,p34,u88,1,158 119 | t119,p10,u57,2,209 120 | t120,p45,u62,1,258 121 | t121,p43,u72,2,215 122 | t122,p38,u68,1,185 123 | t123,p19,u37,2,13 124 | t124,p40,u69,1,234 125 | t125,p42,u79,1,131 126 | t126,p21,u58,1,71 127 | t127,p19,u35,2,20 128 | t128,p35,u5,2,31 129 | t129,p48,u5,2,281 130 | t130,p17,u75,2,45 131 | t131,p41,u89,2,26 132 | t132,p48,u10,1,156 133 | t133,p38,u32,2,154 134 | t134,p25,u62,2,179 135 | t135,p41,u54,2,55 136 | t136,p29,u81,2,122 137 | t137,p13,u78,2,25 138 | t138,p43,u35,1,223 139 | t139,p44,u74,1,186 140 | t140,p27,u37,2,146 141 | t141,p43,u56,2,53 142 | t142,p14,u28,1,51 143 | t143,p38,u2,1,11 144 | t144,p36,u76,2,216 145 | t145,p12,u12,1,155 146 | t146,p2,u90,2,209 147 | t147,p19,u96,2,209 148 | t148,p48,u24,2,175 149 | t149,p14,u89,1,93 150 | t150,p37,u22,2,203 151 | t151,p6,u80,1,219 152 | t152,p11,u72,1,72 153 | t153,p10,u62,2,42 154 | t154,p23,u39,1,237 155 | t155,p11,u49,1,18 156 | t156,p6,u60,2,16 157 | t157,p34,u45,1,288 158 | t158,p29,u53,1,282 159 | t159,p7,u61,1,49 160 | t160,p26,u19,2,133 161 | t161,p47,u57,2,229 162 | t162,p47,u23,1,227 163 | t163,p12,u20,2,298 164 | t164,p35,u45,2,275 165 | t165,p1,u22,2,32 166 | t166,p30,u29,1,183 167 | t167,p26,u2,1,48 168 | t168,p3,u79,1,235 169 | t169,p14,u56,1,210 170 | t170,p31,u55,1,116 171 | t171,p32,u54,2,26 172 | t172,p37,u37,2,226 173 | t173,p31,u51,2,19 174 | t174,p15,u31,2,249 175 | t175,p32,u27,2,96 176 | t176,p1,u95,1,205 177 | t177,p21,u42,2,284 178 | t178,p16,u45,1,168 179 | t179,p29,u77,2,84 180 | t180,p23,u75,2,84 181 | t181,p30,u25,1,47 182 | t182,p32,u33,2,25 183 | t183,p3,u9,2,102 184 | t184,p10,u87,1,213 185 | t185,p49,u2,1,155 186 | t186,p20,u6,1,56 187 | t187,p10,u61,1,94 188 | t188,p11,u31,2,106 189 | t189,p47,u91,2,49 190 | t190,p4,u42,1,141 191 | t191,p27,u45,2,269 192 | t192,p49,u51,2,24 193 | t193,p4,u67,2,118 194 | t194,p46,u72,2,246 195 | t195,p35,u33,2,271 196 | t196,p5,u77,1,113 197 | t197,p32,u50,2,214 198 | t198,p41,u57,1,72 199 | t199,p14,u69,2,132 200 | t200,p30,u88,1,289 -------------------------------------------------------------------------------- /src/main/resources/chap04/user_input.txt: -------------------------------------------------------------------------------- 1 | u1,DE 2 | u2,NE 3 | u3,MI 4 | u4,MA 5 | u5,CA 6 | u6,MA 7 | u7,MA 8 | u8,DE 9 | u9,HI 10 | u10,MI 11 | u11,NE 12 | u12,WA 13 | u13,MP 14 | u14,NE 15 | u15,MI 16 | u16,CA 17 | u17,CA 18 | u18,MA 19 | u19,CA 20 | u20,WI 21 | u21,HI 22 | u22,HI 23 | u23,HI 24 | u24,DE 25 | u25,MA 26 | u26,NE 27 | u27,DE 28 | u28,MP 29 | u29,MA 30 | u30,HI 31 | u31,CA 32 | u32,HI 33 | u33,CA 34 | u34,MS 35 | u35,WI 36 | u36,KS 37 | u37,MA 38 | u38,MA 39 | u39,DE 40 | u40,DE 41 | u41,WI 42 | u42,WI 43 | u43,WI 44 | u44,MA 45 | u45,MS 46 | u46,NE 47 | u47,HI 48 | u48,KS 49 | u49,MA 50 | u50,MP 51 | u51,MI 52 | u52,NE 53 | u53,HI 54 | u54,WA 55 | u55,CA 56 | u56,MS 57 | u57,CO 58 | u58,MA 59 | u59,KS 60 | u60,HI 61 | u61,MP 62 | u62,MI 63 | u63,MS 64 | u64,WI 65 | u65,WA 66 | u66,DE 67 | u67,DE 68 | u68,WI 69 | u69,HI 70 | u70,MI 71 | u71,HI 72 | u72,WI 73 | u73,CO 74 | u74,NE 75 | u75,DE 76 | u76,MI 77 | u77,NE 78 | u78,WA 79 | u79,CO 80 | u80,HI 81 | u81,NE 82 | u82,MI 83 | u83,MS 84 | u84,WI 85 | u85,WI 86 | u86,WI 87 | u87,MA 88 | u88,HI 89 | u89,CO 90 | u90,CA 91 | u91,MI 92 | u92,HI 93 | u93,WI 94 | u94,WI 95 | u95,MA 96 | u96,NE 97 | u97,CO 98 | u98,KS 99 | u99,MS 100 | u100,WA -------------------------------------------------------------------------------- /src/main/resources/chap05/input.txt: -------------------------------------------------------------------------------- 1 | java is a great language 2 | java is a programming language 3 | java is green fun language 4 | java is great 5 | programming with java is fun -------------------------------------------------------------------------------- /src/main/resources/chap05/mock.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /src/main/scala/io/miaoji/flink/algorithms/scala/chap02/SecondarySortingJob.scala: -------------------------------------------------------------------------------- 1 | package io.miaoji.flink.algorithms.scala.chap02 2 | 3 | import org.apache.flink.api.common.operators.Order 4 | import org.apache.flink.api.scala._ 5 | import org.apache.flink.core.fs.FileSystem.WriteMode 6 | import org.apache.flink.util.Collector 7 | 8 | /** 9 | * 使用了sortPartition 10 | */ 11 | object SecondarySortingJob { 12 | 13 | val INPUT_FILE_NAME = "chap03/top10_input.txt" 14 | val OUTPUT_FILE_NAME = "chap03/top10_output.txt" 15 | 16 | def main(args: Array[String]): Unit = { 17 | val env = ExecutionEnvironment.getExecutionEnvironment 18 | val dir = this.getClass.getClassLoader.getResource("") 19 | val inputFilePath = dir + "/" + INPUT_FILE_NAME 20 | val outputFilePath = dir + "/" + OUTPUT_FILE_NAME 21 | val dataSource = env.readTextFile(inputFilePath) 22 | dataSource 23 | // STEP-1: 用逗号进行分隔 24 | .map[Array[String]]{x:String => x.split(",")} 25 | // STEP-2: 过滤不符合格式的参数 26 | .filter(_.length!=3) 27 | // STEP-3: 第一个是名称,第二个是日期,第三个是金额 28 | .map{x:Array[String] => (x(0),x(1), x(2))} 29 | // STEP-4: 分区然后根据date排序 30 | // ,, 31 | .partitionByRange(0) 32 | .sortPartition(2, Order.ASCENDING) 33 | // STEP-5: 分组然后归约 34 | // ,, 35 | .groupBy(0) 36 | .reduce { 37 | (t1: (String, String, String), t2: (String, String, String)) => 38 | (t1._1, t1._2 + ","+t2._2,t1._3+","+t2._3) 39 | } 40 | // STEP-6: 41 | // ,, 42 | .flatMap {(t: (String, String, String), coll: Collector[(String, String)]) => 43 | val timeArray = t._2.split(",") 44 | val array = Array(timeArray.length) 45 | var i = 0 46 | for( i <- 0 until timeArray.length-1){ 47 | 48 | coll.collect((t._1,s"""(${t._2.split(",")(i)},${t._3.split(",")(i)})""")) 49 | } 50 | } 51 | // STEP-7: 名称分组然后把"日期金额"字符串拼接在后面 52 | // , 53 | .groupBy(0) 54 | .reduce((t1,t2) => (t1._1,t1._2+t2._2)) 55 | 56 | dataSource.print() 57 | dataSource.writeAsText(OUTPUT_FILE_NAME, WriteMode.OVERWRITE) 58 | 59 | 60 | env.execute("top 10") 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/io/miaoji/flink/algorithms/scala/chap03/README.md: -------------------------------------------------------------------------------- 1 | # 原文 2 | [top10](https://github.com/mahmoudparsian/data-algorithms-book/tree/master/src/main/java/org/dataalgorithms/chap03) 3 | 4 | # 输入示例 5 | [top10_input.txt](./../../../../resources/chap03/top10_input.txt) -------------------------------------------------------------------------------- /src/main/scala/io/miaoji/flink/algorithms/scala/chap03/Top10.scala: -------------------------------------------------------------------------------- 1 | package io.miaoji.flink.algorithms.scala.chap03 2 | 3 | import org.apache.flink.api.common.operators.Order 4 | import org.apache.flink.api.scala._ 5 | import org.apache.flink.core.fs.FileSystem.WriteMode 6 | ; 7 | 8 | /** 9 | * 使用了sortPartition 10 | */ 11 | object Top10{ 12 | 13 | val INPUT_FILE_NAME = "chap03/top10_input.txt" 14 | val OUTPUT_FILE_NAME = "chap03/top10_output.txt" 15 | 16 | def main(args: Array[String]): Unit = { 17 | val env = ExecutionEnvironment.getExecutionEnvironment 18 | 19 | val dir = this.getClass.getClassLoader.getResource("") 20 | val inputFilePath = dir + "/" + INPUT_FILE_NAME 21 | val outputFilePath = dir + "/" + OUTPUT_FILE_NAME 22 | val dataSource = env.readTextFile(inputFilePath) 23 | dataSource 24 | // STEP-1: 用逗号进行分隔 25 | .map[Array[String]] { x: String => x.split(",") } 26 | // STEP-2: 过滤不符合格式的参数 27 | .filter(_.length == 2) 28 | // STEP-3: 第一个是猫id,第二个是猫体重 29 | .map { x: Array[String] => (x(0), x(1).toInt, 1) } 30 | // STEP-4: 将所有数据划为一个分区 31 | .partitionByRange(2) 32 | // STEP-5: 对同一分区内的数据的第1列(下表从0开始)进行排序 33 | .sortPartition(1, Order.DESCENDING) 34 | // STEP-6: 输出到文件 35 | .writeAsText(OUTPUT_FILE_NAME, WriteMode.OVERWRITE).setParallelism(1) 36 | 37 | env.execute("top 10") 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/io/miaoji/flink/algorithms/scala/chap04/LeftOuterJoin.scala: -------------------------------------------------------------------------------- 1 | package io.miaoji.flink.algorithms.scala.chap04 2 | 3 | import org.apache.flink.api.scala._ 4 | import org.apache.flink.core.fs.FileSystem.WriteMode 5 | 6 | /** 7 | * 当前用户 : wing。 8 | * 当前日期 : 2018/12/25。 9 | * 包名 : io.miaoji.flink.algorithms.scala.chap04。 10 | * 当前时间 : 4:57 PM。 11 | * 功能 : 12 | */ 13 | object LeftOuterJoin { 14 | 15 | val USER_INPUT_FILE_NAME = "chap04/user_input.txt" 16 | val TRANSACTION_INPUT_FILE_NAME = "chap04/transaction_input.txt" 17 | val OUTPUT_FILE_NAME = "chap04/left_out_join_output.txt" 18 | val dir = LeftOuterJoin.getClass.getClassLoader.getResource("") 19 | 20 | def main(args: Array[String]): Unit = { 21 | val env = ExecutionEnvironment.getExecutionEnvironment 22 | // STEP-1: 读取用户数据 23 | val userData = env.readCsvFile[(String, String)](dir+"/"+USER_INPUT_FILE_NAME) 24 | // STEP-2: 读取交易数据 25 | val transactionData = env.readCsvFile[(String, String, String, Long, Long)](dir+"/"+TRANSACTION_INPUT_FILE_NAME) 26 | // STEP-3: 左联接 27 | val result = transactionData.leftOuterJoin(userData) 28 | .where(2).equalTo(0) 29 | .apply((t: (String, String, String, Long, Long), u: (String, String)) => 30 | (t._2,u._2) 31 | ) 32 | // STEP-4: 合并 33 | // p1 TX 34 | .distinct(0,1) 35 | .groupBy(0).reduce { (t1: (String, String), t2: (String, String)) => 36 | (t1._1, t1._2 + "," + t2._2) 37 | } 38 | // p1 TX,TX 39 | .map((t: (String, String)) => 40 | (t._1, "["+t._2+"]",t._2.split(",").length) 41 | ) 42 | result.writeAsCsv(dir+"/"+OUTPUT_FILE_NAME,writeMode = WriteMode.OVERWRITE).setParallelism(1) 43 | // STEP-4: 输出数据 44 | env.execute("Left Outer Join") 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/io/miaoji/flink/algorithms/scala/chap05/OrderInversion.scala: -------------------------------------------------------------------------------- 1 | package io.miaoji.flink.algorithms.scala.chap05 2 | 3 | 4 | import io.miaoji.flink.algorithms.scala.chap04.LeftOuterJoin 5 | import org.apache.flink.api.common.operators.Order 6 | import org.apache.flink.api.scala._ 7 | import org.apache.flink.core.fs.FileSystem.WriteMode 8 | import org.apache.flink.util.Collector 9 | 10 | import scala.collection.mutable.ArrayBuffer 11 | 12 | object OrderInversion { 13 | 14 | val INPUT_FILE_NAME = "chap05/input.txt" 15 | val OUTPUT_FILE_NAME = "chap05/output.txt" 16 | val dir = LeftOuterJoin.getClass.getClassLoader.getResource("") 17 | 18 | def main(args: Array[String]): Unit = { 19 | val env = ExecutionEnvironment.getExecutionEnvironment 20 | // STEP-1: 读取数据 21 | val inputData = env.readTextFile(dir+"/"+INPUT_FILE_NAME) 22 | // STEP-2: 把句子拆成词与相邻词的元祖 23 | // java is a great language 24 | val result = inputData.flatMap { (str: String, collector: Collector[(String, String, Int)]) => 25 | val strArray = str.split(" ") 26 | for(i <- 0 until strArray.length){ 27 | val token = strArray(i) 28 | val start = if(i - 2 < 0) 0 else i-2 29 | val end = if(i + 2 >= strArray.length) strArray.length-1 else i+2 30 | collector.collect((token, "*", end-start)) 31 | // println(s"$i $token $start $end ${end-start}") 32 | // 这里用to来包括end这个下表 33 | for(j <- start to end){ 34 | if(j != i) { 35 | collector.collect((strArray(i), strArray(j), 1)) 36 | } 37 | } 38 | } 39 | } 40 | // java * 4 41 | // java is 1 42 | // java is 1 43 | // java a 1 44 | // java great 1 45 | // java language 1 46 | // STEP-3: 相同词与相邻词进行相加 47 | .groupBy(0,1).reduce((t1: (String, String, Int), t2: (String, String, Int)) => 48 | (t1._1,t1._2,t1._3 + t2._3) 49 | ) 50 | // 必须进行sortGroup,才能保证(java,*)是在第一个 51 | // STEP-4: 按书中的形式组合 52 | .groupBy(0).sortGroup(2, Order.DESCENDING).reduceGroup { (tuples: Iterator[(String, String, Int)], collector: Collector[(Array[(String, String)], Array[Float])]) => 53 | var key = ArrayBuffer[(String, String)]() 54 | var value = ArrayBuffer[Float]() 55 | for(tuple <- tuples){ 56 | key.append((tuple._1,tuple._2)) 57 | value.append(tuple._3) 58 | } 59 | collector.collect((key.toArray, value.toArray)) 60 | } 61 | // (java,*),(java,is),(java,a),(java,great),(java,language) 4,1,1,1,1 62 | .map { (tuple: (Array[(String, String)], Array[Float])) => 63 | val key = tuple._1 64 | val value = tuple._2 65 | val target = ArrayBuffer[Float]() 66 | var index = 0 67 | value.foreach(x => target.append(x/value(index).toFloat)) 68 | (tuple._1, target.toArray) 69 | } 70 | // (java,*),(java,is),(java,a),(java,great),(java,language) 1,0.25,0.25,0.25,0.25 71 | .flatMap ((tuple: (Array[(String, String)], Array[Float]), collector: Collector[(String, String, Double)]) => 72 | for (i <- 0 until tuple._1.length) { 73 | collector.collect(tuple._1(i)._1, tuple._1(i)._2, tuple._2(i)) 74 | } 75 | ) 76 | .filter(_._2 != "*") 77 | result.writeAsCsv(dir+"/"+OUTPUT_FILE_NAME,writeMode = WriteMode.OVERWRITE).setParallelism(1) 78 | env.execute("Order Inversion") 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /src/main/scala/io/miaoji/flink/algorithms/scala/chap05/README.md: -------------------------------------------------------------------------------- 1 | # 原文 2 | [Order Inversion](https://github.com/mahmoudparsian/data-algorithms-book/tree/master/src/main/java/org/dataalgorithms/chap05) 3 | 4 | # 输入示例 5 | [input.txt](./../../../../resources/chap05/input.txt) --------------------------------------------------------------------------------