├── .gitignore
├── LICENSE
├── README.md
├── pom.xml
└── src
└── main
├── java
└── io
│ └── miaoji
│ └── flink
│ └── algorithms
│ └── chap02
│ ├── README.md
│ └── SecondarySortingJob.java
├── resources
├── chap02
│ ├── mock.html
│ ├── secondary_sorting_input.csv
│ └── top10_input.txt
├── chap03
│ ├── mock.html
│ └── top10_input.txt
├── chap04
│ ├── mock_for_transaction.html
│ ├── mock_for_user.html
│ ├── transaction_input.txt
│ └── user_input.txt
└── chap05
│ ├── input.txt
│ └── mock.html
└── scala
└── io
└── miaoji
└── flink
└── algorithms
└── scala
├── chap02
└── SecondarySortingJob.scala
├── chap03
├── README.md
└── Top10.scala
├── chap04
└── LeftOuterJoin.scala
└── chap05
├── OrderInversion.scala
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled class file
2 | *.class
3 |
4 | # Log file
5 | *.log
6 |
7 | # BlueJ files
8 | *.ctxt
9 |
10 | # Mobile Tools for Java (J2ME)
11 | .mtj.tmp/
12 |
13 | # Package Files #
14 | *.jar
15 | *.war
16 | *.nar
17 | *.ear
18 | *.zip
19 | *.tar.gz
20 | *.rar
21 |
22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
23 | hs_err_pid*
24 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # flink-data-algorithms-book
2 | 本代码库基于[data-algorithms-book](https://github.com/mahmoudparsian/data-algorithms-book)的章节,通过flink框架实现其中的算法。
3 |
4 | ## 更新进度
5 |
6 | | | Java | Scala |
7 | | -------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
8 | | chap02 SecondarySortingJob | [✅](src/main/java/io/miaoji/flink/algorithms/chap02/SecondarySortingJob.java) | [✅](src/main/scala/io/miaoji/flink/algorithms/scala/chap02/SecondarySortingJob.scala) |
9 | | chap03 Top 10 | | [✅](src/main/scala/io/miaoji/flink/algorithms/scala/chap03/Top10.scala) |
10 | | chap04 Left Outer Join | | [✅](src/main/scala/io/miaoji/flink/algorithms/scala/chap04/LeftOuterJoin.scala) |
11 | | chap05 Order Inversion | | [✅](src/main/scala/io/miaoji/flink/algorithms/scala/chap05/OrderInversion.scala) |
12 | | chap06 - chap31 | ... | ... |
13 |
14 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | io.miaoji
8 | flink-data-algorithms-book
9 | 1.0-SNAPSHOT
10 |
11 | 2.11
12 | 1.6.2
13 |
14 |
15 |
16 |
17 | org.apache.flink
18 | flink-clients_${scala.binary.version}
19 | ${flink.version}
20 |
21 |
22 |
23 | org.apache.flink
24 | flink-scala_${scala.binary.version}
25 | ${flink.version}
26 |
27 |
28 | org.apache.flink
29 | flink-streaming-scala_${scala.binary.version}
30 | ${flink.version}
31 |
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/src/main/java/io/miaoji/flink/algorithms/chap02/README.md:
--------------------------------------------------------------------------------
1 | # 原文
2 | [secondary_sorting](https://github.com/mahmoudparsian/data-algorithms-book/tree/master/src/main/java/org/dataalgorithms/chap02)
3 |
4 | # 输入示例
5 | [secondary_sorting_input.csv](./../../../../resources/chap02/secondary_sorting_input.csv)
--------------------------------------------------------------------------------
/src/main/java/io/miaoji/flink/algorithms/chap02/SecondarySortingJob.java:
--------------------------------------------------------------------------------
1 | package io.miaoji.flink.algorithms.chap02;
2 |
3 | import org.apache.flink.api.common.functions.MapFunction;
4 | import org.apache.flink.api.common.functions.ReduceFunction;
5 | import org.apache.flink.api.common.operators.Order;
6 | import org.apache.flink.api.java.DataSet;
7 | import org.apache.flink.api.java.ExecutionEnvironment;
8 | import org.apache.flink.api.java.operators.DataSource;
9 | import org.apache.flink.api.java.tuple.Tuple2;
10 | import org.apache.flink.api.java.tuple.Tuple3;
11 | import org.apache.flink.core.fs.FileSystem;
12 |
13 | public class SecondarySortingJob {
14 |
15 | static String INPUT_FILENAME = "chap02/secondary_sorting_input.csv";
16 | static String OUTPUT_FILENAME = "chap02/secondary_sorting_output.csv";
17 |
18 | public static void main(String[] args) throws Exception {
19 | // 本地环境
20 | ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
21 |
22 | // 准备参数(输出路径为 target/classes 下)
23 | String dir = SecondarySortingJob.class.getClassLoader().getResource("").getPath();
24 | String inputFilePath = dir+"/"+INPUT_FILENAME;
25 | String outputFilePath = dir+"/"+OUTPUT_FILENAME;
26 |
27 | // 准备数据
28 | DataSource dataSource = env.readTextFile(inputFilePath);
29 |
30 | // 开始计算
31 | DataSet result = dataSource
32 | // 2000,12,04,10
33 | .map(new MapFunction>() {
34 | public Tuple3 map(String s) throws Exception {
35 | String[] splited = s.split(",");
36 | String yearMonth = splited[0] + "-" + splited[1];
37 | Integer temperature = Integer.parseInt(splited[3]);
38 | String day = splited[2];
39 | return new Tuple3(yearMonth, temperature, day);
40 | }
41 | })
42 | .partitionByRange(1)
43 | .sortPartition(1, Order.ASCENDING)
44 | .map(new MapFunction, Tuple2>() {
45 | public Tuple2 map(Tuple3 tuple3) throws Exception {
46 | return new Tuple2(tuple3.f0, tuple3.f1.toString());
47 | }
48 | })
49 | .groupBy(0)
50 | .reduce(new ReduceFunction>() {
51 | public Tuple2 reduce(Tuple2 t1, Tuple2 t2) throws Exception {
53 | return new Tuple2(t1.f0, t1.f1 + "," + t2.f1);
54 | }
55 | })
56 | ;
57 |
58 | // 单线程输出(确保是在一个文件中)
59 | result.writeAsText(outputFilePath, FileSystem.WriteMode.OVERWRITE).setParallelism(1);
60 |
61 | env.execute("SecondarySorting");
62 |
63 | }
64 |
65 | }
66 |
--------------------------------------------------------------------------------
/src/main/resources/chap02/mock.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/src/main/resources/chap02/secondary_sorting_input.csv:
--------------------------------------------------------------------------------
1 | 2000,12,04,10
2 | 2000,11,01,20
3 | 2000,12,02,-20
4 | 2000,11,07,30
--------------------------------------------------------------------------------
/src/main/resources/chap02/top10_input.txt:
--------------------------------------------------------------------------------
1 | GOOG,1980-02-18,461.78
2 | ILMN,1998-07-01,177.57
3 | IBM,1972-11-06,529.76
4 | GOOG,2007-05-14,268.83
5 | ILMN,2017-05-06,730.35
6 | GOOG,1978-08-04,206.63
7 | GOOG,1989-08-26,323.74
8 | ILMN,1983-02-12,728.37
9 | ILMN,1983-05-16,760.43
10 | GOOG,2003-04-27,552.22
11 | IBM,2017-10-07,254.39
12 | GOOG,2009-03-24,319.83
13 | GOOG,1985-12-02,680.77
14 | ILMN,1999-11-26,410.83
15 | GOOG,2017-02-26,603.11
16 | IBM,1988-06-02,302.76
17 | ILMN,1995-08-13,910.66
18 | GOOG,2001-11-15,805.84
19 | IBM,1990-04-03,968.45
20 | GOOG,2006-03-13,283.94
21 | GOOG,1970-01-06,673.12
22 | IBM,1989-02-26,534.24
23 | ILMN,2003-01-13,601.84
24 | GOOG,1975-08-03,262.16
25 | GOOG,2013-08-26,930.35
26 | IBM,1997-07-25,287.23
27 | IBM,1975-09-19,204.38
28 | GOOG,2011-09-01,993.13
29 | GOOG,1970-08-02,806.29
30 | ILMN,1985-03-14,973.32
--------------------------------------------------------------------------------
/src/main/resources/chap03/mock.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/src/main/resources/chap03/top10_input.txt:
--------------------------------------------------------------------------------
1 | cat1,64
2 | cat2,116
3 | cat3,160
4 | cat4,123
5 | cat5,154
6 | cat6,179
7 | cat7,93
8 | cat8,160
9 | cat9,106
10 | cat10,36
11 | cat11,167
12 | cat12,130
13 | cat13,105
14 | cat14,46
15 | cat15,40
16 | cat16,178
17 | cat17,168
18 | cat18,160
19 | cat19,171
20 | cat20,163
21 | cat21,143
22 | cat22,74
23 | cat23,68
24 | cat24,110
25 | cat25,133
26 | cat26,137
27 | cat27,53
28 | cat28,87
29 | cat29,183
30 | cat30,198
31 | cat31,153
32 | cat32,82
33 | cat33,148
34 | cat34,142
35 | cat35,199
36 | cat36,14
37 | cat37,85
38 | cat38,182
39 | cat39,114
40 | cat40,100
41 | cat41,26
42 | cat42,108
43 | cat43,141
44 | cat44,110
45 | cat45,42
46 | cat46,170
47 | cat47,121
48 | cat48,101
49 | cat49,103
50 | cat50,157
51 | cat51,171
52 | cat52,192
53 | cat53,135
54 | cat54,76
55 | cat55,62
56 | cat56,169
57 | cat57,143
58 | cat58,137
59 | cat59,57
60 | cat60,69
61 | cat61,94
62 | cat62,81
63 | cat63,109
64 | cat64,67
65 | cat65,100
66 | cat66,2
67 | cat67,172
68 | cat68,135
69 | cat69,127
70 | cat70,99
71 | cat71,26
72 | cat72,32
73 | cat73,130
74 | cat74,120
75 | cat75,17
76 | cat76,163
77 | cat77,2
78 | cat78,187
79 | cat79,60
80 | cat80,69
81 | cat81,43
82 | cat82,46
83 | cat83,188
84 | cat84,164
85 | cat85,91
86 | cat86,183
87 | cat87,71
88 | cat88,173
89 | cat89,25
90 | cat90,140
91 | cat91,45
92 | cat92,174
93 | cat93,117
94 | cat94,70
95 | cat95,199
96 | cat96,72
97 | cat97,62
98 | cat98,126
99 | cat99,130
100 | cat100,27
--------------------------------------------------------------------------------
/src/main/resources/chap04/mock_for_transaction.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/src/main/resources/chap04/mock_for_user.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/src/main/resources/chap04/transaction_input.txt:
--------------------------------------------------------------------------------
1 | t1,p31,u5,2,205
2 | t2,p36,u41,2,257
3 | t3,p22,u89,1,29
4 | t4,p4,u21,1,131
5 | t5,p24,u93,2,68
6 | t6,p8,u76,2,63
7 | t7,p47,u48,2,148
8 | t8,p10,u47,2,18
9 | t9,p43,u8,1,222
10 | t10,p38,u33,1,141
11 | t11,p19,u5,2,86
12 | t12,p34,u85,1,49
13 | t13,p16,u5,1,217
14 | t14,p47,u63,2,97
15 | t15,p32,u82,2,113
16 | t16,p16,u38,2,91
17 | t17,p42,u63,2,265
18 | t18,p15,u42,1,60
19 | t19,p30,u73,1,179
20 | t20,p25,u83,2,247
21 | t21,p43,u41,2,17
22 | t22,p40,u9,1,100
23 | t23,p24,u45,2,193
24 | t24,p31,u78,1,199
25 | t25,p5,u2,2,255
26 | t26,p5,u30,1,158
27 | t27,p12,u74,2,105
28 | t28,p28,u69,1,157
29 | t29,p30,u24,1,35
30 | t30,p22,u74,2,214
31 | t31,p48,u62,1,266
32 | t32,p24,u62,1,219
33 | t33,p24,u72,2,286
34 | t34,p5,u81,2,179
35 | t35,p13,u3,2,249
36 | t36,p38,u92,1,82
37 | t37,p21,u36,1,80
38 | t38,p15,u23,1,86
39 | t39,p43,u99,1,287
40 | t40,p49,u67,1,98
41 | t41,p6,u88,1,292
42 | t42,p46,u48,1,63
43 | t43,p33,u39,2,33
44 | t44,p17,u73,1,11
45 | t45,p2,u59,2,72
46 | t46,p42,u95,1,184
47 | t47,p36,u63,2,276
48 | t48,p12,u91,1,188
49 | t49,p49,u17,1,206
50 | t50,p6,u57,1,71
51 | t51,p37,u92,1,225
52 | t52,p10,u82,2,97
53 | t53,p40,u46,1,181
54 | t54,p23,u81,2,180
55 | t55,p39,u12,1,50
56 | t56,p46,u73,1,222
57 | t57,p43,u13,2,99
58 | t58,p13,u50,1,41
59 | t59,p40,u70,2,12
60 | t60,p47,u26,2,103
61 | t61,p20,u29,1,196
62 | t62,p10,u67,1,121
63 | t63,p1,u25,1,92
64 | t64,p42,u35,2,118
65 | t65,p28,u53,1,274
66 | t66,p30,u81,1,93
67 | t67,p17,u26,2,218
68 | t68,p1,u79,2,39
69 | t69,p33,u22,2,19
70 | t70,p47,u1,1,31
71 | t71,p25,u42,2,186
72 | t72,p32,u68,1,115
73 | t73,p29,u34,1,197
74 | t74,p9,u43,1,32
75 | t75,p11,u15,1,171
76 | t76,p36,u78,1,198
77 | t77,p18,u11,2,122
78 | t78,p23,u6,2,245
79 | t79,p19,u32,1,228
80 | t80,p41,u12,1,206
81 | t81,p38,u9,2,175
82 | t82,p30,u77,1,289
83 | t83,p41,u57,1,106
84 | t84,p24,u47,1,212
85 | t85,p12,u67,1,199
86 | t86,p20,u20,2,228
87 | t87,p23,u16,1,240
88 | t88,p11,u34,2,84
89 | t89,p38,u21,1,264
90 | t90,p35,u85,1,76
91 | t91,p30,u100,1,215
92 | t92,p7,u35,2,296
93 | t93,p18,u27,1,278
94 | t94,p15,u60,2,40
95 | t95,p6,u68,1,257
96 | t96,p49,u19,2,290
97 | t97,p27,u58,1,34
98 | t98,p37,u49,2,162
99 | t99,p9,u51,2,83
100 | t100,p47,u67,1,150
101 | t101,p15,u41,1,150
102 | t102,p48,u9,2,61
103 | t103,p42,u49,1,285
104 | t104,p4,u8,1,83
105 | t105,p10,u91,2,162
106 | t106,p8,u27,1,67
107 | t107,p44,u88,1,41
108 | t108,p49,u65,1,43
109 | t109,p30,u49,1,120
110 | t110,p36,u55,1,264
111 | t111,p31,u100,2,135
112 | t112,p5,u58,1,22
113 | t113,p3,u35,1,47
114 | t114,p30,u86,1,76
115 | t115,p6,u47,2,44
116 | t116,p41,u27,2,290
117 | t117,p15,u93,1,155
118 | t118,p34,u88,1,158
119 | t119,p10,u57,2,209
120 | t120,p45,u62,1,258
121 | t121,p43,u72,2,215
122 | t122,p38,u68,1,185
123 | t123,p19,u37,2,13
124 | t124,p40,u69,1,234
125 | t125,p42,u79,1,131
126 | t126,p21,u58,1,71
127 | t127,p19,u35,2,20
128 | t128,p35,u5,2,31
129 | t129,p48,u5,2,281
130 | t130,p17,u75,2,45
131 | t131,p41,u89,2,26
132 | t132,p48,u10,1,156
133 | t133,p38,u32,2,154
134 | t134,p25,u62,2,179
135 | t135,p41,u54,2,55
136 | t136,p29,u81,2,122
137 | t137,p13,u78,2,25
138 | t138,p43,u35,1,223
139 | t139,p44,u74,1,186
140 | t140,p27,u37,2,146
141 | t141,p43,u56,2,53
142 | t142,p14,u28,1,51
143 | t143,p38,u2,1,11
144 | t144,p36,u76,2,216
145 | t145,p12,u12,1,155
146 | t146,p2,u90,2,209
147 | t147,p19,u96,2,209
148 | t148,p48,u24,2,175
149 | t149,p14,u89,1,93
150 | t150,p37,u22,2,203
151 | t151,p6,u80,1,219
152 | t152,p11,u72,1,72
153 | t153,p10,u62,2,42
154 | t154,p23,u39,1,237
155 | t155,p11,u49,1,18
156 | t156,p6,u60,2,16
157 | t157,p34,u45,1,288
158 | t158,p29,u53,1,282
159 | t159,p7,u61,1,49
160 | t160,p26,u19,2,133
161 | t161,p47,u57,2,229
162 | t162,p47,u23,1,227
163 | t163,p12,u20,2,298
164 | t164,p35,u45,2,275
165 | t165,p1,u22,2,32
166 | t166,p30,u29,1,183
167 | t167,p26,u2,1,48
168 | t168,p3,u79,1,235
169 | t169,p14,u56,1,210
170 | t170,p31,u55,1,116
171 | t171,p32,u54,2,26
172 | t172,p37,u37,2,226
173 | t173,p31,u51,2,19
174 | t174,p15,u31,2,249
175 | t175,p32,u27,2,96
176 | t176,p1,u95,1,205
177 | t177,p21,u42,2,284
178 | t178,p16,u45,1,168
179 | t179,p29,u77,2,84
180 | t180,p23,u75,2,84
181 | t181,p30,u25,1,47
182 | t182,p32,u33,2,25
183 | t183,p3,u9,2,102
184 | t184,p10,u87,1,213
185 | t185,p49,u2,1,155
186 | t186,p20,u6,1,56
187 | t187,p10,u61,1,94
188 | t188,p11,u31,2,106
189 | t189,p47,u91,2,49
190 | t190,p4,u42,1,141
191 | t191,p27,u45,2,269
192 | t192,p49,u51,2,24
193 | t193,p4,u67,2,118
194 | t194,p46,u72,2,246
195 | t195,p35,u33,2,271
196 | t196,p5,u77,1,113
197 | t197,p32,u50,2,214
198 | t198,p41,u57,1,72
199 | t199,p14,u69,2,132
200 | t200,p30,u88,1,289
--------------------------------------------------------------------------------
/src/main/resources/chap04/user_input.txt:
--------------------------------------------------------------------------------
1 | u1,DE
2 | u2,NE
3 | u3,MI
4 | u4,MA
5 | u5,CA
6 | u6,MA
7 | u7,MA
8 | u8,DE
9 | u9,HI
10 | u10,MI
11 | u11,NE
12 | u12,WA
13 | u13,MP
14 | u14,NE
15 | u15,MI
16 | u16,CA
17 | u17,CA
18 | u18,MA
19 | u19,CA
20 | u20,WI
21 | u21,HI
22 | u22,HI
23 | u23,HI
24 | u24,DE
25 | u25,MA
26 | u26,NE
27 | u27,DE
28 | u28,MP
29 | u29,MA
30 | u30,HI
31 | u31,CA
32 | u32,HI
33 | u33,CA
34 | u34,MS
35 | u35,WI
36 | u36,KS
37 | u37,MA
38 | u38,MA
39 | u39,DE
40 | u40,DE
41 | u41,WI
42 | u42,WI
43 | u43,WI
44 | u44,MA
45 | u45,MS
46 | u46,NE
47 | u47,HI
48 | u48,KS
49 | u49,MA
50 | u50,MP
51 | u51,MI
52 | u52,NE
53 | u53,HI
54 | u54,WA
55 | u55,CA
56 | u56,MS
57 | u57,CO
58 | u58,MA
59 | u59,KS
60 | u60,HI
61 | u61,MP
62 | u62,MI
63 | u63,MS
64 | u64,WI
65 | u65,WA
66 | u66,DE
67 | u67,DE
68 | u68,WI
69 | u69,HI
70 | u70,MI
71 | u71,HI
72 | u72,WI
73 | u73,CO
74 | u74,NE
75 | u75,DE
76 | u76,MI
77 | u77,NE
78 | u78,WA
79 | u79,CO
80 | u80,HI
81 | u81,NE
82 | u82,MI
83 | u83,MS
84 | u84,WI
85 | u85,WI
86 | u86,WI
87 | u87,MA
88 | u88,HI
89 | u89,CO
90 | u90,CA
91 | u91,MI
92 | u92,HI
93 | u93,WI
94 | u94,WI
95 | u95,MA
96 | u96,NE
97 | u97,CO
98 | u98,KS
99 | u99,MS
100 | u100,WA
--------------------------------------------------------------------------------
/src/main/resources/chap05/input.txt:
--------------------------------------------------------------------------------
1 | java is a great language
2 | java is a programming language
3 | java is green fun language
4 | java is great
5 | programming with java is fun
--------------------------------------------------------------------------------
/src/main/resources/chap05/mock.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/src/main/scala/io/miaoji/flink/algorithms/scala/chap02/SecondarySortingJob.scala:
--------------------------------------------------------------------------------
1 | package io.miaoji.flink.algorithms.scala.chap02
2 |
3 | import org.apache.flink.api.common.operators.Order
4 | import org.apache.flink.api.scala._
5 | import org.apache.flink.core.fs.FileSystem.WriteMode
6 | import org.apache.flink.util.Collector
7 |
8 | /**
9 | * 使用了sortPartition
10 | */
11 | object SecondarySortingJob {
12 |
13 | val INPUT_FILE_NAME = "chap03/top10_input.txt"
14 | val OUTPUT_FILE_NAME = "chap03/top10_output.txt"
15 |
16 | def main(args: Array[String]): Unit = {
17 | val env = ExecutionEnvironment.getExecutionEnvironment
18 | val dir = this.getClass.getClassLoader.getResource("")
19 | val inputFilePath = dir + "/" + INPUT_FILE_NAME
20 | val outputFilePath = dir + "/" + OUTPUT_FILE_NAME
21 | val dataSource = env.readTextFile(inputFilePath)
22 | dataSource
23 | // STEP-1: 用逗号进行分隔
24 | .map[Array[String]]{x:String => x.split(",")}
25 | // STEP-2: 过滤不符合格式的参数
26 | .filter(_.length!=3)
27 | // STEP-3: 第一个是名称,第二个是日期,第三个是金额
28 | .map{x:Array[String] => (x(0),x(1), x(2))}
29 | // STEP-4: 分区然后根据date排序
30 | // ,,
31 | .partitionByRange(0)
32 | .sortPartition(2, Order.ASCENDING)
33 | // STEP-5: 分组然后归约
34 | // ,,
35 | .groupBy(0)
36 | .reduce {
37 | (t1: (String, String, String), t2: (String, String, String)) =>
38 | (t1._1, t1._2 + ","+t2._2,t1._3+","+t2._3)
39 | }
40 | // STEP-6:
41 | // ,,
42 | .flatMap {(t: (String, String, String), coll: Collector[(String, String)]) =>
43 | val timeArray = t._2.split(",")
44 | val array = Array(timeArray.length)
45 | var i = 0
46 | for( i <- 0 until timeArray.length-1){
47 |
48 | coll.collect((t._1,s"""(${t._2.split(",")(i)},${t._3.split(",")(i)})"""))
49 | }
50 | }
51 | // STEP-7: 名称分组然后把"日期金额"字符串拼接在后面
52 | // ,
53 | .groupBy(0)
54 | .reduce((t1,t2) => (t1._1,t1._2+t2._2))
55 |
56 | dataSource.print()
57 | dataSource.writeAsText(OUTPUT_FILE_NAME, WriteMode.OVERWRITE)
58 |
59 |
60 | env.execute("top 10")
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/src/main/scala/io/miaoji/flink/algorithms/scala/chap03/README.md:
--------------------------------------------------------------------------------
1 | # 原文
2 | [top10](https://github.com/mahmoudparsian/data-algorithms-book/tree/master/src/main/java/org/dataalgorithms/chap03)
3 |
4 | # 输入示例
5 | [top10_input.txt](./../../../../resources/chap03/top10_input.txt)
--------------------------------------------------------------------------------
/src/main/scala/io/miaoji/flink/algorithms/scala/chap03/Top10.scala:
--------------------------------------------------------------------------------
1 | package io.miaoji.flink.algorithms.scala.chap03
2 |
3 | import org.apache.flink.api.common.operators.Order
4 | import org.apache.flink.api.scala._
5 | import org.apache.flink.core.fs.FileSystem.WriteMode
6 | ;
7 |
8 | /**
9 | * 使用了sortPartition
10 | */
11 | object Top10{
12 |
13 | val INPUT_FILE_NAME = "chap03/top10_input.txt"
14 | val OUTPUT_FILE_NAME = "chap03/top10_output.txt"
15 |
16 | def main(args: Array[String]): Unit = {
17 | val env = ExecutionEnvironment.getExecutionEnvironment
18 |
19 | val dir = this.getClass.getClassLoader.getResource("")
20 | val inputFilePath = dir + "/" + INPUT_FILE_NAME
21 | val outputFilePath = dir + "/" + OUTPUT_FILE_NAME
22 | val dataSource = env.readTextFile(inputFilePath)
23 | dataSource
24 | // STEP-1: 用逗号进行分隔
25 | .map[Array[String]] { x: String => x.split(",") }
26 | // STEP-2: 过滤不符合格式的参数
27 | .filter(_.length == 2)
28 | // STEP-3: 第一个是猫id,第二个是猫体重
29 | .map { x: Array[String] => (x(0), x(1).toInt, 1) }
30 | // STEP-4: 将所有数据划为一个分区
31 | .partitionByRange(2)
32 | // STEP-5: 对同一分区内的数据的第1列(下表从0开始)进行排序
33 | .sortPartition(1, Order.DESCENDING)
34 | // STEP-6: 输出到文件
35 | .writeAsText(OUTPUT_FILE_NAME, WriteMode.OVERWRITE).setParallelism(1)
36 |
37 | env.execute("top 10")
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/scala/io/miaoji/flink/algorithms/scala/chap04/LeftOuterJoin.scala:
--------------------------------------------------------------------------------
1 | package io.miaoji.flink.algorithms.scala.chap04
2 |
3 | import org.apache.flink.api.scala._
4 | import org.apache.flink.core.fs.FileSystem.WriteMode
5 |
6 | /**
7 | * 当前用户 : wing。
8 | * 当前日期 : 2018/12/25。
9 | * 包名 : io.miaoji.flink.algorithms.scala.chap04。
10 | * 当前时间 : 4:57 PM。
11 | * 功能 :
12 | */
13 | object LeftOuterJoin {
14 |
15 | val USER_INPUT_FILE_NAME = "chap04/user_input.txt"
16 | val TRANSACTION_INPUT_FILE_NAME = "chap04/transaction_input.txt"
17 | val OUTPUT_FILE_NAME = "chap04/left_out_join_output.txt"
18 | val dir = LeftOuterJoin.getClass.getClassLoader.getResource("")
19 |
20 | def main(args: Array[String]): Unit = {
21 | val env = ExecutionEnvironment.getExecutionEnvironment
22 | // STEP-1: 读取用户数据
23 | val userData = env.readCsvFile[(String, String)](dir+"/"+USER_INPUT_FILE_NAME)
24 | // STEP-2: 读取交易数据
25 | val transactionData = env.readCsvFile[(String, String, String, Long, Long)](dir+"/"+TRANSACTION_INPUT_FILE_NAME)
26 | // STEP-3: 左联接
27 | val result = transactionData.leftOuterJoin(userData)
28 | .where(2).equalTo(0)
29 | .apply((t: (String, String, String, Long, Long), u: (String, String)) =>
30 | (t._2,u._2)
31 | )
32 | // STEP-4: 合并
33 | // p1 TX
34 | .distinct(0,1)
35 | .groupBy(0).reduce { (t1: (String, String), t2: (String, String)) =>
36 | (t1._1, t1._2 + "," + t2._2)
37 | }
38 | // p1 TX,TX
39 | .map((t: (String, String)) =>
40 | (t._1, "["+t._2+"]",t._2.split(",").length)
41 | )
42 | result.writeAsCsv(dir+"/"+OUTPUT_FILE_NAME,writeMode = WriteMode.OVERWRITE).setParallelism(1)
43 | // STEP-4: 输出数据
44 | env.execute("Left Outer Join")
45 | }
46 |
47 | }
48 |
--------------------------------------------------------------------------------
/src/main/scala/io/miaoji/flink/algorithms/scala/chap05/OrderInversion.scala:
--------------------------------------------------------------------------------
1 | package io.miaoji.flink.algorithms.scala.chap05
2 |
3 |
4 | import io.miaoji.flink.algorithms.scala.chap04.LeftOuterJoin
5 | import org.apache.flink.api.common.operators.Order
6 | import org.apache.flink.api.scala._
7 | import org.apache.flink.core.fs.FileSystem.WriteMode
8 | import org.apache.flink.util.Collector
9 |
10 | import scala.collection.mutable.ArrayBuffer
11 |
12 | object OrderInversion {
13 |
14 | val INPUT_FILE_NAME = "chap05/input.txt"
15 | val OUTPUT_FILE_NAME = "chap05/output.txt"
16 | val dir = LeftOuterJoin.getClass.getClassLoader.getResource("")
17 |
18 | def main(args: Array[String]): Unit = {
19 | val env = ExecutionEnvironment.getExecutionEnvironment
20 | // STEP-1: 读取数据
21 | val inputData = env.readTextFile(dir+"/"+INPUT_FILE_NAME)
22 | // STEP-2: 把句子拆成词与相邻词的元祖
23 | // java is a great language
24 | val result = inputData.flatMap { (str: String, collector: Collector[(String, String, Int)]) =>
25 | val strArray = str.split(" ")
26 | for(i <- 0 until strArray.length){
27 | val token = strArray(i)
28 | val start = if(i - 2 < 0) 0 else i-2
29 | val end = if(i + 2 >= strArray.length) strArray.length-1 else i+2
30 | collector.collect((token, "*", end-start))
31 | // println(s"$i $token $start $end ${end-start}")
32 | // 这里用to来包括end这个下表
33 | for(j <- start to end){
34 | if(j != i) {
35 | collector.collect((strArray(i), strArray(j), 1))
36 | }
37 | }
38 | }
39 | }
40 | // java * 4
41 | // java is 1
42 | // java is 1
43 | // java a 1
44 | // java great 1
45 | // java language 1
46 | // STEP-3: 相同词与相邻词进行相加
47 | .groupBy(0,1).reduce((t1: (String, String, Int), t2: (String, String, Int)) =>
48 | (t1._1,t1._2,t1._3 + t2._3)
49 | )
50 | // 必须进行sortGroup,才能保证(java,*)是在第一个
51 | // STEP-4: 按书中的形式组合
52 | .groupBy(0).sortGroup(2, Order.DESCENDING).reduceGroup { (tuples: Iterator[(String, String, Int)], collector: Collector[(Array[(String, String)], Array[Float])]) =>
53 | var key = ArrayBuffer[(String, String)]()
54 | var value = ArrayBuffer[Float]()
55 | for(tuple <- tuples){
56 | key.append((tuple._1,tuple._2))
57 | value.append(tuple._3)
58 | }
59 | collector.collect((key.toArray, value.toArray))
60 | }
61 | // (java,*),(java,is),(java,a),(java,great),(java,language) 4,1,1,1,1
62 | .map { (tuple: (Array[(String, String)], Array[Float])) =>
63 | val key = tuple._1
64 | val value = tuple._2
65 | val target = ArrayBuffer[Float]()
66 | var index = 0
67 | value.foreach(x => target.append(x/value(index).toFloat))
68 | (tuple._1, target.toArray)
69 | }
70 | // (java,*),(java,is),(java,a),(java,great),(java,language) 1,0.25,0.25,0.25,0.25
71 | .flatMap ((tuple: (Array[(String, String)], Array[Float]), collector: Collector[(String, String, Double)]) =>
72 | for (i <- 0 until tuple._1.length) {
73 | collector.collect(tuple._1(i)._1, tuple._1(i)._2, tuple._2(i))
74 | }
75 | )
76 | .filter(_._2 != "*")
77 | result.writeAsCsv(dir+"/"+OUTPUT_FILE_NAME,writeMode = WriteMode.OVERWRITE).setParallelism(1)
78 | env.execute("Order Inversion")
79 | }
80 |
81 | }
82 |
--------------------------------------------------------------------------------
/src/main/scala/io/miaoji/flink/algorithms/scala/chap05/README.md:
--------------------------------------------------------------------------------
1 | # 原文
2 | [Order Inversion](https://github.com/mahmoudparsian/data-algorithms-book/tree/master/src/main/java/org/dataalgorithms/chap05)
3 |
4 | # 输入示例
5 | [input.txt](./../../../../resources/chap05/input.txt)
--------------------------------------------------------------------------------