├── LICENSE ├── Project1_商品秒杀 ├── Memcached实现 │ ├── BuyProductionMem.java │ └── RushToPurchaseMem.java ├── Redis实现 │ ├── RedisMS.java │ ├── RedisThread.java │ └── RedisUtil.java └── introduction.pdf ├── Project2_日志分析logAnalysis ├── MR_WLA.java ├── TimeUtil.java ├── baidu.log └── 网站日志分析.pdf ├── Project3_mysql迁移至Hive(JDBC) ├── hive │ └── HiveDemo.java ├── utils │ └── JDBCUtil.java └── 说明.pdf ├── Project4_定时将mysql中的增量打进HDFS ├── m2h │ └── M2H.java └── 说明.pdf ├── Project5_订单交易额实时统计、离线审计 ├── TSA模拟生成订单,1秒1条 │ ├── JedisUtil.java │ ├── SheetGeneratorServer.java │ ├── logback.xml │ ├── pom.xml │ ├── price900 │ └── producer.properties ├── utils │ ├── ENotationUtil.java │ ├── HiveUtil.java │ ├── JDBCUtil.java │ ├── StringUtil.java │ └── TimeUtil.java ├── 实时统计 │ ├── MapReduce版 │ │ ├── pom.xml │ │ └── tradestatistics │ │ │ └── StormKafkaProcess.java │ └── spark streaming版 │ │ ├── StreamingTrade.scala │ │ └── pom.xml ├── 架构图.png ├── 离线审计 │ └── audit │ │ └── TradeAudit.java ├── 问题描述.docx └── 问题说明.pdf ├── Project6_SparkBasic5个小问题 ├── data │ ├── 1.txt │ ├── data.txt │ ├── index.txt │ ├── product.txt │ └── sheet.txt ├── practice │ ├── DicConn.scala │ ├── GroupSum.scala │ ├── InvertedIndex.scala │ ├── TopWordCount.scala │ └── WordCount.scala ├── sparkutil │ └── Util.scala ├── 问题说明.docx └── 问题说明.pdf ├── Project7_HBase ├── 1磁盘小量数据导入HBase │ ├── Data2HBase.java │ └── hbasedata.txt ├── 2磁盘大量数据导入HBase │ └── Data2HBase1.java ├── 3Mysql迁移至HBase │ └── Mysql2HBase.java ├── 4Flter过滤器 │ └── FilterBasic.java ├── HBaseUtil.java ├── JdbcUtil.java ├── 问题说明.docx └── 问题说明.pdf ├── Project8_推荐系统入门 └── recommend │ ├── AlgorithmUtil.py │ ├── Recommender.py │ ├── TestRecommender.py │ └── __init__.py ├── Project9_分布式知乎爬虫 ├── pom.xml └── src │ └── main │ ├── java │ ├── crawler │ │ └── Spider.java │ ├── domain │ │ ├── Page.java │ │ └── User.java │ ├── download │ │ ├── DownLoad.java │ │ └── DownLoadImpl.java │ ├── process │ │ ├── Process.java │ │ └── ProcessImpl.java │ ├── store │ │ ├── Store.java │ │ └── StoreImpl.java │ └── utils │ │ ├── BloomFilter.java │ │ ├── JDBCUtil.java │ │ ├── JedisUtil.java │ │ ├── MD5Filter.java │ │ ├── PageUtil.java │ │ ├── TestFilter.java │ │ ├── ThreadUtil.java │ │ ├── UrlUtil.java │ │ └── UserUtil.java │ └── resources │ └── log4j.properties ├── README.md └── utils └── JedisUtil.java /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | {one line to give the program's name and a brief idea of what it does.} 635 | Copyright (C) {year} {name of author} 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | {project} Copyright (C) {year} {fullname} 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /Project1_商品秒杀/Memcached实现/BuyProductionMem.java: -------------------------------------------------------------------------------- 1 | package mxlee.ms; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.FileWriter; 5 | import java.io.IOException; 6 | import java.net.InetSocketAddress; 7 | import java.net.URL; 8 | import java.util.Random; 9 | 10 | import net.spy.memcached.CASResponse; 11 | import net.spy.memcached.CASValue; 12 | import net.spy.memcached.MemcachedClient; 13 | 14 | /** 15 | * 模拟抢购任务,每个任务都会连接服务器,修改商品的值,这里使用了CAS协议,保证了多线程下的数据原子性 16 | * 17 | * @classnaName BuyProductionMem.java 18 | * @author mxlee 19 | * @date 2016年11月16日 20 | */ 21 | public class BuyProductionMem implements Runnable { 22 | private String MEMCACHED_SERVER_IP = "192.168.1.104";// 服务器端ip 23 | private int MEMCACEHD_SERVER_PORT = 11211; // 服务器端端口 24 | public static long start = 0; 25 | public static long times = 0; 26 | public static int count = 0; 27 | 28 | @Override 29 | public void run() { 30 | MemcachedClient memcachedClient = null; 31 | try { // 新建一个memcached客户端 32 | memcachedClient = new MemcachedClient(new InetSocketAddress(MEMCACHED_SERVER_IP, MEMCACEHD_SERVER_PORT)); 33 | } catch (IOException e) { 34 | e.printStackTrace(); 35 | } 36 | Random randomInt = new Random(); 37 | int nextInt = randomInt.nextInt(5) + 1; 38 | String productionName = "prod" + nextInt;// 随机得到一个商品名(key) 39 | 40 | CASValue casValue = memcachedClient.gets(productionName);// 得到key对应的CASValue 41 | long cas = casValue.getCas();// 得到商品对应的数量的版本号 42 | Integer value = (Integer) casValue.getValue();// 得到商品对应的数量的值 43 | 44 | if (value > 0) { 45 | // 通过cas修改value,如果版本号没变则返回OK修改成功,如果版本号变了则返回其他值 46 | CASResponse response = memcachedClient.cas(productionName, cas, value - 1); 47 | if (response.toString().equals("OK")) { 48 | System.out.println( 49 | Thread.currentThread().getName() + "成功抢到一个商品:" + productionName + "\t剩余:" + (value - 1)); 50 | count++; 51 | switch (count) { 52 | case 1: 53 | start = System.currentTimeMillis(); 54 | break; 55 | case 50: 56 | times = System.currentTimeMillis() - start; 57 | System.out.println("============================" + times); 58 | break; 59 | default: 60 | break; 61 | } 62 | print(Thread.currentThread().getName() + "成功抢到一个商品:" + productionName + "\t剩余:" + (value - 1)); 63 | } else { 64 | System.out.println(Thread.currentThread().getName() + "手速慢了,没抢到"); 65 | } 66 | System.out.println("时间点" + System.currentTimeMillis()); 67 | } else { 68 | System.out.println("商品" + productionName + ",已经被抢光了"); 69 | } 70 | memcachedClient.shutdown(); 71 | } 72 | 73 | /** 74 | * 打印成功抢购信息 75 | * 76 | * @param str 77 | */ 78 | private synchronized void print(String str) { 79 | // 获取程序所在根目录 80 | Class clazz = RedisMS.class; 81 | URL url = clazz.getResource("/"); 82 | String path = url.toString();// 结果为file:/D:/Workspaces/javaBasic/nioDemo/target/classes/ 83 | path = path.substring(6); 84 | 85 | // 缓冲写出流 86 | BufferedWriter bw = null; 87 | try { 88 | bw = new BufferedWriter(new FileWriter(path + "/resultMem.txt", true)); 89 | bw.write(str); 90 | bw.newLine(); 91 | bw.flush(); 92 | } catch (IOException e) { 93 | e.printStackTrace(); 94 | } finally { 95 | try { 96 | bw.close(); 97 | } catch (IOException e) { 98 | e.printStackTrace(); 99 | } 100 | } 101 | } 102 | 103 | } 104 | -------------------------------------------------------------------------------- /Project1_商品秒杀/Memcached实现/RushToPurchaseMem.java: -------------------------------------------------------------------------------- 1 | package mxlee.ms; 2 | 3 | import java.io.IOException; 4 | import java.net.InetSocketAddress; 5 | 6 | import net.spy.memcached.MemcachedClient; 7 | 8 | /** 9 | * 模拟商品抢购,使用memcached缓存,同时开启多个线程访问memcached服务器 10 | * 11 | * @classnaName RushToPurchaseMem.java 12 | * @author mxlee 13 | * @date 2016年11月16日 14 | */ 15 | public class RushToPurchaseMem { 16 | 17 | public static void main(String[] args) { 18 | addProductions();// 加入商品 19 | System.out.println("开始抢购,时间点:" + System.currentTimeMillis()); 20 | 21 | for (int i = 0; i < 500; i++) {// 同时开启多个线程访问memcached服务器 new 22 | new Thread(new BuyProductionMem()).start(); 23 | } 24 | 25 | } 26 | 27 | // -----此方法向memcached中加入商品数据 28 | public static void addProductions() { 29 | String MEMCACHED_SERVER_IP = "192.168.1.104";// 服务器端ip 30 | int MEMCACEHD_SERVER_PORT = 11211; // 服务器端端口 31 | MemcachedClient memcachedClient = null; 32 | try { 33 | memcachedClient = new MemcachedClient(new InetSocketAddress(MEMCACHED_SERVER_IP, MEMCACEHD_SERVER_PORT)); 34 | } catch (IOException e) { 35 | System.out.println("链接服务器失败"); 36 | e.printStackTrace(); 37 | } 38 | // 存入数据 39 | memcachedClient.set("prod1", 30, 10);// 60表示缓存时间为60秒,60秒后自动销毁此条key-value 40 | memcachedClient.set("prod2", 30, 10); 41 | memcachedClient.set("prod3", 30, 10); 42 | memcachedClient.set("prod4", 30, 10); 43 | memcachedClient.set("prod5", 30, 10); 44 | System.out.println(memcachedClient.get("prod1")); 45 | System.out.println(memcachedClient.get("prod2")); 46 | System.out.println(memcachedClient.get("prod3")); 47 | System.out.println(memcachedClient.get("prod4")); 48 | System.out.println(memcachedClient.get("prod5")); 49 | memcachedClient.shutdown(); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /Project1_商品秒杀/Redis实现/RedisMS.java: -------------------------------------------------------------------------------- 1 | package mxlee.ms; 2 | 3 | import java.net.URL; 4 | import java.util.Random; 5 | import java.util.concurrent.ExecutorService; 6 | import java.util.concurrent.Executors; 7 | 8 | import redis.clients.jedis.Jedis; 9 | import redis.clients.jedis.JedisPool; 10 | 11 | /** 12 | * Redis秒杀主程序 13 | * 14 | * @author mxlee 15 | * 16 | */ 17 | public class RedisMS { 18 | 19 | public static void main(String[] args) { 20 | 21 | JedisPool jedisPool = RedisUtil.getJedis(); 22 | 23 | // 预购清单 24 | String[] arr = { "iphone", "pc", "surface", "mi", "huawei" }; 25 | 26 | // Redis数据库赋值 27 | assignment(arr, 10, jedisPool); 28 | 29 | // 抢购 30 | panicBuying(arr, 500, jedisPool); 31 | 32 | } 33 | 34 | /** 35 | * 为Redis数据库中的商品赋值 36 | * 37 | * @param arr 38 | * String 抢购商品数组 39 | * @param num 40 | * int 商品库存 41 | */ 42 | private static void assignment(String[] arr, int num, JedisPool jedisPool) { 43 | 44 | // 获得连接 45 | Jedis jedis = jedisPool.getResource(); 46 | boolean flag = false; 47 | 48 | for (int i = 0; i < arr.length; i++) { 49 | jedis.set(arr[i], num + ""); 50 | } 51 | 52 | } 53 | 54 | /** 55 | * 抢购开始 56 | * 57 | * @param arr 58 | * String 抢购商品数组 59 | * @param threadNum 60 | * int 线程数量 61 | */ 62 | private static void panicBuying(String[] arr, int threadNum, JedisPool jedisPool) { 63 | // 线程池 64 | ExecutorService fixedThreadPool = Executors.newFixedThreadPool(threadNum); 65 | 66 | Random random = new Random(); 67 | 68 | for (int i = 0; i < threadNum; i++) { 69 | // 为线程随机传递需要抢购的商品 70 | int index = random.nextInt(5); 71 | RedisThread redisThread = new RedisThread(arr[index], jedisPool); 72 | fixedThreadPool.submit(redisThread); 73 | } 74 | 75 | // 关闭线程池 76 | fixedThreadPool.shutdown(); 77 | } 78 | 79 | } 80 | -------------------------------------------------------------------------------- /Project1_商品秒杀/Redis实现/RedisThread.java: -------------------------------------------------------------------------------- 1 | package mxlee.ms; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.FileWriter; 5 | import java.io.IOException; 6 | import java.net.URL; 7 | import java.sql.Timestamp; 8 | import java.text.ParseException; 9 | import java.text.SimpleDateFormat; 10 | import java.util.Date; 11 | import java.util.List; 12 | 13 | import redis.clients.jedis.Jedis; 14 | import redis.clients.jedis.JedisPool; 15 | import redis.clients.jedis.Transaction; 16 | 17 | public class RedisThread extends Thread { 18 | private static long start = 0; // 开始抢购时间 19 | private static long time = 0; // 多长时间抢购一空 20 | private static int count = 0; // 抢到商品数量 21 | private JedisPool jedisPool; 22 | private String pro; // 需要购买的商品 23 | 24 | public RedisThread(String pro, JedisPool jedisPool) { 25 | this.pro = pro; 26 | this.jedisPool = jedisPool; 27 | } 28 | 29 | /** 30 | * 打印成功抢购信息 31 | * 32 | * @param str 33 | */ 34 | private synchronized void print(String str) { 35 | // 获取程序所在根目录 36 | Class clazz = RedisMS.class; 37 | URL url = clazz.getResource("/"); 38 | String path = url.toString();// 结果为file:/D:/Workspaces/javaBasic/nioDemo/target/classes/ 39 | path = path.substring(6); 40 | 41 | // 缓冲写出流 42 | BufferedWriter bw = null; 43 | try { 44 | bw = new BufferedWriter(new FileWriter(path + "/resultRedis.txt", true)); 45 | bw.write(str); 46 | bw.newLine(); 47 | bw.flush(); 48 | } catch (IOException e) { 49 | e.printStackTrace(); 50 | } finally { 51 | try { 52 | bw.close(); 53 | } catch (IOException e) { 54 | e.printStackTrace(); 55 | } 56 | } 57 | } 58 | 59 | /** 60 | * 线程开始定时器 61 | * 62 | * @return 63 | */ 64 | private long clock() { 65 | 66 | String clock = "2016-11-18 20:47:00"; 67 | 68 | SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); 69 | 70 | Date date = null; 71 | try { 72 | date = simpleDateFormat.parse(clock); 73 | } catch (ParseException e) { 74 | e.printStackTrace(); 75 | } 76 | 77 | long time = date.getTime(); 78 | 79 | return time; 80 | } 81 | 82 | @Override 83 | public void run() { 84 | 85 | // 取当前时间 86 | long currentTimeMillis = System.currentTimeMillis(); 87 | 88 | long millis = clock() - currentTimeMillis; 89 | 90 | if (millis > 0) { 91 | try { 92 | Thread.sleep(millis); 93 | } catch (InterruptedException e) { 94 | e.printStackTrace(); 95 | } 96 | } 97 | 98 | while (true) { 99 | 100 | // 获得连接 101 | Jedis jedis = jedisPool.getResource(); 102 | try { 103 | Thread.sleep(100); 104 | } catch (Exception e) { 105 | } 106 | 107 | try { 108 | // 获得此刻商品apple的数量 109 | int proNum = Integer.parseInt(jedis.get(pro)); 110 | List result = null; 111 | // 如果还有库存 112 | if (proNum > 0) { 113 | // 监听商品pro 114 | jedis.watch(pro); 115 | int proNum1 = Integer.parseInt(jedis.get(pro)); 116 | 117 | if (proNum1 < proNum) { 118 | jedis.unwatch(); 119 | } else { 120 | // jedis方法开始事务 121 | Transaction transaction = jedis.multi(); 122 | 123 | // 购买商品,然后更改库存 124 | transaction.set(pro, String.valueOf(proNum - 1)); 125 | 126 | // 提交事务 127 | result = transaction.exec(); 128 | } 129 | // 监听的商品被别的线程操作,则本线程无法购买商品,需要排队,自己不修改商品的数量 130 | if (result == null || result.isEmpty()) { 131 | System.out.println(Thread.currentThread().getName() + "\t正在排队抢购\t" + pro + "...");// 可能是watch-key被外部修改,或者是数据操作被驳回 132 | } else { 133 | count++; 134 | 135 | switch (count) { 136 | case 1: 137 | start = System.currentTimeMillis(); 138 | break; 139 | 140 | case 50: 141 | time = System.currentTimeMillis() - start; 142 | System.out.println("===================" + time); 143 | break; 144 | default: 145 | break; 146 | } 147 | String str = Thread.currentThread().getName() + "\t抢购成功,商品名为:\t" + pro + "\t抢购时间:" 148 | + new Timestamp(new Date().getTime()); 149 | System.out.println(str); 150 | // 把抢购成功的顾客信息打印出去 151 | print(str); 152 | 153 | } // end if else 154 | 155 | } else {// 库存为0时 156 | System.out.println(pro + "已售罄,库存为0"); 157 | break; 158 | } 159 | } catch (Exception e) { 160 | e.printStackTrace(); 161 | RedisUtil.returnResource(jedis); 162 | } finally { 163 | RedisUtil.returnResource(jedis); 164 | } 165 | 166 | } 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /Project1_商品秒杀/Redis实现/RedisUtil.java: -------------------------------------------------------------------------------- 1 | package mxlee.ms; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | import redis.clients.jedis.Jedis; 7 | import redis.clients.jedis.JedisPool; 8 | import redis.clients.jedis.JedisPoolConfig; 9 | 10 | /** 11 | * Redis工具类 12 | * 13 | * @author mxlee 14 | * 15 | */ 16 | public class RedisUtil { 17 | protected static Logger logger = LoggerFactory.getLogger(JedisUtil.class); 18 | public static final String HOST = "127.0.0.1"; 19 | public static final int PORT = 6379; 20 | 21 | private RedisUtil() { 22 | } 23 | 24 | private static JedisPool jedisPool = null; 25 | 26 | /** 27 | * 初始化JedisPool 28 | * 29 | * @return 30 | */ 31 | private static void initialPool() { 32 | 33 | if (jedisPool == null) { 34 | JedisPoolConfig jedisPoolConfig = new JedisPoolConfig(); 35 | // 指定连接池中最大的空闲连接数 36 | jedisPoolConfig.setMaxIdle(100); 37 | // 连接池创建的最大连接数 38 | jedisPoolConfig.setMaxTotal(500); 39 | // 设置创建连接的超时时间 40 | jedisPoolConfig.setMaxWaitMillis(1000 * 50); 41 | // 表示从连接池中获取连接时,先测试连接是否可用 42 | jedisPoolConfig.setTestOnBorrow(true); 43 | jedisPool = new JedisPool(jedisPoolConfig, HOST, PORT); 44 | } 45 | 46 | } 47 | 48 | /** 49 | * 在多线程环境同步初始化 50 | */ 51 | private static synchronized void poolInit() { 52 | if (jedisPool == null) { 53 | initialPool(); 54 | } 55 | } 56 | 57 | /** 58 | * 同步获取Jedis实例 59 | * 60 | * @return Jedis 61 | */ 62 | public synchronized static Jedis getJedis() { 63 | if (jedisPool == null) { 64 | poolInit(); 65 | } 66 | Jedis jedis = null; 67 | try { 68 | if (jedisPool != null) { 69 | jedis = jedisPool.getResource(); 70 | } 71 | } catch (Exception e) { 72 | logger.error("获取jedis出错: " + e); 73 | } finally { 74 | returnResource(jedis); 75 | } 76 | return jedis; 77 | } 78 | 79 | /** 80 | * 释放jedis资源 81 | * 82 | * @param jedis 83 | */ 84 | public static void returnResource(Jedis jedis) { 85 | if (jedis != null && jedisPool != null) { 86 | // Jedis3.0之后,returnResource遭弃用,官方重写了close方法 87 | // jedisPool.returnResource(jedis); 88 | jedis.close(); 89 | } 90 | } 91 | 92 | /** 93 | * 释放jedis资源 94 | * 95 | * @param jedis 96 | */ 97 | public static void returnBrokenJedis(Jedis jedis) { 98 | if (jedis != null && jedisPool != null) { 99 | jedisPool.returnBrokenResource(jedis); 100 | } 101 | jedis = null; 102 | } 103 | 104 | } 105 | -------------------------------------------------------------------------------- /Project1_商品秒杀/introduction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monsonlee/BigData/6004a79234c3f0cfa1efc1c197e93651303abd85/Project1_商品秒杀/introduction.pdf -------------------------------------------------------------------------------- /Project2_日志分析logAnalysis/MR_WLA.java: -------------------------------------------------------------------------------- 1 | package mx1202.wla1; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configured; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | import org.apache.hadoop.util.Tool; 15 | import org.apache.hadoop.util.ToolRunner; 16 | 17 | /** 18 | * MR_WLA用于网站日志分析 19 | * 20 | * @className MR_WLA 21 | * @author mxlee 22 | * @email imxlee@foxmail.com 23 | * @date 2016年12月2日 24 | */ 25 | public class MR_WLA extends Configured implements Tool { 26 | 27 | public static void main(String[] args) throws Exception { 28 | ToolRunner.run(new MR_WLA(), args); 29 | } 30 | 31 | public int run(String[] args) throws Exception { 32 | String jobName = "wla_baidu"; 33 | 34 | String inputPath = args[0]; 35 | String outputPath = args[1]; 36 | Path path = new Path(outputPath); 37 | // 删除输出目录 38 | path.getFileSystem(getConf()).delete(path, true); 39 | 40 | // 1、把所有代码组织到类似于Topology的类中 41 | Job job = Job.getInstance(getConf(), jobName); 42 | 43 | // 2、一定要打包运行,必须写下面一行代码 44 | job.setJarByClass(MR_WLA.class); 45 | 46 | // 3、指定输入的hdfs 47 | FileInputFormat.setInputPaths(job, inputPath); 48 | 49 | // 4、指定map类 50 | job.setMapperClass(WLA_Mapper.class); 51 | 52 | // 5、指定map输出的的类型 53 | job.setMapOutputKeyClass(Text.class); 54 | job.setMapOutputValueClass(Text.class); 55 | 56 | // 6、指定reduce类 57 | job.setReducerClass(WLA_Reducer.class); 58 | 59 | // 7、指定reduce输出的的类型 60 | job.setOutputKeyClass(Text.class); 61 | job.setOutputValueClass(Text.class); 62 | 63 | // 8、指定输出的hdfs 64 | FileOutputFormat.setOutputPath(job, new Path(outputPath)); 65 | 66 | return job.waitForCompletion(true) ? 0 : 1; 67 | } 68 | 69 | /** 70 | * WLA_Mapper用于网站日志分组 71 | * 72 | * @className WLA_Mapper 73 | * @author mxlee 74 | * @email imxlee@foxmail.com 75 | * @date 2016年12月2日 76 | */ 77 | public static class WLA_Mapper extends Mapper { 78 | 79 | @Override 80 | protected void map(LongWritable key, Text value, Mapper.Context context) 81 | throws IOException, InterruptedException { 82 | // 格式[2016-11-29 00:02:07 INFO ] 83 | // (cn.baidu.core.inteceptor.LogInteceptor:55) - [0 183.136.190.51 84 | // null http://www.baidu.cn/payment] 85 | String log = value.toString();// 网站访问日志 86 | String str = "(cn.baidu.core.inteceptor.LogInteceptor:55)"; 87 | String baseUrl = "http://www.baidu.cn/"; 88 | int len = str.length(); 89 | int urlLen = baseUrl.length(); 90 | if (log.indexOf(str) != -1) { 91 | String[] log1 = log.split(str); 92 | // 分析第一段[2016-11-29 00:29:58 INFO 93 | String visitTime = log1[0].substring(1, 20);// 获取访问时间 94 | // 分析第二段112.90.82.196 null 95 | // http://www.baidu.cn/course/jobOffline] 96 | String[] split2 = log1[1].split("\t"); 97 | String ip = split2[1];// 获取ip 98 | String url = split2[3];// 获取网址 99 | String subUrl = "http://www.baidu.cn"; 100 | if (url.length() - 1 > urlLen) { 101 | subUrl = url.substring(urlLen, url.length() - 1); 102 | } 103 | String result = visitTime + "," + subUrl; 104 | context.write(new Text(ip), new Text(result)); 105 | } 106 | } 107 | 108 | } 109 | 110 | /** 111 | * WLA_Reducer用于处理分组后的数据 112 | * 113 | * @className WLA_Reducer 114 | * @author mxlee 115 | * @email imxlee@foxmail.com 116 | * @date 2016年12月2日 117 | */ 118 | public static class WLA_Reducer extends Reducer { 119 | 120 | @Override 121 | protected void reduce(Text key, Iterable values, Reducer.Context context) 122 | throws IOException, InterruptedException { 123 | 124 | long firstTime = Long.MAX_VALUE;// 首次访问时间 125 | String startTime = null; 126 | String endTime = null; 127 | long lastTime = Long.MIN_VALUE; 128 | String firstPage = null;// 首次访问页面 129 | String lastPage = null; 130 | int count = 0;// 访问页面次数 131 | 132 | for (Text value : values) { 133 | count++; 134 | String[] split = value.toString().split(","); 135 | 136 | if (TimeUtil.transDate(split[0]) < firstTime) { 137 | firstTime = TimeUtil.transDate(split[0]);// yyyy-MM-dd 138 | // HH:mm:ss 139 | startTime = split[0].substring(11, 19); 140 | firstPage = split[1]; 141 | } 142 | 143 | if (TimeUtil.transDate(split[0]) > lastTime) { 144 | lastTime = TimeUtil.transDate(split[0]); 145 | endTime = split[0].substring(11, 19); 146 | lastPage = split[1]; 147 | } 148 | 149 | } // end for 150 | 151 | long time = 0; 152 | if ((lastTime - firstTime) % (1000 * 60) > 0) { 153 | time = (lastTime - firstTime) / (1000 * 60) + 1; 154 | } else { 155 | time = (lastTime - firstTime) / (1000 * 60); 156 | } 157 | String result = startTime + "\t" + firstPage + "\t" + endTime + "\t" + lastPage + "\t" + count + "\t" + time 158 | + "分钟"; 159 | context.write(key, new Text(result)); 160 | 161 | }// end reduce 162 | 163 | }// end class 164 | 165 | } 166 | -------------------------------------------------------------------------------- /Project2_日志分析logAnalysis/TimeUtil.java: -------------------------------------------------------------------------------- 1 | package mx1202.wla1; 2 | 3 | import java.text.ParseException; 4 | import java.text.SimpleDateFormat; 5 | import java.util.Date; 6 | 7 | public class TimeUtil { 8 | 9 | /** 10 | * 根据给定字符串,转成时间戳,格式yyyy-MM-dd HH:mm:ss 11 | * 12 | * @param str 13 | * @return 14 | * @throws ParseException 15 | */ 16 | public static long transDate(String str) { 17 | SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); 18 | Date parse = null; 19 | try { 20 | parse = simpleDateFormat.parse(str); 21 | } catch (ParseException e) { 22 | return -1; 23 | } 24 | long time = parse.getTime(); 25 | return time; 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /Project2_日志分析logAnalysis/网站日志分析.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monsonlee/BigData/6004a79234c3f0cfa1efc1c197e93651303abd85/Project2_日志分析logAnalysis/网站日志分析.pdf -------------------------------------------------------------------------------- /Project3_mysql迁移至Hive(JDBC)/hive/HiveDemo.java: -------------------------------------------------------------------------------- 1 | package bdr1205.hive; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileWriter; 6 | import java.net.URI; 7 | import java.sql.Connection; 8 | import java.sql.DatabaseMetaData; 9 | import java.sql.DriverManager; 10 | import java.sql.ResultSet; 11 | import java.sql.ResultSetMetaData; 12 | import java.sql.SQLException; 13 | import java.sql.Statement; 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | 17 | import org.apache.commons.lang.StringUtils; 18 | import org.apache.hadoop.conf.Configuration; 19 | import org.apache.hadoop.fs.FSDataOutputStream; 20 | import org.apache.hadoop.fs.FileSystem; 21 | import org.apache.hadoop.fs.Path; 22 | import org.apache.hadoop.io.IOUtils; 23 | 24 | import bdr.utils.JDBCUtil; 25 | 26 | /** 27 | * HiveDemo用于把MySQL中大量的表和数据迁移到Hive中 28 | * 29 | * @className HiveDemo 30 | * @author mxlee 31 | * @email imxlee@foxmail.com 32 | * @date 2016年12月5日 33 | */ 34 | public class HiveDemo { 35 | 36 | static String url = "jdbc:mysql://192.168.1.100:3306/mx"; 37 | static String user = "root"; 38 | static String password = "admin"; 39 | 40 | static { 41 | try { 42 | Class.forName("com.mysql.jdbc.Driver"); 43 | } catch (ClassNotFoundException e) { 44 | e.printStackTrace(); 45 | } 46 | } 47 | 48 | @SuppressWarnings("resource") 49 | public static void main(String[] args) throws Exception { 50 | 51 | // 3.1、加载驱动 52 | Class.forName("org.apache.hadoop.hive.jdbc.HiveDriver"); 53 | // 3.2、获取连接 54 | Connection con = DriverManager.getConnection("jdbc:hive://192.168.1.101:10000/mxlee", "", ""); 55 | Statement stmt = con.createStatement(); 56 | // 1、找出数据库中的表 57 | ArrayList list = queryDBTable(); 58 | // 2、查看数据库表的结构,返回建表语句 59 | for (String table : list) { 60 | String sql = generateCreateTable(table); 61 | // 3、在hive中创建对应的表 62 | stmt.execute(sql); 63 | } 64 | 65 | // 4.从MySQL查出数据写入到Hive中 66 | for (String tableName : list) { 67 | List> queryAll = queryData(tableName); 68 | // 调用jdbc写入到Hive中 69 | // 4.1、将数据写到本地,4.1也可省略,直接用4.2打到HDFS中 70 | String path = "D:/1.txt"; 71 | FileWriter fileWriter = new FileWriter(new File(path), true); 72 | URI uri = new URI("hdfs://192.168.1.101:9000"); 73 | FileSystem fileSystem = FileSystem.get(uri, new Configuration()); 74 | // 4.2 、 将本地文件传至HDFS中 75 | FSDataOutputStream outputStream = fileSystem.create(new Path("/home/group4/mxlee/1205hive/logs/1.txt")); 76 | 77 | for (List list2 : queryAll) { 78 | String[] split = list2.toString().split(","); 79 | String id = split[0].substring(1); 80 | String province = split[1].substring(0, split[1].length() - 1); 81 | fileWriter.write(id + "\t" + province + "\r\n"); 82 | } 83 | 84 | fileWriter.close(); 85 | FileInputStream in = new FileInputStream(new File(path)); 86 | IOUtils.copyBytes(in, outputStream, new Configuration(), true); 87 | System.out.println("上传成功"); 88 | } 89 | // 4.3、将 HDFS中的文件导入HIVE中、 90 | String sql = "LOAD DATA INPATH '/home/group4/mxlee/1205hive/logs/1.txt' OVERWRITE INTO TABLE mxlee.m2h"; 91 | stmt.executeQuery(sql); 92 | con.close(); 93 | } 94 | 95 | /** 96 | * 根据表名查找信息 97 | * 98 | * @param tableName 99 | * @return 100 | */ 101 | private static List> queryData(String tableName) { 102 | String sql = "select * from " + tableName; 103 | return JDBCUtil.queryAll(sql, null); 104 | } 105 | 106 | /** 107 | * 生成创建表的sql语句 108 | * 109 | * @param table 110 | * @return 111 | * @throws SQLException 112 | */ 113 | private static String generateCreateTable(String table) throws SQLException { 114 | ArrayList arrayList = queryTableInfo(table); 115 | StringBuilder stringBuilder = new StringBuilder("create table mxlee." + table + "("); 116 | for (String tableName : arrayList) { 117 | stringBuilder.append(StringUtils.replaceChars(tableName, "\t", " ")).append(","); 118 | } 119 | String substring = stringBuilder.substring(0, stringBuilder.length() - 1); 120 | substring += ") row format delimited fields terminated by '\\t'"; 121 | return substring; 122 | } 123 | 124 | /** 125 | * 查看数据库表的结构 126 | * 127 | * @param table 128 | * @return 129 | * @throws SQLException 130 | */ 131 | private static ArrayList queryTableInfo(String table) throws SQLException { 132 | Connection connection = DriverManager.getConnection(url, user, password); 133 | String sql = "select * from " + table;// 根据传进来的表名查询 134 | 135 | Statement statement = connection.createStatement(); 136 | ResultSet resultSet = statement.executeQuery(sql); 137 | 138 | ResultSetMetaData metaData = resultSet.getMetaData();// 获得查询结果的元数据(此处也指表的元数据) 139 | int columnCount = metaData.getColumnCount();// 表中的列数 140 | 141 | ArrayList arrayList = new ArrayList();// 存储列名 142 | for (int i = 1; i <= columnCount; i++) { 143 | // 类型 144 | String typeName = "String"; 145 | // 列名 146 | String name = metaData.getColumnName(i); 147 | arrayList.add(name + " " + typeName); 148 | } 149 | // 关闭连接 150 | connection.close(); 151 | 152 | return arrayList; 153 | 154 | } 155 | 156 | /** 157 | * 遍历数据库,获得数据库中的所有表名 158 | * 159 | * @return ArrayList 160 | * @throws SQLException 161 | */ 162 | private static ArrayList queryDBTable() throws SQLException { 163 | Connection connection = DriverManager.getConnection(url, user, password); 164 | DatabaseMetaData metaData = connection.getMetaData(); 165 | ResultSet tables = metaData.getTables(null, null, null, new String[] { "TABLE" }); 166 | int columnCount = tables.getMetaData().getColumnCount(); 167 | ArrayList arrayList = new ArrayList(); 168 | while (tables.next()) { 169 | arrayList.add(tables.getObject(3).toString()); 170 | } 171 | connection.close(); 172 | return arrayList; 173 | } 174 | 175 | } 176 | -------------------------------------------------------------------------------- /Project3_mysql迁移至Hive(JDBC)/utils/JDBCUtil.java: -------------------------------------------------------------------------------- 1 | package bdr.utils; 2 | 3 | import java.sql.Connection; 4 | import java.sql.DriverManager; 5 | import java.sql.PreparedStatement; 6 | import java.sql.ResultSet; 7 | import java.sql.SQLException; 8 | import java.util.ArrayList; 9 | import java.util.Arrays; 10 | import java.util.List; 11 | 12 | public class JDBCUtil { 13 | 14 | private static final String url = "jdbc:mysql://192.168.1.100:3306/mx"; 15 | private static final String user = "root"; 16 | private static final String password = "admin"; 17 | 18 | static{ 19 | try { 20 | DriverManager.registerDriver(new com.mysql.jdbc.Driver()); 21 | } catch (SQLException e) { 22 | e.printStackTrace(); 23 | } 24 | } 25 | 26 | public static void main(String[] args) throws SQLException { 27 | for(int i=0; i<200001; i++){ 28 | String sql = "insert into test1(name) values(?)"; 29 | Connection connection = null; 30 | try { 31 | connection = DriverManager.getConnection(url, user, password); 32 | PreparedStatement ps = connection.prepareStatement(sql); 33 | ps.setObject(1, "zhangsan"); 34 | ps.addBatch(); 35 | ps.execute(); 36 | 37 | if(i%10000==0){ 38 | ps.executeUpdate(); 39 | connection.commit(); 40 | ps.clearBatch(); 41 | } 42 | } catch (SQLException e) { 43 | e.printStackTrace(); 44 | } finally{ 45 | if(connection!=null){ 46 | try { 47 | connection.close(); 48 | } catch (SQLException e) { 49 | e.printStackTrace(); 50 | } 51 | } 52 | } 53 | } 54 | 55 | } 56 | 57 | public static void update(String sql, Object... params){ 58 | if(params!=null && params.length>0){ 59 | update(sql, Arrays.asList(params)); 60 | } 61 | } 62 | /** 63 | * 执行insert、update、delete语句 64 | * @param sql 65 | * @param params 66 | */ 67 | public static void update(String sql, List params){ 68 | Connection connection = null; 69 | try { 70 | connection = DriverManager.getConnection(url, user, password); 71 | PreparedStatement ps = connection.prepareStatement(sql); 72 | if(params!=null && params.size()>0){ 73 | for(int i=0; i queryRow(String sql, List params){ 98 | List> result = queryAll(sql, params); 99 | return result.get(0); 100 | } 101 | /** 102 | * 计数 103 | * @param sql 104 | * @param params 105 | * @return 106 | */ 107 | public static long count(String sql, List params){ 108 | List> result = queryAll(sql, params); 109 | return Long.parseLong(result.get(0).get(0).toString()); 110 | } 111 | 112 | /** 113 | * 查询 114 | * @param sql 115 | * @param params 116 | * @return 117 | */ 118 | public static List> queryAll(String sql, List params){ 119 | List> result = new ArrayList>(); 120 | Connection connection = null; 121 | try { 122 | connection = DriverManager.getConnection(url, user, password); 123 | PreparedStatement ps = connection.prepareStatement(sql); 124 | if(params!=null && params.size()>0){ 125 | for(int i=0; i line = new ArrayList(); 133 | for(int i=0; i> query2(String sql, List params){ 154 | List> result = new ArrayList>(); 155 | Connection connection = null; 156 | try { 157 | connection = DriverManager.getConnection(url, user, password); 158 | PreparedStatement ps = connection.prepareStatement(sql); 159 | if(params!=null && params.size()>0){ 160 | for(int i=0; i line = new ArrayList(); 168 | for(int i=0; i goodsPriceList = null;// 用于存放商品码与单价信息 32 | static Random random = new Random(); 33 | static Jedis jedis = null; 34 | static Producer producer = null; 35 | static KeyedMessage message = null; 36 | 37 | public static void main(String[] args) throws IOException { 38 | readDataFromFile();// 从订单文件,将商品信息读入ArrayList 39 | 40 | // 生成订单 41 | while (true) { 42 | gendata(); 43 | try { 44 | Thread.sleep(1000); 45 | } catch (InterruptedException e) { 46 | // e.printStackTrace(); 47 | } 48 | } 49 | } 50 | 51 | /** 52 | * 产生数据的格式是:时间戳\t单号\t商品条码\t单价\t数量 53 | */ 54 | private static void gendata() { 55 | // 1.时间戳 56 | long timestamp = System.currentTimeMillis(); 57 | 58 | // 2.获取订单,把单号保存到redis中,使用jedis类操作 59 | jedis = JedisUtil.getJedis(); 60 | String orderNum = jedis.get("orderNum"); 61 | if (orderNum == null) { 62 | jedis.set("orderNum", "1"); 63 | orderNum = jedis.get("orderNum"); 64 | } else { 65 | jedis.incrBy("orderNum", 1); 66 | } 67 | String order = StringUtils.leftPad(orderNum, 9, "0");// 假定订单号为9位 68 | JedisUtil.returnBrokenJedis(jedis); 69 | jedis = null; 70 | 71 | // 3. 从goodsPriceList中取条码和单价 72 | int randomIndex = random.nextInt(goodsPriceList.size()); 73 | String code_price = goodsPriceList.get(randomIndex); 74 | 75 | // 4.获得随机数量 76 | int amount = random.nextInt(100); 77 | 78 | // 5.拼接单据格式 时间戳\t单号\t商品条码\t单价\t数量 79 | String tradeInfo = timestamp + "\t" + order + "\t" + code_price + "\t" + amount; 80 | 81 | // 6.写入到slf4j和kafka中 82 | logger.info(tradeInfo); 83 | 84 | try { 85 | producer = getProducer(); 86 | } catch (IOException e) { 87 | // e.printStackTrace(); 88 | } 89 | message = new KeyedMessage("trademx", tradeInfo); 90 | producer.send(message); 91 | producer.close(); 92 | } 93 | 94 | /** 95 | * 从订单文件,将商品信息读入ArrayList 96 | * 97 | * @throws IOException 98 | */ 99 | private static void readDataFromFile() throws IOException { 100 | // 读取订单仓库 101 | InputStream resourceAsStream = SheetGeneratorServer.class.getResourceAsStream("price900"); 102 | List readLines = IOUtils.readLines(resourceAsStream); 103 | goodsPriceList = new ArrayList(); 104 | 105 | for (int i = 1; i < readLines.size(); i++) { 106 | String[] splited = StringUtils.split(readLines.get(i)); 107 | String code = splited[1];// 条形码 108 | String priceString = splited[5];// 单价 109 | 110 | // 存入list 条形码 单价 111 | try { 112 | double price = Double.parseDouble(priceString); 113 | goodsPriceList.add(code + "\t" + priceString); 114 | } catch (Exception e) { 115 | // e.printStackTrace(); 116 | } 117 | } 118 | } 119 | 120 | /** 121 | * 获取Kafka生产者 122 | * 123 | * @return 124 | * @throws IOException 125 | */ 126 | private static Producer getProducer() throws IOException { 127 | Properties originalProps = new Properties(); 128 | originalProps.load(SheetGeneratorServer.class.getResourceAsStream("producer.properties")); 129 | originalProps.put("serializer.class", "kafka.serializer.StringEncoder"); 130 | Producer producer = new Producer(new ProducerConfig(originalProps)); 131 | return producer; 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /Project5_订单交易额实时统计、离线审计/TSA模拟生成订单,1秒1条/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 23 | 24 | 25 | UTF-8 26 | 27 | 28 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{50} - %msg%n 29 | 30 | true 31 | false 32 | 33 | 34 | 35 | 36 | UTF-8 37 | 38 | ${LOG_HOME}/${appName}.log 39 | 43 | 44 | 48 | ${LOG_HOME}/${appName}-%d{yyyy-MM-dd}-%i.log 49 | 54 | 365 55 | 58 | 59 | 128MB 60 | 61 | 62 | 65 | 66 | %d{yyyy-MM-dd HH:mm:ss.SSS} [ %thread ] - [ %-5level ] [ %logger{50} : %line ] - %msg%n 67 | 68 | 69 | 70 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 88 | 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /Project5_订单交易额实时统计、离线审计/TSA模拟生成订单,1秒1条/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | mx-group 5 | TSA 6 | 0.0.1-SNAPSHOT 7 | 8 | 9 | UTF-8 10 | 1.7 11 | 12 | 13 | 14 | sgs-mx 15 | 16 | 17 | org.apache.maven.plugins 18 | maven-compiler-plugin 19 | 2.3.2 20 | 21 | ${jdk.version} 22 | ${jdk.version} 23 | ${project.build.sourceEncoding} 24 | 25 | 26 | ${java.home}/lib/rt.jar;${java.home}/lib/jce.jar 27 | 28 | 29 | 30 | 31 | org.apache.maven.plugins 32 | maven-shade-plugin 33 | 2.4.3 34 | 35 | 36 | package 37 | 38 | shade 39 | 40 | 41 | 42 | jar-with-dependencies 43 | 44 | 45 | 47 | tsa.sheetgenerate.SheetGeneratorServer 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | org.slf4j 61 | slf4j-api 62 | 1.7.10 63 | 64 | 65 | ch.qos.logback 66 | logback-classic 67 | 1.1.2 68 | 69 | 70 | ch.qos.logback 71 | logback-core 72 | 1.1.2 73 | 74 | 75 | 76 | 77 | org.apache.kafka 78 | kafka_2.11 79 | 0.8.2.2 80 | 81 | 82 | org.apache.zookeeper 83 | zookeeper 84 | 85 | 86 | log4j 87 | log4j 88 | 89 | 90 | 91 | 92 | org.scala-lang 93 | scala-library 94 | 2.11.8 95 | 96 | 97 | org.scala-lang 98 | scala-reflect 99 | 2.11.8 100 | 101 | 102 | 103 | 104 | org.apache.zookeeper 105 | zookeeper 106 | 3.4.6 107 | 108 | 109 | org.slf4j 110 | slf4j-log4j12 111 | 112 | 113 | 114 | 115 | org.apache.curator 116 | curator-framework 117 | 3.0.0 118 | 119 | 120 | org.apache.curator 121 | curator-recipes 122 | 3.2.1 123 | 124 | 125 | joda-time 126 | joda-time 127 | 2.3 128 | 129 | 130 | com.google.guava 131 | guava 132 | 17.0 133 | 134 | 135 | commons-io 136 | commons-io 137 | 2.4 138 | 139 | 140 | commons-lang 141 | commons-lang 142 | 2.6 143 | 144 | 145 | com.alibaba 146 | fastjson 147 | 1.2.20 148 | 149 | 150 | org.springframework.data 151 | spring-data-redis 152 | 1.7.4.RELEASE 153 | 154 | 155 | redis.clients 156 | jedis 157 | 2.7.3 158 | 159 | 160 | net.spy 161 | spymemcached 162 | 2.12.1 163 | 164 | 165 | 166 | 167 | -------------------------------------------------------------------------------- /Project5_订单交易额实时统计、离线审计/TSA模拟生成订单,1秒1条/producer.properties: -------------------------------------------------------------------------------- 1 | ############################# Producer Basics ############################# 2 | 3 | # list of brokers used for bootstrapping knowledge about the rest of the cluster 4 | # format: host1:port1,host2:port2 ... 5 | # metadata.broker.list=192.168.1.101:9092,192.168.1.102:9092,192.168.1.103:9092 6 | metadata.broker.list=192.168.230.128:9092,192.168.230.129:9092,192.168.230.131:9092 7 | 8 | # name of the partitioner class for partitioning events; default partition spreads data randomly 9 | #partitioner.class= 10 | 11 | # specifies whether the messages are sent asynchronously (async) or synchronously (sync) 12 | producer.type=sync 13 | 14 | # specify the compression codec for all data generated: none, gzip, snappy, lz4. 15 | # the old config values work as well: 0, 1, 2, 3 for none, gzip, snappy, lz4, respectively 16 | compression.codec=none 17 | 18 | # message encoder 19 | serializer.class=kafka.serializer.DefaultEncoder 20 | 21 | # allow topic level compression 22 | #compressed.topics= 23 | 24 | ############################# Async Producer ############################# 25 | # maximum time, in milliseconds, for buffering data on the producer queue 26 | #queue.buffering.max.ms= 27 | 28 | # the maximum size of the blocking queue for buffering on the producer 29 | #queue.buffering.max.messages= 30 | 31 | # Timeout for event enqueue: 32 | # 0: events will be enqueued immediately or dropped if the queue is full 33 | # -ve: enqueue will block indefinitely if the queue is full 34 | # +ve: enqueue will block up to this many milliseconds if the queue is full 35 | #queue.enqueue.timeout.ms= 36 | 37 | # the number of messages batched at the producer 38 | #batch.num.messages= 39 | -------------------------------------------------------------------------------- /Project5_订单交易额实时统计、离线审计/utils/ENotationUtil.java: -------------------------------------------------------------------------------- 1 | package tsa.utils; 2 | 3 | import java.math.BigDecimal; 4 | import java.text.DecimalFormat; 5 | 6 | /** 7 | * ENotationUtil用于科学计数法的处理 8 | * 9 | * @className ENotationUtil 10 | * @author mxlee 11 | * @email imxlee@foxmail.com 12 | * @date 2016年11月26日 13 | */ 14 | public class ENotationUtil { 15 | 16 | private ENotationUtil() { 17 | } 18 | 19 | /** 20 | * 进行加法运算 21 | * 22 | * @param d1 23 | * double 24 | * @param d2 25 | * double 26 | * @return double 两者之和 27 | */ 28 | public static double add(double d1, double d2) { 29 | BigDecimal b1 = new BigDecimal(d1); 30 | BigDecimal b2 = new BigDecimal(d2); 31 | return b1.add(b2).doubleValue(); 32 | } 33 | 34 | /** 35 | * 进行加法运算 36 | */ 37 | public static BigDecimal addBigDec(BigDecimal b1, BigDecimal b2) { 38 | return b1.add(b2); 39 | } 40 | 41 | /** 42 | * 进行减法运算 43 | * 44 | * @param d1 45 | * double 46 | * @param d2 47 | * double 48 | * @return double 两者之差 49 | */ 50 | public static double sub(double d1, double d2) { 51 | BigDecimal b1 = new BigDecimal(d1); 52 | BigDecimal b2 = new BigDecimal(d2); 53 | return b1.subtract(b2).doubleValue(); 54 | } 55 | 56 | /** 57 | * 进行乘法运算 58 | * 59 | * @param d1 60 | * double 61 | * @param d2 62 | * double 63 | * @return double 两者之积 64 | */ 65 | public static double mul(double d1, double d2) { 66 | BigDecimal b1 = new BigDecimal(d1); 67 | BigDecimal b2 = new BigDecimal(d2); 68 | return b1.multiply(b2).doubleValue(); 69 | } 70 | 71 | /** 72 | * 进行除法运算 73 | * 74 | * @param d1 75 | * double 76 | * @param d2 77 | * double 78 | * @return double 79 | */ 80 | public static double div(double d1, double d2, int len) { 81 | BigDecimal b1 = new BigDecimal(d1); 82 | BigDecimal b2 = new BigDecimal(d2); 83 | return b1.divide(b2, len, BigDecimal.ROUND_HALF_UP).doubleValue(); 84 | } 85 | /** 86 | * 进行除法运算 87 | * 88 | * @param d1 89 | * double 90 | * @param d2 91 | * double 92 | * @return double 93 | */ 94 | public static double transf(double d1, int len) { 95 | BigDecimal b1 = new BigDecimal(d1); 96 | BigDecimal b2 = new BigDecimal("100000000"); 97 | return b1.divide(b2, len, BigDecimal.ROUND_HALF_UP).doubleValue(); 98 | } 99 | 100 | /** 101 | * 进行四舍五入操作 102 | * 103 | * @param d1 104 | * double 105 | * @param d2 106 | * double 107 | * @return double 108 | */ 109 | public static double round(double d, int len) { 110 | BigDecimal b1 = new BigDecimal(d); 111 | BigDecimal b2 = new BigDecimal(1); 112 | // 任何一个数字除以1都是原数字,ROUND_HALF_UP是BigDecimal的一个常量,表示进行四舍五入的操作 113 | return b1.divide(b2, len, BigDecimal.ROUND_HALF_UP).doubleValue(); 114 | } 115 | 116 | /** 117 | * 当数值过大时进行转换,取大约数 118 | * 119 | * @param d 120 | * @return 121 | */ 122 | public static String subBigDecimal(double d) { 123 | DecimalFormat df = new DecimalFormat(); 124 | df.applyPattern("0.000亿"); 125 | return df.format(d); 126 | } 127 | 128 | } 129 | -------------------------------------------------------------------------------- /Project5_订单交易额实时统计、离线审计/utils/HiveUtil.java: -------------------------------------------------------------------------------- 1 | package tsa.utils; 2 | 3 | import java.math.BigDecimal; 4 | import java.sql.Connection; 5 | import java.sql.DriverManager; 6 | import java.sql.PreparedStatement; 7 | import java.sql.ResultSet; 8 | import java.sql.SQLException; 9 | 10 | public class HiveUtil { 11 | 12 | static { 13 | try { 14 | Class.forName("org.apache.hadoop.hive.jdbc.HiveDriver"); 15 | } catch (ClassNotFoundException e) { 16 | e.printStackTrace(); 17 | } 18 | } 19 | 20 | public static BigDecimal querySale(String date) throws SQLException { 21 | Connection con = DriverManager.getConnection("jdbc:hive://192.168.230.128:10000/tsa", "", ""); 22 | String sql = "select price,amount from tsa.trade where time like ?"; 23 | PreparedStatement preparedStatement = con.prepareStatement(sql); 24 | preparedStatement.setString(1, '%' + date + '%'); 25 | ResultSet resultSet = preparedStatement.executeQuery(); 26 | BigDecimal balance = new BigDecimal(0); 27 | while (resultSet.next()) { 28 | String price = resultSet.getString("price"); 29 | String amount = resultSet.getString("amount"); 30 | // 单价*数量 31 | BigDecimal value = new BigDecimal( 32 | Double.parseDouble(price) * Double.parseDouble(amount)); 33 | balance = balance.add(value);// 60s之间的数值之和(第1分钟可能不到60s) 34 | } 35 | con.close(); 36 | return balance; 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /Project5_订单交易额实时统计、离线审计/utils/JDBCUtil.java: -------------------------------------------------------------------------------- 1 | package tsa.utils; 2 | 3 | import java.sql.Connection; 4 | import java.sql.DriverManager; 5 | import java.sql.PreparedStatement; 6 | import java.sql.ResultSet; 7 | import java.sql.SQLException; 8 | import java.sql.Statement; 9 | import java.util.ArrayList; 10 | import java.util.Arrays; 11 | import java.util.List; 12 | 13 | public class JDBCUtil { 14 | 15 | private static final String url = "jdbc:mysql://192.168.230.129:3306/tsa?useUnicode=true&characterEncoding=utf-8"; 16 | private static final String user = "mxlee"; 17 | private static final String password = "mxlee"; 18 | 19 | static { 20 | try { 21 | DriverManager.registerDriver(new com.mysql.jdbc.Driver()); 22 | } catch (SQLException e) { 23 | e.printStackTrace(); 24 | } 25 | } 26 | 27 | /** 28 | * 执行insert、update、delete语句 29 | * 30 | * @param sql 31 | * @param params 32 | */ 33 | public static void update(String sql, List params) { 34 | Connection connection = null; 35 | try { 36 | connection = DriverManager.getConnection(url, user, password); 37 | PreparedStatement ps = connection.prepareStatement(sql); 38 | if (params != null && params.size() > 0) { 39 | for (int i = 0; i < params.size(); i++) { 40 | ps.setObject(i + 1, params.get(i)); 41 | } 42 | } 43 | ps.execute(); 44 | } catch (SQLException e) { 45 | e.printStackTrace(); 46 | } finally { 47 | if (connection != null) { 48 | try { 49 | connection.close(); 50 | } catch (SQLException e) { 51 | e.printStackTrace(); 52 | } 53 | } 54 | } 55 | } 56 | 57 | /** 58 | * 查询一行 59 | * 60 | * @param sql 61 | * @param params 62 | * @return 63 | */ 64 | public static List queryRow(String sql, List params) { 65 | List> result = queryAll(sql, params); 66 | return result.get(0); 67 | } 68 | 69 | /** 70 | * 查询商品名 71 | * 72 | * @param sql 73 | * @param params 74 | * @return 75 | */ 76 | public static String queryOne(String sql) { 77 | 78 | Connection connection = null; 79 | String goodsName = null; 80 | try { 81 | 82 | connection = DriverManager.getConnection(url, user, password); 83 | Statement statement = connection.createStatement(); 84 | 85 | ResultSet result = statement.executeQuery(sql); 86 | 87 | while (result.next()) { 88 | goodsName = result.getString("名称"); 89 | } 90 | 91 | } catch (Exception e) { 92 | 93 | }finally{ 94 | try { 95 | connection.close(); 96 | } catch (SQLException e) { 97 | e.printStackTrace(); 98 | } 99 | } 100 | 101 | return goodsName; 102 | } 103 | 104 | /** 105 | * 计数 106 | * 107 | * @param sql 108 | * @param params 109 | * @return 110 | */ 111 | public static long count(String sql, List params) { 112 | List> result = queryAll(sql, params); 113 | return Long.parseLong(result.get(0).get(0).toString()); 114 | } 115 | 116 | /** 117 | * 查询 118 | * 119 | * @param sql 120 | * @param params 121 | * @return 122 | */ 123 | public static List> queryAll3(String sql, Object... params) { 124 | return queryAll(sql, Arrays.asList(params)); 125 | } 126 | 127 | /** 128 | * 查询 129 | * 130 | * @param sql 131 | * 使用占位符的语句 132 | * @param params 133 | * @return 134 | */ 135 | public static List> queryAll(String sql, List params) { 136 | List> result = new ArrayList>(); 137 | Connection connection = null; 138 | try { 139 | connection = DriverManager.getConnection(url, user, password); 140 | PreparedStatement ps = connection.prepareStatement(sql); 141 | if (params != null && params.size() > 0) { 142 | for (int i = 0; i < params.size(); i++) { 143 | ps.setObject(i + 1, params.get(i)); 144 | } 145 | } 146 | ResultSet resultset = ps.executeQuery(); 147 | int columnCount = resultset.getMetaData().getColumnCount(); 148 | while (resultset.next()) { 149 | List line = new ArrayList(); 150 | for (int i = 0; i < columnCount; i++) { 151 | Object value = resultset.getObject(i + 1); 152 | line.add(i, value); 153 | } 154 | result.add(line); 155 | } 156 | } catch (SQLException e) { 157 | e.printStackTrace(); 158 | } finally { 159 | if (connection != null) { 160 | try { 161 | connection.close(); 162 | } catch (SQLException e) { 163 | e.printStackTrace(); 164 | } 165 | } 166 | } 167 | return result; 168 | } 169 | 170 | public static List> query2(String sql, List params) { 171 | List> result = new ArrayList>(); 172 | Connection connection = null; 173 | try { 174 | connection = DriverManager.getConnection(url, user, password); 175 | PreparedStatement ps = connection.prepareStatement(sql); 176 | if (params != null && params.size() > 0) { 177 | for (int i = 0; i < params.size(); i++) { 178 | ps.setObject(i + 1, params.get(i)); 179 | } 180 | } 181 | ResultSet resultset = ps.executeQuery(); 182 | int columnCount = resultset.getMetaData().getColumnCount(); 183 | while (resultset.next()) { 184 | List line = new ArrayList(); 185 | for (int i = 0; i < columnCount; i++) { 186 | Object value = resultset.getObject(i + 1); 187 | if (value != null) { 188 | // 一定要按照位置插入记录。因为有可能前面的列为null 189 | line.add(i, value.toString()); 190 | } 191 | } 192 | result.add(line); 193 | } 194 | } catch (SQLException e) { 195 | e.printStackTrace(); 196 | } finally { 197 | if (connection != null) { 198 | try { 199 | connection.close(); 200 | } catch (SQLException e) { 201 | e.printStackTrace(); 202 | } 203 | } 204 | } 205 | return result; 206 | } 207 | } 208 | -------------------------------------------------------------------------------- /Project5_订单交易额实时统计、离线审计/utils/StringUtil.java: -------------------------------------------------------------------------------- 1 | package tsa.utils; 2 | 3 | import java.util.Iterator; 4 | import java.util.Map; 5 | import java.util.Set; 6 | import java.util.regex.Matcher; 7 | import java.util.regex.Pattern; 8 | 9 | import redis.clients.jedis.Jedis; 10 | 11 | /** 12 | * StringUtil工具类用于处理String字符串 13 | * 14 | * @className StringUtil 15 | * @author mxlee 16 | * @email imxlee@foxmail.com 17 | * @date 2016年11月25日 18 | */ 19 | public class StringUtil { 20 | 21 | static String regEx = "[\u4e00-\u9fa5]"; 22 | static Pattern pat = Pattern.compile(regEx); 23 | 24 | /** 25 | * 取出单价和商品数量相乘,得到总价返回 26 | * 27 | * @param msg 28 | * @return 29 | */ 30 | public static double multiBalance(String msg) { 31 | 32 | try { 33 | // 以:切割字符串 时间戳\t单号\t商品条码\t单价\t数量 34 | // 1481111901694 000000149 6921168509256 1 85 35 | String[] split = msg.split("\t"); 36 | // 取出单价和商品数量相乘,得到总价返回 37 | return Double.parseDouble(split[3].toString()) * Double.parseDouble(split[4].toString()); 38 | } catch (Exception e) { 39 | return 0; 40 | } 41 | 42 | } 43 | 44 | /** 45 | * 判断是否包含汉字 46 | * 47 | * @param msg 48 | * @return 49 | */ 50 | public static boolean hasChinese(String msg) { 51 | Matcher matcher = pat.matcher(msg); 52 | boolean flg = false; 53 | if (matcher.find()) { 54 | flg = true; 55 | } 56 | return flg; 57 | } 58 | 59 | /** 60 | * 截取字符串,判断是要查几分钟的 61 | * 62 | * @param str 63 | * @return 64 | */ 65 | public static String isMinite(String str) { 66 | int indexOf = str.indexOf("="); 67 | if (indexOf == -1) { 68 | return null; 69 | } 70 | return str.substring(indexOf + 1); 71 | } 72 | 73 | /** 74 | * 将信息中包含的商品码,总金额添加进map 75 | * 76 | * @param hashMap 77 | * @param msg 78 | * @return 79 | */ 80 | public static Map addMap(Map hashMap, String msg) { 81 | 82 | String[] split = msg.split(":"); 83 | 84 | if (split == null) { 85 | return hashMap; 86 | } 87 | 88 | double balance = Double.parseDouble(split[3].toString()) * Double.parseDouble(split[4].toString());// 总金额 89 | 90 | // 判断Map中是否已经包含该商品码 91 | if (hashMap.containsKey(split[2])) { 92 | String value = hashMap.get(split[2]); 93 | balance += Double.parseDouble(value); 94 | hashMap.put(split[2], balance + ""); 95 | } else { 96 | hashMap.put(split[2], balance + ""); 97 | } 98 | 99 | return hashMap; 100 | } 101 | 102 | /** 103 | * 遍历Map,将Map中的商品依次推进Redis sorted set 104 | * 105 | * @param jedis 106 | * @param currentTime 107 | * @param hashMap 108 | */ 109 | public static void map2Redis(long currentTime, Map hashMap) { 110 | Jedis jedis = new Jedis("192.168.1.104", 6379); 111 | // 遍历Map 112 | Set keySet = hashMap.keySet(); 113 | Iterator iterator = keySet.iterator(); 114 | 115 | while (iterator.hasNext()) { 116 | String key = iterator.next();// 获得商品码 117 | String value = hashMap.get(key);// 获得字符串类型的总金额 118 | jedis.zadd("mx@" + currentTime, Double.parseDouble(value), key); 119 | } 120 | 121 | jedis.close(); 122 | } 123 | 124 | } 125 | -------------------------------------------------------------------------------- /Project5_订单交易额实时统计、离线审计/utils/TimeUtil.java: -------------------------------------------------------------------------------- 1 | package tsa.utils; 2 | 3 | import java.sql.Timestamp; 4 | import java.text.ParseException; 5 | import java.text.SimpleDateFormat; 6 | import java.util.Date; 7 | 8 | public class TimeUtil { 9 | 10 | /** 11 | * 获取当前时间的时间戳 12 | * 13 | * @return 14 | */ 15 | public static long currentTime() { 16 | String str = new Date().toLocaleString(); 17 | long transDate = transDate(str); 18 | return transDate; 19 | } 20 | 21 | /** 22 | * 根据给定字符串,转成时间戳,格式yyyy-MM-dd HH:mm:ss 23 | * 24 | * @param str 25 | * @return 26 | * @throws ParseException 27 | */ 28 | public static long transDate(String str) { 29 | SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); 30 | Date parse = null; 31 | try { 32 | parse = simpleDateFormat.parse(str); 33 | } catch (ParseException e) { 34 | // System.out.println("对不起,未查到相关结果,请检查输入是否有错!"); 35 | return -1; 36 | } 37 | long time = parse.getTime(); 38 | return time; 39 | } 40 | 41 | /** 42 | * 将时间戳转为时间 43 | * 44 | * @param time 45 | * @return 46 | */ 47 | public static String stamp2Date(String time) { 48 | Timestamp timestamp = new Timestamp(Long.parseLong(time)); 49 | return timestamp.toLocaleString(); 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /Project5_订单交易额实时统计、离线审计/实时统计/MapReduce版/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | mx-group 5 | tradeaudit 6 | 0.0.1-SNAPSHOT 7 | 8 | 9 | UTF-8 10 | 1.7 11 | 12 | 13 | 14 | tsaPro 15 | 16 | 17 | org.apache.maven.plugins 18 | maven-compiler-plugin 19 | 2.3.2 20 | 21 | ${jdk.version} 22 | ${jdk.version} 23 | ${project.build.sourceEncoding} 24 | 25 | 26 | ${java.home}/lib/rt.jar;${java.home}/lib/jce.jar 27 | 28 | 29 | 30 | 31 | maven-assembly-plugin 32 | 33 | 34 | jar-with-dependencies 35 | 36 | 37 | 38 | tsa.tradestatistics.StormKafkaProcess 39 | 40 | 41 | 42 | 43 | 44 | make-assembly 45 | package 46 | 47 | single 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | jdk.tools 58 | jdk.tools 59 | 1.8 60 | system 61 | ${JAVA_HOME}/lib/tools.jar 62 | 63 | 64 | org.apache.storm 65 | storm-core 66 | 1.0.2 67 | provided 68 | 69 | 70 | org.slf4j 71 | slf4j-api 72 | 73 | 74 | 75 | 76 | org.apache.maven.plugins 77 | maven-shade-plugin 78 | 2.4.3 79 | 80 | 81 | org.apache.storm 82 | storm-kafka 83 | 1.0.2 84 | 85 | 86 | 87 | org.apache.kafka 88 | kafka_2.11 89 | 0.8.2.2 90 | 91 | 92 | org.apache.zookeeper 93 | zookeeper 94 | 95 | 96 | log4j 97 | log4j 98 | 99 | 100 | 101 | 102 | org.scala-lang 103 | scala-library 104 | 2.11.8 105 | 106 | 107 | org.scala-lang 108 | scala-reflect 109 | 2.11.8 110 | 111 | 112 | 113 | 114 | org.apache.hive 115 | hive-jdbc 116 | 0.14.0 117 | 118 | 119 | mysql 120 | mysql-connector-java 121 | 5.1.40 122 | 123 | 124 | 125 | 126 | org.apache.hadoop 127 | hadoop-yarn-common 128 | 2.6.0 129 | 130 | 131 | org.apache.hadoop 132 | hadoop-yarn-client 133 | 2.6.0 134 | 135 | 136 | org.apache.hadoop 137 | hadoop-yarn-server-common 138 | 2.6.0 139 | 140 | 141 | org.apache.hadoop 142 | hadoop-yarn-server-resourcemanager 143 | 2.6.0 144 | 145 | 146 | org.apache.hadoop 147 | hadoop-yarn-server-nodemanager 148 | 2.6.0 149 | 150 | 151 | org.apache.hadoop 152 | hadoop-yarn-server-applicationhistoryservice 153 | 2.6.0 154 | 155 | 156 | org.apache.hadoop 157 | hadoop-mapreduce-client-core 158 | 2.6.0 159 | 160 | 161 | org.apache.hadoop 162 | hadoop-mapreduce-client-hs 163 | 2.6.0 164 | 165 | 166 | org.apache.hadoop 167 | hadoop-mapreduce-client-shuffle 168 | 2.6.0 169 | 170 | 171 | org.apache.hadoop 172 | hadoop-mapreduce-examples 173 | 2.6.0 174 | 175 | 176 | 177 | 178 | org.apache.hadoop 179 | hadoop-common 180 | 2.6.0 181 | 182 | 183 | org.apache.hadoop 184 | hadoop-client 185 | 2.6.0 186 | 187 | 188 | org.apache.hadoop 189 | hadoop-hdfs 190 | 2.6.0 191 | 192 | 193 | 194 | 195 | log4j 196 | log4j 197 | 1.2.17 198 | 199 | 200 | org.apache.zookeeper 201 | zookeeper 202 | 3.4.6 203 | 204 | 205 | org.slf4j 206 | slf4j-log4j12 207 | 208 | 209 | 210 | 211 | org.apache.curator 212 | curator-framework 213 | 2.7.1 214 | 215 | 216 | joda-time 217 | joda-time 218 | 2.3 219 | 220 | 221 | com.google.guava 222 | guava 223 | 17.0 224 | 225 | 226 | commons-io 227 | commons-io 228 | 2.4 229 | 230 | 231 | commons-lang 232 | commons-lang 233 | 2.6 234 | 235 | 236 | commons-dbutils 237 | commons-dbutils 238 | 1.6 239 | 240 | 241 | com.alibaba 242 | fastjson 243 | 1.2.20 244 | 245 | 246 | 247 | org.springframework.data 248 | spring-data-redis 249 | 1.7.4.RELEASE 250 | 251 | 252 | 253 | org.springframework 254 | spring-beans 255 | 4.3.4.RELEASE 256 | 257 | 258 | redis.clients 259 | jedis 260 | 2.7.3 261 | 262 | 263 | 264 | net.spy 265 | spymemcached 266 | 2.12.1 267 | 268 | 269 | 270 | io.netty 271 | netty-all 272 | 4.1.6.Final 273 | 274 | 275 | 276 | org.softee 277 | pojo-mbean 278 | 1.1 279 | 280 | 281 | c3p0 282 | c3p0 283 | 0.9.1.2 284 | 285 | 286 | mysql 287 | mysql-connector-java 288 | 5.1.40 289 | 290 | 291 | junit 292 | junit 293 | 4.12 294 | provided 295 | 296 | 297 | 298 | -------------------------------------------------------------------------------- /Project5_订单交易额实时统计、离线审计/实时统计/MapReduce版/tradestatistics/StormKafkaProcess.java: -------------------------------------------------------------------------------- 1 | package tsa.tradestatistics; 2 | 3 | import java.math.BigDecimal; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | import java.util.Map; 7 | 8 | import org.apache.storm.Config; 9 | import org.apache.storm.LocalCluster; 10 | import org.apache.storm.StormSubmitter; 11 | import org.apache.storm.generated.AlreadyAliveException; 12 | import org.apache.storm.generated.AuthorizationException; 13 | import org.apache.storm.generated.InvalidTopologyException; 14 | import org.apache.storm.kafka.KafkaSpout; 15 | import org.apache.storm.kafka.SpoutConfig; 16 | import org.apache.storm.kafka.ZkHosts; 17 | import org.apache.storm.task.OutputCollector; 18 | import org.apache.storm.task.TopologyContext; 19 | import org.apache.storm.topology.OutputFieldsDeclarer; 20 | import org.apache.storm.topology.TopologyBuilder; 21 | import org.apache.storm.topology.base.BaseRichBolt; 22 | import org.apache.storm.tuple.Fields; 23 | import org.apache.storm.tuple.Tuple; 24 | import org.apache.storm.tuple.Values; 25 | 26 | import com.mysql.fabric.xmlrpc.base.Value; 27 | 28 | import tsa.utils.ENotationUtil; 29 | import tsa.utils.JDBCUtil; 30 | import tsa.utils.TimeUtil; 31 | 32 | /** 33 | * StormKafkaProcess用于处理实时交易统计以及排行榜(Top10) 34 | * 35 | * @className StormKafkaProcess 36 | * @author mxlee 37 | * @email imxlee@foxmail.com 38 | * @date 2016年11月25日 39 | */ 40 | public class StormKafkaProcess { 41 | 42 | public static void main(String[] args) 43 | throws InterruptedException, InvalidTopologyException, AuthorizationException, AlreadyAliveException { 44 | 45 | String topologyName = "TSAS";// 元组名 46 | // Zookeeper主机地址,会自动选取其中一个 47 | ZkHosts zkHosts = new ZkHosts("192.168.230.128:2181,192.168.230.129:2181,192.168.230.131:2181"); 48 | String topic = "trademx"; 49 | String zkRoot = "/storm";// storm在Zookeeper上的根路径 50 | String id = "tsaPro"; 51 | 52 | // 创建SpoutConfig对象 53 | SpoutConfig spontConfig = new SpoutConfig(zkHosts, topic, zkRoot, id); 54 | 55 | TopologyBuilder builder = new TopologyBuilder(); 56 | builder.setSpout("kafka", new KafkaSpout(spontConfig), 2); 57 | builder.setBolt("AccBolt", new AccBolt()).shuffleGrouping("kafka"); 58 | builder.setBolt("ToDbBolt", new ToDbBolt()).shuffleGrouping("AccBolt"); 59 | 60 | Config config = new Config(); 61 | config.setDebug(false); 62 | 63 | if (args.length == 0) { // 本地运行,用于测试 64 | LocalCluster localCluster = new LocalCluster(); 65 | localCluster.submitTopology(topologyName, config, builder.createTopology()); 66 | Thread.sleep(1000 * 3600); 67 | localCluster.killTopology(topologyName); 68 | localCluster.shutdown(); 69 | } else { // 提交至集群运行 70 | StormSubmitter.submitTopology(topologyName, config, builder.createTopology()); 71 | } 72 | 73 | } 74 | 75 | /** 76 | * AccBolt用于接收Spout发送的消息 77 | * 78 | * @className AccBolt 79 | * @author mxlee 80 | * @email imxlee@foxmail.com 81 | * @date 2016年11月25日 82 | */ 83 | public static class AccBolt extends BaseRichBolt { 84 | 85 | private OutputCollector collector; 86 | BigDecimal balance = new BigDecimal(0);// 商品总价 87 | String markTime = null;// 存入mysql的时间 88 | 89 | public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { 90 | this.collector = collector; 91 | } 92 | 93 | public void execute(Tuple input) { 94 | // 分析信息 95 | byte[] binary = input.getBinary(0); 96 | String msg = new String(binary); 97 | 98 | String[] split = msg.split("\t"); 99 | String date = TimeUtil.stamp2Date(split[0]);// 将时间戳1481111901765转为时间2016-12-7 100 | // 19:58:21 101 | String subdate = date.substring(0, date.length() - 3);// 获取到分钟2016-12-7 102 | // 19:58 103 | 104 | if (markTime != null && !subdate.equals(markTime)) { 105 | // 将信息发送给下一级bolt,然后由其存入mysql 106 | Values tuple = new Values(markTime, balance); 107 | collector.emit(tuple); 108 | // balance置为0 109 | balance = new BigDecimal(0); 110 | } 111 | 112 | markTime = subdate; 113 | // 单价*数量 114 | BigDecimal value = new BigDecimal( 115 | Double.parseDouble(split[3].toString()) * Double.parseDouble(split[4].toString())); 116 | balance = balance.add(value);// 60s之间的数值之和(第1分钟可能不到60s) 117 | this.collector.ack(input); 118 | } 119 | 120 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 121 | declarer.declare(new Fields("time", "balance")); 122 | } 123 | 124 | } 125 | 126 | /** 127 | * ToDbBolt用于接收1分钟之内的商品总和,插入mysql 128 | * 129 | * @className ToDbBolt 130 | * @author mxlee 131 | * @email imxlee@foxmail.com 132 | * @date 2016年12月11日 133 | */ 134 | public static class ToDbBolt extends BaseRichBolt { 135 | 136 | private OutputCollector collector; 137 | BigDecimal balance = new BigDecimal(0);// 商品总价 138 | String markTime = null;// 存入mysql的时间 139 | 140 | public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { 141 | this.collector = collector; 142 | } 143 | 144 | public void execute(Tuple input) { 145 | // 分析信息,将时间和总额打入mysql 146 | markTime = input.getStringByField("time"); 147 | balance = new BigDecimal(input.getStringByField("balance")); 148 | String sql = "insert into trade(id,time,balance) values(null,?,?)"; 149 | List params = new ArrayList(); 150 | params.add(markTime); 151 | params.add(balance); 152 | JDBCUtil.update(sql, params); 153 | this.collector.ack(input); 154 | } 155 | 156 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 157 | } 158 | 159 | } 160 | 161 | } 162 | -------------------------------------------------------------------------------- /Project5_订单交易额实时统计、离线审计/实时统计/spark streaming版/StreamingTrade.scala: -------------------------------------------------------------------------------- 1 | package sparkstreaming 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.storage.StorageLevel 5 | import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream} 6 | import org.apache.spark.streaming.{Seconds, StreamingContext} 7 | 8 | import scala.collection.mutable 9 | 10 | /** 11 | * 从订单服务器获取订单,服务器每秒生成一条订单,简化订单格式(商品码\t单价\t购买数量) 12 | * 1、实时显示总销售额 13 | * 2、打印最近1分钟、5分钟、15分钟的销售总额(其实一样) 14 | * 3、打印最近1小时内销售总额前10的商品 15 | */ 16 | object StreamingTrade { 17 | def main(args: Array[String]): Unit = { 18 | //1、创建配置对象 19 | val conf: SparkConf = new SparkConf() 20 | conf.setAppName(StreamingOrder.getClass.getSimpleName) 21 | conf.setMaster("local[2]") 22 | 23 | //2、构造StreamingContext,第1个是配置对象,第2个是时间间隔 24 | val ssc: StreamingContext = new StreamingContext(conf, Seconds(1)) 25 | ssc.checkpoint("D:/trade") 26 | //3、定义接收器 27 | val textStream: ReceiverInputDStream[String] = ssc.socketTextStream("192.168.1.100", 5656, StorageLevel.MEMORY_ONLY) 28 | 29 | //4、业务逻辑 30 | val wc: DStream[(String, Double)] = textStream.map(line => { 31 | val split: Array[String] = line.split("\t") 32 | // (商品码,销售额) 33 | (split(0), split(1).toDouble * split(2).toInt) 34 | }) 35 | // 实时销售额,调用状态更新函数更新状态 36 | wc.map { case (k, v) => ("实时销售总额", v) }.updateStateByKey(updateFunc).print() 37 | 38 | // 1、5、15分钟销售额 39 | wc.reduceByKeyAndWindow((x: Double, y: Double) => x + y, Seconds(60), Seconds(1)).map(_._2).reduce(_ + _).print() 40 | wc.reduceByKeyAndWindow((x: Double, y: Double) => x + y, Seconds(60 * 5), Seconds(1)).map(_._2).reduce(_ + _).print() 41 | wc.reduceByKeyAndWindow((x: Double, y: Double) => x + y, Seconds(60 * 15), Seconds(1)).map(_._2).reduce(_ + _).print() 42 | 43 | // Top10 1小时内的销售总额前10的商品 44 | val top10DStream: DStream[(String, Double)] = wc.reduceByKeyAndWindow((x: Double, y: Double) => x + y, Seconds(60 * 60), Seconds(1)) 45 | top10DStream.foreachRDD { 46 | _.sortBy(_._2, false).take(10).foreach(println) 47 | } 48 | 49 | //5、启动流计算 50 | ssc.start() 51 | 52 | //6、等待程序结束 53 | ssc.awaitTermination() 54 | } 55 | 56 | /** 57 | * 状态更新 58 | * 59 | * @param value 60 | * @param status 61 | * @return 62 | */ 63 | def updateFunc(value: Seq[Double], status: Option[Double]) = { 64 | //获取当前状态 65 | val thisStatus: Double = value.sum 66 | //获取上一状态 67 | val lastStatus: Double = status.getOrElse(0) 68 | Some(thisStatus + lastStatus) 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /Project5_订单交易额实时统计、离线审计/实时统计/spark streaming版/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | mxgroup 5 | integration 6 | 1.0-SNAPSHOT 7 | 8 | 9 | UTF-8 10 | 2.0.2 11 | 2.11 12 | 2.6.0 13 | 14 | 15 | 16 | 17 | org.scala-lang 18 | scala-library 19 | 2.11.8 20 | 21 | 22 | junit 23 | junit 24 | 4.4 25 | test 26 | 27 | 28 | 29 | org.apache.spark 30 | spark-core_${scala.version} 31 | ${spark.version} 32 | 33 | 34 | org.apache.spark 35 | spark-sql_${scala.version} 36 | ${spark.version} 37 | 38 | 39 | org.apache.spark 40 | spark-hive_${scala.version} 41 | ${spark.version} 42 | 43 | 44 | org.apache.spark 45 | spark-streaming_${scala.version} 46 | ${spark.version} 47 | 48 | 49 | org.apache.hadoop 50 | hadoop-client 51 | 2.6.0 52 | 53 | 54 | org.apache.spark 55 | spark-streaming-kafka_${scala.version} 56 | 1.6.3 57 | 58 | 59 | org.apache.spark 60 | spark-mllib_${scala.version} 61 | ${spark.version} 62 | 63 | 64 | mysql 65 | mysql-connector-java 66 | 5.1.39 67 | 68 | 69 | org.apache.hbase 70 | hbase 71 | 1.1.5 72 | pom 73 | 74 | 75 | org.apache.hbase 76 | hbase-client 77 | 1.1.5 78 | 79 | 80 | org.apache.hbase 81 | hbase-common 82 | 1.1.5 83 | 84 | 85 | org.apache.hbase 86 | hbase-server 87 | 1.1.5 88 | 89 | 90 | 91 | 92 | 93 | 94 | org.codehaus.mojo 95 | build-helper-maven-plugin 96 | 1.8 97 | 98 | 99 | add-source 100 | generate-sources 101 | 102 | add-source 103 | 104 | 105 | 106 | src/main/java 107 | src/main/scala 108 | 109 | 110 | 111 | 112 | 113 | 114 | org.apache.maven.plugins 115 | maven-compiler-plugin 116 | 3.5.1 117 | 118 | 1.8 119 | 1.8 120 | utf8 121 | 122 | 123 | 124 | org.scala-tools 125 | maven-scala-plugin 126 | 127 | 128 | 129 | compile 130 | testCompile 131 | 132 | 133 | 134 | 135 | ${scala.version} 136 | 137 | -target:jvm-1.7 138 | 139 | 140 | 141 | 142 | maven-assembly-plugin 143 | 144 | 145 | jar-with-dependencies 146 | 147 | 148 | 149 | sparkstreaming.GenerateChar 150 | 151 | 152 | 153 | 154 | 155 | make-assembly 156 | package 157 | 158 | single 159 | 160 | 161 | 162 | 163 | 164 | 165 | -------------------------------------------------------------------------------- /Project5_订单交易额实时统计、离线审计/架构图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monsonlee/BigData/6004a79234c3f0cfa1efc1c197e93651303abd85/Project5_订单交易额实时统计、离线审计/架构图.png -------------------------------------------------------------------------------- /Project5_订单交易额实时统计、离线审计/离线审计/audit/TradeAudit.java: -------------------------------------------------------------------------------- 1 | package tsa.audit; 2 | 3 | import java.math.BigDecimal; 4 | import java.sql.SQLException; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import clojure.main; 9 | import tsa.utils.HiveUtil; 10 | import tsa.utils.JDBCUtil; 11 | 12 | /** 13 | * TradeAudit用于交易审计 14 | * 15 | * @className TradeAudit 16 | * @author mxlee 17 | * @email imxlee@foxmail.com 18 | * @date 2016年12月7日 19 | */ 20 | public class TradeAudit { 21 | 22 | public static void main(String[] args) throws SQLException { 23 | audit("2016-12-11 17:17"); 24 | } 25 | 26 | /** 27 | * 审计mysql中与hive中的交易 28 | * 29 | * @param minute 30 | * 2016-12-7 19:58 31 | * @return 32 | * @throws SQLException 33 | */ 34 | public static void audit(String minute) throws SQLException { 35 | // 1.查询MySQL 36 | String sql = "select balance from trade where time=?"; 37 | List params = new ArrayList(); 38 | params.add(minute); 39 | List listBalance = JDBCUtil.queryRow(sql, params); 40 | BigDecimal mysqlBalance = new BigDecimal(listBalance.get(0).toString()); 41 | System.out.println("实时计算:¥" + mysqlBalance); 42 | // 2.查询Hive的时候,范围是[20161207094900, 20161207095000) 43 | BigDecimal hiveBalance = HiveUtil.querySale(minute); 44 | 45 | // 3.判断两个结果是否相等 46 | System.out.println("落地数据:¥" + hiveBalance); 47 | System.out.println("审计结果是否相等:" + mysqlBalance.equals(hiveBalance)); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /Project5_订单交易额实时统计、离线审计/问题描述.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monsonlee/BigData/6004a79234c3f0cfa1efc1c197e93651303abd85/Project5_订单交易额实时统计、离线审计/问题描述.docx -------------------------------------------------------------------------------- /Project5_订单交易额实时统计、离线审计/问题说明.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monsonlee/BigData/6004a79234c3f0cfa1efc1c197e93651303abd85/Project5_订单交易额实时统计、离线审计/问题说明.pdf -------------------------------------------------------------------------------- /Project6_SparkBasic5个小问题/data/1.txt: -------------------------------------------------------------------------------- 1 | zookeeper kafka storm 2 | hdfs zookeeper flume sqoop 3 | hdfs kafka flume storm 4 | hdfs hive zookeeper sqoop 5 | hdfs kafka flume zookeeper -------------------------------------------------------------------------------- /Project6_SparkBasic5个小问题/data/data.txt: -------------------------------------------------------------------------------- 1 | 2010-05-04 12:50,10,10,10 2 | 2010-05-05 13:50,20,20,20 3 | 2010-05-06 14:50,30,30,30 4 | 2010-05-05 13:50,20,20,20 5 | 2010-05-06 14:50,30,30,30 6 | 2010-05-04 12:50,10,10,10 7 | 2010-05-04 11:50,10,10,10 -------------------------------------------------------------------------------- /Project6_SparkBasic5个小问题/data/index.txt: -------------------------------------------------------------------------------- 1 | cx1|a,b,c,d,e,f 2 | cx2|c,d,e,f 3 | cx3|a,b,c,f 4 | cx4|a,b,c,d,e,f 5 | cx5|a,b,e,f 6 | cx6|a,b,c,d 7 | cx7|a,b,c,f 8 | cx8|d,e,f 9 | cx9|b,c,d,e,f -------------------------------------------------------------------------------- /Project6_SparkBasic5个小问题/data/product.txt: -------------------------------------------------------------------------------- 1 | 1 皮鞋 2 | 2 衣服 -------------------------------------------------------------------------------- /Project6_SparkBasic5个小问题/data/sheet.txt: -------------------------------------------------------------------------------- 1 | 00001 1 2 | 00002 2 -------------------------------------------------------------------------------- /Project6_SparkBasic5个小问题/practice/DicConn.scala: -------------------------------------------------------------------------------- 1 | package practice 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import org.apache.spark.rdd.RDD 5 | 6 | /** 7 | * 需求: 8 | * 文件sheet.txt内容如下 9 | * 00001 1 10 | * 00002 2 11 | * 文件product.txt内容如下 12 | * 1 皮鞋 13 | * 2 衣服 14 | * 希望输出结果 15 | * 00001 皮鞋 16 | * 00002 衣服 17 | * Created by Administrator on 2016/12/15. 18 | */ 19 | object DicConn { 20 | def main(args: Array[String]): Unit = { 21 | var sc: SparkContext = sparkutil.Util.sc 22 | //1、读取文件内容 23 | val rdd: RDD[String] = sc.textFile("D:/sheet.txt") 24 | val rdd1: RDD[String] = sc.textFile("D:/product.txt") 25 | //2、rdd中的Key,Value对调后组成Tuple返回 26 | val sheetRDD: RDD[(String, String)] = rdd.map(x => { 27 | val split: Array[String] = x.split("\t") 28 | (split(1), split(0)) 29 | }) 30 | //3、rdd1中的Key,Value组成Tuple返回 31 | val productRDD: RDD[(String, String)] = rdd1.map(x => { 32 | val split: Array[String] = x.split("\t") 33 | (split(0), split(1)) 34 | }) 35 | //4、sheetRDD与productRDD进行join连接 36 | //5、利用map取出value 37 | //6、sortByKey排序并打印 38 | sheetRDD.join(productRDD).map(_._2).sortByKey().foreach(println(_)) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /Project6_SparkBasic5个小问题/practice/GroupSum.scala: -------------------------------------------------------------------------------- 1 | package practice 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.rdd.RDD 5 | 6 | import scala.collection.mutable 7 | 8 | /** 9 | * 需求: 10 | * 原始数据 11 | * 2010-05-04 12:50,10,10,10 12 | * 2010-05-05 13:50,20,20,20 13 | * 2010-05-06 14:50,30,30,30 14 | * 2010-05-05 13:50,20,20,20 15 | * 2010-05-06 14:50,30,30,30 16 | * 2010-05-04 12:50,10,10,10 17 | * 2010-05-04 11:50,10,10,10 18 | * 结果数据 19 | * 2010-05-04 11:50,10,10,10 20 | * 2010-05-04 12:50,20,20,20 21 | * 2010-05-05 13:50,40,40,40 22 | * 2010-05-06 14:50,60,60,60 23 | * 思路: 24 | * 分组、计算 25 | * Created by Administrator on 2016/12/16. 26 | */ 27 | object GroupSum { 28 | def main(args: Array[String]): Unit = { 29 | val sc: SparkContext = sparkutil.Util.sc 30 | val rdd: RDD[String] = sc.textFile("D:/data.txt") 31 | //1、先将数据以","切割,然后返回Tuple 32 | val rdd1: RDD[(String, Array[String])] = rdd.map(x => { 33 | val split: mutable.ArrayOps[String] = x.split(",", 2) //切割长度为2 34 | (split(0), split(1).split(",")) 35 | }) 36 | //2、分组、计算 37 | rdd1.reduceByKey((x, y) => { 38 | val i0: Int = x(0).toInt + y(0).toInt 39 | val i1: Int = x(1).toInt + y(1).toInt 40 | val i2: Int = x(2).toInt + y(2).toInt 41 | Array(i0 + "," + i1 + "," + i2) 42 | }).map(x => { 43 | x._1 + "," + x._2.mkString(",") 44 | }).foreach(println(_)) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /Project6_SparkBasic5个小问题/practice/InvertedIndex.scala: -------------------------------------------------------------------------------- 1 | package practice 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.rdd.RDD 5 | 6 | import scala.collection.mutable 7 | 8 | /** 9 | * 需求:倒排索引 10 | * 原始数据 11 | * cx1|a,b,c,d,e,f 12 | * cx2|c,d,e,f 13 | * cx3|a,b,c,f 14 | * cx4|a,b,c,d,e,f 15 | * cx5|a,b,e,f 16 | * cx6|a,b,c,d 17 | * cx7|a,b,c,f 18 | * cx8|d,e,f 19 | * cx9|b,c,d,e,f 20 | * 结果数据 21 | * d|cx1,cx2,cx4,cx6,cx8,cx9 22 | * e|cx1,cx2,cx4,cx5,cx8,cx9 23 | * a|cx1,cx3,cx4,cx5,cx6,cx7 24 | * b|cx1,cx3,cx4,cx5,cx6,cx7,cx9 25 | * f|cx1,cx2,cx3,cx4,cx5,cx7,cx8,cx9 26 | * c|cx1,cx2,cx3,cx4,cx6,cx7,cx9 27 | */ 28 | object InvertedIndex { 29 | def main(args: Array[String]): Unit = { 30 | val sc: SparkContext = sparkutil.Util.sc 31 | val rdd: RDD[String] = sc.textFile("D:/index.txt") 32 | //1、以"|"切割字符串,返回Tuple(value,index) 33 | val rdd1: RDD[(String, String)] = rdd.flatMap(line => { 34 | val split: Array[String] = line.split("\\|") 35 | var list: List[(String, String)] = List() 36 | for (word <- split(1).split(",")) { 37 | list = (word, split(0)) :: list 38 | } 39 | list 40 | }) 41 | //2、reduceByKey 42 | rdd1.reduceByKey((x, y) => { 43 | x + "," + y 44 | }).foreach(line => { 45 | println(line._1 + "|" + line._2) 46 | }) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /Project6_SparkBasic5个小问题/practice/TopWordCount.scala: -------------------------------------------------------------------------------- 1 | package practice 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | 6 | /** 7 | * 需求:词频计数 TopN 8 | * 思路: 9 | * 1、先把所有单词分割 10 | * 2、再给所有单词赋值(key,1) 11 | * 3、再用reduceByKey进行汇总 12 | * 4、利用sortBy(value,false)进行倒序 13 | * 5、利用take取前几位 14 | * 6、froeach遍历打印 15 | * Created by Administrator on 2016/12/15. 16 | */ 17 | object TopWordCount { 18 | 19 | def main(args: Array[String]): Unit = { 20 | var sc: SparkContext = sparkutil.Util.sc 21 | sc.textFile("D:/1.txt").flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _).sortBy(_._2, false).take(3).foreach(println(_)) 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /Project6_SparkBasic5个小问题/practice/WordCount.scala: -------------------------------------------------------------------------------- 1 | package practice 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | 6 | /** 7 | * 需求:单词计数 8 | * 思路: 9 | * 1、先把所有单词分割 10 | * 2、再给所有单词赋值(key,1) 11 | * 3、再用reduceByKey进行汇总 12 | * Created by Administrator on 2016/12/15. 13 | */ 14 | object WordCount { 15 | def main(args: Array[String]): Unit = { 16 | var sc: SparkContext = sparkutil.Util.sc 17 | sc.textFile("D:/1.txt").flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _).foreach(println(_)) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /Project6_SparkBasic5个小问题/sparkutil/Util.scala: -------------------------------------------------------------------------------- 1 | package sparkutil 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | 6 | /** 7 | * Spark工具类 8 | * Created by Administrator on 2016/12/15. 9 | */ 10 | object Util { 11 | def conf: SparkConf = { 12 | System.setProperty("hadoop.home.dir", "D:/Program Files/hadoop_winutils") 13 | System.setProperty("spark.master", "local") 14 | val conf: SparkConf = new SparkConf() 15 | conf.setAppName(Util.getClass.getSimpleName) 16 | conf 17 | } 18 | 19 | def sc: SparkContext = new SparkContext(conf) 20 | 21 | /** 22 | * 加载文件或者文件夹(本地或Hdfs),生成RDD 23 | * 24 | * @param path 25 | * @return 26 | */ 27 | def loadFile(path: String): RDD[String] = { 28 | val rdd: RDD[String] = Util.sc.textFile(path) 29 | rdd 30 | } 31 | 32 | /** 33 | * 加载文件或文件夹,生成RDD[(String,String)],第一个是文件名,第二个是文件内容 34 | * 35 | * @param path 36 | * @return 37 | */ 38 | def loadWholeFile(path: String): RDD[(String, String)] = { 39 | val rdd: RDD[(String, String)] = Util.sc.wholeTextFiles(path) 40 | rdd 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /Project6_SparkBasic5个小问题/问题说明.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monsonlee/BigData/6004a79234c3f0cfa1efc1c197e93651303abd85/Project6_SparkBasic5个小问题/问题说明.docx -------------------------------------------------------------------------------- /Project6_SparkBasic5个小问题/问题说明.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monsonlee/BigData/6004a79234c3f0cfa1efc1c197e93651303abd85/Project6_SparkBasic5个小问题/问题说明.pdf -------------------------------------------------------------------------------- /Project7_HBase/1磁盘小量数据导入HBase/Data2HBase.java: -------------------------------------------------------------------------------- 1 | import breeze.linalg.split; 2 | import org.apache.hadoop.conf.Configuration; 3 | import org.apache.hadoop.hbase.HBaseConfiguration; 4 | import org.apache.hadoop.hbase.HColumnDescriptor; 5 | import org.apache.hadoop.hbase.HTableDescriptor; 6 | import org.apache.hadoop.hbase.TableName; 7 | import org.apache.hadoop.hbase.client.*; 8 | import org.apache.hadoop.hbase.util.Bytes; 9 | 10 | import java.io.BufferedReader; 11 | import java.io.File; 12 | import java.io.FileReader; 13 | import java.io.IOException; 14 | 15 | /** 16 | * HBase解析数据,创建表 17 | * members表有5个字段,分别是id、name、address、sDate、eDate。 18 | * 测试数据如下, 19 | * 2000|cc|上海|2013-04-11|2014-11-18 20 | * 2001|Franky|北京|2007-04-11|2010-03-30 21 | * 2002|陈慧|上海|2009-04-11|2016-04-30 22 | * 2003|Linda7|深圳|2003-04-11|2004-04-25 23 | * 2004|Liz|上海|2013-10-11|2015-06-12 24 | * 2005|bibalily|广州|2002-04-11|2004-04-25 25 | * 2006|加斐|深圳|2012-04-11|2016-05-03 26 | * 2007|蒋艳铮|上海|2005-04-11|2007-04-02 27 | * 2008|张渠|北京|2000-04-11|2004-04-25 28 | * 2009|骆嫣|上海|2006-04-11|2007-04-25 29 | * 要求: 30 | * 1.创建表members、列族f1,然后插入数据 31 | * 思路: 32 | * 1.创建HBase表 33 | * 2.从磁盘加载数据文件 34 | * 3.向HBase写入数据 35 | */ 36 | public class Data2HBase { 37 | public static void main(String[] args) throws IOException { 38 | //1、获得连接 39 | Connection connection = getConnection(); 40 | //2、创建表 41 | String tableNameString = "d2h_table"; 42 | String columnFamily = "d2h_family"; 43 | createTable(connection, tableNameString, columnFamily); 44 | //3、读取数据源 45 | BufferedReader bufferedReader = new BufferedReader(new FileReader(new File("D:\\Workspaces\\javaBasic\\datasource/hbasedata.txt"))); 46 | 47 | String line = null; 48 | while ((line = bufferedReader.readLine()) != null) { 49 | String[] split = line.split("\\|"); 50 | //4、打入Hbase表 51 | put2Table(connection, tableNameString, columnFamily, split); 52 | } 53 | //5、关闭连接 54 | bufferedReader.close(); 55 | connection.close(); 56 | System.out.println("解析成功并完成导入"); 57 | } 58 | 59 | /** 60 | * 获得连接 61 | * 62 | * @return 63 | * @throws IOException 64 | */ 65 | private static Connection getConnection() throws IOException { 66 | Configuration conf = HBaseConfiguration.create(); 67 | conf.set("hbase.zookeeper.quorum", "slave1,slave2,slave3"); 68 | conf.set("hbase.zookeeper.property.clientPort", "2181"); 69 | ConnectionFactory.createConnection(conf); 70 | return ConnectionFactory.createConnection(conf); 71 | } 72 | 73 | /** 74 | * 创建表 75 | * 76 | * @param admin 77 | * @param tableNameString 78 | * @param columnFamily 79 | * @throws IOException 80 | */ 81 | private static void createTable(Connection connection, String tableNameString, String columnFamily) throws IOException { 82 | Admin admin = connection.getAdmin(); 83 | TableName tableName = TableName.valueOf(tableNameString); //d2h (data to HBase) 84 | HTableDescriptor table = new HTableDescriptor(tableName); 85 | HColumnDescriptor family = new HColumnDescriptor(columnFamily); 86 | table.addFamily(family); 87 | //判断表是否已经存在 88 | if (admin.tableExists(tableName)) { 89 | admin.disableTable(tableName); 90 | admin.deleteTable(tableName); 91 | } 92 | admin.createTable(table); 93 | } 94 | 95 | /** 96 | * 按行插入Hbase表 97 | * 98 | * @param connection 99 | * @param tableNameString 100 | * @param columnFamily 101 | * @param split 102 | * @throws IOException 103 | */ 104 | private static void put2Table(Connection connection, String tableNameString, String columnFamily, String[] split) throws IOException { 105 | String rowKey = split[0]; 106 | String name = split[1]; 107 | String address = split[2]; 108 | String sDate = split[3]; 109 | String eDate = split[4]; 110 | Table table = connection.getTable(TableName.valueOf(tableNameString)); 111 | Put put = new Put(Bytes.toBytes(rowKey)); 112 | put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("name"), Bytes.toBytes(name)); 113 | put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("address"), Bytes.toBytes(address)); 114 | put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("sDate"), Bytes.toBytes(sDate)); 115 | put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("eDate"), Bytes.toBytes(eDate)); 116 | table.put(put); 117 | //关闭表 118 | table.close(); 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /Project7_HBase/1磁盘小量数据导入HBase/hbasedata.txt: -------------------------------------------------------------------------------- 1 | 2000|cc|上海|2013-04-11|2014-11-18 2 | 2001|Franky|北京|2007-04-11|2010-03-30 3 | 2002|陈慧|上海|2009-04-11|2016-04-30 4 | 2003|Linda7|深圳|2003-04-11|2004-04-25 5 | 2004|Liz|上海|2013-10-11|2015-06-12 6 | 2005|bibalily|广州|2002-04-11|2004-04-25 7 | 2006|加斐|深圳|2012-04-11|2016-05-03 8 | 2007|蒋艳铮|上海|2005-04-11|2007-04-02 9 | 2008|张渠|北京|2000-04-11|2004-04-25 10 | 2009|骆嫣|上海|2006-04-11|2007-04-25 -------------------------------------------------------------------------------- /Project7_HBase/2磁盘大量数据导入HBase/Data2HBase1.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.hbase.TableName; 2 | import org.apache.hadoop.hbase.client.BufferedMutator; 3 | import org.apache.hadoop.hbase.client.Connection; 4 | import org.apache.hadoop.hbase.client.Put; 5 | import org.apache.hadoop.hbase.client.Table; 6 | 7 | import java.io.IOException; 8 | import java.util.ArrayList; 9 | 10 | import static com.sun.xml.internal.fastinfoset.alphabet.BuiltInRestrictedAlphabets.table; 11 | 12 | /** 13 | * 大量数据导入HBase, 14 | * 三种方式比较及优化 15 | * 第一种,单条导入,10W条记录耗时2-3分钟,单线程 16 | * 第二种,批量导入,10w条记录耗时3s,100w条记录耗时25s,单线程(1亿条记录利用多线程) 17 | * 第三种,利用BufferedMutator批量导入,10w条记录耗时2s,100w条记录耗时22s,单线程(1亿条记录利用多线程) 18 | */ 19 | public class Data2HBase1 { 20 | public static void main(String[] args) throws Exception { 21 | //获得连接 22 | Connection connection = HBaseUtil.getConnection(); 23 | // singleRowImport(connection);//单条导入 24 | // batchRowImport(connection);//批量导入 25 | // bmImport(connection); //利用BufferedMutator批量导入 26 | //关闭连接 27 | connection.close(); 28 | } 29 | 30 | /** 31 | * 单条数据导入 32 | * 33 | * @param connection 34 | * @return 35 | * @throws IOException 36 | */ 37 | private static void singleRowImport(Connection connection) throws IOException { 38 | Table table = connection.getTable(TableName.valueOf("t3")); 39 | byte[] columnFamily = "f1".getBytes(); 40 | 41 | long startTime = System.currentTimeMillis(); 42 | for (int i = 0; i < 99999; i++) { 43 | table.put(HBaseUtil.createPut(i + "", columnFamily, "c1", i + "")); 44 | } 45 | table.close(); 46 | System.out.println("共耗时:" + (System.currentTimeMillis() - startTime) + "ms"); 47 | } 48 | 49 | /** 50 | * 批量导入 51 | * 52 | * @param connection 53 | * @throws IOException 54 | */ 55 | private static void batchRowImport(Connection connection) throws IOException { 56 | Table table = connection.getTable(TableName.valueOf("t3")); 57 | byte[] columnFamily = "f1".getBytes(); 58 | 59 | long startTime = System.currentTimeMillis(); 60 | ArrayList puts = new ArrayList(); 61 | for (int i = 0; i < 99999; i++) { 62 | puts.add(HBaseUtil.createPut(i + "", columnFamily, "c1", i + "")); 63 | //每10000条导入一次 64 | if (i % 10000 == 0) { 65 | table.put(puts); 66 | puts.clear(); 67 | } 68 | } 69 | table.put(puts); 70 | table.close(); 71 | System.out.println("共耗时:" + (System.currentTimeMillis() - startTime) + "ms"); 72 | } 73 | 74 | /** 75 | * 利用BufferedMutator批量导入 76 | * 77 | * @param connection 78 | * @throws IOException 79 | */ 80 | private static void bmImport(Connection connection) throws IOException { 81 | BufferedMutator bufferedMutator = connection.getBufferedMutator(TableName.valueOf("t3")); 82 | byte[] columnFamily = "f1".getBytes(); 83 | 84 | long startTime = System.currentTimeMillis(); 85 | ArrayList puts = new ArrayList(); 86 | for (int i = 0; i < 999999; i++) { 87 | puts.add(HBaseUtil.createPut(i + "", columnFamily, "c1", i + "")); 88 | //每10000条导入一次 89 | if (i % 10000 == 0) { 90 | bufferedMutator.mutate(puts); 91 | puts.clear(); 92 | } 93 | } 94 | //批量调用 95 | bufferedMutator.mutate(puts); 96 | bufferedMutator.close(); 97 | System.out.println("共耗时:" + (System.currentTimeMillis() - startTime) + "ms"); 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /Project7_HBase/3Mysql迁移至HBase/Mysql2HBase.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.hbase.TableName; 2 | import org.apache.hadoop.hbase.client.*; 3 | 4 | import java.io.IOException; 5 | import java.sql.*; 6 | import java.sql.Connection; 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | import static org.apache.avro.TypeEnum.c; 11 | import static org.apache.hadoop.yarn.webapp.hamlet.HamletSpec.Scope.row; 12 | 13 | /** 14 | * 将Mysql某一数据库中的所有表及视图迁移至HBase 15 | *

16 | * 从Mysql导入到HBase中,可以实现一对一,也可以实现多对一 17 | * 多对一的情况出现在关联查询非常多的情况下 18 | *

19 | * 实现多对一的方法: 20 | * 在HBase中,1、一个Mysql表对应一个列族;2、所有表的字段都作为列族的列存在 21 | * 在Mysql中,建立一个视图,转化为一对一的实现方式 22 | */ 23 | public class Mysql2HBase { 24 | public static void main(String[] args) throws Exception { 25 | //1、获取数据库连接,通过DataBaseMetaData获取数据库中的所有表的名称、结构 26 | Connection connection = JdbcUtil.getConnection(); 27 | //获取数据库中表及视图的名称 28 | ArrayList tableNames = JdbcUtil.getTableNames(connection); 29 | //2、在HBase中创建对应的表 30 | org.apache.hadoop.hbase.client.Connection hbaseConn = HBaseUtil.getConnection(); 31 | String familyName = "mx_fn"; //列族名 32 | String nameSpace = "mx_ns:"; //命名空间 33 | 34 | for (String tableName : tableNames) { 35 | HBaseUtil.createTable(hbaseConn, nameSpace + tableName, familyName); 36 | } 37 | //3、从Mysql中读取数据 38 | for (String tableName : tableNames) { 39 | Statement statement = connection.createStatement(); 40 | String sql = "select * from " + tableName; 41 | ResultSet resultSet = statement.executeQuery(sql); 42 | //获取列名 43 | List columnNames = getColumnName(resultSet); 44 | //读取MySQL某张表的数据 45 | List> allRows = getAllRow(resultSet, columnNames); 46 | //构造Put对象 47 | List puts = createPutList(familyName, columnNames, allRows); 48 | //插入到HBase中 49 | insertData(hbaseConn, nameSpace + tableName, puts); 50 | } 51 | //4、关闭连接 52 | connection.close(); 53 | hbaseConn.close(); 54 | } 55 | 56 | /** 57 | * 插入数据至HBase中 58 | * 59 | * @param hbaseConn 60 | * @param s 61 | * @param puts 62 | */ 63 | private static void insertData(org.apache.hadoop.hbase.client.Connection hbaseConn, String s, List puts) throws IOException { 64 | Table table = hbaseConn.getTable(TableName.valueOf(s)); 65 | table.put(puts); 66 | table.close(); 67 | } 68 | 69 | /** 70 | * 创建Put列表 71 | * 72 | * @param familyName 73 | * @param columnNames 74 | * @param allRows 75 | * @return 76 | */ 77 | private static List createPutList(String familyName, List columnNames, List> allRows) { 78 | ArrayList puts = new ArrayList<>(); 79 | for (ArrayList row : allRows) { 80 | int size = row.size(); 81 | String rowKeyString = row.get(0); 82 | for (int i = 1; i < size; i++) { 83 | String columnName = columnNames.get(i); 84 | String columnValue = row.get(i); 85 | if (columnValue != null) { 86 | Put put = HBaseUtil.createPut(rowKeyString, familyName.getBytes(), columnName, columnValue); 87 | puts.add(put); 88 | } 89 | } 90 | } 91 | return puts; 92 | } 93 | 94 | /** 95 | * 读取MySQL某张表的数据 96 | * 97 | * @param resultSet 98 | * @param columnNames 99 | * @return 100 | * @throws SQLException 101 | */ 102 | private static List> getAllRow(ResultSet resultSet, List columnNames) throws SQLException { 103 | ArrayList> allRows = new ArrayList<>(); 104 | while (resultSet.next()) { 105 | ArrayList row = new ArrayList<>(); 106 | for (String columnName : columnNames) { 107 | Object value = resultSet.getObject(columnName); 108 | row.add(value == null ? null : value.toString()); 109 | } 110 | allRows.add(row); 111 | } 112 | return allRows; 113 | } 114 | 115 | /** 116 | * 获取表的列名 117 | * 118 | * @param resultSet 119 | * @return 120 | */ 121 | private static List getColumnName(ResultSet resultSet) throws SQLException { 122 | ResultSetMetaData metaData = resultSet.getMetaData(); 123 | int columnCount = metaData.getColumnCount(); 124 | 125 | List columnNames = new ArrayList<>(); 126 | for (int i = 1; i <= columnCount; i++) { 127 | columnNames.add(metaData.getColumnName(i)); 128 | } 129 | return columnNames; 130 | } 131 | 132 | } 133 | -------------------------------------------------------------------------------- /Project7_HBase/4Flter过滤器/FilterBasic.java: -------------------------------------------------------------------------------- 1 | import org.apache.commons.lang.StringUtils; 2 | import org.apache.hadoop.hbase.TableName; 3 | import org.apache.hadoop.hbase.client.*; 4 | import org.apache.hadoop.hbase.filter.*; 5 | 6 | import java.io.IOException; 7 | import java.text.SimpleDateFormat; 8 | import java.util.Calendar; 9 | 10 | import static com.sun.xml.internal.fastinfoset.alphabet.BuiltInRestrictedAlphabets.table; 11 | 12 | /** 13 | * 问题:Filter过滤器基本操作 14 | * HBase中有张members表,内容如下: 15 | * 2000|cc|上海|2013-04-11|2014-11-18 16 | * 2001|Franky|北京|2007-04-11|2010-03-30 17 | * 2002|陈慧|上海|2009-04-11|2016-04-30 18 | * 2003|Linda7|深圳|2003-04-11|2004-04-25 19 | * 2004|Liz|上海|2013-10-11|2015-06-12 20 | * 2005|bibalily|广州|2002-04-11|2004-04-25 21 | * 2006|加斐|深圳|2012-04-11|2016-05-03 22 | * 2007|蒋艳铮|上海|2005-04-11|2007-04-02 23 | * 2008|张渠|北京|2000-04-11|2004-04-25 24 | * 2009|骆嫣|上海|2006-04-11|2007-04-25 25 | *

26 | * 提供以下查询功能: 27 | * query(id, name, area, startRegDate, endRegDate, lastDaysLogin) 28 | * 其中,编号精准匹配、姓名模糊匹配、地区精准匹配、注册时间范围匹配、最近多少天登录。查询条件都不是必填信息。 29 | */ 30 | public class FilterBasic { 31 | public static void main(String[] args) throws Exception { 32 | query("2004", null, null, null, null, null); 33 | } 34 | 35 | /** 36 | * 设置过滤条件 37 | * FilterList.Operator.MUST_PASS_ALL,必须全部满足 38 | * FilterList.Operator.MUST_PASS_ONE,满足任一 39 | * 默认全部 40 | * 41 | * @param id 42 | * @param name 43 | * @param area 44 | * @param startRegDate 45 | * @param endRegDate 46 | * @param lastDaysLogin 47 | * @throws Exception 48 | */ 49 | private static void query(String id, String name, String area, String startRegDate, String endRegDate, Integer lastDaysLogin) throws Exception { 50 | //1、获取HBase连接 51 | Connection connection = HBaseUtil.getConnection(); 52 | //2、获取表 53 | Table table = connection.getTable(TableName.valueOf("members")); 54 | //3、设置过滤器 55 | Scan scan = new Scan(); //客户端缓存 56 | FilterList filterList = new FilterList(); 57 | //判断id,编号精准匹配 58 | if (StringUtils.isNotBlank(id)) { 59 | filterList.addFilter(new RowFilter(CompareFilter.CompareOp.EQUAL, new BinaryComparator(id.getBytes()))); 60 | } 61 | //判断姓名,姓名模糊匹配 62 | if (StringUtils.isNotBlank(name)) { 63 | filterList.addFilter(new SingleColumnValueFilter("f1".getBytes(), "name".getBytes(), CompareFilter.CompareOp.EQUAL, new SubstringComparator(name))); 64 | } 65 | //判断地区,地区精准匹配 66 | if (StringUtils.isNotBlank(area)) { 67 | filterList.addFilter(new SingleColumnValueFilter("f1".getBytes(), "area".getBytes(), CompareFilter.CompareOp.EQUAL, new BinaryComparator(area.getBytes()))); 68 | } 69 | //开始注册时间,注册时间范围匹配 70 | if (StringUtils.isNotBlank(startRegDate)) { 71 | filterList.addFilter(new SingleColumnValueFilter("f1".getBytes(), "startRegDate".getBytes(), CompareFilter.CompareOp.GREATER_OR_EQUAL, new BinaryComparator(startRegDate.getBytes()))); 72 | } 73 | //结束注册时间,注册时间范围匹配 74 | if (StringUtils.isNotBlank(endRegDate)) { 75 | filterList.addFilter(new SingleColumnValueFilter("f1".getBytes(), "endDate".getBytes(), CompareFilter.CompareOp.LESS, new BinaryComparator(endRegDate.getBytes()))); 76 | } 77 | //最近多少天登录 78 | if (lastDaysLogin != null) { 79 | Calendar instance = Calendar.getInstance(); 80 | instance.add(Calendar.DAY_OF_MONTH, 0 - lastDaysLogin); 81 | SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd"); 82 | String format = simpleDateFormat.format(instance.getTime()); 83 | filterList.addFilter(new SingleColumnValueFilter("f1".getBytes(), "lastdate".getBytes(), CompareFilter.CompareOp.EQUAL, new BinaryComparator(format.getBytes()))); 84 | } 85 | scan.setFilter(filterList); 86 | ResultScanner scanner = table.getScanner(scan); 87 | //4、解析ResultScanner 88 | for (Result result : scanner) { 89 | HBaseUtil.print(result); //解析并打印result信息 90 | } 91 | //5、关闭 92 | table.close(); 93 | connection.close(); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /Project7_HBase/HBaseUtil.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.conf.Configuration; 2 | import org.apache.hadoop.hbase.HBaseConfiguration; 3 | import org.apache.hadoop.hbase.HColumnDescriptor; 4 | import org.apache.hadoop.hbase.HTableDescriptor; 5 | import org.apache.hadoop.hbase.TableName; 6 | import org.apache.hadoop.hbase.client.*; 7 | 8 | import java.io.IOException; 9 | import java.util.Map; 10 | import java.util.NavigableMap; 11 | import java.util.Set; 12 | 13 | import static org.apache.hadoop.yarn.webapp.hamlet.HamletSpec.Scope.col; 14 | 15 | /** 16 | * HBase工具类 17 | */ 18 | public class HBaseUtil { 19 | /** 20 | * 获得连接 21 | * 22 | * @return 23 | * @throws IOException 24 | */ 25 | public static Connection getConnection() throws IOException { 26 | Configuration conf = HBaseConfiguration.create(); 27 | conf.set("hbase.zookeeper.quorum", "crxy107"); 28 | conf.set("hbase.zookeeper.property.clientPort", "2181"); 29 | return ConnectionFactory.createConnection(conf); 30 | } 31 | 32 | /** 33 | * 创建表 34 | * 35 | * @param admin 36 | * @param tableNameString 37 | * @param columnFamily 38 | * @throws IOException 39 | */ 40 | public static void createTable(Connection connection, String tableNameString, String columnFamily) throws IOException { 41 | Admin admin = connection.getAdmin(); 42 | TableName tableName = TableName.valueOf(tableNameString); //d2h (data to HBase) 43 | HTableDescriptor table = new HTableDescriptor(tableName); 44 | HColumnDescriptor family = new HColumnDescriptor(columnFamily); 45 | table.addFamily(family); 46 | //判断表是否已经存在 47 | if (admin.tableExists(tableName)) { 48 | admin.disableTable(tableName); 49 | admin.deleteTable(tableName); 50 | } 51 | admin.createTable(table); 52 | } 53 | 54 | /** 55 | * 获取插入HBase的操作put 56 | * 57 | * @param rowKeyString 58 | * @param familyName 59 | * @param columnName 60 | * @param columnValue 61 | * @return 62 | */ 63 | public static Put createPut(String rowKeyString, byte[] familyName, String columnName, String columnValue) { 64 | byte[] rowKey = rowKeyString.getBytes(); 65 | Put put = new Put(rowKey); 66 | put.addColumn(familyName, columnName.getBytes(), columnValue.getBytes()); 67 | return put; 68 | } 69 | 70 | /** 71 | * 打印HBase查询结果 72 | * 73 | * @param result 74 | */ 75 | public static void print(Result result) { 76 | //result是个四元组<行键,列族,列(标记符),值> 77 | byte[] row = result.getRow(); //行键 78 | NavigableMap>> map = result.getMap(); 79 | for (Map.Entry>> familyEntry : map.entrySet()) { 80 | byte[] familyBytes = familyEntry.getKey(); //列族 81 | for (Map.Entry> entry : familyEntry.getValue().entrySet()) { 82 | byte[] column = entry.getKey(); //列 83 | for (Map.Entry longEntry : entry.getValue().entrySet()) { 84 | Long time = longEntry.getKey(); //时间戳 85 | byte[] value = longEntry.getValue(); //值 86 | System.out.println(String.format("行键rowKey=%s,列族columnFamily=%s,列column=%s,时间戳timestamp=%d,值value=%s", new String(row), new String(familyBytes), new String(column), time, new String(value))); 87 | } 88 | } 89 | } 90 | 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /Project7_HBase/JdbcUtil.java: -------------------------------------------------------------------------------- 1 | import java.sql.*; 2 | import java.util.ArrayList; 3 | 4 | /** 5 | * 连接mysql 6 | */ 7 | public class JdbcUtil { 8 | 9 | /** 10 | * 获得mysql连接 11 | * 12 | * @return 13 | * @throws Exception 偷懒了 14 | */ 15 | public static Connection getConnection() throws Exception { 16 | //注册驱动 17 | Class.forName("com.mysql.jdbc.Driver"); 18 | String url = "jdbc:mysql://localhost:3306/goods"; 19 | String user = "root"; 20 | String password = "123"; 21 | return DriverManager.getConnection(url, user, password); 22 | } 23 | 24 | /** 25 | * 获取Mysql中所有表及视图的名称 26 | * 27 | * @param connection 28 | * @return 29 | */ 30 | public static ArrayList getTableNames(Connection connection) throws SQLException { 31 | //获得数据库的元数据 32 | DatabaseMetaData metaData = connection.getMetaData(); 33 | //获得表与视图 34 | ResultSet tables = metaData.getTables(null, null, null, new String[]{"TABLE", "VIEW"}); 35 | ArrayList tableNames = new ArrayList<>(); 36 | //遍历ResultSet 37 | while (tables.next()) { 38 | tableNames.add(tables.getString(3)); //表名及视图名在第3个位置 39 | } 40 | return tableNames; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /Project7_HBase/问题说明.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monsonlee/BigData/6004a79234c3f0cfa1efc1c197e93651303abd85/Project7_HBase/问题说明.docx -------------------------------------------------------------------------------- /Project7_HBase/问题说明.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monsonlee/BigData/6004a79234c3f0cfa1efc1c197e93651303abd85/Project7_HBase/问题说明.pdf -------------------------------------------------------------------------------- /Project8_推荐系统入门/recommend/AlgorithmUtil.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | # manhatten曼哈顿距离 4 | # Euclidean欧几里得距离 5 | # minkowski闵克夫斯基距离 6 | # pearson皮尔逊相关系数 7 | # cosSimilarity余弦相似度 8 | ####应用场景#### 9 | # 1、数据存在“分数膨胀”,采用pearson皮尔逊相关系数 10 | # 2、数据比较密集,采用manhatten曼哈顿距离或Euclidean欧几里得距离或minkowski闵克夫斯基距离 11 | # 3、数据比较稀疏,采用余弦相似度 12 | ####自定义的数据格式#### 13 | # u1用户1 book1书1 3.5评分 14 | # users = {"u1": {'book1': 3.5, 'book2': 4, 'book3': 3, 'book4': 1}, 15 | # "u2": {'book2': 5, 'book5': 4, 'book1': 3.5, 'book4': 1}, 16 | # "u3": {'book3': 2.5, 'book4': 4, 'book5': 3, 'book6': 1}, 17 | # "u4": {'book6': 4, 'book4': 4, 'book3': 3, 'book1': 1.5}} 18 | 19 | def manhattan(rating1, rating2): 20 | """计算曼哈顿距离,rating表示某个用户的所有物品及评分""" 21 | distance = 0 # manhatten距离 22 | for item in rating1: # item表示被评分物品 23 | if item in rating2: # 判断item是否同时在rating1和rating2中,即是否同时被两个用户评价过 24 | score1 = rating1[item] # score代表分数 25 | score2 = rating2[item] 26 | distance += abs(score1 - score2) 27 | return distance 28 | 29 | 30 | def euclidean(rating1, rating2): 31 | """计算欧几里得距离,rating表示某个用户的所有物品及评分""" 32 | distance = 0 # 欧几里得距离 33 | for item in rating1: # item表示被评分物品 34 | if item in rating2: # 判断item是否同时在rating1和rating2中,即是否同时被两个用户评价过 35 | score1 = rating1[item] # score代表分数 36 | score2 = rating2[item] 37 | distance += pow((score1 - score2), 2) 38 | distance = pow(distance, 0.5) 39 | return distance 40 | 41 | 42 | def minkowski(rating1, rating2, r=10): 43 | """计算闵克夫夫斯基距离,rating表示某个用户的所有物品及评分,r是闵克夫夫斯基距离公式要求的参数,r=10时为曼哈顿距离,r=2时为欧几里得距离""" 44 | distance = 0 # 闵克夫斯基距离 45 | for item in rating1: # item表示被评分物品 46 | if item in rating2: # 判断item是否同时在rating1和rating2中,即是否同时被两个用户评价过 47 | score1 = rating1[item] # score代表分数 48 | score2 = rating2[item] 49 | distance += pow(abs(score1 - score2), r) 50 | distance = pow(distance, 1.0 / r) 51 | return distance 52 | 53 | 54 | def pearson(rating1, rating2): 55 | """皮尔逊相关系数,rating表示某个用户的所有物品及评分""" 56 | sum_xy = 0.0 # xy乘积的和 57 | sum_x = 0.0 # x的和 58 | sum_y = 0.0 # y的和 59 | sum_x2 = 0.0 # x的平方的和 60 | sum_y2 = 0.0 # y的平方的和 61 | n = 0 # 用于计算rating的长度 62 | for item in rating1: 63 | if item in rating2: 64 | n += 1 65 | x = rating1[item] 66 | y = rating2[item] 67 | sum_xy += x * y 68 | sum_x += x 69 | sum_y += y 70 | sum_x2 += pow(x, 2) 71 | sum_y2 += pow(y, 2) 72 | # 分子 73 | n = n * 1.0 74 | numerator = sum_xy - sum_x * sum_y / n 75 | # 分母 76 | denominator = pow((sum_x2 - (sum_x ** 2) / n), 0.5) * pow((sum_y2 - (sum_y ** 2) / n), 0.5) 77 | if denominator == 0: 78 | pearsonCoefficient = 0 79 | else: 80 | pearsonCoefficient = numerator / denominator # 皮尔逊相关系数近似值 81 | return pearsonCoefficient 82 | 83 | 84 | def cosSimilarity(list1, list2): 85 | """计算x,y余弦相似度,x,y均为list""" 86 | sum_xy = 0.0 # xy乘积之和 87 | sum_x2 = 0.0 # x的平方之和 88 | sum_y2 = 0.0 # y的平方之和 89 | for index, item in enumerate(list1): 90 | sum_xy += list1[index] * list2[index] 91 | sum_x2 += list1[index] ** 2 92 | sum_y2 += list2[index] ** 2 93 | # 分母 94 | denominator = pow(sum_x2, 0.5) * pow(sum_y2, 0.5) 95 | return sum_xy * 1.0 / denominator 96 | 97 | 98 | def itemAll(users): 99 | """返回users中所有的物品item""" 100 | itemSet = set() 101 | for user in users: 102 | for item in users[user]: 103 | itemSet.add(item) 104 | itemList = list(itemSet) 105 | itemList.sort() 106 | return itemList 107 | 108 | 109 | def vect(username, users): 110 | """返回代表用户username的向量""" 111 | items = itemAll(users) # 返回所有的物品list(item1,item2...) 112 | userVec = [] # 用户username的向量 113 | rating = users[username] # 返回的是{'item1':3,'item2':4...} 114 | for item in items: 115 | if item in rating: 116 | userVec.append(rating[item]) 117 | else: 118 | userVec.append(0) 119 | return userVec -------------------------------------------------------------------------------- /Project8_推荐系统入门/recommend/Recommender.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import AlgorithmUtil as al 3 | # manhatten曼哈顿距离 4 | # Euclidean欧几里得距离 5 | # minkowski闵克夫斯基距离 6 | # pearson皮尔逊相关系数 7 | # cosSimilarity余弦相似度 8 | ####应用场景#### 9 | # 1、数据存在“分数膨胀”,采用pearson皮尔逊相关系数 10 | # 2、数据比较密集,采用manhatten曼哈顿距离或Euclidean欧几里得距离或minkowski闵克夫斯基距离 11 | # 3、数据比较稀疏,采用余弦相似度 12 | ####自定义的数据格式#### 13 | # u1用户1 book1书1 3.5评分 字典类型 14 | # users = {"u1": {'book1': 3.5, 'book2': 4, 'book3': 3, 'book4': 1}, 15 | # "u2": {'book2': 5, 'book5': 4, 'book1': 3.5, 'book4': 1}, 16 | # "u3": {'book3': 2.5, 'book4': 4, 'book5': 3, 'book6': 1}, 17 | # "u4": {'book6': 4, 'book4': 4, 'book3': 3, 'book1': 1.5}} 18 | 19 | class Recommender: 20 | def __init__(self, data, k=3, metric='pearson', n=5): 21 | """k-NearestNeighbor,k最近邻算法 22 | data 训练数据 23 | k k最近邻算法中的值 24 | metric 使用哪种距离计算方式 25 | n 推荐结果的数量 26 | """ 27 | self.k = k 28 | self.n = n 29 | # 使用哪种距离计算方式 30 | if metric == 'pearson': 31 | self.fn = al.pearson # 皮尔逊系数 32 | elif metric == 'manhattan': 33 | self.fn = al.manhattan # 曼哈顿距离 34 | elif metric == 'euclidean': 35 | self.fn = al.euclidean # 欧几里得距离 36 | elif metric == 'minkowski': 37 | self.fn = al.minkowski # 闵克夫斯基距离 38 | else: 39 | self.fn = al.cosSimilarity # 余弦相似度 40 | 41 | # 如果data是一个字典类型,则保存下来,否则忽略 42 | # if type(self.data)._name_ == 'dict': 43 | self.data = data 44 | 45 | def recommend(self, username): 46 | # 计算结果,推荐列表 47 | result = {} 48 | # 找k个最近邻用户 49 | kNearestNeighbors = self.compareDis(username)[:self.k] 50 | # 获取用户username评价过的商品 51 | userRatings = self.data[username] 52 | # 计算总距离 53 | totalDistance = 0.0 54 | for kValue in kNearestNeighbors: 55 | totalDistance += kValue[0] 56 | # 对推荐物品进行加权评分 57 | for kValue in kNearestNeighbors: 58 | weight = kValue[0] / totalDistance # 计算各邻近用户权重即占比 59 | name = kValue[1] # 用户姓名 60 | # 获得该用户的所有评价 61 | neighborRatings = self.data[name] 62 | # 获得没有评价过的商品 63 | for item in neighborRatings: 64 | if not item in userRatings: 65 | if not item in result: 66 | result[item] = neighborRatings[item] * weight 67 | else: 68 | result[item] += neighborRatings[item] * weight 69 | # 开始推荐 70 | recommendations = list(result.items()) 71 | recommendations.sort(key=lambda tuple: tuple[1], reverse=True) 72 | print recommendations[:self.n] 73 | 74 | def compareDis(self, userName): 75 | """计算用户userName与其他用户的距离,返回排序后的(距离,用户)""" 76 | result = [] # 利用manhatten距离算出的结果 77 | for user in self.data: 78 | if user != userName: 79 | distance = self.fn(self.data[user], self.data[userName]) # 调用求距离的函数 80 | result.append((distance, user)) # (距离,用户) 81 | result.sort() 82 | return result -------------------------------------------------------------------------------- /Project8_推荐系统入门/recommend/TestRecommender.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import AlgorithmUtil as dis 4 | from recommend.Recommender import * 5 | 6 | # u1用户1 book1书1 3.5评分 7 | users = {"u1": {'book1': 3.5, 'book2': 4, 'book3': 3, 'book4': 1}, 8 | "u2": {'book2': 5, 'book5': 4, 'book1': 3.5, 'book4': 1}, 9 | "u3": {'book3': 2.5, 'book4': 4, 'book5': 3, 'book6': 1}, 10 | "u4": {'book6': 4, 'book4': 4, 'book3': 3, 'book1': 1.5}} 11 | 12 | r = Recommender(users, 'pearson') 13 | r.recommend('u2') 14 | -------------------------------------------------------------------------------- /Project8_推荐系统入门/recommend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monsonlee/BigData/6004a79234c3f0cfa1efc1c197e93651303abd85/Project8_推荐系统入门/recommend/__init__.py -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | mxgroup 5 | Spider 6 | 0.0.1-SNAPSHOT 7 | 8 | 9 | 1.8 10 | UTF-8 11 | 12 | 13 | 14 | ZhihuSpider 15 | 16 | 17 | org.apache.maven.plugins 18 | maven-compiler-plugin 19 | 2.3.2 20 | 21 | ${jdk.version} 22 | ${jdk.version} 23 | ${project.build.sourceEncoding} 24 | 25 | 26 | ${java.home}/lib/rt.jar;${java.home}/lib/jce.jar 27 | 28 | 29 | 30 | 31 | org.apache.maven.plugins 32 | maven-shade-plugin 33 | 2.4.3 34 | 35 | 36 | package 37 | 38 | shade 39 | 40 | 41 | 42 | 44 | crawler.Spider 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | jdk.tools 57 | jdk.tools 58 | 1.8 59 | system 60 | ${JAVA_HOME}/lib/tools.jar 61 | 62 | 63 | 64 | 65 | junit 66 | junit 67 | 4.12 68 | test 69 | 70 | 71 | 72 | 73 | org.apache.httpcomponents 74 | httpclient 75 | 4.4 76 | 77 | 78 | 79 | 80 | net.sourceforge.htmlcleaner 81 | htmlcleaner 82 | 2.10 83 | 84 | 85 | 86 | 87 | org.json 88 | json 89 | 20140107 90 | 91 | 92 | 93 | com.alibaba 94 | fastjson 95 | 1.2.7 96 | 97 | 98 | 99 | org.slf4j 100 | slf4j-api 101 | 1.7.10 102 | 103 | 104 | org.slf4j 105 | slf4j-log4j12 106 | 1.7.10 107 | 108 | 109 | 110 | us.codecraft 111 | webmagic-core 112 | 0.6.0 113 | 114 | 115 | us.codecraft 116 | webmagic-extension 117 | 0.6.0 118 | 119 | 120 | org.slf4j 121 | slf4j-log4j12 122 | 123 | 124 | 125 | 126 | 127 | mysql 128 | mysql-connector-java 129 | 5.1.29 130 | 131 | 132 | 133 | redis.clients 134 | jedis 135 | 2.8.1 136 | 137 | 138 | 139 | org.quartz-scheduler 140 | quartz 141 | 1.8.4 142 | 143 | 144 | 145 | org.apache.curator 146 | curator-framework 147 | 2.7.1 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/src/main/java/crawler/Spider.java: -------------------------------------------------------------------------------- 1 | package crawler; 2 | 3 | import java.net.InetAddress; 4 | import java.util.Scanner; 5 | import java.util.concurrent.ExecutorService; 6 | import java.util.concurrent.Executors; 7 | 8 | import org.apache.curator.RetryPolicy; 9 | import org.apache.curator.framework.CuratorFramework; 10 | import org.apache.curator.framework.CuratorFrameworkFactory; 11 | import org.apache.curator.retry.ExponentialBackoffRetry; 12 | import org.apache.zookeeper.CreateMode; 13 | import org.apache.zookeeper.ZooDefs.Ids; 14 | import org.slf4j.Logger; 15 | import org.slf4j.LoggerFactory; 16 | 17 | import domain.Page; 18 | import domain.User; 19 | import download.DownLoad; 20 | import download.DownLoadImpl; 21 | import process.Process; 22 | import process.ProcessImpl; 23 | import redis.clients.jedis.Jedis; 24 | import store.Store; 25 | import store.StoreImpl; 26 | import utils.JedisUtil; 27 | import utils.ThreadUtil; 28 | import utils.UserUtil; 29 | 30 | public class Spider { 31 | static Logger logger = LoggerFactory.getLogger(Spider.class.getSimpleName()); 32 | // 创建一个初始化线程数量为5的线程池 33 | private static ExecutorService threadPool = Executors.newFixedThreadPool(5); 34 | private DownLoad downLoadIn; 35 | private Process processIn; 36 | private Store storeIn; 37 | 38 | public void setStoreIn(Store storeIn) { 39 | this.storeIn = storeIn; 40 | } 41 | 42 | public void setDownLoadInter(DownLoad downLoadIn) { 43 | this.downLoadIn = downLoadIn; 44 | } 45 | 46 | public void setProcessInter(Process processIn) { 47 | this.processIn = processIn; 48 | } 49 | 50 | public static void main(String[] args) { 51 | start(); 52 | } 53 | 54 | /*public Spider() { 55 | // 重试机制:1000:表示重试的间隔,3表示是重试的次数 56 | RetryPolicy retryPolicy = new ExponentialBackoffRetry(1000, 3); 57 | // 指定zk的链接地址 58 | String zookeeperConnectionString = "192.168.230.128:2181,192.168.230.129:2181,192.168.230.131:2181"; 59 | int sessionTimeoutMs = 5000;// 链接失效时间,默认是40s,注意:这个值只能是4s--40s之间的值 60 | int connectionTimeoutMs = 3000;// 链接超时时间 61 | // 获取zk链接 62 | CuratorFramework client = CuratorFrameworkFactory.newClient(zookeeperConnectionString, sessionTimeoutMs, 63 | connectionTimeoutMs, retryPolicy); 64 | // 开启链接 65 | client.start(); 66 | try { 67 | // 获取本机ip信息 68 | InetAddress localHost = InetAddress.getLocalHost(); 69 | String ip = localHost.getHostAddress(); 70 | client.create()// 创建节点 71 | .creatingParentsIfNeeded()// 如果需要,则创建父节点 72 | .withMode(CreateMode.EPHEMERAL)// 指定节点类型 73 | .withACL(Ids.OPEN_ACL_UNSAFE)// 指定节点的权限 74 | .forPath("/spider/" + ip); 75 | } catch (Exception e) { 76 | e.printStackTrace(); 77 | } 78 | }*/ 79 | 80 | /** 81 | * 启动爬虫 82 | */ 83 | public static void start() { 84 | final Spider spider = new Spider(); 85 | spider.setDownLoadInter(new DownLoadImpl()); 86 | spider.setProcessInter(new ProcessImpl()); 87 | spider.setStoreIn(new StoreImpl()); 88 | System.out.println("请输入一个要爬取的知乎话题:"); 89 | // 获取话题 90 | @SuppressWarnings("resource") 91 | Scanner scanner = new Scanner(System.in); 92 | String topic = scanner.nextLine();// 话题 93 | String url = "https://www.zhihu.com/search?type=topic&q=" + topic; 94 | logger.info("爬虫开始运行..."); 95 | // 下载话题精华问题页 96 | final Page page = spider.download(url); 97 | // 解析话题精华问题页 98 | spider.process(page); 99 | 100 | while (true) { 101 | // 读取Redis中的url 102 | Jedis jedis = JedisUtil.getJedis(); 103 | final String userUrl = jedis.rpop(JedisUtil.urlkey); 104 | JedisUtil.returnResource(jedis); 105 | if (userUrl != null) { 106 | threadPool.execute(new Runnable() { 107 | public void run() { 108 | if (userUrl.endsWith("following") || userUrl.endsWith("follower")) { 109 | UserUtil.processFollow(userUrl); 110 | } else { 111 | User user = UserUtil.processUser(userUrl); 112 | if (user != null) { 113 | spider.store(user);// 存储 114 | }else{ 115 | logger.info("很奇怪,user为null"); 116 | } 117 | } 118 | }// end run 119 | }); 120 | 121 | } else { 122 | logger.info("没有url了,休息一会..."); 123 | ThreadUtil.sleep(5); 124 | } // end if else 125 | 126 | } // end while 127 | 128 | } 129 | 130 | /** 131 | * 下载 132 | * 133 | * @param url 134 | * @return 135 | */ 136 | public Page download(String url) { 137 | return downLoadIn.download(url); 138 | } 139 | 140 | /** 141 | * 解析爬取的原始内容 142 | * 143 | * @param page 144 | * @param user 145 | */ 146 | public void process(Page page) { 147 | processIn.process(page); 148 | } 149 | 150 | /** 151 | * 保存解析的用户信息 152 | * 153 | * @param user 154 | */ 155 | public void store(User user) { 156 | storeIn.store(user); 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/src/main/java/domain/Page.java: -------------------------------------------------------------------------------- 1 | package domain; 2 | 3 | public class Page { 4 | /** 5 | * 下载原始内容 6 | */ 7 | private String content; 8 | /** 9 | * URL 10 | */ 11 | private String url; 12 | 13 | 14 | public String getUrl() { 15 | return url; 16 | } 17 | 18 | public void setUrl(String url) { 19 | this.url = url; 20 | } 21 | 22 | public String getContent() { 23 | return content; 24 | } 25 | 26 | public void setContent(String content) { 27 | this.content = content; 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/src/main/java/domain/User.java: -------------------------------------------------------------------------------- 1 | package domain; 2 | 3 | /** 4 | * User用于存储用户信息 5 | * 6 | * @className User 7 | * @author mxlee 8 | * @email imxlee@foxmail.com 9 | */ 10 | public class User { 11 | 12 | String userID;// 用户ID 13 | 14 | String name;// 用户姓名 15 | 16 | String gender;// 性别 17 | 18 | String location;// 居住地; 19 | 20 | String business;// 行业 21 | 22 | String employment;// 公司 23 | 24 | String position;// 职位; 25 | 26 | String school;// 大学 27 | 28 | String major;// 专业 29 | 30 | int answersNum;// 回答数量 31 | 32 | int starsNum;// 被赞同数 33 | 34 | int thxNum;// 被感谢数 35 | 36 | int saveNum;// 被收藏数 37 | 38 | int follow;// 关注了 39 | 40 | int follower;// 关注者 41 | 42 | public String getUserID() { 43 | return userID; 44 | } 45 | 46 | public void setUserID(String userID) { 47 | this.userID = userID; 48 | } 49 | 50 | public String getName() { 51 | return name; 52 | } 53 | 54 | public void setName(String name) { 55 | this.name = name; 56 | } 57 | 58 | public String getGender() { 59 | return gender; 60 | } 61 | 62 | public void setGender(String gender) { 63 | this.gender = gender; 64 | } 65 | 66 | public String getLocation() { 67 | return location; 68 | } 69 | 70 | public void setLocation(String location) { 71 | this.location = location; 72 | } 73 | 74 | public String getBusiness() { 75 | return business; 76 | } 77 | 78 | public void setBusiness(String business) { 79 | this.business = business; 80 | } 81 | 82 | public String getEmployment() { 83 | return employment; 84 | } 85 | 86 | public void setEmployment(String employment) { 87 | this.employment = employment; 88 | } 89 | 90 | public String getPosition() { 91 | return position; 92 | } 93 | 94 | public void setPosition(String position) { 95 | this.position = position; 96 | } 97 | 98 | public String getSchool() { 99 | return school; 100 | } 101 | 102 | public void setSchool(String school) { 103 | this.school = school; 104 | } 105 | 106 | public String getMajor() { 107 | return major; 108 | } 109 | 110 | public void setMajor(String major) { 111 | this.major = major; 112 | } 113 | 114 | public int getAnswersNum() { 115 | return answersNum; 116 | } 117 | 118 | public void setAnswersNum(int answersNum) { 119 | this.answersNum = answersNum; 120 | } 121 | 122 | public int getStarsNum() { 123 | return starsNum; 124 | } 125 | 126 | public void setStarsNum(int starsNum) { 127 | this.starsNum = starsNum; 128 | } 129 | 130 | public int getThxNum() { 131 | return thxNum; 132 | } 133 | 134 | public void setThxNum(int thxNum) { 135 | this.thxNum = thxNum; 136 | } 137 | 138 | public int getSaveNum() { 139 | return saveNum; 140 | } 141 | 142 | public void setSaveNum(int saveNum) { 143 | this.saveNum = saveNum; 144 | } 145 | 146 | public int getFollow() { 147 | return follow; 148 | } 149 | 150 | public void setFollow(int follow) { 151 | this.follow = follow; 152 | } 153 | 154 | public int getFollower() { 155 | return follower; 156 | } 157 | 158 | public void setFollower(int follower) { 159 | this.follower = follower; 160 | } 161 | 162 | @Override 163 | public String toString() { 164 | return "用户ID:" + userID + "用户名:" + name + "\n性别:" + gender + "\n居住地:" + location + "\n行业:" + business + "\n公司:" 165 | + employment + "\n职位:" + position + "\n大学:" + school + "\n专业:" + major + "\n回答数:" + answersNum 166 | + "\n关注数:" + follow + "\n被关注数:" + follower + "\n被点赞数:" + starsNum + "\n被感谢数:" + thxNum + "\n被收藏数:" 167 | + saveNum; 168 | } 169 | 170 | } 171 | -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/src/main/java/download/DownLoad.java: -------------------------------------------------------------------------------- 1 | package download; 2 | 3 | import domain.Page; 4 | 5 | public interface DownLoad { 6 | public Page download(String url); 7 | } 8 | -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/src/main/java/download/DownLoadImpl.java: -------------------------------------------------------------------------------- 1 | package download; 2 | 3 | import domain.Page; 4 | import utils.PageUtil; 5 | 6 | public class DownLoadImpl implements DownLoad { 7 | 8 | /** 9 | * 下载原始内容 10 | */ 11 | public Page download(String url) { 12 | Page page = new Page(); 13 | page.setContent(PageUtil.getContent(url)); 14 | page.setUrl(url); 15 | return page; 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/src/main/java/process/Process.java: -------------------------------------------------------------------------------- 1 | package process; 2 | 3 | import domain.Page; 4 | 5 | public interface Process { 6 | public void process(Page page); 7 | } 8 | -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/src/main/java/process/ProcessImpl.java: -------------------------------------------------------------------------------- 1 | package process; 2 | 3 | import java.util.regex.Matcher; 4 | import java.util.regex.Pattern; 5 | 6 | import org.htmlcleaner.HtmlCleaner; 7 | import org.htmlcleaner.TagNode; 8 | import org.htmlcleaner.XPatherException; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | 12 | import domain.Page; 13 | import download.DownLoadImpl; 14 | import utils.UrlUtil; 15 | 16 | public class ProcessImpl implements Process { 17 | 18 | private Logger logger = LoggerFactory.getLogger(ProcessImpl.class.getSimpleName()); 19 | 20 | /** 21 | * 解析原始页面内容 22 | */ 23 | public void process(Page page) { 24 | HtmlCleaner htmlCleaner = new HtmlCleaner(); 25 | // 获得根节点 26 | String content = page.getContent(); 27 | // 抓取话题精华问题页面URL 28 | String url = page.getUrl(); 29 | TagNode tagNode = htmlCleaner.clean(content); 30 | Object[] uObj;// 最高票用户数组 31 | try { 32 | // 获取用户 33 | uObj = tagNode.evaluateXPath("//*[@id='zh-topic-top-page-list']/*/div/div/div[1]/div[3]/span/span[1]/a"); 34 | if (uObj != null & uObj.length > 0) { 35 | logger.info("此页有" + uObj.length + "个用户"); 36 | TagNode uNode;// 最高票用户节点 37 | for (int i = 0; i < uObj.length; i++) { 38 | uNode = (TagNode) uObj[i]; 39 | // 解析node,获取最高票用户链接URL 40 | String userHref = "https://www.zhihu.com" + uNode.getAttributeByName("href"); 41 | // 用户url去重 42 | UrlUtil.juageUrl(userHref); 43 | } 44 | } else { 45 | String topic = url.substring(url.lastIndexOf("=") + 1); 46 | Pattern pattern = Pattern 47 | .compile("" 48 | + topic + ""); 49 | Matcher matcher = pattern.matcher(content); 50 | if (matcher.find()) { 51 | String topicURL = "https://www.zhihu.com" + matcher.group(1) + "/top-answers"; 52 | DownLoadImpl downLoadPage = new DownLoadImpl(); 53 | Page userPage = null; 54 | // 下载精华问题第1页 55 | userPage = downLoadPage.download(topicURL + "?page=" + 1); 56 | process(userPage); 57 | } else { 58 | logger.info("没有找到相关话题"); 59 | } 60 | } 61 | } catch (XPatherException e) { 62 | logger.error("解析失败" + e.getMessage()); 63 | } 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/src/main/java/store/Store.java: -------------------------------------------------------------------------------- 1 | package store; 2 | 3 | import domain.User; 4 | 5 | public interface Store { 6 | public void store(User user); 7 | } 8 | -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/src/main/java/store/StoreImpl.java: -------------------------------------------------------------------------------- 1 | package store; 2 | 3 | import domain.User; 4 | import utils.JDBCUtil; 5 | 6 | public class StoreImpl implements Store { 7 | 8 | public void store(User user) { 9 | String sql = "insert into user(userid,name,gender,location,business,employment,position,school,major,answersNum,starsNum,thxNum,saveNum,follow,follower) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"; 10 | JDBCUtil.update(sql, user); 11 | } 12 | 13 | } 14 | -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/src/main/java/utils/BloomFilter.java: -------------------------------------------------------------------------------- 1 | package utils; 2 | 3 | import java.util.BitSet; 4 | 5 | public class BloomFilter { 6 | private static final int BIT_SIZE = 2 << 28;// 二进制向量的位数,相当于能存储1000万条url左右,误报率为千万分之一 7 | private static final int[] seeds = new int[] { 3, 5, 7, 11, 13, 31, 37, 61 };// 用于生成信息指纹的8个随机数,最好选取质数 8 | 9 | private BitSet bits = new BitSet(BIT_SIZE); 10 | private Hash[] func = new Hash[seeds.length];// 用于存储8个随机哈希值对象 11 | 12 | public BloomFilter() { 13 | for (int i = 0; i < seeds.length; i++) { 14 | func[i] = new Hash(BIT_SIZE, seeds[i]); 15 | } 16 | } 17 | 18 | /** 19 | * 像过滤器中添加字符串 20 | */ 21 | public void addValue(String value) { 22 | // 将字符串value哈希为8个或多个整数,然后在这些整数的bit上变为1 23 | if (value != null) { 24 | for (Hash f : func) 25 | bits.set(f.hash(value), true); 26 | } 27 | } 28 | 29 | /** 30 | * 判断字符串是否包含在布隆过滤器中 31 | */ 32 | public boolean contains(String value) { 33 | if (value == null) 34 | return false; 35 | 36 | boolean ret = true; 37 | 38 | // 将要比较的字符串重新以上述方法计算hash值,再与布隆过滤器比对 39 | for (Hash f : func) 40 | ret = ret && bits.get(f.hash(value)); 41 | return ret; 42 | } 43 | 44 | /** 45 | * 随机哈希值对象 46 | */ 47 | 48 | public static class Hash { 49 | private int size;// 二进制向量数组大小 50 | private int seed;// 随机数种子 51 | 52 | public Hash(int cap, int seed) { 53 | this.size = cap; 54 | this.seed = seed; 55 | } 56 | 57 | /** 58 | * 计算哈希值(也可以选用别的恰当的哈希函数) 59 | */ 60 | public int hash(String value) { 61 | int result = 0; 62 | int len = value.length(); 63 | for (int i = 0; i < len; i++) { 64 | result = seed * result + value.charAt(i); 65 | } 66 | 67 | return (size - 1) & result; 68 | } 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/src/main/java/utils/JDBCUtil.java: -------------------------------------------------------------------------------- 1 | package utils; 2 | 3 | import java.sql.Connection; 4 | import java.sql.DriverManager; 5 | import java.sql.PreparedStatement; 6 | import java.sql.SQLException; 7 | 8 | import domain.User; 9 | 10 | public class JDBCUtil { 11 | 12 | private static final String url = "jdbc:mysql://127.0.0.1:3306/zhihu?useUnicode=true&characterEncoding=utf-8"; 13 | private static final String username = "root"; 14 | private static final String password = "123"; 15 | 16 | static { 17 | try { 18 | DriverManager.registerDriver(new com.mysql.jdbc.Driver()); 19 | } catch (SQLException e) { 20 | e.printStackTrace(); 21 | } 22 | } 23 | 24 | /** 25 | * 执行insert、update、delete语句 26 | * 27 | * @param sql 28 | * @param params 29 | */ 30 | public static void update(String sql, User user) { 31 | Connection connection = null; 32 | try { 33 | connection = DriverManager.getConnection(url, username, password); 34 | PreparedStatement ps = connection.prepareStatement(sql); 35 | ps.setString(1, user.getUserID()); 36 | ps.setString(2, user.getName()); 37 | ps.setString(3, user.getGender()); 38 | ps.setString(4, user.getLocation()); 39 | ps.setString(5, user.getBusiness()); 40 | ps.setString(6, user.getEmployment()); 41 | ps.setString(7, user.getPosition()); 42 | ps.setString(8, user.getSchool()); 43 | ps.setString(9, user.getMajor()); 44 | ps.setInt(10, user.getAnswersNum()); 45 | ps.setInt(11, user.getStarsNum()); 46 | ps.setInt(12, user.getThxNum()); 47 | ps.setInt(13, user.getSaveNum()); 48 | ps.setInt(14, user.getFollow()); 49 | ps.setInt(15, user.getFollower()); 50 | ps.execute(); 51 | } catch (SQLException e) { 52 | e.printStackTrace(); 53 | } finally { 54 | if (connection != null) { 55 | try { 56 | connection.close(); 57 | } catch (SQLException e) { 58 | e.printStackTrace(); 59 | } 60 | } 61 | } 62 | }// end update 63 | } 64 | -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/src/main/java/utils/JedisUtil.java: -------------------------------------------------------------------------------- 1 | package utils; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | import redis.clients.jedis.Jedis; 7 | import redis.clients.jedis.JedisPool; 8 | import redis.clients.jedis.JedisPoolConfig; 9 | 10 | /** 11 | * Redis工具类 12 | * 13 | * @author mxlee 14 | * 15 | */ 16 | public class JedisUtil { 17 | protected static Logger logger = LoggerFactory.getLogger(JedisUtil.class); 18 | public static final String HOST = "127.0.0.1"; 19 | public static final int PORT = 6379; 20 | public static final String urlkey = "url"; 21 | 22 | private JedisUtil() { 23 | } 24 | 25 | private static JedisPool jedisPool = null; 26 | 27 | /** 28 | * 初始化JedisPool 29 | * 30 | * @return 31 | */ 32 | private static void initialPool() { 33 | 34 | if (jedisPool == null) { 35 | JedisPoolConfig jedisPoolConfig = new JedisPoolConfig(); 36 | // 指定连接池中最大的空闲连接数 37 | jedisPoolConfig.setMaxIdle(200); 38 | // 连接池创建的最大连接数 39 | jedisPoolConfig.setMaxTotal(2000); 40 | // 设置创建连接的超时时间 41 | jedisPoolConfig.setMaxWaitMillis(1000 * 10); 42 | // 表示从连接池中获取连接时,先测试连接是否可用 43 | jedisPoolConfig.setTestOnBorrow(true); 44 | jedisPoolConfig.setTestOnReturn(true); 45 | jedisPool = new JedisPool(jedisPoolConfig, HOST, PORT, 10000); 46 | } 47 | 48 | } 49 | 50 | /** 51 | * 在多线程环境同步初始化 52 | */ 53 | private static synchronized void poolInit() { 54 | if (jedisPool == null) { 55 | initialPool(); 56 | } 57 | } 58 | 59 | /** 60 | * 同步获取Jedis实例 61 | * 62 | * @return Jedis 63 | */ 64 | public synchronized static Jedis getJedis() { 65 | if (jedisPool == null) { 66 | poolInit(); 67 | } 68 | Jedis jedis = null; 69 | try { 70 | if (jedisPool != null) { 71 | jedis = jedisPool.getResource(); 72 | } 73 | } catch (Exception e) { 74 | logger.error("获取jedis出错: " + e); 75 | } 76 | return jedis; 77 | } 78 | 79 | /** 80 | * 释放jedis资源 81 | * 82 | * @param jedis 83 | */ 84 | public static void returnResource(Jedis jedis) { 85 | if (jedis != null && jedisPool != null) { 86 | // Jedis3.0之后,returnResource遭弃用,官方重写了close方法 87 | // jedisPool.returnResource(jedis); 88 | jedis.close(); 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/src/main/java/utils/MD5Filter.java: -------------------------------------------------------------------------------- 1 | package utils; 2 | 3 | import java.security.MessageDigest; 4 | 5 | public class MD5Filter { 6 | public static String md5(String string) { 7 | char hexDigits[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 8 | 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' }; 9 | try { 10 | byte[] bytes = string.getBytes(); 11 | MessageDigest messageDigest = MessageDigest.getInstance("MD5"); 12 | messageDigest.update(bytes); 13 | byte[] updateBytes = messageDigest.digest(); 14 | int len = updateBytes.length; 15 | char myChar[] = new char[len * 2]; 16 | int k = 0; 17 | for (int i = 0; i < len; i++) { 18 | byte byte0 = updateBytes[i]; 19 | myChar[k++] = hexDigits[byte0 >>> 4 & 0x0f]; 20 | myChar[k++] = hexDigits[byte0 & 0x0f]; 21 | } 22 | return new String(myChar); 23 | } catch (Exception e) { 24 | return null; 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/src/main/java/utils/PageUtil.java: -------------------------------------------------------------------------------- 1 | package utils; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.http.HttpEntity; 6 | import org.apache.http.client.HttpRequestRetryHandler; 7 | import org.apache.http.client.config.RequestConfig; 8 | import org.apache.http.client.methods.CloseableHttpResponse; 9 | import org.apache.http.client.methods.HttpGet; 10 | import org.apache.http.client.methods.HttpUriRequest; 11 | import org.apache.http.impl.client.CloseableHttpClient; 12 | import org.apache.http.impl.client.HttpClientBuilder; 13 | import org.apache.http.impl.client.HttpClients; 14 | import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; 15 | import org.apache.http.protocol.HttpContext; 16 | import org.apache.http.util.EntityUtils; 17 | import org.slf4j.Logger; 18 | import org.slf4j.LoggerFactory; 19 | 20 | public class PageUtil { 21 | private static Logger logger = LoggerFactory.getLogger(PageUtil.class.getSimpleName()); 22 | 23 | /** 24 | * 下载原始内容 25 | */ 26 | public static String getContent(String url) { 27 | String content = null; 28 | CloseableHttpClient client = null; 29 | CloseableHttpResponse response = null; 30 | try { 31 | long startTime = System.currentTimeMillis(); 32 | HttpClientBuilder builder = HttpClients.custom();// HttpClient构建器 33 | PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();// 连接池 34 | cm.setMaxTotal(200);// 最大连接数 35 | cm.setDefaultMaxPerRoute(20);// 最大路由连接数 36 | builder.setConnectionManager(cm); 37 | // 设置超时 38 | final int retryTime = 3; 39 | RequestConfig defaultRequestConfig = RequestConfig.custom().setSocketTimeout(5000).setConnectTimeout(5000) 40 | .setConnectionRequestTimeout(5000).build(); 41 | builder.setDefaultRequestConfig(defaultRequestConfig); 42 | // 设置重试次数 43 | builder.setRetryHandler(new HttpRequestRetryHandler() { 44 | public boolean retryRequest(IOException exception, int executionCount, HttpContext context) { 45 | if (executionCount >= retryTime) { 46 | return false; 47 | } 48 | return true; 49 | } 50 | }); 51 | // 获取一个HttpClient对象,模拟浏览器 52 | client = builder.build(); 53 | HttpUriRequest request = new HttpGet(url); 54 | request.setHeader("User-Agent", 55 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36"); 56 | response = client.execute(request);// 执行请求 57 | // 获取一个Entity实体对象 58 | HttpEntity entity = response.getEntity(); 59 | content = EntityUtils.toString(entity);// 原始内容 60 | long endTime = System.currentTimeMillis(); 61 | logger.info("页面下载成功,url:{},消耗时间:{}", url, endTime - startTime); 62 | } catch (Exception e) { 63 | logger.error("页面下载失败,url:{}", url); 64 | } finally { 65 | try { 66 | if (response != null) { 67 | response.close(); 68 | } 69 | if (client != null) { 70 | client.close(); 71 | } 72 | } catch (IOException e) { 73 | logger.error("资源释放失败" + e.getMessage()); 74 | } 75 | } 76 | return content; 77 | } 78 | 79 | } 80 | -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/src/main/java/utils/TestFilter.java: -------------------------------------------------------------------------------- 1 | package utils; 2 | public class TestFilter { 3 | public static void main(String[] args) { 4 | BloomFilter b = new BloomFilter(); 5 | b.addValue("www.baidu.com"); 6 | b.addValue("www.sohu.com"); 7 | 8 | System.out.println(b.contains("www.baid.com")); 9 | System.out.println(b.contains("www.sohu.com")); 10 | // String md5 = MD5Filter.md5("www.github.com"); 11 | // String md5_1 = MD5Filter.md5("www.github.com"); 12 | // System.out.println(md5); 13 | // System.out.println(md5_1); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/src/main/java/utils/ThreadUtil.java: -------------------------------------------------------------------------------- 1 | package utils; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | public class ThreadUtil { 7 | static Logger logger = LoggerFactory.getLogger(ThreadUtil.class.getSimpleName()); 8 | 9 | public static void sleep(int seconds) { 10 | try { 11 | Thread.sleep(seconds * 1000); 12 | } catch (InterruptedException e) { 13 | logger.error(e.getMessage()); 14 | } 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/src/main/java/utils/UrlUtil.java: -------------------------------------------------------------------------------- 1 | package utils; 2 | 3 | import redis.clients.jedis.Jedis; 4 | 5 | public class UrlUtil { 6 | public static void juageUrl(String userHref) { 7 | // 用户url去重 8 | String md5 = MD5Filter.md5(userHref); 9 | Jedis jedis = JedisUtil.getJedis(); 10 | if (jedis.get(md5) == null) { 11 | jedis.append(md5, "md5url"); 12 | jedis.lpush(JedisUtil.urlkey, userHref); 13 | JedisUtil.returnResource(jedis); 14 | } else { 15 | JedisUtil.returnResource(jedis); 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/src/main/java/utils/UserUtil.java: -------------------------------------------------------------------------------- 1 | package utils; 2 | 3 | import java.util.regex.Matcher; 4 | import java.util.regex.Pattern; 5 | 6 | import org.htmlcleaner.HtmlCleaner; 7 | import org.htmlcleaner.TagNode; 8 | import org.htmlcleaner.XPatherException; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | 12 | import com.alibaba.fastjson.JSON; 13 | import com.alibaba.fastjson.JSONObject; 14 | 15 | import domain.User; 16 | import redis.clients.jedis.Jedis; 17 | 18 | public class UserUtil { 19 | private static Logger logger = LoggerFactory.getLogger(UserUtil.class.getSimpleName()); 20 | 21 | /** 22 | * 处理用户主页,析取用户信息 23 | */ 24 | public static User processUser(String userUrl) { 25 | User user = new User(); 26 | String content = PageUtil.getContent(userUrl); 27 | if (content == null) { 28 | return user; 29 | } 30 | Pattern pattern; 31 | Matcher matcher; 32 | JSONObject jUser = null; // 用户信息json对象 33 | // 关注的人与关注者链接 34 | String userID = null; 35 | pattern = Pattern.compile("following\"> 0) { 154 | TagNode node = (TagNode) pageNumObj[pageNumObj.length - 2]; 155 | int pagenum = Integer.parseInt(node.getText().toString()); 156 | for (int i = 2; i <= pagenum; i++) { 157 | String url = followUrl + "?page=" + i; 158 | content = PageUtil.getContent(url); 159 | extractUserUrl(content); 160 | } 161 | } 162 | } catch (XPatherException e) { 163 | logger.error(e.getMessage()); 164 | } 165 | } 166 | 167 | /** 168 | * 提取用户Url 169 | * 170 | * @param content 171 | */ 172 | private static void extractUserUrl(String content) { 173 | Pattern pattern = Pattern.compile(""); 174 | Matcher m = pattern.matcher(content); 175 | while (m.find()) { 176 | String userUrl = "https://www.zhihu.com" + m.group(1); 177 | // 用户url去重 178 | UrlUtil.juageUrl(userUrl); 179 | } 180 | } 181 | 182 | } 183 | -------------------------------------------------------------------------------- /Project9_分布式知乎爬虫/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | #产品上线后一般使用info,项目开发过程中可以使用debug 2 | #log4j.rootLogger=info,stdout 3 | log4j.rootLogger=info,E 4 | #指定日志输出到什么地方 5 | log4j.appender.stdout = org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target = System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%m%n 9 | #log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c] [%p] - %m%n 10 | 11 | log4j.appender.E = org.apache.log4j.DailyRollingFileAppender 12 | log4j.appender.E.File = logs/logs.log 13 | log4j.appender.E.Append = true 14 | log4j.appender.E.Threshold = DEBUG 15 | log4j.appender.E.layout = org.apache.log4j.PatternLayout 16 | log4j.appender.E.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss} [ %t:%r ] - [ %p ] %m%n -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BigData 2 | BigData Project 大数据项目由浅入深 3 | --------------------------------------------------------------------------------