├── .gitignore ├── LICENSE ├── cover.jpg ├── docs ├── 1.md ├── 10.md ├── 11.md ├── 12.md ├── 13.md ├── 14.md ├── 15.md ├── 16.md ├── 17.md ├── 18.md ├── 19.md ├── 2.md ├── 20.md ├── 21.md ├── 22.md ├── 3.md ├── 4.md ├── 5.md ├── 6.md ├── 7.md ├── 8.md ├── 9.md └── img │ ├── 000d96e53607268ac90aab877bb7dbfd.jpg │ ├── 007f623d1ec885d996b1b72689ba7cb1.jpg │ ├── 018af0886bf373be0fc585cac38b2d6c.jpg │ ├── 0539212d2d3e4c28b27805e3c8783cab.jpg │ ├── 067197a5eeb69cc2f3d828a92ebcf52e.jpg │ ├── 0679a6e10e2c4166bb23f4effc3d5887.jpg │ ├── 06dae2d3c7cb1a64cb31ed2482e632b7.jpg │ ├── 0702875bab1a20dbb9d95fab3813c019.jpg │ ├── 070e654d25157c32f4038fb2daa42351.jpg │ ├── 08c6744e242573f80b23af5dbbf21a94.jpg │ ├── 09f553a8fd91309c3c1c2634e1b5ca15.jpg │ ├── 0a52ececadd638e5127d7a562d9b00b6.jpg │ ├── 0ac8a503cc147ea1ccb9c24bf83a5992.jpg │ ├── 0d2c607e00ca608222b80fa6b61e780a.jpg │ ├── 0e33aec96020afa0297be6d91db0d5d8.jpg │ ├── 0eb5759f21246505752043bb890ab6bf.jpg │ ├── 0ebcb4677d2131e71e039be8ea955cff.jpg │ ├── 0ef51f1b4020192962616eb9559975a4.jpg │ ├── 0ff87df50cf4610da54dd94b51c6d809.jpg │ ├── 112822dfffe3d0e59d7881d265b78cad.jpg │ ├── 1428271961e4c95f6508f59083d5a645.jpg │ ├── 1478d9b0743fdc3b0c6ad079b88034ec.jpg │ ├── 16dd8d60ea9b042c3ce0652c9f0571e8.jpg │ ├── 16fd7a4c078cf22fee09b636dc10d55c.jpg │ ├── 189ce8661099fd6f1118f978d53cf85b.jpg │ ├── 197517339d2ce744dd0a46c607e84534.jpg │ ├── 1a8a8647a66b744ccd5c9137adb66255.jpg │ ├── 1ac835166928f502b55a31636602602a.jpg │ ├── 1bb7f4b9072cd83f62d4e344eaba88e8.jpg │ ├── 1c57212c22a6a7777decfa1971418148.jpg │ ├── 1cef776388e6c2cba3cf00cab2199e3d.jpg │ ├── 1f778c47baa79f4277cc4c2cb0ff0a2d.jpg │ ├── 203cd7c17881ea567f75816f98ec50fa.jpg │ ├── 210c2cca588f51d57a4eee64f09d4b38.jpg │ ├── 22e9d09a51121f8c77345a724091c622.jpg │ ├── 2649ef98f720c129d663f5d82add4129.jpg │ ├── 26e6828e76fd4c01d8aea7770dd032c7.jpg │ ├── 290e0b58c66f2b75c67fd1a15e3fe958.jpg │ ├── 29928a4b917a9348751e7d0410712045.jpg │ ├── 2a4a130bcfb223ced98c0de613bd076a.jpg │ ├── 2a63caca4dc8603d4a602018e75a1fcd.jpg │ ├── 2b9e0aad3a11fee86e4261a1e94749e0.jpg │ ├── 2c646f410ca3ccbf4db40d322dfba989.jpg │ ├── 2d776487e1a2ee4683c3c6f51fca7e48.jpg │ ├── 2f97b51dd2c305123e29377f808272b2.jpg │ ├── 2fbe25eafed24324bfbde9c4d3dca1f4.jpg │ ├── 3152173a8fd696819c7a2c2b8c6ef005.jpg │ ├── 315be0f70cd0effa6c8682f2a949a46c.jpg │ ├── 348c0d7bc8db0d630042e5faffd2d647.jpg │ ├── 361ef23b0743d01bb30ead2dccc9edca.jpg │ ├── 38cff4d0c27588f71d4ed00223dcc4a2.jpg │ ├── 38f75cffa6acca49fcf1ba20d202b2d0.jpg │ ├── 3b99ee07cd783026d41b65651ee5d293.jpg │ ├── 3c0d894f48daba4e004eddd747cb556c.jpg │ ├── 3c62f7e72a479ae0b82768c51bdc2830.jpg │ ├── 3de6020b5a20976d8e1244b98a1ae30b.jpg │ ├── 3f26c9365c0603f014f3bba403ed27fb.jpg │ ├── 3f74c667189eff836df907a6b6ff2584.jpg │ ├── 3f959ef0c8e56c21a00dceb01eb6e6c1.jpg │ ├── 40877c463ec6621caf8f742f1e5c7c05.jpg │ ├── 40bca4c232843e818fac0254a2ae2689.jpg │ ├── 43393ec4f575c391ddfca83324ec67a8.jpg │ ├── 44d0db1ee959675768959ef02c868b32.jpg │ ├── 46803121f8c51dcb1422593800354126.jpg │ ├── 4a1a112aa8490f7c8410b710845e8c7a.jpg │ ├── 4ae661a05a9586c4ce7b5eabf4bab417.jpg │ ├── 4b454255e179a3626e205ce324184acf.jpg │ ├── 4bcd8bf4febeeb8972519ed2adbce8d5.jpg │ ├── 4e5f23199f8f3cd55221e56cb05bd116.jpg │ ├── 4fb175e4e5682ef75a156dfba37beeea.jpg │ ├── 501025688da0cf9e2b3937cd7da9580d.jpg │ ├── 585d98b9749f0661bc9077e01f28eb15.jpg │ ├── 59ebd939c24bf4d59d82b0daf4874daf.jpg │ ├── 5a13655c0030372e1b06cd77ff1e53e0.jpg │ ├── 5a64add1cdcd06c1755071eba4040184.jpg │ ├── 5afb60bc18fcf81ab2ff1282bb035030.jpg │ ├── 5b5795767ca8cb65be11e7cc020d6b7f.jpg │ ├── 5d730ed5b317fc0cf48f64f9e3312d92.jpg │ ├── 5d7cfe29b931b3da8aa6fe174ccaac16.jpg │ ├── 6170e4e4344d720bef3ff354a507f6fa.jpg │ ├── 61bccf1d55cc6636fce9585573c9981a.jpg │ ├── 6202e78af1fef25458e1696f4a8ae057.jpg │ ├── 62e9e7ead57d4e2a4be61668d6aff334.jpg │ ├── 679879d3fb7800c91c8321c24ce0f8d9.jpg │ ├── 67ba05c0d55d08b80558d07c418fa22e.jpg │ ├── 6869432b79fecd2af9dcc85625f6f356.jpg │ ├── 68b23d0b045f72286a7dd26502a39dd0.jpg │ ├── 68d093ccd7d87aecf646581f431220be.jpg │ ├── 69ff1f1b7a8e2162d5395fa62c35e8b6.jpg │ ├── 6a466b1857f68538cfa76355f8ea6396.jpg │ ├── 6b9d05253edbae95f27fa6c58648c872.jpg │ ├── 6eb508bad184c89094f5045a5bf2e31c.jpg │ ├── 6f2adb68d3f0a7f1f3af2ef044441071.jpg │ ├── 6ffbd19e479aef3243e53b706d026af4.jpg │ ├── 710e36ed942d63d074523c804347981b.jpg │ ├── 7166a4887b7f211527c9e45a072e23d2.jpg │ ├── 7171a798c643e82807785cc163a04796.jpg │ ├── 72748fa31cb48a5062a2fc7949bd0b45.jpg │ ├── 739c6ec939fd446ba1cde4cf4620512a.jpg │ ├── 7537a0a4978369cde56fd3dee294d760.jpg │ ├── 76b7cb4ea7971a8dc0db7a36feb66a35.jpg │ ├── 77c47cf9cfec8ec740c5a18dc4386670.jpg │ ├── 791424a3e5f6e2f4372471d96e5b4676.jpg │ ├── 7a33368daf8723e9a736c50a54b4d084.jpg │ ├── 7a9506c9bd23ed8b08861cd51eaf5cc3.jpg │ ├── 7a9b61eccdf8d02c95b6cd81a63e02ac.jpg │ ├── 7ab51337f3701838a285ea3a7346a777.jpg │ ├── 7b85580dc5d18fc8fec74ce74849c182.jpg │ ├── 7bb886fc0ea7d5d1144002edd99e0c7f.jpg │ ├── 7c0c7d4ea0f6a4cc6d3b9942f440d2ff.jpg │ ├── 7e397a7a0557431be9b98b2af35968d6.jpg │ ├── 7f8b8ddc9f821d1c5a27849bc02e355f.jpg │ ├── 80a25ad6329d3836f4e625a1c93e7898.jpg │ ├── 80f5d2344fd2f483b82b81d0a33e9333.jpg │ ├── 815f9ee92336e4ffc376f80bcb777ea1.jpg │ ├── 822f02d6117220d35ba69a1e20befe65.jpg │ ├── 82a22af158d760e46ae93ba1663a6487.jpg │ ├── 82e0633e9121ff663a913eb95a3dd723.jpg │ ├── 82f787ceda05c98a84ab98cdc998025c.jpg │ ├── 84566f6949f9a2f8734318c284f441f7.jpg │ ├── 85a26958e55acab88aef1ab37443b30b.jpg │ ├── 86176a13e0a00622dbc982348d7ca623.jpg │ ├── 875e532ac3b299876d209507d595df14.jpg │ ├── 88d05071bd3700af0ba08bab16c423be.jpg │ ├── 8973b73843e90120de5f556d5084eb49.jpg │ ├── 8c3fdcf6adcc472c7cd7a4598f96caac.jpg │ ├── 8d1654d45d287b49d6a7cbcae26c598f.jpg │ ├── 8dc8e70e19ec4318b12b16f1c5bdb879.jpg │ ├── 8f58cf98a539286a53e41582f194fbed.jpg │ ├── 901db29887d45801cb568cdd53d72a99.jpg │ ├── 905a46295f1f2a591a5d0b563d44277b.jpg │ ├── 90a1240e7489f989b9a4e5739b1efbd5.jpg │ ├── 91d663abfef497e13ec41f9300a5c354.jpg │ ├── 91deb6bcd6225e40290234462f33288a.jpg │ ├── 92df4afaf5010b135936512a39fb87d8.jpg │ ├── 946e2a245a8fae021860977280b52b44.jpg │ ├── 94b77459ef6ab620703ddb014430c700.jpg │ ├── 95594348fc6d49d2819be3d412a27e55.jpg │ ├── 962ddfc5aa5c0edc0ea500f82be01ac0.jpg │ ├── 96f213d1391dab5d9cd7f4ff68e739aa.jpg │ ├── 97fa452fc3843513b0746e76d216be78.jpg │ ├── 9af4ce81465021e68c774194432663c0.jpg │ ├── 9b41f0fbb97ef7ddd6383753e6ad1c26.jpg │ ├── 9b8d35ed3fc944be3e432c47b447f92f.jpg │ ├── 9d27515800718ff1cc0ac326899c7f77.jpg │ ├── 9d7dabd9ffa8795e12f2bcdf181e0b62.jpg │ ├── a389bc9d64e6d8eb9bc985f12054716b.jpg │ ├── a5fda7453d5707d5e8985434c789ba48.jpg │ ├── a70ad9ce54b1e6e921ef6745fcef45da.jpg │ ├── a769c068095381d9207afe431343c95c.jpg │ ├── aa2fbf6676b8fd4f67229d35f1c7c537.jpg │ ├── ad37847dfd8d9f3d99f646966f32cf30.jpg │ ├── aec897e37f71d43694de4db49ed3be3e.jpg │ ├── aed7e56b0a3e63a84e53c79df4f79b0e.jpg │ ├── aef64ee73dc1b1a03a152855f685113e.jpg │ ├── afa87c5126806e604709f243ab72848b.jpg │ ├── b288f19072faa2f8f373d5a8910c080b.jpg │ ├── b3039f057e9453e4183ed33aecf5815f.jpg │ ├── b3268e19b1a48f645d17d659940fb084.jpg │ ├── b4a297ef2185e28694b366bde4069858.jpg │ ├── b6bd384dd0f03237f1b1b36428d27842.jpg │ ├── b7721ad6f461509452813013157c7a5e.jpg │ ├── b7d416463ca0be2cb7caf7d373dc26cc.jpg │ ├── b7d7ca35788d7bfb804b5b230a76af8c.jpg │ ├── b8bf446d4a625497f28f2347b7ca0c92.jpg │ ├── b8c9ccb17235ad37b2b0fee18853efe6.jpg │ ├── b9eb842264e6a48a42ecf5f142e32414.jpg │ ├── baa636adac3ad30302c0a36fc2f58751.jpg │ ├── bab25b7785bf747bc1caa1442874df74.jpg │ ├── be4d423d387dbcd6a770d4bda5718082.jpg │ ├── c03bdd903e4bd06d018711d1dece0c35.jpg │ ├── c089ca6ef2f36b0394d7bcf41db78030.jpg │ ├── c1bfb9f293835166e1378720b9f206b8.jpg │ ├── c24065c33e1cca422d1ae92f57cd77c1.jpg │ ├── c421a389906a45c77337a6a68fa78a0b.jpg │ ├── c45aab6ee1f6f00de1ac3f428e62b01c.jpg │ ├── c4660874124a448ac14209f4a59e367a.jpg │ ├── c51fb942d508d4161e72d0075a5284e7.jpg │ ├── c647aced84d4783e96a244a8af78ddd2.jpg │ ├── c789e9bbaa3506dc90047b5cd487a42a.jpg │ ├── c8a2ccec457f128649ad30a2ba066a48.jpg │ ├── c9c3087ea25e6c3f848030b33b06de8f.jpg │ ├── cab981b993e03ab12309dd619da9e31d.jpg │ ├── cb0b50e4410efd78416163f37eaf1262.jpg │ ├── cb63c877ea3af266bb0f5ad6ba5e0b1d.jpg │ ├── cdcdbf84e640274f429780824ccf99ae.jpg │ ├── cedd3825782041ef84d7741e62528a42.jpg │ ├── d003fed20e7f2d040ccc24412cb854d1.jpg │ ├── d09c46ec94d638e4ddcecfbba1c11ea8.jpg │ ├── d142da9aae51c6d3c3c736fc82252862.jpg │ ├── d2f9799d371fde446e6dc8292ba07393.jpg │ ├── d3a773e713ad3244265d91b77ef7fb7e.jpg │ ├── d3b112475692c0421480c01cd029cf09.jpg │ ├── d4b213f9046b3ed8b898fac4d4aeec34.jpg │ ├── d4b34834b440d5d60f25912180e7e130.jpg │ ├── d4c847aca412080f018bab9df543ff7b.jpg │ ├── d69988406d72ad9e624d24db6b4d2838.jpg │ ├── d6c0dcf5a8894d7495e320405295cc8f.jpg │ ├── d7945700ecde92ac83058e07433755da.jpg │ ├── d7cd0e2a15aa54e4700d3dc03e6ac28d.jpg │ ├── d9195c20e19c173ec6d22c2e60a2cddb.jpg │ ├── dd0fad3141f468ebc29678d3ff86055d.jpg │ ├── e1164e5922bbcc2db8e6b23c145b8f75.jpg │ ├── e97f8315ce721d1417bc7bb3b4a9d332.jpg │ ├── eacebbc96f1d97c47d903d7981ce1167.jpg │ ├── ec9e0b7231caed693477682311612304.jpg │ ├── ed008064e9d0e55dc93f673b9aca6b65.jpg │ ├── edb67528127916e7e274addf9ad96029.jpg │ ├── eea23835a8abd9d903f56256c18cf8aa.jpg │ ├── f18ecec7a6c176301d7370e41a0a60dd.jpg │ ├── f3f89822d498eea24c520e0ab3cb6b0d.jpg │ ├── f4e95f92187a42f257864cd22193c8ad.jpg │ ├── f5832d90e75d18f501ede7acb0b6ce74.jpg │ ├── f80c359151c40c9277e2d70f38856eab.jpg │ ├── f9711a0b52dcab7b1173e08ac154cdb4.jpg │ ├── fad9e18cebad821450ed0f34abdb3988.jpg │ ├── fcc4c8c5db1d6aa3ff080466e10ccb74.jpg │ ├── fdfe96b0b4fdfbfd862a698dc64ce34a.jpg │ └── fef76f108c095f250d8e9efb4cfcb710.jpg └── styles └── ebook.css /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | .DS_Store 103 | 104 | # gitbook 105 | _book 106 | 107 | # node.js 108 | node_modules 109 | 110 | # windows 111 | Thumbs.db 112 | 113 | # word 114 | ~$*.docx 115 | ~$*.doc 116 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License (CC BY-NC-SA 4.0) 2 | 3 | Copyright © 2020 ApacheCN(apachecn@163.com) 4 | 5 | By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions. 6 | 7 | Section 1 – Definitions. 8 | 9 | a. Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image. 10 | b. Adapter's License means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License. 11 | c. BY-NC-SA Compatible License means a license listed at creativecommons.org/compatiblelicenses, approved by Creative Commons as essentially the equivalent of this Public License. 12 | d. Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights. 13 | e. Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements. 14 | f. Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material. 15 | g. License Elements means the license attributes listed in the name of a Creative Commons Public License. The License Elements of this Public License are Attribution, NonCommercial, and ShareAlike. 16 | h. Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License. 17 | i. Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license. 18 | j. Licensor means the individual(s) or entity(ies) granting rights under this Public License. 19 | k. NonCommercial means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange. 20 | l. Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them. 21 | m. Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world. 22 | n. You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning. 23 | 24 | Section 2 – Scope. 25 | 26 | a. License grant. 27 | 1. Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to: 28 | A. reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and 29 | B. produce, reproduce, and Share Adapted Material for NonCommercial purposes only. 30 | 2. Exceptions and Limitations. For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions. 31 | 3. Term. The term of this Public License is specified in Section 6(a). 32 | 4. Media and formats; technical modifications allowed. The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material. 33 | 5. Downstream recipients. 34 | A. Offer from the Licensor – Licensed Material. Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License. 35 | B. Additional offer from the Licensor – Adapted Material. Every recipient of Adapted Material from You automatically receives an offer from the Licensor to exercise the Licensed Rights in the Adapted Material under the conditions of the Adapter’s License You apply. 36 | C. No downstream restrictions. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material. 37 | 6. No endorsement. Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i). 38 | b. Other rights. 39 | 1. Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise. 40 | 2. Patent and trademark rights are not licensed under this Public License. 41 | 3. To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes. 42 | 43 | Section 3 – License Conditions. 44 | 45 | Your exercise of the Licensed Rights is expressly made subject to the following conditions. 46 | 47 | a. Attribution. 48 | 1. If You Share the Licensed Material (including in modified form), You must: 49 | A. retain the following if it is supplied by the Licensor with the Licensed Material: 50 | i. identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated); 51 | ii. a copyright notice; 52 | iii. a notice that refers to this Public License; 53 | iv. a notice that refers to the disclaimer of warranties; 54 | v. a URI or hyperlink to the Licensed Material to the extent reasonably practicable; 55 | B. indicate if You modified the Licensed Material and retain an indication of any previous modifications; and 56 | C. indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License. 57 | 2. You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information. 58 | 3. If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable. 59 | b. ShareAlike. 60 | In addition to the conditions in Section 3(a), if You Share Adapted Material You produce, the following conditions also apply. 61 | 1. The Adapter’s License You apply must be a Creative Commons license with the same License Elements, this version or later, or a BY-NC-SA Compatible License. 62 | 2. You must include the text of, or the URI or hyperlink to, the Adapter's License You apply. You may satisfy this condition in any reasonable manner based on the medium, means, and context in which You Share Adapted Material. 63 | 3. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, Adapted Material that restrict exercise of the rights granted under the Adapter's License You apply. 64 | 65 | Section 4 – Sui Generis Database Rights. 66 | 67 | Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material: 68 | 69 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only; 70 | b. if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material, including for purposes of Section 3(b); and 71 | c. You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database. 72 | 73 | For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights. 74 | 75 | Section 5 – Disclaimer of Warranties and Limitation of Liability. 76 | 77 | a. Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You. 78 | b. To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You. 79 | c. The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability. 80 | 81 | Section 6 – Term and Termination. 82 | 83 | a. This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically. 84 | b. Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates: 85 | 1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or 86 | 2. upon express reinstatement by the Licensor. 87 | For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License. 88 | c. For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License. 89 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public License. 90 | 91 | Section 7 – Other Terms and Conditions. 92 | 93 | a. The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed. 94 | b. Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License. 95 | 96 | Section 8 – Interpretation. 97 | 98 | a. For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License. 99 | b. To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions. 100 | c. No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor. 101 | d. Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority. -------------------------------------------------------------------------------- /cover.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/cover.jpg -------------------------------------------------------------------------------- /docs/1.md: -------------------------------------------------------------------------------- 1 | # 1\. 前言 2 | 3 | ## 1.1\. 关于 4 | 5 | ### 1.1.1\. 关于这个笔记 6 | 7 | 这是[ PySpark 学习手册](https://github.com/runawayhorse001/LearningApacheSpark)的共享仓库。第一个版本发布在 [ChenFeng](https://mingchen0919.github.io/learning-apache-spark/index.html)([[Feng2017]](reference.html#feng2017))的 Github 上。 这个共享仓库主要包含 Wenqiang 在[ IMA 数据科学项目](https://www.ima.umn.edu/2016-2017/SW1.23-3.10.17#)期间的自学和教学笔记。 读者可以参考仓库 [apachecn/learning-pyspark-zh](https://github.com/apachecn/learning-pyspark-zh),了解`dataset`和`.ipynb`文件的更多详细信息。 8 | 9 | 在此仓库中,我尝试使用详细的演示代码和示例来说明如何使用每个主要函数。如果您发现本文未引用您的作品,请随时告诉我。 10 | 11 | 虽然我绝不是数据挖掘编程和大数据专家,但我决定以简单的教程和详细的例子,分享我对 PySpark 编程的见解。我希望这些教程将成为您学习的宝贵工具。 12 | 13 | 教程假设读者具有编程和 Linux 的初步知识。英文文档是使用 [sphinx](http://sphinx.pocoo.org) 自动生成的。 14 | 15 | ### 1.1.2\. 关于作者 16 | 17 | * **Wenqiang Feng** 18 | 19 | * 数据科学家和数学 PhD 20 | * 田纳西大学,诺克斯维尔 21 | * Email: [von198@gmail.com](mailto:von198%40gmail.com) 22 | * **自传** 23 | 24 | Wenqiang Feng 是 DST 应用分析小组的数据科学家。 Feng 博士的职责包括为 DST 客户提供最先进的技术和技术,包括大数据分析解决方案,高级分析和数据增强技术以及建模。 25 | 26 | Feng 博士在数据挖掘,分析系统,机器学习算法,商业智能以及应用大数据工具方面,拥有深厚的分析专业知识,可以战略性地解决跨功能业务中的行业问题。 在加入 DST 之前,Feng 博士是明尼苏达大学数学及其应用研究所(IMA)的数据科学研究员。 在那里,他帮助初创公司根据深度预测分析做出营销决策。 27 | 28 | Feng 博士毕业于田纳西大学,诺克斯维尔,拥有博士学位,和计算数学和统计学硕士学位。 他还拥有密苏里科技大学(MST)计算数学硕士学位和中国科学技术大学(USTC)应用数学硕士学位。 29 | 30 | * **声明** 31 | 32 | 在 IMA 工作期间,Feng 的工作得到了 IMA 的支持。 但是,本材料中表达的任何意见,发现,结论或建议均为作者的意见,并不一定反映 IMA,UTK 和 DST 的观点。 33 | 34 | ## 1.2\. 这个教程的动机 35 | 36 | 我受到[ IMA 数据科学项目](https://www.ima.umn.edu/2016-2017/SW1.23-3.10.17#)项目的启发,来学习 PySpark。 之后,我对 PySpark 印象深刻。 我觉得: 37 | 38 | > 1. 可以毫不夸张地说,Spark 是最强大的大数据工具。 39 | > 2. 但是,我仍然发现学习 Spark 是一个艰难的过程。 我必须搜索并确定哪一个是对的。 很难找到详细的例子,我可以用它轻松地在一个文件中学习完整的过程。 40 | > 3. 对于研究生来说,好的资源是昂贵的。 41 | 42 | ## 1.3\. 版权声明和协议信息 43 | 44 | 本文档中的代码遵循[ MIT 协议](https://github.com/runawayhorse001/LearningApacheSpark/blob/master/LICENSE),文字遵循[ CC BY-NC-SA 4.0 协议](https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode)。 45 | 46 | **当您计划使用,复制,修改,合并,发布,分发或再授权时,请查看这些协议的条款来获取更多详细信息,并向作者提供相应的署名**。 47 | 48 | ## 1.4\. 致谢 49 | 50 | 在此,我要感谢田纳西大学,诺克斯维尔的 Ming Chen,Jian Sun 和 Zhongbo Li 的宝贵讨论,并感谢慷慨的匿名作者在互联网上提供详细的解决方案和源代码。 没有这些帮助,就无法建立这个CAE库。 Wenqiang 还要感谢[明尼苏达大学双子城](https://twin-cities.umn.edu/)的[数学及其应用研究所(IMA)](https://www.ima.umn.edu/),在他的 IMA 数据科学家项目期间提供支持。 51 | 52 | 特别感谢[ Haiping Lu 博士](http://staffwww.dcs.shef.ac.uk/people/H.Lu/),谢菲尔德大学计算机科学系机器学习讲师,在他的教学中推荐和大量使用我的教程,并提出了有价值的建议。 53 | 54 | ## 1.5\. 反馈和建议 55 | 56 | 非常感谢您的意见和建议。 我非常乐意通过电子邮件([von198@gmail.com](mailto:von198%40gmail.com))收到更正,建议或反馈,以便进行改进。 57 | -------------------------------------------------------------------------------- /docs/11.md: -------------------------------------------------------------------------------- 1 | # 11\. Clustering 2 | 3 | Chinese proverb 4 | 5 | Sharpening the knife longer can make it easier to hack the firewood – old Chinese proverb 6 | 7 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/clustering_logo.png](img/eacebbc96f1d97c47d903d7981ce1167.jpg) 8 | 9 | The above figure was generated by the code from: [Python Data Science Handbook](https://jakevdp.github.io/PythonDataScienceHandbook/06.00-figure-code.html#Expectation-Maximization). 10 | 11 | ## 11.1\. K-Means Model 12 | 13 | ### 11.1.1\. Introduction 14 | 15 | k-means clustering is a method of vector quantization, originally from signal processing, that is popular for cluster analysis in data mining. The approach kmeans follows to solve the problem is called **Expectation-Maximization**. It can be described as follows: 16 | 17 | 1. Assign some cluter centers 18 | 19 | 2. Repeated until converged 20 | 21 | > * E-Step: assign points to the nearest center 22 | > * M-step: set the cluster center to the mean 23 | 24 | Given a set of observations ![(x_1, x_2, \cdots, x_m)](img/290e0b58c66f2b75c67fd1a15e3fe958.jpg). The objective function is 25 | 26 | ![J = \sum_{i=1}^{m}\sum_{k=1}^{K}w_{ik} ||x_i-c_k||^2](img/0d2c607e00ca608222b80fa6b61e780a.jpg) 27 | 28 | where ![w_{ik}=1](img/a769c068095381d9207afe431343c95c.jpg) if ![x_i](img/82e0633e9121ff663a913eb95a3dd723.jpg) is in cluster ![k](img/739c6ec939fd446ba1cde4cf4620512a.jpg); otherwise ![w_{ik}=0](img/7a9506c9bd23ed8b08861cd51eaf5cc3.jpg) and ![c_k](img/cab981b993e03ab12309dd619da9e31d.jpg) is the centroid of ![x_i](img/82e0633e9121ff663a913eb95a3dd723.jpg) ‘s cluster. 29 | 30 | Mathematically, k-means is a minimization problem with two parts: First, we minimize ![J](img/d6c0dcf5a8894d7495e320405295cc8f.jpg) w.r.t ![w_{ik}](img/c647aced84d4783e96a244a8af78ddd2.jpg) with ![c_k](img/cab981b993e03ab12309dd619da9e31d.jpg) fixed; Then minimize ![J](img/d6c0dcf5a8894d7495e320405295cc8f.jpg) w.r.t ![c_k](img/cab981b993e03ab12309dd619da9e31d.jpg) with ![w_{ik}](img/c647aced84d4783e96a244a8af78ddd2.jpg) fixed. i.e. 31 | 32 | **E-step**: 33 | 34 | ![\frac{\partial J}{\partial w_{ik}} = \sum_{i=1}^{m}\sum_{k=1}^{K} ||x_i-c_k||^2\\ 35 | \Rightarrow w_{ik} =\left\{ 36 | \begin{array}{ll} 37 | 1, & \text{ if }{ k = argmin_{j} ||x_i-c_j||^2} \\ 38 | 0, & \text{ otherwise } 39 | \end{array} 40 | \right.](img/c421a389906a45c77337a6a68fa78a0b.jpg) 41 | 42 | **M-step**: 43 | 44 | ![\frac{\partial J}{\partial c_k} = 2\sum_{i=1}{m} w_{ik}(x_i-c_k) =0 \Rightarrow 45 | c_k = \frac{\sum_{i=1}^{m}w_{ik}x_i}{\sum_{i=1}^{m}w_{ik}}](img/62e9e7ead57d4e2a4be61668d6aff334.jpg) 46 | 47 | ### 11.1.2\. Demo 48 | 49 | 1. Set up spark context and SparkSession 50 | 51 | ``` 52 | from pyspark.sql import SparkSession 53 | 54 | spark = SparkSession \ 55 | .builder \ 56 | .appName("Python Spark K-means example") \ 57 | .config("spark.some.config.option", "some-value") \ 58 | .getOrCreate() 59 | 60 | ``` 61 | 62 | 1. Load dataset 63 | 64 | ``` 65 | df = spark.read.format('com.databricks.spark.csv').\ 66 | options(header='true', \ 67 | inferschema='true').\ 68 | load("../data/iris.csv",header=True); 69 | 70 | ``` 71 | 72 | check the data set 73 | 74 | ``` 75 | df.show(5,True) 76 | df.printSchema() 77 | 78 | ``` 79 | 80 | Then you will get 81 | 82 | ``` 83 | +------------+-----------+------------+-----------+-------+ 84 | |sepal_length|sepal_width|petal_length|petal_width|species| 85 | +------------+-----------+------------+-----------+-------+ 86 | | 5.1| 3.5| 1.4| 0.2| setosa| 87 | | 4.9| 3.0| 1.4| 0.2| setosa| 88 | | 4.7| 3.2| 1.3| 0.2| setosa| 89 | | 4.6| 3.1| 1.5| 0.2| setosa| 90 | | 5.0| 3.6| 1.4| 0.2| setosa| 91 | +------------+-----------+------------+-----------+-------+ 92 | only showing top 5 rows 93 | 94 | root 95 | |-- sepal_length: double (nullable = true) 96 | |-- sepal_width: double (nullable = true) 97 | |-- petal_length: double (nullable = true) 98 | |-- petal_width: double (nullable = true) 99 | |-- species: string (nullable = true) 100 | 101 | ``` 102 | 103 | You can also get the Statistical resutls from the data frame (Unfortunately, it only works for numerical). 104 | 105 | ``` 106 | df.describe().show() 107 | 108 | ``` 109 | 110 | Then you will get 111 | 112 | ``` 113 | +-------+------------------+-------------------+------------------+------------------+---------+ 114 | |summary| sepal_length| sepal_width| petal_length| petal_width| species| 115 | +-------+------------------+-------------------+------------------+------------------+---------+ 116 | | count| 150| 150| 150| 150| 150| 117 | | mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672| null| 118 | | stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414| null| 119 | | min| 4.3| 2.0| 1.0| 0.1| setosa| 120 | | max| 7.9| 4.4| 6.9| 2.5|virginica| 121 | +-------+------------------+-------------------+------------------+------------------+---------+ 122 | 123 | ``` 124 | 125 | 1. Convert the data to dense vector (**features**) 126 | 127 | ``` 128 | # convert the data to dense vector 129 | def transData(data): 130 | return data.rdd.map(lambda r: [Vectors.dense(r[:-1])]).toDF(['features']) 131 | 132 | ``` 133 | 134 | Note 135 | 136 | You are strongly encouraged to try my `get_dummy` function for dealing with the categorical data in complex dataset. 137 | 138 | Supervised learning version: 139 | 140 | > ``` 141 | > def get_dummy(df,indexCol,categoricalCols,continuousCols,labelCol): 142 | > 143 | > from pyspark.ml import Pipeline 144 | > from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler 145 | > from pyspark.sql.functions import col 146 | > 147 | > indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) 148 | > for c in categoricalCols ] 149 | > 150 | > # default setting: dropLast=True 151 | > encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(), 152 | > outputCol="{0}_encoded".format(indexer.getOutputCol())) 153 | > for indexer in indexers ] 154 | > 155 | > assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders] 156 | > + continuousCols, outputCol="features") 157 | > 158 | > pipeline = Pipeline(stages=indexers + encoders + [assembler]) 159 | > 160 | > model=pipeline.fit(df) 161 | > data = model.transform(df) 162 | > 163 | > data = data.withColumn('label',col(labelCol)) 164 | > 165 | > return data.select(indexCol,'features','label') 166 | > 167 | > ``` 168 | 169 | Unsupervised learning version: 170 | 171 | > ``` 172 | > def get_dummy(df,indexCol,categoricalCols,continuousCols): 173 | > ''' 174 | > Get dummy variables and concat with continuous variables for unsupervised learning. 175 | > :param df: the dataframe 176 | > :param categoricalCols: the name list of the categorical data 177 | > :param continuousCols: the name list of the numerical data 178 | > :return k: feature matrix 179 | > 180 | > :author: Wenqiang Feng 181 | > :email: von198@gmail.com 182 | > ''' 183 | > 184 | > indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) 185 | > for c in categoricalCols ] 186 | > 187 | > # default setting: dropLast=True 188 | > encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(), 189 | > outputCol="{0}_encoded".format(indexer.getOutputCol())) 190 | > for indexer in indexers ] 191 | > 192 | > assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders] 193 | > + continuousCols, outputCol="features") 194 | > 195 | > pipeline = Pipeline(stages=indexers + encoders + [assembler]) 196 | > 197 | > model=pipeline.fit(df) 198 | > data = model.transform(df) 199 | > 200 | > return data.select(indexCol,'features') 201 | > 202 | > ``` 203 | 204 | 1. Transform the dataset to DataFrame 205 | 206 | ``` 207 | transformed= transData(df) 208 | transformed.show(5, False) 209 | 210 | ``` 211 | 212 | ``` 213 | +-----------------+ 214 | |features | 215 | +-----------------+ 216 | |[5.1,3.5,1.4,0.2]| 217 | |[4.9,3.0,1.4,0.2]| 218 | |[4.7,3.2,1.3,0.2]| 219 | |[4.6,3.1,1.5,0.2]| 220 | |[5.0,3.6,1.4,0.2]| 221 | +-----------------+ 222 | only showing top 5 rows 223 | 224 | ``` 225 | 226 | 1. Deal With Categorical Variables 227 | 228 | ``` 229 | from pyspark.ml import Pipeline 230 | from pyspark.ml.regression import LinearRegression 231 | from pyspark.ml.feature import VectorIndexer 232 | from pyspark.ml.evaluation import RegressionEvaluator 233 | 234 | # Automatically identify categorical features, and index them. 235 | # We specify maxCategories so features with > 4 distinct values are treated as continuous. 236 | 237 | featureIndexer = VectorIndexer(inputCol="features", \ 238 | outputCol="indexedFeatures",\ 239 | maxCategories=4).fit(transformed) 240 | 241 | data = featureIndexer.transform(transformed) 242 | 243 | ``` 244 | 245 | Now you check your dataset with 246 | 247 | ``` 248 | data.show(5,True) 249 | 250 | ``` 251 | 252 | you will get 253 | 254 | ``` 255 | +-----------------+-----------------+ 256 | | features| indexedFeatures| 257 | +-----------------+-----------------+ 258 | |[5.1,3.5,1.4,0.2]|[5.1,3.5,1.4,0.2]| 259 | |[4.9,3.0,1.4,0.2]|[4.9,3.0,1.4,0.2]| 260 | |[4.7,3.2,1.3,0.2]|[4.7,3.2,1.3,0.2]| 261 | |[4.6,3.1,1.5,0.2]|[4.6,3.1,1.5,0.2]| 262 | |[5.0,3.6,1.4,0.2]|[5.0,3.6,1.4,0.2]| 263 | +-----------------+-----------------+ 264 | only showing top 5 rows 265 | 266 | ``` 267 | 268 | Note 269 | 270 | Since clustering algorithms including k-means use distance-based measurements to determine the similarity between data points, It’s strongly recommended to standardize the data to have a mean of zero and a standard deviation of one. 271 | 272 | 1. Elbow method to determine the optimal number of clusters for k-means clustering 273 | 274 | ``` 275 | import numpy as np 276 | cost = np.zeros(20) 277 | for k in range(2,20): 278 | kmeans = KMeans()\ 279 | .setK(k)\ 280 | .setSeed(1) \ 281 | .setFeaturesCol("indexedFeatures")\ 282 | .setPredictionCol("cluster") 283 | 284 | model = kmeans.fit(data) 285 | cost[k] = model.computeCost(data) # requires Spark 2.0 or later 286 | 287 | ``` 288 | 289 | ``` 290 | import numpy as np 291 | import matplotlib.mlab as mlab 292 | import matplotlib.pyplot as plt 293 | import seaborn as sbs 294 | from matplotlib.ticker import MaxNLocator 295 | 296 | fig, ax = plt.subplots(1,1, figsize =(8,6)) 297 | ax.plot(range(2,20),cost[2:20]) 298 | ax.set_xlabel('k') 299 | ax.set_ylabel('cost') 300 | ax.xaxis.set_major_locator(MaxNLocator(integer=True)) 301 | plt.show() 302 | 303 | ``` 304 | 305 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/elbow.png](img/92df4afaf5010b135936512a39fb87d8.jpg) 306 | 307 | In my opinion, sometimes it’s hard to choose the optimal number of the clusters by using the `elbow method`. As shown in the following Figure, you can choose 3, 5 or even 8\. I will choose `3` in this demo. 308 | 309 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/elbow_rfm.png](img/d4b213f9046b3ed8b898fac4d4aeec34.jpg) 310 | 311 | * Silhouette analysis 312 | 313 | ``` 314 | #PySpark libraries 315 | from pyspark.ml import Pipeline 316 | from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler 317 | from pyspark.sql.functions import col, percent_rank, lit 318 | from pyspark.sql.window import Window 319 | from pyspark.sql import DataFrame, Row 320 | from pyspark.sql.types import StructType 321 | from functools import reduce # For Python 3.x 322 | 323 | from pyspark.ml.clustering import KMeans 324 | from pyspark.ml.evaluation import ClusteringEvaluator 325 | 326 | def optimal_k(df_in,index_col,k_min, k_max,num_runs): 327 | ''' 328 | Determine optimal number of clusters by using Silhoutte Score Analysis. 329 | :param df_in: the input dataframe 330 | :param index_col: the name of the index column 331 | :param k_min: the train dataset 332 | :param k_min: the minmum number of the clusters 333 | :param k_max: the maxmum number of the clusters 334 | :param num_runs: the number of runs for each fixed clusters 335 | 336 | :return k: optimal number of the clusters 337 | :return silh_lst: Silhouette score 338 | :return r_table: the running results table 339 | 340 | :author: Wenqiang Feng 341 | :email: von198@gmail.com 342 | ''' 343 | 344 | start = time.time() 345 | silh_lst = [] 346 | k_lst = np.arange(k_min, k_max+1) 347 | 348 | r_table = df_in.select(index_col).toPandas() 349 | r_table = r_table.set_index(index_col) 350 | centers = pd.DataFrame() 351 | 352 | for k in k_lst: 353 | silh_val = [] 354 | for run in np.arange(1, num_runs+1): 355 | 356 | # Trains a k-means model. 357 | kmeans = KMeans()\ 358 | .setK(k)\ 359 | .setSeed(int(np.random.randint(100, size=1))) 360 | model = kmeans.fit(df_in) 361 | 362 | # Make predictions 363 | predictions = model.transform(df_in) 364 | r_table['cluster_{k}_{run}'.format(k=k, run=run)]= predictions.select('prediction').toPandas() 365 | 366 | # Evaluate clustering by computing Silhouette score 367 | evaluator = ClusteringEvaluator() 368 | silhouette = evaluator.evaluate(predictions) 369 | silh_val.append(silhouette) 370 | 371 | silh_array=np.asanyarray(silh_val) 372 | silh_lst.append(silh_array.mean()) 373 | 374 | elapsed = time.time() - start 375 | 376 | silhouette = pd.DataFrame(list(zip(k_lst,silh_lst)),columns = ['k', 'silhouette']) 377 | 378 | print('+------------------------------------------------------------+') 379 | print("| The finding optimal k phase took %8.0f s. |" %(elapsed)) 380 | print('+------------------------------------------------------------+') 381 | 382 | return k_lst[np.argmax(silh_lst, axis=0)], silhouette , r_table 383 | 384 | ``` 385 | 386 | ``` 387 | k, silh_lst, r_table = optimal_k(scaledData,index_col,k_min, k_max,num_runs) 388 | 389 | +------------------------------------------------------------+ 390 | | The finding optimal k phase took 1783 s. | 391 | +------------------------------------------------------------+ 392 | 393 | ``` 394 | 395 | ``` 396 | spark.createDataFrame(silh_lst).show() 397 | 398 | +---+------------------+ 399 | | k| silhouette| 400 | +---+------------------+ 401 | | 3|0.8045154385557953| 402 | | 4|0.6993528775512052| 403 | | 5|0.6689286654221447| 404 | | 6|0.6356184024841809| 405 | | 7|0.7174102265711756| 406 | | 8|0.6720861758298997| 407 | | 9| 0.601771359881241| 408 | | 10|0.6292447334578428| 409 | +---+------------------+ 410 | 411 | ``` 412 | 413 | From the silhouette list, we can choose `3` as the optimal number of the clusters. 414 | 415 | Warning 416 | 417 | `ClusteringEvaluator` in `pyspark.ml.evaluation` requires Spark 2.4 or later!! 418 | 419 | 1. Pipeline Architecture 420 | 421 | ``` 422 | from pyspark.ml.clustering import KMeans, KMeansModel 423 | 424 | kmeans = KMeans() \ 425 | .setK(3) \ 426 | .setFeaturesCol("indexedFeatures")\ 427 | .setPredictionCol("cluster") 428 | 429 | # Chain indexer and tree in a Pipeline 430 | pipeline = Pipeline(stages=[featureIndexer, kmeans]) 431 | 432 | model = pipeline.fit(transformed) 433 | 434 | cluster = model.transform(transformed) 435 | 436 | ``` 437 | 438 | 1. k-means clusters 439 | 440 | ``` 441 | cluster = model.transform(transformed) 442 | 443 | ``` 444 | 445 | ``` 446 | +-----------------+-----------------+-------+ 447 | | features| indexedFeatures|cluster| 448 | +-----------------+-----------------+-------+ 449 | |[5.1,3.5,1.4,0.2]|[5.1,3.5,1.4,0.2]| 1| 450 | |[4.9,3.0,1.4,0.2]|[4.9,3.0,1.4,0.2]| 1| 451 | |[4.7,3.2,1.3,0.2]|[4.7,3.2,1.3,0.2]| 1| 452 | |[4.6,3.1,1.5,0.2]|[4.6,3.1,1.5,0.2]| 1| 453 | |[5.0,3.6,1.4,0.2]|[5.0,3.6,1.4,0.2]| 1| 454 | |[5.4,3.9,1.7,0.4]|[5.4,3.9,1.7,0.4]| 1| 455 | |[4.6,3.4,1.4,0.3]|[4.6,3.4,1.4,0.3]| 1| 456 | |[5.0,3.4,1.5,0.2]|[5.0,3.4,1.5,0.2]| 1| 457 | |[4.4,2.9,1.4,0.2]|[4.4,2.9,1.4,0.2]| 1| 458 | |[4.9,3.1,1.5,0.1]|[4.9,3.1,1.5,0.1]| 1| 459 | |[5.4,3.7,1.5,0.2]|[5.4,3.7,1.5,0.2]| 1| 460 | |[4.8,3.4,1.6,0.2]|[4.8,3.4,1.6,0.2]| 1| 461 | |[4.8,3.0,1.4,0.1]|[4.8,3.0,1.4,0.1]| 1| 462 | |[4.3,3.0,1.1,0.1]|[4.3,3.0,1.1,0.1]| 1| 463 | |[5.8,4.0,1.2,0.2]|[5.8,4.0,1.2,0.2]| 1| 464 | |[5.7,4.4,1.5,0.4]|[5.7,4.4,1.5,0.4]| 1| 465 | |[5.4,3.9,1.3,0.4]|[5.4,3.9,1.3,0.4]| 1| 466 | |[5.1,3.5,1.4,0.3]|[5.1,3.5,1.4,0.3]| 1| 467 | |[5.7,3.8,1.7,0.3]|[5.7,3.8,1.7,0.3]| 1| 468 | |[5.1,3.8,1.5,0.3]|[5.1,3.8,1.5,0.3]| 1| 469 | +-----------------+-----------------+-------+ 470 | only showing top 20 rows 471 | 472 | ``` -------------------------------------------------------------------------------- /docs/14.md: -------------------------------------------------------------------------------- 1 | # 14\. Social Network Analysis 2 | 3 | Chinese proverb 4 | 5 | **A Touch of Cloth,linked in countless ways.** – old Chinese proverb 6 | 7 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/net_work.png](img/cb0b50e4410efd78416163f37eaf1262.jpg) 8 | 9 | ## 14.1\. Introduction 10 | 11 | 12 | 13 | ## 14.2\. Co-occurrence Network 14 | 15 | [Co-occurrence networks](https://en.wikipedia.org/wiki/Co-occurrence_networks) are generally used to provide a graphic visualization of potential relationships between people, organizations, concepts or other entities represented within written material. The generation and visualization of co-occurrence networks has become practical with the advent of electronically stored text amenable to text mining. 16 | 17 | ### 14.2.1\. Methodology 18 | 19 | * Build Corpus C 20 | * Build Document-Term matrix D based on Corpus C 21 | * Compute Term-Document matrix ![D^T](img/315be0f70cd0effa6c8682f2a949a46c.jpg) 22 | * Adjacency Matrix ![A =D^T\cdot D](img/e97f8315ce721d1417bc7bb3b4a9d332.jpg) 23 | 24 | There are four main components in this algorithm in the algorithm: Corpus C, Document-Term matrix D, Term-Document matrix ![D^T](img/315be0f70cd0effa6c8682f2a949a46c.jpg) and Adjacency Matrix A. In this demo part, I will show how to build those four main components. 25 | 26 | Given that we have three groups of friends, they are 27 | 28 | > ``` 29 | > +-------------------------------------+ 30 | > |words | 31 | > +-------------------------------------+ 32 | > |[[george] [jimmy] [john] [peter]] | 33 | > |[[vincent] [george] [stefan] [james]]| 34 | > |[[emma] [james] [olivia] [george]] | 35 | > +-------------------------------------+ 36 | > 37 | > ``` 38 | 39 | 1. Corpus C 40 | 41 | Then we can build the following corpus based on the unique elements in the given group data: 42 | 43 | > ``` 44 | > [u'george', u'james', u'jimmy', u'peter', u'stefan', u'vincent', u'olivia', u'john', u'emma'] 45 | > 46 | > ``` 47 | 48 | The corresponding elements frequency: 49 | 50 | > ![https://runawayhorse001.github.io/LearningApacheSpark/_images/demo_freq.png](img/cdcdbf84e640274f429780824ccf99ae.jpg) 51 | 52 | 1. Document-Term matrix D based on Corpus C (CountVectorizer) 53 | 54 | > ``` 55 | > from pyspark.ml.feature import CountVectorizer 56 | > count_vectorizer_wo = CountVectorizer(inputCol='term', outputCol='features') 57 | > # with total unique vocabulary 58 | > countVectorizer_mod_wo = count_vectorizer_wo.fit(df) 59 | > countVectorizer_twitter_wo = countVectorizer_mod_wo.transform(df) 60 | > # with truncated unique vocabulary (99%) 61 | > count_vectorizer = CountVectorizer(vocabSize=48,inputCol='term',outputCol='features') 62 | > countVectorizer_mod = count_vectorizer.fit(df) 63 | > countVectorizer_twitter = countVectorizer_mod.transform(df) 64 | > 65 | > ``` 66 | > 67 | > ``` 68 | > +-------------------------------+ 69 | > |features | 70 | > +-------------------------------+ 71 | > |(9,[0,2,3,7],[1.0,1.0,1.0,1.0])| 72 | > |(9,[0,1,4,5],[1.0,1.0,1.0,1.0])| 73 | > |(9,[0,1,6,8],[1.0,1.0,1.0,1.0])| 74 | > +-------------------------------+ 75 | > 76 | > ``` 77 | 78 | * Term-Document matrix ![D^T](img/315be0f70cd0effa6c8682f2a949a46c.jpg) 79 | 80 | > RDD: 81 | > 82 | > ``` 83 | > [array([ 1., 1., 1.]), array([ 0., 1., 1.]), array([ 1., 0., 0.]), 84 | > array([ 1., 0., 0.]), array([ 0., 1., 0.]), array([ 0., 1., 0.]), 85 | > array([ 0., 0., 1.]), array([ 1., 0., 0.]), array([ 0., 0., 1.])] 86 | > 87 | > ``` 88 | > 89 | > Matrix: 90 | > 91 | > ``` 92 | > array([[ 1., 1., 1.], 93 | > [ 0., 1., 1.], 94 | > [ 1., 0., 0.], 95 | > [ 1., 0., 0.], 96 | > [ 0., 1., 0.], 97 | > [ 0., 1., 0.], 98 | > [ 0., 0., 1.], 99 | > [ 1., 0., 0.], 100 | > [ 0., 0., 1.]]) 101 | > 102 | > ``` 103 | 104 | 1. Adjacency Matrix ![A =D^T\cdot D](img/e97f8315ce721d1417bc7bb3b4a9d332.jpg) 105 | 106 | > RDD: 107 | > 108 | > ``` 109 | > [array([ 1., 1., 1.]), array([ 0., 1., 1.]), array([ 1., 0., 0.]), 110 | > array([ 1., 0., 0.]), array([ 0., 1., 0.]), array([ 0., 1., 0.]), 111 | > array([ 0., 0., 1.]), array([ 1., 0., 0.]), array([ 0., 0., 1.])] 112 | > 113 | > ``` 114 | > 115 | > Matrix: 116 | > 117 | > ``` 118 | > array([[ 3., 2., 1., 1., 1., 1., 1., 1., 1.], 119 | > [ 2., 2., 0., 0., 1., 1., 1., 0., 1.], 120 | > [ 1., 0., 1., 1., 0., 0., 0., 1., 0.], 121 | > [ 1., 0., 1., 1., 0., 0., 0., 1., 0.], 122 | > [ 1., 1., 0., 0., 1., 1., 0., 0., 0.], 123 | > [ 1., 1., 0., 0., 1., 1., 0., 0., 0.], 124 | > [ 1., 1., 0., 0., 0., 0., 1., 0., 1.], 125 | > [ 1., 0., 1., 1., 0., 0., 0., 1., 0.], 126 | > [ 1., 1., 0., 0., 0., 0., 1., 0., 1.]]) 127 | > 128 | > ``` 129 | 130 | ### 14.2.2\. Coding Puzzle from my interview 131 | 132 | * Problem 133 | 134 | The attached utf-8 encoded text file contains the tags associated with an online biomedical scientific article formatted as follows (size: 100000). Each Scientific article is represented by a line in the file delimited by carriage return. 135 | 136 | > ``` 137 | > +--------------------+ 138 | > | words| 139 | > +--------------------+ 140 | > |[ACTH Syndrome, E...| 141 | > |[Antibody Formati...| 142 | > |[Adaptation, Phys...| 143 | > |[Aerosol Propella...| 144 | > +--------------------+ 145 | > only showing top 4 rows 146 | > 147 | > ``` 148 | 149 | Write a program that, using this file as input, produces a list of pairs of tags which appear TOGETHER in any order and position in at least fifty different Scientific articles. For example, in the above sample, [Female] and [Humans] appear together twice, but every other pair appears only once. Your program should output the pair list to stdout in the same form as the input (eg tag 1, tag 2n). 150 | 151 | * My solution 152 | 153 | > The corresponding words frequency: 154 | > 155 | > > ![https://runawayhorse001.github.io/LearningApacheSpark/_images/freq_word_ze.png](img/f5832d90e75d18f501ede7acb0b6ce74.jpg) 156 | > > 157 | > > Word frequency 158 | > 159 | > Output: 160 | > 161 | > ``` 162 | > +----------+------+-------+ 163 | > | term.x|term.y| freq| 164 | > +----------+------+-------+ 165 | > | Female|Humans|16741.0| 166 | > | Male|Humans|13883.0| 167 | > | Adult|Humans|10391.0| 168 | > | Male|Female| 9806.0| 169 | > |MiddleAged|Humans| 8181.0| 170 | > | Adult|Female| 7411.0| 171 | > | Adult| Male| 7240.0| 172 | > |MiddleAged| Male| 6328.0| 173 | > |MiddleAged|Female| 6002.0| 174 | > |MiddleAged| Adult| 5944.0| 175 | > +----------+------+-------+ 176 | > only showing top 10 rows 177 | > 178 | > ``` 179 | 180 | The corresponding Co-occurrence network: 181 | 182 | > ![https://runawayhorse001.github.io/LearningApacheSpark/_images/netfreq.png](img/8c3fdcf6adcc472c7cd7a4598f96caac.jpg) 183 | > 184 | > Co-occurrence network 185 | 186 | Then you will get Figure [Co-occurrence network](#fig-netfreq) 187 | 188 | ## 14.3\. Appendix: matrix multiplication in PySpark 189 | 190 | 1. load test matrix 191 | 192 | ``` 193 | df = spark.read.csv("matrix1.txt",sep=",",inferSchema=True) 194 | df.show() 195 | 196 | ``` 197 | 198 | ``` 199 | +---+---+---+---+ 200 | |_c0|_c1|_c2|_c3| 201 | +---+---+---+---+ 202 | |1.2|3.4|2.3|1.1| 203 | |2.3|1.1|1.5|2.2| 204 | |3.3|1.8|4.5|3.3| 205 | |5.3|2.2|4.5|4.4| 206 | |9.3|8.1|0.3|5.5| 207 | |4.5|4.3|2.1|6.6| 208 | +---+---+---+---+ 209 | 210 | ``` 211 | 212 | 1. main function for matrix multiplication in PySpark 213 | 214 | ``` 215 | from pyspark.sql import functions as F 216 | from functools import reduce 217 | # reference: https://stackoverflow.com/questions/44348527/matrix-multiplication-at-a-in-pyspark 218 | # do the sum of the multiplication that we want, and get 219 | # one data frame for each column 220 | colDFs = [] 221 | for c2 in df.columns: 222 | colDFs.append( df.select( [ F.sum(df[c1]*df[c2]).alias("op_{0}".format(i)) for i,c1 in enumerate(df.columns) ] ) ) 223 | # now union those separate data frames to build the "matrix" 224 | mtxDF = reduce(lambda a,b: a.select(a.columns).union(b.select(a.columns)), colDFs ) 225 | mtxDF.show() 226 | 227 | ``` 228 | 229 | ``` 230 | +------------------+------------------+------------------+------------------+ 231 | | op_0| op_1| op_2| op_3| 232 | +------------------+------------------+------------------+------------------+ 233 | | 152.45|118.88999999999999| 57.15|121.44000000000001| 234 | |118.88999999999999|104.94999999999999| 38.93| 94.71| 235 | | 57.15| 38.93|52.540000000000006| 55.99| 236 | |121.44000000000001| 94.71| 55.99|110.10999999999999| 237 | +------------------+------------------+------------------+------------------+ 238 | 239 | ``` 240 | 241 | 1. Validation with python version 242 | 243 | ``` 244 | import numpy as np 245 | a = np.genfromtxt("matrix1.txt",delimiter=",") 246 | np.dot(a.T, a) 247 | 248 | ``` 249 | 250 | ``` 251 | array([[152.45, 118.89, 57.15, 121.44], 252 | [118.89, 104.95, 38.93, 94.71], 253 | [ 57.15, 38.93, 52.54, 55.99], 254 | [121.44, 94.71, 55.99, 110.11]]) 255 | 256 | ``` 257 | 258 | ## 14.4\. Correlation Network 259 | 260 | TODO .. -------------------------------------------------------------------------------- /docs/15.md: -------------------------------------------------------------------------------- 1 | # 15\. ALS: Stock Portfolio Recommendations 2 | 3 | Chinese proverb 4 | 5 | **Don’t put all your eggs in one basket.** 6 | 7 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/stock_portfolio.png](img/b4a297ef2185e28694b366bde4069858.jpg) 8 | 9 | Code for the above figure: 10 | 11 | ``` 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | 15 | fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(aspect="equal")) 16 | 17 | recipe = ["375 k U.S. Large Cap Blend", 18 | "300 k U.S. Large Cap Value", 19 | "75 k U.S. Short-Term Bonds", 20 | "50 k U.S. Small Cap Blend", 21 | "55 k U.S. Small Cap Value", 22 | "95 k U.S. Real Estate", 23 | "250 k Intermediate-Term Bonds"] 24 | 25 | data = [float(x.split()[0]) for x in recipe] 26 | ingredients = [' '.join(x.split()[2:]) for x in recipe] 27 | 28 | print(data) 29 | print(ingredients) 30 | def func(pct, allvals): 31 | absolute = int(pct/100.*np.sum(allvals)) 32 | return "{:.1f}%\n({:d} k)".format(pct, absolute) 33 | 34 | explode = np.empty(len(data))#(0.1, 0.1, 0.1, 0.1, 0.1, 0.1) # explode 1st slice 35 | explode.fill(0.1) 36 | 37 | wedges, texts, autotexts = ax.pie(data, explode=explode, autopct=lambda pct: func(pct, data), 38 | textprops=dict(color="w")) 39 | ax.legend(wedges, ingredients, 40 | #title="Stock portfolio", 41 | loc="center left", 42 | bbox_to_anchor=(1, 0, 0.5, 1)) 43 | 44 | plt.setp(autotexts, size=8, weight="bold") 45 | 46 | #ax.set_title("Stock portfolio") 47 | 48 | plt.show() 49 | 50 | ``` 51 | 52 | ## 15.1\. Recommender systems 53 | 54 | Recommender systems or recommendation systems (sometimes replacing “system” with a synonym such as platform or engine) are a subclass of information filtering system that seek to predict the “rating” or “preference” that a user would give to an item.” 55 | 56 | The main idea is to build a matrix users `R` items rating values and try to factorize it, to recommend main products rated by other users. A popular approach for this is matrix factorization is Alternating Least Squares (ALS) 57 | 58 | ## 15.2\. Alternating Least Squares 59 | 60 | Apache Spark ML implements ALS for collaborative filtering, a very popular algorithm for making recommendations. 61 | 62 | ALS recommender is a matrix factorization algorithm that uses Alternating Least Squares with Weighted-Lamda-Regularization (ALS-WR). It factors the user to item matrix `A` into the user-to-feature matrix `U` and the item-to-feature matrix `M`: It runs the ALS algorithm in a parallel fashion. The ALS algorithm should uncover the latent factors that explain the observed user to item ratings and tries to find optimal factor weights to minimize the least squares between predicted and actual ratings. 63 | 64 | [https://www.elenacuoco.com/2016/12/22/alternating-least-squares-als-spark-ml/](https://www.elenacuoco.com/2016/12/22/alternating-least-squares-als-spark-ml/) 65 | 66 | ## 15.3\. Demo 67 | 68 | * The Jupyter notebook can be download from [ALS Recommender systems](_static/ALS.ipynb). 69 | * The data can be downloaf from [German Credit](_static/OnlineRetail.csv). 70 | 71 | ### 15.3.1\. Load and clean data 72 | 73 | 1. Set up spark context and SparkSession 74 | 75 | ``` 76 | from pyspark.sql import SparkSession 77 | 78 | spark = SparkSession \ 79 | .builder \ 80 | .appName("Python Spark RFM example") \ 81 | .config("spark.some.config.option", "some-value") \ 82 | .getOrCreate() 83 | 84 | ``` 85 | 86 | 1. Load dataset 87 | 88 | ``` 89 | df_raw = spark.read.format('com.databricks.spark.csv').\ 90 | options(header='true', \ 91 | inferschema='true').\ 92 | load("Online Retail.csv",header=True); 93 | 94 | ``` 95 | 96 | check the data set 97 | 98 | ``` 99 | df_raw.show(5) 100 | df_raw.printSchema() 101 | 102 | ``` 103 | 104 | Then you will get 105 | 106 | ``` 107 | +---------+---------+--------------------+--------+------------+---------+----------+--------------+ 108 | |InvoiceNo|StockCode| Description|Quantity| InvoiceDate|UnitPrice|CustomerID| Country| 109 | +---------+---------+--------------------+--------+------------+---------+----------+--------------+ 110 | | 536365| 85123A|WHITE HANGING HEA...| 6|12/1/10 8:26| 2.55| 17850|United Kingdom| 111 | | 536365| 71053| WHITE METAL LANTERN| 6|12/1/10 8:26| 3.39| 17850|United Kingdom| 112 | | 536365| 84406B|CREAM CUPID HEART...| 8|12/1/10 8:26| 2.75| 17850|United Kingdom| 113 | | 536365| 84029G|KNITTED UNION FLA...| 6|12/1/10 8:26| 3.39| 17850|United Kingdom| 114 | | 536365| 84029E|RED WOOLLY HOTTIE...| 6|12/1/10 8:26| 3.39| 17850|United Kingdom| 115 | +---------+---------+--------------------+--------+------------+---------+----------+--------------+ 116 | only showing top 5 rows 117 | 118 | root 119 | |-- InvoiceNo: string (nullable = true) 120 | |-- StockCode: string (nullable = true) 121 | |-- Description: string (nullable = true) 122 | |-- Quantity: integer (nullable = true) 123 | |-- InvoiceDate: string (nullable = true) 124 | |-- UnitPrice: double (nullable = true) 125 | |-- CustomerID: integer (nullable = true) 126 | |-- Country: string (nullable = true) 127 | 128 | ``` 129 | 130 | 1. Data clean and data manipulation 131 | 132 | * check and remove the `null` values 133 | 134 | ``` 135 | from pyspark.sql.functions import count 136 | 137 | def my_count(df_in): 138 | df_in.agg( *[ count(c).alias(c) for c in df_in.columns ] ).show() 139 | 140 | ``` 141 | 142 | ``` 143 | import pyspark.sql.functions as F 144 | from pyspark.sql.functions import round 145 | df_raw = df_raw.withColumn('Asset',round( F.col('Quantity') * F.col('UnitPrice'), 2 )) 146 | df = df_raw.withColumnRenamed('StockCode', 'Cusip')\ 147 | .select('CustomerID','Cusip','Quantity','UnitPrice','Asset') 148 | 149 | ``` 150 | 151 | ``` 152 | my_count(df) 153 | 154 | ``` 155 | 156 | ``` 157 | +----------+------+--------+---------+------+ 158 | |CustomerID| Cusip|Quantity|UnitPrice| Asset| 159 | +----------+------+--------+---------+------+ 160 | | 406829|541909| 541909| 541909|541909| 161 | +----------+------+--------+---------+------+ 162 | 163 | ``` 164 | 165 | Since the count results are not the same, we have some null value in the `CustomerID` column. We can drop these records from the dataset. 166 | 167 | ``` 168 | df = df.filter(F.col('Asset')>=0) 169 | df = df.dropna(how='any') 170 | my_count(df) 171 | 172 | ``` 173 | 174 | ``` 175 | +----------+------+--------+---------+------+ 176 | |CustomerID| Cusip|Quantity|UnitPrice| Asset| 177 | +----------+------+--------+---------+------+ 178 | | 397924|397924| 397924| 397924|397924| 179 | +----------+------+--------+---------+------+ 180 | 181 | ``` 182 | 183 | ``` 184 | df.show(3) 185 | 186 | +----------+------+--------+---------+-----+ 187 | |CustomerID| Cusip|Quantity|UnitPrice|Asset| 188 | +----------+------+--------+---------+-----+ 189 | | 17850|85123A| 6| 2.55| 15.3| 190 | | 17850| 71053| 6| 3.39|20.34| 191 | | 17850|84406B| 8| 2.75| 22.0| 192 | +----------+------+--------+---------+-----+ 193 | only showing top 3 rows 194 | 195 | ``` 196 | 197 | * Convert the `Cusip` to consistent format 198 | 199 | ``` 200 | from pyspark.sql.functions import udf 201 | from pyspark.sql.types import StringType, DoubleType 202 | 203 | def toUpper(s): 204 | return s.upper() 205 | 206 | upper_udf = udf(lambda x: toUpper(x), StringType()) 207 | 208 | ``` 209 | 210 | * Find the most top `n` stockes 211 | 212 | ``` 213 | pop = df.groupBy('Cusip')\ 214 | .agg(F.count('CustomerID').alias('Customers'),F.round(F.sum('Asset'),2).alias('TotalAsset'))\ 215 | .sort([F.col('Customers'),F.col('TotalAsset')],ascending=[0,0]) 216 | 217 | pop.show(5) 218 | 219 | ``` 220 | 221 | ``` 222 | +------+---------+----------+ 223 | | Cusip|Customers|TotalAsset| 224 | +------+---------+----------+ 225 | |85123A| 2035| 100603.5| 226 | | 22423| 1724| 142592.95| 227 | |85099B| 1618| 85220.78| 228 | | 84879| 1408| 56580.34| 229 | | 47566| 1397| 68844.33| 230 | +------+---------+----------+ 231 | only showing top 5 rows 232 | 233 | ``` 234 | 235 | ### 15.3.2\. Build feature matrix 236 | 237 | * Fetch the top `n` cusip list 238 | 239 | ``` 240 | top = 10 241 | cusip_lst = pd.DataFrame(pop.select('Cusip').head(top)).astype('str').iloc[:, 0].tolist() 242 | cusip_lst.insert(0,'CustomerID') 243 | 244 | ``` 245 | 246 | * Create the portfolio table for each customer 247 | 248 | ``` 249 | pivot_tab = df.groupBy('CustomerID').pivot('Cusip').sum('Asset') 250 | pivot_tab = pivot_tab.fillna(0) 251 | 252 | ``` 253 | 254 | * Fetch the most `n` stock’s portfolio table for each customer 255 | 256 | ``` 257 | selected_tab = pivot_tab.select(cusip_lst) 258 | selected_tab.show(4) 259 | 260 | ``` 261 | 262 | ``` 263 | +----------+------+-----+------+-----+-----+-----+-----+-----+----+-----+ 264 | |CustomerID|85123A|22423|85099B|84879|47566|20725|22720|20727|POST|23203| 265 | +----------+------+-----+------+-----+-----+-----+-----+-----+----+-----+ 266 | | 16503| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 33.0| 0.0| 0.0| 267 | | 15727| 123.9| 25.5| 0.0| 0.0| 0.0| 33.0| 99.0| 0.0| 0.0| 0.0| 268 | | 14570| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 269 | | 14450| 0.0| 0.0| 8.32| 0.0| 0.0| 0.0| 49.5| 0.0| 0.0| 0.0| 270 | +----------+------+-----+------+-----+-----+-----+-----+-----+----+-----+ 271 | only showing top 4 rows 272 | 273 | ``` 274 | 275 | * Build the `rating` matrix 276 | 277 | ``` 278 | def elemwiseDiv(df_in): 279 | num = len(df_in.columns) 280 | temp = df_in.rdd.map(lambda x: list(flatten([x[0],[x[i]/float(sum(x[1:])) 281 | if sum(x[1:])>0 else x[i] 282 | for i in range(1,num)]]))) 283 | return spark.createDataFrame(temp,df_in.columns) 284 | 285 | ratings = elemwiseDiv(selected_tab) 286 | 287 | ``` 288 | 289 | ``` 290 | ratings.show(4) 291 | 292 | +----------+------+-----+------+-----+-----+-----+-----+-----+----+-----+ 293 | |CustomerID|85123A|22423|85099B|84879|47566|20725|22720|20727|POST|23203| 294 | +----------+------+-----+------+-----+-----+-----+-----+-----+----+-----+ 295 | | 16503| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 1.0| 0.0| 0.0| 296 | | 15727| 0.44| 0.09| 0.0| 0.0| 0.0| 0.12| 0.35| 0.0| 0.0| 0.0| 297 | | 14570| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 298 | | 14450| 0.0| 0.0| 0.14| 0.0| 0.0| 0.0| 0.86| 0.0| 0.0| 0.0| 299 | +----------+------+-----+------+-----+-----+-----+-----+-----+----+-----+ 300 | 301 | ``` 302 | 303 | * Convert `rating` matrix to long table 304 | 305 | ``` 306 | from pyspark.sql.functions import array, col, explode, struct, lit 307 | 308 | def to_long(df, by): 309 | """ 310 | reference: https://stackoverflow.com/questions/37864222/transpose-column-to-row-with-spark 311 | """ 312 | 313 | # Filter dtypes and split into column names and type description 314 | cols, dtypes = zip(*((c, t) for (c, t) in df.dtypes if c not in by)) 315 | # Spark SQL supports only homogeneous columns 316 | assert len(set(dtypes)) == 1, "All columns have to be of the same type" 317 | 318 | # Create and explode an array of (column_name, column_value) structs 319 | kvs = explode(array([ 320 | struct(lit(c).alias("Cusip"), col(c).alias("rating")) for c in cols 321 | ])).alias("kvs") 322 | 323 | ``` 324 | 325 | ``` 326 | df_all = to_long(ratings,['CustomerID']) 327 | df_all.show(5) 328 | 329 | ``` 330 | 331 | ``` 332 | +----------+------+------+ 333 | |CustomerID| Cusip|rating| 334 | +----------+------+------+ 335 | | 16503|85123A| 0.0| 336 | | 16503| 22423| 0.0| 337 | | 16503|85099B| 0.0| 338 | | 16503| 84879| 0.0| 339 | | 16503| 47566| 0.0| 340 | +----------+------+------+ 341 | only showing top 5 rows 342 | 343 | ``` 344 | 345 | * Convert the string `Cusip` to numerical index 346 | 347 | ``` 348 | from pyspark.ml.feature import StringIndexer 349 | # Index labels, adding metadata to the label column 350 | labelIndexer = StringIndexer(inputCol='Cusip', 351 | outputCol='indexedCusip').fit(df_all) 352 | df_all = labelIndexer.transform(df_all) 353 | 354 | df_all.show(5, True) 355 | df_all.printSchema() 356 | 357 | ``` 358 | 359 | ``` 360 | +----------+------+------+------------+ 361 | |CustomerID| Cusip|rating|indexedCusip| 362 | +----------+------+------+------------+ 363 | | 16503|85123A| 0.0| 6.0| 364 | | 16503| 22423| 0.0| 9.0| 365 | | 16503|85099B| 0.0| 5.0| 366 | | 16503| 84879| 0.0| 1.0| 367 | | 16503| 47566| 0.0| 0.0| 368 | +----------+------+------+------------+ 369 | only showing top 5 rows 370 | 371 | root 372 | |-- CustomerID: long (nullable = true) 373 | |-- Cusip: string (nullable = false) 374 | |-- rating: double (nullable = true) 375 | |-- indexedCusip: double (nullable = true) 376 | 377 | ``` 378 | 379 | ### 15.3.3\. Train model 380 | 381 | * build `train` and `test` dataset 382 | 383 | ``` 384 | train, test = df_all.randomSplit([0.8,0.2]) 385 | 386 | train.show(5) 387 | test.show(5) 388 | 389 | ``` 390 | 391 | ``` 392 | +----------+-----+------------+-------------------+ 393 | |CustomerID|Cusip|indexedCusip| rating| 394 | +----------+-----+------------+-------------------+ 395 | | 12940|20725| 2.0| 0.0| 396 | | 12940|20727| 4.0| 0.0| 397 | | 12940|22423| 9.0|0.49990198000392083| 398 | | 12940|22720| 3.0| 0.0| 399 | | 12940|23203| 7.0| 0.0| 400 | +----------+-----+------------+-------------------+ 401 | only showing top 5 rows 402 | 403 | +----------+-----+------------+------------------+ 404 | |CustomerID|Cusip|indexedCusip| rating| 405 | +----------+-----+------------+------------------+ 406 | | 12940|84879| 1.0|0.1325230346990786| 407 | | 13285|20725| 2.0|0.2054154995331466| 408 | | 13285|20727| 4.0|0.2054154995331466| 409 | | 13285|47566| 0.0| 0.0| 410 | | 13623|23203| 7.0| 0.0| 411 | +----------+-----+------------+------------------+ 412 | only showing top 5 rows 413 | 414 | ``` 415 | 416 | * train model 417 | 418 | ``` 419 | import itertools 420 | from math import sqrt 421 | from operator import add 422 | import sys 423 | from pyspark.ml.recommendation import ALS 424 | 425 | from pyspark.ml.evaluation import RegressionEvaluator 426 | 427 | evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", 428 | predictionCol="prediction") 429 | def computeRmse(model, data): 430 | """ 431 | Compute RMSE (Root mean Squared Error). 432 | """ 433 | predictions = model.transform(data) 434 | rmse = evaluator.evaluate(predictions) 435 | print("Root-mean-square error = " + str(rmse)) 436 | return rmse 437 | 438 | #train models and evaluate them on the validation set 439 | 440 | ranks = [4,5] 441 | lambdas = [0.05] 442 | numIters = [30] 443 | bestModel = None 444 | bestValidationRmse = float("inf") 445 | bestRank = 0 446 | bestLambda = -1.0 447 | bestNumIter = -1 448 | 449 | val = test.na.drop() 450 | for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters): 451 | als = ALS(rank=rank, maxIter=numIter, regParam=lmbda, numUserBlocks=10, numItemBlocks=10, implicitPrefs=False, 452 | alpha=1.0, 453 | userCol="CustomerID", itemCol="indexedCusip", seed=1, ratingCol="rating", nonnegative=True) 454 | model=als.fit(train) 455 | 456 | validationRmse = computeRmse(model, val) 457 | print("RMSE (validation) = %f for the model trained with " % validationRmse + \ 458 | "rank = %d, lambda = %.1f, and numIter = %d." % (rank, lmbda, numIter)) 459 | if (validationRmse, bestValidationRmse): 460 | bestModel = model 461 | bestValidationRmse = validationRmse 462 | bestRank = rank 463 | bestLambda = lmbda 464 | bestNumIter = numIter 465 | 466 | model = bestModel 467 | 468 | ``` 469 | 470 | ### 15.3.4\. Make prediction 471 | 472 | * make prediction 473 | 474 | ``` 475 | topredict=test[test['rating']==0] 476 | 477 | predictions=model.transform(topredict) 478 | predictions.filter(predictions.prediction>0)\ 479 | .sort([F.col('CustomerID'),F.col('Cusip')],ascending=[0,0]).show(5) 480 | 481 | ``` 482 | 483 | ``` 484 | +----------+------+------------+------+------------+ 485 | |CustomerID| Cusip|indexedCusip|rating| prediction| 486 | +----------+------+------------+------+------------+ 487 | | 18283| 47566| 0.0| 0.0| 0.01625076| 488 | | 18282|85123A| 6.0| 0.0| 0.057172246| 489 | | 18282| 84879| 1.0| 0.0| 0.059531752| 490 | | 18282| 23203| 7.0| 0.0| 0.010502596| 491 | | 18282| 22720| 3.0| 0.0| 0.053893942| 492 | +----------+------+------------+------+------------+ 493 | only showing top 5 rows 494 | 495 | ``` -------------------------------------------------------------------------------- /docs/16.md: -------------------------------------------------------------------------------- 1 | # 16\. Monte Carlo Simulation 2 | 3 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/baba_sim_price.png](img/68b23d0b045f72286a7dd26502a39dd0.jpg) 4 | 5 | Monte Carlo simulations are just a way of estimating a fixed parameter by repeatedly generating random numbers. More details can be found at [A Zero Math Introduction to Markov Chain Monte Carlo Methods](https://towardsdatascience.com/a-zero-math-introduction-to-markov-chain-monte-carlo-methods-dcba889e0c50). 6 | 7 | Monte Carlo simulation is a technique used to understand the impact of risk and uncertainty in financial, project management, cost, and other forecasting models. A Monte Carlo simulator helps one visualize most or all of the potential outcomes to have a better idea regarding the risk of a decision. More details can be found at [The house always wins](https://towardsdatascience.com/the-house-always-wins-monte-carlo-simulation-eb82787da2a3). 8 | 9 | ## 16.1\. Simulating Casino Win 10 | 11 | We assume that the player John has the 49% chance to win the game and the wager will be $5 per game. 12 | 13 | ``` 14 | import numpy as np 15 | import pandas as pd 16 | import matplotlib.pyplot as plt 17 | 18 | start_m =100 19 | wager = 5 20 | bets = 100 21 | trials = 1000 22 | 23 | trans = np.vectorize(lambda t: -wager if t <=0.51 else wager) 24 | 25 | fig = plt.figure(figsize=(10, 6)) 26 | ax = fig.add_subplot(1,1,1) 27 | 28 | end_m = [] 29 | 30 | for i in range(trials): 31 | money = reduce(lambda c, x: c + [c[-1] + x], trans(np.random.random(bets)), [start_m]) 32 | end_m.append(money[-1]) 33 | plt.plot(money) 34 | 35 | plt.ylabel('Player Money in $') 36 | plt.xlabel('Number of bets') 37 | plt.title(("John starts the game with $ %.2f and ends with $ %.2f")%(start_m,sum(end_m)/len(end_m))) 38 | plt.show() 39 | 40 | ``` 41 | 42 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/casino_5_100.png](img/f80c359151c40c9277e2d70f38856eab.jpg) ![https://runawayhorse001.github.io/LearningApacheSpark/_images/casino_100_1000.png](img/06dae2d3c7cb1a64cb31ed2482e632b7.jpg) 43 | 44 | ## 16.2\. Simulating a Random Walk 45 | 46 | ### 16.2.1\. Fetch the histrical stock price 47 | 48 | 1. Fecth the data. If you need the code for this piece, you can contact with me. 49 | 50 | ``` 51 | stock.tail(4) 52 | 53 | +----------+----------+----------+----------+----------+----------+--------+ 54 | | Date| Open| High| Low| Close| Adj Close| Volume| 55 | +----------+----------+----------+----------+----------+----------+--------+ 56 | |2018-12-07|155.399994|158.050003|151.729996|153.059998|153.059998|17447900| 57 | |2018-12-10|150.389999|152.809998|147.479996|151.429993|151.429993|15525500| 58 | |2018-12-11|155.259995|156.240005|150.899994|151.830002|151.830002|13651900| 59 | |2018-12-12|155.240005|156.169998|151.429993| 151.5| 151.5|16597900| 60 | +----------+----------+----------+----------+----------+----------+--------+ 61 | 62 | ``` 63 | 64 | 1. Convert the `str` type date to date type 65 | 66 | ``` 67 | stock['Date'] = pd.to_datetime(stock['Date']) 68 | 69 | ``` 70 | 71 | 1. Data visualization 72 | 73 | ``` 74 | # Plot everything by leveraging the very powerful matplotlib package 75 | width = 10 76 | height = 6 77 | data = stock 78 | fig = plt.figure(figsize=(width, height)) 79 | ax = fig.add_subplot(1,1,1) 80 | ax.plot(data.Date, data.Close, label='Close') 81 | ax.plot(data.Date, data.High, label='High') 82 | # ax.plot(data.Date, data.Low, label='Low') 83 | ax.set_xlabel('Date') 84 | ax.set_ylabel('price ($)') 85 | ax.legend() 86 | ax.set_title('Stock price: ' + ticker, y=1.01) 87 | #plt.xticks(rotation=70) 88 | plt.show() 89 | # Plot everything by leveraging the very powerful matplotlib package 90 | fig = plt.figure(figsize=(width, height)) 91 | ax = fig.add_subplot(1,1,1) 92 | ax.plot(data.Date, data.Volume, label='Volume') 93 | #ax.plot(data.Date, data.High, label='High') 94 | # ax.plot(data.Date, data.Low, label='Low') 95 | ax.set_xlabel('Date') 96 | ax.set_ylabel('Volume') 97 | ax.legend() 98 | ax.set_title('Stock volume: ' + ticker, y=1.01) 99 | #plt.xticks(rotation=70) 100 | plt.show() 101 | 102 | ``` 103 | 104 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/baba_history.png](img/112822dfffe3d0e59d7881d265b78cad.jpg) 105 | 106 | Historical Stock Price 107 | 108 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/baba_history_vol.png](img/679879d3fb7800c91c8321c24ce0f8d9.jpg) 109 | 110 | Historical Stock Volume 111 | 112 | ### 16.2.2\. Calulate the Compound Annual Growth Rate 113 | 114 | The formula for Compound Annual Growth Rate (CAGR) is very useful for investment analysis. It may also be referred to as the annualized rate of return or annual percent yield or effective annual rate, depending on the algebraic form of the equation. Many investments such as stocks have returns that can vary wildly. The CAGR formula allows you to calculate a “smoothed” rate of return that you can use to compare to other investments. The formula is defined as (more details can be found at [CAGR Calculator and Formula](https://www.vertex42.com/Calculators/cagr-calculator.html)) 115 | 116 | ![\text{CAGR}=\left(\frac{\text{End Value}}{\text{Start Value}}\right)^{\frac{365}{\text{Days}}} -1](img/26e6828e76fd4c01d8aea7770dd032c7.jpg) 117 | 118 | ``` 119 | days = (stock.Date.iloc[-1] - stock.Date.iloc[0]).days 120 | cagr = ((((stock['Adj Close'].iloc[-1]) / stock['Adj Close'].iloc[0])) ** (365.0/days)) - 1 121 | print ('CAGR =',str(round(cagr,4)*100)+"%") 122 | mu = cagr 123 | 124 | ``` 125 | 126 | ### 16.2.3\. Calulate the annual volatility 127 | 128 | A stock’s volatility is the variation in its price over a period of time. For example, one stock may have a tendency to swing wildly higher and lower, while another stock may move in much steadier, less turbulent way. Both stocks may end up at the same price at the end of day, but their path to that point can vary wildly. First, we create a series of percentage returns and calculate the annual volatility of returns Annualizing volatility. To present this volatility in annualized terms, we simply need to multiply our daily standard deviation by the square root of 252\. This assumes there are 252 trading days in a given year. More details can be found at [How to Calculate Annualized Volatility](https://www.fool.com/knowledge-center/how-to-calculate-annualized-volatility.aspx). 129 | 130 | ``` 131 | stock['Returns'] = stock['Adj Close'].pct_change() 132 | vol = stock['Returns'].std()*np.sqrt(252) 133 | 134 | ``` 135 | 136 | ### 16.2.4\. Create matrix of daily returns 137 | 138 | 1. Create matrix of daily returns using random normal distribution Generates an RDD matrix comprised of i.i.d. samples from the uniform distribution U(0.0, 1.0). 139 | 140 | ``` 141 | S = stock['Adj Close'].iloc[-1] #starting stock price (i.e. last available real stock price) 142 | T = 5 #Number of trading days 143 | mu = cagr #Return 144 | vol = vol #Volatility 145 | trials = 10000 146 | 147 | mat = RandomRDDs.normalVectorRDD(sc, trials, T, seed=1) 148 | 149 | ``` 150 | 151 | 1. Transform the distribution in the generated RDD from U(0.0, 1.0) to U(a, b), use RandomRDDs.uniformRDD(sc, n, p, seed) .map(lambda v: a + (b - a) * v) 152 | 153 | ``` 154 | a = mu/T 155 | b = vol/math.sqrt(T) 156 | v = mat.map(lambda x: a + (b - a)* x) 157 | 158 | ``` 159 | 160 | 1. Convert Rdd mstrix to dataframe 161 | 162 | ``` 163 | df = v.map(lambda x: [round(i,6)+1 for i in x]).toDF() 164 | df.show(5) 165 | 166 | ``` 167 | 168 | ``` 169 | +--------+--------+--------+--------+--------+ 170 | | _1| _2| _3| _4| _5| 171 | +--------+--------+--------+--------+--------+ 172 | |0.935234|1.162894| 1.07972|1.238257|1.066136| 173 | |0.878456|1.045922|0.990071|1.045552|0.854516| 174 | |1.186472|0.944777|0.742247|0.940023|1.220934| 175 | |0.872928|1.030882|1.248644|1.114262|1.063762| 176 | | 1.09742|1.188537|1.137283|1.162548|1.024612| 177 | +--------+--------+--------+--------+--------+ 178 | only showing top 5 rows 179 | 180 | ``` 181 | 182 | ``` 183 | from pyspark.sql.functions import lit 184 | S = stock['Adj Close'].iloc[-1] 185 | price = df.withColumn('init_price' ,lit(S)) 186 | 187 | ``` 188 | 189 | ``` 190 | price.show(5) 191 | 192 | +--------+--------+--------+--------+--------+----------+ 193 | | _1| _2| _3| _4| _5|init_price| 194 | +--------+--------+--------+--------+--------+----------+ 195 | |0.935234|1.162894| 1.07972|1.238257|1.066136| 151.5| 196 | |0.878456|1.045922|0.990071|1.045552|0.854516| 151.5| 197 | |1.186472|0.944777|0.742247|0.940023|1.220934| 151.5| 198 | |0.872928|1.030882|1.248644|1.114262|1.063762| 151.5| 199 | | 1.09742|1.188537|1.137283|1.162548|1.024612| 151.5| 200 | +--------+--------+--------+--------+--------+----------+ 201 | only showing top 5 rows 202 | 203 | ``` 204 | 205 | ``` 206 | price = price.withColumn('day_0', col('init_price')) 207 | price.show(5) 208 | 209 | ``` 210 | 211 | ``` 212 | +--------+--------+--------+--------+--------+----------+-----+ 213 | | _1| _2| _3| _4| _5|init_price|day_0| 214 | +--------+--------+--------+--------+--------+----------+-----+ 215 | |0.935234|1.162894| 1.07972|1.238257|1.066136| 151.5|151.5| 216 | |0.878456|1.045922|0.990071|1.045552|0.854516| 151.5|151.5| 217 | |1.186472|0.944777|0.742247|0.940023|1.220934| 151.5|151.5| 218 | |0.872928|1.030882|1.248644|1.114262|1.063762| 151.5|151.5| 219 | | 1.09742|1.188537|1.137283|1.162548|1.024612| 151.5|151.5| 220 | +--------+--------+--------+--------+--------+----------+-----+ 221 | only showing top 5 rows 222 | 223 | ``` 224 | 225 | ### 16.2.5\. Monte Carlo Simulation 226 | 227 | ``` 228 | from pyspark.sql.functions import round 229 | for name in price.columns[:-2]: 230 | price = price.withColumn('day'+name, round(col(name)*col('init_price'),2)) 231 | price = price.withColumn('init_price',col('day'+name)) 232 | 233 | ``` 234 | 235 | ``` 236 | price.show(5) 237 | 238 | +--------+--------+--------+--------+--------+----------+-----+------+------+------+------+------+ 239 | | _1| _2| _3| _4| _5|init_price|day_0| day_1| day_2| day_3| day_4| day_5| 240 | +--------+--------+--------+--------+--------+----------+-----+------+------+------+------+------+ 241 | |0.935234|1.162894| 1.07972|1.238257|1.066136| 234.87|151.5|141.69|164.77|177.91| 220.3|234.87| 242 | |0.878456|1.045922|0.990071|1.045552|0.854516| 123.14|151.5|133.09| 139.2|137.82| 144.1|123.14| 243 | |1.186472|0.944777|0.742247|0.940023|1.220934| 144.67|151.5|179.75|169.82|126.05|118.49|144.67| 244 | |0.872928|1.030882|1.248644|1.114262|1.063762| 201.77|151.5|132.25|136.33|170.23|189.68|201.77| 245 | | 1.09742|1.188537|1.137283|1.162548|1.024612| 267.7|151.5|166.26|197.61|224.74|261.27| 267.7| 246 | +--------+--------+--------+--------+--------+----------+-----+------+------+------+------+------+ 247 | only showing top 5 rows 248 | 249 | ``` 250 | 251 | ### 16.2.6\. Summary 252 | 253 | ``` 254 | selected_col = [name for name in price.columns if 'day_' in name] 255 | 256 | simulated = price.select(selected_col) 257 | simulated.describe().show() 258 | 259 | ``` 260 | 261 | ``` 262 | +-------+----------+------------------+------------------+------------------+------------------+------------------+ 263 | |summary|2018-12-12| 2018-12-13| 2018-12-14| 2018-12-17| 2018-12-18| 2018-12-19| 264 | +-------+----------+------------------+------------------+------------------+------------------+------------------+ 265 | | count| 10000.0| 10000.0| 10000.0| 10000.0| 10000.0| 10000.0| 266 | | mean| 151.5|155.11643700000002| 158.489058|162.23713200000003| 166.049375| 170.006525| 267 | | std| 0.0|18.313783237787845|26.460919262517276| 33.37780495150803|39.369101074463416|45.148120695490846| 268 | | min| 151.5| 88.2| 74.54| 65.87| 68.21| 58.25| 269 | | 25%| 151.5| 142.485| 140.15| 138.72| 138.365| 137.33| 270 | | 50%| 151.5| 154.97| 157.175| 159.82| 162.59|165.04500000000002| 271 | | 75%| 151.5| 167.445|175.48499999999999| 182.8625| 189.725| 196.975| 272 | | max| 151.5| 227.48| 275.94| 319.17| 353.59| 403.68| 273 | +-------+----------+------------------+------------------+------------------+------------------+------------------+ 274 | 275 | ``` 276 | 277 | ``` 278 | data_plt = simulated.toPandas() 279 | days = pd.date_range(stock['Date'].iloc[-1], periods= T+1,freq='B').date 280 | 281 | width = 10 282 | height = 6 283 | fig = plt.figure(figsize=(width, height)) 284 | ax = fig.add_subplot(1,1,1) 285 | 286 | days = pd.date_range(stock['Date'].iloc[-1], periods= T+1,freq='B').date 287 | 288 | for i in range(trials): 289 | plt.plot(days, data_plt.iloc[i]) 290 | ax.set_xlabel('Date') 291 | ax.set_ylabel('price ($)') 292 | ax.set_title('Simulated Stock price: ' + ticker, y=1.01) 293 | plt.show() 294 | 295 | ``` 296 | 297 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/baba_sim_price_demo.png](img/d7945700ecde92ac83058e07433755da.jpg) 298 | 299 | ### 16.2.7\. One-year Stock price simulation 300 | 301 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/baba_sim_price.png](img/68b23d0b045f72286a7dd26502a39dd0.jpg) 302 | 303 | Simulated Stock Price 304 | 305 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/baba_sim_dis1.png](img/2b9e0aad3a11fee86e4261a1e94749e0.jpg) 306 | 307 | Simulated Stock Price distribution -------------------------------------------------------------------------------- /docs/17.md: -------------------------------------------------------------------------------- 1 | # 17\. Markov Chain Monte Carlo 2 | 3 | Chinese proverb 4 | 5 | **A book is known in time of need.** 6 | 7 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/mcmc_py.png](img/b7d416463ca0be2cb7caf7d373dc26cc.jpg) 8 | 9 | Monte Carlo simulations are just a way of estimating a fixed parameter by repeatedly generating random numbers. More details can be found at [A Zero Math Introduction to Markov Chain Monte Carlo Methods](https://towardsdatascience.com/a-zero-math-introduction-to-markov-chain-monte-carlo-methods-dcba889e0c50). 10 | 11 | Markov Chain Monte Carlo (MCMC) methods are used to approximate the posterior distribution of a parameter of interest by random sampling in a probabilistic space. More details can be found at [A Zero Math Introduction to Markov Chain Monte Carlo Methods](https://towardsdatascience.com/a-zero-math-introduction-to-markov-chain-monte-carlo-methods-dcba889e0c50). 12 | 13 | The following theory and demo are from Dr. Rebecca C. Steorts’s [Intro to Markov Chain Monte Carlo](http://www2.stat.duke.edu/~rcs46/lecturesModernBayes/601-module6-markov/markov-chain-monte-carlo.pdf). More details can be found at Dr. Rebecca C. Steorts’s STA 360/601: [Bayesian Methods and Modern Statistics](http://www2.stat.duke.edu/~rcs46/bayes.html) class at Duke. 14 | 15 | ## 17.1\. Metropolis algorithm 16 | 17 | The Metropolis algorithm takes three main steps: 18 | 19 | 1. Sample ![\theta^* \sim J(\theta | \theta ^{(s)})](img/a70ad9ce54b1e6e921ef6745fcef45da.jpg) 20 | 21 | 2. Compute the acceptance ratio ![(r)](img/0679a6e10e2c4166bb23f4effc3d5887.jpg) 22 | 23 | > ![r = \frac{p(\theta^*|y)}{p(\theta^{(s)}|y)} = \frac{p(y|\theta^*)p(\theta^*)}{p(y|\theta^{(s)})p(\theta^{(s)})}](img/4e5f23199f8f3cd55221e56cb05bd116.jpg) 24 | 25 | 3. Let 26 | 27 | > (1)![\theta^{(s+1)} 28 | > = 29 | > \left\{ 30 | > \begin{array}{ll} 31 | > \theta^* &\text{ with prob min}{(r,1)} \\ 32 | > \theta^{(s)} &\text{ otherwise } 33 | > \end{array} 34 | > \right.](img/6b9d05253edbae95f27fa6c58648c872.jpg) 35 | 36 | Note 37 | 38 | Actually, the [(1)](#equation-eq-step3) in Step 3 can be replaced by sampling ![u \sim \text{Uniform}(0,1)](img/29928a4b917a9348751e7d0410712045.jpg) and setting ![\theta^{(s+1)}=\theta^*](img/3c0d894f48daba4e004eddd747cb556c.jpg) if ![u<r](img/3f959ef0c8e56c21a00dceb01eb6e6c1.jpg) and setting ![\theta^{(s+1)}=\theta^{(s)}](img/97fa452fc3843513b0746e76d216be78.jpg) otherwise. 39 | 40 | ## 17.2\. A Toy Example of Metropolis 41 | 42 | The following example is going to test out the Metropolis algorithm for the conjugate Normal-Normal model with a known variance situation. 43 | 44 | ### 17.2.1\. Conjugate Normal-Normal model 45 | 46 | > ![\begin{array}{ll} 47 | > X_1, \cdots, X_n & \theta \stackrel{iid}{\sim}\text{Normal}(\theta,\sigma^2)\\ 48 | > & \theta \sim\text{Normal}(\mu,\tau^2) 49 | > \end{array}](img/46803121f8c51dcb1422593800354126.jpg) 50 | 51 | Recall that the posterior of ![\theta](img/ed008064e9d0e55dc93f673b9aca6b65.jpg) is ![\text{Normal}(\mu_n,\tau^2_n)](img/5a64add1cdcd06c1755071eba4040184.jpg), where 52 | 53 | > ![\mu_n = \bar{x}\frac{n/\sigma^2}{n/\sigma^2+1/\tau^2} + \mu\frac{1/\tau^2}{n/\sigma^2+1/\tau^2}](img/9af4ce81465021e68c774194432663c0.jpg) 54 | 55 | and 56 | 57 | > ![\tau_n^2 = \frac{1}{n/\sigma^2+1/\tau^2}](img/5afb60bc18fcf81ab2ff1282bb035030.jpg) 58 | 59 | ### 17.2.2\. Example setup 60 | 61 | The rest of the parameters are ![\sigma^2=1](img/7b85580dc5d18fc8fec74ce74849c182.jpg), ![\tau^2=10](img/210c2cca588f51d57a4eee64f09d4b38.jpg), ![\mu=5](img/fcc4c8c5db1d6aa3ff080466e10ccb74.jpg), ![n=5](img/2f97b51dd2c305123e29377f808272b2.jpg) and 62 | 63 | > ![y = [9.37, 10.18, 9.16, 11.60, 10.33]](img/d9195c20e19c173ec6d22c2e60a2cddb.jpg) 64 | 65 | For this setup, we get that ![\mu_n=10.02745](img/eea23835a8abd9d903f56256c18cf8aa.jpg) and ![\tau_n^2=0.1960784](img/82f787ceda05c98a84ab98cdc998025c.jpg). 66 | 67 | ### 17.2.3\. Essential mathematical derivation 68 | 69 | In the [Metropolis algorithm](#metroalg), we need to compute the acceptance ratio ![r](img/40bca4c232843e818fac0254a2ae2689.jpg), i.e. 70 | 71 | > ![r &= \frac{p(\theta^*|x)}{p(\theta^{(s)}|x)} \\ 72 | > &= \frac{p(x|\theta^*)p(\theta^*)}{p(x|\theta^{(s)})p(\theta^{(s)})}\\ 73 | > &= \left(\frac{\prod_i\text{dnorm}(x_i,\theta^*,\sigma)}{\prod_i\text{dnorm}(x_i,\theta^{(s)},\sigma)}\right) 74 | > \left(\frac{\text{dnorm}(\theta^*,\mu,\tau)}{\text{dnorm}(\theta^{(s)},\mu,\tau)}\right)](img/5d730ed5b317fc0cf48f64f9e3312d92.jpg) 75 | 76 | In many cases, computing the ratio ![r](img/40bca4c232843e818fac0254a2ae2689.jpg) directly can be numerically unstable, however, this can be modified by taking ![log r](img/6202e78af1fef25458e1696f4a8ae057.jpg). i.e. 77 | 78 | > ![logr &= \sum_i \left(log[\text{dnorm}(x_i,\theta^*,\sigma)] - log[\text{dnorm}(x_i, \theta^{(s)}, \sigma)]\right)\\ 79 | > &+ \sum_i \left(log[\text{dnorm}(\theta^*,\mu,\tau)] - log[\text{dnorm}(\theta^{(s)}, \mu,\tau)]\right)](img/7ab51337f3701838a285ea3a7346a777.jpg) 80 | 81 | Then the criteria of the acceptance becomes: if ![log u< log r](img/3de6020b5a20976d8e1244b98a1ae30b.jpg), where ![u](img/76b7cb4ea7971a8dc0db7a36feb66a35.jpg) is sample form the ![\text{Uniform}(0,1)](img/80f5d2344fd2f483b82b81d0a33e9333.jpg). 82 | 83 | ## 17.3\. Demos 84 | 85 | Now, We generate ![S](img/40877c463ec6621caf8f742f1e5c7c05.jpg) iterations of the Metropolis algorithm starting at ![\theta^{(0)}=0](img/43393ec4f575c391ddfca83324ec67a8.jpg) and using a normal proposal distribution, where 86 | 87 | > ![\theta^{(s+1)} \sim \text{Normal}(\theta^{(s)},2).](img/7a9b61eccdf8d02c95b6cd81a63e02ac.jpg) 88 | 89 | ### 17.3.1\. R results 90 | 91 | ``` 92 | # setting values 93 | set.seed(1) 94 | s2<-1 95 | t2<-10 96 | mu<-5; n<-5 97 | 98 | # rounding the rnorm to 2 decimal places 99 | y<-round(rnorm(n,10,1),2) 100 | # mean of the normal posterior 101 | mu.n<-( mean(y)*n/s2 + mu/t2 )/( n/s2+1/t2) 102 | # variance of the normal posterior 103 | t2.n<-1/(n/s2+1/t2) 104 | # defining the data 105 | y<-c(9.37, 10.18, 9.16, 11.60, 10.33) 106 | 107 | ####metropolis part#### 108 | ##S = total num of simulations 109 | theta<-0 ; delta<-2 ; S<-10000 ; THETA<-NULL ; set.seed(1) 110 | for(s in 1:S){ 111 | ## simulating our proposal 112 | #the new value of theta 113 | #print(theta) 114 | theta.star<-rnorm(1,theta,sqrt(delta)) 115 | ##taking the log of the ratio r 116 | log.r<-( sum(dnorm(y,theta.star,sqrt(s2),log=TRUE))+ 117 | dnorm(theta.star,mu,sqrt(t2),log=TRUE))- 118 | ( sum(dnorm(y,theta,sqrt(s2),log=TRUE))+ 119 | dnorm(theta,mu,sqrt(t2),log=TRUE)) 120 | #print(log.r) 121 | if(log(runif(1))