├── .gitignore
├── LICENSE
├── cover.jpg
├── docs
    ├── 1.md
    ├── 10.md
    ├── 11.md
    ├── 12.md
    ├── 13.md
    ├── 14.md
    ├── 15.md
    ├── 16.md
    ├── 17.md
    ├── 18.md
    ├── 19.md
    ├── 2.md
    ├── 20.md
    ├── 21.md
    ├── 22.md
    ├── 3.md
    ├── 4.md
    ├── 5.md
    ├── 6.md
    ├── 7.md
    ├── 8.md
    ├── 9.md
    └── img
    │   ├── 000d96e53607268ac90aab877bb7dbfd.jpg
    │   ├── 007f623d1ec885d996b1b72689ba7cb1.jpg
    │   ├── 018af0886bf373be0fc585cac38b2d6c.jpg
    │   ├── 0539212d2d3e4c28b27805e3c8783cab.jpg
    │   ├── 067197a5eeb69cc2f3d828a92ebcf52e.jpg
    │   ├── 0679a6e10e2c4166bb23f4effc3d5887.jpg
    │   ├── 06dae2d3c7cb1a64cb31ed2482e632b7.jpg
    │   ├── 0702875bab1a20dbb9d95fab3813c019.jpg
    │   ├── 070e654d25157c32f4038fb2daa42351.jpg
    │   ├── 08c6744e242573f80b23af5dbbf21a94.jpg
    │   ├── 09f553a8fd91309c3c1c2634e1b5ca15.jpg
    │   ├── 0a52ececadd638e5127d7a562d9b00b6.jpg
    │   ├── 0ac8a503cc147ea1ccb9c24bf83a5992.jpg
    │   ├── 0d2c607e00ca608222b80fa6b61e780a.jpg
    │   ├── 0e33aec96020afa0297be6d91db0d5d8.jpg
    │   ├── 0eb5759f21246505752043bb890ab6bf.jpg
    │   ├── 0ebcb4677d2131e71e039be8ea955cff.jpg
    │   ├── 0ef51f1b4020192962616eb9559975a4.jpg
    │   ├── 0ff87df50cf4610da54dd94b51c6d809.jpg
    │   ├── 112822dfffe3d0e59d7881d265b78cad.jpg
    │   ├── 1428271961e4c95f6508f59083d5a645.jpg
    │   ├── 1478d9b0743fdc3b0c6ad079b88034ec.jpg
    │   ├── 16dd8d60ea9b042c3ce0652c9f0571e8.jpg
    │   ├── 16fd7a4c078cf22fee09b636dc10d55c.jpg
    │   ├── 189ce8661099fd6f1118f978d53cf85b.jpg
    │   ├── 197517339d2ce744dd0a46c607e84534.jpg
    │   ├── 1a8a8647a66b744ccd5c9137adb66255.jpg
    │   ├── 1ac835166928f502b55a31636602602a.jpg
    │   ├── 1bb7f4b9072cd83f62d4e344eaba88e8.jpg
    │   ├── 1c57212c22a6a7777decfa1971418148.jpg
    │   ├── 1cef776388e6c2cba3cf00cab2199e3d.jpg
    │   ├── 1f778c47baa79f4277cc4c2cb0ff0a2d.jpg
    │   ├── 203cd7c17881ea567f75816f98ec50fa.jpg
    │   ├── 210c2cca588f51d57a4eee64f09d4b38.jpg
    │   ├── 22e9d09a51121f8c77345a724091c622.jpg
    │   ├── 2649ef98f720c129d663f5d82add4129.jpg
    │   ├── 26e6828e76fd4c01d8aea7770dd032c7.jpg
    │   ├── 290e0b58c66f2b75c67fd1a15e3fe958.jpg
    │   ├── 29928a4b917a9348751e7d0410712045.jpg
    │   ├── 2a4a130bcfb223ced98c0de613bd076a.jpg
    │   ├── 2a63caca4dc8603d4a602018e75a1fcd.jpg
    │   ├── 2b9e0aad3a11fee86e4261a1e94749e0.jpg
    │   ├── 2c646f410ca3ccbf4db40d322dfba989.jpg
    │   ├── 2d776487e1a2ee4683c3c6f51fca7e48.jpg
    │   ├── 2f97b51dd2c305123e29377f808272b2.jpg
    │   ├── 2fbe25eafed24324bfbde9c4d3dca1f4.jpg
    │   ├── 3152173a8fd696819c7a2c2b8c6ef005.jpg
    │   ├── 315be0f70cd0effa6c8682f2a949a46c.jpg
    │   ├── 348c0d7bc8db0d630042e5faffd2d647.jpg
    │   ├── 361ef23b0743d01bb30ead2dccc9edca.jpg
    │   ├── 38cff4d0c27588f71d4ed00223dcc4a2.jpg
    │   ├── 38f75cffa6acca49fcf1ba20d202b2d0.jpg
    │   ├── 3b99ee07cd783026d41b65651ee5d293.jpg
    │   ├── 3c0d894f48daba4e004eddd747cb556c.jpg
    │   ├── 3c62f7e72a479ae0b82768c51bdc2830.jpg
    │   ├── 3de6020b5a20976d8e1244b98a1ae30b.jpg
    │   ├── 3f26c9365c0603f014f3bba403ed27fb.jpg
    │   ├── 3f74c667189eff836df907a6b6ff2584.jpg
    │   ├── 3f959ef0c8e56c21a00dceb01eb6e6c1.jpg
    │   ├── 40877c463ec6621caf8f742f1e5c7c05.jpg
    │   ├── 40bca4c232843e818fac0254a2ae2689.jpg
    │   ├── 43393ec4f575c391ddfca83324ec67a8.jpg
    │   ├── 44d0db1ee959675768959ef02c868b32.jpg
    │   ├── 46803121f8c51dcb1422593800354126.jpg
    │   ├── 4a1a112aa8490f7c8410b710845e8c7a.jpg
    │   ├── 4ae661a05a9586c4ce7b5eabf4bab417.jpg
    │   ├── 4b454255e179a3626e205ce324184acf.jpg
    │   ├── 4bcd8bf4febeeb8972519ed2adbce8d5.jpg
    │   ├── 4e5f23199f8f3cd55221e56cb05bd116.jpg
    │   ├── 4fb175e4e5682ef75a156dfba37beeea.jpg
    │   ├── 501025688da0cf9e2b3937cd7da9580d.jpg
    │   ├── 585d98b9749f0661bc9077e01f28eb15.jpg
    │   ├── 59ebd939c24bf4d59d82b0daf4874daf.jpg
    │   ├── 5a13655c0030372e1b06cd77ff1e53e0.jpg
    │   ├── 5a64add1cdcd06c1755071eba4040184.jpg
    │   ├── 5afb60bc18fcf81ab2ff1282bb035030.jpg
    │   ├── 5b5795767ca8cb65be11e7cc020d6b7f.jpg
    │   ├── 5d730ed5b317fc0cf48f64f9e3312d92.jpg
    │   ├── 5d7cfe29b931b3da8aa6fe174ccaac16.jpg
    │   ├── 6170e4e4344d720bef3ff354a507f6fa.jpg
    │   ├── 61bccf1d55cc6636fce9585573c9981a.jpg
    │   ├── 6202e78af1fef25458e1696f4a8ae057.jpg
    │   ├── 62e9e7ead57d4e2a4be61668d6aff334.jpg
    │   ├── 679879d3fb7800c91c8321c24ce0f8d9.jpg
    │   ├── 67ba05c0d55d08b80558d07c418fa22e.jpg
    │   ├── 6869432b79fecd2af9dcc85625f6f356.jpg
    │   ├── 68b23d0b045f72286a7dd26502a39dd0.jpg
    │   ├── 68d093ccd7d87aecf646581f431220be.jpg
    │   ├── 69ff1f1b7a8e2162d5395fa62c35e8b6.jpg
    │   ├── 6a466b1857f68538cfa76355f8ea6396.jpg
    │   ├── 6b9d05253edbae95f27fa6c58648c872.jpg
    │   ├── 6eb508bad184c89094f5045a5bf2e31c.jpg
    │   ├── 6f2adb68d3f0a7f1f3af2ef044441071.jpg
    │   ├── 6ffbd19e479aef3243e53b706d026af4.jpg
    │   ├── 710e36ed942d63d074523c804347981b.jpg
    │   ├── 7166a4887b7f211527c9e45a072e23d2.jpg
    │   ├── 7171a798c643e82807785cc163a04796.jpg
    │   ├── 72748fa31cb48a5062a2fc7949bd0b45.jpg
    │   ├── 739c6ec939fd446ba1cde4cf4620512a.jpg
    │   ├── 7537a0a4978369cde56fd3dee294d760.jpg
    │   ├── 76b7cb4ea7971a8dc0db7a36feb66a35.jpg
    │   ├── 77c47cf9cfec8ec740c5a18dc4386670.jpg
    │   ├── 791424a3e5f6e2f4372471d96e5b4676.jpg
    │   ├── 7a33368daf8723e9a736c50a54b4d084.jpg
    │   ├── 7a9506c9bd23ed8b08861cd51eaf5cc3.jpg
    │   ├── 7a9b61eccdf8d02c95b6cd81a63e02ac.jpg
    │   ├── 7ab51337f3701838a285ea3a7346a777.jpg
    │   ├── 7b85580dc5d18fc8fec74ce74849c182.jpg
    │   ├── 7bb886fc0ea7d5d1144002edd99e0c7f.jpg
    │   ├── 7c0c7d4ea0f6a4cc6d3b9942f440d2ff.jpg
    │   ├── 7e397a7a0557431be9b98b2af35968d6.jpg
    │   ├── 7f8b8ddc9f821d1c5a27849bc02e355f.jpg
    │   ├── 80a25ad6329d3836f4e625a1c93e7898.jpg
    │   ├── 80f5d2344fd2f483b82b81d0a33e9333.jpg
    │   ├── 815f9ee92336e4ffc376f80bcb777ea1.jpg
    │   ├── 822f02d6117220d35ba69a1e20befe65.jpg
    │   ├── 82a22af158d760e46ae93ba1663a6487.jpg
    │   ├── 82e0633e9121ff663a913eb95a3dd723.jpg
    │   ├── 82f787ceda05c98a84ab98cdc998025c.jpg
    │   ├── 84566f6949f9a2f8734318c284f441f7.jpg
    │   ├── 85a26958e55acab88aef1ab37443b30b.jpg
    │   ├── 86176a13e0a00622dbc982348d7ca623.jpg
    │   ├── 875e532ac3b299876d209507d595df14.jpg
    │   ├── 88d05071bd3700af0ba08bab16c423be.jpg
    │   ├── 8973b73843e90120de5f556d5084eb49.jpg
    │   ├── 8c3fdcf6adcc472c7cd7a4598f96caac.jpg
    │   ├── 8d1654d45d287b49d6a7cbcae26c598f.jpg
    │   ├── 8dc8e70e19ec4318b12b16f1c5bdb879.jpg
    │   ├── 8f58cf98a539286a53e41582f194fbed.jpg
    │   ├── 901db29887d45801cb568cdd53d72a99.jpg
    │   ├── 905a46295f1f2a591a5d0b563d44277b.jpg
    │   ├── 90a1240e7489f989b9a4e5739b1efbd5.jpg
    │   ├── 91d663abfef497e13ec41f9300a5c354.jpg
    │   ├── 91deb6bcd6225e40290234462f33288a.jpg
    │   ├── 92df4afaf5010b135936512a39fb87d8.jpg
    │   ├── 946e2a245a8fae021860977280b52b44.jpg
    │   ├── 94b77459ef6ab620703ddb014430c700.jpg
    │   ├── 95594348fc6d49d2819be3d412a27e55.jpg
    │   ├── 962ddfc5aa5c0edc0ea500f82be01ac0.jpg
    │   ├── 96f213d1391dab5d9cd7f4ff68e739aa.jpg
    │   ├── 97fa452fc3843513b0746e76d216be78.jpg
    │   ├── 9af4ce81465021e68c774194432663c0.jpg
    │   ├── 9b41f0fbb97ef7ddd6383753e6ad1c26.jpg
    │   ├── 9b8d35ed3fc944be3e432c47b447f92f.jpg
    │   ├── 9d27515800718ff1cc0ac326899c7f77.jpg
    │   ├── 9d7dabd9ffa8795e12f2bcdf181e0b62.jpg
    │   ├── a389bc9d64e6d8eb9bc985f12054716b.jpg
    │   ├── a5fda7453d5707d5e8985434c789ba48.jpg
    │   ├── a70ad9ce54b1e6e921ef6745fcef45da.jpg
    │   ├── a769c068095381d9207afe431343c95c.jpg
    │   ├── aa2fbf6676b8fd4f67229d35f1c7c537.jpg
    │   ├── ad37847dfd8d9f3d99f646966f32cf30.jpg
    │   ├── aec897e37f71d43694de4db49ed3be3e.jpg
    │   ├── aed7e56b0a3e63a84e53c79df4f79b0e.jpg
    │   ├── aef64ee73dc1b1a03a152855f685113e.jpg
    │   ├── afa87c5126806e604709f243ab72848b.jpg
    │   ├── b288f19072faa2f8f373d5a8910c080b.jpg
    │   ├── b3039f057e9453e4183ed33aecf5815f.jpg
    │   ├── b3268e19b1a48f645d17d659940fb084.jpg
    │   ├── b4a297ef2185e28694b366bde4069858.jpg
    │   ├── b6bd384dd0f03237f1b1b36428d27842.jpg
    │   ├── b7721ad6f461509452813013157c7a5e.jpg
    │   ├── b7d416463ca0be2cb7caf7d373dc26cc.jpg
    │   ├── b7d7ca35788d7bfb804b5b230a76af8c.jpg
    │   ├── b8bf446d4a625497f28f2347b7ca0c92.jpg
    │   ├── b8c9ccb17235ad37b2b0fee18853efe6.jpg
    │   ├── b9eb842264e6a48a42ecf5f142e32414.jpg
    │   ├── baa636adac3ad30302c0a36fc2f58751.jpg
    │   ├── bab25b7785bf747bc1caa1442874df74.jpg
    │   ├── be4d423d387dbcd6a770d4bda5718082.jpg
    │   ├── c03bdd903e4bd06d018711d1dece0c35.jpg
    │   ├── c089ca6ef2f36b0394d7bcf41db78030.jpg
    │   ├── c1bfb9f293835166e1378720b9f206b8.jpg
    │   ├── c24065c33e1cca422d1ae92f57cd77c1.jpg
    │   ├── c421a389906a45c77337a6a68fa78a0b.jpg
    │   ├── c45aab6ee1f6f00de1ac3f428e62b01c.jpg
    │   ├── c4660874124a448ac14209f4a59e367a.jpg
    │   ├── c51fb942d508d4161e72d0075a5284e7.jpg
    │   ├── c647aced84d4783e96a244a8af78ddd2.jpg
    │   ├── c789e9bbaa3506dc90047b5cd487a42a.jpg
    │   ├── c8a2ccec457f128649ad30a2ba066a48.jpg
    │   ├── c9c3087ea25e6c3f848030b33b06de8f.jpg
    │   ├── cab981b993e03ab12309dd619da9e31d.jpg
    │   ├── cb0b50e4410efd78416163f37eaf1262.jpg
    │   ├── cb63c877ea3af266bb0f5ad6ba5e0b1d.jpg
    │   ├── cdcdbf84e640274f429780824ccf99ae.jpg
    │   ├── cedd3825782041ef84d7741e62528a42.jpg
    │   ├── d003fed20e7f2d040ccc24412cb854d1.jpg
    │   ├── d09c46ec94d638e4ddcecfbba1c11ea8.jpg
    │   ├── d142da9aae51c6d3c3c736fc82252862.jpg
    │   ├── d2f9799d371fde446e6dc8292ba07393.jpg
    │   ├── d3a773e713ad3244265d91b77ef7fb7e.jpg
    │   ├── d3b112475692c0421480c01cd029cf09.jpg
    │   ├── d4b213f9046b3ed8b898fac4d4aeec34.jpg
    │   ├── d4b34834b440d5d60f25912180e7e130.jpg
    │   ├── d4c847aca412080f018bab9df543ff7b.jpg
    │   ├── d69988406d72ad9e624d24db6b4d2838.jpg
    │   ├── d6c0dcf5a8894d7495e320405295cc8f.jpg
    │   ├── d7945700ecde92ac83058e07433755da.jpg
    │   ├── d7cd0e2a15aa54e4700d3dc03e6ac28d.jpg
    │   ├── d9195c20e19c173ec6d22c2e60a2cddb.jpg
    │   ├── dd0fad3141f468ebc29678d3ff86055d.jpg
    │   ├── e1164e5922bbcc2db8e6b23c145b8f75.jpg
    │   ├── e97f8315ce721d1417bc7bb3b4a9d332.jpg
    │   ├── eacebbc96f1d97c47d903d7981ce1167.jpg
    │   ├── ec9e0b7231caed693477682311612304.jpg
    │   ├── ed008064e9d0e55dc93f673b9aca6b65.jpg
    │   ├── edb67528127916e7e274addf9ad96029.jpg
    │   ├── eea23835a8abd9d903f56256c18cf8aa.jpg
    │   ├── f18ecec7a6c176301d7370e41a0a60dd.jpg
    │   ├── f3f89822d498eea24c520e0ab3cb6b0d.jpg
    │   ├── f4e95f92187a42f257864cd22193c8ad.jpg
    │   ├── f5832d90e75d18f501ede7acb0b6ce74.jpg
    │   ├── f80c359151c40c9277e2d70f38856eab.jpg
    │   ├── f9711a0b52dcab7b1173e08ac154cdb4.jpg
    │   ├── fad9e18cebad821450ed0f34abdb3988.jpg
    │   ├── fcc4c8c5db1d6aa3ff080466e10ccb74.jpg
    │   ├── fdfe96b0b4fdfbfd862a698dc64ce34a.jpg
    │   └── fef76f108c095f250d8e9efb4cfcb710.jpg
└── styles
    └── ebook.css


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | .DS_Store
103 | 
104 | # gitbook
105 | _book
106 | 
107 | # node.js
108 | node_modules
109 | 
110 | # windows
111 | Thumbs.db
112 | 
113 | # word
114 | ~$*.docx
115 | ~$*.doc
116 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License (CC BY-NC-SA 4.0)
  2 | 
  3 | Copyright © 2020 ApacheCN(apachecn@163.com)
  4 | 
  5 | By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions.
  6 | 
  7 | Section 1 – Definitions.
  8 | 
  9 | a.  Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image.
 10 | b.  Adapter's License means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License.
 11 | c.  BY-NC-SA Compatible License means a license listed at creativecommons.org/compatiblelicenses, approved by Creative Commons as essentially the equivalent of this Public License.
 12 | d.  Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights.
 13 | e.  Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements.
 14 | f.  Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material.
 15 | g.  License Elements means the license attributes listed in the name of a Creative Commons Public License. The License Elements of this Public License are Attribution, NonCommercial, and ShareAlike.
 16 | h.  Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License.
 17 | i.  Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license.
 18 | j.  Licensor means the individual(s) or entity(ies) granting rights under this Public License.
 19 | k.  NonCommercial means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange.
 20 | l.  Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them.
 21 | m.  Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world.
 22 | n.  You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning.
 23 | 
 24 | Section 2 – Scope.
 25 | 
 26 | a.  License grant.
 27 |     1.  Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to:
 28 |         A.  reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and
 29 |         B.  produce, reproduce, and Share Adapted Material for NonCommercial purposes only.
 30 |     2.  Exceptions and Limitations. For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions.
 31 |     3.  Term. The term of this Public License is specified in Section 6(a).
 32 |     4.  Media and formats; technical modifications allowed. The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material.
 33 |     5.  Downstream recipients.
 34 |         A.  Offer from the Licensor – Licensed Material. Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License.
 35 |         B.  Additional offer from the Licensor – Adapted Material. Every recipient of Adapted Material from You automatically receives an offer from the Licensor to exercise the Licensed Rights in the Adapted Material under the conditions of the Adapter’s License You apply.
 36 |         C.  No downstream restrictions. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material.
 37 |     6.  No endorsement. Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i).
 38 | b.  Other rights.
 39 |     1.  Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise.
 40 |     2.  Patent and trademark rights are not licensed under this Public License.
 41 |     3.  To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes.
 42 | 
 43 | Section 3 – License Conditions.
 44 | 
 45 | Your exercise of the Licensed Rights is expressly made subject to the following conditions.
 46 | 
 47 | a.  Attribution.
 48 |     1.  If You Share the Licensed Material (including in modified form), You must:
 49 |         A.  retain the following if it is supplied by the Licensor with the Licensed Material:
 50 |             i.  identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated);
 51 |            ii.  a copyright notice;
 52 |           iii.  a notice that refers to this Public License;
 53 |            iv.  a notice that refers to the disclaimer of warranties;
 54 |             v.  a URI or hyperlink to the Licensed Material to the extent reasonably practicable;
 55 |         B.  indicate if You modified the Licensed Material and retain an indication of any previous modifications; and
 56 |         C.  indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License.
 57 |     2.  You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information.
 58 |     3.  If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable.
 59 | b.  ShareAlike.
 60 |     In addition to the conditions in Section 3(a), if You Share Adapted Material You produce, the following conditions also apply.
 61 |     1.  The Adapter’s License You apply must be a Creative Commons license with the same License Elements, this version or later, or a BY-NC-SA Compatible License.
 62 |     2.  You must include the text of, or the URI or hyperlink to, the Adapter's License You apply. You may satisfy this condition in any reasonable manner based on the medium, means, and context in which You Share Adapted Material.
 63 |     3.  You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, Adapted Material that restrict exercise of the rights granted under the Adapter's License You apply.
 64 | 
 65 | Section 4 – Sui Generis Database Rights.
 66 | 
 67 | Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material:
 68 | 
 69 | a.  for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only;
 70 | b.  if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material, including for purposes of Section 3(b); and
 71 | c.  You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database.
 72 | 
 73 | For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights.
 74 | 
 75 | Section 5 – Disclaimer of Warranties and Limitation of Liability.
 76 | 
 77 | a.  Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You.
 78 | b.  To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You.
 79 | c.  The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability.
 80 | 
 81 | Section 6 – Term and Termination.
 82 | 
 83 | a.  This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically.
 84 | b.  Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates:
 85 |     1.  automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or
 86 |     2.  upon express reinstatement by the Licensor.
 87 |     For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License.
 88 | c.  For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License.
 89 | d.  Sections 1, 5, 6, 7, and 8 survive termination of this Public License.
 90 | 
 91 | Section 7 – Other Terms and Conditions.
 92 | 
 93 | a.  The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed.
 94 | b.  Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License.
 95 | 
 96 | Section 8 – Interpretation.
 97 | 
 98 | a.  For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License.
 99 | b.  To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions.
100 | c.  No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor.
101 | d.  Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority.


--------------------------------------------------------------------------------
/cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/cover.jpg


--------------------------------------------------------------------------------
/docs/1.md:
--------------------------------------------------------------------------------
 1 | # 1\. 前言
 2 | 
 3 | ## 1.1\. 关于
 4 | 
 5 | ### 1.1.1\. 关于这个笔记
 6 | 
 7 | 这是[ PySpark 学习手册](https://github.com/runawayhorse001/LearningApacheSpark)的共享仓库。第一个版本发布在 [ChenFeng](https://mingchen0919.github.io/learning-apache-spark/index.html)（[[Feng2017]](reference.html#feng2017)）的 Github 上。 这个共享仓库主要包含 Wenqiang 在[ IMA 数据科学项目](https://www.ima.umn.edu/2016-2017/SW1.23-3.10.17#)期间的自学和教学笔记。 读者可以参考仓库 [apachecn/learning-pyspark-zh](https://github.com/apachecn/learning-pyspark-zh)，了解`dataset`和`.ipynb`文件的更多详细信息。
 8 | 
 9 | 在此仓库中，我尝试使用详细的演示代码和示例来说明如何使用每个主要函数。如果您发现本文未引用您的作品，请随时告诉我。
10 | 
11 | 虽然我绝不是数据挖掘编程和大数据专家，但我决定以简单的教程和详细的例子，分享我对 PySpark 编程的见解。我希望这些教程将成为您学习的宝贵工具。
12 | 
13 | 教程假设读者具有编程和 Linux 的初步知识。英文文档是使用 [sphinx](http://sphinx.pocoo.org) 自动生成的。
14 | 
15 | ### 1.1.2\. 关于作者
16 | 
17 | *   **Wenqiang Feng**
18 | 
19 |     *   数据科学家和数学 PhD
20 |     *   田纳西大学，诺克斯维尔
21 |     *   Email: [von198@gmail.com](mailto:von198%40gmail.com)
22 | *   **自传**
23 | 
24 |     Wenqiang Feng 是 DST 应用分析小组的数据科学家。 Feng 博士的职责包括为 DST 客户提供最先进的技术和技术，包括大数据分析解决方案，高级分析和数据增强技术以及建模。
25 | 
26 |     Feng 博士在数据挖掘，分析系统，机器学习算法，商业智能以及应用大数据工具方面，拥有深厚的分析专业知识，可以战略性地解决跨功能业务中的行业问题。 在加入 DST 之前，Feng 博士是明尼苏达大学数学及其应用研究所（IMA）的数据科学研究员。 在那里，他帮助初创公司根据深度预测分析做出营销决策。
27 | 
28 |     Feng 博士毕业于田纳西大学，诺克斯维尔，拥有博士学位，和计算数学和统计学硕士学位。 他还拥有密苏里科技大学（MST）计算数学硕士学位和中国科学技术大学（USTC）应用数学硕士学位。
29 | 
30 | *   **声明**
31 | 
32 |     在 IMA 工作期间，Feng 的工作得到了 IMA 的支持。 但是，本材料中表达的任何意见，发现，结论或建议均为作者的意见，并不一定反映 IMA，UTK 和 DST 的观点。
33 | 
34 | ## 1.2\. 这个教程的动机
35 | 
36 | 我受到[ IMA 数据科学项目](https://www.ima.umn.edu/2016-2017/SW1.23-3.10.17#)项目的启发，来学习 PySpark。 之后，我对 PySpark 印象深刻。 我觉得：
37 | 
38 | > 1.  可以毫不夸张地说，Spark 是最强大的大数据工具。
39 | > 2.  但是，我仍然发现学习 Spark 是一个艰难的过程。 我必须搜索并确定哪一个是对的。 很难找到详细的例子，我可以用它轻松地在一个文件中学习完整的过程。
40 | > 3.  对于研究生来说，好的资源是昂贵的。
41 | 
42 | ## 1.3\. 版权声明和协议信息
43 | 
44 | 本文档中的代码遵循[ MIT 协议](https://github.com/runawayhorse001/LearningApacheSpark/blob/master/LICENSE)，文字遵循[ CC BY-NC-SA 4.0 协议](https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode)。
45 | 
46 | **当您计划使用，复制，修改，合并，发布，分发或再授权时，请查看这些协议的条款来获取更多详细信息，并向作者提供相应的署名**。
47 | 
48 | ## 1.4\. 致谢
49 | 
50 | 在此，我要感谢田纳西大学，诺克斯维尔的 Ming Chen，Jian Sun 和 Zhongbo Li 的宝贵讨论，并感谢慷慨的匿名作者在互联网上提供详细的解决方案和源代码。 没有这些帮助，就无法建立这个CAE库。 Wenqiang 还要感谢[明尼苏达大学双子城](https://twin-cities.umn.edu/)的[数学及其应用研究所（IMA）](https://www.ima.umn.edu/)，在他的 IMA 数据科学家项目期间提供支持。
51 | 
52 | 特别感谢[ Haiping Lu 博士](http://staffwww.dcs.shef.ac.uk/people/H.Lu/)，谢菲尔德大学计算机科学系机器学习讲师，在他的教学中推荐和大量使用我的教程，并提出了有价值的建议。
53 | 
54 | ## 1.5\. 反馈和建议
55 | 
56 | 非常感谢您的意见和建议。 我非常乐意通过电子邮件（[von198@gmail.com](mailto:von198%40gmail.com)）收到更正，建议或反馈，以便进行改进。
57 | 


--------------------------------------------------------------------------------
/docs/11.md:
--------------------------------------------------------------------------------
  1 | # 11\. Clustering
  2 | 
  3 | Chinese proverb
  4 | 
  5 | Sharpening the knife longer can make it easier to hack the firewood – old Chinese proverb
  6 | 
  7 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/clustering_logo.png](img/eacebbc96f1d97c47d903d7981ce1167.jpg)
  8 | 
  9 | The above figure was generated by the code from: [Python Data Science Handbook](https://jakevdp.github.io/PythonDataScienceHandbook/06.00-figure-code.html#Expectation-Maximization).
 10 | 
 11 | ## 11.1\. K-Means Model
 12 | 
 13 | ### 11.1.1\. Introduction
 14 | 
 15 | k-means clustering is a method of vector quantization, originally from signal processing, that is popular for cluster analysis in data mining. The approach kmeans follows to solve the problem is called **Expectation-Maximization**. It can be described as follows:
 16 | 
 17 | 1.  Assign some cluter centers
 18 | 
 19 | 2.  Repeated until converged
 20 | 
 21 |     &gt; *   E-Step: assign points to the nearest center
 22 |     &gt; *   M-step: set the cluster center to the mean
 23 | 
 24 | Given a set of observations ![(x_1, x_2, \cdots, x_m)](img/290e0b58c66f2b75c67fd1a15e3fe958.jpg). The objective function is
 25 | 
 26 | ![J = \sum_{i=1}^{m}\sum_{k=1}^{K}w_{ik} ||x_i-c_k||^2](img/0d2c607e00ca608222b80fa6b61e780a.jpg)
 27 | 
 28 | where ![w_{ik}=1](img/a769c068095381d9207afe431343c95c.jpg) if ![x_i](img/82e0633e9121ff663a913eb95a3dd723.jpg) is in cluster ![k](img/739c6ec939fd446ba1cde4cf4620512a.jpg); otherwise ![w_{ik}=0](img/7a9506c9bd23ed8b08861cd51eaf5cc3.jpg) and ![c_k](img/cab981b993e03ab12309dd619da9e31d.jpg) is the centroid of ![x_i](img/82e0633e9121ff663a913eb95a3dd723.jpg) ‘s cluster.
 29 | 
 30 | Mathematically, k-means is a minimization problem with two parts: First, we minimize ![J](img/d6c0dcf5a8894d7495e320405295cc8f.jpg) w.r.t ![w_{ik}](img/c647aced84d4783e96a244a8af78ddd2.jpg) with ![c_k](img/cab981b993e03ab12309dd619da9e31d.jpg) fixed; Then minimize ![J](img/d6c0dcf5a8894d7495e320405295cc8f.jpg) w.r.t ![c_k](img/cab981b993e03ab12309dd619da9e31d.jpg) with ![w_{ik}](img/c647aced84d4783e96a244a8af78ddd2.jpg) fixed. i.e.
 31 | 
 32 | **E-step**:
 33 | 
 34 | ![\frac{\partial J}{\partial w_{ik}} = \sum_{i=1}^{m}\sum_{k=1}^{K} ||x_i-c_k||^2\\
 35 | \Rightarrow w_{ik} =\left\{
 36 |         \begin{array}{ll}
 37 |           1, & \text{ if }{ k = argmin_{j} ||x_i-c_j||^2} \\
 38 |           0, & \text{ otherwise }
 39 |         \end{array}
 40 |       \right.](img/c421a389906a45c77337a6a68fa78a0b.jpg)
 41 | 
 42 | **M-step**:
 43 | 
 44 | ![\frac{\partial J}{\partial c_k} = 2\sum_{i=1}{m} w_{ik}(x_i-c_k) =0  \Rightarrow
 45 | c_k = \frac{\sum_{i=1}^{m}w_{ik}x_i}{\sum_{i=1}^{m}w_{ik}}](img/62e9e7ead57d4e2a4be61668d6aff334.jpg)
 46 | 
 47 | ### 11.1.2\. Demo
 48 | 
 49 | 1.  Set up spark context and SparkSession
 50 | 
 51 | ```
 52 | from pyspark.sql import SparkSession
 53 | 
 54 | spark = SparkSession \
 55 |     .builder \
 56 |     .appName("Python Spark K-means example") \
 57 |     .config("spark.some.config.option", "some-value") \
 58 |     .getOrCreate()
 59 | 
 60 | ```
 61 | 
 62 | 1.  Load dataset
 63 | 
 64 | ```
 65 | df = spark.read.format('com.databricks.spark.csv').\
 66 |                        options(header='true', \
 67 |                        inferschema='true').\
 68 |             load("../data/iris.csv",header=True);
 69 | 
 70 | ```
 71 | 
 72 | check the data set
 73 | 
 74 | ```
 75 | df.show(5,True)
 76 | df.printSchema()
 77 | 
 78 | ```
 79 | 
 80 | Then you will get
 81 | 
 82 | ```
 83 | +------------+-----------+------------+-----------+-------+
 84 | |sepal_length|sepal_width|petal_length|petal_width|species|
 85 | +------------+-----------+------------+-----------+-------+
 86 | |         5.1|        3.5|         1.4|        0.2| setosa|
 87 | |         4.9|        3.0|         1.4|        0.2| setosa|
 88 | |         4.7|        3.2|         1.3|        0.2| setosa|
 89 | |         4.6|        3.1|         1.5|        0.2| setosa|
 90 | |         5.0|        3.6|         1.4|        0.2| setosa|
 91 | +------------+-----------+------------+-----------+-------+
 92 | only showing top 5 rows
 93 | 
 94 | root
 95 |  |-- sepal_length: double (nullable = true)
 96 |  |-- sepal_width: double (nullable = true)
 97 |  |-- petal_length: double (nullable = true)
 98 |  |-- petal_width: double (nullable = true)
 99 |  |-- species: string (nullable = true)
100 | 
101 | ```
102 | 
103 | You can also get the Statistical resutls from the data frame (Unfortunately, it only works for numerical).
104 | 
105 | ```
106 | df.describe().show()
107 | 
108 | ```
109 | 
110 | Then you will get
111 | 
112 | ```
113 | +-------+------------------+-------------------+------------------+------------------+---------+
114 | |summary|      sepal_length|        sepal_width|      petal_length|       petal_width|  species|
115 | +-------+------------------+-------------------+------------------+------------------+---------+
116 | |  count|               150|                150|               150|               150|      150|
117 | |   mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672|     null|
118 | | stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|     null|
119 | |    min|               4.3|                2.0|               1.0|               0.1|   setosa|
120 | |    max|               7.9|                4.4|               6.9|               2.5|virginica|
121 | +-------+------------------+-------------------+------------------+------------------+---------+
122 | 
123 | ```
124 | 
125 | 1.  Convert the data to dense vector (**features**)
126 | 
127 | ```
128 | # convert the data to dense vector
129 | def transData(data):
130 |     return data.rdd.map(lambda r: [Vectors.dense(r[:-1])]).toDF(['features'])
131 | 
132 | ```
133 | 
134 | Note
135 | 
136 | You are strongly encouraged to try my `get_dummy` function for dealing with the categorical data in complex dataset.
137 | 
138 | Supervised learning version:
139 | 
140 | > ```
141 | > def get_dummy(df,indexCol,categoricalCols,continuousCols,labelCol):
142 | > 
143 | >     from pyspark.ml import Pipeline
144 | >     from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
145 | >     from pyspark.sql.functions import col
146 | > 
147 | >     indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
148 | >                  for c in categoricalCols ]
149 | > 
150 | >     # default setting: dropLast=True
151 | >     encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(),
152 | >                  outputCol="{0}_encoded".format(indexer.getOutputCol()))
153 | >                  for indexer in indexers ]
154 | > 
155 | >     assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
156 | >                                 + continuousCols, outputCol="features")
157 | > 
158 | >     pipeline = Pipeline(stages=indexers + encoders + [assembler])
159 | > 
160 | >     model=pipeline.fit(df)
161 | >     data = model.transform(df)
162 | > 
163 | >     data = data.withColumn('label',col(labelCol))
164 | > 
165 | >     return data.select(indexCol,'features','label')
166 | > 
167 | > ```
168 | 
169 | Unsupervised learning version:
170 | 
171 | > ```
172 | > def get_dummy(df,indexCol,categoricalCols,continuousCols):
173 | >     '''
174 | >     Get dummy variables and concat with continuous variables for unsupervised learning.
175 | >     :param df: the dataframe
176 | >     :param categoricalCols: the name list of the categorical data
177 | >     :param continuousCols:  the name list of the numerical data
178 | >     :return k: feature matrix
179 | > 
180 | >     :author: Wenqiang Feng
181 | >     :email:  von198@gmail.com
182 | >     '''
183 | > 
184 | >     indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
185 | >                  for c in categoricalCols ]
186 | > 
187 | >     # default setting: dropLast=True
188 | >     encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(),
189 | >                  outputCol="{0}_encoded".format(indexer.getOutputCol()))
190 | >                  for indexer in indexers ]
191 | > 
192 | >     assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
193 | >                                 + continuousCols, outputCol="features")
194 | > 
195 | >     pipeline = Pipeline(stages=indexers + encoders + [assembler])
196 | > 
197 | >     model=pipeline.fit(df)
198 | >     data = model.transform(df)
199 | > 
200 | >     return data.select(indexCol,'features')
201 | > 
202 | > ```
203 | 
204 | 1.  Transform the dataset to DataFrame
205 | 
206 | ```
207 | transformed= transData(df)
208 | transformed.show(5, False)
209 | 
210 | ```
211 | 
212 | ```
213 | +-----------------+
214 | |features         |
215 | +-----------------+
216 | |[5.1,3.5,1.4,0.2]|
217 | |[4.9,3.0,1.4,0.2]|
218 | |[4.7,3.2,1.3,0.2]|
219 | |[4.6,3.1,1.5,0.2]|
220 | |[5.0,3.6,1.4,0.2]|
221 | +-----------------+
222 | only showing top 5 rows
223 | 
224 | ```
225 | 
226 | 1.  Deal With Categorical Variables
227 | 
228 | ```
229 | from pyspark.ml import Pipeline
230 | from pyspark.ml.regression import LinearRegression
231 | from pyspark.ml.feature import VectorIndexer
232 | from pyspark.ml.evaluation import RegressionEvaluator
233 | 
234 | # Automatically identify categorical features, and index them.
235 | # We specify maxCategories so features with > 4 distinct values are treated as continuous.
236 | 
237 | featureIndexer = VectorIndexer(inputCol="features", \
238 |                                outputCol="indexedFeatures",\
239 |                                maxCategories=4).fit(transformed)
240 | 
241 | data = featureIndexer.transform(transformed)
242 | 
243 | ```
244 | 
245 | Now you check your dataset with
246 | 
247 | ```
248 | data.show(5,True)
249 | 
250 | ```
251 | 
252 | you will get
253 | 
254 | ```
255 | +-----------------+-----------------+
256 | |         features|  indexedFeatures|
257 | +-----------------+-----------------+
258 | |[5.1,3.5,1.4,0.2]|[5.1,3.5,1.4,0.2]|
259 | |[4.9,3.0,1.4,0.2]|[4.9,3.0,1.4,0.2]|
260 | |[4.7,3.2,1.3,0.2]|[4.7,3.2,1.3,0.2]|
261 | |[4.6,3.1,1.5,0.2]|[4.6,3.1,1.5,0.2]|
262 | |[5.0,3.6,1.4,0.2]|[5.0,3.6,1.4,0.2]|
263 | +-----------------+-----------------+
264 | only showing top 5 rows
265 | 
266 | ```
267 | 
268 | Note
269 | 
270 | Since clustering algorithms including k-means use distance-based measurements to determine the similarity between data points, It’s strongly recommended to standardize the data to have a mean of zero and a standard deviation of one.
271 | 
272 | 1.  Elbow method to determine the optimal number of clusters for k-means clustering
273 | 
274 | ```
275 | import numpy as np
276 | cost = np.zeros(20)
277 | for k in range(2,20):
278 |     kmeans = KMeans()\
279 |             .setK(k)\
280 |             .setSeed(1) \
281 |             .setFeaturesCol("indexedFeatures")\
282 |             .setPredictionCol("cluster")
283 | 
284 |     model = kmeans.fit(data)
285 |     cost[k] = model.computeCost(data) # requires Spark 2.0 or later
286 | 
287 | ```
288 | 
289 | ```
290 | import numpy as np
291 | import matplotlib.mlab as mlab
292 | import matplotlib.pyplot as plt
293 | import seaborn as sbs
294 | from matplotlib.ticker import MaxNLocator
295 | 
296 | fig, ax = plt.subplots(1,1, figsize =(8,6))
297 | ax.plot(range(2,20),cost[2:20])
298 | ax.set_xlabel('k')
299 | ax.set_ylabel('cost')
300 | ax.xaxis.set_major_locator(MaxNLocator(integer=True))
301 | plt.show()
302 | 
303 | ```
304 | 
305 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/elbow.png](img/92df4afaf5010b135936512a39fb87d8.jpg)
306 | 
307 | In my opinion, sometimes it’s hard to choose the optimal number of the clusters by using the `elbow method`. As shown in the following Figure, you can choose 3, 5 or even 8\. I will choose `3` in this demo.
308 | 
309 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/elbow_rfm.png](img/d4b213f9046b3ed8b898fac4d4aeec34.jpg)
310 | 
311 | *   Silhouette analysis
312 | 
313 | ```
314 | #PySpark libraries
315 | from pyspark.ml import Pipeline
316 | from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
317 | from pyspark.sql.functions import col, percent_rank, lit
318 | from pyspark.sql.window import Window
319 | from pyspark.sql import DataFrame, Row
320 | from pyspark.sql.types import StructType
321 | from functools import reduce  # For Python 3.x
322 | 
323 | from pyspark.ml.clustering import KMeans
324 | from pyspark.ml.evaluation import ClusteringEvaluator
325 | 
326 | def optimal_k(df_in,index_col,k_min, k_max,num_runs):
327 |     '''
328 |     Determine optimal number of clusters by using Silhoutte Score Analysis.
329 |     :param df_in: the input dataframe
330 |     :param index_col: the name of the index column
331 |     :param k_min: the train dataset
332 |     :param k_min: the minmum number of the clusters
333 |     :param k_max: the maxmum number of the clusters
334 |     :param num_runs: the number of runs for each fixed clusters
335 | 
336 |     :return k: optimal number of the clusters
337 |     :return silh_lst: Silhouette score
338 |     :return r_table: the running results table
339 | 
340 |     :author: Wenqiang Feng
341 |     :email:  von198@gmail.com
342 |     '''
343 | 
344 |     start = time.time()
345 |     silh_lst = []
346 |     k_lst = np.arange(k_min, k_max+1)
347 | 
348 |     r_table = df_in.select(index_col).toPandas()
349 |     r_table = r_table.set_index(index_col)
350 |     centers = pd.DataFrame()
351 | 
352 |     for k in k_lst:
353 |         silh_val = []
354 |         for run in np.arange(1, num_runs+1):
355 | 
356 |             # Trains a k-means model.
357 |             kmeans = KMeans()\
358 |                     .setK(k)\
359 |                     .setSeed(int(np.random.randint(100, size=1)))
360 |             model = kmeans.fit(df_in)
361 | 
362 |             # Make predictions
363 |             predictions = model.transform(df_in)
364 |             r_table['cluster_{k}_{run}'.format(k=k, run=run)]= predictions.select('prediction').toPandas()
365 | 
366 |             # Evaluate clustering by computing Silhouette score
367 |             evaluator = ClusteringEvaluator()
368 |             silhouette = evaluator.evaluate(predictions)
369 |             silh_val.append(silhouette)
370 | 
371 |         silh_array=np.asanyarray(silh_val)
372 |         silh_lst.append(silh_array.mean())
373 | 
374 |     elapsed =  time.time() - start
375 | 
376 |     silhouette = pd.DataFrame(list(zip(k_lst,silh_lst)),columns = ['k', 'silhouette'])
377 | 
378 |     print('+------------------------------------------------------------+')
379 |     print("|         The finding optimal k phase took %8.0f s.       |" %(elapsed))
380 |     print('+------------------------------------------------------------+')
381 | 
382 |     return k_lst[np.argmax(silh_lst, axis=0)], silhouette , r_table
383 | 
384 | ```
385 | 
386 | ```
387 | k, silh_lst, r_table = optimal_k(scaledData,index_col,k_min, k_max,num_runs)
388 | 
389 | +------------------------------------------------------------+
390 | |         The finding optimal k phase took     1783 s.       |
391 | +------------------------------------------------------------+
392 | 
393 | ```
394 | 
395 | ```
396 | spark.createDataFrame(silh_lst).show()
397 | 
398 | +---+------------------+
399 | |  k|        silhouette|
400 | +---+------------------+
401 | |  3|0.8045154385557953|
402 | |  4|0.6993528775512052|
403 | |  5|0.6689286654221447|
404 | |  6|0.6356184024841809|
405 | |  7|0.7174102265711756|
406 | |  8|0.6720861758298997|
407 | |  9| 0.601771359881241|
408 | | 10|0.6292447334578428|
409 | +---+------------------+
410 | 
411 | ```
412 | 
413 | From the silhouette list, we can choose `3` as the optimal number of the clusters.
414 | 
415 | Warning
416 | 
417 | `ClusteringEvaluator` in `pyspark.ml.evaluation` requires Spark 2.4 or later!!
418 | 
419 | 1.  Pipeline Architecture
420 | 
421 | ```
422 | from pyspark.ml.clustering import KMeans, KMeansModel
423 | 
424 | kmeans = KMeans() \
425 |           .setK(3) \
426 |           .setFeaturesCol("indexedFeatures")\
427 |           .setPredictionCol("cluster")
428 | 
429 | # Chain indexer and tree in a Pipeline
430 | pipeline = Pipeline(stages=[featureIndexer, kmeans])
431 | 
432 | model = pipeline.fit(transformed)
433 | 
434 | cluster = model.transform(transformed)
435 | 
436 | ```
437 | 
438 | 1.  k-means clusters
439 | 
440 | ```
441 | cluster = model.transform(transformed)
442 | 
443 | ```
444 | 
445 | ```
446 | +-----------------+-----------------+-------+
447 | |         features|  indexedFeatures|cluster|
448 | +-----------------+-----------------+-------+
449 | |[5.1,3.5,1.4,0.2]|[5.1,3.5,1.4,0.2]|      1|
450 | |[4.9,3.0,1.4,0.2]|[4.9,3.0,1.4,0.2]|      1|
451 | |[4.7,3.2,1.3,0.2]|[4.7,3.2,1.3,0.2]|      1|
452 | |[4.6,3.1,1.5,0.2]|[4.6,3.1,1.5,0.2]|      1|
453 | |[5.0,3.6,1.4,0.2]|[5.0,3.6,1.4,0.2]|      1|
454 | |[5.4,3.9,1.7,0.4]|[5.4,3.9,1.7,0.4]|      1|
455 | |[4.6,3.4,1.4,0.3]|[4.6,3.4,1.4,0.3]|      1|
456 | |[5.0,3.4,1.5,0.2]|[5.0,3.4,1.5,0.2]|      1|
457 | |[4.4,2.9,1.4,0.2]|[4.4,2.9,1.4,0.2]|      1|
458 | |[4.9,3.1,1.5,0.1]|[4.9,3.1,1.5,0.1]|      1|
459 | |[5.4,3.7,1.5,0.2]|[5.4,3.7,1.5,0.2]|      1|
460 | |[4.8,3.4,1.6,0.2]|[4.8,3.4,1.6,0.2]|      1|
461 | |[4.8,3.0,1.4,0.1]|[4.8,3.0,1.4,0.1]|      1|
462 | |[4.3,3.0,1.1,0.1]|[4.3,3.0,1.1,0.1]|      1|
463 | |[5.8,4.0,1.2,0.2]|[5.8,4.0,1.2,0.2]|      1|
464 | |[5.7,4.4,1.5,0.4]|[5.7,4.4,1.5,0.4]|      1|
465 | |[5.4,3.9,1.3,0.4]|[5.4,3.9,1.3,0.4]|      1|
466 | |[5.1,3.5,1.4,0.3]|[5.1,3.5,1.4,0.3]|      1|
467 | |[5.7,3.8,1.7,0.3]|[5.7,3.8,1.7,0.3]|      1|
468 | |[5.1,3.8,1.5,0.3]|[5.1,3.8,1.5,0.3]|      1|
469 | +-----------------+-----------------+-------+
470 | only showing top 20 rows
471 | 
472 | ```


--------------------------------------------------------------------------------
/docs/14.md:
--------------------------------------------------------------------------------
  1 | # 14\. Social Network Analysis
  2 | 
  3 | Chinese proverb
  4 | 
  5 | **A Touch of Cloth,linked in countless ways.** – old Chinese proverb
  6 | 
  7 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/net_work.png](img/cb0b50e4410efd78416163f37eaf1262.jpg)
  8 | 
  9 | ## 14.1\. Introduction
 10 | 
 11 | <iframe width="560" height="315" src="https://www.youtube.com/embed/xT3EpF2EsbQ" frameborder="0" allowfullscreen=""></iframe>
 12 | 
 13 | ## 14.2\. Co-occurrence Network
 14 | 
 15 | [Co-occurrence networks](https://en.wikipedia.org/wiki/Co-occurrence_networks) are generally used to provide a graphic visualization of potential relationships between people, organizations, concepts or other entities represented within written material. The generation and visualization of co-occurrence networks has become practical with the advent of electronically stored text amenable to text mining.
 16 | 
 17 | ### 14.2.1\. Methodology
 18 | 
 19 | *   Build Corpus C
 20 | *   Build Document-Term matrix D based on Corpus C
 21 | *   Compute Term-Document matrix ![D^T](img/315be0f70cd0effa6c8682f2a949a46c.jpg)
 22 | *   Adjacency Matrix ![A =D^T\cdot D](img/e97f8315ce721d1417bc7bb3b4a9d332.jpg)
 23 | 
 24 | There are four main components in this algorithm in the algorithm: Corpus C, Document-Term matrix D, Term-Document matrix ![D^T](img/315be0f70cd0effa6c8682f2a949a46c.jpg) and Adjacency Matrix A. In this demo part, I will show how to build those four main components.
 25 | 
 26 | Given that we have three groups of friends, they are
 27 | 
 28 | > ```
 29 | > +-------------------------------------+
 30 | > |words                                |
 31 | > +-------------------------------------+
 32 | > |[[george] [jimmy] [john] [peter]]    |
 33 | > |[[vincent] [george] [stefan] [james]]|
 34 | > |[[emma] [james] [olivia] [george]]   |
 35 | > +-------------------------------------+
 36 | > 
 37 | > ```
 38 | 
 39 | 1.  Corpus C
 40 | 
 41 | Then we can build the following corpus based on the unique elements in the given group data:
 42 | 
 43 | > ```
 44 | > [u'george', u'james', u'jimmy', u'peter', u'stefan', u'vincent', u'olivia', u'john', u'emma']
 45 | > 
 46 | > ```
 47 | 
 48 | The corresponding elements frequency:
 49 | 
 50 | > ![https://runawayhorse001.github.io/LearningApacheSpark/_images/demo_freq.png](img/cdcdbf84e640274f429780824ccf99ae.jpg)
 51 | 
 52 | 1.  Document-Term matrix D based on Corpus C (CountVectorizer)
 53 | 
 54 | > ```
 55 | > from pyspark.ml.feature import CountVectorizer
 56 | > count_vectorizer_wo = CountVectorizer(inputCol='term', outputCol='features')
 57 | > # with total unique vocabulary
 58 | > countVectorizer_mod_wo = count_vectorizer_wo.fit(df)
 59 | > countVectorizer_twitter_wo = countVectorizer_mod_wo.transform(df)
 60 | > # with truncated unique vocabulary (99%)
 61 | > count_vectorizer = CountVectorizer(vocabSize=48,inputCol='term',outputCol='features')
 62 | > countVectorizer_mod = count_vectorizer.fit(df)
 63 | > countVectorizer_twitter = countVectorizer_mod.transform(df)
 64 | > 
 65 | > ```
 66 | > 
 67 | > ```
 68 | > +-------------------------------+
 69 | > |features                       |
 70 | > +-------------------------------+
 71 | > |(9,[0,2,3,7],[1.0,1.0,1.0,1.0])|
 72 | > |(9,[0,1,4,5],[1.0,1.0,1.0,1.0])|
 73 | > |(9,[0,1,6,8],[1.0,1.0,1.0,1.0])|
 74 | > +-------------------------------+
 75 | > 
 76 | > ```
 77 | 
 78 | *   Term-Document matrix ![D^T](img/315be0f70cd0effa6c8682f2a949a46c.jpg)
 79 | 
 80 | > RDD:
 81 | > 
 82 | > ```
 83 | > [array([ 1.,  1.,  1.]), array([ 0.,  1.,  1.]), array([ 1.,  0.,  0.]),
 84 | >  array([ 1.,  0.,  0.]), array([ 0.,  1.,  0.]), array([ 0.,  1.,  0.]),
 85 | >  array([ 0.,  0.,  1.]), array([ 1.,  0.,  0.]), array([ 0.,  0.,  1.])]
 86 | > 
 87 | > ```
 88 | > 
 89 | > Matrix:
 90 | > 
 91 | > ```
 92 | > array([[ 1.,  1.,  1.],
 93 | >        [ 0.,  1.,  1.],
 94 | >        [ 1.,  0.,  0.],
 95 | >        [ 1.,  0.,  0.],
 96 | >        [ 0.,  1.,  0.],
 97 | >        [ 0.,  1.,  0.],
 98 | >        [ 0.,  0.,  1.],
 99 | >        [ 1.,  0.,  0.],
100 | >        [ 0.,  0.,  1.]])
101 | > 
102 | > ```
103 | 
104 | 1.  Adjacency Matrix ![A =D^T\cdot D](img/e97f8315ce721d1417bc7bb3b4a9d332.jpg)
105 | 
106 | > RDD:
107 | > 
108 | > ```
109 | > [array([ 1.,  1.,  1.]), array([ 0.,  1.,  1.]), array([ 1.,  0.,  0.]),
110 | >  array([ 1.,  0.,  0.]), array([ 0.,  1.,  0.]), array([ 0.,  1.,  0.]),
111 | >  array([ 0.,  0.,  1.]), array([ 1.,  0.,  0.]), array([ 0.,  0.,  1.])]
112 | > 
113 | > ```
114 | > 
115 | > Matrix:
116 | > 
117 | > ```
118 | > array([[ 3.,  2.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
119 | >        [ 2.,  2.,  0.,  0.,  1.,  1.,  1.,  0.,  1.],
120 | >        [ 1.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  0.],
121 | >        [ 1.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  0.],
122 | >        [ 1.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.],
123 | >        [ 1.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.],
124 | >        [ 1.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  1.],
125 | >        [ 1.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  0.],
126 | >        [ 1.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  1.]])
127 | > 
128 | > ```
129 | 
130 | ### 14.2.2\. Coding Puzzle from my interview
131 | 
132 | *   Problem
133 | 
134 | The attached utf-8 encoded text file contains the tags associated with an online biomedical scientific article formatted as follows (size: 100000). Each Scientific article is represented by a line in the file delimited by carriage return.
135 | 
136 | > ```
137 | > +--------------------+
138 | > |               words|
139 | > +--------------------+
140 | > |[ACTH Syndrome, E...|
141 | > |[Antibody Formati...|
142 | > |[Adaptation, Phys...|
143 | > |[Aerosol Propella...|
144 | > +--------------------+
145 | > only showing top 4 rows
146 | > 
147 | > ```
148 | 
149 | Write a program that, using this file as input, produces a list of pairs of tags which appear TOGETHER in any order and position in at least fifty different Scientific articles. For example, in the above sample, [Female] and [Humans] appear together twice, but every other pair appears only once. Your program should output the pair list to stdout in the same form as the input (eg tag 1, tag 2n).
150 | 
151 | *   My solution
152 | 
153 | > The corresponding words frequency:
154 | > 
155 | > &gt; ![https://runawayhorse001.github.io/LearningApacheSpark/_images/freq_word_ze.png](img/f5832d90e75d18f501ede7acb0b6ce74.jpg)
156 | > &gt; 
157 | > &gt; Word frequency
158 | > 
159 | > Output:
160 | > 
161 | > ```
162 | > +----------+------+-------+
163 | > |    term.x|term.y|   freq|
164 | > +----------+------+-------+
165 | > |    Female|Humans|16741.0|
166 | > |      Male|Humans|13883.0|
167 | > |     Adult|Humans|10391.0|
168 | > |      Male|Female| 9806.0|
169 | > |MiddleAged|Humans| 8181.0|
170 | > |     Adult|Female| 7411.0|
171 | > |     Adult|  Male| 7240.0|
172 | > |MiddleAged|  Male| 6328.0|
173 | > |MiddleAged|Female| 6002.0|
174 | > |MiddleAged| Adult| 5944.0|
175 | > +----------+------+-------+
176 | > only showing top 10 rows
177 | > 
178 | > ```
179 | 
180 | The corresponding Co-occurrence network:
181 | 
182 | > ![https://runawayhorse001.github.io/LearningApacheSpark/_images/netfreq.png](img/8c3fdcf6adcc472c7cd7a4598f96caac.jpg)
183 | > 
184 | > Co-occurrence network
185 | 
186 | Then you will get Figure [Co-occurrence network](#fig-netfreq)
187 | 
188 | ## 14.3\. Appendix: matrix multiplication in PySpark
189 | 
190 | 1.  load test matrix
191 | 
192 | ```
193 | df = spark.read.csv("matrix1.txt",sep=",",inferSchema=True)
194 | df.show()
195 | 
196 | ```
197 | 
198 | ```
199 | +---+---+---+---+
200 | |_c0|_c1|_c2|_c3|
201 | +---+---+---+---+
202 | |1.2|3.4|2.3|1.1|
203 | |2.3|1.1|1.5|2.2|
204 | |3.3|1.8|4.5|3.3|
205 | |5.3|2.2|4.5|4.4|
206 | |9.3|8.1|0.3|5.5|
207 | |4.5|4.3|2.1|6.6|
208 | +---+---+---+---+
209 | 
210 | ```
211 | 
212 | 1.  main function for matrix multiplication in PySpark
213 | 
214 | ```
215 | from pyspark.sql import functions as F
216 | from functools import reduce
217 | # reference: https://stackoverflow.com/questions/44348527/matrix-multiplication-at-a-in-pyspark
218 | # do the sum of the multiplication that we want, and get
219 | # one data frame for each column
220 | colDFs = []
221 | for c2 in df.columns:
222 |     colDFs.append( df.select( [ F.sum(df[c1]*df[c2]).alias("op_{0}".format(i)) for i,c1 in enumerate(df.columns) ] ) )
223 | # now union those separate data frames to build the "matrix"
224 | mtxDF = reduce(lambda a,b: a.select(a.columns).union(b.select(a.columns)), colDFs )
225 | mtxDF.show()
226 | 
227 | ```
228 | 
229 | ```
230 | +------------------+------------------+------------------+------------------+
231 | |              op_0|              op_1|              op_2|              op_3|
232 | +------------------+------------------+------------------+------------------+
233 | |            152.45|118.88999999999999|             57.15|121.44000000000001|
234 | |118.88999999999999|104.94999999999999|             38.93|             94.71|
235 | |             57.15|             38.93|52.540000000000006|             55.99|
236 | |121.44000000000001|             94.71|             55.99|110.10999999999999|
237 | +------------------+------------------+------------------+------------------+
238 | 
239 | ```
240 | 
241 | 1.  Validation with python version
242 | 
243 | ```
244 | import numpy as np
245 | a = np.genfromtxt("matrix1.txt",delimiter=",")
246 | np.dot(a.T, a)
247 | 
248 | ```
249 | 
250 | ```
251 | array([[152.45, 118.89,  57.15, 121.44],
252 |        [118.89, 104.95,  38.93,  94.71],
253 |        [ 57.15,  38.93,  52.54,  55.99],
254 |        [121.44,  94.71,  55.99, 110.11]])
255 | 
256 | ```
257 | 
258 | ## 14.4\. Correlation Network
259 | 
260 | TODO ..


--------------------------------------------------------------------------------
/docs/15.md:
--------------------------------------------------------------------------------
  1 | # 15\. ALS: Stock Portfolio Recommendations
  2 | 
  3 | Chinese proverb
  4 | 
  5 | **Don’t put all your eggs in one basket.**
  6 | 
  7 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/stock_portfolio.png](img/b4a297ef2185e28694b366bde4069858.jpg)
  8 | 
  9 | Code for the above figure:
 10 | 
 11 | ```
 12 | import numpy as np
 13 | import matplotlib.pyplot as plt
 14 | 
 15 | fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(aspect="equal"))
 16 | 
 17 | recipe = ["375 k U.S. Large Cap Blend",
 18 |           "300 k U.S. Large Cap Value",
 19 |           "75 k U.S. Short-Term Bonds",
 20 |           "50 k U.S. Small Cap Blend",
 21 |           "55 k U.S. Small Cap Value",
 22 |           "95 k U.S. Real Estate",
 23 |           "250 k Intermediate-Term Bonds"]
 24 | 
 25 | data = [float(x.split()[0]) for x in recipe]
 26 | ingredients = [' '.join(x.split()[2:]) for x in recipe]
 27 | 
 28 | print(data)
 29 | print(ingredients)
 30 | def func(pct, allvals):
 31 |     absolute = int(pct/100.*np.sum(allvals))
 32 |     return "{:.1f}%\n({:d} k)".format(pct, absolute)
 33 | 
 34 | explode = np.empty(len(data))#(0.1, 0.1, 0.1,  0.1, 0.1, 0.1)  # explode 1st slice
 35 | explode.fill(0.1)
 36 | 
 37 | wedges, texts, autotexts = ax.pie(data, explode=explode, autopct=lambda pct: func(pct, data),
 38 |                                   textprops=dict(color="w"))
 39 | ax.legend(wedges, ingredients,
 40 |           #title="Stock portfolio",
 41 |           loc="center left",
 42 |           bbox_to_anchor=(1, 0, 0.5, 1))
 43 | 
 44 | plt.setp(autotexts, size=8, weight="bold")
 45 | 
 46 | #ax.set_title("Stock portfolio")
 47 | 
 48 | plt.show()
 49 | 
 50 | ```
 51 | 
 52 | ## 15.1\. Recommender systems
 53 | 
 54 | Recommender systems or recommendation systems (sometimes replacing “system” with a synonym such as platform or engine) are a subclass of information filtering system that seek to predict the “rating” or “preference” that a user would give to an item.”
 55 | 
 56 | The main idea is to build a matrix users `R` items rating values and try to factorize it, to recommend main products rated by other users. A popular approach for this is matrix factorization is Alternating Least Squares (ALS)
 57 | 
 58 | ## 15.2\. Alternating Least Squares
 59 | 
 60 | Apache Spark ML implements ALS for collaborative filtering, a very popular algorithm for making recommendations.
 61 | 
 62 | ALS recommender is a matrix factorization algorithm that uses Alternating Least Squares with Weighted-Lamda-Regularization (ALS-WR). It factors the user to item matrix `A` into the user-to-feature matrix `U` and the item-to-feature matrix `M`: It runs the ALS algorithm in a parallel fashion. The ALS algorithm should uncover the latent factors that explain the observed user to item ratings and tries to find optimal factor weights to minimize the least squares between predicted and actual ratings.
 63 | 
 64 | [https://www.elenacuoco.com/2016/12/22/alternating-least-squares-als-spark-ml/](https://www.elenacuoco.com/2016/12/22/alternating-least-squares-als-spark-ml/)
 65 | 
 66 | ## 15.3\. Demo
 67 | 
 68 | *   The Jupyter notebook can be download from [ALS Recommender systems](_static/ALS.ipynb).
 69 | *   The data can be downloaf from [German Credit](_static/OnlineRetail.csv).
 70 | 
 71 | ### 15.3.1\. Load and clean data
 72 | 
 73 | 1.  Set up spark context and SparkSession
 74 | 
 75 | ```
 76 | from pyspark.sql import SparkSession
 77 | 
 78 | spark = SparkSession \
 79 |     .builder \
 80 |     .appName("Python Spark RFM example") \
 81 |     .config("spark.some.config.option", "some-value") \
 82 |     .getOrCreate()
 83 | 
 84 | ```
 85 | 
 86 | 1.  Load dataset
 87 | 
 88 | ```
 89 | df_raw = spark.read.format('com.databricks.spark.csv').\
 90 |                        options(header='true', \
 91 |                        inferschema='true').\
 92 |             load("Online Retail.csv",header=True);
 93 | 
 94 | ```
 95 | 
 96 | check the data set
 97 | 
 98 | ```
 99 | df_raw.show(5)
100 | df_raw.printSchema()
101 | 
102 | ```
103 | 
104 | Then you will get
105 | 
106 | ```
107 | +---------+---------+--------------------+--------+------------+---------+----------+--------------+
108 | |InvoiceNo|StockCode|         Description|Quantity| InvoiceDate|UnitPrice|CustomerID|       Country|
109 | +---------+---------+--------------------+--------+------------+---------+----------+--------------+
110 | |   536365|   85123A|WHITE HANGING HEA...|       6|12/1/10 8:26|     2.55|     17850|United Kingdom|
111 | |   536365|    71053| WHITE METAL LANTERN|       6|12/1/10 8:26|     3.39|     17850|United Kingdom|
112 | |   536365|   84406B|CREAM CUPID HEART...|       8|12/1/10 8:26|     2.75|     17850|United Kingdom|
113 | |   536365|   84029G|KNITTED UNION FLA...|       6|12/1/10 8:26|     3.39|     17850|United Kingdom|
114 | |   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/10 8:26|     3.39|     17850|United Kingdom|
115 | +---------+---------+--------------------+--------+------------+---------+----------+--------------+
116 | only showing top 5 rows
117 | 
118 | root
119 |  |-- InvoiceNo: string (nullable = true)
120 |  |-- StockCode: string (nullable = true)
121 |  |-- Description: string (nullable = true)
122 |  |-- Quantity: integer (nullable = true)
123 |  |-- InvoiceDate: string (nullable = true)
124 |  |-- UnitPrice: double (nullable = true)
125 |  |-- CustomerID: integer (nullable = true)
126 |  |-- Country: string (nullable = true)
127 | 
128 | ```
129 | 
130 | 1.  Data clean and data manipulation
131 | 
132 | *   check and remove the `null` values
133 | 
134 | ```
135 | from pyspark.sql.functions import count
136 | 
137 | def my_count(df_in):
138 |     df_in.agg( *[ count(c).alias(c) for c in df_in.columns ] ).show()
139 | 
140 | ```
141 | 
142 | ```
143 | import pyspark.sql.functions as F
144 | from pyspark.sql.functions import round
145 | df_raw = df_raw.withColumn('Asset',round( F.col('Quantity') * F.col('UnitPrice'), 2 ))
146 | df = df_raw.withColumnRenamed('StockCode', 'Cusip')\
147 |            .select('CustomerID','Cusip','Quantity','UnitPrice','Asset')
148 | 
149 | ```
150 | 
151 | ```
152 | my_count(df)
153 | 
154 | ```
155 | 
156 | ```
157 | +----------+------+--------+---------+------+
158 | |CustomerID| Cusip|Quantity|UnitPrice| Asset|
159 | +----------+------+--------+---------+------+
160 | |    406829|541909|  541909|   541909|541909|
161 | +----------+------+--------+---------+------+
162 | 
163 | ```
164 | 
165 | Since the count results are not the same, we have some null value in the `CustomerID` column. We can drop these records from the dataset.
166 | 
167 | ```
168 | df =  df.filter(F.col('Asset')>=0)
169 | df = df.dropna(how='any')
170 | my_count(df)
171 | 
172 | ```
173 | 
174 | ```
175 | +----------+------+--------+---------+------+
176 | |CustomerID| Cusip|Quantity|UnitPrice| Asset|
177 | +----------+------+--------+---------+------+
178 | |    397924|397924|  397924|   397924|397924|
179 | +----------+------+--------+---------+------+
180 | 
181 | ```
182 | 
183 | ```
184 | df.show(3)
185 | 
186 | +----------+------+--------+---------+-----+
187 | |CustomerID| Cusip|Quantity|UnitPrice|Asset|
188 | +----------+------+--------+---------+-----+
189 | |     17850|85123A|       6|     2.55| 15.3|
190 | |     17850| 71053|       6|     3.39|20.34|
191 | |     17850|84406B|       8|     2.75| 22.0|
192 | +----------+------+--------+---------+-----+
193 | only showing top 3 rows
194 | 
195 | ```
196 | 
197 | *   Convert the `Cusip` to consistent format
198 | 
199 | ```
200 | from pyspark.sql.functions import udf
201 | from pyspark.sql.types import StringType, DoubleType
202 | 
203 | def toUpper(s):
204 |     return s.upper()
205 | 
206 | upper_udf = udf(lambda x: toUpper(x), StringType())
207 | 
208 | ```
209 | 
210 | *   Find the most top `n` stockes
211 | 
212 | ```
213 | pop = df.groupBy('Cusip')\
214 |   .agg(F.count('CustomerID').alias('Customers'),F.round(F.sum('Asset'),2).alias('TotalAsset'))\
215 |   .sort([F.col('Customers'),F.col('TotalAsset')],ascending=[0,0])
216 | 
217 | pop.show(5)
218 | 
219 | ```
220 | 
221 | ```
222 | +------+---------+----------+
223 | | Cusip|Customers|TotalAsset|
224 | +------+---------+----------+
225 | |85123A|     2035|  100603.5|
226 | | 22423|     1724| 142592.95|
227 | |85099B|     1618|  85220.78|
228 | | 84879|     1408|  56580.34|
229 | | 47566|     1397|  68844.33|
230 | +------+---------+----------+
231 | only showing top 5 rows
232 | 
233 | ```
234 | 
235 | ### 15.3.2\. Build feature matrix
236 | 
237 | *   Fetch the top `n` cusip list
238 | 
239 | ```
240 | top = 10
241 | cusip_lst = pd.DataFrame(pop.select('Cusip').head(top)).astype('str').iloc[:, 0].tolist()
242 | cusip_lst.insert(0,'CustomerID')
243 | 
244 | ```
245 | 
246 | *   Create the portfolio table for each customer
247 | 
248 | ```
249 | pivot_tab = df.groupBy('CustomerID').pivot('Cusip').sum('Asset')
250 | pivot_tab = pivot_tab.fillna(0)
251 | 
252 | ```
253 | 
254 | *   Fetch the most `n` stock’s portfolio table for each customer
255 | 
256 | ```
257 | selected_tab  = pivot_tab.select(cusip_lst)
258 | selected_tab.show(4)
259 | 
260 | ```
261 | 
262 | ```
263 | +----------+------+-----+------+-----+-----+-----+-----+-----+----+-----+
264 | |CustomerID|85123A|22423|85099B|84879|47566|20725|22720|20727|POST|23203|
265 | +----------+------+-----+------+-----+-----+-----+-----+-----+----+-----+
266 | |     16503|   0.0|  0.0|   0.0|  0.0|  0.0|  0.0|  0.0| 33.0| 0.0|  0.0|
267 | |     15727| 123.9| 25.5|   0.0|  0.0|  0.0| 33.0| 99.0|  0.0| 0.0|  0.0|
268 | |     14570|   0.0|  0.0|   0.0|  0.0|  0.0|  0.0|  0.0|  0.0| 0.0|  0.0|
269 | |     14450|   0.0|  0.0|  8.32|  0.0|  0.0|  0.0| 49.5|  0.0| 0.0|  0.0|
270 | +----------+------+-----+------+-----+-----+-----+-----+-----+----+-----+
271 | only showing top 4 rows
272 | 
273 | ```
274 | 
275 | *   Build the `rating` matrix
276 | 
277 | ```
278 | def elemwiseDiv(df_in):
279 |     num = len(df_in.columns)
280 |     temp = df_in.rdd.map(lambda x: list(flatten([x[0],[x[i]/float(sum(x[1:]))
281 |                                                        if sum(x[1:])>0 else x[i]
282 |                                                        for i in range(1,num)]])))
283 |     return spark.createDataFrame(temp,df_in.columns)
284 | 
285 | ratings = elemwiseDiv(selected_tab)
286 | 
287 | ```
288 | 
289 | ```
290 | ratings.show(4)
291 | 
292 | +----------+------+-----+------+-----+-----+-----+-----+-----+----+-----+
293 | |CustomerID|85123A|22423|85099B|84879|47566|20725|22720|20727|POST|23203|
294 | +----------+------+-----+------+-----+-----+-----+-----+-----+----+-----+
295 | |     16503|   0.0|  0.0|   0.0|  0.0|  0.0|  0.0|  0.0|  1.0| 0.0|  0.0|
296 | |     15727|  0.44| 0.09|   0.0|  0.0|  0.0| 0.12| 0.35|  0.0| 0.0|  0.0|
297 | |     14570|   0.0|  0.0|   0.0|  0.0|  0.0|  0.0|  0.0|  0.0| 0.0|  0.0|
298 | |     14450|   0.0|  0.0|  0.14|  0.0|  0.0|  0.0| 0.86|  0.0| 0.0|  0.0|
299 | +----------+------+-----+------+-----+-----+-----+-----+-----+----+-----+
300 | 
301 | ```
302 | 
303 | *   Convert `rating` matrix to long table
304 | 
305 | ```
306 | from pyspark.sql.functions import array, col, explode, struct, lit
307 | 
308 | def to_long(df, by):
309 |         """
310 |         reference: https://stackoverflow.com/questions/37864222/transpose-column-to-row-with-spark
311 |         """
312 | 
313 |     # Filter dtypes and split into column names and type description
314 |     cols, dtypes = zip(*((c, t) for (c, t) in df.dtypes if c not in by))
315 |     # Spark SQL supports only homogeneous columns
316 |     assert len(set(dtypes)) == 1, "All columns have to be of the same type"
317 | 
318 |     # Create and explode an array of (column_name, column_value) structs
319 |     kvs = explode(array([
320 |       struct(lit(c).alias("Cusip"), col(c).alias("rating")) for c in cols
321 |     ])).alias("kvs")
322 | 
323 | ```
324 | 
325 | ```
326 | df_all = to_long(ratings,['CustomerID'])
327 | df_all.show(5)
328 | 
329 | ```
330 | 
331 | ```
332 | +----------+------+------+
333 | |CustomerID| Cusip|rating|
334 | +----------+------+------+
335 | |     16503|85123A|   0.0|
336 | |     16503| 22423|   0.0|
337 | |     16503|85099B|   0.0|
338 | |     16503| 84879|   0.0|
339 | |     16503| 47566|   0.0|
340 | +----------+------+------+
341 | only showing top 5 rows
342 | 
343 | ```
344 | 
345 | *   Convert the string `Cusip` to numerical index
346 | 
347 | ```
348 | from pyspark.ml.feature import StringIndexer
349 | # Index labels, adding metadata to the label column
350 | labelIndexer = StringIndexer(inputCol='Cusip',
351 |                              outputCol='indexedCusip').fit(df_all)
352 | df_all = labelIndexer.transform(df_all)
353 | 
354 | df_all.show(5, True)
355 | df_all.printSchema()
356 | 
357 | ```
358 | 
359 | ```
360 | +----------+------+------+------------+
361 | |CustomerID| Cusip|rating|indexedCusip|
362 | +----------+------+------+------------+
363 | |     16503|85123A|   0.0|         6.0|
364 | |     16503| 22423|   0.0|         9.0|
365 | |     16503|85099B|   0.0|         5.0|
366 | |     16503| 84879|   0.0|         1.0|
367 | |     16503| 47566|   0.0|         0.0|
368 | +----------+------+------+------------+
369 | only showing top 5 rows
370 | 
371 | root
372 |  |-- CustomerID: long (nullable = true)
373 |  |-- Cusip: string (nullable = false)
374 |  |-- rating: double (nullable = true)
375 |  |-- indexedCusip: double (nullable = true)
376 | 
377 | ```
378 | 
379 | ### 15.3.3\. Train model
380 | 
381 | *   build `train` and `test` dataset
382 | 
383 | ```
384 | train, test = df_all.randomSplit([0.8,0.2])
385 | 
386 | train.show(5)
387 | test.show(5)
388 | 
389 | ```
390 | 
391 | ```
392 | +----------+-----+------------+-------------------+
393 | |CustomerID|Cusip|indexedCusip|             rating|
394 | +----------+-----+------------+-------------------+
395 | |     12940|20725|         2.0|                0.0|
396 | |     12940|20727|         4.0|                0.0|
397 | |     12940|22423|         9.0|0.49990198000392083|
398 | |     12940|22720|         3.0|                0.0|
399 | |     12940|23203|         7.0|                0.0|
400 | +----------+-----+------------+-------------------+
401 | only showing top 5 rows
402 | 
403 | +----------+-----+------------+------------------+
404 | |CustomerID|Cusip|indexedCusip|            rating|
405 | +----------+-----+------------+------------------+
406 | |     12940|84879|         1.0|0.1325230346990786|
407 | |     13285|20725|         2.0|0.2054154995331466|
408 | |     13285|20727|         4.0|0.2054154995331466|
409 | |     13285|47566|         0.0|               0.0|
410 | |     13623|23203|         7.0|               0.0|
411 | +----------+-----+------------+------------------+
412 | only showing top 5 rows
413 | 
414 | ```
415 | 
416 | *   train model
417 | 
418 | ```
419 | import itertools
420 | from math import sqrt
421 | from operator import add
422 | import sys
423 | from pyspark.ml.recommendation import ALS
424 | 
425 | from pyspark.ml.evaluation import RegressionEvaluator
426 | 
427 | evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
428 |                                 predictionCol="prediction")
429 | def computeRmse(model, data):
430 |     """
431 |     Compute RMSE (Root mean Squared Error).
432 |     """
433 |     predictions = model.transform(data)
434 |     rmse = evaluator.evaluate(predictions)
435 |     print("Root-mean-square error = " + str(rmse))
436 |     return rmse
437 | 
438 | #train models and evaluate them on the validation set
439 | 
440 | ranks = [4,5]
441 | lambdas = [0.05]
442 | numIters = [30]
443 | bestModel = None
444 | bestValidationRmse = float("inf")
445 | bestRank = 0
446 | bestLambda = -1.0
447 | bestNumIter = -1
448 | 
449 | val = test.na.drop()
450 | for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters):
451 |     als = ALS(rank=rank, maxIter=numIter, regParam=lmbda, numUserBlocks=10, numItemBlocks=10, implicitPrefs=False,
452 |               alpha=1.0,
453 |               userCol="CustomerID", itemCol="indexedCusip", seed=1, ratingCol="rating", nonnegative=True)
454 |     model=als.fit(train)
455 | 
456 |     validationRmse = computeRmse(model, val)
457 |     print("RMSE (validation) = %f for the model trained with " % validationRmse + \
458 |             "rank = %d, lambda = %.1f, and numIter = %d." % (rank, lmbda, numIter))
459 |     if (validationRmse, bestValidationRmse):
460 |         bestModel = model
461 |         bestValidationRmse = validationRmse
462 |         bestRank = rank
463 |         bestLambda = lmbda
464 |         bestNumIter = numIter
465 | 
466 | model = bestModel
467 | 
468 | ```
469 | 
470 | ### 15.3.4\. Make prediction
471 | 
472 | *   make prediction
473 | 
474 | ```
475 | topredict=test[test['rating']==0]
476 | 
477 | predictions=model.transform(topredict)
478 | predictions.filter(predictions.prediction>0)\
479 |            .sort([F.col('CustomerID'),F.col('Cusip')],ascending=[0,0]).show(5)
480 | 
481 | ```
482 | 
483 | ```
484 | +----------+------+------------+------+------------+
485 | |CustomerID| Cusip|indexedCusip|rating|  prediction|
486 | +----------+------+------------+------+------------+
487 | |     18283| 47566|         0.0|   0.0|  0.01625076|
488 | |     18282|85123A|         6.0|   0.0| 0.057172246|
489 | |     18282| 84879|         1.0|   0.0| 0.059531752|
490 | |     18282| 23203|         7.0|   0.0| 0.010502596|
491 | |     18282| 22720|         3.0|   0.0| 0.053893942|
492 | +----------+------+------------+------+------------+
493 | only showing top 5 rows
494 | 
495 | ```


--------------------------------------------------------------------------------
/docs/16.md:
--------------------------------------------------------------------------------
  1 | # 16\. Monte Carlo Simulation
  2 | 
  3 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/baba_sim_price.png](img/68b23d0b045f72286a7dd26502a39dd0.jpg)
  4 | 
  5 | Monte Carlo simulations are just a way of estimating a fixed parameter by repeatedly generating random numbers. More details can be found at [A Zero Math Introduction to Markov Chain Monte Carlo Methods](https://towardsdatascience.com/a-zero-math-introduction-to-markov-chain-monte-carlo-methods-dcba889e0c50).
  6 | 
  7 | Monte Carlo simulation is a technique used to understand the impact of risk and uncertainty in financial, project management, cost, and other forecasting models. A Monte Carlo simulator helps one visualize most or all of the potential outcomes to have a better idea regarding the risk of a decision. More details can be found at [The house always wins](https://towardsdatascience.com/the-house-always-wins-monte-carlo-simulation-eb82787da2a3).
  8 | 
  9 | ## 16.1\. Simulating Casino Win
 10 | 
 11 | We assume that the player John has the 49% chance to win the game and the wager will be $5 per game.
 12 | 
 13 | ```
 14 | import numpy as np
 15 | import pandas as pd
 16 | import matplotlib.pyplot as plt
 17 | 
 18 | start_m =100
 19 | wager = 5
 20 | bets = 100
 21 | trials = 1000
 22 | 
 23 | trans = np.vectorize(lambda t: -wager if t <=0.51 else wager)
 24 | 
 25 | fig = plt.figure(figsize=(10, 6))
 26 | ax = fig.add_subplot(1,1,1)
 27 | 
 28 | end_m = []
 29 | 
 30 | for i in range(trials):
 31 |     money = reduce(lambda c, x: c + [c[-1] + x], trans(np.random.random(bets)), [start_m])
 32 |     end_m.append(money[-1])
 33 |     plt.plot(money)
 34 | 
 35 | plt.ylabel('Player Money in $')
 36 | plt.xlabel('Number of bets')
 37 | plt.title(("John starts the game with $ %.2f and ends with $ %.2f")%(start_m,sum(end_m)/len(end_m)))
 38 | plt.show()
 39 | 
 40 | ```
 41 | 
 42 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/casino_5_100.png](img/f80c359151c40c9277e2d70f38856eab.jpg) ![https://runawayhorse001.github.io/LearningApacheSpark/_images/casino_100_1000.png](img/06dae2d3c7cb1a64cb31ed2482e632b7.jpg)
 43 | 
 44 | ## 16.2\. Simulating a Random Walk
 45 | 
 46 | ### 16.2.1\. Fetch the histrical stock price
 47 | 
 48 | 1.  Fecth the data. If you need the code for this piece, you can contact with me.
 49 | 
 50 | ```
 51 | stock.tail(4)
 52 | 
 53 | +----------+----------+----------+----------+----------+----------+--------+
 54 | |      Date|      Open|      High|       Low|     Close| Adj Close|  Volume|
 55 | +----------+----------+----------+----------+----------+----------+--------+
 56 | |2018-12-07|155.399994|158.050003|151.729996|153.059998|153.059998|17447900|
 57 | |2018-12-10|150.389999|152.809998|147.479996|151.429993|151.429993|15525500|
 58 | |2018-12-11|155.259995|156.240005|150.899994|151.830002|151.830002|13651900|
 59 | |2018-12-12|155.240005|156.169998|151.429993|     151.5|     151.5|16597900|
 60 | +----------+----------+----------+----------+----------+----------+--------+
 61 | 
 62 | ```
 63 | 
 64 | 1.  Convert the `str` type date to date type
 65 | 
 66 | ```
 67 | stock['Date'] = pd.to_datetime(stock['Date'])
 68 | 
 69 | ```
 70 | 
 71 | 1.  Data visualization
 72 | 
 73 | ```
 74 | # Plot everything by leveraging the very powerful matplotlib package
 75 | width = 10
 76 | height = 6
 77 | data = stock
 78 | fig = plt.figure(figsize=(width, height))
 79 | ax = fig.add_subplot(1,1,1)
 80 | ax.plot(data.Date, data.Close, label='Close')
 81 | ax.plot(data.Date, data.High, label='High')
 82 | # ax.plot(data.Date, data.Low, label='Low')
 83 | ax.set_xlabel('Date')
 84 | ax.set_ylabel('price ($)')
 85 | ax.legend()
 86 | ax.set_title('Stock price: ' + ticker, y=1.01)
 87 | #plt.xticks(rotation=70)
 88 | plt.show()
 89 | # Plot everything by leveraging the very powerful matplotlib package
 90 | fig = plt.figure(figsize=(width, height))
 91 | ax = fig.add_subplot(1,1,1)
 92 | ax.plot(data.Date, data.Volume, label='Volume')
 93 | #ax.plot(data.Date, data.High, label='High')
 94 | # ax.plot(data.Date, data.Low, label='Low')
 95 | ax.set_xlabel('Date')
 96 | ax.set_ylabel('Volume')
 97 | ax.legend()
 98 | ax.set_title('Stock volume: ' + ticker, y=1.01)
 99 | #plt.xticks(rotation=70)
100 | plt.show()
101 | 
102 | ```
103 | 
104 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/baba_history.png](img/112822dfffe3d0e59d7881d265b78cad.jpg)
105 | 
106 | Historical Stock Price
107 | 
108 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/baba_history_vol.png](img/679879d3fb7800c91c8321c24ce0f8d9.jpg)
109 | 
110 | Historical Stock Volume
111 | 
112 | ### 16.2.2\. Calulate the Compound Annual Growth Rate
113 | 
114 | The formula for Compound Annual Growth Rate (CAGR) is very useful for investment analysis. It may also be referred to as the annualized rate of return or annual percent yield or effective annual rate, depending on the algebraic form of the equation. Many investments such as stocks have returns that can vary wildly. The CAGR formula allows you to calculate a “smoothed” rate of return that you can use to compare to other investments. The formula is defined as (more details can be found at [CAGR Calculator and Formula](https://www.vertex42.com/Calculators/cagr-calculator.html))
115 | 
116 | ![\text{CAGR}=\left(\frac{\text{End Value}}{\text{Start Value}}\right)^{\frac{365}{\text{Days}}} -1](img/26e6828e76fd4c01d8aea7770dd032c7.jpg)
117 | 
118 | ```
119 | days =  (stock.Date.iloc[-1] - stock.Date.iloc[0]).days
120 | cagr = ((((stock['Adj Close'].iloc[-1]) / stock['Adj Close'].iloc[0])) ** (365.0/days)) - 1
121 | print ('CAGR =',str(round(cagr,4)*100)+"%")
122 | mu = cagr
123 | 
124 | ```
125 | 
126 | ### 16.2.3\. Calulate the annual volatility
127 | 
128 | A stock’s volatility is the variation in its price over a period of time. For example, one stock may have a tendency to swing wildly higher and lower, while another stock may move in much steadier, less turbulent way. Both stocks may end up at the same price at the end of day, but their path to that point can vary wildly. First, we create a series of percentage returns and calculate the annual volatility of returns Annualizing volatility. To present this volatility in annualized terms, we simply need to multiply our daily standard deviation by the square root of 252\. This assumes there are 252 trading days in a given year. More details can be found at [How to Calculate Annualized Volatility](https://www.fool.com/knowledge-center/how-to-calculate-annualized-volatility.aspx).
129 | 
130 | ```
131 | stock['Returns'] = stock['Adj Close'].pct_change()
132 | vol = stock['Returns'].std()*np.sqrt(252)
133 | 
134 | ```
135 | 
136 | ### 16.2.4\. Create matrix of daily returns
137 | 
138 | 1.  Create matrix of daily returns using random normal distribution Generates an RDD matrix comprised of i.i.d. samples from the uniform distribution U(0.0, 1.0).
139 | 
140 | ```
141 | S = stock['Adj Close'].iloc[-1] #starting stock price (i.e. last available real stock price)
142 | T = 5 #Number of trading days
143 | mu = cagr #Return
144 | vol = vol #Volatility
145 | trials = 10000
146 | 
147 | mat = RandomRDDs.normalVectorRDD(sc, trials, T, seed=1)
148 | 
149 | ```
150 | 
151 | 1.  Transform the distribution in the generated RDD from U(0.0, 1.0) to U(a, b), use RandomRDDs.uniformRDD(sc, n, p, seed) .map(lambda v: a + (b - a) * v)
152 | 
153 | ```
154 | a = mu/T
155 | b = vol/math.sqrt(T)
156 | v = mat.map(lambda x: a +  (b - a)* x)
157 | 
158 | ```
159 | 
160 | 1.  Convert Rdd mstrix to dataframe
161 | 
162 | ```
163 | df = v.map(lambda x: [round(i,6)+1 for i in x]).toDF()
164 | df.show(5)
165 | 
166 | ```
167 | 
168 | ```
169 | +--------+--------+--------+--------+--------+
170 | |      _1|      _2|      _3|      _4|      _5|
171 | +--------+--------+--------+--------+--------+
172 | |0.935234|1.162894| 1.07972|1.238257|1.066136|
173 | |0.878456|1.045922|0.990071|1.045552|0.854516|
174 | |1.186472|0.944777|0.742247|0.940023|1.220934|
175 | |0.872928|1.030882|1.248644|1.114262|1.063762|
176 | | 1.09742|1.188537|1.137283|1.162548|1.024612|
177 | +--------+--------+--------+--------+--------+
178 | only showing top 5 rows
179 | 
180 | ```
181 | 
182 | ```
183 | from pyspark.sql.functions import lit
184 | S = stock['Adj Close'].iloc[-1]
185 | price = df.withColumn('init_price' ,lit(S))
186 | 
187 | ```
188 | 
189 | ```
190 | price.show(5)
191 | 
192 | +--------+--------+--------+--------+--------+----------+
193 | |      _1|      _2|      _3|      _4|      _5|init_price|
194 | +--------+--------+--------+--------+--------+----------+
195 | |0.935234|1.162894| 1.07972|1.238257|1.066136|     151.5|
196 | |0.878456|1.045922|0.990071|1.045552|0.854516|     151.5|
197 | |1.186472|0.944777|0.742247|0.940023|1.220934|     151.5|
198 | |0.872928|1.030882|1.248644|1.114262|1.063762|     151.5|
199 | | 1.09742|1.188537|1.137283|1.162548|1.024612|     151.5|
200 | +--------+--------+--------+--------+--------+----------+
201 | only showing top 5 rows
202 | 
203 | ```
204 | 
205 | ```
206 | price = price.withColumn('day_0', col('init_price'))
207 | price.show(5)
208 | 
209 | ```
210 | 
211 | ```
212 | +--------+--------+--------+--------+--------+----------+-----+
213 | |      _1|      _2|      _3|      _4|      _5|init_price|day_0|
214 | +--------+--------+--------+--------+--------+----------+-----+
215 | |0.935234|1.162894| 1.07972|1.238257|1.066136|     151.5|151.5|
216 | |0.878456|1.045922|0.990071|1.045552|0.854516|     151.5|151.5|
217 | |1.186472|0.944777|0.742247|0.940023|1.220934|     151.5|151.5|
218 | |0.872928|1.030882|1.248644|1.114262|1.063762|     151.5|151.5|
219 | | 1.09742|1.188537|1.137283|1.162548|1.024612|     151.5|151.5|
220 | +--------+--------+--------+--------+--------+----------+-----+
221 | only showing top 5 rows
222 | 
223 | ```
224 | 
225 | ### 16.2.5\. Monte Carlo Simulation
226 | 
227 | ```
228 | from pyspark.sql.functions import round
229 | for name in price.columns[:-2]:
230 |     price = price.withColumn('day'+name, round(col(name)*col('init_price'),2))
231 |     price = price.withColumn('init_price',col('day'+name))
232 | 
233 | ```
234 | 
235 | ```
236 | price.show(5)
237 | 
238 | +--------+--------+--------+--------+--------+----------+-----+------+------+------+------+------+
239 | |      _1|      _2|      _3|      _4|      _5|init_price|day_0| day_1| day_2| day_3| day_4| day_5|
240 | +--------+--------+--------+--------+--------+----------+-----+------+------+------+------+------+
241 | |0.935234|1.162894| 1.07972|1.238257|1.066136|    234.87|151.5|141.69|164.77|177.91| 220.3|234.87|
242 | |0.878456|1.045922|0.990071|1.045552|0.854516|    123.14|151.5|133.09| 139.2|137.82| 144.1|123.14|
243 | |1.186472|0.944777|0.742247|0.940023|1.220934|    144.67|151.5|179.75|169.82|126.05|118.49|144.67|
244 | |0.872928|1.030882|1.248644|1.114262|1.063762|    201.77|151.5|132.25|136.33|170.23|189.68|201.77|
245 | | 1.09742|1.188537|1.137283|1.162548|1.024612|     267.7|151.5|166.26|197.61|224.74|261.27| 267.7|
246 | +--------+--------+--------+--------+--------+----------+-----+------+------+------+------+------+
247 | only showing top 5 rows
248 | 
249 | ```
250 | 
251 | ### 16.2.6\. Summary
252 | 
253 | ```
254 | selected_col = [name for name in price.columns if 'day_' in name]
255 | 
256 | simulated = price.select(selected_col)
257 | simulated.describe().show()
258 | 
259 | ```
260 | 
261 | ```
262 | +-------+----------+------------------+------------------+------------------+------------------+------------------+
263 | |summary|2018-12-12|        2018-12-13|        2018-12-14|        2018-12-17|        2018-12-18|        2018-12-19|
264 | +-------+----------+------------------+------------------+------------------+------------------+------------------+
265 | |  count|   10000.0|           10000.0|           10000.0|           10000.0|           10000.0|           10000.0|
266 | |   mean|     151.5|155.11643700000002|        158.489058|162.23713200000003|        166.049375|        170.006525|
267 | |    std|       0.0|18.313783237787845|26.460919262517276| 33.37780495150803|39.369101074463416|45.148120695490846|
268 | |    min|     151.5|              88.2|             74.54|             65.87|             68.21|             58.25|
269 | |    25%|     151.5|           142.485|            140.15|            138.72|           138.365|            137.33|
270 | |    50%|     151.5|            154.97|           157.175|            159.82|            162.59|165.04500000000002|
271 | |    75%|     151.5|           167.445|175.48499999999999|          182.8625|           189.725|           196.975|
272 | |    max|     151.5|            227.48|            275.94|            319.17|            353.59|            403.68|
273 | +-------+----------+------------------+------------------+------------------+------------------+------------------+
274 | 
275 | ```
276 | 
277 | ```
278 | data_plt = simulated.toPandas()
279 | days = pd.date_range(stock['Date'].iloc[-1], periods= T+1,freq='B').date
280 | 
281 | width = 10
282 | height = 6
283 | fig = plt.figure(figsize=(width, height))
284 | ax = fig.add_subplot(1,1,1)
285 | 
286 | days = pd.date_range(stock['Date'].iloc[-1], periods= T+1,freq='B').date
287 | 
288 | for i in range(trials):
289 |     plt.plot(days, data_plt.iloc[i])
290 | ax.set_xlabel('Date')
291 | ax.set_ylabel('price ($)')
292 | ax.set_title('Simulated Stock price: ' + ticker, y=1.01)
293 | plt.show()
294 | 
295 | ```
296 | 
297 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/baba_sim_price_demo.png](img/d7945700ecde92ac83058e07433755da.jpg)
298 | 
299 | ### 16.2.7\. One-year Stock price simulation
300 | 
301 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/baba_sim_price.png](img/68b23d0b045f72286a7dd26502a39dd0.jpg)
302 | 
303 | Simulated Stock Price
304 | 
305 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/baba_sim_dis1.png](img/2b9e0aad3a11fee86e4261a1e94749e0.jpg)
306 | 
307 | Simulated Stock Price distribution


--------------------------------------------------------------------------------
/docs/17.md:
--------------------------------------------------------------------------------
  1 | # 17\. Markov Chain Monte Carlo
  2 | 
  3 | Chinese proverb
  4 | 
  5 | **A book is known in time of need.**
  6 | 
  7 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/mcmc_py.png](img/b7d416463ca0be2cb7caf7d373dc26cc.jpg)
  8 | 
  9 | Monte Carlo simulations are just a way of estimating a fixed parameter by repeatedly generating random numbers. More details can be found at [A Zero Math Introduction to Markov Chain Monte Carlo Methods](https://towardsdatascience.com/a-zero-math-introduction-to-markov-chain-monte-carlo-methods-dcba889e0c50).
 10 | 
 11 | Markov Chain Monte Carlo (MCMC) methods are used to approximate the posterior distribution of a parameter of interest by random sampling in a probabilistic space. More details can be found at [A Zero Math Introduction to Markov Chain Monte Carlo Methods](https://towardsdatascience.com/a-zero-math-introduction-to-markov-chain-monte-carlo-methods-dcba889e0c50).
 12 | 
 13 | The following theory and demo are from Dr. Rebecca C. Steorts’s [Intro to Markov Chain Monte Carlo](http://www2.stat.duke.edu/~rcs46/lecturesModernBayes/601-module6-markov/markov-chain-monte-carlo.pdf). More details can be found at Dr. Rebecca C. Steorts’s STA 360/601: [Bayesian Methods and Modern Statistics](http://www2.stat.duke.edu/~rcs46/bayes.html) class at Duke.
 14 | 
 15 | ## 17.1\. Metropolis algorithm
 16 | 
 17 | The Metropolis algorithm takes three main steps:
 18 | 
 19 | 1.  Sample ![\theta^* \sim J(\theta | \theta ^{(s)})](img/a70ad9ce54b1e6e921ef6745fcef45da.jpg)
 20 | 
 21 | 2.  Compute the acceptance ratio ![(r)](img/0679a6e10e2c4166bb23f4effc3d5887.jpg)
 22 | 
 23 |     &gt; ![r = \frac{p(\theta^*|y)}{p(\theta^{(s)}|y)} = \frac{p(y|\theta^*)p(\theta^*)}{p(y|\theta^{(s)})p(\theta^{(s)})}](img/4e5f23199f8f3cd55221e56cb05bd116.jpg)
 24 | 
 25 | 3.  Let
 26 | 
 27 |     &gt; (1)![\theta^{(s+1)}
 28 |     &gt;              =
 29 |     &gt;              \left\{
 30 |     &gt;      \begin{array}{ll}
 31 |     &gt;              \theta^* &\text{ with prob min}{(r,1)} \\
 32 |     &gt;              \theta^{(s)} &\text{ otherwise }
 33 |     &gt;      \end{array}
 34 |     &gt;      \right.](img/6b9d05253edbae95f27fa6c58648c872.jpg)
 35 | 
 36 | Note
 37 | 
 38 | Actually, the [(1)](#equation-eq-step3) in Step 3 can be replaced by sampling ![u \sim \text{Uniform}(0,1)](img/29928a4b917a9348751e7d0410712045.jpg) and setting ![\theta^{(s+1)}=\theta^*](img/3c0d894f48daba4e004eddd747cb556c.jpg) if ![u&lt;r](img/3f959ef0c8e56c21a00dceb01eb6e6c1.jpg) and setting ![\theta^{(s+1)}=\theta^{(s)}](img/97fa452fc3843513b0746e76d216be78.jpg) otherwise.
 39 | 
 40 | ## 17.2\. A Toy Example of Metropolis
 41 | 
 42 | The following example is going to test out the Metropolis algorithm for the conjugate Normal-Normal model with a known variance situation.
 43 | 
 44 | ### 17.2.1\. Conjugate Normal-Normal model
 45 | 
 46 | > ![\begin{array}{ll}
 47 | >     X_1, \cdots, X_n & \theta \stackrel{iid}{\sim}\text{Normal}(\theta,\sigma^2)\\
 48 | >                       & \theta \sim\text{Normal}(\mu,\tau^2)
 49 | > \end{array}](img/46803121f8c51dcb1422593800354126.jpg)
 50 | 
 51 | Recall that the posterior of ![\theta](img/ed008064e9d0e55dc93f673b9aca6b65.jpg) is ![\text{Normal}(\mu_n,\tau^2_n)](img/5a64add1cdcd06c1755071eba4040184.jpg), where
 52 | 
 53 | > ![\mu_n = \bar{x}\frac{n/\sigma^2}{n/\sigma^2+1/\tau^2} + \mu\frac{1/\tau^2}{n/\sigma^2+1/\tau^2}](img/9af4ce81465021e68c774194432663c0.jpg)
 54 | 
 55 | and
 56 | 
 57 | > ![\tau_n^2 = \frac{1}{n/\sigma^2+1/\tau^2}](img/5afb60bc18fcf81ab2ff1282bb035030.jpg)
 58 | 
 59 | ### 17.2.2\. Example setup
 60 | 
 61 | The rest of the parameters are ![\sigma^2=1](img/7b85580dc5d18fc8fec74ce74849c182.jpg), ![\tau^2=10](img/210c2cca588f51d57a4eee64f09d4b38.jpg), ![\mu=5](img/fcc4c8c5db1d6aa3ff080466e10ccb74.jpg), ![n=5](img/2f97b51dd2c305123e29377f808272b2.jpg) and
 62 | 
 63 | > ![y = [9.37, 10.18, 9.16, 11.60, 10.33]](img/d9195c20e19c173ec6d22c2e60a2cddb.jpg)
 64 | 
 65 | For this setup, we get that ![\mu_n=10.02745](img/eea23835a8abd9d903f56256c18cf8aa.jpg) and ![\tau_n^2=0.1960784](img/82f787ceda05c98a84ab98cdc998025c.jpg).
 66 | 
 67 | ### 17.2.3\. Essential mathematical derivation
 68 | 
 69 | In the [Metropolis algorithm](#metroalg), we need to compute the acceptance ratio ![r](img/40bca4c232843e818fac0254a2ae2689.jpg), i.e.
 70 | 
 71 | > ![r  &=  \frac{p(\theta^*|x)}{p(\theta^{(s)}|x)} \\
 72 | >    &=  \frac{p(x|\theta^*)p(\theta^*)}{p(x|\theta^{(s)})p(\theta^{(s)})}\\
 73 | >    &=  \left(\frac{\prod_i\text{dnorm}(x_i,\theta^*,\sigma)}{\prod_i\text{dnorm}(x_i,\theta^{(s)},\sigma)}\right)
 74 | >         \left(\frac{\text{dnorm}(\theta^*,\mu,\tau)}{\text{dnorm}(\theta^{(s)},\mu,\tau)}\right)](img/5d730ed5b317fc0cf48f64f9e3312d92.jpg)
 75 | 
 76 | In many cases, computing the ratio ![r](img/40bca4c232843e818fac0254a2ae2689.jpg) directly can be numerically unstable, however, this can be modified by taking ![log r](img/6202e78af1fef25458e1696f4a8ae057.jpg). i.e.
 77 | 
 78 | > ![logr  &=  \sum_i \left(log[\text{dnorm}(x_i,\theta^*,\sigma)] - log[\text{dnorm}(x_i, \theta^{(s)}, \sigma)]\right)\\
 79 | >       &+  \sum_i \left(log[\text{dnorm}(\theta^*,\mu,\tau)] - log[\text{dnorm}(\theta^{(s)}, \mu,\tau)]\right)](img/7ab51337f3701838a285ea3a7346a777.jpg)
 80 | 
 81 | Then the criteria of the acceptance becomes: if ![log u&lt; log r](img/3de6020b5a20976d8e1244b98a1ae30b.jpg), where ![u](img/76b7cb4ea7971a8dc0db7a36feb66a35.jpg) is sample form the ![\text{Uniform}(0,1)](img/80f5d2344fd2f483b82b81d0a33e9333.jpg).
 82 | 
 83 | ## 17.3\. Demos
 84 | 
 85 | Now, We generate ![S](img/40877c463ec6621caf8f742f1e5c7c05.jpg) iterations of the Metropolis algorithm starting at ![\theta^{(0)}=0](img/43393ec4f575c391ddfca83324ec67a8.jpg) and using a normal proposal distribution, where
 86 | 
 87 | > ![\theta^{(s+1)} \sim \text{Normal}(\theta^{(s)},2).](img/7a9b61eccdf8d02c95b6cd81a63e02ac.jpg)
 88 | 
 89 | ### 17.3.1\. R results
 90 | 
 91 | ```
 92 | # setting values
 93 | set.seed(1)
 94 | s2<-1
 95 | t2<-10
 96 | mu<-5; n<-5
 97 | 
 98 | # rounding the rnorm to 2 decimal places
 99 | y<-round(rnorm(n,10,1),2)
100 | # mean of the normal posterior
101 | mu.n<-( mean(y)*n/s2 + mu/t2 )/( n/s2+1/t2)
102 | # variance of the normal posterior
103 | t2.n<-1/(n/s2+1/t2)
104 | # defining the data
105 | y<-c(9.37, 10.18, 9.16, 11.60, 10.33)
106 | 
107 | ####metropolis part####
108 | ##S = total num of simulations
109 | theta<-0 ; delta<-2 ; S<-10000 ; THETA<-NULL ; set.seed(1)
110 | for(s in 1:S){
111 |   ## simulating our proposal
112 |   #the new value of theta
113 |   #print(theta)
114 |   theta.star<-rnorm(1,theta,sqrt(delta))
115 |   ##taking the log of the ratio r
116 |   log.r<-( sum(dnorm(y,theta.star,sqrt(s2),log=TRUE))+ 
117 |                  dnorm(theta.star,mu,sqrt(t2),log=TRUE))- 
118 |           ( sum(dnorm(y,theta,sqrt(s2),log=TRUE))+  
119 |                   dnorm(theta,mu,sqrt(t2),log=TRUE))
120 |   #print(log.r)
121 |   if(log(runif(1))<log.r) { theta<-theta.star }
122 |   ##updating THETA
123 |   #print(log(runif(1)))
124 |   THETA<-c(THETA,theta)
125 | }
126 | 
127 | ##two plots: trace of theta and comparing the empirical distribution
128 | ##of simulated values to the true posterior
129 | par(mar=c(3,3,1,1),mgp=c(1.75,.75,0))
130 | par(mfrow=c(1,2))
131 | # creating a sequence
132 | skeep<-seq(10,S,by=10)
133 | # making a trace place
134 | plot(skeep,THETA[skeep],type="l",
135 |      xlab="iteration",ylab=expression(theta))
136 | # making a histogram
137 | hist(THETA[-(1:50)],prob=TRUE,main="",
138 |      xlab=expression(theta),ylab="density")
139 | th<-seq(min(THETA),max(THETA),length=100)
140 | lines(th,dnorm(th,mu.n,sqrt(t2.n)) )
141 | 
142 | ```
143 | 
144 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/mcmc_r.png](img/96f213d1391dab5d9cd7f4ff68e739aa.jpg)
145 | 
146 | Histogram for the Metropolis algorithm with r
147 | 
148 | Figure. [Histogram for the Metropolis algorithm with r](#fig-mcmc-r) shows a trace plot for this run as well as a histogram for the Metropolis algorithm compared with a draw from the true normal density.
149 | 
150 | ### 17.3.2\. Python results
151 | 
152 | ```
153 | # coding: utf-8
154 | 
155 | # In[1]:
156 | 
157 | import numpy as np
158 | 
159 | # In[2]:
160 | 
161 | from scipy.stats import norm
162 | 
163 | def rnorm(n,mean,sd):
164 |     """
165 |     same functions as rnorm in r
166 |     r: rnorm(n, mean=0, sd=1)
167 |     py: rvs(loc=0, scale=1, size=1, random_state=None)
168 |     """
169 |     return norm.rvs(loc=mean,scale=sd,size=n)
170 | 
171 | def dnorm(x,mean,sd, log=False):
172 |     """
173 |     same functions as dnorm in r
174 |     dnorm(x, mean=0, sd=1, log=FALSE)
175 |     pdf(x, loc=0, scale=1)
176 |     """
177 |     if log:
178 |         return np.log(norm.pdf(x=x,loc=mean,scale=sd))
179 |     else:
180 |         return norm.pdf(x=x,loc=mean,scale=sd)
181 | 
182 | def runif(n,min=0, max=1):
183 |     """
184 |     r: runif(n, min = 0, max = 1)
185 |     py: random.uniform(low=0.0, high=1.0, size=None)
186 |     """
187 |     return np.random.uniform(min,max,size=n)
188 | 
189 | # In[3]:
190 | 
191 | s2 = 1
192 | t2 = 10
193 | mu = 5
194 | n = 5 
195 | 
196 | # In[4]:
197 | 
198 | y = rnorm(n,10,1)
199 | y
200 | 
201 | # In[5]:
202 | 
203 | # mean of the normal posterior
204 | mu_n = (np.mean(y)*n/s2 + mu/float(t2))/(n/float(s2)+1/float(t2)) 
205 | mu_n
206 | 
207 | # In[6]:
208 | 
209 | # variance of the normal posterior
210 | # t2.n<-1/(n/s2+1/t2)
211 | 
212 | t2_n = 1.0/(n/float(s2)+1.0/t2)
213 | t2_n
214 | 
215 | # In[7]:
216 | 
217 | # defining the data
218 | # y<-c(9.37, 10.18, 9.16, 11.60, 10.33)
219 | 
220 | y = [9.37, 10.18, 9.16, 11.60, 10.33]
221 | 
222 | # In[8]:
223 | 
224 | mu_n = (np.mean(y)*n/s2 + mu/float(t2))/(n/float(s2)+1/float(t2)) 
225 | mu_n
226 | 
227 | # In[9]:
228 | 
229 | ####metropolis part####
230 | ##S = total num of simulations
231 | # theta<-0 ; delta<-2 ; S<-10000 ; THETA<-NULL ; set.seed(1)
232 | 
233 | theta = 0 
234 | delta = 2
235 | 
236 | S = 10000
237 | 
238 | theta_v = []
239 | 
240 | # In[ ]:
241 | 
242 | for s in range(S):
243 |     theta_star = norm.rvs(theta,np.sqrt(delta),1)
244 |     logr = (sum(dnorm(y,theta_star,np.sqrt(s2),log=True)) +            
245 |             sum(dnorm(theta_star,mu,np.sqrt(t2),log=True)))-            
246 |             (sum(dnorm(y,theta,np.sqrt(s2),log=True)) +             
247 |              sum(dnorm([theta],mu,np.sqrt(t2),log=True)))
248 |     #print(logr)
249 |     if np.log(runif(1))<logr:
250 |         theta = theta_star
251 |     #print(theta)    
252 |     theta_v.append(theta)  
253 | 
254 | # In[ ]:
255 | 
256 | import matplotlib.mlab as mlab
257 | import matplotlib.pyplot as plt
258 | 
259 | plt.figure(figsize=(20, 8))
260 | 
261 | plt.subplot(1, 2, 1)
262 | plt.plot(theta_v,'b-.')
263 | 
264 | plt.subplot(1, 2, 2)
265 | #bins = np.arange(0, S, 10) 
266 | plt.hist(theta_v, density=True,bins='auto')
267 | x = np.linspace(min(theta_v),max(theta_v),100) 
268 | y = norm.pdf(x,mu_n,np.sqrt(t2_n))
269 | plt.plot(x,y,'y-.')
270 | plt.xlim(right=12)  # adjust the right leaving left unchanged
271 | plt.xlim(left=8)  # adjust the left leaving right unchanged
272 | plt.show()
273 | 
274 | # In[ ]:
275 | 
276 | ```
277 | 
278 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/mcmc_py.png](img/b7d416463ca0be2cb7caf7d373dc26cc.jpg)
279 | 
280 | Histogram for the Metropolis algorithm with python
281 | 
282 | Figure. [Histogram for the Metropolis algorithm with python](#fig-mcmc-py) shows a trace plot for this run as well as a histogram for the Metropolis algorithm compared with a draw from the true normal density.
283 | 
284 | ### 17.3.3\. PySpark results
285 | 
286 | TODO…
287 | 
288 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/mcmc_py.png](img/b7d416463ca0be2cb7caf7d373dc26cc.jpg)
289 | 
290 | Histogram for the Metropolis algorithm with PySpark
291 | 
292 | Figure. [Histogram for the Metropolis algorithm with PySpark](#fig-mcmc-pyspark) shows a trace plot for this run as well as a histogram for the Metropolis algorithm compared with a draw from the true normal density.


--------------------------------------------------------------------------------
/docs/18.md:
--------------------------------------------------------------------------------
  1 | # 18\. Neural Network
  2 | 
  3 | Chinese proverb
  4 | 
  5 | Sharpening the knife longer can make it easier to hack the firewood – old Chinese proverb
  6 | 
  7 | ## 18.1\. Feedforward Neural Network
  8 | 
  9 | ### 18.1.1\. Introduction
 10 | 
 11 | A feedforward neural network is an artificial neural network wherein connections between the units do not form a cycle. As such, it is different from recurrent neural networks.
 12 | 
 13 | The feedforward neural network was the first and simplest type of artificial neural network devised. In this network, the information moves in only one direction, forward (see Fig. [MultiLayer Neural Network](#fig-fnn)), from the input nodes, through the hidden nodes (if any) and to the output nodes. There are no cycles or loops in the network.
 14 | 
 15 | > ![https://runawayhorse001.github.io/LearningApacheSpark/_images/fnn.png](img/cedd3825782041ef84d7741e62528a42.jpg)
 16 | > 
 17 | > MultiLayer Neural Network
 18 | 
 19 | ### 18.1.2\. Demo
 20 | 
 21 | 1.  Set up spark context and SparkSession
 22 | 
 23 | ```
 24 | from pyspark.sql import SparkSession
 25 | 
 26 | spark = SparkSession \
 27 |     .builder \
 28 |     .appName("Python Spark Feedforward neural network example") \
 29 |     .config("spark.some.config.option", "some-value") \
 30 |     .getOrCreate()
 31 | 
 32 | ```
 33 | 
 34 | 1.  Load dataset
 35 | 
 36 | ```
 37 | +-----+--------+------+-----+---------+----+-----+-------+----+---------+-------+-------+
 38 | |fixed|volatile|citric|sugar|chlorides|free|total|density|  pH|sulphates|alcohol|quality|
 39 | +-----+--------+------+-----+---------+----+-----+-------+----+---------+-------+-------+
 40 | |  7.4|     0.7|   0.0|  1.9|    0.076|11.0| 34.0| 0.9978|3.51|     0.56|    9.4|      5|
 41 | |  7.8|    0.88|   0.0|  2.6|    0.098|25.0| 67.0| 0.9968| 3.2|     0.68|    9.8|      5|
 42 | |  7.8|    0.76|  0.04|  2.3|    0.092|15.0| 54.0|  0.997|3.26|     0.65|    9.8|      5|
 43 | | 11.2|    0.28|  0.56|  1.9|    0.075|17.0| 60.0|  0.998|3.16|     0.58|    9.8|      6|
 44 | |  7.4|     0.7|   0.0|  1.9|    0.076|11.0| 34.0| 0.9978|3.51|     0.56|    9.4|      5|
 45 | +-----+--------+------+-----+---------+----+-----+-------+----+---------+-------+-------+
 46 | only showing top 5 rows
 47 | 
 48 | ```
 49 | 
 50 | 1.  change categorical variable size
 51 | 
 52 | ```
 53 | # Convert to float format
 54 | def string_to_float(x):
 55 |     return float(x)
 56 | 
 57 | #
 58 | def condition(r):
 59 |     if (0<= r <= 4):
 60 |         label = "low"
 61 |     elif(4< r <= 6):
 62 |         label = "medium"
 63 |     else:
 64 |         label = "high"
 65 |     return label
 66 | 
 67 | ```
 68 | 
 69 | ```
 70 | from pyspark.sql.functions import udf
 71 | from pyspark.sql.types import StringType, DoubleType
 72 | string_to_float_udf = udf(string_to_float, DoubleType())
 73 | quality_udf = udf(lambda x: condition(x), StringType())
 74 | df= df.withColumn("quality", quality_udf("quality"))
 75 | 
 76 | ```
 77 | 
 78 | 1.  Convert the data to dense vector
 79 | 
 80 | ```
 81 | # convert the data to dense vector
 82 | def transData(data):
 83 |     return data.rdd.map(lambda r: [r[-1], Vectors.dense(r[:-1])]).\
 84 |            toDF(['label','features'])
 85 | 
 86 | from pyspark.sql import Row
 87 | from pyspark.ml.linalg import Vectors
 88 | 
 89 | data= transData(df)
 90 | data.show()
 91 | 
 92 | ```
 93 | 
 94 | 1.  Split the data into training and test sets (40% held out for testing)
 95 | 
 96 | ```
 97 | # Split the data into train and test
 98 | (trainingData, testData) = data.randomSplit([0.6, 0.4])
 99 | 
100 | ```
101 | 
102 | 1.  Train neural network
103 | 
104 | ```
105 | # specify layers for the neural network:
106 | # input layer of size 11 (features), two intermediate of size 5 and 4
107 | # and output of size 7 (classes)
108 | layers = [11, 5, 4, 4, 3 , 7]
109 | 
110 | # create the trainer and set its parameters
111 | FNN = MultilayerPerceptronClassifier(labelCol="indexedLabel", \
112 |                                      featuresCol="indexedFeatures",\
113 |                                      maxIter=100, layers=layers, \
114 |                                      blockSize=128, seed=1234)
115 | # Convert indexed labels back to original labels.
116 | labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
117 |                                labels=labelIndexer.labels)
118 | # Chain indexers and forest in a Pipeline
119 | from pyspark.ml import Pipeline
120 | pipeline = Pipeline(stages=[labelIndexer, featureIndexer, FNN, labelConverter])
121 | # train the model
122 | # Train model.  This also runs the indexers.
123 | model = pipeline.fit(trainingData)
124 | 
125 | ```
126 | 
127 | 1.  Make predictions
128 | 
129 | ```
130 | # Make predictions.
131 | predictions = model.transform(testData)
132 | # Select example rows to display.
133 | predictions.select("features","label","predictedLabel").show(5)
134 | 
135 | ```
136 | 
137 | 1.  Evaluation
138 | 
139 | ```
140 | # Select (prediction, true label) and compute test error
141 | evaluator = MulticlassClassificationEvaluator(
142 |     labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
143 | accuracy = evaluator.evaluate(predictions)
144 | print("Predictions accuracy = %g, Test Error = %g" % (accuracy,(1.0 - accuracy)))
145 | 
146 | ```


--------------------------------------------------------------------------------
/docs/19.md:
--------------------------------------------------------------------------------
  1 | # 19\. My PySpark Package
  2 | 
  3 | It’s super easy to wrap your own package in Python. I packed some functions which I frequently used in my daily work. You can download and install it from [My PySpark Package](https://github.com/runawayhorse001/PySparkTools). The hierarchical structure and the directory structure of this package are as follows.
  4 | 
  5 | ## 19.1\. Hierarchical Structure
  6 | 
  7 | ```
  8 | |-- build
  9 | |   |-- bdist.linux-x86_64
 10 | |   |-- lib.linux-x86_64-2.7
 11 | |       |-- PySparkTools
 12 | |           |-- __init__.py
 13 | |           |-- Manipulation
 14 | |           |   |-- DataManipulation.py
 15 | |           |   |-- __init__.py
 16 | |           |── Visualization
 17 | |               |-- __init__.py
 18 | │               |-- PyPlots.py
 19 | |-- dist
 20 | │   |-- PySParkTools-1.0-py2.7.egg
 21 | |-- __init__.py
 22 | |-- PySparkTools
 23 | |   |-- __init__.py
 24 | |   |-- Manipulation
 25 | |   |   |-- DataManipulation.py
 26 | |   |   |-- __init__.py
 27 | |   |-- Visualization
 28 | |       |-- __init__.py
 29 | |       |-- PyPlots.py
 30 | │       |-- PyPlots.pyc
 31 | |-- PySParkTools.egg-info
 32 | |   |-- dependency_links.txt
 33 | |   |-- PKG-INFO
 34 | |   |-- requires.txt
 35 | |   |-- SOURCES.txt
 36 | |   |-- top_level.txt
 37 | |-- README.md
 38 | |-- requirements.txt
 39 | |-- setup.py
 40 | |-- test
 41 |     |-- spark-warehouse
 42 |     |-- test1.py
 43 |     |-- test2.py
 44 | 
 45 | ```
 46 | 
 47 | From the above hierarchical structure, you will find that you have to have `__init__.py` in each directory. I will explain the `__init__.py` file with the example below:
 48 | 
 49 | ## 19.2\. Set Up
 50 | 
 51 | ```
 52 | from setuptools import setup, find_packages
 53 | 
 54 | try:
 55 |     with open("README.md") as f:
 56 |         long_description = f.read()
 57 | except IOError:
 58 |     long_description = ""
 59 | 
 60 | try:
 61 |     with open("requirements.txt") as f:
 62 |         requirements = [x.strip() for x in f.read().splitlines() if x.strip()]
 63 | except IOError:
 64 |     requirements = []
 65 | 
 66 | setup(name='PySParkTools',
 67 |           install_requires=requirements,
 68 |       version='1.0',
 69 |       description='Python Spark Tools',
 70 |       author='Wenqiang Feng',
 71 |       author_email='von198@gmail.com',
 72 |       url='https://github.com/runawayhorse001/PySparkTools',
 73 |       packages=find_packages(),
 74 |       long_description=long_description
 75 |      )
 76 | 
 77 | ```
 78 | 
 79 | ## 19.3\. ReadMe
 80 | 
 81 | ```
 82 | # PySparkTools
 83 | 
 84 | This is my PySpark Tools. If you want to colne and install it, you can use
 85 | 
 86 | - clone
 87 | 
 88 | ```{bash}
 89 | git clone git@github.com:runawayhorse001/PySparkTools.git
 90 | ```
 91 | - install
 92 | 
 93 | ```{bash}
 94 | cd PySparkTools
 95 | pip install -r requirements.txt
 96 | python setup.py install
 97 | ```
 98 | 
 99 | - test
100 | 
101 | ```{bash}
102 | cd PySparkTools/test
103 | python test1.py
104 | ```
105 | 
106 | ```


--------------------------------------------------------------------------------
/docs/2.md:
--------------------------------------------------------------------------------
 1 | # 2\. 为什么是 Spark 和 Python
 2 | 
 3 | > **磨刀不误砍柴工。** – 中国古代谚语
 4 | 
 5 | 我想从以下两个部分回答这个问题：
 6 | 
 7 | ## 2.1\. 为什么是 Spark
 8 | 
 9 | 我认为 [Apache Spark™](http://spark.apache.org/) 官网的以下四个主要原因足以说服您使用 Spark。
10 | 
11 | 1.  速度
12 | 
13 |     在内存中运行程序比 Hadoop MapReduce 快 100 倍，或者比磁盘上运行快 10 倍。
14 | 
15 |     Apache Spark 拥有先进的 DAG 执行引擎，支持非循环数据流和内存计算。
16 | 
17 |     > ![https://runawayhorse001.github.io/LearningApacheSpark/_images/logistic-regression.png](img/72748fa31cb48a5062a2fc7949bd0b45.jpg)
18 |     > 
19 |     > Hadoop 和 Spark 中的逻辑回归
20 | 
21 | 1.  易于使用
22 | 
23 |     使用 Java，Scala，Python，R 快速编写应用。
24 | 
25 |     Spark 提供 80 多个高级操作符，可以轻松构建并行应用。 您可以从 Scala，Python 和 R shell 中以交互方式使用它。
26 | 
27 | 2.  通用性
28 | 
29 |     结合SQL，流式和复杂的分析。
30 | 
31 |     Spark 支持很多库，包括 SQL 和 DataFrames，用于机器学习的 MLlib，GraphX 和 Spark Streaming。您可以在同一个应用中无缝地组合这些库。
32 | 
33 |     > [![https://runawayhorse001.github.io/LearningApacheSpark/_images/stack.png](img/d3b112475692c0421480c01cd029cf09.jpg)](https://runawayhorse001.github.io/LearningApacheSpark/_images/stack.png)
34 |     > 
35 |     > Spark 技术栈
36 | 
37 | 1.  随处运行
38 | 
39 |     Spark 在 Hadoop，Mesos，独立或云端运行。 它可以访问各种数据源，包括 HDFS，Cassandra，HBase 和 S3。
40 | 
41 |     > [![https://runawayhorse001.github.io/LearningApacheSpark/_images/spark-runs-everywhere.png](img/b9eb842264e6a48a42ecf5f142e32414.jpg)](https://runawayhorse001.github.io/LearningApacheSpark/_images/spark-runs-everywhere.png)
42 |     > 
43 |     > Spark 平台
44 | 
45 | ## 2.2\. 为什么是 PySpark?
46 | 
47 | 无论你喜欢与否，Python 都是最受欢迎的编程语言之一。
48 | 
49 | > ![https://runawayhorse001.github.io/LearningApacheSpark/_images/languages.jpg](img/348c0d7bc8db0d630042e5faffd2d647.jpg)
50 | > 
51 | > KDnuggets 分析/数据科学 2017 软件调查，来自 [kdnuggets](http://www.kdnuggets.com/2017/05/poll-analytics-data-science-machine-learning-software-leaders.html)。
52 | 


--------------------------------------------------------------------------------
/docs/20.md:
--------------------------------------------------------------------------------
1 | # 20\. My Cheat Sheet
2 | 
3 | You can download the PDF version: [PySpark Cheat Sheet](https://github.com/runawayhorse001/CheatSheet/blob/master/cheatSheet_pyspark.pdf) and [pdDataFrame vs rddDataFrame](https://github.com/runawayhorse001/CheatSheet/blob/master/cheatSheet_pdrdd.pdf).
4 | 
5 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/cheatSheet_pyspark1.jpg](img/c24065c33e1cca422d1ae92f57cd77c1.jpg) ![https://runawayhorse001.github.io/LearningApacheSpark/_images/cheatSheet_pyspark2.jpg](img/88d05071bd3700af0ba08bab16c423be.jpg) ![https://runawayhorse001.github.io/LearningApacheSpark/_images/cheatSheet_pdrdd.jpg](img/dd0fad3141f468ebc29678d3ff86055d.jpg)


--------------------------------------------------------------------------------
/docs/22.md:
--------------------------------------------------------------------------------
 1 | # 22\. Main Reference
 2 | 
 3 | | [Feng2017] | 
 4 | 
 5 | 1.  Feng and M. Chen. [Learning Apache Spark](https://mingchen0919.github.io/learning-apache-spark/index.html), Github 2017.
 6 | 
 7 |  |
 8 | 
 9 | | [Feng2016PSD] | 
10 | 
11 | 1.  Feng, A. J. Salgado, C. Wang, S. M. Wise. Preconditioned Steepest Descent Methods for some Nonlinear Elliptic Equations Involving p-Laplacian Terms. J. Comput. Phys., 334:45–67, 2016.
12 | 
13 |  |
14 | 
15 | | [Feng2014] | 
16 | 
17 | 1.  Feng. [Prelim Notes for Numerical Analysis](http://web.utk.edu/~wfeng1/doc/PrelimNum.pdf), The University of Tennessee, Knoxville.
18 | 
19 |  |
20 | 
21 | | [Karau2015] | 
22 | 
23 | 1.  Karau, A. Konwinski, P. Wendell and M. Zaharia. Learning Spark: Lightning-Fast Big Data Analysis. O’Reilly Media, Inc., 2015
24 | 
25 |  |
26 | 
27 | | [Kirillov2016] | Anton Kirillov. Apache Spark: core concepts, architecture and internals. [http://datastrophic.io/core-concepts-architecture-and-internals-of-apache-spark/](http://datastrophic.io/core-concepts-architecture-and-internals-of-apache-spark/) |


--------------------------------------------------------------------------------
/docs/3.md:
--------------------------------------------------------------------------------
  1 | # 3\. 配置运行平台
  2 | 
  3 | > **工欲善其事，必先利其器。** – 中国古代谚语
  4 | 
  5 | 一个好的编程平台可以为您节省大量的麻烦和时间。 在这里，我将仅介绍如何安装我最喜欢的编程平台，并且只展示我在 Linux 系统上设置它的最简单的方法。 如果要在其他操作系统上安装，可以通过搜索引擎。 在本节中，您可以学习如何在相应的编程平台和包上设置 Pyspark。
  6 | 
  7 | ## 3.1\. 在 Databricks 社区云上运行
  8 | 
  9 | 如果您对 Linux 或 Unix 操作系统没有任何经验，我很乐意建议您在 Databricks 社区云上使用 Spark。 因为你不需要设置 Spark，它对于社区版来说完全是免费的**。 请按照下面列出的步骤操作。
 10 | 
 11 | 1.  在 [https://community.cloud.databricks.com/login.html](https://community.cloud.databricks.com/login.html) 建立账户：
 12 | 
 13 |     ![https://runawayhorse001.github.io/LearningApacheSpark/_images/login.png](img/7166a4887b7f211527c9e45a072e23d2.jpg)
 14 | 
 15 | 1.  使用您的帐户登录，然后您可以创建集群（计算机），表（数据集）和笔记本（代码）。
 16 | 
 17 |     ![https://runawayhorse001.github.io/LearningApacheSpark/_images/workspace.png](img/c9c3087ea25e6c3f848030b33b06de8f.jpg)
 18 | 
 19 | 1.  创建运行代码的集群
 20 | 
 21 |     ![https://runawayhorse001.github.io/LearningApacheSpark/_images/cluster.png](img/fdfe96b0b4fdfbfd862a698dc64ce34a.jpg)
 22 | 
 23 | 1.  导入你的数据集
 24 | 
 25 |     ![https://runawayhorse001.github.io/LearningApacheSpark/_images/table.png](img/b7721ad6f461509452813013157c7a5e.jpg)
 26 |     
 27 |     ![https://runawayhorse001.github.io/LearningApacheSpark/_images/dataset1.png](img/b8c9ccb17235ad37b2b0fee18853efe6.jpg)
 28 | 
 29 |     > 注意
 30 |     > 
 31 |     > 您需要保存`Uploaded to DBFS`中显示的路径: `/FileStore/tables/05rmhuqv1489687378010/`，由于我们会使用这个路径来上传数据集。
 32 | 
 33 | 1.  创建你的笔记本
 34 | 
 35 |     ![https://runawayhorse001.github.io/LearningApacheSpark/_images/notebook.png](img/edb67528127916e7e274addf9ad96029.jpg) 
 36 |     
 37 |     ![https://runawayhorse001.github.io/LearningApacheSpark/_images/codenotebook.png](img/8973b73843e90120de5f556d5084eb49.jpg)
 38 | 
 39 | 完成上述 5 个步骤后，您就可以在 Databricks 社区云上运行 Spark 代码了。 我将在 Databricks 社区云上运行以下所有演示。在运行演示代码时，希望您将获得以下结果：
 40 | 
 41 | ```
 42 | +---+-----+-----+---------+-----+
 43 | |_c0|   TV|Radio|Newspaper|Sales|
 44 | +---+-----+-----+---------+-----+
 45 | |  1|230.1| 37.8|     69.2| 22.1|
 46 | |  2| 44.5| 39.3|     45.1| 10.4|
 47 | |  3| 17.2| 45.9|     69.3|  9.3|
 48 | |  4|151.5| 41.3|     58.5| 18.5|
 49 | |  5|180.8| 10.8|     58.4| 12.9|
 50 | +---+-----+-----+---------+-----+
 51 | only showing top 5 rows
 52 | 
 53 | root
 54 |  |-- _c0: integer (nullable = true)
 55 |  |-- TV: double (nullable = true)
 56 |  |-- Radio: double (nullable = true)
 57 |  |-- Newspaper: double (nullable = true)
 58 |  |-- Sales: double (nullable = true)
 59 | 
 60 | ```
 61 | 
 62 | ## 3.2\. 在 Mac 和 Ubuntu 上配置 Spark
 63 | 
 64 | ### 3.2.1\. 安装先决条件
 65 | 
 66 | 我强烈建议您安装 [Anaconda](https://www.anaconda.com/download/)，因为它包含大部分先决条件并支持多个操作系统。
 67 | 
 68 | **安装 Python**
 69 | 
 70 | 转到 Ubuntu 软件中心并按照以下步骤操作：
 71 | 
 72 | 1.  打开 Ubuntu 软件中心
 73 | 2.  搜索 python
 74 | 3.  并点击“安装”
 75 | 
 76 | 或者打开终端执行以下命令：
 77 | 
 78 | ```bash
 79 | sudo apt-get install build-essential checkinstall
 80 | sudo apt-get install libreadline-gplv2-dev libncursesw5-dev libssl-dev
 81 |                  libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev
 82 | sudo apt-get install python
 83 | sudo easy_install pip
 84 | sudo pip install ipython
 85 | 
 86 | ```
 87 | 
 88 | ### 3.2.2\. 安装 Java
 89 | 
 90 | Java 被许多其他软件使用。 所以你很可能已经安装了它。 您可以在命令提示符中使用以下命令：
 91 | 
 92 | ```bash
 93 | java -version
 94 | 
 95 | ```
 96 | 
 97 | 否则，您可以按照[如何为我的 Mac 安装 Java？](https://java.com/en/download/help/mac_install.xml)中的步骤，在 Mac 上安装 java 并在命令提示符中使用以下命令来在 Ubuntu 上安装：
 98 | 
 99 | ```bash
100 | sudo apt-add-repository ppa:webupd8team/java
101 | sudo apt-get update
102 | sudo apt-get install oracle-java8-installer
103 | 
104 | ```
105 | 
106 | ### 3.2.3\. 安装 JRE
107 | 
108 | 我安装了 ORACLE [Java JDK](http://www.oracle.com/technetwork/java/javase/downloads/index-jsp-138363.html)。
109 | 
110 | > 警告
111 | > 
112 | > **安装 Java 和 Java SE 运行时环境的步骤非常重要，因为 Spark 是一种用 Java 编写的领域特定语言。**
113 | 
114 | 您可以在命令提示符中使用以下命令检查 Java 是否可用并找到它的版本：
115 | 
116 | ```bash
117 | java -version
118 | 
119 | ```
120 | 
121 | 如果您的 Java 安装成功，您将获得如下的类似结果：
122 | 
123 | ```
124 | java version "1.8.0_131"
125 | Java(TM) SE Runtime Environment (build 1.8.0_131-b11)
126 | Java HotSpot(TM) 64-Bit Server VM (build 25.131-b11, mixed mode)
127 | 
128 | ```
129 | 
130 | ### 3.2.4\. 安装 Apache Spark
131 | 
132 | 实际上，预构建版本不需要安装。 你在解包时可以使用它。
133 | 
134 | 1.  下载：您可以从 [下载 Apache Spark™](http://spark.apache.org/downloads.html) 获得预构建的 Apache Spark™。
135 | 2.  解压缩：将 Apache Spark™ 解压缩到您要安装 Spark 的路径。
136 | 3.  测试：测试先决条件：修改路径`spark-#.#.#-bin-hadoop#.#/bin`并运行
137 | 
138 | ```bash
139 | ./pyspark
140 | 
141 | ```
142 | 
143 | ```
144 | Python 2.7.13 |Anaconda 4.4.0 (x86_64)| (default, Dec 20 2016, 23:05:08)
145 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)] on darwin
146 | Type "help", "copyright", "credits" or "license" for more information.
147 | Anaconda is brought to you by Continuum Analytics.
148 | Please check out: http://continuum.io/thanks and https://anaconda.org
149 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
150 | Setting default log level to "WARN".
151 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR,
152 | use setLogLevel(newLevel).
153 | 17/08/30 13:30:12 WARN NativeCodeLoader: Unable to load native-hadoop
154 | library for your platform... using builtin-java classes where applicable
155 | 17/08/30 13:30:17 WARN ObjectStore: Failed to get database global_temp,
156 | returning NoSuchObjectException
157 | Welcome to
158 |        ____              __
159 |       / __/__  ___ _____/ /__
160 |      _\ \/ _ \/ _ `/ __/  '_/
161 |     /__ / .__/\_,_/_/ /_/\_\   version 2.1.1
162 |        /_/
163 | 
164 | Using Python version 2.7.13 (default, Dec 20 2016 23:05:08)
165 | SparkSession available as 'spark'.
166 | 
167 | ```
168 | 
169 | ### 3.2.5\. 配置 Spark
170 | 
171 | 1.  **Mac 操作系统：**在终端打开你的`bash_profile`
172 | 
173 |     ```bash
174 |     vim ~/.bash_profile
175 | 
176 |     ```
177 | 
178 |     并将以下行添加到`bash_profile`（记得改变路径）
179 | 
180 |     ```bash
181 |     # 为 spark 添加
182 |     export SPARK_HOME=your_spark_installation_path
183 |     export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
184 |     export PATH=$PATH:$SPARK_HOME/bin
185 |     export PYSPARK_DRIVE_PYTHON="jupyter"
186 |     export PYSPARK_DRIVE_PYTHON_OPTS="notebook"
187 | 
188 |     ```
189 | 
190 |     最后，记得执行你的`bash_profile`
191 | 
192 |     ```bash
193 |     source ~/.bash_profile
194 | 
195 |     ```
196 | 
197 | 1.  **Ubuntu 操作系统：**在终端打开`bashrc`
198 | 
199 |     ```bash
200 |     vim ~/.bashrc
201 | 
202 |     ```
203 | 
204 |     并将以下行添加到`bashrc`（记得改变路径）
205 | 
206 |     ```bash
207 |     # 为 spark 添加
208 |     export SPARK_HOME=your_spark_installation_path
209 |     export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
210 |     export PATH=$PATH:$SPARK_HOME/bin
211 |     export PYSPARK_DRIVE_PYTHON="jupyter"
212 |     export PYSPARK_DRIVE_PYTHON_OPTS="notebook"
213 | 
214 |     ```
215 | 
216 |     最后，记得执行你的`bash_profile`
217 | 
218 |     ```bash
219 |     source ~/.bashrc
220 | 
221 |     ```
222 | 
223 | ## 3.3\. 在 Windows 上配置 Spark
224 | 
225 | 在 Windows 上安装开源软件对我来说总是一场噩梦。 感谢 Deelesh Mandloi。 您可以按照博客[ Windows 上的 PySpark 入门](http://deelesh.github.io/pyspark-windows.html)中的详细步骤，在 Windows 操作系统上安装 Apache Spark™。
226 | 
227 | ## 3.4\. PySpark 和文本编辑器或 IDE 
228 | 
229 | ### 3.4.1\. PySpark 和 Jupyter 笔记本
230 | 
231 | 完成[在 Mac 和 Ubuntu 上配置 Spark](#setup-up-ubuntu)中的上述设置步骤后，您应该在 Jupyter 笔 记本中编写和运行 PySpark 代码。
232 | 
233 | > ![https://runawayhorse001.github.io/LearningApacheSpark/_images/jupyterWithPySpark.png](img/90a1240e7489f989b9a4e5739b1efbd5.jpg)
234 | 
235 | ### 3.4.2\. PySpark 和 Apache Zeppelin
236 | 
237 | 完成[在 Mac 和 Ubuntu 上配置 Spark](#setup-up-ubuntu)中的上述设置步骤后，您应该在 Apache Zeppelin 中编写和运行 PySpark 代码。
238 | 
239 | > ![https://runawayhorse001.github.io/LearningApacheSpark/_images/zeppelin.png](img/067197a5eeb69cc2f3d828a92ebcf52e.jpg)
240 | 
241 | ### 3.4.3\. PySpark 和 Sublime Text
242 | 
243 | 完成[在 Mac 和 Ubuntu 上配置 Spark](#setup-up-ubuntu)中的上述设置步骤后，您应该可以使用 Sublime Text 编写 PySpark 代码,并在终端中将代码作为普通的 python 代码运行。
244 | 
245 | ```bash
246 | python test_pyspark.py
247 | 
248 | ```
249 | 
250 | 然后你应该在你的终端获得输出结果。
251 | 
252 | > ![https://runawayhorse001.github.io/LearningApacheSpark/_images/sublimeWithPySpark.png](img/c51fb942d508d4161e72d0075a5284e7.jpg)
253 | 
254 | ### 3.4.4\. PySpark 和 Eclipse
255 | 
256 | 如果要在 Eclipse 上运行 PySpark 代码，则需要为**当前项目**添加**外部库**的路径，如下所示：
257 | 
258 | 1.  打开你的项目的属性
259 | 
260 |     ![https://runawayhorse001.github.io/LearningApacheSpark/_images/PyDevProperties.png](img/f18ecec7a6c176301d7370e41a0a60dd.jpg)
261 | 
262 | 1.  为**外部**添加路径
263 | 
264 |     ![https://runawayhorse001.github.io/LearningApacheSpark/_images/pydevPath.png](img/197517339d2ce744dd0a46c607e84534.jpg)
265 | 
266 | 然后你应该足以用 PyDev 在 Eclipse 上运行你的代码。
267 | 
268 | > ![https://runawayhorse001.github.io/LearningApacheSpark/_images/pysparkWithEclipse.png](img/6f2adb68d3f0a7f1f3af2ef044441071.jpg)
269 | 
270 | ## 3.5\. PySparkling 水: Spark + H2O
271 | 
272 | 1.  从 [https://s3.amazonaws.com/h2o-release/sparkling-water/rel-2.4/5/index.html](https://s3.amazonaws.com/h2o-release/sparkling-water/rel-2.4/5/index.html) 下载`Sparkling Water`：
273 | 
274 | 2.  测试 PySparking
275 | 
276 |     ```bash
277 |     unzip sparkling-water-2.4.5.zip
278 |     cd  ~/sparkling-water-2.4.5/bin
279 |     ./pysparkling
280 | 
281 |     ```
282 | 
283 |     如果您有正确设置了 PySpark，那么您将获得以下结果：
284 | 
285 |     ```
286 |     Using Spark defined in the SPARK_HOME=/Users/dt216661/spark environmental property
287 | 
288 |     Python 3.7.1 (default, Dec 14 2018, 13:28:58)
289 |     [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)] on darwin
290 |     Type "help", "copyright", "credits" or "license" for more information.
291 |     2019-02-15 14:08:30 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
292 |     Setting default log level to "WARN".
293 |     Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
294 |     Setting default log level to "WARN".
295 |     To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
296 |     2019-02-15 14:08:31 WARN  Utils:66 - Service 'SparkUI' could not bind on port 4040\. Attempting port 4041.
297 |     2019-02-15 14:08:31 WARN  Utils:66 - Service 'SparkUI' could not bind on port 4041\. Attempting port 4042.
298 |     17/08/30 13:30:12 WARN NativeCodeLoader: Unable to load native-hadoop
299 |     library for your platform... using builtin-java classes where applicable
300 |     17/08/30 13:30:17 WARN ObjectStore: Failed to get database global_temp,
301 |     returning NoSuchObjectException
302 |     Welcome to
303 |            ____              __
304 |           / __/__  ___ _____/ /__
305 |          _\ \/ _ \/ _ `/ __/  '_/
306 |         /__ / .__/\_,_/_/ /_/\_\   version 2.4.0
307 |            /_/
308 | 
309 |     Using Python version 3.7.1 (default, Dec 14 2018 13:28:58)
310 |     SparkSession available as 'spark'.
311 | 
312 |     ```
313 | 
314 | 1.  使用 Jupyter notebook `pysparkling`
315 | 
316 |     将以下别名添加到`bashrc`（Linux 系统）或`bash_profile`（Mac 系统）
317 | 
318 |     ```bash
319 |     alias sparkling="PYSPARK_DRIVER_PYTHON="ipython" PYSPARK_DRIVER_PYTHON_OPTS=    "notebook" /~/~/sparkling-water-2.4.5/bin/pysparkling"
320 | 
321 |     ```
322 | 
323 | 1.  在终端打开`pysparkling`
324 | 
325 |     ```bash
326 |     sparkling
327 | 
328 |     ```
329 | 
330 | ## 3.6\. 在云上配置 Spark
331 | 
332 | 按照[在 Mac 和 Ubuntu 上配置 Spark](#setup-up-ubuntu)中的设置步骤，您可以在云上设置自己的集群，例如 AWS，Google Cloud。 实际上，对于那些云，他们有自己的大数据工具。 你可以直接在任何设置上运行它们，就像 Databricks 社区云一样。 如果您想了解更多详情，请随时与作者联系。
333 | 
334 | ## 3.7\. 这一节的示例代码
335 | 
336 | 此部分的代码可在[`test_pyspark`](static/test_pyspark.py)下载，Jupyter 笔记本可从[`test_pyspark_ipynb`](static/test_pyspark.ipynb)下载。
337 | 
338 | *   Python 源代码
339 | 
340 | ```py
341 | ## 建立 SparkSession
342 | from pyspark.sql import SparkSession
343 | 
344 | spark = SparkSession \
345 |     .builder \
346 |     .appName("Python Spark SQL basic example") \
347 |     .config("spark.some.config.option", "some-value") \
348 |     .getOrCreate()
349 | 
350 | df = spark.read.format('com.databricks.spark.csv').\
351 |                                options(header='true', \
352 |                                inferschema='true').\
353 |                      load("/home/feng/Spark/Code/data/Advertising.csv",header=True)
354 | 
355 | df.show(5)
356 | df.printSchema()                     
357 | 
358 | ```


--------------------------------------------------------------------------------
/docs/4.md:
--------------------------------------------------------------------------------
 1 | # 4\. Apache Spark 入门
 2 | 
 3 | **知己知彼，百战百胜。** – 《孙子兵法》
 4 | 
 5 | ## 4.1\. 核心概念
 6 | 
 7 | 以下大部分内容来自 [[Kirillov2016]](reference.html#kirillov2016)。 所以版权属于 **Anton Kirillov**。我向您推荐[ Apache Spark 核心概念，架构和内部](http://datastrophic.io/core-concepts-architecture-and-internals-of-apache-spark/)来获取更多详细信息。
 8 | 
 9 | 在深入研究 Apache Spark 之前，让我们阅读 Apache Spark 的行话。
10 | 
11 | * 作业：从 HDFS 或本地读取一些输入的代码，对数据执行一些计算并写入一些输出数据。
12 | * 阶段：工作分为几个阶段。 阶段被分类为映射或归约阶段（如果您已经使用过 Hadoop 并希望关联，则更容易理解）。 阶段基于计算边界划分，所有计算（操作符）不能在单个阶段中更新。 它发生在很多阶段。
13 | * 任务：每个阶段都有一些任务，每个任务一个分区。 一个任务执行在一个执行器（机器）上的一个数据分区上。
14 | * DAG：DAG 代表有向无环图，在本文中是操作符的 DAG。
15 | * 执行器：负责执行任务的进程。
16 | * 主机：运行驱动程序的机器
17 | * 从机：运行执行程序的机器
18 | 
19 | 
20 | ## 4.2\. Spark 组件
21 | 
22 | > ![https://runawayhorse001.github.io/LearningApacheSpark/_images/spark-components.png](img/f4e95f92187a42f257864cd22193c8ad.jpg)
23 | 
24 | 1.  Spark 驱动
25 | 
26 |     *   隔离进程来执行用户应用
27 |     *   创建`SparkContext`来调度任务执行并与集群管理器协商
28 | 
29 | 1.  执行器
30 | 
31 |     *   运行由驱动调度的任务
32 |     *   在内存、磁盘或者 off-heap 中储存计算结果
33 |     *   与储存系统个交互
34 | 
35 | 1.  集群管理器
36 | 
37 |     *   Mesos
38 |     *   YARN
39 |     *   Spark Standalone
40 | 
41 |     Spark 驱动包含更多组件，负责将用户代码转换为集群上的实际作业：
42 | 
43 |     ![https://runawayhorse001.github.io/LearningApacheSpark/_images/spark-components1.png](img/0ff87df50cf4610da54dd94b51c6d809.jpg)
44 | 
45 | *   `SparkContext`
46 | 
47 |     *   表示 spark 集群的连接，可用于在该集群上创建 RDD，累加器和广播变量
48 |     
49 |     
50 | *   `DAGScheduler`
51 | 
52 |     *   计算机每个作业的阶段的 DAG 并将它们提交给`TaskScheduler`，来确定任务的首选位置（基于缓存状态或随机文件位置）并找到运行作业的最小调度
53 |     
54 | *   `TaskScheduler`
55 | 
56 |     *   负责将任务发送到集群，运行它们，在发生故障时重试，以及减轻负担
57 |     
58 |     
59 | *   `SchedulerBackend`
60 | 
61 |     *   用于调度系统的后端接口，允许插入不同的实现（mesos，yarn，standalone，local）
62 |     
63 |     
64 | *   `BlockManager`
65 |     
66 |     *   提供接口，用于在本地和远程将块放置到各种存储（内存，磁盘和堆外）和检索
67 | 
68 | ## 4.3\. 架构
69 | 
70 | ## 4.4\. Spark 的工作原理
71 | 
72 | Spark 具有较小的代码库，系统分为不同的层。 每一层都有一些责任。 这些层彼此独立。
73 | 
74 | 第一层是解释器，Spark 使用 Scala 解释器，并进行了一些修改。 当您在 spark 控制台中输入代码（创建 RDD 并应用操作符）时，Spark 会创建一个操作符图。 当用户运行操作（如收集）时，图将提交给 DAG 调度器。 DAG 调度器将操作符图分为（映射和归约）阶段。 阶段由基于输入数据的分区的任务组成。 DAG 调度器将操作符连接在一起来优化图。 例如 许多图的操作符可以安排在一个阶段中。 此优化是 Sparks 性能的关键。 DAG 调度器的最终结果是一组阶段。 这些阶段将传递给任务调度器。 任务调度器通过集群管理器启动任务（Spark Standalone/Yarn/Mesos）。 任务调度器不知道阶段之间的依赖关系。
75 | 
76 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/work_flow.png](img/0ebcb4677d2131e71e039be8ea955cff.jpg)


--------------------------------------------------------------------------------
/docs/6.md:
--------------------------------------------------------------------------------
  1 | # 6\. 统计与线性代数预备
  2 | 
  3 | **知彼知己，百战不殆；不知彼而知己，一胜一负；不知彼，不知己，每战必殆。** – 《孙子兵法》
  4 | 
  5 | ## 6.1\. 表示法
  6 | 
  7 | *   m：样本数
  8 | *   n：特征数
  9 | *   ![y_i](img/8f58cf98a539286a53e41582f194fbed.jpg)：第`i`个标签
 10 | *   ![\hat{y}_i](img/585d98b9749f0661bc9077e01f28eb15.jpg)：第`i`个预测标签
 11 | *   ![{\displaystyle {\bar {\y}}} = {\frac {1}{m}}\sum _{i=1}^{m}y_{i}](img/791424a3e5f6e2f4372471d96e5b4676.jpg)：![\y](img/afa87c5126806e604709f243ab72848b.jpg) 的均值
 12 | *   ![\y](img/afa87c5126806e604709f243ab72848b.jpg)：标签向量
 13 | *   ![\hat{\y}](img/bab25b7785bf747bc1caa1442874df74.jpg)：预测标签向量
 14 | 
 15 | ## 6.2\. 线性代数预备
 16 | 
 17 | 由于我在我的数值分析考试笔记中记录了线性代数预备，有兴趣的读者可以参考 [[Feng2014]](reference.html#feng2014)了解更多细节。
 18 | 
 19 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/linear_algebra.png](img/c089ca6ef2f36b0394d7bcf41db78030.jpg)
 20 | 
 21 | 线性代数预备
 22 | 
 23 | ## 6.3\. 测量公式
 24 | 
 25 | ### 6.3.1\. 平均绝对误差
 26 | 
 27 | 在统计学中，**MAE**（[平均绝对误差](https://en.wikipedia.org/wiki/Mean_absolute_error)）衡量两个连续变量间的差异。 平均绝对误差由下式给出：
 28 | 
 29 | ![{\displaystyle \mathrm {MAE} ={\frac{1}{m} {\sum _{i=1}^{m}\left|\hat{y}_i-y_i\right|}}.}](img/61bccf1d55cc6636fce9585573c9981a.jpg)
 30 | 
 31 | ### 6.3.2\. 均方误差
 32 | 
 33 | 在统计中，估计器（估计未观测量的过程）的 **MSE**（[均方误差](https://en.wikipedia.org/wiki/Mean_squared_error)）测量了误差或偏差的平方的平均值 - 即估计器与被估计值之间的差异。
 34 | 
 35 | ![\text{MSE}=\frac{1}{m}\sum_{i=1}^m\left( \hat{y}_i-y_i\right)^2](img/3152173a8fd696819c7a2c2b8c6ef005.jpg)
 36 | 
 37 | ### 6.3.3\. 均方根误差
 38 | 
 39 | ![\text{RMSE} = \sqrt{\text{MSE}}=\sqrt{\frac{1}{m}\sum_{i=1}^m\left( \hat{y}_i-y_i\right)^2}](img/c8a2ccec457f128649ad30a2ba066a48.jpg)
 40 | 
 41 | ### 6.3.4\. 总体平方和
 42 | 
 43 | 在统计数据分析中，**TSS**（[总体平方和](https://en.wikipedia.org/wiki/Total_sum_of_squares)）是一个数量，作为呈现此类分析结果的标准方式的一部分。 它被定义为在所有观察中，每个观测值与总体平均值的平方差的总和。
 44 | 
 45 | ![\text{TSS} =  \sum_{i=1}^m\left( y_i-\bar{\y}\right)^2](img/16fd7a4c078cf22fee09b636dc10d55c.jpg)
 46 | 
 47 | ### 6.3.5\. 解释平方和
 48 | 
 49 | 在统计学中，**ESS**（[解释平方和](https://en.wikipedia.org/wiki/Explained_sum_of_squares)），或者称为模型平方和或回归平方和。
 50 | 
 51 | ESS 是预测值和响应变量的均值的差的平方和，由下式给出：
 52 | 
 53 | ![\text{ESS}= \sum_{i=1}^m\left( \hat{y}_i-\bar{\y}\right)^2](img/8dc8e70e19ec4318b12b16f1c5bdb879.jpg)
 54 | 
 55 | ### 6.3.6\. 残差平方和
 56 | 
 57 | 在统计中，**RSS/SSR**（[残差平方和](https://en.wikipedia.org/wiki/Residual_sum_of_squares)），也称为预测误差平方和 预测（SSE），由下式给出：
 58 | 
 59 | ![\text{RSS}= \sum_{i=1}^m\left( \hat{y}_i-y_i\right)^2](img/95594348fc6d49d2819be3d412a27e55.jpg)
 60 | 
 61 | ### 6.3.7\. 判定系数 ![R^2](img/1ac835166928f502b55a31636602602a.jpg)
 62 | 
 63 | ![R^{2} := \frac{ESS}{TSS} = 1-{\text{RSS} \over \text{TSS}}.\,](img/fef76f108c095f250d8e9efb4cfcb710.jpg)
 64 | 
 65 | > 注意
 66 | > 
 67 | > 一般来说，(![\y^{T}{\bar {\y}}={\hat {\y}}^{T}{\bar {\y}}](img/b288f19072faa2f8f373d5a8910c080b.jpg))，总体平方和，等于解释平方和加上残差平方和，也就是：
 68 | 
 69 | ![\text{TSS} = \text{ESS} + \text{RSS} \text{ if and only if } {\displaystyle \y^{T}{\bar {\y}}={\hat {\y}}^{T}{\bar {\y}}}.](img/4a1a112aa8490f7c8410b710845e8c7a.jpg)
 70 | 
 71 | 更多细节可以在[普通最小二乘模型中的分区](https://en.wikipedia.org/wiki/Explained_sum_of_squares)中找到。
 72 | 
 73 | ## 6.4\. 混淆矩阵
 74 | 
 75 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/confusion_matrix.png](img/c789e9bbaa3506dc90047b5cd487a42a.jpg)
 76 | 
 77 | 混淆矩阵
 78 | 
 79 | ### 6.4.1\. 召回率
 80 | 
 81 | ![\text{Recall}=\frac{\text{TP}}{\text{TP+FN}}](img/3f26c9365c0603f014f3bba403ed27fb.jpg)
 82 | 
 83 | ### 6.4.2\. 精确率
 84 | 
 85 | ![\text{Precision}=\frac{\text{TP}}{\text{TP+FP}}](img/1a8a8647a66b744ccd5c9137adb66255.jpg)
 86 | 
 87 | ### 6.4.3\. 准确率
 88 | 
 89 | ![\text{Accuracy }=\frac{\text{TP+TN}}{\text{Total}}](img/5a13655c0030372e1b06cd77ff1e53e0.jpg)
 90 | 
 91 | ### 6.4.4\. F1 得分
 92 | 
 93 | ![\text{F}_1=\frac{2*\text{Recall}*\text{Precision}}{\text{Recall}+ \text{Precision}}](img/1cef776388e6c2cba3cf00cab2199e3d.jpg)
 94 | 
 95 | ## 6.5\. 统计检验
 96 | 
 97 | ### 6.5.1\. 互相关检验
 98 | 
 99 | *   Pearson 互相关: 检验两个连续变量之间的相关度。
100 | *   Spearman 互相关: 检验两个序数变量之间的相关度（不依赖于正态分布数据的假设）。
101 | *   卡方: 检验两个类别变量之间的相关度。
102 | 
103 | ### 6.5.2\. 均值检验的比较
104 | 
105 | *   配对 T 检验: 检验两个相关变量之间的差异
106 | *   独立 T 检验: 检验两个独立变量之间的差异
107 | *   ANOVA: 在考虑结果变量中的任何其他变化之后，检验组均值之间的差异。
108 | 
109 | ### 6.5.3\. 非配对检验
110 | 
111 | *   Wilcoxon 秩和检验: 检验两个独立变量之间的差异 - 考虑差异的大小和方向。
112 | *   Wilcoxon 符号秩检验: 检验两个相关变量之间的差异 - 考虑差异的大小和方向。
113 | *   符号检验: 检验两个相关变量是否不同 - 忽略变化大小，仅考虑方向。


--------------------------------------------------------------------------------
/docs/7.md:
--------------------------------------------------------------------------------
  1 | # 7\. 数据探索
  2 | 
  3 | **千里之行，始于足下。** – 《老子》
  4 | 
  5 | 我不是说，理解你的数据集是数据科学中最困难的事情，但它非常重要且耗时。 数据探索是通过统计和可视化技术来描述数据。 我们探索数据来了解特征并将其带到我们的模型。
  6 | 
  7 | ## 7.1\. 单变量分析
  8 | 
  9 | 在数学中，单变量是指仅含一个变量的表达式，方程式，函数或多项式。 “Uni”表示“一个”，换句话说，您的数据只有一个变量。 因此，您无需在此步骤中处理原因或关系。单变量分析获取数据，逐个汇总变量（属性）并发现数据中的模式。
 10 | 
 11 | 单变量数据中发现的模式可以通过多种方式描述，包括集中趋势（均值，众数和中值）和离散度：极差，方差，最大值，最小值，四分位数（包括四分位数极差），方差和标准差。 您还可以使用多个选项来视化和描述单变量数据。 如`频率分布表`，`条形图`，`直方图`，`频率多边形`，`扇形图`。
 12 | 
 13 | 变量可以是分类变量或数值变量，我将演示不同的统计和可视化技术来研究变量的每种类型。
 14 | 
 15 | *   Jupyter 笔记本可以从[数据探索](_static/Data_exploration.ipynb)下载。
 16 | *   数据可以从 [German Credit](_static/german_credit.csv) 下载。
 17 | 
 18 | ### 7.1.1\. 数值变量
 19 | 
 20 | **描述**
 21 | 
 22 | `pandas`和`spark`中的`describe`函数将给出大部分统计结果，例如最小值，中值，最大值，四分位数和标准差。 借助用户定义的函数，您可以获得更多的统计结果。
 23 | 
 24 | ```py
 25 | # 为选择要展示的变量
 26 | num_cols = ['Account Balance','No of dependents']
 27 | df.select(num_cols).describe().show()
 28 | 
 29 | ```
 30 | 
 31 | ```py
 32 | +-------+------------------+-------------------+
 33 | |summary|   Account Balance|   No of dependents|
 34 | +-------+------------------+-------------------+
 35 | |  count|              1000|               1000|
 36 | |   mean|             2.577|              1.155|
 37 | | stddev|1.2576377271108936|0.36208577175319395|
 38 | |    min|                 1|                  1|
 39 | |    max|                 4|                  2|
 40 | +-------+------------------+-------------------+
 41 | 
 42 | ```
 43 | 
 44 | 您可能会发现 PySpark 中的默认函数不包含四分位数。 以下函数将帮助您在 Pandas 中获得相同的结果：
 45 | 
 46 | ```py
 47 | def describe_pd(df_in, columns, deciles=False):
 48 |     '''
 49 |     Function to union the basic stats results and deciles
 50 |     :param df_in: the input dataframe
 51 |     :param columns: the cloumn name list of the numerical variable
 52 |     :param deciles: the deciles output
 53 | 
 54 |     :return : the numerical describe info. of the input dataframe
 55 | 
 56 |     :author: Ming Chen and Wenqiang Feng
 57 |     :email:  von198@gmail.com
 58 |     '''
 59 | 
 60 |     if deciles:
 61 |         percentiles = np.array(range(0, 110, 10))
 62 |     else:
 63 |         percentiles = [25, 50, 75]
 64 | 
 65 |     percs = np.transpose([np.percentile(df_in.select(x).collect(), percentiles) for x in columns])
 66 |     percs = pd.DataFrame(percs, columns=columns)
 67 |     percs['summary'] = [str(p) + '%' for p in percentiles]
 68 | 
 69 |     spark_describe = df_in.describe().toPandas()
 70 |     new_df = pd.concat([spark_describe, percs],ignore_index=True)
 71 |     new_df = new_df.round(2)
 72 |     return new_df[['summary'] + columns]
 73 | 
 74 | ```
 75 | 
 76 | ```py
 77 | describe_pd(df,num_cols)
 78 | 
 79 | ```
 80 | 
 81 | ```py
 82 | +-------+------------------+-----------------+
 83 | |summary|   Account Balance| No of dependents|
 84 | +-------+------------------+-----------------+
 85 | |  count|            1000.0|           1000.0|
 86 | |   mean|             2.577|            1.155|
 87 | | stddev|1.2576377271108936|0.362085771753194|
 88 | |    min|               1.0|              1.0|
 89 | |    max|               4.0|              2.0|
 90 | |    25%|               1.0|              1.0|
 91 | |    50%|               2.0|              1.0|
 92 | |    75%|               4.0|              1.0|
 93 | +-------+------------------+-----------------+
 94 | 
 95 | ```
 96 | 
 97 | 有时，由于机密数据问题，您无法提供真实数据，您的客户可能会请求更多统计结果，例如“十分位数”。 您可以应用以下函数来实现它。
 98 | 
 99 | ```py
100 | describe_pd(df,num_cols,deciles=True)
101 | 
102 | ```
103 | 
104 | ```py
105 | +-------+------------------+-----------------+
106 | |summary|   Account Balance| No of dependents|
107 | +-------+------------------+-----------------+
108 | |  count|            1000.0|           1000.0|
109 | |   mean|             2.577|            1.155|
110 | | stddev|1.2576377271108936|0.362085771753194|
111 | |    min|               1.0|              1.0|
112 | |    max|               4.0|              2.0|
113 | |     0%|               1.0|              1.0|
114 | |    10%|               1.0|              1.0|
115 | |    20%|               1.0|              1.0|
116 | |    30%|               2.0|              1.0|
117 | |    40%|               2.0|              1.0|
118 | |    50%|               2.0|              1.0|
119 | |    60%|               3.0|              1.0|
120 | |    70%|               4.0|              1.0|
121 | |    80%|               4.0|              1.0|
122 | |    90%|               4.0|              2.0|
123 | |   100%|               4.0|              2.0|
124 | +-------+------------------+-----------------+
125 | 
126 | ```
127 | 
128 | *   偏度和峰度
129 | 
130 |     这个小节来自维基百科[偏度](https://en.wikipedia.org/wiki/Skewness)。
131 | 
132 |     在概率论和统计学中，偏度是实值随机变量概率分布关于其均值的不对称性的度量。 偏度值可以是正数或负数，或者是未定义的。对于单峰分布，负偏度通常表示尾部位于分布的左侧，而正偏度表示尾部位于右侧。
133 | 
134 |     考虑下图中的两个分布。 在每个图中，分布右侧的值与左侧的值不同。 这些逐渐变细的一端称为尾部，它们提供了一种可视方法来确定分布中的两种偏斜中的哪一种：
135 |     
136 |     1.  负偏度：左尾较长；分布的质量集中在图的右侧。尽管曲线本身看起来是向右倾斜的，但这种分布成为左倾的。左是指尾部向左侧延伸，并且通常，平均值偏向数据的典型中心的左侧。 左倾分布通常表现为右倾曲线。
137 |     2.  正偏度：右尾更长; 分布的质量集中在图的左侧。尽管曲线本身看起来是向左倾斜的，但这种分布成为右倾的。右边是指尾部向右侧延伸，通常，平均值偏向于典型数据中心的右侧。 右倾分布通常表现为左倾曲线。
138 | 
139 |     这一小节来自维基百科[峰度](https://en.wikipedia.org/wiki/Kurtosis)。
140 | 
141 |     在概率论和统计学中，峰度（kyrtos 或 kurtos，意思是“弯曲的，拱形的”）是实值随机变量的概率分布的“尾部”的度量。 与偏度概念类似，峰度描述概率分布形状，正如偏度一样，有不同的方法来量化它的理论分布，和相应的方法来估计它来自一个样本总体。
142 | 
143 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/skewed.png](img/6eb508bad184c89094f5045a5bf2e31c.jpg)
144 | 
145 | ```py
146 | from pyspark.sql.functions import col, skewness, kurtosis
147 | df.select(skewness(var),kurtosis(var)).show()
148 | 
149 | ```
150 | 
151 | ```py
152 | +---------------------+---------------------+
153 | |skewness(Age (years))|kurtosis(Age (years))|
154 | +---------------------+---------------------+
155 | |   1.0231743160548064|   0.6114371688367672|
156 | +---------------------+---------------------+
157 | 
158 | ```
159 | 
160 | > 警告
161 | 
162 | **有时统计量可能产生误导！**
163 | 
164 | F. J. Anscombe 曾经说到执行计算和制作图表。 应研究两种结果；每个都有助于理解。 图[相同统计量的不同图表](#fig-misleading)（Datasaurus，和 12 个其他东西）中的这 13 个数据集各自具有相同的汇总统计量（`x/y`均值，`x/y`标准差和 Pearson 相关性），虽然外观完全不同。 这项工作描述了我们开发的技术，用于创建此数据集，以及其他类似的数据集。 更多细节和有趣的结果可以在[相同统计量和不同图表](https://www.autodeskresearch.com/publications/samestats)中找到。
165 | 
166 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/misleading.png](img/4fb175e4e5682ef75a156dfba37beeea.jpg)
167 | 
168 | 相同统计量和不同图表
169 | 
170 | **直方图**
171 | 
172 | > 警告
173 | > 
174 | > **直方图经常和条形图混淆！**
175 | 
176 | 直方图和条形图之间的根本区别，将帮助您轻松识别两者，条形图中的条形之间存在间隙，但在直方图中，条形彼此相邻。 感兴趣的读者可以参考[直方图和条形图之间的差异](https://keydifferences.com/difference-between-histogram-and-bar-graph.html)。
177 | 
178 | ```py
179 | var = 'Age (years)'
180 | x = data1[var]
181 | bins = np.arange(0, 100, 5.0)
182 | 
183 | plt.figure(figsize=(10,8))
184 | # 数据直方图
185 | plt.hist(x, bins, alpha=0.8, histtype='bar', color='gold',
186 |          ec='black',weights=np.zeros_like(x) + 100\. / x.size)
187 | 
188 | plt.xlabel(var)
189 | plt.ylabel('percentage')
190 | plt.xticks(bins)
191 | plt.show()
192 | 
193 | fig.savefig(var+".pdf", bbox_inches='tight')
194 | 
195 | ```
196 | 
197 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/his_s.png](img/0539212d2d3e4c28b27805e3c8783cab.jpg)
198 | 
199 | ```py
200 | var = 'Age (years)'
201 | x = data1[var]
202 | bins = np.arange(0, 100, 5.0)
203 | 
204 | ########################################################################
205 | hist, bin_edges = np.histogram(x,bins,
206 |                                weights=np.zeros_like(x) + 100\. / x.size)
207 | # 生成直方图
208 | 
209 | fig = plt.figure(figsize=(20, 8))
210 | ax = fig.add_subplot(1, 2, 1)
211 | 
212 | # 绘制高度与 x 轴上的整数的直方图
213 | ax.bar(range(len(hist)),hist,width=1,alpha=0.8,ec ='black', color='gold')
214 | # 将刻度设在条形中间
215 | ax.set_xticks([0.5+i for i,j in enumerate(hist)])
216 | # 将 xticklabels 设置为一个字符串，告诉我们桶的边缘是什么
217 | labels =['{}'.format(int(bins[i+1])) for i,j in enumerate(hist)]
218 | labels.insert(0,'0')
219 | ax.set_xticklabels(labels)
220 | plt.xlabel(var)
221 | plt.ylabel('percentage')
222 | 
223 | ########################################################################
224 | 
225 | hist, bin_edges = np.histogram(x,bins) # 生成直方图
226 | 
227 | ax = fig.add_subplot(1, 2, 2)
228 | # 绘制高度与 x 轴上的整数的直方图
229 | ax.bar(range(len(hist)),hist,width=1,alpha=0.8,ec ='black', color='gold')
230 | 
231 | # 将刻度设在条形中间
232 | ax.set_xticks([0.5+i for i,j in enumerate(hist)])
233 | 
234 | # 将 xticklabels 设置为一个字符串，告诉我们桶的边缘是什么
235 | labels =['{}'.format(int(bins[i+1])) for i,j in enumerate(hist)]
236 | labels.insert(0,'0')
237 | ax.set_xticklabels(labels)
238 | plt.xlabel(var)
239 | plt.ylabel('count')
240 | plt.suptitle('Histogram of {}: Left with percentage output;Right with count output'
241 |              .format(var), size=16)
242 | plt.show()
243 | 
244 | fig.savefig(var+".pdf", bbox_inches='tight')
245 | 
246 | ```
247 | 
248 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/his_d.png](img/2a4a130bcfb223ced98c0de613bd076a.jpg)
249 | 
250 | 有时，有些人会要求您绘制不等宽度的条形（直方图的无效参数）。 你仍然可以通过以下方法实现它。
251 | 
252 | ```py
253 | var = 'Credit Amount'
254 | plot_data = df.select(var).toPandas()
255 | x= plot_data[var]
256 | 
257 | bins =[0,200,400,600,700,800,900,1000,2000,3000,4000,5000,6000,10000,25000]
258 | 
259 | hist, bin_edges = np.histogram(x,bins,weights=np.zeros_like(x) + 100\. / x.size) # make the histogram
260 | 
261 | fig = plt.figure(figsize=(10, 8))
262 | ax = fig.add_subplot(1, 1, 1)
263 | # 绘制高度与 x 轴上的整数的直方图
264 | ax.bar(range(len(hist)),hist,width=1,alpha=0.8,ec ='black',color = 'gold')
265 | 
266 | # 将刻度设在条形中间
267 | ax.set_xticks([0.5+i for i,j in enumerate(hist)])
268 | 
269 | # 将 xticklabels 设置为一个字符串，告诉我们桶的边缘是什么
270 | #labels =['{}k'.format(int(bins[i+1]/1000)) for i,j in enumerate(hist)]
271 | labels =['{}'.format(bins[i+1]) for i,j in enumerate(hist)]
272 | labels.insert(0,'0')
273 | ax.set_xticklabels(labels)
274 | #plt.text(-0.6, -1.4,'0')
275 | plt.xlabel(var)
276 | plt.ylabel('percentage')
277 | plt.show()
278 | 
279 | ```
280 | 
281 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/unequal.png](img/cb63c877ea3af266bb0f5ad6ba5e0b1d.jpg)
282 | 
283 | **箱形图和提琴图**
284 | 
285 | 请注意，虽然提琴图与 Tukey（1977）的箱形图密切相关，但提琴图可以显示比箱形图更多的信息。 当我们进行探索性分析时，没有样本的知识。 因此，样本分布不能假设为正态分布，并且通常当您获得大数据时，正态分布将在箱形图中显示一些溢出。
286 | 
287 | 然而，对于较小的样本大小，提琴图可能会产生误导，其中即使在为标准正常数据生成时，密度图也可能显示出有趣的特征（以及其中的分组差异）。 一些作者建议样本量应大于 250（例如，`n> 250`或理想情况甚至更大）。其中核密度图提供了分布的合理准确表示，可能表现诸如双峰性或其他形式的细微差别，它在箱形图中是不可见的或不太清楚。 更多细节可以在[箱形图和小提琴图的简单比较]中找到(https://figshare.com/articles/A_simple_comparison_of_box_plots_and_violin_plots/1544525)。
288 | 
289 | ```py
290 | x = df.select(var).toPandas()
291 | 
292 | fig = plt.figure(figsize=(20, 8))
293 | ax = fig.add_subplot(1, 2, 1)
294 | ax = sns.boxplot(data=x)
295 | 
296 | ax = fig.add_subplot(1, 2, 2)
297 | ax = sns.violinplot(data=x)
298 | 
299 | ```
300 | 
301 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/box_vio.png](img/0eb5759f21246505752043bb890ab6bf.jpg)
302 | 
303 | ### 7.1.2\. 类别变量
304 | 
305 | 与数值变量相比，分类变量更容易进行探索。
306 | 
307 | **频率表**
308 | 
309 | ```py
310 | from pyspark.sql import functions as F
311 | from pyspark.sql.functions import rank,sum,col
312 | from pyspark.sql import Window
313 | 
314 | window = Window.rowsBetween(Window.unboundedPreceding,Window.unboundedFollowing)
315 | # withColumn('Percent %',F.format_string("%5.0f%%\n",col('Credit_num')*100/col('total'))).\
316 | tab = df.select(['age_class','Credit Amount']).\
317 |    groupBy('age_class').\
318 |    agg(F.count('Credit Amount').alias('Credit_num'),
319 |        F.mean('Credit Amount').alias('Credit_avg'),
320 |        F.min('Credit Amount').alias('Credit_min'),
321 |        F.max('Credit Amount').alias('Credit_max')).\
322 |    withColumn('total',sum(col('Credit_num')).over(window)).\
323 |    withColumn('Percent',col('Credit_num')*100/col('total')).\
324 |    drop(col('total'))
325 | 
326 | ```
327 | 
328 | ```py
329 | +---------+----------+------------------+----------+----------+-------+
330 | |age_class|Credit_num|        Credit_avg|Credit_min|Credit_max|Percent|
331 | +---------+----------+------------------+----------+----------+-------+
332 | |    45-54|       120|3183.0666666666666|       338|     12612|   12.0|
333 | |      <25|       150| 2970.733333333333|       276|     15672|   15.0|
334 | |    55-64|        56| 3493.660714285714|       385|     15945|    5.6|
335 | |    35-44|       254| 3403.771653543307|       250|     15857|   25.4|
336 | |    25-34|       397| 3298.823677581864|       343|     18424|   39.7|
337 | |      65+|        23|3210.1739130434785|       571|     14896|    2.3|
338 | +---------+----------+------------------+----------+----------+-------+
339 | 
340 | ```
341 | 
342 | **扇形图**
343 | 
344 | ```py
345 | # 要绘制的数据
346 | labels = plot_data.age_class
347 | sizes =  plot_data.Percent
348 | colors = ['gold', 'yellowgreen', 'lightcoral','blue', 'lightskyblue','green','red']
349 | explode = (0, 0.1, 0, 0,0,0)  # explode 1st slice
350 | 
351 | # 绘制
352 | plt.figure(figsize=(10,8))
353 | plt.pie(sizes, explode=explode, labels=labels, colors=colors,
354 |         autopct='%1.1f%%', shadow=True, startangle=140)
355 | 
356 | plt.axis('equal')
357 | plt.show()
358 | 
359 | ```
360 | 
361 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/pie.png](img/38cff4d0c27588f71d4ed00223dcc4a2.jpg)
362 | 
363 | **条形图**
364 | 
365 | ```py
366 | labels = plot_data.age_class
367 | missing = plot_data.Percent
368 | ind = [x for x, _ in enumerate(labels)]
369 | 
370 | plt.figure(figsize=(10,8))
371 | plt.bar(ind, missing, width=0.8, label='missing', color='gold')
372 | 
373 | plt.xticks(ind, labels)
374 | plt.ylabel("percentage")
375 | 
376 | plt.show()
377 | 
378 | ```
379 | 
380 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/bar.png](img/7f8b8ddc9f821d1c5a27849bc02e355f.jpg)
381 | 
382 | ```py
383 | labels = ['missing', '<25', '25-34', '35-44', '45-54','55-64','65+']
384 | missing = np.array([0.000095, 0.024830, 0.028665, 0.029477, 0.031918,0.037073,0.026699])
385 | man = np.array([0.000147, 0.036311, 0.038684, 0.044761, 0.051269, 0.059542, 0.054259])
386 | women = np.array([0.004035, 0.032935, 0.035351, 0.041778, 0.048437, 0.056236,0.048091])
387 | ind = [x for x, _ in enumerate(labels)]
388 | 
389 | plt.figure(figsize=(10,8))
390 | plt.bar(ind, women, width=0.8, label='women', color='gold', bottom=man+missing)
391 | plt.bar(ind, man, width=0.8, label='man', color='silver', bottom=missing)
392 | plt.bar(ind, missing, width=0.8, label='missing', color='#CD853F')
393 | 
394 | plt.xticks(ind, labels)
395 | plt.ylabel("percentage")
396 | plt.legend(loc="upper left")
397 | plt.title("demo")
398 | 
399 | plt.show()
400 | 
401 | ```
402 | 
403 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/stacked.png](img/aa2fbf6676b8fd4f67229d35f1c7c537.jpg)
404 | 
405 | ## 7.2\. 多变量分析
406 | 
407 | 在本节中，我将仅演示双变量分析。 由于多变量分析由双变量派生。
408 | 
409 | ### 7.2.1\. 数值 VS 数值
410 | 
411 | **相关矩阵**
412 | 
413 | ```py
414 | from pyspark.mllib.stat import Statistics
415 | import pandas as pd
416 | 
417 | corr_data = df.select(num_cols)
418 | 
419 | col_names = corr_data.columns
420 | features = corr_data.rdd.map(lambda row: row[0:])
421 | corr_mat=Statistics.corr(features, method="pearson")
422 | corr_df = pd.DataFrame(corr_mat)
423 | corr_df.index, corr_df.columns = col_names, col_names
424 | 
425 | print(corr_df.to_string())
426 | 
427 | ```
428 | 
429 | ```py
430 | +--------------------+--------------------+
431 | |     Account Balance|    No of dependents|
432 | +--------------------+--------------------+
433 | |                 1.0|-0.01414542650320914|
434 | |-0.01414542650320914|                 1.0|
435 | +--------------------+--------------------+
436 | 
437 | ```
438 | 
439 | **散点图**
440 | 
441 | ```py
442 | import seaborn as sns
443 | sns.set(style="ticks")
444 | 
445 | df = sns.load_dataset("iris")
446 | sns.pairplot(df, hue="species")
447 | plt.show()
448 | 
449 | ```
450 | 
451 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/pairplot.png](img/1428271961e4c95f6508f59083d5a645.jpg)
452 | 
453 | ### 7.2.2\. 类别 VS 类别
454 | 
455 | **卡方检验**
456 | 
457 | > 警告
458 | > 
459 | > `pyspark.ml.stat` 只在 Spark 2.4.0 中可用。
460 | 
461 | ```py
462 | from pyspark.ml.linalg import Vectors
463 | from pyspark.ml.stat import ChiSquareTest
464 | 
465 | data = [(0.0, Vectors.dense(0.5, 10.0)),
466 |         (0.0, Vectors.dense(1.5, 20.0)),
467 |         (1.0, Vectors.dense(1.5, 30.0)),
468 |         (0.0, Vectors.dense(3.5, 30.0)),
469 |         (0.0, Vectors.dense(3.5, 40.0)),
470 |         (1.0, Vectors.dense(3.5, 40.0))]
471 | df = spark.createDataFrame(data, ["label", "features"])
472 | 
473 | r = ChiSquareTest.test(df, "features", "label").head()
474 | print("pValues: " + str(r.pValues))
475 | print("degreesOfFreedom: " + str(r.degreesOfFreedom))
476 | print("statistics: " + str(r.statistics))
477 | 
478 | ```
479 | 
480 | ```py
481 | pValues: [0.687289278791,0.682270330336]
482 | degreesOfFreedom: [2, 3]
483 | statistics: [0.75,1.5]
484 | 
485 | ```
486 | 
487 | **交叉表**
488 | 
489 | ```py
490 | df.stat.crosstab("age_class", "Occupation").show()
491 | 
492 | ```
493 | 
494 | ```py
495 | +--------------------+---+---+---+---+
496 | |age_class_Occupation|  1|  2|  3|  4|
497 | +--------------------+---+---+---+---+
498 | |                 <25|  4| 34|108|  4|
499 | |               55-64|  1| 15| 31|  9|
500 | |               25-34|  7| 61|269| 60|
501 | |               35-44|  4| 58|143| 49|
502 | |                 65+|  5|  3|  6|  9|
503 | |               45-54|  1| 29| 73| 17|
504 | +--------------------+---+---+---+---+
505 | 
506 | ```
507 | 
508 | **堆栈图**
509 | 
510 | ```py
511 | labels = ['missing', '<25', '25-34', '35-44', '45-54','55-64','65+']
512 | missing = np.array([0.000095, 0.024830, 0.028665, 0.029477, 0.031918,0.037073,0.026699])
513 | man = np.array([0.000147, 0.036311, 0.038684, 0.044761, 0.051269, 0.059542, 0.054259])
514 | women = np.array([0.004035, 0.032935, 0.035351, 0.041778, 0.048437, 0.056236,0.048091])
515 | ind = [x for x, _ in enumerate(labels)]
516 | 
517 | plt.figure(figsize=(10,8))
518 | plt.bar(ind, women, width=0.8, label='women', color='gold', bottom=man+missing)
519 | plt.bar(ind, man, width=0.8, label='man', color='silver', bottom=missing)
520 | plt.bar(ind, missing, width=0.8, label='missing', color='#CD853F')
521 | 
522 | plt.xticks(ind, labels)
523 | plt.ylabel("percentage")
524 | plt.legend(loc="upper left")
525 | plt.title("demo")
526 | 
527 | plt.show()
528 | 
529 | ```
530 | 
531 | ![https://runawayhorse001.github.io/LearningApacheSpark/_images/stacked.png](img/aa2fbf6676b8fd4f67229d35f1c7c537.jpg)
532 | 


--------------------------------------------------------------------------------
/docs/9.md:
--------------------------------------------------------------------------------
 1 | # 9\. Regularization
 2 | 
 3 | In mathematics, statistics, and computer science, particularly in the fields of machine learning and inverse problems, regularization is a process of introducing additional information in order to solve an ill-posed problem or to prevent overfitting ([Wikipedia Regularization](https://en.wikipedia.org/wiki/Regularization_(mathematics))).
 4 | 
 5 | Due to the sparsity within our data, our training sets will often be ill-posed (singular). Applying regularization to the regression has many advantages, including:
 6 | 
 7 | 1.  Converting ill-posed problems to well-posed by adding additional information via the penalty parameter ![\lambda](img/84566f6949f9a2f8734318c284f441f7.jpg)
 8 | 2.  Preventing overfitting
 9 | 3.  Variable selection and the removal of correlated variables ([Glmnet Vignette](https://web.stanford.edu/~hastie/Papers/Glmnet_Vignette.pdf)). The Ridge method shrinks the coefficients of correlated variables while the LASSO method picks one variable and discards the others. The elastic net penalty is a mixture of these two; if variables are correlated in groups then ![\alpha=0.5](img/44d0db1ee959675768959ef02c868b32.jpg) tends to select the groups as in or out. If α is close to 1, the elastic net performs much like the LASSO method and removes any degeneracies and wild behavior caused by extreme correlations.
10 | 
11 | ## 9.1\. Ordinary least squares regression
12 | 
13 | ![\min _{\Bbeta\in \mathbb {R} ^{n}}{\frac {1}{n}}\|{\X}\Bbeta -{\y}\|^{2}](img/6869432b79fecd2af9dcc85625f6f356.jpg)
14 | 
15 | When ![\lambda=0](img/6170e4e4344d720bef3ff354a507f6fa.jpg) (i.e. `regParam` ![=0](img/9b8d35ed3fc944be3e432c47b447f92f.jpg)), then there is no penalty.
16 | 
17 | ```
18 | LinearRegression(featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100,
19 | regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, standardization=True, solver="auto",
20 | weightCol=None, aggregationDepth=2)
21 | 
22 | ```
23 | 
24 | ## 9.2\. Ridge regression
25 | 
26 | ![\min _{\Bbeta\in \mathbb {R} ^{n}}{\frac {1}{n}}\|{\X}\Bbeta-{\y}\|^{2}+\lambda \|\Bbeta\|_{2}^{2}](img/007f623d1ec885d996b1b72689ba7cb1.jpg)
27 | 
28 | When ![\lambda&gt;0](img/85a26958e55acab88aef1ab37443b30b.jpg) (i.e. `regParam` ![&gt;0](img/08c6744e242573f80b23af5dbbf21a94.jpg)) and ![\alpha=0](img/946e2a245a8fae021860977280b52b44.jpg) (i.e. `elasticNetParam` ![=0](img/9b8d35ed3fc944be3e432c47b447f92f.jpg)) , then the penalty is an L2 penalty.
29 | 
30 | ```
31 | LinearRegression(featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100,
32 | regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, standardization=True, solver="auto",
33 | weightCol=None, aggregationDepth=2)
34 | 
35 | ```
36 | 
37 | ## 9.3\. Least Absolute Shrinkage and Selection Operator (LASSO)
38 | 
39 | ![\min _{\Bbeta\in \mathbb {R} ^{n}}{\frac {1}{n}}\|{\X}\Bbeta-{\y}\|^{2}+\lambda\|\Bbeta\|_{1}](img/be4d423d387dbcd6a770d4bda5718082.jpg)
40 | 
41 | When ![\lambda&gt;0](img/85a26958e55acab88aef1ab37443b30b.jpg) (i.e. `regParam` ![&gt;0](img/08c6744e242573f80b23af5dbbf21a94.jpg)) and ![\alpha=1](img/361ef23b0743d01bb30ead2dccc9edca.jpg) (i.e. `elasticNetParam` ![=1](img/0702875bab1a20dbb9d95fab3813c019.jpg)), then the penalty is an L1 penalty.
42 | 
43 | ```
44 | LinearRegression(featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100,
45 | regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, standardization=True, solver="auto",
46 | weightCol=None, aggregationDepth=2)
47 | 
48 | ```
49 | 
50 | ## 9.4\. Elastic net
51 | 
52 | ![\min _{\Bbeta\in \mathbb {R} ^{n}}{\frac {1}{n}}\|{\X}\Bbeta-{\y}\|^{2}+\lambda (\alpha \|\Bbeta\|_{1}+(1-\alpha )\|\Bbeta\|_{2}^{2}),\alpha \in (0,1)](img/0ac8a503cc147ea1ccb9c24bf83a5992.jpg)
53 | 
54 | When ![\lambda&gt;0](img/85a26958e55acab88aef1ab37443b30b.jpg) (i.e. `regParam` ![&gt;0](img/08c6744e242573f80b23af5dbbf21a94.jpg)) and `elasticNetParam` ![\in (0,1)](img/2fbe25eafed24324bfbde9c4d3dca1f4.jpg) (i.e. ![\alpha\in (0,1)](img/c45aab6ee1f6f00de1ac3f428e62b01c.jpg)) , then the penalty is an L1 + L2 penalty.
55 | 
56 | ```
57 | LinearRegression(featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100,
58 | regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, standardization=True, solver="auto",
59 | weightCol=None, aggregationDepth=2)
60 | 
61 | ```


--------------------------------------------------------------------------------
/docs/img/000d96e53607268ac90aab877bb7dbfd.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/000d96e53607268ac90aab877bb7dbfd.jpg


--------------------------------------------------------------------------------
/docs/img/007f623d1ec885d996b1b72689ba7cb1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/007f623d1ec885d996b1b72689ba7cb1.jpg


--------------------------------------------------------------------------------
/docs/img/018af0886bf373be0fc585cac38b2d6c.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/018af0886bf373be0fc585cac38b2d6c.jpg


--------------------------------------------------------------------------------
/docs/img/0539212d2d3e4c28b27805e3c8783cab.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/0539212d2d3e4c28b27805e3c8783cab.jpg


--------------------------------------------------------------------------------
/docs/img/067197a5eeb69cc2f3d828a92ebcf52e.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/067197a5eeb69cc2f3d828a92ebcf52e.jpg


--------------------------------------------------------------------------------
/docs/img/0679a6e10e2c4166bb23f4effc3d5887.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/0679a6e10e2c4166bb23f4effc3d5887.jpg


--------------------------------------------------------------------------------
/docs/img/06dae2d3c7cb1a64cb31ed2482e632b7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/06dae2d3c7cb1a64cb31ed2482e632b7.jpg


--------------------------------------------------------------------------------
/docs/img/0702875bab1a20dbb9d95fab3813c019.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/0702875bab1a20dbb9d95fab3813c019.jpg


--------------------------------------------------------------------------------
/docs/img/070e654d25157c32f4038fb2daa42351.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/070e654d25157c32f4038fb2daa42351.jpg


--------------------------------------------------------------------------------
/docs/img/08c6744e242573f80b23af5dbbf21a94.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/08c6744e242573f80b23af5dbbf21a94.jpg


--------------------------------------------------------------------------------
/docs/img/09f553a8fd91309c3c1c2634e1b5ca15.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/09f553a8fd91309c3c1c2634e1b5ca15.jpg


--------------------------------------------------------------------------------
/docs/img/0a52ececadd638e5127d7a562d9b00b6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/0a52ececadd638e5127d7a562d9b00b6.jpg


--------------------------------------------------------------------------------
/docs/img/0ac8a503cc147ea1ccb9c24bf83a5992.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/0ac8a503cc147ea1ccb9c24bf83a5992.jpg


--------------------------------------------------------------------------------
/docs/img/0d2c607e00ca608222b80fa6b61e780a.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/0d2c607e00ca608222b80fa6b61e780a.jpg


--------------------------------------------------------------------------------
/docs/img/0e33aec96020afa0297be6d91db0d5d8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/0e33aec96020afa0297be6d91db0d5d8.jpg


--------------------------------------------------------------------------------
/docs/img/0eb5759f21246505752043bb890ab6bf.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/0eb5759f21246505752043bb890ab6bf.jpg


--------------------------------------------------------------------------------
/docs/img/0ebcb4677d2131e71e039be8ea955cff.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/0ebcb4677d2131e71e039be8ea955cff.jpg


--------------------------------------------------------------------------------
/docs/img/0ef51f1b4020192962616eb9559975a4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/0ef51f1b4020192962616eb9559975a4.jpg


--------------------------------------------------------------------------------
/docs/img/0ff87df50cf4610da54dd94b51c6d809.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/0ff87df50cf4610da54dd94b51c6d809.jpg


--------------------------------------------------------------------------------
/docs/img/112822dfffe3d0e59d7881d265b78cad.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/112822dfffe3d0e59d7881d265b78cad.jpg


--------------------------------------------------------------------------------
/docs/img/1428271961e4c95f6508f59083d5a645.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/1428271961e4c95f6508f59083d5a645.jpg


--------------------------------------------------------------------------------
/docs/img/1478d9b0743fdc3b0c6ad079b88034ec.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/1478d9b0743fdc3b0c6ad079b88034ec.jpg


--------------------------------------------------------------------------------
/docs/img/16dd8d60ea9b042c3ce0652c9f0571e8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/16dd8d60ea9b042c3ce0652c9f0571e8.jpg


--------------------------------------------------------------------------------
/docs/img/16fd7a4c078cf22fee09b636dc10d55c.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/16fd7a4c078cf22fee09b636dc10d55c.jpg


--------------------------------------------------------------------------------
/docs/img/189ce8661099fd6f1118f978d53cf85b.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/189ce8661099fd6f1118f978d53cf85b.jpg


--------------------------------------------------------------------------------
/docs/img/197517339d2ce744dd0a46c607e84534.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/197517339d2ce744dd0a46c607e84534.jpg


--------------------------------------------------------------------------------
/docs/img/1a8a8647a66b744ccd5c9137adb66255.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/1a8a8647a66b744ccd5c9137adb66255.jpg


--------------------------------------------------------------------------------
/docs/img/1ac835166928f502b55a31636602602a.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/1ac835166928f502b55a31636602602a.jpg


--------------------------------------------------------------------------------
/docs/img/1bb7f4b9072cd83f62d4e344eaba88e8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/1bb7f4b9072cd83f62d4e344eaba88e8.jpg


--------------------------------------------------------------------------------
/docs/img/1c57212c22a6a7777decfa1971418148.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/1c57212c22a6a7777decfa1971418148.jpg


--------------------------------------------------------------------------------
/docs/img/1cef776388e6c2cba3cf00cab2199e3d.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/1cef776388e6c2cba3cf00cab2199e3d.jpg


--------------------------------------------------------------------------------
/docs/img/1f778c47baa79f4277cc4c2cb0ff0a2d.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/1f778c47baa79f4277cc4c2cb0ff0a2d.jpg


--------------------------------------------------------------------------------
/docs/img/203cd7c17881ea567f75816f98ec50fa.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/203cd7c17881ea567f75816f98ec50fa.jpg


--------------------------------------------------------------------------------
/docs/img/210c2cca588f51d57a4eee64f09d4b38.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/210c2cca588f51d57a4eee64f09d4b38.jpg


--------------------------------------------------------------------------------
/docs/img/22e9d09a51121f8c77345a724091c622.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/22e9d09a51121f8c77345a724091c622.jpg


--------------------------------------------------------------------------------
/docs/img/2649ef98f720c129d663f5d82add4129.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/2649ef98f720c129d663f5d82add4129.jpg


--------------------------------------------------------------------------------
/docs/img/26e6828e76fd4c01d8aea7770dd032c7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/26e6828e76fd4c01d8aea7770dd032c7.jpg


--------------------------------------------------------------------------------
/docs/img/290e0b58c66f2b75c67fd1a15e3fe958.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/290e0b58c66f2b75c67fd1a15e3fe958.jpg


--------------------------------------------------------------------------------
/docs/img/29928a4b917a9348751e7d0410712045.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/29928a4b917a9348751e7d0410712045.jpg


--------------------------------------------------------------------------------
/docs/img/2a4a130bcfb223ced98c0de613bd076a.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/2a4a130bcfb223ced98c0de613bd076a.jpg


--------------------------------------------------------------------------------
/docs/img/2a63caca4dc8603d4a602018e75a1fcd.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/2a63caca4dc8603d4a602018e75a1fcd.jpg


--------------------------------------------------------------------------------
/docs/img/2b9e0aad3a11fee86e4261a1e94749e0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/2b9e0aad3a11fee86e4261a1e94749e0.jpg


--------------------------------------------------------------------------------
/docs/img/2c646f410ca3ccbf4db40d322dfba989.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/2c646f410ca3ccbf4db40d322dfba989.jpg


--------------------------------------------------------------------------------
/docs/img/2d776487e1a2ee4683c3c6f51fca7e48.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/2d776487e1a2ee4683c3c6f51fca7e48.jpg


--------------------------------------------------------------------------------
/docs/img/2f97b51dd2c305123e29377f808272b2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/2f97b51dd2c305123e29377f808272b2.jpg


--------------------------------------------------------------------------------
/docs/img/2fbe25eafed24324bfbde9c4d3dca1f4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/2fbe25eafed24324bfbde9c4d3dca1f4.jpg


--------------------------------------------------------------------------------
/docs/img/3152173a8fd696819c7a2c2b8c6ef005.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/3152173a8fd696819c7a2c2b8c6ef005.jpg


--------------------------------------------------------------------------------
/docs/img/315be0f70cd0effa6c8682f2a949a46c.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/315be0f70cd0effa6c8682f2a949a46c.jpg


--------------------------------------------------------------------------------
/docs/img/348c0d7bc8db0d630042e5faffd2d647.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/348c0d7bc8db0d630042e5faffd2d647.jpg


--------------------------------------------------------------------------------
/docs/img/361ef23b0743d01bb30ead2dccc9edca.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/361ef23b0743d01bb30ead2dccc9edca.jpg


--------------------------------------------------------------------------------
/docs/img/38cff4d0c27588f71d4ed00223dcc4a2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/38cff4d0c27588f71d4ed00223dcc4a2.jpg


--------------------------------------------------------------------------------
/docs/img/38f75cffa6acca49fcf1ba20d202b2d0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/38f75cffa6acca49fcf1ba20d202b2d0.jpg


--------------------------------------------------------------------------------
/docs/img/3b99ee07cd783026d41b65651ee5d293.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/3b99ee07cd783026d41b65651ee5d293.jpg


--------------------------------------------------------------------------------
/docs/img/3c0d894f48daba4e004eddd747cb556c.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/3c0d894f48daba4e004eddd747cb556c.jpg


--------------------------------------------------------------------------------
/docs/img/3c62f7e72a479ae0b82768c51bdc2830.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/3c62f7e72a479ae0b82768c51bdc2830.jpg


--------------------------------------------------------------------------------
/docs/img/3de6020b5a20976d8e1244b98a1ae30b.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/3de6020b5a20976d8e1244b98a1ae30b.jpg


--------------------------------------------------------------------------------
/docs/img/3f26c9365c0603f014f3bba403ed27fb.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/3f26c9365c0603f014f3bba403ed27fb.jpg


--------------------------------------------------------------------------------
/docs/img/3f74c667189eff836df907a6b6ff2584.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/3f74c667189eff836df907a6b6ff2584.jpg


--------------------------------------------------------------------------------
/docs/img/3f959ef0c8e56c21a00dceb01eb6e6c1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/3f959ef0c8e56c21a00dceb01eb6e6c1.jpg


--------------------------------------------------------------------------------
/docs/img/40877c463ec6621caf8f742f1e5c7c05.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/40877c463ec6621caf8f742f1e5c7c05.jpg


--------------------------------------------------------------------------------
/docs/img/40bca4c232843e818fac0254a2ae2689.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/40bca4c232843e818fac0254a2ae2689.jpg


--------------------------------------------------------------------------------
/docs/img/43393ec4f575c391ddfca83324ec67a8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/43393ec4f575c391ddfca83324ec67a8.jpg


--------------------------------------------------------------------------------
/docs/img/44d0db1ee959675768959ef02c868b32.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/44d0db1ee959675768959ef02c868b32.jpg


--------------------------------------------------------------------------------
/docs/img/46803121f8c51dcb1422593800354126.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/46803121f8c51dcb1422593800354126.jpg


--------------------------------------------------------------------------------
/docs/img/4a1a112aa8490f7c8410b710845e8c7a.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/4a1a112aa8490f7c8410b710845e8c7a.jpg


--------------------------------------------------------------------------------
/docs/img/4ae661a05a9586c4ce7b5eabf4bab417.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/4ae661a05a9586c4ce7b5eabf4bab417.jpg


--------------------------------------------------------------------------------
/docs/img/4b454255e179a3626e205ce324184acf.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/4b454255e179a3626e205ce324184acf.jpg


--------------------------------------------------------------------------------
/docs/img/4bcd8bf4febeeb8972519ed2adbce8d5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/4bcd8bf4febeeb8972519ed2adbce8d5.jpg


--------------------------------------------------------------------------------
/docs/img/4e5f23199f8f3cd55221e56cb05bd116.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/4e5f23199f8f3cd55221e56cb05bd116.jpg


--------------------------------------------------------------------------------
/docs/img/4fb175e4e5682ef75a156dfba37beeea.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/4fb175e4e5682ef75a156dfba37beeea.jpg


--------------------------------------------------------------------------------
/docs/img/501025688da0cf9e2b3937cd7da9580d.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/501025688da0cf9e2b3937cd7da9580d.jpg


--------------------------------------------------------------------------------
/docs/img/585d98b9749f0661bc9077e01f28eb15.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/585d98b9749f0661bc9077e01f28eb15.jpg


--------------------------------------------------------------------------------
/docs/img/59ebd939c24bf4d59d82b0daf4874daf.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/59ebd939c24bf4d59d82b0daf4874daf.jpg


--------------------------------------------------------------------------------
/docs/img/5a13655c0030372e1b06cd77ff1e53e0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/5a13655c0030372e1b06cd77ff1e53e0.jpg


--------------------------------------------------------------------------------
/docs/img/5a64add1cdcd06c1755071eba4040184.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/5a64add1cdcd06c1755071eba4040184.jpg


--------------------------------------------------------------------------------
/docs/img/5afb60bc18fcf81ab2ff1282bb035030.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/5afb60bc18fcf81ab2ff1282bb035030.jpg


--------------------------------------------------------------------------------
/docs/img/5b5795767ca8cb65be11e7cc020d6b7f.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/5b5795767ca8cb65be11e7cc020d6b7f.jpg


--------------------------------------------------------------------------------
/docs/img/5d730ed5b317fc0cf48f64f9e3312d92.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/5d730ed5b317fc0cf48f64f9e3312d92.jpg


--------------------------------------------------------------------------------
/docs/img/5d7cfe29b931b3da8aa6fe174ccaac16.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/5d7cfe29b931b3da8aa6fe174ccaac16.jpg


--------------------------------------------------------------------------------
/docs/img/6170e4e4344d720bef3ff354a507f6fa.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/6170e4e4344d720bef3ff354a507f6fa.jpg


--------------------------------------------------------------------------------
/docs/img/61bccf1d55cc6636fce9585573c9981a.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/61bccf1d55cc6636fce9585573c9981a.jpg


--------------------------------------------------------------------------------
/docs/img/6202e78af1fef25458e1696f4a8ae057.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/6202e78af1fef25458e1696f4a8ae057.jpg


--------------------------------------------------------------------------------
/docs/img/62e9e7ead57d4e2a4be61668d6aff334.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/62e9e7ead57d4e2a4be61668d6aff334.jpg


--------------------------------------------------------------------------------
/docs/img/679879d3fb7800c91c8321c24ce0f8d9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/679879d3fb7800c91c8321c24ce0f8d9.jpg


--------------------------------------------------------------------------------
/docs/img/67ba05c0d55d08b80558d07c418fa22e.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/67ba05c0d55d08b80558d07c418fa22e.jpg


--------------------------------------------------------------------------------
/docs/img/6869432b79fecd2af9dcc85625f6f356.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/6869432b79fecd2af9dcc85625f6f356.jpg


--------------------------------------------------------------------------------
/docs/img/68b23d0b045f72286a7dd26502a39dd0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/68b23d0b045f72286a7dd26502a39dd0.jpg


--------------------------------------------------------------------------------
/docs/img/68d093ccd7d87aecf646581f431220be.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/68d093ccd7d87aecf646581f431220be.jpg


--------------------------------------------------------------------------------
/docs/img/69ff1f1b7a8e2162d5395fa62c35e8b6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/69ff1f1b7a8e2162d5395fa62c35e8b6.jpg


--------------------------------------------------------------------------------
/docs/img/6a466b1857f68538cfa76355f8ea6396.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/6a466b1857f68538cfa76355f8ea6396.jpg


--------------------------------------------------------------------------------
/docs/img/6b9d05253edbae95f27fa6c58648c872.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/6b9d05253edbae95f27fa6c58648c872.jpg


--------------------------------------------------------------------------------
/docs/img/6eb508bad184c89094f5045a5bf2e31c.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/6eb508bad184c89094f5045a5bf2e31c.jpg


--------------------------------------------------------------------------------
/docs/img/6f2adb68d3f0a7f1f3af2ef044441071.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/6f2adb68d3f0a7f1f3af2ef044441071.jpg


--------------------------------------------------------------------------------
/docs/img/6ffbd19e479aef3243e53b706d026af4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/6ffbd19e479aef3243e53b706d026af4.jpg


--------------------------------------------------------------------------------
/docs/img/710e36ed942d63d074523c804347981b.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/710e36ed942d63d074523c804347981b.jpg


--------------------------------------------------------------------------------
/docs/img/7166a4887b7f211527c9e45a072e23d2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/7166a4887b7f211527c9e45a072e23d2.jpg


--------------------------------------------------------------------------------
/docs/img/7171a798c643e82807785cc163a04796.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/7171a798c643e82807785cc163a04796.jpg


--------------------------------------------------------------------------------
/docs/img/72748fa31cb48a5062a2fc7949bd0b45.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/72748fa31cb48a5062a2fc7949bd0b45.jpg


--------------------------------------------------------------------------------
/docs/img/739c6ec939fd446ba1cde4cf4620512a.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/739c6ec939fd446ba1cde4cf4620512a.jpg


--------------------------------------------------------------------------------
/docs/img/7537a0a4978369cde56fd3dee294d760.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/7537a0a4978369cde56fd3dee294d760.jpg


--------------------------------------------------------------------------------
/docs/img/76b7cb4ea7971a8dc0db7a36feb66a35.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/76b7cb4ea7971a8dc0db7a36feb66a35.jpg


--------------------------------------------------------------------------------
/docs/img/77c47cf9cfec8ec740c5a18dc4386670.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/77c47cf9cfec8ec740c5a18dc4386670.jpg


--------------------------------------------------------------------------------
/docs/img/791424a3e5f6e2f4372471d96e5b4676.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/791424a3e5f6e2f4372471d96e5b4676.jpg


--------------------------------------------------------------------------------
/docs/img/7a33368daf8723e9a736c50a54b4d084.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/7a33368daf8723e9a736c50a54b4d084.jpg


--------------------------------------------------------------------------------
/docs/img/7a9506c9bd23ed8b08861cd51eaf5cc3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/7a9506c9bd23ed8b08861cd51eaf5cc3.jpg


--------------------------------------------------------------------------------
/docs/img/7a9b61eccdf8d02c95b6cd81a63e02ac.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/7a9b61eccdf8d02c95b6cd81a63e02ac.jpg


--------------------------------------------------------------------------------
/docs/img/7ab51337f3701838a285ea3a7346a777.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/7ab51337f3701838a285ea3a7346a777.jpg


--------------------------------------------------------------------------------
/docs/img/7b85580dc5d18fc8fec74ce74849c182.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/7b85580dc5d18fc8fec74ce74849c182.jpg


--------------------------------------------------------------------------------
/docs/img/7bb886fc0ea7d5d1144002edd99e0c7f.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/7bb886fc0ea7d5d1144002edd99e0c7f.jpg


--------------------------------------------------------------------------------
/docs/img/7c0c7d4ea0f6a4cc6d3b9942f440d2ff.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/7c0c7d4ea0f6a4cc6d3b9942f440d2ff.jpg


--------------------------------------------------------------------------------
/docs/img/7e397a7a0557431be9b98b2af35968d6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/7e397a7a0557431be9b98b2af35968d6.jpg


--------------------------------------------------------------------------------
/docs/img/7f8b8ddc9f821d1c5a27849bc02e355f.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/7f8b8ddc9f821d1c5a27849bc02e355f.jpg


--------------------------------------------------------------------------------
/docs/img/80a25ad6329d3836f4e625a1c93e7898.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/80a25ad6329d3836f4e625a1c93e7898.jpg


--------------------------------------------------------------------------------
/docs/img/80f5d2344fd2f483b82b81d0a33e9333.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/80f5d2344fd2f483b82b81d0a33e9333.jpg


--------------------------------------------------------------------------------
/docs/img/815f9ee92336e4ffc376f80bcb777ea1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/815f9ee92336e4ffc376f80bcb777ea1.jpg


--------------------------------------------------------------------------------
/docs/img/822f02d6117220d35ba69a1e20befe65.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/822f02d6117220d35ba69a1e20befe65.jpg


--------------------------------------------------------------------------------
/docs/img/82a22af158d760e46ae93ba1663a6487.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/82a22af158d760e46ae93ba1663a6487.jpg


--------------------------------------------------------------------------------
/docs/img/82e0633e9121ff663a913eb95a3dd723.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/82e0633e9121ff663a913eb95a3dd723.jpg


--------------------------------------------------------------------------------
/docs/img/82f787ceda05c98a84ab98cdc998025c.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/82f787ceda05c98a84ab98cdc998025c.jpg


--------------------------------------------------------------------------------
/docs/img/84566f6949f9a2f8734318c284f441f7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/84566f6949f9a2f8734318c284f441f7.jpg


--------------------------------------------------------------------------------
/docs/img/85a26958e55acab88aef1ab37443b30b.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/85a26958e55acab88aef1ab37443b30b.jpg


--------------------------------------------------------------------------------
/docs/img/86176a13e0a00622dbc982348d7ca623.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/86176a13e0a00622dbc982348d7ca623.jpg


--------------------------------------------------------------------------------
/docs/img/875e532ac3b299876d209507d595df14.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/875e532ac3b299876d209507d595df14.jpg


--------------------------------------------------------------------------------
/docs/img/88d05071bd3700af0ba08bab16c423be.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/88d05071bd3700af0ba08bab16c423be.jpg


--------------------------------------------------------------------------------
/docs/img/8973b73843e90120de5f556d5084eb49.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/8973b73843e90120de5f556d5084eb49.jpg


--------------------------------------------------------------------------------
/docs/img/8c3fdcf6adcc472c7cd7a4598f96caac.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/8c3fdcf6adcc472c7cd7a4598f96caac.jpg


--------------------------------------------------------------------------------
/docs/img/8d1654d45d287b49d6a7cbcae26c598f.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/8d1654d45d287b49d6a7cbcae26c598f.jpg


--------------------------------------------------------------------------------
/docs/img/8dc8e70e19ec4318b12b16f1c5bdb879.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/8dc8e70e19ec4318b12b16f1c5bdb879.jpg


--------------------------------------------------------------------------------
/docs/img/8f58cf98a539286a53e41582f194fbed.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/8f58cf98a539286a53e41582f194fbed.jpg


--------------------------------------------------------------------------------
/docs/img/901db29887d45801cb568cdd53d72a99.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/901db29887d45801cb568cdd53d72a99.jpg


--------------------------------------------------------------------------------
/docs/img/905a46295f1f2a591a5d0b563d44277b.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/905a46295f1f2a591a5d0b563d44277b.jpg


--------------------------------------------------------------------------------
/docs/img/90a1240e7489f989b9a4e5739b1efbd5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/90a1240e7489f989b9a4e5739b1efbd5.jpg


--------------------------------------------------------------------------------
/docs/img/91d663abfef497e13ec41f9300a5c354.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/91d663abfef497e13ec41f9300a5c354.jpg


--------------------------------------------------------------------------------
/docs/img/91deb6bcd6225e40290234462f33288a.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/91deb6bcd6225e40290234462f33288a.jpg


--------------------------------------------------------------------------------
/docs/img/92df4afaf5010b135936512a39fb87d8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/92df4afaf5010b135936512a39fb87d8.jpg


--------------------------------------------------------------------------------
/docs/img/946e2a245a8fae021860977280b52b44.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/946e2a245a8fae021860977280b52b44.jpg


--------------------------------------------------------------------------------
/docs/img/94b77459ef6ab620703ddb014430c700.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/94b77459ef6ab620703ddb014430c700.jpg


--------------------------------------------------------------------------------
/docs/img/95594348fc6d49d2819be3d412a27e55.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/95594348fc6d49d2819be3d412a27e55.jpg


--------------------------------------------------------------------------------
/docs/img/962ddfc5aa5c0edc0ea500f82be01ac0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/962ddfc5aa5c0edc0ea500f82be01ac0.jpg


--------------------------------------------------------------------------------
/docs/img/96f213d1391dab5d9cd7f4ff68e739aa.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/96f213d1391dab5d9cd7f4ff68e739aa.jpg


--------------------------------------------------------------------------------
/docs/img/97fa452fc3843513b0746e76d216be78.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/97fa452fc3843513b0746e76d216be78.jpg


--------------------------------------------------------------------------------
/docs/img/9af4ce81465021e68c774194432663c0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/9af4ce81465021e68c774194432663c0.jpg


--------------------------------------------------------------------------------
/docs/img/9b41f0fbb97ef7ddd6383753e6ad1c26.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/9b41f0fbb97ef7ddd6383753e6ad1c26.jpg


--------------------------------------------------------------------------------
/docs/img/9b8d35ed3fc944be3e432c47b447f92f.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/9b8d35ed3fc944be3e432c47b447f92f.jpg


--------------------------------------------------------------------------------
/docs/img/9d27515800718ff1cc0ac326899c7f77.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/9d27515800718ff1cc0ac326899c7f77.jpg


--------------------------------------------------------------------------------
/docs/img/9d7dabd9ffa8795e12f2bcdf181e0b62.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/9d7dabd9ffa8795e12f2bcdf181e0b62.jpg


--------------------------------------------------------------------------------
/docs/img/a389bc9d64e6d8eb9bc985f12054716b.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/a389bc9d64e6d8eb9bc985f12054716b.jpg


--------------------------------------------------------------------------------
/docs/img/a5fda7453d5707d5e8985434c789ba48.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/a5fda7453d5707d5e8985434c789ba48.jpg


--------------------------------------------------------------------------------
/docs/img/a70ad9ce54b1e6e921ef6745fcef45da.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/a70ad9ce54b1e6e921ef6745fcef45da.jpg


--------------------------------------------------------------------------------
/docs/img/a769c068095381d9207afe431343c95c.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/a769c068095381d9207afe431343c95c.jpg


--------------------------------------------------------------------------------
/docs/img/aa2fbf6676b8fd4f67229d35f1c7c537.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/aa2fbf6676b8fd4f67229d35f1c7c537.jpg


--------------------------------------------------------------------------------
/docs/img/ad37847dfd8d9f3d99f646966f32cf30.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/ad37847dfd8d9f3d99f646966f32cf30.jpg


--------------------------------------------------------------------------------
/docs/img/aec897e37f71d43694de4db49ed3be3e.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/aec897e37f71d43694de4db49ed3be3e.jpg


--------------------------------------------------------------------------------
/docs/img/aed7e56b0a3e63a84e53c79df4f79b0e.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/aed7e56b0a3e63a84e53c79df4f79b0e.jpg


--------------------------------------------------------------------------------
/docs/img/aef64ee73dc1b1a03a152855f685113e.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/aef64ee73dc1b1a03a152855f685113e.jpg


--------------------------------------------------------------------------------
/docs/img/afa87c5126806e604709f243ab72848b.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/afa87c5126806e604709f243ab72848b.jpg


--------------------------------------------------------------------------------
/docs/img/b288f19072faa2f8f373d5a8910c080b.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/b288f19072faa2f8f373d5a8910c080b.jpg


--------------------------------------------------------------------------------
/docs/img/b3039f057e9453e4183ed33aecf5815f.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/b3039f057e9453e4183ed33aecf5815f.jpg


--------------------------------------------------------------------------------
/docs/img/b3268e19b1a48f645d17d659940fb084.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/b3268e19b1a48f645d17d659940fb084.jpg


--------------------------------------------------------------------------------
/docs/img/b4a297ef2185e28694b366bde4069858.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/b4a297ef2185e28694b366bde4069858.jpg


--------------------------------------------------------------------------------
/docs/img/b6bd384dd0f03237f1b1b36428d27842.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/b6bd384dd0f03237f1b1b36428d27842.jpg


--------------------------------------------------------------------------------
/docs/img/b7721ad6f461509452813013157c7a5e.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/b7721ad6f461509452813013157c7a5e.jpg


--------------------------------------------------------------------------------
/docs/img/b7d416463ca0be2cb7caf7d373dc26cc.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/b7d416463ca0be2cb7caf7d373dc26cc.jpg


--------------------------------------------------------------------------------
/docs/img/b7d7ca35788d7bfb804b5b230a76af8c.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/b7d7ca35788d7bfb804b5b230a76af8c.jpg


--------------------------------------------------------------------------------
/docs/img/b8bf446d4a625497f28f2347b7ca0c92.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/b8bf446d4a625497f28f2347b7ca0c92.jpg


--------------------------------------------------------------------------------
/docs/img/b8c9ccb17235ad37b2b0fee18853efe6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/b8c9ccb17235ad37b2b0fee18853efe6.jpg


--------------------------------------------------------------------------------
/docs/img/b9eb842264e6a48a42ecf5f142e32414.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/b9eb842264e6a48a42ecf5f142e32414.jpg


--------------------------------------------------------------------------------
/docs/img/baa636adac3ad30302c0a36fc2f58751.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/baa636adac3ad30302c0a36fc2f58751.jpg


--------------------------------------------------------------------------------
/docs/img/bab25b7785bf747bc1caa1442874df74.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/bab25b7785bf747bc1caa1442874df74.jpg


--------------------------------------------------------------------------------
/docs/img/be4d423d387dbcd6a770d4bda5718082.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/be4d423d387dbcd6a770d4bda5718082.jpg


--------------------------------------------------------------------------------
/docs/img/c03bdd903e4bd06d018711d1dece0c35.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/c03bdd903e4bd06d018711d1dece0c35.jpg


--------------------------------------------------------------------------------
/docs/img/c089ca6ef2f36b0394d7bcf41db78030.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/c089ca6ef2f36b0394d7bcf41db78030.jpg


--------------------------------------------------------------------------------
/docs/img/c1bfb9f293835166e1378720b9f206b8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/c1bfb9f293835166e1378720b9f206b8.jpg


--------------------------------------------------------------------------------
/docs/img/c24065c33e1cca422d1ae92f57cd77c1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/c24065c33e1cca422d1ae92f57cd77c1.jpg


--------------------------------------------------------------------------------
/docs/img/c421a389906a45c77337a6a68fa78a0b.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/c421a389906a45c77337a6a68fa78a0b.jpg


--------------------------------------------------------------------------------
/docs/img/c45aab6ee1f6f00de1ac3f428e62b01c.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/c45aab6ee1f6f00de1ac3f428e62b01c.jpg


--------------------------------------------------------------------------------
/docs/img/c4660874124a448ac14209f4a59e367a.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/c4660874124a448ac14209f4a59e367a.jpg


--------------------------------------------------------------------------------
/docs/img/c51fb942d508d4161e72d0075a5284e7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/c51fb942d508d4161e72d0075a5284e7.jpg


--------------------------------------------------------------------------------
/docs/img/c647aced84d4783e96a244a8af78ddd2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/c647aced84d4783e96a244a8af78ddd2.jpg


--------------------------------------------------------------------------------
/docs/img/c789e9bbaa3506dc90047b5cd487a42a.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/c789e9bbaa3506dc90047b5cd487a42a.jpg


--------------------------------------------------------------------------------
/docs/img/c8a2ccec457f128649ad30a2ba066a48.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/c8a2ccec457f128649ad30a2ba066a48.jpg


--------------------------------------------------------------------------------
/docs/img/c9c3087ea25e6c3f848030b33b06de8f.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/c9c3087ea25e6c3f848030b33b06de8f.jpg


--------------------------------------------------------------------------------
/docs/img/cab981b993e03ab12309dd619da9e31d.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/cab981b993e03ab12309dd619da9e31d.jpg


--------------------------------------------------------------------------------
/docs/img/cb0b50e4410efd78416163f37eaf1262.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/cb0b50e4410efd78416163f37eaf1262.jpg


--------------------------------------------------------------------------------
/docs/img/cb63c877ea3af266bb0f5ad6ba5e0b1d.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/cb63c877ea3af266bb0f5ad6ba5e0b1d.jpg


--------------------------------------------------------------------------------
/docs/img/cdcdbf84e640274f429780824ccf99ae.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/cdcdbf84e640274f429780824ccf99ae.jpg


--------------------------------------------------------------------------------
/docs/img/cedd3825782041ef84d7741e62528a42.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/cedd3825782041ef84d7741e62528a42.jpg


--------------------------------------------------------------------------------
/docs/img/d003fed20e7f2d040ccc24412cb854d1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/d003fed20e7f2d040ccc24412cb854d1.jpg


--------------------------------------------------------------------------------
/docs/img/d09c46ec94d638e4ddcecfbba1c11ea8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/d09c46ec94d638e4ddcecfbba1c11ea8.jpg


--------------------------------------------------------------------------------
/docs/img/d142da9aae51c6d3c3c736fc82252862.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/d142da9aae51c6d3c3c736fc82252862.jpg


--------------------------------------------------------------------------------
/docs/img/d2f9799d371fde446e6dc8292ba07393.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/d2f9799d371fde446e6dc8292ba07393.jpg


--------------------------------------------------------------------------------
/docs/img/d3a773e713ad3244265d91b77ef7fb7e.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/d3a773e713ad3244265d91b77ef7fb7e.jpg


--------------------------------------------------------------------------------
/docs/img/d3b112475692c0421480c01cd029cf09.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/d3b112475692c0421480c01cd029cf09.jpg


--------------------------------------------------------------------------------
/docs/img/d4b213f9046b3ed8b898fac4d4aeec34.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/d4b213f9046b3ed8b898fac4d4aeec34.jpg


--------------------------------------------------------------------------------
/docs/img/d4b34834b440d5d60f25912180e7e130.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/d4b34834b440d5d60f25912180e7e130.jpg


--------------------------------------------------------------------------------
/docs/img/d4c847aca412080f018bab9df543ff7b.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/d4c847aca412080f018bab9df543ff7b.jpg


--------------------------------------------------------------------------------
/docs/img/d69988406d72ad9e624d24db6b4d2838.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/d69988406d72ad9e624d24db6b4d2838.jpg


--------------------------------------------------------------------------------
/docs/img/d6c0dcf5a8894d7495e320405295cc8f.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/d6c0dcf5a8894d7495e320405295cc8f.jpg


--------------------------------------------------------------------------------
/docs/img/d7945700ecde92ac83058e07433755da.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/d7945700ecde92ac83058e07433755da.jpg


--------------------------------------------------------------------------------
/docs/img/d7cd0e2a15aa54e4700d3dc03e6ac28d.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/d7cd0e2a15aa54e4700d3dc03e6ac28d.jpg


--------------------------------------------------------------------------------
/docs/img/d9195c20e19c173ec6d22c2e60a2cddb.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/d9195c20e19c173ec6d22c2e60a2cddb.jpg


--------------------------------------------------------------------------------
/docs/img/dd0fad3141f468ebc29678d3ff86055d.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/dd0fad3141f468ebc29678d3ff86055d.jpg


--------------------------------------------------------------------------------
/docs/img/e1164e5922bbcc2db8e6b23c145b8f75.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/e1164e5922bbcc2db8e6b23c145b8f75.jpg


--------------------------------------------------------------------------------
/docs/img/e97f8315ce721d1417bc7bb3b4a9d332.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/e97f8315ce721d1417bc7bb3b4a9d332.jpg


--------------------------------------------------------------------------------
/docs/img/eacebbc96f1d97c47d903d7981ce1167.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/eacebbc96f1d97c47d903d7981ce1167.jpg


--------------------------------------------------------------------------------
/docs/img/ec9e0b7231caed693477682311612304.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/ec9e0b7231caed693477682311612304.jpg


--------------------------------------------------------------------------------
/docs/img/ed008064e9d0e55dc93f673b9aca6b65.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/ed008064e9d0e55dc93f673b9aca6b65.jpg


--------------------------------------------------------------------------------
/docs/img/edb67528127916e7e274addf9ad96029.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/edb67528127916e7e274addf9ad96029.jpg


--------------------------------------------------------------------------------
/docs/img/eea23835a8abd9d903f56256c18cf8aa.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/eea23835a8abd9d903f56256c18cf8aa.jpg


--------------------------------------------------------------------------------
/docs/img/f18ecec7a6c176301d7370e41a0a60dd.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/f18ecec7a6c176301d7370e41a0a60dd.jpg


--------------------------------------------------------------------------------
/docs/img/f3f89822d498eea24c520e0ab3cb6b0d.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/f3f89822d498eea24c520e0ab3cb6b0d.jpg


--------------------------------------------------------------------------------
/docs/img/f4e95f92187a42f257864cd22193c8ad.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/f4e95f92187a42f257864cd22193c8ad.jpg


--------------------------------------------------------------------------------
/docs/img/f5832d90e75d18f501ede7acb0b6ce74.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/f5832d90e75d18f501ede7acb0b6ce74.jpg


--------------------------------------------------------------------------------
/docs/img/f80c359151c40c9277e2d70f38856eab.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/f80c359151c40c9277e2d70f38856eab.jpg


--------------------------------------------------------------------------------
/docs/img/f9711a0b52dcab7b1173e08ac154cdb4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/f9711a0b52dcab7b1173e08ac154cdb4.jpg


--------------------------------------------------------------------------------
/docs/img/fad9e18cebad821450ed0f34abdb3988.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/fad9e18cebad821450ed0f34abdb3988.jpg


--------------------------------------------------------------------------------
/docs/img/fcc4c8c5db1d6aa3ff080466e10ccb74.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/fcc4c8c5db1d6aa3ff080466e10ccb74.jpg


--------------------------------------------------------------------------------
/docs/img/fdfe96b0b4fdfbfd862a698dc64ce34a.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/fdfe96b0b4fdfbfd862a698dc64ce34a.jpg


--------------------------------------------------------------------------------
/docs/img/fef76f108c095f250d8e9efb4cfcb710.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apachecn/learning-pyspark-zh/13ee6f83f71412586d131acede7417d4f90c18fd/docs/img/fef76f108c095f250d8e9efb4cfcb710.jpg


--------------------------------------------------------------------------------
/styles/ebook.css:
--------------------------------------------------------------------------------
  1 | /* GitHub stylesheet for MarkdownPad (http://markdownpad.com) */
  2 | /* Author: Nicolas Hery - http://nicolashery.com */
  3 | /* Version: b13fe65ca28d2e568c6ed5d7f06581183df8f2ff */
  4 | /* Source: https://github.com/nicolahery/markdownpad-github */
  5 | 
  6 | /* RESET
  7 | =============================================================================*/
  8 | 
  9 | html, body, div, span, applet, object, iframe, h1, h2, h3, h4, h5, h6, p, blockquote, pre, a, abbr, acronym, address, big, cite, code, del, dfn, em, img, ins, kbd, q, s, samp, small, strike, strong, sub, sup, tt, var, b, u, i, center, dl, dt, dd, ol, ul, li, fieldset, form, label, legend, table, caption, tbody, tfoot, thead, tr, th, td, article, aside, canvas, details, embed, figure, figcaption, footer, header, hgroup, menu, nav, output, ruby, section, summary, time, mark, audio, video {
 10 |   margin: 0;
 11 |   padding: 0;
 12 |   border: 0;
 13 | }
 14 | 
 15 | /* BODY
 16 | =============================================================================*/
 17 | 
 18 | body {
 19 |   font-family: Helvetica, arial, freesans, clean, sans-serif;
 20 |   font-size: 14px;
 21 |   line-height: 1.6;
 22 |   color: #333;
 23 |   background-color: #fff;
 24 |   padding: 20px;
 25 |   max-width: 960px;
 26 |   margin: 0 auto;
 27 | }
 28 | 
 29 | body>*:first-child {
 30 |   margin-top: 0 !important;
 31 | }
 32 | 
 33 | body>*:last-child {
 34 |   margin-bottom: 0 !important;
 35 | }
 36 | 
 37 | /* BLOCKS
 38 | =============================================================================*/
 39 | 
 40 | p, blockquote, ul, ol, dl, table, pre {
 41 |   margin: 15px 0;
 42 | }
 43 | 
 44 | /* HEADERS
 45 | =============================================================================*/
 46 | 
 47 | h1, h2, h3, h4, h5, h6 {
 48 |   margin: 20px 0 10px;
 49 |   padding: 0;
 50 |   font-weight: bold;
 51 |   -webkit-font-smoothing: antialiased;
 52 | }
 53 | 
 54 | h1 tt, h1 code, h2 tt, h2 code, h3 tt, h3 code, h4 tt, h4 code, h5 tt, h5 code, h6 tt, h6 code {
 55 |   font-size: inherit;
 56 | }
 57 | 
 58 | h1 {
 59 |   font-size: 24px;
 60 |   border-bottom: 1px solid #ccc;
 61 |   color: #000;
 62 | }
 63 | 
 64 | h2 {
 65 |   font-size: 18px;
 66 |   color: #000;
 67 | }
 68 | 
 69 | h3 {
 70 |   font-size: 14px;
 71 | }
 72 | 
 73 | h4 {
 74 |   font-size: 14px;
 75 | }
 76 | 
 77 | h5 {
 78 |   font-size: 14px;
 79 | }
 80 | 
 81 | h6 {
 82 |   color: #777;
 83 |   font-size: 14px;
 84 | }
 85 | 
 86 | body>h2:first-child, body>h1:first-child, body>h1:first-child+h2, body>h3:first-child, body>h4:first-child, body>h5:first-child, body>h6:first-child {
 87 |   margin-top: 0;
 88 |   padding-top: 0;
 89 | }
 90 | 
 91 | a:first-child h1, a:first-child h2, a:first-child h3, a:first-child h4, a:first-child h5, a:first-child h6 {
 92 |   margin-top: 0;
 93 |   padding-top: 0;
 94 | }
 95 | 
 96 | h1+p, h2+p, h3+p, h4+p, h5+p, h6+p {
 97 |   margin-top: 10px;
 98 | }
 99 | 
100 | /* LINKS
101 | =============================================================================*/
102 | 
103 | a {
104 |   color: #4183C4;
105 |   text-decoration: none;
106 | }
107 | 
108 | a:hover {
109 |   text-decoration: underline;
110 | }
111 | 
112 | /* LISTS
113 | =============================================================================*/
114 | 
115 | ul, ol {
116 |   padding-left: 30px;
117 | }
118 | 
119 | ul li > :first-child, 
120 | ol li > :first-child, 
121 | ul li ul:first-of-type, 
122 | ol li ol:first-of-type, 
123 | ul li ol:first-of-type, 
124 | ol li ul:first-of-type {
125 |   margin-top: 0px;
126 | }
127 | 
128 | ul ul, ul ol, ol ol, ol ul {
129 |   margin-bottom: 0;
130 | }
131 | 
132 | dl {
133 |   padding: 0;
134 | }
135 | 
136 | dl dt {
137 |   font-size: 14px;
138 |   font-weight: bold;
139 |   font-style: italic;
140 |   padding: 0;
141 |   margin: 15px 0 5px;
142 | }
143 | 
144 | dl dt:first-child {
145 |   padding: 0;
146 | }
147 | 
148 | dl dt>:first-child {
149 |   margin-top: 0px;
150 | }
151 | 
152 | dl dt>:last-child {
153 |   margin-bottom: 0px;
154 | }
155 | 
156 | dl dd {
157 |   margin: 0 0 15px;
158 |   padding: 0 15px;
159 | }
160 | 
161 | dl dd>:first-child {
162 |   margin-top: 0px;
163 | }
164 | 
165 | dl dd>:last-child {
166 |   margin-bottom: 0px;
167 | }
168 | 
169 | /* CODE
170 | =============================================================================*/
171 | 
172 | pre, code, tt {
173 |   font-size: 12px;
174 |   font-family: Consolas, "Liberation Mono", Courier, monospace;
175 | }
176 | 
177 | code, tt {
178 |   margin: 0 0px;
179 |   padding: 0px 0px;
180 |   white-space: nowrap;
181 |   border: 1px solid #eaeaea;
182 |   background-color: #f8f8f8;
183 |   border-radius: 3px;
184 | }
185 | 
186 | pre>code {
187 |   margin: 0;
188 |   padding: 0;
189 |   white-space: pre;
190 |   border: none;
191 |   background: transparent;
192 | }
193 | 
194 | pre {
195 |   background-color: #f8f8f8;
196 |   border: 1px solid #ccc;
197 |   font-size: 13px;
198 |   line-height: 19px;
199 |   overflow: auto;
200 |   padding: 6px 10px;
201 |   border-radius: 3px;
202 | }
203 | 
204 | pre code, pre tt {
205 |   background-color: transparent;
206 |   border: none;
207 | }
208 | 
209 | kbd {
210 |     -moz-border-bottom-colors: none;
211 |     -moz-border-left-colors: none;
212 |     -moz-border-right-colors: none;
213 |     -moz-border-top-colors: none;
214 |     background-color: #DDDDDD;
215 |     background-image: linear-gradient(#F1F1F1, #DDDDDD);
216 |     background-repeat: repeat-x;
217 |     border-color: #DDDDDD #CCCCCC #CCCCCC #DDDDDD;
218 |     border-image: none;
219 |     border-radius: 2px 2px 2px 2px;
220 |     border-style: solid;
221 |     border-width: 1px;
222 |     font-family: "Helvetica Neue",Helvetica,Arial,sans-serif;
223 |     line-height: 10px;
224 |     padding: 1px 4px;
225 | }
226 | 
227 | /* QUOTES
228 | =============================================================================*/
229 | 
230 | blockquote {
231 |   border-left: 4px solid #DDD;
232 |   padding: 0 15px;
233 |   color: #777;
234 | }
235 | 
236 | blockquote>:first-child {
237 |   margin-top: 0px;
238 | }
239 | 
240 | blockquote>:last-child {
241 |   margin-bottom: 0px;
242 | }
243 | 
244 | /* HORIZONTAL RULES
245 | =============================================================================*/
246 | 
247 | hr {
248 |   clear: both;
249 |   margin: 15px 0;
250 |   height: 0px;
251 |   overflow: hidden;
252 |   border: none;
253 |   background: transparent;
254 |   border-bottom: 4px solid #ddd;
255 |   padding: 0;
256 | }
257 | 
258 | /* TABLES
259 | =============================================================================*/
260 | 
261 | table th {
262 |   font-weight: bold;
263 | }
264 | 
265 | table th, table td {
266 |   border: 1px solid #ccc;
267 |   padding: 6px 13px;
268 | }
269 | 
270 | table tr {
271 |   border-top: 1px solid #ccc;
272 |   background-color: #fff;
273 | }
274 | 
275 | table tr:nth-child(2n) {
276 |   background-color: #f8f8f8;
277 | }
278 | 
279 | /* IMAGES
280 | =============================================================================*/
281 | 
282 | img {
283 |   max-width: 100%
284 | }


--------------------------------------------------------------------------------