├── .gitignore
├── LICENSE
├── README.md
├── apache-spark-tutorials
├── Apache Spark + Collaborative Filtering.ipynb
├── Apache Spark - Pipelines.ipynb
├── MongoDB.ipynb
├── notebook+s3.ipynb
├── sample_movielens_ratings.txt
└── spark+deeplearning.ipynb
├── automated-feature-engineering
├── automated-feature-engineering.ipynb
└── automated_feature.ipynb
├── docker-project-boilerplate
├── .dockerignore
├── Dockerfile
├── README.md
├── docker-compose.yml
└── requirements.txt
├── python + API
└── Python + REST API.ipynb
├── sagemaker-training-template
├── Dockerfile
├── main.py
└── requirements.txt
├── sagemaker-tutorials
├── Digits.ipynb
├── s3+pandas.ipynb
└── s3+spark.ipynb
├── spark-sql
├── Time between specified events.ipynb
├── session length.ipynb
├── session_duration.csv
└── time_between_events_test.csv
└── tesseract_on_ami.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | ### Python template
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | .hypothesis/
50 | .pytest_cache/
51 |
52 | # Translations
53 | *.mo
54 | *.pot
55 |
56 | # Django stuff:
57 | *.log
58 | local_settings.py
59 | db.sqlite3
60 |
61 | # Flask stuff:
62 | instance/
63 | .webassets-cache
64 |
65 | # Scrapy stuff:
66 | .scrapy
67 |
68 | # Sphinx documentation
69 | docs/_build/
70 |
71 | # PyBuilder
72 | target/
73 |
74 | # Jupyter Notebook
75 | .ipynb_checkpoints
76 |
77 | # pyenv
78 | .python-version
79 |
80 | # celery beat schedule file
81 | celerybeat-schedule
82 |
83 | # SageMath parsed files
84 | *.sage.py
85 |
86 | # Environments
87 | .env
88 | .venv
89 | env/
90 | venv/
91 | ENV/
92 | env.bak/
93 | venv.bak/
94 |
95 | # Spyder project settings
96 | .spyderproject
97 | .spyproject
98 |
99 | # Rope project settings
100 | .ropeproject
101 |
102 | # mkdocs documentation
103 | /site
104 |
105 | # mypy
106 | .mypy_cache/
107 |
108 | data/db
109 | apache-spark-tutorials/data
110 | flower_photos.tgz
111 | model-full.h5
112 | apache-spark-tutorials/flower_photos
113 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DataScience
2 | ### Repository for sharing knowledge and reusable concepts in datascience and machine-learning fields.
--------------------------------------------------------------------------------
/apache-spark-tutorials/Apache Spark + Collaborative Filtering.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Install Apache Spark:"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "$ pip install pyspark"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "Initialize spark session:"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {
28 | "collapsed": true
29 | },
30 | "outputs": [],
31 | "source": [
32 | "from pyspark.context import SparkContext\n",
33 | "from pyspark.sql.session import SparkSession\n",
34 | "\n",
35 | "sc = SparkContext('local')\n",
36 | "spark = SparkSession(sc)"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "File \"sample_movielens_ratings.txt\" contains rows with content:\n",
44 | "userId::movieId::rating::timestamp\n",
45 | "\n",
46 | "For 29::9::1::1424380312 example:\n",
47 | "userId=29\n",
48 | "movieId=9\n",
49 | "rating=1\n",
50 | "timestamp=1424380312"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "Read and parse dataset:"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 6,
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "from pyspark.ml.evaluation import RegressionEvaluator\n",
67 | "from pyspark.ml.recommendation import ALS\n",
68 | "from pyspark.sql import Row\n",
69 | "\n",
70 | "lines = spark.read.text(\"sample_movielens_ratings.txt\").rdd\n",
71 | "parts = lines.map(lambda row: row.value.split(\"::\"))\n",
72 | "ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),\n",
73 | " rating=float(p[2]), timestamp=float(p[3])))\n",
74 | "ratings = spark.createDataFrame(ratingsRDD)"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 7,
80 | "metadata": {
81 | "collapsed": true
82 | },
83 | "outputs": [],
84 | "source": [
85 | "#Split dataset to training and test:\n",
86 | "(training, test) = ratings.randomSplit([0.8, 0.2])"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "Important features while using ALS:\n",
94 | "- userCol - column with user id identifier\n",
95 | "- itemCol - column with identifier of an object\n",
96 | "- ratingCol - column of rating, this could be explicite rating or implicite (for example kind of behaviour), in this second case implicitPrefs=True should be use for better results\n",
97 | "- coldStartStrategy - strategy for cold start problem, there are 2 solutions in Apache: drop - drop nan values, and nan - return nan values, other strategies are in development"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 8,
103 | "metadata": {
104 | "collapsed": true
105 | },
106 | "outputs": [],
107 | "source": [
108 | "# Build the recommendation model using ALS on the training data\n",
109 | "# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics\n",
110 | "als = ALS(maxIter=5, regParam=0.01, userCol=\"userId\", itemCol=\"movieId\", ratingCol=\"rating\",\n",
111 | " coldStartStrategy=\"drop\")\n",
112 | "model = als.fit(training)"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 9,
118 | "metadata": {},
119 | "outputs": [
120 | {
121 | "name": "stdout",
122 | "output_type": "stream",
123 | "text": [
124 | "Root-mean-square error = 1.7991273222740998\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "# Evaluate the model by computing the RMSE on the test data\n",
130 | "predictions = model.transform(test)\n",
131 | "evaluator = RegressionEvaluator(metricName=\"rmse\", labelCol=\"rating\",\n",
132 | " predictionCol=\"prediction\")\n",
133 | "rmse = evaluator.evaluate(predictions)\n",
134 | "print(\"Root-mean-square error = \" + str(rmse))"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": 17,
140 | "metadata": {},
141 | "outputs": [
142 | {
143 | "data": {
144 | "text/html": [
145 | "
\n",
146 | "\n",
159 | "
\n",
160 | " \n",
161 | " \n",
162 | " \n",
163 | " userId \n",
164 | " recommendations \n",
165 | " \n",
166 | " \n",
167 | " \n",
168 | " \n",
169 | " 0 \n",
170 | " 28 \n",
171 | " [(85, 5.939967155456543), (92, 5.1181178092956... \n",
172 | " \n",
173 | " \n",
174 | " 1 \n",
175 | " 26 \n",
176 | " [(62, 5.905642986297607), (32, 5.5628299713134... \n",
177 | " \n",
178 | " \n",
179 | " 2 \n",
180 | " 27 \n",
181 | " [(85, 5.086740016937256), (34, 4.4568171501159... \n",
182 | " \n",
183 | " \n",
184 | "
\n",
185 | "
"
186 | ],
187 | "text/plain": [
188 | " userId recommendations\n",
189 | "0 28 [(85, 5.939967155456543), (92, 5.1181178092956...\n",
190 | "1 26 [(62, 5.905642986297607), (32, 5.5628299713134...\n",
191 | "2 27 [(85, 5.086740016937256), (34, 4.4568171501159..."
192 | ]
193 | },
194 | "execution_count": 17,
195 | "metadata": {},
196 | "output_type": "execute_result"
197 | }
198 | ],
199 | "source": [
200 | "# Generate top 10 movie recommendations for each user\n",
201 | "userRecs = model.recommendForAllUsers(10)\n",
202 | "userRecs.toPandas().head(3)"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 18,
208 | "metadata": {},
209 | "outputs": [
210 | {
211 | "data": {
212 | "text/html": [
213 | "\n",
214 | "\n",
227 | "
\n",
228 | " \n",
229 | " \n",
230 | " \n",
231 | " movieId \n",
232 | " recommendations \n",
233 | " \n",
234 | " \n",
235 | " \n",
236 | " \n",
237 | " 0 \n",
238 | " 31 \n",
239 | " [(20, 4.2785234451293945), (12, 3.455365180969... \n",
240 | " \n",
241 | " \n",
242 | " 1 \n",
243 | " 85 \n",
244 | " [(28, 5.939967155456543), (27, 5.0867400169372... \n",
245 | " \n",
246 | " \n",
247 | " 2 \n",
248 | " 65 \n",
249 | " [(11, 3.328084945678711), (20, 1.9836831092834... \n",
250 | " \n",
251 | " \n",
252 | "
\n",
253 | "
"
254 | ],
255 | "text/plain": [
256 | " movieId recommendations\n",
257 | "0 31 [(20, 4.2785234451293945), (12, 3.455365180969...\n",
258 | "1 85 [(28, 5.939967155456543), (27, 5.0867400169372...\n",
259 | "2 65 [(11, 3.328084945678711), (20, 1.9836831092834..."
260 | ]
261 | },
262 | "execution_count": 18,
263 | "metadata": {},
264 | "output_type": "execute_result"
265 | }
266 | ],
267 | "source": [
268 | "# Generate top 10 user recommendations for each movie\n",
269 | "movieRecs = model.recommendForAllItems(10)\n",
270 | "movieRecs.toPandas().head(3)"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": 33,
276 | "metadata": {},
277 | "outputs": [
278 | {
279 | "data": {
280 | "text/plain": [
281 | "[Row(userId=28, movieId=[85, 92, 81, 12, 2, 89, 82, 47, 76, 11]),\n",
282 | " Row(userId=26, movieId=[62, 32, 7, 94, 23, 24, 22, 64, 60, 75]),\n",
283 | " Row(userId=27, movieId=[85, 34, 47, 80, 51, 33, 75, 19, 27, 83]),\n",
284 | " Row(userId=12, movieId=[46, 17, 64, 27, 30, 22, 32, 68, 16, 36]),\n",
285 | " Row(userId=22, movieId=[22, 94, 74, 30, 75, 51, 46, 7, 32, 62]),\n",
286 | " Row(userId=1, movieId=[55, 17, 83, 64, 68, 10, 46, 66, 8, 85]),\n",
287 | " Row(userId=13, movieId=[39, 93, 70, 83, 29, 72, 74, 8, 77, 53]),\n",
288 | " Row(userId=6, movieId=[34, 83, 64, 41, 47, 43, 74, 63, 67, 85]),\n",
289 | " Row(userId=16, movieId=[19, 51, 90, 54, 75, 71, 29, 27, 58, 47]),\n",
290 | " Row(userId=3, movieId=[51, 75, 22, 27, 80, 85, 77, 88, 39, 83]),\n",
291 | " Row(userId=20, movieId=[27, 52, 22, 30, 31, 17, 77, 96, 88, 53]),\n",
292 | " Row(userId=5, movieId=[17, 55, 27, 90, 30, 10, 46, 49, 68, 32]),\n",
293 | " Row(userId=19, movieId=[46, 90, 94, 98, 71, 74, 51, 54, 30, 19]),\n",
294 | " Row(userId=15, movieId=[46, 1, 53, 4, 3, 74, 31, 61, 98, 77]),\n",
295 | " Row(userId=17, movieId=[90, 46, 55, 17, 94, 30, 68, 32, 10, 64]),\n",
296 | " Row(userId=9, movieId=[49, 18, 7, 32, 87, 79, 47, 67, 43, 27]),\n",
297 | " Row(userId=4, movieId=[41, 52, 70, 72, 93, 83, 64, 87, 63, 40]),\n",
298 | " Row(userId=8, movieId=[51, 52, 29, 85, 53, 22, 75, 62, 58, 95]),\n",
299 | " Row(userId=23, movieId=[55, 49, 27, 96, 32, 17, 90, 30, 52, 23]),\n",
300 | " Row(userId=7, movieId=[85, 47, 34, 52, 62, 87, 29, 41, 76, 39]),\n",
301 | " Row(userId=10, movieId=[64, 85, 46, 62, 12, 81, 47, 11, 40, 23]),\n",
302 | " Row(userId=25, movieId=[47, 33, 46, 71, 90, 91, 16, 1, 82, 12]),\n",
303 | " Row(userId=24, movieId=[52, 90, 30, 55, 72, 63, 17, 70, 27, 9]),\n",
304 | " Row(userId=29, movieId=[46, 90, 23, 32, 10, 94, 17, 54, 68, 49]),\n",
305 | " Row(userId=21, movieId=[29, 47, 52, 87, 34, 18, 2, 93, 90, 63]),\n",
306 | " Row(userId=11, movieId=[53, 46, 77, 18, 93, 48, 23, 29, 74, 27]),\n",
307 | " Row(userId=14, movieId=[52, 29, 87, 18, 62, 49, 96, 90, 47, 70]),\n",
308 | " Row(userId=2, movieId=[72, 93, 8, 83, 41, 39, 70, 40, 74, 89]),\n",
309 | " Row(userId=0, movieId=[39, 2, 77, 85, 89, 52, 8, 92, 88, 83]),\n",
310 | " Row(userId=18, movieId=[88, 83, 85, 39, 89, 24, 8, 11, 77, 61])]"
311 | ]
312 | },
313 | "execution_count": 33,
314 | "metadata": {},
315 | "output_type": "execute_result"
316 | }
317 | ],
318 | "source": [
319 | "recommendations_for_users = userRecs.select(\"userId\", \"recommendations.movieId\")\n",
320 | "recommendations_for_users.collect()"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": 38,
326 | "metadata": {},
327 | "outputs": [
328 | {
329 | "data": {
330 | "text/plain": [
331 | "['{\"userId\":28,\"movieId\":[85,92,81,12,2,89,82,47,76,11]}',\n",
332 | " '{\"userId\":26,\"movieId\":[62,32,7,94,23,24,22,64,60,75]}',\n",
333 | " '{\"userId\":27,\"movieId\":[85,34,47,80,51,33,75,19,27,83]}',\n",
334 | " '{\"userId\":12,\"movieId\":[46,17,64,27,30,22,32,68,16,36]}',\n",
335 | " '{\"userId\":22,\"movieId\":[22,94,74,30,75,51,46,7,32,62]}',\n",
336 | " '{\"userId\":1,\"movieId\":[55,17,83,64,68,10,46,66,8,85]}',\n",
337 | " '{\"userId\":13,\"movieId\":[39,93,70,83,29,72,74,8,77,53]}',\n",
338 | " '{\"userId\":6,\"movieId\":[34,83,64,41,47,43,74,63,67,85]}',\n",
339 | " '{\"userId\":16,\"movieId\":[19,51,90,54,75,71,29,27,58,47]}',\n",
340 | " '{\"userId\":3,\"movieId\":[51,75,22,27,80,85,77,88,39,83]}',\n",
341 | " '{\"userId\":20,\"movieId\":[27,52,22,30,31,17,77,96,88,53]}',\n",
342 | " '{\"userId\":5,\"movieId\":[17,55,27,90,30,10,46,49,68,32]}',\n",
343 | " '{\"userId\":19,\"movieId\":[46,90,94,98,71,74,51,54,30,19]}',\n",
344 | " '{\"userId\":15,\"movieId\":[46,1,53,4,3,74,31,61,98,77]}',\n",
345 | " '{\"userId\":17,\"movieId\":[90,46,55,17,94,30,68,32,10,64]}',\n",
346 | " '{\"userId\":9,\"movieId\":[49,18,7,32,87,79,47,67,43,27]}',\n",
347 | " '{\"userId\":4,\"movieId\":[41,52,70,72,93,83,64,87,63,40]}',\n",
348 | " '{\"userId\":8,\"movieId\":[51,52,29,85,53,22,75,62,58,95]}',\n",
349 | " '{\"userId\":23,\"movieId\":[55,49,27,96,32,17,90,30,52,23]}',\n",
350 | " '{\"userId\":7,\"movieId\":[85,47,34,52,62,87,29,41,76,39]}',\n",
351 | " '{\"userId\":10,\"movieId\":[64,85,46,62,12,81,47,11,40,23]}',\n",
352 | " '{\"userId\":25,\"movieId\":[47,33,46,71,90,91,16,1,82,12]}',\n",
353 | " '{\"userId\":24,\"movieId\":[52,90,30,55,72,63,17,70,27,9]}',\n",
354 | " '{\"userId\":29,\"movieId\":[46,90,23,32,10,94,17,54,68,49]}',\n",
355 | " '{\"userId\":21,\"movieId\":[29,47,52,87,34,18,2,93,90,63]}',\n",
356 | " '{\"userId\":11,\"movieId\":[53,46,77,18,93,48,23,29,74,27]}',\n",
357 | " '{\"userId\":14,\"movieId\":[52,29,87,18,62,49,96,90,47,70]}',\n",
358 | " '{\"userId\":2,\"movieId\":[72,93,8,83,41,39,70,40,74,89]}',\n",
359 | " '{\"userId\":0,\"movieId\":[39,2,77,85,89,52,8,92,88,83]}',\n",
360 | " '{\"userId\":18,\"movieId\":[88,83,85,39,89,24,8,11,77,61]}']"
361 | ]
362 | },
363 | "execution_count": 38,
364 | "metadata": {},
365 | "output_type": "execute_result"
366 | }
367 | ],
368 | "source": [
369 | "json_rdd = recommendations_for_users.toJSON()\n",
370 | "json_rdd.collect()"
371 | ]
372 | }
373 | ],
374 | "metadata": {
375 | "kernelspec": {
376 | "display_name": "Python 3",
377 | "language": "python",
378 | "name": "python3"
379 | },
380 | "language_info": {
381 | "codemirror_mode": {
382 | "name": "ipython",
383 | "version": 3
384 | },
385 | "file_extension": ".py",
386 | "mimetype": "text/x-python",
387 | "name": "python",
388 | "nbconvert_exporter": "python",
389 | "pygments_lexer": "ipython3",
390 | "version": "3.6.3"
391 | }
392 | },
393 | "nbformat": 4,
394 | "nbformat_minor": 2
395 | }
396 |
--------------------------------------------------------------------------------
/apache-spark-tutorials/MongoDB.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB example: database preparation"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Install MongoDB on Mac OS:"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "$ brew install mongodb"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "Create dir for data:"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "$ mkdir -p data/db"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Run mongoDB:"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "$ mongod --dbpath data/db"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "Start (in new console window) a mongo shell on the same host machine as the mongod. Use the --host command line option to specify the localhost address (in this case 127.0.0.1) and port that the mongod listens on:"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "$ mongo --host 127.0.0.1:27017"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "Later, to stop MongoDB, press Control+C in the terminal where the mongod instance is running."
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "Create and use new database: "
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {},
83 | "source": [
84 | "> use test"
85 | ]
86 | },
87 | {
88 | "cell_type": "markdown",
89 | "metadata": {},
90 | "source": [
91 | "Insert test data to new database:"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "> db.movies.insert([{\n",
99 | "\t\"_id\": \"5692a15524de1e0ce2dfcfa3\",\n",
100 | "\t\"title\": \"Toy Story 4\",\n",
101 | "\t\"year\": 2011,\n",
102 | "\t\"rated\": \"G\",\n",
103 | "\t\"released\": {\n",
104 | "\t\t\"$date\": \"2010-06-18T04:00:00.000Z\"\n",
105 | "\t},\n",
106 | "\t\"runtime\": 206,\n",
107 | "\t\"countries\": [\n",
108 | "\t\t\"USA\"\n",
109 | "\t],\n",
110 | "\t\"genres\": [\n",
111 | "\t\t\"Animation\",\n",
112 | "\t\t\"Adventure\",\n",
113 | "\t\t\"Comedy\"\n",
114 | "\t],\n",
115 | "\t\"director\": \"Lee Unkrich\",\n",
116 | "\t\"writers\": [\n",
117 | "\t\t\"John Lasseter\",\n",
118 | "\t\t\"Andrew Stanton\",\n",
119 | "\t\t\"Lee Unkrich\",\n",
120 | "\t\t\"Michael Arndt\"\n",
121 | "\t],\n",
122 | "\t\"actors\": [\n",
123 | "\t\t\"Tom Hanks\",\n",
124 | "\t\t\"Tim Allen\",\n",
125 | "\t\t\"Joan Cusack\",\n",
126 | "\t\t\"Ned Beatty\"\n",
127 | "\t],\n",
128 | "\t\"plot\": \"The toys are mistakenly delivered to a day-care center instead of the attic right before Andy leaves for college, and it's up to Woody to convince the other toys that they weren't abandoned and to return home.\",\n",
129 | "\t\"poster\": \"http://ia.media-imdb.com/images/M/MV5BMTgxOTY4Mjc0MF5BMl5BanBnXkFtZTcwNTA4MDQyMw@@._V1_SX300.jpg\",\n",
130 | "\t\"imdb\": {\n",
131 | "\t\t\"id\": \"tt0435761\",\n",
132 | "\t\t\"rating\": 8.4,\n",
133 | "\t\t\"votes\": 500084\n",
134 | "\t},\n",
135 | "\t\"tomato\": {\n",
136 | "\t\t\"meter\": 99,\n",
137 | "\t\t\"image\": \"certified\",\n",
138 | "\t\t\"rating\": 8.9,\n",
139 | "\t\t\"reviews\": 287,\n",
140 | "\t\t\"fresh\": 283,\n",
141 | "\t\t\"consensus\": \"Deftly blending comedy, adventure, and honest emotion, Toy Story 3 is a rare second sequel that really works.\",\n",
142 | "\t\t\"userMeter\": 89,\n",
143 | "\t\t\"userRating\": 4.3,\n",
144 | "\t\t\"userReviews\": 602138\n",
145 | "\t},\n",
146 | "\t\"metacritic\": 92,\n",
147 | "\t\"awards\": {\n",
148 | "\t\t\"wins\": 56,\n",
149 | "\t\t\"nominations\": 86,\n",
150 | "\t\t\"text\": \"Won 2 Oscars. Another 56 wins \\u0026 86 nominations.\"\n",
151 | "\t},\n",
152 | "\t\"type\": \"movie\",\n",
153 | "\t\"reviews\": [\n",
154 | "\t\t{\n",
155 | "\t\t\t\"date\": {\n",
156 | "\t\t\t\t\"$date\": \"2017-02-13T04:00:00.000Z\"\n",
157 | "\t\t\t},\n",
158 | "\t\t\t\"name\": \"parvesh\",\n",
159 | "\t\t\t\"rating\": 8.9,\n",
160 | "\t\t\t\"comment\": \"My first review for Toy Story 3, hoping it will execute while trying for the very first time.\"\n",
161 | "\t\t},\n",
162 | "\t\t{\n",
163 | "\t\t\t\"date\": {\n",
164 | "\t\t\t\t\"$date\": \"2017-02-13T04:00:00.000Z\"\n",
165 | "\t\t\t},\n",
166 | "\t\t\t\"name\": \"Prabhash\",\n",
167 | "\t\t\t\"rating\": 8.9,\n",
168 | "\t\t\t\"comment\": \"My first review for Toy Story 3, hoping it will execute while trying for the very first time.\"\n",
169 | "\t\t},\n",
170 | "\t\t{\n",
171 | "\t\t\t\"date\": {\n",
172 | "\t\t\t\t\"$date\": \"2017-02-11T04:00:00.000Z\"\n",
173 | "\t\t\t},\n",
174 | "\t\t\t\"name\": \"praveen\",\n",
175 | "\t\t\t\"rating\": 6.7,\n",
176 | "\t\t\t\"comment\": \"My first review for Toy Story 3, hoping it will execute while trying for the very first time.\"\n",
177 | "\t\t}\n",
178 | "\t]\n",
179 | "}, {\n",
180 | "\t\"_id\": \"589cbda9c0b9fec62febf274\",\n",
181 | "\t\"title\": \"Deadpool\",\n",
182 | "\t\"year\": 2016,\n",
183 | "\t\"rated\": \"R\",\n",
184 | "\t\"released\": {\n",
185 | "\t\t\"$date\": \"2016-06-18T04:00:00.000Z\"\n",
186 | "\t},\n",
187 | "\t\"runtime\": 108,\n",
188 | "\t\"countries\": [\n",
189 | "\t\t\"USA\"\n",
190 | "\t],\n",
191 | "\t\"genres\": [\n",
192 | "\t\t\"Comics character\",\n",
193 | "\t\t\"Adventure\",\n",
194 | "\t\t\"Action\"\n",
195 | "\t],\n",
196 | "\t\"director\": \"Tim Miller\",\n",
197 | "\t\"writers\": [\n",
198 | "\t\t\"Rhett Reese\",\n",
199 | "\t\t\"Paul Wernick\"\n",
200 | "\t],\n",
201 | "\t\"actors\": [\n",
202 | "\t\t\"Ryan Reynolds\",\n",
203 | "\t\t\"Morena Baccarin\",\n",
204 | "\t\t\"Ed Skrein\",\n",
205 | "\t\t\"T.J. Miller\",\n",
206 | "\t\t\"Gina Carano\",\n",
207 | "\t\t\"Leslie Uggams\",\n",
208 | "\t\t\"Stefan Kapičić\",\n",
209 | "\t\t\"Brianna Hildebrand\"\n",
210 | "\t],\n",
211 | "\t\"plot\": \"Deadpool is a 2016 American superhero film directed by Tim Miller and written by Rhett Reese and Paul Wernick, based on the Marvel Comics character of the same name.\",\n",
212 | "\t\"poster\": \"http://ia.media-imdb.com/images/M/MV5BMTgxOTY4Mjc0MF5BMl5BanBnXkFtZTcwNTA4MDQyMw@@._V1_SX300.jpg\",\n",
213 | "\t\"imdb\": {\n",
214 | "\t\t\"id\": \"tt1431045\",\n",
215 | "\t\t\"rating\": 8.1,\n",
216 | "\t\t\"votes\": 585141\n",
217 | "\t},\n",
218 | "\t\"tomato\": {\n",
219 | "\t\t\"meter\": 99,\n",
220 | "\t\t\"image\": \"certified\",\n",
221 | "\t\t\"rating\": 6.9,\n",
222 | "\t\t\"reviews\": 287,\n",
223 | "\t\t\"fresh\": 241,\n",
224 | "\t\t\"consensus\": \"Fast, funny, and gleefully profane, the fourth-wall-busting Deadpool.\",\n",
225 | "\t\t\"userMeter\": 90,\n",
226 | "\t\t\"userRating\": 4.3,\n",
227 | "\t\t\"userReviews\": 181719\n",
228 | "\t},\n",
229 | "\t\"metacritic\": 92,\n",
230 | "\t\"awards\": {\n",
231 | "\t\t\"wins\": 5,\n",
232 | "\t\t\"nominations\": 12,\n",
233 | "\t\t\"text\": \"wo Golden Globe Award nominations for Best Motion Picture – Musical or Comedy and Best Actor – Motion Picture Musical or Comedy.\"\n",
234 | "\t},\n",
235 | "\t\"type\": \"movie\"\n",
236 | "}, {\n",
237 | "\t\"_id\": \"589cc22cc0b9fec62febf275\",\n",
238 | "\t\"title\": \"BATMAN V SUPERMAN: DAWN OF JUSTICE\",\n",
239 | "\t\"year\": 2016,\n",
240 | "\t\"rated\": \"PG-13\",\n",
241 | "\t\"released\": {\n",
242 | "\t\t\"$date\": \"2016-03-19T04:00:00.000Z\"\n",
243 | "\t},\n",
244 | "\t\"runtime\": 151,\n",
245 | "\t\"countries\": [\n",
246 | "\t\t\"USA\"\n",
247 | "\t],\n",
248 | "\t\"genres\": [\n",
249 | "\t\t\"Action\",\n",
250 | "\t\t\"Adventure\",\n",
251 | "\t\t\"Sci-Fi\"\n",
252 | "\t],\n",
253 | "\t\"director\": \"Lee Unkrich\",\n",
254 | "\t\"writers\": [\n",
255 | "\t\t\"Chris Terrio\",\n",
256 | "\t\t\"David S. Goyer\"\n",
257 | "\t],\n",
258 | "\t\"actors\": [\n",
259 | "\t\t\"Amy Adams\",\n",
260 | "\t\t\"Henry Cavill\",\n",
261 | "\t\t\"Ben Affleck\"\n",
262 | "\t],\n",
263 | "\t\"plot\": \"The general public is concerned over having Superman on their planet and letting the Dark Knight - Batman - pursue the streets of Gotham. While this is happening, a power-phobic Batman tries to attack Superman.,Meanwhile Superman tries to settle on a decision, and Lex Luthor, the criminal mastermind and millionaire, tries to use his own advantages to fight the Man of Steel.\",\n",
264 | "\t\"poster\": \"http://ia.media-imdb.com/images/M/MV5BMTgxOTY4Mjc0MF5BMl5BanBnXkFtZTcwNTA4MDQyMw@@._V1_SX300.jpg\",\n",
265 | "\t\"imdb\": {\n",
266 | "\t\t\"id\": \"tt2975590\",\n",
267 | "\t\t\"rating\": 6.7,\n",
268 | "\t\t\"votes\": 3206\n",
269 | "\t},\n",
270 | "\t\"tomato\": {\n",
271 | "\t\t\"meter\": 27,\n",
272 | "\t\t\"image\": \"certified\",\n",
273 | "\t\t\"rating\": 4.9,\n",
274 | "\t\t\"reviews\": 353,\n",
275 | "\t\t\"fresh\": 97,\n",
276 | "\t\t\"consensus\": \"Batman v Superman: Dawn of Justice smothers a potentially powerful story -- and some of Americas most iconic superheroes -- in a grim whirlwind of effects-driven action.\",\n",
277 | "\t\t\"userMeter\": 64,\n",
278 | "\t\t\"userRating\": 3.6,\n",
279 | "\t\t\"userReviews\": 225954\n",
280 | "\t},\n",
281 | "\t\"metacritic\": 44,\n",
282 | "\t\"awards\": {\n",
283 | "\t\t\"wins\": 6,\n",
284 | "\t\t\"nominations\": 26,\n",
285 | "\t\t\"text\": \"Actor of the Year, Most Original Poster, Best Body of Work\"\n",
286 | "\t},\n",
287 | "\t\"type\": \"movie\"\n",
288 | "}, {\n",
289 | "\t\"_id\": \"589cc417c0b9fec62febf276\",\n",
290 | "\t\"title\": \"doctor strange\",\n",
291 | "\t\"year\": 2016,\n",
292 | "\t\"rated\": \"PG-13\",\n",
293 | "\t\"released\": {\n",
294 | "\t\t\"$date\": \"2016-11-04T04:00:00.000Z\"\n",
295 | "\t},\n",
296 | "\t\"runtime\": 115,\n",
297 | "\t\"countries\": [\n",
298 | "\t\t\"USA\"\n",
299 | "\t],\n",
300 | "\t\"genres\": [\n",
301 | "\t\t\"Sci-Fi\",\n",
302 | "\t\t\"Fantasy\",\n",
303 | "\t\t\"Adventure\",\n",
304 | "\t\t\"Action\"\n",
305 | "\t],\n",
306 | "\t\"director\": \"Scott Derrickson\",\n",
307 | "\t\"writers\": [\n",
308 | "\t\t\"Jon Spaihts\",\n",
309 | "\t\t\"Scott Derrickson\"\n",
310 | "\t],\n",
311 | "\t\"actors\": [\n",
312 | "\t\t\"Benedict Cumberbatch\",\n",
313 | "\t\t\"Chiwetel Ejiofor\",\n",
314 | "\t\t\"Rachel McAdams\"\n",
315 | "\t],\n",
316 | "\t\"plot\": \"Marvels Doctor Strange follows the story of the talented neurosurgeon Doctor Stephen Strange who, after a tragic car accident, must put ego aside and learn the secrets of a hidden world of mysticism and alternate dimensions. Based in New York Citys Greenwich Village, Doctor Strange must act as an intermediary between the real world and what lies beyond, utilising a vast array of metaphysical abilities and artifacts to protect the Marvel Cinematic Universe.\",\n",
317 | "\t\"poster\": \"http://ia.media-imdb.com/images/M/MV5BMTgxOTY4Mjc0MF5BMl5BanBnXkFtZTcwNTA4MDQyMw@@._V1_SX300.jpg\",\n",
318 | "\t\"imdb\": {\n",
319 | "\t\t\"id\": \"tt1211837\",\n",
320 | "\t\t\"rating\": 7.8,\n",
321 | "\t\t\"votes\": 191181\n",
322 | "\t},\n",
323 | "\t\"tomato\": {\n",
324 | "\t\t\"meter\": 27,\n",
325 | "\t\t\"image\": \"certified\",\n",
326 | "\t\t\"rating\": 4.9,\n",
327 | "\t\t\"reviews\": 353,\n",
328 | "\t\t\"fresh\": 97,\n",
329 | "\t\t\"consensus\": \"Batman v Superman: Dawn of Justice smothers a potentially powerful story -- and some of Americas most iconic superheroes -- in a grim whirlwind of effects-driven action.\",\n",
330 | "\t\t\"userMeter\": 64,\n",
331 | "\t\t\"userRating\": 3.6,\n",
332 | "\t\t\"userReviews\": 225954\n",
333 | "\t},\n",
334 | "\t\"metacritic\": 44,\n",
335 | "\t\"awards\": {\n",
336 | "\t\t\"wins\": 6,\n",
337 | "\t\t\"nominations\": 38,\n",
338 | "\t\t\"text\": \"Oscar, Best Visual Effects\"\n",
339 | "\t},\n",
340 | "\t\"type\": \"movie\"\n",
341 | "}, {\n",
342 | "\t\"_id\": \"589cc696c0b9fec62febf277\",\n",
343 | "\t\"title\": \"kung fu panda 3\",\n",
344 | "\t\"year\": 2016,\n",
345 | "\t\"rated\": \"PG\",\n",
346 | "\t\"released\": {\n",
347 | "\t\t\"$date\": \"2016-01-29T04:00:00.000Z\"\n",
348 | "\t},\n",
349 | "\t\"runtime\": 95,\n",
350 | "\t\"countries\": [\n",
351 | "\t\t\"USA\"\n",
352 | "\t],\n",
353 | "\t\"genres\": [\n",
354 | "\t\t\" Animation\",\n",
355 | "\t\t\"Action\",\n",
356 | "\t\t\"Adventure\",\n",
357 | "\t\t\"Comedy\",\n",
358 | "\t\t\"Family\"\n",
359 | "\t],\n",
360 | "\t\"director\": \" Alessandro Carloni\",\n",
361 | "\t\"writers\": [\n",
362 | "\t\t\" Jonathan Aibel\",\n",
363 | "\t\t\"Glenn Berger\"\n",
364 | "\t],\n",
365 | "\t\"actors\": [\n",
366 | "\t\t\" Jack Black\",\n",
367 | "\t\t\"Dustin Hoffman\",\n",
368 | "\t\t\"Bryan Cranston\"\n",
369 | "\t],\n",
370 | "\t\"plot\": \"When Pos long-lost panda father suddenly reappears, the reunited duo travels to a secret panda paradise to meet scores of hilarious new panda characters. But when the supernatural villain Kai begins to sweep across China defeating all the kung fu masters, Po must do the impossible-learn to train a village full of his fun-loving, clumsy brethren to become the ultimate band of Kung Fu Pandas.\",\n",
371 | "\t\"poster\": \"http://ia.media-imdb.com/images/M/MV5BMTgxOTY4Mjc0MF5BMl5BanBnXkFtZTcwNTA4MDQyMw@@._V1_SX300.jpg\",\n",
372 | "\t\"imdb\": {\n",
373 | "\t\t\"id\": \"tt2267968\",\n",
374 | "\t\t\"rating\": 7.2,\n",
375 | "\t\t\"votes\": 83809\n",
376 | "\t},\n",
377 | "\t\"tomato\": {\n",
378 | "\t\t\"meter\": 87,\n",
379 | "\t\t\"image\": \"certified\",\n",
380 | "\t\t\"rating\": 6.8,\n",
381 | "\t\t\"reviews\": 153,\n",
382 | "\t\t\"fresh\": 133,\n",
383 | "\t\t\"consensus\": \"Kung Fu Panda 3 boasts the requisite visual splendor, but like its rotund protagonist, this sequels narrative is also surprisingly nimble, adding up to animated fun for the whole family.\",\n",
384 | "\t\t\"userMeter\": 79,\n",
385 | "\t\t\"userRating\": 3.9,\n",
386 | "\t\t\"userReviews\": 98794\n",
387 | "\t},\n",
388 | "\t\"metacritic\": 44,\n",
389 | "\t\"awards\": {\n",
390 | "\t\t\"wins\": 0,\n",
391 | "\t\t\"nominations\": 6,\n",
392 | "\t\t\"text\": \"Best Animated Feature, Most Wanted Pet\"\n",
393 | "\t},\n",
394 | "\t\"type\": \"movie\"\n",
395 | "}, {\n",
396 | "\t\"_id\": \"589cc846c0b9fec62febf278\",\n",
397 | "\t\"title\": \"zootopia\",\n",
398 | "\t\"year\": 2016,\n",
399 | "\t\"rated\": \"PG\",\n",
400 | "\t\"released\": {\n",
401 | "\t\t\"$date\": \"2016-04-04T04:00:00.000Z\"\n",
402 | "\t},\n",
403 | "\t\"runtime\": 108,\n",
404 | "\t\"countries\": [\n",
405 | "\t\t\"USA\"\n",
406 | "\t],\n",
407 | "\t\"genres\": [\n",
408 | "\t\t\"Animation\",\n",
409 | "\t\t\"Adventure\",\n",
410 | "\t\t\"Comedy\",\n",
411 | "\t\t\"Crime\",\n",
412 | "\t\t\"Family\",\n",
413 | "\t\t\"Mystery\"\n",
414 | "\t],\n",
415 | "\t\"director\": \"Byron Howard\",\n",
416 | "\t\"writers\": [\n",
417 | "\t\t\"Byron Howard\",\n",
418 | "\t\t\"Rich Moore\"\n",
419 | "\t],\n",
420 | "\t\"actors\": [\n",
421 | "\t\t\"Ginnifer Goodwin\",\n",
422 | "\t\t\"Jason Bateman\",\n",
423 | "\t\t\"Idris Elba\"\n",
424 | "\t],\n",
425 | "\t\"plot\": \"From the largest elephant to the smallest shrew, the city of Zootopia is a mammal metropolis where various animals live and thrive. When Judy Hopps becomes the first rabbit to join the police force, she quickly learns how tough it is to enforce the law. Determined to prove herself, Judy jumps at the opportunity to solve a mysterious case. Unfortunately, that means working with Nick Wilde, a wily fox who makes her job even harder.\",\n",
426 | "\t\"poster\": \"http://ia.media-imdb.com/images/M/MV5BMTgxOTY4Mjc0MF5BMl5BanBnXkFtZTcwNTA4MDQyMw@@._V1_SX300.jpg\",\n",
427 | "\t\"imdb\": {\n",
428 | "\t\t\"id\": \"tt2948356\",\n",
429 | "\t\t\"rating\": 8.1,\n",
430 | "\t\t\"votes\": 262258\n",
431 | "\t},\n",
432 | "\t\"tomato\": {\n",
433 | "\t\t\"meter\": 98,\n",
434 | "\t\t\"image\": \"certified\",\n",
435 | "\t\t\"rating\": 8.1,\n",
436 | "\t\t\"reviews\": 241,\n",
437 | "\t\t\"fresh\": 236,\n",
438 | "\t\t\"consensus\": \"Kung Fu Panda 3 boasts the requisite visual splendor, but like its rotund protagonist, this sequels narrative is also surprisingly nimble, adding up to animated fun for the whole family.\",\n",
439 | "\t\t\"userMeter\": 92,\n",
440 | "\t\t\"userRating\": 4.4,\n",
441 | "\t\t\"userReviews\": 95658\n",
442 | "\t},\n",
443 | "\t\"metacritic\": 44,\n",
444 | "\t\"awards\": {\n",
445 | "\t\t\"wins\": 26,\n",
446 | "\t\t\"nominations\": 52,\n",
447 | "\t\t\"text\": \"Best Animated Feature Film of the Year, Best Motion Picture - Animated\"\n",
448 | "\t},\n",
449 | "\t\"type\": \"movie\"\n",
450 | "}, {\n",
451 | "\t\"_id\": \"589d733a296ba85b1bc3bee6\",\n",
452 | "\t\"title\": \"John Carter\",\n",
453 | "\t\"year\": 2012,\n",
454 | "\t\"rated\": \"PG-13\",\n",
455 | "\t\"released\": {\n",
456 | "\t\t\"$date\": \"2012-03-09T04:00:00.000Z\"\n",
457 | "\t},\n",
458 | "\t\"runtime\": 132,\n",
459 | "\t\"countries\": [\n",
460 | "\t\t\"USA\"\n",
461 | "\t],\n",
462 | "\t\"genres\": [\n",
463 | "\t\t\"Action\",\n",
464 | "\t\t\"Adventure\",\n",
465 | "\t\t\"Sci-Fi\"\n",
466 | "\t],\n",
467 | "\t\"director\": \"Andrew Stanton\",\n",
468 | "\t\"writers\": [\n",
469 | "\t\t\"John Lasseter\",\n",
470 | "\t\t\"Andrew Stanton\",\n",
471 | "\t\t\"Lee Unkrich\",\n",
472 | "\t\t\"Michael Arndt\"\n",
473 | "\t],\n",
474 | "\t\"actors\": [\n",
475 | "\t\t\"Andrew Stanton\",\n",
476 | "\t\t\"Mark Andrews\"\n",
477 | "\t],\n",
478 | "\t\"plot\": \"John Carter, a Civil War veteran, who in 1868 was trying to live a normal life, is asked by the Army to join, but he refuses so he is locked up. He escapes, and is pursued. Eventually they run into some Indians, and theres a gunfight. Carter seeks refuge in a cave. While there, he encounters someone who is holding some kind of medallion. When Carter touches it, he finds himself in a place where he can leap incredible heights, among other things. He later encounters beings he has never seen before. He meets a woman who helps him to discover that he is on Mars, and he learns theres some kind of unrest going on.\",\n",
479 | "\t\"poster\": \"http://ia.media-imdb.com/images/M/MV5BMTgxOTY4Mjc0MF5BMl5BanBnXkFtZTcwNTA4MDQyMw@@._V1_SX300.jpg\",\n",
480 | "\t\"imdb\": {\n",
481 | "\t\t\"id\": \"tt0401729\",\n",
482 | "\t\t\"rating\": 6.6,\n",
483 | "\t\t\"votes\": 217518\n",
484 | "\t},\n",
485 | "\t\"tomato\": {\n",
486 | "\t\t\"meter\": 51,\n",
487 | "\t\t\"image\": \"certified\",\n",
488 | "\t\t\"rating\": 5.7,\n",
489 | "\t\t\"reviews\": 219,\n",
490 | "\t\t\"fresh\": 111,\n",
491 | "\t\t\"consensus\": \"While John Carter looks terrific and delivers its share of pulpy thrills, it also suffers from uneven pacing and occasionally incomprehensible plotting and characterization.\",\n",
492 | "\t\t\"userMeter\": 60,\n",
493 | "\t\t\"userRating\": 3.5,\n",
494 | "\t\t\"userReviews\": 113966\n",
495 | "\t},\n",
496 | "\t\"metacritic\": 92,\n",
497 | "\t\"awards\": {\n",
498 | "\t\t\"wins\": 2,\n",
499 | "\t\t\"nominations\": 7,\n",
500 | "\t\t\"text\": \"Top Box Office Films, Best Original Score for a Fantasy/Science Fiction/Horror Film\"\n",
501 | "\t},\n",
502 | "\t\"type\": \"movie\"\n",
503 | "}])"
504 | ]
505 | },
506 | {
507 | "cell_type": "markdown",
508 | "metadata": {},
509 | "source": [
510 | "Check data:"
511 | ]
512 | },
513 | {
514 | "cell_type": "markdown",
515 | "metadata": {},
516 | "source": [
517 | "> db.movies.find()"
518 | ]
519 | },
520 | {
521 | "cell_type": "markdown",
522 | "metadata": {},
523 | "source": [
524 | "# Conntection in Python code"
525 | ]
526 | },
527 | {
528 | "cell_type": "markdown",
529 | "metadata": {
530 | "collapsed": true
531 | },
532 | "source": [
533 | "Download jar package from https://mvnrepository.com/artifact/org.mongodb.spark/mongo-spark-connector_2.11/2.3.0\n",
534 | "Move it to spark jars folder.\n",
535 | "(On my computer the location is /Users/agnieszkabiernacka/anaconda3/pkgs/pyspark-2.2.0-py36_0/lib/python3.6/site-packages/pyspark/jars)\n",
536 | "Then you can run jupyter notebook by anaconda."
537 | ]
538 | },
539 | {
540 | "cell_type": "markdown",
541 | "metadata": {},
542 | "source": [
543 | "Prepare SparkSession. Important parameters:\n",
544 | "- master -- for test \"local\" should be used, for production special configuration file should be used\n",
545 | "- appName -- visible name by Spark\n",
546 | "- spark.jars.packages -- extra packages which should be used, they should be added to pyspark/jars folder before, for this example we need mongo-spark-connector\n",
547 | "- spark.mongodb.input.uri / spark.mongodb.output.uri -- uri/database.collection"
548 | ]
549 | },
550 | {
551 | "cell_type": "code",
552 | "execution_count": 7,
553 | "metadata": {
554 | "collapsed": true
555 | },
556 | "outputs": [],
557 | "source": [
558 | "from pyspark.sql import SparkSession\n",
559 | "\n",
560 | "spark = SparkSession \\\n",
561 | " .builder \\\n",
562 | " .master(\"local\") \\\n",
563 | " .appName(\"myApp\") \\\n",
564 | " .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.0') \\\n",
565 | " .config(\"spark.mongodb.input.uri\", \"mongodb://127.0.0.1/test.movies\") \\\n",
566 | " .config(\"spark.mongodb.output.uri\", \"mongodb://127.0.0.1/test.movies\") \\\n",
567 | " .getOrCreate()"
568 | ]
569 | },
570 | {
571 | "cell_type": "code",
572 | "execution_count": 8,
573 | "metadata": {},
574 | "outputs": [],
575 | "source": [
576 | "#Load dataframe (collection mentioned in SparkSession configuration)\n",
577 | "df = spark.read.format(\"com.mongodb.spark.sql\").load()"
578 | ]
579 | },
580 | {
581 | "cell_type": "code",
582 | "execution_count": 9,
583 | "metadata": {},
584 | "outputs": [
585 | {
586 | "name": "stdout",
587 | "output_type": "stream",
588 | "text": [
589 | "root\n",
590 | " |-- _id: string (nullable = true)\n",
591 | " |-- actors: array (nullable = true)\n",
592 | " | |-- element: string (containsNull = true)\n",
593 | " |-- awards: struct (nullable = true)\n",
594 | " | |-- wins: double (nullable = true)\n",
595 | " | |-- nominations: double (nullable = true)\n",
596 | " | |-- text: string (nullable = true)\n",
597 | " |-- countries: array (nullable = true)\n",
598 | " | |-- element: string (containsNull = true)\n",
599 | " |-- director: string (nullable = true)\n",
600 | " |-- genres: array (nullable = true)\n",
601 | " | |-- element: string (containsNull = true)\n",
602 | " |-- imdb: struct (nullable = true)\n",
603 | " | |-- id: string (nullable = true)\n",
604 | " | |-- rating: double (nullable = true)\n",
605 | " | |-- votes: double (nullable = true)\n",
606 | " |-- metacritic: double (nullable = true)\n",
607 | " |-- plot: string (nullable = true)\n",
608 | " |-- poster: string (nullable = true)\n",
609 | " |-- rated: string (nullable = true)\n",
610 | " |-- released: struct (nullable = true)\n",
611 | " | |-- $date: string (nullable = true)\n",
612 | " |-- reviews: array (nullable = true)\n",
613 | " | |-- element: struct (containsNull = true)\n",
614 | " | | |-- date: struct (nullable = true)\n",
615 | " | | | |-- $date: string (nullable = true)\n",
616 | " | | |-- name: string (nullable = true)\n",
617 | " | | |-- rating: double (nullable = true)\n",
618 | " | | |-- comment: string (nullable = true)\n",
619 | " |-- runtime: double (nullable = true)\n",
620 | " |-- title: string (nullable = true)\n",
621 | " |-- tomato: struct (nullable = true)\n",
622 | " | |-- meter: double (nullable = true)\n",
623 | " | |-- image: string (nullable = true)\n",
624 | " | |-- rating: double (nullable = true)\n",
625 | " | |-- reviews: double (nullable = true)\n",
626 | " | |-- fresh: double (nullable = true)\n",
627 | " | |-- consensus: string (nullable = true)\n",
628 | " | |-- userMeter: double (nullable = true)\n",
629 | " | |-- userRating: double (nullable = true)\n",
630 | " | |-- userReviews: double (nullable = true)\n",
631 | " |-- type: string (nullable = true)\n",
632 | " |-- writers: array (nullable = true)\n",
633 | " | |-- element: string (containsNull = true)\n",
634 | " |-- year: double (nullable = true)\n",
635 | "\n"
636 | ]
637 | }
638 | ],
639 | "source": [
640 | "#Print schema of database\n",
641 | "df.printSchema()"
642 | ]
643 | },
644 | {
645 | "cell_type": "code",
646 | "execution_count": 6,
647 | "metadata": {},
648 | "outputs": [
649 | {
650 | "data": {
651 | "text/plain": [
652 | "[Row(_id='5692a15524de1e0ce2dfcfa3', actors=['Tom Hanks', 'Tim Allen', 'Joan Cusack', 'Ned Beatty'], awards=Row(wins=56.0, nominations=86.0, text='Won 2 Oscars. Another 56 wins & 86 nominations.'), countries=['USA'], director='Lee Unkrich', genres=['Animation', 'Adventure', 'Comedy'], imdb=Row(id='tt0435761', rating=8.4, votes=500084.0), metacritic=92.0, plot=\"The toys are mistakenly delivered to a day-care center instead of the attic right before Andy leaves for college, and it's up to Woody to convince the other toys that they weren't abandoned and to return home.\", poster='http://ia.media-imdb.com/images/M/MV5BMTgxOTY4Mjc0MF5BMl5BanBnXkFtZTcwNTA4MDQyMw@@._V1_SX300.jpg', rated='G', released=Row($date='2010-06-18T04:00:00.000Z'), reviews=[Row(date=Row($date='2017-02-13T04:00:00.000Z'), name='parvesh', rating=8.9, comment='My first review for Toy Story 3, hoping it will execute while trying for the very first time.'), Row(date=Row($date='2017-02-13T04:00:00.000Z'), name='Prabhash', rating=8.9, comment='My first review for Toy Story 3, hoping it will execute while trying for the very first time.'), Row(date=Row($date='2017-02-11T04:00:00.000Z'), name='praveen', rating=6.7, comment='My first review for Toy Story 3, hoping it will execute while trying for the very first time.')], runtime=206.0, title='Toy Story 4', tomato=Row(meter=99.0, image='certified', rating=8.9, reviews=287.0, fresh=283.0, consensus='Deftly blending comedy, adventure, and honest emotion, Toy Story 3 is a rare second sequel that really works.', userMeter=89.0, userRating=4.3, userReviews=602138.0), type='movie', writers=['John Lasseter', 'Andrew Stanton', 'Lee Unkrich', 'Michael Arndt'], year=2011.0)]"
653 | ]
654 | },
655 | "execution_count": 6,
656 | "metadata": {},
657 | "output_type": "execute_result"
658 | }
659 | ],
660 | "source": [
661 | "df.head(1)"
662 | ]
663 | },
664 | {
665 | "cell_type": "code",
666 | "execution_count": 22,
667 | "metadata": {},
668 | "outputs": [
669 | {
670 | "name": "stdout",
671 | "output_type": "stream",
672 | "text": [
673 | "+--------------------+--------------------+--------------------+---------+-----------+--------------------+--------------------+----------+--------------------+--------------------+-----+--------------------+--------------------+-------+--------------------+--------------------+-----+--------------------+------+\n",
674 | "| _id| actors| awards|countries| director| genres| imdb|metacritic| plot| poster|rated| released| reviews|runtime| title| tomato| type| writers| year|\n",
675 | "+--------------------+--------------------+--------------------+---------+-----------+--------------------+--------------------+----------+--------------------+--------------------+-----+--------------------+--------------------+-------+--------------------+--------------------+-----+--------------------+------+\n",
676 | "|5692a15524de1e0ce...|[Tom Hanks, Tim A...|[56.0,86.0,Won 2 ...| [USA]|Lee Unkrich|[Animation, Adven...|[tt0435761,8.4,50...| 92.0|The toys are mist...|http://ia.media-i...| G|[2010-06-18T04:00...|[[[2017-02-13T04:...| 206.0| Toy Story 4|[99.0,certified,8...|movie|[John Lasseter, A...|2011.0|\n",
677 | "|589cc22cc0b9fec62...|[Amy Adams, Henry...|[6.0,26.0,Actor o...| [USA]|Lee Unkrich|[Action, Adventur...|[tt2975590,6.7,32...| 44.0|The general publi...|http://ia.media-i...|PG-13|[2016-03-19T04:00...| null| 151.0|BATMAN V SUPERMAN...|[27.0,certified,4...|movie|[Chris Terrio, Da...|2016.0|\n",
678 | "+--------------------+--------------------+--------------------+---------+-----------+--------------------+--------------------+----------+--------------------+--------------------+-----+--------------------+--------------------+-------+--------------------+--------------------+-----+--------------------+------+\n",
679 | "\n"
680 | ]
681 | }
682 | ],
683 | "source": [
684 | "df.createOrReplaceTempView(\"movies\")\n",
685 | "spark.sql(\"SELECT * FROM movies WHERE director LIKE '%Lee%'\").show()"
686 | ]
687 | },
688 | {
689 | "cell_type": "code",
690 | "execution_count": 24,
691 | "metadata": {},
692 | "outputs": [
693 | {
694 | "name": "stdout",
695 | "output_type": "stream",
696 | "text": [
697 | "+--------------------+\n",
698 | "| actors|\n",
699 | "+--------------------+\n",
700 | "|[Tom Hanks, Tim A...|\n",
701 | "|[Ryan Reynolds, M...|\n",
702 | "|[Ginnifer Goodwin...|\n",
703 | "+--------------------+\n",
704 | "\n"
705 | ]
706 | }
707 | ],
708 | "source": [
709 | "spark.sql(\"SELECT actors FROM movies WHERE imdb.rating > 8\").show()"
710 | ]
711 | },
712 | {
713 | "cell_type": "code",
714 | "execution_count": null,
715 | "metadata": {
716 | "collapsed": true
717 | },
718 | "outputs": [],
719 | "source": []
720 | }
721 | ],
722 | "metadata": {
723 | "kernelspec": {
724 | "display_name": "Python 3",
725 | "language": "python",
726 | "name": "python3"
727 | },
728 | "language_info": {
729 | "codemirror_mode": {
730 | "name": "ipython",
731 | "version": 3
732 | },
733 | "file_extension": ".py",
734 | "mimetype": "text/x-python",
735 | "name": "python",
736 | "nbconvert_exporter": "python",
737 | "pygments_lexer": "ipython3",
738 | "version": "3.6.3"
739 | }
740 | },
741 | "nbformat": 4,
742 | "nbformat_minor": 2
743 | }
744 |
--------------------------------------------------------------------------------
/apache-spark-tutorials/notebook+s3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# How to connect Jupyter notebook (local instance) with Amazon S3"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## AWS Credentials "
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "Create your access key:\n",
22 | "- go to aws console\n",
23 | "- IAM -> Users -> User -> Security credentials -> Download file (Store the keys in a secure location)"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "Install the AWS Command Line Interface on macOS:\n",
31 | "\n",
32 | "$ pip install awscli --upgrade --user\n"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "$ aws configure\n",
40 | "\n",
41 | "AWS Access Key ID [None]: [Access-key-from-csv-file]\n",
42 | "\n",
43 | "AWS Secret Access Key [None]: [Secret-Access-key-from-csv-file]\n",
44 | "\n",
45 | "Default region name [None]: [Enter]\n",
46 | "\n",
47 | "Default output format [None]: [Enter]"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {},
53 | "source": [
54 | "Your credentials should be saved in ~/.aws/credentials file"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "## Connect to S3 in Jupyter Notebook "
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 1,
67 | "metadata": {
68 | "collapsed": true
69 | },
70 | "outputs": [],
71 | "source": [
72 | "import os\n",
73 | "os.environ['PYSPARK_SUBMIT_ARGS'] = \"--packages=org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell\""
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 2,
79 | "metadata": {
80 | "collapsed": true
81 | },
82 | "outputs": [],
83 | "source": [
84 | "import configparser\n",
85 | "import os\n",
86 | "config = configparser.ConfigParser()\n",
87 | "config.read(os.path.expanduser(\"~/.aws/credentials\"))\n",
88 | "access_id = config.get('default', \"aws_access_key_id\") \n",
89 | "access_key = config.get('default', \"aws_secret_access_key\")"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 3,
95 | "metadata": {
96 | "collapsed": true
97 | },
98 | "outputs": [],
99 | "source": [
100 | "from pyspark.context import SparkContext\n",
101 | "from pyspark.sql.session import SparkSession\n",
102 | "\n",
103 | "sc = SparkContext('local')\n",
104 | "spark = SparkSession(sc)\n",
105 | "\n",
106 | "hadoop_conf=sc._jsc.hadoopConfiguration()\n",
107 | "hadoop_conf.set(\"fs.s3n.impl\", \"org.apache.hadoop.fs.s3native.NativeS3FileSystem\")\n",
108 | "hadoop_conf.set(\"fs.s3n.awsAccessKeyId\", access_id)\n",
109 | "hadoop_conf.set(\"fs.s3n.awsSecretAccessKey\", access_key)"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 4,
115 | "metadata": {},
116 | "outputs": [
117 | {
118 | "data": {
119 | "text/html": [
120 | "\n",
121 | "\n",
134 | "
\n",
135 | " \n",
136 | " \n",
137 | " \n",
138 | " meta_id \n",
139 | " meta_timestamp \n",
140 | " data_device_device_type \n",
141 | " \n",
142 | " \n",
143 | " \n",
144 | " \n",
145 | " 0 \n",
146 | " 333A \n",
147 | " 100000000 \n",
148 | " mobile \n",
149 | " \n",
150 | " \n",
151 | " 1 \n",
152 | " 333A \n",
153 | " 100000020 \n",
154 | " None \n",
155 | " \n",
156 | " \n",
157 | " 2 \n",
158 | " 333A \n",
159 | " 100000040 \n",
160 | " None \n",
161 | " \n",
162 | " \n",
163 | " 3 \n",
164 | " 333A \n",
165 | " 100000060 \n",
166 | " mobile \n",
167 | " \n",
168 | " \n",
169 | " 4 \n",
170 | " 333A \n",
171 | " 100000080 \n",
172 | " None \n",
173 | " \n",
174 | " \n",
175 | "
\n",
176 | "
"
177 | ],
178 | "text/plain": [
179 | " meta_id meta_timestamp data_device_device_type\n",
180 | "0 333A 100000000 mobile\n",
181 | "1 333A 100000020 None\n",
182 | "2 333A 100000040 None\n",
183 | "3 333A 100000060 mobile\n",
184 | "4 333A 100000080 None"
185 | ]
186 | },
187 | "execution_count": 4,
188 | "metadata": {},
189 | "output_type": "execute_result"
190 | }
191 | ],
192 | "source": [
193 | "df=spark.read.orc(\"s3n://miquido-ds-test-backet/events_processing/events_processing_df_002_orc\")\n",
194 | "df.limit(5).toPandas()[['meta_id', 'meta_timestamp', 'data_device_device_type']].head()"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "metadata": {
201 | "collapsed": true
202 | },
203 | "outputs": [],
204 | "source": []
205 | }
206 | ],
207 | "metadata": {
208 | "kernelspec": {
209 | "display_name": "Python 3",
210 | "language": "python",
211 | "name": "python3"
212 | },
213 | "language_info": {
214 | "codemirror_mode": {
215 | "name": "ipython",
216 | "version": 3
217 | },
218 | "file_extension": ".py",
219 | "mimetype": "text/x-python",
220 | "name": "python",
221 | "nbconvert_exporter": "python",
222 | "pygments_lexer": "ipython3",
223 | "version": "3.6.3"
224 | }
225 | },
226 | "nbformat": 4,
227 | "nbformat_minor": 2
228 | }
229 |
--------------------------------------------------------------------------------
/apache-spark-tutorials/sample_movielens_ratings.txt:
--------------------------------------------------------------------------------
1 | 0::2::3::1424380312
2 | 0::3::1::1424380312
3 | 0::5::2::1424380312
4 | 0::9::4::1424380312
5 | 0::11::1::1424380312
6 | 0::12::2::1424380312
7 | 0::15::1::1424380312
8 | 0::17::1::1424380312
9 | 0::19::1::1424380312
10 | 0::21::1::1424380312
11 | 0::23::1::1424380312
12 | 0::26::3::1424380312
13 | 0::27::1::1424380312
14 | 0::28::1::1424380312
15 | 0::29::1::1424380312
16 | 0::30::1::1424380312
17 | 0::31::1::1424380312
18 | 0::34::1::1424380312
19 | 0::37::1::1424380312
20 | 0::41::2::1424380312
21 | 0::44::1::1424380312
22 | 0::45::2::1424380312
23 | 0::46::1::1424380312
24 | 0::47::1::1424380312
25 | 0::48::1::1424380312
26 | 0::50::1::1424380312
27 | 0::51::1::1424380312
28 | 0::54::1::1424380312
29 | 0::55::1::1424380312
30 | 0::59::2::1424380312
31 | 0::61::2::1424380312
32 | 0::64::1::1424380312
33 | 0::67::1::1424380312
34 | 0::68::1::1424380312
35 | 0::69::1::1424380312
36 | 0::71::1::1424380312
37 | 0::72::1::1424380312
38 | 0::77::2::1424380312
39 | 0::79::1::1424380312
40 | 0::83::1::1424380312
41 | 0::87::1::1424380312
42 | 0::89::2::1424380312
43 | 0::91::3::1424380312
44 | 0::92::4::1424380312
45 | 0::94::1::1424380312
46 | 0::95::2::1424380312
47 | 0::96::1::1424380312
48 | 0::98::1::1424380312
49 | 0::99::1::1424380312
50 | 1::2::2::1424380312
51 | 1::3::1::1424380312
52 | 1::4::2::1424380312
53 | 1::6::1::1424380312
54 | 1::9::3::1424380312
55 | 1::12::1::1424380312
56 | 1::13::1::1424380312
57 | 1::14::1::1424380312
58 | 1::16::1::1424380312
59 | 1::19::1::1424380312
60 | 1::21::3::1424380312
61 | 1::27::1::1424380312
62 | 1::28::3::1424380312
63 | 1::33::1::1424380312
64 | 1::36::2::1424380312
65 | 1::37::1::1424380312
66 | 1::40::1::1424380312
67 | 1::41::2::1424380312
68 | 1::43::1::1424380312
69 | 1::44::1::1424380312
70 | 1::47::1::1424380312
71 | 1::50::1::1424380312
72 | 1::54::1::1424380312
73 | 1::56::2::1424380312
74 | 1::57::1::1424380312
75 | 1::58::1::1424380312
76 | 1::60::1::1424380312
77 | 1::62::4::1424380312
78 | 1::63::1::1424380312
79 | 1::67::1::1424380312
80 | 1::68::4::1424380312
81 | 1::70::2::1424380312
82 | 1::72::1::1424380312
83 | 1::73::1::1424380312
84 | 1::74::2::1424380312
85 | 1::76::1::1424380312
86 | 1::77::3::1424380312
87 | 1::78::1::1424380312
88 | 1::81::1::1424380312
89 | 1::82::1::1424380312
90 | 1::85::3::1424380312
91 | 1::86::2::1424380312
92 | 1::88::2::1424380312
93 | 1::91::1::1424380312
94 | 1::92::2::1424380312
95 | 1::93::1::1424380312
96 | 1::94::2::1424380312
97 | 1::96::1::1424380312
98 | 1::97::1::1424380312
99 | 2::4::3::1424380312
100 | 2::6::1::1424380312
101 | 2::8::5::1424380312
102 | 2::9::1::1424380312
103 | 2::10::1::1424380312
104 | 2::12::3::1424380312
105 | 2::13::1::1424380312
106 | 2::15::2::1424380312
107 | 2::18::2::1424380312
108 | 2::19::4::1424380312
109 | 2::22::1::1424380312
110 | 2::26::1::1424380312
111 | 2::28::1::1424380312
112 | 2::34::4::1424380312
113 | 2::35::1::1424380312
114 | 2::37::5::1424380312
115 | 2::38::1::1424380312
116 | 2::39::5::1424380312
117 | 2::40::4::1424380312
118 | 2::47::1::1424380312
119 | 2::50::1::1424380312
120 | 2::52::2::1424380312
121 | 2::54::1::1424380312
122 | 2::55::1::1424380312
123 | 2::57::2::1424380312
124 | 2::58::2::1424380312
125 | 2::59::1::1424380312
126 | 2::61::1::1424380312
127 | 2::62::1::1424380312
128 | 2::64::1::1424380312
129 | 2::65::1::1424380312
130 | 2::66::3::1424380312
131 | 2::68::1::1424380312
132 | 2::71::3::1424380312
133 | 2::76::1::1424380312
134 | 2::77::1::1424380312
135 | 2::78::1::1424380312
136 | 2::80::1::1424380312
137 | 2::83::5::1424380312
138 | 2::85::1::1424380312
139 | 2::87::2::1424380312
140 | 2::88::1::1424380312
141 | 2::89::4::1424380312
142 | 2::90::1::1424380312
143 | 2::92::4::1424380312
144 | 2::93::5::1424380312
145 | 3::0::1::1424380312
146 | 3::1::1::1424380312
147 | 3::2::1::1424380312
148 | 3::7::3::1424380312
149 | 3::8::3::1424380312
150 | 3::9::1::1424380312
151 | 3::14::1::1424380312
152 | 3::15::1::1424380312
153 | 3::16::1::1424380312
154 | 3::18::4::1424380312
155 | 3::19::1::1424380312
156 | 3::24::3::1424380312
157 | 3::26::1::1424380312
158 | 3::29::3::1424380312
159 | 3::33::1::1424380312
160 | 3::34::3::1424380312
161 | 3::35::1::1424380312
162 | 3::36::3::1424380312
163 | 3::37::1::1424380312
164 | 3::38::2::1424380312
165 | 3::43::1::1424380312
166 | 3::44::1::1424380312
167 | 3::46::1::1424380312
168 | 3::47::1::1424380312
169 | 3::51::5::1424380312
170 | 3::52::3::1424380312
171 | 3::56::1::1424380312
172 | 3::58::1::1424380312
173 | 3::60::3::1424380312
174 | 3::62::1::1424380312
175 | 3::65::2::1424380312
176 | 3::66::1::1424380312
177 | 3::67::1::1424380312
178 | 3::68::2::1424380312
179 | 3::70::1::1424380312
180 | 3::72::2::1424380312
181 | 3::76::3::1424380312
182 | 3::79::3::1424380312
183 | 3::80::4::1424380312
184 | 3::81::1::1424380312
185 | 3::83::1::1424380312
186 | 3::84::1::1424380312
187 | 3::86::1::1424380312
188 | 3::87::2::1424380312
189 | 3::88::4::1424380312
190 | 3::89::1::1424380312
191 | 3::91::1::1424380312
192 | 3::94::3::1424380312
193 | 4::1::1::1424380312
194 | 4::6::1::1424380312
195 | 4::8::1::1424380312
196 | 4::9::1::1424380312
197 | 4::10::1::1424380312
198 | 4::11::1::1424380312
199 | 4::12::1::1424380312
200 | 4::13::1::1424380312
201 | 4::14::2::1424380312
202 | 4::15::1::1424380312
203 | 4::17::1::1424380312
204 | 4::20::1::1424380312
205 | 4::22::1::1424380312
206 | 4::23::1::1424380312
207 | 4::24::1::1424380312
208 | 4::29::4::1424380312
209 | 4::30::1::1424380312
210 | 4::31::1::1424380312
211 | 4::34::1::1424380312
212 | 4::35::1::1424380312
213 | 4::36::1::1424380312
214 | 4::39::2::1424380312
215 | 4::40::3::1424380312
216 | 4::41::4::1424380312
217 | 4::43::2::1424380312
218 | 4::44::1::1424380312
219 | 4::45::1::1424380312
220 | 4::46::1::1424380312
221 | 4::47::1::1424380312
222 | 4::49::2::1424380312
223 | 4::50::1::1424380312
224 | 4::51::1::1424380312
225 | 4::52::4::1424380312
226 | 4::54::1::1424380312
227 | 4::55::1::1424380312
228 | 4::60::3::1424380312
229 | 4::61::1::1424380312
230 | 4::62::4::1424380312
231 | 4::63::3::1424380312
232 | 4::65::1::1424380312
233 | 4::67::2::1424380312
234 | 4::69::1::1424380312
235 | 4::70::4::1424380312
236 | 4::71::1::1424380312
237 | 4::73::1::1424380312
238 | 4::78::1::1424380312
239 | 4::84::1::1424380312
240 | 4::85::1::1424380312
241 | 4::87::3::1424380312
242 | 4::88::3::1424380312
243 | 4::89::2::1424380312
244 | 4::96::1::1424380312
245 | 4::97::1::1424380312
246 | 4::98::1::1424380312
247 | 4::99::1::1424380312
248 | 5::0::1::1424380312
249 | 5::1::1::1424380312
250 | 5::4::1::1424380312
251 | 5::5::1::1424380312
252 | 5::8::1::1424380312
253 | 5::9::3::1424380312
254 | 5::10::2::1424380312
255 | 5::13::3::1424380312
256 | 5::15::1::1424380312
257 | 5::19::1::1424380312
258 | 5::20::3::1424380312
259 | 5::21::2::1424380312
260 | 5::23::3::1424380312
261 | 5::27::1::1424380312
262 | 5::28::1::1424380312
263 | 5::29::1::1424380312
264 | 5::31::1::1424380312
265 | 5::36::3::1424380312
266 | 5::38::2::1424380312
267 | 5::39::1::1424380312
268 | 5::42::1::1424380312
269 | 5::48::3::1424380312
270 | 5::49::4::1424380312
271 | 5::50::3::1424380312
272 | 5::51::1::1424380312
273 | 5::52::1::1424380312
274 | 5::54::1::1424380312
275 | 5::55::5::1424380312
276 | 5::56::3::1424380312
277 | 5::58::1::1424380312
278 | 5::60::1::1424380312
279 | 5::61::1::1424380312
280 | 5::64::3::1424380312
281 | 5::65::2::1424380312
282 | 5::68::4::1424380312
283 | 5::70::1::1424380312
284 | 5::71::1::1424380312
285 | 5::72::1::1424380312
286 | 5::74::1::1424380312
287 | 5::79::1::1424380312
288 | 5::81::2::1424380312
289 | 5::84::1::1424380312
290 | 5::85::1::1424380312
291 | 5::86::1::1424380312
292 | 5::88::1::1424380312
293 | 5::90::4::1424380312
294 | 5::91::2::1424380312
295 | 5::95::2::1424380312
296 | 5::99::1::1424380312
297 | 6::0::1::1424380312
298 | 6::1::1::1424380312
299 | 6::2::3::1424380312
300 | 6::5::1::1424380312
301 | 6::6::1::1424380312
302 | 6::9::1::1424380312
303 | 6::10::1::1424380312
304 | 6::15::2::1424380312
305 | 6::16::2::1424380312
306 | 6::17::1::1424380312
307 | 6::18::1::1424380312
308 | 6::20::1::1424380312
309 | 6::21::1::1424380312
310 | 6::22::1::1424380312
311 | 6::24::1::1424380312
312 | 6::25::5::1424380312
313 | 6::26::1::1424380312
314 | 6::28::1::1424380312
315 | 6::30::1::1424380312
316 | 6::33::1::1424380312
317 | 6::38::1::1424380312
318 | 6::39::1::1424380312
319 | 6::43::4::1424380312
320 | 6::44::1::1424380312
321 | 6::45::1::1424380312
322 | 6::48::1::1424380312
323 | 6::49::1::1424380312
324 | 6::50::1::1424380312
325 | 6::53::1::1424380312
326 | 6::54::1::1424380312
327 | 6::55::1::1424380312
328 | 6::56::1::1424380312
329 | 6::58::4::1424380312
330 | 6::59::1::1424380312
331 | 6::60::1::1424380312
332 | 6::61::3::1424380312
333 | 6::63::3::1424380312
334 | 6::66::1::1424380312
335 | 6::67::3::1424380312
336 | 6::68::1::1424380312
337 | 6::69::1::1424380312
338 | 6::71::2::1424380312
339 | 6::73::1::1424380312
340 | 6::75::1::1424380312
341 | 6::77::1::1424380312
342 | 6::79::1::1424380312
343 | 6::81::1::1424380312
344 | 6::84::1::1424380312
345 | 6::85::3::1424380312
346 | 6::86::1::1424380312
347 | 6::87::1::1424380312
348 | 6::88::1::1424380312
349 | 6::89::1::1424380312
350 | 6::91::2::1424380312
351 | 6::94::1::1424380312
352 | 6::95::2::1424380312
353 | 6::96::1::1424380312
354 | 7::1::1::1424380312
355 | 7::2::2::1424380312
356 | 7::3::1::1424380312
357 | 7::4::1::1424380312
358 | 7::7::1::1424380312
359 | 7::10::1::1424380312
360 | 7::11::2::1424380312
361 | 7::14::2::1424380312
362 | 7::15::1::1424380312
363 | 7::16::1::1424380312
364 | 7::18::1::1424380312
365 | 7::21::1::1424380312
366 | 7::22::1::1424380312
367 | 7::23::1::1424380312
368 | 7::25::5::1424380312
369 | 7::26::1::1424380312
370 | 7::29::4::1424380312
371 | 7::30::1::1424380312
372 | 7::31::3::1424380312
373 | 7::32::1::1424380312
374 | 7::33::1::1424380312
375 | 7::35::1::1424380312
376 | 7::37::2::1424380312
377 | 7::39::3::1424380312
378 | 7::40::2::1424380312
379 | 7::42::2::1424380312
380 | 7::44::1::1424380312
381 | 7::45::2::1424380312
382 | 7::47::4::1424380312
383 | 7::48::1::1424380312
384 | 7::49::1::1424380312
385 | 7::53::1::1424380312
386 | 7::54::1::1424380312
387 | 7::55::1::1424380312
388 | 7::56::1::1424380312
389 | 7::59::1::1424380312
390 | 7::61::2::1424380312
391 | 7::62::3::1424380312
392 | 7::63::2::1424380312
393 | 7::66::1::1424380312
394 | 7::67::3::1424380312
395 | 7::74::1::1424380312
396 | 7::75::1::1424380312
397 | 7::76::3::1424380312
398 | 7::77::1::1424380312
399 | 7::81::1::1424380312
400 | 7::82::1::1424380312
401 | 7::84::2::1424380312
402 | 7::85::4::1424380312
403 | 7::86::1::1424380312
404 | 7::92::2::1424380312
405 | 7::96::1::1424380312
406 | 7::97::1::1424380312
407 | 7::98::1::1424380312
408 | 8::0::1::1424380312
409 | 8::2::4::1424380312
410 | 8::3::2::1424380312
411 | 8::4::2::1424380312
412 | 8::5::1::1424380312
413 | 8::7::1::1424380312
414 | 8::9::1::1424380312
415 | 8::11::1::1424380312
416 | 8::15::1::1424380312
417 | 8::18::1::1424380312
418 | 8::19::1::1424380312
419 | 8::21::1::1424380312
420 | 8::29::5::1424380312
421 | 8::31::3::1424380312
422 | 8::33::1::1424380312
423 | 8::35::1::1424380312
424 | 8::36::1::1424380312
425 | 8::40::2::1424380312
426 | 8::44::1::1424380312
427 | 8::45::1::1424380312
428 | 8::50::1::1424380312
429 | 8::51::1::1424380312
430 | 8::52::5::1424380312
431 | 8::53::5::1424380312
432 | 8::54::1::1424380312
433 | 8::55::1::1424380312
434 | 8::56::1::1424380312
435 | 8::58::4::1424380312
436 | 8::60::3::1424380312
437 | 8::62::4::1424380312
438 | 8::64::1::1424380312
439 | 8::67::3::1424380312
440 | 8::69::1::1424380312
441 | 8::71::1::1424380312
442 | 8::72::3::1424380312
443 | 8::77::3::1424380312
444 | 8::78::1::1424380312
445 | 8::79::1::1424380312
446 | 8::83::1::1424380312
447 | 8::85::5::1424380312
448 | 8::86::1::1424380312
449 | 8::88::1::1424380312
450 | 8::90::1::1424380312
451 | 8::92::2::1424380312
452 | 8::95::4::1424380312
453 | 8::96::3::1424380312
454 | 8::97::1::1424380312
455 | 8::98::1::1424380312
456 | 8::99::1::1424380312
457 | 9::2::3::1424380312
458 | 9::3::1::1424380312
459 | 9::4::1::1424380312
460 | 9::5::1::1424380312
461 | 9::6::1::1424380312
462 | 9::7::5::1424380312
463 | 9::9::1::1424380312
464 | 9::12::1::1424380312
465 | 9::14::3::1424380312
466 | 9::15::1::1424380312
467 | 9::19::1::1424380312
468 | 9::21::1::1424380312
469 | 9::22::1::1424380312
470 | 9::24::1::1424380312
471 | 9::25::1::1424380312
472 | 9::26::1::1424380312
473 | 9::30::3::1424380312
474 | 9::32::4::1424380312
475 | 9::35::2::1424380312
476 | 9::36::2::1424380312
477 | 9::37::2::1424380312
478 | 9::38::1::1424380312
479 | 9::39::1::1424380312
480 | 9::43::3::1424380312
481 | 9::49::5::1424380312
482 | 9::50::3::1424380312
483 | 9::53::1::1424380312
484 | 9::54::1::1424380312
485 | 9::58::1::1424380312
486 | 9::59::1::1424380312
487 | 9::60::1::1424380312
488 | 9::61::1::1424380312
489 | 9::63::3::1424380312
490 | 9::64::3::1424380312
491 | 9::68::1::1424380312
492 | 9::69::1::1424380312
493 | 9::70::3::1424380312
494 | 9::71::1::1424380312
495 | 9::73::2::1424380312
496 | 9::75::1::1424380312
497 | 9::77::2::1424380312
498 | 9::81::2::1424380312
499 | 9::82::1::1424380312
500 | 9::83::1::1424380312
501 | 9::84::1::1424380312
502 | 9::86::1::1424380312
503 | 9::87::4::1424380312
504 | 9::88::1::1424380312
505 | 9::90::3::1424380312
506 | 9::94::2::1424380312
507 | 9::95::3::1424380312
508 | 9::97::2::1424380312
509 | 9::98::1::1424380312
510 | 10::0::3::1424380312
511 | 10::2::4::1424380312
512 | 10::4::3::1424380312
513 | 10::7::1::1424380312
514 | 10::8::1::1424380312
515 | 10::10::1::1424380312
516 | 10::13::2::1424380312
517 | 10::14::1::1424380312
518 | 10::16::2::1424380312
519 | 10::17::1::1424380312
520 | 10::18::1::1424380312
521 | 10::21::1::1424380312
522 | 10::22::1::1424380312
523 | 10::24::1::1424380312
524 | 10::25::3::1424380312
525 | 10::28::1::1424380312
526 | 10::35::1::1424380312
527 | 10::36::1::1424380312
528 | 10::37::1::1424380312
529 | 10::38::1::1424380312
530 | 10::39::1::1424380312
531 | 10::40::4::1424380312
532 | 10::41::2::1424380312
533 | 10::42::3::1424380312
534 | 10::43::1::1424380312
535 | 10::49::3::1424380312
536 | 10::50::1::1424380312
537 | 10::51::1::1424380312
538 | 10::52::1::1424380312
539 | 10::55::2::1424380312
540 | 10::56::1::1424380312
541 | 10::58::1::1424380312
542 | 10::63::1::1424380312
543 | 10::66::1::1424380312
544 | 10::67::2::1424380312
545 | 10::68::1::1424380312
546 | 10::75::1::1424380312
547 | 10::77::1::1424380312
548 | 10::79::1::1424380312
549 | 10::86::1::1424380312
550 | 10::89::3::1424380312
551 | 10::90::1::1424380312
552 | 10::97::1::1424380312
553 | 10::98::1::1424380312
554 | 11::0::1::1424380312
555 | 11::6::2::1424380312
556 | 11::9::1::1424380312
557 | 11::10::1::1424380312
558 | 11::11::1::1424380312
559 | 11::12::1::1424380312
560 | 11::13::4::1424380312
561 | 11::16::1::1424380312
562 | 11::18::5::1424380312
563 | 11::19::4::1424380312
564 | 11::20::1::1424380312
565 | 11::21::1::1424380312
566 | 11::22::1::1424380312
567 | 11::23::5::1424380312
568 | 11::25::1::1424380312
569 | 11::27::5::1424380312
570 | 11::30::5::1424380312
571 | 11::32::5::1424380312
572 | 11::35::3::1424380312
573 | 11::36::2::1424380312
574 | 11::37::2::1424380312
575 | 11::38::4::1424380312
576 | 11::39::1::1424380312
577 | 11::40::1::1424380312
578 | 11::41::1::1424380312
579 | 11::43::2::1424380312
580 | 11::45::1::1424380312
581 | 11::47::1::1424380312
582 | 11::48::5::1424380312
583 | 11::50::4::1424380312
584 | 11::51::3::1424380312
585 | 11::59::1::1424380312
586 | 11::61::1::1424380312
587 | 11::62::1::1424380312
588 | 11::64::1::1424380312
589 | 11::66::4::1424380312
590 | 11::67::1::1424380312
591 | 11::69::5::1424380312
592 | 11::70::1::1424380312
593 | 11::71::3::1424380312
594 | 11::72::3::1424380312
595 | 11::75::3::1424380312
596 | 11::76::1::1424380312
597 | 11::77::1::1424380312
598 | 11::78::1::1424380312
599 | 11::79::5::1424380312
600 | 11::80::3::1424380312
601 | 11::81::4::1424380312
602 | 11::82::1::1424380312
603 | 11::86::1::1424380312
604 | 11::88::1::1424380312
605 | 11::89::1::1424380312
606 | 11::90::4::1424380312
607 | 11::94::2::1424380312
608 | 11::97::3::1424380312
609 | 11::99::1::1424380312
610 | 12::2::1::1424380312
611 | 12::4::1::1424380312
612 | 12::6::1::1424380312
613 | 12::7::3::1424380312
614 | 12::8::1::1424380312
615 | 12::14::1::1424380312
616 | 12::15::2::1424380312
617 | 12::16::4::1424380312
618 | 12::17::5::1424380312
619 | 12::18::2::1424380312
620 | 12::21::1::1424380312
621 | 12::22::2::1424380312
622 | 12::23::3::1424380312
623 | 12::24::1::1424380312
624 | 12::25::1::1424380312
625 | 12::27::5::1424380312
626 | 12::30::2::1424380312
627 | 12::31::4::1424380312
628 | 12::35::5::1424380312
629 | 12::38::1::1424380312
630 | 12::41::1::1424380312
631 | 12::44::2::1424380312
632 | 12::45::1::1424380312
633 | 12::50::4::1424380312
634 | 12::51::1::1424380312
635 | 12::52::1::1424380312
636 | 12::53::1::1424380312
637 | 12::54::1::1424380312
638 | 12::56::2::1424380312
639 | 12::57::1::1424380312
640 | 12::60::1::1424380312
641 | 12::63::1::1424380312
642 | 12::64::5::1424380312
643 | 12::66::3::1424380312
644 | 12::67::1::1424380312
645 | 12::70::1::1424380312
646 | 12::72::1::1424380312
647 | 12::74::1::1424380312
648 | 12::75::1::1424380312
649 | 12::77::1::1424380312
650 | 12::78::1::1424380312
651 | 12::79::3::1424380312
652 | 12::82::2::1424380312
653 | 12::83::1::1424380312
654 | 12::84::1::1424380312
655 | 12::85::1::1424380312
656 | 12::86::1::1424380312
657 | 12::87::1::1424380312
658 | 12::88::1::1424380312
659 | 12::91::3::1424380312
660 | 12::92::1::1424380312
661 | 12::94::4::1424380312
662 | 12::95::2::1424380312
663 | 12::96::1::1424380312
664 | 12::98::2::1424380312
665 | 13::0::1::1424380312
666 | 13::3::1::1424380312
667 | 13::4::2::1424380312
668 | 13::5::1::1424380312
669 | 13::6::1::1424380312
670 | 13::12::1::1424380312
671 | 13::14::2::1424380312
672 | 13::15::1::1424380312
673 | 13::17::1::1424380312
674 | 13::18::3::1424380312
675 | 13::20::1::1424380312
676 | 13::21::1::1424380312
677 | 13::22::1::1424380312
678 | 13::26::1::1424380312
679 | 13::27::1::1424380312
680 | 13::29::3::1424380312
681 | 13::31::1::1424380312
682 | 13::33::1::1424380312
683 | 13::40::2::1424380312
684 | 13::43::2::1424380312
685 | 13::44::1::1424380312
686 | 13::45::1::1424380312
687 | 13::49::1::1424380312
688 | 13::51::1::1424380312
689 | 13::52::2::1424380312
690 | 13::53::3::1424380312
691 | 13::54::1::1424380312
692 | 13::62::1::1424380312
693 | 13::63::2::1424380312
694 | 13::64::1::1424380312
695 | 13::68::1::1424380312
696 | 13::71::1::1424380312
697 | 13::72::3::1424380312
698 | 13::73::1::1424380312
699 | 13::74::3::1424380312
700 | 13::77::2::1424380312
701 | 13::78::1::1424380312
702 | 13::79::2::1424380312
703 | 13::83::3::1424380312
704 | 13::85::1::1424380312
705 | 13::86::1::1424380312
706 | 13::87::2::1424380312
707 | 13::88::2::1424380312
708 | 13::90::1::1424380312
709 | 13::93::4::1424380312
710 | 13::94::1::1424380312
711 | 13::98::1::1424380312
712 | 13::99::1::1424380312
713 | 14::1::1::1424380312
714 | 14::3::3::1424380312
715 | 14::4::1::1424380312
716 | 14::5::1::1424380312
717 | 14::6::1::1424380312
718 | 14::7::1::1424380312
719 | 14::9::1::1424380312
720 | 14::10::1::1424380312
721 | 14::11::1::1424380312
722 | 14::12::1::1424380312
723 | 14::13::1::1424380312
724 | 14::14::3::1424380312
725 | 14::15::1::1424380312
726 | 14::16::1::1424380312
727 | 14::17::1::1424380312
728 | 14::20::1::1424380312
729 | 14::21::1::1424380312
730 | 14::24::1::1424380312
731 | 14::25::2::1424380312
732 | 14::27::1::1424380312
733 | 14::28::1::1424380312
734 | 14::29::5::1424380312
735 | 14::31::3::1424380312
736 | 14::34::1::1424380312
737 | 14::36::1::1424380312
738 | 14::37::2::1424380312
739 | 14::39::2::1424380312
740 | 14::40::1::1424380312
741 | 14::44::1::1424380312
742 | 14::45::1::1424380312
743 | 14::47::3::1424380312
744 | 14::48::1::1424380312
745 | 14::49::1::1424380312
746 | 14::51::1::1424380312
747 | 14::52::5::1424380312
748 | 14::53::3::1424380312
749 | 14::54::1::1424380312
750 | 14::55::1::1424380312
751 | 14::56::1::1424380312
752 | 14::62::4::1424380312
753 | 14::63::5::1424380312
754 | 14::67::3::1424380312
755 | 14::68::1::1424380312
756 | 14::69::3::1424380312
757 | 14::71::1::1424380312
758 | 14::72::4::1424380312
759 | 14::73::1::1424380312
760 | 14::76::5::1424380312
761 | 14::79::1::1424380312
762 | 14::82::1::1424380312
763 | 14::83::1::1424380312
764 | 14::88::1::1424380312
765 | 14::93::3::1424380312
766 | 14::94::1::1424380312
767 | 14::95::2::1424380312
768 | 14::96::4::1424380312
769 | 14::98::1::1424380312
770 | 15::0::1::1424380312
771 | 15::1::4::1424380312
772 | 15::2::1::1424380312
773 | 15::5::2::1424380312
774 | 15::6::1::1424380312
775 | 15::7::1::1424380312
776 | 15::13::1::1424380312
777 | 15::14::1::1424380312
778 | 15::15::1::1424380312
779 | 15::17::2::1424380312
780 | 15::19::2::1424380312
781 | 15::22::2::1424380312
782 | 15::23::2::1424380312
783 | 15::25::1::1424380312
784 | 15::26::3::1424380312
785 | 15::27::1::1424380312
786 | 15::28::2::1424380312
787 | 15::29::1::1424380312
788 | 15::32::1::1424380312
789 | 15::33::2::1424380312
790 | 15::34::1::1424380312
791 | 15::35::2::1424380312
792 | 15::36::1::1424380312
793 | 15::37::1::1424380312
794 | 15::39::1::1424380312
795 | 15::42::1::1424380312
796 | 15::46::5::1424380312
797 | 15::48::2::1424380312
798 | 15::50::2::1424380312
799 | 15::51::1::1424380312
800 | 15::52::1::1424380312
801 | 15::58::1::1424380312
802 | 15::62::1::1424380312
803 | 15::64::3::1424380312
804 | 15::65::2::1424380312
805 | 15::72::1::1424380312
806 | 15::73::1::1424380312
807 | 15::74::1::1424380312
808 | 15::79::1::1424380312
809 | 15::80::1::1424380312
810 | 15::81::1::1424380312
811 | 15::82::2::1424380312
812 | 15::85::1::1424380312
813 | 15::87::1::1424380312
814 | 15::91::2::1424380312
815 | 15::96::1::1424380312
816 | 15::97::1::1424380312
817 | 15::98::3::1424380312
818 | 16::2::1::1424380312
819 | 16::5::3::1424380312
820 | 16::6::2::1424380312
821 | 16::7::1::1424380312
822 | 16::9::1::1424380312
823 | 16::12::1::1424380312
824 | 16::14::1::1424380312
825 | 16::15::1::1424380312
826 | 16::19::1::1424380312
827 | 16::21::2::1424380312
828 | 16::29::4::1424380312
829 | 16::30::2::1424380312
830 | 16::32::1::1424380312
831 | 16::34::1::1424380312
832 | 16::36::1::1424380312
833 | 16::38::1::1424380312
834 | 16::46::1::1424380312
835 | 16::47::3::1424380312
836 | 16::48::1::1424380312
837 | 16::49::1::1424380312
838 | 16::50::1::1424380312
839 | 16::51::5::1424380312
840 | 16::54::5::1424380312
841 | 16::55::1::1424380312
842 | 16::56::2::1424380312
843 | 16::57::1::1424380312
844 | 16::60::1::1424380312
845 | 16::63::2::1424380312
846 | 16::65::1::1424380312
847 | 16::67::1::1424380312
848 | 16::72::1::1424380312
849 | 16::74::1::1424380312
850 | 16::80::1::1424380312
851 | 16::81::1::1424380312
852 | 16::82::1::1424380312
853 | 16::85::5::1424380312
854 | 16::86::1::1424380312
855 | 16::90::5::1424380312
856 | 16::91::1::1424380312
857 | 16::93::1::1424380312
858 | 16::94::3::1424380312
859 | 16::95::2::1424380312
860 | 16::96::3::1424380312
861 | 16::98::3::1424380312
862 | 16::99::1::1424380312
863 | 17::2::1::1424380312
864 | 17::3::1::1424380312
865 | 17::6::1::1424380312
866 | 17::10::4::1424380312
867 | 17::11::1::1424380312
868 | 17::13::2::1424380312
869 | 17::17::5::1424380312
870 | 17::19::1::1424380312
871 | 17::20::5::1424380312
872 | 17::22::4::1424380312
873 | 17::28::1::1424380312
874 | 17::29::1::1424380312
875 | 17::33::1::1424380312
876 | 17::34::1::1424380312
877 | 17::35::2::1424380312
878 | 17::37::1::1424380312
879 | 17::38::1::1424380312
880 | 17::45::1::1424380312
881 | 17::46::5::1424380312
882 | 17::47::1::1424380312
883 | 17::49::3::1424380312
884 | 17::51::1::1424380312
885 | 17::55::5::1424380312
886 | 17::56::3::1424380312
887 | 17::57::1::1424380312
888 | 17::58::1::1424380312
889 | 17::59::1::1424380312
890 | 17::60::1::1424380312
891 | 17::63::1::1424380312
892 | 17::66::1::1424380312
893 | 17::68::4::1424380312
894 | 17::69::1::1424380312
895 | 17::70::1::1424380312
896 | 17::72::1::1424380312
897 | 17::73::3::1424380312
898 | 17::78::1::1424380312
899 | 17::79::1::1424380312
900 | 17::82::2::1424380312
901 | 17::84::1::1424380312
902 | 17::90::5::1424380312
903 | 17::91::3::1424380312
904 | 17::92::1::1424380312
905 | 17::93::1::1424380312
906 | 17::94::4::1424380312
907 | 17::95::2::1424380312
908 | 17::97::1::1424380312
909 | 18::1::1::1424380312
910 | 18::4::3::1424380312
911 | 18::5::2::1424380312
912 | 18::6::1::1424380312
913 | 18::7::1::1424380312
914 | 18::10::1::1424380312
915 | 18::11::4::1424380312
916 | 18::12::2::1424380312
917 | 18::13::1::1424380312
918 | 18::15::1::1424380312
919 | 18::18::1::1424380312
920 | 18::20::1::1424380312
921 | 18::21::2::1424380312
922 | 18::22::1::1424380312
923 | 18::23::2::1424380312
924 | 18::25::1::1424380312
925 | 18::26::1::1424380312
926 | 18::27::1::1424380312
927 | 18::28::5::1424380312
928 | 18::29::1::1424380312
929 | 18::31::1::1424380312
930 | 18::32::1::1424380312
931 | 18::36::1::1424380312
932 | 18::38::5::1424380312
933 | 18::39::5::1424380312
934 | 18::40::1::1424380312
935 | 18::42::1::1424380312
936 | 18::43::1::1424380312
937 | 18::44::4::1424380312
938 | 18::46::1::1424380312
939 | 18::47::1::1424380312
940 | 18::48::1::1424380312
941 | 18::51::2::1424380312
942 | 18::55::1::1424380312
943 | 18::56::1::1424380312
944 | 18::57::1::1424380312
945 | 18::62::1::1424380312
946 | 18::63::1::1424380312
947 | 18::66::3::1424380312
948 | 18::67::1::1424380312
949 | 18::70::1::1424380312
950 | 18::75::1::1424380312
951 | 18::76::3::1424380312
952 | 18::77::1::1424380312
953 | 18::80::3::1424380312
954 | 18::81::3::1424380312
955 | 18::82::1::1424380312
956 | 18::83::5::1424380312
957 | 18::84::1::1424380312
958 | 18::97::1::1424380312
959 | 18::98::1::1424380312
960 | 18::99::2::1424380312
961 | 19::0::1::1424380312
962 | 19::1::1::1424380312
963 | 19::2::1::1424380312
964 | 19::4::1::1424380312
965 | 19::6::2::1424380312
966 | 19::11::1::1424380312
967 | 19::12::1::1424380312
968 | 19::14::1::1424380312
969 | 19::23::1::1424380312
970 | 19::26::1::1424380312
971 | 19::31::1::1424380312
972 | 19::32::4::1424380312
973 | 19::33::1::1424380312
974 | 19::34::1::1424380312
975 | 19::37::1::1424380312
976 | 19::38::1::1424380312
977 | 19::41::1::1424380312
978 | 19::43::1::1424380312
979 | 19::45::1::1424380312
980 | 19::48::1::1424380312
981 | 19::49::1::1424380312
982 | 19::50::2::1424380312
983 | 19::53::2::1424380312
984 | 19::54::3::1424380312
985 | 19::55::1::1424380312
986 | 19::56::2::1424380312
987 | 19::58::1::1424380312
988 | 19::61::1::1424380312
989 | 19::62::1::1424380312
990 | 19::63::1::1424380312
991 | 19::64::1::1424380312
992 | 19::65::1::1424380312
993 | 19::69::2::1424380312
994 | 19::72::1::1424380312
995 | 19::74::3::1424380312
996 | 19::76::1::1424380312
997 | 19::78::1::1424380312
998 | 19::79::1::1424380312
999 | 19::81::1::1424380312
1000 | 19::82::1::1424380312
1001 | 19::84::1::1424380312
1002 | 19::86::1::1424380312
1003 | 19::87::2::1424380312
1004 | 19::90::4::1424380312
1005 | 19::93::1::1424380312
1006 | 19::94::4::1424380312
1007 | 19::95::2::1424380312
1008 | 19::96::1::1424380312
1009 | 19::98::4::1424380312
1010 | 20::0::1::1424380312
1011 | 20::1::1::1424380312
1012 | 20::2::2::1424380312
1013 | 20::4::2::1424380312
1014 | 20::6::1::1424380312
1015 | 20::8::1::1424380312
1016 | 20::12::1::1424380312
1017 | 20::21::2::1424380312
1018 | 20::22::5::1424380312
1019 | 20::24::2::1424380312
1020 | 20::25::1::1424380312
1021 | 20::26::1::1424380312
1022 | 20::29::2::1424380312
1023 | 20::30::2::1424380312
1024 | 20::32::2::1424380312
1025 | 20::39::1::1424380312
1026 | 20::40::1::1424380312
1027 | 20::41::2::1424380312
1028 | 20::45::2::1424380312
1029 | 20::48::1::1424380312
1030 | 20::50::1::1424380312
1031 | 20::51::3::1424380312
1032 | 20::53::3::1424380312
1033 | 20::55::1::1424380312
1034 | 20::57::2::1424380312
1035 | 20::60::1::1424380312
1036 | 20::61::1::1424380312
1037 | 20::64::1::1424380312
1038 | 20::66::1::1424380312
1039 | 20::70::2::1424380312
1040 | 20::72::1::1424380312
1041 | 20::73::2::1424380312
1042 | 20::75::4::1424380312
1043 | 20::76::1::1424380312
1044 | 20::77::4::1424380312
1045 | 20::78::1::1424380312
1046 | 20::79::1::1424380312
1047 | 20::84::2::1424380312
1048 | 20::85::2::1424380312
1049 | 20::88::3::1424380312
1050 | 20::89::1::1424380312
1051 | 20::90::3::1424380312
1052 | 20::91::1::1424380312
1053 | 20::92::2::1424380312
1054 | 20::93::1::1424380312
1055 | 20::94::4::1424380312
1056 | 20::97::1::1424380312
1057 | 21::0::1::1424380312
1058 | 21::2::4::1424380312
1059 | 21::3::1::1424380312
1060 | 21::7::2::1424380312
1061 | 21::11::1::1424380312
1062 | 21::12::1::1424380312
1063 | 21::13::1::1424380312
1064 | 21::14::3::1424380312
1065 | 21::17::1::1424380312
1066 | 21::19::1::1424380312
1067 | 21::20::1::1424380312
1068 | 21::21::1::1424380312
1069 | 21::22::1::1424380312
1070 | 21::23::1::1424380312
1071 | 21::24::1::1424380312
1072 | 21::27::1::1424380312
1073 | 21::29::5::1424380312
1074 | 21::30::2::1424380312
1075 | 21::38::1::1424380312
1076 | 21::40::2::1424380312
1077 | 21::43::3::1424380312
1078 | 21::44::1::1424380312
1079 | 21::45::1::1424380312
1080 | 21::46::1::1424380312
1081 | 21::48::1::1424380312
1082 | 21::51::1::1424380312
1083 | 21::53::5::1424380312
1084 | 21::54::1::1424380312
1085 | 21::55::1::1424380312
1086 | 21::56::1::1424380312
1087 | 21::58::3::1424380312
1088 | 21::59::3::1424380312
1089 | 21::64::1::1424380312
1090 | 21::66::1::1424380312
1091 | 21::68::1::1424380312
1092 | 21::71::1::1424380312
1093 | 21::73::1::1424380312
1094 | 21::74::4::1424380312
1095 | 21::80::1::1424380312
1096 | 21::81::1::1424380312
1097 | 21::83::1::1424380312
1098 | 21::84::1::1424380312
1099 | 21::85::3::1424380312
1100 | 21::87::4::1424380312
1101 | 21::89::2::1424380312
1102 | 21::92::2::1424380312
1103 | 21::96::3::1424380312
1104 | 21::99::1::1424380312
1105 | 22::0::1::1424380312
1106 | 22::3::2::1424380312
1107 | 22::5::2::1424380312
1108 | 22::6::2::1424380312
1109 | 22::9::1::1424380312
1110 | 22::10::1::1424380312
1111 | 22::11::1::1424380312
1112 | 22::13::1::1424380312
1113 | 22::14::1::1424380312
1114 | 22::16::1::1424380312
1115 | 22::18::3::1424380312
1116 | 22::19::1::1424380312
1117 | 22::22::5::1424380312
1118 | 22::25::1::1424380312
1119 | 22::26::1::1424380312
1120 | 22::29::3::1424380312
1121 | 22::30::5::1424380312
1122 | 22::32::4::1424380312
1123 | 22::33::1::1424380312
1124 | 22::35::1::1424380312
1125 | 22::36::3::1424380312
1126 | 22::37::1::1424380312
1127 | 22::40::1::1424380312
1128 | 22::41::3::1424380312
1129 | 22::44::1::1424380312
1130 | 22::45::2::1424380312
1131 | 22::48::1::1424380312
1132 | 22::51::5::1424380312
1133 | 22::55::1::1424380312
1134 | 22::56::2::1424380312
1135 | 22::60::3::1424380312
1136 | 22::61::1::1424380312
1137 | 22::62::4::1424380312
1138 | 22::63::1::1424380312
1139 | 22::65::1::1424380312
1140 | 22::66::1::1424380312
1141 | 22::68::4::1424380312
1142 | 22::69::4::1424380312
1143 | 22::70::3::1424380312
1144 | 22::71::1::1424380312
1145 | 22::74::5::1424380312
1146 | 22::75::5::1424380312
1147 | 22::78::1::1424380312
1148 | 22::80::3::1424380312
1149 | 22::81::1::1424380312
1150 | 22::82::1::1424380312
1151 | 22::84::1::1424380312
1152 | 22::86::1::1424380312
1153 | 22::87::3::1424380312
1154 | 22::88::5::1424380312
1155 | 22::90::2::1424380312
1156 | 22::92::3::1424380312
1157 | 22::95::2::1424380312
1158 | 22::96::2::1424380312
1159 | 22::98::4::1424380312
1160 | 22::99::1::1424380312
1161 | 23::0::1::1424380312
1162 | 23::2::1::1424380312
1163 | 23::4::1::1424380312
1164 | 23::6::2::1424380312
1165 | 23::10::4::1424380312
1166 | 23::12::1::1424380312
1167 | 23::13::4::1424380312
1168 | 23::14::1::1424380312
1169 | 23::15::1::1424380312
1170 | 23::18::4::1424380312
1171 | 23::22::2::1424380312
1172 | 23::23::4::1424380312
1173 | 23::24::1::1424380312
1174 | 23::25::1::1424380312
1175 | 23::26::1::1424380312
1176 | 23::27::5::1424380312
1177 | 23::28::1::1424380312
1178 | 23::29::1::1424380312
1179 | 23::30::4::1424380312
1180 | 23::32::5::1424380312
1181 | 23::33::2::1424380312
1182 | 23::36::3::1424380312
1183 | 23::37::1::1424380312
1184 | 23::38::1::1424380312
1185 | 23::39::1::1424380312
1186 | 23::43::1::1424380312
1187 | 23::48::5::1424380312
1188 | 23::49::5::1424380312
1189 | 23::50::4::1424380312
1190 | 23::53::1::1424380312
1191 | 23::55::5::1424380312
1192 | 23::57::1::1424380312
1193 | 23::59::1::1424380312
1194 | 23::60::1::1424380312
1195 | 23::61::1::1424380312
1196 | 23::64::4::1424380312
1197 | 23::65::5::1424380312
1198 | 23::66::2::1424380312
1199 | 23::67::1::1424380312
1200 | 23::68::3::1424380312
1201 | 23::69::1::1424380312
1202 | 23::72::1::1424380312
1203 | 23::73::3::1424380312
1204 | 23::77::1::1424380312
1205 | 23::82::2::1424380312
1206 | 23::83::1::1424380312
1207 | 23::84::1::1424380312
1208 | 23::85::1::1424380312
1209 | 23::87::3::1424380312
1210 | 23::88::1::1424380312
1211 | 23::95::2::1424380312
1212 | 23::97::1::1424380312
1213 | 24::4::1::1424380312
1214 | 24::6::3::1424380312
1215 | 24::7::1::1424380312
1216 | 24::10::2::1424380312
1217 | 24::12::1::1424380312
1218 | 24::15::1::1424380312
1219 | 24::19::1::1424380312
1220 | 24::24::1::1424380312
1221 | 24::27::3::1424380312
1222 | 24::30::5::1424380312
1223 | 24::31::1::1424380312
1224 | 24::32::3::1424380312
1225 | 24::33::1::1424380312
1226 | 24::37::1::1424380312
1227 | 24::39::1::1424380312
1228 | 24::40::1::1424380312
1229 | 24::42::1::1424380312
1230 | 24::43::3::1424380312
1231 | 24::45::2::1424380312
1232 | 24::46::1::1424380312
1233 | 24::47::1::1424380312
1234 | 24::48::1::1424380312
1235 | 24::49::1::1424380312
1236 | 24::50::1::1424380312
1237 | 24::52::5::1424380312
1238 | 24::57::1::1424380312
1239 | 24::59::4::1424380312
1240 | 24::63::4::1424380312
1241 | 24::65::1::1424380312
1242 | 24::66::1::1424380312
1243 | 24::67::1::1424380312
1244 | 24::68::3::1424380312
1245 | 24::69::5::1424380312
1246 | 24::71::1::1424380312
1247 | 24::72::4::1424380312
1248 | 24::77::4::1424380312
1249 | 24::78::1::1424380312
1250 | 24::80::1::1424380312
1251 | 24::82::1::1424380312
1252 | 24::84::1::1424380312
1253 | 24::86::1::1424380312
1254 | 24::87::1::1424380312
1255 | 24::88::2::1424380312
1256 | 24::89::1::1424380312
1257 | 24::90::5::1424380312
1258 | 24::91::1::1424380312
1259 | 24::92::1::1424380312
1260 | 24::94::2::1424380312
1261 | 24::95::1::1424380312
1262 | 24::96::5::1424380312
1263 | 24::98::1::1424380312
1264 | 24::99::1::1424380312
1265 | 25::1::3::1424380312
1266 | 25::2::1::1424380312
1267 | 25::7::1::1424380312
1268 | 25::9::1::1424380312
1269 | 25::12::3::1424380312
1270 | 25::16::3::1424380312
1271 | 25::17::1::1424380312
1272 | 25::18::1::1424380312
1273 | 25::20::1::1424380312
1274 | 25::22::1::1424380312
1275 | 25::23::1::1424380312
1276 | 25::26::2::1424380312
1277 | 25::29::1::1424380312
1278 | 25::30::1::1424380312
1279 | 25::31::2::1424380312
1280 | 25::33::4::1424380312
1281 | 25::34::3::1424380312
1282 | 25::35::2::1424380312
1283 | 25::36::1::1424380312
1284 | 25::37::1::1424380312
1285 | 25::40::1::1424380312
1286 | 25::41::1::1424380312
1287 | 25::43::1::1424380312
1288 | 25::47::4::1424380312
1289 | 25::50::1::1424380312
1290 | 25::51::1::1424380312
1291 | 25::53::1::1424380312
1292 | 25::56::1::1424380312
1293 | 25::58::2::1424380312
1294 | 25::64::2::1424380312
1295 | 25::67::2::1424380312
1296 | 25::68::1::1424380312
1297 | 25::70::1::1424380312
1298 | 25::71::4::1424380312
1299 | 25::73::1::1424380312
1300 | 25::74::1::1424380312
1301 | 25::76::1::1424380312
1302 | 25::79::1::1424380312
1303 | 25::82::1::1424380312
1304 | 25::84::2::1424380312
1305 | 25::85::1::1424380312
1306 | 25::91::3::1424380312
1307 | 25::92::1::1424380312
1308 | 25::94::1::1424380312
1309 | 25::95::1::1424380312
1310 | 25::97::2::1424380312
1311 | 26::0::1::1424380312
1312 | 26::1::1::1424380312
1313 | 26::2::1::1424380312
1314 | 26::3::1::1424380312
1315 | 26::4::4::1424380312
1316 | 26::5::2::1424380312
1317 | 26::6::3::1424380312
1318 | 26::7::5::1424380312
1319 | 26::13::3::1424380312
1320 | 26::14::1::1424380312
1321 | 26::16::1::1424380312
1322 | 26::18::3::1424380312
1323 | 26::20::1::1424380312
1324 | 26::21::3::1424380312
1325 | 26::22::5::1424380312
1326 | 26::23::5::1424380312
1327 | 26::24::5::1424380312
1328 | 26::27::1::1424380312
1329 | 26::31::1::1424380312
1330 | 26::35::1::1424380312
1331 | 26::36::4::1424380312
1332 | 26::40::1::1424380312
1333 | 26::44::1::1424380312
1334 | 26::45::2::1424380312
1335 | 26::47::1::1424380312
1336 | 26::48::1::1424380312
1337 | 26::49::3::1424380312
1338 | 26::50::2::1424380312
1339 | 26::52::1::1424380312
1340 | 26::54::4::1424380312
1341 | 26::55::1::1424380312
1342 | 26::57::3::1424380312
1343 | 26::58::1::1424380312
1344 | 26::61::1::1424380312
1345 | 26::62::2::1424380312
1346 | 26::66::1::1424380312
1347 | 26::68::4::1424380312
1348 | 26::71::1::1424380312
1349 | 26::73::4::1424380312
1350 | 26::76::1::1424380312
1351 | 26::81::3::1424380312
1352 | 26::85::1::1424380312
1353 | 26::86::3::1424380312
1354 | 26::88::5::1424380312
1355 | 26::91::1::1424380312
1356 | 26::94::5::1424380312
1357 | 26::95::1::1424380312
1358 | 26::96::1::1424380312
1359 | 26::97::1::1424380312
1360 | 27::0::1::1424380312
1361 | 27::9::1::1424380312
1362 | 27::10::1::1424380312
1363 | 27::18::4::1424380312
1364 | 27::19::3::1424380312
1365 | 27::20::1::1424380312
1366 | 27::22::2::1424380312
1367 | 27::24::2::1424380312
1368 | 27::25::1::1424380312
1369 | 27::27::3::1424380312
1370 | 27::28::1::1424380312
1371 | 27::29::1::1424380312
1372 | 27::31::1::1424380312
1373 | 27::33::3::1424380312
1374 | 27::40::1::1424380312
1375 | 27::42::1::1424380312
1376 | 27::43::1::1424380312
1377 | 27::44::3::1424380312
1378 | 27::45::1::1424380312
1379 | 27::51::3::1424380312
1380 | 27::52::1::1424380312
1381 | 27::55::3::1424380312
1382 | 27::57::1::1424380312
1383 | 27::59::1::1424380312
1384 | 27::60::1::1424380312
1385 | 27::61::1::1424380312
1386 | 27::64::1::1424380312
1387 | 27::66::3::1424380312
1388 | 27::68::1::1424380312
1389 | 27::70::1::1424380312
1390 | 27::71::2::1424380312
1391 | 27::72::1::1424380312
1392 | 27::75::3::1424380312
1393 | 27::78::1::1424380312
1394 | 27::80::3::1424380312
1395 | 27::82::1::1424380312
1396 | 27::83::3::1424380312
1397 | 27::86::1::1424380312
1398 | 27::87::2::1424380312
1399 | 27::90::1::1424380312
1400 | 27::91::1::1424380312
1401 | 27::92::1::1424380312
1402 | 27::93::1::1424380312
1403 | 27::94::2::1424380312
1404 | 27::95::1::1424380312
1405 | 27::98::1::1424380312
1406 | 28::0::3::1424380312
1407 | 28::1::1::1424380312
1408 | 28::2::4::1424380312
1409 | 28::3::1::1424380312
1410 | 28::6::1::1424380312
1411 | 28::7::1::1424380312
1412 | 28::12::5::1424380312
1413 | 28::13::2::1424380312
1414 | 28::14::1::1424380312
1415 | 28::15::1::1424380312
1416 | 28::17::1::1424380312
1417 | 28::19::3::1424380312
1418 | 28::20::1::1424380312
1419 | 28::23::3::1424380312
1420 | 28::24::3::1424380312
1421 | 28::27::1::1424380312
1422 | 28::29::1::1424380312
1423 | 28::33::1::1424380312
1424 | 28::34::1::1424380312
1425 | 28::36::1::1424380312
1426 | 28::38::2::1424380312
1427 | 28::39::2::1424380312
1428 | 28::44::1::1424380312
1429 | 28::45::1::1424380312
1430 | 28::49::4::1424380312
1431 | 28::50::1::1424380312
1432 | 28::52::1::1424380312
1433 | 28::54::1::1424380312
1434 | 28::56::1::1424380312
1435 | 28::57::3::1424380312
1436 | 28::58::1::1424380312
1437 | 28::59::1::1424380312
1438 | 28::60::1::1424380312
1439 | 28::62::3::1424380312
1440 | 28::63::1::1424380312
1441 | 28::65::1::1424380312
1442 | 28::75::1::1424380312
1443 | 28::78::1::1424380312
1444 | 28::81::5::1424380312
1445 | 28::82::4::1424380312
1446 | 28::83::1::1424380312
1447 | 28::85::1::1424380312
1448 | 28::88::2::1424380312
1449 | 28::89::4::1424380312
1450 | 28::90::1::1424380312
1451 | 28::92::5::1424380312
1452 | 28::94::1::1424380312
1453 | 28::95::2::1424380312
1454 | 28::98::1::1424380312
1455 | 28::99::1::1424380312
1456 | 29::3::1::1424380312
1457 | 29::4::1::1424380312
1458 | 29::5::1::1424380312
1459 | 29::7::2::1424380312
1460 | 29::9::1::1424380312
1461 | 29::10::3::1424380312
1462 | 29::11::1::1424380312
1463 | 29::13::3::1424380312
1464 | 29::14::1::1424380312
1465 | 29::15::1::1424380312
1466 | 29::17::3::1424380312
1467 | 29::19::3::1424380312
1468 | 29::22::3::1424380312
1469 | 29::23::4::1424380312
1470 | 29::25::1::1424380312
1471 | 29::29::1::1424380312
1472 | 29::31::1::1424380312
1473 | 29::32::4::1424380312
1474 | 29::33::2::1424380312
1475 | 29::36::2::1424380312
1476 | 29::38::3::1424380312
1477 | 29::39::1::1424380312
1478 | 29::42::1::1424380312
1479 | 29::46::5::1424380312
1480 | 29::49::3::1424380312
1481 | 29::51::2::1424380312
1482 | 29::59::1::1424380312
1483 | 29::61::1::1424380312
1484 | 29::62::1::1424380312
1485 | 29::67::1::1424380312
1486 | 29::68::3::1424380312
1487 | 29::69::1::1424380312
1488 | 29::70::1::1424380312
1489 | 29::74::1::1424380312
1490 | 29::75::1::1424380312
1491 | 29::79::2::1424380312
1492 | 29::80::1::1424380312
1493 | 29::81::2::1424380312
1494 | 29::83::1::1424380312
1495 | 29::85::1::1424380312
1496 | 29::86::1::1424380312
1497 | 29::90::4::1424380312
1498 | 29::93::1::1424380312
1499 | 29::94::4::1424380312
1500 | 29::97::1::1424380312
1501 | 29::99::1::1424380312
1502 |
--------------------------------------------------------------------------------
/apache-spark-tutorials/spark+deeplearning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import os\n",
12 | "os.environ['PYSPARK_SUBMIT_ARGS'] = \"--packages=databricks:spark-deep-learning:1.5.0-spark2.4-s_2.11 pyspark-shell\""
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {
19 | "collapsed": true
20 | },
21 | "outputs": [],
22 | "source": [
23 | "from pyspark.context import SparkContext\n",
24 | "from pyspark.sql.session import SparkSession\n",
25 | "\n",
26 | "sc = SparkContext('local')\n",
27 | "spark = SparkSession(sc)"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 3,
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "data": {
37 | "text/html": [
38 | "\n",
39 | "\n",
52 | "
\n",
53 | " \n",
54 | " \n",
55 | " \n",
56 | " image \n",
57 | " \n",
58 | " \n",
59 | " \n",
60 | " \n",
61 | " 0 \n",
62 | " (file:/Users/agnieszkabiernacka/Desktop/git/Da... \n",
63 | " \n",
64 | " \n",
65 | " 1 \n",
66 | " (file:/Users/agnieszkabiernacka/Desktop/git/Da... \n",
67 | " \n",
68 | " \n",
69 | " 2 \n",
70 | " (file:/Users/agnieszkabiernacka/Desktop/git/Da... \n",
71 | " \n",
72 | " \n",
73 | "
\n",
74 | "
"
75 | ],
76 | "text/plain": [
77 | " image\n",
78 | "0 (file:/Users/agnieszkabiernacka/Desktop/git/Da...\n",
79 | "1 (file:/Users/agnieszkabiernacka/Desktop/git/Da...\n",
80 | "2 (file:/Users/agnieszkabiernacka/Desktop/git/Da..."
81 | ]
82 | },
83 | "execution_count": 3,
84 | "metadata": {},
85 | "output_type": "execute_result"
86 | }
87 | ],
88 | "source": [
89 | "from pyspark.ml.image import ImageSchema\n",
90 | "image_df = ImageSchema.readImages(\"images\")\n",
91 | "image_df.toPandas()"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 4,
97 | "metadata": {
98 | "collapsed": true
99 | },
100 | "outputs": [],
101 | "source": [
102 | "image_df = ImageSchema.readImages(\"images\")"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "metadata": {
108 | "collapsed": true
109 | },
110 | "source": [
111 | "## Transfer learning"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 5,
117 | "metadata": {
118 | "collapsed": true
119 | },
120 | "outputs": [],
121 | "source": [
122 | "from pyspark.sql.functions import *\n",
123 | "tulips_df = ImageSchema.readImages(\"flower_photos/tulips\").withColumn(\"label\", lit(1)).limit(10)\n",
124 | "roses_df = ImageSchema.readImages(\"flower_photos/roses\").withColumn(\"label\", lit(0)).limit(10)\n"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 6,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "\n",
134 | "tulips_train, tulips_test, _ = tulips_df.randomSplit([0.6, 0.3, 0.1]) # use larger training sets (e.g. [0.6, 0.4] for non-community edition clusters)\n",
135 | "roses_train, roses_test, _ = roses_df.randomSplit([0.6, 0.3, 0.1]) # use larger training sets (e.g. [0.6, 0.4] for non-community edition clusters)\n",
136 | "train_df = tulips_train.unionAll(roses_train)\n",
137 | "test_df = tulips_test.unionAll(roses_test)\n",
138 | "\n",
139 | "# Under the hood, each of the partitions is fully loaded in memory, which may be expensive.\n",
140 | "# This ensure that each of the paritions has a small size.\n",
141 | "train_df = train_df.repartition(100)\n",
142 | "test_df = test_df.repartition(100)"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 7,
148 | "metadata": {},
149 | "outputs": [
150 | {
151 | "name": "stderr",
152 | "output_type": "stream",
153 | "text": [
154 | "/Users/agnieszkabiernacka/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
155 | " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
156 | "/Users/agnieszkabiernacka/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
157 | " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
158 | "/Users/agnieszkabiernacka/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
159 | " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
160 | "/Users/agnieszkabiernacka/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
161 | " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
162 | "/Users/agnieszkabiernacka/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
163 | " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
164 | "/Users/agnieszkabiernacka/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
165 | " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n",
166 | "/Users/agnieszkabiernacka/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
167 | " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
168 | "/Users/agnieszkabiernacka/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
169 | " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
170 | "/Users/agnieszkabiernacka/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
171 | " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
172 | "/Users/agnieszkabiernacka/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
173 | " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
174 | "/Users/agnieszkabiernacka/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
175 | " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
176 | "/Users/agnieszkabiernacka/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
177 | " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n",
178 | "Using TensorFlow backend.\n"
179 | ]
180 | }
181 | ],
182 | "source": [
183 | "from pyspark.ml.classification import LogisticRegression\n",
184 | "from pyspark.ml import Pipeline\n",
185 | "from sparkdl import DeepImageFeaturizer \n",
186 | "\n",
187 | "featurizer = DeepImageFeaturizer(inputCol=\"image\", outputCol=\"features\", modelName=\"InceptionV3\")\n",
188 | "lr = LogisticRegression(maxIter=5, regParam=0.05, elasticNetParam=0.3, labelCol=\"label\")\n",
189 | "p = Pipeline(stages=[featurizer, lr])\n",
190 | "\n",
191 | "p_model = p.fit(train_df)"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 8,
197 | "metadata": {},
198 | "outputs": [
199 | {
200 | "name": "stdout",
201 | "output_type": "stream",
202 | "text": [
203 | "Test set accuracy = 0.7\n"
204 | ]
205 | }
206 | ],
207 | "source": [
208 | "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
209 | "\n",
210 | "tested_df = p_model.transform(test_df)\n",
211 | "evaluator = MulticlassClassificationEvaluator(metricName=\"accuracy\")\n",
212 | "print(\"Test set accuracy = \" + str(evaluator.evaluate(tested_df.select(\"prediction\", \"label\"))))"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": 9,
218 | "metadata": {},
219 | "outputs": [
220 | {
221 | "data": {
222 | "text/html": [
223 | "\n",
224 | "\n",
237 | "
\n",
238 | " \n",
239 | " \n",
240 | " \n",
241 | " image \n",
242 | " label \n",
243 | " features \n",
244 | " rawPrediction \n",
245 | " probability \n",
246 | " prediction \n",
247 | " \n",
248 | " \n",
249 | " \n",
250 | " \n",
251 | " 0 \n",
252 | " (file:/Users/agnieszkabiernacka/Desktop/git/Da... \n",
253 | " 1 \n",
254 | " [0.2271287590265274, 0.0, 0.0, 0.0, 0.0, 0.0, ... \n",
255 | " [-2.1270269158902897, 2.1270269158902897] \n",
256 | " [0.10649756717954208, 0.8935024328204579] \n",
257 | " 1.0 \n",
258 | " \n",
259 | " \n",
260 | " 1 \n",
261 | " (file:/Users/agnieszkabiernacka/Desktop/git/Da... \n",
262 | " 1 \n",
263 | " [0.2719690799713135, 0.5799758434295654, 0.0, ... \n",
264 | " [-2.091161151399062, 2.091161151399062] \n",
265 | " [0.10995888340422419, 0.8900411165957758] \n",
266 | " 1.0 \n",
267 | " \n",
268 | " \n",
269 | " 2 \n",
270 | " (file:/Users/agnieszkabiernacka/Desktop/git/Da... \n",
271 | " 1 \n",
272 | " [0.42978203296661377, 0.0, 0.19137954711914062... \n",
273 | " [-1.7315559533843232, 1.7315559533843232] \n",
274 | " [0.1503886644381542, 0.8496113355618459] \n",
275 | " 1.0 \n",
276 | " \n",
277 | " \n",
278 | " 3 \n",
279 | " (file:/Users/agnieszkabiernacka/Desktop/git/Da... \n",
280 | " 1 \n",
281 | " [0.0, 0.0, 0.0, 0.518147349357605, 0.0, 0.0, 0... \n",
282 | " [-27.620443125823375, 27.620443125823375] \n",
283 | " [1.0106341348329834e-12, 0.9999999999989895] \n",
284 | " 1.0 \n",
285 | " \n",
286 | " \n",
287 | " 4 \n",
288 | " (file:/Users/agnieszkabiernacka/Desktop/git/Da... \n",
289 | " 1 \n",
290 | " [0.0, 0.2156921923160553, 0.6671097278594971, ... \n",
291 | " [-17.192780406046005, 17.192780406046005] \n",
292 | " [3.4140535298693253e-08, 0.9999999658594647] \n",
293 | " 1.0 \n",
294 | " \n",
295 | " \n",
296 | " 5 \n",
297 | " (file:/Users/agnieszkabiernacka/Desktop/git/Da... \n",
298 | " 0 \n",
299 | " [0.7097122073173523, 0.07345157861709595, 0.26... \n",
300 | " [-469.9846518379927, 469.9846518379927] \n",
301 | " [7.731417527395182e-205, 1.0] \n",
302 | " 1.0 \n",
303 | " \n",
304 | " \n",
305 | " 6 \n",
306 | " (file:/Users/agnieszkabiernacka/Desktop/git/Da... \n",
307 | " 0 \n",
308 | " [0.0, 0.3295484185218811, 1.0895251035690308, ... \n",
309 | " [0.01773589645516419, -0.01773589645516419] \n",
310 | " [0.5044338578874581, 0.49556614211254185] \n",
311 | " 0.0 \n",
312 | " \n",
313 | " \n",
314 | " 7 \n",
315 | " (file:/Users/agnieszkabiernacka/Desktop/git/Da... \n",
316 | " 0 \n",
317 | " [0.7112337946891785, 0.0, 0.3489874005317688, ... \n",
318 | " [-7.70352164348588, 7.70352164348588] \n",
319 | " [0.0004510317702174508, 0.9995489682297825] \n",
320 | " 1.0 \n",
321 | " \n",
322 | " \n",
323 | " 8 \n",
324 | " (file:/Users/agnieszkabiernacka/Desktop/git/Da... \n",
325 | " 0 \n",
326 | " [0.0, 0.0, 0.0, 0.6153160333633423, 0.0, 0.0, ... \n",
327 | " [1.0048001739756212, -1.0048001739756212] \n",
328 | " [0.732001302709658, 0.2679986972903419] \n",
329 | " 0.0 \n",
330 | " \n",
331 | " \n",
332 | " 9 \n",
333 | " (file:/Users/agnieszkabiernacka/Desktop/git/Da... \n",
334 | " 0 \n",
335 | " [0.0, 0.27889925241470337, 0.07056798785924911... \n",
336 | " [-202.93339892805423, 202.93339892805423] \n",
337 | " [7.364523488877299e-89, 1.0] \n",
338 | " 1.0 \n",
339 | " \n",
340 | " \n",
341 | "
\n",
342 | "
"
343 | ],
344 | "text/plain": [
345 | " image label \\\n",
346 | "0 (file:/Users/agnieszkabiernacka/Desktop/git/Da... 1 \n",
347 | "1 (file:/Users/agnieszkabiernacka/Desktop/git/Da... 1 \n",
348 | "2 (file:/Users/agnieszkabiernacka/Desktop/git/Da... 1 \n",
349 | "3 (file:/Users/agnieszkabiernacka/Desktop/git/Da... 1 \n",
350 | "4 (file:/Users/agnieszkabiernacka/Desktop/git/Da... 1 \n",
351 | "5 (file:/Users/agnieszkabiernacka/Desktop/git/Da... 0 \n",
352 | "6 (file:/Users/agnieszkabiernacka/Desktop/git/Da... 0 \n",
353 | "7 (file:/Users/agnieszkabiernacka/Desktop/git/Da... 0 \n",
354 | "8 (file:/Users/agnieszkabiernacka/Desktop/git/Da... 0 \n",
355 | "9 (file:/Users/agnieszkabiernacka/Desktop/git/Da... 0 \n",
356 | "\n",
357 | " features \\\n",
358 | "0 [0.2271287590265274, 0.0, 0.0, 0.0, 0.0, 0.0, ... \n",
359 | "1 [0.2719690799713135, 0.5799758434295654, 0.0, ... \n",
360 | "2 [0.42978203296661377, 0.0, 0.19137954711914062... \n",
361 | "3 [0.0, 0.0, 0.0, 0.518147349357605, 0.0, 0.0, 0... \n",
362 | "4 [0.0, 0.2156921923160553, 0.6671097278594971, ... \n",
363 | "5 [0.7097122073173523, 0.07345157861709595, 0.26... \n",
364 | "6 [0.0, 0.3295484185218811, 1.0895251035690308, ... \n",
365 | "7 [0.7112337946891785, 0.0, 0.3489874005317688, ... \n",
366 | "8 [0.0, 0.0, 0.0, 0.6153160333633423, 0.0, 0.0, ... \n",
367 | "9 [0.0, 0.27889925241470337, 0.07056798785924911... \n",
368 | "\n",
369 | " rawPrediction \\\n",
370 | "0 [-2.1270269158902897, 2.1270269158902897] \n",
371 | "1 [-2.091161151399062, 2.091161151399062] \n",
372 | "2 [-1.7315559533843232, 1.7315559533843232] \n",
373 | "3 [-27.620443125823375, 27.620443125823375] \n",
374 | "4 [-17.192780406046005, 17.192780406046005] \n",
375 | "5 [-469.9846518379927, 469.9846518379927] \n",
376 | "6 [0.01773589645516419, -0.01773589645516419] \n",
377 | "7 [-7.70352164348588, 7.70352164348588] \n",
378 | "8 [1.0048001739756212, -1.0048001739756212] \n",
379 | "9 [-202.93339892805423, 202.93339892805423] \n",
380 | "\n",
381 | " probability prediction \n",
382 | "0 [0.10649756717954208, 0.8935024328204579] 1.0 \n",
383 | "1 [0.10995888340422419, 0.8900411165957758] 1.0 \n",
384 | "2 [0.1503886644381542, 0.8496113355618459] 1.0 \n",
385 | "3 [1.0106341348329834e-12, 0.9999999999989895] 1.0 \n",
386 | "4 [3.4140535298693253e-08, 0.9999999658594647] 1.0 \n",
387 | "5 [7.731417527395182e-205, 1.0] 1.0 \n",
388 | "6 [0.5044338578874581, 0.49556614211254185] 0.0 \n",
389 | "7 [0.0004510317702174508, 0.9995489682297825] 1.0 \n",
390 | "8 [0.732001302709658, 0.2679986972903419] 0.0 \n",
391 | "9 [7.364523488877299e-89, 1.0] 1.0 "
392 | ]
393 | },
394 | "execution_count": 9,
395 | "metadata": {},
396 | "output_type": "execute_result"
397 | }
398 | ],
399 | "source": [
400 | "tested_df.toPandas()"
401 | ]
402 | },
403 | {
404 | "cell_type": "markdown",
405 | "metadata": {},
406 | "source": [
407 | "## Learning"
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": 10,
413 | "metadata": {},
414 | "outputs": [],
415 | "source": [
416 | "import PIL.Image\n",
417 | "import numpy as np\n",
418 | "from keras.applications.imagenet_utils import preprocess_input\n",
419 | "from sparkdl.estimators.keras_image_file_estimator import KerasImageFileEstimator\n",
420 | "\n",
421 | "def load_image_from_uri(local_uri):\n",
422 | " img = (PIL.Image.open(local_uri).convert('RGB').resize((299, 299), PIL.Image.ANTIALIAS))\n",
423 | " img_arr = np.array(img).astype(np.float32)\n",
424 | " img_tnsr = preprocess_input(img_arr[np.newaxis, :])\n",
425 | " return img_tnsr"
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": 11,
431 | "metadata": {},
432 | "outputs": [],
433 | "source": [
434 | "from keras.layers import Activation, Dense, Flatten\n",
435 | "from keras.models import Sequential\n",
436 | "\n",
437 | "model = Sequential()\n",
438 | "model.add(Flatten(input_shape=(299, 299, 3)))\n",
439 | "model.add(Dense(2))\n",
440 | "model.add(Activation(\"softmax\"))\n",
441 | "model.save('model-full.h5')"
442 | ]
443 | },
444 | {
445 | "cell_type": "code",
446 | "execution_count": 12,
447 | "metadata": {
448 | "collapsed": true
449 | },
450 | "outputs": [],
451 | "source": [
452 | "estimator = KerasImageFileEstimator( inputCol=\"uri\",\n",
453 | " outputCol=\"prediction\",\n",
454 | " labelCol=\"one_hot_label\",\n",
455 | " imageLoader=load_image_from_uri,\n",
456 | " kerasOptimizer='adam',\n",
457 | " kerasLoss='categorical_crossentropy',\n",
458 | " modelFile='model-full.h5' # local file path for model\n",
459 | " ) "
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": 14,
465 | "metadata": {},
466 | "outputs": [],
467 | "source": [
468 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
469 | "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder\n",
470 | "\n",
471 | "paramGrid = (\n",
472 | " ParamGridBuilder()\n",
473 | " .addGrid(estimator.kerasFitParams, [{\"batch_size\": 32, \"verbose\": 0},\n",
474 | " {\"batch_size\": 64, \"verbose\": 0}])\n",
475 | " .build()\n",
476 | ")\n",
477 | "bc = BinaryClassificationEvaluator(rawPredictionCol=\"prediction\", labelCol=\"label\" )\n",
478 | "cv = CrossValidator(estimator=estimator, estimatorParamMaps=paramGrid, evaluator=bc, numFolds=2)\n"
479 | ]
480 | },
481 | {
482 | "cell_type": "code",
483 | "execution_count": 15,
484 | "metadata": {},
485 | "outputs": [],
486 | "source": [
487 | "\n",
488 | "train_df = spark.createDataFrame([\n",
489 | " (\"flower_photos/tulips/10791227_7168491604.jpg\", 1),\n",
490 | " (\"flower_photos/tulips/11746080_963537acdc.jpg\", 1),\n",
491 | " (\"flower_photos/tulips/11746276_de3dec8201.jpg\", 2),\n",
492 | " (\"flower_photos/tulips/11746367_d23a35b085_n.jpg\", 2),\n",
493 | " (\"flower_photos/roses/12240303_80d87f77a3_n.jpg\", 0),\n",
494 | " (\"flower_photos/roses/22679076_bdb4c24401_m.jpg\", 0),\n",
495 | " (\"flower_photos/roses/24781114_bc83aa811e_n.jpg\", 0)\n",
496 | "], [\"uri\", \"label\"])"
497 | ]
498 | },
499 | {
500 | "cell_type": "code",
501 | "execution_count": 16,
502 | "metadata": {},
503 | "outputs": [],
504 | "source": [
505 | "test_df = spark.createDataFrame([\n",
506 | " (\"flower_photos/tulips/10791227_7168491604.jpg\", 1),\n",
507 | " (\"flower_photos/roses/24781114_bc83aa811e_n.jpg\", 0)\n",
508 | "], [\"uri\", \"label\"])"
509 | ]
510 | },
511 | {
512 | "cell_type": "code",
513 | "execution_count": 17,
514 | "metadata": {},
515 | "outputs": [],
516 | "source": [
517 | "from pyspark.ml.feature import OneHotEncoderEstimator\n",
518 | "\n",
519 | "oh_encoder = OneHotEncoderEstimator(inputCols=[\"label\"],\n",
520 | " outputCols=[\"one_hot_label\"])\n",
521 | "oh_model = oh_encoder.fit(train_df)"
522 | ]
523 | },
524 | {
525 | "cell_type": "code",
526 | "execution_count": 18,
527 | "metadata": {
528 | "collapsed": true
529 | },
530 | "outputs": [],
531 | "source": [
532 | "\n",
533 | "train_df = oh_model.transform(train_df)\n",
534 | "test_df = oh_model.transform(test_df)"
535 | ]
536 | },
537 | {
538 | "cell_type": "code",
539 | "execution_count": 19,
540 | "metadata": {},
541 | "outputs": [
542 | {
543 | "data": {
544 | "text/html": [
545 | "\n",
546 | "\n",
559 | "
\n",
560 | " \n",
561 | " \n",
562 | " \n",
563 | " uri \n",
564 | " label \n",
565 | " one_hot_label \n",
566 | " \n",
567 | " \n",
568 | " \n",
569 | " \n",
570 | " 0 \n",
571 | " flower_photos/tulips/10791227_7168491604.jpg \n",
572 | " 1 \n",
573 | " (0.0, 1.0) \n",
574 | " \n",
575 | " \n",
576 | "
\n",
577 | "
"
578 | ],
579 | "text/plain": [
580 | " uri label one_hot_label\n",
581 | "0 flower_photos/tulips/10791227_7168491604.jpg 1 (0.0, 1.0)"
582 | ]
583 | },
584 | "execution_count": 19,
585 | "metadata": {},
586 | "output_type": "execute_result"
587 | }
588 | ],
589 | "source": [
590 | "train_df.limit(1).toPandas()"
591 | ]
592 | },
593 | {
594 | "cell_type": "code",
595 | "execution_count": 20,
596 | "metadata": {},
597 | "outputs": [
598 | {
599 | "name": "stdout",
600 | "output_type": "stream",
601 | "text": [
602 | "WARNING:tensorflow:From /private/var/folders/vm/tl51s8cd6x160xdb14hj7n1c0000gn/T/spark-9365f0fd-065b-4b45-a90f-741065177fe2/userFiles-0b99936d-2e9f-402b-b808-94dc22ea5530/databricks_spark-deep-learning-1.5.0-spark2.4-s_2.11.jar/sparkdl/transformers/keras_utils.py:37: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.\n",
603 | "\n",
604 | "WARNING:tensorflow:From /private/var/folders/vm/tl51s8cd6x160xdb14hj7n1c0000gn/T/spark-9365f0fd-065b-4b45-a90f-741065177fe2/userFiles-0b99936d-2e9f-402b-b808-94dc22ea5530/databricks_spark-deep-learning-1.5.0-spark2.4-s_2.11.jar/sparkdl/graph/utils.py:220: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n",
605 | "Instructions for updating:\n",
606 | "Use `tf.compat.v1.graph_util.convert_variables_to_constants`\n",
607 | "WARNING:tensorflow:From /Users/agnieszkabiernacka/.local/lib/python3.6/site-packages/tensorflow/python/framework/graph_util_impl.py:270: extract_sub_graph (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n",
608 | "Instructions for updating:\n",
609 | "Use `tf.compat.v1.graph_util.extract_sub_graph`\n",
610 | "INFO:tensorflow:Froze 2 variables.\n",
611 | "INFO:tensorflow:Converted 2 variables to const ops.\n"
612 | ]
613 | },
614 | {
615 | "name": "stderr",
616 | "output_type": "stream",
617 | "text": [
618 | "/Users/agnieszkabiernacka/anaconda3/lib/python3.6/site-packages/keras/engine/saving.py:341: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually.\n",
619 | " warnings.warn('No training configuration found in save file: '\n"
620 | ]
621 | },
622 | {
623 | "name": "stdout",
624 | "output_type": "stream",
625 | "text": [
626 | "WARNING:tensorflow:From /private/var/folders/vm/tl51s8cd6x160xdb14hj7n1c0000gn/T/spark-9365f0fd-065b-4b45-a90f-741065177fe2/userFiles-0b99936d-2e9f-402b-b808-94dc22ea5530/databricks_spark-deep-learning-1.5.0-spark2.4-s_2.11.jar/sparkdl/transformers/tf_image.py:180: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.\n",
627 | "\n",
628 | "INFO:tensorflow:Froze 0 variables.\n",
629 | "INFO:tensorflow:Converted 0 variables to const ops.\n",
630 | "WARNING:tensorflow:From /private/var/folders/vm/tl51s8cd6x160xdb14hj7n1c0000gn/T/spark-9365f0fd-065b-4b45-a90f-741065177fe2/userFiles-0b99936d-2e9f-402b-b808-94dc22ea5530/databricks_tensorframes-0.6.0-s_2.11.jar/tensorframes/core.py:101: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n",
631 | "\n",
632 | "WARNING:tensorflow:From /private/var/folders/vm/tl51s8cd6x160xdb14hj7n1c0000gn/T/spark-9365f0fd-065b-4b45-a90f-741065177fe2/userFiles-0b99936d-2e9f-402b-b808-94dc22ea5530/databricks_tensorframes-0.6.0-s_2.11.jar/tensorframes/core.py:44: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.\n",
633 | "\n",
634 | "WARNING:tensorflow:From /private/var/folders/vm/tl51s8cd6x160xdb14hj7n1c0000gn/T/spark-9365f0fd-065b-4b45-a90f-741065177fe2/userFiles-0b99936d-2e9f-402b-b808-94dc22ea5530/databricks_tensorframes-0.6.0-s_2.11.jar/tensorframes/core.py:62: The name tf.train.write_graph is deprecated. Please use tf.io.write_graph instead.\n",
635 | "\n",
636 | "INFO:tensorflow:Froze 2 variables.\n",
637 | "INFO:tensorflow:Converted 2 variables to const ops.\n",
638 | "INFO:tensorflow:Froze 0 variables.\n",
639 | "INFO:tensorflow:Converted 0 variables to const ops.\n",
640 | "INFO:tensorflow:Froze 2 variables.\n",
641 | "INFO:tensorflow:Converted 2 variables to const ops.\n",
642 | "INFO:tensorflow:Froze 0 variables.\n",
643 | "INFO:tensorflow:Converted 0 variables to const ops.\n",
644 | "INFO:tensorflow:Froze 2 variables.\n",
645 | "INFO:tensorflow:Converted 2 variables to const ops.\n",
646 | "INFO:tensorflow:Froze 0 variables.\n",
647 | "INFO:tensorflow:Converted 0 variables to const ops.\n"
648 | ]
649 | }
650 | ],
651 | "source": [
652 | "cvModel = cv.fit(train_df)"
653 | ]
654 | },
655 | {
656 | "cell_type": "code",
657 | "execution_count": 21,
658 | "metadata": {},
659 | "outputs": [
660 | {
661 | "name": "stdout",
662 | "output_type": "stream",
663 | "text": [
664 | "INFO:tensorflow:Froze 2 variables.\n",
665 | "INFO:tensorflow:Converted 2 variables to const ops.\n"
666 | ]
667 | },
668 | {
669 | "name": "stderr",
670 | "output_type": "stream",
671 | "text": [
672 | "/Users/agnieszkabiernacka/anaconda3/lib/python3.6/site-packages/keras/engine/saving.py:341: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually.\n",
673 | " warnings.warn('No training configuration found in save file: '\n"
674 | ]
675 | },
676 | {
677 | "name": "stdout",
678 | "output_type": "stream",
679 | "text": [
680 | "INFO:tensorflow:Froze 0 variables.\n",
681 | "INFO:tensorflow:Converted 0 variables to const ops.\n"
682 | ]
683 | },
684 | {
685 | "data": {
686 | "text/html": [
687 | "\n",
688 | "\n",
701 | "
\n",
702 | " \n",
703 | " \n",
704 | " \n",
705 | " uri \n",
706 | " label \n",
707 | " one_hot_label \n",
708 | " prediction \n",
709 | " \n",
710 | " \n",
711 | " \n",
712 | " \n",
713 | " 0 \n",
714 | " flower_photos/tulips/10791227_7168491604.jpg \n",
715 | " 1 \n",
716 | " (0.0, 1.0) \n",
717 | " [1.0, 0.0] \n",
718 | " \n",
719 | " \n",
720 | "
\n",
721 | "
"
722 | ],
723 | "text/plain": [
724 | " uri label one_hot_label \\\n",
725 | "0 flower_photos/tulips/10791227_7168491604.jpg 1 (0.0, 1.0) \n",
726 | "\n",
727 | " prediction \n",
728 | "0 [1.0, 0.0] "
729 | ]
730 | },
731 | "execution_count": 21,
732 | "metadata": {},
733 | "output_type": "execute_result"
734 | }
735 | ],
736 | "source": [
737 | "cvModel.transform(test_df).limit(1).toPandas()"
738 | ]
739 | },
740 | {
741 | "cell_type": "code",
742 | "execution_count": 22,
743 | "metadata": {},
744 | "outputs": [
745 | {
746 | "name": "stdout",
747 | "output_type": "stream",
748 | "text": [
749 | "INFO:tensorflow:Froze 2 variables.\n",
750 | "INFO:tensorflow:Converted 2 variables to const ops.\n",
751 | "INFO:tensorflow:Froze 0 variables.\n",
752 | "INFO:tensorflow:Converted 0 variables to const ops.\n"
753 | ]
754 | },
755 | {
756 | "name": "stderr",
757 | "output_type": "stream",
758 | "text": [
759 | "/Users/agnieszkabiernacka/anaconda3/lib/python3.6/site-packages/keras/engine/saving.py:341: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually.\n",
760 | " warnings.warn('No training configuration found in save file: '\n"
761 | ]
762 | },
763 | {
764 | "data": {
765 | "text/plain": [
766 | "0.0"
767 | ]
768 | },
769 | "execution_count": 22,
770 | "metadata": {},
771 | "output_type": "execute_result"
772 | }
773 | ],
774 | "source": [
775 | "bc.evaluate(cvModel.transform(test_df))"
776 | ]
777 | },
778 | {
779 | "cell_type": "code",
780 | "execution_count": null,
781 | "metadata": {
782 | "collapsed": true
783 | },
784 | "outputs": [],
785 | "source": []
786 | }
787 | ],
788 | "metadata": {
789 | "kernelspec": {
790 | "display_name": "Python 3",
791 | "language": "python",
792 | "name": "python3"
793 | },
794 | "language_info": {
795 | "codemirror_mode": {
796 | "name": "ipython",
797 | "version": 3
798 | },
799 | "file_extension": ".py",
800 | "mimetype": "text/x-python",
801 | "name": "python",
802 | "nbconvert_exporter": "python",
803 | "pygments_lexer": "ipython3",
804 | "version": "3.6.3"
805 | }
806 | },
807 | "nbformat": 4,
808 | "nbformat_minor": 2
809 | }
810 |
--------------------------------------------------------------------------------
/docker-project-boilerplate/.dockerignore:
--------------------------------------------------------------------------------
1 | *
2 | !/src
3 | !/notebooks
4 | !requirements.txt
--------------------------------------------------------------------------------
/docker-project-boilerplate/Dockerfile:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Miquido. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | #
16 | # Ubuntu-based, CPU-only environment for datascience / machine-learning usage
17 | # based on https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/dockerfiles/dockerfiles
18 | #
19 |
20 | ARG UBUNTU_VERSION=16.04
21 | FROM ubuntu:${UBUNTU_VERSION}
22 |
23 | RUN apt-get update && apt-get install -y --no-install-recommends \
24 | build-essential \
25 | curl \
26 | libcurl3-dev \
27 | libfreetype6-dev \
28 | libhdf5-serial-dev \
29 | libpng12-dev \
30 | libzmq3-dev \
31 | pkg-config \
32 | python-dev \
33 | rsync \
34 | software-properties-common \
35 | unzip \
36 | zip \
37 | zlib1g-dev \
38 | openjdk-8-jdk \
39 | openjdk-8-jre-headless \
40 | && \
41 | apt-get clean && \
42 | rm -rf /var/lib/apt/lists/*
43 |
44 | RUN add-apt-repository ppa:jonathonf/python-3.6
45 | RUN apt-get update
46 | RUN apt-get install -y build-essential python3.6 python3.6-dev python3-pip python3.6-venv \
47 | python3-pip
48 |
49 |
50 | # setup python3.6 as default choice for command python3
51 | RUN rm /usr/bin/python3 && \
52 | ln -s /usr/bin/python3.6 /usr/bin/python3
53 |
54 | RUN pip3 install --upgrade \
55 | pip \
56 | setuptools \
57 | scikit-learn
58 |
59 |
60 | RUN apt-get update && apt-get install -y \
61 | build-essential \
62 | curl \
63 | openjdk-8-jdk \
64 | swig
65 |
66 | # Install bazel
67 | RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
68 | curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
69 | apt-get update && \
70 | apt-get install -y bazel
71 |
72 |
73 | COPY ./requirements.txt requirements.txt
74 | RUN pip install -r requirements.txt
75 |
76 | RUN mkdir /notebooks && chmod a+rwx /notebooks
77 | RUN mkdir /.local && chmod a+rwx /.local
78 |
79 |
80 | WORKDIR /root
81 | #expose ports for jupter and tensorboard
82 | EXPOSE 8888 6006
83 |
84 | CMD ["bash"]
--------------------------------------------------------------------------------
/docker-project-boilerplate/README.md:
--------------------------------------------------------------------------------
1 | # Boilerplate for datascience projects using docker.
2 |
3 | This project basic setup is based on:
4 | - [All-in-one Docker image for Deep Learning](https://github.com/floydhub/dl-docker)
5 | - [Tensorflow docker files](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/dockerfiles/dockerfiles)
6 |
7 | ----
8 | #### Building image:
9 | 0. Install Docker & start docker daemon
10 | 1. Build image: ```docker build -f Dockerfile ./ -t 'ds-default:cpu' ```
11 |
12 |
13 | #### Run container from build image
14 | ```docker run -it -p 8888:8888 -v $(pwd):/root/ ds-default:cpu ```
15 |
16 | | Parameter | Explanation |
17 | |----------------|-------------|
18 | |`-it` | This creates an interactive terminal you can use to iteract with your container |
19 | |`-p 8888:8888 ` | This exposes the ports inside the container so they can be accessed from the host. The default iPython Notebook runs on port 8888 and Tensorboard on 6006 |
20 | |`-v $(pwd):/root/` | This shares the whole project root folder `$(pwd)` on your host machine to `/root` inside your container. Any data written to this folder by the container will be persistent.
21 | |`ds-default:cpu` | This the image that you want to run. The format is `image:tag`. In our case, we use the image `dl-docker` and tag `gpu` or `cpu` to spin up the appropriate image |
22 |
23 |
24 |
25 | ---
26 |
27 |
28 | #### Running jupyter notebook backend inside container - develop locally in browser:
29 | 1. inside container ```jupyter notebook --notebook-dir=notebooks --ip 0.0.0.0 --no-browser --allow-root"```
30 | 2. enter [http://localhost:8888/](http://localhost:8888/) or logged in console link with token
31 |
32 |
33 | ----
34 |
35 | #### Usefull commands:
36 |
37 | | command | effect |
38 | |-------------|--------|
39 | |```docker ps``` | list all running containers, add flag ```-a``` to list all containers|
40 | |```docker rm $(docker ps -a -q)``` | remove all containers|
41 | | ```docker rmi $(docker images -q)```| remove all images|
42 | |``` docker exec -it ```| enter running container|
43 |
44 |
45 |
--------------------------------------------------------------------------------
/docker-project-boilerplate/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '2'
2 |
3 | services:
4 | app:
5 | build: .
6 | ports:
7 | - "8888:8888"
8 | - "6006:6006"
9 | volumes:
10 | - .:/root
11 |
--------------------------------------------------------------------------------
/docker-project-boilerplate/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/miquido/DataScience/4db881a7bd0092f0679c22b8f9b3c11e1f8a445b/docker-project-boilerplate/requirements.txt
--------------------------------------------------------------------------------
/python + API/Python + REST API.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Using \"requests\" package:"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "See also: http://docs.python-requests.org/en/master/user/quickstart/"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import requests\n",
26 | "url = 'https://httpbin.org/anything'\n",
27 | "data = '''{\n",
28 | " \"query\": {\n",
29 | " \"bool\": {\n",
30 | " \"must\": [\n",
31 | " {\n",
32 | " \"text\": {\n",
33 | " \"record.document\": \"SOME_JOURNAL\"\n",
34 | " }\n",
35 | " },\n",
36 | " {\n",
37 | " \"text\": {\n",
38 | " \"record.articleTitle\": \"farmers\"\n",
39 | " }\n",
40 | " }\n",
41 | " ],\n",
42 | " \"must_not\": [],\n",
43 | " \"should\": []\n",
44 | " }\n",
45 | " },\n",
46 | " \"from\": 0,\n",
47 | " \"size\": 50,\n",
48 | " \"sort\": [],\n",
49 | " \"facets\": {}\n",
50 | "}'''\n",
51 | "response = requests.post(url, data=data)"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 3,
57 | "metadata": {},
58 | "outputs": [
59 | {
60 | "data": {
61 | "text/plain": [
62 | "{'args': {},\n",
63 | " 'data': '{\\n \"query\": {\\n \"bool\": {\\n \"must\": [\\n {\\n \"text\": {\\n \"record.document\": \"SOME_JOURNAL\"\\n }\\n },\\n {\\n \"text\": {\\n \"record.articleTitle\": \"farmers\"\\n }\\n }\\n ],\\n \"must_not\": [],\\n \"should\": []\\n }\\n },\\n \"from\": 0,\\n \"size\": 50,\\n \"sort\": [],\\n \"facets\": {}\\n}',\n",
64 | " 'files': {},\n",
65 | " 'form': {},\n",
66 | " 'headers': {'Accept': '*/*',\n",
67 | " 'Accept-Encoding': 'gzip, deflate',\n",
68 | " 'Connection': 'close',\n",
69 | " 'Content-Length': '359',\n",
70 | " 'Host': 'httpbin.org',\n",
71 | " 'User-Agent': 'python-requests/2.18.4'},\n",
72 | " 'json': {'facets': {},\n",
73 | " 'from': 0,\n",
74 | " 'query': {'bool': {'must': [{'text': {'record.document': 'SOME_JOURNAL'}},\n",
75 | " {'text': {'record.articleTitle': 'farmers'}}],\n",
76 | " 'must_not': [],\n",
77 | " 'should': []}},\n",
78 | " 'size': 50,\n",
79 | " 'sort': []},\n",
80 | " 'method': 'POST',\n",
81 | " 'origin': '93.180.179.112',\n",
82 | " 'url': 'https://httpbin.org/anything'}"
83 | ]
84 | },
85 | "execution_count": 3,
86 | "metadata": {},
87 | "output_type": "execute_result"
88 | }
89 | ],
90 | "source": [
91 | "response.json()"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 9,
97 | "metadata": {},
98 | "outputs": [
99 | {
100 | "data": {
101 | "text/plain": [
102 | "50"
103 | ]
104 | },
105 | "execution_count": 9,
106 | "metadata": {},
107 | "output_type": "execute_result"
108 | }
109 | ],
110 | "source": [
111 | "#Data could be loaded to object\n",
112 | "import json\n",
113 | "x = json.loads(response.json()['data'])\n",
114 | "x['size']"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 12,
120 | "metadata": {},
121 | "outputs": [
122 | {
123 | "data": {
124 | "text/plain": [
125 | "{'args': {},\n",
126 | " 'data': '',\n",
127 | " 'files': {},\n",
128 | " 'form': {},\n",
129 | " 'headers': {'Accept': '*/*',\n",
130 | " 'Accept-Encoding': 'gzip, deflate',\n",
131 | " 'Connection': 'close',\n",
132 | " 'Host': 'httpbin.org',\n",
133 | " 'User-Agent': 'my-app/0.0.1'},\n",
134 | " 'json': None,\n",
135 | " 'method': 'GET',\n",
136 | " 'origin': '93.180.179.112',\n",
137 | " 'url': 'https://httpbin.org/anything/endpoint'}"
138 | ]
139 | },
140 | "execution_count": 12,
141 | "metadata": {},
142 | "output_type": "execute_result"
143 | }
144 | ],
145 | "source": [
146 | "url = 'https://httpbin.org/anything/endpoint'\n",
147 | "headers = {'user-agent': 'my-app/0.0.1'}\n",
148 | "\n",
149 | "response = requests.get(url, headers=headers)\n",
150 | "response.json()"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": null,
156 | "metadata": {
157 | "collapsed": true
158 | },
159 | "outputs": [],
160 | "source": []
161 | }
162 | ],
163 | "metadata": {
164 | "kernelspec": {
165 | "display_name": "Python 3",
166 | "language": "python",
167 | "name": "python3"
168 | },
169 | "language_info": {
170 | "codemirror_mode": {
171 | "name": "ipython",
172 | "version": 3
173 | },
174 | "file_extension": ".py",
175 | "mimetype": "text/x-python",
176 | "name": "python",
177 | "nbconvert_exporter": "python",
178 | "pygments_lexer": "ipython3",
179 | "version": "3.6.3"
180 | }
181 | },
182 | "nbformat": 4,
183 | "nbformat_minor": 2
184 | }
185 |
--------------------------------------------------------------------------------
/sagemaker-training-template/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM microsoft/mmlspark
2 |
3 | RUN pip install --upgrade pip
4 |
5 | COPY requirements.txt /requirements.txt
6 | RUN pip install -r /requirements.txt
7 |
8 | ENV PYTHONUNBUFFERED=TRUE
9 | ENV PYTHONDONTWRITEBYTECODE=TRUE
10 | ENV PATH="/opt/program:${PATH}"
11 |
12 | ADD . /opt/program
13 | WORKDIR /opt/program
14 |
15 | USER root
16 |
17 | ENTRYPOINT ["python", "main.py"]
18 |
--------------------------------------------------------------------------------
/sagemaker-training-template/main.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import json
4 | import pickle
5 | import sys
6 | import traceback
7 | import joblib
8 |
9 | import pandas as pd
10 |
11 | from sklearn.linear_model import LogisticRegression
12 | from sklearn.metrics import f1_score
13 |
14 | output_path = '/opt/ml/output'
15 | model_path = '/opt/ml/model'
16 | hyperparameters_path = '/opt/ml/input/config/hyperparameters.json'
17 | training_path = '/opt/ml/input/data/train'
18 |
19 |
20 | def train():
21 | print('Train job started')
22 | try:
23 | # Read in any hyperparameters that the user passed with the training job
24 | with open(hyperparameters_path, 'r') as tc:
25 | trainingParams = json.load(tc)
26 |
27 | input_files = [ os.path.join(training_path, file) for file in os.listdir(training_path) ]
28 | raw_data = [ pd.read_csv(file, header=0) for file in input_files ]
29 | train_data = pd.concat(raw_data)
30 |
31 | # labels are in the last column
32 | train_X = train_data.ix[:,:-1]
33 | train_y = train_data.ix[:,-1:]
34 |
35 | # Here we only support a single hyperparameter. Note that hyperparameters are always passed in as
36 | # strings, so we need to do any necessary conversions.
37 | penalty = trainingParams.get('penalty', 'l2')
38 |
39 | clf = LogisticRegression(penalty=penalty)
40 | clf = clf.fit(train_X, train_y)
41 | f1 = f1_score(train_y, clf.predict(train_X))
42 | print("F1 = " + str(f1) + ";")
43 |
44 | # save the model
45 | with open(os.path.join(model_path, 'model.pkl'), 'wb') as out:
46 | pickle.dump(clf, out)
47 | print('Train job finished')
48 | except Exception as exc:
49 | trace = traceback.format_exc()
50 | #This should store error logs in CloudWatch
51 | print('Exception: ' + str(exc) + '\n' + trace, file=sys.stderr)
52 | sys.exit(1)
53 |
54 |
55 | if __name__ == '__main__':
56 |
57 | if sys.argv[1] == 'train':
58 | train()
59 | sys.exit(0)
60 | sys.exit(1)
61 |
--------------------------------------------------------------------------------
/sagemaker-training-template/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.14.2
2 | pandas==0.23.1
3 | scikit-learn==0.19.1
4 | scipy==1.0.1
5 | pyspark-dist-explore==0.1.4
6 | joblib==0.12.4
--------------------------------------------------------------------------------
/sagemaker-tutorials/Digits.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Digits prediction\n"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "from sklearn import datasets\n",
17 | "digits = datasets.load_digits()"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 2,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "from sklearn.model_selection import train_test_split\n",
27 | "X_train, X_test, y_train, y_test = train_test_split(digits.data[:-1], digits.target[:-1], test_size=0.33, random_state=42)"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 3,
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "data": {
37 | "text/plain": [
38 | "SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,\n",
39 | " decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',\n",
40 | " max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
41 | " tol=0.001, verbose=False)"
42 | ]
43 | },
44 | "execution_count": 3,
45 | "metadata": {},
46 | "output_type": "execute_result"
47 | }
48 | ],
49 | "source": [
50 | "from sklearn import svm\n",
51 | "clf = svm.SVC(gamma=0.001, C=100.)\n",
52 | "clf.fit(X_train, y_train)"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 4,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "y_pred = clf.predict(X_test)"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 5,
67 | "metadata": {},
68 | "outputs": [
69 | {
70 | "data": {
71 | "text/plain": [
72 | "0.9881956155143339"
73 | ]
74 | },
75 | "execution_count": 5,
76 | "metadata": {},
77 | "output_type": "execute_result"
78 | }
79 | ],
80 | "source": [
81 | "from sklearn.metrics import accuracy_score\n",
82 | "accuracy_score(y_test, y_pred)"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 6,
88 | "metadata": {},
89 | "outputs": [
90 | {
91 | "name": "stdout",
92 | "output_type": "stream",
93 | "text": [
94 | "Automatically created module for IPython interactive environment\n",
95 | "Accuracy (train) for L1 logistic: 82.7% \n",
96 | "Accuracy (train) for L2 logistic (Multinomial): 82.7% \n",
97 | "Accuracy (train) for L2 logistic (OvR): 80.0% \n",
98 | "Accuracy (train) for Linear SVC: 82.0% \n",
99 | "Accuracy (train) for GPC: 82.7% \n"
100 | ]
101 | },
102 | {
103 | "data": {
104 | "text/plain": [
105 | ""
106 | ]
107 | },
108 | "metadata": {},
109 | "output_type": "display_data"
110 | }
111 | ],
112 | "source": [
113 | "print(__doc__)\n",
114 | "\n",
115 | "# Author: Alexandre Gramfort \n",
116 | "# License: BSD 3 clause\n",
117 | " \n",
118 | "import matplotlib.pyplot as plt\n",
119 | "import numpy as np\n",
120 | "\n",
121 | "from sklearn.metrics import accuracy_score\n",
122 | "from sklearn.linear_model import LogisticRegression\n",
123 | "from sklearn.svm import SVC\n",
124 | "from sklearn.gaussian_process import GaussianProcessClassifier\n",
125 | "from sklearn.gaussian_process.kernels import RBF\n",
126 | "from sklearn import datasets\n",
127 | "\n",
128 | "iris = datasets.load_iris()\n",
129 | "X = iris.data[:, 0:2] # we only take the first two features for visualization\n",
130 | "y = iris.target\n",
131 | "\n",
132 | "n_features = X.shape[1]\n",
133 | "\n",
134 | "C = 10\n",
135 | "kernel = 1.0 * RBF([1.0, 1.0]) # for GPC\n",
136 | "\n",
137 | "# Create different classifiers.\n",
138 | "classifiers = {\n",
139 | " 'L1 logistic': LogisticRegression(C=C, penalty='l1',\n",
140 | " solver='saga',\n",
141 | " multi_class='multinomial',\n",
142 | " max_iter=10000),\n",
143 | " 'L2 logistic (Multinomial)': LogisticRegression(C=C, penalty='l2',\n",
144 | " solver='saga',\n",
145 | " multi_class='multinomial',\n",
146 | " max_iter=10000),\n",
147 | " 'L2 logistic (OvR)': LogisticRegression(C=C, penalty='l2',\n",
148 | " solver='saga',\n",
149 | " multi_class='ovr',\n",
150 | " max_iter=10000),\n",
151 | " 'Linear SVC': SVC(kernel='linear', C=C, probability=True,\n",
152 | " random_state=0),\n",
153 | " 'GPC': GaussianProcessClassifier(kernel)\n",
154 | "}\n",
155 | "\n",
156 | "n_classifiers = len(classifiers)\n",
157 | "\n",
158 | "plt.figure(figsize=(3 * 2, n_classifiers * 2))\n",
159 | "plt.subplots_adjust(bottom=.2, top=.95)\n",
160 | "\n",
161 | "xx = np.linspace(3, 9, 100)\n",
162 | "yy = np.linspace(1, 5, 100).T\n",
163 | "xx, yy = np.meshgrid(xx, yy)\n",
164 | "Xfull = np.c_[xx.ravel(), yy.ravel()]\n",
165 | "\n",
166 | "for index, (name, classifier) in enumerate(classifiers.items()):\n",
167 | " classifier.fit(X, y)\n",
168 | "\n",
169 | " y_pred = classifier.predict(X)\n",
170 | " accuracy = accuracy_score(y, y_pred)\n",
171 | " print(\"Accuracy (train) for %s: %0.1f%% \" % (name, accuracy * 100))\n",
172 | "\n",
173 | " # View probabilities:\n",
174 | " probas = classifier.predict_proba(Xfull)\n",
175 | " n_classes = np.unique(y_pred).size\n",
176 | " for k in range(n_classes):\n",
177 | " plt.subplot(n_classifiers, n_classes, index * n_classes + k + 1)\n",
178 | " plt.title(\"Class %d\" % k)\n",
179 | " if k == 0:\n",
180 | " plt.ylabel(name)\n",
181 | " imshow_handle = plt.imshow(probas[:, k].reshape((100, 100)),\n",
182 | " extent=(3, 9, 1, 5), origin='lower')\n",
183 | " plt.xticks(())\n",
184 | " plt.yticks(())\n",
185 | " idx = (y_pred == k)\n",
186 | " if idx.any():\n",
187 | " plt.scatter(X[idx, 0], X[idx, 1], marker='o', c='w', edgecolor='k')\n",
188 | "\n",
189 | "ax = plt.axes([0.15, 0.04, 0.7, 0.05])\n",
190 | "plt.title(\"Probability\")\n",
191 | "plt.colorbar(imshow_handle, cax=ax, orientation='horizontal')\n",
192 | "\n",
193 | "plt.show()"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {},
200 | "outputs": [],
201 | "source": []
202 | }
203 | ],
204 | "metadata": {
205 | "kernelspec": {
206 | "display_name": "conda_python3",
207 | "language": "python",
208 | "name": "conda_python3"
209 | },
210 | "language_info": {
211 | "codemirror_mode": {
212 | "name": "ipython",
213 | "version": 3
214 | },
215 | "file_extension": ".py",
216 | "mimetype": "text/x-python",
217 | "name": "python",
218 | "nbconvert_exporter": "python",
219 | "pygments_lexer": "ipython3",
220 | "version": "3.6.5"
221 | }
222 | },
223 | "nbformat": 4,
224 | "nbformat_minor": 2
225 | }
226 |
--------------------------------------------------------------------------------
/sagemaker-tutorials/s3+pandas.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 2,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "bucket = 'sagemaker-ds-bucket'\n",
19 | "data_key = 'data/cars2.csv'\n",
20 | "data_location = 's3://{}/{}'.format(bucket, data_key)"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 3,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "df = pd.read_csv(data_location)"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 4,
35 | "metadata": {},
36 | "outputs": [
37 | {
38 | "data": {
39 | "text/html": [
40 | "\n",
41 | "\n",
54 | "
\n",
55 | " \n",
56 | " \n",
57 | " \n",
58 | " mpg \n",
59 | " cylinders \n",
60 | " displacement \n",
61 | " horsepower \n",
62 | " weight \n",
63 | " acceleration \n",
64 | " model \n",
65 | " origin \n",
66 | " car \n",
67 | " \n",
68 | " \n",
69 | " \n",
70 | " \n",
71 | " 0 \n",
72 | " 33.0 \n",
73 | " 4 \n",
74 | " 91 \n",
75 | " 53 \n",
76 | " 1795 \n",
77 | " 17.4 \n",
78 | " 76 \n",
79 | " 3 \n",
80 | " honda civic \n",
81 | " \n",
82 | " \n",
83 | " 1 \n",
84 | " 20.0 \n",
85 | " 6 \n",
86 | " 225 \n",
87 | " 100 \n",
88 | " 3651 \n",
89 | " 17.7 \n",
90 | " 76 \n",
91 | " 1 \n",
92 | " dodge aspen se \n",
93 | " \n",
94 | " \n",
95 | " 2 \n",
96 | " 18.0 \n",
97 | " 6 \n",
98 | " 250 \n",
99 | " 78 \n",
100 | " 3574 \n",
101 | " 21.0 \n",
102 | " 76 \n",
103 | " 1 \n",
104 | " ford granada ghia \n",
105 | " \n",
106 | " \n",
107 | " 3 \n",
108 | " 18.5 \n",
109 | " 6 \n",
110 | " 250 \n",
111 | " 110 \n",
112 | " 3645 \n",
113 | " 16.2 \n",
114 | " 76 \n",
115 | " 1 \n",
116 | " pontiac ventura sj \n",
117 | " \n",
118 | " \n",
119 | " 4 \n",
120 | " 17.5 \n",
121 | " 6 \n",
122 | " 258 \n",
123 | " 95 \n",
124 | " 3193 \n",
125 | " 17.8 \n",
126 | " 76 \n",
127 | " 1 \n",
128 | " amc pacer d/l \n",
129 | " \n",
130 | " \n",
131 | "
\n",
132 | "
"
133 | ],
134 | "text/plain": [
135 | " mpg cylinders displacement horsepower weight acceleration model \\\n",
136 | "0 33.0 4 91 53 1795 17.4 76 \n",
137 | "1 20.0 6 225 100 3651 17.7 76 \n",
138 | "2 18.0 6 250 78 3574 21.0 76 \n",
139 | "3 18.5 6 250 110 3645 16.2 76 \n",
140 | "4 17.5 6 258 95 3193 17.8 76 \n",
141 | "\n",
142 | " origin car \n",
143 | "0 3 honda civic \n",
144 | "1 1 dodge aspen se \n",
145 | "2 1 ford granada ghia \n",
146 | "3 1 pontiac ventura sj \n",
147 | "4 1 amc pacer d/l "
148 | ]
149 | },
150 | "execution_count": 4,
151 | "metadata": {},
152 | "output_type": "execute_result"
153 | }
154 | ],
155 | "source": [
156 | "df.head()"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "metadata": {},
163 | "outputs": [],
164 | "source": []
165 | }
166 | ],
167 | "metadata": {
168 | "kernelspec": {
169 | "display_name": "conda_python3",
170 | "language": "python",
171 | "name": "conda_python3"
172 | },
173 | "language_info": {
174 | "codemirror_mode": {
175 | "name": "ipython",
176 | "version": 3
177 | },
178 | "file_extension": ".py",
179 | "mimetype": "text/x-python",
180 | "name": "python",
181 | "nbconvert_exporter": "python",
182 | "pygments_lexer": "ipython3",
183 | "version": "3.6.5"
184 | }
185 | },
186 | "nbformat": 4,
187 | "nbformat_minor": 2
188 | }
189 |
--------------------------------------------------------------------------------
/sagemaker-tutorials/s3+spark.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.hadoop:hadoop-aws:2.7.2 pyspark-shell'"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "from pyspark.context import SparkContext\n",
20 | "from pyspark.sql.session import SparkSession\n",
21 | "\n",
22 | "sc = SparkContext('local')\n",
23 | "spark = SparkSession(sc)"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 3,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "bucket = 'sagemaker-ds-bucket'\n",
33 | "data_key = 'data/cars2.csv'\n",
34 | "data_location = 's3a://{}/{}'.format(bucket, data_key)"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 4,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "df = spark.read.csv(data_location, header=True)"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 5,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "data": {
53 | "text/html": [
54 | "\n",
55 | "\n",
68 | "
\n",
69 | " \n",
70 | " \n",
71 | " \n",
72 | " mpg \n",
73 | " cylinders \n",
74 | " displacement \n",
75 | " horsepower \n",
76 | " weight \n",
77 | " acceleration \n",
78 | " model \n",
79 | " origin \n",
80 | " car \n",
81 | " \n",
82 | " \n",
83 | " \n",
84 | " \n",
85 | " 0 \n",
86 | " 33.0 \n",
87 | " 4 \n",
88 | " 91 \n",
89 | " 53 \n",
90 | " 1795 \n",
91 | " 17.4 \n",
92 | " 76 \n",
93 | " 3 \n",
94 | " honda civic \n",
95 | " \n",
96 | " \n",
97 | " 1 \n",
98 | " 20.0 \n",
99 | " 6 \n",
100 | " 225 \n",
101 | " 100 \n",
102 | " 3651 \n",
103 | " 17.7 \n",
104 | " 76 \n",
105 | " 1 \n",
106 | " dodge aspen se \n",
107 | " \n",
108 | " \n",
109 | " 2 \n",
110 | " 18.0 \n",
111 | " 6 \n",
112 | " 250 \n",
113 | " 78 \n",
114 | " 3574 \n",
115 | " 21.0 \n",
116 | " 76 \n",
117 | " 1 \n",
118 | " ford granada ghia \n",
119 | " \n",
120 | " \n",
121 | " 3 \n",
122 | " 18.5 \n",
123 | " 6 \n",
124 | " 250 \n",
125 | " 110 \n",
126 | " 3645 \n",
127 | " 16.2 \n",
128 | " 76 \n",
129 | " 1 \n",
130 | " pontiac ventura sj \n",
131 | " \n",
132 | " \n",
133 | " 4 \n",
134 | " 17.5 \n",
135 | " 6 \n",
136 | " 258 \n",
137 | " 95 \n",
138 | " 3193 \n",
139 | " 17.8 \n",
140 | " 76 \n",
141 | " 1 \n",
142 | " amc pacer d/l \n",
143 | " \n",
144 | " \n",
145 | "
\n",
146 | "
"
147 | ],
148 | "text/plain": [
149 | " mpg cylinders displacement horsepower weight acceleration model origin \\\n",
150 | "0 33.0 4 91 53 1795 17.4 76 3 \n",
151 | "1 20.0 6 225 100 3651 17.7 76 1 \n",
152 | "2 18.0 6 250 78 3574 21.0 76 1 \n",
153 | "3 18.5 6 250 110 3645 16.2 76 1 \n",
154 | "4 17.5 6 258 95 3193 17.8 76 1 \n",
155 | "\n",
156 | " car \n",
157 | "0 honda civic \n",
158 | "1 dodge aspen se \n",
159 | "2 ford granada ghia \n",
160 | "3 pontiac ventura sj \n",
161 | "4 amc pacer d/l "
162 | ]
163 | },
164 | "execution_count": 5,
165 | "metadata": {},
166 | "output_type": "execute_result"
167 | }
168 | ],
169 | "source": [
170 | "df.toPandas().head()"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "metadata": {},
177 | "outputs": [],
178 | "source": []
179 | }
180 | ],
181 | "metadata": {
182 | "kernelspec": {
183 | "display_name": "conda_python3",
184 | "language": "python",
185 | "name": "conda_python3"
186 | },
187 | "language_info": {
188 | "codemirror_mode": {
189 | "name": "ipython",
190 | "version": 3
191 | },
192 | "file_extension": ".py",
193 | "mimetype": "text/x-python",
194 | "name": "python",
195 | "nbconvert_exporter": "python",
196 | "pygments_lexer": "ipython3",
197 | "version": "3.6.5"
198 | }
199 | },
200 | "nbformat": 4,
201 | "nbformat_minor": 2
202 | }
203 |
--------------------------------------------------------------------------------
/spark-sql/Time between specified events.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from pyspark.context import SparkContext\n",
12 | "from pyspark.sql.session import SparkSession\n",
13 | "\n",
14 | "sc = SparkContext('local')\n",
15 | "spark = SparkSession(sc)"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 18,
21 | "metadata": {
22 | "collapsed": true
23 | },
24 | "outputs": [],
25 | "source": [
26 | "df = spark.read.csv(\"time_between_events_test.csv\", header=True)"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 19,
32 | "metadata": {},
33 | "outputs": [
34 | {
35 | "data": {
36 | "text/html": [
37 | "\n",
38 | "\n",
51 | "
\n",
52 | " \n",
53 | " \n",
54 | " \n",
55 | " userId \n",
56 | " sessionId \n",
57 | " playlist_id \n",
58 | " event_name \n",
59 | " timestamp \n",
60 | " \n",
61 | " \n",
62 | " \n",
63 | " \n",
64 | " 0 \n",
65 | " 1 \n",
66 | " 1 \n",
67 | " A \n",
68 | " start \n",
69 | " 1 \n",
70 | " \n",
71 | " \n",
72 | " 1 \n",
73 | " 1 \n",
74 | " 1 \n",
75 | " None \n",
76 | " another \n",
77 | " 2 \n",
78 | " \n",
79 | " \n",
80 | " 2 \n",
81 | " 1 \n",
82 | " 1 \n",
83 | " A \n",
84 | " stop \n",
85 | " 3 \n",
86 | " \n",
87 | " \n",
88 | " 3 \n",
89 | " 1 \n",
90 | " 1 \n",
91 | " B \n",
92 | " start \n",
93 | " 4 \n",
94 | " \n",
95 | " \n",
96 | " 4 \n",
97 | " 1 \n",
98 | " 1 \n",
99 | " None \n",
100 | " another \n",
101 | " 5 \n",
102 | " \n",
103 | " \n",
104 | " 5 \n",
105 | " 1 \n",
106 | " 1 \n",
107 | " C \n",
108 | " start \n",
109 | " 6 \n",
110 | " \n",
111 | " \n",
112 | " 6 \n",
113 | " 1 \n",
114 | " 1 \n",
115 | " C \n",
116 | " stop \n",
117 | " 7 \n",
118 | " \n",
119 | " \n",
120 | " 7 \n",
121 | " 1 \n",
122 | " 1 \n",
123 | " C \n",
124 | " start \n",
125 | " 8 \n",
126 | " \n",
127 | " \n",
128 | " 8 \n",
129 | " 1 \n",
130 | " 1 \n",
131 | " C \n",
132 | " stop \n",
133 | " 9 \n",
134 | " \n",
135 | " \n",
136 | "
\n",
137 | "
"
138 | ],
139 | "text/plain": [
140 | " userId sessionId playlist_id event_name timestamp\n",
141 | "0 1 1 A start 1\n",
142 | "1 1 1 None another 2\n",
143 | "2 1 1 A stop 3\n",
144 | "3 1 1 B start 4\n",
145 | "4 1 1 None another 5\n",
146 | "5 1 1 C start 6\n",
147 | "6 1 1 C stop 7\n",
148 | "7 1 1 C start 8\n",
149 | "8 1 1 C stop 9"
150 | ]
151 | },
152 | "execution_count": 19,
153 | "metadata": {},
154 | "output_type": "execute_result"
155 | }
156 | ],
157 | "source": [
158 | "df.toPandas()"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 20,
164 | "metadata": {
165 | "collapsed": true
166 | },
167 | "outputs": [],
168 | "source": [
169 | "\n",
170 | "from pyspark.sql import Window\n",
171 | "import time\n",
172 | "from pyspark.sql.types import StringType\n",
173 | "import pyspark.sql.functions as F\n",
174 | "from pyspark.sql.types import TimestampType, LongType"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": 21,
180 | "metadata": {
181 | "collapsed": true
182 | },
183 | "outputs": [],
184 | "source": [
185 | "#Window.partitionBy(partitioning_column).orderBy(ordering_column).rangeBetween(Window.unboundedPreceding,last_row)"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": 22,
191 | "metadata": {
192 | "collapsed": true
193 | },
194 | "outputs": [],
195 | "source": [
196 | "df = df.filter(df['event_name'].isin(['start', 'stop']))"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": 23,
202 | "metadata": {},
203 | "outputs": [
204 | {
205 | "data": {
206 | "text/html": [
207 | "\n",
208 | "\n",
221 | "
\n",
222 | " \n",
223 | " \n",
224 | " \n",
225 | " userId \n",
226 | " sessionId \n",
227 | " playlist_id \n",
228 | " event_name \n",
229 | " timestamp \n",
230 | " \n",
231 | " \n",
232 | " \n",
233 | " \n",
234 | " 0 \n",
235 | " 1 \n",
236 | " 1 \n",
237 | " A \n",
238 | " start \n",
239 | " 1 \n",
240 | " \n",
241 | " \n",
242 | " 1 \n",
243 | " 1 \n",
244 | " 1 \n",
245 | " A \n",
246 | " stop \n",
247 | " 3 \n",
248 | " \n",
249 | " \n",
250 | " 2 \n",
251 | " 1 \n",
252 | " 1 \n",
253 | " B \n",
254 | " start \n",
255 | " 4 \n",
256 | " \n",
257 | " \n",
258 | " 3 \n",
259 | " 1 \n",
260 | " 1 \n",
261 | " C \n",
262 | " start \n",
263 | " 6 \n",
264 | " \n",
265 | " \n",
266 | " 4 \n",
267 | " 1 \n",
268 | " 1 \n",
269 | " C \n",
270 | " stop \n",
271 | " 7 \n",
272 | " \n",
273 | " \n",
274 | " 5 \n",
275 | " 1 \n",
276 | " 1 \n",
277 | " C \n",
278 | " start \n",
279 | " 8 \n",
280 | " \n",
281 | " \n",
282 | " 6 \n",
283 | " 1 \n",
284 | " 1 \n",
285 | " C \n",
286 | " stop \n",
287 | " 9 \n",
288 | " \n",
289 | " \n",
290 | "
\n",
291 | "
"
292 | ],
293 | "text/plain": [
294 | " userId sessionId playlist_id event_name timestamp\n",
295 | "0 1 1 A start 1\n",
296 | "1 1 1 A stop 3\n",
297 | "2 1 1 B start 4\n",
298 | "3 1 1 C start 6\n",
299 | "4 1 1 C stop 7\n",
300 | "5 1 1 C start 8\n",
301 | "6 1 1 C stop 9"
302 | ]
303 | },
304 | "execution_count": 23,
305 | "metadata": {},
306 | "output_type": "execute_result"
307 | }
308 | ],
309 | "source": [
310 | "df.toPandas()"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": 24,
316 | "metadata": {},
317 | "outputs": [
318 | {
319 | "data": {
320 | "text/html": [
321 | "\n",
322 | "\n",
335 | "
\n",
336 | " \n",
337 | " \n",
338 | " \n",
339 | " userId \n",
340 | " sessionId \n",
341 | " playlist_id \n",
342 | " event_name \n",
343 | " timestamp \n",
344 | " event_duration_end \n",
345 | " \n",
346 | " \n",
347 | " \n",
348 | " \n",
349 | " 0 \n",
350 | " 1 \n",
351 | " 1 \n",
352 | " A \n",
353 | " start \n",
354 | " 1 \n",
355 | " 3 \n",
356 | " \n",
357 | " \n",
358 | " 1 \n",
359 | " 1 \n",
360 | " 1 \n",
361 | " A \n",
362 | " stop \n",
363 | " 3 \n",
364 | " 4 \n",
365 | " \n",
366 | " \n",
367 | " 2 \n",
368 | " 1 \n",
369 | " 1 \n",
370 | " B \n",
371 | " start \n",
372 | " 4 \n",
373 | " 6 \n",
374 | " \n",
375 | " \n",
376 | " 3 \n",
377 | " 1 \n",
378 | " 1 \n",
379 | " C \n",
380 | " start \n",
381 | " 6 \n",
382 | " 7 \n",
383 | " \n",
384 | " \n",
385 | " 4 \n",
386 | " 1 \n",
387 | " 1 \n",
388 | " C \n",
389 | " stop \n",
390 | " 7 \n",
391 | " 8 \n",
392 | " \n",
393 | " \n",
394 | " 5 \n",
395 | " 1 \n",
396 | " 1 \n",
397 | " C \n",
398 | " start \n",
399 | " 8 \n",
400 | " 9 \n",
401 | " \n",
402 | " \n",
403 | " 6 \n",
404 | " 1 \n",
405 | " 1 \n",
406 | " C \n",
407 | " stop \n",
408 | " 9 \n",
409 | " None \n",
410 | " \n",
411 | " \n",
412 | "
\n",
413 | "
"
414 | ],
415 | "text/plain": [
416 | " userId sessionId playlist_id event_name timestamp event_duration_end\n",
417 | "0 1 1 A start 1 3\n",
418 | "1 1 1 A stop 3 4\n",
419 | "2 1 1 B start 4 6\n",
420 | "3 1 1 C start 6 7\n",
421 | "4 1 1 C stop 7 8\n",
422 | "5 1 1 C start 8 9\n",
423 | "6 1 1 C stop 9 None"
424 | ]
425 | },
426 | "execution_count": 24,
427 | "metadata": {},
428 | "output_type": "execute_result"
429 | }
430 | ],
431 | "source": [
432 | "win = Window.partitionBy(['userId', 'sessionId']).orderBy('timestamp')\n",
433 | "df = df.withColumn(\"event_duration_end\", F.lead('timestamp').over(win))\n",
434 | "df.toPandas()"
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": 25,
440 | "metadata": {},
441 | "outputs": [
442 | {
443 | "data": {
444 | "text/html": [
445 | "\n",
446 | "\n",
459 | "
\n",
460 | " \n",
461 | " \n",
462 | " \n",
463 | " userId \n",
464 | " sessionId \n",
465 | " playlist_id \n",
466 | " event_name \n",
467 | " timestamp \n",
468 | " event_duration_end \n",
469 | " \n",
470 | " \n",
471 | " \n",
472 | " \n",
473 | " 0 \n",
474 | " 1 \n",
475 | " 1 \n",
476 | " A \n",
477 | " start \n",
478 | " 1 \n",
479 | " 3 \n",
480 | " \n",
481 | " \n",
482 | " 1 \n",
483 | " 1 \n",
484 | " 1 \n",
485 | " B \n",
486 | " start \n",
487 | " 4 \n",
488 | " 6 \n",
489 | " \n",
490 | " \n",
491 | " 2 \n",
492 | " 1 \n",
493 | " 1 \n",
494 | " C \n",
495 | " start \n",
496 | " 6 \n",
497 | " 7 \n",
498 | " \n",
499 | " \n",
500 | " 3 \n",
501 | " 1 \n",
502 | " 1 \n",
503 | " C \n",
504 | " start \n",
505 | " 8 \n",
506 | " 9 \n",
507 | " \n",
508 | " \n",
509 | "
\n",
510 | "
"
511 | ],
512 | "text/plain": [
513 | " userId sessionId playlist_id event_name timestamp event_duration_end\n",
514 | "0 1 1 A start 1 3\n",
515 | "1 1 1 B start 4 6\n",
516 | "2 1 1 C start 6 7\n",
517 | "3 1 1 C start 8 9"
518 | ]
519 | },
520 | "execution_count": 25,
521 | "metadata": {},
522 | "output_type": "execute_result"
523 | }
524 | ],
525 | "source": [
526 | "df = df.filter((F.col(\"event_name\") == 'start') & (F.col('event_duration_end').isNotNull()))\n",
527 | "df.toPandas()"
528 | ]
529 | },
530 | {
531 | "cell_type": "code",
532 | "execution_count": 27,
533 | "metadata": {},
534 | "outputs": [
535 | {
536 | "data": {
537 | "text/html": [
538 | "\n",
539 | "\n",
552 | "
\n",
553 | " \n",
554 | " \n",
555 | " \n",
556 | " userId \n",
557 | " sessionId \n",
558 | " playlist_id \n",
559 | " event_name \n",
560 | " timestamp \n",
561 | " event_duration_end \n",
562 | " playlist_duration \n",
563 | " \n",
564 | " \n",
565 | " \n",
566 | " \n",
567 | " 0 \n",
568 | " 1 \n",
569 | " 1 \n",
570 | " A \n",
571 | " start \n",
572 | " 1 \n",
573 | " 3 \n",
574 | " 2.0 \n",
575 | " \n",
576 | " \n",
577 | " 1 \n",
578 | " 1 \n",
579 | " 1 \n",
580 | " B \n",
581 | " start \n",
582 | " 4 \n",
583 | " 6 \n",
584 | " 2.0 \n",
585 | " \n",
586 | " \n",
587 | " 2 \n",
588 | " 1 \n",
589 | " 1 \n",
590 | " C \n",
591 | " start \n",
592 | " 6 \n",
593 | " 7 \n",
594 | " 1.0 \n",
595 | " \n",
596 | " \n",
597 | " 3 \n",
598 | " 1 \n",
599 | " 1 \n",
600 | " C \n",
601 | " start \n",
602 | " 8 \n",
603 | " 9 \n",
604 | " 1.0 \n",
605 | " \n",
606 | " \n",
607 | "
\n",
608 | "
"
609 | ],
610 | "text/plain": [
611 | " userId sessionId playlist_id event_name timestamp event_duration_end \\\n",
612 | "0 1 1 A start 1 3 \n",
613 | "1 1 1 B start 4 6 \n",
614 | "2 1 1 C start 6 7 \n",
615 | "3 1 1 C start 8 9 \n",
616 | "\n",
617 | " playlist_duration \n",
618 | "0 2.0 \n",
619 | "1 2.0 \n",
620 | "2 1.0 \n",
621 | "3 1.0 "
622 | ]
623 | },
624 | "execution_count": 27,
625 | "metadata": {},
626 | "output_type": "execute_result"
627 | }
628 | ],
629 | "source": [
630 | "df = df.withColumn('playlist_duration', F.col('event_duration_end') - F.col('timestamp'))\n",
631 | "df.toPandas()"
632 | ]
633 | },
634 | {
635 | "cell_type": "code",
636 | "execution_count": 29,
637 | "metadata": {},
638 | "outputs": [
639 | {
640 | "data": {
641 | "text/html": [
642 | "\n",
643 | "\n",
656 | "
\n",
657 | " \n",
658 | " \n",
659 | " \n",
660 | " userId \n",
661 | " sessionId \n",
662 | " playlist_id \n",
663 | " event_name \n",
664 | " playlist_duration \n",
665 | " \n",
666 | " \n",
667 | " \n",
668 | " \n",
669 | " 0 \n",
670 | " 1 \n",
671 | " 1 \n",
672 | " A \n",
673 | " start \n",
674 | " 2.0 \n",
675 | " \n",
676 | " \n",
677 | " 1 \n",
678 | " 1 \n",
679 | " 1 \n",
680 | " B \n",
681 | " start \n",
682 | " 2.0 \n",
683 | " \n",
684 | " \n",
685 | " 2 \n",
686 | " 1 \n",
687 | " 1 \n",
688 | " C \n",
689 | " start \n",
690 | " 2.0 \n",
691 | " \n",
692 | " \n",
693 | "
\n",
694 | "
"
695 | ],
696 | "text/plain": [
697 | " userId sessionId playlist_id event_name playlist_duration\n",
698 | "0 1 1 A start 2.0\n",
699 | "1 1 1 B start 2.0\n",
700 | "2 1 1 C start 2.0"
701 | ]
702 | },
703 | "execution_count": 29,
704 | "metadata": {},
705 | "output_type": "execute_result"
706 | }
707 | ],
708 | "source": [
709 | "df.groupBy(['userId', 'sessionId', 'playlist_id', 'event_name']).agg(F.sum('playlist_duration').alias('playlist_duration')).toPandas()"
710 | ]
711 | },
712 | {
713 | "cell_type": "code",
714 | "execution_count": null,
715 | "metadata": {
716 | "collapsed": true
717 | },
718 | "outputs": [],
719 | "source": []
720 | }
721 | ],
722 | "metadata": {
723 | "kernelspec": {
724 | "display_name": "Python 3",
725 | "language": "python",
726 | "name": "python3"
727 | },
728 | "language_info": {
729 | "codemirror_mode": {
730 | "name": "ipython",
731 | "version": 3
732 | },
733 | "file_extension": ".py",
734 | "mimetype": "text/x-python",
735 | "name": "python",
736 | "nbconvert_exporter": "python",
737 | "pygments_lexer": "ipython3",
738 | "version": "3.6.3"
739 | }
740 | },
741 | "nbformat": 4,
742 | "nbformat_minor": 2
743 | }
744 |
--------------------------------------------------------------------------------
/spark-sql/session length.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from pyspark.context import SparkContext\n",
12 | "from pyspark.sql.session import SparkSession\n",
13 | "\n",
14 | "sc = SparkContext('local')\n",
15 | "spark = SparkSession(sc)"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 12,
21 | "metadata": {
22 | "collapsed": true
23 | },
24 | "outputs": [],
25 | "source": [
26 | "df = spark.read.csv(\"session_duration.csv\", header=True)"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 13,
32 | "metadata": {},
33 | "outputs": [
34 | {
35 | "data": {
36 | "text/html": [
37 | "\n",
38 | "\n",
51 | "
\n",
52 | " \n",
53 | " \n",
54 | " \n",
55 | " userId \n",
56 | " sessionId \n",
57 | " timestamp \n",
58 | " \n",
59 | " \n",
60 | " \n",
61 | " \n",
62 | " 0 \n",
63 | " A \n",
64 | " 1 \n",
65 | " 1523849238491 \n",
66 | " \n",
67 | " \n",
68 | " 1 \n",
69 | " A \n",
70 | " 1 \n",
71 | " 1523849238492 \n",
72 | " \n",
73 | " \n",
74 | " 2 \n",
75 | " A \n",
76 | " 1 \n",
77 | " 1523849238493 \n",
78 | " \n",
79 | " \n",
80 | " 3 \n",
81 | " A \n",
82 | " 1 \n",
83 | " 1523849238496 \n",
84 | " \n",
85 | " \n",
86 | " 4 \n",
87 | " A \n",
88 | " 2 \n",
89 | " 1523849238492 \n",
90 | " \n",
91 | " \n",
92 | " 5 \n",
93 | " A \n",
94 | " 2 \n",
95 | " 1523849238495 \n",
96 | " \n",
97 | " \n",
98 | "
\n",
99 | "
"
100 | ],
101 | "text/plain": [
102 | " userId sessionId timestamp\n",
103 | "0 A 1 1523849238491\n",
104 | "1 A 1 1523849238492\n",
105 | "2 A 1 1523849238493\n",
106 | "3 A 1 1523849238496\n",
107 | "4 A 2 1523849238492\n",
108 | "5 A 2 1523849238495"
109 | ]
110 | },
111 | "execution_count": 13,
112 | "metadata": {},
113 | "output_type": "execute_result"
114 | }
115 | ],
116 | "source": [
117 | "df.toPandas()"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 15,
123 | "metadata": {},
124 | "outputs": [
125 | {
126 | "data": {
127 | "text/html": [
128 | "\n",
129 | "\n",
142 | "
\n",
143 | " \n",
144 | " \n",
145 | " \n",
146 | " userId \n",
147 | " sessionId \n",
148 | " feature_session_duration \n",
149 | " \n",
150 | " \n",
151 | " \n",
152 | " \n",
153 | " 0 \n",
154 | " A \n",
155 | " 1 \n",
156 | " 5.0 \n",
157 | " \n",
158 | " \n",
159 | " 1 \n",
160 | " A \n",
161 | " 2 \n",
162 | " 3.0 \n",
163 | " \n",
164 | " \n",
165 | "
\n",
166 | "
"
167 | ],
168 | "text/plain": [
169 | " userId sessionId feature_session_duration\n",
170 | "0 A 1 5.0\n",
171 | "1 A 2 3.0"
172 | ]
173 | },
174 | "execution_count": 15,
175 | "metadata": {},
176 | "output_type": "execute_result"
177 | }
178 | ],
179 | "source": [
180 | "df_with_session_length = df.groupBy(['userId', 'sessionId'])\\\n",
181 | " .agg(F.min('timestamp').alias('session_start'), F.max('timestamp').alias('session_end'))\\\n",
182 | " .withColumn('feature_session_duration', F.col('session_end') - F.col('session_start'))\\\n",
183 | " .select('userId', 'sessionId', 'feature_session_duration')\n",
184 | "df_with_session_length.toPandas()"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 48,
190 | "metadata": {},
191 | "outputs": [],
192 | "source": [
193 | "\n",
194 | "dfc2 = spark.read.csv(\"session_duration.csv\", header=True)"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 49,
200 | "metadata": {},
201 | "outputs": [
202 | {
203 | "data": {
204 | "text/html": [
205 | "\n",
206 | "\n",
219 | "
\n",
220 | " \n",
221 | " \n",
222 | " \n",
223 | " sessionId \n",
224 | " sessionId: \n",
225 | " timestamp \n",
226 | " \n",
227 | " \n",
228 | " \n",
229 | " \n",
230 | " 0 \n",
231 | " A \n",
232 | " 1 \n",
233 | " 1523849238491 \n",
234 | " \n",
235 | " \n",
236 | " 1 \n",
237 | " None \n",
238 | " 1 \n",
239 | " 1523849238492 \n",
240 | " \n",
241 | " \n",
242 | " 2 \n",
243 | " A \n",
244 | " None \n",
245 | " 1523849238493 \n",
246 | " \n",
247 | " \n",
248 | " 3 \n",
249 | " A \n",
250 | " 1 \n",
251 | " 1523849238496 \n",
252 | " \n",
253 | " \n",
254 | " 4 \n",
255 | " A \n",
256 | " 2 \n",
257 | " 1523849238492 \n",
258 | " \n",
259 | " \n",
260 | " 5 \n",
261 | " A \n",
262 | " 2 \n",
263 | " 1523849238495 \n",
264 | " \n",
265 | " \n",
266 | "
\n",
267 | "
"
268 | ],
269 | "text/plain": [
270 | " sessionId sessionId: timestamp\n",
271 | "0 A 1 1523849238491\n",
272 | "1 None 1 1523849238492\n",
273 | "2 A None 1523849238493\n",
274 | "3 A 1 1523849238496\n",
275 | "4 A 2 1523849238492\n",
276 | "5 A 2 1523849238495"
277 | ]
278 | },
279 | "execution_count": 49,
280 | "metadata": {},
281 | "output_type": "execute_result"
282 | }
283 | ],
284 | "source": [
285 | "dfc2.toPandas()"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 50,
291 | "metadata": {},
292 | "outputs": [],
293 | "source": [
294 | "columns_to_merge = {}\n",
295 | "for col_name in dfc2.columns:\n",
296 | " new_col_name = col_name.replace(\".\",\"_\")\n",
297 | " dfc2 = dfc2.withColumnRenamed(col_name, new_col_name)\n",
298 | " if \":\" in new_col_name:\n",
299 | " error_column_name = new_col_name.replace(\":\",\"_exception\")\n",
300 | " columns_to_merge[new_col_name.replace(\":\",\"\")] = error_column_name\n",
301 | " dfc2 = dfc2.withColumnRenamed(new_col_name, error_column_name)\n",
302 | " \n"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 51,
308 | "metadata": {},
309 | "outputs": [
310 | {
311 | "data": {
312 | "text/html": [
313 | "\n",
314 | "\n",
327 | "
\n",
328 | " \n",
329 | " \n",
330 | " \n",
331 | " sessionId \n",
332 | " sessionId_exception \n",
333 | " timestamp \n",
334 | " \n",
335 | " \n",
336 | " \n",
337 | " \n",
338 | " 0 \n",
339 | " A \n",
340 | " 1 \n",
341 | " 1523849238491 \n",
342 | " \n",
343 | " \n",
344 | " 1 \n",
345 | " None \n",
346 | " 1 \n",
347 | " 1523849238492 \n",
348 | " \n",
349 | " \n",
350 | " 2 \n",
351 | " A \n",
352 | " None \n",
353 | " 1523849238493 \n",
354 | " \n",
355 | " \n",
356 | " 3 \n",
357 | " A \n",
358 | " 1 \n",
359 | " 1523849238496 \n",
360 | " \n",
361 | " \n",
362 | " 4 \n",
363 | " A \n",
364 | " 2 \n",
365 | " 1523849238492 \n",
366 | " \n",
367 | " \n",
368 | " 5 \n",
369 | " A \n",
370 | " 2 \n",
371 | " 1523849238495 \n",
372 | " \n",
373 | " \n",
374 | "
\n",
375 | "
"
376 | ],
377 | "text/plain": [
378 | " sessionId sessionId_exception timestamp\n",
379 | "0 A 1 1523849238491\n",
380 | "1 None 1 1523849238492\n",
381 | "2 A None 1523849238493\n",
382 | "3 A 1 1523849238496\n",
383 | "4 A 2 1523849238492\n",
384 | "5 A 2 1523849238495"
385 | ]
386 | },
387 | "execution_count": 51,
388 | "metadata": {},
389 | "output_type": "execute_result"
390 | }
391 | ],
392 | "source": [
393 | "dfc2.toPandas()"
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": 52,
399 | "metadata": {},
400 | "outputs": [
401 | {
402 | "name": "stdout",
403 | "output_type": "stream",
404 | "text": [
405 | "sessionId\n",
406 | "sessionId_exception\n",
407 | "i\n"
408 | ]
409 | }
410 | ],
411 | "source": [
412 | "for valid_column_name, error_column_name in columns_to_merge.items():\n",
413 | " print(valid_column_name)\n",
414 | " print(error_column_name)\n",
415 | " if (valid_column_name in dfc2.columns) & (error_column_name in dfc2.columns):\n",
416 | " print(\"i\")\n",
417 | " dfc2 = dfc2.withColumn(valid_column_name, F.coalesce(F.col(error_column_name), F.col(valid_column_name)))\n",
418 | " dfc2 = dfc2.drop(error_column_name)"
419 | ]
420 | },
421 | {
422 | "cell_type": "code",
423 | "execution_count": 53,
424 | "metadata": {},
425 | "outputs": [
426 | {
427 | "data": {
428 | "text/html": [
429 | "\n",
430 | "\n",
443 | "
\n",
444 | " \n",
445 | " \n",
446 | " \n",
447 | " sessionId \n",
448 | " timestamp \n",
449 | " \n",
450 | " \n",
451 | " \n",
452 | " \n",
453 | " 0 \n",
454 | " 1 \n",
455 | " 1523849238491 \n",
456 | " \n",
457 | " \n",
458 | " 1 \n",
459 | " 1 \n",
460 | " 1523849238492 \n",
461 | " \n",
462 | " \n",
463 | " 2 \n",
464 | " A \n",
465 | " 1523849238493 \n",
466 | " \n",
467 | " \n",
468 | " 3 \n",
469 | " 1 \n",
470 | " 1523849238496 \n",
471 | " \n",
472 | " \n",
473 | " 4 \n",
474 | " 2 \n",
475 | " 1523849238492 \n",
476 | " \n",
477 | " \n",
478 | " 5 \n",
479 | " 2 \n",
480 | " 1523849238495 \n",
481 | " \n",
482 | " \n",
483 | "
\n",
484 | "
"
485 | ],
486 | "text/plain": [
487 | " sessionId timestamp\n",
488 | "0 1 1523849238491\n",
489 | "1 1 1523849238492\n",
490 | "2 A 1523849238493\n",
491 | "3 1 1523849238496\n",
492 | "4 2 1523849238492\n",
493 | "5 2 1523849238495"
494 | ]
495 | },
496 | "execution_count": 53,
497 | "metadata": {},
498 | "output_type": "execute_result"
499 | }
500 | ],
501 | "source": [
502 | "dfc2.toPandas()"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": null,
508 | "metadata": {
509 | "collapsed": true
510 | },
511 | "outputs": [],
512 | "source": []
513 | },
514 | {
515 | "cell_type": "code",
516 | "execution_count": null,
517 | "metadata": {
518 | "collapsed": true
519 | },
520 | "outputs": [],
521 | "source": []
522 | }
523 | ],
524 | "metadata": {
525 | "kernelspec": {
526 | "display_name": "Python 3",
527 | "language": "python",
528 | "name": "python3"
529 | },
530 | "language_info": {
531 | "codemirror_mode": {
532 | "name": "ipython",
533 | "version": 3
534 | },
535 | "file_extension": ".py",
536 | "mimetype": "text/x-python",
537 | "name": "python",
538 | "nbconvert_exporter": "python",
539 | "pygments_lexer": "ipython3",
540 | "version": "3.6.3"
541 | }
542 | },
543 | "nbformat": 4,
544 | "nbformat_minor": 2
545 | }
546 |
--------------------------------------------------------------------------------
/spark-sql/session_duration.csv:
--------------------------------------------------------------------------------
1 | sessionId,sessionId:,timestamp
2 | A,1,1523849238491
3 | ,1,1523849238492
4 | A,,1523849238493
5 | A,1,1523849238496
6 | A,2,1523849238492
7 | A,2,1523849238495
8 |
--------------------------------------------------------------------------------
/spark-sql/time_between_events_test.csv:
--------------------------------------------------------------------------------
1 | userId,sessionId,playlist_id,event_name,timestamp
2 | 1,1,A,start,1
3 | 1,1,,another,2
4 | 1,1,A,stop,3
5 | 1,1,B,start,4
6 | 1,1,,another,5
7 | 1,1,C,start,6
8 | 1,1,C,stop,7
9 |
--------------------------------------------------------------------------------
/tesseract_on_ami.txt:
--------------------------------------------------------------------------------
1 | install python
2 | https://tecadmin.net/install-python-3-7-amazon-linux/
3 |
4 | yum install gcc-c++
5 |
6 | sudo yum install python-pip
7 | sudo yum install python3-pip
8 | -- install conda by https://linuxize.com/post/how-to-install-anaconda-on-centos-7/
9 |
10 | conda install -c conda-forge poppler
11 |
12 | activate source
13 | pip install pdf2image
14 | pip install pytesseract
15 |
16 |
17 | sudo yum install autoconf aclocal automake
18 | sudo yum install libtool
19 | sudo yum install libjpeg-devel libpng-devel libtiff-devel zlib-devel
20 | http://www.leptonica.org/source/leptonica-1.78.0.tar.gz
21 | tar -zxvf leptonica-1.78.0.tar.gz
22 | cd leptonica-1.78.0
23 | ./configure
24 | make
25 | sudo make install
26 | cd ..
27 | wget https://github.com/tesseract-ocr/tesseract/archive/3.04.00.tar.gz
28 | tar -zxvf 3.04.00.tar.gz
29 | cd tesseract-3.04.00/
30 | ./autogen.sh
31 | ./configure
32 | make
33 | sudo make install
34 | sudo ldconfig
35 |
36 | cd /usr/local/share/tessdata
37 | sudo wget https://github.com/tesseract-ocr/tessdata/blob/master/eng.traineddata
38 | sudo wget https://github.com/tesseract-ocr/tessdata/blob/master/pl.traineddata
39 | export TESSDATA_PREFIX=/usr/local/share/
40 | vi ~/.bash_profile
41 | # Copy this line to the end: export TESSDATA_PREFIX=/usr/local/share/
42 | # Verify:
43 | tesseract --list-langs
44 |
--------------------------------------------------------------------------------