├── data
├── elements.csv
├── excel.xlsx
├── map_func.py
├── parquet.parquet
├── multiple_separators.csv
├── text_lines.txt
├── crime0-10.dprep.settings
├── crime0-10.csv
├── adls-dpreptestfiles.crt
├── median_income.csv
├── secrets.dprep
├── crime0-10.dprep
└── median_income_transformed.csv
├── README.md
├── column-type-transform.ipynb
├── package-json-representation.ipynb
├── quantile-transformation.ipynb
├── read-pandas-dataframe.ipynb
├── secrets.ipynb
├── external-references.ipynb
├── impute-missing-values.ipynb
├── 0. Import librairie.ipynb
├── smart-read-file-separators.ipynb
├── caching.ipynb
├── join.ipynb
└── split-column-by-example.ipynb
/data/elements.csv:
--------------------------------------------------------------------------------
1 | ID,Symbol,Boiling Point
2 | 1,H,-252.87
3 | 53,I,184.3
4 | 2,He,-268.93
--------------------------------------------------------------------------------
/data/excel.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/retkowsky/Data-Preparation-avec-Azure-ML-service/master/data/excel.xlsx
--------------------------------------------------------------------------------
/data/map_func.py:
--------------------------------------------------------------------------------
1 | def transform(df, index):
2 | df['MAM_MTH00numvalid_1011'].fillna(0,inplace=True)
3 | return df
4 |
--------------------------------------------------------------------------------
/data/parquet.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/retkowsky/Data-Preparation-avec-Azure-ML-service/master/data/parquet.parquet
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data-Preparation-avec-Azure-ML-service
2 |
3 | Serge Retkowsky | serge.retkowsky@microsoft.com | https://www.linkedin.com/in/serger/
4 |
--------------------------------------------------------------------------------
/data/multiple_separators.csv:
--------------------------------------------------------------------------------
1 | ID |CaseNumber| |Completed|
2 | 10140490 |HY329907| |Y|
3 | 10139776 |HY329265| |Y|
4 | 10140270 |HY329253| |N|
5 | 10139885 |HY329308| |Y|
6 | 10140379 |HY329556| |N|
7 | 10140868 |HY330421| |N|
8 | 10139762 |HY329232| |N|
9 | 10139722 |HY329228| |Y|
10 | 10139774 |HY329209| |N|
11 | 10139697 |HY329177| |N|
--------------------------------------------------------------------------------
/data/text_lines.txt:
--------------------------------------------------------------------------------
1 | Date||Minimum temperature||Maximum temperature
2 | 2015-07-1||-4.1||10.0
3 | 2015-07-2||-0.8||10.8
4 | 2015-07-3||-7.0||10.5
5 | 2015-07-4||-5.5||9.3
6 | 2015-07-5||-4.7||7.3
7 | 2015-07-6||-2.4||11.2
8 | 2015-07-7||-4.7||11.5
9 | 2015-07-8||-3.0||12.6
10 | 2015-07-9||-1.3||13.8
11 | 2015-07-10||-0.5||9.9
12 | 2015-07-11||3.6||12.5
13 | 2015-07-12||3.1||9.2
14 | 2015-07-13||3.6||13.6
15 | 2015-07-14||4.1||10.0
16 | 2015-07-15||1.1||7.9
17 |
--------------------------------------------------------------------------------
/data/crime0-10.dprep.settings:
--------------------------------------------------------------------------------
1 | {"project":{"activitiesPaneSize":200,"isActivitiesPaneCollapsed":true,"activeActivityId":"75637565-60ad-4baa-87d3-396a7930cfe7","isInActivityView":true},"75637565-60ad-4baa-87d3-396a7930cfe7.main.visualFilters":[],"75637565-60ad-4baa-87d3-396a7930cfe7.main.currentSliceIndex":0,"75637565-60ad-4baa-87d3-396a7930cfe7.main.typeFilter":[],"75637565-60ad-4baa-87d3-396a7930cfe7.main.columnSearchFilter":{"term":"","matchCase":false,"matchWholeWord":false,"useRegex":false},"75637565-60ad-4baa-87d3-396a7930cfe7.main.columnsSelections":[]}
--------------------------------------------------------------------------------
/data/crime0-10.csv:
--------------------------------------------------------------------------------
1 | ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
2 | 10140490,HY329907,07/05/2015 11:50:00 PM,050XX N NEWLAND AVE,0820,THEFT,$500 AND UNDER,STREET,false,false,1613,016,41,10,06,1129230,1933315,2015,07/12/2015 12:42:46 PM,41.973309466,-87.800174996,"(41.973309466, -87.800174996)"
3 | 10139776,HY329265,07/05/2015 11:30:00 PM,011XX W MORSE AVE,0460,BATTERY,SIMPLE,STREET,false,true,2431,024,49,1,08B,1167370,1946271,2015,07/12/2015 12:42:46 PM,42.008124017,-87.65955018,"(42.008124017, -87.65955018)"
4 | 10140270,HY329253,07/05/2015 11:20:00 PM,121XX S FRONT AVE,0486,BATTERY,DOMESTIC BATTERY SIMPLE,STREET,false,true,0532,,9,53,08B,,,2015,07/12/2015 12:42:46 PM,,,
5 | 10139885,HY329308,07/05/2015 11:19:00 PM,051XX W DIVISION ST,0610,BURGLARY,FORCIBLE ENTRY,SMALL RETAIL STORE,false,false,1531,015,37,25,05,1141721,1907465,2015,07/12/2015 12:42:46 PM,41.902152027,-87.754883404,"(41.902152027, -87.754883404)"
6 | 10140379,HY329556,07/05/2015 11:00:00 PM,012XX W LAKE ST,0930,MOTOR VEHICLE THEFT,THEFT/RECOVERY: AUTOMOBILE,STREET,false,false,1215,012,27,28,07,1168413,1901632,2015,07/12/2015 12:42:46 PM,41.885610142,-87.657008701,"(41.885610142, -87.657008701)"
7 | 10140868,HY330421,07/05/2015 10:54:00 PM,118XX S PEORIA ST,1320,CRIMINAL DAMAGE,TO VEHICLE,VEHICLE NON-COMMERCIAL,false,false,0524,005,34,53,14,1172409,1826485,2015,07/12/2015 12:42:46 PM,41.6793109,-87.644545209,"(41.6793109, -87.644545209)"
8 | 10139762,HY329232,07/05/2015 10:42:00 PM,026XX W 37TH PL,1020,ARSON,BY FIRE,VACANT LOT/LAND,false,false,0911,009,12,58,09,1159436,1879658,2015,07/12/2015 12:42:46 PM,41.825500607,-87.690578042,"(41.825500607, -87.690578042)"
9 | 10139722,HY329228,07/05/2015 10:30:00 PM,016XX S CENTRAL PARK AVE,1811,NARCOTICS,POSS: CANNABIS 30GMS OR LESS,ALLEY,true,false,1021,010,24,29,18,1152687,1891389,2015,07/12/2015 12:42:46 PM,41.857827814,-87.715028789,"(41.857827814, -87.715028789)"
10 | 10139774,HY329209,07/05/2015 10:15:00 PM,048XX N ASHLAND AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,false,false,2032,020,46,3,14,1164821,1932394,2015,07/12/2015 12:42:46 PM,41.970099796,-87.669324377,"(41.970099796, -87.669324377)"
11 | 10139697,HY329177,07/05/2015 10:10:00 PM,058XX S ARTESIAN AVE,1320,CRIMINAL DAMAGE,TO VEHICLE,ALLEY,false,false,0824,008,16,63,14,1160997,1865851,2015,07/12/2015 12:42:46 PM,41.787580282,-87.685233078,"(41.787580282, -87.685233078)"
--------------------------------------------------------------------------------
/data/adls-dpreptestfiles.crt:
--------------------------------------------------------------------------------
1 | -----BEGIN PRIVATE KEY-----
2 | MIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQDmkkyF0BwipZow
3 | Wd1AMkRkySx0y079JPxpsYhv4i1xXKdoa9bpFqwoXmJpeQM1JWnU4UeZzFeM86qK
4 | AhQvL4KV4kibcP2ENvu2NKFEdotO3uxPJ+6GlcYwMYzy+tUj008KnnRZfTrR78sJ
5 | tIl3C6lnVL0ICihksG59P1sskRq3PvOjXLAdEZalwDjZ4ZPoNDZdj6nUjB2l8zqu
6 | pKAt5mR+bJ9Sox4yrDuNhMmFt5QsRDRe3wUqdV+C9OCWHmjlmsjrYw7p9YmjBDvC
7 | 5U7mF0Mk/XeYFzj0pkXKQVqBL6xqig+q5ob0szYfg19iDeFhS3iIsRcJGEnRVW/A
8 | NpsBZyKrAgMBAAECggEBANlvP8C1F8NInhZYuIAwpzTQTh86Fxw8g9h8dijkh2wv
9 | LyQXBk07d1B+aZoDZ5X32UzKwcX04N9obfvFqBkzWZdVFJmZvUmwvEEActBoZkkT
10 | io+/HX5HweVy5PPCvbsSK6jc8uXtZcnSs4tMeJIOKkvqqnTpd1w00Y1FcQqfMC16
11 | 4p7o8wbt6OFoFAYqcxeVYVwDzCTLZD3+iJaqmntkBkoDndJy52yXQmMq5z1wbQVp
12 | BL6+L9nTvmouy64jiHVSKOx8nnWThYfHsXoPv+rYywjeuK/v3hyaTAwogs36ooEn
13 | SnuTBRvJcumN9Q0XIVlxKMVBcGyyAP+0yNKGz5NQgdECgYEA/I/Uq1E3epPJgEWR
14 | Bub+LpCgwtrw/lgKncb/Q/AiE9qoXobUe4KNU8aGaNMb7uVNLckY7cOluLS6SQb3
15 | Mzwk2Jl0G3vk8rW46tZWvSYB8+zAR2Rz7seUOT9SE5OmvwpnHrnp3nRr1vvVd2bp
16 | Q/ypwMLrwWQN51Kr+oTS74bUbrkCgYEA6bXVIUyao7z2Q3qAr6h+6JEWDbkJA7hJ
17 | BjHIOXvxd1tMoJJX+X9+IE/2XoJaUkGCb0vrM/hi1cyQFmS4Or/J6IWSZu8oBpDr
18 | EBmIK3PF1nrzNvWD28wM46c6ScehyWSm/u4bJWSm9liTX3dv5Kpa6ym7yLKc3c0B
19 | ECpSJM+5SoMCgYEAq585Tukzn/IJPUcIk/4nv5C8DW0l0lAVdr2g/JOTNJajTwik
20 | HwHJ86G1+Elsc9wRpAlBDWCjnm4BIFrBZGl8SEuOoJaCL4PZEotwCbxoG09IIbtb
21 | JGkuifBDX9Y3ux3gkPqYt3e5SC99EVQ3MuHgoIJUHehVolmFUAkuJWIjvNECgYEA
22 | 5pU0VspRuELzZdgzpxvDOooLDDcHodfslGQBfFXBA1Xc4IACtHMJaa/7D3vkyUtA
23 | +bYZtQjX2sEdWDq/WZdoCjXfIBfNkczhXt0R8G0lQFvGIu9QzUchYGrZo3mHMkBQ
24 | Uy1xMw9/e4YgwQwCJcW+Nk7Sq00uX9enuN9IdHFOCykCgYAqAGMK6CH1tlpjvHrf
25 | k+ZhigYxTXBlsVVvK1BIGGaiwzDpn65zeQp4aLOjSZkI1LuRi3tfTiZ321jRd64J
26 | 4lGk5Jurqv5grDmxROX/U50wEYbI9ncu/thU7syUdxDiqxHPI2RMG50mRcm3a55p
27 | ZCNSqkMlcXyA0U1z8C1ILNUsbA==
28 | -----END PRIVATE KEY-----
29 | -----BEGIN CERTIFICATE-----
30 | MIICoTCCAYkCAgPoMA0GCSqGSIb3DQEBBQUAMBQxEjAQBgNVBAMMCUNMSS1Mb2dp
31 | bjAiGA8yMDE4MDcxMzIzMjA0N1oYDzIwMTkwNzEzMjMyMDQ5WjAUMRIwEAYDVQQD
32 | DAlDTEktTG9naW4wggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDmkkyF
33 | 0BwipZowWd1AMkRkySx0y079JPxpsYhv4i1xXKdoa9bpFqwoXmJpeQM1JWnU4UeZ
34 | zFeM86qKAhQvL4KV4kibcP2ENvu2NKFEdotO3uxPJ+6GlcYwMYzy+tUj008KnnRZ
35 | fTrR78sJtIl3C6lnVL0ICihksG59P1sskRq3PvOjXLAdEZalwDjZ4ZPoNDZdj6nU
36 | jB2l8zqupKAt5mR+bJ9Sox4yrDuNhMmFt5QsRDRe3wUqdV+C9OCWHmjlmsjrYw7p
37 | 9YmjBDvC5U7mF0Mk/XeYFzj0pkXKQVqBL6xqig+q5ob0szYfg19iDeFhS3iIsRcJ
38 | GEnRVW/ANpsBZyKrAgMBAAEwDQYJKoZIhvcNAQEFBQADggEBAI4VlaFb9NsXMLdT
39 | Cw5/pk0Xo2Qi6483RGTy8vzrw88IE7f3juB/JWG+rayjtW5bBRx2fae4/ZIdZ4zg
40 | N2FDKn2PQPAc9m9pcKyUKUvWOC8ixSkrUmeQew0l1AXU0hsPSlJ7/7ZK4efoyB47
41 | hj71fsyKdyKbisZDcUFBq/S8PazdPF0YOD1W/4A2tW0cSMg+jmFWynuUTdWt3SU8
42 | CwBGqdiSKT5faJuYwIWnRXDEQS3ObRn1OFEfFdd4d2sxjxydWKRgnINnGlBdiFAT
43 | KzCozVr+75cO2ErH6x5C0hLQGG5BxXbaijyxyvaRNokTMVVv6OaDEnjzCGfJ72Yf
44 | 2wgitNc=
45 | -----END CERTIFICATE-----
46 |
--------------------------------------------------------------------------------
/data/median_income.csv:
--------------------------------------------------------------------------------
1 | median_income
2 | 4.4896
3 | 2.1029
4 | 2.3889
5 | 3.707
6 | 6.4788
7 | 4.4074
8 | 5.2907
9 | 1.5156
10 | 8.4411
11 | 4.4085
12 | 2.1439
13 | 2.8971
14 | 6.1008
15 | 3.5258
16 | 2.7694
17 | 2.2356
18 | 1.9509
19 | 4.0905
20 | 3.6726
21 | 3.1696
22 | 2.5389
23 | 3.0319
24 | 4.6779
25 | 2.9076
26 | 2.8616
27 | 1.4722
28 | 5.6413
29 | 2.1167
30 | 4.7308
31 | 4.8173
32 | 2.3438
33 | 1.7333
34 | 1.4429
35 | 2.3253
36 | 2.4022
37 | 3.4048
38 | 6.6073
39 | 4.1080000000000005
40 | 4.2829
41 | 1.5727
42 | 2.5211
43 | 4.2679
44 | 4.7328
45 | 4.7069
46 | 2.465
47 | 5.0267
48 | 2.8043
49 | 2.4053
50 | 1.2176
51 | 2.39
52 | 3.6364
53 | 6.0162
54 | 2.8088
55 | 3.3984
56 | 4.5
57 | 3.9079
58 | 4.9618
59 | 2.9344
60 | 2.4283
61 | 3.7388
62 | 1.6021
63 | 2.3352
64 | 4.0982
65 | 1.9531
66 | 3.2386
67 | 5.1169
68 | 4.692
69 | 4.0
70 | 6.4238
71 | 3.7375
72 | 2.8233
73 | 2.8009
74 | 3.767
75 | 3.6761
76 | 5.0282
77 | 3.5296
78 | 5.215
79 | 4.0125
80 | 9.4667
81 | 5.9062
82 | 3.9864
83 | 2.0734
84 | 2.875
85 | 3.3611
86 | 2.8214
87 | 0.9946
88 | 4.5446
89 | 4.6908
90 | 9.3198
91 | 1.2826
92 | 2.4943
93 | 10.1882
94 | 4.6731
95 | 4.375
96 | 2.8173
97 | 2.0903
98 | 2.725
99 | 2.8547
100 | 2.25
101 | 1.9444
102 | 1.7167
103 | 1.9342
104 | 4.9524
105 | 3.65
106 | 3.0856
107 | 3.2396
108 | 2.9324
109 | 3.495
110 | 1.9818
111 | 4.6964
112 | 3.925
113 | 3.625
114 | 2.9688
115 | 4.0417
116 | 9.7956
117 | 3.8732
118 | 2.6998
119 | 2.006
120 | 4.25
121 | 3.1839999999999997
122 | 5.9658
123 | 2.628
124 | 2.5057
125 | 5.155
126 | 4.6
127 | 4.6681
128 | 5.5942
129 | 5.1104
130 | 3.0759
131 | 3.5757
132 | 3.6845
133 | 6.4667
134 | 5.273
135 | 3.0635
136 | 11.2866
137 | 4.0444
138 | 5.2541
139 | 5.5791
140 | 4.5375
141 | 9.8144
142 | 6.7257
143 | 4.1442
144 | 4.0313
145 | 2.2791
146 | 4.1679
147 | 3.2852
148 | 3.2768
149 | 5.021
150 | 4.875
151 | 4.419
152 | 3.3272
153 | 4.2386
154 | 1.245
155 | 5.152
156 | 4.8125
157 | 2.1638
158 | 7.1621
159 | 1.5372
160 | 10.0481
161 | 3.3869
162 | 5.4591
163 | 4.4318
164 | 6.5044
165 | 4.2865
166 | 3.0461
167 | 11.3283
168 | 2.7026
169 | 3.016
170 | 3.0943
171 | 3.225
172 | 6.187
173 | 3.8158
174 | 3.0147
175 | 15.0
176 | 3.1364
177 | 2.9
178 | 5.5941
179 | 3.4028
180 | 6.0062
181 | 8.3792
182 | 3.8036
183 | 2.0926
184 | 6.7703
185 | 4.2569
186 | 4.744
187 | 9.7037
188 | 5.1292
189 | 2.3148
190 | 3.3021
191 | 1.95
192 | 3.025
193 | 2.6523
194 | 1.2188
195 | 5.827999999999999
196 | 3.1587
197 | 2.45
198 | 2.3851
199 | 2.1221
200 | 3.5313
201 | 3.4821
202 | 7.8252
203 | 5.1878
204 | 3.7459
205 | 6.0097
206 | 2.3194
207 | 4.2061
208 | 2.267
209 | 2.2109
210 | 2.7589
211 | 2.6553
212 | 6.3325
213 | 5.7233
214 | 4.337
215 | 3.9667
216 | 5.8623
217 | 1.6806
218 | 3.5851
219 | 2.9716
220 | 3.9
221 | 2.7431
222 | 3.3621
223 | 1.9464
224 | 7.3518
225 | 4.775
226 | 3.5968
227 | 6.221
228 | 10.0968
229 | 1.9483
230 | 2.0469
231 | 3.725
232 | 3.675
233 | 1.8529
234 | 1.7159
235 | 1.7386
236 | 3.6687
237 | 3.4671
238 | 4.8233
239 | 4.3036
240 | 1.6488
241 | 2.9453
242 | 5.0096
243 | 3.175
244 | 4.2031
245 | 3.1667
246 | 5.7204
247 | 3.375
248 | 6.5483
249 | 4.2206
250 | 2.6631
251 | 3.5363
252 |
--------------------------------------------------------------------------------
/column-type-transform.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "metadata": {},
5 | "cell_type": "markdown",
6 | "source": "# Column Type Transformations\nCopyright (c) Microsoft Corporation. All rights reserved.
\nLicensed under the MIT License."
7 | },
8 | {
9 | "metadata": {},
10 | "cell_type": "markdown",
11 | "source": "DataPrep has the ability to transform column types."
12 | },
13 | {
14 | "metadata": {
15 | "trusted": true
16 | },
17 | "cell_type": "code",
18 | "source": "import azureml.dataprep as dprep\ndataflow = dprep.read_csv(path=r'data\\elements.csv')\ndataflow.head(3)",
19 | "execution_count": 1,
20 | "outputs": [
21 | {
22 | "output_type": "execute_result",
23 | "execution_count": 1,
24 | "data": {
25 | "text/html": "
\n\n
\n \n \n | \n ID | \n Symbol | \n Boiling Point | \n
\n \n \n \n | 0 | \n 1 | \n H | \n -252.87 | \n
\n \n | 1 | \n 53 | \n I | \n 184.3 | \n
\n \n | 2 | \n 2 | \n He | \n -268.93 | \n
\n \n
\n
",
26 | "text/plain": " ID Symbol Boiling Point\n0 1 H -252.87\n1 53 I 184.3\n2 2 He -268.93"
27 | },
28 | "metadata": {}
29 | }
30 | ]
31 | },
32 | {
33 | "metadata": {},
34 | "cell_type": "markdown",
35 | "source": "#### `to_long(columns)`"
36 | },
37 | {
38 | "metadata": {
39 | "trusted": true
40 | },
41 | "cell_type": "code",
42 | "source": "# Convert the boiling point to a 64 bit integer.\nintegers_only_dataflow = dataflow.to_long(['Boiling Point'])\nintegers_only_dataflow.head(3)",
43 | "execution_count": 2,
44 | "outputs": [
45 | {
46 | "output_type": "execute_result",
47 | "execution_count": 2,
48 | "data": {
49 | "text/html": "\n\n
\n \n \n | \n ID | \n Symbol | \n Boiling Point | \n
\n \n \n \n | 0 | \n 1 | \n H | \n -252 | \n
\n \n | 1 | \n 53 | \n I | \n 184 | \n
\n \n | 2 | \n 2 | \n He | \n -268 | \n
\n \n
\n
",
50 | "text/plain": " ID Symbol Boiling Point\n0 1 H -252\n1 53 I 184\n2 2 He -268"
51 | },
52 | "metadata": {}
53 | }
54 | ]
55 | },
56 | {
57 | "metadata": {
58 | "trusted": true
59 | },
60 | "cell_type": "code",
61 | "source": "",
62 | "execution_count": null,
63 | "outputs": []
64 | }
65 | ],
66 | "metadata": {
67 | "kernelspec": {
68 | "name": "python36",
69 | "display_name": "Python 3.6",
70 | "language": "python"
71 | },
72 | "language_info": {
73 | "mimetype": "text/x-python",
74 | "nbconvert_exporter": "python",
75 | "name": "python",
76 | "pygments_lexer": "ipython3",
77 | "version": "3.6.6",
78 | "file_extension": ".py",
79 | "codemirror_mode": {
80 | "version": 3,
81 | "name": "ipython"
82 | }
83 | }
84 | },
85 | "nbformat": 4,
86 | "nbformat_minor": 2
87 | }
--------------------------------------------------------------------------------
/package-json-representation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "metadata": {},
5 | "cell_type": "markdown",
6 | "source": "DataPrep Package can be saved to and loaded from JSON string."
7 | },
8 | {
9 | "metadata": {},
10 | "cell_type": "markdown",
11 | "source": "# Work with JSON representation of Package\nCopyright (c) Microsoft Corporation. All rights reserved.
\nLicensed under the MIT License."
12 | },
13 | {
14 | "metadata": {
15 | "trusted": true
16 | },
17 | "cell_type": "code",
18 | "source": "# create a Dataflow and pack it into a Package\nimport azureml.dataprep as dprep\ndf = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv')\npkg = dprep.Package(df)",
19 | "execution_count": 1,
20 | "outputs": []
21 | },
22 | {
23 | "metadata": {
24 | "trusted": true
25 | },
26 | "cell_type": "code",
27 | "source": "# save Package to JSON string\njson_str = pkg.to_json()\njson_str",
28 | "execution_count": 2,
29 | "outputs": [
30 | {
31 | "output_type": "execute_result",
32 | "execution_count": 2,
33 | "data": {
34 | "text/plain": "'{\\n \"schemaVersion\": 63,\\n \"id\": \"1e865029-9dec-4664-a9ca-effecbace8c9\",\\n \"activities\": [\\n {\\n \"id\": \"9a2f0365-c518-4d54-b68c-e8ca31ef5b22\",\\n \"name\": \"dataflow\",\\n \"blocks\": [\\n {\\n \"id\": \"c60a8fb8-d189-477a-af3d-d5e163cb9eae\",\\n \"type\": \"Microsoft.DPrep.GetFilesBlock\",\\n \"arguments\": {\\n \"isArchive\": false,\\n \"path\": {\\n \"target\": 1,\\n \"resourceDetails\": [\\n {\\n \"path\": \"https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv\",\\n \"sas\": null,\\n \"storageAccountName\": null,\\n \"storageAccountKey\": null\\n }\\n ]\\n }\\n },\\n \"localData\": {},\\n \"isEnabled\": true,\\n \"name\": null,\\n \"annotation\": null\\n },\\n {\\n \"id\": \"6af3740f-32ff-4337-8201-b772de830251\",\\n \"type\": \"Microsoft.DPrep.ParseDelimitedBlock\",\\n \"arguments\": {\\n \"columnHeadersMode\": 3,\\n \"fileEncoding\": 0,\\n \"handleQuotedLineBreaks\": false,\\n \"preview\": false,\\n \"separator\": \",\",\\n \"skipRows\": 0,\\n \"skipRowsMode\": 0\\n },\\n \"localData\": {},\\n \"isEnabled\": true,\\n \"name\": null,\\n \"annotation\": null\\n },\\n {\\n \"id\": \"f46f529c-0a0d-4a88-b398-734a71f42d48\",\\n \"type\": \"Microsoft.DPrep.DropColumnsBlock\",\\n \"arguments\": {\\n \"columns\": {\\n \"type\": 0,\\n \"details\": {\\n \"selectedColumns\": [\\n \"Path\"\\n ]\\n }\\n }\\n },\\n \"localData\": {},\\n \"isEnabled\": true,\\n \"name\": null,\\n \"annotation\": null\\n }\\n ],\\n \"inspectors\": []\\n }\\n ],\\n \"runConfigurations\": []\\n}'"
35 | },
36 | "metadata": {}
37 | }
38 | ]
39 | },
40 | {
41 | "metadata": {
42 | "trusted": true
43 | },
44 | "cell_type": "code",
45 | "source": "# load Package from JSON string\npkg_loaded = dprep.Package.from_json(json_str)\ndf_loaded = pkg_loaded.dataflows[0]",
46 | "execution_count": 3,
47 | "outputs": []
48 | },
49 | {
50 | "metadata": {
51 | "trusted": true
52 | },
53 | "cell_type": "code",
54 | "source": "",
55 | "execution_count": null,
56 | "outputs": []
57 | }
58 | ],
59 | "metadata": {
60 | "kernelspec": {
61 | "name": "python36",
62 | "display_name": "Python 3.6",
63 | "language": "python"
64 | },
65 | "language_info": {
66 | "mimetype": "text/x-python",
67 | "nbconvert_exporter": "python",
68 | "name": "python",
69 | "pygments_lexer": "ipython3",
70 | "version": "3.6.6",
71 | "file_extension": ".py",
72 | "codemirror_mode": {
73 | "version": 3,
74 | "name": "ipython"
75 | }
76 | }
77 | },
78 | "nbformat": 4,
79 | "nbformat_minor": 2
80 | }
--------------------------------------------------------------------------------
/data/secrets.dprep:
--------------------------------------------------------------------------------
1 | {
2 | "schemaVersion": 61,
3 | "id": "0ca59762-2256-45e6-b406-e58a4bb280b9",
4 | "activities": [
5 | {
6 | "id": "b308e5b8-9b2a-47f8-9d32-0f542b4a34a4",
7 | "name": "read_csv_duplicate_headers",
8 | "blocks": [
9 | {
10 | "id": "8d9ec228-6a4b-4abf-afb7-65f58dda1581",
11 | "type": "Microsoft.DPrep.GetFilesBlock",
12 | "arguments": {
13 | "path": {
14 | "target": 1,
15 | "resourceDetails": [
16 | {
17 | "path": "https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv",
18 | "sas": {
19 | "id": "https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv",
20 | "secretType": "AzureMLSecret"
21 | },
22 | "storageAccountName": null,
23 | "storageAccountKey": null
24 | }
25 | ]
26 | }
27 | },
28 | "isEnabled": true,
29 | "name": null,
30 | "annotation": null
31 | },
32 | {
33 | "id": "4ad0460f-ec65-47c0-a0a4-44345404a462",
34 | "type": "Microsoft.DPrep.ParseDelimitedBlock",
35 | "arguments": {
36 | "columnHeadersMode": 3,
37 | "fileEncoding": 0,
38 | "handleQuotedLineBreaks": false,
39 | "preview": false,
40 | "separator": ",",
41 | "skipRows": 0,
42 | "skipRowsMode": 0
43 | },
44 | "isEnabled": true,
45 | "name": null,
46 | "annotation": null
47 | },
48 | {
49 | "id": "1a3e11ba-5854-48da-aa47-53af61beb782",
50 | "type": "Microsoft.DPrep.DropColumnsBlock",
51 | "arguments": {
52 | "columns": {
53 | "type": 0,
54 | "details": {
55 | "selectedColumns": [
56 | "Path"
57 | ]
58 | }
59 | }
60 | },
61 | "isEnabled": true,
62 | "name": null,
63 | "annotation": null
64 | }
65 | ],
66 | "inspectors": []
67 | },
68 | {
69 | "id": "2d1fd227-0e7c-41de-9606-ca7eced82e07",
70 | "name": "population",
71 | "blocks": [
72 | {
73 | "id": "27060820-095e-48d1-bdbd-511f7e369105",
74 | "type": "Microsoft.DPrep.GetFilesBlock",
75 | "arguments": {
76 | "path": {
77 | "target": 1,
78 | "resourceDetails": [
79 | {
80 | "path": "https://dpreptestfiles.blob.core.windows.net/testfiles/population.csv",
81 | "sas": {
82 | "id": "https://dpreptestfiles.blob.core.windows.net/testfiles/population.csv",
83 | "secretType": "AzureMLSecret"
84 | },
85 | "storageAccountName": null,
86 | "storageAccountKey": null
87 | }
88 | ]
89 | }
90 | },
91 | "isEnabled": true,
92 | "name": null,
93 | "annotation": null
94 | },
95 | {
96 | "id": "e7b2a399-9300-4fe5-8959-0d4ae9fc9172",
97 | "type": "Microsoft.DPrep.ParseDelimitedBlock",
98 | "arguments": {
99 | "columnHeadersMode": 3,
100 | "fileEncoding": 0,
101 | "handleQuotedLineBreaks": false,
102 | "preview": false,
103 | "separator": ",",
104 | "skipRows": 0,
105 | "skipRowsMode": 0
106 | },
107 | "isEnabled": true,
108 | "name": null,
109 | "annotation": null
110 | },
111 | {
112 | "id": "5572e00a-dd5e-41fe-b301-3e66d0f4c5e2",
113 | "type": "Microsoft.DPrep.DropColumnsBlock",
114 | "arguments": {
115 | "columns": {
116 | "type": 0,
117 | "details": {
118 | "selectedColumns": [
119 | "Path"
120 | ]
121 | }
122 | }
123 | },
124 | "isEnabled": true,
125 | "name": null,
126 | "annotation": null
127 | }
128 | ],
129 | "inspectors": []
130 | },
131 | {
132 | "id": "ec2c9cf9-beb9-4ebd-b4d2-8ba076c6a3db",
133 | "name": "top_films",
134 | "blocks": [
135 | {
136 | "id": "6ac0814d-9e5b-4db5-8cc1-f11dc3db531d",
137 | "type": "Microsoft.DPrep.GetFilesBlock",
138 | "arguments": {
139 | "path": {
140 | "target": 1,
141 | "resourceDetails": [
142 | {
143 | "path": "https://dpreptestfiles.blob.core.windows.net/testfiles/TopFilms.csv",
144 | "sas": {
145 | "id": "https://dpreptestfiles.blob.core.windows.net/testfiles/TopFilms.csv",
146 | "secretType": "AzureMLSecret"
147 | },
148 | "storageAccountName": null,
149 | "storageAccountKey": null
150 | }
151 | ]
152 | }
153 | },
154 | "isEnabled": true,
155 | "name": null,
156 | "annotation": null
157 | },
158 | {
159 | "id": "0cd162d2-8395-4369-aa78-e431456c9201",
160 | "type": "Microsoft.DPrep.ParseDelimitedBlock",
161 | "arguments": {
162 | "columnHeadersMode": 3,
163 | "fileEncoding": 0,
164 | "handleQuotedLineBreaks": false,
165 | "preview": false,
166 | "separator": ",",
167 | "skipRows": 0,
168 | "skipRowsMode": 0
169 | },
170 | "isEnabled": true,
171 | "name": null,
172 | "annotation": null
173 | },
174 | {
175 | "id": "ceb32a6b-ba57-4c90-a4d0-5913c211961e",
176 | "type": "Microsoft.DPrep.DropColumnsBlock",
177 | "arguments": {
178 | "columns": {
179 | "type": 0,
180 | "details": {
181 | "selectedColumns": [
182 | "Path"
183 | ]
184 | }
185 | }
186 | },
187 | "isEnabled": true,
188 | "name": null,
189 | "annotation": null
190 | }
191 | ],
192 | "inspectors": []
193 | }
194 | ],
195 | "runConfigurations": []
196 | }
--------------------------------------------------------------------------------
/data/crime0-10.dprep:
--------------------------------------------------------------------------------
1 | {
2 | "schemaVersion": 59,
3 | "id": "1ba93a7c-e711-464f-9a70-1c491e28a66f",
4 | "activities": [
5 | {
6 | "id": "75637565-60ad-4baa-87d3-396a7930cfe7",
7 | "name": "crime0-10",
8 | "blocks": [
9 | {
10 | "id": "ba5a8061-129e-4618-953a-ce3e89c8f2cb",
11 | "type": "Microsoft.DPrep.GetFilesBlock",
12 | "arguments": {
13 | "path": {
14 | "target": 0,
15 | "resourceDetails": [
16 | {
17 | "path": "./crime0-10.csv"
18 | }
19 | ]
20 | }
21 | },
22 | "isEnabled": true,
23 | "name": null,
24 | "annotation": null
25 | },
26 | {
27 | "id": "1b345643-6b60-4ca1-99f9-2a64ae932a23",
28 | "type": "Microsoft.DPrep.ParseDelimitedBlock",
29 | "arguments": {
30 | "columnHeadersMode": 1,
31 | "fileEncoding": 0,
32 | "handleQuotedLineBreaks": false,
33 | "preview": false,
34 | "separator": ",",
35 | "skipRowsMode": 0
36 | },
37 | "isEnabled": true,
38 | "name": null,
39 | "annotation": null
40 | },
41 | {
42 | "id": "12cf73a2-1487-4915-bfa7-c86be7de08c0",
43 | "type": "Microsoft.DPrep.SetColumnTypesBlock",
44 | "arguments": {
45 | "columnConversion": [
46 | {
47 | "column": {
48 | "type": 2,
49 | "details": {
50 | "selectedColumn": "ID"
51 | }
52 | },
53 | "typeProperty": 3
54 | },
55 | {
56 | "column": {
57 | "type": 2,
58 | "details": {
59 | "selectedColumn": "IUCR"
60 | }
61 | },
62 | "typeProperty": 3
63 | },
64 | {
65 | "column": {
66 | "type": 2,
67 | "details": {
68 | "selectedColumn": "Domestic"
69 | }
70 | },
71 | "typeProperty": 1
72 | },
73 | {
74 | "column": {
75 | "type": 2,
76 | "details": {
77 | "selectedColumn": "Beat"
78 | }
79 | },
80 | "typeProperty": 3
81 | },
82 | {
83 | "column": {
84 | "type": 2,
85 | "details": {
86 | "selectedColumn": "District"
87 | }
88 | },
89 | "typeProperty": 3
90 | },
91 | {
92 | "column": {
93 | "type": 2,
94 | "details": {
95 | "selectedColumn": "Ward"
96 | }
97 | },
98 | "typeProperty": 3
99 | },
100 | {
101 | "column": {
102 | "type": 2,
103 | "details": {
104 | "selectedColumn": "Community Area"
105 | }
106 | },
107 | "typeProperty": 3
108 | },
109 | {
110 | "column": {
111 | "type": 2,
112 | "details": {
113 | "selectedColumn": "Year"
114 | }
115 | },
116 | "typeProperty": 3
117 | },
118 | {
119 | "column": {
120 | "type": 2,
121 | "details": {
122 | "selectedColumn": "Longitude"
123 | }
124 | },
125 | "typeProperty": 3
126 | },
127 | {
128 | "column": {
129 | "type": 2,
130 | "details": {
131 | "selectedColumn": "Arrest"
132 | }
133 | },
134 | "typeProperty": 1
135 | },
136 | {
137 | "column": {
138 | "type": 2,
139 | "details": {
140 | "selectedColumn": "X Coordinate"
141 | }
142 | },
143 | "typeProperty": 3
144 | },
145 | {
146 | "column": {
147 | "type": 2,
148 | "details": {
149 | "selectedColumn": "Updated On"
150 | }
151 | },
152 | "typeArguments": {
153 | "dateTimeFormats": [
154 | "%m/%d/%Y %I:%M:%S %p"
155 | ]
156 | },
157 | "typeProperty": 4
158 | },
159 | {
160 | "column": {
161 | "type": 2,
162 | "details": {
163 | "selectedColumn": "Date"
164 | }
165 | },
166 | "typeArguments": {
167 | "dateTimeFormats": [
168 | "%m/%d/%Y %I:%M:%S %p"
169 | ]
170 | },
171 | "typeProperty": 4
172 | },
173 | {
174 | "column": {
175 | "type": 2,
176 | "details": {
177 | "selectedColumn": "Y Coordinate"
178 | }
179 | },
180 | "typeProperty": 3
181 | },
182 | {
183 | "column": {
184 | "type": 2,
185 | "details": {
186 | "selectedColumn": "Latitude"
187 | }
188 | },
189 | "typeProperty": 3
190 | }
191 | ]
192 | },
193 | "isEnabled": true,
194 | "name": null,
195 | "annotation": null
196 | },
197 | {
198 | "id": "5f370fdf-2fde-4f18-8069-93ef5800bf0c",
199 | "type": "Microsoft.DPrep.SampleBlock",
200 | "arguments": {
201 | "activeSample": "0afde520-3a41-4fef-8d20-eaa07d588924",
202 | "samples": [
203 | {
204 | "allowAutoGen": true,
205 | "isDisabled": false,
206 | "sampleId": "0afde520-3a41-4fef-8d20-eaa07d588924",
207 | "sampleName": "Top 10000",
208 | "sampleRevision": "d8663336-152a-462f-bb57-686dc7a0843c",
209 | "sampleRunner": {
210 | "id": null,
211 | "type": 0
212 | },
213 | "sampleStrategy": 0,
214 | "topArguments": {
215 | "sampleCount": 10000
216 | }
217 | }
218 | ]
219 | },
220 | "isEnabled": true,
221 | "name": null,
222 | "annotation": null
223 | },
224 | {
225 | "id": "dfd62543-9285-412b-a930-0aeaaffde699",
226 | "type": "Microsoft.DPrep.HandlePathColumnBlock",
227 | "arguments": {
228 | "pathColumnOperation": 0
229 | },
230 | "isEnabled": true,
231 | "name": null,
232 | "annotation": null
233 | }
234 | ],
235 | "inspectors": []
236 | }
237 | ],
238 | "runConfigurations": []
239 | }
--------------------------------------------------------------------------------
/quantile-transformation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "metadata": {},
5 | "cell_type": "markdown",
6 | "source": "# Quantile Transformation\nCopyright (c) Microsoft Corporation. All rights reserved.\nLicensed under the MIT License.\n\nDataPrep has the ability to perform quantile transformation to a numeric column. This transformation can transform the data into a normal or uniform distribution. Values bigger than the learnt boundaries will simply be clipped to the learnt boundaries when applying quantile transformation.\n\nLet's load a sample of the median income of california households in different suburbs from the 1990 census data. From the data profile, we can see that the minimum value and maximum value is 0.9946 and 15 respectively."
7 | },
8 | {
9 | "metadata": {
10 | "trusted": true
11 | },
12 | "cell_type": "code",
13 | "source": "!pip install azureml",
14 | "execution_count": 1,
15 | "outputs": [
16 | {
17 | "output_type": "stream",
18 | "text": "Requirement already satisfied: azureml in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (0.2.7)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.3)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.19.1)\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.8.24)\nRequirement already satisfied: urllib3<1.24,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.5)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.5)\n",
19 | "name": "stdout"
20 | }
21 | ]
22 | },
23 | {
24 | "metadata": {
25 | "trusted": true
26 | },
27 | "cell_type": "code",
28 | "source": "import azureml.dataprep as dprep\n\ndf = dprep.read_csv(path='./data/median_income.csv').set_column_types(type_conversions={\n 'median_income': dprep.TypeConverter(dprep.FieldType.DECIMAL)\n})\ndf.get_profile()",
29 | "execution_count": 2,
30 | "outputs": [
31 | {
32 | "output_type": "execute_result",
33 | "execution_count": 2,
34 | "data": {
35 | "text/html": "\n \n \n | \n Type | \n Min | \n Max | \n Count | \n Missing Count | \n Error Count | \n Lower Quartile | \n Median | \n Upper Quartile | \n Standard Deviation | \n Mean | \n
\n \n \n \n | median_income | \n FieldType.DECIMAL | \n 0.9946 | \n 15.0 | \n 250.0 | \n 0.0 | \n 0.0 | \n 2.6907 | \n 3.6307 | \n 4.77335 | \n 2.026679 | \n 4.007843 | \n
\n \n
",
36 | "text/plain": "ColumnProfile\n name: median_income\n type: FieldType.DECIMAL\n\n min: 0.9946\n max: 15.0\n count: 250.0\n missing_count: 0.0\n error_count: 0.0\n\n lower_quartile: 2.6907\n median: 3.6307\n upper_quartile: 4.773350000000001\n std: 2.026679472255346\n mean: 4.007842799999996"
37 | },
38 | "metadata": {}
39 | }
40 | ]
41 | },
42 | {
43 | "metadata": {},
44 | "cell_type": "markdown",
45 | "source": "Let's now apply quantile transformation to `median_income` and see how that affects the data. We will apply quantile transformation twice, one that maps the data to a Uniform(0, 1) distribution, one that maps it to a Normal(0, 1) distribution.\n\nFrom the data profile, we can see that the min and max of the uniform median income is strictly between 0 and 1 and the mean and standard deviation of the normal median income is close to 1 and 0 respectively.\n\n*note: for normal distribution, we will clip the values at the ends as the 0th percentile and the 100th percentile are -Inf and Inf respectively.*"
46 | },
47 | {
48 | "metadata": {
49 | "trusted": true
50 | },
51 | "cell_type": "code",
52 | "source": "df = df.quantile_transform(source_column='median_income', new_column='median_income_uniform', quantiles_count=5)\ndf = df.quantile_transform(source_column='median_income', new_column='median_income_normal', \n quantiles_count=5, output_distribution=\"Normal\")\ndf.get_profile()",
53 | "execution_count": 3,
54 | "outputs": [
55 | {
56 | "output_type": "execute_result",
57 | "execution_count": 3,
58 | "data": {
59 | "text/html": "\n \n \n | \n Type | \n Min | \n Max | \n Count | \n Missing Count | \n Error Count | \n Lower Quartile | \n Median | \n Upper Quartile | \n Standard Deviation | \n Mean | \n
\n \n \n \n | median_income | \n FieldType.DECIMAL | \n 0.994600 | \n 15.000000 | \n 250.0 | \n 0.0 | \n 0.0 | \n 2.690700 | \n 3.630700 | \n 4.773350 | \n 2.026679 | \n 4.007843 | \n
\n \n | median_income_normal | \n FieldType.DECIMAL | \n -7.941345 | \n 7.941444 | \n 250.0 | \n 0.0 | \n 0.0 | \n -0.671590 | \n -0.000337 | \n 0.667810 | \n 1.021506 | \n -0.060922 | \n
\n \n | median_income_uniform | \n FieldType.DECIMAL | \n 0.000000 | \n 1.000000 | \n 250.0 | \n 0.0 | \n 0.0 | \n 0.250934 | \n 0.499866 | \n 0.747861 | \n 0.252830 | \n 0.484762 | \n
\n \n
",
60 | "text/plain": "ColumnProfile\n name: median_income\n type: FieldType.DECIMAL\n\n min: 0.9946\n max: 15.0\n count: 250.0\n missing_count: 0.0\n error_count: 0.0\n\n lower_quartile: 2.6907\n median: 3.6307\n upper_quartile: 4.773350000000001\n std: 2.026679472255346\n mean: 4.007842799999996\n\nColumnProfile\n name: median_income_normal\n type: FieldType.DECIMAL\n\n min: -7.941345326170997\n max: 7.94144448741598\n count: 250.0\n missing_count: 0.0\n error_count: 0.0\n\n lower_quartile: -0.6715898847385642\n median: -0.00033696356609359737\n upper_quartile: 0.6678101623094225\n std: 1.021505801777812\n mean: -0.06092218967843191\n\nColumnProfile\n name: median_income_uniform\n type: FieldType.DECIMAL\n\n min: 0.0\n max: 1.0\n count: 250.0\n missing_count: 0.0\n error_count: 0.0\n\n lower_quartile: 0.25093366375866033\n median: 0.4998655717951272\n upper_quartile: 0.7478610044020887\n std: 0.25283034846216024\n mean: 0.4847624122367444"
61 | },
62 | "metadata": {}
63 | }
64 | ]
65 | },
66 | {
67 | "metadata": {},
68 | "cell_type": "markdown",
69 | "source": "Let's now save the dataflow which we will later load in the operationalization notebook."
70 | },
71 | {
72 | "metadata": {
73 | "trusted": true
74 | },
75 | "cell_type": "code",
76 | "source": "from tempfile import mkdtemp\nfrom os import path\n\ntmp_dir = mkdtemp()\npackage_path = path.join(tmp_dir, 'quantile_transform.dprep')\npackage = dprep.Package(arg=df)\npackage.save(package_path)\nprint('Package saved to: \"{}\"'.format(package_path))",
77 | "execution_count": 3,
78 | "outputs": [
79 | {
80 | "output_type": "stream",
81 | "text": "Package saved to: \"/tmp/tmp29cvg68a/quantile_transform.dprep\"\n",
82 | "name": "stdout"
83 | }
84 | ]
85 | },
86 | {
87 | "metadata": {
88 | "trusted": true
89 | },
90 | "cell_type": "code",
91 | "source": "",
92 | "execution_count": null,
93 | "outputs": []
94 | }
95 | ],
96 | "metadata": {
97 | "kernelspec": {
98 | "name": "python36",
99 | "display_name": "Python 3.6",
100 | "language": "python"
101 | },
102 | "language_info": {
103 | "mimetype": "text/x-python",
104 | "nbconvert_exporter": "python",
105 | "name": "python",
106 | "pygments_lexer": "ipython3",
107 | "version": "3.6.6",
108 | "file_extension": ".py",
109 | "codemirror_mode": {
110 | "version": 3,
111 | "name": "ipython"
112 | }
113 | }
114 | },
115 | "nbformat": 4,
116 | "nbformat_minor": 2
117 | }
--------------------------------------------------------------------------------
/read-pandas-dataframe.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "metadata": {
5 | "trusted": true
6 | },
7 | "cell_type": "code",
8 | "source": "!pip install azureml",
9 | "execution_count": 1,
10 | "outputs": [
11 | {
12 | "output_type": "stream",
13 | "text": "Requirement already satisfied: azureml in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (0.2.7)\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.19.1)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.3)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.5)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.5)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.8.24)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: urllib3<1.24,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\n",
14 | "name": "stdout"
15 | }
16 | ]
17 | },
18 | {
19 | "metadata": {
20 | "trusted": true
21 | },
22 | "cell_type": "code",
23 | "source": "import azureml.dataprep as dprep",
24 | "execution_count": 2,
25 | "outputs": []
26 | },
27 | {
28 | "metadata": {
29 | "trusted": true
30 | },
31 | "cell_type": "code",
32 | "source": "dflow = dprep.read_excel(path='./data/excel.xlsx')\ndflow = dflow.drop_columns(columns=['Column1'])\ndf = dflow.to_pandas_dataframe()\ndf.head(10)",
33 | "execution_count": 3,
34 | "outputs": [
35 | {
36 | "output_type": "execute_result",
37 | "execution_count": 3,
38 | "data": {
39 | "text/html": "\n\n
\n \n \n | \n Column2 | \n Column3 | \n Column4 | \n Column5 | \n Column6 | \n Column7 | \n Column8 | \n
\n \n \n \n | 0 | \n Iron, IVB | \n 6e+07 | \n Found | \n 1920 | \n http://www.lpi.usra.edu/meteor/metbull.php?cod... | \n -19.5833 | \n 17.9167 | \n
\n \n | 1 | \n Iron, IIIAB | \n 5.82e+07 | \n Found | \n 1818 | \n http://www.lpi.usra.edu/meteor/metbull.php?cod... | \n 76.1333 | \n -64.9333 | \n
\n \n | 2 | \n Iron, IAB-MG | \n 5e+07 | \n Found | \n 1576 | \n http://www.lpi.usra.edu/meteor/metbull.php?cod... | \n -27.4667 | \n -60.5833 | \n
\n \n | 3 | \n Iron, IAB-MG | \n 3e+07 | \n Found | \n 1891 | \n http://www.lpi.usra.edu/meteor/metbull.php?cod... | \n 35.05 | \n -111.033 | \n
\n \n | 4 | \n Iron, IIIE | \n 2.8e+07 | \n Found | \n 1898 | \n http://www.lpi.usra.edu/meteor/metbull.php?cod... | \n 47 | \n 88 | \n
\n \n | 5 | \n Iron, IVA | \n 2.6e+07 | \n Found | \n 1836 | \n http://www.lpi.usra.edu/meteor/metbull.php?cod... | \n -25.5 | \n 18 | \n
\n \n | 6 | \n Iron, IIIAB | \n 2.43e+07 | \n Found | \n 1852 | \n http://www.lpi.usra.edu/meteor/metbull.php?cod... | \n 27 | \n -105.1 | \n
\n \n | 7 | \n Iron, IAB-ung | \n 2.4e+07 | \n Found | \n 1911 | \n http://www.lpi.usra.edu/meteor/metbull.php?cod... | \n -30.7833 | \n 127.55 | \n
\n \n | 8 | \n Iron, IIAB | \n 2.3e+07 | \n Fell | \n 1947 | \n http://www.lpi.usra.edu/meteor/metbull.php?cod... | \n 46.16 | \n 134.653 | \n
\n \n | 9 | \n Iron, ungrouped | \n 2.2e+07 | \n Found | \n 1863 | \n http://www.lpi.usra.edu/meteor/metbull.php?cod... | \n 26.2 | \n -107.833 | \n
\n \n
\n
",
40 | "text/plain": " Column2 Column3 Column4 Column5 \\\n0 Iron, IVB 6e+07 Found 1920 \n1 Iron, IIIAB 5.82e+07 Found 1818 \n2 Iron, IAB-MG 5e+07 Found 1576 \n3 Iron, IAB-MG 3e+07 Found 1891 \n4 Iron, IIIE 2.8e+07 Found 1898 \n5 Iron, IVA 2.6e+07 Found 1836 \n6 Iron, IIIAB 2.43e+07 Found 1852 \n7 Iron, IAB-ung 2.4e+07 Found 1911 \n8 Iron, IIAB 2.3e+07 Fell 1947 \n9 Iron, ungrouped 2.2e+07 Found 1863 \n\n Column6 Column7 Column8 \n0 http://www.lpi.usra.edu/meteor/metbull.php?cod... -19.5833 17.9167 \n1 http://www.lpi.usra.edu/meteor/metbull.php?cod... 76.1333 -64.9333 \n2 http://www.lpi.usra.edu/meteor/metbull.php?cod... -27.4667 -60.5833 \n3 http://www.lpi.usra.edu/meteor/metbull.php?cod... 35.05 -111.033 \n4 http://www.lpi.usra.edu/meteor/metbull.php?cod... 47 88 \n5 http://www.lpi.usra.edu/meteor/metbull.php?cod... -25.5 18 \n6 http://www.lpi.usra.edu/meteor/metbull.php?cod... 27 -105.1 \n7 http://www.lpi.usra.edu/meteor/metbull.php?cod... -30.7833 127.55 \n8 http://www.lpi.usra.edu/meteor/metbull.php?cod... 46.16 134.653 \n9 http://www.lpi.usra.edu/meteor/metbull.php?cod... 26.2 -107.833 "
41 | },
42 | "metadata": {}
43 | }
44 | ]
45 | },
46 | {
47 | "metadata": {},
48 | "cell_type": "markdown",
49 | "source": "## read_pandas_dataframe\n\nThere are situations where you may already have some data in the form of a pandas DataFrame.\nThe steps taken to get to this DataFrame may be non-trivial or not easy to convert to dprep operations. The 'read_pandas_dataframe' reader can take a DataFrame and use it as the datasource for a Dataflow.\nIt is also required to pass in a path to a directory (that exists) where DataPrep can store the contents of the DataFrame. The files written to this directory will be named 'part-00000' and so on, they are written out as DataPrep's internal row based file format."
50 | },
51 | {
52 | "metadata": {
53 | "trusted": true
54 | },
55 | "cell_type": "code",
56 | "source": "import shutil\ncache_dir = 'df_dflow'\nshutil.rmtree(cache_dir, ignore_errors=True)\ndf_dflow = dprep.read_pandas_dataframe(df, cache_dir)",
57 | "execution_count": 5,
58 | "outputs": [
59 | {
60 | "output_type": "error",
61 | "ename": "AttributeError",
62 | "evalue": "module 'azureml.dataprep' has no attribute 'read_pandas_dataframe'",
63 | "traceback": [
64 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
65 | "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
66 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mcache_dir\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'df_dflow'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mshutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrmtree\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcache_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_errors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mdf_dflow\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdprep\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_pandas_dataframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
67 | "\u001b[0;31mAttributeError\u001b[0m: module 'azureml.dataprep' has no attribute 'read_pandas_dataframe'"
68 | ]
69 | }
70 | ]
71 | },
72 | {
73 | "metadata": {
74 | "trusted": true
75 | },
76 | "cell_type": "code",
77 | "source": "df_dflow.head(10)",
78 | "execution_count": null,
79 | "outputs": []
80 | },
81 | {
82 | "metadata": {
83 | "trusted": true
84 | },
85 | "cell_type": "code",
86 | "source": "",
87 | "execution_count": null,
88 | "outputs": []
89 | }
90 | ],
91 | "metadata": {
92 | "execute_as_test": false,
93 | "kernelspec": {
94 | "name": "python36",
95 | "display_name": "Python 3.6",
96 | "language": "python"
97 | },
98 | "language_info": {
99 | "mimetype": "text/x-python",
100 | "nbconvert_exporter": "python",
101 | "name": "python",
102 | "pygments_lexer": "ipython3",
103 | "version": "3.6.6",
104 | "file_extension": ".py",
105 | "codemirror_mode": {
106 | "version": 3,
107 | "name": "ipython"
108 | }
109 | }
110 | },
111 | "nbformat": 4,
112 | "nbformat_minor": 2
113 | }
--------------------------------------------------------------------------------
/data/median_income_transformed.csv:
--------------------------------------------------------------------------------
1 | median_income,median_income_uniform,median_income_normal
2 | 4.4896,0.688927015969381,0.4928112398942898
3 | 2.1029,0.16242159563866576,-0.9845540061415601
4 | 2.3889,0.20433495515563627,-0.8262365643809355
5 | 3.707,0.5167832475474021,0.04208177981426486
6 | 6.4788,0.7918154943685715,0.8127367410844715
7 | 4.4074,0.6708459812590735,0.44225037403262457
8 | 5.2907,0.7627885954411082,0.71530142355699
9 | 1.5156,0.07635265842077493,-1.430040774223022
10 | 8.4411,0.8397571522806675,0.9934602851577232
11 | 4.4085,0.6710879415775812,0.44291928632820865
12 | 2.1439,0.16843015417081886,-0.960387338010023
13 | 2.8971,0.3028380993334766,-0.5162551753422808
14 | 6.1008,0.7825804402531089,0.780937730437047
15 | 3.5258,0.47180713824983866,-0.07072794893657437
16 | 2.7694,0.2685175231133089,-0.6173027607303029
17 | 2.2356,0.18186880825370766,-0.9082661605741446
18 | 1.9509,0.14014596400726886,-1.0796637810446446
19 | 4.0905,0.6011394131362455,0.25629744014575906
20 | 3.6726,0.5092164884958867,0.023104366055624576
21 | 3.1696,0.3760750376263169,-0.31580558952782856
22 | 2.5389,0.2263174863708306,-0.7510293827903433
23 | 3.0319,0.3390668673403568,-0.4150111583096138
24 | 4.6779,0.7303462232193919,0.61386044030116
25 | 2.9076,0.3056600731025585,-0.5081899386067169
26 | 2.8616,0.29329714039991395,-0.5437779548183906
27 | 1.4722,0.0699923793891787,-1.475847787309027
28 | 5.6413,0.7713542302900003,0.7433140959507171
29 | 2.1167,0.1644439885104636,-0.9763562034967139
30 | 4.7308,0.7419823149003563,0.6494688564486208
31 | 4.8173,0.7512227895726955,0.6783427156995535
32 | 2.3438,0.19772554077026783,-0.8497733992585795
33 | 1.7333,0.10825663872442697,-1.235852818864947
34 | 1.4429,0.06569845829181076,-1.5086161030849279
35 | 2.3253,0.19501436192039387,-0.8595652745311925
36 | 2.4022,0.20628407292338352,-0.8193826233304142
37 | 3.4048,0.4392872500537518,-0.15277653846062797
38 | 6.6073,0.7949549241406269,0.8237349930522744
39 | 4.1080000000000005,0.6049887818397783,0.2662814785234653
40 | 4.2829,0.6434604724825127,0.3677239840415221
41 | 1.5727,0.0847206753033589,-1.3740010688161062
42 | 2.5211,0.22370889266662755,-0.7597270013437459
43 | 4.2679,0.6401610135937705,0.35888920867930674
44 | 4.7328,0.7424222427521885,0.6508311116191982
45 | 4.7069,0.7367251770709602,0.6332817852325006
46 | 2.465,0.21548742599214485,-0.7875245686101101
47 | 5.0267,0.7563387163763406,0.6945735910123612
48 | 2.8043,0.277897226402924,-0.5890996132954226
49 | 2.4053,0.20673837856849753,-0.8177906162820535
50 | 1.2176,0.03268069640658888,-1.8427782558151542
51 | 2.39,0.2044961603845477,-0.8256682268514074
52 | 3.6364,0.501253794377722,0.0031428016114447123
53 | 6.0162,0.780513547189172,0.7739287859106961
54 | 2.8088,0.2791066437325306,-0.5854974399696314
55 | 3.3984,0.4375671898516448,-0.15714017005326586
56 | 4.5,0.6912146407989088,0.4992962189785684
57 | 3.9079,0.5609740002639567,0.1534391166508579
58 | 4.9618,0.7547531211062519,0.6895237101278
59 | 2.9344,0.3128628251988819,-0.4877518079508566
60 | 2.4283,0.21010903335482733,-0.8060429810797759
61 | 3.7388,0.5237781003915357,0.05963819257139614
62 | 1.6021,0.089029251421537,-1.346757015256811
63 | 2.3352,0.1964652089805967,-0.8543151087700283
64 | 4.0982,0.6028331353658,0.26068721102120757
65 | 1.9531,0.1404683744650917,-1.078217399304828
66 | 3.2386,0.39461943668028376,-0.26729910818310126
67 | 5.1169,0.7585424250568029,0.7016216442421952
68 | 4.692,0.7334477145748096,0.6232738623769376
69 | 4.0,0.5812326778408341,0.20504797284322906
70 | 6.4238,0.7904717695634116,0.8080592738489488
71 | 3.7375,0.5234921472878448,0.05892015392758171
72 | 2.8233,0.28300365512792947,-0.5739416160725967
73 | 2.8009,0.2769834444205546,-0.5918263314054893
74 | 3.767,0.5299810831023711,0.07522231004048434
75 | 3.6761,0.5099863622365932,0.025034712726592745
76 | 5.0282,0.7563753634164814,0.6946905154674345
77 | 3.5296,0.47282842399483976,-0.06816178418735802
78 | 5.215,0.7609391414820063,0.70932677431432
79 | 4.0125,0.583982226914786,0.21209163422132077
80 | 9.4667,0.8648139551928855,1.1022060793492992
81 | 5.9062,0.7778260975788522,0.7648719390554228
82 | 3.9864,0.5782411684483745,0.19739599549652825
83 | 2.0734,0.15809836449967754,-1.0023041289181829
84 | 2.875,0.29689851644807563,-0.5333417486766315
85 | 3.3611,0.4275424639862395,-0.1826343528680402
86 | 2.8214,0.2824930122554289,-0.5754514381784691
87 | 0.9946,9.999999977795539e-08,-5.199337582605575
88 | 4.5446,0.7010250318947692,0.5273508961812362
89 | 4.6908,0.7331837578637103,0.6224705781821348
90 | 9.3198,0.861224988395104,1.085839464021226
91 | 1.2826,0.04220645993317308,-1.725635992615424
92 | 2.4943,0.2197813470895128,-0.7729318836756727
93 | 10.1882,0.8824411815005742,1.1872788772739642
94 | 4.6731,0.7292903963749944,0.6106682834354926
95 | 4.375,0.6637191500593902,0.42263484449515154
96 | 2.8173,0.281391098688454,-0.578713956288219
97 | 2.0903,0.16057506301658947,-0.9920971718209968
98 | 2.725,0.2565846054611911,-0.6539109002088199
99 | 2.8547,0.29144270049451715,-0.5491749392049887
100 | 2.25,0.18397913125036633,-0.9003044332369591
101 | 1.9444,0.13919338765461042,-1.0839504359991745
102 | 1.7167,0.10582390526994544,-1.2490472005138797
103 | 1.9342,0.13769857553197723,-1.0907176288996137
104 | 4.9524,0.7545234663213701,0.6887937530976251
105 | 3.65,0.5042453037701816,0.010641599310222728
106 | 3.0856,0.35349924747366146,-0.37589024233948426
107 | 3.2396,0.394888196086863,-0.26660099147199706
108 | 2.9324,0.3123253063857234,-0.48926992156445664
109 | 3.495,0.46352934852719846,-0.09154607537767347
110 | 1.9818,0.14467436543759887,-1.0595514059995472
111 | 4.6964,0.7344155558488406,0.6262226873574688
112 | 3.925,0.5647353833971228,0.16298628506764115
113 | 3.625,0.4984680713824984,-0.003839985024398658
114 | 2.9688,0.3221081487852074,-0.4618117868349911
115 | 4.0417,0.5904051735515374,0.22858735593517016
116 | 9.7956,0.8728494295277418,1.1399643807904891
117 | 3.8732,0.5533412520346663,0.13410759250347515
118 | 2.6998,0.24989741485432906,-0.6748126069650576
119 | 2.006,0.1482208804736502,-1.0440943039822579
120 | 4.25,0.6362236593198715,0.34838284885436693
121 | 3.1839999999999997,0.3799451730810577,-0.3056247863400478
122 | 5.9658,0.7792822066404437,0.7697712668826051
123 | 2.628,0.23937510991265604,-0.7083141049607123
124 | 2.5057,0.2214520194618676,-0.7672985355850374
125 | 5.155,0.7594732598763774,0.7046091860898294
126 | 4.6,0.7132110333905237,0.5627899373325242
127 | 4.6681,0.7281905767454135,0.6073497232884044
128 | 5.5942,0.7702035132295815,0.7395172425174374
129 | 5.1104,0.7583836212161931,0.7011125840899696
130 | 3.0759,0.35089228122984295,-0.38291260834816176
131 | 3.5757,0.4852182326381423,-0.037060878177008386
132 | 3.6845,0.5118340592142888,0.02966793907608783
133 | 6.4667,0.7915198749114363,0.8117061751243242
134 | 5.273,0.7623561603674476,0.7139021637992619
135 | 3.0635,0.3475596645882605,-0.3919172875113563
136 | 11.2866,0.9092765874276221,1.3363135019854007
137 | 4.0444,0.5909990761515111,0.23011572277987793
138 | 5.2541,0.7618944076616745,0.7124095804237914
139 | 5.5791,0.7698345996921648,0.7383022482134728
140 | 4.5375,0.6994632880207644,0.5228574972480653
141 | 9.8144,0.8733087390975055,1.1421720249180776
142 | 6.7257,0.7978475971757347,0.8339577268169219
143 | 4.1442,0.6129514759579427,0.28701994562086086
144 | 4.0313,0.5881175487220095,0.22270526607462063
145 | 2.2791,0.18824374230611404,-0.88438672867467
146 | 4.1679,0.6181646210021556,0.3006639542199363
147 | 3.2852,0.40714362502687595,-0.23489883984002982
148 | 3.2768,0.4048860460116104,-0.24072005796234255
149 | 5.021,0.7561994576238059,0.6941293646505097
150 | 4.875,0.7526324790501087,0.682797132481279
151 | 4.419,0.6733975627997006,0.44931439580522814
152 | 3.3272,0.41843152010320356,-0.20590766210173994
153 | 4.2386,0.6337160705644274,0.341711709817628
154 | 1.245,0.03669617210856439,-1.7903831065617093
155 | 5.152,0.7593999657960959,0.704373718687922
156 | 4.8125,0.7511055190442452,0.6779727645490474
157 | 2.1638,0.1713465033120347,-0.9488576690059357
158 | 7.1621,0.8085094427206763,0.872416661000832
159 | 1.5372,0.0795181429157629,-1.408320169672634
160 | 10.0481,0.8790183479514304,1.1700935990858978
161 | 3.3869,0.43447645667598356,-0.16498865287495923
162 | 5.4591,0.7669028364809068,0.7286850708477646
163 | 4.4318,0.6762131010514274,0.4571353017049396
164 | 6.5044,0.7924409371869732,0.8149199625504349
165 | 4.2865,0.644252342615811,0.3698485817763834
166 | 3.0461,0.3428832509137819,-0.40460687823111086
167 | 11.3283,0.9102953751435343,1.3425761736583905
168 | 2.7026,0.25056439475381626,-0.6727147380915197
169 | 3.016,0.33479359277574705,-0.4267146406076349
170 | 3.0943,0.35583745431090086,-0.3696075717076287
171 | 3.225,0.39096430875080623,-0.2768065942137824
172 | 6.187,0.7846864234931958,0.7881189136255171
173 | 3.8158,0.5407153226870792,0.10223599868489697
174 | 3.0147,0.33444420554719406,-0.42767409705501475
175 | 15.0,0.9999999000000003,5.19933758270342
176 | 3.1364,0.36715222532788644,-0.33940526790689773
177 | 2.9,0.30361750161255635,-0.5140242974144523
178 | 5.5941,0.7702010700935721,0.739509192607556
179 | 3.4028,0.4387497312405934,-0.15413985695864643
180 | 6.0062,0.7802692335882339,0.7731028191405914
181 | 8.3792,0.8382448510908602,0.9872699911993058
182 | 3.8036,0.5380317627909023,0.09547634997560216
183 | 2.0926,0.1609121284952224,-0.9907160390730209
184 | 6.7703,0.798937235835919,0.837831173103705
185 | 4.2569,0.6377414104086929,0.3524281695696263
186 | 4.744,0.7448858387224493,0.6584822133838069
187 | 9.7037,0.87060418753512,1.1292518063856445
188 | 5.1292,0.7588429307859569,0.7025854406375655
189 | 2.3148,0.19347558473533027,-0.8651596484776254
190 | 3.3021,0.41168565899806486,-0.22321096549396113
191 | 1.95,0.1400140688199777,-1.0802561341135346
192 | 3.025,0.33721242743496016,-0.42008295383958816
193 | 2.6523,0.2429362799695175,-0.6968885244090585
194 | 1.2188,0.032856556656310446,-1.8403755990446928
195 | 5.827999999999999,0.7759155652195158,0.7584713308686281
196 | 3.1587,0.3731455600946033,-0.3235336598161438
197 | 2.45,0.21328917287062546,-0.7950604390156254
198 | 2.3851,0.20377806436485135,-0.8282019693528496
199 | 2.1221,0.16523535963421065,-0.9731661745087966
200 | 3.5313,0.47328531498602444,-0.06701390942662634
201 | 3.4821,0.4600623521823264,-0.10027663775339715
202 | 7.8252,0.8247098775988859,0.9334643965884332
203 | 5.1878,0.7602746084874545,0.7071861842977014
204 | 3.7459,0.5253398442655404,0.06356034060479637
205 | 6.0097,0.7803547433485621,0.773391847490294
206 | 2.3194,0.19414971569259623,-0.862705456837268
207 | 4.2061,0.6265672429721525,0.32277517627250846
208 | 2.267,0.18647048478808834,-0.8909780060851004
209 | 2.2109,0.1782490181136057,-0.9220585498888423
210 | 2.7589,0.265695549344227,-0.6258838950234518
211 | 2.6553,0.2433759305938214,-0.6954842771160817
212 | 6.3325,0.7882411863868461,0.8003334507140982
213 | 5.7233,0.7733576018176932,0.7499500376839813
214 | 4.337,0.6553605208745764,0.3998337668630456
215 | 3.9667,0.5739078791078263,0.18633222024962756
216 | 5.8623,0.7767535608707337,0.7612749184789059
217 | 1.6806,0.1005334427574887,-1.2785178769957448
218 | 3.5851,0.48774457105998714,-0.03072463804138443
219 | 2.9716,0.32286067512362926,-0.45971423541894674
220 | 3.9,0.5592362852492191,0.14903320783722215
221 | 2.7431,0.26144915072027514,-0.6388841474573944
222 | 3.3621,0.4278112233928187,-0.18194938611936773
223 | 1.9464,0.13948648807081301,-1.0826293461973695
224 | 7.3518,0.8131440717304732,0.8895420092252918
225 | 4.775,0.750189343040727,0.6750857069958222
226 | 3.5968,0.4908890561169641,-0.022839735128387998
227 | 6.221,0.7855170897363856,0.7909625929859541
228 | 10.0968,0.8802081551879993,1.1760279942069634
229 | 1.9483,0.1397649334662055,-1.0813760586453744
230 | 2.0469,0.15421478398499322,-1.0185228067876306
231 | 3.725,0.5207425982138929,0.05201743189676979
232 | 3.675,0.5097444019180853,0.02442802263182327
233 | 1.8529,0.1257840436133419,-1.1465489456932518
234 | 1.7159,0.10570666510346441,-1.2496885811666654
235 | 1.7386,0.10903335482736382,-1.2316851754273808
236 | 3.6687,0.5083586291848137,0.020953509422343072
237 | 3.4671,0.45603096108363783,-0.11043812057602183
238 | 4.8233,0.7513693777332584,0.6788052852389578
239 | 4.3036,0.6480137257489771,0.3799634475571365
240 | 1.6488,0.0958731461398675,-1.3054305179573955
241 | 2.9453,0.3157923027305955,-0.47949769802476955
242 | 5.0096,0.7559209401187363,0.6932413225605799
243 | 3.175,0.3775263384218447,-0.31198400815546146
244 | 4.2031,0.625907351194404,0.32103311272888746
245 | 3.1667,0.3752956353472371,-0.3178598218476214
246 | 5.7204,0.7732867508734211,0.7497147894780406
247 | 3.375,0.43127821973769076,-0.17312084617136517
248 | 6.5483,0.7935134738950917,0.818672917584347
249 | 4.2206,0.6297567198979367,0.33120908249828585
250 | 2.6631,0.2445190222170115,-0.6918396326662699
251 | 3.5363,0.4746291120189207,-0.06363831319524592
252 |
--------------------------------------------------------------------------------
/secrets.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "metadata": {},
5 | "cell_type": "markdown",
6 | "source": "Currently, secrets are only persisted for the lifetime of the engine process and they are not part of the dprep file. If you started a new session (hence start an engine process) and load a package and try to run a dataflow within that package, you will need to call `use_secrets` to register the required secrets to use during execution, otherwise the execution will fail as the required secrets are not available.\n\nIn this notebook, we will:\n1. Loading a previously saved package\n2. Call `get_missing_secrets` to determine the missing secrets\n3. Call `use_secrets` and pass in the missing secrets to register it with the engine for this session\n4. Call `head` to see the a preview of the data"
7 | },
8 | {
9 | "metadata": {},
10 | "cell_type": "markdown",
11 | "source": "# Providing Secrets\nCopyright (c) Microsoft Corporation. All rights reserved.
\nLicensed under the MIT License."
12 | },
13 | {
14 | "metadata": {
15 | "trusted": true
16 | },
17 | "cell_type": "code",
18 | "source": "!pip install azureml",
19 | "execution_count": 1,
20 | "outputs": [
21 | {
22 | "output_type": "stream",
23 | "text": "Requirement already satisfied: azureml in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (0.2.7)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.3)\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.19.1)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.5)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.5)\nRequirement already satisfied: urllib3<1.24,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.8.24)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\n",
24 | "name": "stdout"
25 | }
26 | ]
27 | },
28 | {
29 | "metadata": {
30 | "trusted": true
31 | },
32 | "cell_type": "code",
33 | "source": "import azureml.dataprep as dprep\nimport os",
34 | "execution_count": 2,
35 | "outputs": []
36 | },
37 | {
38 | "metadata": {},
39 | "cell_type": "markdown",
40 | "source": "Let's load the previously saved package."
41 | },
42 | {
43 | "metadata": {
44 | "trusted": true
45 | },
46 | "cell_type": "code",
47 | "source": "package = dprep.Package.open(file_path='./data/secrets.dprep')\ndataflow = package.dataflows[0]",
48 | "execution_count": 3,
49 | "outputs": []
50 | },
51 | {
52 | "metadata": {},
53 | "cell_type": "markdown",
54 | "source": "Let's call `get_missing_secrets` to see what secrets are required missing in the engine."
55 | },
56 | {
57 | "metadata": {
58 | "trusted": true
59 | },
60 | "cell_type": "code",
61 | "source": "dataflow.get_missing_secrets()",
62 | "execution_count": 4,
63 | "outputs": [
64 | {
65 | "output_type": "execute_result",
66 | "execution_count": 4,
67 | "data": {
68 | "text/plain": "['https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv']"
69 | },
70 | "metadata": {}
71 | }
72 | ]
73 | },
74 | {
75 | "metadata": {},
76 | "cell_type": "markdown",
77 | "source": "Let's now read the secrets from an environment variable and put it in our secret dictionary and call `use_secrets` with the secrets. This will register these secrets in the engine so you don't need to provide them again in this session.\n\n_Note: It is a bad practice to have secrets in files that will be checked into source control._"
78 | },
79 | {
80 | "metadata": {
81 | "trusted": true
82 | },
83 | "cell_type": "code",
84 | "source": "sas = os.environ['SCENARIOS_SECRETS']\nsecrets = {\n 'https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv': sas\n}\ndataflow.use_secrets(secrets=secrets)",
85 | "execution_count": 5,
86 | "outputs": [
87 | {
88 | "output_type": "error",
89 | "ename": "KeyError",
90 | "evalue": "'SCENARIOS_SECRETS'",
91 | "traceback": [
92 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
93 | "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
94 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msas\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menviron\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'SCENARIOS_SECRETS'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m secrets = {\n\u001b[1;32m 3\u001b[0m \u001b[0;34m'https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0msas\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m }\n\u001b[1;32m 5\u001b[0m \u001b[0mdataflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse_secrets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msecrets\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msecrets\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
95 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/os.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 667\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 668\u001b[0m \u001b[0;31m# raise KeyError with the original key value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 669\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 670\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecodevalue\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 671\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
96 | "\u001b[0;31mKeyError\u001b[0m: 'SCENARIOS_SECRETS'"
97 | ]
98 | }
99 | ]
100 | },
101 | {
102 | "metadata": {},
103 | "cell_type": "markdown",
104 | "source": "We can now call `head` without passing in `secrets` and the engine will happily execute and show us a preview of the data."
105 | },
106 | {
107 | "metadata": {
108 | "trusted": true
109 | },
110 | "cell_type": "code",
111 | "source": "dataflow.head(5)",
112 | "execution_count": 6,
113 | "outputs": [
114 | {
115 | "output_type": "error",
116 | "ename": "MissingSecretsError",
117 | "evalue": "Required secrets are missing. Please call use_secrets to register the missing secrets.\nMissing secrets:\nhttps://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv",
118 | "traceback": [
119 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
120 | "\u001b[0;31mMissingSecretsError\u001b[0m Traceback (most recent call last)",
121 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdataflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
122 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mhead\u001b[0;34m(self, count)\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mA\u001b[0m \u001b[0mPandas\u001b[0m \u001b[0mDataframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 95\u001b[0m \"\"\"\n\u001b[0;32m---> 96\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_pandas_dataframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mextended_types\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 97\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrun_local\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
123 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mto_pandas_dataframe\u001b[0;34m(self, extended_types)\u001b[0m\n\u001b[1;32m 145\u001b[0m })\n\u001b[1;32m 146\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 147\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_raise_if_missing_secrets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 148\u001b[0m self._engine_api.execute_anonymous_blocks(\n\u001b[1;32m 149\u001b[0m ExecuteAnonymousBlocksMessageArguments(blocks=steps_to_block_datas(dataflow_to_execute._steps),\n",
124 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36m_raise_if_missing_secrets\u001b[0;34m(self, secrets)\u001b[0m\n\u001b[1;32m 1054\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmissing_secret_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmissing_secret_ids\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1055\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1056\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mMissingSecretsError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmissing_secrets\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1057\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1058\u001b[0m \u001b[0;31m# Steps are immutable so we don't need to create a full deepcopy of them when cloning Dataflows.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
125 | "\u001b[0;31mMissingSecretsError\u001b[0m: Required secrets are missing. Please call use_secrets to register the missing secrets.\nMissing secrets:\nhttps://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv"
126 | ]
127 | }
128 | ]
129 | },
130 | {
131 | "metadata": {
132 | "trusted": true
133 | },
134 | "cell_type": "code",
135 | "source": "",
136 | "execution_count": null,
137 | "outputs": []
138 | }
139 | ],
140 | "metadata": {
141 | "execute_as_test": false,
142 | "kernelspec": {
143 | "name": "python36",
144 | "display_name": "Python 3.6",
145 | "language": "python"
146 | },
147 | "language_info": {
148 | "mimetype": "text/x-python",
149 | "nbconvert_exporter": "python",
150 | "name": "python",
151 | "pygments_lexer": "ipython3",
152 | "version": "3.6.6",
153 | "file_extension": ".py",
154 | "codemirror_mode": {
155 | "version": 3,
156 | "name": "ipython"
157 | }
158 | }
159 | },
160 | "nbformat": 4,
161 | "nbformat_minor": 2
162 | }
--------------------------------------------------------------------------------
/external-references.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "metadata": {},
5 | "cell_type": "markdown",
6 | "source": "# External References\nCopyright (c) Microsoft Corporation. All rights reserved.
\nLicensed under the MIT License."
7 | },
8 | {
9 | "metadata": {},
10 | "cell_type": "markdown",
11 | "source": "In addition to opening existing Dataflows in code and modifying them, it is also possible to create and persist Dataflows that reference another Dataflow that has been persisted to a DataPrep package. In this case, executing this Dataflow will load the referenced DataPrep package dynamically, execute the referenced Dataflow, and then execute the steps in the referencing Dataflow."
12 | },
13 | {
14 | "metadata": {},
15 | "cell_type": "markdown",
16 | "source": "To demonstrate, we will create a Dataflow that loads and transforms some data. After that, we will persist this Dataflow to a DataPrep package."
17 | },
18 | {
19 | "metadata": {
20 | "trusted": true
21 | },
22 | "cell_type": "code",
23 | "source": "import azureml.dataprep as dprep\nimport tempfile\nimport os\n\ndf = dprep.smart_read_file('./data/fixed_width_file.txt')\ndf = df.drop_errors(['Column7', 'Column8', 'Column9'], dprep.ColumnRelationship.ANY)\ndf = df.set_name('FWF')\npkg = dprep.Package(df)\npkg_path = os.path.join(tempfile.gettempdir(), 'package.dprep')\npkg = pkg.save(pkg_path)",
24 | "execution_count": 1,
25 | "outputs": [
26 | {
27 | "output_type": "stream",
28 | "text": "/home/nbuser/anaconda3_501/lib/python3.6/site-packages/ipykernel/__main__.py:5: DeprecationWarning: Function smart_read_file is deprecated. Use auto_read_file instead.\n",
29 | "name": "stderr"
30 | }
31 | ]
32 | },
33 | {
34 | "metadata": {},
35 | "cell_type": "markdown",
36 | "source": "Now that we have a package file, we can create a new Dataflow that references it."
37 | },
38 | {
39 | "metadata": {
40 | "trusted": true
41 | },
42 | "cell_type": "code",
43 | "source": "new_df = dprep.Dataflow.reference(dprep.ExternalReference(pkg_path, 'FWF'))\nnew_df.head(10)",
44 | "execution_count": 2,
45 | "outputs": [
46 | {
47 | "output_type": "execute_result",
48 | "execution_count": 2,
49 | "data": {
50 | "text/html": "\n\n
\n \n \n | \n Column1 | \n Column2 | \n Column3 | \n Column4 | \n Column5 | \n Column6 | \n Column7 | \n Column8 | \n Column9 | \n
\n \n \n \n | 0 | \n 10010.0 | \n 99999.0 | \n JAN MAYEN | \n azureml.dataprep.native.DataPrepError(\"'Micros... | \n JN | \n ENJA | \n 70933.0 | \n -8667.0 | \n 90.0 | \n
\n \n | 1 | \n 10014.0 | \n 99999.0 | \n SOERSTOKKEN | \n azureml.dataprep.native.DataPrepError(\"'Micros... | \n NO | \n ENSO | \n 59783.0 | \n 5350.0 | \n 500.0 | \n
\n \n | 2 | \n 10015.0 | \n 99999.0 | \n BRINGELAND | \n azureml.dataprep.native.DataPrepError(\"'Micros... | \n NO | \n ENBL | \n 61383.0 | \n 5867.0 | \n 3270.0 | \n
\n \n | 3 | \n 10016.0 | \n 99999.0 | \n RORVIK/RYUM | \n azureml.dataprep.native.DataPrepError(\"'Micros... | \n NO | \n | \n 64850.0 | \n 11233.0 | \n 140.0 | \n
\n \n | 4 | \n 10017.0 | \n 99999.0 | \n FRIGG | \n azureml.dataprep.native.DataPrepError(\"'Micros... | \n NO | \n ENFR | \n 59933.0 | \n 2417.0 | \n 480.0 | \n
\n \n | 5 | \n 10020.0 | \n 99999.0 | \n VERLEGENHUKEN | \n azureml.dataprep.native.DataPrepError(\"'Micros... | \n SV | \n | \n 80050.0 | \n 16250.0 | \n 80.0 | \n
\n \n | 6 | \n 10030.0 | \n 99999.0 | \n HORNSUND | \n azureml.dataprep.native.DataPrepError(\"'Micros... | \n SV | \n | \n 77000.0 | \n 15500.0 | \n 120.0 | \n
\n \n | 7 | \n 10040.0 | \n 99999.0 | \n NY-ALESUND II | \n azureml.dataprep.native.DataPrepError(\"'Micros... | \n SV | \n ENAS | \n 78917.0 | \n 11933.0 | \n 80.0 | \n
\n \n | 8 | \n 10050.0 | \n 99999.0 | \n ISFJORD RADIO | \n azureml.dataprep.native.DataPrepError(\"'Micros... | \n NO | \n ENIS | \n 78067.0 | \n 13633.0 | \n 50.0 | \n
\n \n | 9 | \n 10060.0 | \n 99999.0 | \n EDGEOYA | \n azureml.dataprep.native.DataPrepError(\"'Micros... | \n NO | \n | \n 78250.0 | \n 22783.0 | \n 140.0 | \n
\n \n
\n
",
51 | "text/plain": " Column1 Column2 Column3 \\\n0 10010.0 99999.0 JAN MAYEN \n1 10014.0 99999.0 SOERSTOKKEN \n2 10015.0 99999.0 BRINGELAND \n3 10016.0 99999.0 RORVIK/RYUM \n4 10017.0 99999.0 FRIGG \n5 10020.0 99999.0 VERLEGENHUKEN \n6 10030.0 99999.0 HORNSUND \n7 10040.0 99999.0 NY-ALESUND II \n8 10050.0 99999.0 ISFJORD RADIO \n9 10060.0 99999.0 EDGEOYA \n\n Column4 Column5 Column6 Column7 \\\n0 azureml.dataprep.native.DataPrepError(\"'Micros... JN ENJA 70933.0 \n1 azureml.dataprep.native.DataPrepError(\"'Micros... NO ENSO 59783.0 \n2 azureml.dataprep.native.DataPrepError(\"'Micros... NO ENBL 61383.0 \n3 azureml.dataprep.native.DataPrepError(\"'Micros... NO 64850.0 \n4 azureml.dataprep.native.DataPrepError(\"'Micros... NO ENFR 59933.0 \n5 azureml.dataprep.native.DataPrepError(\"'Micros... SV 80050.0 \n6 azureml.dataprep.native.DataPrepError(\"'Micros... SV 77000.0 \n7 azureml.dataprep.native.DataPrepError(\"'Micros... SV ENAS 78917.0 \n8 azureml.dataprep.native.DataPrepError(\"'Micros... NO ENIS 78067.0 \n9 azureml.dataprep.native.DataPrepError(\"'Micros... NO 78250.0 \n\n Column8 Column9 \n0 -8667.0 90.0 \n1 5350.0 500.0 \n2 5867.0 3270.0 \n3 11233.0 140.0 \n4 2417.0 480.0 \n5 16250.0 80.0 \n6 15500.0 120.0 \n7 11933.0 80.0 \n8 13633.0 50.0 \n9 22783.0 140.0 "
52 | },
53 | "metadata": {}
54 | }
55 | ]
56 | },
57 | {
58 | "metadata": {},
59 | "cell_type": "markdown",
60 | "source": "When executed, the new Dataflow returns the same results as the one we saved in our package. Since this reference is resolved on execution, updating the package file results in the changes being visible when re-executing the referencing Dataflow."
61 | },
62 | {
63 | "metadata": {
64 | "trusted": true
65 | },
66 | "cell_type": "code",
67 | "source": "df = df.take(5)\npkg = dprep.Package(df)\npkg.save(pkg_path)\n\nnew_df.head(10)",
68 | "execution_count": 3,
69 | "outputs": [
70 | {
71 | "output_type": "execute_result",
72 | "execution_count": 3,
73 | "data": {
74 | "text/html": "\n\n
\n \n \n | \n Column1 | \n Column2 | \n Column3 | \n Column4 | \n Column5 | \n Column6 | \n Column7 | \n Column8 | \n Column9 | \n
\n \n \n \n | 0 | \n 10010.0 | \n 99999.0 | \n JAN MAYEN | \n azureml.dataprep.native.DataPrepError(\"'Micros... | \n JN | \n ENJA | \n 70933.0 | \n -8667.0 | \n 90.0 | \n
\n \n | 1 | \n 10014.0 | \n 99999.0 | \n SOERSTOKKEN | \n azureml.dataprep.native.DataPrepError(\"'Micros... | \n NO | \n ENSO | \n 59783.0 | \n 5350.0 | \n 500.0 | \n
\n \n | 2 | \n 10015.0 | \n 99999.0 | \n BRINGELAND | \n azureml.dataprep.native.DataPrepError(\"'Micros... | \n NO | \n ENBL | \n 61383.0 | \n 5867.0 | \n 3270.0 | \n
\n \n | 3 | \n 10016.0 | \n 99999.0 | \n RORVIK/RYUM | \n azureml.dataprep.native.DataPrepError(\"'Micros... | \n NO | \n | \n 64850.0 | \n 11233.0 | \n 140.0 | \n
\n \n | 4 | \n 10017.0 | \n 99999.0 | \n FRIGG | \n azureml.dataprep.native.DataPrepError(\"'Micros... | \n NO | \n ENFR | \n 59933.0 | \n 2417.0 | \n 480.0 | \n
\n \n
\n
",
75 | "text/plain": " Column1 Column2 Column3 \\\n0 10010.0 99999.0 JAN MAYEN \n1 10014.0 99999.0 SOERSTOKKEN \n2 10015.0 99999.0 BRINGELAND \n3 10016.0 99999.0 RORVIK/RYUM \n4 10017.0 99999.0 FRIGG \n\n Column4 Column5 Column6 Column7 \\\n0 azureml.dataprep.native.DataPrepError(\"'Micros... JN ENJA 70933.0 \n1 azureml.dataprep.native.DataPrepError(\"'Micros... NO ENSO 59783.0 \n2 azureml.dataprep.native.DataPrepError(\"'Micros... NO ENBL 61383.0 \n3 azureml.dataprep.native.DataPrepError(\"'Micros... NO 64850.0 \n4 azureml.dataprep.native.DataPrepError(\"'Micros... NO ENFR 59933.0 \n\n Column8 Column9 \n0 -8667.0 90.0 \n1 5350.0 500.0 \n2 5867.0 3270.0 \n3 11233.0 140.0 \n4 2417.0 480.0 "
76 | },
77 | "metadata": {}
78 | }
79 | ]
80 | },
81 | {
82 | "metadata": {},
83 | "cell_type": "markdown",
84 | "source": "As we can see, even though we did not modify new_df, it now returns only 5 records, as the package was updated with the Dataflow that resulted from calling `df.take(5)`."
85 | },
86 | {
87 | "metadata": {
88 | "trusted": true
89 | },
90 | "cell_type": "code",
91 | "source": "",
92 | "execution_count": null,
93 | "outputs": []
94 | },
95 | {
96 | "metadata": {
97 | "trusted": true
98 | },
99 | "cell_type": "code",
100 | "source": "",
101 | "execution_count": null,
102 | "outputs": []
103 | }
104 | ],
105 | "metadata": {
106 | "kernelspec": {
107 | "name": "python36",
108 | "display_name": "Python 3.6",
109 | "language": "python"
110 | },
111 | "language_info": {
112 | "mimetype": "text/x-python",
113 | "nbconvert_exporter": "python",
114 | "name": "python",
115 | "pygments_lexer": "ipython3",
116 | "version": "3.6.6",
117 | "file_extension": ".py",
118 | "codemirror_mode": {
119 | "version": 3,
120 | "name": "ipython"
121 | }
122 | }
123 | },
124 | "nbformat": 4,
125 | "nbformat_minor": 2
126 | }
--------------------------------------------------------------------------------
/impute-missing-values.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "metadata": {},
5 | "cell_type": "markdown",
6 | "source": "# Impute missing values\nCopyright (c) Microsoft Corporation. All rights reserved.
\nLicensed under the MIT License."
7 | },
8 | {
9 | "metadata": {},
10 | "cell_type": "markdown",
11 | "source": "DataPrep has the ability to impute missing values in specified columns. In this case, we will attempt to impute the missing _Latitude_ and _Longitude_ values in the input data."
12 | },
13 | {
14 | "metadata": {
15 | "trusted": true
16 | },
17 | "cell_type": "code",
18 | "source": "!pip install azureml",
19 | "execution_count": 2,
20 | "outputs": [
21 | {
22 | "output_type": "stream",
23 | "text": "Collecting azureml\n Downloading https://files.pythonhosted.org/packages/ab/e8/76cd2cb6784b9039affd2c659eed1b3f46baf2e6b87a10b072a20b5b0113/azureml-0.2.7-py2.py3-none-any.whl\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.20.1)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.5)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.7)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.6)\nRequirement already satisfied: urllib3<1.25,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.10.15)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\nInstalling collected packages: azureml\nSuccessfully installed azureml-0.2.7\n",
24 | "name": "stdout"
25 | }
26 | ]
27 | },
28 | {
29 | "metadata": {
30 | "trusted": true
31 | },
32 | "cell_type": "code",
33 | "source": "import azureml.dataprep as dprep",
34 | "execution_count": 3,
35 | "outputs": []
36 | },
37 | {
38 | "metadata": {
39 | "trusted": true
40 | },
41 | "cell_type": "code",
42 | "source": "# loading input data\ndf = dprep.read_csv(r'data\\crime0-10.csv')\ndf = df.keep_columns(['ID', 'Arrest', 'Latitude', 'Longitude'])\ndf = df.to_number(['Latitude', 'Longitude'])\ndf.head(10)",
43 | "execution_count": 4,
44 | "outputs": [
45 | {
46 | "output_type": "execute_result",
47 | "execution_count": 4,
48 | "data": {
49 | "text/html": "\n\n
\n \n \n | \n ID | \n Arrest | \n Latitude | \n Longitude | \n
\n \n \n \n | 0 | \n 10140490 | \n false | \n 41.973309 | \n -87.800175 | \n
\n \n | 1 | \n 10139776 | \n false | \n 42.008124 | \n -87.659550 | \n
\n \n | 2 | \n 10140270 | \n false | \n NaN | \n NaN | \n
\n \n | 3 | \n 10139885 | \n false | \n 41.902152 | \n -87.754883 | \n
\n \n | 4 | \n 10140379 | \n false | \n 41.885610 | \n -87.657009 | \n
\n \n | 5 | \n 10140868 | \n false | \n 41.679311 | \n -87.644545 | \n
\n \n | 6 | \n 10139762 | \n false | \n 41.825501 | \n -87.690578 | \n
\n \n | 7 | \n 10139722 | \n true | \n 41.857828 | \n -87.715029 | \n
\n \n | 8 | \n 10139774 | \n false | \n 41.970100 | \n -87.669324 | \n
\n \n | 9 | \n 10139697 | \n false | \n 41.787580 | \n -87.685233 | \n
\n \n
\n
",
50 | "text/plain": " ID Arrest Latitude Longitude\n0 10140490 false 41.973309 -87.800175\n1 10139776 false 42.008124 -87.659550\n2 10140270 false NaN NaN\n3 10139885 false 41.902152 -87.754883\n4 10140379 false 41.885610 -87.657009\n5 10140868 false 41.679311 -87.644545\n6 10139762 false 41.825501 -87.690578\n7 10139722 true 41.857828 -87.715029\n8 10139774 false 41.970100 -87.669324\n9 10139697 false 41.787580 -87.685233"
51 | },
52 | "metadata": {}
53 | }
54 | ]
55 | },
56 | {
57 | "metadata": {},
58 | "cell_type": "markdown",
59 | "source": "The third record from input data has _Latitude_ and _Longitude_ missing. To impute those missing values, we can use `ImputeMissingValuesBuilder` to learn a fixed program which imputes the columns with either a calculated `MIN`, `MAX` or `MEAN` value or a `CUSTOM` value. When `group_by_columns` is specified, missing values will be imputed by group with `MIN`, `MAX` and `MEAN` calculated per group."
60 | },
61 | {
62 | "metadata": {},
63 | "cell_type": "markdown",
64 | "source": "Firstly, let us quickly see check the `MEAN` value of _Latitude_ column."
65 | },
66 | {
67 | "metadata": {
68 | "trusted": true
69 | },
70 | "cell_type": "code",
71 | "source": "df_mean = df.summarize(group_by_columns=['Arrest'],\n summary_columns=[dprep.SummaryColumnsValue(column_id='Latitude',\n summary_column_name='Latitude_MEAN',\n summary_function=dprep.SummaryFunction.MEAN)])\ndf_mean = df_mean.filter(dprep.col('Arrest') == 'false')\ndf_mean.head(1)",
72 | "execution_count": 5,
73 | "outputs": [
74 | {
75 | "output_type": "execute_result",
76 | "execution_count": 5,
77 | "data": {
78 | "text/html": "\n\n
\n \n \n | \n Arrest | \n Latitude_MEAN | \n
\n \n \n \n | 0 | \n false | \n 41.878961 | \n
\n \n
\n
",
79 | "text/plain": " Arrest Latitude_MEAN\n0 false 41.878961"
80 | },
81 | "metadata": {}
82 | }
83 | ]
84 | },
85 | {
86 | "metadata": {},
87 | "cell_type": "markdown",
88 | "source": "The `MEAN` value of _Latitude_ looks good. So we will impute _Latitude_ with it. As for `Longitude`, we will impute it using `42` based on external knowledge."
89 | },
90 | {
91 | "metadata": {
92 | "trusted": true
93 | },
94 | "cell_type": "code",
95 | "source": "# impute with MEAN\nimpute_mean = dprep.ImputeColumnArguments(column_id='Latitude',\n impute_function=dprep.ReplaceValueFunction.MEAN)\n# impute with custom value 42\nimpute_custom = dprep.ImputeColumnArguments(column_id='Longitude',\n custom_impute_value=42)\n# get instance of ImputeMissingValuesBuilder\nimpute_builder = df.builders.impute_missing_values(impute_columns=[impute_mean, impute_custom],\n group_by_columns=['Arrest'])\n# call learn() to learn a fixed program to impute missing values\nimpute_builder.learn()\n# call to_dataflow() to get a dataflow with impute step added\ndf_imputed = impute_builder.to_dataflow()",
96 | "execution_count": 6,
97 | "outputs": []
98 | },
99 | {
100 | "metadata": {
101 | "trusted": true
102 | },
103 | "cell_type": "code",
104 | "source": "# check impute result\ndf_imputed.head(10)",
105 | "execution_count": 7,
106 | "outputs": [
107 | {
108 | "output_type": "execute_result",
109 | "execution_count": 7,
110 | "data": {
111 | "text/html": "\n\n
\n \n \n | \n ID | \n Arrest | \n Latitude | \n Longitude | \n
\n \n \n \n | 0 | \n 10140490 | \n false | \n 41.973309 | \n -87.800175 | \n
\n \n | 1 | \n 10139776 | \n false | \n 42.008124 | \n -87.659550 | \n
\n \n | 2 | \n 10140270 | \n false | \n 41.878961 | \n 42.000000 | \n
\n \n | 3 | \n 10139885 | \n false | \n 41.902152 | \n -87.754883 | \n
\n \n | 4 | \n 10140379 | \n false | \n 41.885610 | \n -87.657009 | \n
\n \n | 5 | \n 10140868 | \n false | \n 41.679311 | \n -87.644545 | \n
\n \n | 6 | \n 10139762 | \n false | \n 41.825501 | \n -87.690578 | \n
\n \n | 7 | \n 10139722 | \n true | \n 41.857828 | \n -87.715029 | \n
\n \n | 8 | \n 10139774 | \n false | \n 41.970100 | \n -87.669324 | \n
\n \n | 9 | \n 10139697 | \n false | \n 41.787580 | \n -87.685233 | \n
\n \n
\n
",
112 | "text/plain": " ID Arrest Latitude Longitude\n0 10140490 false 41.973309 -87.800175\n1 10139776 false 42.008124 -87.659550\n2 10140270 false 41.878961 42.000000\n3 10139885 false 41.902152 -87.754883\n4 10140379 false 41.885610 -87.657009\n5 10140868 false 41.679311 -87.644545\n6 10139762 false 41.825501 -87.690578\n7 10139722 true 41.857828 -87.715029\n8 10139774 false 41.970100 -87.669324\n9 10139697 false 41.787580 -87.685233"
113 | },
114 | "metadata": {}
115 | }
116 | ]
117 | },
118 | {
119 | "metadata": {},
120 | "cell_type": "markdown",
121 | "source": "As the result above, the missing _Latitude_ has been imputed with the `MEAN` value of `Arrest=='false'` group, and the missing _Longitude_ has been imputed with `42`."
122 | },
123 | {
124 | "metadata": {
125 | "trusted": true
126 | },
127 | "cell_type": "code",
128 | "source": "",
129 | "execution_count": null,
130 | "outputs": []
131 | }
132 | ],
133 | "metadata": {
134 | "kernelspec": {
135 | "name": "python36",
136 | "display_name": "Python 3.6",
137 | "language": "python"
138 | },
139 | "language_info": {
140 | "mimetype": "text/x-python",
141 | "nbconvert_exporter": "python",
142 | "name": "python",
143 | "pygments_lexer": "ipython3",
144 | "version": "3.6.6",
145 | "file_extension": ".py",
146 | "codemirror_mode": {
147 | "version": 3,
148 | "name": "ipython"
149 | }
150 | }
151 | },
152 | "nbformat": 4,
153 | "nbformat_minor": 2
154 | }
--------------------------------------------------------------------------------
/0. Import librairie.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "metadata": {
5 | "trusted": true
6 | },
7 | "cell_type": "code",
8 | "source": "!pip install azureml-sdk",
9 | "execution_count": 1,
10 | "outputs": [
11 | {
12 | "output_type": "stream",
13 | "text": "Requirement already satisfied: azureml-sdk in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (1.0.2)\nRequirement already satisfied: azureml-pipeline==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-sdk) (1.0.2)\nRequirement already satisfied: azureml-train==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-sdk) (1.0.2)\nRequirement already satisfied: azureml-core==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-sdk) (1.0.2)\nRequirement already satisfied: azureml-pipeline-core==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-pipeline==1.0.2.*->azureml-sdk) (1.0.2)\nRequirement already satisfied: azureml-pipeline-steps==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-pipeline==1.0.2.*->azureml-sdk) (1.0.2)\nRequirement already satisfied: azureml-train-core==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-train==1.0.2.*->azureml-sdk) (1.0.2)\nRequirement already satisfied: pathspec in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.5.9)\nRequirement already satisfied: azure-storage-blob>=1.1.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.4.0)\nRequirement already satisfied: SecretStorage<3.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.3.1)\nRequirement already satisfied: azure-cli-profile>=2.0.26 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.1.2)\nRequirement already satisfied: azure-common>=1.1.12 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.1.16)\nRequirement already satisfied: msrestazure>=0.4.33 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.5.1)\nRequirement already satisfied: docker in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (3.6.0)\nRequirement already satisfied: azure-mgmt-storage>=1.5.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (3.1.0)\nRequirement already satisfied: backports.tempfile in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.0)\nRequirement already satisfied: azure-graphrbac>=0.40.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.53.0)\nRequirement already satisfied: azure-mgmt-keyvault>=0.40.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.1.0)\nRequirement already satisfied: azure-cli-core>=2.0.38 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.0.52)\nRequirement already satisfied: azure-mgmt-containerregistry>=2.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.4.0)\nRequirement already satisfied: requests>=2.19.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.20.1)\nRequirement already satisfied: azure-mgmt-resource>=1.2.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.0.0)\nRequirement already satisfied: azure-mgmt-authorization>=0.40.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.51.1)\nRequirement already satisfied: pytz in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2018.7)\nRequirement already satisfied: azure-storage-nspkg>=3.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (3.1.0)\nRequirement already satisfied: ruamel.yaml<=0.15.51,>=0.15.35 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.15.51)\nRequirement already satisfied: ndg-httpsclient in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.5.1)\nRequirement already satisfied: azure-storage-common>=1.1.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.4.0)\nRequirement already satisfied: contextlib2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.5.5)\nRequirement already satisfied: cryptography!=1.9,!=2.0.*,!=2.1.*,!=2.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.3.1)\nRequirement already satisfied: python-dateutil>=2.7.3 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.7.5)\nRequirement already satisfied: msrest>=0.5.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.6.2)\nRequirement already satisfied: PyJWT in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.7.1)\nRequirement already satisfied: urllib3<1.24,>=1.23 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.23)\nRequirement already satisfied: six>=1.11.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.11.0)\nRequirement already satisfied: jsonpickle in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.0)\nRequirement already satisfied: certifi in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-pipeline-steps==1.0.2.*->azureml-pipeline==1.0.2.*->azureml-sdk) (2018.10.15)\nRequirement already satisfied: azureml-telemetry==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-train-core==1.0.2.*->azureml-train==1.0.2.*->azureml-sdk) (1.0.2)\nRequirement already satisfied: azureml-train-restclients-hyperdrive==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-train-core==1.0.2.*->azureml-train==1.0.2.*->azureml-sdk) (1.0.2)\nRequirement already satisfied: azure-cli-command-modules-nspkg>=2.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-profile>=2.0.26->azureml-core==1.0.2.*->azureml-sdk) (2.0.2)\nRequirement already satisfied: adal<2.0.0,>=0.6.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from msrestazure>=0.4.33->azureml-core==1.0.2.*->azureml-sdk) (1.2.0)\nRequirement already satisfied: docker-pycreds>=0.3.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from docker->azureml-core==1.0.2.*->azureml-sdk) (0.4.0)\nRequirement already satisfied: websocket-client>=0.32.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from docker->azureml-core==1.0.2.*->azureml-sdk) (0.54.0)\nRequirement already satisfied: backports.weakref in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from backports.tempfile->azureml-core==1.0.2.*->azureml-sdk) (1.0rc1)\nRequirement already satisfied: azure-mgmt-nspkg>=2.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-mgmt-keyvault>=0.40.0->azureml-core==1.0.2.*->azureml-sdk) (3.0.2)\nRequirement already satisfied: azure-cli-telemetry in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (1.0.0)\nRequirement already satisfied: pyopenssl>=17.1.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (18.0.0)\nRequirement already satisfied: pyyaml~=3.13 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (3.13)\nRequirement already satisfied: pip in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (18.1)\nRequirement already satisfied: paramiko>=2.0.8 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (2.4.2)\nRequirement already satisfied: jmespath in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (0.9.3)\nRequirement already satisfied: argcomplete>=1.8.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (1.9.4)\nRequirement already satisfied: pygments in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (2.2.0)\nRequirement already satisfied: knack==0.5.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (0.5.1)\nRequirement already satisfied: tabulate<=0.8.2,>=0.7.7 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (0.8.2)\n",
14 | "name": "stdout"
15 | },
16 | {
17 | "output_type": "stream",
18 | "text": "Requirement already satisfied: antlr4-python3-runtime; python_version >= \"3.0\" in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (4.7.1)\nRequirement already satisfied: colorama>=0.3.9 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (0.3.9)\nRequirement already satisfied: azure-cli-nspkg>=2.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (3.0.3)\nRequirement already satisfied: humanfriendly>=4.7 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (4.17)\nRequirement already satisfied: wheel==0.30.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (0.30.0)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests>=2.19.1->azureml-core==1.0.2.*->azureml-sdk) (2.7)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests>=2.19.1->azureml-core==1.0.2.*->azureml-sdk) (3.0.4)\nRequirement already satisfied: azure-nspkg>=2.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-storage-nspkg>=3.0.0->azureml-core==1.0.2.*->azureml-sdk) (3.0.2)\nRequirement already satisfied: pyasn1>=0.1.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from ndg-httpsclient->azureml-core==1.0.2.*->azureml-sdk) (0.4.4)\nRequirement already satisfied: asn1crypto>=0.21.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from cryptography!=1.9,!=2.0.*,!=2.1.*,!=2.2.*->azureml-core==1.0.2.*->azureml-sdk) (0.24.0)\nRequirement already satisfied: cffi!=1.11.3,>=1.7 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from cryptography!=1.9,!=2.0.*,!=2.1.*,!=2.2.*->azureml-core==1.0.2.*->azureml-sdk) (1.11.5)\nRequirement already satisfied: requests-oauthlib>=0.5.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from msrest>=0.5.1->azureml-core==1.0.2.*->azureml-sdk) (1.0.0)\nRequirement already satisfied: isodate>=0.6.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from msrest>=0.5.1->azureml-core==1.0.2.*->azureml-sdk) (0.6.0)\nRequirement already satisfied: applicationinsights in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-telemetry==1.0.2.*->azureml-train-core==1.0.2.*->azureml-train==1.0.2.*->azureml-sdk) (0.11.7)\nRequirement already satisfied: portalocker==1.2.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-telemetry->azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (1.2.1)\nRequirement already satisfied: pynacl>=1.0.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from paramiko>=2.0.8->azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (1.3.0)\nRequirement already satisfied: bcrypt>=3.1.3 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from paramiko>=2.0.8->azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (3.1.4)\nRequirement already satisfied: pycparser in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.7->cryptography!=1.9,!=2.0.*,!=2.1.*,!=2.2.*->azureml-core==1.0.2.*->azureml-sdk) (2.19)\nRequirement already satisfied: oauthlib>=0.6.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests-oauthlib>=0.5.0->msrest>=0.5.1->azureml-core==1.0.2.*->azureml-sdk) (2.1.0)\n",
19 | "name": "stdout"
20 | }
21 | ]
22 | },
23 | {
24 | "metadata": {
25 | "trusted": true
26 | },
27 | "cell_type": "code",
28 | "source": "!pip install --upgrade azureml-dataprep",
29 | "execution_count": 2,
30 | "outputs": [
31 | {
32 | "output_type": "stream",
33 | "text": "Requirement already up-to-date: azureml-dataprep in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (1.0.4)\nRequirement already satisfied, skipping upgrade: pandas>=0.19.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-dataprep) (0.22.0)\nRequirement already satisfied, skipping upgrade: numpy>=1.11.3 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-dataprep) (1.14.6)\nRequirement already satisfied, skipping upgrade: dotnetcore2==2.1.7 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-dataprep) (2.1.7)\nRequirement already satisfied, skipping upgrade: azureml-dataprep-native<12.0.0,>=11.2.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-dataprep) (11.2.0)\nRequirement already satisfied, skipping upgrade: python-dateutil>=2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas>=0.19.2->azureml-dataprep) (2.7.5)\nRequirement already satisfied, skipping upgrade: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas>=0.19.2->azureml-dataprep) (2018.7)\nRequirement already satisfied, skipping upgrade: distro>=1.2.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from dotnetcore2==2.1.7->azureml-dataprep) (1.3.0)\nRequirement already satisfied, skipping upgrade: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil>=2->pandas>=0.19.2->azureml-dataprep) (1.11.0)\n",
34 | "name": "stdout"
35 | }
36 | ]
37 | },
38 | {
39 | "metadata": {
40 | "trusted": true
41 | },
42 | "cell_type": "code",
43 | "source": "import sys",
44 | "execution_count": 3,
45 | "outputs": []
46 | },
47 | {
48 | "metadata": {
49 | "trusted": true
50 | },
51 | "cell_type": "code",
52 | "source": "sys.version",
53 | "execution_count": 4,
54 | "outputs": [
55 | {
56 | "output_type": "execute_result",
57 | "execution_count": 4,
58 | "data": {
59 | "text/plain": "'3.6.6 |Anaconda, Inc.| (default, Jun 28 2018, 17:14:51) \\n[GCC 7.2.0]'"
60 | },
61 | "metadata": {}
62 | }
63 | ]
64 | },
65 | {
66 | "metadata": {
67 | "trusted": true
68 | },
69 | "cell_type": "code",
70 | "source": "import azureml.core\nprint(\"Version Azure ML service :\", azureml.core.VERSION)",
71 | "execution_count": 7,
72 | "outputs": [
73 | {
74 | "output_type": "stream",
75 | "text": "Version Azure ML service : 1.0.2\n",
76 | "name": "stdout"
77 | }
78 | ]
79 | },
80 | {
81 | "metadata": {
82 | "trusted": true
83 | },
84 | "cell_type": "code",
85 | "source": "",
86 | "execution_count": null,
87 | "outputs": []
88 | }
89 | ],
90 | "metadata": {
91 | "kernelspec": {
92 | "name": "python36",
93 | "display_name": "Python 3.6",
94 | "language": "python"
95 | },
96 | "language_info": {
97 | "mimetype": "text/x-python",
98 | "nbconvert_exporter": "python",
99 | "name": "python",
100 | "pygments_lexer": "ipython3",
101 | "version": "3.6.6",
102 | "file_extension": ".py",
103 | "codemirror_mode": {
104 | "version": 3,
105 | "name": "ipython"
106 | }
107 | }
108 | },
109 | "nbformat": 4,
110 | "nbformat_minor": 2
111 | }
--------------------------------------------------------------------------------
/smart-read-file-separators.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "metadata": {},
5 | "cell_type": "markdown",
6 | "source": "# Smart Read File\nCopyright (c) Microsoft Corporation. All rights reserved.
\nLicensed under the MIT License."
7 | },
8 | {
9 | "metadata": {
10 | "trusted": true
11 | },
12 | "cell_type": "code",
13 | "source": "!pip install azureml",
14 | "execution_count": 1,
15 | "outputs": [
16 | {
17 | "output_type": "stream",
18 | "text": "Requirement already satisfied: azureml in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (0.2.7)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.3)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.19.1)\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\nRequirement already satisfied: urllib3<1.24,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.8.24)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.5)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.5)\n",
19 | "name": "stdout"
20 | }
21 | ]
22 | },
23 | {
24 | "metadata": {
25 | "trusted": true
26 | },
27 | "cell_type": "code",
28 | "source": "import azureml.dataprep as dprep",
29 | "execution_count": 2,
30 | "outputs": []
31 | },
32 | {
33 | "metadata": {},
34 | "cell_type": "markdown",
35 | "source": "DataPrep has the ability to load different kinds of text files. The `smart_read_file` entry point can take any text based file (including excel, json and parquet) and auto-detect how to parse the file. It will also attempt to auto-detect the types of each column and apply type transformations to the columns it detects.\n\nThe result will be a Dataflow object that has all the steps added that are required to read the given file(s) and convert their columns to the predicted types. No parameters are required beyond the file path or `FileDataSource` object."
36 | },
37 | {
38 | "metadata": {
39 | "trusted": true
40 | },
41 | "cell_type": "code",
42 | "source": "smart_dataflow = dprep.smart_read_file('./data/multiple_separators.csv')\nsmart_dataflow.head(10)",
43 | "execution_count": 3,
44 | "outputs": [
45 | {
46 | "output_type": "execute_result",
47 | "execution_count": 3,
48 | "data": {
49 | "text/html": "\n\n
\n \n \n | \n ID | \n CaseNumber | \n Column3 | \n Completed | \n Column5 | \n
\n \n \n \n | 0 | \n 10140490.0 | \n HY329907 | \n | \n Y | \n | \n
\n \n | 1 | \n 10139776.0 | \n HY329265 | \n | \n Y | \n | \n
\n \n | 2 | \n 10140270.0 | \n HY329253 | \n | \n N | \n | \n
\n \n | 3 | \n 10139885.0 | \n HY329308 | \n | \n Y | \n | \n
\n \n | 4 | \n 10140379.0 | \n HY329556 | \n | \n N | \n | \n
\n \n | 5 | \n 10140868.0 | \n HY330421 | \n | \n N | \n | \n
\n \n | 6 | \n 10139762.0 | \n HY329232 | \n | \n N | \n | \n
\n \n | 7 | \n 10139722.0 | \n HY329228 | \n | \n Y | \n | \n
\n \n | 8 | \n 10139774.0 | \n HY329209 | \n | \n N | \n | \n
\n \n | 9 | \n 10139697.0 | \n HY329177 | \n | \n N | \n | \n
\n \n
\n
",
50 | "text/plain": " ID CaseNumber Column3 Completed Column5\n0 10140490.0 HY329907 Y \n1 10139776.0 HY329265 Y \n2 10140270.0 HY329253 N \n3 10139885.0 HY329308 Y \n4 10140379.0 HY329556 N \n5 10140868.0 HY330421 N \n6 10139762.0 HY329232 N \n7 10139722.0 HY329228 Y \n8 10139774.0 HY329209 N \n9 10139697.0 HY329177 N "
51 | },
52 | "metadata": {}
53 | }
54 | ]
55 | },
56 | {
57 | "metadata": {},
58 | "cell_type": "markdown",
59 | "source": "Looking at the data, we can see that there are two empty columns either side of the 'Completed' column.\nIf we compare the dataframe to a few rows from the original file:\n```\nID |CaseNumber| |Completed|\n10140490 |HY329907| |Y|\n10139776 |HY329265| |Y|\n```\nWe can see that the `|`'s have disappeared in the dataframe. This is because `|` is a very common separator character in csv files, so `smart_read_file` guessed it was the column separator. For this data we actually want the `|`'s to remain and instead use space as the column separator.\n\nTo acheive this we can use `detect_file_format` which will take a file path or datasource obeject and give back a `FileFormatBuilder` which has learnt some information about the supplied data.\nThis is what `smart_file_read` is using behind the scenes to 'learn' the contents of the given file and determine how to parse it. With the `FileFormatBuilder` we can take advantage of the intelligent learning aspect of `smart_file_read` but have the chance to modify some of the learnt information."
60 | },
61 | {
62 | "metadata": {
63 | "trusted": true
64 | },
65 | "cell_type": "code",
66 | "source": "ffb = dprep.detect_file_format('./data/multiple_separators.csv')\nffb_2 = dprep.detect_file_format('./data/excel.xlsx')\nffb_3 = dprep.detect_file_format('./data/fixed_width_file.txt')\nffb_4 = dprep.detect_file_format('./data/json.json')\n\nprint(ffb.file_format)\nprint(ffb_2.file_format)\nprint(ffb_3.file_format)\nprint(type(ffb_4.file_format))",
67 | "execution_count": 4,
68 | "outputs": [
69 | {
70 | "output_type": "stream",
71 | "text": "ParseDelimitedProperties\n separator: '|'\n headers_mode: PromoteHeadersMode.CONSTANTGROUPED\n encoding: FileEncoding.UTF8\n quoting: False\n skip_rows: 0\n skip_mode: SkipMode.NONE\n comment: None\n\nReadExcelProperties\n sheet_name: None\n use_headers: False\n skip_rows: 0\n\nParseFixedWidthProperties\n offsets: '[7, 13, 43, 46, 52, 58, 65, 73]'\n headers_mode: PromoteHeadersMode.NONE\n encoding: FileEncoding.UTF8\n skip_rows: 0\n skip_mode: SkipMode.NONE\n\n\n",
72 | "name": "stdout"
73 | }
74 | ]
75 | },
76 | {
77 | "metadata": {},
78 | "cell_type": "markdown",
79 | "source": "After calling `detect_file_format` we get a `FileFormatBuilder` that has had `learn` called on it. This means the `file_format` attribute will be populated with a `Properties` object, it contains all the information that was learnt about the file. As we can see above different file types have corresponding file_formats detected. \nContinuing with our delimited example we can change any of these values and then call `ffb.to_dataflow()` to create a `Dataflow` that has the steps required to parse the datasource."
80 | },
81 | {
82 | "metadata": {
83 | "scrolled": true,
84 | "trusted": true
85 | },
86 | "cell_type": "code",
87 | "source": "ffb.file_format.separator = ' '\ndataflow = ffb.to_dataflow()\ndf = dataflow.to_pandas_dataframe()\ndf",
88 | "execution_count": 5,
89 | "outputs": [
90 | {
91 | "output_type": "execute_result",
92 | "execution_count": 5,
93 | "data": {
94 | "text/html": "\n\n
\n \n \n | \n ID | \n |CaseNumber| | \n |Completed| | \n
\n \n \n \n | 0 | \n 10140490 | \n |HY329907| | \n |Y| | \n
\n \n | 1 | \n 10139776 | \n |HY329265| | \n |Y| | \n
\n \n | 2 | \n 10140270 | \n |HY329253| | \n |N| | \n
\n \n | 3 | \n 10139885 | \n |HY329308| | \n |Y| | \n
\n \n | 4 | \n 10140379 | \n |HY329556| | \n |N| | \n
\n \n | 5 | \n 10140868 | \n |HY330421| | \n |N| | \n
\n \n | 6 | \n 10139762 | \n |HY329232| | \n |N| | \n
\n \n | 7 | \n 10139722 | \n |HY329228| | \n |Y| | \n
\n \n | 8 | \n 10139774 | \n |HY329209| | \n |N| | \n
\n \n | 9 | \n 10139697 | \n |HY329177| | \n |N| | \n
\n \n
\n
",
95 | "text/plain": " ID |CaseNumber| |Completed|\n0 10140490 |HY329907| |Y|\n1 10139776 |HY329265| |Y|\n2 10140270 |HY329253| |N|\n3 10139885 |HY329308| |Y|\n4 10140379 |HY329556| |N|\n5 10140868 |HY330421| |N|\n6 10139762 |HY329232| |N|\n7 10139722 |HY329228| |Y|\n8 10139774 |HY329209| |N|\n9 10139697 |HY329177| |N|"
96 | },
97 | "metadata": {}
98 | }
99 | ]
100 | },
101 | {
102 | "metadata": {},
103 | "cell_type": "markdown",
104 | "source": "The result is our desired dataframe with `|`'s included.\n\nIf we refer back to the original data output by `smart_read_file` the 'ID' column was also detected as numeric and converted to a number data type, instead of remaining a string like in the data above.\nWe can perform type inference on our new dataflow using the `dataflow.builders` property. This property exposes different builders that can `learn` from a dataflow and `apply` the learning to produce a new dataflow, very similar to the pattern we used above for the `FileFormatBuilder`."
105 | },
106 | {
107 | "metadata": {
108 | "trusted": true
109 | },
110 | "cell_type": "code",
111 | "source": "ctb = dataflow.builders.set_column_types()\nctb.learn()\nctb.inference_info",
112 | "execution_count": 6,
113 | "outputs": [
114 | {
115 | "output_type": "execute_result",
116 | "execution_count": 6,
117 | "data": {
118 | "text/plain": "{'|CaseNumber|': [FieldType.STRING],\n '|Completed|': [FieldType.STRING],\n 'ID': [FieldType.DECIMAL]}"
119 | },
120 | "metadata": {}
121 | }
122 | ]
123 | },
124 | {
125 | "metadata": {},
126 | "cell_type": "markdown",
127 | "source": "After learning `ctb.inference_info` has been populated with information about the inferred types for each column, it is possible for there to be multiple candidate types per column, in this example there is only one type for each column.\n\nThe candidates look correct, we only want to convert `ID` to be a number column (also known as `DECIMAL`), so applying this `ColumnTypesBuilder` should result in a Dataflow with our columns converted to their respective types."
128 | },
129 | {
130 | "metadata": {
131 | "trusted": true
132 | },
133 | "cell_type": "code",
134 | "source": "converted_dataflow = ctb.to_dataflow()\nconverted_df = converted_dataflow.to_pandas_dataframe()\nconverted_df",
135 | "execution_count": 7,
136 | "outputs": [
137 | {
138 | "output_type": "execute_result",
139 | "execution_count": 7,
140 | "data": {
141 | "text/html": "\n\n
\n \n \n | \n ID | \n |CaseNumber| | \n |Completed| | \n
\n \n \n \n | 0 | \n 10140490.0 | \n |HY329907| | \n |Y| | \n
\n \n | 1 | \n 10139776.0 | \n |HY329265| | \n |Y| | \n
\n \n | 2 | \n 10140270.0 | \n |HY329253| | \n |N| | \n
\n \n | 3 | \n 10139885.0 | \n |HY329308| | \n |Y| | \n
\n \n | 4 | \n 10140379.0 | \n |HY329556| | \n |N| | \n
\n \n | 5 | \n 10140868.0 | \n |HY330421| | \n |N| | \n
\n \n | 6 | \n 10139762.0 | \n |HY329232| | \n |N| | \n
\n \n | 7 | \n 10139722.0 | \n |HY329228| | \n |Y| | \n
\n \n | 8 | \n 10139774.0 | \n |HY329209| | \n |N| | \n
\n \n | 9 | \n 10139697.0 | \n |HY329177| | \n |N| | \n
\n \n
\n
",
142 | "text/plain": " ID |CaseNumber| |Completed|\n0 10140490.0 |HY329907| |Y|\n1 10139776.0 |HY329265| |Y|\n2 10140270.0 |HY329253| |N|\n3 10139885.0 |HY329308| |Y|\n4 10140379.0 |HY329556| |N|\n5 10140868.0 |HY330421| |N|\n6 10139762.0 |HY329232| |N|\n7 10139722.0 |HY329228| |Y|\n8 10139774.0 |HY329209| |N|\n9 10139697.0 |HY329177| |N|"
143 | },
144 | "metadata": {}
145 | }
146 | ]
147 | },
148 | {
149 | "metadata": {
150 | "trusted": true
151 | },
152 | "cell_type": "code",
153 | "source": "",
154 | "execution_count": null,
155 | "outputs": []
156 | }
157 | ],
158 | "metadata": {
159 | "kernelspec": {
160 | "name": "python36",
161 | "display_name": "Python 3.6",
162 | "language": "python"
163 | },
164 | "language_info": {
165 | "mimetype": "text/x-python",
166 | "nbconvert_exporter": "python",
167 | "name": "python",
168 | "pygments_lexer": "ipython3",
169 | "version": "3.6.6",
170 | "file_extension": ".py",
171 | "codemirror_mode": {
172 | "version": 3,
173 | "name": "ipython"
174 | }
175 | }
176 | },
177 | "nbformat": 4,
178 | "nbformat_minor": 2
179 | }
--------------------------------------------------------------------------------
/caching.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "metadata": {},
5 | "cell_type": "markdown",
6 | "source": "# Caching\nCopyright (c) Microsoft Corporation. All rights reserved.
\nLicensed under the MIT License."
7 | },
8 | {
9 | "metadata": {},
10 | "cell_type": "markdown",
11 | "source": "A Dataflow can be cached into a file on disk during a local run by calling `df_cached = df.cache(directory_path)` on the Dataflow object `df`. Doing so, we will run all steps in `df` and save the cached data to the specified `directory_path`. The returned Dataflow `df_cached` has a Caching Step added at the end. Any run on Dataflow `df_cached` will reuse the cached data. And steps in `df_cached` before Caching Step will not be run again.\n\nCaching avoids running transforms multiple times, which can make local runs more efficient. Here are common places to use Caching:\n- after reading data from remote\n- after expensive transforms, such as Sort\n- after transforms that change the shape of data, such as Sampling, Filter and Summarize\n\nCaching Step will be ignored during scale-out run invoked by `to_spark_dataframe()`."
12 | },
13 | {
14 | "metadata": {
15 | "trusted": true
16 | },
17 | "cell_type": "code",
18 | "source": "# read data and apply transforms\nimport azureml.dataprep as dprep\ndf = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv')\ndf = df.take_sample(probability=0.2, seed=7)\ndf = df.skip(1)\ndf = df.sort_asc(columns='schnam10')\ndf = df.keep_columns(['stnam', 'fipst', 'leaid', 'leanm10', 'ncessch', 'schnam10'])\ndf.head(5)",
19 | "execution_count": 6,
20 | "outputs": [
21 | {
22 | "output_type": "execute_result",
23 | "execution_count": 6,
24 | "data": {
25 | "text/html": "\n\n
\n \n \n | \n stnam | \n fipst | \n leaid | \n leanm10 | \n ncessch | \n schnam10 | \n
\n \n \n \n | 0 | \n ALABAMA | \n 1 | \n 102100 | \n Limestone County | \n 10210000797 | \n Ardmore High Sch | \n
\n \n | 1 | \n ALABAMA | \n 1 | \n 101920 | \n Jefferson County | \n 10192000691 | \n Brighton Middle Sch | \n
\n \n | 2 | \n ALABAMA | \n 1 | \n 101920 | \n Jefferson County | \n 10192000720 | \n Bryan Elem Sch | \n
\n \n | 3 | \n ALABAMA | \n 1 | \n 102010 | \n Lauderdale County | \n 10201000766 | \n Cloverdale Jr High Sch | \n
\n \n | 4 | \n ALABAMA | \n 1 | \n 101920 | \n Jefferson County | \n 10192000701 | \n Erwin Elem Sch | \n
\n \n
\n
",
26 | "text/plain": " stnam fipst leaid leanm10 ncessch \\\n0 ALABAMA 1 102100 Limestone County 10210000797 \n1 ALABAMA 1 101920 Jefferson County 10192000691 \n2 ALABAMA 1 101920 Jefferson County 10192000720 \n3 ALABAMA 1 102010 Lauderdale County 10201000766 \n4 ALABAMA 1 101920 Jefferson County 10192000701 \n\n schnam10 \n0 Ardmore High Sch \n1 Brighton Middle Sch \n2 Bryan Elem Sch \n3 Cloverdale Jr High Sch \n4 Erwin Elem Sch "
27 | },
28 | "metadata": {}
29 | }
30 | ]
31 | },
32 | {
33 | "metadata": {
34 | "trusted": true
35 | },
36 | "cell_type": "code",
37 | "source": "# choose a directory to store cache data\nimport os\nfrom pathlib import Path\ncache_dir = str(Path(os.getcwd(), 'dataflow-cache'))\ncache_dir",
38 | "execution_count": 7,
39 | "outputs": [
40 | {
41 | "output_type": "execute_result",
42 | "execution_count": 7,
43 | "data": {
44 | "text/plain": "'/home/nbuser/library/dataflow-cache'"
45 | },
46 | "metadata": {}
47 | }
48 | ]
49 | },
50 | {
51 | "metadata": {
52 | "trusted": true
53 | },
54 | "cell_type": "code",
55 | "source": "# choose a directory to store cache data\ncache_dir = str('dataflow-cache')\ncache_dir",
56 | "execution_count": 8,
57 | "outputs": [
58 | {
59 | "output_type": "execute_result",
60 | "execution_count": 8,
61 | "data": {
62 | "text/plain": "'dataflow-cache'"
63 | },
64 | "metadata": {}
65 | }
66 | ]
67 | },
68 | {
69 | "metadata": {
70 | "trusted": true
71 | },
72 | "cell_type": "code",
73 | "source": "# cache the dataflow\ndf_cached = df.cache(directory_path=cache_dir)",
74 | "execution_count": 9,
75 | "outputs": [
76 | {
77 | "output_type": "error",
78 | "ename": "ExecutionError",
79 | "evalue": "Cannot write cache. Please check if the specified cache folder exists.",
80 | "traceback": [
81 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
82 | "\u001b[0;31mExecutionError\u001b[0m Traceback (most recent call last)",
83 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# cache the dataflow\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdf_cached\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdirectory_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcache_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
84 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mcache\u001b[0;34m(self, directory_path)\u001b[0m\n\u001b[1;32m 982\u001b[0m \u001b[0;34m'cachePath'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mLocalDataSource\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdirectory_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munderlying_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 983\u001b[0m })\n\u001b[0;32m--> 984\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 985\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 986\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
85 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mhead\u001b[0;34m(self, count)\u001b[0m\n\u001b[1;32m 337\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mA\u001b[0m \u001b[0mPandas\u001b[0m \u001b[0mDataframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 338\u001b[0m \"\"\"\n\u001b[0;32m--> 339\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_pandas_dataframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mextended_types\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 340\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 341\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrun_local\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
86 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mto_pandas_dataframe\u001b[0;34m(self, extended_types, nulls_as_nan)\u001b[0m\n\u001b[1;32m 391\u001b[0m self._engine_api.execute_anonymous_blocks(\n\u001b[1;32m 392\u001b[0m ExecuteAnonymousBlocksMessageArguments(blocks=steps_to_block_datas(dataflow_to_execute._steps),\n\u001b[0;32m--> 393\u001b[0;31m project_context=self._parent_package_path))\n\u001b[0m\u001b[1;32m 394\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 395\u001b[0m \u001b[0mintermediate_files\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mp\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mintermediate_path\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mglob\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'part-*'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
87 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/_aml_helper.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(op_code, message)\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchanged\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0mengine_api_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate_environment_variable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchanged\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 37\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msend_message_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mop_code\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 38\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
88 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/engineapi/api.py\u001b[0m in \u001b[0;36mexecute_anonymous_blocks\u001b[0;34m(self, message_args)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mupdate_aml_env_vars\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mget_engine_api\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mexecute_anonymous_blocks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage_args\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mtypedefinitions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mExecuteAnonymousBlocksMessageArguments\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_message\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Engine.ExecuteActivity'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 57\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
89 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/engineapi/engine.py\u001b[0m in \u001b[0;36msend_message\u001b[0;34m(self, op_code, message)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read_response\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'error'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0mraise_engine_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'error'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'id'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mmessage_id\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'result'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
90 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/errorhandlers.py\u001b[0m in \u001b[0;36mraise_engine_error\u001b[0;34m(error_response)\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[0merror_code\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0merror_response\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'errorCode'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'ActivityExecutionFailed'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0merror_code\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 69\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mExecutionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_response\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 70\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;34m'UnableToPreviewDataSource'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0merror_code\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mPreviewDataSourceError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_response\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
91 | "\u001b[0;31mExecutionError\u001b[0m: Cannot write cache. Please check if the specified cache folder exists."
92 | ]
93 | }
94 | ]
95 | },
96 | {
97 | "metadata": {
98 | "trusted": true
99 | },
100 | "cell_type": "code",
101 | "source": "# check steps in df_cached\n[s.step_type for s in df_cached.get_steps()]",
102 | "execution_count": null,
103 | "outputs": []
104 | },
105 | {
106 | "metadata": {
107 | "trusted": false
108 | },
109 | "cell_type": "code",
110 | "source": "# check the stored cache data\nos.listdir(cache_dir)",
111 | "execution_count": 5,
112 | "outputs": [
113 | {
114 | "data": {
115 | "text/plain": "['7acc00d7-8e69-471d-b74d-085d0625cd9b.cacheIndex',\n '86e51582-fa4f-4b9e-8e45-439692d0da02']"
116 | },
117 | "execution_count": 5,
118 | "metadata": {},
119 | "output_type": "execute_result"
120 | }
121 | ]
122 | },
123 | {
124 | "metadata": {
125 | "trusted": false
126 | },
127 | "cell_type": "code",
128 | "source": "# run against df_cached will reuse the cache data and skip running all the previous steps again\ndf_cached.head(5)",
129 | "execution_count": 6,
130 | "outputs": [
131 | {
132 | "data": {
133 | "text/html": "\n
\n \n \n | \n stnam | \n fipst | \n leaid | \n leanm10 | \n ncessch | \n schnam10 | \n
\n \n \n \n | 0 | \n ALABAMA | \n 1 | \n 102100 | \n Limestone County | \n 10210000797 | \n Ardmore High Sch | \n
\n \n | 1 | \n ALABAMA | \n 1 | \n 101920 | \n Jefferson County | \n 10192000691 | \n Brighton Middle Sch | \n
\n \n | 2 | \n ALABAMA | \n 1 | \n 101920 | \n Jefferson County | \n 10192000720 | \n Bryan Elem Sch | \n
\n \n | 3 | \n ALABAMA | \n 1 | \n 102010 | \n Lauderdale County | \n 10201000766 | \n Cloverdale Jr High Sch | \n
\n \n | 4 | \n ALABAMA | \n 1 | \n 101920 | \n Jefferson County | \n 10192000701 | \n Erwin Elem Sch | \n
\n \n
\n
",
134 | "text/plain": " stnam fipst leaid leanm10 ncessch \\\n0 ALABAMA 1 102100 Limestone County 10210000797 \n1 ALABAMA 1 101920 Jefferson County 10192000691 \n2 ALABAMA 1 101920 Jefferson County 10192000720 \n3 ALABAMA 1 102010 Lauderdale County 10201000766 \n4 ALABAMA 1 101920 Jefferson County 10192000701 \n\n schnam10 \n0 Ardmore High Sch \n1 Brighton Middle Sch \n2 Bryan Elem Sch \n3 Cloverdale Jr High Sch \n4 Erwin Elem Sch "
135 | },
136 | "execution_count": 6,
137 | "metadata": {},
138 | "output_type": "execute_result"
139 | }
140 | ]
141 | },
142 | {
143 | "metadata": {
144 | "trusted": false
145 | },
146 | "cell_type": "code",
147 | "source": "df1 = df_cached.take(10)\ndf2 = df_cached.skip(10).take(10)\n\n# run against df1 and df2 will reuse the cache data as well\ndataframe1 = df1.to_pandas_dataframe()\ndataframe2 = df2.to_pandas_dataframe()",
148 | "execution_count": 7,
149 | "outputs": []
150 | },
151 | {
152 | "metadata": {
153 | "trusted": false
154 | },
155 | "cell_type": "code",
156 | "source": "# clean up cache data\nimport shutil\nshutil.rmtree(path=cache_dir)",
157 | "execution_count": 8,
158 | "outputs": []
159 | }
160 | ],
161 | "metadata": {
162 | "kernelspec": {
163 | "name": "python36",
164 | "display_name": "Python 3.6",
165 | "language": "python"
166 | },
167 | "language_info": {
168 | "mimetype": "text/x-python",
169 | "nbconvert_exporter": "python",
170 | "name": "python",
171 | "pygments_lexer": "ipython3",
172 | "version": "3.6.6",
173 | "file_extension": ".py",
174 | "codemirror_mode": {
175 | "version": 3,
176 | "name": "ipython"
177 | }
178 | }
179 | },
180 | "nbformat": 4,
181 | "nbformat_minor": 2
182 | }
--------------------------------------------------------------------------------
/join.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "metadata": {},
5 | "cell_type": "markdown",
6 | "source": "# Join\nCopyright (c) Microsoft Corporation. All rights reserved.
\nLicensed under the MIT License.
\n\nIn DataPrep you can easily join two dataflows."
7 | },
8 | {
9 | "metadata": {
10 | "trusted": true
11 | },
12 | "cell_type": "code",
13 | "source": "!pip install azureml",
14 | "execution_count": 1,
15 | "outputs": [
16 | {
17 | "output_type": "stream",
18 | "text": "Requirement already satisfied: azureml in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (0.2.7)\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.5)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.20.1)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.7)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.6)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\nRequirement already satisfied: urllib3<1.25,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.10.15)\n",
19 | "name": "stdout"
20 | }
21 | ]
22 | },
23 | {
24 | "metadata": {
25 | "trusted": true
26 | },
27 | "cell_type": "code",
28 | "source": "import azureml.dataprep as dprep",
29 | "execution_count": 2,
30 | "outputs": []
31 | },
32 | {
33 | "metadata": {},
34 | "cell_type": "markdown",
35 | "source": "First let's get the left side of the data into a shape that is ready for the join."
36 | },
37 | {
38 | "metadata": {
39 | "scrolled": false,
40 | "trusted": true
41 | },
42 | "cell_type": "code",
43 | "source": "# get the first dataflow and derive desired key column\ndataflow_l = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/testfiles/BostonWeather.csv')\ndataflow_l = dataflow_l.derive_column_by_example(source_columns='DATE', new_column_name='date_timerange',\n example_data=[('11/11/2015 0:54', 'Nov 11, 2015 | 12AM-2AM'),\n ('2/1/2015 0:54', 'Feb 1, 2015 | 12AM-2AM'),\n ('1/29/2015 20:54', 'Jan 29, 2015 | 8PM-10PM')])\ndataflow_l = dataflow_l.drop_columns(['DATE'])\n\n# convert types and summarize data\ndataflow_l = dataflow_l.set_column_types(type_conversions={'HOURLYDRYBULBTEMPF': dprep.TypeConverter(dprep.FieldType.DECIMAL)})\ndataflow_l = dataflow_l.filter(expression=dprep.f_not(dprep.col('HOURLYDRYBULBTEMPF').is_error()))\ndataflow_l = dataflow_l.summarize(group_by_columns=['date_timerange'],summary_columns=[dprep.SummaryColumnsValue('HOURLYDRYBULBTEMPF', dprep.api.engineapi.typedefinitions.SummaryFunction.MEAN, 'HOURLYDRYBULBTEMPF_Mean')] )\n\n# cache the result so the steps above are not executed every time we pull on the data\nimport os\nfrom pathlib import Path\ncache_dir = str(Path(os.getcwd(), 'dataflow-cache'))\ndataflow_l.cache(directory_path=cache_dir)\ndataflow_l.head(10)",
44 | "execution_count": 3,
45 | "outputs": [
46 | {
47 | "output_type": "error",
48 | "ename": "ExecutionError",
49 | "evalue": "Cannot write cache. Please check if the specified cache folder exists.",
50 | "traceback": [
51 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
52 | "\u001b[0;31mExecutionError\u001b[0m Traceback (most recent call last)",
53 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpathlib\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mPath\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mcache_dir\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mPath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetcwd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'dataflow-cache'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 18\u001b[0;31m \u001b[0mdataflow_l\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdirectory_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcache_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 19\u001b[0m \u001b[0mdataflow_l\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
54 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mcache\u001b[0;34m(self, directory_path)\u001b[0m\n\u001b[1;32m 982\u001b[0m \u001b[0;34m'cachePath'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mLocalDataSource\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdirectory_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munderlying_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 983\u001b[0m })\n\u001b[0;32m--> 984\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 985\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 986\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
55 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mhead\u001b[0;34m(self, count)\u001b[0m\n\u001b[1;32m 337\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mA\u001b[0m \u001b[0mPandas\u001b[0m \u001b[0mDataframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 338\u001b[0m \"\"\"\n\u001b[0;32m--> 339\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_pandas_dataframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mextended_types\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 340\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 341\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrun_local\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
56 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mto_pandas_dataframe\u001b[0;34m(self, extended_types, nulls_as_nan)\u001b[0m\n\u001b[1;32m 391\u001b[0m self._engine_api.execute_anonymous_blocks(\n\u001b[1;32m 392\u001b[0m ExecuteAnonymousBlocksMessageArguments(blocks=steps_to_block_datas(dataflow_to_execute._steps),\n\u001b[0;32m--> 393\u001b[0;31m project_context=self._parent_package_path))\n\u001b[0m\u001b[1;32m 394\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 395\u001b[0m \u001b[0mintermediate_files\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mp\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mintermediate_path\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mglob\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'part-*'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
57 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/_aml_helper.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(op_code, message)\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchanged\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0mengine_api_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate_environment_variable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchanged\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 37\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msend_message_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mop_code\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 38\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
58 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/engineapi/api.py\u001b[0m in \u001b[0;36mexecute_anonymous_blocks\u001b[0;34m(self, message_args)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mupdate_aml_env_vars\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mget_engine_api\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mexecute_anonymous_blocks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage_args\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mtypedefinitions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mExecuteAnonymousBlocksMessageArguments\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_message\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Engine.ExecuteActivity'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 57\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
59 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/engineapi/engine.py\u001b[0m in \u001b[0;36msend_message\u001b[0;34m(self, op_code, message)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read_response\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'error'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0mraise_engine_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'error'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'id'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mmessage_id\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'result'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
60 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/errorhandlers.py\u001b[0m in \u001b[0;36mraise_engine_error\u001b[0;34m(error_response)\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[0merror_code\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0merror_response\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'errorCode'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'ActivityExecutionFailed'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0merror_code\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 69\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mExecutionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_response\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 70\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;34m'UnableToPreviewDataSource'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0merror_code\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mPreviewDataSourceError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_response\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
61 | "\u001b[0;31mExecutionError\u001b[0m: Cannot write cache. Please check if the specified cache folder exists."
62 | ]
63 | }
64 | ]
65 | },
66 | {
67 | "metadata": {},
68 | "cell_type": "markdown",
69 | "source": "Now let's prepare the data for the right side of the join."
70 | },
71 | {
72 | "metadata": {
73 | "scrolled": false,
74 | "trusted": true
75 | },
76 | "cell_type": "code",
77 | "source": "# get the second dataflow and desired key column\ndataflow_r = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/bike-share/*-hubway-tripdata.csv')\ndataflow_r = dataflow_r.keep_columns(['starttime', 'start station id'])\ndataflow_r = dataflow_r.derive_column_by_example(source_columns='starttime', new_column_name='l_date_timerange',\n example_data=[('2015-01-01 00:21:44', 'Jan 1, 2015 | 12AM-2AM')])\ndataflow_r = dataflow_r.drop_columns('starttime')\n\n# cache the results\ndataflow_r.cache(directory_path=cache_dir)\ndataflow_r.head(10)",
78 | "execution_count": null,
79 | "outputs": []
80 | },
81 | {
82 | "metadata": {},
83 | "cell_type": "markdown",
84 | "source": "There are three ways one can join two dataflows in DataPrep:\n1. Create `JoinBuilder` object for interactive join configuration.\n2. Call ```join()``` on one of the dataflows and pass in the other along with all other arguments.\n3. Call ```Dataflow.join()``` method and pass in two dataflows along with all other arguments.\n\nWe will explore the builder object as it simplifies the determination of correct arguments. "
85 | },
86 | {
87 | "metadata": {
88 | "trusted": true
89 | },
90 | "cell_type": "code",
91 | "source": "# construct a builder for joining dataflow_l with dataflow_r\njoin_builder = dataflow_l.builders.join(right_dataflow=dataflow_r, left_column_prefix='l', right_column_prefix='r')\n\njoin_builder",
92 | "execution_count": null,
93 | "outputs": []
94 | },
95 | {
96 | "metadata": {},
97 | "cell_type": "markdown",
98 | "source": "As you can see, so far the builder has no propeties set except default values.\nFrom here you could set each of the options and preview its effect on the join result or use DataPrep to determine some of them.\n\nLet's start with determining appropriate column prefixes for left and right side of the join and lists of columns that would not conflict and therefore don't need to be prefixed. "
99 | },
100 | {
101 | "metadata": {
102 | "scrolled": true,
103 | "trusted": true
104 | },
105 | "cell_type": "code",
106 | "source": "join_builder.detect_column_info()\njoin_builder",
107 | "execution_count": null,
108 | "outputs": []
109 | },
110 | {
111 | "metadata": {},
112 | "cell_type": "markdown",
113 | "source": "You can see that DataPrep has performed a pull on both dataflows to determine the column names in them. Given that `dataflow_r` already had a column starting with `l_` new prefix got generated which would not collide with any column names that are already present.\nAdditionally columns in each dataflow that won't conflict during join would remain unprefixed.\nThis apprach to column naming is crucial for join robustness to schema changes in the data. Let's say that at some time in future the data consumed by left dataflow will also have `l_date_timerange` column in it.\nConfigured as above the join will still run as expected and the new column will be prefixed with `l2_` ensuring that ig column `l_date_timerange` was consumed by some other future transformation it remains unaffected.\n\nNote: `KEY_generated` is appended to both lists and is reserved for Dataprep use in case Autojoin is performed.\n\n### Autojoin\nAutojoin is a Dataprep feature that determines suitable join arguments given data on both sides. In some cases Autojoin can even derive a key column from a number of available columns in the data.\nHere is how you can use Autojoin:"
114 | },
115 | {
116 | "metadata": {
117 | "trusted": true
118 | },
119 | "cell_type": "code",
120 | "source": "# generate join suggestions\njoin_builder.generate_suggested_join()\n\n# list generated suggestions\njoin_builder.list_join_suggestions()",
121 | "execution_count": null,
122 | "outputs": []
123 | },
124 | {
125 | "metadata": {},
126 | "cell_type": "markdown",
127 | "source": "Now lets select first suggestion and preview the result of the join."
128 | },
129 | {
130 | "metadata": {
131 | "trusted": true
132 | },
133 | "cell_type": "code",
134 | "source": "# apply first suggestion\njoin_builder.apply_suggestion(0)\n\njoin_builder.preview(10)",
135 | "execution_count": null,
136 | "outputs": []
137 | },
138 | {
139 | "metadata": {},
140 | "cell_type": "markdown",
141 | "source": "Everything looks just as we would expect, so it is time to get our new joined dataflow."
142 | },
143 | {
144 | "metadata": {
145 | "trusted": true
146 | },
147 | "cell_type": "code",
148 | "source": "dataflow_autojoined = join_builder.to_dataflow().drop_columns(['l_date_timerange'])",
149 | "execution_count": null,
150 | "outputs": []
151 | },
152 | {
153 | "metadata": {},
154 | "cell_type": "markdown",
155 | "source": "### Joining two dataflows without pulling the data\n\nIn don't want to pull on data and know what join should look like you can always you join method on Dataflow "
156 | },
157 | {
158 | "metadata": {
159 | "trusted": true
160 | },
161 | "cell_type": "code",
162 | "source": "dataflow_joined = dprep.Dataflow.join(left_dataflow=dataflow_l,\n right_dataflow=dataflow_r,\n join_key_pairs=[('date_timerange', 'l_date_timerange')],\n left_column_prefix='l2_',\n right_column_prefix='r_')\n",
163 | "execution_count": null,
164 | "outputs": []
165 | },
166 | {
167 | "metadata": {
168 | "trusted": true
169 | },
170 | "cell_type": "code",
171 | "source": "dataflow_joined.head(10)",
172 | "execution_count": null,
173 | "outputs": []
174 | },
175 | {
176 | "metadata": {
177 | "trusted": true
178 | },
179 | "cell_type": "code",
180 | "source": "dataflow_joined = dataflow_joined.filter(expression=dprep.col('r_start station id') == '67')\ndf = dataflow_joined.to_pandas_dataframe()\ndf",
181 | "execution_count": null,
182 | "outputs": []
183 | },
184 | {
185 | "metadata": {
186 | "trusted": true
187 | },
188 | "cell_type": "code",
189 | "source": "",
190 | "execution_count": null,
191 | "outputs": []
192 | },
193 | {
194 | "metadata": {
195 | "trusted": true
196 | },
197 | "cell_type": "code",
198 | "source": "",
199 | "execution_count": null,
200 | "outputs": []
201 | },
202 | {
203 | "metadata": {
204 | "trusted": true
205 | },
206 | "cell_type": "code",
207 | "source": "",
208 | "execution_count": null,
209 | "outputs": []
210 | },
211 | {
212 | "metadata": {
213 | "trusted": true
214 | },
215 | "cell_type": "code",
216 | "source": "",
217 | "execution_count": null,
218 | "outputs": []
219 | }
220 | ],
221 | "metadata": {
222 | "kernelspec": {
223 | "name": "python36",
224 | "display_name": "Python 3.6",
225 | "language": "python"
226 | },
227 | "language_info": {
228 | "mimetype": "text/x-python",
229 | "nbconvert_exporter": "python",
230 | "name": "python",
231 | "pygments_lexer": "ipython3",
232 | "version": "3.6.6",
233 | "file_extension": ".py",
234 | "codemirror_mode": {
235 | "version": 3,
236 | "name": "ipython"
237 | }
238 | }
239 | },
240 | "nbformat": 4,
241 | "nbformat_minor": 2
242 | }
--------------------------------------------------------------------------------
/split-column-by-example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "metadata": {},
5 | "cell_type": "markdown",
6 | "source": "# Split column by example\nCopyright (c) Microsoft Corporation. All rights reserved.
\nLicensed under the MIT License."
7 | },
8 | {
9 | "metadata": {},
10 | "cell_type": "markdown",
11 | "source": "DataPrep also offers you a way to easily split a column into multiple columns.\nThe SplitColumnByExampleBuilder class lets you generate a proper split program that will work even when the cases are not trivial, like in example below."
12 | },
13 | {
14 | "metadata": {
15 | "trusted": true
16 | },
17 | "cell_type": "code",
18 | "source": "!pip install azureml",
19 | "execution_count": 1,
20 | "outputs": [
21 | {
22 | "output_type": "stream",
23 | "text": "Requirement already satisfied: azureml in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (0.2.7)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.19.1)\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.3)\nRequirement already satisfied: urllib3<1.24,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.8.24)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.5)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.5)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\n",
24 | "name": "stdout"
25 | }
26 | ]
27 | },
28 | {
29 | "metadata": {
30 | "trusted": true
31 | },
32 | "cell_type": "code",
33 | "source": "import azureml.dataprep as dprep",
34 | "execution_count": 3,
35 | "outputs": []
36 | },
37 | {
38 | "metadata": {
39 | "scrolled": true,
40 | "trusted": true
41 | },
42 | "cell_type": "code",
43 | "source": "dataflow = dprep.read_lines(path='https://dpreptestfiles.blob.core.windows.net/testfiles/sample.log')\ndf = dataflow.head(10)",
44 | "execution_count": 4,
45 | "outputs": []
46 | },
47 | {
48 | "metadata": {
49 | "trusted": true
50 | },
51 | "cell_type": "code",
52 | "source": "df['Line'].iloc[0]",
53 | "execution_count": 5,
54 | "outputs": [
55 | {
56 | "output_type": "execute_result",
57 | "execution_count": 5,
58 | "data": {
59 | "text/plain": "'2012-02-03 18:35:34 SampleClass6 [INFO] everything normal for id 577725851'"
60 | },
61 | "metadata": {}
62 | }
63 | ]
64 | },
65 | {
66 | "metadata": {},
67 | "cell_type": "markdown",
68 | "source": "As you can see above, you can't split this particular log file by space character as it will create too many columns and even worse number of columns will depend on a string in 6th column.\nThat's where split_column_by_example could be quite useful."
69 | },
70 | {
71 | "metadata": {
72 | "trusted": true
73 | },
74 | "cell_type": "code",
75 | "source": "b = dataflow.builders.split_column_by_example('Line', keep_delimiters=True)",
76 | "execution_count": 6,
77 | "outputs": []
78 | },
79 | {
80 | "metadata": {
81 | "scrolled": false,
82 | "trusted": true
83 | },
84 | "cell_type": "code",
85 | "source": "b.preview()",
86 | "execution_count": 7,
87 | "outputs": [
88 | {
89 | "output_type": "execute_result",
90 | "execution_count": 7,
91 | "data": {
92 | "text/html": "\n\n
\n \n \n | \n Line | \n Line_1 | \n Line_2 | \n Line_3 | \n Line_4 | \n Line_5 | \n Line_6 | \n Line_7 | \n Line_8 | \n Line_9 | \n Line_10 | \n Line_11 | \n Line_12 | \n
\n \n \n \n | 0 | \n 2012-02-03 18:35:34 SampleClass6 [INFO] everyt... | \n 2012-02-03 | \n | \n 18:35:34 | \n | \n SampleClass | \n 6 | \n [ | \n INFO | \n ] | \n everything normal for id | \n | \n 577725851 | \n
\n \n | 1 | \n 2012-02-03 18:35:34 SampleClass4 [FATAL] syste... | \n 2012-02-03 | \n | \n 18:35:34 | \n | \n SampleClass | \n 4 | \n [ | \n FATAL | \n ] | \n system problem at id | \n | \n 1991281254 | \n
\n \n | 2 | \n 2012-02-03 18:35:34 SampleClass3 [DEBUG] detai... | \n 2012-02-03 | \n | \n 18:35:34 | \n | \n SampleClass | \n 3 | \n [ | \n DEBUG | \n ] | \n detail for id | \n | \n 1304807656 | \n
\n \n | 3 | \n 2012-02-03 18:35:34 SampleClass3 [WARN] missin... | \n 2012-02-03 | \n | \n 18:35:34 | \n | \n SampleClass | \n 3 | \n [ | \n WARN | \n ] | \n missing id | \n | \n 423340895 | \n
\n \n | 4 | \n 2012-02-03 18:35:34 SampleClass5 [TRACE] verbo... | \n 2012-02-03 | \n | \n 18:35:34 | \n | \n SampleClass | \n 5 | \n [ | \n TRACE | \n ] | \n verbose detail for id | \n | \n 2082654978 | \n
\n \n | 5 | \n 2012-02-03 18:35:34 SampleClass0 [ERROR] incor... | \n None | \n None | \n None | \n None | \n None | \n None | \n None | \n None | \n None | \n None | \n None | \n None | \n
\n \n | 6 | \n 2012-02-03 18:35:34 SampleClass9 [TRACE] verbo... | \n 2012-02-03 | \n | \n 18:35:34 | \n | \n SampleClass | \n 9 | \n [ | \n TRACE | \n ] | \n verbose detail for id | \n | \n 438634209 | \n
\n \n | 7 | \n 2012-02-03 18:35:34 SampleClass8 [DEBUG] detai... | \n 2012-02-03 | \n | \n 18:35:34 | \n | \n SampleClass | \n 8 | \n [ | \n DEBUG | \n ] | \n detail for id | \n | \n 2074121310 | \n
\n \n | 8 | \n 2012-02-03 18:55:54 SampleClass4 [DEBUG] detai... | \n 2012-02-03 | \n | \n 18:55:54 | \n | \n SampleClass | \n 4 | \n [ | \n DEBUG | \n ] | \n detail for id | \n | \n 1029178762 | \n
\n \n | 9 | \n 2012-02-03 18:55:54 SampleClass2 [TRACE] verbo... | \n 2012-02-03 | \n | \n 18:55:54 | \n | \n SampleClass | \n 2 | \n [ | \n TRACE | \n ] | \n verbose detail for id | \n | \n 1135460272 | \n
\n \n
\n
",
93 | "text/plain": " Line Line_1 Line_2 \\\n0 2012-02-03 18:35:34 SampleClass6 [INFO] everyt... 2012-02-03 \n1 2012-02-03 18:35:34 SampleClass4 [FATAL] syste... 2012-02-03 \n2 2012-02-03 18:35:34 SampleClass3 [DEBUG] detai... 2012-02-03 \n3 2012-02-03 18:35:34 SampleClass3 [WARN] missin... 2012-02-03 \n4 2012-02-03 18:35:34 SampleClass5 [TRACE] verbo... 2012-02-03 \n5 2012-02-03 18:35:34 SampleClass0 [ERROR] incor... None None \n6 2012-02-03 18:35:34 SampleClass9 [TRACE] verbo... 2012-02-03 \n7 2012-02-03 18:35:34 SampleClass8 [DEBUG] detai... 2012-02-03 \n8 2012-02-03 18:55:54 SampleClass4 [DEBUG] detai... 2012-02-03 \n9 2012-02-03 18:55:54 SampleClass2 [TRACE] verbo... 2012-02-03 \n\n Line_3 Line_4 Line_5 Line_6 Line_7 Line_8 Line_9 \\\n0 18:35:34 SampleClass 6 [ INFO ] \n1 18:35:34 SampleClass 4 [ FATAL ] \n2 18:35:34 SampleClass 3 [ DEBUG ] \n3 18:35:34 SampleClass 3 [ WARN ] \n4 18:35:34 SampleClass 5 [ TRACE ] \n5 None None None None None None None \n6 18:35:34 SampleClass 9 [ TRACE ] \n7 18:35:34 SampleClass 8 [ DEBUG ] \n8 18:55:54 SampleClass 4 [ DEBUG ] \n9 18:55:54 SampleClass 2 [ TRACE ] \n\n Line_10 Line_11 Line_12 \n0 everything normal for id 577725851 \n1 system problem at id 1991281254 \n2 detail for id 1304807656 \n3 missing id 423340895 \n4 verbose detail for id 2082654978 \n5 None None None \n6 verbose detail for id 438634209 \n7 detail for id 2074121310 \n8 detail for id 1029178762 \n9 verbose detail for id 1135460272 "
94 | },
95 | "metadata": {}
96 | }
97 | ]
98 | },
99 | {
100 | "metadata": {},
101 | "cell_type": "markdown",
102 | "source": "Couple things to take note of here. No examples were given, and yet DataPrep was able to generate quite reasonable split program. \nWe have passed keep_delimiters=True so we can see all the data split into columns. In practice, though, delimiters are rarely useful, so let's exclude them."
103 | },
104 | {
105 | "metadata": {
106 | "scrolled": true,
107 | "trusted": true
108 | },
109 | "cell_type": "code",
110 | "source": "b.keep_delimiters = False\nb.preview()",
111 | "execution_count": 8,
112 | "outputs": [
113 | {
114 | "output_type": "execute_result",
115 | "execution_count": 8,
116 | "data": {
117 | "text/html": "\n\n
\n \n \n | \n Line | \n Line_1 | \n Line_2 | \n Line_3 | \n Line_4 | \n Line_5 | \n Line_6 | \n Line_7 | \n
\n \n \n \n | 0 | \n 2012-02-03 18:35:34 SampleClass6 [INFO] everyt... | \n 2012-02-03 | \n 18:35:34 | \n SampleClass | \n 6 | \n INFO | \n everything normal for id | \n 577725851 | \n
\n \n | 1 | \n 2012-02-03 18:35:34 SampleClass4 [FATAL] syste... | \n 2012-02-03 | \n 18:35:34 | \n SampleClass | \n 4 | \n FATAL | \n system problem at id | \n 1991281254 | \n
\n \n | 2 | \n 2012-02-03 18:35:34 SampleClass3 [DEBUG] detai... | \n 2012-02-03 | \n 18:35:34 | \n SampleClass | \n 3 | \n DEBUG | \n detail for id | \n 1304807656 | \n
\n \n | 3 | \n 2012-02-03 18:35:34 SampleClass3 [WARN] missin... | \n 2012-02-03 | \n 18:35:34 | \n SampleClass | \n 3 | \n WARN | \n missing id | \n 423340895 | \n
\n \n | 4 | \n 2012-02-03 18:35:34 SampleClass5 [TRACE] verbo... | \n 2012-02-03 | \n 18:35:34 | \n SampleClass | \n 5 | \n TRACE | \n verbose detail for id | \n 2082654978 | \n
\n \n | 5 | \n 2012-02-03 18:35:34 SampleClass0 [ERROR] incor... | \n None | \n None | \n None | \n None | \n None | \n None | \n None | \n
\n \n | 6 | \n 2012-02-03 18:35:34 SampleClass9 [TRACE] verbo... | \n 2012-02-03 | \n 18:35:34 | \n SampleClass | \n 9 | \n TRACE | \n verbose detail for id | \n 438634209 | \n
\n \n | 7 | \n 2012-02-03 18:35:34 SampleClass8 [DEBUG] detai... | \n 2012-02-03 | \n 18:35:34 | \n SampleClass | \n 8 | \n DEBUG | \n detail for id | \n 2074121310 | \n
\n \n | 8 | \n 2012-02-03 18:55:54 SampleClass4 [DEBUG] detai... | \n 2012-02-03 | \n 18:55:54 | \n SampleClass | \n 4 | \n DEBUG | \n detail for id | \n 1029178762 | \n
\n \n | 9 | \n 2012-02-03 18:55:54 SampleClass2 [TRACE] verbo... | \n 2012-02-03 | \n 18:55:54 | \n SampleClass | \n 2 | \n TRACE | \n verbose detail for id | \n 1135460272 | \n
\n \n
\n
",
118 | "text/plain": " Line Line_1 Line_2 \\\n0 2012-02-03 18:35:34 SampleClass6 [INFO] everyt... 2012-02-03 18:35:34 \n1 2012-02-03 18:35:34 SampleClass4 [FATAL] syste... 2012-02-03 18:35:34 \n2 2012-02-03 18:35:34 SampleClass3 [DEBUG] detai... 2012-02-03 18:35:34 \n3 2012-02-03 18:35:34 SampleClass3 [WARN] missin... 2012-02-03 18:35:34 \n4 2012-02-03 18:35:34 SampleClass5 [TRACE] verbo... 2012-02-03 18:35:34 \n5 2012-02-03 18:35:34 SampleClass0 [ERROR] incor... None None \n6 2012-02-03 18:35:34 SampleClass9 [TRACE] verbo... 2012-02-03 18:35:34 \n7 2012-02-03 18:35:34 SampleClass8 [DEBUG] detai... 2012-02-03 18:35:34 \n8 2012-02-03 18:55:54 SampleClass4 [DEBUG] detai... 2012-02-03 18:55:54 \n9 2012-02-03 18:55:54 SampleClass2 [TRACE] verbo... 2012-02-03 18:55:54 \n\n Line_3 Line_4 Line_5 Line_6 Line_7 \n0 SampleClass 6 INFO everything normal for id 577725851 \n1 SampleClass 4 FATAL system problem at id 1991281254 \n2 SampleClass 3 DEBUG detail for id 1304807656 \n3 SampleClass 3 WARN missing id 423340895 \n4 SampleClass 5 TRACE verbose detail for id 2082654978 \n5 None None None None None \n6 SampleClass 9 TRACE verbose detail for id 438634209 \n7 SampleClass 8 DEBUG detail for id 2074121310 \n8 SampleClass 4 DEBUG detail for id 1029178762 \n9 SampleClass 2 TRACE verbose detail for id 1135460272 "
119 | },
120 | "metadata": {}
121 | }
122 | ]
123 | },
124 | {
125 | "metadata": {},
126 | "cell_type": "markdown",
127 | "source": "This looks pretty good already, except for line 5.\nIf we request generation of suggested examples we will see that line 5 is one of the items program need more input on."
128 | },
129 | {
130 | "metadata": {
131 | "trusted": true
132 | },
133 | "cell_type": "code",
134 | "source": "suggestions = b.generate_suggested_examples()\nsuggestions",
135 | "execution_count": 9,
136 | "outputs": [
137 | {
138 | "output_type": "execute_result",
139 | "execution_count": 9,
140 | "data": {
141 | "text/html": "\n\n
\n \n \n | \n Line | \n
\n \n \n \n | 0 | \n 2012-02-03 18:35:34 SampleClass6 [INFO] everyt... | \n
\n \n | 1 | \n 2012-02-03 18:35:34 SampleClass0 [ERROR] incor... | \n
\n \n | 2 | \n | \n
\n \n | 3 | \n java.lang.Exception: 2012-02-03 19:11:02 Sampl... | \n
\n \n | 4 | \n \\tat com.osa.mocklogger.MockLogger$2.run(MockL... | \n
\n \n
\n
",
142 | "text/plain": " Line\n0 2012-02-03 18:35:34 SampleClass6 [INFO] everyt...\n1 2012-02-03 18:35:34 SampleClass0 [ERROR] incor...\n2 \n3 java.lang.Exception: 2012-02-03 19:11:02 Sampl...\n4 \\tat com.osa.mocklogger.MockLogger$2.run(MockL..."
143 | },
144 | "metadata": {}
145 | }
146 | ]
147 | },
148 | {
149 | "metadata": {
150 | "trusted": true
151 | },
152 | "cell_type": "code",
153 | "source": "suggestions.iloc[1]['Line']",
154 | "execution_count": 10,
155 | "outputs": [
156 | {
157 | "output_type": "execute_result",
158 | "execution_count": 10,
159 | "data": {
160 | "text/plain": "'2012-02-03 18:35:34 SampleClass0 [ERROR] incorrect id 1886438513'"
161 | },
162 | "metadata": {}
163 | }
164 | ]
165 | },
166 | {
167 | "metadata": {},
168 | "cell_type": "markdown",
169 | "source": "Having retrieved source value we can now provide an example of desired split.\nNotice that we chose not to split date and time but rather keep them together in one column."
170 | },
171 | {
172 | "metadata": {
173 | "trusted": true
174 | },
175 | "cell_type": "code",
176 | "source": "b.add_example(example=(suggestions['Line'].iloc[1], ['2012-02-03 18:35:34','SampleClass0','ERROR','incorrect id','1886438513']))",
177 | "execution_count": 11,
178 | "outputs": []
179 | },
180 | {
181 | "metadata": {
182 | "scrolled": false,
183 | "trusted": true
184 | },
185 | "cell_type": "code",
186 | "source": "b.preview()",
187 | "execution_count": 12,
188 | "outputs": [
189 | {
190 | "output_type": "execute_result",
191 | "execution_count": 12,
192 | "data": {
193 | "text/html": "\n\n
\n \n \n | \n Line | \n Line_1 | \n Line_2 | \n Line_3 | \n Line_4 | \n Line_5 | \n
\n \n \n \n | 0 | \n 2012-02-03 18:35:34 SampleClass6 [INFO] everyt... | \n 2012-02-03 18:35:34 | \n SampleClass6 | \n INFO | \n everything normal for id | \n 577725851 | \n
\n \n | 1 | \n 2012-02-03 18:35:34 SampleClass4 [FATAL] syste... | \n 2012-02-03 18:35:34 | \n SampleClass4 | \n FATAL | \n system problem at id | \n 1991281254 | \n
\n \n | 2 | \n 2012-02-03 18:35:34 SampleClass3 [DEBUG] detai... | \n 2012-02-03 18:35:34 | \n SampleClass3 | \n DEBUG | \n detail for id | \n 1304807656 | \n
\n \n | 3 | \n 2012-02-03 18:35:34 SampleClass3 [WARN] missin... | \n 2012-02-03 18:35:34 | \n SampleClass3 | \n WARN | \n missing id | \n 423340895 | \n
\n \n | 4 | \n 2012-02-03 18:35:34 SampleClass5 [TRACE] verbo... | \n 2012-02-03 18:35:34 | \n SampleClass5 | \n TRACE | \n verbose detail for id | \n 2082654978 | \n
\n \n | 5 | \n 2012-02-03 18:35:34 SampleClass0 [ERROR] incor... | \n 2012-02-03 18:35:34 | \n SampleClass0 | \n ERROR | \n incorrect id | \n 1886438513 | \n
\n \n | 6 | \n 2012-02-03 18:35:34 SampleClass9 [TRACE] verbo... | \n 2012-02-03 18:35:34 | \n SampleClass9 | \n TRACE | \n verbose detail for id | \n 438634209 | \n
\n \n | 7 | \n 2012-02-03 18:35:34 SampleClass8 [DEBUG] detai... | \n 2012-02-03 18:35:34 | \n SampleClass8 | \n DEBUG | \n detail for id | \n 2074121310 | \n
\n \n | 8 | \n 2012-02-03 18:55:54 SampleClass4 [DEBUG] detai... | \n 2012-02-03 18:55:54 | \n SampleClass4 | \n DEBUG | \n detail for id | \n 1029178762 | \n
\n \n | 9 | \n 2012-02-03 18:55:54 SampleClass2 [TRACE] verbo... | \n 2012-02-03 18:55:54 | \n SampleClass2 | \n TRACE | \n verbose detail for id | \n 1135460272 | \n
\n \n
\n
",
194 | "text/plain": " Line Line_1 \\\n0 2012-02-03 18:35:34 SampleClass6 [INFO] everyt... 2012-02-03 18:35:34 \n1 2012-02-03 18:35:34 SampleClass4 [FATAL] syste... 2012-02-03 18:35:34 \n2 2012-02-03 18:35:34 SampleClass3 [DEBUG] detai... 2012-02-03 18:35:34 \n3 2012-02-03 18:35:34 SampleClass3 [WARN] missin... 2012-02-03 18:35:34 \n4 2012-02-03 18:35:34 SampleClass5 [TRACE] verbo... 2012-02-03 18:35:34 \n5 2012-02-03 18:35:34 SampleClass0 [ERROR] incor... 2012-02-03 18:35:34 \n6 2012-02-03 18:35:34 SampleClass9 [TRACE] verbo... 2012-02-03 18:35:34 \n7 2012-02-03 18:35:34 SampleClass8 [DEBUG] detai... 2012-02-03 18:35:34 \n8 2012-02-03 18:55:54 SampleClass4 [DEBUG] detai... 2012-02-03 18:55:54 \n9 2012-02-03 18:55:54 SampleClass2 [TRACE] verbo... 2012-02-03 18:55:54 \n\n Line_2 Line_3 Line_4 Line_5 \n0 SampleClass6 INFO everything normal for id 577725851 \n1 SampleClass4 FATAL system problem at id 1991281254 \n2 SampleClass3 DEBUG detail for id 1304807656 \n3 SampleClass3 WARN missing id 423340895 \n4 SampleClass5 TRACE verbose detail for id 2082654978 \n5 SampleClass0 ERROR incorrect id 1886438513 \n6 SampleClass9 TRACE verbose detail for id 438634209 \n7 SampleClass8 DEBUG detail for id 2074121310 \n8 SampleClass4 DEBUG detail for id 1029178762 \n9 SampleClass2 TRACE verbose detail for id 1135460272 "
195 | },
196 | "metadata": {}
197 | }
198 | ]
199 | },
200 | {
201 | "metadata": {},
202 | "cell_type": "markdown",
203 | "source": "This looks just like what we need, so let's get a dataflow with split in it and drop original column."
204 | },
205 | {
206 | "metadata": {
207 | "trusted": true
208 | },
209 | "cell_type": "code",
210 | "source": "dataflow = b.to_dataflow()\ndataflow = dataflow.drop_columns(['Line'])\ndataflow.head(10)",
211 | "execution_count": 13,
212 | "outputs": [
213 | {
214 | "output_type": "execute_result",
215 | "execution_count": 13,
216 | "data": {
217 | "text/html": "\n\n
\n \n \n | \n Line_1 | \n Line_2 | \n Line_3 | \n Line_4 | \n Line_5 | \n
\n \n \n \n | 0 | \n 2012-02-03 18:35:34 | \n SampleClass6 | \n INFO | \n everything normal for id | \n 577725851 | \n
\n \n | 1 | \n 2012-02-03 18:35:34 | \n SampleClass4 | \n FATAL | \n system problem at id | \n 1991281254 | \n
\n \n | 2 | \n 2012-02-03 18:35:34 | \n SampleClass3 | \n DEBUG | \n detail for id | \n 1304807656 | \n
\n \n | 3 | \n 2012-02-03 18:35:34 | \n SampleClass3 | \n WARN | \n missing id | \n 423340895 | \n
\n \n | 4 | \n 2012-02-03 18:35:34 | \n SampleClass5 | \n TRACE | \n verbose detail for id | \n 2082654978 | \n
\n \n | 5 | \n 2012-02-03 18:35:34 | \n SampleClass0 | \n ERROR | \n incorrect id | \n 1886438513 | \n
\n \n | 6 | \n 2012-02-03 18:35:34 | \n SampleClass9 | \n TRACE | \n verbose detail for id | \n 438634209 | \n
\n \n | 7 | \n 2012-02-03 18:35:34 | \n SampleClass8 | \n DEBUG | \n detail for id | \n 2074121310 | \n
\n \n | 8 | \n 2012-02-03 18:55:54 | \n SampleClass4 | \n DEBUG | \n detail for id | \n 1029178762 | \n
\n \n | 9 | \n 2012-02-03 18:55:54 | \n SampleClass2 | \n TRACE | \n verbose detail for id | \n 1135460272 | \n
\n \n
\n
",
218 | "text/plain": " Line_1 Line_2 Line_3 Line_4 \\\n0 2012-02-03 18:35:34 SampleClass6 INFO everything normal for id \n1 2012-02-03 18:35:34 SampleClass4 FATAL system problem at id \n2 2012-02-03 18:35:34 SampleClass3 DEBUG detail for id \n3 2012-02-03 18:35:34 SampleClass3 WARN missing id \n4 2012-02-03 18:35:34 SampleClass5 TRACE verbose detail for id \n5 2012-02-03 18:35:34 SampleClass0 ERROR incorrect id \n6 2012-02-03 18:35:34 SampleClass9 TRACE verbose detail for id \n7 2012-02-03 18:35:34 SampleClass8 DEBUG detail for id \n8 2012-02-03 18:55:54 SampleClass4 DEBUG detail for id \n9 2012-02-03 18:55:54 SampleClass2 TRACE verbose detail for id \n\n Line_5 \n0 577725851 \n1 1991281254 \n2 1304807656 \n3 423340895 \n4 2082654978 \n5 1886438513 \n6 438634209 \n7 2074121310 \n8 1029178762 \n9 1135460272 "
219 | },
220 | "metadata": {}
221 | }
222 | ]
223 | },
224 | {
225 | "metadata": {
226 | "trusted": true
227 | },
228 | "cell_type": "code",
229 | "source": "",
230 | "execution_count": null,
231 | "outputs": []
232 | }
233 | ],
234 | "metadata": {
235 | "kernelspec": {
236 | "name": "python36",
237 | "display_name": "Python 3.6",
238 | "language": "python"
239 | },
240 | "language_info": {
241 | "mimetype": "text/x-python",
242 | "nbconvert_exporter": "python",
243 | "name": "python",
244 | "pygments_lexer": "ipython3",
245 | "version": "3.6.6",
246 | "file_extension": ".py",
247 | "codemirror_mode": {
248 | "version": 3,
249 | "name": "ipython"
250 | }
251 | }
252 | },
253 | "nbformat": 4,
254 | "nbformat_minor": 2
255 | }
--------------------------------------------------------------------------------