├── data
    ├── elements.csv
    ├── excel.xlsx
    ├── map_func.py
    ├── parquet.parquet
    ├── multiple_separators.csv
    ├── text_lines.txt
    ├── crime0-10.dprep.settings
    ├── crime0-10.csv
    ├── adls-dpreptestfiles.crt
    ├── median_income.csv
    ├── secrets.dprep
    ├── crime0-10.dprep
    └── median_income_transformed.csv
├── README.md
├── column-type-transform.ipynb
├── package-json-representation.ipynb
├── quantile-transformation.ipynb
├── read-pandas-dataframe.ipynb
├── secrets.ipynb
├── external-references.ipynb
├── impute-missing-values.ipynb
├── 0. Import librairie.ipynb
├── smart-read-file-separators.ipynb
├── caching.ipynb
├── join.ipynb
└── split-column-by-example.ipynb


/data/elements.csv:
--------------------------------------------------------------------------------
1 | ID,Symbol,Boiling Point
2 | 1,H,-252.87
3 | 53,I,184.3
4 | 2,He,-268.93


--------------------------------------------------------------------------------
/data/excel.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/retkowsky/Data-Preparation-avec-Azure-ML-service/master/data/excel.xlsx


--------------------------------------------------------------------------------
/data/map_func.py:
--------------------------------------------------------------------------------
1 | def transform(df, index):
2 |     df['MAM_MTH00numvalid_1011'].fillna(0,inplace=True)
3 |     return df
4 | 


--------------------------------------------------------------------------------
/data/parquet.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/retkowsky/Data-Preparation-avec-Azure-ML-service/master/data/parquet.parquet


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data-Preparation-avec-Azure-ML-service
2 | 
3 | Serge Retkowsky | serge.retkowsky@microsoft.com | https://www.linkedin.com/in/serger/
4 | 


--------------------------------------------------------------------------------
/data/multiple_separators.csv:
--------------------------------------------------------------------------------
 1 | ID |CaseNumber| |Completed|
 2 | 10140490 |HY329907| |Y|
 3 | 10139776 |HY329265| |Y|
 4 | 10140270 |HY329253| |N|
 5 | 10139885 |HY329308| |Y|
 6 | 10140379 |HY329556| |N|
 7 | 10140868 |HY330421| |N|
 8 | 10139762 |HY329232| |N|
 9 | 10139722 |HY329228| |Y|
10 | 10139774 |HY329209| |N|
11 | 10139697 |HY329177| |N|


--------------------------------------------------------------------------------
/data/text_lines.txt:
--------------------------------------------------------------------------------
 1 | Date||Minimum temperature||Maximum temperature
 2 | 2015-07-1||-4.1||10.0
 3 | 2015-07-2||-0.8||10.8
 4 | 2015-07-3||-7.0||10.5
 5 | 2015-07-4||-5.5||9.3
 6 | 2015-07-5||-4.7||7.3
 7 | 2015-07-6||-2.4||11.2
 8 | 2015-07-7||-4.7||11.5
 9 | 2015-07-8||-3.0||12.6
10 | 2015-07-9||-1.3||13.8
11 | 2015-07-10||-0.5||9.9
12 | 2015-07-11||3.6||12.5
13 | 2015-07-12||3.1||9.2
14 | 2015-07-13||3.6||13.6
15 | 2015-07-14||4.1||10.0
16 | 2015-07-15||1.1||7.9
17 | 


--------------------------------------------------------------------------------
/data/crime0-10.dprep.settings:
--------------------------------------------------------------------------------
1 | {"project":{"activitiesPaneSize":200,"isActivitiesPaneCollapsed":true,"activeActivityId":"75637565-60ad-4baa-87d3-396a7930cfe7","isInActivityView":true},"75637565-60ad-4baa-87d3-396a7930cfe7.main.visualFilters":[],"75637565-60ad-4baa-87d3-396a7930cfe7.main.currentSliceIndex":0,"75637565-60ad-4baa-87d3-396a7930cfe7.main.typeFilter":[],"75637565-60ad-4baa-87d3-396a7930cfe7.main.columnSearchFilter":{"term":"","matchCase":false,"matchWholeWord":false,"useRegex":false},"75637565-60ad-4baa-87d3-396a7930cfe7.main.columnsSelections":[]}


--------------------------------------------------------------------------------
/data/crime0-10.csv:
--------------------------------------------------------------------------------
 1 | ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
 2 | 10140490,HY329907,07/05/2015 11:50:00 PM,050XX N NEWLAND AVE,0820,THEFT,$500 AND UNDER,STREET,false,false,1613,016,41,10,06,1129230,1933315,2015,07/12/2015 12:42:46 PM,41.973309466,-87.800174996,"(41.973309466, -87.800174996)"
 3 | 10139776,HY329265,07/05/2015 11:30:00 PM,011XX W MORSE AVE,0460,BATTERY,SIMPLE,STREET,false,true,2431,024,49,1,08B,1167370,1946271,2015,07/12/2015 12:42:46 PM,42.008124017,-87.65955018,"(42.008124017, -87.65955018)"
 4 | 10140270,HY329253,07/05/2015 11:20:00 PM,121XX S FRONT AVE,0486,BATTERY,DOMESTIC BATTERY SIMPLE,STREET,false,true,0532,,9,53,08B,,,2015,07/12/2015 12:42:46 PM,,,
 5 | 10139885,HY329308,07/05/2015 11:19:00 PM,051XX W DIVISION ST,0610,BURGLARY,FORCIBLE ENTRY,SMALL RETAIL STORE,false,false,1531,015,37,25,05,1141721,1907465,2015,07/12/2015 12:42:46 PM,41.902152027,-87.754883404,"(41.902152027, -87.754883404)"
 6 | 10140379,HY329556,07/05/2015 11:00:00 PM,012XX W LAKE ST,0930,MOTOR VEHICLE THEFT,THEFT/RECOVERY: AUTOMOBILE,STREET,false,false,1215,012,27,28,07,1168413,1901632,2015,07/12/2015 12:42:46 PM,41.885610142,-87.657008701,"(41.885610142, -87.657008701)"
 7 | 10140868,HY330421,07/05/2015 10:54:00 PM,118XX S PEORIA ST,1320,CRIMINAL DAMAGE,TO VEHICLE,VEHICLE NON-COMMERCIAL,false,false,0524,005,34,53,14,1172409,1826485,2015,07/12/2015 12:42:46 PM,41.6793109,-87.644545209,"(41.6793109, -87.644545209)"
 8 | 10139762,HY329232,07/05/2015 10:42:00 PM,026XX W 37TH PL,1020,ARSON,BY FIRE,VACANT LOT/LAND,false,false,0911,009,12,58,09,1159436,1879658,2015,07/12/2015 12:42:46 PM,41.825500607,-87.690578042,"(41.825500607, -87.690578042)"
 9 | 10139722,HY329228,07/05/2015 10:30:00 PM,016XX S CENTRAL PARK AVE,1811,NARCOTICS,POSS: CANNABIS 30GMS OR LESS,ALLEY,true,false,1021,010,24,29,18,1152687,1891389,2015,07/12/2015 12:42:46 PM,41.857827814,-87.715028789,"(41.857827814, -87.715028789)"
10 | 10139774,HY329209,07/05/2015 10:15:00 PM,048XX N ASHLAND AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,false,false,2032,020,46,3,14,1164821,1932394,2015,07/12/2015 12:42:46 PM,41.970099796,-87.669324377,"(41.970099796, -87.669324377)"
11 | 10139697,HY329177,07/05/2015 10:10:00 PM,058XX S ARTESIAN AVE,1320,CRIMINAL DAMAGE,TO VEHICLE,ALLEY,false,false,0824,008,16,63,14,1160997,1865851,2015,07/12/2015 12:42:46 PM,41.787580282,-87.685233078,"(41.787580282, -87.685233078)"


--------------------------------------------------------------------------------
/data/adls-dpreptestfiles.crt:
--------------------------------------------------------------------------------
 1 | -----BEGIN PRIVATE KEY-----
 2 | MIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQDmkkyF0BwipZow
 3 | Wd1AMkRkySx0y079JPxpsYhv4i1xXKdoa9bpFqwoXmJpeQM1JWnU4UeZzFeM86qK
 4 | AhQvL4KV4kibcP2ENvu2NKFEdotO3uxPJ+6GlcYwMYzy+tUj008KnnRZfTrR78sJ
 5 | tIl3C6lnVL0ICihksG59P1sskRq3PvOjXLAdEZalwDjZ4ZPoNDZdj6nUjB2l8zqu
 6 | pKAt5mR+bJ9Sox4yrDuNhMmFt5QsRDRe3wUqdV+C9OCWHmjlmsjrYw7p9YmjBDvC
 7 | 5U7mF0Mk/XeYFzj0pkXKQVqBL6xqig+q5ob0szYfg19iDeFhS3iIsRcJGEnRVW/A
 8 | NpsBZyKrAgMBAAECggEBANlvP8C1F8NInhZYuIAwpzTQTh86Fxw8g9h8dijkh2wv
 9 | LyQXBk07d1B+aZoDZ5X32UzKwcX04N9obfvFqBkzWZdVFJmZvUmwvEEActBoZkkT
10 | io+/HX5HweVy5PPCvbsSK6jc8uXtZcnSs4tMeJIOKkvqqnTpd1w00Y1FcQqfMC16
11 | 4p7o8wbt6OFoFAYqcxeVYVwDzCTLZD3+iJaqmntkBkoDndJy52yXQmMq5z1wbQVp
12 | BL6+L9nTvmouy64jiHVSKOx8nnWThYfHsXoPv+rYywjeuK/v3hyaTAwogs36ooEn
13 | SnuTBRvJcumN9Q0XIVlxKMVBcGyyAP+0yNKGz5NQgdECgYEA/I/Uq1E3epPJgEWR
14 | Bub+LpCgwtrw/lgKncb/Q/AiE9qoXobUe4KNU8aGaNMb7uVNLckY7cOluLS6SQb3
15 | Mzwk2Jl0G3vk8rW46tZWvSYB8+zAR2Rz7seUOT9SE5OmvwpnHrnp3nRr1vvVd2bp
16 | Q/ypwMLrwWQN51Kr+oTS74bUbrkCgYEA6bXVIUyao7z2Q3qAr6h+6JEWDbkJA7hJ
17 | BjHIOXvxd1tMoJJX+X9+IE/2XoJaUkGCb0vrM/hi1cyQFmS4Or/J6IWSZu8oBpDr
18 | EBmIK3PF1nrzNvWD28wM46c6ScehyWSm/u4bJWSm9liTX3dv5Kpa6ym7yLKc3c0B
19 | ECpSJM+5SoMCgYEAq585Tukzn/IJPUcIk/4nv5C8DW0l0lAVdr2g/JOTNJajTwik
20 | HwHJ86G1+Elsc9wRpAlBDWCjnm4BIFrBZGl8SEuOoJaCL4PZEotwCbxoG09IIbtb
21 | JGkuifBDX9Y3ux3gkPqYt3e5SC99EVQ3MuHgoIJUHehVolmFUAkuJWIjvNECgYEA
22 | 5pU0VspRuELzZdgzpxvDOooLDDcHodfslGQBfFXBA1Xc4IACtHMJaa/7D3vkyUtA
23 | +bYZtQjX2sEdWDq/WZdoCjXfIBfNkczhXt0R8G0lQFvGIu9QzUchYGrZo3mHMkBQ
24 | Uy1xMw9/e4YgwQwCJcW+Nk7Sq00uX9enuN9IdHFOCykCgYAqAGMK6CH1tlpjvHrf
25 | k+ZhigYxTXBlsVVvK1BIGGaiwzDpn65zeQp4aLOjSZkI1LuRi3tfTiZ321jRd64J
26 | 4lGk5Jurqv5grDmxROX/U50wEYbI9ncu/thU7syUdxDiqxHPI2RMG50mRcm3a55p
27 | ZCNSqkMlcXyA0U1z8C1ILNUsbA==
28 | -----END PRIVATE KEY-----
29 | -----BEGIN CERTIFICATE-----
30 | MIICoTCCAYkCAgPoMA0GCSqGSIb3DQEBBQUAMBQxEjAQBgNVBAMMCUNMSS1Mb2dp
31 | bjAiGA8yMDE4MDcxMzIzMjA0N1oYDzIwMTkwNzEzMjMyMDQ5WjAUMRIwEAYDVQQD
32 | DAlDTEktTG9naW4wggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDmkkyF
33 | 0BwipZowWd1AMkRkySx0y079JPxpsYhv4i1xXKdoa9bpFqwoXmJpeQM1JWnU4UeZ
34 | zFeM86qKAhQvL4KV4kibcP2ENvu2NKFEdotO3uxPJ+6GlcYwMYzy+tUj008KnnRZ
35 | fTrR78sJtIl3C6lnVL0ICihksG59P1sskRq3PvOjXLAdEZalwDjZ4ZPoNDZdj6nU
36 | jB2l8zqupKAt5mR+bJ9Sox4yrDuNhMmFt5QsRDRe3wUqdV+C9OCWHmjlmsjrYw7p
37 | 9YmjBDvC5U7mF0Mk/XeYFzj0pkXKQVqBL6xqig+q5ob0szYfg19iDeFhS3iIsRcJ
38 | GEnRVW/ANpsBZyKrAgMBAAEwDQYJKoZIhvcNAQEFBQADggEBAI4VlaFb9NsXMLdT
39 | Cw5/pk0Xo2Qi6483RGTy8vzrw88IE7f3juB/JWG+rayjtW5bBRx2fae4/ZIdZ4zg
40 | N2FDKn2PQPAc9m9pcKyUKUvWOC8ixSkrUmeQew0l1AXU0hsPSlJ7/7ZK4efoyB47
41 | hj71fsyKdyKbisZDcUFBq/S8PazdPF0YOD1W/4A2tW0cSMg+jmFWynuUTdWt3SU8
42 | CwBGqdiSKT5faJuYwIWnRXDEQS3ObRn1OFEfFdd4d2sxjxydWKRgnINnGlBdiFAT
43 | KzCozVr+75cO2ErH6x5C0hLQGG5BxXbaijyxyvaRNokTMVVv6OaDEnjzCGfJ72Yf
44 | 2wgitNc=
45 | -----END CERTIFICATE-----
46 | 


--------------------------------------------------------------------------------
/data/median_income.csv:
--------------------------------------------------------------------------------
  1 | median_income
  2 | 4.4896
  3 | 2.1029
  4 | 2.3889
  5 | 3.707
  6 | 6.4788
  7 | 4.4074
  8 | 5.2907
  9 | 1.5156
 10 | 8.4411
 11 | 4.4085
 12 | 2.1439
 13 | 2.8971
 14 | 6.1008
 15 | 3.5258
 16 | 2.7694
 17 | 2.2356
 18 | 1.9509
 19 | 4.0905
 20 | 3.6726
 21 | 3.1696
 22 | 2.5389
 23 | 3.0319
 24 | 4.6779
 25 | 2.9076
 26 | 2.8616
 27 | 1.4722
 28 | 5.6413
 29 | 2.1167
 30 | 4.7308
 31 | 4.8173
 32 | 2.3438
 33 | 1.7333
 34 | 1.4429
 35 | 2.3253
 36 | 2.4022
 37 | 3.4048
 38 | 6.6073
 39 | 4.1080000000000005
 40 | 4.2829
 41 | 1.5727
 42 | 2.5211
 43 | 4.2679
 44 | 4.7328
 45 | 4.7069
 46 | 2.465
 47 | 5.0267
 48 | 2.8043
 49 | 2.4053
 50 | 1.2176
 51 | 2.39
 52 | 3.6364
 53 | 6.0162
 54 | 2.8088
 55 | 3.3984
 56 | 4.5
 57 | 3.9079
 58 | 4.9618
 59 | 2.9344
 60 | 2.4283
 61 | 3.7388
 62 | 1.6021
 63 | 2.3352
 64 | 4.0982
 65 | 1.9531
 66 | 3.2386
 67 | 5.1169
 68 | 4.692
 69 | 4.0
 70 | 6.4238
 71 | 3.7375
 72 | 2.8233
 73 | 2.8009
 74 | 3.767
 75 | 3.6761
 76 | 5.0282
 77 | 3.5296
 78 | 5.215
 79 | 4.0125
 80 | 9.4667
 81 | 5.9062
 82 | 3.9864
 83 | 2.0734
 84 | 2.875
 85 | 3.3611
 86 | 2.8214
 87 | 0.9946
 88 | 4.5446
 89 | 4.6908
 90 | 9.3198
 91 | 1.2826
 92 | 2.4943
 93 | 10.1882
 94 | 4.6731
 95 | 4.375
 96 | 2.8173
 97 | 2.0903
 98 | 2.725
 99 | 2.8547
100 | 2.25
101 | 1.9444
102 | 1.7167
103 | 1.9342
104 | 4.9524
105 | 3.65
106 | 3.0856
107 | 3.2396
108 | 2.9324
109 | 3.495
110 | 1.9818
111 | 4.6964
112 | 3.925
113 | 3.625
114 | 2.9688
115 | 4.0417
116 | 9.7956
117 | 3.8732
118 | 2.6998
119 | 2.006
120 | 4.25
121 | 3.1839999999999997
122 | 5.9658
123 | 2.628
124 | 2.5057
125 | 5.155
126 | 4.6
127 | 4.6681
128 | 5.5942
129 | 5.1104
130 | 3.0759
131 | 3.5757
132 | 3.6845
133 | 6.4667
134 | 5.273
135 | 3.0635
136 | 11.2866
137 | 4.0444
138 | 5.2541
139 | 5.5791
140 | 4.5375
141 | 9.8144
142 | 6.7257
143 | 4.1442
144 | 4.0313
145 | 2.2791
146 | 4.1679
147 | 3.2852
148 | 3.2768
149 | 5.021
150 | 4.875
151 | 4.419
152 | 3.3272
153 | 4.2386
154 | 1.245
155 | 5.152
156 | 4.8125
157 | 2.1638
158 | 7.1621
159 | 1.5372
160 | 10.0481
161 | 3.3869
162 | 5.4591
163 | 4.4318
164 | 6.5044
165 | 4.2865
166 | 3.0461
167 | 11.3283
168 | 2.7026
169 | 3.016
170 | 3.0943
171 | 3.225
172 | 6.187
173 | 3.8158
174 | 3.0147
175 | 15.0
176 | 3.1364
177 | 2.9
178 | 5.5941
179 | 3.4028
180 | 6.0062
181 | 8.3792
182 | 3.8036
183 | 2.0926
184 | 6.7703
185 | 4.2569
186 | 4.744
187 | 9.7037
188 | 5.1292
189 | 2.3148
190 | 3.3021
191 | 1.95
192 | 3.025
193 | 2.6523
194 | 1.2188
195 | 5.827999999999999
196 | 3.1587
197 | 2.45
198 | 2.3851
199 | 2.1221
200 | 3.5313
201 | 3.4821
202 | 7.8252
203 | 5.1878
204 | 3.7459
205 | 6.0097
206 | 2.3194
207 | 4.2061
208 | 2.267
209 | 2.2109
210 | 2.7589
211 | 2.6553
212 | 6.3325
213 | 5.7233
214 | 4.337
215 | 3.9667
216 | 5.8623
217 | 1.6806
218 | 3.5851
219 | 2.9716
220 | 3.9
221 | 2.7431
222 | 3.3621
223 | 1.9464
224 | 7.3518
225 | 4.775
226 | 3.5968
227 | 6.221
228 | 10.0968
229 | 1.9483
230 | 2.0469
231 | 3.725
232 | 3.675
233 | 1.8529
234 | 1.7159
235 | 1.7386
236 | 3.6687
237 | 3.4671
238 | 4.8233
239 | 4.3036
240 | 1.6488
241 | 2.9453
242 | 5.0096
243 | 3.175
244 | 4.2031
245 | 3.1667
246 | 5.7204
247 | 3.375
248 | 6.5483
249 | 4.2206
250 | 2.6631
251 | 3.5363
252 | 


--------------------------------------------------------------------------------
/column-type-transform.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |   "cells": [
 3 |     {
 4 |       "metadata": {},
 5 |       "cell_type": "markdown",
 6 |       "source": "# Column Type Transformations\nCopyright (c) Microsoft Corporation. All rights reserved.<br>\nLicensed under the MIT License."
 7 |     },
 8 |     {
 9 |       "metadata": {},
10 |       "cell_type": "markdown",
11 |       "source": "DataPrep has the ability to transform column types."
12 |     },
13 |     {
14 |       "metadata": {
15 |         "trusted": true
16 |       },
17 |       "cell_type": "code",
18 |       "source": "import azureml.dataprep as dprep\ndataflow = dprep.read_csv(path=r'data\\elements.csv')\ndataflow.head(3)",
19 |       "execution_count": 1,
20 |       "outputs": [
21 |         {
22 |           "output_type": "execute_result",
23 |           "execution_count": 1,
24 |           "data": {
25 |             "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>ID</th>\n      <th>Symbol</th>\n      <th>Boiling Point</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>H</td>\n      <td>-252.87</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>53</td>\n      <td>I</td>\n      <td>184.3</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>He</td>\n      <td>-268.93</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
26 |             "text/plain": "   ID Symbol Boiling Point\n0   1      H       -252.87\n1  53      I         184.3\n2   2     He       -268.93"
27 |           },
28 |           "metadata": {}
29 |         }
30 |       ]
31 |     },
32 |     {
33 |       "metadata": {},
34 |       "cell_type": "markdown",
35 |       "source": "#### `to_long(columns)`"
36 |     },
37 |     {
38 |       "metadata": {
39 |         "trusted": true
40 |       },
41 |       "cell_type": "code",
42 |       "source": "# Convert the boiling point to a 64 bit integer.\nintegers_only_dataflow = dataflow.to_long(['Boiling Point'])\nintegers_only_dataflow.head(3)",
43 |       "execution_count": 2,
44 |       "outputs": [
45 |         {
46 |           "output_type": "execute_result",
47 |           "execution_count": 2,
48 |           "data": {
49 |             "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>ID</th>\n      <th>Symbol</th>\n      <th>Boiling Point</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>H</td>\n      <td>-252</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>53</td>\n      <td>I</td>\n      <td>184</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>He</td>\n      <td>-268</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
50 |             "text/plain": "   ID Symbol  Boiling Point\n0   1      H           -252\n1  53      I            184\n2   2     He           -268"
51 |           },
52 |           "metadata": {}
53 |         }
54 |       ]
55 |     },
56 |     {
57 |       "metadata": {
58 |         "trusted": true
59 |       },
60 |       "cell_type": "code",
61 |       "source": "",
62 |       "execution_count": null,
63 |       "outputs": []
64 |     }
65 |   ],
66 |   "metadata": {
67 |     "kernelspec": {
68 |       "name": "python36",
69 |       "display_name": "Python 3.6",
70 |       "language": "python"
71 |     },
72 |     "language_info": {
73 |       "mimetype": "text/x-python",
74 |       "nbconvert_exporter": "python",
75 |       "name": "python",
76 |       "pygments_lexer": "ipython3",
77 |       "version": "3.6.6",
78 |       "file_extension": ".py",
79 |       "codemirror_mode": {
80 |         "version": 3,
81 |         "name": "ipython"
82 |       }
83 |     }
84 |   },
85 |   "nbformat": 4,
86 |   "nbformat_minor": 2
87 | }


--------------------------------------------------------------------------------
/package-json-representation.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |   "cells": [
 3 |     {
 4 |       "metadata": {},
 5 |       "cell_type": "markdown",
 6 |       "source": "DataPrep Package can be saved to and loaded from JSON string."
 7 |     },
 8 |     {
 9 |       "metadata": {},
10 |       "cell_type": "markdown",
11 |       "source": "# Work with JSON representation of Package\nCopyright (c) Microsoft Corporation. All rights reserved.<br>\nLicensed under the MIT License."
12 |     },
13 |     {
14 |       "metadata": {
15 |         "trusted": true
16 |       },
17 |       "cell_type": "code",
18 |       "source": "# create a Dataflow and pack it into a Package\nimport azureml.dataprep as dprep\ndf = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv')\npkg = dprep.Package(df)",
19 |       "execution_count": 1,
20 |       "outputs": []
21 |     },
22 |     {
23 |       "metadata": {
24 |         "trusted": true
25 |       },
26 |       "cell_type": "code",
27 |       "source": "# save Package to JSON string\njson_str = pkg.to_json()\njson_str",
28 |       "execution_count": 2,
29 |       "outputs": [
30 |         {
31 |           "output_type": "execute_result",
32 |           "execution_count": 2,
33 |           "data": {
34 |             "text/plain": "'{\\n  \"schemaVersion\": 63,\\n  \"id\": \"1e865029-9dec-4664-a9ca-effecbace8c9\",\\n  \"activities\": [\\n    {\\n      \"id\": \"9a2f0365-c518-4d54-b68c-e8ca31ef5b22\",\\n      \"name\": \"dataflow\",\\n      \"blocks\": [\\n        {\\n          \"id\": \"c60a8fb8-d189-477a-af3d-d5e163cb9eae\",\\n          \"type\": \"Microsoft.DPrep.GetFilesBlock\",\\n          \"arguments\": {\\n            \"isArchive\": false,\\n            \"path\": {\\n              \"target\": 1,\\n              \"resourceDetails\": [\\n                {\\n                  \"path\": \"https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv\",\\n                  \"sas\": null,\\n                  \"storageAccountName\": null,\\n                  \"storageAccountKey\": null\\n                }\\n              ]\\n            }\\n          },\\n          \"localData\": {},\\n          \"isEnabled\": true,\\n          \"name\": null,\\n          \"annotation\": null\\n        },\\n        {\\n          \"id\": \"6af3740f-32ff-4337-8201-b772de830251\",\\n          \"type\": \"Microsoft.DPrep.ParseDelimitedBlock\",\\n          \"arguments\": {\\n            \"columnHeadersMode\": 3,\\n            \"fileEncoding\": 0,\\n            \"handleQuotedLineBreaks\": false,\\n            \"preview\": false,\\n            \"separator\": \",\",\\n            \"skipRows\": 0,\\n            \"skipRowsMode\": 0\\n          },\\n          \"localData\": {},\\n          \"isEnabled\": true,\\n          \"name\": null,\\n          \"annotation\": null\\n        },\\n        {\\n          \"id\": \"f46f529c-0a0d-4a88-b398-734a71f42d48\",\\n          \"type\": \"Microsoft.DPrep.DropColumnsBlock\",\\n          \"arguments\": {\\n            \"columns\": {\\n              \"type\": 0,\\n              \"details\": {\\n                \"selectedColumns\": [\\n                  \"Path\"\\n                ]\\n              }\\n            }\\n          },\\n          \"localData\": {},\\n          \"isEnabled\": true,\\n          \"name\": null,\\n          \"annotation\": null\\n        }\\n      ],\\n      \"inspectors\": []\\n    }\\n  ],\\n  \"runConfigurations\": []\\n}'"
35 |           },
36 |           "metadata": {}
37 |         }
38 |       ]
39 |     },
40 |     {
41 |       "metadata": {
42 |         "trusted": true
43 |       },
44 |       "cell_type": "code",
45 |       "source": "# load Package from JSON string\npkg_loaded = dprep.Package.from_json(json_str)\ndf_loaded = pkg_loaded.dataflows[0]",
46 |       "execution_count": 3,
47 |       "outputs": []
48 |     },
49 |     {
50 |       "metadata": {
51 |         "trusted": true
52 |       },
53 |       "cell_type": "code",
54 |       "source": "",
55 |       "execution_count": null,
56 |       "outputs": []
57 |     }
58 |   ],
59 |   "metadata": {
60 |     "kernelspec": {
61 |       "name": "python36",
62 |       "display_name": "Python 3.6",
63 |       "language": "python"
64 |     },
65 |     "language_info": {
66 |       "mimetype": "text/x-python",
67 |       "nbconvert_exporter": "python",
68 |       "name": "python",
69 |       "pygments_lexer": "ipython3",
70 |       "version": "3.6.6",
71 |       "file_extension": ".py",
72 |       "codemirror_mode": {
73 |         "version": 3,
74 |         "name": "ipython"
75 |       }
76 |     }
77 |   },
78 |   "nbformat": 4,
79 |   "nbformat_minor": 2
80 | }


--------------------------------------------------------------------------------
/data/secrets.dprep:
--------------------------------------------------------------------------------
  1 | {
  2 |   "schemaVersion": 61,
  3 |   "id": "0ca59762-2256-45e6-b406-e58a4bb280b9",
  4 |   "activities": [
  5 |     {
  6 |       "id": "b308e5b8-9b2a-47f8-9d32-0f542b4a34a4",
  7 |       "name": "read_csv_duplicate_headers",
  8 |       "blocks": [
  9 |         {
 10 |           "id": "8d9ec228-6a4b-4abf-afb7-65f58dda1581",
 11 |           "type": "Microsoft.DPrep.GetFilesBlock",
 12 |           "arguments": {
 13 |             "path": {
 14 |               "target": 1,
 15 |               "resourceDetails": [
 16 |                 {
 17 |                   "path": "https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv",
 18 |                   "sas": {
 19 |                     "id": "https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv",
 20 |                     "secretType": "AzureMLSecret"
 21 |                   },
 22 |                   "storageAccountName": null,
 23 |                   "storageAccountKey": null
 24 |                 }
 25 |               ]
 26 |             }
 27 |           },
 28 |           "isEnabled": true,
 29 |           "name": null,
 30 |           "annotation": null
 31 |         },
 32 |         {
 33 |           "id": "4ad0460f-ec65-47c0-a0a4-44345404a462",
 34 |           "type": "Microsoft.DPrep.ParseDelimitedBlock",
 35 |           "arguments": {
 36 |             "columnHeadersMode": 3,
 37 |             "fileEncoding": 0,
 38 |             "handleQuotedLineBreaks": false,
 39 |             "preview": false,
 40 |             "separator": ",",
 41 |             "skipRows": 0,
 42 |             "skipRowsMode": 0
 43 |           },
 44 |           "isEnabled": true,
 45 |           "name": null,
 46 |           "annotation": null
 47 |         },
 48 |         {
 49 |           "id": "1a3e11ba-5854-48da-aa47-53af61beb782",
 50 |           "type": "Microsoft.DPrep.DropColumnsBlock",
 51 |           "arguments": {
 52 |             "columns": {
 53 |               "type": 0,
 54 |               "details": {
 55 |                 "selectedColumns": [
 56 |                   "Path"
 57 |                 ]
 58 |               }
 59 |             }
 60 |           },
 61 |           "isEnabled": true,
 62 |           "name": null,
 63 |           "annotation": null
 64 |         }
 65 |       ],
 66 |       "inspectors": []
 67 |     },
 68 |     {
 69 |       "id": "2d1fd227-0e7c-41de-9606-ca7eced82e07",
 70 |       "name": "population",
 71 |       "blocks": [
 72 |         {
 73 |           "id": "27060820-095e-48d1-bdbd-511f7e369105",
 74 |           "type": "Microsoft.DPrep.GetFilesBlock",
 75 |           "arguments": {
 76 |             "path": {
 77 |               "target": 1,
 78 |               "resourceDetails": [
 79 |                 {
 80 |                   "path": "https://dpreptestfiles.blob.core.windows.net/testfiles/population.csv",
 81 |                   "sas": {
 82 |                     "id": "https://dpreptestfiles.blob.core.windows.net/testfiles/population.csv",
 83 |                     "secretType": "AzureMLSecret"
 84 |                   },
 85 |                   "storageAccountName": null,
 86 |                   "storageAccountKey": null
 87 |                 }
 88 |               ]
 89 |             }
 90 |           },
 91 |           "isEnabled": true,
 92 |           "name": null,
 93 |           "annotation": null
 94 |         },
 95 |         {
 96 |           "id": "e7b2a399-9300-4fe5-8959-0d4ae9fc9172",
 97 |           "type": "Microsoft.DPrep.ParseDelimitedBlock",
 98 |           "arguments": {
 99 |             "columnHeadersMode": 3,
100 |             "fileEncoding": 0,
101 |             "handleQuotedLineBreaks": false,
102 |             "preview": false,
103 |             "separator": ",",
104 |             "skipRows": 0,
105 |             "skipRowsMode": 0
106 |           },
107 |           "isEnabled": true,
108 |           "name": null,
109 |           "annotation": null
110 |         },
111 |         {
112 |           "id": "5572e00a-dd5e-41fe-b301-3e66d0f4c5e2",
113 |           "type": "Microsoft.DPrep.DropColumnsBlock",
114 |           "arguments": {
115 |             "columns": {
116 |               "type": 0,
117 |               "details": {
118 |                 "selectedColumns": [
119 |                   "Path"
120 |                 ]
121 |               }
122 |             }
123 |           },
124 |           "isEnabled": true,
125 |           "name": null,
126 |           "annotation": null
127 |         }
128 |       ],
129 |       "inspectors": []
130 |     },
131 |     {
132 |       "id": "ec2c9cf9-beb9-4ebd-b4d2-8ba076c6a3db",
133 |       "name": "top_films",
134 |       "blocks": [
135 |         {
136 |           "id": "6ac0814d-9e5b-4db5-8cc1-f11dc3db531d",
137 |           "type": "Microsoft.DPrep.GetFilesBlock",
138 |           "arguments": {
139 |             "path": {
140 |               "target": 1,
141 |               "resourceDetails": [
142 |                 {
143 |                   "path": "https://dpreptestfiles.blob.core.windows.net/testfiles/TopFilms.csv",
144 |                   "sas": {
145 |                     "id": "https://dpreptestfiles.blob.core.windows.net/testfiles/TopFilms.csv",
146 |                     "secretType": "AzureMLSecret"
147 |                   },
148 |                   "storageAccountName": null,
149 |                   "storageAccountKey": null
150 |                 }
151 |               ]
152 |             }
153 |           },
154 |           "isEnabled": true,
155 |           "name": null,
156 |           "annotation": null
157 |         },
158 |         {
159 |           "id": "0cd162d2-8395-4369-aa78-e431456c9201",
160 |           "type": "Microsoft.DPrep.ParseDelimitedBlock",
161 |           "arguments": {
162 |             "columnHeadersMode": 3,
163 |             "fileEncoding": 0,
164 |             "handleQuotedLineBreaks": false,
165 |             "preview": false,
166 |             "separator": ",",
167 |             "skipRows": 0,
168 |             "skipRowsMode": 0
169 |           },
170 |           "isEnabled": true,
171 |           "name": null,
172 |           "annotation": null
173 |         },
174 |         {
175 |           "id": "ceb32a6b-ba57-4c90-a4d0-5913c211961e",
176 |           "type": "Microsoft.DPrep.DropColumnsBlock",
177 |           "arguments": {
178 |             "columns": {
179 |               "type": 0,
180 |               "details": {
181 |                 "selectedColumns": [
182 |                   "Path"
183 |                 ]
184 |               }
185 |             }
186 |           },
187 |           "isEnabled": true,
188 |           "name": null,
189 |           "annotation": null
190 |         }
191 |       ],
192 |       "inspectors": []
193 |     }
194 |   ],
195 |   "runConfigurations": []
196 | }


--------------------------------------------------------------------------------
/data/crime0-10.dprep:
--------------------------------------------------------------------------------
  1 | {
  2 |   "schemaVersion": 59,
  3 |   "id": "1ba93a7c-e711-464f-9a70-1c491e28a66f",
  4 |   "activities": [
  5 |     {
  6 |       "id": "75637565-60ad-4baa-87d3-396a7930cfe7",
  7 |       "name": "crime0-10",
  8 |       "blocks": [
  9 |         {
 10 |           "id": "ba5a8061-129e-4618-953a-ce3e89c8f2cb",
 11 |           "type": "Microsoft.DPrep.GetFilesBlock",
 12 |           "arguments": {
 13 |             "path": {
 14 |               "target": 0,
 15 |               "resourceDetails": [
 16 |                 {
 17 |                   "path": "./crime0-10.csv"
 18 |                 }
 19 |               ]
 20 |             }
 21 |           },
 22 |           "isEnabled": true,
 23 |           "name": null,
 24 |           "annotation": null
 25 |         },
 26 |         {
 27 |           "id": "1b345643-6b60-4ca1-99f9-2a64ae932a23",
 28 |           "type": "Microsoft.DPrep.ParseDelimitedBlock",
 29 |           "arguments": {
 30 |             "columnHeadersMode": 1,
 31 |             "fileEncoding": 0,
 32 |             "handleQuotedLineBreaks": false,
 33 |             "preview": false,
 34 |             "separator": ",",
 35 |             "skipRowsMode": 0
 36 |           },
 37 |           "isEnabled": true,
 38 |           "name": null,
 39 |           "annotation": null
 40 |         },
 41 |         {
 42 |           "id": "12cf73a2-1487-4915-bfa7-c86be7de08c0",
 43 |           "type": "Microsoft.DPrep.SetColumnTypesBlock",
 44 |           "arguments": {
 45 |             "columnConversion": [
 46 |               {
 47 |                 "column": {
 48 |                   "type": 2,
 49 |                   "details": {
 50 |                     "selectedColumn": "ID"
 51 |                   }
 52 |                 },
 53 |                 "typeProperty": 3
 54 |               },
 55 |               {
 56 |                 "column": {
 57 |                   "type": 2,
 58 |                   "details": {
 59 |                     "selectedColumn": "IUCR"
 60 |                   }
 61 |                 },
 62 |                 "typeProperty": 3
 63 |               },
 64 |               {
 65 |                 "column": {
 66 |                   "type": 2,
 67 |                   "details": {
 68 |                     "selectedColumn": "Domestic"
 69 |                   }
 70 |                 },
 71 |                 "typeProperty": 1
 72 |               },
 73 |               {
 74 |                 "column": {
 75 |                   "type": 2,
 76 |                   "details": {
 77 |                     "selectedColumn": "Beat"
 78 |                   }
 79 |                 },
 80 |                 "typeProperty": 3
 81 |               },
 82 |               {
 83 |                 "column": {
 84 |                   "type": 2,
 85 |                   "details": {
 86 |                     "selectedColumn": "District"
 87 |                   }
 88 |                 },
 89 |                 "typeProperty": 3
 90 |               },
 91 |               {
 92 |                 "column": {
 93 |                   "type": 2,
 94 |                   "details": {
 95 |                     "selectedColumn": "Ward"
 96 |                   }
 97 |                 },
 98 |                 "typeProperty": 3
 99 |               },
100 |               {
101 |                 "column": {
102 |                   "type": 2,
103 |                   "details": {
104 |                     "selectedColumn": "Community Area"
105 |                   }
106 |                 },
107 |                 "typeProperty": 3
108 |               },
109 |               {
110 |                 "column": {
111 |                   "type": 2,
112 |                   "details": {
113 |                     "selectedColumn": "Year"
114 |                   }
115 |                 },
116 |                 "typeProperty": 3
117 |               },
118 |               {
119 |                 "column": {
120 |                   "type": 2,
121 |                   "details": {
122 |                     "selectedColumn": "Longitude"
123 |                   }
124 |                 },
125 |                 "typeProperty": 3
126 |               },
127 |               {
128 |                 "column": {
129 |                   "type": 2,
130 |                   "details": {
131 |                     "selectedColumn": "Arrest"
132 |                   }
133 |                 },
134 |                 "typeProperty": 1
135 |               },
136 |               {
137 |                 "column": {
138 |                   "type": 2,
139 |                   "details": {
140 |                     "selectedColumn": "X Coordinate"
141 |                   }
142 |                 },
143 |                 "typeProperty": 3
144 |               },
145 |               {
146 |                 "column": {
147 |                   "type": 2,
148 |                   "details": {
149 |                     "selectedColumn": "Updated On"
150 |                   }
151 |                 },
152 |                 "typeArguments": {
153 |                   "dateTimeFormats": [
154 |                     "%m/%d/%Y %I:%M:%S %p"
155 |                   ]
156 |                 },
157 |                 "typeProperty": 4
158 |               },
159 |               {
160 |                 "column": {
161 |                   "type": 2,
162 |                   "details": {
163 |                     "selectedColumn": "Date"
164 |                   }
165 |                 },
166 |                 "typeArguments": {
167 |                   "dateTimeFormats": [
168 |                     "%m/%d/%Y %I:%M:%S %p"
169 |                   ]
170 |                 },
171 |                 "typeProperty": 4
172 |               },
173 |               {
174 |                 "column": {
175 |                   "type": 2,
176 |                   "details": {
177 |                     "selectedColumn": "Y Coordinate"
178 |                   }
179 |                 },
180 |                 "typeProperty": 3
181 |               },
182 |               {
183 |                 "column": {
184 |                   "type": 2,
185 |                   "details": {
186 |                     "selectedColumn": "Latitude"
187 |                   }
188 |                 },
189 |                 "typeProperty": 3
190 |               }
191 |             ]
192 |           },
193 |           "isEnabled": true,
194 |           "name": null,
195 |           "annotation": null
196 |         },
197 |         {
198 |           "id": "5f370fdf-2fde-4f18-8069-93ef5800bf0c",
199 |           "type": "Microsoft.DPrep.SampleBlock",
200 |           "arguments": {
201 |             "activeSample": "0afde520-3a41-4fef-8d20-eaa07d588924",
202 |             "samples": [
203 |               {
204 |                 "allowAutoGen": true,
205 |                 "isDisabled": false,
206 |                 "sampleId": "0afde520-3a41-4fef-8d20-eaa07d588924",
207 |                 "sampleName": "Top 10000",
208 |                 "sampleRevision": "d8663336-152a-462f-bb57-686dc7a0843c",
209 |                 "sampleRunner": {
210 |                   "id": null,
211 |                   "type": 0
212 |                 },
213 |                 "sampleStrategy": 0,
214 |                 "topArguments": {
215 |                   "sampleCount": 10000
216 |                 }
217 |               }
218 |             ]
219 |           },
220 |           "isEnabled": true,
221 |           "name": null,
222 |           "annotation": null
223 |         },
224 |         {
225 |           "id": "dfd62543-9285-412b-a930-0aeaaffde699",
226 |           "type": "Microsoft.DPrep.HandlePathColumnBlock",
227 |           "arguments": {
228 |             "pathColumnOperation": 0
229 |           },
230 |           "isEnabled": true,
231 |           "name": null,
232 |           "annotation": null
233 |         }
234 |       ],
235 |       "inspectors": []
236 |     }
237 |   ],
238 |   "runConfigurations": []
239 | }


--------------------------------------------------------------------------------
/quantile-transformation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {},
  5 |       "cell_type": "markdown",
  6 |       "source": "# Quantile Transformation\nCopyright (c) Microsoft Corporation. All rights reserved.\nLicensed under the MIT License.\n\nDataPrep has the ability to perform quantile transformation to a numeric column. This transformation can transform the data into a normal or uniform distribution. Values bigger than the learnt boundaries will simply be clipped to the learnt boundaries when applying quantile transformation.\n\nLet's load a sample of the median income of california households in different suburbs from the 1990 census data. From the data profile, we can see that the minimum value and maximum value is 0.9946 and 15 respectively."
  7 |     },
  8 |     {
  9 |       "metadata": {
 10 |         "trusted": true
 11 |       },
 12 |       "cell_type": "code",
 13 |       "source": "!pip install azureml",
 14 |       "execution_count": 1,
 15 |       "outputs": [
 16 |         {
 17 |           "output_type": "stream",
 18 |           "text": "Requirement already satisfied: azureml in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (0.2.7)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.3)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.19.1)\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.8.24)\nRequirement already satisfied: urllib3<1.24,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.5)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.5)\n",
 19 |           "name": "stdout"
 20 |         }
 21 |       ]
 22 |     },
 23 |     {
 24 |       "metadata": {
 25 |         "trusted": true
 26 |       },
 27 |       "cell_type": "code",
 28 |       "source": "import azureml.dataprep as dprep\n\ndf = dprep.read_csv(path='./data/median_income.csv').set_column_types(type_conversions={\n    'median_income': dprep.TypeConverter(dprep.FieldType.DECIMAL)\n})\ndf.get_profile()",
 29 |       "execution_count": 2,
 30 |       "outputs": [
 31 |         {
 32 |           "output_type": "execute_result",
 33 |           "execution_count": 2,
 34 |           "data": {
 35 |             "text/html": "<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Type</th>\n      <th>Min</th>\n      <th>Max</th>\n      <th>Count</th>\n      <th>Missing Count</th>\n      <th>Error Count</th>\n      <th>Lower Quartile</th>\n      <th>Median</th>\n      <th>Upper Quartile</th>\n      <th>Standard Deviation</th>\n      <th>Mean</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>median_income</th>\n      <td>FieldType.DECIMAL</td>\n      <td>0.9946</td>\n      <td>15.0</td>\n      <td>250.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>2.6907</td>\n      <td>3.6307</td>\n      <td>4.77335</td>\n      <td>2.026679</td>\n      <td>4.007843</td>\n    </tr>\n  </tbody>\n</table>",
 36 |             "text/plain": "ColumnProfile\n    name: median_income\n    type: FieldType.DECIMAL\n\n    min: 0.9946\n    max: 15.0\n    count: 250.0\n    missing_count: 0.0\n    error_count: 0.0\n\n    lower_quartile: 2.6907\n    median: 3.6307\n    upper_quartile: 4.773350000000001\n    std: 2.026679472255346\n    mean: 4.007842799999996"
 37 |           },
 38 |           "metadata": {}
 39 |         }
 40 |       ]
 41 |     },
 42 |     {
 43 |       "metadata": {},
 44 |       "cell_type": "markdown",
 45 |       "source": "Let's now apply quantile transformation to `median_income` and see how that affects the data. We will apply quantile transformation twice, one that maps the data to a Uniform(0, 1) distribution, one that maps it to a Normal(0, 1) distribution.\n\nFrom the data profile, we can see that the min and max of the uniform median income is strictly between 0 and 1 and the mean and standard deviation of the normal median income is close to 1 and 0 respectively.\n\n*note: for normal distribution, we will clip the values at the ends as the 0th percentile and the 100th percentile are -Inf and Inf respectively.*"
 46 |     },
 47 |     {
 48 |       "metadata": {
 49 |         "trusted": true
 50 |       },
 51 |       "cell_type": "code",
 52 |       "source": "df = df.quantile_transform(source_column='median_income', new_column='median_income_uniform', quantiles_count=5)\ndf = df.quantile_transform(source_column='median_income', new_column='median_income_normal', \n                           quantiles_count=5, output_distribution=\"Normal\")\ndf.get_profile()",
 53 |       "execution_count": 3,
 54 |       "outputs": [
 55 |         {
 56 |           "output_type": "execute_result",
 57 |           "execution_count": 3,
 58 |           "data": {
 59 |             "text/html": "<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Type</th>\n      <th>Min</th>\n      <th>Max</th>\n      <th>Count</th>\n      <th>Missing Count</th>\n      <th>Error Count</th>\n      <th>Lower Quartile</th>\n      <th>Median</th>\n      <th>Upper Quartile</th>\n      <th>Standard Deviation</th>\n      <th>Mean</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>median_income</th>\n      <td>FieldType.DECIMAL</td>\n      <td>0.994600</td>\n      <td>15.000000</td>\n      <td>250.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>2.690700</td>\n      <td>3.630700</td>\n      <td>4.773350</td>\n      <td>2.026679</td>\n      <td>4.007843</td>\n    </tr>\n    <tr>\n      <th>median_income_normal</th>\n      <td>FieldType.DECIMAL</td>\n      <td>-7.941345</td>\n      <td>7.941444</td>\n      <td>250.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>-0.671590</td>\n      <td>-0.000337</td>\n      <td>0.667810</td>\n      <td>1.021506</td>\n      <td>-0.060922</td>\n    </tr>\n    <tr>\n      <th>median_income_uniform</th>\n      <td>FieldType.DECIMAL</td>\n      <td>0.000000</td>\n      <td>1.000000</td>\n      <td>250.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.250934</td>\n      <td>0.499866</td>\n      <td>0.747861</td>\n      <td>0.252830</td>\n      <td>0.484762</td>\n    </tr>\n  </tbody>\n</table>",
 60 |             "text/plain": "ColumnProfile\n    name: median_income\n    type: FieldType.DECIMAL\n\n    min: 0.9946\n    max: 15.0\n    count: 250.0\n    missing_count: 0.0\n    error_count: 0.0\n\n    lower_quartile: 2.6907\n    median: 3.6307\n    upper_quartile: 4.773350000000001\n    std: 2.026679472255346\n    mean: 4.007842799999996\n\nColumnProfile\n    name: median_income_normal\n    type: FieldType.DECIMAL\n\n    min: -7.941345326170997\n    max: 7.94144448741598\n    count: 250.0\n    missing_count: 0.0\n    error_count: 0.0\n\n    lower_quartile: -0.6715898847385642\n    median: -0.00033696356609359737\n    upper_quartile: 0.6678101623094225\n    std: 1.021505801777812\n    mean: -0.06092218967843191\n\nColumnProfile\n    name: median_income_uniform\n    type: FieldType.DECIMAL\n\n    min: 0.0\n    max: 1.0\n    count: 250.0\n    missing_count: 0.0\n    error_count: 0.0\n\n    lower_quartile: 0.25093366375866033\n    median: 0.4998655717951272\n    upper_quartile: 0.7478610044020887\n    std: 0.25283034846216024\n    mean: 0.4847624122367444"
 61 |           },
 62 |           "metadata": {}
 63 |         }
 64 |       ]
 65 |     },
 66 |     {
 67 |       "metadata": {},
 68 |       "cell_type": "markdown",
 69 |       "source": "Let's now save the dataflow which we will later load in the operationalization notebook."
 70 |     },
 71 |     {
 72 |       "metadata": {
 73 |         "trusted": true
 74 |       },
 75 |       "cell_type": "code",
 76 |       "source": "from tempfile import mkdtemp\nfrom os import path\n\ntmp_dir = mkdtemp()\npackage_path = path.join(tmp_dir, 'quantile_transform.dprep')\npackage = dprep.Package(arg=df)\npackage.save(package_path)\nprint('Package saved to: \"{}\"'.format(package_path))",
 77 |       "execution_count": 3,
 78 |       "outputs": [
 79 |         {
 80 |           "output_type": "stream",
 81 |           "text": "Package saved to: \"/tmp/tmp29cvg68a/quantile_transform.dprep\"\n",
 82 |           "name": "stdout"
 83 |         }
 84 |       ]
 85 |     },
 86 |     {
 87 |       "metadata": {
 88 |         "trusted": true
 89 |       },
 90 |       "cell_type": "code",
 91 |       "source": "",
 92 |       "execution_count": null,
 93 |       "outputs": []
 94 |     }
 95 |   ],
 96 |   "metadata": {
 97 |     "kernelspec": {
 98 |       "name": "python36",
 99 |       "display_name": "Python 3.6",
100 |       "language": "python"
101 |     },
102 |     "language_info": {
103 |       "mimetype": "text/x-python",
104 |       "nbconvert_exporter": "python",
105 |       "name": "python",
106 |       "pygments_lexer": "ipython3",
107 |       "version": "3.6.6",
108 |       "file_extension": ".py",
109 |       "codemirror_mode": {
110 |         "version": 3,
111 |         "name": "ipython"
112 |       }
113 |     }
114 |   },
115 |   "nbformat": 4,
116 |   "nbformat_minor": 2
117 | }


--------------------------------------------------------------------------------
/read-pandas-dataframe.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {
  5 |         "trusted": true
  6 |       },
  7 |       "cell_type": "code",
  8 |       "source": "!pip install azureml",
  9 |       "execution_count": 1,
 10 |       "outputs": [
 11 |         {
 12 |           "output_type": "stream",
 13 |           "text": "Requirement already satisfied: azureml in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (0.2.7)\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.19.1)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.3)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.5)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.5)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.8.24)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: urllib3<1.24,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\n",
 14 |           "name": "stdout"
 15 |         }
 16 |       ]
 17 |     },
 18 |     {
 19 |       "metadata": {
 20 |         "trusted": true
 21 |       },
 22 |       "cell_type": "code",
 23 |       "source": "import azureml.dataprep as dprep",
 24 |       "execution_count": 2,
 25 |       "outputs": []
 26 |     },
 27 |     {
 28 |       "metadata": {
 29 |         "trusted": true
 30 |       },
 31 |       "cell_type": "code",
 32 |       "source": "dflow = dprep.read_excel(path='./data/excel.xlsx')\ndflow = dflow.drop_columns(columns=['Column1'])\ndf = dflow.to_pandas_dataframe()\ndf.head(10)",
 33 |       "execution_count": 3,
 34 |       "outputs": [
 35 |         {
 36 |           "output_type": "execute_result",
 37 |           "execution_count": 3,
 38 |           "data": {
 39 |             "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Column2</th>\n      <th>Column3</th>\n      <th>Column4</th>\n      <th>Column5</th>\n      <th>Column6</th>\n      <th>Column7</th>\n      <th>Column8</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Iron, IVB</td>\n      <td>6e+07</td>\n      <td>Found</td>\n      <td>1920</td>\n      <td>http://www.lpi.usra.edu/meteor/metbull.php?cod...</td>\n      <td>-19.5833</td>\n      <td>17.9167</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Iron, IIIAB</td>\n      <td>5.82e+07</td>\n      <td>Found</td>\n      <td>1818</td>\n      <td>http://www.lpi.usra.edu/meteor/metbull.php?cod...</td>\n      <td>76.1333</td>\n      <td>-64.9333</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Iron, IAB-MG</td>\n      <td>5e+07</td>\n      <td>Found</td>\n      <td>1576</td>\n      <td>http://www.lpi.usra.edu/meteor/metbull.php?cod...</td>\n      <td>-27.4667</td>\n      <td>-60.5833</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Iron, IAB-MG</td>\n      <td>3e+07</td>\n      <td>Found</td>\n      <td>1891</td>\n      <td>http://www.lpi.usra.edu/meteor/metbull.php?cod...</td>\n      <td>35.05</td>\n      <td>-111.033</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Iron, IIIE</td>\n      <td>2.8e+07</td>\n      <td>Found</td>\n      <td>1898</td>\n      <td>http://www.lpi.usra.edu/meteor/metbull.php?cod...</td>\n      <td>47</td>\n      <td>88</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Iron, IVA</td>\n      <td>2.6e+07</td>\n      <td>Found</td>\n      <td>1836</td>\n      <td>http://www.lpi.usra.edu/meteor/metbull.php?cod...</td>\n      <td>-25.5</td>\n      <td>18</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Iron, IIIAB</td>\n      <td>2.43e+07</td>\n      <td>Found</td>\n      <td>1852</td>\n      <td>http://www.lpi.usra.edu/meteor/metbull.php?cod...</td>\n      <td>27</td>\n      <td>-105.1</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>Iron, IAB-ung</td>\n      <td>2.4e+07</td>\n      <td>Found</td>\n      <td>1911</td>\n      <td>http://www.lpi.usra.edu/meteor/metbull.php?cod...</td>\n      <td>-30.7833</td>\n      <td>127.55</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>Iron, IIAB</td>\n      <td>2.3e+07</td>\n      <td>Fell</td>\n      <td>1947</td>\n      <td>http://www.lpi.usra.edu/meteor/metbull.php?cod...</td>\n      <td>46.16</td>\n      <td>134.653</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>Iron, ungrouped</td>\n      <td>2.2e+07</td>\n      <td>Found</td>\n      <td>1863</td>\n      <td>http://www.lpi.usra.edu/meteor/metbull.php?cod...</td>\n      <td>26.2</td>\n      <td>-107.833</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
 40 |             "text/plain": "           Column2   Column3 Column4 Column5  \\\n0        Iron, IVB     6e+07   Found    1920   \n1      Iron, IIIAB  5.82e+07   Found    1818   \n2     Iron, IAB-MG     5e+07   Found    1576   \n3     Iron, IAB-MG     3e+07   Found    1891   \n4       Iron, IIIE   2.8e+07   Found    1898   \n5        Iron, IVA   2.6e+07   Found    1836   \n6      Iron, IIIAB  2.43e+07   Found    1852   \n7    Iron, IAB-ung   2.4e+07   Found    1911   \n8       Iron, IIAB   2.3e+07    Fell    1947   \n9  Iron, ungrouped   2.2e+07   Found    1863   \n\n                                             Column6  Column7  Column8  \n0  http://www.lpi.usra.edu/meteor/metbull.php?cod... -19.5833  17.9167  \n1  http://www.lpi.usra.edu/meteor/metbull.php?cod...  76.1333 -64.9333  \n2  http://www.lpi.usra.edu/meteor/metbull.php?cod... -27.4667 -60.5833  \n3  http://www.lpi.usra.edu/meteor/metbull.php?cod...    35.05 -111.033  \n4  http://www.lpi.usra.edu/meteor/metbull.php?cod...       47       88  \n5  http://www.lpi.usra.edu/meteor/metbull.php?cod...    -25.5       18  \n6  http://www.lpi.usra.edu/meteor/metbull.php?cod...       27   -105.1  \n7  http://www.lpi.usra.edu/meteor/metbull.php?cod... -30.7833   127.55  \n8  http://www.lpi.usra.edu/meteor/metbull.php?cod...    46.16  134.653  \n9  http://www.lpi.usra.edu/meteor/metbull.php?cod...     26.2 -107.833  "
 41 |           },
 42 |           "metadata": {}
 43 |         }
 44 |       ]
 45 |     },
 46 |     {
 47 |       "metadata": {},
 48 |       "cell_type": "markdown",
 49 |       "source": "## read_pandas_dataframe\n\nThere are situations where you may already have some data in the form of a pandas DataFrame.\nThe steps taken to get to this DataFrame may be non-trivial or not easy to convert to dprep operations. The 'read_pandas_dataframe' reader can take a DataFrame and use it as the datasource for a Dataflow.\nIt is also required to pass in a path to a directory (that exists) where DataPrep can store the contents of the DataFrame. The files written to this directory will be named 'part-00000' and so on, they are written out as DataPrep's internal row based file format."
 50 |     },
 51 |     {
 52 |       "metadata": {
 53 |         "trusted": true
 54 |       },
 55 |       "cell_type": "code",
 56 |       "source": "import shutil\ncache_dir = 'df_dflow'\nshutil.rmtree(cache_dir, ignore_errors=True)\ndf_dflow = dprep.read_pandas_dataframe(df, cache_dir)",
 57 |       "execution_count": 5,
 58 |       "outputs": [
 59 |         {
 60 |           "output_type": "error",
 61 |           "ename": "AttributeError",
 62 |           "evalue": "module 'azureml.dataprep' has no attribute 'read_pandas_dataframe'",
 63 |           "traceback": [
 64 |             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 65 |             "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
 66 |             "\u001b[0;32m<ipython-input-5-c1b12711f4ac>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mcache_dir\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'df_dflow'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mshutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrmtree\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcache_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_errors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mdf_dflow\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdprep\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_pandas_dataframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
 67 |             "\u001b[0;31mAttributeError\u001b[0m: module 'azureml.dataprep' has no attribute 'read_pandas_dataframe'"
 68 |           ]
 69 |         }
 70 |       ]
 71 |     },
 72 |     {
 73 |       "metadata": {
 74 |         "trusted": true
 75 |       },
 76 |       "cell_type": "code",
 77 |       "source": "df_dflow.head(10)",
 78 |       "execution_count": null,
 79 |       "outputs": []
 80 |     },
 81 |     {
 82 |       "metadata": {
 83 |         "trusted": true
 84 |       },
 85 |       "cell_type": "code",
 86 |       "source": "",
 87 |       "execution_count": null,
 88 |       "outputs": []
 89 |     }
 90 |   ],
 91 |   "metadata": {
 92 |     "execute_as_test": false,
 93 |     "kernelspec": {
 94 |       "name": "python36",
 95 |       "display_name": "Python 3.6",
 96 |       "language": "python"
 97 |     },
 98 |     "language_info": {
 99 |       "mimetype": "text/x-python",
100 |       "nbconvert_exporter": "python",
101 |       "name": "python",
102 |       "pygments_lexer": "ipython3",
103 |       "version": "3.6.6",
104 |       "file_extension": ".py",
105 |       "codemirror_mode": {
106 |         "version": 3,
107 |         "name": "ipython"
108 |       }
109 |     }
110 |   },
111 |   "nbformat": 4,
112 |   "nbformat_minor": 2
113 | }


--------------------------------------------------------------------------------
/data/median_income_transformed.csv:
--------------------------------------------------------------------------------
  1 | median_income,median_income_uniform,median_income_normal
  2 | 4.4896,0.688927015969381,0.4928112398942898
  3 | 2.1029,0.16242159563866576,-0.9845540061415601
  4 | 2.3889,0.20433495515563627,-0.8262365643809355
  5 | 3.707,0.5167832475474021,0.04208177981426486
  6 | 6.4788,0.7918154943685715,0.8127367410844715
  7 | 4.4074,0.6708459812590735,0.44225037403262457
  8 | 5.2907,0.7627885954411082,0.71530142355699
  9 | 1.5156,0.07635265842077493,-1.430040774223022
 10 | 8.4411,0.8397571522806675,0.9934602851577232
 11 | 4.4085,0.6710879415775812,0.44291928632820865
 12 | 2.1439,0.16843015417081886,-0.960387338010023
 13 | 2.8971,0.3028380993334766,-0.5162551753422808
 14 | 6.1008,0.7825804402531089,0.780937730437047
 15 | 3.5258,0.47180713824983866,-0.07072794893657437
 16 | 2.7694,0.2685175231133089,-0.6173027607303029
 17 | 2.2356,0.18186880825370766,-0.9082661605741446
 18 | 1.9509,0.14014596400726886,-1.0796637810446446
 19 | 4.0905,0.6011394131362455,0.25629744014575906
 20 | 3.6726,0.5092164884958867,0.023104366055624576
 21 | 3.1696,0.3760750376263169,-0.31580558952782856
 22 | 2.5389,0.2263174863708306,-0.7510293827903433
 23 | 3.0319,0.3390668673403568,-0.4150111583096138
 24 | 4.6779,0.7303462232193919,0.61386044030116
 25 | 2.9076,0.3056600731025585,-0.5081899386067169
 26 | 2.8616,0.29329714039991395,-0.5437779548183906
 27 | 1.4722,0.0699923793891787,-1.475847787309027
 28 | 5.6413,0.7713542302900003,0.7433140959507171
 29 | 2.1167,0.1644439885104636,-0.9763562034967139
 30 | 4.7308,0.7419823149003563,0.6494688564486208
 31 | 4.8173,0.7512227895726955,0.6783427156995535
 32 | 2.3438,0.19772554077026783,-0.8497733992585795
 33 | 1.7333,0.10825663872442697,-1.235852818864947
 34 | 1.4429,0.06569845829181076,-1.5086161030849279
 35 | 2.3253,0.19501436192039387,-0.8595652745311925
 36 | 2.4022,0.20628407292338352,-0.8193826233304142
 37 | 3.4048,0.4392872500537518,-0.15277653846062797
 38 | 6.6073,0.7949549241406269,0.8237349930522744
 39 | 4.1080000000000005,0.6049887818397783,0.2662814785234653
 40 | 4.2829,0.6434604724825127,0.3677239840415221
 41 | 1.5727,0.0847206753033589,-1.3740010688161062
 42 | 2.5211,0.22370889266662755,-0.7597270013437459
 43 | 4.2679,0.6401610135937705,0.35888920867930674
 44 | 4.7328,0.7424222427521885,0.6508311116191982
 45 | 4.7069,0.7367251770709602,0.6332817852325006
 46 | 2.465,0.21548742599214485,-0.7875245686101101
 47 | 5.0267,0.7563387163763406,0.6945735910123612
 48 | 2.8043,0.277897226402924,-0.5890996132954226
 49 | 2.4053,0.20673837856849753,-0.8177906162820535
 50 | 1.2176,0.03268069640658888,-1.8427782558151542
 51 | 2.39,0.2044961603845477,-0.8256682268514074
 52 | 3.6364,0.501253794377722,0.0031428016114447123
 53 | 6.0162,0.780513547189172,0.7739287859106961
 54 | 2.8088,0.2791066437325306,-0.5854974399696314
 55 | 3.3984,0.4375671898516448,-0.15714017005326586
 56 | 4.5,0.6912146407989088,0.4992962189785684
 57 | 3.9079,0.5609740002639567,0.1534391166508579
 58 | 4.9618,0.7547531211062519,0.6895237101278
 59 | 2.9344,0.3128628251988819,-0.4877518079508566
 60 | 2.4283,0.21010903335482733,-0.8060429810797759
 61 | 3.7388,0.5237781003915357,0.05963819257139614
 62 | 1.6021,0.089029251421537,-1.346757015256811
 63 | 2.3352,0.1964652089805967,-0.8543151087700283
 64 | 4.0982,0.6028331353658,0.26068721102120757
 65 | 1.9531,0.1404683744650917,-1.078217399304828
 66 | 3.2386,0.39461943668028376,-0.26729910818310126
 67 | 5.1169,0.7585424250568029,0.7016216442421952
 68 | 4.692,0.7334477145748096,0.6232738623769376
 69 | 4.0,0.5812326778408341,0.20504797284322906
 70 | 6.4238,0.7904717695634116,0.8080592738489488
 71 | 3.7375,0.5234921472878448,0.05892015392758171
 72 | 2.8233,0.28300365512792947,-0.5739416160725967
 73 | 2.8009,0.2769834444205546,-0.5918263314054893
 74 | 3.767,0.5299810831023711,0.07522231004048434
 75 | 3.6761,0.5099863622365932,0.025034712726592745
 76 | 5.0282,0.7563753634164814,0.6946905154674345
 77 | 3.5296,0.47282842399483976,-0.06816178418735802
 78 | 5.215,0.7609391414820063,0.70932677431432
 79 | 4.0125,0.583982226914786,0.21209163422132077
 80 | 9.4667,0.8648139551928855,1.1022060793492992
 81 | 5.9062,0.7778260975788522,0.7648719390554228
 82 | 3.9864,0.5782411684483745,0.19739599549652825
 83 | 2.0734,0.15809836449967754,-1.0023041289181829
 84 | 2.875,0.29689851644807563,-0.5333417486766315
 85 | 3.3611,0.4275424639862395,-0.1826343528680402
 86 | 2.8214,0.2824930122554289,-0.5754514381784691
 87 | 0.9946,9.999999977795539e-08,-5.199337582605575
 88 | 4.5446,0.7010250318947692,0.5273508961812362
 89 | 4.6908,0.7331837578637103,0.6224705781821348
 90 | 9.3198,0.861224988395104,1.085839464021226
 91 | 1.2826,0.04220645993317308,-1.725635992615424
 92 | 2.4943,0.2197813470895128,-0.7729318836756727
 93 | 10.1882,0.8824411815005742,1.1872788772739642
 94 | 4.6731,0.7292903963749944,0.6106682834354926
 95 | 4.375,0.6637191500593902,0.42263484449515154
 96 | 2.8173,0.281391098688454,-0.578713956288219
 97 | 2.0903,0.16057506301658947,-0.9920971718209968
 98 | 2.725,0.2565846054611911,-0.6539109002088199
 99 | 2.8547,0.29144270049451715,-0.5491749392049887
100 | 2.25,0.18397913125036633,-0.9003044332369591
101 | 1.9444,0.13919338765461042,-1.0839504359991745
102 | 1.7167,0.10582390526994544,-1.2490472005138797
103 | 1.9342,0.13769857553197723,-1.0907176288996137
104 | 4.9524,0.7545234663213701,0.6887937530976251
105 | 3.65,0.5042453037701816,0.010641599310222728
106 | 3.0856,0.35349924747366146,-0.37589024233948426
107 | 3.2396,0.394888196086863,-0.26660099147199706
108 | 2.9324,0.3123253063857234,-0.48926992156445664
109 | 3.495,0.46352934852719846,-0.09154607537767347
110 | 1.9818,0.14467436543759887,-1.0595514059995472
111 | 4.6964,0.7344155558488406,0.6262226873574688
112 | 3.925,0.5647353833971228,0.16298628506764115
113 | 3.625,0.4984680713824984,-0.003839985024398658
114 | 2.9688,0.3221081487852074,-0.4618117868349911
115 | 4.0417,0.5904051735515374,0.22858735593517016
116 | 9.7956,0.8728494295277418,1.1399643807904891
117 | 3.8732,0.5533412520346663,0.13410759250347515
118 | 2.6998,0.24989741485432906,-0.6748126069650576
119 | 2.006,0.1482208804736502,-1.0440943039822579
120 | 4.25,0.6362236593198715,0.34838284885436693
121 | 3.1839999999999997,0.3799451730810577,-0.3056247863400478
122 | 5.9658,0.7792822066404437,0.7697712668826051
123 | 2.628,0.23937510991265604,-0.7083141049607123
124 | 2.5057,0.2214520194618676,-0.7672985355850374
125 | 5.155,0.7594732598763774,0.7046091860898294
126 | 4.6,0.7132110333905237,0.5627899373325242
127 | 4.6681,0.7281905767454135,0.6073497232884044
128 | 5.5942,0.7702035132295815,0.7395172425174374
129 | 5.1104,0.7583836212161931,0.7011125840899696
130 | 3.0759,0.35089228122984295,-0.38291260834816176
131 | 3.5757,0.4852182326381423,-0.037060878177008386
132 | 3.6845,0.5118340592142888,0.02966793907608783
133 | 6.4667,0.7915198749114363,0.8117061751243242
134 | 5.273,0.7623561603674476,0.7139021637992619
135 | 3.0635,0.3475596645882605,-0.3919172875113563
136 | 11.2866,0.9092765874276221,1.3363135019854007
137 | 4.0444,0.5909990761515111,0.23011572277987793
138 | 5.2541,0.7618944076616745,0.7124095804237914
139 | 5.5791,0.7698345996921648,0.7383022482134728
140 | 4.5375,0.6994632880207644,0.5228574972480653
141 | 9.8144,0.8733087390975055,1.1421720249180776
142 | 6.7257,0.7978475971757347,0.8339577268169219
143 | 4.1442,0.6129514759579427,0.28701994562086086
144 | 4.0313,0.5881175487220095,0.22270526607462063
145 | 2.2791,0.18824374230611404,-0.88438672867467
146 | 4.1679,0.6181646210021556,0.3006639542199363
147 | 3.2852,0.40714362502687595,-0.23489883984002982
148 | 3.2768,0.4048860460116104,-0.24072005796234255
149 | 5.021,0.7561994576238059,0.6941293646505097
150 | 4.875,0.7526324790501087,0.682797132481279
151 | 4.419,0.6733975627997006,0.44931439580522814
152 | 3.3272,0.41843152010320356,-0.20590766210173994
153 | 4.2386,0.6337160705644274,0.341711709817628
154 | 1.245,0.03669617210856439,-1.7903831065617093
155 | 5.152,0.7593999657960959,0.704373718687922
156 | 4.8125,0.7511055190442452,0.6779727645490474
157 | 2.1638,0.1713465033120347,-0.9488576690059357
158 | 7.1621,0.8085094427206763,0.872416661000832
159 | 1.5372,0.0795181429157629,-1.408320169672634
160 | 10.0481,0.8790183479514304,1.1700935990858978
161 | 3.3869,0.43447645667598356,-0.16498865287495923
162 | 5.4591,0.7669028364809068,0.7286850708477646
163 | 4.4318,0.6762131010514274,0.4571353017049396
164 | 6.5044,0.7924409371869732,0.8149199625504349
165 | 4.2865,0.644252342615811,0.3698485817763834
166 | 3.0461,0.3428832509137819,-0.40460687823111086
167 | 11.3283,0.9102953751435343,1.3425761736583905
168 | 2.7026,0.25056439475381626,-0.6727147380915197
169 | 3.016,0.33479359277574705,-0.4267146406076349
170 | 3.0943,0.35583745431090086,-0.3696075717076287
171 | 3.225,0.39096430875080623,-0.2768065942137824
172 | 6.187,0.7846864234931958,0.7881189136255171
173 | 3.8158,0.5407153226870792,0.10223599868489697
174 | 3.0147,0.33444420554719406,-0.42767409705501475
175 | 15.0,0.9999999000000003,5.19933758270342
176 | 3.1364,0.36715222532788644,-0.33940526790689773
177 | 2.9,0.30361750161255635,-0.5140242974144523
178 | 5.5941,0.7702010700935721,0.739509192607556
179 | 3.4028,0.4387497312405934,-0.15413985695864643
180 | 6.0062,0.7802692335882339,0.7731028191405914
181 | 8.3792,0.8382448510908602,0.9872699911993058
182 | 3.8036,0.5380317627909023,0.09547634997560216
183 | 2.0926,0.1609121284952224,-0.9907160390730209
184 | 6.7703,0.798937235835919,0.837831173103705
185 | 4.2569,0.6377414104086929,0.3524281695696263
186 | 4.744,0.7448858387224493,0.6584822133838069
187 | 9.7037,0.87060418753512,1.1292518063856445
188 | 5.1292,0.7588429307859569,0.7025854406375655
189 | 2.3148,0.19347558473533027,-0.8651596484776254
190 | 3.3021,0.41168565899806486,-0.22321096549396113
191 | 1.95,0.1400140688199777,-1.0802561341135346
192 | 3.025,0.33721242743496016,-0.42008295383958816
193 | 2.6523,0.2429362799695175,-0.6968885244090585
194 | 1.2188,0.032856556656310446,-1.8403755990446928
195 | 5.827999999999999,0.7759155652195158,0.7584713308686281
196 | 3.1587,0.3731455600946033,-0.3235336598161438
197 | 2.45,0.21328917287062546,-0.7950604390156254
198 | 2.3851,0.20377806436485135,-0.8282019693528496
199 | 2.1221,0.16523535963421065,-0.9731661745087966
200 | 3.5313,0.47328531498602444,-0.06701390942662634
201 | 3.4821,0.4600623521823264,-0.10027663775339715
202 | 7.8252,0.8247098775988859,0.9334643965884332
203 | 5.1878,0.7602746084874545,0.7071861842977014
204 | 3.7459,0.5253398442655404,0.06356034060479637
205 | 6.0097,0.7803547433485621,0.773391847490294
206 | 2.3194,0.19414971569259623,-0.862705456837268
207 | 4.2061,0.6265672429721525,0.32277517627250846
208 | 2.267,0.18647048478808834,-0.8909780060851004
209 | 2.2109,0.1782490181136057,-0.9220585498888423
210 | 2.7589,0.265695549344227,-0.6258838950234518
211 | 2.6553,0.2433759305938214,-0.6954842771160817
212 | 6.3325,0.7882411863868461,0.8003334507140982
213 | 5.7233,0.7733576018176932,0.7499500376839813
214 | 4.337,0.6553605208745764,0.3998337668630456
215 | 3.9667,0.5739078791078263,0.18633222024962756
216 | 5.8623,0.7767535608707337,0.7612749184789059
217 | 1.6806,0.1005334427574887,-1.2785178769957448
218 | 3.5851,0.48774457105998714,-0.03072463804138443
219 | 2.9716,0.32286067512362926,-0.45971423541894674
220 | 3.9,0.5592362852492191,0.14903320783722215
221 | 2.7431,0.26144915072027514,-0.6388841474573944
222 | 3.3621,0.4278112233928187,-0.18194938611936773
223 | 1.9464,0.13948648807081301,-1.0826293461973695
224 | 7.3518,0.8131440717304732,0.8895420092252918
225 | 4.775,0.750189343040727,0.6750857069958222
226 | 3.5968,0.4908890561169641,-0.022839735128387998
227 | 6.221,0.7855170897363856,0.7909625929859541
228 | 10.0968,0.8802081551879993,1.1760279942069634
229 | 1.9483,0.1397649334662055,-1.0813760586453744
230 | 2.0469,0.15421478398499322,-1.0185228067876306
231 | 3.725,0.5207425982138929,0.05201743189676979
232 | 3.675,0.5097444019180853,0.02442802263182327
233 | 1.8529,0.1257840436133419,-1.1465489456932518
234 | 1.7159,0.10570666510346441,-1.2496885811666654
235 | 1.7386,0.10903335482736382,-1.2316851754273808
236 | 3.6687,0.5083586291848137,0.020953509422343072
237 | 3.4671,0.45603096108363783,-0.11043812057602183
238 | 4.8233,0.7513693777332584,0.6788052852389578
239 | 4.3036,0.6480137257489771,0.3799634475571365
240 | 1.6488,0.0958731461398675,-1.3054305179573955
241 | 2.9453,0.3157923027305955,-0.47949769802476955
242 | 5.0096,0.7559209401187363,0.6932413225605799
243 | 3.175,0.3775263384218447,-0.31198400815546146
244 | 4.2031,0.625907351194404,0.32103311272888746
245 | 3.1667,0.3752956353472371,-0.3178598218476214
246 | 5.7204,0.7732867508734211,0.7497147894780406
247 | 3.375,0.43127821973769076,-0.17312084617136517
248 | 6.5483,0.7935134738950917,0.818672917584347
249 | 4.2206,0.6297567198979367,0.33120908249828585
250 | 2.6631,0.2445190222170115,-0.6918396326662699
251 | 3.5363,0.4746291120189207,-0.06363831319524592
252 | 


--------------------------------------------------------------------------------
/secrets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {},
  5 |       "cell_type": "markdown",
  6 |       "source": "Currently, secrets are only persisted for the lifetime of the engine process and they are not part of the dprep file. If you started a new session (hence start an engine process) and load a package and try to run a dataflow within that package, you will need to call `use_secrets` to register the required secrets to use during execution, otherwise the execution will fail as the required secrets are not available.\n\nIn this notebook, we will:\n1. Loading a previously saved package\n2. Call `get_missing_secrets` to determine the missing secrets\n3. Call `use_secrets` and pass in the missing secrets to register it with the engine for this session\n4. Call `head` to see the a preview of the data"
  7 |     },
  8 |     {
  9 |       "metadata": {},
 10 |       "cell_type": "markdown",
 11 |       "source": "# Providing Secrets\nCopyright (c) Microsoft Corporation. All rights reserved.<br>\nLicensed under the MIT License."
 12 |     },
 13 |     {
 14 |       "metadata": {
 15 |         "trusted": true
 16 |       },
 17 |       "cell_type": "code",
 18 |       "source": "!pip install azureml",
 19 |       "execution_count": 1,
 20 |       "outputs": [
 21 |         {
 22 |           "output_type": "stream",
 23 |           "text": "Requirement already satisfied: azureml in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (0.2.7)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.3)\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.19.1)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.5)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.5)\nRequirement already satisfied: urllib3<1.24,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.8.24)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\n",
 24 |           "name": "stdout"
 25 |         }
 26 |       ]
 27 |     },
 28 |     {
 29 |       "metadata": {
 30 |         "trusted": true
 31 |       },
 32 |       "cell_type": "code",
 33 |       "source": "import azureml.dataprep as dprep\nimport os",
 34 |       "execution_count": 2,
 35 |       "outputs": []
 36 |     },
 37 |     {
 38 |       "metadata": {},
 39 |       "cell_type": "markdown",
 40 |       "source": "Let's load the previously saved package."
 41 |     },
 42 |     {
 43 |       "metadata": {
 44 |         "trusted": true
 45 |       },
 46 |       "cell_type": "code",
 47 |       "source": "package = dprep.Package.open(file_path='./data/secrets.dprep')\ndataflow = package.dataflows[0]",
 48 |       "execution_count": 3,
 49 |       "outputs": []
 50 |     },
 51 |     {
 52 |       "metadata": {},
 53 |       "cell_type": "markdown",
 54 |       "source": "Let's call `get_missing_secrets` to see what secrets are required missing in the engine."
 55 |     },
 56 |     {
 57 |       "metadata": {
 58 |         "trusted": true
 59 |       },
 60 |       "cell_type": "code",
 61 |       "source": "dataflow.get_missing_secrets()",
 62 |       "execution_count": 4,
 63 |       "outputs": [
 64 |         {
 65 |           "output_type": "execute_result",
 66 |           "execution_count": 4,
 67 |           "data": {
 68 |             "text/plain": "['https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv']"
 69 |           },
 70 |           "metadata": {}
 71 |         }
 72 |       ]
 73 |     },
 74 |     {
 75 |       "metadata": {},
 76 |       "cell_type": "markdown",
 77 |       "source": "Let's now read the secrets from an environment variable and put it in our secret dictionary and call `use_secrets` with the secrets. This will register these secrets in the engine so you don't need to provide them again in this session.\n\n_Note: It is a bad practice to have secrets in files that will be checked into source control._"
 78 |     },
 79 |     {
 80 |       "metadata": {
 81 |         "trusted": true
 82 |       },
 83 |       "cell_type": "code",
 84 |       "source": "sas = os.environ['SCENARIOS_SECRETS']\nsecrets = {\n    'https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv': sas\n}\ndataflow.use_secrets(secrets=secrets)",
 85 |       "execution_count": 5,
 86 |       "outputs": [
 87 |         {
 88 |           "output_type": "error",
 89 |           "ename": "KeyError",
 90 |           "evalue": "'SCENARIOS_SECRETS'",
 91 |           "traceback": [
 92 |             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 93 |             "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
 94 |             "\u001b[0;32m<ipython-input-5-f851e2afaf0f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msas\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menviron\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'SCENARIOS_SECRETS'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m secrets = {\n\u001b[1;32m      3\u001b[0m     \u001b[0;34m'https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0msas\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m }\n\u001b[1;32m      5\u001b[0m \u001b[0mdataflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse_secrets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msecrets\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msecrets\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 95 |             "\u001b[0;32m~/anaconda3_501/lib/python3.6/os.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m    667\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    668\u001b[0m             \u001b[0;31m# raise KeyError with the original key value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 669\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    670\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecodevalue\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    671\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
 96 |             "\u001b[0;31mKeyError\u001b[0m: 'SCENARIOS_SECRETS'"
 97 |           ]
 98 |         }
 99 |       ]
100 |     },
101 |     {
102 |       "metadata": {},
103 |       "cell_type": "markdown",
104 |       "source": "We can now call `head` without passing in `secrets` and the engine will happily execute and show us a preview of the data."
105 |     },
106 |     {
107 |       "metadata": {
108 |         "trusted": true
109 |       },
110 |       "cell_type": "code",
111 |       "source": "dataflow.head(5)",
112 |       "execution_count": 6,
113 |       "outputs": [
114 |         {
115 |           "output_type": "error",
116 |           "ename": "MissingSecretsError",
117 |           "evalue": "Required secrets are missing. Please call use_secrets to register the missing secrets.\nMissing secrets:\nhttps://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv",
118 |           "traceback": [
119 |             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
120 |             "\u001b[0;31mMissingSecretsError\u001b[0m                       Traceback (most recent call last)",
121 |             "\u001b[0;32m<ipython-input-6-3cfd7913bed6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdataflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
122 |             "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mhead\u001b[0;34m(self, count)\u001b[0m\n\u001b[1;32m     94\u001b[0m         \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mA\u001b[0m \u001b[0mPandas\u001b[0m \u001b[0mDataframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     95\u001b[0m         \"\"\"\n\u001b[0;32m---> 96\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_pandas_dataframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mextended_types\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     97\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     98\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mrun_local\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
123 |             "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mto_pandas_dataframe\u001b[0;34m(self, extended_types)\u001b[0m\n\u001b[1;32m    145\u001b[0m         })\n\u001b[1;32m    146\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 147\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_raise_if_missing_secrets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    148\u001b[0m         self._engine_api.execute_anonymous_blocks(\n\u001b[1;32m    149\u001b[0m             ExecuteAnonymousBlocksMessageArguments(blocks=steps_to_block_datas(dataflow_to_execute._steps),\n",
124 |             "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36m_raise_if_missing_secrets\u001b[0;34m(self, secrets)\u001b[0m\n\u001b[1;32m   1054\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmissing_secret_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmissing_secret_ids\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1055\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1056\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mMissingSecretsError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmissing_secrets\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1057\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1058\u001b[0m     \u001b[0;31m# Steps are immutable so we don't need to create a full deepcopy of them when cloning Dataflows.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
125 |             "\u001b[0;31mMissingSecretsError\u001b[0m: Required secrets are missing. Please call use_secrets to register the missing secrets.\nMissing secrets:\nhttps://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv"
126 |           ]
127 |         }
128 |       ]
129 |     },
130 |     {
131 |       "metadata": {
132 |         "trusted": true
133 |       },
134 |       "cell_type": "code",
135 |       "source": "",
136 |       "execution_count": null,
137 |       "outputs": []
138 |     }
139 |   ],
140 |   "metadata": {
141 |     "execute_as_test": false,
142 |     "kernelspec": {
143 |       "name": "python36",
144 |       "display_name": "Python 3.6",
145 |       "language": "python"
146 |     },
147 |     "language_info": {
148 |       "mimetype": "text/x-python",
149 |       "nbconvert_exporter": "python",
150 |       "name": "python",
151 |       "pygments_lexer": "ipython3",
152 |       "version": "3.6.6",
153 |       "file_extension": ".py",
154 |       "codemirror_mode": {
155 |         "version": 3,
156 |         "name": "ipython"
157 |       }
158 |     }
159 |   },
160 |   "nbformat": 4,
161 |   "nbformat_minor": 2
162 | }


--------------------------------------------------------------------------------
/external-references.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {},
  5 |       "cell_type": "markdown",
  6 |       "source": "# External References\nCopyright (c) Microsoft Corporation. All rights reserved.<br>\nLicensed under the MIT License."
  7 |     },
  8 |     {
  9 |       "metadata": {},
 10 |       "cell_type": "markdown",
 11 |       "source": "In addition to opening existing Dataflows in code and modifying them, it is also possible to create and persist Dataflows that reference another Dataflow that has been persisted to a DataPrep package. In this case, executing this Dataflow will load the referenced DataPrep package dynamically, execute the referenced Dataflow, and then execute the steps in the referencing Dataflow."
 12 |     },
 13 |     {
 14 |       "metadata": {},
 15 |       "cell_type": "markdown",
 16 |       "source": "To demonstrate, we will create a Dataflow that loads and transforms some data. After that, we will persist this Dataflow to a DataPrep package."
 17 |     },
 18 |     {
 19 |       "metadata": {
 20 |         "trusted": true
 21 |       },
 22 |       "cell_type": "code",
 23 |       "source": "import azureml.dataprep as dprep\nimport tempfile\nimport os\n\ndf = dprep.smart_read_file('./data/fixed_width_file.txt')\ndf = df.drop_errors(['Column7', 'Column8', 'Column9'], dprep.ColumnRelationship.ANY)\ndf = df.set_name('FWF')\npkg = dprep.Package(df)\npkg_path = os.path.join(tempfile.gettempdir(), 'package.dprep')\npkg = pkg.save(pkg_path)",
 24 |       "execution_count": 1,
 25 |       "outputs": [
 26 |         {
 27 |           "output_type": "stream",
 28 |           "text": "/home/nbuser/anaconda3_501/lib/python3.6/site-packages/ipykernel/__main__.py:5: DeprecationWarning: Function smart_read_file is deprecated. Use auto_read_file instead.\n",
 29 |           "name": "stderr"
 30 |         }
 31 |       ]
 32 |     },
 33 |     {
 34 |       "metadata": {},
 35 |       "cell_type": "markdown",
 36 |       "source": "Now that we have a package file, we can create a new Dataflow that references it."
 37 |     },
 38 |     {
 39 |       "metadata": {
 40 |         "trusted": true
 41 |       },
 42 |       "cell_type": "code",
 43 |       "source": "new_df = dprep.Dataflow.reference(dprep.ExternalReference(pkg_path, 'FWF'))\nnew_df.head(10)",
 44 |       "execution_count": 2,
 45 |       "outputs": [
 46 |         {
 47 |           "output_type": "execute_result",
 48 |           "execution_count": 2,
 49 |           "data": {
 50 |             "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Column1</th>\n      <th>Column2</th>\n      <th>Column3</th>\n      <th>Column4</th>\n      <th>Column5</th>\n      <th>Column6</th>\n      <th>Column7</th>\n      <th>Column8</th>\n      <th>Column9</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>10010.0</td>\n      <td>99999.0</td>\n      <td>JAN MAYEN</td>\n      <td>azureml.dataprep.native.DataPrepError(\"'Micros...</td>\n      <td>JN</td>\n      <td>ENJA</td>\n      <td>70933.0</td>\n      <td>-8667.0</td>\n      <td>90.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>10014.0</td>\n      <td>99999.0</td>\n      <td>SOERSTOKKEN</td>\n      <td>azureml.dataprep.native.DataPrepError(\"'Micros...</td>\n      <td>NO</td>\n      <td>ENSO</td>\n      <td>59783.0</td>\n      <td>5350.0</td>\n      <td>500.0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>10015.0</td>\n      <td>99999.0</td>\n      <td>BRINGELAND</td>\n      <td>azureml.dataprep.native.DataPrepError(\"'Micros...</td>\n      <td>NO</td>\n      <td>ENBL</td>\n      <td>61383.0</td>\n      <td>5867.0</td>\n      <td>3270.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>10016.0</td>\n      <td>99999.0</td>\n      <td>RORVIK/RYUM</td>\n      <td>azureml.dataprep.native.DataPrepError(\"'Micros...</td>\n      <td>NO</td>\n      <td></td>\n      <td>64850.0</td>\n      <td>11233.0</td>\n      <td>140.0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>10017.0</td>\n      <td>99999.0</td>\n      <td>FRIGG</td>\n      <td>azureml.dataprep.native.DataPrepError(\"'Micros...</td>\n      <td>NO</td>\n      <td>ENFR</td>\n      <td>59933.0</td>\n      <td>2417.0</td>\n      <td>480.0</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>10020.0</td>\n      <td>99999.0</td>\n      <td>VERLEGENHUKEN</td>\n      <td>azureml.dataprep.native.DataPrepError(\"'Micros...</td>\n      <td>SV</td>\n      <td></td>\n      <td>80050.0</td>\n      <td>16250.0</td>\n      <td>80.0</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>10030.0</td>\n      <td>99999.0</td>\n      <td>HORNSUND</td>\n      <td>azureml.dataprep.native.DataPrepError(\"'Micros...</td>\n      <td>SV</td>\n      <td></td>\n      <td>77000.0</td>\n      <td>15500.0</td>\n      <td>120.0</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>10040.0</td>\n      <td>99999.0</td>\n      <td>NY-ALESUND II</td>\n      <td>azureml.dataprep.native.DataPrepError(\"'Micros...</td>\n      <td>SV</td>\n      <td>ENAS</td>\n      <td>78917.0</td>\n      <td>11933.0</td>\n      <td>80.0</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>10050.0</td>\n      <td>99999.0</td>\n      <td>ISFJORD RADIO</td>\n      <td>azureml.dataprep.native.DataPrepError(\"'Micros...</td>\n      <td>NO</td>\n      <td>ENIS</td>\n      <td>78067.0</td>\n      <td>13633.0</td>\n      <td>50.0</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>10060.0</td>\n      <td>99999.0</td>\n      <td>EDGEOYA</td>\n      <td>azureml.dataprep.native.DataPrepError(\"'Micros...</td>\n      <td>NO</td>\n      <td></td>\n      <td>78250.0</td>\n      <td>22783.0</td>\n      <td>140.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
 51 |             "text/plain": "   Column1  Column2                         Column3  \\\n0  10010.0  99999.0  JAN MAYEN                        \n1  10014.0  99999.0  SOERSTOKKEN                      \n2  10015.0  99999.0  BRINGELAND                       \n3  10016.0  99999.0  RORVIK/RYUM                      \n4  10017.0  99999.0  FRIGG                            \n5  10020.0  99999.0  VERLEGENHUKEN                    \n6  10030.0  99999.0  HORNSUND                         \n7  10040.0  99999.0  NY-ALESUND II                    \n8  10050.0  99999.0  ISFJORD RADIO                    \n9  10060.0  99999.0  EDGEOYA                          \n\n                                             Column4 Column5 Column6  Column7  \\\n0  azureml.dataprep.native.DataPrepError(\"'Micros...  JN      ENJA    70933.0   \n1  azureml.dataprep.native.DataPrepError(\"'Micros...  NO      ENSO    59783.0   \n2  azureml.dataprep.native.DataPrepError(\"'Micros...  NO      ENBL    61383.0   \n3  azureml.dataprep.native.DataPrepError(\"'Micros...  NO              64850.0   \n4  azureml.dataprep.native.DataPrepError(\"'Micros...  NO      ENFR    59933.0   \n5  azureml.dataprep.native.DataPrepError(\"'Micros...  SV              80050.0   \n6  azureml.dataprep.native.DataPrepError(\"'Micros...  SV              77000.0   \n7  azureml.dataprep.native.DataPrepError(\"'Micros...  SV      ENAS    78917.0   \n8  azureml.dataprep.native.DataPrepError(\"'Micros...  NO      ENIS    78067.0   \n9  azureml.dataprep.native.DataPrepError(\"'Micros...  NO              78250.0   \n\n   Column8  Column9  \n0  -8667.0     90.0  \n1   5350.0    500.0  \n2   5867.0   3270.0  \n3  11233.0    140.0  \n4   2417.0    480.0  \n5  16250.0     80.0  \n6  15500.0    120.0  \n7  11933.0     80.0  \n8  13633.0     50.0  \n9  22783.0    140.0  "
 52 |           },
 53 |           "metadata": {}
 54 |         }
 55 |       ]
 56 |     },
 57 |     {
 58 |       "metadata": {},
 59 |       "cell_type": "markdown",
 60 |       "source": "When executed, the new Dataflow returns the same results as the one we saved in our package. Since this reference is resolved on execution, updating the package file results in the changes being visible when re-executing the referencing Dataflow."
 61 |     },
 62 |     {
 63 |       "metadata": {
 64 |         "trusted": true
 65 |       },
 66 |       "cell_type": "code",
 67 |       "source": "df = df.take(5)\npkg = dprep.Package(df)\npkg.save(pkg_path)\n\nnew_df.head(10)",
 68 |       "execution_count": 3,
 69 |       "outputs": [
 70 |         {
 71 |           "output_type": "execute_result",
 72 |           "execution_count": 3,
 73 |           "data": {
 74 |             "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Column1</th>\n      <th>Column2</th>\n      <th>Column3</th>\n      <th>Column4</th>\n      <th>Column5</th>\n      <th>Column6</th>\n      <th>Column7</th>\n      <th>Column8</th>\n      <th>Column9</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>10010.0</td>\n      <td>99999.0</td>\n      <td>JAN MAYEN</td>\n      <td>azureml.dataprep.native.DataPrepError(\"'Micros...</td>\n      <td>JN</td>\n      <td>ENJA</td>\n      <td>70933.0</td>\n      <td>-8667.0</td>\n      <td>90.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>10014.0</td>\n      <td>99999.0</td>\n      <td>SOERSTOKKEN</td>\n      <td>azureml.dataprep.native.DataPrepError(\"'Micros...</td>\n      <td>NO</td>\n      <td>ENSO</td>\n      <td>59783.0</td>\n      <td>5350.0</td>\n      <td>500.0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>10015.0</td>\n      <td>99999.0</td>\n      <td>BRINGELAND</td>\n      <td>azureml.dataprep.native.DataPrepError(\"'Micros...</td>\n      <td>NO</td>\n      <td>ENBL</td>\n      <td>61383.0</td>\n      <td>5867.0</td>\n      <td>3270.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>10016.0</td>\n      <td>99999.0</td>\n      <td>RORVIK/RYUM</td>\n      <td>azureml.dataprep.native.DataPrepError(\"'Micros...</td>\n      <td>NO</td>\n      <td></td>\n      <td>64850.0</td>\n      <td>11233.0</td>\n      <td>140.0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>10017.0</td>\n      <td>99999.0</td>\n      <td>FRIGG</td>\n      <td>azureml.dataprep.native.DataPrepError(\"'Micros...</td>\n      <td>NO</td>\n      <td>ENFR</td>\n      <td>59933.0</td>\n      <td>2417.0</td>\n      <td>480.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
 75 |             "text/plain": "   Column1  Column2                         Column3  \\\n0  10010.0  99999.0  JAN MAYEN                        \n1  10014.0  99999.0  SOERSTOKKEN                      \n2  10015.0  99999.0  BRINGELAND                       \n3  10016.0  99999.0  RORVIK/RYUM                      \n4  10017.0  99999.0  FRIGG                            \n\n                                             Column4 Column5 Column6  Column7  \\\n0  azureml.dataprep.native.DataPrepError(\"'Micros...  JN      ENJA    70933.0   \n1  azureml.dataprep.native.DataPrepError(\"'Micros...  NO      ENSO    59783.0   \n2  azureml.dataprep.native.DataPrepError(\"'Micros...  NO      ENBL    61383.0   \n3  azureml.dataprep.native.DataPrepError(\"'Micros...  NO              64850.0   \n4  azureml.dataprep.native.DataPrepError(\"'Micros...  NO      ENFR    59933.0   \n\n   Column8  Column9  \n0  -8667.0     90.0  \n1   5350.0    500.0  \n2   5867.0   3270.0  \n3  11233.0    140.0  \n4   2417.0    480.0  "
 76 |           },
 77 |           "metadata": {}
 78 |         }
 79 |       ]
 80 |     },
 81 |     {
 82 |       "metadata": {},
 83 |       "cell_type": "markdown",
 84 |       "source": "As we can see, even though we did not modify new_df, it now returns only 5 records, as the package was updated with the Dataflow that resulted from calling `df.take(5)`."
 85 |     },
 86 |     {
 87 |       "metadata": {
 88 |         "trusted": true
 89 |       },
 90 |       "cell_type": "code",
 91 |       "source": "",
 92 |       "execution_count": null,
 93 |       "outputs": []
 94 |     },
 95 |     {
 96 |       "metadata": {
 97 |         "trusted": true
 98 |       },
 99 |       "cell_type": "code",
100 |       "source": "",
101 |       "execution_count": null,
102 |       "outputs": []
103 |     }
104 |   ],
105 |   "metadata": {
106 |     "kernelspec": {
107 |       "name": "python36",
108 |       "display_name": "Python 3.6",
109 |       "language": "python"
110 |     },
111 |     "language_info": {
112 |       "mimetype": "text/x-python",
113 |       "nbconvert_exporter": "python",
114 |       "name": "python",
115 |       "pygments_lexer": "ipython3",
116 |       "version": "3.6.6",
117 |       "file_extension": ".py",
118 |       "codemirror_mode": {
119 |         "version": 3,
120 |         "name": "ipython"
121 |       }
122 |     }
123 |   },
124 |   "nbformat": 4,
125 |   "nbformat_minor": 2
126 | }


--------------------------------------------------------------------------------
/impute-missing-values.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {},
  5 |       "cell_type": "markdown",
  6 |       "source": "# Impute missing values\nCopyright (c) Microsoft Corporation. All rights reserved.<br>\nLicensed under the MIT License."
  7 |     },
  8 |     {
  9 |       "metadata": {},
 10 |       "cell_type": "markdown",
 11 |       "source": "DataPrep has the ability to impute missing values in specified columns. In this case, we will attempt to impute the missing _Latitude_ and _Longitude_ values in the input data."
 12 |     },
 13 |     {
 14 |       "metadata": {
 15 |         "trusted": true
 16 |       },
 17 |       "cell_type": "code",
 18 |       "source": "!pip install azureml",
 19 |       "execution_count": 2,
 20 |       "outputs": [
 21 |         {
 22 |           "output_type": "stream",
 23 |           "text": "Collecting azureml\n  Downloading https://files.pythonhosted.org/packages/ab/e8/76cd2cb6784b9039affd2c659eed1b3f46baf2e6b87a10b072a20b5b0113/azureml-0.2.7-py2.py3-none-any.whl\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.20.1)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.5)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.7)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.6)\nRequirement already satisfied: urllib3<1.25,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.10.15)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\nInstalling collected packages: azureml\nSuccessfully installed azureml-0.2.7\n",
 24 |           "name": "stdout"
 25 |         }
 26 |       ]
 27 |     },
 28 |     {
 29 |       "metadata": {
 30 |         "trusted": true
 31 |       },
 32 |       "cell_type": "code",
 33 |       "source": "import azureml.dataprep as dprep",
 34 |       "execution_count": 3,
 35 |       "outputs": []
 36 |     },
 37 |     {
 38 |       "metadata": {
 39 |         "trusted": true
 40 |       },
 41 |       "cell_type": "code",
 42 |       "source": "# loading input data\ndf = dprep.read_csv(r'data\\crime0-10.csv')\ndf = df.keep_columns(['ID', 'Arrest', 'Latitude', 'Longitude'])\ndf = df.to_number(['Latitude', 'Longitude'])\ndf.head(10)",
 43 |       "execution_count": 4,
 44 |       "outputs": [
 45 |         {
 46 |           "output_type": "execute_result",
 47 |           "execution_count": 4,
 48 |           "data": {
 49 |             "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>ID</th>\n      <th>Arrest</th>\n      <th>Latitude</th>\n      <th>Longitude</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>10140490</td>\n      <td>false</td>\n      <td>41.973309</td>\n      <td>-87.800175</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>10139776</td>\n      <td>false</td>\n      <td>42.008124</td>\n      <td>-87.659550</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>10140270</td>\n      <td>false</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>10139885</td>\n      <td>false</td>\n      <td>41.902152</td>\n      <td>-87.754883</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>10140379</td>\n      <td>false</td>\n      <td>41.885610</td>\n      <td>-87.657009</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>10140868</td>\n      <td>false</td>\n      <td>41.679311</td>\n      <td>-87.644545</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>10139762</td>\n      <td>false</td>\n      <td>41.825501</td>\n      <td>-87.690578</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>10139722</td>\n      <td>true</td>\n      <td>41.857828</td>\n      <td>-87.715029</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>10139774</td>\n      <td>false</td>\n      <td>41.970100</td>\n      <td>-87.669324</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>10139697</td>\n      <td>false</td>\n      <td>41.787580</td>\n      <td>-87.685233</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
 50 |             "text/plain": "         ID Arrest   Latitude  Longitude\n0  10140490  false  41.973309 -87.800175\n1  10139776  false  42.008124 -87.659550\n2  10140270  false        NaN        NaN\n3  10139885  false  41.902152 -87.754883\n4  10140379  false  41.885610 -87.657009\n5  10140868  false  41.679311 -87.644545\n6  10139762  false  41.825501 -87.690578\n7  10139722   true  41.857828 -87.715029\n8  10139774  false  41.970100 -87.669324\n9  10139697  false  41.787580 -87.685233"
 51 |           },
 52 |           "metadata": {}
 53 |         }
 54 |       ]
 55 |     },
 56 |     {
 57 |       "metadata": {},
 58 |       "cell_type": "markdown",
 59 |       "source": "The third record from input data has _Latitude_ and _Longitude_ missing. To impute those missing values, we can use `ImputeMissingValuesBuilder` to learn a fixed program which imputes the columns with either a calculated `MIN`, `MAX` or `MEAN` value or a `CUSTOM` value. When `group_by_columns` is specified, missing values will be imputed by group with `MIN`, `MAX` and `MEAN` calculated per group."
 60 |     },
 61 |     {
 62 |       "metadata": {},
 63 |       "cell_type": "markdown",
 64 |       "source": "Firstly, let us quickly see check the `MEAN` value of _Latitude_ column."
 65 |     },
 66 |     {
 67 |       "metadata": {
 68 |         "trusted": true
 69 |       },
 70 |       "cell_type": "code",
 71 |       "source": "df_mean = df.summarize(group_by_columns=['Arrest'],\n                       summary_columns=[dprep.SummaryColumnsValue(column_id='Latitude',\n                                                                 summary_column_name='Latitude_MEAN',\n                                                                 summary_function=dprep.SummaryFunction.MEAN)])\ndf_mean = df_mean.filter(dprep.col('Arrest') == 'false')\ndf_mean.head(1)",
 72 |       "execution_count": 5,
 73 |       "outputs": [
 74 |         {
 75 |           "output_type": "execute_result",
 76 |           "execution_count": 5,
 77 |           "data": {
 78 |             "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Arrest</th>\n      <th>Latitude_MEAN</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>false</td>\n      <td>41.878961</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
 79 |             "text/plain": "  Arrest  Latitude_MEAN\n0  false      41.878961"
 80 |           },
 81 |           "metadata": {}
 82 |         }
 83 |       ]
 84 |     },
 85 |     {
 86 |       "metadata": {},
 87 |       "cell_type": "markdown",
 88 |       "source": "The `MEAN` value of _Latitude_ looks good. So we will impute _Latitude_ with it. As for `Longitude`, we will impute it using `42` based on external knowledge."
 89 |     },
 90 |     {
 91 |       "metadata": {
 92 |         "trusted": true
 93 |       },
 94 |       "cell_type": "code",
 95 |       "source": "# impute with MEAN\nimpute_mean = dprep.ImputeColumnArguments(column_id='Latitude',\n                                          impute_function=dprep.ReplaceValueFunction.MEAN)\n# impute with custom value 42\nimpute_custom = dprep.ImputeColumnArguments(column_id='Longitude',\n                                            custom_impute_value=42)\n# get instance of ImputeMissingValuesBuilder\nimpute_builder = df.builders.impute_missing_values(impute_columns=[impute_mean, impute_custom],\n                                                   group_by_columns=['Arrest'])\n# call learn() to learn a fixed program to impute missing values\nimpute_builder.learn()\n# call to_dataflow() to get a dataflow with impute step added\ndf_imputed = impute_builder.to_dataflow()",
 96 |       "execution_count": 6,
 97 |       "outputs": []
 98 |     },
 99 |     {
100 |       "metadata": {
101 |         "trusted": true
102 |       },
103 |       "cell_type": "code",
104 |       "source": "# check impute result\ndf_imputed.head(10)",
105 |       "execution_count": 7,
106 |       "outputs": [
107 |         {
108 |           "output_type": "execute_result",
109 |           "execution_count": 7,
110 |           "data": {
111 |             "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>ID</th>\n      <th>Arrest</th>\n      <th>Latitude</th>\n      <th>Longitude</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>10140490</td>\n      <td>false</td>\n      <td>41.973309</td>\n      <td>-87.800175</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>10139776</td>\n      <td>false</td>\n      <td>42.008124</td>\n      <td>-87.659550</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>10140270</td>\n      <td>false</td>\n      <td>41.878961</td>\n      <td>42.000000</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>10139885</td>\n      <td>false</td>\n      <td>41.902152</td>\n      <td>-87.754883</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>10140379</td>\n      <td>false</td>\n      <td>41.885610</td>\n      <td>-87.657009</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>10140868</td>\n      <td>false</td>\n      <td>41.679311</td>\n      <td>-87.644545</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>10139762</td>\n      <td>false</td>\n      <td>41.825501</td>\n      <td>-87.690578</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>10139722</td>\n      <td>true</td>\n      <td>41.857828</td>\n      <td>-87.715029</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>10139774</td>\n      <td>false</td>\n      <td>41.970100</td>\n      <td>-87.669324</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>10139697</td>\n      <td>false</td>\n      <td>41.787580</td>\n      <td>-87.685233</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
112 |             "text/plain": "         ID Arrest   Latitude  Longitude\n0  10140490  false  41.973309 -87.800175\n1  10139776  false  42.008124 -87.659550\n2  10140270  false  41.878961  42.000000\n3  10139885  false  41.902152 -87.754883\n4  10140379  false  41.885610 -87.657009\n5  10140868  false  41.679311 -87.644545\n6  10139762  false  41.825501 -87.690578\n7  10139722   true  41.857828 -87.715029\n8  10139774  false  41.970100 -87.669324\n9  10139697  false  41.787580 -87.685233"
113 |           },
114 |           "metadata": {}
115 |         }
116 |       ]
117 |     },
118 |     {
119 |       "metadata": {},
120 |       "cell_type": "markdown",
121 |       "source": "As the result above, the missing _Latitude_ has been imputed with the `MEAN` value of `Arrest=='false'` group, and the missing _Longitude_ has been imputed with `42`."
122 |     },
123 |     {
124 |       "metadata": {
125 |         "trusted": true
126 |       },
127 |       "cell_type": "code",
128 |       "source": "",
129 |       "execution_count": null,
130 |       "outputs": []
131 |     }
132 |   ],
133 |   "metadata": {
134 |     "kernelspec": {
135 |       "name": "python36",
136 |       "display_name": "Python 3.6",
137 |       "language": "python"
138 |     },
139 |     "language_info": {
140 |       "mimetype": "text/x-python",
141 |       "nbconvert_exporter": "python",
142 |       "name": "python",
143 |       "pygments_lexer": "ipython3",
144 |       "version": "3.6.6",
145 |       "file_extension": ".py",
146 |       "codemirror_mode": {
147 |         "version": 3,
148 |         "name": "ipython"
149 |       }
150 |     }
151 |   },
152 |   "nbformat": 4,
153 |   "nbformat_minor": 2
154 | }


--------------------------------------------------------------------------------
/0. Import librairie.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {
  5 |         "trusted": true
  6 |       },
  7 |       "cell_type": "code",
  8 |       "source": "!pip install azureml-sdk",
  9 |       "execution_count": 1,
 10 |       "outputs": [
 11 |         {
 12 |           "output_type": "stream",
 13 |           "text": "Requirement already satisfied: azureml-sdk in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (1.0.2)\nRequirement already satisfied: azureml-pipeline==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-sdk) (1.0.2)\nRequirement already satisfied: azureml-train==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-sdk) (1.0.2)\nRequirement already satisfied: azureml-core==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-sdk) (1.0.2)\nRequirement already satisfied: azureml-pipeline-core==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-pipeline==1.0.2.*->azureml-sdk) (1.0.2)\nRequirement already satisfied: azureml-pipeline-steps==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-pipeline==1.0.2.*->azureml-sdk) (1.0.2)\nRequirement already satisfied: azureml-train-core==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-train==1.0.2.*->azureml-sdk) (1.0.2)\nRequirement already satisfied: pathspec in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.5.9)\nRequirement already satisfied: azure-storage-blob>=1.1.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.4.0)\nRequirement already satisfied: SecretStorage<3.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.3.1)\nRequirement already satisfied: azure-cli-profile>=2.0.26 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.1.2)\nRequirement already satisfied: azure-common>=1.1.12 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.1.16)\nRequirement already satisfied: msrestazure>=0.4.33 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.5.1)\nRequirement already satisfied: docker in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (3.6.0)\nRequirement already satisfied: azure-mgmt-storage>=1.5.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (3.1.0)\nRequirement already satisfied: backports.tempfile in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.0)\nRequirement already satisfied: azure-graphrbac>=0.40.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.53.0)\nRequirement already satisfied: azure-mgmt-keyvault>=0.40.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.1.0)\nRequirement already satisfied: azure-cli-core>=2.0.38 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.0.52)\nRequirement already satisfied: azure-mgmt-containerregistry>=2.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.4.0)\nRequirement already satisfied: requests>=2.19.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.20.1)\nRequirement already satisfied: azure-mgmt-resource>=1.2.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.0.0)\nRequirement already satisfied: azure-mgmt-authorization>=0.40.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.51.1)\nRequirement already satisfied: pytz in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2018.7)\nRequirement already satisfied: azure-storage-nspkg>=3.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (3.1.0)\nRequirement already satisfied: ruamel.yaml<=0.15.51,>=0.15.35 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.15.51)\nRequirement already satisfied: ndg-httpsclient in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.5.1)\nRequirement already satisfied: azure-storage-common>=1.1.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.4.0)\nRequirement already satisfied: contextlib2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.5.5)\nRequirement already satisfied: cryptography!=1.9,!=2.0.*,!=2.1.*,!=2.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.3.1)\nRequirement already satisfied: python-dateutil>=2.7.3 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.7.5)\nRequirement already satisfied: msrest>=0.5.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.6.2)\nRequirement already satisfied: PyJWT in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.7.1)\nRequirement already satisfied: urllib3<1.24,>=1.23 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.23)\nRequirement already satisfied: six>=1.11.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.11.0)\nRequirement already satisfied: jsonpickle in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.0)\nRequirement already satisfied: certifi in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-pipeline-steps==1.0.2.*->azureml-pipeline==1.0.2.*->azureml-sdk) (2018.10.15)\nRequirement already satisfied: azureml-telemetry==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-train-core==1.0.2.*->azureml-train==1.0.2.*->azureml-sdk) (1.0.2)\nRequirement already satisfied: azureml-train-restclients-hyperdrive==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-train-core==1.0.2.*->azureml-train==1.0.2.*->azureml-sdk) (1.0.2)\nRequirement already satisfied: azure-cli-command-modules-nspkg>=2.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-profile>=2.0.26->azureml-core==1.0.2.*->azureml-sdk) (2.0.2)\nRequirement already satisfied: adal<2.0.0,>=0.6.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from msrestazure>=0.4.33->azureml-core==1.0.2.*->azureml-sdk) (1.2.0)\nRequirement already satisfied: docker-pycreds>=0.3.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from docker->azureml-core==1.0.2.*->azureml-sdk) (0.4.0)\nRequirement already satisfied: websocket-client>=0.32.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from docker->azureml-core==1.0.2.*->azureml-sdk) (0.54.0)\nRequirement already satisfied: backports.weakref in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from backports.tempfile->azureml-core==1.0.2.*->azureml-sdk) (1.0rc1)\nRequirement already satisfied: azure-mgmt-nspkg>=2.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-mgmt-keyvault>=0.40.0->azureml-core==1.0.2.*->azureml-sdk) (3.0.2)\nRequirement already satisfied: azure-cli-telemetry in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (1.0.0)\nRequirement already satisfied: pyopenssl>=17.1.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (18.0.0)\nRequirement already satisfied: pyyaml~=3.13 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (3.13)\nRequirement already satisfied: pip in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (18.1)\nRequirement already satisfied: paramiko>=2.0.8 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (2.4.2)\nRequirement already satisfied: jmespath in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (0.9.3)\nRequirement already satisfied: argcomplete>=1.8.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (1.9.4)\nRequirement already satisfied: pygments in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (2.2.0)\nRequirement already satisfied: knack==0.5.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (0.5.1)\nRequirement already satisfied: tabulate<=0.8.2,>=0.7.7 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (0.8.2)\n",
 14 |           "name": "stdout"
 15 |         },
 16 |         {
 17 |           "output_type": "stream",
 18 |           "text": "Requirement already satisfied: antlr4-python3-runtime; python_version >= \"3.0\" in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (4.7.1)\nRequirement already satisfied: colorama>=0.3.9 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (0.3.9)\nRequirement already satisfied: azure-cli-nspkg>=2.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (3.0.3)\nRequirement already satisfied: humanfriendly>=4.7 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (4.17)\nRequirement already satisfied: wheel==0.30.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (0.30.0)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests>=2.19.1->azureml-core==1.0.2.*->azureml-sdk) (2.7)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests>=2.19.1->azureml-core==1.0.2.*->azureml-sdk) (3.0.4)\nRequirement already satisfied: azure-nspkg>=2.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-storage-nspkg>=3.0.0->azureml-core==1.0.2.*->azureml-sdk) (3.0.2)\nRequirement already satisfied: pyasn1>=0.1.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from ndg-httpsclient->azureml-core==1.0.2.*->azureml-sdk) (0.4.4)\nRequirement already satisfied: asn1crypto>=0.21.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from cryptography!=1.9,!=2.0.*,!=2.1.*,!=2.2.*->azureml-core==1.0.2.*->azureml-sdk) (0.24.0)\nRequirement already satisfied: cffi!=1.11.3,>=1.7 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from cryptography!=1.9,!=2.0.*,!=2.1.*,!=2.2.*->azureml-core==1.0.2.*->azureml-sdk) (1.11.5)\nRequirement already satisfied: requests-oauthlib>=0.5.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from msrest>=0.5.1->azureml-core==1.0.2.*->azureml-sdk) (1.0.0)\nRequirement already satisfied: isodate>=0.6.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from msrest>=0.5.1->azureml-core==1.0.2.*->azureml-sdk) (0.6.0)\nRequirement already satisfied: applicationinsights in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-telemetry==1.0.2.*->azureml-train-core==1.0.2.*->azureml-train==1.0.2.*->azureml-sdk) (0.11.7)\nRequirement already satisfied: portalocker==1.2.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-telemetry->azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (1.2.1)\nRequirement already satisfied: pynacl>=1.0.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from paramiko>=2.0.8->azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (1.3.0)\nRequirement already satisfied: bcrypt>=3.1.3 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from paramiko>=2.0.8->azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (3.1.4)\nRequirement already satisfied: pycparser in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.7->cryptography!=1.9,!=2.0.*,!=2.1.*,!=2.2.*->azureml-core==1.0.2.*->azureml-sdk) (2.19)\nRequirement already satisfied: oauthlib>=0.6.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests-oauthlib>=0.5.0->msrest>=0.5.1->azureml-core==1.0.2.*->azureml-sdk) (2.1.0)\n",
 19 |           "name": "stdout"
 20 |         }
 21 |       ]
 22 |     },
 23 |     {
 24 |       "metadata": {
 25 |         "trusted": true
 26 |       },
 27 |       "cell_type": "code",
 28 |       "source": "!pip install --upgrade azureml-dataprep",
 29 |       "execution_count": 2,
 30 |       "outputs": [
 31 |         {
 32 |           "output_type": "stream",
 33 |           "text": "Requirement already up-to-date: azureml-dataprep in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (1.0.4)\nRequirement already satisfied, skipping upgrade: pandas>=0.19.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-dataprep) (0.22.0)\nRequirement already satisfied, skipping upgrade: numpy>=1.11.3 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-dataprep) (1.14.6)\nRequirement already satisfied, skipping upgrade: dotnetcore2==2.1.7 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-dataprep) (2.1.7)\nRequirement already satisfied, skipping upgrade: azureml-dataprep-native<12.0.0,>=11.2.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-dataprep) (11.2.0)\nRequirement already satisfied, skipping upgrade: python-dateutil>=2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas>=0.19.2->azureml-dataprep) (2.7.5)\nRequirement already satisfied, skipping upgrade: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas>=0.19.2->azureml-dataprep) (2018.7)\nRequirement already satisfied, skipping upgrade: distro>=1.2.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from dotnetcore2==2.1.7->azureml-dataprep) (1.3.0)\nRequirement already satisfied, skipping upgrade: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil>=2->pandas>=0.19.2->azureml-dataprep) (1.11.0)\n",
 34 |           "name": "stdout"
 35 |         }
 36 |       ]
 37 |     },
 38 |     {
 39 |       "metadata": {
 40 |         "trusted": true
 41 |       },
 42 |       "cell_type": "code",
 43 |       "source": "import sys",
 44 |       "execution_count": 3,
 45 |       "outputs": []
 46 |     },
 47 |     {
 48 |       "metadata": {
 49 |         "trusted": true
 50 |       },
 51 |       "cell_type": "code",
 52 |       "source": "sys.version",
 53 |       "execution_count": 4,
 54 |       "outputs": [
 55 |         {
 56 |           "output_type": "execute_result",
 57 |           "execution_count": 4,
 58 |           "data": {
 59 |             "text/plain": "'3.6.6 |Anaconda, Inc.| (default, Jun 28 2018, 17:14:51) \\n[GCC 7.2.0]'"
 60 |           },
 61 |           "metadata": {}
 62 |         }
 63 |       ]
 64 |     },
 65 |     {
 66 |       "metadata": {
 67 |         "trusted": true
 68 |       },
 69 |       "cell_type": "code",
 70 |       "source": "import azureml.core\nprint(\"Version Azure ML service :\", azureml.core.VERSION)",
 71 |       "execution_count": 7,
 72 |       "outputs": [
 73 |         {
 74 |           "output_type": "stream",
 75 |           "text": "Version Azure ML service : 1.0.2\n",
 76 |           "name": "stdout"
 77 |         }
 78 |       ]
 79 |     },
 80 |     {
 81 |       "metadata": {
 82 |         "trusted": true
 83 |       },
 84 |       "cell_type": "code",
 85 |       "source": "",
 86 |       "execution_count": null,
 87 |       "outputs": []
 88 |     }
 89 |   ],
 90 |   "metadata": {
 91 |     "kernelspec": {
 92 |       "name": "python36",
 93 |       "display_name": "Python 3.6",
 94 |       "language": "python"
 95 |     },
 96 |     "language_info": {
 97 |       "mimetype": "text/x-python",
 98 |       "nbconvert_exporter": "python",
 99 |       "name": "python",
100 |       "pygments_lexer": "ipython3",
101 |       "version": "3.6.6",
102 |       "file_extension": ".py",
103 |       "codemirror_mode": {
104 |         "version": 3,
105 |         "name": "ipython"
106 |       }
107 |     }
108 |   },
109 |   "nbformat": 4,
110 |   "nbformat_minor": 2
111 | }


--------------------------------------------------------------------------------
/smart-read-file-separators.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {},
  5 |       "cell_type": "markdown",
  6 |       "source": "# Smart Read File\nCopyright (c) Microsoft Corporation. All rights reserved.<br>\nLicensed under the MIT License."
  7 |     },
  8 |     {
  9 |       "metadata": {
 10 |         "trusted": true
 11 |       },
 12 |       "cell_type": "code",
 13 |       "source": "!pip install azureml",
 14 |       "execution_count": 1,
 15 |       "outputs": [
 16 |         {
 17 |           "output_type": "stream",
 18 |           "text": "Requirement already satisfied: azureml in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (0.2.7)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.3)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.19.1)\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\nRequirement already satisfied: urllib3<1.24,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.8.24)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.5)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.5)\n",
 19 |           "name": "stdout"
 20 |         }
 21 |       ]
 22 |     },
 23 |     {
 24 |       "metadata": {
 25 |         "trusted": true
 26 |       },
 27 |       "cell_type": "code",
 28 |       "source": "import azureml.dataprep as dprep",
 29 |       "execution_count": 2,
 30 |       "outputs": []
 31 |     },
 32 |     {
 33 |       "metadata": {},
 34 |       "cell_type": "markdown",
 35 |       "source": "DataPrep has the ability to load different kinds of text files. The `smart_read_file` entry point can take any text based file (including excel, json and parquet) and auto-detect how to parse the file. It will also attempt to auto-detect the types of each column and apply type transformations to the columns it detects.\n\nThe result will be a Dataflow object that has all the steps added that are required to read the given file(s) and convert their columns to the predicted types. No parameters are required beyond the file path or `FileDataSource` object."
 36 |     },
 37 |     {
 38 |       "metadata": {
 39 |         "trusted": true
 40 |       },
 41 |       "cell_type": "code",
 42 |       "source": "smart_dataflow = dprep.smart_read_file('./data/multiple_separators.csv')\nsmart_dataflow.head(10)",
 43 |       "execution_count": 3,
 44 |       "outputs": [
 45 |         {
 46 |           "output_type": "execute_result",
 47 |           "execution_count": 3,
 48 |           "data": {
 49 |             "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>ID</th>\n      <th>CaseNumber</th>\n      <th>Column3</th>\n      <th>Completed</th>\n      <th>Column5</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>10140490.0</td>\n      <td>HY329907</td>\n      <td></td>\n      <td>Y</td>\n      <td></td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>10139776.0</td>\n      <td>HY329265</td>\n      <td></td>\n      <td>Y</td>\n      <td></td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>10140270.0</td>\n      <td>HY329253</td>\n      <td></td>\n      <td>N</td>\n      <td></td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>10139885.0</td>\n      <td>HY329308</td>\n      <td></td>\n      <td>Y</td>\n      <td></td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>10140379.0</td>\n      <td>HY329556</td>\n      <td></td>\n      <td>N</td>\n      <td></td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>10140868.0</td>\n      <td>HY330421</td>\n      <td></td>\n      <td>N</td>\n      <td></td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>10139762.0</td>\n      <td>HY329232</td>\n      <td></td>\n      <td>N</td>\n      <td></td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>10139722.0</td>\n      <td>HY329228</td>\n      <td></td>\n      <td>Y</td>\n      <td></td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>10139774.0</td>\n      <td>HY329209</td>\n      <td></td>\n      <td>N</td>\n      <td></td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>10139697.0</td>\n      <td>HY329177</td>\n      <td></td>\n      <td>N</td>\n      <td></td>\n    </tr>\n  </tbody>\n</table>\n</div>",
 50 |             "text/plain": "           ID CaseNumber Column3 Completed Column5\n0  10140490.0   HY329907                 Y        \n1  10139776.0   HY329265                 Y        \n2  10140270.0   HY329253                 N        \n3  10139885.0   HY329308                 Y        \n4  10140379.0   HY329556                 N        \n5  10140868.0   HY330421                 N        \n6  10139762.0   HY329232                 N        \n7  10139722.0   HY329228                 Y        \n8  10139774.0   HY329209                 N        \n9  10139697.0   HY329177                 N        "
 51 |           },
 52 |           "metadata": {}
 53 |         }
 54 |       ]
 55 |     },
 56 |     {
 57 |       "metadata": {},
 58 |       "cell_type": "markdown",
 59 |       "source": "Looking at the data, we can see that there are two empty columns either side of the 'Completed' column.\nIf we compare the dataframe to a few rows from the original file:\n```\nID |CaseNumber| |Completed|\n10140490 |HY329907| |Y|\n10139776 |HY329265| |Y|\n```\nWe can see that the `|`'s have disappeared in the dataframe. This is because `|` is a very common separator character in csv files, so `smart_read_file` guessed it was the column separator. For this data we actually want the `|`'s to remain and instead use space as the column separator.\n\nTo acheive this we can use `detect_file_format` which will take a file path or datasource obeject and give back a `FileFormatBuilder` which has learnt some information about the supplied data.\nThis is what `smart_file_read` is using behind the scenes to 'learn' the contents of the given file and determine how to parse it. With the `FileFormatBuilder` we can take advantage of the intelligent learning aspect of `smart_file_read` but have the chance to modify some of the learnt information."
 60 |     },
 61 |     {
 62 |       "metadata": {
 63 |         "trusted": true
 64 |       },
 65 |       "cell_type": "code",
 66 |       "source": "ffb = dprep.detect_file_format('./data/multiple_separators.csv')\nffb_2 = dprep.detect_file_format('./data/excel.xlsx')\nffb_3 = dprep.detect_file_format('./data/fixed_width_file.txt')\nffb_4 = dprep.detect_file_format('./data/json.json')\n\nprint(ffb.file_format)\nprint(ffb_2.file_format)\nprint(ffb_3.file_format)\nprint(type(ffb_4.file_format))",
 67 |       "execution_count": 4,
 68 |       "outputs": [
 69 |         {
 70 |           "output_type": "stream",
 71 |           "text": "ParseDelimitedProperties\n    separator: '|'\n    headers_mode: PromoteHeadersMode.CONSTANTGROUPED\n    encoding: FileEncoding.UTF8\n    quoting: False\n    skip_rows: 0\n    skip_mode: SkipMode.NONE\n    comment: None\n\nReadExcelProperties\n    sheet_name: None\n    use_headers: False\n    skip_rows: 0\n\nParseFixedWidthProperties\n    offsets: '[7, 13, 43, 46, 52, 58, 65, 73]'\n    headers_mode: PromoteHeadersMode.NONE\n    encoding: FileEncoding.UTF8\n    skip_rows: 0\n    skip_mode: SkipMode.NONE\n\n<class 'azureml.dataprep.api.parseproperties.ReadJsonProperties'>\n",
 72 |           "name": "stdout"
 73 |         }
 74 |       ]
 75 |     },
 76 |     {
 77 |       "metadata": {},
 78 |       "cell_type": "markdown",
 79 |       "source": "After calling `detect_file_format` we get a `FileFormatBuilder` that has had `learn` called on it. This means the `file_format` attribute will be populated with a `<Parse|Read><type>Properties` object, it contains all the information that was learnt about the file. As we can see above different file types have corresponding file_formats detected. \nContinuing with our delimited example we can change any of these values and then call `ffb.to_dataflow()` to create a `Dataflow` that has the steps required to parse the datasource."
 80 |     },
 81 |     {
 82 |       "metadata": {
 83 |         "scrolled": true,
 84 |         "trusted": true
 85 |       },
 86 |       "cell_type": "code",
 87 |       "source": "ffb.file_format.separator = ' '\ndataflow = ffb.to_dataflow()\ndf = dataflow.to_pandas_dataframe()\ndf",
 88 |       "execution_count": 5,
 89 |       "outputs": [
 90 |         {
 91 |           "output_type": "execute_result",
 92 |           "execution_count": 5,
 93 |           "data": {
 94 |             "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>ID</th>\n      <th>|CaseNumber|</th>\n      <th>|Completed|</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>10140490</td>\n      <td>|HY329907|</td>\n      <td>|Y|</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>10139776</td>\n      <td>|HY329265|</td>\n      <td>|Y|</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>10140270</td>\n      <td>|HY329253|</td>\n      <td>|N|</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>10139885</td>\n      <td>|HY329308|</td>\n      <td>|Y|</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>10140379</td>\n      <td>|HY329556|</td>\n      <td>|N|</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>10140868</td>\n      <td>|HY330421|</td>\n      <td>|N|</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>10139762</td>\n      <td>|HY329232|</td>\n      <td>|N|</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>10139722</td>\n      <td>|HY329228|</td>\n      <td>|Y|</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>10139774</td>\n      <td>|HY329209|</td>\n      <td>|N|</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>10139697</td>\n      <td>|HY329177|</td>\n      <td>|N|</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
 95 |             "text/plain": "         ID |CaseNumber| |Completed|\n0  10140490   |HY329907|         |Y|\n1  10139776   |HY329265|         |Y|\n2  10140270   |HY329253|         |N|\n3  10139885   |HY329308|         |Y|\n4  10140379   |HY329556|         |N|\n5  10140868   |HY330421|         |N|\n6  10139762   |HY329232|         |N|\n7  10139722   |HY329228|         |Y|\n8  10139774   |HY329209|         |N|\n9  10139697   |HY329177|         |N|"
 96 |           },
 97 |           "metadata": {}
 98 |         }
 99 |       ]
100 |     },
101 |     {
102 |       "metadata": {},
103 |       "cell_type": "markdown",
104 |       "source": "The result is our desired dataframe with `|`'s included.\n\nIf we refer back to the original data output by `smart_read_file` the 'ID' column was also detected as numeric and converted to a number data type, instead of remaining a string like in the data above.\nWe can perform type inference on our new dataflow using the `dataflow.builders` property. This property exposes different builders that can `learn` from a dataflow and `apply` the learning to produce a new dataflow, very similar to the pattern we used above for the `FileFormatBuilder`."
105 |     },
106 |     {
107 |       "metadata": {
108 |         "trusted": true
109 |       },
110 |       "cell_type": "code",
111 |       "source": "ctb = dataflow.builders.set_column_types()\nctb.learn()\nctb.inference_info",
112 |       "execution_count": 6,
113 |       "outputs": [
114 |         {
115 |           "output_type": "execute_result",
116 |           "execution_count": 6,
117 |           "data": {
118 |             "text/plain": "{'|CaseNumber|': [FieldType.STRING],\n '|Completed|': [FieldType.STRING],\n 'ID': [FieldType.DECIMAL]}"
119 |           },
120 |           "metadata": {}
121 |         }
122 |       ]
123 |     },
124 |     {
125 |       "metadata": {},
126 |       "cell_type": "markdown",
127 |       "source": "After learning `ctb.inference_info` has been populated with information about the inferred types for each column, it is possible for there to be multiple candidate types per column, in this example there is only one type for each column.\n\nThe candidates look correct, we only want to convert `ID` to be a number column (also known as `DECIMAL`), so applying this `ColumnTypesBuilder` should result in a Dataflow with our columns converted to their respective types."
128 |     },
129 |     {
130 |       "metadata": {
131 |         "trusted": true
132 |       },
133 |       "cell_type": "code",
134 |       "source": "converted_dataflow = ctb.to_dataflow()\nconverted_df = converted_dataflow.to_pandas_dataframe()\nconverted_df",
135 |       "execution_count": 7,
136 |       "outputs": [
137 |         {
138 |           "output_type": "execute_result",
139 |           "execution_count": 7,
140 |           "data": {
141 |             "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>ID</th>\n      <th>|CaseNumber|</th>\n      <th>|Completed|</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>10140490.0</td>\n      <td>|HY329907|</td>\n      <td>|Y|</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>10139776.0</td>\n      <td>|HY329265|</td>\n      <td>|Y|</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>10140270.0</td>\n      <td>|HY329253|</td>\n      <td>|N|</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>10139885.0</td>\n      <td>|HY329308|</td>\n      <td>|Y|</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>10140379.0</td>\n      <td>|HY329556|</td>\n      <td>|N|</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>10140868.0</td>\n      <td>|HY330421|</td>\n      <td>|N|</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>10139762.0</td>\n      <td>|HY329232|</td>\n      <td>|N|</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>10139722.0</td>\n      <td>|HY329228|</td>\n      <td>|Y|</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>10139774.0</td>\n      <td>|HY329209|</td>\n      <td>|N|</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>10139697.0</td>\n      <td>|HY329177|</td>\n      <td>|N|</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
142 |             "text/plain": "           ID |CaseNumber| |Completed|\n0  10140490.0   |HY329907|         |Y|\n1  10139776.0   |HY329265|         |Y|\n2  10140270.0   |HY329253|         |N|\n3  10139885.0   |HY329308|         |Y|\n4  10140379.0   |HY329556|         |N|\n5  10140868.0   |HY330421|         |N|\n6  10139762.0   |HY329232|         |N|\n7  10139722.0   |HY329228|         |Y|\n8  10139774.0   |HY329209|         |N|\n9  10139697.0   |HY329177|         |N|"
143 |           },
144 |           "metadata": {}
145 |         }
146 |       ]
147 |     },
148 |     {
149 |       "metadata": {
150 |         "trusted": true
151 |       },
152 |       "cell_type": "code",
153 |       "source": "",
154 |       "execution_count": null,
155 |       "outputs": []
156 |     }
157 |   ],
158 |   "metadata": {
159 |     "kernelspec": {
160 |       "name": "python36",
161 |       "display_name": "Python 3.6",
162 |       "language": "python"
163 |     },
164 |     "language_info": {
165 |       "mimetype": "text/x-python",
166 |       "nbconvert_exporter": "python",
167 |       "name": "python",
168 |       "pygments_lexer": "ipython3",
169 |       "version": "3.6.6",
170 |       "file_extension": ".py",
171 |       "codemirror_mode": {
172 |         "version": 3,
173 |         "name": "ipython"
174 |       }
175 |     }
176 |   },
177 |   "nbformat": 4,
178 |   "nbformat_minor": 2
179 | }


--------------------------------------------------------------------------------
/caching.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {},
  5 |       "cell_type": "markdown",
  6 |       "source": "# Caching\nCopyright (c) Microsoft Corporation. All rights reserved.<br>\nLicensed under the MIT License."
  7 |     },
  8 |     {
  9 |       "metadata": {},
 10 |       "cell_type": "markdown",
 11 |       "source": "A Dataflow can be cached into a file on disk during a local run by calling `df_cached = df.cache(directory_path)` on the Dataflow object `df`. Doing so, we will run all steps in `df` and save the cached data to the specified `directory_path`. The returned Dataflow `df_cached` has a Caching Step added at the end. Any run on Dataflow `df_cached` will reuse the cached data. And steps in `df_cached` before Caching Step will not be run again.\n\nCaching avoids running transforms multiple times, which can make local runs more efficient. Here are common places to use Caching:\n- after reading data from remote\n- after expensive transforms, such as Sort\n- after transforms that change the shape of data, such as Sampling, Filter and Summarize\n\nCaching Step will be ignored during scale-out run invoked by `to_spark_dataframe()`."
 12 |     },
 13 |     {
 14 |       "metadata": {
 15 |         "trusted": true
 16 |       },
 17 |       "cell_type": "code",
 18 |       "source": "# read data and apply transforms\nimport azureml.dataprep as dprep\ndf = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv')\ndf = df.take_sample(probability=0.2, seed=7)\ndf = df.skip(1)\ndf = df.sort_asc(columns='schnam10')\ndf = df.keep_columns(['stnam', 'fipst', 'leaid', 'leanm10', 'ncessch', 'schnam10'])\ndf.head(5)",
 19 |       "execution_count": 6,
 20 |       "outputs": [
 21 |         {
 22 |           "output_type": "execute_result",
 23 |           "execution_count": 6,
 24 |           "data": {
 25 |             "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>stnam</th>\n      <th>fipst</th>\n      <th>leaid</th>\n      <th>leanm10</th>\n      <th>ncessch</th>\n      <th>schnam10</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>ALABAMA</td>\n      <td>1</td>\n      <td>102100</td>\n      <td>Limestone County</td>\n      <td>10210000797</td>\n      <td>Ardmore High Sch</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>ALABAMA</td>\n      <td>1</td>\n      <td>101920</td>\n      <td>Jefferson County</td>\n      <td>10192000691</td>\n      <td>Brighton Middle Sch</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>ALABAMA</td>\n      <td>1</td>\n      <td>101920</td>\n      <td>Jefferson County</td>\n      <td>10192000720</td>\n      <td>Bryan Elem Sch</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>ALABAMA</td>\n      <td>1</td>\n      <td>102010</td>\n      <td>Lauderdale County</td>\n      <td>10201000766</td>\n      <td>Cloverdale Jr High Sch</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>ALABAMA</td>\n      <td>1</td>\n      <td>101920</td>\n      <td>Jefferson County</td>\n      <td>10192000701</td>\n      <td>Erwin Elem Sch</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
 26 |             "text/plain": "     stnam fipst   leaid            leanm10      ncessch  \\\n0  ALABAMA     1  102100   Limestone County  10210000797   \n1  ALABAMA     1  101920   Jefferson County  10192000691   \n2  ALABAMA     1  101920   Jefferson County  10192000720   \n3  ALABAMA     1  102010  Lauderdale County  10201000766   \n4  ALABAMA     1  101920   Jefferson County  10192000701   \n\n                 schnam10  \n0        Ardmore High Sch  \n1     Brighton Middle Sch  \n2          Bryan Elem Sch  \n3  Cloverdale Jr High Sch  \n4          Erwin Elem Sch  "
 27 |           },
 28 |           "metadata": {}
 29 |         }
 30 |       ]
 31 |     },
 32 |     {
 33 |       "metadata": {
 34 |         "trusted": true
 35 |       },
 36 |       "cell_type": "code",
 37 |       "source": "# choose a directory to store cache data\nimport os\nfrom pathlib import Path\ncache_dir = str(Path(os.getcwd(), 'dataflow-cache'))\ncache_dir",
 38 |       "execution_count": 7,
 39 |       "outputs": [
 40 |         {
 41 |           "output_type": "execute_result",
 42 |           "execution_count": 7,
 43 |           "data": {
 44 |             "text/plain": "'/home/nbuser/library/dataflow-cache'"
 45 |           },
 46 |           "metadata": {}
 47 |         }
 48 |       ]
 49 |     },
 50 |     {
 51 |       "metadata": {
 52 |         "trusted": true
 53 |       },
 54 |       "cell_type": "code",
 55 |       "source": "# choose a directory to store cache data\ncache_dir = str('dataflow-cache')\ncache_dir",
 56 |       "execution_count": 8,
 57 |       "outputs": [
 58 |         {
 59 |           "output_type": "execute_result",
 60 |           "execution_count": 8,
 61 |           "data": {
 62 |             "text/plain": "'dataflow-cache'"
 63 |           },
 64 |           "metadata": {}
 65 |         }
 66 |       ]
 67 |     },
 68 |     {
 69 |       "metadata": {
 70 |         "trusted": true
 71 |       },
 72 |       "cell_type": "code",
 73 |       "source": "# cache the dataflow\ndf_cached = df.cache(directory_path=cache_dir)",
 74 |       "execution_count": 9,
 75 |       "outputs": [
 76 |         {
 77 |           "output_type": "error",
 78 |           "ename": "ExecutionError",
 79 |           "evalue": "Cannot write cache. Please check if the specified cache folder exists.",
 80 |           "traceback": [
 81 |             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 82 |             "\u001b[0;31mExecutionError\u001b[0m                            Traceback (most recent call last)",
 83 |             "\u001b[0;32m<ipython-input-9-5f875f19e519>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# cache the dataflow\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdf_cached\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdirectory_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcache_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
 84 |             "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mcache\u001b[0;34m(self, directory_path)\u001b[0m\n\u001b[1;32m    982\u001b[0m             \u001b[0;34m'cachePath'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mLocalDataSource\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdirectory_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munderlying_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    983\u001b[0m         })\n\u001b[0;32m--> 984\u001b[0;31m         \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    985\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    986\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
 85 |             "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mhead\u001b[0;34m(self, count)\u001b[0m\n\u001b[1;32m    337\u001b[0m         \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mA\u001b[0m \u001b[0mPandas\u001b[0m \u001b[0mDataframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    338\u001b[0m         \"\"\"\n\u001b[0;32m--> 339\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_pandas_dataframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mextended_types\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    340\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    341\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mrun_local\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 86 |             "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mto_pandas_dataframe\u001b[0;34m(self, extended_types, nulls_as_nan)\u001b[0m\n\u001b[1;32m    391\u001b[0m             self._engine_api.execute_anonymous_blocks(\n\u001b[1;32m    392\u001b[0m                 ExecuteAnonymousBlocksMessageArguments(blocks=steps_to_block_datas(dataflow_to_execute._steps),\n\u001b[0;32m--> 393\u001b[0;31m                                                        project_context=self._parent_package_path))\n\u001b[0m\u001b[1;32m    394\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    395\u001b[0m             \u001b[0mintermediate_files\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mp\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mintermediate_path\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mglob\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'part-*'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 87 |             "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/_aml_helper.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(op_code, message)\u001b[0m\n\u001b[1;32m     35\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchanged\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     36\u001b[0m                 \u001b[0mengine_api_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate_environment_variable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchanged\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 37\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0msend_message_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mop_code\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     38\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     39\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 88 |             "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/engineapi/api.py\u001b[0m in \u001b[0;36mexecute_anonymous_blocks\u001b[0;34m(self, message_args)\u001b[0m\n\u001b[1;32m     54\u001b[0m     \u001b[0;34m@\u001b[0m\u001b[0mupdate_aml_env_vars\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mget_engine_api\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     55\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mexecute_anonymous_blocks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage_args\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mtypedefinitions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mExecuteAnonymousBlocksMessageArguments\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m         \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_message\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Engine.ExecuteActivity'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     57\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     58\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
 89 |             "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/engineapi/engine.py\u001b[0m in \u001b[0;36msend_message\u001b[0;34m(self, op_code, message)\u001b[0m\n\u001b[1;32m     61\u001b[0m             \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read_response\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     62\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;34m'error'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m                 \u001b[0mraise_engine_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'error'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     64\u001b[0m             \u001b[0;32melif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'id'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mmessage_id\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     65\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'result'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 90 |             "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/errorhandlers.py\u001b[0m in \u001b[0;36mraise_engine_error\u001b[0;34m(error_response)\u001b[0m\n\u001b[1;32m     67\u001b[0m     \u001b[0merror_code\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0merror_response\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'errorCode'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     68\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0;34m'ActivityExecutionFailed'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0merror_code\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 69\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mExecutionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_response\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     70\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0;34m'UnableToPreviewDataSource'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0merror_code\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     71\u001b[0m         \u001b[0;32mraise\u001b[0m \u001b[0mPreviewDataSourceError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_response\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 91 |             "\u001b[0;31mExecutionError\u001b[0m: Cannot write cache. Please check if the specified cache folder exists."
 92 |           ]
 93 |         }
 94 |       ]
 95 |     },
 96 |     {
 97 |       "metadata": {
 98 |         "trusted": true
 99 |       },
100 |       "cell_type": "code",
101 |       "source": "# check steps in df_cached\n[s.step_type for s in df_cached.get_steps()]",
102 |       "execution_count": null,
103 |       "outputs": []
104 |     },
105 |     {
106 |       "metadata": {
107 |         "trusted": false
108 |       },
109 |       "cell_type": "code",
110 |       "source": "# check the stored cache data\nos.listdir(cache_dir)",
111 |       "execution_count": 5,
112 |       "outputs": [
113 |         {
114 |           "data": {
115 |             "text/plain": "['7acc00d7-8e69-471d-b74d-085d0625cd9b.cacheIndex',\n '86e51582-fa4f-4b9e-8e45-439692d0da02']"
116 |           },
117 |           "execution_count": 5,
118 |           "metadata": {},
119 |           "output_type": "execute_result"
120 |         }
121 |       ]
122 |     },
123 |     {
124 |       "metadata": {
125 |         "trusted": false
126 |       },
127 |       "cell_type": "code",
128 |       "source": "# run against df_cached will reuse the cache data and skip running all the previous steps again\ndf_cached.head(5)",
129 |       "execution_count": 6,
130 |       "outputs": [
131 |         {
132 |           "data": {
133 |             "text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>stnam</th>\n      <th>fipst</th>\n      <th>leaid</th>\n      <th>leanm10</th>\n      <th>ncessch</th>\n      <th>schnam10</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>ALABAMA</td>\n      <td>1</td>\n      <td>102100</td>\n      <td>Limestone County</td>\n      <td>10210000797</td>\n      <td>Ardmore High Sch</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>ALABAMA</td>\n      <td>1</td>\n      <td>101920</td>\n      <td>Jefferson County</td>\n      <td>10192000691</td>\n      <td>Brighton Middle Sch</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>ALABAMA</td>\n      <td>1</td>\n      <td>101920</td>\n      <td>Jefferson County</td>\n      <td>10192000720</td>\n      <td>Bryan Elem Sch</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>ALABAMA</td>\n      <td>1</td>\n      <td>102010</td>\n      <td>Lauderdale County</td>\n      <td>10201000766</td>\n      <td>Cloverdale Jr High Sch</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>ALABAMA</td>\n      <td>1</td>\n      <td>101920</td>\n      <td>Jefferson County</td>\n      <td>10192000701</td>\n      <td>Erwin Elem Sch</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
134 |             "text/plain": "     stnam fipst   leaid            leanm10      ncessch  \\\n0  ALABAMA     1  102100   Limestone County  10210000797   \n1  ALABAMA     1  101920   Jefferson County  10192000691   \n2  ALABAMA     1  101920   Jefferson County  10192000720   \n3  ALABAMA     1  102010  Lauderdale County  10201000766   \n4  ALABAMA     1  101920   Jefferson County  10192000701   \n\n                 schnam10  \n0        Ardmore High Sch  \n1     Brighton Middle Sch  \n2          Bryan Elem Sch  \n3  Cloverdale Jr High Sch  \n4          Erwin Elem Sch  "
135 |           },
136 |           "execution_count": 6,
137 |           "metadata": {},
138 |           "output_type": "execute_result"
139 |         }
140 |       ]
141 |     },
142 |     {
143 |       "metadata": {
144 |         "trusted": false
145 |       },
146 |       "cell_type": "code",
147 |       "source": "df1 = df_cached.take(10)\ndf2 = df_cached.skip(10).take(10)\n\n# run against df1 and df2 will reuse the cache data as well\ndataframe1 = df1.to_pandas_dataframe()\ndataframe2 = df2.to_pandas_dataframe()",
148 |       "execution_count": 7,
149 |       "outputs": []
150 |     },
151 |     {
152 |       "metadata": {
153 |         "trusted": false
154 |       },
155 |       "cell_type": "code",
156 |       "source": "# clean up cache data\nimport shutil\nshutil.rmtree(path=cache_dir)",
157 |       "execution_count": 8,
158 |       "outputs": []
159 |     }
160 |   ],
161 |   "metadata": {
162 |     "kernelspec": {
163 |       "name": "python36",
164 |       "display_name": "Python 3.6",
165 |       "language": "python"
166 |     },
167 |     "language_info": {
168 |       "mimetype": "text/x-python",
169 |       "nbconvert_exporter": "python",
170 |       "name": "python",
171 |       "pygments_lexer": "ipython3",
172 |       "version": "3.6.6",
173 |       "file_extension": ".py",
174 |       "codemirror_mode": {
175 |         "version": 3,
176 |         "name": "ipython"
177 |       }
178 |     }
179 |   },
180 |   "nbformat": 4,
181 |   "nbformat_minor": 2
182 | }


--------------------------------------------------------------------------------
/join.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {},
  5 |       "cell_type": "markdown",
  6 |       "source": "# Join\nCopyright (c) Microsoft Corporation. All rights reserved.<br>\nLicensed under the MIT License.<br>\n\nIn DataPrep you can easily join two dataflows."
  7 |     },
  8 |     {
  9 |       "metadata": {
 10 |         "trusted": true
 11 |       },
 12 |       "cell_type": "code",
 13 |       "source": "!pip install azureml",
 14 |       "execution_count": 1,
 15 |       "outputs": [
 16 |         {
 17 |           "output_type": "stream",
 18 |           "text": "Requirement already satisfied: azureml in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (0.2.7)\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.5)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.20.1)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.7)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.6)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\nRequirement already satisfied: urllib3<1.25,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.10.15)\n",
 19 |           "name": "stdout"
 20 |         }
 21 |       ]
 22 |     },
 23 |     {
 24 |       "metadata": {
 25 |         "trusted": true
 26 |       },
 27 |       "cell_type": "code",
 28 |       "source": "import azureml.dataprep as dprep",
 29 |       "execution_count": 2,
 30 |       "outputs": []
 31 |     },
 32 |     {
 33 |       "metadata": {},
 34 |       "cell_type": "markdown",
 35 |       "source": "First let's get the left side of the data into a shape that is ready for the join."
 36 |     },
 37 |     {
 38 |       "metadata": {
 39 |         "scrolled": false,
 40 |         "trusted": true
 41 |       },
 42 |       "cell_type": "code",
 43 |       "source": "# get the first dataflow and derive desired key column\ndataflow_l = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/testfiles/BostonWeather.csv')\ndataflow_l = dataflow_l.derive_column_by_example(source_columns='DATE', new_column_name='date_timerange',\n                                                 example_data=[('11/11/2015 0:54', 'Nov 11, 2015 | 12AM-2AM'),\n                                                              ('2/1/2015 0:54', 'Feb 1, 2015 | 12AM-2AM'),\n                                                              ('1/29/2015 20:54', 'Jan 29, 2015 | 8PM-10PM')])\ndataflow_l = dataflow_l.drop_columns(['DATE'])\n\n# convert types and summarize data\ndataflow_l = dataflow_l.set_column_types(type_conversions={'HOURLYDRYBULBTEMPF': dprep.TypeConverter(dprep.FieldType.DECIMAL)})\ndataflow_l = dataflow_l.filter(expression=dprep.f_not(dprep.col('HOURLYDRYBULBTEMPF').is_error()))\ndataflow_l = dataflow_l.summarize(group_by_columns=['date_timerange'],summary_columns=[dprep.SummaryColumnsValue('HOURLYDRYBULBTEMPF', dprep.api.engineapi.typedefinitions.SummaryFunction.MEAN, 'HOURLYDRYBULBTEMPF_Mean')] )\n\n# cache the result so the steps above are not executed every time we pull on the data\nimport os\nfrom pathlib import Path\ncache_dir = str(Path(os.getcwd(), 'dataflow-cache'))\ndataflow_l.cache(directory_path=cache_dir)\ndataflow_l.head(10)",
 44 |       "execution_count": 3,
 45 |       "outputs": [
 46 |         {
 47 |           "output_type": "error",
 48 |           "ename": "ExecutionError",
 49 |           "evalue": "Cannot write cache. Please check if the specified cache folder exists.",
 50 |           "traceback": [
 51 |             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 52 |             "\u001b[0;31mExecutionError\u001b[0m                            Traceback (most recent call last)",
 53 |             "\u001b[0;32m<ipython-input-3-314bcf79d8e4>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     16\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpathlib\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mPath\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     17\u001b[0m \u001b[0mcache_dir\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mPath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetcwd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'dataflow-cache'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 18\u001b[0;31m \u001b[0mdataflow_l\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdirectory_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcache_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     19\u001b[0m \u001b[0mdataflow_l\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 54 |             "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mcache\u001b[0;34m(self, directory_path)\u001b[0m\n\u001b[1;32m    982\u001b[0m             \u001b[0;34m'cachePath'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mLocalDataSource\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdirectory_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munderlying_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    983\u001b[0m         })\n\u001b[0;32m--> 984\u001b[0;31m         \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    985\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    986\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
 55 |             "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mhead\u001b[0;34m(self, count)\u001b[0m\n\u001b[1;32m    337\u001b[0m         \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mA\u001b[0m \u001b[0mPandas\u001b[0m \u001b[0mDataframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    338\u001b[0m         \"\"\"\n\u001b[0;32m--> 339\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_pandas_dataframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mextended_types\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    340\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    341\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mrun_local\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 56 |             "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mto_pandas_dataframe\u001b[0;34m(self, extended_types, nulls_as_nan)\u001b[0m\n\u001b[1;32m    391\u001b[0m             self._engine_api.execute_anonymous_blocks(\n\u001b[1;32m    392\u001b[0m                 ExecuteAnonymousBlocksMessageArguments(blocks=steps_to_block_datas(dataflow_to_execute._steps),\n\u001b[0;32m--> 393\u001b[0;31m                                                        project_context=self._parent_package_path))\n\u001b[0m\u001b[1;32m    394\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    395\u001b[0m             \u001b[0mintermediate_files\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mp\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mintermediate_path\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mglob\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'part-*'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 57 |             "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/_aml_helper.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(op_code, message)\u001b[0m\n\u001b[1;32m     35\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchanged\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     36\u001b[0m                 \u001b[0mengine_api_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate_environment_variable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchanged\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 37\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0msend_message_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mop_code\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     38\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     39\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 58 |             "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/engineapi/api.py\u001b[0m in \u001b[0;36mexecute_anonymous_blocks\u001b[0;34m(self, message_args)\u001b[0m\n\u001b[1;32m     54\u001b[0m     \u001b[0;34m@\u001b[0m\u001b[0mupdate_aml_env_vars\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mget_engine_api\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     55\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mexecute_anonymous_blocks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage_args\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mtypedefinitions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mExecuteAnonymousBlocksMessageArguments\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m         \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_message\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Engine.ExecuteActivity'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     57\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     58\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
 59 |             "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/engineapi/engine.py\u001b[0m in \u001b[0;36msend_message\u001b[0;34m(self, op_code, message)\u001b[0m\n\u001b[1;32m     61\u001b[0m             \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read_response\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     62\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;34m'error'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m                 \u001b[0mraise_engine_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'error'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     64\u001b[0m             \u001b[0;32melif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'id'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mmessage_id\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     65\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'result'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 60 |             "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/errorhandlers.py\u001b[0m in \u001b[0;36mraise_engine_error\u001b[0;34m(error_response)\u001b[0m\n\u001b[1;32m     67\u001b[0m     \u001b[0merror_code\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0merror_response\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'errorCode'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     68\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0;34m'ActivityExecutionFailed'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0merror_code\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 69\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mExecutionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_response\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     70\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0;34m'UnableToPreviewDataSource'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0merror_code\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     71\u001b[0m         \u001b[0;32mraise\u001b[0m \u001b[0mPreviewDataSourceError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_response\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 61 |             "\u001b[0;31mExecutionError\u001b[0m: Cannot write cache. Please check if the specified cache folder exists."
 62 |           ]
 63 |         }
 64 |       ]
 65 |     },
 66 |     {
 67 |       "metadata": {},
 68 |       "cell_type": "markdown",
 69 |       "source": "Now let's prepare the data for the right side of the join."
 70 |     },
 71 |     {
 72 |       "metadata": {
 73 |         "scrolled": false,
 74 |         "trusted": true
 75 |       },
 76 |       "cell_type": "code",
 77 |       "source": "# get the second dataflow and desired key column\ndataflow_r = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/bike-share/*-hubway-tripdata.csv')\ndataflow_r = dataflow_r.keep_columns(['starttime', 'start station id'])\ndataflow_r = dataflow_r.derive_column_by_example(source_columns='starttime', new_column_name='l_date_timerange',\n                                                 example_data=[('2015-01-01 00:21:44', 'Jan 1, 2015 | 12AM-2AM')])\ndataflow_r = dataflow_r.drop_columns('starttime')\n\n# cache the results\ndataflow_r.cache(directory_path=cache_dir)\ndataflow_r.head(10)",
 78 |       "execution_count": null,
 79 |       "outputs": []
 80 |     },
 81 |     {
 82 |       "metadata": {},
 83 |       "cell_type": "markdown",
 84 |       "source": "There are three ways one can join two dataflows in DataPrep:\n1. Create `JoinBuilder` object for interactive join configuration.\n2. Call ```join()``` on one of the dataflows and pass in the other along with all other arguments.\n3. Call ```Dataflow.join()``` method and pass in two dataflows along with all other arguments.\n\nWe will explore the builder object as it simplifies the determination of correct arguments. "
 85 |     },
 86 |     {
 87 |       "metadata": {
 88 |         "trusted": true
 89 |       },
 90 |       "cell_type": "code",
 91 |       "source": "# construct a builder for joining dataflow_l with dataflow_r\njoin_builder = dataflow_l.builders.join(right_dataflow=dataflow_r, left_column_prefix='l', right_column_prefix='r')\n\njoin_builder",
 92 |       "execution_count": null,
 93 |       "outputs": []
 94 |     },
 95 |     {
 96 |       "metadata": {},
 97 |       "cell_type": "markdown",
 98 |       "source": "As you can see, so far the builder has no propeties set except default values.\nFrom here you could set each of the options and preview its effect on the join result or use DataPrep to determine some of them.\n\nLet's start with determining appropriate column prefixes for left and right side of the join and lists of columns that would not conflict and therefore don't need to be prefixed.  "
 99 |     },
100 |     {
101 |       "metadata": {
102 |         "scrolled": true,
103 |         "trusted": true
104 |       },
105 |       "cell_type": "code",
106 |       "source": "join_builder.detect_column_info()\njoin_builder",
107 |       "execution_count": null,
108 |       "outputs": []
109 |     },
110 |     {
111 |       "metadata": {},
112 |       "cell_type": "markdown",
113 |       "source": "You can see that DataPrep has performed a pull on both dataflows to determine the column names in them. Given that `dataflow_r` already had a column starting with `l_` new prefix got generated which would not collide with any column names that are already present.\nAdditionally columns in each dataflow that won't conflict during join would remain unprefixed.\nThis apprach to column naming is crucial for join robustness to schema changes in the data. Let's say that at some time in future the data consumed by left dataflow will also have `l_date_timerange` column in it.\nConfigured as above the join will still run as expected and the new column will be prefixed with `l2_` ensuring that ig column `l_date_timerange` was consumed by some other future transformation it remains unaffected.\n\nNote: `KEY_generated` is appended to both lists and is reserved for Dataprep use in case Autojoin is performed.\n\n### Autojoin\nAutojoin is a Dataprep feature that determines suitable join arguments given data on both sides. In some cases Autojoin can even derive a key column from a number of available columns in the data.\nHere is how you can use Autojoin:"
114 |     },
115 |     {
116 |       "metadata": {
117 |         "trusted": true
118 |       },
119 |       "cell_type": "code",
120 |       "source": "# generate join suggestions\njoin_builder.generate_suggested_join()\n\n# list generated suggestions\njoin_builder.list_join_suggestions()",
121 |       "execution_count": null,
122 |       "outputs": []
123 |     },
124 |     {
125 |       "metadata": {},
126 |       "cell_type": "markdown",
127 |       "source": "Now lets select first suggestion and preview the result of the join."
128 |     },
129 |     {
130 |       "metadata": {
131 |         "trusted": true
132 |       },
133 |       "cell_type": "code",
134 |       "source": "# apply first suggestion\njoin_builder.apply_suggestion(0)\n\njoin_builder.preview(10)",
135 |       "execution_count": null,
136 |       "outputs": []
137 |     },
138 |     {
139 |       "metadata": {},
140 |       "cell_type": "markdown",
141 |       "source": "Everything looks just as we would expect, so it is time to get our new joined dataflow."
142 |     },
143 |     {
144 |       "metadata": {
145 |         "trusted": true
146 |       },
147 |       "cell_type": "code",
148 |       "source": "dataflow_autojoined = join_builder.to_dataflow().drop_columns(['l_date_timerange'])",
149 |       "execution_count": null,
150 |       "outputs": []
151 |     },
152 |     {
153 |       "metadata": {},
154 |       "cell_type": "markdown",
155 |       "source": "### Joining two dataflows without pulling the data\n\nIn don't want to pull on data and know what join should look like you can always you join method on Dataflow "
156 |     },
157 |     {
158 |       "metadata": {
159 |         "trusted": true
160 |       },
161 |       "cell_type": "code",
162 |       "source": "dataflow_joined = dprep.Dataflow.join(left_dataflow=dataflow_l,\n                                      right_dataflow=dataflow_r,\n                                      join_key_pairs=[('date_timerange', 'l_date_timerange')],\n                                      left_column_prefix='l2_',\n                                      right_column_prefix='r_')\n",
163 |       "execution_count": null,
164 |       "outputs": []
165 |     },
166 |     {
167 |       "metadata": {
168 |         "trusted": true
169 |       },
170 |       "cell_type": "code",
171 |       "source": "dataflow_joined.head(10)",
172 |       "execution_count": null,
173 |       "outputs": []
174 |     },
175 |     {
176 |       "metadata": {
177 |         "trusted": true
178 |       },
179 |       "cell_type": "code",
180 |       "source": "dataflow_joined = dataflow_joined.filter(expression=dprep.col('r_start station id') == '67')\ndf = dataflow_joined.to_pandas_dataframe()\ndf",
181 |       "execution_count": null,
182 |       "outputs": []
183 |     },
184 |     {
185 |       "metadata": {
186 |         "trusted": true
187 |       },
188 |       "cell_type": "code",
189 |       "source": "",
190 |       "execution_count": null,
191 |       "outputs": []
192 |     },
193 |     {
194 |       "metadata": {
195 |         "trusted": true
196 |       },
197 |       "cell_type": "code",
198 |       "source": "",
199 |       "execution_count": null,
200 |       "outputs": []
201 |     },
202 |     {
203 |       "metadata": {
204 |         "trusted": true
205 |       },
206 |       "cell_type": "code",
207 |       "source": "",
208 |       "execution_count": null,
209 |       "outputs": []
210 |     },
211 |     {
212 |       "metadata": {
213 |         "trusted": true
214 |       },
215 |       "cell_type": "code",
216 |       "source": "",
217 |       "execution_count": null,
218 |       "outputs": []
219 |     }
220 |   ],
221 |   "metadata": {
222 |     "kernelspec": {
223 |       "name": "python36",
224 |       "display_name": "Python 3.6",
225 |       "language": "python"
226 |     },
227 |     "language_info": {
228 |       "mimetype": "text/x-python",
229 |       "nbconvert_exporter": "python",
230 |       "name": "python",
231 |       "pygments_lexer": "ipython3",
232 |       "version": "3.6.6",
233 |       "file_extension": ".py",
234 |       "codemirror_mode": {
235 |         "version": 3,
236 |         "name": "ipython"
237 |       }
238 |     }
239 |   },
240 |   "nbformat": 4,
241 |   "nbformat_minor": 2
242 | }


--------------------------------------------------------------------------------
/split-column-by-example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {},
  5 |       "cell_type": "markdown",
  6 |       "source": "# Split column by example\nCopyright (c) Microsoft Corporation. All rights reserved.<br>\nLicensed under the MIT License."
  7 |     },
  8 |     {
  9 |       "metadata": {},
 10 |       "cell_type": "markdown",
 11 |       "source": "DataPrep also offers you a way to easily split a column into multiple columns.\nThe SplitColumnByExampleBuilder class lets you generate a proper split program that will work even when the cases are not trivial, like in example below."
 12 |     },
 13 |     {
 14 |       "metadata": {
 15 |         "trusted": true
 16 |       },
 17 |       "cell_type": "code",
 18 |       "source": "!pip install azureml",
 19 |       "execution_count": 1,
 20 |       "outputs": [
 21 |         {
 22 |           "output_type": "stream",
 23 |           "text": "Requirement already satisfied: azureml in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (0.2.7)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.19.1)\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.3)\nRequirement already satisfied: urllib3<1.24,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.8.24)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.5)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.5)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\n",
 24 |           "name": "stdout"
 25 |         }
 26 |       ]
 27 |     },
 28 |     {
 29 |       "metadata": {
 30 |         "trusted": true
 31 |       },
 32 |       "cell_type": "code",
 33 |       "source": "import azureml.dataprep as dprep",
 34 |       "execution_count": 3,
 35 |       "outputs": []
 36 |     },
 37 |     {
 38 |       "metadata": {
 39 |         "scrolled": true,
 40 |         "trusted": true
 41 |       },
 42 |       "cell_type": "code",
 43 |       "source": "dataflow = dprep.read_lines(path='https://dpreptestfiles.blob.core.windows.net/testfiles/sample.log')\ndf = dataflow.head(10)",
 44 |       "execution_count": 4,
 45 |       "outputs": []
 46 |     },
 47 |     {
 48 |       "metadata": {
 49 |         "trusted": true
 50 |       },
 51 |       "cell_type": "code",
 52 |       "source": "df['Line'].iloc[0]",
 53 |       "execution_count": 5,
 54 |       "outputs": [
 55 |         {
 56 |           "output_type": "execute_result",
 57 |           "execution_count": 5,
 58 |           "data": {
 59 |             "text/plain": "'2012-02-03 18:35:34 SampleClass6 [INFO] everything normal for id 577725851'"
 60 |           },
 61 |           "metadata": {}
 62 |         }
 63 |       ]
 64 |     },
 65 |     {
 66 |       "metadata": {},
 67 |       "cell_type": "markdown",
 68 |       "source": "As you can see above, you can't split this particular log file by space character as it will create too many columns and even worse number of columns will depend on a string in 6th column.\nThat's where split_column_by_example could be quite useful."
 69 |     },
 70 |     {
 71 |       "metadata": {
 72 |         "trusted": true
 73 |       },
 74 |       "cell_type": "code",
 75 |       "source": "b = dataflow.builders.split_column_by_example('Line', keep_delimiters=True)",
 76 |       "execution_count": 6,
 77 |       "outputs": []
 78 |     },
 79 |     {
 80 |       "metadata": {
 81 |         "scrolled": false,
 82 |         "trusted": true
 83 |       },
 84 |       "cell_type": "code",
 85 |       "source": "b.preview()",
 86 |       "execution_count": 7,
 87 |       "outputs": [
 88 |         {
 89 |           "output_type": "execute_result",
 90 |           "execution_count": 7,
 91 |           "data": {
 92 |             "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Line</th>\n      <th>Line_1</th>\n      <th>Line_2</th>\n      <th>Line_3</th>\n      <th>Line_4</th>\n      <th>Line_5</th>\n      <th>Line_6</th>\n      <th>Line_7</th>\n      <th>Line_8</th>\n      <th>Line_9</th>\n      <th>Line_10</th>\n      <th>Line_11</th>\n      <th>Line_12</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>2012-02-03 18:35:34 SampleClass6 [INFO] everyt...</td>\n      <td>2012-02-03</td>\n      <td></td>\n      <td>18:35:34</td>\n      <td></td>\n      <td>SampleClass</td>\n      <td>6</td>\n      <td>[</td>\n      <td>INFO</td>\n      <td>]</td>\n      <td>everything normal for id</td>\n      <td></td>\n      <td>577725851</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2012-02-03 18:35:34 SampleClass4 [FATAL] syste...</td>\n      <td>2012-02-03</td>\n      <td></td>\n      <td>18:35:34</td>\n      <td></td>\n      <td>SampleClass</td>\n      <td>4</td>\n      <td>[</td>\n      <td>FATAL</td>\n      <td>]</td>\n      <td>system problem at id</td>\n      <td></td>\n      <td>1991281254</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2012-02-03 18:35:34 SampleClass3 [DEBUG] detai...</td>\n      <td>2012-02-03</td>\n      <td></td>\n      <td>18:35:34</td>\n      <td></td>\n      <td>SampleClass</td>\n      <td>3</td>\n      <td>[</td>\n      <td>DEBUG</td>\n      <td>]</td>\n      <td>detail for id</td>\n      <td></td>\n      <td>1304807656</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>2012-02-03 18:35:34 SampleClass3 [WARN] missin...</td>\n      <td>2012-02-03</td>\n      <td></td>\n      <td>18:35:34</td>\n      <td></td>\n      <td>SampleClass</td>\n      <td>3</td>\n      <td>[</td>\n      <td>WARN</td>\n      <td>]</td>\n      <td>missing id</td>\n      <td></td>\n      <td>423340895</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>2012-02-03 18:35:34 SampleClass5 [TRACE] verbo...</td>\n      <td>2012-02-03</td>\n      <td></td>\n      <td>18:35:34</td>\n      <td></td>\n      <td>SampleClass</td>\n      <td>5</td>\n      <td>[</td>\n      <td>TRACE</td>\n      <td>]</td>\n      <td>verbose detail for id</td>\n      <td></td>\n      <td>2082654978</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>2012-02-03 18:35:34 SampleClass0 [ERROR] incor...</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>2012-02-03 18:35:34 SampleClass9 [TRACE] verbo...</td>\n      <td>2012-02-03</td>\n      <td></td>\n      <td>18:35:34</td>\n      <td></td>\n      <td>SampleClass</td>\n      <td>9</td>\n      <td>[</td>\n      <td>TRACE</td>\n      <td>]</td>\n      <td>verbose detail for id</td>\n      <td></td>\n      <td>438634209</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>2012-02-03 18:35:34 SampleClass8 [DEBUG] detai...</td>\n      <td>2012-02-03</td>\n      <td></td>\n      <td>18:35:34</td>\n      <td></td>\n      <td>SampleClass</td>\n      <td>8</td>\n      <td>[</td>\n      <td>DEBUG</td>\n      <td>]</td>\n      <td>detail for id</td>\n      <td></td>\n      <td>2074121310</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>2012-02-03 18:55:54 SampleClass4 [DEBUG] detai...</td>\n      <td>2012-02-03</td>\n      <td></td>\n      <td>18:55:54</td>\n      <td></td>\n      <td>SampleClass</td>\n      <td>4</td>\n      <td>[</td>\n      <td>DEBUG</td>\n      <td>]</td>\n      <td>detail for id</td>\n      <td></td>\n      <td>1029178762</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>2012-02-03 18:55:54 SampleClass2 [TRACE] verbo...</td>\n      <td>2012-02-03</td>\n      <td></td>\n      <td>18:55:54</td>\n      <td></td>\n      <td>SampleClass</td>\n      <td>2</td>\n      <td>[</td>\n      <td>TRACE</td>\n      <td>]</td>\n      <td>verbose detail for id</td>\n      <td></td>\n      <td>1135460272</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
 93 |             "text/plain": "                                                Line      Line_1 Line_2  \\\n0  2012-02-03 18:35:34 SampleClass6 [INFO] everyt...  2012-02-03          \n1  2012-02-03 18:35:34 SampleClass4 [FATAL] syste...  2012-02-03          \n2  2012-02-03 18:35:34 SampleClass3 [DEBUG] detai...  2012-02-03          \n3  2012-02-03 18:35:34 SampleClass3 [WARN] missin...  2012-02-03          \n4  2012-02-03 18:35:34 SampleClass5 [TRACE] verbo...  2012-02-03          \n5  2012-02-03 18:35:34 SampleClass0 [ERROR] incor...        None   None   \n6  2012-02-03 18:35:34 SampleClass9 [TRACE] verbo...  2012-02-03          \n7  2012-02-03 18:35:34 SampleClass8 [DEBUG] detai...  2012-02-03          \n8  2012-02-03 18:55:54 SampleClass4 [DEBUG] detai...  2012-02-03          \n9  2012-02-03 18:55:54 SampleClass2 [TRACE] verbo...  2012-02-03          \n\n     Line_3 Line_4       Line_5 Line_6 Line_7 Line_8 Line_9  \\\n0  18:35:34         SampleClass      6      [   INFO     ]    \n1  18:35:34         SampleClass      4      [  FATAL     ]    \n2  18:35:34         SampleClass      3      [  DEBUG     ]    \n3  18:35:34         SampleClass      3      [   WARN     ]    \n4  18:35:34         SampleClass      5      [  TRACE     ]    \n5      None   None         None   None   None   None   None   \n6  18:35:34         SampleClass      9      [  TRACE     ]    \n7  18:35:34         SampleClass      8      [  DEBUG     ]    \n8  18:55:54         SampleClass      4      [  DEBUG     ]    \n9  18:55:54         SampleClass      2      [  TRACE     ]    \n\n                    Line_10 Line_11     Line_12  \n0  everything normal for id           577725851  \n1      system problem at id          1991281254  \n2             detail for id          1304807656  \n3                missing id           423340895  \n4     verbose detail for id          2082654978  \n5                      None    None        None  \n6     verbose detail for id           438634209  \n7             detail for id          2074121310  \n8             detail for id          1029178762  \n9     verbose detail for id          1135460272  "
 94 |           },
 95 |           "metadata": {}
 96 |         }
 97 |       ]
 98 |     },
 99 |     {
100 |       "metadata": {},
101 |       "cell_type": "markdown",
102 |       "source": "Couple things to take note of here. No examples were given, and yet DataPrep was able to generate quite reasonable split program. \nWe have passed keep_delimiters=True so we can see all the data split into columns. In practice, though, delimiters are rarely useful, so let's exclude them."
103 |     },
104 |     {
105 |       "metadata": {
106 |         "scrolled": true,
107 |         "trusted": true
108 |       },
109 |       "cell_type": "code",
110 |       "source": "b.keep_delimiters = False\nb.preview()",
111 |       "execution_count": 8,
112 |       "outputs": [
113 |         {
114 |           "output_type": "execute_result",
115 |           "execution_count": 8,
116 |           "data": {
117 |             "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Line</th>\n      <th>Line_1</th>\n      <th>Line_2</th>\n      <th>Line_3</th>\n      <th>Line_4</th>\n      <th>Line_5</th>\n      <th>Line_6</th>\n      <th>Line_7</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>2012-02-03 18:35:34 SampleClass6 [INFO] everyt...</td>\n      <td>2012-02-03</td>\n      <td>18:35:34</td>\n      <td>SampleClass</td>\n      <td>6</td>\n      <td>INFO</td>\n      <td>everything normal for id</td>\n      <td>577725851</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2012-02-03 18:35:34 SampleClass4 [FATAL] syste...</td>\n      <td>2012-02-03</td>\n      <td>18:35:34</td>\n      <td>SampleClass</td>\n      <td>4</td>\n      <td>FATAL</td>\n      <td>system problem at id</td>\n      <td>1991281254</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2012-02-03 18:35:34 SampleClass3 [DEBUG] detai...</td>\n      <td>2012-02-03</td>\n      <td>18:35:34</td>\n      <td>SampleClass</td>\n      <td>3</td>\n      <td>DEBUG</td>\n      <td>detail for id</td>\n      <td>1304807656</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>2012-02-03 18:35:34 SampleClass3 [WARN] missin...</td>\n      <td>2012-02-03</td>\n      <td>18:35:34</td>\n      <td>SampleClass</td>\n      <td>3</td>\n      <td>WARN</td>\n      <td>missing id</td>\n      <td>423340895</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>2012-02-03 18:35:34 SampleClass5 [TRACE] verbo...</td>\n      <td>2012-02-03</td>\n      <td>18:35:34</td>\n      <td>SampleClass</td>\n      <td>5</td>\n      <td>TRACE</td>\n      <td>verbose detail for id</td>\n      <td>2082654978</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>2012-02-03 18:35:34 SampleClass0 [ERROR] incor...</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n      <td>None</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>2012-02-03 18:35:34 SampleClass9 [TRACE] verbo...</td>\n      <td>2012-02-03</td>\n      <td>18:35:34</td>\n      <td>SampleClass</td>\n      <td>9</td>\n      <td>TRACE</td>\n      <td>verbose detail for id</td>\n      <td>438634209</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>2012-02-03 18:35:34 SampleClass8 [DEBUG] detai...</td>\n      <td>2012-02-03</td>\n      <td>18:35:34</td>\n      <td>SampleClass</td>\n      <td>8</td>\n      <td>DEBUG</td>\n      <td>detail for id</td>\n      <td>2074121310</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>2012-02-03 18:55:54 SampleClass4 [DEBUG] detai...</td>\n      <td>2012-02-03</td>\n      <td>18:55:54</td>\n      <td>SampleClass</td>\n      <td>4</td>\n      <td>DEBUG</td>\n      <td>detail for id</td>\n      <td>1029178762</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>2012-02-03 18:55:54 SampleClass2 [TRACE] verbo...</td>\n      <td>2012-02-03</td>\n      <td>18:55:54</td>\n      <td>SampleClass</td>\n      <td>2</td>\n      <td>TRACE</td>\n      <td>verbose detail for id</td>\n      <td>1135460272</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
118 |             "text/plain": "                                                Line      Line_1    Line_2  \\\n0  2012-02-03 18:35:34 SampleClass6 [INFO] everyt...  2012-02-03  18:35:34   \n1  2012-02-03 18:35:34 SampleClass4 [FATAL] syste...  2012-02-03  18:35:34   \n2  2012-02-03 18:35:34 SampleClass3 [DEBUG] detai...  2012-02-03  18:35:34   \n3  2012-02-03 18:35:34 SampleClass3 [WARN] missin...  2012-02-03  18:35:34   \n4  2012-02-03 18:35:34 SampleClass5 [TRACE] verbo...  2012-02-03  18:35:34   \n5  2012-02-03 18:35:34 SampleClass0 [ERROR] incor...        None      None   \n6  2012-02-03 18:35:34 SampleClass9 [TRACE] verbo...  2012-02-03  18:35:34   \n7  2012-02-03 18:35:34 SampleClass8 [DEBUG] detai...  2012-02-03  18:35:34   \n8  2012-02-03 18:55:54 SampleClass4 [DEBUG] detai...  2012-02-03  18:55:54   \n9  2012-02-03 18:55:54 SampleClass2 [TRACE] verbo...  2012-02-03  18:55:54   \n\n        Line_3 Line_4 Line_5                    Line_6      Line_7  \n0  SampleClass      6   INFO  everything normal for id   577725851  \n1  SampleClass      4  FATAL      system problem at id  1991281254  \n2  SampleClass      3  DEBUG             detail for id  1304807656  \n3  SampleClass      3   WARN                missing id   423340895  \n4  SampleClass      5  TRACE     verbose detail for id  2082654978  \n5         None   None   None                      None        None  \n6  SampleClass      9  TRACE     verbose detail for id   438634209  \n7  SampleClass      8  DEBUG             detail for id  2074121310  \n8  SampleClass      4  DEBUG             detail for id  1029178762  \n9  SampleClass      2  TRACE     verbose detail for id  1135460272  "
119 |           },
120 |           "metadata": {}
121 |         }
122 |       ]
123 |     },
124 |     {
125 |       "metadata": {},
126 |       "cell_type": "markdown",
127 |       "source": "This looks pretty good already, except for line 5.\nIf we request generation of suggested examples we will see that line 5 is one of the items program need more input on."
128 |     },
129 |     {
130 |       "metadata": {
131 |         "trusted": true
132 |       },
133 |       "cell_type": "code",
134 |       "source": "suggestions = b.generate_suggested_examples()\nsuggestions",
135 |       "execution_count": 9,
136 |       "outputs": [
137 |         {
138 |           "output_type": "execute_result",
139 |           "execution_count": 9,
140 |           "data": {
141 |             "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Line</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>2012-02-03 18:35:34 SampleClass6 [INFO] everyt...</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2012-02-03 18:35:34 SampleClass0 [ERROR] incor...</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td></td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>java.lang.Exception: 2012-02-03 19:11:02 Sampl...</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>\\tat com.osa.mocklogger.MockLogger$2.run(MockL...</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
142 |             "text/plain": "                                                Line\n0  2012-02-03 18:35:34 SampleClass6 [INFO] everyt...\n1  2012-02-03 18:35:34 SampleClass0 [ERROR] incor...\n2                                                   \n3  java.lang.Exception: 2012-02-03 19:11:02 Sampl...\n4  \\tat com.osa.mocklogger.MockLogger$2.run(MockL..."
143 |           },
144 |           "metadata": {}
145 |         }
146 |       ]
147 |     },
148 |     {
149 |       "metadata": {
150 |         "trusted": true
151 |       },
152 |       "cell_type": "code",
153 |       "source": "suggestions.iloc[1]['Line']",
154 |       "execution_count": 10,
155 |       "outputs": [
156 |         {
157 |           "output_type": "execute_result",
158 |           "execution_count": 10,
159 |           "data": {
160 |             "text/plain": "'2012-02-03 18:35:34 SampleClass0 [ERROR] incorrect id  1886438513'"
161 |           },
162 |           "metadata": {}
163 |         }
164 |       ]
165 |     },
166 |     {
167 |       "metadata": {},
168 |       "cell_type": "markdown",
169 |       "source": "Having retrieved source value we can now provide an example of desired split.\nNotice that we chose not to split date and time but rather keep them together in one column."
170 |     },
171 |     {
172 |       "metadata": {
173 |         "trusted": true
174 |       },
175 |       "cell_type": "code",
176 |       "source": "b.add_example(example=(suggestions['Line'].iloc[1], ['2012-02-03 18:35:34','SampleClass0','ERROR','incorrect id','1886438513']))",
177 |       "execution_count": 11,
178 |       "outputs": []
179 |     },
180 |     {
181 |       "metadata": {
182 |         "scrolled": false,
183 |         "trusted": true
184 |       },
185 |       "cell_type": "code",
186 |       "source": "b.preview()",
187 |       "execution_count": 12,
188 |       "outputs": [
189 |         {
190 |           "output_type": "execute_result",
191 |           "execution_count": 12,
192 |           "data": {
193 |             "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Line</th>\n      <th>Line_1</th>\n      <th>Line_2</th>\n      <th>Line_3</th>\n      <th>Line_4</th>\n      <th>Line_5</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>2012-02-03 18:35:34 SampleClass6 [INFO] everyt...</td>\n      <td>2012-02-03 18:35:34</td>\n      <td>SampleClass6</td>\n      <td>INFO</td>\n      <td>everything normal for id</td>\n      <td>577725851</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2012-02-03 18:35:34 SampleClass4 [FATAL] syste...</td>\n      <td>2012-02-03 18:35:34</td>\n      <td>SampleClass4</td>\n      <td>FATAL</td>\n      <td>system problem at id</td>\n      <td>1991281254</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2012-02-03 18:35:34 SampleClass3 [DEBUG] detai...</td>\n      <td>2012-02-03 18:35:34</td>\n      <td>SampleClass3</td>\n      <td>DEBUG</td>\n      <td>detail for id</td>\n      <td>1304807656</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>2012-02-03 18:35:34 SampleClass3 [WARN] missin...</td>\n      <td>2012-02-03 18:35:34</td>\n      <td>SampleClass3</td>\n      <td>WARN</td>\n      <td>missing id</td>\n      <td>423340895</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>2012-02-03 18:35:34 SampleClass5 [TRACE] verbo...</td>\n      <td>2012-02-03 18:35:34</td>\n      <td>SampleClass5</td>\n      <td>TRACE</td>\n      <td>verbose detail for id</td>\n      <td>2082654978</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>2012-02-03 18:35:34 SampleClass0 [ERROR] incor...</td>\n      <td>2012-02-03 18:35:34</td>\n      <td>SampleClass0</td>\n      <td>ERROR</td>\n      <td>incorrect id</td>\n      <td>1886438513</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>2012-02-03 18:35:34 SampleClass9 [TRACE] verbo...</td>\n      <td>2012-02-03 18:35:34</td>\n      <td>SampleClass9</td>\n      <td>TRACE</td>\n      <td>verbose detail for id</td>\n      <td>438634209</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>2012-02-03 18:35:34 SampleClass8 [DEBUG] detai...</td>\n      <td>2012-02-03 18:35:34</td>\n      <td>SampleClass8</td>\n      <td>DEBUG</td>\n      <td>detail for id</td>\n      <td>2074121310</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>2012-02-03 18:55:54 SampleClass4 [DEBUG] detai...</td>\n      <td>2012-02-03 18:55:54</td>\n      <td>SampleClass4</td>\n      <td>DEBUG</td>\n      <td>detail for id</td>\n      <td>1029178762</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>2012-02-03 18:55:54 SampleClass2 [TRACE] verbo...</td>\n      <td>2012-02-03 18:55:54</td>\n      <td>SampleClass2</td>\n      <td>TRACE</td>\n      <td>verbose detail for id</td>\n      <td>1135460272</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
194 |             "text/plain": "                                                Line               Line_1  \\\n0  2012-02-03 18:35:34 SampleClass6 [INFO] everyt...  2012-02-03 18:35:34   \n1  2012-02-03 18:35:34 SampleClass4 [FATAL] syste...  2012-02-03 18:35:34   \n2  2012-02-03 18:35:34 SampleClass3 [DEBUG] detai...  2012-02-03 18:35:34   \n3  2012-02-03 18:35:34 SampleClass3 [WARN] missin...  2012-02-03 18:35:34   \n4  2012-02-03 18:35:34 SampleClass5 [TRACE] verbo...  2012-02-03 18:35:34   \n5  2012-02-03 18:35:34 SampleClass0 [ERROR] incor...  2012-02-03 18:35:34   \n6  2012-02-03 18:35:34 SampleClass9 [TRACE] verbo...  2012-02-03 18:35:34   \n7  2012-02-03 18:35:34 SampleClass8 [DEBUG] detai...  2012-02-03 18:35:34   \n8  2012-02-03 18:55:54 SampleClass4 [DEBUG] detai...  2012-02-03 18:55:54   \n9  2012-02-03 18:55:54 SampleClass2 [TRACE] verbo...  2012-02-03 18:55:54   \n\n         Line_2 Line_3                    Line_4      Line_5  \n0  SampleClass6   INFO  everything normal for id   577725851  \n1  SampleClass4  FATAL      system problem at id  1991281254  \n2  SampleClass3  DEBUG             detail for id  1304807656  \n3  SampleClass3   WARN                missing id   423340895  \n4  SampleClass5  TRACE     verbose detail for id  2082654978  \n5  SampleClass0  ERROR              incorrect id  1886438513  \n6  SampleClass9  TRACE     verbose detail for id   438634209  \n7  SampleClass8  DEBUG             detail for id  2074121310  \n8  SampleClass4  DEBUG             detail for id  1029178762  \n9  SampleClass2  TRACE     verbose detail for id  1135460272  "
195 |           },
196 |           "metadata": {}
197 |         }
198 |       ]
199 |     },
200 |     {
201 |       "metadata": {},
202 |       "cell_type": "markdown",
203 |       "source": "This looks just like what we need, so let's get a dataflow with split in it and drop original column."
204 |     },
205 |     {
206 |       "metadata": {
207 |         "trusted": true
208 |       },
209 |       "cell_type": "code",
210 |       "source": "dataflow = b.to_dataflow()\ndataflow = dataflow.drop_columns(['Line'])\ndataflow.head(10)",
211 |       "execution_count": 13,
212 |       "outputs": [
213 |         {
214 |           "output_type": "execute_result",
215 |           "execution_count": 13,
216 |           "data": {
217 |             "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Line_1</th>\n      <th>Line_2</th>\n      <th>Line_3</th>\n      <th>Line_4</th>\n      <th>Line_5</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>2012-02-03 18:35:34</td>\n      <td>SampleClass6</td>\n      <td>INFO</td>\n      <td>everything normal for id</td>\n      <td>577725851</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2012-02-03 18:35:34</td>\n      <td>SampleClass4</td>\n      <td>FATAL</td>\n      <td>system problem at id</td>\n      <td>1991281254</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2012-02-03 18:35:34</td>\n      <td>SampleClass3</td>\n      <td>DEBUG</td>\n      <td>detail for id</td>\n      <td>1304807656</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>2012-02-03 18:35:34</td>\n      <td>SampleClass3</td>\n      <td>WARN</td>\n      <td>missing id</td>\n      <td>423340895</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>2012-02-03 18:35:34</td>\n      <td>SampleClass5</td>\n      <td>TRACE</td>\n      <td>verbose detail for id</td>\n      <td>2082654978</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>2012-02-03 18:35:34</td>\n      <td>SampleClass0</td>\n      <td>ERROR</td>\n      <td>incorrect id</td>\n      <td>1886438513</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>2012-02-03 18:35:34</td>\n      <td>SampleClass9</td>\n      <td>TRACE</td>\n      <td>verbose detail for id</td>\n      <td>438634209</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>2012-02-03 18:35:34</td>\n      <td>SampleClass8</td>\n      <td>DEBUG</td>\n      <td>detail for id</td>\n      <td>2074121310</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>2012-02-03 18:55:54</td>\n      <td>SampleClass4</td>\n      <td>DEBUG</td>\n      <td>detail for id</td>\n      <td>1029178762</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>2012-02-03 18:55:54</td>\n      <td>SampleClass2</td>\n      <td>TRACE</td>\n      <td>verbose detail for id</td>\n      <td>1135460272</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
218 |             "text/plain": "                Line_1        Line_2 Line_3                    Line_4  \\\n0  2012-02-03 18:35:34  SampleClass6   INFO  everything normal for id   \n1  2012-02-03 18:35:34  SampleClass4  FATAL      system problem at id   \n2  2012-02-03 18:35:34  SampleClass3  DEBUG             detail for id   \n3  2012-02-03 18:35:34  SampleClass3   WARN                missing id   \n4  2012-02-03 18:35:34  SampleClass5  TRACE     verbose detail for id   \n5  2012-02-03 18:35:34  SampleClass0  ERROR              incorrect id   \n6  2012-02-03 18:35:34  SampleClass9  TRACE     verbose detail for id   \n7  2012-02-03 18:35:34  SampleClass8  DEBUG             detail for id   \n8  2012-02-03 18:55:54  SampleClass4  DEBUG             detail for id   \n9  2012-02-03 18:55:54  SampleClass2  TRACE     verbose detail for id   \n\n       Line_5  \n0   577725851  \n1  1991281254  \n2  1304807656  \n3   423340895  \n4  2082654978  \n5  1886438513  \n6   438634209  \n7  2074121310  \n8  1029178762  \n9  1135460272  "
219 |           },
220 |           "metadata": {}
221 |         }
222 |       ]
223 |     },
224 |     {
225 |       "metadata": {
226 |         "trusted": true
227 |       },
228 |       "cell_type": "code",
229 |       "source": "",
230 |       "execution_count": null,
231 |       "outputs": []
232 |     }
233 |   ],
234 |   "metadata": {
235 |     "kernelspec": {
236 |       "name": "python36",
237 |       "display_name": "Python 3.6",
238 |       "language": "python"
239 |     },
240 |     "language_info": {
241 |       "mimetype": "text/x-python",
242 |       "nbconvert_exporter": "python",
243 |       "name": "python",
244 |       "pygments_lexer": "ipython3",
245 |       "version": "3.6.6",
246 |       "file_extension": ".py",
247 |       "codemirror_mode": {
248 |         "version": 3,
249 |         "name": "ipython"
250 |       }
251 |     }
252 |   },
253 |   "nbformat": 4,
254 |   "nbformat_minor": 2
255 | }


--------------------------------------------------------------------------------