├── data ├── elements.csv ├── excel.xlsx ├── map_func.py ├── parquet.parquet ├── multiple_separators.csv ├── text_lines.txt ├── crime0-10.dprep.settings ├── crime0-10.csv ├── adls-dpreptestfiles.crt ├── median_income.csv ├── secrets.dprep ├── crime0-10.dprep └── median_income_transformed.csv ├── README.md ├── column-type-transform.ipynb ├── package-json-representation.ipynb ├── quantile-transformation.ipynb ├── read-pandas-dataframe.ipynb ├── secrets.ipynb ├── external-references.ipynb ├── impute-missing-values.ipynb ├── 0. Import librairie.ipynb ├── smart-read-file-separators.ipynb ├── caching.ipynb ├── join.ipynb └── split-column-by-example.ipynb /data/elements.csv: -------------------------------------------------------------------------------- 1 | ID,Symbol,Boiling Point 2 | 1,H,-252.87 3 | 53,I,184.3 4 | 2,He,-268.93 -------------------------------------------------------------------------------- /data/excel.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retkowsky/Data-Preparation-avec-Azure-ML-service/master/data/excel.xlsx -------------------------------------------------------------------------------- /data/map_func.py: -------------------------------------------------------------------------------- 1 | def transform(df, index): 2 | df['MAM_MTH00numvalid_1011'].fillna(0,inplace=True) 3 | return df 4 | -------------------------------------------------------------------------------- /data/parquet.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retkowsky/Data-Preparation-avec-Azure-ML-service/master/data/parquet.parquet -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data-Preparation-avec-Azure-ML-service 2 | 3 | Serge Retkowsky | serge.retkowsky@microsoft.com | https://www.linkedin.com/in/serger/ 4 | -------------------------------------------------------------------------------- /data/multiple_separators.csv: -------------------------------------------------------------------------------- 1 | ID |CaseNumber| |Completed| 2 | 10140490 |HY329907| |Y| 3 | 10139776 |HY329265| |Y| 4 | 10140270 |HY329253| |N| 5 | 10139885 |HY329308| |Y| 6 | 10140379 |HY329556| |N| 7 | 10140868 |HY330421| |N| 8 | 10139762 |HY329232| |N| 9 | 10139722 |HY329228| |Y| 10 | 10139774 |HY329209| |N| 11 | 10139697 |HY329177| |N| -------------------------------------------------------------------------------- /data/text_lines.txt: -------------------------------------------------------------------------------- 1 | Date||Minimum temperature||Maximum temperature 2 | 2015-07-1||-4.1||10.0 3 | 2015-07-2||-0.8||10.8 4 | 2015-07-3||-7.0||10.5 5 | 2015-07-4||-5.5||9.3 6 | 2015-07-5||-4.7||7.3 7 | 2015-07-6||-2.4||11.2 8 | 2015-07-7||-4.7||11.5 9 | 2015-07-8||-3.0||12.6 10 | 2015-07-9||-1.3||13.8 11 | 2015-07-10||-0.5||9.9 12 | 2015-07-11||3.6||12.5 13 | 2015-07-12||3.1||9.2 14 | 2015-07-13||3.6||13.6 15 | 2015-07-14||4.1||10.0 16 | 2015-07-15||1.1||7.9 17 | -------------------------------------------------------------------------------- /data/crime0-10.dprep.settings: -------------------------------------------------------------------------------- 1 | {"project":{"activitiesPaneSize":200,"isActivitiesPaneCollapsed":true,"activeActivityId":"75637565-60ad-4baa-87d3-396a7930cfe7","isInActivityView":true},"75637565-60ad-4baa-87d3-396a7930cfe7.main.visualFilters":[],"75637565-60ad-4baa-87d3-396a7930cfe7.main.currentSliceIndex":0,"75637565-60ad-4baa-87d3-396a7930cfe7.main.typeFilter":[],"75637565-60ad-4baa-87d3-396a7930cfe7.main.columnSearchFilter":{"term":"","matchCase":false,"matchWholeWord":false,"useRegex":false},"75637565-60ad-4baa-87d3-396a7930cfe7.main.columnsSelections":[]} -------------------------------------------------------------------------------- /data/crime0-10.csv: -------------------------------------------------------------------------------- 1 | ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location 2 | 10140490,HY329907,07/05/2015 11:50:00 PM,050XX N NEWLAND AVE,0820,THEFT,$500 AND UNDER,STREET,false,false,1613,016,41,10,06,1129230,1933315,2015,07/12/2015 12:42:46 PM,41.973309466,-87.800174996,"(41.973309466, -87.800174996)" 3 | 10139776,HY329265,07/05/2015 11:30:00 PM,011XX W MORSE AVE,0460,BATTERY,SIMPLE,STREET,false,true,2431,024,49,1,08B,1167370,1946271,2015,07/12/2015 12:42:46 PM,42.008124017,-87.65955018,"(42.008124017, -87.65955018)" 4 | 10140270,HY329253,07/05/2015 11:20:00 PM,121XX S FRONT AVE,0486,BATTERY,DOMESTIC BATTERY SIMPLE,STREET,false,true,0532,,9,53,08B,,,2015,07/12/2015 12:42:46 PM,,, 5 | 10139885,HY329308,07/05/2015 11:19:00 PM,051XX W DIVISION ST,0610,BURGLARY,FORCIBLE ENTRY,SMALL RETAIL STORE,false,false,1531,015,37,25,05,1141721,1907465,2015,07/12/2015 12:42:46 PM,41.902152027,-87.754883404,"(41.902152027, -87.754883404)" 6 | 10140379,HY329556,07/05/2015 11:00:00 PM,012XX W LAKE ST,0930,MOTOR VEHICLE THEFT,THEFT/RECOVERY: AUTOMOBILE,STREET,false,false,1215,012,27,28,07,1168413,1901632,2015,07/12/2015 12:42:46 PM,41.885610142,-87.657008701,"(41.885610142, -87.657008701)" 7 | 10140868,HY330421,07/05/2015 10:54:00 PM,118XX S PEORIA ST,1320,CRIMINAL DAMAGE,TO VEHICLE,VEHICLE NON-COMMERCIAL,false,false,0524,005,34,53,14,1172409,1826485,2015,07/12/2015 12:42:46 PM,41.6793109,-87.644545209,"(41.6793109, -87.644545209)" 8 | 10139762,HY329232,07/05/2015 10:42:00 PM,026XX W 37TH PL,1020,ARSON,BY FIRE,VACANT LOT/LAND,false,false,0911,009,12,58,09,1159436,1879658,2015,07/12/2015 12:42:46 PM,41.825500607,-87.690578042,"(41.825500607, -87.690578042)" 9 | 10139722,HY329228,07/05/2015 10:30:00 PM,016XX S CENTRAL PARK AVE,1811,NARCOTICS,POSS: CANNABIS 30GMS OR LESS,ALLEY,true,false,1021,010,24,29,18,1152687,1891389,2015,07/12/2015 12:42:46 PM,41.857827814,-87.715028789,"(41.857827814, -87.715028789)" 10 | 10139774,HY329209,07/05/2015 10:15:00 PM,048XX N ASHLAND AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,false,false,2032,020,46,3,14,1164821,1932394,2015,07/12/2015 12:42:46 PM,41.970099796,-87.669324377,"(41.970099796, -87.669324377)" 11 | 10139697,HY329177,07/05/2015 10:10:00 PM,058XX S ARTESIAN AVE,1320,CRIMINAL DAMAGE,TO VEHICLE,ALLEY,false,false,0824,008,16,63,14,1160997,1865851,2015,07/12/2015 12:42:46 PM,41.787580282,-87.685233078,"(41.787580282, -87.685233078)" -------------------------------------------------------------------------------- /data/adls-dpreptestfiles.crt: -------------------------------------------------------------------------------- 1 | -----BEGIN PRIVATE KEY----- 2 | MIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQDmkkyF0BwipZow 3 | Wd1AMkRkySx0y079JPxpsYhv4i1xXKdoa9bpFqwoXmJpeQM1JWnU4UeZzFeM86qK 4 | AhQvL4KV4kibcP2ENvu2NKFEdotO3uxPJ+6GlcYwMYzy+tUj008KnnRZfTrR78sJ 5 | tIl3C6lnVL0ICihksG59P1sskRq3PvOjXLAdEZalwDjZ4ZPoNDZdj6nUjB2l8zqu 6 | pKAt5mR+bJ9Sox4yrDuNhMmFt5QsRDRe3wUqdV+C9OCWHmjlmsjrYw7p9YmjBDvC 7 | 5U7mF0Mk/XeYFzj0pkXKQVqBL6xqig+q5ob0szYfg19iDeFhS3iIsRcJGEnRVW/A 8 | NpsBZyKrAgMBAAECggEBANlvP8C1F8NInhZYuIAwpzTQTh86Fxw8g9h8dijkh2wv 9 | LyQXBk07d1B+aZoDZ5X32UzKwcX04N9obfvFqBkzWZdVFJmZvUmwvEEActBoZkkT 10 | io+/HX5HweVy5PPCvbsSK6jc8uXtZcnSs4tMeJIOKkvqqnTpd1w00Y1FcQqfMC16 11 | 4p7o8wbt6OFoFAYqcxeVYVwDzCTLZD3+iJaqmntkBkoDndJy52yXQmMq5z1wbQVp 12 | BL6+L9nTvmouy64jiHVSKOx8nnWThYfHsXoPv+rYywjeuK/v3hyaTAwogs36ooEn 13 | SnuTBRvJcumN9Q0XIVlxKMVBcGyyAP+0yNKGz5NQgdECgYEA/I/Uq1E3epPJgEWR 14 | Bub+LpCgwtrw/lgKncb/Q/AiE9qoXobUe4KNU8aGaNMb7uVNLckY7cOluLS6SQb3 15 | Mzwk2Jl0G3vk8rW46tZWvSYB8+zAR2Rz7seUOT9SE5OmvwpnHrnp3nRr1vvVd2bp 16 | Q/ypwMLrwWQN51Kr+oTS74bUbrkCgYEA6bXVIUyao7z2Q3qAr6h+6JEWDbkJA7hJ 17 | BjHIOXvxd1tMoJJX+X9+IE/2XoJaUkGCb0vrM/hi1cyQFmS4Or/J6IWSZu8oBpDr 18 | EBmIK3PF1nrzNvWD28wM46c6ScehyWSm/u4bJWSm9liTX3dv5Kpa6ym7yLKc3c0B 19 | ECpSJM+5SoMCgYEAq585Tukzn/IJPUcIk/4nv5C8DW0l0lAVdr2g/JOTNJajTwik 20 | HwHJ86G1+Elsc9wRpAlBDWCjnm4BIFrBZGl8SEuOoJaCL4PZEotwCbxoG09IIbtb 21 | JGkuifBDX9Y3ux3gkPqYt3e5SC99EVQ3MuHgoIJUHehVolmFUAkuJWIjvNECgYEA 22 | 5pU0VspRuELzZdgzpxvDOooLDDcHodfslGQBfFXBA1Xc4IACtHMJaa/7D3vkyUtA 23 | +bYZtQjX2sEdWDq/WZdoCjXfIBfNkczhXt0R8G0lQFvGIu9QzUchYGrZo3mHMkBQ 24 | Uy1xMw9/e4YgwQwCJcW+Nk7Sq00uX9enuN9IdHFOCykCgYAqAGMK6CH1tlpjvHrf 25 | k+ZhigYxTXBlsVVvK1BIGGaiwzDpn65zeQp4aLOjSZkI1LuRi3tfTiZ321jRd64J 26 | 4lGk5Jurqv5grDmxROX/U50wEYbI9ncu/thU7syUdxDiqxHPI2RMG50mRcm3a55p 27 | ZCNSqkMlcXyA0U1z8C1ILNUsbA== 28 | -----END PRIVATE KEY----- 29 | -----BEGIN CERTIFICATE----- 30 | MIICoTCCAYkCAgPoMA0GCSqGSIb3DQEBBQUAMBQxEjAQBgNVBAMMCUNMSS1Mb2dp 31 | bjAiGA8yMDE4MDcxMzIzMjA0N1oYDzIwMTkwNzEzMjMyMDQ5WjAUMRIwEAYDVQQD 32 | DAlDTEktTG9naW4wggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDmkkyF 33 | 0BwipZowWd1AMkRkySx0y079JPxpsYhv4i1xXKdoa9bpFqwoXmJpeQM1JWnU4UeZ 34 | zFeM86qKAhQvL4KV4kibcP2ENvu2NKFEdotO3uxPJ+6GlcYwMYzy+tUj008KnnRZ 35 | fTrR78sJtIl3C6lnVL0ICihksG59P1sskRq3PvOjXLAdEZalwDjZ4ZPoNDZdj6nU 36 | jB2l8zqupKAt5mR+bJ9Sox4yrDuNhMmFt5QsRDRe3wUqdV+C9OCWHmjlmsjrYw7p 37 | 9YmjBDvC5U7mF0Mk/XeYFzj0pkXKQVqBL6xqig+q5ob0szYfg19iDeFhS3iIsRcJ 38 | GEnRVW/ANpsBZyKrAgMBAAEwDQYJKoZIhvcNAQEFBQADggEBAI4VlaFb9NsXMLdT 39 | Cw5/pk0Xo2Qi6483RGTy8vzrw88IE7f3juB/JWG+rayjtW5bBRx2fae4/ZIdZ4zg 40 | N2FDKn2PQPAc9m9pcKyUKUvWOC8ixSkrUmeQew0l1AXU0hsPSlJ7/7ZK4efoyB47 41 | hj71fsyKdyKbisZDcUFBq/S8PazdPF0YOD1W/4A2tW0cSMg+jmFWynuUTdWt3SU8 42 | CwBGqdiSKT5faJuYwIWnRXDEQS3ObRn1OFEfFdd4d2sxjxydWKRgnINnGlBdiFAT 43 | KzCozVr+75cO2ErH6x5C0hLQGG5BxXbaijyxyvaRNokTMVVv6OaDEnjzCGfJ72Yf 44 | 2wgitNc= 45 | -----END CERTIFICATE----- 46 | -------------------------------------------------------------------------------- /data/median_income.csv: -------------------------------------------------------------------------------- 1 | median_income 2 | 4.4896 3 | 2.1029 4 | 2.3889 5 | 3.707 6 | 6.4788 7 | 4.4074 8 | 5.2907 9 | 1.5156 10 | 8.4411 11 | 4.4085 12 | 2.1439 13 | 2.8971 14 | 6.1008 15 | 3.5258 16 | 2.7694 17 | 2.2356 18 | 1.9509 19 | 4.0905 20 | 3.6726 21 | 3.1696 22 | 2.5389 23 | 3.0319 24 | 4.6779 25 | 2.9076 26 | 2.8616 27 | 1.4722 28 | 5.6413 29 | 2.1167 30 | 4.7308 31 | 4.8173 32 | 2.3438 33 | 1.7333 34 | 1.4429 35 | 2.3253 36 | 2.4022 37 | 3.4048 38 | 6.6073 39 | 4.1080000000000005 40 | 4.2829 41 | 1.5727 42 | 2.5211 43 | 4.2679 44 | 4.7328 45 | 4.7069 46 | 2.465 47 | 5.0267 48 | 2.8043 49 | 2.4053 50 | 1.2176 51 | 2.39 52 | 3.6364 53 | 6.0162 54 | 2.8088 55 | 3.3984 56 | 4.5 57 | 3.9079 58 | 4.9618 59 | 2.9344 60 | 2.4283 61 | 3.7388 62 | 1.6021 63 | 2.3352 64 | 4.0982 65 | 1.9531 66 | 3.2386 67 | 5.1169 68 | 4.692 69 | 4.0 70 | 6.4238 71 | 3.7375 72 | 2.8233 73 | 2.8009 74 | 3.767 75 | 3.6761 76 | 5.0282 77 | 3.5296 78 | 5.215 79 | 4.0125 80 | 9.4667 81 | 5.9062 82 | 3.9864 83 | 2.0734 84 | 2.875 85 | 3.3611 86 | 2.8214 87 | 0.9946 88 | 4.5446 89 | 4.6908 90 | 9.3198 91 | 1.2826 92 | 2.4943 93 | 10.1882 94 | 4.6731 95 | 4.375 96 | 2.8173 97 | 2.0903 98 | 2.725 99 | 2.8547 100 | 2.25 101 | 1.9444 102 | 1.7167 103 | 1.9342 104 | 4.9524 105 | 3.65 106 | 3.0856 107 | 3.2396 108 | 2.9324 109 | 3.495 110 | 1.9818 111 | 4.6964 112 | 3.925 113 | 3.625 114 | 2.9688 115 | 4.0417 116 | 9.7956 117 | 3.8732 118 | 2.6998 119 | 2.006 120 | 4.25 121 | 3.1839999999999997 122 | 5.9658 123 | 2.628 124 | 2.5057 125 | 5.155 126 | 4.6 127 | 4.6681 128 | 5.5942 129 | 5.1104 130 | 3.0759 131 | 3.5757 132 | 3.6845 133 | 6.4667 134 | 5.273 135 | 3.0635 136 | 11.2866 137 | 4.0444 138 | 5.2541 139 | 5.5791 140 | 4.5375 141 | 9.8144 142 | 6.7257 143 | 4.1442 144 | 4.0313 145 | 2.2791 146 | 4.1679 147 | 3.2852 148 | 3.2768 149 | 5.021 150 | 4.875 151 | 4.419 152 | 3.3272 153 | 4.2386 154 | 1.245 155 | 5.152 156 | 4.8125 157 | 2.1638 158 | 7.1621 159 | 1.5372 160 | 10.0481 161 | 3.3869 162 | 5.4591 163 | 4.4318 164 | 6.5044 165 | 4.2865 166 | 3.0461 167 | 11.3283 168 | 2.7026 169 | 3.016 170 | 3.0943 171 | 3.225 172 | 6.187 173 | 3.8158 174 | 3.0147 175 | 15.0 176 | 3.1364 177 | 2.9 178 | 5.5941 179 | 3.4028 180 | 6.0062 181 | 8.3792 182 | 3.8036 183 | 2.0926 184 | 6.7703 185 | 4.2569 186 | 4.744 187 | 9.7037 188 | 5.1292 189 | 2.3148 190 | 3.3021 191 | 1.95 192 | 3.025 193 | 2.6523 194 | 1.2188 195 | 5.827999999999999 196 | 3.1587 197 | 2.45 198 | 2.3851 199 | 2.1221 200 | 3.5313 201 | 3.4821 202 | 7.8252 203 | 5.1878 204 | 3.7459 205 | 6.0097 206 | 2.3194 207 | 4.2061 208 | 2.267 209 | 2.2109 210 | 2.7589 211 | 2.6553 212 | 6.3325 213 | 5.7233 214 | 4.337 215 | 3.9667 216 | 5.8623 217 | 1.6806 218 | 3.5851 219 | 2.9716 220 | 3.9 221 | 2.7431 222 | 3.3621 223 | 1.9464 224 | 7.3518 225 | 4.775 226 | 3.5968 227 | 6.221 228 | 10.0968 229 | 1.9483 230 | 2.0469 231 | 3.725 232 | 3.675 233 | 1.8529 234 | 1.7159 235 | 1.7386 236 | 3.6687 237 | 3.4671 238 | 4.8233 239 | 4.3036 240 | 1.6488 241 | 2.9453 242 | 5.0096 243 | 3.175 244 | 4.2031 245 | 3.1667 246 | 5.7204 247 | 3.375 248 | 6.5483 249 | 4.2206 250 | 2.6631 251 | 3.5363 252 | -------------------------------------------------------------------------------- /column-type-transform.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": {}, 5 | "cell_type": "markdown", 6 | "source": "# Column Type Transformations\nCopyright (c) Microsoft Corporation. All rights reserved.
\nLicensed under the MIT License." 7 | }, 8 | { 9 | "metadata": {}, 10 | "cell_type": "markdown", 11 | "source": "DataPrep has the ability to transform column types." 12 | }, 13 | { 14 | "metadata": { 15 | "trusted": true 16 | }, 17 | "cell_type": "code", 18 | "source": "import azureml.dataprep as dprep\ndataflow = dprep.read_csv(path=r'data\\elements.csv')\ndataflow.head(3)", 19 | "execution_count": 1, 20 | "outputs": [ 21 | { 22 | "output_type": "execute_result", 23 | "execution_count": 1, 24 | "data": { 25 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
IDSymbolBoiling Point
01H-252.87
153I184.3
22He-268.93
\n
", 26 | "text/plain": " ID Symbol Boiling Point\n0 1 H -252.87\n1 53 I 184.3\n2 2 He -268.93" 27 | }, 28 | "metadata": {} 29 | } 30 | ] 31 | }, 32 | { 33 | "metadata": {}, 34 | "cell_type": "markdown", 35 | "source": "#### `to_long(columns)`" 36 | }, 37 | { 38 | "metadata": { 39 | "trusted": true 40 | }, 41 | "cell_type": "code", 42 | "source": "# Convert the boiling point to a 64 bit integer.\nintegers_only_dataflow = dataflow.to_long(['Boiling Point'])\nintegers_only_dataflow.head(3)", 43 | "execution_count": 2, 44 | "outputs": [ 45 | { 46 | "output_type": "execute_result", 47 | "execution_count": 2, 48 | "data": { 49 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
IDSymbolBoiling Point
01H-252
153I184
22He-268
\n
", 50 | "text/plain": " ID Symbol Boiling Point\n0 1 H -252\n1 53 I 184\n2 2 He -268" 51 | }, 52 | "metadata": {} 53 | } 54 | ] 55 | }, 56 | { 57 | "metadata": { 58 | "trusted": true 59 | }, 60 | "cell_type": "code", 61 | "source": "", 62 | "execution_count": null, 63 | "outputs": [] 64 | } 65 | ], 66 | "metadata": { 67 | "kernelspec": { 68 | "name": "python36", 69 | "display_name": "Python 3.6", 70 | "language": "python" 71 | }, 72 | "language_info": { 73 | "mimetype": "text/x-python", 74 | "nbconvert_exporter": "python", 75 | "name": "python", 76 | "pygments_lexer": "ipython3", 77 | "version": "3.6.6", 78 | "file_extension": ".py", 79 | "codemirror_mode": { 80 | "version": 3, 81 | "name": "ipython" 82 | } 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 2 87 | } -------------------------------------------------------------------------------- /package-json-representation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": {}, 5 | "cell_type": "markdown", 6 | "source": "DataPrep Package can be saved to and loaded from JSON string." 7 | }, 8 | { 9 | "metadata": {}, 10 | "cell_type": "markdown", 11 | "source": "# Work with JSON representation of Package\nCopyright (c) Microsoft Corporation. All rights reserved.
\nLicensed under the MIT License." 12 | }, 13 | { 14 | "metadata": { 15 | "trusted": true 16 | }, 17 | "cell_type": "code", 18 | "source": "# create a Dataflow and pack it into a Package\nimport azureml.dataprep as dprep\ndf = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv')\npkg = dprep.Package(df)", 19 | "execution_count": 1, 20 | "outputs": [] 21 | }, 22 | { 23 | "metadata": { 24 | "trusted": true 25 | }, 26 | "cell_type": "code", 27 | "source": "# save Package to JSON string\njson_str = pkg.to_json()\njson_str", 28 | "execution_count": 2, 29 | "outputs": [ 30 | { 31 | "output_type": "execute_result", 32 | "execution_count": 2, 33 | "data": { 34 | "text/plain": "'{\\n \"schemaVersion\": 63,\\n \"id\": \"1e865029-9dec-4664-a9ca-effecbace8c9\",\\n \"activities\": [\\n {\\n \"id\": \"9a2f0365-c518-4d54-b68c-e8ca31ef5b22\",\\n \"name\": \"dataflow\",\\n \"blocks\": [\\n {\\n \"id\": \"c60a8fb8-d189-477a-af3d-d5e163cb9eae\",\\n \"type\": \"Microsoft.DPrep.GetFilesBlock\",\\n \"arguments\": {\\n \"isArchive\": false,\\n \"path\": {\\n \"target\": 1,\\n \"resourceDetails\": [\\n {\\n \"path\": \"https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv\",\\n \"sas\": null,\\n \"storageAccountName\": null,\\n \"storageAccountKey\": null\\n }\\n ]\\n }\\n },\\n \"localData\": {},\\n \"isEnabled\": true,\\n \"name\": null,\\n \"annotation\": null\\n },\\n {\\n \"id\": \"6af3740f-32ff-4337-8201-b772de830251\",\\n \"type\": \"Microsoft.DPrep.ParseDelimitedBlock\",\\n \"arguments\": {\\n \"columnHeadersMode\": 3,\\n \"fileEncoding\": 0,\\n \"handleQuotedLineBreaks\": false,\\n \"preview\": false,\\n \"separator\": \",\",\\n \"skipRows\": 0,\\n \"skipRowsMode\": 0\\n },\\n \"localData\": {},\\n \"isEnabled\": true,\\n \"name\": null,\\n \"annotation\": null\\n },\\n {\\n \"id\": \"f46f529c-0a0d-4a88-b398-734a71f42d48\",\\n \"type\": \"Microsoft.DPrep.DropColumnsBlock\",\\n \"arguments\": {\\n \"columns\": {\\n \"type\": 0,\\n \"details\": {\\n \"selectedColumns\": [\\n \"Path\"\\n ]\\n }\\n }\\n },\\n \"localData\": {},\\n \"isEnabled\": true,\\n \"name\": null,\\n \"annotation\": null\\n }\\n ],\\n \"inspectors\": []\\n }\\n ],\\n \"runConfigurations\": []\\n}'" 35 | }, 36 | "metadata": {} 37 | } 38 | ] 39 | }, 40 | { 41 | "metadata": { 42 | "trusted": true 43 | }, 44 | "cell_type": "code", 45 | "source": "# load Package from JSON string\npkg_loaded = dprep.Package.from_json(json_str)\ndf_loaded = pkg_loaded.dataflows[0]", 46 | "execution_count": 3, 47 | "outputs": [] 48 | }, 49 | { 50 | "metadata": { 51 | "trusted": true 52 | }, 53 | "cell_type": "code", 54 | "source": "", 55 | "execution_count": null, 56 | "outputs": [] 57 | } 58 | ], 59 | "metadata": { 60 | "kernelspec": { 61 | "name": "python36", 62 | "display_name": "Python 3.6", 63 | "language": "python" 64 | }, 65 | "language_info": { 66 | "mimetype": "text/x-python", 67 | "nbconvert_exporter": "python", 68 | "name": "python", 69 | "pygments_lexer": "ipython3", 70 | "version": "3.6.6", 71 | "file_extension": ".py", 72 | "codemirror_mode": { 73 | "version": 3, 74 | "name": "ipython" 75 | } 76 | } 77 | }, 78 | "nbformat": 4, 79 | "nbformat_minor": 2 80 | } -------------------------------------------------------------------------------- /data/secrets.dprep: -------------------------------------------------------------------------------- 1 | { 2 | "schemaVersion": 61, 3 | "id": "0ca59762-2256-45e6-b406-e58a4bb280b9", 4 | "activities": [ 5 | { 6 | "id": "b308e5b8-9b2a-47f8-9d32-0f542b4a34a4", 7 | "name": "read_csv_duplicate_headers", 8 | "blocks": [ 9 | { 10 | "id": "8d9ec228-6a4b-4abf-afb7-65f58dda1581", 11 | "type": "Microsoft.DPrep.GetFilesBlock", 12 | "arguments": { 13 | "path": { 14 | "target": 1, 15 | "resourceDetails": [ 16 | { 17 | "path": "https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv", 18 | "sas": { 19 | "id": "https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv", 20 | "secretType": "AzureMLSecret" 21 | }, 22 | "storageAccountName": null, 23 | "storageAccountKey": null 24 | } 25 | ] 26 | } 27 | }, 28 | "isEnabled": true, 29 | "name": null, 30 | "annotation": null 31 | }, 32 | { 33 | "id": "4ad0460f-ec65-47c0-a0a4-44345404a462", 34 | "type": "Microsoft.DPrep.ParseDelimitedBlock", 35 | "arguments": { 36 | "columnHeadersMode": 3, 37 | "fileEncoding": 0, 38 | "handleQuotedLineBreaks": false, 39 | "preview": false, 40 | "separator": ",", 41 | "skipRows": 0, 42 | "skipRowsMode": 0 43 | }, 44 | "isEnabled": true, 45 | "name": null, 46 | "annotation": null 47 | }, 48 | { 49 | "id": "1a3e11ba-5854-48da-aa47-53af61beb782", 50 | "type": "Microsoft.DPrep.DropColumnsBlock", 51 | "arguments": { 52 | "columns": { 53 | "type": 0, 54 | "details": { 55 | "selectedColumns": [ 56 | "Path" 57 | ] 58 | } 59 | } 60 | }, 61 | "isEnabled": true, 62 | "name": null, 63 | "annotation": null 64 | } 65 | ], 66 | "inspectors": [] 67 | }, 68 | { 69 | "id": "2d1fd227-0e7c-41de-9606-ca7eced82e07", 70 | "name": "population", 71 | "blocks": [ 72 | { 73 | "id": "27060820-095e-48d1-bdbd-511f7e369105", 74 | "type": "Microsoft.DPrep.GetFilesBlock", 75 | "arguments": { 76 | "path": { 77 | "target": 1, 78 | "resourceDetails": [ 79 | { 80 | "path": "https://dpreptestfiles.blob.core.windows.net/testfiles/population.csv", 81 | "sas": { 82 | "id": "https://dpreptestfiles.blob.core.windows.net/testfiles/population.csv", 83 | "secretType": "AzureMLSecret" 84 | }, 85 | "storageAccountName": null, 86 | "storageAccountKey": null 87 | } 88 | ] 89 | } 90 | }, 91 | "isEnabled": true, 92 | "name": null, 93 | "annotation": null 94 | }, 95 | { 96 | "id": "e7b2a399-9300-4fe5-8959-0d4ae9fc9172", 97 | "type": "Microsoft.DPrep.ParseDelimitedBlock", 98 | "arguments": { 99 | "columnHeadersMode": 3, 100 | "fileEncoding": 0, 101 | "handleQuotedLineBreaks": false, 102 | "preview": false, 103 | "separator": ",", 104 | "skipRows": 0, 105 | "skipRowsMode": 0 106 | }, 107 | "isEnabled": true, 108 | "name": null, 109 | "annotation": null 110 | }, 111 | { 112 | "id": "5572e00a-dd5e-41fe-b301-3e66d0f4c5e2", 113 | "type": "Microsoft.DPrep.DropColumnsBlock", 114 | "arguments": { 115 | "columns": { 116 | "type": 0, 117 | "details": { 118 | "selectedColumns": [ 119 | "Path" 120 | ] 121 | } 122 | } 123 | }, 124 | "isEnabled": true, 125 | "name": null, 126 | "annotation": null 127 | } 128 | ], 129 | "inspectors": [] 130 | }, 131 | { 132 | "id": "ec2c9cf9-beb9-4ebd-b4d2-8ba076c6a3db", 133 | "name": "top_films", 134 | "blocks": [ 135 | { 136 | "id": "6ac0814d-9e5b-4db5-8cc1-f11dc3db531d", 137 | "type": "Microsoft.DPrep.GetFilesBlock", 138 | "arguments": { 139 | "path": { 140 | "target": 1, 141 | "resourceDetails": [ 142 | { 143 | "path": "https://dpreptestfiles.blob.core.windows.net/testfiles/TopFilms.csv", 144 | "sas": { 145 | "id": "https://dpreptestfiles.blob.core.windows.net/testfiles/TopFilms.csv", 146 | "secretType": "AzureMLSecret" 147 | }, 148 | "storageAccountName": null, 149 | "storageAccountKey": null 150 | } 151 | ] 152 | } 153 | }, 154 | "isEnabled": true, 155 | "name": null, 156 | "annotation": null 157 | }, 158 | { 159 | "id": "0cd162d2-8395-4369-aa78-e431456c9201", 160 | "type": "Microsoft.DPrep.ParseDelimitedBlock", 161 | "arguments": { 162 | "columnHeadersMode": 3, 163 | "fileEncoding": 0, 164 | "handleQuotedLineBreaks": false, 165 | "preview": false, 166 | "separator": ",", 167 | "skipRows": 0, 168 | "skipRowsMode": 0 169 | }, 170 | "isEnabled": true, 171 | "name": null, 172 | "annotation": null 173 | }, 174 | { 175 | "id": "ceb32a6b-ba57-4c90-a4d0-5913c211961e", 176 | "type": "Microsoft.DPrep.DropColumnsBlock", 177 | "arguments": { 178 | "columns": { 179 | "type": 0, 180 | "details": { 181 | "selectedColumns": [ 182 | "Path" 183 | ] 184 | } 185 | } 186 | }, 187 | "isEnabled": true, 188 | "name": null, 189 | "annotation": null 190 | } 191 | ], 192 | "inspectors": [] 193 | } 194 | ], 195 | "runConfigurations": [] 196 | } -------------------------------------------------------------------------------- /data/crime0-10.dprep: -------------------------------------------------------------------------------- 1 | { 2 | "schemaVersion": 59, 3 | "id": "1ba93a7c-e711-464f-9a70-1c491e28a66f", 4 | "activities": [ 5 | { 6 | "id": "75637565-60ad-4baa-87d3-396a7930cfe7", 7 | "name": "crime0-10", 8 | "blocks": [ 9 | { 10 | "id": "ba5a8061-129e-4618-953a-ce3e89c8f2cb", 11 | "type": "Microsoft.DPrep.GetFilesBlock", 12 | "arguments": { 13 | "path": { 14 | "target": 0, 15 | "resourceDetails": [ 16 | { 17 | "path": "./crime0-10.csv" 18 | } 19 | ] 20 | } 21 | }, 22 | "isEnabled": true, 23 | "name": null, 24 | "annotation": null 25 | }, 26 | { 27 | "id": "1b345643-6b60-4ca1-99f9-2a64ae932a23", 28 | "type": "Microsoft.DPrep.ParseDelimitedBlock", 29 | "arguments": { 30 | "columnHeadersMode": 1, 31 | "fileEncoding": 0, 32 | "handleQuotedLineBreaks": false, 33 | "preview": false, 34 | "separator": ",", 35 | "skipRowsMode": 0 36 | }, 37 | "isEnabled": true, 38 | "name": null, 39 | "annotation": null 40 | }, 41 | { 42 | "id": "12cf73a2-1487-4915-bfa7-c86be7de08c0", 43 | "type": "Microsoft.DPrep.SetColumnTypesBlock", 44 | "arguments": { 45 | "columnConversion": [ 46 | { 47 | "column": { 48 | "type": 2, 49 | "details": { 50 | "selectedColumn": "ID" 51 | } 52 | }, 53 | "typeProperty": 3 54 | }, 55 | { 56 | "column": { 57 | "type": 2, 58 | "details": { 59 | "selectedColumn": "IUCR" 60 | } 61 | }, 62 | "typeProperty": 3 63 | }, 64 | { 65 | "column": { 66 | "type": 2, 67 | "details": { 68 | "selectedColumn": "Domestic" 69 | } 70 | }, 71 | "typeProperty": 1 72 | }, 73 | { 74 | "column": { 75 | "type": 2, 76 | "details": { 77 | "selectedColumn": "Beat" 78 | } 79 | }, 80 | "typeProperty": 3 81 | }, 82 | { 83 | "column": { 84 | "type": 2, 85 | "details": { 86 | "selectedColumn": "District" 87 | } 88 | }, 89 | "typeProperty": 3 90 | }, 91 | { 92 | "column": { 93 | "type": 2, 94 | "details": { 95 | "selectedColumn": "Ward" 96 | } 97 | }, 98 | "typeProperty": 3 99 | }, 100 | { 101 | "column": { 102 | "type": 2, 103 | "details": { 104 | "selectedColumn": "Community Area" 105 | } 106 | }, 107 | "typeProperty": 3 108 | }, 109 | { 110 | "column": { 111 | "type": 2, 112 | "details": { 113 | "selectedColumn": "Year" 114 | } 115 | }, 116 | "typeProperty": 3 117 | }, 118 | { 119 | "column": { 120 | "type": 2, 121 | "details": { 122 | "selectedColumn": "Longitude" 123 | } 124 | }, 125 | "typeProperty": 3 126 | }, 127 | { 128 | "column": { 129 | "type": 2, 130 | "details": { 131 | "selectedColumn": "Arrest" 132 | } 133 | }, 134 | "typeProperty": 1 135 | }, 136 | { 137 | "column": { 138 | "type": 2, 139 | "details": { 140 | "selectedColumn": "X Coordinate" 141 | } 142 | }, 143 | "typeProperty": 3 144 | }, 145 | { 146 | "column": { 147 | "type": 2, 148 | "details": { 149 | "selectedColumn": "Updated On" 150 | } 151 | }, 152 | "typeArguments": { 153 | "dateTimeFormats": [ 154 | "%m/%d/%Y %I:%M:%S %p" 155 | ] 156 | }, 157 | "typeProperty": 4 158 | }, 159 | { 160 | "column": { 161 | "type": 2, 162 | "details": { 163 | "selectedColumn": "Date" 164 | } 165 | }, 166 | "typeArguments": { 167 | "dateTimeFormats": [ 168 | "%m/%d/%Y %I:%M:%S %p" 169 | ] 170 | }, 171 | "typeProperty": 4 172 | }, 173 | { 174 | "column": { 175 | "type": 2, 176 | "details": { 177 | "selectedColumn": "Y Coordinate" 178 | } 179 | }, 180 | "typeProperty": 3 181 | }, 182 | { 183 | "column": { 184 | "type": 2, 185 | "details": { 186 | "selectedColumn": "Latitude" 187 | } 188 | }, 189 | "typeProperty": 3 190 | } 191 | ] 192 | }, 193 | "isEnabled": true, 194 | "name": null, 195 | "annotation": null 196 | }, 197 | { 198 | "id": "5f370fdf-2fde-4f18-8069-93ef5800bf0c", 199 | "type": "Microsoft.DPrep.SampleBlock", 200 | "arguments": { 201 | "activeSample": "0afde520-3a41-4fef-8d20-eaa07d588924", 202 | "samples": [ 203 | { 204 | "allowAutoGen": true, 205 | "isDisabled": false, 206 | "sampleId": "0afde520-3a41-4fef-8d20-eaa07d588924", 207 | "sampleName": "Top 10000", 208 | "sampleRevision": "d8663336-152a-462f-bb57-686dc7a0843c", 209 | "sampleRunner": { 210 | "id": null, 211 | "type": 0 212 | }, 213 | "sampleStrategy": 0, 214 | "topArguments": { 215 | "sampleCount": 10000 216 | } 217 | } 218 | ] 219 | }, 220 | "isEnabled": true, 221 | "name": null, 222 | "annotation": null 223 | }, 224 | { 225 | "id": "dfd62543-9285-412b-a930-0aeaaffde699", 226 | "type": "Microsoft.DPrep.HandlePathColumnBlock", 227 | "arguments": { 228 | "pathColumnOperation": 0 229 | }, 230 | "isEnabled": true, 231 | "name": null, 232 | "annotation": null 233 | } 234 | ], 235 | "inspectors": [] 236 | } 237 | ], 238 | "runConfigurations": [] 239 | } -------------------------------------------------------------------------------- /quantile-transformation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": {}, 5 | "cell_type": "markdown", 6 | "source": "# Quantile Transformation\nCopyright (c) Microsoft Corporation. All rights reserved.\nLicensed under the MIT License.\n\nDataPrep has the ability to perform quantile transformation to a numeric column. This transformation can transform the data into a normal or uniform distribution. Values bigger than the learnt boundaries will simply be clipped to the learnt boundaries when applying quantile transformation.\n\nLet's load a sample of the median income of california households in different suburbs from the 1990 census data. From the data profile, we can see that the minimum value and maximum value is 0.9946 and 15 respectively." 7 | }, 8 | { 9 | "metadata": { 10 | "trusted": true 11 | }, 12 | "cell_type": "code", 13 | "source": "!pip install azureml", 14 | "execution_count": 1, 15 | "outputs": [ 16 | { 17 | "output_type": "stream", 18 | "text": "Requirement already satisfied: azureml in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (0.2.7)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.3)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.19.1)\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.8.24)\nRequirement already satisfied: urllib3<1.24,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.5)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.5)\n", 19 | "name": "stdout" 20 | } 21 | ] 22 | }, 23 | { 24 | "metadata": { 25 | "trusted": true 26 | }, 27 | "cell_type": "code", 28 | "source": "import azureml.dataprep as dprep\n\ndf = dprep.read_csv(path='./data/median_income.csv').set_column_types(type_conversions={\n 'median_income': dprep.TypeConverter(dprep.FieldType.DECIMAL)\n})\ndf.get_profile()", 29 | "execution_count": 2, 30 | "outputs": [ 31 | { 32 | "output_type": "execute_result", 33 | "execution_count": 2, 34 | "data": { 35 | "text/html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TypeMinMaxCountMissing CountError CountLower QuartileMedianUpper QuartileStandard DeviationMean
median_incomeFieldType.DECIMAL0.994615.0250.00.00.02.69073.63074.773352.0266794.007843
", 36 | "text/plain": "ColumnProfile\n name: median_income\n type: FieldType.DECIMAL\n\n min: 0.9946\n max: 15.0\n count: 250.0\n missing_count: 0.0\n error_count: 0.0\n\n lower_quartile: 2.6907\n median: 3.6307\n upper_quartile: 4.773350000000001\n std: 2.026679472255346\n mean: 4.007842799999996" 37 | }, 38 | "metadata": {} 39 | } 40 | ] 41 | }, 42 | { 43 | "metadata": {}, 44 | "cell_type": "markdown", 45 | "source": "Let's now apply quantile transformation to `median_income` and see how that affects the data. We will apply quantile transformation twice, one that maps the data to a Uniform(0, 1) distribution, one that maps it to a Normal(0, 1) distribution.\n\nFrom the data profile, we can see that the min and max of the uniform median income is strictly between 0 and 1 and the mean and standard deviation of the normal median income is close to 1 and 0 respectively.\n\n*note: for normal distribution, we will clip the values at the ends as the 0th percentile and the 100th percentile are -Inf and Inf respectively.*" 46 | }, 47 | { 48 | "metadata": { 49 | "trusted": true 50 | }, 51 | "cell_type": "code", 52 | "source": "df = df.quantile_transform(source_column='median_income', new_column='median_income_uniform', quantiles_count=5)\ndf = df.quantile_transform(source_column='median_income', new_column='median_income_normal', \n quantiles_count=5, output_distribution=\"Normal\")\ndf.get_profile()", 53 | "execution_count": 3, 54 | "outputs": [ 55 | { 56 | "output_type": "execute_result", 57 | "execution_count": 3, 58 | "data": { 59 | "text/html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TypeMinMaxCountMissing CountError CountLower QuartileMedianUpper QuartileStandard DeviationMean
median_incomeFieldType.DECIMAL0.99460015.000000250.00.00.02.6907003.6307004.7733502.0266794.007843
median_income_normalFieldType.DECIMAL-7.9413457.941444250.00.00.0-0.671590-0.0003370.6678101.021506-0.060922
median_income_uniformFieldType.DECIMAL0.0000001.000000250.00.00.00.2509340.4998660.7478610.2528300.484762
", 60 | "text/plain": "ColumnProfile\n name: median_income\n type: FieldType.DECIMAL\n\n min: 0.9946\n max: 15.0\n count: 250.0\n missing_count: 0.0\n error_count: 0.0\n\n lower_quartile: 2.6907\n median: 3.6307\n upper_quartile: 4.773350000000001\n std: 2.026679472255346\n mean: 4.007842799999996\n\nColumnProfile\n name: median_income_normal\n type: FieldType.DECIMAL\n\n min: -7.941345326170997\n max: 7.94144448741598\n count: 250.0\n missing_count: 0.0\n error_count: 0.0\n\n lower_quartile: -0.6715898847385642\n median: -0.00033696356609359737\n upper_quartile: 0.6678101623094225\n std: 1.021505801777812\n mean: -0.06092218967843191\n\nColumnProfile\n name: median_income_uniform\n type: FieldType.DECIMAL\n\n min: 0.0\n max: 1.0\n count: 250.0\n missing_count: 0.0\n error_count: 0.0\n\n lower_quartile: 0.25093366375866033\n median: 0.4998655717951272\n upper_quartile: 0.7478610044020887\n std: 0.25283034846216024\n mean: 0.4847624122367444" 61 | }, 62 | "metadata": {} 63 | } 64 | ] 65 | }, 66 | { 67 | "metadata": {}, 68 | "cell_type": "markdown", 69 | "source": "Let's now save the dataflow which we will later load in the operationalization notebook." 70 | }, 71 | { 72 | "metadata": { 73 | "trusted": true 74 | }, 75 | "cell_type": "code", 76 | "source": "from tempfile import mkdtemp\nfrom os import path\n\ntmp_dir = mkdtemp()\npackage_path = path.join(tmp_dir, 'quantile_transform.dprep')\npackage = dprep.Package(arg=df)\npackage.save(package_path)\nprint('Package saved to: \"{}\"'.format(package_path))", 77 | "execution_count": 3, 78 | "outputs": [ 79 | { 80 | "output_type": "stream", 81 | "text": "Package saved to: \"/tmp/tmp29cvg68a/quantile_transform.dprep\"\n", 82 | "name": "stdout" 83 | } 84 | ] 85 | }, 86 | { 87 | "metadata": { 88 | "trusted": true 89 | }, 90 | "cell_type": "code", 91 | "source": "", 92 | "execution_count": null, 93 | "outputs": [] 94 | } 95 | ], 96 | "metadata": { 97 | "kernelspec": { 98 | "name": "python36", 99 | "display_name": "Python 3.6", 100 | "language": "python" 101 | }, 102 | "language_info": { 103 | "mimetype": "text/x-python", 104 | "nbconvert_exporter": "python", 105 | "name": "python", 106 | "pygments_lexer": "ipython3", 107 | "version": "3.6.6", 108 | "file_extension": ".py", 109 | "codemirror_mode": { 110 | "version": 3, 111 | "name": "ipython" 112 | } 113 | } 114 | }, 115 | "nbformat": 4, 116 | "nbformat_minor": 2 117 | } -------------------------------------------------------------------------------- /read-pandas-dataframe.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": { 5 | "trusted": true 6 | }, 7 | "cell_type": "code", 8 | "source": "!pip install azureml", 9 | "execution_count": 1, 10 | "outputs": [ 11 | { 12 | "output_type": "stream", 13 | "text": "Requirement already satisfied: azureml in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (0.2.7)\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.19.1)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.3)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.5)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.5)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.8.24)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: urllib3<1.24,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\n", 14 | "name": "stdout" 15 | } 16 | ] 17 | }, 18 | { 19 | "metadata": { 20 | "trusted": true 21 | }, 22 | "cell_type": "code", 23 | "source": "import azureml.dataprep as dprep", 24 | "execution_count": 2, 25 | "outputs": [] 26 | }, 27 | { 28 | "metadata": { 29 | "trusted": true 30 | }, 31 | "cell_type": "code", 32 | "source": "dflow = dprep.read_excel(path='./data/excel.xlsx')\ndflow = dflow.drop_columns(columns=['Column1'])\ndf = dflow.to_pandas_dataframe()\ndf.head(10)", 33 | "execution_count": 3, 34 | "outputs": [ 35 | { 36 | "output_type": "execute_result", 37 | "execution_count": 3, 38 | "data": { 39 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Column2Column3Column4Column5Column6Column7Column8
0Iron, IVB6e+07Found1920http://www.lpi.usra.edu/meteor/metbull.php?cod...-19.583317.9167
1Iron, IIIAB5.82e+07Found1818http://www.lpi.usra.edu/meteor/metbull.php?cod...76.1333-64.9333
2Iron, IAB-MG5e+07Found1576http://www.lpi.usra.edu/meteor/metbull.php?cod...-27.4667-60.5833
3Iron, IAB-MG3e+07Found1891http://www.lpi.usra.edu/meteor/metbull.php?cod...35.05-111.033
4Iron, IIIE2.8e+07Found1898http://www.lpi.usra.edu/meteor/metbull.php?cod...4788
5Iron, IVA2.6e+07Found1836http://www.lpi.usra.edu/meteor/metbull.php?cod...-25.518
6Iron, IIIAB2.43e+07Found1852http://www.lpi.usra.edu/meteor/metbull.php?cod...27-105.1
7Iron, IAB-ung2.4e+07Found1911http://www.lpi.usra.edu/meteor/metbull.php?cod...-30.7833127.55
8Iron, IIAB2.3e+07Fell1947http://www.lpi.usra.edu/meteor/metbull.php?cod...46.16134.653
9Iron, ungrouped2.2e+07Found1863http://www.lpi.usra.edu/meteor/metbull.php?cod...26.2-107.833
\n
", 40 | "text/plain": " Column2 Column3 Column4 Column5 \\\n0 Iron, IVB 6e+07 Found 1920 \n1 Iron, IIIAB 5.82e+07 Found 1818 \n2 Iron, IAB-MG 5e+07 Found 1576 \n3 Iron, IAB-MG 3e+07 Found 1891 \n4 Iron, IIIE 2.8e+07 Found 1898 \n5 Iron, IVA 2.6e+07 Found 1836 \n6 Iron, IIIAB 2.43e+07 Found 1852 \n7 Iron, IAB-ung 2.4e+07 Found 1911 \n8 Iron, IIAB 2.3e+07 Fell 1947 \n9 Iron, ungrouped 2.2e+07 Found 1863 \n\n Column6 Column7 Column8 \n0 http://www.lpi.usra.edu/meteor/metbull.php?cod... -19.5833 17.9167 \n1 http://www.lpi.usra.edu/meteor/metbull.php?cod... 76.1333 -64.9333 \n2 http://www.lpi.usra.edu/meteor/metbull.php?cod... -27.4667 -60.5833 \n3 http://www.lpi.usra.edu/meteor/metbull.php?cod... 35.05 -111.033 \n4 http://www.lpi.usra.edu/meteor/metbull.php?cod... 47 88 \n5 http://www.lpi.usra.edu/meteor/metbull.php?cod... -25.5 18 \n6 http://www.lpi.usra.edu/meteor/metbull.php?cod... 27 -105.1 \n7 http://www.lpi.usra.edu/meteor/metbull.php?cod... -30.7833 127.55 \n8 http://www.lpi.usra.edu/meteor/metbull.php?cod... 46.16 134.653 \n9 http://www.lpi.usra.edu/meteor/metbull.php?cod... 26.2 -107.833 " 41 | }, 42 | "metadata": {} 43 | } 44 | ] 45 | }, 46 | { 47 | "metadata": {}, 48 | "cell_type": "markdown", 49 | "source": "## read_pandas_dataframe\n\nThere are situations where you may already have some data in the form of a pandas DataFrame.\nThe steps taken to get to this DataFrame may be non-trivial or not easy to convert to dprep operations. The 'read_pandas_dataframe' reader can take a DataFrame and use it as the datasource for a Dataflow.\nIt is also required to pass in a path to a directory (that exists) where DataPrep can store the contents of the DataFrame. The files written to this directory will be named 'part-00000' and so on, they are written out as DataPrep's internal row based file format." 50 | }, 51 | { 52 | "metadata": { 53 | "trusted": true 54 | }, 55 | "cell_type": "code", 56 | "source": "import shutil\ncache_dir = 'df_dflow'\nshutil.rmtree(cache_dir, ignore_errors=True)\ndf_dflow = dprep.read_pandas_dataframe(df, cache_dir)", 57 | "execution_count": 5, 58 | "outputs": [ 59 | { 60 | "output_type": "error", 61 | "ename": "AttributeError", 62 | "evalue": "module 'azureml.dataprep' has no attribute 'read_pandas_dataframe'", 63 | "traceback": [ 64 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 65 | "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", 66 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mcache_dir\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'df_dflow'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mshutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrmtree\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcache_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_errors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mdf_dflow\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdprep\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_pandas_dataframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 67 | "\u001b[0;31mAttributeError\u001b[0m: module 'azureml.dataprep' has no attribute 'read_pandas_dataframe'" 68 | ] 69 | } 70 | ] 71 | }, 72 | { 73 | "metadata": { 74 | "trusted": true 75 | }, 76 | "cell_type": "code", 77 | "source": "df_dflow.head(10)", 78 | "execution_count": null, 79 | "outputs": [] 80 | }, 81 | { 82 | "metadata": { 83 | "trusted": true 84 | }, 85 | "cell_type": "code", 86 | "source": "", 87 | "execution_count": null, 88 | "outputs": [] 89 | } 90 | ], 91 | "metadata": { 92 | "execute_as_test": false, 93 | "kernelspec": { 94 | "name": "python36", 95 | "display_name": "Python 3.6", 96 | "language": "python" 97 | }, 98 | "language_info": { 99 | "mimetype": "text/x-python", 100 | "nbconvert_exporter": "python", 101 | "name": "python", 102 | "pygments_lexer": "ipython3", 103 | "version": "3.6.6", 104 | "file_extension": ".py", 105 | "codemirror_mode": { 106 | "version": 3, 107 | "name": "ipython" 108 | } 109 | } 110 | }, 111 | "nbformat": 4, 112 | "nbformat_minor": 2 113 | } -------------------------------------------------------------------------------- /data/median_income_transformed.csv: -------------------------------------------------------------------------------- 1 | median_income,median_income_uniform,median_income_normal 2 | 4.4896,0.688927015969381,0.4928112398942898 3 | 2.1029,0.16242159563866576,-0.9845540061415601 4 | 2.3889,0.20433495515563627,-0.8262365643809355 5 | 3.707,0.5167832475474021,0.04208177981426486 6 | 6.4788,0.7918154943685715,0.8127367410844715 7 | 4.4074,0.6708459812590735,0.44225037403262457 8 | 5.2907,0.7627885954411082,0.71530142355699 9 | 1.5156,0.07635265842077493,-1.430040774223022 10 | 8.4411,0.8397571522806675,0.9934602851577232 11 | 4.4085,0.6710879415775812,0.44291928632820865 12 | 2.1439,0.16843015417081886,-0.960387338010023 13 | 2.8971,0.3028380993334766,-0.5162551753422808 14 | 6.1008,0.7825804402531089,0.780937730437047 15 | 3.5258,0.47180713824983866,-0.07072794893657437 16 | 2.7694,0.2685175231133089,-0.6173027607303029 17 | 2.2356,0.18186880825370766,-0.9082661605741446 18 | 1.9509,0.14014596400726886,-1.0796637810446446 19 | 4.0905,0.6011394131362455,0.25629744014575906 20 | 3.6726,0.5092164884958867,0.023104366055624576 21 | 3.1696,0.3760750376263169,-0.31580558952782856 22 | 2.5389,0.2263174863708306,-0.7510293827903433 23 | 3.0319,0.3390668673403568,-0.4150111583096138 24 | 4.6779,0.7303462232193919,0.61386044030116 25 | 2.9076,0.3056600731025585,-0.5081899386067169 26 | 2.8616,0.29329714039991395,-0.5437779548183906 27 | 1.4722,0.0699923793891787,-1.475847787309027 28 | 5.6413,0.7713542302900003,0.7433140959507171 29 | 2.1167,0.1644439885104636,-0.9763562034967139 30 | 4.7308,0.7419823149003563,0.6494688564486208 31 | 4.8173,0.7512227895726955,0.6783427156995535 32 | 2.3438,0.19772554077026783,-0.8497733992585795 33 | 1.7333,0.10825663872442697,-1.235852818864947 34 | 1.4429,0.06569845829181076,-1.5086161030849279 35 | 2.3253,0.19501436192039387,-0.8595652745311925 36 | 2.4022,0.20628407292338352,-0.8193826233304142 37 | 3.4048,0.4392872500537518,-0.15277653846062797 38 | 6.6073,0.7949549241406269,0.8237349930522744 39 | 4.1080000000000005,0.6049887818397783,0.2662814785234653 40 | 4.2829,0.6434604724825127,0.3677239840415221 41 | 1.5727,0.0847206753033589,-1.3740010688161062 42 | 2.5211,0.22370889266662755,-0.7597270013437459 43 | 4.2679,0.6401610135937705,0.35888920867930674 44 | 4.7328,0.7424222427521885,0.6508311116191982 45 | 4.7069,0.7367251770709602,0.6332817852325006 46 | 2.465,0.21548742599214485,-0.7875245686101101 47 | 5.0267,0.7563387163763406,0.6945735910123612 48 | 2.8043,0.277897226402924,-0.5890996132954226 49 | 2.4053,0.20673837856849753,-0.8177906162820535 50 | 1.2176,0.03268069640658888,-1.8427782558151542 51 | 2.39,0.2044961603845477,-0.8256682268514074 52 | 3.6364,0.501253794377722,0.0031428016114447123 53 | 6.0162,0.780513547189172,0.7739287859106961 54 | 2.8088,0.2791066437325306,-0.5854974399696314 55 | 3.3984,0.4375671898516448,-0.15714017005326586 56 | 4.5,0.6912146407989088,0.4992962189785684 57 | 3.9079,0.5609740002639567,0.1534391166508579 58 | 4.9618,0.7547531211062519,0.6895237101278 59 | 2.9344,0.3128628251988819,-0.4877518079508566 60 | 2.4283,0.21010903335482733,-0.8060429810797759 61 | 3.7388,0.5237781003915357,0.05963819257139614 62 | 1.6021,0.089029251421537,-1.346757015256811 63 | 2.3352,0.1964652089805967,-0.8543151087700283 64 | 4.0982,0.6028331353658,0.26068721102120757 65 | 1.9531,0.1404683744650917,-1.078217399304828 66 | 3.2386,0.39461943668028376,-0.26729910818310126 67 | 5.1169,0.7585424250568029,0.7016216442421952 68 | 4.692,0.7334477145748096,0.6232738623769376 69 | 4.0,0.5812326778408341,0.20504797284322906 70 | 6.4238,0.7904717695634116,0.8080592738489488 71 | 3.7375,0.5234921472878448,0.05892015392758171 72 | 2.8233,0.28300365512792947,-0.5739416160725967 73 | 2.8009,0.2769834444205546,-0.5918263314054893 74 | 3.767,0.5299810831023711,0.07522231004048434 75 | 3.6761,0.5099863622365932,0.025034712726592745 76 | 5.0282,0.7563753634164814,0.6946905154674345 77 | 3.5296,0.47282842399483976,-0.06816178418735802 78 | 5.215,0.7609391414820063,0.70932677431432 79 | 4.0125,0.583982226914786,0.21209163422132077 80 | 9.4667,0.8648139551928855,1.1022060793492992 81 | 5.9062,0.7778260975788522,0.7648719390554228 82 | 3.9864,0.5782411684483745,0.19739599549652825 83 | 2.0734,0.15809836449967754,-1.0023041289181829 84 | 2.875,0.29689851644807563,-0.5333417486766315 85 | 3.3611,0.4275424639862395,-0.1826343528680402 86 | 2.8214,0.2824930122554289,-0.5754514381784691 87 | 0.9946,9.999999977795539e-08,-5.199337582605575 88 | 4.5446,0.7010250318947692,0.5273508961812362 89 | 4.6908,0.7331837578637103,0.6224705781821348 90 | 9.3198,0.861224988395104,1.085839464021226 91 | 1.2826,0.04220645993317308,-1.725635992615424 92 | 2.4943,0.2197813470895128,-0.7729318836756727 93 | 10.1882,0.8824411815005742,1.1872788772739642 94 | 4.6731,0.7292903963749944,0.6106682834354926 95 | 4.375,0.6637191500593902,0.42263484449515154 96 | 2.8173,0.281391098688454,-0.578713956288219 97 | 2.0903,0.16057506301658947,-0.9920971718209968 98 | 2.725,0.2565846054611911,-0.6539109002088199 99 | 2.8547,0.29144270049451715,-0.5491749392049887 100 | 2.25,0.18397913125036633,-0.9003044332369591 101 | 1.9444,0.13919338765461042,-1.0839504359991745 102 | 1.7167,0.10582390526994544,-1.2490472005138797 103 | 1.9342,0.13769857553197723,-1.0907176288996137 104 | 4.9524,0.7545234663213701,0.6887937530976251 105 | 3.65,0.5042453037701816,0.010641599310222728 106 | 3.0856,0.35349924747366146,-0.37589024233948426 107 | 3.2396,0.394888196086863,-0.26660099147199706 108 | 2.9324,0.3123253063857234,-0.48926992156445664 109 | 3.495,0.46352934852719846,-0.09154607537767347 110 | 1.9818,0.14467436543759887,-1.0595514059995472 111 | 4.6964,0.7344155558488406,0.6262226873574688 112 | 3.925,0.5647353833971228,0.16298628506764115 113 | 3.625,0.4984680713824984,-0.003839985024398658 114 | 2.9688,0.3221081487852074,-0.4618117868349911 115 | 4.0417,0.5904051735515374,0.22858735593517016 116 | 9.7956,0.8728494295277418,1.1399643807904891 117 | 3.8732,0.5533412520346663,0.13410759250347515 118 | 2.6998,0.24989741485432906,-0.6748126069650576 119 | 2.006,0.1482208804736502,-1.0440943039822579 120 | 4.25,0.6362236593198715,0.34838284885436693 121 | 3.1839999999999997,0.3799451730810577,-0.3056247863400478 122 | 5.9658,0.7792822066404437,0.7697712668826051 123 | 2.628,0.23937510991265604,-0.7083141049607123 124 | 2.5057,0.2214520194618676,-0.7672985355850374 125 | 5.155,0.7594732598763774,0.7046091860898294 126 | 4.6,0.7132110333905237,0.5627899373325242 127 | 4.6681,0.7281905767454135,0.6073497232884044 128 | 5.5942,0.7702035132295815,0.7395172425174374 129 | 5.1104,0.7583836212161931,0.7011125840899696 130 | 3.0759,0.35089228122984295,-0.38291260834816176 131 | 3.5757,0.4852182326381423,-0.037060878177008386 132 | 3.6845,0.5118340592142888,0.02966793907608783 133 | 6.4667,0.7915198749114363,0.8117061751243242 134 | 5.273,0.7623561603674476,0.7139021637992619 135 | 3.0635,0.3475596645882605,-0.3919172875113563 136 | 11.2866,0.9092765874276221,1.3363135019854007 137 | 4.0444,0.5909990761515111,0.23011572277987793 138 | 5.2541,0.7618944076616745,0.7124095804237914 139 | 5.5791,0.7698345996921648,0.7383022482134728 140 | 4.5375,0.6994632880207644,0.5228574972480653 141 | 9.8144,0.8733087390975055,1.1421720249180776 142 | 6.7257,0.7978475971757347,0.8339577268169219 143 | 4.1442,0.6129514759579427,0.28701994562086086 144 | 4.0313,0.5881175487220095,0.22270526607462063 145 | 2.2791,0.18824374230611404,-0.88438672867467 146 | 4.1679,0.6181646210021556,0.3006639542199363 147 | 3.2852,0.40714362502687595,-0.23489883984002982 148 | 3.2768,0.4048860460116104,-0.24072005796234255 149 | 5.021,0.7561994576238059,0.6941293646505097 150 | 4.875,0.7526324790501087,0.682797132481279 151 | 4.419,0.6733975627997006,0.44931439580522814 152 | 3.3272,0.41843152010320356,-0.20590766210173994 153 | 4.2386,0.6337160705644274,0.341711709817628 154 | 1.245,0.03669617210856439,-1.7903831065617093 155 | 5.152,0.7593999657960959,0.704373718687922 156 | 4.8125,0.7511055190442452,0.6779727645490474 157 | 2.1638,0.1713465033120347,-0.9488576690059357 158 | 7.1621,0.8085094427206763,0.872416661000832 159 | 1.5372,0.0795181429157629,-1.408320169672634 160 | 10.0481,0.8790183479514304,1.1700935990858978 161 | 3.3869,0.43447645667598356,-0.16498865287495923 162 | 5.4591,0.7669028364809068,0.7286850708477646 163 | 4.4318,0.6762131010514274,0.4571353017049396 164 | 6.5044,0.7924409371869732,0.8149199625504349 165 | 4.2865,0.644252342615811,0.3698485817763834 166 | 3.0461,0.3428832509137819,-0.40460687823111086 167 | 11.3283,0.9102953751435343,1.3425761736583905 168 | 2.7026,0.25056439475381626,-0.6727147380915197 169 | 3.016,0.33479359277574705,-0.4267146406076349 170 | 3.0943,0.35583745431090086,-0.3696075717076287 171 | 3.225,0.39096430875080623,-0.2768065942137824 172 | 6.187,0.7846864234931958,0.7881189136255171 173 | 3.8158,0.5407153226870792,0.10223599868489697 174 | 3.0147,0.33444420554719406,-0.42767409705501475 175 | 15.0,0.9999999000000003,5.19933758270342 176 | 3.1364,0.36715222532788644,-0.33940526790689773 177 | 2.9,0.30361750161255635,-0.5140242974144523 178 | 5.5941,0.7702010700935721,0.739509192607556 179 | 3.4028,0.4387497312405934,-0.15413985695864643 180 | 6.0062,0.7802692335882339,0.7731028191405914 181 | 8.3792,0.8382448510908602,0.9872699911993058 182 | 3.8036,0.5380317627909023,0.09547634997560216 183 | 2.0926,0.1609121284952224,-0.9907160390730209 184 | 6.7703,0.798937235835919,0.837831173103705 185 | 4.2569,0.6377414104086929,0.3524281695696263 186 | 4.744,0.7448858387224493,0.6584822133838069 187 | 9.7037,0.87060418753512,1.1292518063856445 188 | 5.1292,0.7588429307859569,0.7025854406375655 189 | 2.3148,0.19347558473533027,-0.8651596484776254 190 | 3.3021,0.41168565899806486,-0.22321096549396113 191 | 1.95,0.1400140688199777,-1.0802561341135346 192 | 3.025,0.33721242743496016,-0.42008295383958816 193 | 2.6523,0.2429362799695175,-0.6968885244090585 194 | 1.2188,0.032856556656310446,-1.8403755990446928 195 | 5.827999999999999,0.7759155652195158,0.7584713308686281 196 | 3.1587,0.3731455600946033,-0.3235336598161438 197 | 2.45,0.21328917287062546,-0.7950604390156254 198 | 2.3851,0.20377806436485135,-0.8282019693528496 199 | 2.1221,0.16523535963421065,-0.9731661745087966 200 | 3.5313,0.47328531498602444,-0.06701390942662634 201 | 3.4821,0.4600623521823264,-0.10027663775339715 202 | 7.8252,0.8247098775988859,0.9334643965884332 203 | 5.1878,0.7602746084874545,0.7071861842977014 204 | 3.7459,0.5253398442655404,0.06356034060479637 205 | 6.0097,0.7803547433485621,0.773391847490294 206 | 2.3194,0.19414971569259623,-0.862705456837268 207 | 4.2061,0.6265672429721525,0.32277517627250846 208 | 2.267,0.18647048478808834,-0.8909780060851004 209 | 2.2109,0.1782490181136057,-0.9220585498888423 210 | 2.7589,0.265695549344227,-0.6258838950234518 211 | 2.6553,0.2433759305938214,-0.6954842771160817 212 | 6.3325,0.7882411863868461,0.8003334507140982 213 | 5.7233,0.7733576018176932,0.7499500376839813 214 | 4.337,0.6553605208745764,0.3998337668630456 215 | 3.9667,0.5739078791078263,0.18633222024962756 216 | 5.8623,0.7767535608707337,0.7612749184789059 217 | 1.6806,0.1005334427574887,-1.2785178769957448 218 | 3.5851,0.48774457105998714,-0.03072463804138443 219 | 2.9716,0.32286067512362926,-0.45971423541894674 220 | 3.9,0.5592362852492191,0.14903320783722215 221 | 2.7431,0.26144915072027514,-0.6388841474573944 222 | 3.3621,0.4278112233928187,-0.18194938611936773 223 | 1.9464,0.13948648807081301,-1.0826293461973695 224 | 7.3518,0.8131440717304732,0.8895420092252918 225 | 4.775,0.750189343040727,0.6750857069958222 226 | 3.5968,0.4908890561169641,-0.022839735128387998 227 | 6.221,0.7855170897363856,0.7909625929859541 228 | 10.0968,0.8802081551879993,1.1760279942069634 229 | 1.9483,0.1397649334662055,-1.0813760586453744 230 | 2.0469,0.15421478398499322,-1.0185228067876306 231 | 3.725,0.5207425982138929,0.05201743189676979 232 | 3.675,0.5097444019180853,0.02442802263182327 233 | 1.8529,0.1257840436133419,-1.1465489456932518 234 | 1.7159,0.10570666510346441,-1.2496885811666654 235 | 1.7386,0.10903335482736382,-1.2316851754273808 236 | 3.6687,0.5083586291848137,0.020953509422343072 237 | 3.4671,0.45603096108363783,-0.11043812057602183 238 | 4.8233,0.7513693777332584,0.6788052852389578 239 | 4.3036,0.6480137257489771,0.3799634475571365 240 | 1.6488,0.0958731461398675,-1.3054305179573955 241 | 2.9453,0.3157923027305955,-0.47949769802476955 242 | 5.0096,0.7559209401187363,0.6932413225605799 243 | 3.175,0.3775263384218447,-0.31198400815546146 244 | 4.2031,0.625907351194404,0.32103311272888746 245 | 3.1667,0.3752956353472371,-0.3178598218476214 246 | 5.7204,0.7732867508734211,0.7497147894780406 247 | 3.375,0.43127821973769076,-0.17312084617136517 248 | 6.5483,0.7935134738950917,0.818672917584347 249 | 4.2206,0.6297567198979367,0.33120908249828585 250 | 2.6631,0.2445190222170115,-0.6918396326662699 251 | 3.5363,0.4746291120189207,-0.06363831319524592 252 | -------------------------------------------------------------------------------- /secrets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": {}, 5 | "cell_type": "markdown", 6 | "source": "Currently, secrets are only persisted for the lifetime of the engine process and they are not part of the dprep file. If you started a new session (hence start an engine process) and load a package and try to run a dataflow within that package, you will need to call `use_secrets` to register the required secrets to use during execution, otherwise the execution will fail as the required secrets are not available.\n\nIn this notebook, we will:\n1. Loading a previously saved package\n2. Call `get_missing_secrets` to determine the missing secrets\n3. Call `use_secrets` and pass in the missing secrets to register it with the engine for this session\n4. Call `head` to see the a preview of the data" 7 | }, 8 | { 9 | "metadata": {}, 10 | "cell_type": "markdown", 11 | "source": "# Providing Secrets\nCopyright (c) Microsoft Corporation. All rights reserved.
\nLicensed under the MIT License." 12 | }, 13 | { 14 | "metadata": { 15 | "trusted": true 16 | }, 17 | "cell_type": "code", 18 | "source": "!pip install azureml", 19 | "execution_count": 1, 20 | "outputs": [ 21 | { 22 | "output_type": "stream", 23 | "text": "Requirement already satisfied: azureml in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (0.2.7)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.3)\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.19.1)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.5)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.5)\nRequirement already satisfied: urllib3<1.24,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.8.24)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\n", 24 | "name": "stdout" 25 | } 26 | ] 27 | }, 28 | { 29 | "metadata": { 30 | "trusted": true 31 | }, 32 | "cell_type": "code", 33 | "source": "import azureml.dataprep as dprep\nimport os", 34 | "execution_count": 2, 35 | "outputs": [] 36 | }, 37 | { 38 | "metadata": {}, 39 | "cell_type": "markdown", 40 | "source": "Let's load the previously saved package." 41 | }, 42 | { 43 | "metadata": { 44 | "trusted": true 45 | }, 46 | "cell_type": "code", 47 | "source": "package = dprep.Package.open(file_path='./data/secrets.dprep')\ndataflow = package.dataflows[0]", 48 | "execution_count": 3, 49 | "outputs": [] 50 | }, 51 | { 52 | "metadata": {}, 53 | "cell_type": "markdown", 54 | "source": "Let's call `get_missing_secrets` to see what secrets are required missing in the engine." 55 | }, 56 | { 57 | "metadata": { 58 | "trusted": true 59 | }, 60 | "cell_type": "code", 61 | "source": "dataflow.get_missing_secrets()", 62 | "execution_count": 4, 63 | "outputs": [ 64 | { 65 | "output_type": "execute_result", 66 | "execution_count": 4, 67 | "data": { 68 | "text/plain": "['https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv']" 69 | }, 70 | "metadata": {} 71 | } 72 | ] 73 | }, 74 | { 75 | "metadata": {}, 76 | "cell_type": "markdown", 77 | "source": "Let's now read the secrets from an environment variable and put it in our secret dictionary and call `use_secrets` with the secrets. This will register these secrets in the engine so you don't need to provide them again in this session.\n\n_Note: It is a bad practice to have secrets in files that will be checked into source control._" 78 | }, 79 | { 80 | "metadata": { 81 | "trusted": true 82 | }, 83 | "cell_type": "code", 84 | "source": "sas = os.environ['SCENARIOS_SECRETS']\nsecrets = {\n 'https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv': sas\n}\ndataflow.use_secrets(secrets=secrets)", 85 | "execution_count": 5, 86 | "outputs": [ 87 | { 88 | "output_type": "error", 89 | "ename": "KeyError", 90 | "evalue": "'SCENARIOS_SECRETS'", 91 | "traceback": [ 92 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 93 | "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", 94 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msas\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menviron\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'SCENARIOS_SECRETS'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m secrets = {\n\u001b[1;32m 3\u001b[0m \u001b[0;34m'https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0msas\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m }\n\u001b[1;32m 5\u001b[0m \u001b[0mdataflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse_secrets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msecrets\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msecrets\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 95 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/os.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 667\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 668\u001b[0m \u001b[0;31m# raise KeyError with the original key value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 669\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 670\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecodevalue\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 671\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 96 | "\u001b[0;31mKeyError\u001b[0m: 'SCENARIOS_SECRETS'" 97 | ] 98 | } 99 | ] 100 | }, 101 | { 102 | "metadata": {}, 103 | "cell_type": "markdown", 104 | "source": "We can now call `head` without passing in `secrets` and the engine will happily execute and show us a preview of the data." 105 | }, 106 | { 107 | "metadata": { 108 | "trusted": true 109 | }, 110 | "cell_type": "code", 111 | "source": "dataflow.head(5)", 112 | "execution_count": 6, 113 | "outputs": [ 114 | { 115 | "output_type": "error", 116 | "ename": "MissingSecretsError", 117 | "evalue": "Required secrets are missing. Please call use_secrets to register the missing secrets.\nMissing secrets:\nhttps://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv", 118 | "traceback": [ 119 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 120 | "\u001b[0;31mMissingSecretsError\u001b[0m Traceback (most recent call last)", 121 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdataflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 122 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mhead\u001b[0;34m(self, count)\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mA\u001b[0m \u001b[0mPandas\u001b[0m \u001b[0mDataframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 95\u001b[0m \"\"\"\n\u001b[0;32m---> 96\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_pandas_dataframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mextended_types\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 97\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrun_local\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 123 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mto_pandas_dataframe\u001b[0;34m(self, extended_types)\u001b[0m\n\u001b[1;32m 145\u001b[0m })\n\u001b[1;32m 146\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 147\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_raise_if_missing_secrets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 148\u001b[0m self._engine_api.execute_anonymous_blocks(\n\u001b[1;32m 149\u001b[0m ExecuteAnonymousBlocksMessageArguments(blocks=steps_to_block_datas(dataflow_to_execute._steps),\n", 124 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36m_raise_if_missing_secrets\u001b[0;34m(self, secrets)\u001b[0m\n\u001b[1;32m 1054\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmissing_secret_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmissing_secret_ids\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1055\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1056\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mMissingSecretsError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmissing_secrets\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1057\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1058\u001b[0m \u001b[0;31m# Steps are immutable so we don't need to create a full deepcopy of them when cloning Dataflows.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 125 | "\u001b[0;31mMissingSecretsError\u001b[0m: Required secrets are missing. Please call use_secrets to register the missing secrets.\nMissing secrets:\nhttps://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv" 126 | ] 127 | } 128 | ] 129 | }, 130 | { 131 | "metadata": { 132 | "trusted": true 133 | }, 134 | "cell_type": "code", 135 | "source": "", 136 | "execution_count": null, 137 | "outputs": [] 138 | } 139 | ], 140 | "metadata": { 141 | "execute_as_test": false, 142 | "kernelspec": { 143 | "name": "python36", 144 | "display_name": "Python 3.6", 145 | "language": "python" 146 | }, 147 | "language_info": { 148 | "mimetype": "text/x-python", 149 | "nbconvert_exporter": "python", 150 | "name": "python", 151 | "pygments_lexer": "ipython3", 152 | "version": "3.6.6", 153 | "file_extension": ".py", 154 | "codemirror_mode": { 155 | "version": 3, 156 | "name": "ipython" 157 | } 158 | } 159 | }, 160 | "nbformat": 4, 161 | "nbformat_minor": 2 162 | } -------------------------------------------------------------------------------- /external-references.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": {}, 5 | "cell_type": "markdown", 6 | "source": "# External References\nCopyright (c) Microsoft Corporation. All rights reserved.
\nLicensed under the MIT License." 7 | }, 8 | { 9 | "metadata": {}, 10 | "cell_type": "markdown", 11 | "source": "In addition to opening existing Dataflows in code and modifying them, it is also possible to create and persist Dataflows that reference another Dataflow that has been persisted to a DataPrep package. In this case, executing this Dataflow will load the referenced DataPrep package dynamically, execute the referenced Dataflow, and then execute the steps in the referencing Dataflow." 12 | }, 13 | { 14 | "metadata": {}, 15 | "cell_type": "markdown", 16 | "source": "To demonstrate, we will create a Dataflow that loads and transforms some data. After that, we will persist this Dataflow to a DataPrep package." 17 | }, 18 | { 19 | "metadata": { 20 | "trusted": true 21 | }, 22 | "cell_type": "code", 23 | "source": "import azureml.dataprep as dprep\nimport tempfile\nimport os\n\ndf = dprep.smart_read_file('./data/fixed_width_file.txt')\ndf = df.drop_errors(['Column7', 'Column8', 'Column9'], dprep.ColumnRelationship.ANY)\ndf = df.set_name('FWF')\npkg = dprep.Package(df)\npkg_path = os.path.join(tempfile.gettempdir(), 'package.dprep')\npkg = pkg.save(pkg_path)", 24 | "execution_count": 1, 25 | "outputs": [ 26 | { 27 | "output_type": "stream", 28 | "text": "/home/nbuser/anaconda3_501/lib/python3.6/site-packages/ipykernel/__main__.py:5: DeprecationWarning: Function smart_read_file is deprecated. Use auto_read_file instead.\n", 29 | "name": "stderr" 30 | } 31 | ] 32 | }, 33 | { 34 | "metadata": {}, 35 | "cell_type": "markdown", 36 | "source": "Now that we have a package file, we can create a new Dataflow that references it." 37 | }, 38 | { 39 | "metadata": { 40 | "trusted": true 41 | }, 42 | "cell_type": "code", 43 | "source": "new_df = dprep.Dataflow.reference(dprep.ExternalReference(pkg_path, 'FWF'))\nnew_df.head(10)", 44 | "execution_count": 2, 45 | "outputs": [ 46 | { 47 | "output_type": "execute_result", 48 | "execution_count": 2, 49 | "data": { 50 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Column1Column2Column3Column4Column5Column6Column7Column8Column9
010010.099999.0JAN MAYENazureml.dataprep.native.DataPrepError(\"'Micros...JNENJA70933.0-8667.090.0
110014.099999.0SOERSTOKKENazureml.dataprep.native.DataPrepError(\"'Micros...NOENSO59783.05350.0500.0
210015.099999.0BRINGELANDazureml.dataprep.native.DataPrepError(\"'Micros...NOENBL61383.05867.03270.0
310016.099999.0RORVIK/RYUMazureml.dataprep.native.DataPrepError(\"'Micros...NO64850.011233.0140.0
410017.099999.0FRIGGazureml.dataprep.native.DataPrepError(\"'Micros...NOENFR59933.02417.0480.0
510020.099999.0VERLEGENHUKENazureml.dataprep.native.DataPrepError(\"'Micros...SV80050.016250.080.0
610030.099999.0HORNSUNDazureml.dataprep.native.DataPrepError(\"'Micros...SV77000.015500.0120.0
710040.099999.0NY-ALESUND IIazureml.dataprep.native.DataPrepError(\"'Micros...SVENAS78917.011933.080.0
810050.099999.0ISFJORD RADIOazureml.dataprep.native.DataPrepError(\"'Micros...NOENIS78067.013633.050.0
910060.099999.0EDGEOYAazureml.dataprep.native.DataPrepError(\"'Micros...NO78250.022783.0140.0
\n
", 51 | "text/plain": " Column1 Column2 Column3 \\\n0 10010.0 99999.0 JAN MAYEN \n1 10014.0 99999.0 SOERSTOKKEN \n2 10015.0 99999.0 BRINGELAND \n3 10016.0 99999.0 RORVIK/RYUM \n4 10017.0 99999.0 FRIGG \n5 10020.0 99999.0 VERLEGENHUKEN \n6 10030.0 99999.0 HORNSUND \n7 10040.0 99999.0 NY-ALESUND II \n8 10050.0 99999.0 ISFJORD RADIO \n9 10060.0 99999.0 EDGEOYA \n\n Column4 Column5 Column6 Column7 \\\n0 azureml.dataprep.native.DataPrepError(\"'Micros... JN ENJA 70933.0 \n1 azureml.dataprep.native.DataPrepError(\"'Micros... NO ENSO 59783.0 \n2 azureml.dataprep.native.DataPrepError(\"'Micros... NO ENBL 61383.0 \n3 azureml.dataprep.native.DataPrepError(\"'Micros... NO 64850.0 \n4 azureml.dataprep.native.DataPrepError(\"'Micros... NO ENFR 59933.0 \n5 azureml.dataprep.native.DataPrepError(\"'Micros... SV 80050.0 \n6 azureml.dataprep.native.DataPrepError(\"'Micros... SV 77000.0 \n7 azureml.dataprep.native.DataPrepError(\"'Micros... SV ENAS 78917.0 \n8 azureml.dataprep.native.DataPrepError(\"'Micros... NO ENIS 78067.0 \n9 azureml.dataprep.native.DataPrepError(\"'Micros... NO 78250.0 \n\n Column8 Column9 \n0 -8667.0 90.0 \n1 5350.0 500.0 \n2 5867.0 3270.0 \n3 11233.0 140.0 \n4 2417.0 480.0 \n5 16250.0 80.0 \n6 15500.0 120.0 \n7 11933.0 80.0 \n8 13633.0 50.0 \n9 22783.0 140.0 " 52 | }, 53 | "metadata": {} 54 | } 55 | ] 56 | }, 57 | { 58 | "metadata": {}, 59 | "cell_type": "markdown", 60 | "source": "When executed, the new Dataflow returns the same results as the one we saved in our package. Since this reference is resolved on execution, updating the package file results in the changes being visible when re-executing the referencing Dataflow." 61 | }, 62 | { 63 | "metadata": { 64 | "trusted": true 65 | }, 66 | "cell_type": "code", 67 | "source": "df = df.take(5)\npkg = dprep.Package(df)\npkg.save(pkg_path)\n\nnew_df.head(10)", 68 | "execution_count": 3, 69 | "outputs": [ 70 | { 71 | "output_type": "execute_result", 72 | "execution_count": 3, 73 | "data": { 74 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Column1Column2Column3Column4Column5Column6Column7Column8Column9
010010.099999.0JAN MAYENazureml.dataprep.native.DataPrepError(\"'Micros...JNENJA70933.0-8667.090.0
110014.099999.0SOERSTOKKENazureml.dataprep.native.DataPrepError(\"'Micros...NOENSO59783.05350.0500.0
210015.099999.0BRINGELANDazureml.dataprep.native.DataPrepError(\"'Micros...NOENBL61383.05867.03270.0
310016.099999.0RORVIK/RYUMazureml.dataprep.native.DataPrepError(\"'Micros...NO64850.011233.0140.0
410017.099999.0FRIGGazureml.dataprep.native.DataPrepError(\"'Micros...NOENFR59933.02417.0480.0
\n
", 75 | "text/plain": " Column1 Column2 Column3 \\\n0 10010.0 99999.0 JAN MAYEN \n1 10014.0 99999.0 SOERSTOKKEN \n2 10015.0 99999.0 BRINGELAND \n3 10016.0 99999.0 RORVIK/RYUM \n4 10017.0 99999.0 FRIGG \n\n Column4 Column5 Column6 Column7 \\\n0 azureml.dataprep.native.DataPrepError(\"'Micros... JN ENJA 70933.0 \n1 azureml.dataprep.native.DataPrepError(\"'Micros... NO ENSO 59783.0 \n2 azureml.dataprep.native.DataPrepError(\"'Micros... NO ENBL 61383.0 \n3 azureml.dataprep.native.DataPrepError(\"'Micros... NO 64850.0 \n4 azureml.dataprep.native.DataPrepError(\"'Micros... NO ENFR 59933.0 \n\n Column8 Column9 \n0 -8667.0 90.0 \n1 5350.0 500.0 \n2 5867.0 3270.0 \n3 11233.0 140.0 \n4 2417.0 480.0 " 76 | }, 77 | "metadata": {} 78 | } 79 | ] 80 | }, 81 | { 82 | "metadata": {}, 83 | "cell_type": "markdown", 84 | "source": "As we can see, even though we did not modify new_df, it now returns only 5 records, as the package was updated with the Dataflow that resulted from calling `df.take(5)`." 85 | }, 86 | { 87 | "metadata": { 88 | "trusted": true 89 | }, 90 | "cell_type": "code", 91 | "source": "", 92 | "execution_count": null, 93 | "outputs": [] 94 | }, 95 | { 96 | "metadata": { 97 | "trusted": true 98 | }, 99 | "cell_type": "code", 100 | "source": "", 101 | "execution_count": null, 102 | "outputs": [] 103 | } 104 | ], 105 | "metadata": { 106 | "kernelspec": { 107 | "name": "python36", 108 | "display_name": "Python 3.6", 109 | "language": "python" 110 | }, 111 | "language_info": { 112 | "mimetype": "text/x-python", 113 | "nbconvert_exporter": "python", 114 | "name": "python", 115 | "pygments_lexer": "ipython3", 116 | "version": "3.6.6", 117 | "file_extension": ".py", 118 | "codemirror_mode": { 119 | "version": 3, 120 | "name": "ipython" 121 | } 122 | } 123 | }, 124 | "nbformat": 4, 125 | "nbformat_minor": 2 126 | } -------------------------------------------------------------------------------- /impute-missing-values.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": {}, 5 | "cell_type": "markdown", 6 | "source": "# Impute missing values\nCopyright (c) Microsoft Corporation. All rights reserved.
\nLicensed under the MIT License." 7 | }, 8 | { 9 | "metadata": {}, 10 | "cell_type": "markdown", 11 | "source": "DataPrep has the ability to impute missing values in specified columns. In this case, we will attempt to impute the missing _Latitude_ and _Longitude_ values in the input data." 12 | }, 13 | { 14 | "metadata": { 15 | "trusted": true 16 | }, 17 | "cell_type": "code", 18 | "source": "!pip install azureml", 19 | "execution_count": 2, 20 | "outputs": [ 21 | { 22 | "output_type": "stream", 23 | "text": "Collecting azureml\n Downloading https://files.pythonhosted.org/packages/ab/e8/76cd2cb6784b9039affd2c659eed1b3f46baf2e6b87a10b072a20b5b0113/azureml-0.2.7-py2.py3-none-any.whl\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.20.1)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.5)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.7)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.6)\nRequirement already satisfied: urllib3<1.25,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.10.15)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\nInstalling collected packages: azureml\nSuccessfully installed azureml-0.2.7\n", 24 | "name": "stdout" 25 | } 26 | ] 27 | }, 28 | { 29 | "metadata": { 30 | "trusted": true 31 | }, 32 | "cell_type": "code", 33 | "source": "import azureml.dataprep as dprep", 34 | "execution_count": 3, 35 | "outputs": [] 36 | }, 37 | { 38 | "metadata": { 39 | "trusted": true 40 | }, 41 | "cell_type": "code", 42 | "source": "# loading input data\ndf = dprep.read_csv(r'data\\crime0-10.csv')\ndf = df.keep_columns(['ID', 'Arrest', 'Latitude', 'Longitude'])\ndf = df.to_number(['Latitude', 'Longitude'])\ndf.head(10)", 43 | "execution_count": 4, 44 | "outputs": [ 45 | { 46 | "output_type": "execute_result", 47 | "execution_count": 4, 48 | "data": { 49 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
IDArrestLatitudeLongitude
010140490false41.973309-87.800175
110139776false42.008124-87.659550
210140270falseNaNNaN
310139885false41.902152-87.754883
410140379false41.885610-87.657009
510140868false41.679311-87.644545
610139762false41.825501-87.690578
710139722true41.857828-87.715029
810139774false41.970100-87.669324
910139697false41.787580-87.685233
\n
", 50 | "text/plain": " ID Arrest Latitude Longitude\n0 10140490 false 41.973309 -87.800175\n1 10139776 false 42.008124 -87.659550\n2 10140270 false NaN NaN\n3 10139885 false 41.902152 -87.754883\n4 10140379 false 41.885610 -87.657009\n5 10140868 false 41.679311 -87.644545\n6 10139762 false 41.825501 -87.690578\n7 10139722 true 41.857828 -87.715029\n8 10139774 false 41.970100 -87.669324\n9 10139697 false 41.787580 -87.685233" 51 | }, 52 | "metadata": {} 53 | } 54 | ] 55 | }, 56 | { 57 | "metadata": {}, 58 | "cell_type": "markdown", 59 | "source": "The third record from input data has _Latitude_ and _Longitude_ missing. To impute those missing values, we can use `ImputeMissingValuesBuilder` to learn a fixed program which imputes the columns with either a calculated `MIN`, `MAX` or `MEAN` value or a `CUSTOM` value. When `group_by_columns` is specified, missing values will be imputed by group with `MIN`, `MAX` and `MEAN` calculated per group." 60 | }, 61 | { 62 | "metadata": {}, 63 | "cell_type": "markdown", 64 | "source": "Firstly, let us quickly see check the `MEAN` value of _Latitude_ column." 65 | }, 66 | { 67 | "metadata": { 68 | "trusted": true 69 | }, 70 | "cell_type": "code", 71 | "source": "df_mean = df.summarize(group_by_columns=['Arrest'],\n summary_columns=[dprep.SummaryColumnsValue(column_id='Latitude',\n summary_column_name='Latitude_MEAN',\n summary_function=dprep.SummaryFunction.MEAN)])\ndf_mean = df_mean.filter(dprep.col('Arrest') == 'false')\ndf_mean.head(1)", 72 | "execution_count": 5, 73 | "outputs": [ 74 | { 75 | "output_type": "execute_result", 76 | "execution_count": 5, 77 | "data": { 78 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ArrestLatitude_MEAN
0false41.878961
\n
", 79 | "text/plain": " Arrest Latitude_MEAN\n0 false 41.878961" 80 | }, 81 | "metadata": {} 82 | } 83 | ] 84 | }, 85 | { 86 | "metadata": {}, 87 | "cell_type": "markdown", 88 | "source": "The `MEAN` value of _Latitude_ looks good. So we will impute _Latitude_ with it. As for `Longitude`, we will impute it using `42` based on external knowledge." 89 | }, 90 | { 91 | "metadata": { 92 | "trusted": true 93 | }, 94 | "cell_type": "code", 95 | "source": "# impute with MEAN\nimpute_mean = dprep.ImputeColumnArguments(column_id='Latitude',\n impute_function=dprep.ReplaceValueFunction.MEAN)\n# impute with custom value 42\nimpute_custom = dprep.ImputeColumnArguments(column_id='Longitude',\n custom_impute_value=42)\n# get instance of ImputeMissingValuesBuilder\nimpute_builder = df.builders.impute_missing_values(impute_columns=[impute_mean, impute_custom],\n group_by_columns=['Arrest'])\n# call learn() to learn a fixed program to impute missing values\nimpute_builder.learn()\n# call to_dataflow() to get a dataflow with impute step added\ndf_imputed = impute_builder.to_dataflow()", 96 | "execution_count": 6, 97 | "outputs": [] 98 | }, 99 | { 100 | "metadata": { 101 | "trusted": true 102 | }, 103 | "cell_type": "code", 104 | "source": "# check impute result\ndf_imputed.head(10)", 105 | "execution_count": 7, 106 | "outputs": [ 107 | { 108 | "output_type": "execute_result", 109 | "execution_count": 7, 110 | "data": { 111 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
IDArrestLatitudeLongitude
010140490false41.973309-87.800175
110139776false42.008124-87.659550
210140270false41.87896142.000000
310139885false41.902152-87.754883
410140379false41.885610-87.657009
510140868false41.679311-87.644545
610139762false41.825501-87.690578
710139722true41.857828-87.715029
810139774false41.970100-87.669324
910139697false41.787580-87.685233
\n
", 112 | "text/plain": " ID Arrest Latitude Longitude\n0 10140490 false 41.973309 -87.800175\n1 10139776 false 42.008124 -87.659550\n2 10140270 false 41.878961 42.000000\n3 10139885 false 41.902152 -87.754883\n4 10140379 false 41.885610 -87.657009\n5 10140868 false 41.679311 -87.644545\n6 10139762 false 41.825501 -87.690578\n7 10139722 true 41.857828 -87.715029\n8 10139774 false 41.970100 -87.669324\n9 10139697 false 41.787580 -87.685233" 113 | }, 114 | "metadata": {} 115 | } 116 | ] 117 | }, 118 | { 119 | "metadata": {}, 120 | "cell_type": "markdown", 121 | "source": "As the result above, the missing _Latitude_ has been imputed with the `MEAN` value of `Arrest=='false'` group, and the missing _Longitude_ has been imputed with `42`." 122 | }, 123 | { 124 | "metadata": { 125 | "trusted": true 126 | }, 127 | "cell_type": "code", 128 | "source": "", 129 | "execution_count": null, 130 | "outputs": [] 131 | } 132 | ], 133 | "metadata": { 134 | "kernelspec": { 135 | "name": "python36", 136 | "display_name": "Python 3.6", 137 | "language": "python" 138 | }, 139 | "language_info": { 140 | "mimetype": "text/x-python", 141 | "nbconvert_exporter": "python", 142 | "name": "python", 143 | "pygments_lexer": "ipython3", 144 | "version": "3.6.6", 145 | "file_extension": ".py", 146 | "codemirror_mode": { 147 | "version": 3, 148 | "name": "ipython" 149 | } 150 | } 151 | }, 152 | "nbformat": 4, 153 | "nbformat_minor": 2 154 | } -------------------------------------------------------------------------------- /0. Import librairie.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": { 5 | "trusted": true 6 | }, 7 | "cell_type": "code", 8 | "source": "!pip install azureml-sdk", 9 | "execution_count": 1, 10 | "outputs": [ 11 | { 12 | "output_type": "stream", 13 | "text": "Requirement already satisfied: azureml-sdk in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (1.0.2)\nRequirement already satisfied: azureml-pipeline==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-sdk) (1.0.2)\nRequirement already satisfied: azureml-train==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-sdk) (1.0.2)\nRequirement already satisfied: azureml-core==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-sdk) (1.0.2)\nRequirement already satisfied: azureml-pipeline-core==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-pipeline==1.0.2.*->azureml-sdk) (1.0.2)\nRequirement already satisfied: azureml-pipeline-steps==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-pipeline==1.0.2.*->azureml-sdk) (1.0.2)\nRequirement already satisfied: azureml-train-core==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-train==1.0.2.*->azureml-sdk) (1.0.2)\nRequirement already satisfied: pathspec in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.5.9)\nRequirement already satisfied: azure-storage-blob>=1.1.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.4.0)\nRequirement already satisfied: SecretStorage<3.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.3.1)\nRequirement already satisfied: azure-cli-profile>=2.0.26 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.1.2)\nRequirement already satisfied: azure-common>=1.1.12 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.1.16)\nRequirement already satisfied: msrestazure>=0.4.33 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.5.1)\nRequirement already satisfied: docker in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (3.6.0)\nRequirement already satisfied: azure-mgmt-storage>=1.5.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (3.1.0)\nRequirement already satisfied: backports.tempfile in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.0)\nRequirement already satisfied: azure-graphrbac>=0.40.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.53.0)\nRequirement already satisfied: azure-mgmt-keyvault>=0.40.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.1.0)\nRequirement already satisfied: azure-cli-core>=2.0.38 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.0.52)\nRequirement already satisfied: azure-mgmt-containerregistry>=2.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.4.0)\nRequirement already satisfied: requests>=2.19.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.20.1)\nRequirement already satisfied: azure-mgmt-resource>=1.2.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.0.0)\nRequirement already satisfied: azure-mgmt-authorization>=0.40.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.51.1)\nRequirement already satisfied: pytz in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2018.7)\nRequirement already satisfied: azure-storage-nspkg>=3.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (3.1.0)\nRequirement already satisfied: ruamel.yaml<=0.15.51,>=0.15.35 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.15.51)\nRequirement already satisfied: ndg-httpsclient in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.5.1)\nRequirement already satisfied: azure-storage-common>=1.1.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.4.0)\nRequirement already satisfied: contextlib2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.5.5)\nRequirement already satisfied: cryptography!=1.9,!=2.0.*,!=2.1.*,!=2.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.3.1)\nRequirement already satisfied: python-dateutil>=2.7.3 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (2.7.5)\nRequirement already satisfied: msrest>=0.5.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (0.6.2)\nRequirement already satisfied: PyJWT in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.7.1)\nRequirement already satisfied: urllib3<1.24,>=1.23 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.23)\nRequirement already satisfied: six>=1.11.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.11.0)\nRequirement already satisfied: jsonpickle in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-core==1.0.2.*->azureml-sdk) (1.0)\nRequirement already satisfied: certifi in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-pipeline-steps==1.0.2.*->azureml-pipeline==1.0.2.*->azureml-sdk) (2018.10.15)\nRequirement already satisfied: azureml-telemetry==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-train-core==1.0.2.*->azureml-train==1.0.2.*->azureml-sdk) (1.0.2)\nRequirement already satisfied: azureml-train-restclients-hyperdrive==1.0.2.* in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-train-core==1.0.2.*->azureml-train==1.0.2.*->azureml-sdk) (1.0.2)\nRequirement already satisfied: azure-cli-command-modules-nspkg>=2.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-profile>=2.0.26->azureml-core==1.0.2.*->azureml-sdk) (2.0.2)\nRequirement already satisfied: adal<2.0.0,>=0.6.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from msrestazure>=0.4.33->azureml-core==1.0.2.*->azureml-sdk) (1.2.0)\nRequirement already satisfied: docker-pycreds>=0.3.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from docker->azureml-core==1.0.2.*->azureml-sdk) (0.4.0)\nRequirement already satisfied: websocket-client>=0.32.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from docker->azureml-core==1.0.2.*->azureml-sdk) (0.54.0)\nRequirement already satisfied: backports.weakref in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from backports.tempfile->azureml-core==1.0.2.*->azureml-sdk) (1.0rc1)\nRequirement already satisfied: azure-mgmt-nspkg>=2.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-mgmt-keyvault>=0.40.0->azureml-core==1.0.2.*->azureml-sdk) (3.0.2)\nRequirement already satisfied: azure-cli-telemetry in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (1.0.0)\nRequirement already satisfied: pyopenssl>=17.1.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (18.0.0)\nRequirement already satisfied: pyyaml~=3.13 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (3.13)\nRequirement already satisfied: pip in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (18.1)\nRequirement already satisfied: paramiko>=2.0.8 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (2.4.2)\nRequirement already satisfied: jmespath in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (0.9.3)\nRequirement already satisfied: argcomplete>=1.8.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (1.9.4)\nRequirement already satisfied: pygments in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (2.2.0)\nRequirement already satisfied: knack==0.5.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (0.5.1)\nRequirement already satisfied: tabulate<=0.8.2,>=0.7.7 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (0.8.2)\n", 14 | "name": "stdout" 15 | }, 16 | { 17 | "output_type": "stream", 18 | "text": "Requirement already satisfied: antlr4-python3-runtime; python_version >= \"3.0\" in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (4.7.1)\nRequirement already satisfied: colorama>=0.3.9 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (0.3.9)\nRequirement already satisfied: azure-cli-nspkg>=2.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (3.0.3)\nRequirement already satisfied: humanfriendly>=4.7 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (4.17)\nRequirement already satisfied: wheel==0.30.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (0.30.0)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests>=2.19.1->azureml-core==1.0.2.*->azureml-sdk) (2.7)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests>=2.19.1->azureml-core==1.0.2.*->azureml-sdk) (3.0.4)\nRequirement already satisfied: azure-nspkg>=2.0.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-storage-nspkg>=3.0.0->azureml-core==1.0.2.*->azureml-sdk) (3.0.2)\nRequirement already satisfied: pyasn1>=0.1.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from ndg-httpsclient->azureml-core==1.0.2.*->azureml-sdk) (0.4.4)\nRequirement already satisfied: asn1crypto>=0.21.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from cryptography!=1.9,!=2.0.*,!=2.1.*,!=2.2.*->azureml-core==1.0.2.*->azureml-sdk) (0.24.0)\nRequirement already satisfied: cffi!=1.11.3,>=1.7 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from cryptography!=1.9,!=2.0.*,!=2.1.*,!=2.2.*->azureml-core==1.0.2.*->azureml-sdk) (1.11.5)\nRequirement already satisfied: requests-oauthlib>=0.5.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from msrest>=0.5.1->azureml-core==1.0.2.*->azureml-sdk) (1.0.0)\nRequirement already satisfied: isodate>=0.6.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from msrest>=0.5.1->azureml-core==1.0.2.*->azureml-sdk) (0.6.0)\nRequirement already satisfied: applicationinsights in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-telemetry==1.0.2.*->azureml-train-core==1.0.2.*->azureml-train==1.0.2.*->azureml-sdk) (0.11.7)\nRequirement already satisfied: portalocker==1.2.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azure-cli-telemetry->azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (1.2.1)\nRequirement already satisfied: pynacl>=1.0.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from paramiko>=2.0.8->azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (1.3.0)\nRequirement already satisfied: bcrypt>=3.1.3 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from paramiko>=2.0.8->azure-cli-core>=2.0.38->azureml-core==1.0.2.*->azureml-sdk) (3.1.4)\nRequirement already satisfied: pycparser in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.7->cryptography!=1.9,!=2.0.*,!=2.1.*,!=2.2.*->azureml-core==1.0.2.*->azureml-sdk) (2.19)\nRequirement already satisfied: oauthlib>=0.6.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests-oauthlib>=0.5.0->msrest>=0.5.1->azureml-core==1.0.2.*->azureml-sdk) (2.1.0)\n", 19 | "name": "stdout" 20 | } 21 | ] 22 | }, 23 | { 24 | "metadata": { 25 | "trusted": true 26 | }, 27 | "cell_type": "code", 28 | "source": "!pip install --upgrade azureml-dataprep", 29 | "execution_count": 2, 30 | "outputs": [ 31 | { 32 | "output_type": "stream", 33 | "text": "Requirement already up-to-date: azureml-dataprep in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (1.0.4)\nRequirement already satisfied, skipping upgrade: pandas>=0.19.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-dataprep) (0.22.0)\nRequirement already satisfied, skipping upgrade: numpy>=1.11.3 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-dataprep) (1.14.6)\nRequirement already satisfied, skipping upgrade: dotnetcore2==2.1.7 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-dataprep) (2.1.7)\nRequirement already satisfied, skipping upgrade: azureml-dataprep-native<12.0.0,>=11.2.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml-dataprep) (11.2.0)\nRequirement already satisfied, skipping upgrade: python-dateutil>=2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas>=0.19.2->azureml-dataprep) (2.7.5)\nRequirement already satisfied, skipping upgrade: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas>=0.19.2->azureml-dataprep) (2018.7)\nRequirement already satisfied, skipping upgrade: distro>=1.2.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from dotnetcore2==2.1.7->azureml-dataprep) (1.3.0)\nRequirement already satisfied, skipping upgrade: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil>=2->pandas>=0.19.2->azureml-dataprep) (1.11.0)\n", 34 | "name": "stdout" 35 | } 36 | ] 37 | }, 38 | { 39 | "metadata": { 40 | "trusted": true 41 | }, 42 | "cell_type": "code", 43 | "source": "import sys", 44 | "execution_count": 3, 45 | "outputs": [] 46 | }, 47 | { 48 | "metadata": { 49 | "trusted": true 50 | }, 51 | "cell_type": "code", 52 | "source": "sys.version", 53 | "execution_count": 4, 54 | "outputs": [ 55 | { 56 | "output_type": "execute_result", 57 | "execution_count": 4, 58 | "data": { 59 | "text/plain": "'3.6.6 |Anaconda, Inc.| (default, Jun 28 2018, 17:14:51) \\n[GCC 7.2.0]'" 60 | }, 61 | "metadata": {} 62 | } 63 | ] 64 | }, 65 | { 66 | "metadata": { 67 | "trusted": true 68 | }, 69 | "cell_type": "code", 70 | "source": "import azureml.core\nprint(\"Version Azure ML service :\", azureml.core.VERSION)", 71 | "execution_count": 7, 72 | "outputs": [ 73 | { 74 | "output_type": "stream", 75 | "text": "Version Azure ML service : 1.0.2\n", 76 | "name": "stdout" 77 | } 78 | ] 79 | }, 80 | { 81 | "metadata": { 82 | "trusted": true 83 | }, 84 | "cell_type": "code", 85 | "source": "", 86 | "execution_count": null, 87 | "outputs": [] 88 | } 89 | ], 90 | "metadata": { 91 | "kernelspec": { 92 | "name": "python36", 93 | "display_name": "Python 3.6", 94 | "language": "python" 95 | }, 96 | "language_info": { 97 | "mimetype": "text/x-python", 98 | "nbconvert_exporter": "python", 99 | "name": "python", 100 | "pygments_lexer": "ipython3", 101 | "version": "3.6.6", 102 | "file_extension": ".py", 103 | "codemirror_mode": { 104 | "version": 3, 105 | "name": "ipython" 106 | } 107 | } 108 | }, 109 | "nbformat": 4, 110 | "nbformat_minor": 2 111 | } -------------------------------------------------------------------------------- /smart-read-file-separators.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": {}, 5 | "cell_type": "markdown", 6 | "source": "# Smart Read File\nCopyright (c) Microsoft Corporation. All rights reserved.
\nLicensed under the MIT License." 7 | }, 8 | { 9 | "metadata": { 10 | "trusted": true 11 | }, 12 | "cell_type": "code", 13 | "source": "!pip install azureml", 14 | "execution_count": 1, 15 | "outputs": [ 16 | { 17 | "output_type": "stream", 18 | "text": "Requirement already satisfied: azureml in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (0.2.7)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.3)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.19.1)\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\nRequirement already satisfied: urllib3<1.24,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.8.24)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.5)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.5)\n", 19 | "name": "stdout" 20 | } 21 | ] 22 | }, 23 | { 24 | "metadata": { 25 | "trusted": true 26 | }, 27 | "cell_type": "code", 28 | "source": "import azureml.dataprep as dprep", 29 | "execution_count": 2, 30 | "outputs": [] 31 | }, 32 | { 33 | "metadata": {}, 34 | "cell_type": "markdown", 35 | "source": "DataPrep has the ability to load different kinds of text files. The `smart_read_file` entry point can take any text based file (including excel, json and parquet) and auto-detect how to parse the file. It will also attempt to auto-detect the types of each column and apply type transformations to the columns it detects.\n\nThe result will be a Dataflow object that has all the steps added that are required to read the given file(s) and convert their columns to the predicted types. No parameters are required beyond the file path or `FileDataSource` object." 36 | }, 37 | { 38 | "metadata": { 39 | "trusted": true 40 | }, 41 | "cell_type": "code", 42 | "source": "smart_dataflow = dprep.smart_read_file('./data/multiple_separators.csv')\nsmart_dataflow.head(10)", 43 | "execution_count": 3, 44 | "outputs": [ 45 | { 46 | "output_type": "execute_result", 47 | "execution_count": 3, 48 | "data": { 49 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
IDCaseNumberColumn3CompletedColumn5
010140490.0HY329907Y
110139776.0HY329265Y
210140270.0HY329253N
310139885.0HY329308Y
410140379.0HY329556N
510140868.0HY330421N
610139762.0HY329232N
710139722.0HY329228Y
810139774.0HY329209N
910139697.0HY329177N
\n
", 50 | "text/plain": " ID CaseNumber Column3 Completed Column5\n0 10140490.0 HY329907 Y \n1 10139776.0 HY329265 Y \n2 10140270.0 HY329253 N \n3 10139885.0 HY329308 Y \n4 10140379.0 HY329556 N \n5 10140868.0 HY330421 N \n6 10139762.0 HY329232 N \n7 10139722.0 HY329228 Y \n8 10139774.0 HY329209 N \n9 10139697.0 HY329177 N " 51 | }, 52 | "metadata": {} 53 | } 54 | ] 55 | }, 56 | { 57 | "metadata": {}, 58 | "cell_type": "markdown", 59 | "source": "Looking at the data, we can see that there are two empty columns either side of the 'Completed' column.\nIf we compare the dataframe to a few rows from the original file:\n```\nID |CaseNumber| |Completed|\n10140490 |HY329907| |Y|\n10139776 |HY329265| |Y|\n```\nWe can see that the `|`'s have disappeared in the dataframe. This is because `|` is a very common separator character in csv files, so `smart_read_file` guessed it was the column separator. For this data we actually want the `|`'s to remain and instead use space as the column separator.\n\nTo acheive this we can use `detect_file_format` which will take a file path or datasource obeject and give back a `FileFormatBuilder` which has learnt some information about the supplied data.\nThis is what `smart_file_read` is using behind the scenes to 'learn' the contents of the given file and determine how to parse it. With the `FileFormatBuilder` we can take advantage of the intelligent learning aspect of `smart_file_read` but have the chance to modify some of the learnt information." 60 | }, 61 | { 62 | "metadata": { 63 | "trusted": true 64 | }, 65 | "cell_type": "code", 66 | "source": "ffb = dprep.detect_file_format('./data/multiple_separators.csv')\nffb_2 = dprep.detect_file_format('./data/excel.xlsx')\nffb_3 = dprep.detect_file_format('./data/fixed_width_file.txt')\nffb_4 = dprep.detect_file_format('./data/json.json')\n\nprint(ffb.file_format)\nprint(ffb_2.file_format)\nprint(ffb_3.file_format)\nprint(type(ffb_4.file_format))", 67 | "execution_count": 4, 68 | "outputs": [ 69 | { 70 | "output_type": "stream", 71 | "text": "ParseDelimitedProperties\n separator: '|'\n headers_mode: PromoteHeadersMode.CONSTANTGROUPED\n encoding: FileEncoding.UTF8\n quoting: False\n skip_rows: 0\n skip_mode: SkipMode.NONE\n comment: None\n\nReadExcelProperties\n sheet_name: None\n use_headers: False\n skip_rows: 0\n\nParseFixedWidthProperties\n offsets: '[7, 13, 43, 46, 52, 58, 65, 73]'\n headers_mode: PromoteHeadersMode.NONE\n encoding: FileEncoding.UTF8\n skip_rows: 0\n skip_mode: SkipMode.NONE\n\n\n", 72 | "name": "stdout" 73 | } 74 | ] 75 | }, 76 | { 77 | "metadata": {}, 78 | "cell_type": "markdown", 79 | "source": "After calling `detect_file_format` we get a `FileFormatBuilder` that has had `learn` called on it. This means the `file_format` attribute will be populated with a `Properties` object, it contains all the information that was learnt about the file. As we can see above different file types have corresponding file_formats detected. \nContinuing with our delimited example we can change any of these values and then call `ffb.to_dataflow()` to create a `Dataflow` that has the steps required to parse the datasource." 80 | }, 81 | { 82 | "metadata": { 83 | "scrolled": true, 84 | "trusted": true 85 | }, 86 | "cell_type": "code", 87 | "source": "ffb.file_format.separator = ' '\ndataflow = ffb.to_dataflow()\ndf = dataflow.to_pandas_dataframe()\ndf", 88 | "execution_count": 5, 89 | "outputs": [ 90 | { 91 | "output_type": "execute_result", 92 | "execution_count": 5, 93 | "data": { 94 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ID|CaseNumber||Completed|
010140490|HY329907||Y|
110139776|HY329265||Y|
210140270|HY329253||N|
310139885|HY329308||Y|
410140379|HY329556||N|
510140868|HY330421||N|
610139762|HY329232||N|
710139722|HY329228||Y|
810139774|HY329209||N|
910139697|HY329177||N|
\n
", 95 | "text/plain": " ID |CaseNumber| |Completed|\n0 10140490 |HY329907| |Y|\n1 10139776 |HY329265| |Y|\n2 10140270 |HY329253| |N|\n3 10139885 |HY329308| |Y|\n4 10140379 |HY329556| |N|\n5 10140868 |HY330421| |N|\n6 10139762 |HY329232| |N|\n7 10139722 |HY329228| |Y|\n8 10139774 |HY329209| |N|\n9 10139697 |HY329177| |N|" 96 | }, 97 | "metadata": {} 98 | } 99 | ] 100 | }, 101 | { 102 | "metadata": {}, 103 | "cell_type": "markdown", 104 | "source": "The result is our desired dataframe with `|`'s included.\n\nIf we refer back to the original data output by `smart_read_file` the 'ID' column was also detected as numeric and converted to a number data type, instead of remaining a string like in the data above.\nWe can perform type inference on our new dataflow using the `dataflow.builders` property. This property exposes different builders that can `learn` from a dataflow and `apply` the learning to produce a new dataflow, very similar to the pattern we used above for the `FileFormatBuilder`." 105 | }, 106 | { 107 | "metadata": { 108 | "trusted": true 109 | }, 110 | "cell_type": "code", 111 | "source": "ctb = dataflow.builders.set_column_types()\nctb.learn()\nctb.inference_info", 112 | "execution_count": 6, 113 | "outputs": [ 114 | { 115 | "output_type": "execute_result", 116 | "execution_count": 6, 117 | "data": { 118 | "text/plain": "{'|CaseNumber|': [FieldType.STRING],\n '|Completed|': [FieldType.STRING],\n 'ID': [FieldType.DECIMAL]}" 119 | }, 120 | "metadata": {} 121 | } 122 | ] 123 | }, 124 | { 125 | "metadata": {}, 126 | "cell_type": "markdown", 127 | "source": "After learning `ctb.inference_info` has been populated with information about the inferred types for each column, it is possible for there to be multiple candidate types per column, in this example there is only one type for each column.\n\nThe candidates look correct, we only want to convert `ID` to be a number column (also known as `DECIMAL`), so applying this `ColumnTypesBuilder` should result in a Dataflow with our columns converted to their respective types." 128 | }, 129 | { 130 | "metadata": { 131 | "trusted": true 132 | }, 133 | "cell_type": "code", 134 | "source": "converted_dataflow = ctb.to_dataflow()\nconverted_df = converted_dataflow.to_pandas_dataframe()\nconverted_df", 135 | "execution_count": 7, 136 | "outputs": [ 137 | { 138 | "output_type": "execute_result", 139 | "execution_count": 7, 140 | "data": { 141 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ID|CaseNumber||Completed|
010140490.0|HY329907||Y|
110139776.0|HY329265||Y|
210140270.0|HY329253||N|
310139885.0|HY329308||Y|
410140379.0|HY329556||N|
510140868.0|HY330421||N|
610139762.0|HY329232||N|
710139722.0|HY329228||Y|
810139774.0|HY329209||N|
910139697.0|HY329177||N|
\n
", 142 | "text/plain": " ID |CaseNumber| |Completed|\n0 10140490.0 |HY329907| |Y|\n1 10139776.0 |HY329265| |Y|\n2 10140270.0 |HY329253| |N|\n3 10139885.0 |HY329308| |Y|\n4 10140379.0 |HY329556| |N|\n5 10140868.0 |HY330421| |N|\n6 10139762.0 |HY329232| |N|\n7 10139722.0 |HY329228| |Y|\n8 10139774.0 |HY329209| |N|\n9 10139697.0 |HY329177| |N|" 143 | }, 144 | "metadata": {} 145 | } 146 | ] 147 | }, 148 | { 149 | "metadata": { 150 | "trusted": true 151 | }, 152 | "cell_type": "code", 153 | "source": "", 154 | "execution_count": null, 155 | "outputs": [] 156 | } 157 | ], 158 | "metadata": { 159 | "kernelspec": { 160 | "name": "python36", 161 | "display_name": "Python 3.6", 162 | "language": "python" 163 | }, 164 | "language_info": { 165 | "mimetype": "text/x-python", 166 | "nbconvert_exporter": "python", 167 | "name": "python", 168 | "pygments_lexer": "ipython3", 169 | "version": "3.6.6", 170 | "file_extension": ".py", 171 | "codemirror_mode": { 172 | "version": 3, 173 | "name": "ipython" 174 | } 175 | } 176 | }, 177 | "nbformat": 4, 178 | "nbformat_minor": 2 179 | } -------------------------------------------------------------------------------- /caching.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": {}, 5 | "cell_type": "markdown", 6 | "source": "# Caching\nCopyright (c) Microsoft Corporation. All rights reserved.
\nLicensed under the MIT License." 7 | }, 8 | { 9 | "metadata": {}, 10 | "cell_type": "markdown", 11 | "source": "A Dataflow can be cached into a file on disk during a local run by calling `df_cached = df.cache(directory_path)` on the Dataflow object `df`. Doing so, we will run all steps in `df` and save the cached data to the specified `directory_path`. The returned Dataflow `df_cached` has a Caching Step added at the end. Any run on Dataflow `df_cached` will reuse the cached data. And steps in `df_cached` before Caching Step will not be run again.\n\nCaching avoids running transforms multiple times, which can make local runs more efficient. Here are common places to use Caching:\n- after reading data from remote\n- after expensive transforms, such as Sort\n- after transforms that change the shape of data, such as Sampling, Filter and Summarize\n\nCaching Step will be ignored during scale-out run invoked by `to_spark_dataframe()`." 12 | }, 13 | { 14 | "metadata": { 15 | "trusted": true 16 | }, 17 | "cell_type": "code", 18 | "source": "# read data and apply transforms\nimport azureml.dataprep as dprep\ndf = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv')\ndf = df.take_sample(probability=0.2, seed=7)\ndf = df.skip(1)\ndf = df.sort_asc(columns='schnam10')\ndf = df.keep_columns(['stnam', 'fipst', 'leaid', 'leanm10', 'ncessch', 'schnam10'])\ndf.head(5)", 19 | "execution_count": 6, 20 | "outputs": [ 21 | { 22 | "output_type": "execute_result", 23 | "execution_count": 6, 24 | "data": { 25 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
stnamfipstleaidleanm10ncesschschnam10
0ALABAMA1102100Limestone County10210000797Ardmore High Sch
1ALABAMA1101920Jefferson County10192000691Brighton Middle Sch
2ALABAMA1101920Jefferson County10192000720Bryan Elem Sch
3ALABAMA1102010Lauderdale County10201000766Cloverdale Jr High Sch
4ALABAMA1101920Jefferson County10192000701Erwin Elem Sch
\n
", 26 | "text/plain": " stnam fipst leaid leanm10 ncessch \\\n0 ALABAMA 1 102100 Limestone County 10210000797 \n1 ALABAMA 1 101920 Jefferson County 10192000691 \n2 ALABAMA 1 101920 Jefferson County 10192000720 \n3 ALABAMA 1 102010 Lauderdale County 10201000766 \n4 ALABAMA 1 101920 Jefferson County 10192000701 \n\n schnam10 \n0 Ardmore High Sch \n1 Brighton Middle Sch \n2 Bryan Elem Sch \n3 Cloverdale Jr High Sch \n4 Erwin Elem Sch " 27 | }, 28 | "metadata": {} 29 | } 30 | ] 31 | }, 32 | { 33 | "metadata": { 34 | "trusted": true 35 | }, 36 | "cell_type": "code", 37 | "source": "# choose a directory to store cache data\nimport os\nfrom pathlib import Path\ncache_dir = str(Path(os.getcwd(), 'dataflow-cache'))\ncache_dir", 38 | "execution_count": 7, 39 | "outputs": [ 40 | { 41 | "output_type": "execute_result", 42 | "execution_count": 7, 43 | "data": { 44 | "text/plain": "'/home/nbuser/library/dataflow-cache'" 45 | }, 46 | "metadata": {} 47 | } 48 | ] 49 | }, 50 | { 51 | "metadata": { 52 | "trusted": true 53 | }, 54 | "cell_type": "code", 55 | "source": "# choose a directory to store cache data\ncache_dir = str('dataflow-cache')\ncache_dir", 56 | "execution_count": 8, 57 | "outputs": [ 58 | { 59 | "output_type": "execute_result", 60 | "execution_count": 8, 61 | "data": { 62 | "text/plain": "'dataflow-cache'" 63 | }, 64 | "metadata": {} 65 | } 66 | ] 67 | }, 68 | { 69 | "metadata": { 70 | "trusted": true 71 | }, 72 | "cell_type": "code", 73 | "source": "# cache the dataflow\ndf_cached = df.cache(directory_path=cache_dir)", 74 | "execution_count": 9, 75 | "outputs": [ 76 | { 77 | "output_type": "error", 78 | "ename": "ExecutionError", 79 | "evalue": "Cannot write cache. Please check if the specified cache folder exists.", 80 | "traceback": [ 81 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 82 | "\u001b[0;31mExecutionError\u001b[0m Traceback (most recent call last)", 83 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# cache the dataflow\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdf_cached\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdirectory_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcache_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 84 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mcache\u001b[0;34m(self, directory_path)\u001b[0m\n\u001b[1;32m 982\u001b[0m \u001b[0;34m'cachePath'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mLocalDataSource\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdirectory_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munderlying_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 983\u001b[0m })\n\u001b[0;32m--> 984\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 985\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 986\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 85 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mhead\u001b[0;34m(self, count)\u001b[0m\n\u001b[1;32m 337\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mA\u001b[0m \u001b[0mPandas\u001b[0m \u001b[0mDataframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 338\u001b[0m \"\"\"\n\u001b[0;32m--> 339\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_pandas_dataframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mextended_types\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 340\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 341\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrun_local\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 86 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mto_pandas_dataframe\u001b[0;34m(self, extended_types, nulls_as_nan)\u001b[0m\n\u001b[1;32m 391\u001b[0m self._engine_api.execute_anonymous_blocks(\n\u001b[1;32m 392\u001b[0m ExecuteAnonymousBlocksMessageArguments(blocks=steps_to_block_datas(dataflow_to_execute._steps),\n\u001b[0;32m--> 393\u001b[0;31m project_context=self._parent_package_path))\n\u001b[0m\u001b[1;32m 394\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 395\u001b[0m \u001b[0mintermediate_files\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mp\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mintermediate_path\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mglob\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'part-*'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 87 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/_aml_helper.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(op_code, message)\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchanged\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0mengine_api_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate_environment_variable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchanged\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 37\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msend_message_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mop_code\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 38\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 88 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/engineapi/api.py\u001b[0m in \u001b[0;36mexecute_anonymous_blocks\u001b[0;34m(self, message_args)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mupdate_aml_env_vars\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mget_engine_api\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mexecute_anonymous_blocks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage_args\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mtypedefinitions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mExecuteAnonymousBlocksMessageArguments\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_message\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Engine.ExecuteActivity'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 57\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 89 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/engineapi/engine.py\u001b[0m in \u001b[0;36msend_message\u001b[0;34m(self, op_code, message)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read_response\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'error'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0mraise_engine_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'error'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'id'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mmessage_id\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'result'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 90 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/errorhandlers.py\u001b[0m in \u001b[0;36mraise_engine_error\u001b[0;34m(error_response)\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[0merror_code\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0merror_response\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'errorCode'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'ActivityExecutionFailed'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0merror_code\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 69\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mExecutionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_response\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 70\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;34m'UnableToPreviewDataSource'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0merror_code\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mPreviewDataSourceError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_response\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 91 | "\u001b[0;31mExecutionError\u001b[0m: Cannot write cache. Please check if the specified cache folder exists." 92 | ] 93 | } 94 | ] 95 | }, 96 | { 97 | "metadata": { 98 | "trusted": true 99 | }, 100 | "cell_type": "code", 101 | "source": "# check steps in df_cached\n[s.step_type for s in df_cached.get_steps()]", 102 | "execution_count": null, 103 | "outputs": [] 104 | }, 105 | { 106 | "metadata": { 107 | "trusted": false 108 | }, 109 | "cell_type": "code", 110 | "source": "# check the stored cache data\nos.listdir(cache_dir)", 111 | "execution_count": 5, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "text/plain": "['7acc00d7-8e69-471d-b74d-085d0625cd9b.cacheIndex',\n '86e51582-fa4f-4b9e-8e45-439692d0da02']" 116 | }, 117 | "execution_count": 5, 118 | "metadata": {}, 119 | "output_type": "execute_result" 120 | } 121 | ] 122 | }, 123 | { 124 | "metadata": { 125 | "trusted": false 126 | }, 127 | "cell_type": "code", 128 | "source": "# run against df_cached will reuse the cache data and skip running all the previous steps again\ndf_cached.head(5)", 129 | "execution_count": 6, 130 | "outputs": [ 131 | { 132 | "data": { 133 | "text/html": "
\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
stnamfipstleaidleanm10ncesschschnam10
0ALABAMA1102100Limestone County10210000797Ardmore High Sch
1ALABAMA1101920Jefferson County10192000691Brighton Middle Sch
2ALABAMA1101920Jefferson County10192000720Bryan Elem Sch
3ALABAMA1102010Lauderdale County10201000766Cloverdale Jr High Sch
4ALABAMA1101920Jefferson County10192000701Erwin Elem Sch
\n
", 134 | "text/plain": " stnam fipst leaid leanm10 ncessch \\\n0 ALABAMA 1 102100 Limestone County 10210000797 \n1 ALABAMA 1 101920 Jefferson County 10192000691 \n2 ALABAMA 1 101920 Jefferson County 10192000720 \n3 ALABAMA 1 102010 Lauderdale County 10201000766 \n4 ALABAMA 1 101920 Jefferson County 10192000701 \n\n schnam10 \n0 Ardmore High Sch \n1 Brighton Middle Sch \n2 Bryan Elem Sch \n3 Cloverdale Jr High Sch \n4 Erwin Elem Sch " 135 | }, 136 | "execution_count": 6, 137 | "metadata": {}, 138 | "output_type": "execute_result" 139 | } 140 | ] 141 | }, 142 | { 143 | "metadata": { 144 | "trusted": false 145 | }, 146 | "cell_type": "code", 147 | "source": "df1 = df_cached.take(10)\ndf2 = df_cached.skip(10).take(10)\n\n# run against df1 and df2 will reuse the cache data as well\ndataframe1 = df1.to_pandas_dataframe()\ndataframe2 = df2.to_pandas_dataframe()", 148 | "execution_count": 7, 149 | "outputs": [] 150 | }, 151 | { 152 | "metadata": { 153 | "trusted": false 154 | }, 155 | "cell_type": "code", 156 | "source": "# clean up cache data\nimport shutil\nshutil.rmtree(path=cache_dir)", 157 | "execution_count": 8, 158 | "outputs": [] 159 | } 160 | ], 161 | "metadata": { 162 | "kernelspec": { 163 | "name": "python36", 164 | "display_name": "Python 3.6", 165 | "language": "python" 166 | }, 167 | "language_info": { 168 | "mimetype": "text/x-python", 169 | "nbconvert_exporter": "python", 170 | "name": "python", 171 | "pygments_lexer": "ipython3", 172 | "version": "3.6.6", 173 | "file_extension": ".py", 174 | "codemirror_mode": { 175 | "version": 3, 176 | "name": "ipython" 177 | } 178 | } 179 | }, 180 | "nbformat": 4, 181 | "nbformat_minor": 2 182 | } -------------------------------------------------------------------------------- /join.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": {}, 5 | "cell_type": "markdown", 6 | "source": "# Join\nCopyright (c) Microsoft Corporation. All rights reserved.
\nLicensed under the MIT License.
\n\nIn DataPrep you can easily join two dataflows." 7 | }, 8 | { 9 | "metadata": { 10 | "trusted": true 11 | }, 12 | "cell_type": "code", 13 | "source": "!pip install azureml", 14 | "execution_count": 1, 15 | "outputs": [ 16 | { 17 | "output_type": "stream", 18 | "text": "Requirement already satisfied: azureml in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (0.2.7)\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.5)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.20.1)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.7)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.6)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\nRequirement already satisfied: urllib3<1.25,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.10.15)\n", 19 | "name": "stdout" 20 | } 21 | ] 22 | }, 23 | { 24 | "metadata": { 25 | "trusted": true 26 | }, 27 | "cell_type": "code", 28 | "source": "import azureml.dataprep as dprep", 29 | "execution_count": 2, 30 | "outputs": [] 31 | }, 32 | { 33 | "metadata": {}, 34 | "cell_type": "markdown", 35 | "source": "First let's get the left side of the data into a shape that is ready for the join." 36 | }, 37 | { 38 | "metadata": { 39 | "scrolled": false, 40 | "trusted": true 41 | }, 42 | "cell_type": "code", 43 | "source": "# get the first dataflow and derive desired key column\ndataflow_l = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/testfiles/BostonWeather.csv')\ndataflow_l = dataflow_l.derive_column_by_example(source_columns='DATE', new_column_name='date_timerange',\n example_data=[('11/11/2015 0:54', 'Nov 11, 2015 | 12AM-2AM'),\n ('2/1/2015 0:54', 'Feb 1, 2015 | 12AM-2AM'),\n ('1/29/2015 20:54', 'Jan 29, 2015 | 8PM-10PM')])\ndataflow_l = dataflow_l.drop_columns(['DATE'])\n\n# convert types and summarize data\ndataflow_l = dataflow_l.set_column_types(type_conversions={'HOURLYDRYBULBTEMPF': dprep.TypeConverter(dprep.FieldType.DECIMAL)})\ndataflow_l = dataflow_l.filter(expression=dprep.f_not(dprep.col('HOURLYDRYBULBTEMPF').is_error()))\ndataflow_l = dataflow_l.summarize(group_by_columns=['date_timerange'],summary_columns=[dprep.SummaryColumnsValue('HOURLYDRYBULBTEMPF', dprep.api.engineapi.typedefinitions.SummaryFunction.MEAN, 'HOURLYDRYBULBTEMPF_Mean')] )\n\n# cache the result so the steps above are not executed every time we pull on the data\nimport os\nfrom pathlib import Path\ncache_dir = str(Path(os.getcwd(), 'dataflow-cache'))\ndataflow_l.cache(directory_path=cache_dir)\ndataflow_l.head(10)", 44 | "execution_count": 3, 45 | "outputs": [ 46 | { 47 | "output_type": "error", 48 | "ename": "ExecutionError", 49 | "evalue": "Cannot write cache. Please check if the specified cache folder exists.", 50 | "traceback": [ 51 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 52 | "\u001b[0;31mExecutionError\u001b[0m Traceback (most recent call last)", 53 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpathlib\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mPath\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mcache_dir\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mPath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetcwd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'dataflow-cache'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 18\u001b[0;31m \u001b[0mdataflow_l\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdirectory_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcache_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 19\u001b[0m \u001b[0mdataflow_l\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 54 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mcache\u001b[0;34m(self, directory_path)\u001b[0m\n\u001b[1;32m 982\u001b[0m \u001b[0;34m'cachePath'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mLocalDataSource\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdirectory_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munderlying_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 983\u001b[0m })\n\u001b[0;32m--> 984\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 985\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 986\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 55 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mhead\u001b[0;34m(self, count)\u001b[0m\n\u001b[1;32m 337\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mA\u001b[0m \u001b[0mPandas\u001b[0m \u001b[0mDataframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 338\u001b[0m \"\"\"\n\u001b[0;32m--> 339\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_pandas_dataframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mextended_types\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 340\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 341\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrun_local\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 56 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py\u001b[0m in \u001b[0;36mto_pandas_dataframe\u001b[0;34m(self, extended_types, nulls_as_nan)\u001b[0m\n\u001b[1;32m 391\u001b[0m self._engine_api.execute_anonymous_blocks(\n\u001b[1;32m 392\u001b[0m ExecuteAnonymousBlocksMessageArguments(blocks=steps_to_block_datas(dataflow_to_execute._steps),\n\u001b[0;32m--> 393\u001b[0;31m project_context=self._parent_package_path))\n\u001b[0m\u001b[1;32m 394\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 395\u001b[0m \u001b[0mintermediate_files\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mp\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mintermediate_path\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mglob\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'part-*'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 57 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/_aml_helper.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(op_code, message)\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchanged\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0mengine_api_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate_environment_variable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchanged\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 37\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msend_message_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mop_code\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 38\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 58 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/engineapi/api.py\u001b[0m in \u001b[0;36mexecute_anonymous_blocks\u001b[0;34m(self, message_args)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mupdate_aml_env_vars\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mget_engine_api\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mexecute_anonymous_blocks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage_args\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mtypedefinitions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mExecuteAnonymousBlocksMessageArguments\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_message\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Engine.ExecuteActivity'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 57\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 59 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/engineapi/engine.py\u001b[0m in \u001b[0;36msend_message\u001b[0;34m(self, op_code, message)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read_response\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'error'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0mraise_engine_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'error'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'id'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mmessage_id\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'result'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 60 | "\u001b[0;32m~/anaconda3_501/lib/python3.6/site-packages/azureml/dataprep/api/errorhandlers.py\u001b[0m in \u001b[0;36mraise_engine_error\u001b[0;34m(error_response)\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[0merror_code\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0merror_response\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'errorCode'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'ActivityExecutionFailed'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0merror_code\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 69\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mExecutionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_response\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 70\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;34m'UnableToPreviewDataSource'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0merror_code\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mPreviewDataSourceError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_response\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 61 | "\u001b[0;31mExecutionError\u001b[0m: Cannot write cache. Please check if the specified cache folder exists." 62 | ] 63 | } 64 | ] 65 | }, 66 | { 67 | "metadata": {}, 68 | "cell_type": "markdown", 69 | "source": "Now let's prepare the data for the right side of the join." 70 | }, 71 | { 72 | "metadata": { 73 | "scrolled": false, 74 | "trusted": true 75 | }, 76 | "cell_type": "code", 77 | "source": "# get the second dataflow and desired key column\ndataflow_r = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/bike-share/*-hubway-tripdata.csv')\ndataflow_r = dataflow_r.keep_columns(['starttime', 'start station id'])\ndataflow_r = dataflow_r.derive_column_by_example(source_columns='starttime', new_column_name='l_date_timerange',\n example_data=[('2015-01-01 00:21:44', 'Jan 1, 2015 | 12AM-2AM')])\ndataflow_r = dataflow_r.drop_columns('starttime')\n\n# cache the results\ndataflow_r.cache(directory_path=cache_dir)\ndataflow_r.head(10)", 78 | "execution_count": null, 79 | "outputs": [] 80 | }, 81 | { 82 | "metadata": {}, 83 | "cell_type": "markdown", 84 | "source": "There are three ways one can join two dataflows in DataPrep:\n1. Create `JoinBuilder` object for interactive join configuration.\n2. Call ```join()``` on one of the dataflows and pass in the other along with all other arguments.\n3. Call ```Dataflow.join()``` method and pass in two dataflows along with all other arguments.\n\nWe will explore the builder object as it simplifies the determination of correct arguments. " 85 | }, 86 | { 87 | "metadata": { 88 | "trusted": true 89 | }, 90 | "cell_type": "code", 91 | "source": "# construct a builder for joining dataflow_l with dataflow_r\njoin_builder = dataflow_l.builders.join(right_dataflow=dataflow_r, left_column_prefix='l', right_column_prefix='r')\n\njoin_builder", 92 | "execution_count": null, 93 | "outputs": [] 94 | }, 95 | { 96 | "metadata": {}, 97 | "cell_type": "markdown", 98 | "source": "As you can see, so far the builder has no propeties set except default values.\nFrom here you could set each of the options and preview its effect on the join result or use DataPrep to determine some of them.\n\nLet's start with determining appropriate column prefixes for left and right side of the join and lists of columns that would not conflict and therefore don't need to be prefixed. " 99 | }, 100 | { 101 | "metadata": { 102 | "scrolled": true, 103 | "trusted": true 104 | }, 105 | "cell_type": "code", 106 | "source": "join_builder.detect_column_info()\njoin_builder", 107 | "execution_count": null, 108 | "outputs": [] 109 | }, 110 | { 111 | "metadata": {}, 112 | "cell_type": "markdown", 113 | "source": "You can see that DataPrep has performed a pull on both dataflows to determine the column names in them. Given that `dataflow_r` already had a column starting with `l_` new prefix got generated which would not collide with any column names that are already present.\nAdditionally columns in each dataflow that won't conflict during join would remain unprefixed.\nThis apprach to column naming is crucial for join robustness to schema changes in the data. Let's say that at some time in future the data consumed by left dataflow will also have `l_date_timerange` column in it.\nConfigured as above the join will still run as expected and the new column will be prefixed with `l2_` ensuring that ig column `l_date_timerange` was consumed by some other future transformation it remains unaffected.\n\nNote: `KEY_generated` is appended to both lists and is reserved for Dataprep use in case Autojoin is performed.\n\n### Autojoin\nAutojoin is a Dataprep feature that determines suitable join arguments given data on both sides. In some cases Autojoin can even derive a key column from a number of available columns in the data.\nHere is how you can use Autojoin:" 114 | }, 115 | { 116 | "metadata": { 117 | "trusted": true 118 | }, 119 | "cell_type": "code", 120 | "source": "# generate join suggestions\njoin_builder.generate_suggested_join()\n\n# list generated suggestions\njoin_builder.list_join_suggestions()", 121 | "execution_count": null, 122 | "outputs": [] 123 | }, 124 | { 125 | "metadata": {}, 126 | "cell_type": "markdown", 127 | "source": "Now lets select first suggestion and preview the result of the join." 128 | }, 129 | { 130 | "metadata": { 131 | "trusted": true 132 | }, 133 | "cell_type": "code", 134 | "source": "# apply first suggestion\njoin_builder.apply_suggestion(0)\n\njoin_builder.preview(10)", 135 | "execution_count": null, 136 | "outputs": [] 137 | }, 138 | { 139 | "metadata": {}, 140 | "cell_type": "markdown", 141 | "source": "Everything looks just as we would expect, so it is time to get our new joined dataflow." 142 | }, 143 | { 144 | "metadata": { 145 | "trusted": true 146 | }, 147 | "cell_type": "code", 148 | "source": "dataflow_autojoined = join_builder.to_dataflow().drop_columns(['l_date_timerange'])", 149 | "execution_count": null, 150 | "outputs": [] 151 | }, 152 | { 153 | "metadata": {}, 154 | "cell_type": "markdown", 155 | "source": "### Joining two dataflows without pulling the data\n\nIn don't want to pull on data and know what join should look like you can always you join method on Dataflow " 156 | }, 157 | { 158 | "metadata": { 159 | "trusted": true 160 | }, 161 | "cell_type": "code", 162 | "source": "dataflow_joined = dprep.Dataflow.join(left_dataflow=dataflow_l,\n right_dataflow=dataflow_r,\n join_key_pairs=[('date_timerange', 'l_date_timerange')],\n left_column_prefix='l2_',\n right_column_prefix='r_')\n", 163 | "execution_count": null, 164 | "outputs": [] 165 | }, 166 | { 167 | "metadata": { 168 | "trusted": true 169 | }, 170 | "cell_type": "code", 171 | "source": "dataflow_joined.head(10)", 172 | "execution_count": null, 173 | "outputs": [] 174 | }, 175 | { 176 | "metadata": { 177 | "trusted": true 178 | }, 179 | "cell_type": "code", 180 | "source": "dataflow_joined = dataflow_joined.filter(expression=dprep.col('r_start station id') == '67')\ndf = dataflow_joined.to_pandas_dataframe()\ndf", 181 | "execution_count": null, 182 | "outputs": [] 183 | }, 184 | { 185 | "metadata": { 186 | "trusted": true 187 | }, 188 | "cell_type": "code", 189 | "source": "", 190 | "execution_count": null, 191 | "outputs": [] 192 | }, 193 | { 194 | "metadata": { 195 | "trusted": true 196 | }, 197 | "cell_type": "code", 198 | "source": "", 199 | "execution_count": null, 200 | "outputs": [] 201 | }, 202 | { 203 | "metadata": { 204 | "trusted": true 205 | }, 206 | "cell_type": "code", 207 | "source": "", 208 | "execution_count": null, 209 | "outputs": [] 210 | }, 211 | { 212 | "metadata": { 213 | "trusted": true 214 | }, 215 | "cell_type": "code", 216 | "source": "", 217 | "execution_count": null, 218 | "outputs": [] 219 | } 220 | ], 221 | "metadata": { 222 | "kernelspec": { 223 | "name": "python36", 224 | "display_name": "Python 3.6", 225 | "language": "python" 226 | }, 227 | "language_info": { 228 | "mimetype": "text/x-python", 229 | "nbconvert_exporter": "python", 230 | "name": "python", 231 | "pygments_lexer": "ipython3", 232 | "version": "3.6.6", 233 | "file_extension": ".py", 234 | "codemirror_mode": { 235 | "version": 3, 236 | "name": "ipython" 237 | } 238 | } 239 | }, 240 | "nbformat": 4, 241 | "nbformat_minor": 2 242 | } -------------------------------------------------------------------------------- /split-column-by-example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": {}, 5 | "cell_type": "markdown", 6 | "source": "# Split column by example\nCopyright (c) Microsoft Corporation. All rights reserved.
\nLicensed under the MIT License." 7 | }, 8 | { 9 | "metadata": {}, 10 | "cell_type": "markdown", 11 | "source": "DataPrep also offers you a way to easily split a column into multiple columns.\nThe SplitColumnByExampleBuilder class lets you generate a proper split program that will work even when the cases are not trivial, like in example below." 12 | }, 13 | { 14 | "metadata": { 15 | "trusted": true 16 | }, 17 | "cell_type": "code", 18 | "source": "!pip install azureml", 19 | "execution_count": 1, 20 | "outputs": [ 21 | { 22 | "output_type": "stream", 23 | "text": "Requirement already satisfied: azureml in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (0.2.7)\nRequirement already satisfied: requests in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.19.1)\nRequirement already satisfied: pandas in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (0.22.0)\nRequirement already satisfied: python-dateutil in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from azureml) (2.7.3)\nRequirement already satisfied: urllib3<1.24,>=1.21.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (1.23)\nRequirement already satisfied: certifi>=2017.4.17 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2018.8.24)\nRequirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (3.0.4)\nRequirement already satisfied: idna<2.8,>=2.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from requests->azureml) (2.7)\nRequirement already satisfied: pytz>=2011k in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (2018.5)\nRequirement already satisfied: numpy>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from pandas->azureml) (1.14.5)\nRequirement already satisfied: six>=1.5 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from python-dateutil->azureml) (1.11.0)\n", 24 | "name": "stdout" 25 | } 26 | ] 27 | }, 28 | { 29 | "metadata": { 30 | "trusted": true 31 | }, 32 | "cell_type": "code", 33 | "source": "import azureml.dataprep as dprep", 34 | "execution_count": 3, 35 | "outputs": [] 36 | }, 37 | { 38 | "metadata": { 39 | "scrolled": true, 40 | "trusted": true 41 | }, 42 | "cell_type": "code", 43 | "source": "dataflow = dprep.read_lines(path='https://dpreptestfiles.blob.core.windows.net/testfiles/sample.log')\ndf = dataflow.head(10)", 44 | "execution_count": 4, 45 | "outputs": [] 46 | }, 47 | { 48 | "metadata": { 49 | "trusted": true 50 | }, 51 | "cell_type": "code", 52 | "source": "df['Line'].iloc[0]", 53 | "execution_count": 5, 54 | "outputs": [ 55 | { 56 | "output_type": "execute_result", 57 | "execution_count": 5, 58 | "data": { 59 | "text/plain": "'2012-02-03 18:35:34 SampleClass6 [INFO] everything normal for id 577725851'" 60 | }, 61 | "metadata": {} 62 | } 63 | ] 64 | }, 65 | { 66 | "metadata": {}, 67 | "cell_type": "markdown", 68 | "source": "As you can see above, you can't split this particular log file by space character as it will create too many columns and even worse number of columns will depend on a string in 6th column.\nThat's where split_column_by_example could be quite useful." 69 | }, 70 | { 71 | "metadata": { 72 | "trusted": true 73 | }, 74 | "cell_type": "code", 75 | "source": "b = dataflow.builders.split_column_by_example('Line', keep_delimiters=True)", 76 | "execution_count": 6, 77 | "outputs": [] 78 | }, 79 | { 80 | "metadata": { 81 | "scrolled": false, 82 | "trusted": true 83 | }, 84 | "cell_type": "code", 85 | "source": "b.preview()", 86 | "execution_count": 7, 87 | "outputs": [ 88 | { 89 | "output_type": "execute_result", 90 | "execution_count": 7, 91 | "data": { 92 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
LineLine_1Line_2Line_3Line_4Line_5Line_6Line_7Line_8Line_9Line_10Line_11Line_12
02012-02-03 18:35:34 SampleClass6 [INFO] everyt...2012-02-0318:35:34SampleClass6[INFO]everything normal for id577725851
12012-02-03 18:35:34 SampleClass4 [FATAL] syste...2012-02-0318:35:34SampleClass4[FATAL]system problem at id1991281254
22012-02-03 18:35:34 SampleClass3 [DEBUG] detai...2012-02-0318:35:34SampleClass3[DEBUG]detail for id1304807656
32012-02-03 18:35:34 SampleClass3 [WARN] missin...2012-02-0318:35:34SampleClass3[WARN]missing id423340895
42012-02-03 18:35:34 SampleClass5 [TRACE] verbo...2012-02-0318:35:34SampleClass5[TRACE]verbose detail for id2082654978
52012-02-03 18:35:34 SampleClass0 [ERROR] incor...NoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNone
62012-02-03 18:35:34 SampleClass9 [TRACE] verbo...2012-02-0318:35:34SampleClass9[TRACE]verbose detail for id438634209
72012-02-03 18:35:34 SampleClass8 [DEBUG] detai...2012-02-0318:35:34SampleClass8[DEBUG]detail for id2074121310
82012-02-03 18:55:54 SampleClass4 [DEBUG] detai...2012-02-0318:55:54SampleClass4[DEBUG]detail for id1029178762
92012-02-03 18:55:54 SampleClass2 [TRACE] verbo...2012-02-0318:55:54SampleClass2[TRACE]verbose detail for id1135460272
\n
", 93 | "text/plain": " Line Line_1 Line_2 \\\n0 2012-02-03 18:35:34 SampleClass6 [INFO] everyt... 2012-02-03 \n1 2012-02-03 18:35:34 SampleClass4 [FATAL] syste... 2012-02-03 \n2 2012-02-03 18:35:34 SampleClass3 [DEBUG] detai... 2012-02-03 \n3 2012-02-03 18:35:34 SampleClass3 [WARN] missin... 2012-02-03 \n4 2012-02-03 18:35:34 SampleClass5 [TRACE] verbo... 2012-02-03 \n5 2012-02-03 18:35:34 SampleClass0 [ERROR] incor... None None \n6 2012-02-03 18:35:34 SampleClass9 [TRACE] verbo... 2012-02-03 \n7 2012-02-03 18:35:34 SampleClass8 [DEBUG] detai... 2012-02-03 \n8 2012-02-03 18:55:54 SampleClass4 [DEBUG] detai... 2012-02-03 \n9 2012-02-03 18:55:54 SampleClass2 [TRACE] verbo... 2012-02-03 \n\n Line_3 Line_4 Line_5 Line_6 Line_7 Line_8 Line_9 \\\n0 18:35:34 SampleClass 6 [ INFO ] \n1 18:35:34 SampleClass 4 [ FATAL ] \n2 18:35:34 SampleClass 3 [ DEBUG ] \n3 18:35:34 SampleClass 3 [ WARN ] \n4 18:35:34 SampleClass 5 [ TRACE ] \n5 None None None None None None None \n6 18:35:34 SampleClass 9 [ TRACE ] \n7 18:35:34 SampleClass 8 [ DEBUG ] \n8 18:55:54 SampleClass 4 [ DEBUG ] \n9 18:55:54 SampleClass 2 [ TRACE ] \n\n Line_10 Line_11 Line_12 \n0 everything normal for id 577725851 \n1 system problem at id 1991281254 \n2 detail for id 1304807656 \n3 missing id 423340895 \n4 verbose detail for id 2082654978 \n5 None None None \n6 verbose detail for id 438634209 \n7 detail for id 2074121310 \n8 detail for id 1029178762 \n9 verbose detail for id 1135460272 " 94 | }, 95 | "metadata": {} 96 | } 97 | ] 98 | }, 99 | { 100 | "metadata": {}, 101 | "cell_type": "markdown", 102 | "source": "Couple things to take note of here. No examples were given, and yet DataPrep was able to generate quite reasonable split program. \nWe have passed keep_delimiters=True so we can see all the data split into columns. In practice, though, delimiters are rarely useful, so let's exclude them." 103 | }, 104 | { 105 | "metadata": { 106 | "scrolled": true, 107 | "trusted": true 108 | }, 109 | "cell_type": "code", 110 | "source": "b.keep_delimiters = False\nb.preview()", 111 | "execution_count": 8, 112 | "outputs": [ 113 | { 114 | "output_type": "execute_result", 115 | "execution_count": 8, 116 | "data": { 117 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
LineLine_1Line_2Line_3Line_4Line_5Line_6Line_7
02012-02-03 18:35:34 SampleClass6 [INFO] everyt...2012-02-0318:35:34SampleClass6INFOeverything normal for id577725851
12012-02-03 18:35:34 SampleClass4 [FATAL] syste...2012-02-0318:35:34SampleClass4FATALsystem problem at id1991281254
22012-02-03 18:35:34 SampleClass3 [DEBUG] detai...2012-02-0318:35:34SampleClass3DEBUGdetail for id1304807656
32012-02-03 18:35:34 SampleClass3 [WARN] missin...2012-02-0318:35:34SampleClass3WARNmissing id423340895
42012-02-03 18:35:34 SampleClass5 [TRACE] verbo...2012-02-0318:35:34SampleClass5TRACEverbose detail for id2082654978
52012-02-03 18:35:34 SampleClass0 [ERROR] incor...NoneNoneNoneNoneNoneNoneNone
62012-02-03 18:35:34 SampleClass9 [TRACE] verbo...2012-02-0318:35:34SampleClass9TRACEverbose detail for id438634209
72012-02-03 18:35:34 SampleClass8 [DEBUG] detai...2012-02-0318:35:34SampleClass8DEBUGdetail for id2074121310
82012-02-03 18:55:54 SampleClass4 [DEBUG] detai...2012-02-0318:55:54SampleClass4DEBUGdetail for id1029178762
92012-02-03 18:55:54 SampleClass2 [TRACE] verbo...2012-02-0318:55:54SampleClass2TRACEverbose detail for id1135460272
\n
", 118 | "text/plain": " Line Line_1 Line_2 \\\n0 2012-02-03 18:35:34 SampleClass6 [INFO] everyt... 2012-02-03 18:35:34 \n1 2012-02-03 18:35:34 SampleClass4 [FATAL] syste... 2012-02-03 18:35:34 \n2 2012-02-03 18:35:34 SampleClass3 [DEBUG] detai... 2012-02-03 18:35:34 \n3 2012-02-03 18:35:34 SampleClass3 [WARN] missin... 2012-02-03 18:35:34 \n4 2012-02-03 18:35:34 SampleClass5 [TRACE] verbo... 2012-02-03 18:35:34 \n5 2012-02-03 18:35:34 SampleClass0 [ERROR] incor... None None \n6 2012-02-03 18:35:34 SampleClass9 [TRACE] verbo... 2012-02-03 18:35:34 \n7 2012-02-03 18:35:34 SampleClass8 [DEBUG] detai... 2012-02-03 18:35:34 \n8 2012-02-03 18:55:54 SampleClass4 [DEBUG] detai... 2012-02-03 18:55:54 \n9 2012-02-03 18:55:54 SampleClass2 [TRACE] verbo... 2012-02-03 18:55:54 \n\n Line_3 Line_4 Line_5 Line_6 Line_7 \n0 SampleClass 6 INFO everything normal for id 577725851 \n1 SampleClass 4 FATAL system problem at id 1991281254 \n2 SampleClass 3 DEBUG detail for id 1304807656 \n3 SampleClass 3 WARN missing id 423340895 \n4 SampleClass 5 TRACE verbose detail for id 2082654978 \n5 None None None None None \n6 SampleClass 9 TRACE verbose detail for id 438634209 \n7 SampleClass 8 DEBUG detail for id 2074121310 \n8 SampleClass 4 DEBUG detail for id 1029178762 \n9 SampleClass 2 TRACE verbose detail for id 1135460272 " 119 | }, 120 | "metadata": {} 121 | } 122 | ] 123 | }, 124 | { 125 | "metadata": {}, 126 | "cell_type": "markdown", 127 | "source": "This looks pretty good already, except for line 5.\nIf we request generation of suggested examples we will see that line 5 is one of the items program need more input on." 128 | }, 129 | { 130 | "metadata": { 131 | "trusted": true 132 | }, 133 | "cell_type": "code", 134 | "source": "suggestions = b.generate_suggested_examples()\nsuggestions", 135 | "execution_count": 9, 136 | "outputs": [ 137 | { 138 | "output_type": "execute_result", 139 | "execution_count": 9, 140 | "data": { 141 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Line
02012-02-03 18:35:34 SampleClass6 [INFO] everyt...
12012-02-03 18:35:34 SampleClass0 [ERROR] incor...
2
3java.lang.Exception: 2012-02-03 19:11:02 Sampl...
4\\tat com.osa.mocklogger.MockLogger$2.run(MockL...
\n
", 142 | "text/plain": " Line\n0 2012-02-03 18:35:34 SampleClass6 [INFO] everyt...\n1 2012-02-03 18:35:34 SampleClass0 [ERROR] incor...\n2 \n3 java.lang.Exception: 2012-02-03 19:11:02 Sampl...\n4 \\tat com.osa.mocklogger.MockLogger$2.run(MockL..." 143 | }, 144 | "metadata": {} 145 | } 146 | ] 147 | }, 148 | { 149 | "metadata": { 150 | "trusted": true 151 | }, 152 | "cell_type": "code", 153 | "source": "suggestions.iloc[1]['Line']", 154 | "execution_count": 10, 155 | "outputs": [ 156 | { 157 | "output_type": "execute_result", 158 | "execution_count": 10, 159 | "data": { 160 | "text/plain": "'2012-02-03 18:35:34 SampleClass0 [ERROR] incorrect id 1886438513'" 161 | }, 162 | "metadata": {} 163 | } 164 | ] 165 | }, 166 | { 167 | "metadata": {}, 168 | "cell_type": "markdown", 169 | "source": "Having retrieved source value we can now provide an example of desired split.\nNotice that we chose not to split date and time but rather keep them together in one column." 170 | }, 171 | { 172 | "metadata": { 173 | "trusted": true 174 | }, 175 | "cell_type": "code", 176 | "source": "b.add_example(example=(suggestions['Line'].iloc[1], ['2012-02-03 18:35:34','SampleClass0','ERROR','incorrect id','1886438513']))", 177 | "execution_count": 11, 178 | "outputs": [] 179 | }, 180 | { 181 | "metadata": { 182 | "scrolled": false, 183 | "trusted": true 184 | }, 185 | "cell_type": "code", 186 | "source": "b.preview()", 187 | "execution_count": 12, 188 | "outputs": [ 189 | { 190 | "output_type": "execute_result", 191 | "execution_count": 12, 192 | "data": { 193 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
LineLine_1Line_2Line_3Line_4Line_5
02012-02-03 18:35:34 SampleClass6 [INFO] everyt...2012-02-03 18:35:34SampleClass6INFOeverything normal for id577725851
12012-02-03 18:35:34 SampleClass4 [FATAL] syste...2012-02-03 18:35:34SampleClass4FATALsystem problem at id1991281254
22012-02-03 18:35:34 SampleClass3 [DEBUG] detai...2012-02-03 18:35:34SampleClass3DEBUGdetail for id1304807656
32012-02-03 18:35:34 SampleClass3 [WARN] missin...2012-02-03 18:35:34SampleClass3WARNmissing id423340895
42012-02-03 18:35:34 SampleClass5 [TRACE] verbo...2012-02-03 18:35:34SampleClass5TRACEverbose detail for id2082654978
52012-02-03 18:35:34 SampleClass0 [ERROR] incor...2012-02-03 18:35:34SampleClass0ERRORincorrect id1886438513
62012-02-03 18:35:34 SampleClass9 [TRACE] verbo...2012-02-03 18:35:34SampleClass9TRACEverbose detail for id438634209
72012-02-03 18:35:34 SampleClass8 [DEBUG] detai...2012-02-03 18:35:34SampleClass8DEBUGdetail for id2074121310
82012-02-03 18:55:54 SampleClass4 [DEBUG] detai...2012-02-03 18:55:54SampleClass4DEBUGdetail for id1029178762
92012-02-03 18:55:54 SampleClass2 [TRACE] verbo...2012-02-03 18:55:54SampleClass2TRACEverbose detail for id1135460272
\n
", 194 | "text/plain": " Line Line_1 \\\n0 2012-02-03 18:35:34 SampleClass6 [INFO] everyt... 2012-02-03 18:35:34 \n1 2012-02-03 18:35:34 SampleClass4 [FATAL] syste... 2012-02-03 18:35:34 \n2 2012-02-03 18:35:34 SampleClass3 [DEBUG] detai... 2012-02-03 18:35:34 \n3 2012-02-03 18:35:34 SampleClass3 [WARN] missin... 2012-02-03 18:35:34 \n4 2012-02-03 18:35:34 SampleClass5 [TRACE] verbo... 2012-02-03 18:35:34 \n5 2012-02-03 18:35:34 SampleClass0 [ERROR] incor... 2012-02-03 18:35:34 \n6 2012-02-03 18:35:34 SampleClass9 [TRACE] verbo... 2012-02-03 18:35:34 \n7 2012-02-03 18:35:34 SampleClass8 [DEBUG] detai... 2012-02-03 18:35:34 \n8 2012-02-03 18:55:54 SampleClass4 [DEBUG] detai... 2012-02-03 18:55:54 \n9 2012-02-03 18:55:54 SampleClass2 [TRACE] verbo... 2012-02-03 18:55:54 \n\n Line_2 Line_3 Line_4 Line_5 \n0 SampleClass6 INFO everything normal for id 577725851 \n1 SampleClass4 FATAL system problem at id 1991281254 \n2 SampleClass3 DEBUG detail for id 1304807656 \n3 SampleClass3 WARN missing id 423340895 \n4 SampleClass5 TRACE verbose detail for id 2082654978 \n5 SampleClass0 ERROR incorrect id 1886438513 \n6 SampleClass9 TRACE verbose detail for id 438634209 \n7 SampleClass8 DEBUG detail for id 2074121310 \n8 SampleClass4 DEBUG detail for id 1029178762 \n9 SampleClass2 TRACE verbose detail for id 1135460272 " 195 | }, 196 | "metadata": {} 197 | } 198 | ] 199 | }, 200 | { 201 | "metadata": {}, 202 | "cell_type": "markdown", 203 | "source": "This looks just like what we need, so let's get a dataflow with split in it and drop original column." 204 | }, 205 | { 206 | "metadata": { 207 | "trusted": true 208 | }, 209 | "cell_type": "code", 210 | "source": "dataflow = b.to_dataflow()\ndataflow = dataflow.drop_columns(['Line'])\ndataflow.head(10)", 211 | "execution_count": 13, 212 | "outputs": [ 213 | { 214 | "output_type": "execute_result", 215 | "execution_count": 13, 216 | "data": { 217 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Line_1Line_2Line_3Line_4Line_5
02012-02-03 18:35:34SampleClass6INFOeverything normal for id577725851
12012-02-03 18:35:34SampleClass4FATALsystem problem at id1991281254
22012-02-03 18:35:34SampleClass3DEBUGdetail for id1304807656
32012-02-03 18:35:34SampleClass3WARNmissing id423340895
42012-02-03 18:35:34SampleClass5TRACEverbose detail for id2082654978
52012-02-03 18:35:34SampleClass0ERRORincorrect id1886438513
62012-02-03 18:35:34SampleClass9TRACEverbose detail for id438634209
72012-02-03 18:35:34SampleClass8DEBUGdetail for id2074121310
82012-02-03 18:55:54SampleClass4DEBUGdetail for id1029178762
92012-02-03 18:55:54SampleClass2TRACEverbose detail for id1135460272
\n
", 218 | "text/plain": " Line_1 Line_2 Line_3 Line_4 \\\n0 2012-02-03 18:35:34 SampleClass6 INFO everything normal for id \n1 2012-02-03 18:35:34 SampleClass4 FATAL system problem at id \n2 2012-02-03 18:35:34 SampleClass3 DEBUG detail for id \n3 2012-02-03 18:35:34 SampleClass3 WARN missing id \n4 2012-02-03 18:35:34 SampleClass5 TRACE verbose detail for id \n5 2012-02-03 18:35:34 SampleClass0 ERROR incorrect id \n6 2012-02-03 18:35:34 SampleClass9 TRACE verbose detail for id \n7 2012-02-03 18:35:34 SampleClass8 DEBUG detail for id \n8 2012-02-03 18:55:54 SampleClass4 DEBUG detail for id \n9 2012-02-03 18:55:54 SampleClass2 TRACE verbose detail for id \n\n Line_5 \n0 577725851 \n1 1991281254 \n2 1304807656 \n3 423340895 \n4 2082654978 \n5 1886438513 \n6 438634209 \n7 2074121310 \n8 1029178762 \n9 1135460272 " 219 | }, 220 | "metadata": {} 221 | } 222 | ] 223 | }, 224 | { 225 | "metadata": { 226 | "trusted": true 227 | }, 228 | "cell_type": "code", 229 | "source": "", 230 | "execution_count": null, 231 | "outputs": [] 232 | } 233 | ], 234 | "metadata": { 235 | "kernelspec": { 236 | "name": "python36", 237 | "display_name": "Python 3.6", 238 | "language": "python" 239 | }, 240 | "language_info": { 241 | "mimetype": "text/x-python", 242 | "nbconvert_exporter": "python", 243 | "name": "python", 244 | "pygments_lexer": "ipython3", 245 | "version": "3.6.6", 246 | "file_extension": ".py", 247 | "codemirror_mode": { 248 | "version": 3, 249 | "name": "ipython" 250 | } 251 | } 252 | }, 253 | "nbformat": 4, 254 | "nbformat_minor": 2 255 | } --------------------------------------------------------------------------------