├── .github └── workflows │ └── constraints.yml ├── LICENSE ├── README.md ├── github-actions ├── README.md ├── constraints-fail.json └── constraints-succeed.json ├── java └── demo1 │ ├── .gitignore │ ├── build.gradle.kts │ ├── gradle │ └── wrapper │ │ ├── gradle-wrapper.jar │ │ └── gradle-wrapper.properties │ ├── gradlew │ ├── gradlew.bat │ ├── settings.gradle │ └── src │ └── main │ ├── java │ └── com │ │ └── whylogs │ │ └── examples │ │ └── WhyLogsDemo.java │ └── resources │ └── com │ └── whylogs │ └── examples │ └── Fire_Department_Calls_for_Service.csv ├── python ├── .gitignore ├── .whylogs.yaml ├── .whylogs_mlflow.yaml ├── DatasetDrift.ipynb ├── GettingStarted.ipynb ├── Kafka.ipynb ├── Logging_Images.ipynb ├── MLFlow Integration Example.ipynb ├── MediaSpendDataset.csv ├── RAPIDS GPU Integration Example.ipynb ├── S3 example.ipynb ├── Streaming Mode - whylogs.ipynb ├── WhyLabs_Platform.ipynb ├── WhyLabs_Platform_with_log_reference.ipynb ├── flower2.jpg ├── lending_club_1000.csv ├── lending_club_demo.csv ├── logging_example.ipynb ├── mlflow.db ├── profile.bin ├── requirements.txt └── whylogs.yaml └── scala ├── .gitignore ├── Fire_Department_Calls_for_Service.csv ├── build.sbt ├── project └── build.properties └── src └── main └── scala ├── WhyLogsDemo.scala ├── WhyLogsScalaLendingClubToWhylabsExample.scala └── WhylabsDatabricks.sh /.github/workflows/constraints.yml: -------------------------------------------------------------------------------- 1 | 2 | on: [push] 3 | 4 | jobs: 5 | whylogs_constraints: 6 | runs-on: ubuntu-latest 7 | name: validates data against whylogs constraints 8 | steps: 9 | # To use this repository's private action, 10 | # you must check out the repository 11 | - name: Checkout 12 | uses: actions/checkout@v2 13 | - name: expect constraints to fail step 14 | uses: whylabs/whylogs_action@v1 15 | id: expect-failure 16 | with: 17 | constraintsfile: 'github-actions/constraints-fail.json' 18 | datafile: 'python/lending_club_1000.csv' 19 | expect_failure: 'True' 20 | - name: expect constraints to succeed step 21 | uses: whylabs/whylogs_action@v1 22 | id: expect-success 23 | with: 24 | constraintsfile: 'github-actions/constraints-succeed.json' 25 | datafile: 'python/lending_club_1000.csv' 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WhyLogs Examples 2 | 3 | This repository contains sample code for whylogs 0.7.x and older*. 4 | 5 | For whylogs 1.x and newer, please visit the examples folder at our repository: 6 | 7 | https://github.com/whylabs/whylogs/tree/mainline/python/examples 8 | 9 | *the source code for the library's versions 0.7.x and older are located in the following repos: 10 | 11 | * Python: https://github.com/whylabs/whylogs/tree/maintenance/0.7.x 12 | * Java: https://github.com/whylabs/whylogs/tree/maintenance/0.7.x/java 13 | -------------------------------------------------------------------------------- /github-actions/README.md: -------------------------------------------------------------------------------- 1 | 2 | whylogs can help monitor your ML datasets as part of your GitOps CI/CD pipeline. 3 | 4 | The github action used in this example is defined in `.github/workflows/constraints.yml`. 5 | It specifies what actions to take whenever commits are pushed to this repo. 6 | 7 | This directory contains whylogs constraints that are applied to a dataset as part of the Github action. 8 | Constraints assert that a logged value or summary statistic is within an expected range. 9 | 10 | ```yaml 11 | - name: expect constraints to fail step 12 | uses: whylabs/whylogs_action@v1 13 | id: expect-failure 14 | with: 15 | constraintsfile: 'github-actions/constraints-fail.json' 16 | datafile: 'python/lending_club_1000.csv' 17 | expect_failure: 'True' 18 | - name: expect constraints to succeed step 19 | uses: whylabs/whylogs_action@v1 20 | id: expect-success 21 | with: 22 | constraintsfile: 'github-actions/constraints-succeed.json' 23 | datafile: 'python/lending_club_1000.csv' 24 | ``` 25 | 26 | We define two steps in our action. The first runs a set of constraints that are expected to fail. That 27 | is done just to check that the constraint logic is working as expected. The second step applies a set of constraints that are expected to succeed. 28 | 29 | ## Action Tags 30 | 31 | `uses:` references the prepackaged action in the`whylabs/whylogs_action` repo. 32 | That tells github how to run whylogs on parameters you supply. This is a tag common to all Github actions. 33 | 34 | `constraintsfile:` points to a file of constraints defined in this repo. 35 | 36 | `datafile:` points to a file containing data to which the constraints should be applied. Format 37 | is anything that the pandas package can load, but CSV works well. 38 | 39 | `expect_failure:` indicates whether the action is expected to fail or not. Actions are usually 40 | written to expect success; we include this flag for completeness. 41 | 42 | ## Constraint Definition 43 | 44 | whylogs constraints are specified in JSON. 45 | Each constraint is bound to a column in the data, and each column may have multiple constraints. 46 | Standard boolean comparison operators are supported -- LT, LE, EQ, NE, GE, GT. 47 | We are actively extending whylogs to support other constraint operators, for example, 48 | to match regex on strings or to test image features. 49 | 50 | Example: 51 | ``` 52 | { 53 | "valueConstraints": { 54 | "loan_amnt": { 55 | "constraints": [ 56 | { 57 | "value": 548250.0, 58 | "op": "LT" 59 | }, 60 | { 61 | "value": 2500.0, 62 | "op": "LT", 63 | "verbose": true 64 | } 65 | ] 66 | } 67 | }, 68 | "summaryConstraints": { 69 | "annual_inc": { 70 | "constraints": [ 71 | { 72 | "firstField": "min", 73 | "value": 0.0, 74 | "op": "GE" 75 | } 76 | ] 77 | } 78 | } 79 | } 80 | ``` 81 | 82 | This example shows the definition of two types of constraints; `valueConstraints` and `summaryConstraints`. 83 | Value constraints are applied to every value that is logged for a feature. At a minimum, 84 | Value constraints must specify a comparison operator and a literal value. 85 | Summary constraints are applied to Whylogs feature summaries. 86 | They compare fields of the summary to static literals or to another field in the summary, 87 | 88 | Constraints may be marked 'verbose' which will log every failure. 89 | ``` 90 | INFO - value constraint value GT 2500.0 failed on value 2500.0 91 | ``` 92 | Verbose logging helps identify why a constraint is failing to validate, but can be very chatty if there are a lots of failures. 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /github-actions/constraints-fail.json: -------------------------------------------------------------------------------- 1 | { 2 | "valueConstraints": { 3 | "loan_amnt": { 4 | "constraints": [ 5 | { 6 | "name": "value LT 548250", 7 | "value": 548250.0, 8 | "op": "LT", 9 | "verbose": false 10 | }, 11 | { 12 | "name": "value GT 2500.0", 13 | "value": 2500.0, 14 | "op": "GT", 15 | "verbose": true 16 | } 17 | ] 18 | }, 19 | "fico_range_high": { 20 | "constraints": [ 21 | { 22 | "name": "value GT 4000", 23 | "value": 4000.0, 24 | "op": "LT", 25 | "verbose": true 26 | } 27 | ] 28 | } 29 | }, 30 | "summaryConstraints": { 31 | "annual_inc": { 32 | "constraints": [ 33 | { 34 | "name": "summary min GE 0/None", 35 | "first_field": "min", 36 | "value": 0.0, 37 | "op": "GE", 38 | "verbose": false 39 | } 40 | ] 41 | } 42 | } 43 | } -------------------------------------------------------------------------------- /github-actions/constraints-succeed.json: -------------------------------------------------------------------------------- 1 | { 2 | "summaryConstraints": { 3 | "annual_inc": { 4 | "constraints": [ 5 | { 6 | "name": "summary min GE 0/None", 7 | "first_field": "min", 8 | "value": 0.0, 9 | "op": "GE", 10 | "verbose": false 11 | } 12 | ] 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /java/demo1/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .gradle 3 | build 4 | output 5 | -------------------------------------------------------------------------------- /java/demo1/build.gradle.kts: -------------------------------------------------------------------------------- 1 | plugins { 2 | java 3 | } 4 | 5 | group = "com.whylogs.example" 6 | version = "1.0-SNAPSHOT" 7 | 8 | repositories { 9 | mavenCentral() 10 | } 11 | 12 | dependencies { 13 | implementation("ai.whylabs:whylogs-core:0.0.2b3") 14 | implementation("org.apache.commons:commons-csv:1.8") 15 | } 16 | -------------------------------------------------------------------------------- /java/demo1/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whylabs/whylogs-examples/167be1e91f335ca2b77a09aa6ef99090099bfcae/java/demo1/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /java/demo1/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.1-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /java/demo1/gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | # 4 | # Copyright 2015 the original author or authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | ############################################################################## 20 | ## 21 | ## Gradle start up script for UN*X 22 | ## 23 | ############################################################################## 24 | 25 | # Attempt to set APP_HOME 26 | # Resolve links: $0 may be a link 27 | PRG="$0" 28 | # Need this for relative symlinks. 29 | while [ -h "$PRG" ] ; do 30 | ls=`ls -ld "$PRG"` 31 | link=`expr "$ls" : '.*-> \(.*\)$'` 32 | if expr "$link" : '/.*' > /dev/null; then 33 | PRG="$link" 34 | else 35 | PRG=`dirname "$PRG"`"/$link" 36 | fi 37 | done 38 | SAVED="`pwd`" 39 | cd "`dirname \"$PRG\"`/" >/dev/null 40 | APP_HOME="`pwd -P`" 41 | cd "$SAVED" >/dev/null 42 | 43 | APP_NAME="Gradle" 44 | APP_BASE_NAME=`basename "$0"` 45 | 46 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 47 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' 48 | 49 | # Use the maximum available, or set MAX_FD != -1 to use that value. 50 | MAX_FD="maximum" 51 | 52 | warn () { 53 | echo "$*" 54 | } 55 | 56 | die () { 57 | echo 58 | echo "$*" 59 | echo 60 | exit 1 61 | } 62 | 63 | # OS specific support (must be 'true' or 'false'). 64 | cygwin=false 65 | msys=false 66 | darwin=false 67 | nonstop=false 68 | case "`uname`" in 69 | CYGWIN* ) 70 | cygwin=true 71 | ;; 72 | Darwin* ) 73 | darwin=true 74 | ;; 75 | MINGW* ) 76 | msys=true 77 | ;; 78 | NONSTOP* ) 79 | nonstop=true 80 | ;; 81 | esac 82 | 83 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 84 | 85 | # Determine the Java command to use to start the JVM. 86 | if [ -n "$JAVA_HOME" ] ; then 87 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 88 | # IBM's JDK on AIX uses strange locations for the executables 89 | JAVACMD="$JAVA_HOME/jre/sh/java" 90 | else 91 | JAVACMD="$JAVA_HOME/bin/java" 92 | fi 93 | if [ ! -x "$JAVACMD" ] ; then 94 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 95 | 96 | Please set the JAVA_HOME variable in your environment to match the 97 | location of your Java installation." 98 | fi 99 | else 100 | JAVACMD="java" 101 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 102 | 103 | Please set the JAVA_HOME variable in your environment to match the 104 | location of your Java installation." 105 | fi 106 | 107 | # Increase the maximum file descriptors if we can. 108 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then 109 | MAX_FD_LIMIT=`ulimit -H -n` 110 | if [ $? -eq 0 ] ; then 111 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 112 | MAX_FD="$MAX_FD_LIMIT" 113 | fi 114 | ulimit -n $MAX_FD 115 | if [ $? -ne 0 ] ; then 116 | warn "Could not set maximum file descriptor limit: $MAX_FD" 117 | fi 118 | else 119 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 120 | fi 121 | fi 122 | 123 | # For Darwin, add options to specify how the application appears in the dock 124 | if $darwin; then 125 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 126 | fi 127 | 128 | # For Cygwin or MSYS, switch paths to Windows format before running java 129 | if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then 130 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 131 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 132 | JAVACMD=`cygpath --unix "$JAVACMD"` 133 | 134 | # We build the pattern for arguments to be converted via cygpath 135 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 136 | SEP="" 137 | for dir in $ROOTDIRSRAW ; do 138 | ROOTDIRS="$ROOTDIRS$SEP$dir" 139 | SEP="|" 140 | done 141 | OURCYGPATTERN="(^($ROOTDIRS))" 142 | # Add a user-defined pattern to the cygpath arguments 143 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 144 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 145 | fi 146 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 147 | i=0 148 | for arg in "$@" ; do 149 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 150 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 151 | 152 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 153 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 154 | else 155 | eval `echo args$i`="\"$arg\"" 156 | fi 157 | i=`expr $i + 1` 158 | done 159 | case $i in 160 | 0) set -- ;; 161 | 1) set -- "$args0" ;; 162 | 2) set -- "$args0" "$args1" ;; 163 | 3) set -- "$args0" "$args1" "$args2" ;; 164 | 4) set -- "$args0" "$args1" "$args2" "$args3" ;; 165 | 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 166 | 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 167 | 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 168 | 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 169 | 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 170 | esac 171 | fi 172 | 173 | # Escape application args 174 | save () { 175 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done 176 | echo " " 177 | } 178 | APP_ARGS=`save "$@"` 179 | 180 | # Collect all arguments for the java command, following the shell quoting and substitution rules 181 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" 182 | 183 | exec "$JAVACMD" "$@" 184 | -------------------------------------------------------------------------------- /java/demo1/gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%" == "" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%" == "" set DIRNAME=. 29 | set APP_BASE_NAME=%~n0 30 | set APP_HOME=%DIRNAME% 31 | 32 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 33 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 34 | 35 | @rem Find java.exe 36 | if defined JAVA_HOME goto findJavaFromJavaHome 37 | 38 | set JAVA_EXE=java.exe 39 | %JAVA_EXE% -version >NUL 2>&1 40 | if "%ERRORLEVEL%" == "0" goto init 41 | 42 | echo. 43 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 44 | echo. 45 | echo Please set the JAVA_HOME variable in your environment to match the 46 | echo location of your Java installation. 47 | 48 | goto fail 49 | 50 | :findJavaFromJavaHome 51 | set JAVA_HOME=%JAVA_HOME:"=% 52 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 53 | 54 | if exist "%JAVA_EXE%" goto init 55 | 56 | echo. 57 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 58 | echo. 59 | echo Please set the JAVA_HOME variable in your environment to match the 60 | echo location of your Java installation. 61 | 62 | goto fail 63 | 64 | :init 65 | @rem Get command-line arguments, handling Windows variants 66 | 67 | if not "%OS%" == "Windows_NT" goto win9xME_args 68 | 69 | :win9xME_args 70 | @rem Slurp the command line arguments. 71 | set CMD_LINE_ARGS= 72 | set _SKIP=2 73 | 74 | :win9xME_args_slurp 75 | if "x%~1" == "x" goto execute 76 | 77 | set CMD_LINE_ARGS=%* 78 | 79 | :execute 80 | @rem Setup the command line 81 | 82 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 83 | 84 | @rem Execute Gradle 85 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 86 | 87 | :end 88 | @rem End local scope for the variables with windows NT shell 89 | if "%ERRORLEVEL%"=="0" goto mainEnd 90 | 91 | :fail 92 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 93 | rem the _cmd.exe /c_ return code! 94 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 95 | exit /b 1 96 | 97 | :mainEnd 98 | if "%OS%"=="Windows_NT" endlocal 99 | 100 | :omega 101 | -------------------------------------------------------------------------------- /java/demo1/settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'demo1' 2 | 3 | -------------------------------------------------------------------------------- /java/demo1/src/main/java/com/whylogs/examples/WhyLogsDemo.java: -------------------------------------------------------------------------------- 1 | package com.whylogs.examples; 2 | 3 | import com.whylogs.core.DatasetProfile; 4 | import org.apache.commons.csv.CSVFormat; 5 | import org.apache.commons.csv.CSVParser; 6 | import org.apache.commons.csv.CSVRecord; 7 | 8 | import java.io.InputStreamReader; 9 | import java.io.OutputStream; 10 | import java.nio.file.Files; 11 | import java.nio.file.Path; 12 | import java.nio.file.Paths; 13 | import java.nio.file.StandardOpenOption; 14 | import java.time.Instant; 15 | import java.time.LocalDate; 16 | import java.time.ZoneOffset; 17 | import java.time.format.DateTimeFormatter; 18 | import java.util.Collections; 19 | import java.util.HashMap; 20 | import java.util.Map; 21 | import java.util.UUID; 22 | 23 | /** 24 | * An example of processing a CSV dataset. 25 | * 26 | * Here we demonstrate how you can extract data from a CSV file and track it with WhyLogs. We group 27 | * the data by year here and run profiling for each year. 28 | * 29 | * In practice, if the data is sorted by date, you can write the data to disk as soon as you see the timestamp 30 | * increase (in this case, you see the value of the following year in the dataset). In that way you can 31 | * guarantee constant memory usage. 32 | */ 33 | public class WhyLogsDemo { 34 | 35 | public static final String DATE_COLUMN = "Call Date"; 36 | public static final CSVFormat CSV_FORMAT = CSVFormat.DEFAULT 37 | .withFirstRecordAsHeader() 38 | .withNullString("") 39 | .withDelimiter(','); 40 | public static final String INPUT_FILE_NAME = "Fire_Department_Calls_for_Service.csv"; 41 | public static final DateTimeFormatter DATE_TIME_FORMAT = DateTimeFormatter.ofPattern("MM/dd/yyyy"); 42 | 43 | public static void main(String[] args) throws Exception { 44 | final String sessionId = UUID.randomUUID().toString(); 45 | final Instant now = Instant.now(); 46 | 47 | // map for storing the result 48 | final Map result = new HashMap<>(); 49 | 50 | try (final InputStreamReader is = new InputStreamReader(WhyLogsDemo.class.getResourceAsStream(INPUT_FILE_NAME))) { 51 | final CSVParser parser = new CSVParser(is, CSV_FORMAT); 52 | 53 | // iterate through records 54 | for (final CSVRecord record : parser) { 55 | // extract date time 56 | final Instant dataTime = parseAndTruncateToYear(record.get(DATE_COLUMN)); 57 | 58 | // create new dataset profile 59 | final DatasetProfile profile = result.computeIfAbsent(dataTime, 60 | t -> new DatasetProfile(sessionId, now, t, Collections.emptyMap(), Collections.emptyMap())); 61 | 62 | // track multiple features 63 | profile.track(record.toMap()); 64 | } 65 | } 66 | 67 | System.out.println("Number of profiles: " + result.size()); 68 | 69 | // write to a folder called "output" 70 | final Path output = Paths.get("output"); 71 | Files.createDirectories(output); 72 | 73 | for (Map.Entry entry : result.entrySet()) { 74 | final DatasetProfile profile = entry.getValue(); 75 | // associate the year with filename 76 | final String fileName = String.format("profile_%s.bin", entry.getKey().atZone(ZoneOffset.UTC).getYear()); 77 | 78 | // write out the output 79 | try (final OutputStream os = 80 | Files.newOutputStream(output.resolve(fileName), StandardOpenOption.WRITE, StandardOpenOption.CREATE)) { 81 | profile.toProtobuf().build().writeDelimitedTo(os); 82 | } 83 | } 84 | } 85 | 86 | /** 87 | * Parse a text to an Instant object. This is used to extract data from the CSV and map 88 | * them into DatasetProfile's dataset_timestamp 89 | * 90 | * @param text input text 91 | * @return time in UTC as {@link Instant} 92 | */ 93 | private static Instant parseAndTruncateToYear(String text) { 94 | return LocalDate.parse(text, DATE_TIME_FORMAT) 95 | .atStartOfDay() 96 | .withDayOfMonth(1) 97 | .withMonth(1) 98 | .atZone(ZoneOffset.UTC).toInstant(); 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /python/.gitignore: -------------------------------------------------------------------------------- 1 | whylogs-output 2 | mlruns 3 | .ipynb_checkpoints/ 4 | -------------------------------------------------------------------------------- /python/.whylogs.yaml: -------------------------------------------------------------------------------- 1 | # .whylogs.yaml 2 | 3 | # Example WhyLogs YAML configuration 4 | project: example-project 5 | pipeline: example-pipeline 6 | verbose: false 7 | writers: 8 | # Save out the full protobuf datasketches data locally 9 | - formats: 10 | - protobuf 11 | output_path: whylogs-output 12 | # Template variables can be accessed via $variable or ${variable} 13 | path_template: $name/dataset_profile 14 | filename_template: datase_profile-$dataset_timestamp 15 | type: local 16 | # Save out the flat summary data locally, separately from the protobuf 17 | - formats: 18 | - flat 19 | - json 20 | output_path: whylogs-output 21 | path_template: $name/dataset_summary 22 | filename_template: dataset_summary-$dataset_timestamp 23 | type: local 24 | -------------------------------------------------------------------------------- /python/.whylogs_mlflow.yaml: -------------------------------------------------------------------------------- 1 | project: example-project 2 | pipeline: example-pipeline 3 | verbose: false 4 | writers: 5 | - data_collection_consent: true 6 | formats: ['protobuf'] 7 | output_path: whylogs-output 8 | type: local 9 | - data_collection_consent: true 10 | formats: ['protobuf'] 11 | output_path: mlflow 12 | type: mlflow -------------------------------------------------------------------------------- /python/Kafka.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "similar-corporation", 6 | "metadata": {}, 7 | "source": [ 8 | "This notebook assumes kafka (and zookeeper) have been started and are available at localhost:9092.\n", 9 | "\n", 10 | "https://medium.com/better-programming/your-local-event-driven-environment-using-dockerised-kafka-cluster-6e84af09cd95\n", 11 | "\n", 12 | "```\n", 13 | "$ docker-compose up -d\n", 14 | "```\n", 15 | "\n", 16 | "You can explicitly create Kafka topics with appropriate replication and partition config.\n", 17 | "\n", 18 | "```\n", 19 | "% docker exec -ti kafka bash\n", 20 | "root@kafka:/# kafka-topics --create --bootstrap-server localhost:9092 --replication-factor 1 --partitions 1 --topic whylogs-stream\n", 21 | "```" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "id": "genuine-recipient", 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | "Requirement already satisfied: kafka-python in /Users/chris/opt/miniconda3/envs/jupyter/lib/python3.8/site-packages (2.0.2)\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "%matplotlib inline\n", 40 | "import warnings\n", 41 | "warnings.simplefilter(\"ignore\")\n", 42 | "\n", 43 | "!pip install kafka-python\n", 44 | "import datetime\n", 45 | "import os.path\n", 46 | "import pandas as pd\n", 47 | "import numpy as np" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "id": "recorded-stamp", 53 | "metadata": {}, 54 | "source": [ 55 | "Load some sample data that we will feed into a Kafka topic." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 2, 61 | "id": "brutal-breath", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "data_file = \"lending_club_demo.csv\"\n", 66 | "full_data = pd.read_csv(os.path.join(data_file))\n", 67 | "full_data['issue_d'].describe()\n", 68 | "\n", 69 | "data = full_data[full_data['issue_d'] == 'Jan-2017']" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "id": "heated-venue", 75 | "metadata": {}, 76 | "source": [ 77 | "Load some data into a Kafka topic." 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 3, 83 | "id": "rotary-manhattan", 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "from kafka import KafkaProducer\n", 88 | "import json\n", 89 | "producer = KafkaProducer(bootstrap_servers='localhost:9092',\n", 90 | " value_serializer=lambda v: json.dumps(v).encode('utf-8'))\n", 91 | "\n", 92 | "for i, row in data.iterrows():\n", 93 | " producer.send('whylogs-stream', row.to_dict())" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 4, 99 | "id": "intimate-circulation", 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "topic whylogs-stream - partition 0\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "import json\n", 112 | "from kafka import KafkaConsumer, TopicPartition\n", 113 | "\n", 114 | "consumer = KafkaConsumer(bootstrap_servers='localhost:9092', \n", 115 | " value_deserializer=lambda x: json.loads(x.decode('utf-8')))\n", 116 | "\n", 117 | "# consumer.seek_to_beginning workaround\n", 118 | "# https://github.com/dpkp/kafka-python/issues/601#issuecomment-331419097\n", 119 | "assignments = []\n", 120 | "topics=['whylogs-stream']\n", 121 | "for topic in topics:\n", 122 | " partitions = consumer.partitions_for_topic(topic)\n", 123 | " for p in partitions:\n", 124 | " print(f'topic {topic} - partition {p}')\n", 125 | " assignments.append(TopicPartition(topic, p))\n", 126 | "consumer.assign(assignments)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "id": "swedish-monkey", 132 | "metadata": {}, 133 | "source": [ 134 | "A long-running, stand-alone python consumer might use this code to read events from a Kfaka topic.\n", 135 | "We don't use this in the Notebook because it does not terminate.\n", 136 | "\n", 137 | "```\n", 138 | "import datetime\n", 139 | "consumer.seek_to_beginning();\n", 140 | "total = 0\n", 141 | "with session.logger(dataset_name=\"another-dataset\", dataset_timestamp=datetime.datetime(2020, 9, 22, 0, 0)) as logger:\n", 142 | " for record in consumer:\n", 143 | " total += 1\n", 144 | " print(f'total {total}')\n", 145 | " logger.log(record.value)\n", 146 | "```\n", 147 | "\n", 148 | "For Notebooks it is better to poll for data and exit when the partition is exhausted.\n", 149 | "\n", 150 | "For demonstration purposes, we reset all partitions to the beginning." 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 11, 156 | "id": "seeing-spider", 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "name": "stdout", 161 | "output_type": "stream", 162 | "text": [ 163 | "TopicPartition(topic='whylogs-stream', partition=0) - 100\n", 164 | "TopicPartition(topic='whylogs-stream', partition=0) - 100\n", 165 | "TopicPartition(topic='whylogs-stream', partition=0) - 67\n", 166 | "TopicPartition(topic='whylogs-stream', partition=0) - 42\n", 167 | "total 309\n" 168 | ] 169 | } 170 | ], 171 | "source": [ 172 | "from whylogs import get_or_create_session\n", 173 | "\n", 174 | "session = get_or_create_session()\n", 175 | "\n", 176 | "consumer.seek_to_beginning();\n", 177 | "with session.logger(dataset_name=\"another-dataset\") as logger:\n", 178 | " total = 0 \n", 179 | " while True:\n", 180 | " finished = True\n", 181 | " record = consumer.poll(timeout_ms=500, max_records=100, update_offsets=True)\n", 182 | " for k,v in record.items():\n", 183 | " print(f'{k} - {len(v)}')\n", 184 | " total += len(v)\n", 185 | " df = pd.DataFrame([row.value for row in v])\n", 186 | " logger.log_dataframe(df)\n", 187 | " finished = False\n", 188 | " if finished:\n", 189 | " print(f\"total {total}\")\n", 190 | " break\n" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 12, 196 | "id": "adjusted-blast", 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "name": "stdout", 201 | "output_type": "stream", 202 | "text": [ 203 | "whylogs-output/another-dataset/dataset_summary/freq_numbers/dataset_summary-batch.json\n", 204 | "whylogs-output/another-dataset/dataset_summary/json/dataset_summary-batch.json\n", 205 | "whylogs-output/another-dataset/dataset_summary/flat_table/dataset_summary-batch.csv\n", 206 | "whylogs-output/another-dataset/dataset_summary/histogram/dataset_summary-batch.json\n", 207 | "whylogs-output/another-dataset/dataset_summary/frequent_strings/dataset_summary-batch.json\n", 208 | "whylogs-output/another-dataset/dataset_profile/protobuf/datase_profile-batch.bin\n" 209 | ] 210 | } 211 | ], 212 | "source": [ 213 | "!find whylogs-output -type f " 214 | ] 215 | } 216 | ], 217 | "metadata": { 218 | "kernelspec": { 219 | "display_name": "Python 3", 220 | "language": "python", 221 | "name": "python3" 222 | }, 223 | "language_info": { 224 | "codemirror_mode": { 225 | "name": "ipython", 226 | "version": 3 227 | }, 228 | "file_extension": ".py", 229 | "mimetype": "text/x-python", 230 | "name": "python", 231 | "nbconvert_exporter": "python", 232 | "pygments_lexer": "ipython3", 233 | "version": "3.8.5" 234 | } 235 | }, 236 | "nbformat": 4, 237 | "nbformat_minor": 5 238 | } 239 | -------------------------------------------------------------------------------- /python/S3 example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Saving Profiles to S3 \n", 8 | "---" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 43, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "from whylogs import get_or_create_session\n", 18 | "import pandas as pd" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 44, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "name": "stdout", 28 | "output_type": "stream", 29 | "text": [ 30 | "The autoreload extension is already loaded. To reload it, use:\n", 31 | " %reload_ext autoreload\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "\n", 37 | "%load_ext autoreload\n", 38 | "%autoreload 2" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## Create a mock s3 server \n" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "For this example we will create a fake s3 server using moto lib. You should remove this section if you have you own bucket setup on aws. Make sure you have your aws configuration set. By default this mock server creates a server in region \"us-east-1\"" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 45, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "BUCKET=\"super_awesome_bucket\"" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 46, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "data": { 71 | "text/plain": [ 72 | "s3.Bucket(name='super_awesome_bucket')" 73 | ] 74 | }, 75 | "execution_count": 46, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "from moto import mock_s3\n", 82 | "from moto.s3.responses import DEFAULT_REGION_NAME\n", 83 | "import boto3\n", 84 | "\n", 85 | "mocks3 = mock_s3()\n", 86 | "mocks3.start()\n", 87 | "res = boto3.resource('s3', region_name=DEFAULT_REGION_NAME)\n", 88 | "res.create_bucket(Bucket=BUCKET)\n" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "## Load Data" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "We can go by our usual way, load a example csv data" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 47, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "df = pd.read_csv(\"lending_club_1000.csv\")" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "## Config File\n", 119 | "---\n", 120 | "Seting up whylogs to save your data on s3 can be in several ways. Simplest is to simply create a config file,where each data format can be saved to a specific location. As shown below " 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 48, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "CONFIG = \"\"\"\n", 130 | "project: s3_example_project\n", 131 | "pipeline: latest_results\n", 132 | "verbose: false\n", 133 | "writers:\n", 134 | "- formats:\n", 135 | " - protobuf\n", 136 | " output_path: s3://super_awesome_bucket/\n", 137 | " path_template: $name/dataset_summary\n", 138 | " filename_template: dataset_summary\n", 139 | " type: s3\n", 140 | "- formats:\n", 141 | " - flat\n", 142 | " output_path: s3://super_awesome_bucket/\n", 143 | " path_template: $name/dataset_summary\n", 144 | " filename_template: dataset_summary\n", 145 | " type: s3\n", 146 | "- formats:\n", 147 | " - json\n", 148 | " output_path: s3://super_awesome_bucket/\n", 149 | " path_template: $name/dataset_summary\n", 150 | " filename_template: dataset_summary\n", 151 | " type: s3\n", 152 | "\"\"\"" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 49, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "config_path=\".whylogs.yaml\"\n", 162 | "with open(\".whylogs.yaml\",\"w\") as file:\n", 163 | " file.write(CONFIG)\n" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "Checking the content:" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 50, 176 | "metadata": { 177 | "scrolled": true 178 | }, 179 | "outputs": [ 180 | { 181 | "name": "stdout", 182 | "output_type": "stream", 183 | "text": [ 184 | "\r\n", 185 | "project: s3_example_project\r\n", 186 | "pipeline: latest_results\r\n", 187 | "verbose: false\r\n", 188 | "writers:\r\n", 189 | "- formats:\r\n", 190 | " - protobuf\r\n", 191 | " output_path: s3://super_awesome_bucket/\r\n", 192 | " path_template: $name/dataset_summary\r\n", 193 | " filename_template: dataset_summary\r\n", 194 | " type: s3\r\n", 195 | "- formats:\r\n", 196 | " - flat\r\n", 197 | " output_path: s3://super_awesome_bucket/\r\n", 198 | " path_template: $name/dataset_summary\r\n", 199 | " filename_template: dataset_summary\r\n", 200 | " type: s3\r\n", 201 | "- formats:\r\n", 202 | " - json\r\n", 203 | " output_path: s3://super_awesome_bucket/\r\n", 204 | " path_template: $name/dataset_summary\r\n", 205 | " filename_template: dataset_summary\r\n", 206 | " type: s3\r\n" 207 | ] 208 | } 209 | ], 210 | "source": [ 211 | "%cat .whylogs.yaml" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "If you have a custom name for your config file or place it in a special location you can use the helper function" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 51, 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "name": "stdout", 228 | "output_type": "stream", 229 | "text": [ 230 | "cache: 1\n", 231 | "pipeline: latest_results\n", 232 | "project: s3_example_project\n", 233 | "verbose: false\n", 234 | "with_rotation_time: null\n", 235 | "writers:\n", 236 | "- filename_template: \n", 237 | " formats:\n", 238 | " - OutputFormat.protobuf\n", 239 | " output_path: s3://super_awesome_bucket/\n", 240 | " path_template: \n", 241 | "- filename_template: \n", 242 | " formats:\n", 243 | " - OutputFormat.flat\n", 244 | " output_path: s3://super_awesome_bucket/\n", 245 | " path_template: \n", 246 | "- filename_template: \n", 247 | " formats:\n", 248 | " - OutputFormat.json\n", 249 | " output_path: s3://super_awesome_bucket/\n", 250 | " path_template: \n", 251 | "\n" 252 | ] 253 | } 254 | ], 255 | "source": [ 256 | "from whylogs.app.session import load_config, session_from_config\n", 257 | "config = load_config(\".whylogs.yaml\")\n", 258 | "session = session_from_config(config)\n", 259 | "print(session.get_config().to_yaml())" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "Otherwise if the file is located in your home directory or current location you are running, you can simply run `get_or_create_session()`" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 52, 272 | "metadata": { 273 | "scrolled": true 274 | }, 275 | "outputs": [ 276 | { 277 | "name": "stdout", 278 | "output_type": "stream", 279 | "text": [ 280 | "cache: 1\n", 281 | "pipeline: latest_results\n", 282 | "project: s3_example_project\n", 283 | "verbose: false\n", 284 | "with_rotation_time: null\n", 285 | "writers:\n", 286 | "- filename_template: \n", 287 | " formats:\n", 288 | " - OutputFormat.protobuf\n", 289 | " output_path: s3://super_awesome_bucket/\n", 290 | " path_template: \n", 291 | "- filename_template: \n", 292 | " formats:\n", 293 | " - OutputFormat.flat\n", 294 | " output_path: s3://super_awesome_bucket/\n", 295 | " path_template: \n", 296 | "- filename_template: \n", 297 | " formats:\n", 298 | " - OutputFormat.json\n", 299 | " output_path: s3://super_awesome_bucket/\n", 300 | " path_template: \n", 301 | "\n" 302 | ] 303 | } 304 | ], 305 | "source": [ 306 | "session= get_or_create_session()\n", 307 | "print(session.get_config().to_yaml())" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "## Loggin Data \n", 315 | "--- \n", 316 | "The data can be save by simply closing a logger, or one a logger is out of scope." 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 53, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "with session.logger(\"dataset_test_s3\") as logger:\n", 326 | " logger.log_dataframe(df)\n" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 54, 332 | "metadata": {}, 333 | "outputs": [ 334 | { 335 | "data": { 336 | "text/plain": [ 337 | "['dataset_test_s3/dataset_summary/flat_table/dataset_summary.csv',\n", 338 | " 'dataset_test_s3/dataset_summary/freq_numbers/dataset_summary.json',\n", 339 | " 'dataset_test_s3/dataset_summary/frequent_strings/dataset_summary.json',\n", 340 | " 'dataset_test_s3/dataset_summary/histogram/dataset_summary.json',\n", 341 | " 'dataset_test_s3/dataset_summary/json/dataset_summary.json',\n", 342 | " 'dataset_test_s3/dataset_summary/protobuf/dataset_summary.bin']" 343 | ] 344 | }, 345 | "execution_count": 54, 346 | "metadata": {}, 347 | "output_type": "execute_result" 348 | } 349 | ], 350 | "source": [ 351 | "client = boto3.client('s3')\n", 352 | "objects = client.list_objects(Bucket=BUCKET)\n", 353 | "[obj[\"Key\"] for obj in objects[\"Contents\"]]\n" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "You can define the configure for were the data is save through a configuration file or creating a custom writer.\n" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "### Close mock s3 server " 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 55, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "mocks3.stop()" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [] 385 | } 386 | ], 387 | "metadata": { 388 | "kernelspec": { 389 | "display_name": "Python 3", 390 | "language": "python", 391 | "name": "python3" 392 | }, 393 | "language_info": { 394 | "codemirror_mode": { 395 | "name": "ipython", 396 | "version": 3 397 | }, 398 | "file_extension": ".py", 399 | "mimetype": "text/x-python", 400 | "name": "python", 401 | "nbconvert_exporter": "python", 402 | "pygments_lexer": "ipython3", 403 | "version": "3.8.5" 404 | } 405 | }, 406 | "nbformat": 4, 407 | "nbformat_minor": 4 408 | } 409 | -------------------------------------------------------------------------------- /python/Streaming Mode - whylogs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "In this notebook, we will explore how to use Python in a streaming and distributed manner\n", 8 | "\n", 9 | "## Loading the dataset\n", 10 | "\n", 11 | "To simulate streaming data, we will load data into a Pandas dataframe. Then, we will iterate via each `Row` object, which is a dictionary object.\n", 12 | "\n", 13 | "`whylogs.DatasetProfile.track` method accepts dictionary of `[feature_name, value]`." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import datetime\n", 23 | "import os.path\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/html": [ 35 | "
\n", 36 | "\n", 49 | "\n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | "
idmember_idloan_amntfunded_amntfunded_amnt_invtermint_rateinstallmentgradesub_grade...hardship_payoff_balance_amounthardship_last_payment_amountdisbursement_methoddebt_settlement_flagdebt_settlement_flag_datesettlement_statussettlement_datesettlement_amountsettlement_percentagesettlement_term
1059996596008NaN15000.015000.015000.036 months15.99527.29CC5...NaNNaNCashNNaNNaNNaNNaNNaNNaN
1060196703051NaN14575.014575.014575.036 months25.49583.29EE4...NaNNaNCashNNaNNaNNaNNaNNaNNaN
1060296960509NaN5000.05000.05000.036 months8.24157.24BB1...NaNNaNCashNNaNNaNNaNNaNNaNNaN
1060397463966NaN13200.013200.013200.060 months13.99307.08CC3...NaNNaNCashNNaNNaNNaNNaNNaNNaN
1060596841832NaN9500.09500.09500.036 months8.24298.75BB1...NaNNaNCashNNaNNaNNaNNaNNaNNaN
..................................................................
1091495617334NaN6500.06500.06250.036 months5.32195.75AA1...NaNNaNCashNNaNNaNNaNNaNNaNNaN
1091595129874NaN15000.015000.015000.060 months15.99364.70CC5...NaNNaNCashNNaNNaNNaNNaNNaNNaN
1091696187258NaN40000.040000.040000.036 months7.491244.07AA4...NaNNaNCashNNaNNaNNaNNaNNaNNaN
1091794469381NaN5050.05050.05050.036 months21.49191.54DD5...NaNNaNCashNNaNNaNNaNNaNNaNNaN
1091894480548NaN7350.07350.07350.036 months12.74246.74CC1...NaNNaNCashNNaNNaNNaNNaNNaNNaN
\n", 343 | "

309 rows × 150 columns

\n", 344 | "
" 345 | ], 346 | "text/plain": [ 347 | " id member_id loan_amnt funded_amnt funded_amnt_inv \\\n", 348 | "10599 96596008 NaN 15000.0 15000.0 15000.0 \n", 349 | "10601 96703051 NaN 14575.0 14575.0 14575.0 \n", 350 | "10602 96960509 NaN 5000.0 5000.0 5000.0 \n", 351 | "10603 97463966 NaN 13200.0 13200.0 13200.0 \n", 352 | "10605 96841832 NaN 9500.0 9500.0 9500.0 \n", 353 | "... ... ... ... ... ... \n", 354 | "10914 95617334 NaN 6500.0 6500.0 6250.0 \n", 355 | "10915 95129874 NaN 15000.0 15000.0 15000.0 \n", 356 | "10916 96187258 NaN 40000.0 40000.0 40000.0 \n", 357 | "10917 94469381 NaN 5050.0 5050.0 5050.0 \n", 358 | "10918 94480548 NaN 7350.0 7350.0 7350.0 \n", 359 | "\n", 360 | " term int_rate installment grade sub_grade ... \\\n", 361 | "10599 36 months 15.99 527.29 C C5 ... \n", 362 | "10601 36 months 25.49 583.29 E E4 ... \n", 363 | "10602 36 months 8.24 157.24 B B1 ... \n", 364 | "10603 60 months 13.99 307.08 C C3 ... \n", 365 | "10605 36 months 8.24 298.75 B B1 ... \n", 366 | "... ... ... ... ... ... ... \n", 367 | "10914 36 months 5.32 195.75 A A1 ... \n", 368 | "10915 60 months 15.99 364.70 C C5 ... \n", 369 | "10916 36 months 7.49 1244.07 A A4 ... \n", 370 | "10917 36 months 21.49 191.54 D D5 ... \n", 371 | "10918 36 months 12.74 246.74 C C1 ... \n", 372 | "\n", 373 | " hardship_payoff_balance_amount hardship_last_payment_amount \\\n", 374 | "10599 NaN NaN \n", 375 | "10601 NaN NaN \n", 376 | "10602 NaN NaN \n", 377 | "10603 NaN NaN \n", 378 | "10605 NaN NaN \n", 379 | "... ... ... \n", 380 | "10914 NaN NaN \n", 381 | "10915 NaN NaN \n", 382 | "10916 NaN NaN \n", 383 | "10917 NaN NaN \n", 384 | "10918 NaN NaN \n", 385 | "\n", 386 | " disbursement_method debt_settlement_flag debt_settlement_flag_date \\\n", 387 | "10599 Cash N NaN \n", 388 | "10601 Cash N NaN \n", 389 | "10602 Cash N NaN \n", 390 | "10603 Cash N NaN \n", 391 | "10605 Cash N NaN \n", 392 | "... ... ... ... \n", 393 | "10914 Cash N NaN \n", 394 | "10915 Cash N NaN \n", 395 | "10916 Cash N NaN \n", 396 | "10917 Cash N NaN \n", 397 | "10918 Cash N NaN \n", 398 | "\n", 399 | " settlement_status settlement_date settlement_amount \\\n", 400 | "10599 NaN NaN NaN \n", 401 | "10601 NaN NaN NaN \n", 402 | "10602 NaN NaN NaN \n", 403 | "10603 NaN NaN NaN \n", 404 | "10605 NaN NaN NaN \n", 405 | "... ... ... ... \n", 406 | "10914 NaN NaN NaN \n", 407 | "10915 NaN NaN NaN \n", 408 | "10916 NaN NaN NaN \n", 409 | "10917 NaN NaN NaN \n", 410 | "10918 NaN NaN NaN \n", 411 | "\n", 412 | " settlement_percentage settlement_term \n", 413 | "10599 NaN NaN \n", 414 | "10601 NaN NaN \n", 415 | "10602 NaN NaN \n", 416 | "10603 NaN NaN \n", 417 | "10605 NaN NaN \n", 418 | "... ... ... \n", 419 | "10914 NaN NaN \n", 420 | "10915 NaN NaN \n", 421 | "10916 NaN NaN \n", 422 | "10917 NaN NaN \n", 423 | "10918 NaN NaN \n", 424 | "\n", 425 | "[309 rows x 150 columns]" 426 | ] 427 | }, 428 | "execution_count": 2, 429 | "metadata": {}, 430 | "output_type": "execute_result" 431 | } 432 | ], 433 | "source": [ 434 | "data_file = \"lending_club_demo.csv\"\n", 435 | "full_data = pd.read_csv(data_file)\n", 436 | "full_data['issue_d'].describe()\n", 437 | "\n", 438 | "data = full_data[full_data['issue_d'] == 'Jan-2017']\n", 439 | "data" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": {}, 445 | "source": [ 446 | "## Creating a whylogs session\n", 447 | "\n", 448 | "Let's now explore import a function from whylogs that allows us to create a logging session.\n", 449 | "\n", 450 | "This session can be connected with multiple writers that output the results of our profiling locally in JSON, a flat CSV, or binary protobuf format as well as writers to an AWS S3 bucket in the cloud. Further writing functionality will be added as well.\n", 451 | "\n", 452 | "Let's create a default session below." 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 3, 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "from whylogs import get_or_create_session\n", 462 | "\n", 463 | "session = get_or_create_session()" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "## Creating a logger\n", 471 | "\n", 472 | "We can create a logger for a specific dataset timestamp. This often represents a window of data or a batch of data.\n" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 4, 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "logger= session.logger(dataset_name=\"dataset\", dataset_timestamp=datetime.datetime(2020, 9, 22, 0, 0))" 482 | ] 483 | }, 484 | { 485 | "cell_type": "markdown", 486 | "metadata": {}, 487 | "source": [ 488 | "## Log streaming data\n", 489 | "We'll stream through the dataframe and call `logger.log`.\n", 490 | "\n", 491 | "In practice, you'll call this on individual data points" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 5, 497 | "metadata": {}, 498 | "outputs": [], 499 | "source": [ 500 | "for i, r in data.iterrows():\n", 501 | " logger.log(r)" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 6, 507 | "metadata": {}, 508 | "outputs": [ 509 | { 510 | "data": { 511 | "text/plain": [ 512 | "" 513 | ] 514 | }, 515 | "execution_count": 6, 516 | "metadata": {}, 517 | "output_type": "execute_result" 518 | } 519 | ], 520 | "source": [ 521 | "# close the logger to write to dist\n", 522 | "logger.close()" 523 | ] 524 | }, 525 | { 526 | "cell_type": "markdown", 527 | "metadata": {}, 528 | "source": [ 529 | "## Another logger\n", 530 | "We'll create another logger and write data to the new logger, but with a different timestamp" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": 7, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "with session.logger(dataset_name=\"dataset\", dataset_timestamp=datetime.datetime(2020, 9, 21, 0, 0)) as logger:\n", 540 | " for i, r in data.iterrows():\n", 541 | " logger.log(r)" 542 | ] 543 | }, 544 | { 545 | "cell_type": "markdown", 546 | "metadata": {}, 547 | "source": [ 548 | "## Merging data\n", 549 | "Once data is written to disk, we can then merge the entries together to get a summary view.\n", 550 | "\n", 551 | "If you run a distributed systems, this means that you can collect your `whylogs` data into a cloud storage such as S3 and then aggregate them." 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": 8, 557 | "metadata": {}, 558 | "outputs": [], 559 | "source": [ 560 | "import glob" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 9, 566 | "metadata": {}, 567 | "outputs": [ 568 | { 569 | "data": { 570 | "text/plain": [ 571 | "['whylogs-output/dataset/dataset_profile/protobuf/datase_profile-1600732800000.bin',\n", 572 | " 'whylogs-output/dataset/dataset_profile/protobuf/datase_profile-1600646400000.bin']" 573 | ] 574 | }, 575 | "execution_count": 9, 576 | "metadata": {}, 577 | "output_type": "execute_result" 578 | } 579 | ], 580 | "source": [ 581 | "binaries = glob.glob('whylogs-output/dataset/**/*.bin', recursive=True)\n", 582 | "binaries" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": 10, 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [ 591 | "from whylogs import DatasetProfile\n", 592 | "# currently, whylogs writer writes non-delimited files\n", 593 | "profiles = [DatasetProfile.read_protobuf(x, delimited_file=False) for x in binaries]" 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": 11, 599 | "metadata": {}, 600 | "outputs": [], 601 | "source": [ 602 | "from functools import reduce\n", 603 | "merged = reduce(lambda x, y: x.merge(y), profiles)" 604 | ] 605 | }, 606 | { 607 | "cell_type": "markdown", 608 | "metadata": {}, 609 | "source": [ 610 | "## Quick check with the merged data\n", 611 | "We can check the counter to see if the merged data reflect the \"merge\" here" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": 12, 617 | "metadata": {}, 618 | "outputs": [ 619 | { 620 | "name": "stdout", 621 | "output_type": "stream", 622 | "text": [ 623 | "First DTI count: 309\n", 624 | "Second DTI count: 309\n", 625 | "Merged count: 618\n" 626 | ] 627 | } 628 | ], 629 | "source": [ 630 | "print(\"First DTI count: \", profiles[0].columns['dti'].counters.count)\n", 631 | "print(\"Second DTI count: \", profiles[1].columns['dti'].counters.count)\n", 632 | "print(\"Merged count: \", merged.columns['dti'].counters.count)" 633 | ] 634 | } 635 | ], 636 | "metadata": { 637 | "kernelspec": { 638 | "display_name": "whylogs", 639 | "language": "python", 640 | "name": "whylogs" 641 | }, 642 | "language_info": { 643 | "codemirror_mode": { 644 | "name": "ipython", 645 | "version": 3 646 | }, 647 | "file_extension": ".py", 648 | "mimetype": "text/x-python", 649 | "name": "python", 650 | "nbconvert_exporter": "python", 651 | "pygments_lexer": "ipython3", 652 | "version": "3.7.7" 653 | } 654 | }, 655 | "nbformat": 4, 656 | "nbformat_minor": 4 657 | } 658 | -------------------------------------------------------------------------------- /python/WhyLabs_Platform.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "013a0fd4-31f9-4f3f-bf3a-1efc9640422f", 6 | "metadata": {}, 7 | "source": [ 8 | "# Install basic requirements" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "ad907ce3-0c3b-49e4-86f1-eae9de934f7b", 15 | "metadata": { 16 | "collapsed": true, 17 | "jupyter": { 18 | "outputs_hidden": true 19 | }, 20 | "tags": [] 21 | }, 22 | "outputs": [ 23 | { 24 | "name": "stdout", 25 | "output_type": "stream", 26 | "text": [ 27 | "Requirement already satisfied: whylogs in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (0.6.5)\n", 28 | "Requirement already satisfied: pandas in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (1.3.3)\n", 29 | "Requirement already satisfied: pytz>=2017.3 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from pandas) (2021.1)\n", 30 | "Requirement already satisfied: python-dateutil>=2.7.3 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from pandas) (2.8.2)\n", 31 | "Requirement already satisfied: numpy>=1.17.3 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from pandas) (1.21.2)\n", 32 | "Requirement already satisfied: six>=1.5 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from python-dateutil>=2.7.3->pandas) (1.16.0)\n", 33 | "Requirement already satisfied: tqdm<5.0.0,>=4.60.0 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from whylogs) (4.62.3)\n", 34 | "Requirement already satisfied: xlrd<3.0.0,>=2.0.1 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from whylogs) (2.0.1)\n", 35 | "Requirement already satisfied: whylabs-client in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from whylogs) (0.1)\n", 36 | "Requirement already satisfied: smart-open>=4.1.2 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from whylogs) (5.2.1)\n", 37 | "Requirement already satisfied: pyyaml>=5.3.1 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from whylogs) (5.4.1)\n", 38 | "Requirement already satisfied: whylabs-datasketches>=2.2.0b1 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from whylogs) (2.2.0b1)\n", 39 | "Requirement already satisfied: boto3>=1.14.1 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from whylogs) (1.18.45)\n", 40 | "Requirement already satisfied: scikit-learn<0.25.0,>=0.24.2 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from whylogs) (0.24.2)\n", 41 | "Requirement already satisfied: botocore>=1.17.44 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from whylogs) (1.21.45)\n", 42 | "Requirement already satisfied: mlflow<1.14,>=1.13 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from whylogs) (1.13.1)\n", 43 | "Requirement already satisfied: click>=7.1.2 in /Users/andy/.local/lib/python3.8/site-packages (from whylogs) (7.1.2)\n", 44 | "Requirement already satisfied: puremagic in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from whylogs) (1.10)\n", 45 | "Requirement already satisfied: matplotlib<4.0.0,>=3.0.3 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from whylogs) (3.4.3)\n", 46 | "Requirement already satisfied: protobuf>=3.15.5 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from whylogs) (3.18.0)\n", 47 | "Requirement already satisfied: marshmallow>=3.7.1 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from whylogs) (3.13.0)\n", 48 | "Requirement already satisfied: openpyxl<4.0.0,>=3.0.7 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from whylogs) (3.0.7)\n", 49 | "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from boto3>=1.14.1->whylogs) (0.10.0)\n", 50 | "Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from boto3>=1.14.1->whylogs) (0.5.0)\n", 51 | "Requirement already satisfied: urllib3<1.27,>=1.25.4 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from botocore>=1.17.44->whylogs) (1.26.6)\n", 52 | "Requirement already satisfied: pyparsing>=2.2.1 in /Users/andy/.local/lib/python3.8/site-packages (from matplotlib<4.0.0,>=3.0.3->whylogs) (2.4.7)\n", 53 | "Requirement already satisfied: pillow>=6.2.0 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from matplotlib<4.0.0,>=3.0.3->whylogs) (8.3.2)\n", 54 | "Requirement already satisfied: cycler>=0.10 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from matplotlib<4.0.0,>=3.0.3->whylogs) (0.10.0)\n", 55 | "Requirement already satisfied: kiwisolver>=1.0.1 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from matplotlib<4.0.0,>=3.0.3->whylogs) (1.3.2)\n", 56 | "Requirement already satisfied: gitpython>=2.1.0 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from mlflow<1.14,>=1.13->whylogs) (3.1.24)\n", 57 | "Requirement already satisfied: docker>=4.0.0 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from mlflow<1.14,>=1.13->whylogs) (5.0.2)\n", 58 | "Requirement already satisfied: querystring-parser in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from mlflow<1.14,>=1.13->whylogs) (1.2.4)\n", 59 | "Requirement already satisfied: Flask in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from mlflow<1.14,>=1.13->whylogs) (2.0.1)\n", 60 | "Requirement already satisfied: gunicorn in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from mlflow<1.14,>=1.13->whylogs) (20.1.0)\n", 61 | "Requirement already satisfied: cloudpickle in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from mlflow<1.14,>=1.13->whylogs) (2.0.0)\n", 62 | "Requirement already satisfied: sqlalchemy in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from mlflow<1.14,>=1.13->whylogs) (1.4.23)\n", 63 | "Requirement already satisfied: sqlparse>=0.3.1 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from mlflow<1.14,>=1.13->whylogs) (0.4.2)\n", 64 | "Requirement already satisfied: azure-storage-blob>=12.0.0 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from mlflow<1.14,>=1.13->whylogs) (12.9.0)\n", 65 | "Requirement already satisfied: requests>=2.17.3 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from mlflow<1.14,>=1.13->whylogs) (2.26.0)\n", 66 | "Requirement already satisfied: entrypoints in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from mlflow<1.14,>=1.13->whylogs) (0.3)\n", 67 | "Requirement already satisfied: prometheus-flask-exporter in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from mlflow<1.14,>=1.13->whylogs) (0.18.2)\n", 68 | "Requirement already satisfied: databricks-cli>=0.8.7 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from mlflow<1.14,>=1.13->whylogs) (0.15.0)\n", 69 | "Requirement already satisfied: alembic<=1.4.1 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from mlflow<1.14,>=1.13->whylogs) (1.4.1)\n", 70 | "Requirement already satisfied: Mako in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from alembic<=1.4.1->mlflow<1.14,>=1.13->whylogs) (1.1.5)\n", 71 | "Requirement already satisfied: python-editor>=0.3 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from alembic<=1.4.1->mlflow<1.14,>=1.13->whylogs) (1.0.4)\n", 72 | "Requirement already satisfied: msrest>=0.6.21 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from azure-storage-blob>=12.0.0->mlflow<1.14,>=1.13->whylogs) (0.6.21)\n", 73 | "Requirement already satisfied: azure-core<2.0.0,>=1.10.0 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from azure-storage-blob>=12.0.0->mlflow<1.14,>=1.13->whylogs) (1.18.0)\n", 74 | "Requirement already satisfied: cryptography>=2.1.4 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from azure-storage-blob>=12.0.0->mlflow<1.14,>=1.13->whylogs) (3.4.8)\n", 75 | "Requirement already satisfied: cffi>=1.12 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from cryptography>=2.1.4->azure-storage-blob>=12.0.0->mlflow<1.14,>=1.13->whylogs) (1.14.6)\n", 76 | "Requirement already satisfied: pycparser in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from cffi>=1.12->cryptography>=2.1.4->azure-storage-blob>=12.0.0->mlflow<1.14,>=1.13->whylogs) (2.20)\n", 77 | "Requirement already satisfied: tabulate>=0.7.7 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from databricks-cli>=0.8.7->mlflow<1.14,>=1.13->whylogs) (0.8.9)\n", 78 | "Requirement already satisfied: websocket-client>=0.32.0 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from docker>=4.0.0->mlflow<1.14,>=1.13->whylogs) (1.2.1)\n", 79 | "Requirement already satisfied: gitdb<5,>=4.0.1 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from gitpython>=2.1.0->mlflow<1.14,>=1.13->whylogs) (4.0.7)\n", 80 | "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from gitpython>=2.1.0->mlflow<1.14,>=1.13->whylogs) (3.10.0.2)\n", 81 | "Requirement already satisfied: smmap<5,>=3.0.1 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from gitdb<5,>=4.0.1->gitpython>=2.1.0->mlflow<1.14,>=1.13->whylogs) (4.0.0)\n", 82 | "Requirement already satisfied: certifi>=2017.4.17 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from msrest>=0.6.21->azure-storage-blob>=12.0.0->mlflow<1.14,>=1.13->whylogs) (2021.5.30)\n", 83 | "Requirement already satisfied: requests-oauthlib>=0.5.0 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from msrest>=0.6.21->azure-storage-blob>=12.0.0->mlflow<1.14,>=1.13->whylogs) (1.3.0)\n", 84 | "Requirement already satisfied: isodate>=0.6.0 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from msrest>=0.6.21->azure-storage-blob>=12.0.0->mlflow<1.14,>=1.13->whylogs) (0.6.0)\n", 85 | "Requirement already satisfied: et-xmlfile in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from openpyxl<4.0.0,>=3.0.7->whylogs) (1.1.0)\n", 86 | "Requirement already satisfied: idna<4,>=2.5 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from requests>=2.17.3->mlflow<1.14,>=1.13->whylogs) (3.2)\n", 87 | "Requirement already satisfied: charset-normalizer~=2.0.0 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from requests>=2.17.3->mlflow<1.14,>=1.13->whylogs) (2.0.6)\n", 88 | "Requirement already satisfied: oauthlib>=3.0.0 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from requests-oauthlib>=0.5.0->msrest>=0.6.21->azure-storage-blob>=12.0.0->mlflow<1.14,>=1.13->whylogs) (3.1.1)\n", 89 | "Requirement already satisfied: scipy>=0.19.1 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from scikit-learn<0.25.0,>=0.24.2->whylogs) (1.7.1)\n", 90 | "Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from scikit-learn<0.25.0,>=0.24.2->whylogs) (2.2.0)\n", 91 | "Requirement already satisfied: joblib>=0.11 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from scikit-learn<0.25.0,>=0.24.2->whylogs) (1.0.1)\n", 92 | "Requirement already satisfied: greenlet!=0.4.17 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from sqlalchemy->mlflow<1.14,>=1.13->whylogs) (1.1.1)\n", 93 | "Requirement already satisfied: Jinja2>=3.0 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from Flask->mlflow<1.14,>=1.13->whylogs) (3.0.1)\n", 94 | "Requirement already satisfied: Werkzeug>=2.0 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from Flask->mlflow<1.14,>=1.13->whylogs) (2.0.1)\n", 95 | "Requirement already satisfied: itsdangerous>=2.0 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from Flask->mlflow<1.14,>=1.13->whylogs) (2.0.1)\n", 96 | "Requirement already satisfied: MarkupSafe>=2.0 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from Jinja2>=3.0->Flask->mlflow<1.14,>=1.13->whylogs) (2.0.1)\n", 97 | "Requirement already satisfied: setuptools>=3.0 in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from gunicorn->mlflow<1.14,>=1.13->whylogs) (58.0.4)\n", 98 | "Requirement already satisfied: prometheus-client in /Users/andy/miniconda3/envs/whylogs/lib/python3.8/site-packages (from prometheus-flask-exporter->mlflow<1.14,>=1.13->whylogs) (0.11.0)\n", 99 | "Collecting argparse\n", 100 | " Using cached argparse-1.4.0-py2.py3-none-any.whl (23 kB)\n", 101 | "Installing collected packages: argparse\n", 102 | "Successfully installed argparse-1.4.0\n", 103 | "Note: you may need to restart the kernel to use updated packages.\n" 104 | ] 105 | } 106 | ], 107 | "source": [ 108 | "pip install -U whylogs pandas" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 2, 114 | "id": "8369d3a8-9bf2-4043-a45a-13838498f211", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "import whylogs\n", 119 | "import pandas as pd" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "id": "a244145c-ea35-4ab6-b03e-cf5f864ed94c", 125 | "metadata": {}, 126 | "source": [ 127 | "# Load example data batches\n", 128 | "\n", 129 | "The example data is prepared from our public S3 bucket. You can use your own data if you want if you have multiple batches of data." 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 3, 135 | "id": "b78028ea-c7cb-494f-a303-071f1c345dfc", 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "name": "stdout", 140 | "output_type": "stream", 141 | "text": [ 142 | "Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_1.csv\n", 143 | "Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_2.csv\n", 144 | "Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_3.csv\n", 145 | "Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_4.csv\n", 146 | "Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_5.csv\n", 147 | "Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_6.csv\n", 148 | "Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_7.csv\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "pdfs = []\n", 154 | "for i in range(1, 8):\n", 155 | " path = f\"https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_{i}.csv\"\n", 156 | " print(f\"Loading data from {path}\")\n", 157 | " df = pd.read_csv(path)\n", 158 | " pdfs.append(df)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 4, 164 | "id": "67b81ab4-a456-4d2d-9547-ad0d772e0aaa", 165 | "metadata": {}, 166 | "outputs": [ 167 | { 168 | "data": { 169 | "text/html": [ 170 | "
\n", 171 | "\n", 184 | "\n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | "
Unnamed: 0idmember_idloan_amntfunded_amntfunded_amnt_invint_rateinstallmentannual_incdesc...hardship_loan_statusorig_projected_additional_accrued_interesthardship_payoff_balance_amounthardship_last_payment_amountdebt_settlement_flag_datesettlement_statussettlement_datesettlement_amountsettlement_percentagesettlement_term
count407.0000004.070000e+020.0407.000000407.000000407.000000407.000000407.000000407.0000000.0...0.00.00.00.00.00.00.00.00.00.0
mean12548.7174451.158631e+08NaN14203.74692914203.74692914202.94840313.514054418.02034478818.956069NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
std125.3547721.207642e+06NaN9351.1423749351.1423749350.9978745.446881271.09653155864.939403NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
min12325.0000001.121538e+08NaN1000.0000001000.0000001000.0000005.32000034.2200000.000000NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
25%12442.5000001.150769e+08NaN7000.0000007000.0000007000.0000009.930000235.58000043325.000000NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
50%12550.0000001.157004e+08NaN12000.00000012000.00000012000.00000012.620000357.25000063300.000000NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
75%12653.5000001.168245e+08NaN20000.00000020000.00000020000.00000016.020000553.51500095000.000000NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
max12862.0000001.181592e+08NaN40000.00000040000.00000040000.00000030.9900001417.710000495000.000000NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", 406 | "

8 rows × 126 columns

\n", 407 | "
" 408 | ], 409 | "text/plain": [ 410 | " Unnamed: 0 id member_id loan_amnt funded_amnt \\\n", 411 | "count 407.000000 4.070000e+02 0.0 407.000000 407.000000 \n", 412 | "mean 12548.717445 1.158631e+08 NaN 14203.746929 14203.746929 \n", 413 | "std 125.354772 1.207642e+06 NaN 9351.142374 9351.142374 \n", 414 | "min 12325.000000 1.121538e+08 NaN 1000.000000 1000.000000 \n", 415 | "25% 12442.500000 1.150769e+08 NaN 7000.000000 7000.000000 \n", 416 | "50% 12550.000000 1.157004e+08 NaN 12000.000000 12000.000000 \n", 417 | "75% 12653.500000 1.168245e+08 NaN 20000.000000 20000.000000 \n", 418 | "max 12862.000000 1.181592e+08 NaN 40000.000000 40000.000000 \n", 419 | "\n", 420 | " funded_amnt_inv int_rate installment annual_inc desc ... \\\n", 421 | "count 407.000000 407.000000 407.000000 407.000000 0.0 ... \n", 422 | "mean 14202.948403 13.514054 418.020344 78818.956069 NaN ... \n", 423 | "std 9350.997874 5.446881 271.096531 55864.939403 NaN ... \n", 424 | "min 1000.000000 5.320000 34.220000 0.000000 NaN ... \n", 425 | "25% 7000.000000 9.930000 235.580000 43325.000000 NaN ... \n", 426 | "50% 12000.000000 12.620000 357.250000 63300.000000 NaN ... \n", 427 | "75% 20000.000000 16.020000 553.515000 95000.000000 NaN ... \n", 428 | "max 40000.000000 30.990000 1417.710000 495000.000000 NaN ... \n", 429 | "\n", 430 | " hardship_loan_status orig_projected_additional_accrued_interest \\\n", 431 | "count 0.0 0.0 \n", 432 | "mean NaN NaN \n", 433 | "std NaN NaN \n", 434 | "min NaN NaN \n", 435 | "25% NaN NaN \n", 436 | "50% NaN NaN \n", 437 | "75% NaN NaN \n", 438 | "max NaN NaN \n", 439 | "\n", 440 | " hardship_payoff_balance_amount hardship_last_payment_amount \\\n", 441 | "count 0.0 0.0 \n", 442 | "mean NaN NaN \n", 443 | "std NaN NaN \n", 444 | "min NaN NaN \n", 445 | "25% NaN NaN \n", 446 | "50% NaN NaN \n", 447 | "75% NaN NaN \n", 448 | "max NaN NaN \n", 449 | "\n", 450 | " debt_settlement_flag_date settlement_status settlement_date \\\n", 451 | "count 0.0 0.0 0.0 \n", 452 | "mean NaN NaN NaN \n", 453 | "std NaN NaN NaN \n", 454 | "min NaN NaN NaN \n", 455 | "25% NaN NaN NaN \n", 456 | "50% NaN NaN NaN \n", 457 | "75% NaN NaN NaN \n", 458 | "max NaN NaN NaN \n", 459 | "\n", 460 | " settlement_amount settlement_percentage settlement_term \n", 461 | "count 0.0 0.0 0.0 \n", 462 | "mean NaN NaN NaN \n", 463 | "std NaN NaN NaN \n", 464 | "min NaN NaN NaN \n", 465 | "25% NaN NaN NaN \n", 466 | "50% NaN NaN NaN \n", 467 | "75% NaN NaN NaN \n", 468 | "max NaN NaN NaN \n", 469 | "\n", 470 | "[8 rows x 126 columns]" 471 | ] 472 | }, 473 | "execution_count": 4, 474 | "metadata": {}, 475 | "output_type": "execute_result" 476 | } 477 | ], 478 | "source": [ 479 | "pdfs[0].describe()" 480 | ] 481 | }, 482 | { 483 | "cell_type": "markdown", 484 | "id": "834d6471-8490-48ea-bb52-6be31662dc97", 485 | "metadata": {}, 486 | "source": [ 487 | "# Configure whylogs\n", 488 | "\n", 489 | "`whylogs`, by default, does not send statistics to WhyLabs.\n", 490 | "\n", 491 | "There are a few small steps you need to set up. If you haven't got the access key, please onboard with WhyLabs.\n", 492 | "\n", 493 | "**WhyLabs only requires whylogs API - your raw data never leaves your premise.**" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 5, 499 | "id": "1f8ee6e6-20cc-4967-b854-a5a8b9fc47e7", 500 | "metadata": {}, 501 | "outputs": [], 502 | "source": [ 503 | "from whylogs.app import Session\n", 504 | "from whylogs.app.writers import WhyLabsWriter\n", 505 | "import os\n", 506 | "import datetime" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": 6, 512 | "id": "31371bc6-4ec8-4518-84a0-4718b19d1506", 513 | "metadata": {}, 514 | "outputs": [ 515 | { 516 | "name": "stdout", 517 | "output_type": "stream", 518 | "text": [ 519 | "Enter your WhyLabs Org ID\n" 520 | ] 521 | }, 522 | { 523 | "name": "stdin", 524 | "output_type": "stream", 525 | "text": [ 526 | " org-5953\n" 527 | ] 528 | }, 529 | { 530 | "name": "stdout", 531 | "output_type": "stream", 532 | "text": [ 533 | "Enter your WhyLabs API key\n" 534 | ] 535 | }, 536 | { 537 | "name": "stdin", 538 | "output_type": "stream", 539 | "text": [ 540 | " ································································\n" 541 | ] 542 | }, 543 | { 544 | "name": "stdout", 545 | "output_type": "stream", 546 | "text": [ 547 | "Using API Key ID: naGzCisIJt\n" 548 | ] 549 | } 550 | ], 551 | "source": [ 552 | "import getpass\n", 553 | "\n", 554 | "# set your org-id here\n", 555 | "print(\"Enter your WhyLabs Org ID\")\n", 556 | "os.environ[\"WHYLABS_DEFAULT_ORG_ID\"] = input()\n", 557 | "# set your API key here\n", 558 | "print(\"Enter your WhyLabs API key\")\n", 559 | "os.environ[\"WHYLABS_API_KEY\"] = getpass.getpass()\n", 560 | "print(\"Using API Key ID: \", os.environ[\"WHYLABS_API_KEY\"][0:10])" 561 | ] 562 | }, 563 | { 564 | "cell_type": "markdown", 565 | "id": "38db51e3-39b6-4877-9318-c11726ff34d2", 566 | "metadata": {}, 567 | "source": [ 568 | "## Creating session\n", 569 | "\n", 570 | "Once the environments are set, let's create a whylogs session with a WhyLabs writer.\n", 571 | "\n", 572 | "Note that you can add your local writer or S3 writer if you want here. Check out the API docs for more information." 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": 7, 578 | "id": "a4e01c22-e2f1-46be-97dc-2b080e680088", 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "# create WhyLabs session\n", 583 | "writer = WhyLabsWriter(\"\", formats=[])\n", 584 | "session = Session(project=\"demo-project\", pipeline=\"demo-pipeline\", writers=[writer])" 585 | ] 586 | }, 587 | { 588 | "cell_type": "markdown", 589 | "id": "8d7bbb9c-9b18-4e12-bab2-88a5176eeba7", 590 | "metadata": {}, 591 | "source": [ 592 | "## Logging to WhyLabs\n", 593 | "\n", 594 | "Ensure you have a **model ID** (also called **dataset ID**) before you start!\n", 595 | "\n", 596 | "### Dataset Timestamp\n", 597 | "* To avoid confusion, it's recommended that you use UTC\n", 598 | "* If you don't set `dataset_timestamp` parameter, it'll default to `UTC` now\n", 599 | "* WhyLabs supports real time visualization when the timestamp is **within the last 7 days**. Anything older than than will be picked up when we run our batch processing\n", 600 | "* **If you log two profiles for the same day with different timestamps (12:00 vs 12:01), they are merged to the same batch**\n", 601 | "\n", 602 | "### Logging Different Batches of Data\n", 603 | "* We'll give the profiles different **dates**\n", 604 | "* Create a new logger for each date. Note that the logger needs to be `closed` to flush out the data" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": 8, 610 | "id": "3a006a2b-8403-477f-a1f5-a37ea33b160c", 611 | "metadata": {}, 612 | "outputs": [ 613 | { 614 | "name": "stdout", 615 | "output_type": "stream", 616 | "text": [ 617 | "Enter your model ID from WhyLabs:\n" 618 | ] 619 | }, 620 | { 621 | "name": "stdin", 622 | "output_type": "stream", 623 | "text": [ 624 | " model-5\n" 625 | ] 626 | }, 627 | { 628 | "name": "stdout", 629 | "output_type": "stream", 630 | "text": [ 631 | "Log data frame for 2021-09-30 04:30:22.845881+00:00\n", 632 | "Log data frame for 2021-09-29 04:30:25.273786+00:00\n" 633 | ] 634 | }, 635 | { 636 | "name": "stderr", 637 | "output_type": "stream", 638 | "text": [ 639 | "Using API key ID: naGzCisIJt\n" 640 | ] 641 | }, 642 | { 643 | "name": "stdout", 644 | "output_type": "stream", 645 | "text": [ 646 | "Log data frame for 2021-09-28 04:30:27.638109+00:00\n", 647 | "Log data frame for 2021-09-27 04:30:29.872950+00:00\n", 648 | "Log data frame for 2021-09-26 04:30:32.003965+00:00\n", 649 | "Log data frame for 2021-09-25 04:30:33.789872+00:00\n", 650 | "Log data frame for 2021-09-24 04:30:36.016256+00:00\n" 651 | ] 652 | } 653 | ], 654 | "source": [ 655 | "print(\"Enter your model ID from WhyLabs:\")\n", 656 | "model_id = input()\n", 657 | "for i, df in enumerate(pdfs):\n", 658 | " # walking backwards. Each dataset has to map to a date to show up as a different batch\n", 659 | " # in WhyLabs\n", 660 | " dt = datetime.datetime.now(tz=datetime.timezone.utc) - datetime.timedelta(days=i)\n", 661 | " \n", 662 | " # Create new logger for date\n", 663 | " with session.logger(tags={\"datasetId\": model_id}, dataset_timestamp=dt) as ylog:\n", 664 | " print(\"Log data frame for \", dt)\n", 665 | " ylog.log_dataframe(df)" 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": 9, 671 | "id": "4b0ae3f5-395b-4a24-bde3-2779176ab9ee", 672 | "metadata": {}, 673 | "outputs": [], 674 | "source": [ 675 | "# Ensure everything is flushed\n", 676 | "session.close()" 677 | ] 678 | }, 679 | { 680 | "cell_type": "markdown", 681 | "id": "b2c81d63-f420-4a36-8960-ef093b2f895f", 682 | "metadata": {}, 683 | "source": [ 684 | "## Voila\n", 685 | "\n", 686 | "* Now check the application to see if your **statistics** are in!!\n", 687 | "* Also, run the above cell again for the same model ID, do you see the statistics changes in WhyLabs? Especially the counters?" 688 | ] 689 | } 690 | ], 691 | "metadata": { 692 | "kernelspec": { 693 | "display_name": "whylogs", 694 | "language": "python", 695 | "name": "whylogs" 696 | }, 697 | "language_info": { 698 | "codemirror_mode": { 699 | "name": "ipython", 700 | "version": 3 701 | }, 702 | "file_extension": ".py", 703 | "mimetype": "text/x-python", 704 | "name": "python", 705 | "nbconvert_exporter": "python", 706 | "pygments_lexer": "ipython3", 707 | "version": "3.8.11" 708 | } 709 | }, 710 | "nbformat": 4, 711 | "nbformat_minor": 5 712 | } 713 | -------------------------------------------------------------------------------- /python/WhyLabs_Platform_with_log_reference.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "013a0fd4-31f9-4f3f-bf3a-1efc9640422f", 6 | "metadata": {}, 7 | "source": [ 8 | "# Install basic requirements" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "ad907ce3-0c3b-49e4-86f1-eae9de934f7b", 15 | "metadata": { 16 | "collapsed": true, 17 | "jupyter": { 18 | "outputs_hidden": true 19 | }, 20 | "tags": [] 21 | }, 22 | "outputs": [ 23 | { 24 | "name": "stdout", 25 | "output_type": "stream", 26 | "text": [ 27 | "Defaulting to user installation because normal site-packages is not writeable\n", 28 | "\u001b[33mWARNING: Ignoring invalid distribution -andas (/home/jamie/.local/lib/python3.8/site-packages)\u001b[0m\n", 29 | "\u001b[33mWARNING: Ignoring invalid distribution -andas (/home/jamie/.local/lib/python3.8/site-packages)\u001b[0m\n", 30 | "Requirement already satisfied: whylogs in /home/jamie/.local/lib/python3.8/site-packages (0.6.25.dev0)\n", 31 | "Requirement already satisfied: pandas in /home/jamie/.local/lib/python3.8/site-packages (1.4.0)\n", 32 | "Requirement already satisfied: puremagic<2.0,>=1.10 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (1.11)\n", 33 | "Requirement already satisfied: jsonschema>=3.2.0 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (4.4.0)\n", 34 | "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.8/dist-packages/python_dateutil-2.8.2-py3.8.egg (from whylogs) (2.8.2)\n", 35 | "Requirement already satisfied: boto3>=1.14.1 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (1.18.9)\n", 36 | "Requirement already satisfied: botocore>=1.17.44 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (1.21.9)\n", 37 | "Requirement already satisfied: requests>=2.22.0 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (2.27.1)\n", 38 | "Requirement already satisfied: whylabs-client<0.2.0,>=0.1.1.dev0 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (0.1.1.dev0)\n", 39 | "Requirement already satisfied: numpy>=1.18.0 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (1.22.0)\n", 40 | "Requirement already satisfied: scipy<2.0.0,>=1.5.4 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (1.6.3)\n", 41 | "Requirement already satisfied: whylabs-datasketches>=2.2.0b1 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (2.2.0b1)\n", 42 | "Requirement already satisfied: smart-open>=4.1.2 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (4.1.2)\n", 43 | "Requirement already satisfied: matplotlib<4.0.0,>=3.0.3 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (3.3.3)\n", 44 | "Requirement already satisfied: marshmallow>=3.7.1 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (3.14.1)\n", 45 | "Requirement already satisfied: click>=7.1.2 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (7.1.2)\n", 46 | "Requirement already satisfied: tqdm<5.0.0,>=4.60.0 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (4.62.3)\n", 47 | "Requirement already satisfied: pyyaml>=5.3.1 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (6.0)\n", 48 | "Requirement already satisfied: protobuf>=3.15.5 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (3.19.3)\n", 49 | "Requirement already satisfied: pytz>=2020.1 in /home/jamie/.local/lib/python3.8/site-packages (from pandas) (2021.3)\n", 50 | "Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/jamie/.local/lib/python3.8/site-packages (from boto3>=1.14.1->whylogs) (0.5.0)\n", 51 | "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/jamie/.local/lib/python3.8/site-packages (from boto3>=1.14.1->whylogs) (0.10.0)\n", 52 | "Requirement already satisfied: urllib3<1.27,>=1.25.4 in /usr/lib/python3/dist-packages (from botocore>=1.17.44->whylogs) (1.25.8)\n", 53 | "Requirement already satisfied: attrs>=17.4.0 in /home/jamie/.local/lib/python3.8/site-packages (from jsonschema>=3.2.0->whylogs) (21.2.0)\n", 54 | "Requirement already satisfied: importlib-resources>=1.4.0 in /home/jamie/.local/lib/python3.8/site-packages (from jsonschema>=3.2.0->whylogs) (5.4.0)\n", 55 | "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/lib/python3/dist-packages (from jsonschema>=3.2.0->whylogs) (0.15.5)\n", 56 | "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /usr/lib/python3/dist-packages (from matplotlib<4.0.0,>=3.0.3->whylogs) (2.4.6)\n", 57 | "Requirement already satisfied: kiwisolver>=1.0.1 in /home/jamie/.local/lib/python3.8/site-packages (from matplotlib<4.0.0,>=3.0.3->whylogs) (1.3.1)\n", 58 | "Requirement already satisfied: cycler>=0.10 in /home/jamie/.local/lib/python3.8/site-packages (from matplotlib<4.0.0,>=3.0.3->whylogs) (0.10.0)\n", 59 | "Requirement already satisfied: pillow>=6.2.0 in /usr/lib/python3/dist-packages (from matplotlib<4.0.0,>=3.0.3->whylogs) (7.0.0)\n", 60 | "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.1->whylogs) (1.14.0)\n", 61 | "Requirement already satisfied: charset-normalizer~=2.0.0 in /home/jamie/.local/lib/python3.8/site-packages (from requests>=2.22.0->whylogs) (2.0.7)\n", 62 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests>=2.22.0->whylogs) (2019.11.28)\n", 63 | "Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests>=2.22.0->whylogs) (2.8)\n", 64 | "Requirement already satisfied: zipp>=3.1.0 in /home/jamie/.local/lib/python3.8/site-packages (from importlib-resources>=1.4.0->jsonschema>=3.2.0->whylogs) (3.7.0)\n", 65 | "\u001b[33mWARNING: Ignoring invalid distribution -andas (/home/jamie/.local/lib/python3.8/site-packages)\u001b[0m\n", 66 | "\u001b[33mWARNING: Ignoring invalid distribution -andas (/home/jamie/.local/lib/python3.8/site-packages)\u001b[0m\n", 67 | "\u001b[33mWARNING: Ignoring invalid distribution -andas (/home/jamie/.local/lib/python3.8/site-packages)\u001b[0m\n", 68 | "Note: you may need to restart the kernel to use updated packages.\n" 69 | ] 70 | } 71 | ], 72 | "source": [ 73 | "pip install -U whylogs pandas" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 2, 79 | "id": "8369d3a8-9bf2-4043-a45a-13838498f211", 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "import whylogs\n", 84 | "import pandas as pd" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "id": "a244145c-ea35-4ab6-b03e-cf5f864ed94c", 90 | "metadata": {}, 91 | "source": [ 92 | "# Load example data batches\n", 93 | "\n", 94 | "The example data is prepared from our public S3 bucket. You can use your own data if you want if you have multiple batches of data." 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 3, 100 | "id": "b78028ea-c7cb-494f-a303-071f1c345dfc", 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_1.csv\n", 108 | "Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_2.csv\n", 109 | "Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_3.csv\n", 110 | "Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_4.csv\n", 111 | "Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_5.csv\n", 112 | "Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_6.csv\n", 113 | "Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_7.csv\n" 114 | ] 115 | } 116 | ], 117 | "source": [ 118 | "pdfs = []\n", 119 | "for i in range(1, 8):\n", 120 | " path = f\"https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_{i}.csv\"\n", 121 | " print(f\"Loading data from {path}\")\n", 122 | " df = pd.read_csv(path)\n", 123 | " pdfs.append(df)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 4, 129 | "id": "67b81ab4-a456-4d2d-9547-ad0d772e0aaa", 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "data": { 134 | "text/html": [ 135 | "
\n", 136 | "\n", 149 | "\n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | "
Unnamed: 0idmember_idloan_amntfunded_amntfunded_amnt_invint_rateinstallmentannual_incdesc...hardship_loan_statusorig_projected_additional_accrued_interesthardship_payoff_balance_amounthardship_last_payment_amountdebt_settlement_flag_datesettlement_statussettlement_datesettlement_amountsettlement_percentagesettlement_term
count407.0000004.070000e+020.0407.000000407.000000407.000000407.000000407.000000407.0000000.0...0.00.00.00.00.00.00.00.00.00.0
mean12548.7174451.158631e+08NaN14203.74692914203.74692914202.94840313.514054418.02034478818.956069NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
std125.3547721.207642e+06NaN9351.1423749351.1423749350.9978745.446881271.09653155864.939403NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
min12325.0000001.121538e+08NaN1000.0000001000.0000001000.0000005.32000034.2200000.000000NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
25%12442.5000001.150769e+08NaN7000.0000007000.0000007000.0000009.930000235.58000043325.000000NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
50%12550.0000001.157004e+08NaN12000.00000012000.00000012000.00000012.620000357.25000063300.000000NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
75%12653.5000001.168245e+08NaN20000.00000020000.00000020000.00000016.020000553.51500095000.000000NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
max12862.0000001.181592e+08NaN40000.00000040000.00000040000.00000030.9900001417.710000495000.000000NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", 371 | "

8 rows × 126 columns

\n", 372 | "
" 373 | ], 374 | "text/plain": [ 375 | " Unnamed: 0 id member_id loan_amnt funded_amnt \\\n", 376 | "count 407.000000 4.070000e+02 0.0 407.000000 407.000000 \n", 377 | "mean 12548.717445 1.158631e+08 NaN 14203.746929 14203.746929 \n", 378 | "std 125.354772 1.207642e+06 NaN 9351.142374 9351.142374 \n", 379 | "min 12325.000000 1.121538e+08 NaN 1000.000000 1000.000000 \n", 380 | "25% 12442.500000 1.150769e+08 NaN 7000.000000 7000.000000 \n", 381 | "50% 12550.000000 1.157004e+08 NaN 12000.000000 12000.000000 \n", 382 | "75% 12653.500000 1.168245e+08 NaN 20000.000000 20000.000000 \n", 383 | "max 12862.000000 1.181592e+08 NaN 40000.000000 40000.000000 \n", 384 | "\n", 385 | " funded_amnt_inv int_rate installment annual_inc desc ... \\\n", 386 | "count 407.000000 407.000000 407.000000 407.000000 0.0 ... \n", 387 | "mean 14202.948403 13.514054 418.020344 78818.956069 NaN ... \n", 388 | "std 9350.997874 5.446881 271.096531 55864.939403 NaN ... \n", 389 | "min 1000.000000 5.320000 34.220000 0.000000 NaN ... \n", 390 | "25% 7000.000000 9.930000 235.580000 43325.000000 NaN ... \n", 391 | "50% 12000.000000 12.620000 357.250000 63300.000000 NaN ... \n", 392 | "75% 20000.000000 16.020000 553.515000 95000.000000 NaN ... \n", 393 | "max 40000.000000 30.990000 1417.710000 495000.000000 NaN ... \n", 394 | "\n", 395 | " hardship_loan_status orig_projected_additional_accrued_interest \\\n", 396 | "count 0.0 0.0 \n", 397 | "mean NaN NaN \n", 398 | "std NaN NaN \n", 399 | "min NaN NaN \n", 400 | "25% NaN NaN \n", 401 | "50% NaN NaN \n", 402 | "75% NaN NaN \n", 403 | "max NaN NaN \n", 404 | "\n", 405 | " hardship_payoff_balance_amount hardship_last_payment_amount \\\n", 406 | "count 0.0 0.0 \n", 407 | "mean NaN NaN \n", 408 | "std NaN NaN \n", 409 | "min NaN NaN \n", 410 | "25% NaN NaN \n", 411 | "50% NaN NaN \n", 412 | "75% NaN NaN \n", 413 | "max NaN NaN \n", 414 | "\n", 415 | " debt_settlement_flag_date settlement_status settlement_date \\\n", 416 | "count 0.0 0.0 0.0 \n", 417 | "mean NaN NaN NaN \n", 418 | "std NaN NaN NaN \n", 419 | "min NaN NaN NaN \n", 420 | "25% NaN NaN NaN \n", 421 | "50% NaN NaN NaN \n", 422 | "75% NaN NaN NaN \n", 423 | "max NaN NaN NaN \n", 424 | "\n", 425 | " settlement_amount settlement_percentage settlement_term \n", 426 | "count 0.0 0.0 0.0 \n", 427 | "mean NaN NaN NaN \n", 428 | "std NaN NaN NaN \n", 429 | "min NaN NaN NaN \n", 430 | "25% NaN NaN NaN \n", 431 | "50% NaN NaN NaN \n", 432 | "75% NaN NaN NaN \n", 433 | "max NaN NaN NaN \n", 434 | "\n", 435 | "[8 rows x 126 columns]" 436 | ] 437 | }, 438 | "execution_count": 4, 439 | "metadata": {}, 440 | "output_type": "execute_result" 441 | } 442 | ], 443 | "source": [ 444 | "pdfs[0].describe()" 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "id": "834d6471-8490-48ea-bb52-6be31662dc97", 450 | "metadata": {}, 451 | "source": [ 452 | "# Configure whylogs\n", 453 | "\n", 454 | "`whylogs`, by default, does not send statistics to WhyLabs.\n", 455 | "\n", 456 | "There are a few small steps you need to set up. If you haven't got the access key, please onboard with WhyLabs here: .\n", 457 | "\n", 458 | "**WhyLabs only requires whylogs API - your raw data never leaves your premise.**" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 5, 464 | "id": "1f8ee6e6-20cc-4967-b854-a5a8b9fc47e7", 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | "from whylogs.app import Session\n", 469 | "from whylogs.app.writers import WhyLabsWriter\n", 470 | "import os\n", 471 | "import datetime" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "id": "31371bc6-4ec8-4518-84a0-4718b19d1506", 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "import getpass\n", 482 | "\n", 483 | "# set your org-id here\n", 484 | "print(\"Enter your WhyLabs Org ID\")\n", 485 | "os.environ[\"WHYLABS_DEFAULT_ORG_ID\"] = input()\n", 486 | "# set your API key here\n", 487 | "print(\"Enter your WhyLabs API key\")\n", 488 | "os.environ[\"WHYLABS_API_KEY\"] = getpass.getpass()\n", 489 | "print(\"Using API Key ID: \", os.environ[\"WHYLABS_API_KEY\"][0:10])" 490 | ] 491 | }, 492 | { 493 | "cell_type": "markdown", 494 | "id": "38db51e3-39b6-4877-9318-c11726ff34d2", 495 | "metadata": {}, 496 | "source": [ 497 | "## Creating session\n", 498 | "\n", 499 | "Once the environments are set, let's create a whylogs session with a WhyLabs writer.\n", 500 | "\n", 501 | "Note that you can add your local writer or S3 writer if you want here. Check out the API docs for more information." 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 7, 507 | "id": "a4e01c22-e2f1-46be-97dc-2b080e680088", 508 | "metadata": {}, 509 | "outputs": [], 510 | "source": [ 511 | "# create WhyLabs session\n", 512 | "writer = WhyLabsWriter()\n", 513 | "session = Session(writers=[writer])" 514 | ] 515 | }, 516 | { 517 | "cell_type": "markdown", 518 | "id": "8d7bbb9c-9b18-4e12-bab2-88a5176eeba7", 519 | "metadata": {}, 520 | "source": [ 521 | "## Logging to WhyLabs\n", 522 | "\n", 523 | "Ensure you have a **model ID** (also called **dataset ID**) before you start!\n", 524 | "\n", 525 | "### Dataset Timestamp\n", 526 | "* To avoid confusion, it's recommended that you use UTC\n", 527 | "* If you don't set `dataset_timestamp` parameter, it'll default to `UTC` now\n", 528 | "* WhyLabs supports real time visualization when the timestamp is **within the last 7 days**. Anything older than than will be picked up when we run our batch processing\n", 529 | "* **If you log two profiles for the same day with different timestamps (12:00 vs 12:01), they are merged to the same batch**\n", 530 | "\n", 531 | "### Logging Different Batches of Data\n", 532 | "* We'll give the profiles different **dates**\n", 533 | "* Create a new logger for each date. Note that the logger needs to be `closed` to flush out the data" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": null, 539 | "id": "3a006a2b-8403-477f-a1f5-a37ea33b160c", 540 | "metadata": {}, 541 | "outputs": [], 542 | "source": [ 543 | "print(\"Enter your model ID from WhyLabs:\")\n", 544 | "model_id = input()\n", 545 | "reference_profile = None\n", 546 | "for i, df in enumerate(pdfs):\n", 547 | " # walking backwards. Each dataset has to map to a date to show up as a different batch\n", 548 | " # in WhyLabs\n", 549 | " dt = datetime.datetime.now(tz=datetime.timezone.utc) - datetime.timedelta(days=i)\n", 550 | " \n", 551 | " # Create new logger for date\n", 552 | " with session.logger(tags={\"datasetId\": model_id}, dataset_timestamp=dt) as ylog:\n", 553 | " print(\"Log data frame for \", dt)\n", 554 | " ylog.log_dataframe(df)\n", 555 | " # we will keep a reference to the first profile for us as a baseline for monitoring\n", 556 | " if (i==0):\n", 557 | " reference_profile = ylog.profile" 558 | ] 559 | }, 560 | { 561 | "cell_type": "markdown", 562 | "id": "bf53e9ca", 563 | "metadata": {}, 564 | "source": [] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": 9, 569 | "id": "4b0ae3f5-395b-4a24-bde3-2779176ab9ee", 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "# Ensure everything is flushed\n", 574 | "session.close()" 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "id": "3c3cfcf1", 580 | "metadata": {}, 581 | "source": [ 582 | "We still have a reference to the first profile, for this demo we will use this dataframe's profile and upload it as a reference profile for monitoring on Whylabs" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": null, 588 | "id": "1fa34a96", 589 | "metadata": {}, 590 | "outputs": [], 591 | "source": [ 592 | "# You can rename the reference profile alias, this will show up when choosing a baseline on the monitoring settings page of Whylabs\n", 593 | "reference_profile_alias = \"demo-reference-profile\"\n", 594 | "reference_profile.to_summary()" 595 | ] 596 | }, 597 | { 598 | "cell_type": "markdown", 599 | "id": "7ad55e9e", 600 | "metadata": {}, 601 | "source": [ 602 | "The reference profile can be uploaded using a whylabs_client directly. First, we need to reference the profile as a file on disk, so write it out." 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": null, 608 | "id": "fe653d51", 609 | "metadata": {}, 610 | "outputs": [], 611 | "source": [ 612 | "import tempfile\n", 613 | "\n", 614 | "# write out the profile we just \n", 615 | "tmp_dir = tempfile.mkdtemp()\n", 616 | "profile_path = os.path.join(tmp_dir, \"reference-profile.bin\")\n", 617 | "reference_profile.write_protobuf(profile_path)\n", 618 | "print(f\"Reference profile written to temporary file in preparation to upload to Whylabs as a reference profile: {profile_path}\")" 619 | ] 620 | }, 621 | { 622 | "cell_type": "markdown", 623 | "id": "b682878f", 624 | "metadata": {}, 625 | "source": [ 626 | "The whylabs_client will construct a request to upload this as a reference profile, using the org-id, model-id and api-key entered above." 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": null, 632 | "id": "156c1bc5", 633 | "metadata": {}, 634 | "outputs": [], 635 | "source": [ 636 | "import requests\n", 637 | "import whylabs_client\n", 638 | "from whylabs_client.api.log_api import LogApi\n", 639 | "from whylabs_client.model.log_reference_request import LogReferenceRequest\n", 640 | "\n", 641 | "# Now setup some of the inputs required to make the request to upload to Whylabs using the whylabs_client\n", 642 | "whylabs_api_endpoint = \"https://api.whylabsapp.com\"\n", 643 | "api_key = os.environ[\"WHYLABS_API_KEY\"]\n", 644 | "print(f\"Using API key ID: {api_key[:10]} and endpoint {whylabs_api_endpoint}\")\n", 645 | "config = whylabs_client.Configuration(host=whylabs_api_endpoint, api_key={\"ApiKeyAuth\": api_key}, discard_unknown_keys=True)\n", 646 | "api_log_client = whylabs_client.ApiClient(config)\n", 647 | "log_api = LogApi(api_log_client)\n", 648 | "\n", 649 | "org_id = reference_profile.tags.get(\"orgId\", os.environ.get(\"WHYLABS_DEFAULT_ORG_ID\"))\n", 650 | "dataset_id = reference_profile.tags.get(\"datasetId\", os.environ.get(\"WHYLABS_DEFAULT_DATASET_ID\"))\n", 651 | "dataset_timestamp = int(reference_profile.dataset_timestamp.timestamp() * 1000)\n", 652 | "alias = reference_profile_alias\n", 653 | "\n", 654 | "try:\n", 655 | " with open(profile_path, \"rb\") as f:\n", 656 | " request = LogReferenceRequest(dataset_timestamp=dataset_timestamp, alias=alias)\n", 657 | " print(f\"Making initial call to log_reference to get upload url for {alias} and in [{org_id}] for [{dataset_id}] using request: {request}\")\n", 658 | " async_result = log_api.log_reference(org_id=org_id, model_id=dataset_id, log_reference_request=request, async_req=True)\n", 659 | " result = async_result.get()\n", 660 | " upload_url = result[\"upload_url\"]\n", 661 | " print(f\"got async_result from log_reference, upload url is: {upload_url[:140]}\")\n", 662 | " print(f\"About to upload reference profile...\")\n", 663 | " http_response = requests.put(upload_url, data=f.read())\n", 664 | " if http_response.ok:\n", 665 | " print(f\"Done uploading reference profile with alias: {alias} to: {upload_url[:140]} with API token ID: {api_key[:10]}\")\n", 666 | " else:\n", 667 | " print(\n", 668 | " f\"Failed to upload reference profile with alias: {alias} to: {upload_url[:140]} with API token ID: {api_key[:10]} to \"\n", 669 | " + f\"{whylabs_api_endpoint}: unexpected HTTP status {http_response}\"\n", 670 | " )\n", 671 | "except Exception as e:\n", 672 | " print(f\"Failed to upload reference profile: {e}.\")" 673 | ] 674 | }, 675 | { 676 | "cell_type": "markdown", 677 | "id": "b2c81d63-f420-4a36-8960-ef093b2f895f", 678 | "metadata": {}, 679 | "source": [ 680 | "## Voila\n", 681 | "\n", 682 | "* Now check the application to see if your **statistics** are in!!\n", 683 | "* Check the monitoring settings page, if you change the toggle from Baseline from trailing window to \"Compare to reference profile\", you can select the reference profile we just uploading which should show up with the text from our 'alias'\n" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "id": "2e14cfc8", 690 | "metadata": {}, 691 | "outputs": [], 692 | "source": [ 693 | "from IPython.display import display, Markdown\n", 694 | "url = f\"https://hub.whylabsapp.com/models/{dataset_id}/monitor-settings\"\n", 695 | "content = Markdown(f\"url here: {url}\")\n", 696 | "display(content)" 697 | ] 698 | } 699 | ], 700 | "metadata": { 701 | "interpreter": { 702 | "hash": "892921edc4b5a5c5d6099af55101733d18e39dc83f9447864be65f12184b4035" 703 | }, 704 | "kernelspec": { 705 | "display_name": "whylogs", 706 | "language": "python", 707 | "name": "python3" 708 | }, 709 | "language_info": { 710 | "codemirror_mode": { 711 | "name": "ipython", 712 | "version": 3 713 | }, 714 | "file_extension": ".py", 715 | "mimetype": "text/x-python", 716 | "name": "python", 717 | "nbconvert_exporter": "python", 718 | "pygments_lexer": "ipython3", 719 | "version": "3.8.10" 720 | } 721 | }, 722 | "nbformat": 4, 723 | "nbformat_minor": 5 724 | } 725 | -------------------------------------------------------------------------------- /python/flower2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whylabs/whylogs-examples/167be1e91f335ca2b77a09aa6ef99090099bfcae/python/flower2.jpg -------------------------------------------------------------------------------- /python/logging_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%matplotlib inline\n", 10 | "\n", 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "import matplotlib.pyplot as plt" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "2020-09-22 16:47:19,276 - whylogs.logs - DEBUG - whylogs.logs logging -> stdout at level DEBUG\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "# Just a simple convenience function to send the internal python\n", 31 | "# logs to stdout. Definitely not required\n", 32 | "from whylogs.logs import display_logging\n", 33 | "display_logging('debug')" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "## Load data" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/html": [ 51 | "
\n", 52 | "\n", 65 | "\n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | "
idmember_idloan_amntfunded_amntfunded_amnt_invtermint_rateinstallmentgradesub_grade...hardship_payoff_balance_amounthardship_last_payment_amountdisbursement_methoddebt_settlement_flagdebt_settlement_flag_datesettlement_statussettlement_datesettlement_amountsettlement_percentagesettlement_term
090671227NaN4800.04800.04800.036 months13.49162.87CC2...NaNNaNCashNNaNNaNNaNNaNNaNNaN
190060135NaN21600.021600.021600.060 months9.49453.54BB2...NaNNaNCashNNaNNaNNaNNaNNaNNaN
290501423NaN24200.024200.024200.036 months9.49775.09BB2...NaNNaNCashNNaNNaNNaNNaNNaNNaN
390186302NaN3600.03600.03600.036 months11.49118.70BB5...NaNNaNCashNNaNNaNNaNNaNNaNNaN
490805192NaN8000.08000.08000.036 months10.49259.99BB3...NaNNaNCashNNaNNaNNaNNaNNaNNaN
\n", 215 | "

5 rows × 151 columns

\n", 216 | "
" 217 | ], 218 | "text/plain": [ 219 | " id member_id loan_amnt funded_amnt funded_amnt_inv term \\\n", 220 | "0 90671227 NaN 4800.0 4800.0 4800.0 36 months \n", 221 | "1 90060135 NaN 21600.0 21600.0 21600.0 60 months \n", 222 | "2 90501423 NaN 24200.0 24200.0 24200.0 36 months \n", 223 | "3 90186302 NaN 3600.0 3600.0 3600.0 36 months \n", 224 | "4 90805192 NaN 8000.0 8000.0 8000.0 36 months \n", 225 | "\n", 226 | " int_rate installment grade sub_grade ... hardship_payoff_balance_amount \\\n", 227 | "0 13.49 162.87 C C2 ... NaN \n", 228 | "1 9.49 453.54 B B2 ... NaN \n", 229 | "2 9.49 775.09 B B2 ... NaN \n", 230 | "3 11.49 118.70 B B5 ... NaN \n", 231 | "4 10.49 259.99 B B3 ... NaN \n", 232 | "\n", 233 | " hardship_last_payment_amount disbursement_method debt_settlement_flag \\\n", 234 | "0 NaN Cash N \n", 235 | "1 NaN Cash N \n", 236 | "2 NaN Cash N \n", 237 | "3 NaN Cash N \n", 238 | "4 NaN Cash N \n", 239 | "\n", 240 | " debt_settlement_flag_date settlement_status settlement_date \\\n", 241 | "0 NaN NaN NaN \n", 242 | "1 NaN NaN NaN \n", 243 | "2 NaN NaN NaN \n", 244 | "3 NaN NaN NaN \n", 245 | "4 NaN NaN NaN \n", 246 | "\n", 247 | " settlement_amount settlement_percentage settlement_term \n", 248 | "0 NaN NaN NaN \n", 249 | "1 NaN NaN NaN \n", 250 | "2 NaN NaN NaN \n", 251 | "3 NaN NaN NaN \n", 252 | "4 NaN NaN NaN \n", 253 | "\n", 254 | "[5 rows x 151 columns]" 255 | ] 256 | }, 257 | "execution_count": 3, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "# Load some data\n", 264 | "df = pd.read_csv('lending_club_1000.csv')\n", 265 | "# Split into a test & training set\n", 266 | "df_training = df.sample(int(len(df) * 0.8), replace=False, random_state=123)\n", 267 | "df_test = df.drop(df_training.index)\n", 268 | "df.head()" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": {}, 274 | "source": [ 275 | "## Log dataset sketches" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 4, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "name": "stdout", 285 | "output_type": "stream", 286 | "text": [ 287 | "2020-09-22 16:47:19,339 - whylogs.app.config - DEBUG - Attempting to load config file: None\n", 288 | "2020-09-22 16:47:19,340 - whylogs.app.config - DEBUG - Attempting to load config file: .whylogs.yaml\n" 289 | ] 290 | } 291 | ], 292 | "source": [ 293 | "from whylogs import get_or_create_session\n", 294 | "\n", 295 | "session = get_or_create_session()" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "#### Log dataframe" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 5, 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "data": { 312 | "text/plain": [ 313 | "" 314 | ] 315 | }, 316 | "execution_count": 5, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | } 320 | ], 321 | "source": [ 322 | "session.log_dataframe(df_training, 'training.data')\n", 323 | "# Then you could do whatever training or calculations you'd like" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "### Inspect profiles/statistics" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 6, 336 | "metadata": {}, 337 | "outputs": [ 338 | { 339 | "data": { 340 | "text/html": [ 341 | "
\n", 342 | "\n", 355 | "\n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | "
columncountnull_countbool_countnumeric_countmaxmeanminstddevnunique_numbers...ununique_str_upperquantile_0.0000quantile_0.0100quantile_0.0500quantile_0.2500quantile_0.5000quantile_0.7500quantile_0.9500quantile_0.9900quantile_1.0000
0num_il_tl200.00.00.0199.043.009.8341710.008.29065734.0...0.00.000.0000001.0000004.0000007.00000014.00000028.00000042.00000043.000000
1open_acc_6m200.00.00.0199.08.001.3567840.001.4207498.0...0.00.000.0000000.0000000.0000001.0000002.0000004.0000007.0000008.000000
2avg_cur_bal200.00.00.0199.072812.0013079.467337244.0014001.002777199.0...0.0244.00425.0000001252.0000003039.0000008200.00000017591.00000043647.00000068086.00000072812.000000
3dti_joint200.00.00.04.020.6514.89250012.353.8639224.0...0.012.3512.35000012.35000013.22000013.35000020.65000020.65000020.65000020.650000
4num_accts_ever_120_pd200.00.00.0199.07.000.5427140.001.2296578.0...0.00.000.0000000.0000000.0000000.0000001.0000003.0000007.0000007.000000
..................................................................
146sec_app_collections_12_mths_ex_med200.00.00.00.00.000.0000000.000.0000000.0...0.0NaNNaNNaNNaNNaNNaNNaNNaNNaN
147emp_length200.00.00.00.00.000.0000000.000.0000000.0...11.0NaNNaNNaNNaNNaNNaNNaNNaNNaN
148last_pymnt_amnt200.00.00.0199.035304.765068.3704520.007696.468449194.0...0.00.007.980000118.699997334.100006771.2299807585.50976622287.58007832954.30859435304.761719
149total_pymnt_inv200.00.00.0199.052583.9715089.0573370.0010349.878426198.0...0.00.00828.9000242734.1599127149.43017612359.34960920929.97070335261.21875051942.23046952583.968750
150debt_settlement_flag200.00.00.00.00.000.0000000.000.0000000.0...2.0NaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", 649 | "

151 rows × 32 columns

\n", 650 | "
" 651 | ], 652 | "text/plain": [ 653 | " column count null_count bool_count \\\n", 654 | "0 num_il_tl 200.0 0.0 0.0 \n", 655 | "1 open_acc_6m 200.0 0.0 0.0 \n", 656 | "2 avg_cur_bal 200.0 0.0 0.0 \n", 657 | "3 dti_joint 200.0 0.0 0.0 \n", 658 | "4 num_accts_ever_120_pd 200.0 0.0 0.0 \n", 659 | ".. ... ... ... ... \n", 660 | "146 sec_app_collections_12_mths_ex_med 200.0 0.0 0.0 \n", 661 | "147 emp_length 200.0 0.0 0.0 \n", 662 | "148 last_pymnt_amnt 200.0 0.0 0.0 \n", 663 | "149 total_pymnt_inv 200.0 0.0 0.0 \n", 664 | "150 debt_settlement_flag 200.0 0.0 0.0 \n", 665 | "\n", 666 | " numeric_count max mean min stddev \\\n", 667 | "0 199.0 43.00 9.834171 0.00 8.290657 \n", 668 | "1 199.0 8.00 1.356784 0.00 1.420749 \n", 669 | "2 199.0 72812.00 13079.467337 244.00 14001.002777 \n", 670 | "3 4.0 20.65 14.892500 12.35 3.863922 \n", 671 | "4 199.0 7.00 0.542714 0.00 1.229657 \n", 672 | ".. ... ... ... ... ... \n", 673 | "146 0.0 0.00 0.000000 0.00 0.000000 \n", 674 | "147 0.0 0.00 0.000000 0.00 0.000000 \n", 675 | "148 199.0 35304.76 5068.370452 0.00 7696.468449 \n", 676 | "149 199.0 52583.97 15089.057337 0.00 10349.878426 \n", 677 | "150 0.0 0.00 0.000000 0.00 0.000000 \n", 678 | "\n", 679 | " nunique_numbers ... ununique_str_upper quantile_0.0000 \\\n", 680 | "0 34.0 ... 0.0 0.00 \n", 681 | "1 8.0 ... 0.0 0.00 \n", 682 | "2 199.0 ... 0.0 244.00 \n", 683 | "3 4.0 ... 0.0 12.35 \n", 684 | "4 8.0 ... 0.0 0.00 \n", 685 | ".. ... ... ... ... \n", 686 | "146 0.0 ... 0.0 NaN \n", 687 | "147 0.0 ... 11.0 NaN \n", 688 | "148 194.0 ... 0.0 0.00 \n", 689 | "149 198.0 ... 0.0 0.00 \n", 690 | "150 0.0 ... 2.0 NaN \n", 691 | "\n", 692 | " quantile_0.0100 quantile_0.0500 quantile_0.2500 quantile_0.5000 \\\n", 693 | "0 0.000000 1.000000 4.000000 7.000000 \n", 694 | "1 0.000000 0.000000 0.000000 1.000000 \n", 695 | "2 425.000000 1252.000000 3039.000000 8200.000000 \n", 696 | "3 12.350000 12.350000 13.220000 13.350000 \n", 697 | "4 0.000000 0.000000 0.000000 0.000000 \n", 698 | ".. ... ... ... ... \n", 699 | "146 NaN NaN NaN NaN \n", 700 | "147 NaN NaN NaN NaN \n", 701 | "148 7.980000 118.699997 334.100006 771.229980 \n", 702 | "149 828.900024 2734.159912 7149.430176 12359.349609 \n", 703 | "150 NaN NaN NaN NaN \n", 704 | "\n", 705 | " quantile_0.7500 quantile_0.9500 quantile_0.9900 quantile_1.0000 \n", 706 | "0 14.000000 28.000000 42.000000 43.000000 \n", 707 | "1 2.000000 4.000000 7.000000 8.000000 \n", 708 | "2 17591.000000 43647.000000 68086.000000 72812.000000 \n", 709 | "3 20.650000 20.650000 20.650000 20.650000 \n", 710 | "4 1.000000 3.000000 7.000000 7.000000 \n", 711 | ".. ... ... ... ... \n", 712 | "146 NaN NaN NaN NaN \n", 713 | "147 NaN NaN NaN NaN \n", 714 | "148 7585.509766 22287.580078 32954.308594 35304.761719 \n", 715 | "149 20929.970703 35261.218750 51942.230469 52583.968750 \n", 716 | "150 NaN NaN NaN NaN \n", 717 | "\n", 718 | "[151 rows x 32 columns]" 719 | ] 720 | }, 721 | "execution_count": 6, 722 | "metadata": {}, 723 | "output_type": "execute_result" 724 | } 725 | ], 726 | "source": [ 727 | "# You can also capture the logger response and interact with the generated\n", 728 | "# profiles\n", 729 | "\n", 730 | "# Log the test data\n", 731 | "prof = session.log_dataframe(df_test, 'test.data')\n", 732 | "summary = prof.flat_summary()\n", 733 | "stats_df = summary['summary']\n", 734 | "stats_df" 735 | ] 736 | }, 737 | { 738 | "cell_type": "code", 739 | "execution_count": 7, 740 | "metadata": {}, 741 | "outputs": [ 742 | { 743 | "data": { 744 | "text/plain": [ 745 | "" 746 | ] 747 | }, 748 | "execution_count": 7, 749 | "metadata": {}, 750 | "output_type": "execute_result" 751 | }, 752 | { 753 | "data": { 754 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAD7CAYAAABzGc+QAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAANpklEQVR4nO3dfaxkdX3H8fenrJoULQ/dy3Zj0UsIkvBPV3tLabCtSmt5MIJJYyQN0lSzxpRGjGmzatL65/qcNGls1kLdphTjA1YSaCsSW2NSsBeKPEoAu0RwWS61LbRNNMC3f8zZMl7u3Tt778ze/e6+X8lkzvmdM3e+38zks2fOnN9OqgpJUj8/tdkFSJLWxwCXpKYMcElqygCXpKYMcElqygCXpKbWDPAkpyf5RpL7k9yX5H3D+EeSPJ7kruF28ezLlSQdlLWuA0+yHdheVXcmeQVwB3AZ8Hbgv6vqEzOvUpL0IlvW2qGq9gP7h+VnkjwAvHI9T7Z169aan59fz0Ml6bh1xx13PFVVc8vH1wzwcUnmgdcCtwPnA1cleSewCHygqv7jUI+fn59ncXHxcJ5Sko57SR5daXziLzGTvBz4MnB1VT0NfAY4E9jB6Aj9k6s8bmeSxSSLS0tLh1u3JGkVEwV4kpcwCu/rquoGgKo6UFXPVdXzwGeBc1d6bFXtqaqFqlqYm3vRJwBJ0jpNchVKgGuAB6rqU2Pj28d2extw7/TLkyStZpJz4OcDVwD3JLlrGPsQcHmSHUAB+4D3zKA+SdIqJrkK5VtAVth08/TLkSRNypmYktSUAS5JTRngktSUAS5JTR3WTMzj1fyumybab9/uS2ZciSS9wCNwSWrKAJekpgxwSWrKAJekpgxwSWrKAJekpgxwSWqqzXXgXostST/JI3BJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJamrNAE9yepJvJLk/yX1J3jeMn5rkliQPDfenzL5cSdJBkxyBPwt8oKrOAc4Dfj/JOcAu4NaqOgu4dViXJB0hawZ4Ve2vqjuH5WeAB4BXApcCe4fd9gKXzahGSdIKDusceJJ54LXA7cC2qto/bHoC2Dbd0iRJhzJxgCd5OfBl4Oqqenp8W1UVUKs8bmeSxSSLS0tLGypWkvSCiQI8yUsYhfd1VXXDMHwgyfZh+3bgyZUeW1V7qmqhqhbm5uamUbMkicmuQglwDfBAVX1qbNONwJXD8pXAV6dfniRpNVsm2Od84ArgniR3DWMfAnYDX0jyLuBR4O0zqVCStKI1A7yqvgVklc0XTLccSdKknIkpSU0Z4JLUlAEuSU0Z4JLU1CRXobQyv+umzS5Bko4Ij8AlqSkDXJKaMsAlqSkDXJKaMsAlqSkDXJKaMsAlqSkDXJKaMsAlqSkDXJKaMsAlqSkDXJKaMsAlqSkDXJKaMsAlqSkDXJKaOuZ+0KGDSX90Yt/uS2ZciaTOPAKXpKYMcElqygCXpKYMcElqygCXpKYMcElqygCXpKYMcElqyok8xwAnBknHJ4/AJakpA1ySmjLAJampNQM8ybVJnkxy79jYR5I8nuSu4XbxbMuUJC03yRH454ALVxj/dFXtGG43T7csSdJa1gzwqvom8MMjUIsk6TBs5Bz4VUnuHk6xnDK1iiRJE1lvgH8GOBPYAewHPrnajkl2JllMsri0tLTOp5MkLbeuAK+qA1X1XFU9D3wWOPcQ++6pqoWqWpibm1tvnZKkZdYV4Em2j62+Dbh3tX0lSbOx5lT6JNcDbwC2JnkM+BPgDUl2AAXsA94zuxIlSStZM8Cr6vIVhq+ZQS2SpMPgTExJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6Smtmx2AVrd/K6bNruENU1a477dl8y4Eun44xG4JDVlgEtSUwa4JDVlgEtSU2sGeJJrkzyZ5N6xsVOT3JLkoeH+lNmWKUlabpIj8M8BFy4b2wXcWlVnAbcO65KkI2jNAK+qbwI/XDZ8KbB3WN4LXDbdsiRJa1nvOfBtVbV/WH4C2DaleiRJE9rwl5hVVUCttj3JziSLSRaXlpY2+nSSpMF6A/xAku0Aw/2Tq+1YVXuqaqGqFubm5tb5dJKk5dYb4DcCVw7LVwJfnU45kqRJTXIZ4fXAPwNnJ3ksybuA3cBvJnkI+I1hXZJ0BK35n1lV1eWrbLpgyrVIkg6DMzElqSkDXJKaMsAlqSl/0EEv4g9JSD14BC5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTW3Z7AJ0fJjfddNE++3bfckx8bzSkeARuCQ1ZYBLUlMGuCQ1ZYBLUlMb+hIzyT7gGeA54NmqWphGUZKktU3jKpQ3VtVTU/g7kqTD4CkUSWpqowFewNeS3JFk5zQKkiRNZqOnUF5fVY8nOQ24Jcl3q+qb4zsMwb4T4FWvetUGn06ajUkn/ByOzZoc5OSl48eGjsCr6vHh/kngK8C5K+yzp6oWqmphbm5uI08nSRqz7gBPcmKSVxxcBt4M3DutwiRJh7aRUyjbgK8kOfh3/qaq/n4qVUmS1rTuAK+q7wG/MMVaJEmHwcsIJakpA1ySmjLAJakpf9BBamIW16qrN4/AJakpA1ySmjLAJakpA1ySmjLAJakpA1ySmjLAJakpA1ySmnIij7TJnKCj9fIIXJKaMsAlqSkDXJKaMsAlqSkDXJKaMsAlqSkDXJKaMsAlqSkn8khqb9LJUPt2XzLjSo4sj8AlqSkDXJKaMsAlqSkDXJKaMsAlqSkDXJKaMsAlqSmvA5c0NdP+cYppX7e9mT+eMYtr0D0Cl6SmDHBJasoAl6SmDHBJampDAZ7kwiQPJnk4ya5pFSVJWtu6AzzJCcCfARcB5wCXJzlnWoVJkg5tI0fg5wIPV9X3qurHwOeBS6dTliRpLRsJ8FcC3x9bf2wYkyQdAamq9T0w+W3gwqp697B+BfDLVXXVsv12AjuH1bOBB5f9qa3AU+sq4uh1rPV0rPUD9tSFPY28uqrmlg9uZCbm48DpY+s/P4z9hKraA+xZ7Y8kWayqhQ3UcdQ51no61voBe+rCng5tI6dQ/gU4K8kZSV4KvAO4cRpFSZLWtu4j8Kp6NslVwD8AJwDXVtV9U6tMknRIG/rPrKrqZuDmDdaw6umVxo61no61fsCeurCnQ1j3l5iSpM3lVHpJamrmAZ7k5CRfSvLdJA8k+ZVh/A+GsfuSfGxs/w8OU/MfTPJbs65vPVbqKcmOJLcluSvJYpJzh32T5E+Hnu5O8rrNrn+5JGcPdR+8PZ3k6iSnJrklyUPD/SnD/p17+vjwut2d5CtJTh57zFH93lutp7HtH0hSSbYO621fp2Fby4w4xHtv+hlRVTO9AXuBdw/LLwVOBt4IfB142TB+2nB/DvAd4GXAGcAjwAmzrnFKPX0NuGgYuxj4x7HlvwMCnAfcvtn1r9HbCcATwKuBjwG7hvFdwEePgZ7eDGwZxj861lOL995KPQ3rpzO6oOBRYOsx8Dq1zohVepp6Rsz0CDzJScCvAdcAVNWPq+o/gfcCu6vqR8P4k8NDLgU+X1U/qqp/Ax5mNGX/qHGIngr4mWG3k4AfDMuXAn9VI7cBJyfZfmSrPiwXAI9U1aOMat87jO8FLhuW2/ZUVV+rqmeH8dsYzV+ABu+9ZcZfJ4BPA3/E6H14UNvXicYZscx4T1PPiFmfQjkDWAL+Msm/JvmLJCcCrwF+NcntSf4pyS8N+3eYnr9aT1cDH0/yfeATwAeH/Tv0NO4dwPXD8raq2j8sPwFsG5Y79zTu9xgd+UDjnpJcCjxeVd9Ztk/bnuidEePGe7qaKWfErAN8C/A64DNV9Vrgfxh9FN8CnMro48IfAl9IkhnXMi2r9fRe4P1VdTrwfoYj9E4ympD1VuCLy7fV6LNeu0uWVuspyYeBZ4HrNqOujRjvKclPAx8C/nhzq9qYFV6nzhkBrNjT1DNi1gH+GPBYVd0+rH+JUfg9BtwwfGT4NvA8o/8fYKLp+ZtstZ6uBG4Yxr7ICx/rOvR00EXAnVV1YFg/cPCj3HB/8GNs555I8rvAW4DfGf5hgr49ncnoU+F3kuxjVPedSX6Ovj1B74w4aHlPU8+ImQZ4VT0BfD/J2cPQBcD9wN8y+pKCJK9h9EXgU4ym4r8jycuSnAGcBXx7ljUerkP09APg14exNwEPDcs3Au8cvmk+D/ivsdMSR5vL+clTDTcyetMx3H91bLxlT0kuZHSu+K1V9b9j+x31770x/99TVd1TVadV1XxVzTMKvtcN79O2rxONM2LM8p6mnxFH4FvYHcAicDejF+UURi/GXwP3AncCbxrb/8OMvll+kOEb26PttkpPrwfuYPQN+e3ALw77htEPXzwC3AMsbHb9q/R0IvDvwEljYz8L3Dq80b4OnHoM9PQwo/ONdw23P2/23ntRT8u27+OFq1A6v07dM2KlnqaeEc7ElKSmnIkpSU0Z4JLUlAEuSU0Z4JLUlAEuSU0Z4JLUlAEuSU0Z4JLU1P8B4NichWHMVIwAAAAASUVORK5CYII=\n", 755 | "text/plain": [ 756 | "
" 757 | ] 758 | }, 759 | "metadata": { 760 | "needs_background": "light" 761 | }, 762 | "output_type": "display_data" 763 | } 764 | ], 765 | "source": [ 766 | "# See one of the inspected histograms\n", 767 | "hist_data = summary['hist']['fico_range_high']\n", 768 | "bins = hist_data['bin_edges']\n", 769 | "n = hist_data['counts']\n", 770 | "bin_width = np.diff(bins)\n", 771 | "\n", 772 | "plt.bar(bins[0:-1], n, bin_width, align='edge')" 773 | ] 774 | }, 775 | { 776 | "cell_type": "markdown", 777 | "metadata": {}, 778 | "source": [ 779 | "## Load logged data" 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": 8, 785 | "metadata": {}, 786 | "outputs": [], 787 | "source": [ 788 | "import glob" 789 | ] 790 | }, 791 | { 792 | "cell_type": "markdown", 793 | "metadata": {}, 794 | "source": [ 795 | "### Load flat table statistics" 796 | ] 797 | }, 798 | { 799 | "cell_type": "code", 800 | "execution_count": 9, 801 | "metadata": {}, 802 | "outputs": [ 803 | { 804 | "data": { 805 | "text/html": [ 806 | "
\n", 807 | "\n", 820 | "\n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | "
columncountnull_countbool_countnumeric_countmaxmeanminstddevnunique_numbers...ununique_str_upperquantile_0.0000quantile_0.0100quantile_0.0500quantile_0.2500quantile_0.5000quantile_0.7500quantile_0.9500quantile_0.9900quantile_1.0000
0funded_amnt200.00.00.0199.040000.0016479.8994971000.009811.38494279.0...0.01000.0000001000.0000003325.0000009600.0015000.00000023000.00000035000.00000040000.00000040000.000000
1mo_sin_rcnt_tl200.00.00.0199.046.006.1959800.006.64973526.0...0.00.0000000.0000000.0000002.004.0000008.00000021.00000035.00000046.000000
2open_il_12m200.00.00.0199.04.000.6783920.000.8451205.0...0.00.0000000.0000000.0000000.000.0000001.0000002.0000003.0000004.000000
3installment200.00.00.0199.01300.55486.01809034.96283.607183180.0...0.034.95999936.150002112.139999271.75413.000000668.8599851069.4399411204.5699461300.550049
4bc_open_to_buy200.00.00.0198.088250.0011172.8434340.0014448.281979194.0...0.00.0000000.000000118.0000002011.005719.00000015374.00000042950.00000085587.00000088250.000000
..................................................................
146num_rev_tl_bal_gt_0200.00.00.0199.018.005.9798990.003.35742819.0...0.00.0000001.0000002.0000004.005.0000008.00000013.00000017.00000018.000000
147last_pymnt_d200.00.00.00.00.000.0000000.000.0000000.0...30.0NaNNaNNaNNaNNaNNaNNaNNaNNaN
148percent_bc_gt_75200.00.00.0198.0100.0040.3823230.0033.93326126.0...0.00.0000000.0000000.0000007.7033.29999966.699997100.000000100.000000100.000000
149debt_settlement_flag200.00.00.00.00.000.0000000.000.0000000.0...2.0NaNNaNNaNNaNNaNNaNNaNNaNNaN
150mo_sin_old_il_acct200.00.00.0195.0269.00127.1487183.0049.477824114.0...0.03.0000005.00000028.000000110.00132.000000153.000000209.000000264.000000269.000000
\n", 1114 | "

151 rows × 32 columns

\n", 1115 | "
" 1116 | ], 1117 | "text/plain": [ 1118 | " column count null_count bool_count numeric_count \\\n", 1119 | "0 funded_amnt 200.0 0.0 0.0 199.0 \n", 1120 | "1 mo_sin_rcnt_tl 200.0 0.0 0.0 199.0 \n", 1121 | "2 open_il_12m 200.0 0.0 0.0 199.0 \n", 1122 | "3 installment 200.0 0.0 0.0 199.0 \n", 1123 | "4 bc_open_to_buy 200.0 0.0 0.0 198.0 \n", 1124 | ".. ... ... ... ... ... \n", 1125 | "146 num_rev_tl_bal_gt_0 200.0 0.0 0.0 199.0 \n", 1126 | "147 last_pymnt_d 200.0 0.0 0.0 0.0 \n", 1127 | "148 percent_bc_gt_75 200.0 0.0 0.0 198.0 \n", 1128 | "149 debt_settlement_flag 200.0 0.0 0.0 0.0 \n", 1129 | "150 mo_sin_old_il_acct 200.0 0.0 0.0 195.0 \n", 1130 | "\n", 1131 | " max mean min stddev nunique_numbers ... \\\n", 1132 | "0 40000.00 16479.899497 1000.00 9811.384942 79.0 ... \n", 1133 | "1 46.00 6.195980 0.00 6.649735 26.0 ... \n", 1134 | "2 4.00 0.678392 0.00 0.845120 5.0 ... \n", 1135 | "3 1300.55 486.018090 34.96 283.607183 180.0 ... \n", 1136 | "4 88250.00 11172.843434 0.00 14448.281979 194.0 ... \n", 1137 | ".. ... ... ... ... ... ... \n", 1138 | "146 18.00 5.979899 0.00 3.357428 19.0 ... \n", 1139 | "147 0.00 0.000000 0.00 0.000000 0.0 ... \n", 1140 | "148 100.00 40.382323 0.00 33.933261 26.0 ... \n", 1141 | "149 0.00 0.000000 0.00 0.000000 0.0 ... \n", 1142 | "150 269.00 127.148718 3.00 49.477824 114.0 ... \n", 1143 | "\n", 1144 | " ununique_str_upper quantile_0.0000 quantile_0.0100 quantile_0.0500 \\\n", 1145 | "0 0.0 1000.000000 1000.000000 3325.000000 \n", 1146 | "1 0.0 0.000000 0.000000 0.000000 \n", 1147 | "2 0.0 0.000000 0.000000 0.000000 \n", 1148 | "3 0.0 34.959999 36.150002 112.139999 \n", 1149 | "4 0.0 0.000000 0.000000 118.000000 \n", 1150 | ".. ... ... ... ... \n", 1151 | "146 0.0 0.000000 1.000000 2.000000 \n", 1152 | "147 30.0 NaN NaN NaN \n", 1153 | "148 0.0 0.000000 0.000000 0.000000 \n", 1154 | "149 2.0 NaN NaN NaN \n", 1155 | "150 0.0 3.000000 5.000000 28.000000 \n", 1156 | "\n", 1157 | " quantile_0.2500 quantile_0.5000 quantile_0.7500 quantile_0.9500 \\\n", 1158 | "0 9600.00 15000.000000 23000.000000 35000.000000 \n", 1159 | "1 2.00 4.000000 8.000000 21.000000 \n", 1160 | "2 0.00 0.000000 1.000000 2.000000 \n", 1161 | "3 271.75 413.000000 668.859985 1069.439941 \n", 1162 | "4 2011.00 5719.000000 15374.000000 42950.000000 \n", 1163 | ".. ... ... ... ... \n", 1164 | "146 4.00 5.000000 8.000000 13.000000 \n", 1165 | "147 NaN NaN NaN NaN \n", 1166 | "148 7.70 33.299999 66.699997 100.000000 \n", 1167 | "149 NaN NaN NaN NaN \n", 1168 | "150 110.00 132.000000 153.000000 209.000000 \n", 1169 | "\n", 1170 | " quantile_0.9900 quantile_1.0000 \n", 1171 | "0 40000.000000 40000.000000 \n", 1172 | "1 35.000000 46.000000 \n", 1173 | "2 3.000000 4.000000 \n", 1174 | "3 1204.569946 1300.550049 \n", 1175 | "4 85587.000000 88250.000000 \n", 1176 | ".. ... ... \n", 1177 | "146 17.000000 18.000000 \n", 1178 | "147 NaN NaN \n", 1179 | "148 100.000000 100.000000 \n", 1180 | "149 NaN NaN \n", 1181 | "150 264.000000 269.000000 \n", 1182 | "\n", 1183 | "[151 rows x 32 columns]" 1184 | ] 1185 | }, 1186 | "execution_count": 9, 1187 | "metadata": {}, 1188 | "output_type": "execute_result" 1189 | } 1190 | ], 1191 | "source": [ 1192 | "# Load the flat table statistics from the 'test.data' dataset\n", 1193 | "fnames = glob.glob('whylogs-output/test.data/dataset_summary/flat_table/dataset_summary*.csv')\n", 1194 | "fnames.sort()\n", 1195 | "# Load the most recent file\n", 1196 | "test_stats = pd.read_csv(fnames[-1])\n", 1197 | "test_stats" 1198 | ] 1199 | }, 1200 | { 1201 | "cell_type": "markdown", 1202 | "metadata": {}, 1203 | "source": [ 1204 | "### Load the full dataset profile sketch" 1205 | ] 1206 | }, 1207 | { 1208 | "cell_type": "code", 1209 | "execution_count": 10, 1210 | "metadata": {}, 1211 | "outputs": [ 1212 | { 1213 | "data": { 1214 | "text/plain": [ 1215 | "" 1216 | ] 1217 | }, 1218 | "execution_count": 10, 1219 | "metadata": {}, 1220 | "output_type": "execute_result" 1221 | } 1222 | ], 1223 | "source": [ 1224 | "from whylogs import DatasetProfile\n", 1225 | "\n", 1226 | "# Load a dataset profile from the 'test.data' dataset\n", 1227 | "fnames = glob.glob('whylogs-output/test.data/dataset_profile/protobuf/*.bin')\n", 1228 | "fnames.sort()\n", 1229 | "\n", 1230 | "test_prof = DatasetProfile.read_protobuf(fnames[-1], delimited_file=False)\n", 1231 | "test_prof" 1232 | ] 1233 | }, 1234 | { 1235 | "cell_type": "markdown", 1236 | "metadata": {}, 1237 | "source": [ 1238 | "---" 1239 | ] 1240 | }, 1241 | { 1242 | "cell_type": "code", 1243 | "execution_count": 11, 1244 | "metadata": {}, 1245 | "outputs": [ 1246 | { 1247 | "name": "stdout", 1248 | "output_type": "stream", 1249 | "text": [ 1250 | "2020-09-22 16:47:24,053 - whylogs.app.config - DEBUG - Attempting to load config file: None\n", 1251 | "2020-09-22 16:47:24,054 - whylogs.app.config - DEBUG - Attempting to load config file: .whylogs.yaml\n" 1252 | ] 1253 | } 1254 | ], 1255 | "source": [ 1256 | "# Not necessary, but you can reset the WhyLogs session if you want\n", 1257 | "from whylogs import reset_default_session\n", 1258 | "reset_default_session()" 1259 | ] 1260 | }, 1261 | { 1262 | "cell_type": "code", 1263 | "execution_count": null, 1264 | "metadata": {}, 1265 | "outputs": [], 1266 | "source": [] 1267 | } 1268 | ], 1269 | "metadata": { 1270 | "kernelspec": { 1271 | "display_name": "whylogs", 1272 | "language": "python", 1273 | "name": "whylogs" 1274 | }, 1275 | "language_info": { 1276 | "codemirror_mode": { 1277 | "name": "ipython", 1278 | "version": 3 1279 | }, 1280 | "file_extension": ".py", 1281 | "mimetype": "text/x-python", 1282 | "name": "python", 1283 | "nbconvert_exporter": "python", 1284 | "pygments_lexer": "ipython3", 1285 | "version": "3.7.7" 1286 | } 1287 | }, 1288 | "nbformat": 4, 1289 | "nbformat_minor": 4 1290 | } 1291 | -------------------------------------------------------------------------------- /python/mlflow.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whylabs/whylogs-examples/167be1e91f335ca2b77a09aa6ef99090099bfcae/python/mlflow.db -------------------------------------------------------------------------------- /python/profile.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whylabs/whylogs-examples/167be1e91f335ca2b77a09aa6ef99090099bfcae/python/profile.bin -------------------------------------------------------------------------------- /python/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3 2 | certifi 3 | chardet 4 | matplotlib 5 | numpy 6 | whylogs 7 | -------------------------------------------------------------------------------- /python/whylogs.yaml: -------------------------------------------------------------------------------- 1 | # .whylogs.yaml 2 | 3 | # Example WhyLogs YAML configuration 4 | project: example-project 5 | pipeline: example-pipeline 6 | verbose: false 7 | writers: 8 | # Save out the full protobuf datasketches data locally 9 | - formats: 10 | - protobuf 11 | output_path: whylogs-output 12 | # Template variables can be accessed via $variable or ${variable} 13 | path_template: $name/dataset_profile 14 | filename_template: datase_profile-$dataset_timestamp 15 | type: local 16 | # Save out the flat summary data locally, separately from the protobuf 17 | - formats: 18 | - flat 19 | - json 20 | output_path: whylogs-output 21 | path_template: $name/dataset_summary 22 | filename_template: dataset_summary-$dataset_timestamp 23 | type: local 24 | -------------------------------------------------------------------------------- /scala/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | target/* 3 | target -------------------------------------------------------------------------------- /scala/build.sbt: -------------------------------------------------------------------------------- 1 | name := "scala" 2 | 3 | version := "0.1" 4 | 5 | scalaVersion := "2.12.12" 6 | 7 | libraryDependencies += "ai.whylabs" %% "whylogs-spark" % "0.0.2b3" 8 | -------------------------------------------------------------------------------- /scala/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 1.3.13 -------------------------------------------------------------------------------- /scala/src/main/scala/WhyLogsDemo.scala: -------------------------------------------------------------------------------- 1 | import java.time.LocalDateTime 2 | 3 | import org.apache.spark.sql.functions._ 4 | import org.apache.spark.sql.{SaveMode, SparkSession} 5 | import com.whylogs.spark.WhyLogs._ 6 | 7 | object WhyLogsDemo extends App { 8 | 9 | val spark = SparkSession 10 | .builder() 11 | .master("local[*, 3]") 12 | .appName("SparkTesting-" + LocalDateTime.now().toString) 13 | .config("spark.ui.enabled", "false") 14 | .getOrCreate() 15 | 16 | val raw_df = spark.read 17 | .option("header", "true") 18 | .csv("Fire_Department_Calls_for_Service.csv") 19 | 20 | val df = 21 | raw_df.withColumn("call_date", to_timestamp(col("Call Date"), "MM/dd/YYYY")) 22 | df.printSchema() 23 | 24 | val profiles = df 25 | .newProfilingSession("FireDepartment") // start a new WhyLogs profiling job 26 | .withTimeColumn("call_date") // split dataset by call_date 27 | .groupBy("City", "Priority") // tag and group the data with categorical information 28 | .aggProfiles() // runs the aggregation. returns a dataframe of entries 29 | 30 | profiles.write 31 | .mode(SaveMode.Overwrite) 32 | .parquet("profiles_parquet") 33 | 34 | } 35 | -------------------------------------------------------------------------------- /scala/src/main/scala/WhyLogsScalaLendingClubToWhylabsExample.scala: -------------------------------------------------------------------------------- 1 | // Tested on Databricks cluster running as scala notebook: 2 | // * cluster version: 8.3 (includes Apache Spark 3.1.1, Scala 2.12) 3 | // * installed whylogs jar: whylogs_spark_bundle_3_1_1_scala_2_12_0_1_21aeb7b2_20210903_224257_1_all-d1b20.jar 4 | // * from: https://oss.sonatype.org/content/repositories/snapshots/ai/whylabs/whylogs-spark-bundle_3.1.1-scala_2.12/0.1-21aeb7b2-SNAPSHOT/whylogs-spark-bundle_3.1.1-scala_2.12-0.1-21aeb7b2-20210903.224257-1-all.jar 5 | 6 | import java.time.LocalDateTime 7 | 8 | import org.apache.spark.sql.functions._ 9 | import org.apache.spark.sql.{SaveMode, SparkSession} 10 | import com.whylogs.spark.WhyLogs._ 11 | 12 | // COMMAND ---------- 13 | 14 | // For demo purposes we will create a time column with yesterday's date, so that Whylabs ingestion sees this as a recent dataset profile 15 | // and it shows up in default dashboard of last 7 days on Whylabs. 16 | def unixEpochTimeForNumberOfDaysAgo(numDaysAgo: Int): Long = { 17 | import java.time._ 18 | val numDaysAgoDateTime: LocalDateTime = LocalDateTime.now().minusDays(numDaysAgo) 19 | val zdt: ZonedDateTime = numDaysAgoDateTime.atZone(ZoneId.of("America/Los_Angeles")) 20 | val numDaysAgoDateTimeInMillis = zdt.toInstant.toEpochMilli 21 | val unixEpochTime = numDaysAgoDateTimeInMillis / 1000L 22 | unixEpochTime 23 | } 24 | 25 | val timestamp_yesterday = unixEpochTimeForNumberOfDaysAgo(1) 26 | println(timestamp_yesterday) 27 | val timeColumn = "dataset_timestamp" 28 | 29 | 30 | // COMMAND ---------- 31 | 32 | import org.apache.spark.sql.functions._ 33 | import org.apache.spark.sql.types.DataTypes 34 | val spark = SparkSession 35 | .builder() 36 | .master("local[*, 3]") 37 | .appName("SparkTesting-" + LocalDateTime.now().toString) 38 | .config("spark.ui.enabled", "false") 39 | .getOrCreate() 40 | 41 | // the file location below is using the lending_club_1000.csv uploaded onto a Databricks dbfs 42 | // e.g. from here https://github.com/whylabs/whylogs/blob/mainline/testdata/lending_club_1000.csv 43 | // you will need to update that location based on a dataset you use or follow this example. 44 | val input_dataset_location = "dbfs:/FileStore/tables/lending_club_1000.csv" 45 | 46 | val raw_df = spark.read 47 | .option("header", "true") 48 | .option("inferSchema", "true") 49 | .csv(input_dataset_location) 50 | 51 | // Here we add an artificial column for time. It is required that there is a TimestampType column for profiling with this API 52 | val df = raw_df.withColumn(timeColumn, lit(timestamp_yesterday).cast(DataTypes.TimestampType)) 53 | df.printSchema() 54 | 55 | 56 | // COMMAND ---------- 57 | 58 | val session = df.newProfilingSession("LendingClubScala") // start a new WhyLogs profiling job 59 | .withTimeColumn(timeColumn) 60 | val profiles = session 61 | .aggProfiles() // runs the aggregation. returns a dataframe of entries 62 | 63 | // COMMAND ---------- 64 | 65 | // optionally you might write the dataset profiles out somewhere before uploading to Whylabs 66 | profiles.write 67 | .mode(SaveMode.Overwrite) 68 | .parquet("dbfs:/FileStore/tables/whylogs_demo_profiles_parquet") 69 | 70 | // COMMAND ---------- 71 | 72 | // Replace the following parameters below with your values after signing up for an account at https://whylabs.ai/ 73 | // You can find Organization Id on https://hub.whylabsapp.com/settings/access-tokens and the value looks something like: org-123abc 74 | // also the settings page allows you t create new apiKeys which you will need an apiKey to upload to your account in Whylabs 75 | // The modelId below specifies which model this profile is for, by default an initial model-1 is created but you will update this 76 | // if you create a new model here https://hub.whylabsapp.com/settings/model-management 77 | session.log( 78 | orgId = "replace_with_your_orgId", 79 | modelId = "model-1", 80 | apiKey = "replace_with_your_api_key") 81 | 82 | -------------------------------------------------------------------------------- /scala/src/main/scala/WhylabsDatabricks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | PROPERTIES_FILE="/databricks/spark/dbconf/java/extra.security" 3 | # by default the disabled algorithms include this set and GCM, which has known performance issues on parquet files 4 | DISABLED_ALGOS="SSLv3, RC4, DES, MD5withRSA, DH keySize < 1024, EC keySize < 224, 3DES_EDE_CBC, anon, NULL" 5 | echo "Configure Databricks java for Whylabs access and allow GCM" 6 | if [[ -f "${PROPERTIES_FILE}" ]]; then 7 | echo "setting jdk.tls.disabledAlgorithms..." 8 | echo "jdk.tls.disabledAlgorithms=${DISABLED_ALGOS}" | tee "${PROPERTIES_FILE}" 9 | else 10 | >&2 echo "ERROR failed to find ${PROPERTIES_FILE}" 11 | fi 12 | --------------------------------------------------------------------------------