├── .gitignore ├── LICENSE ├── README.md ├── build.gradle ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── python_analysis.ipynb ├── src └── main │ └── java │ └── test_dataframes │ ├── CheckResult.java │ ├── TestDFLib.java │ ├── TestDatavec.java │ ├── TestDuckDb.kt │ ├── TestJoinery.java │ ├── TestKotlinDataFrame.kt │ ├── TestKrangl.kt │ ├── TestMorpheus.java │ └── TestTablesaw.java └── urb_cpop1_1_Data.csv /.gitignore: -------------------------------------------------------------------------------- 1 | bin 2 | .ipynb_checkpoints 3 | .classpath 4 | .project 5 | .settings 6 | .gradle 7 | .idea/ 8 | build/ 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Java dataframes test 2 | This is the companion repository to the following medium post: [Doing cool data science in Java: how 3 DataFrame libraries stack up](https://medium.com/@thijser/doing-cool-data-science-in-java-how-3-dataframe-libraries-stack-up-5e6ccb7b437) 3 | 4 | ## Data 5 | The data was extracted from [Eurostat](http://appsso.eurostat.ec.europa.eu/nui/show.do?dataset=urb_cpop1&lang=en) in the beginning of September 2018. I opened the extracted CSV in LibreOffice and saved it again because there were some illegal UTF-8 characters in the Eurostat output that some csv importers couldn't handle directly. 6 | 7 | # Results [June 2025] 8 | 9 | | Library | Maintained | Version | Time (ms) | 10 | |---------------------------------------------------------|------------|-----------|-----------| 11 | | [DuckDb](https://github.com/duckdb/duckdb-java) | Y | 1.3.0 | 93 | 12 | | [DFLib](https://github.com/dflib/dflib) | Y | 1.3.0 | 226 | 13 | | [Kotlin DataFrame](https://github.com/Kotlin/dataframe) | Y | 1.0-beta2 | 816 | 14 | | [Tablesaw](https://github.com/jtablesaw/tablesaw) | Y | 0.44.1 | 820 | 15 | | Joinery | n | 1.9 | 1,478 | 16 | | Krangl | n | 0.18.4 | 1,796 | 17 | | Morpheus | n | 0.9.23 | * | 18 | 19 | * Morpheus is no longer maintained and doesn't seem to work on later java versions (error related to accessing `sun.util.calendar.ZoneInfo`) 20 | 21 | ## Code 22 | The code for the three libraries is present in the `Test{libraryname}.java` files. They all use `CheckResult.java` to do a basic correctness check for the top-growing cities. 23 | 24 | As described in the [medium post](https://medium.com/@thijser/doing-cool-data-science-in-java-how-3-dataframe-libraries-stack-up-5e6ccb7b437), I couldn't find a good way to do the pivot step in [datavec](https://deeplearning4j.org/docs/latest/datavec-overview), but I included the code I wrote up until that point. 25 | -------------------------------------------------------------------------------- /build.gradle: -------------------------------------------------------------------------------- 1 | buildscript { 2 | ext.kotlin_version = '2.1.21' 3 | repositories { 4 | mavenCentral() 5 | } 6 | 7 | dependencies { 8 | classpath "org.jetbrains.kotlin:kotlin-gradle-plugin:$kotlin_version" 9 | } 10 | } 11 | 12 | apply plugin: 'java' 13 | apply plugin: 'kotlin' 14 | apply plugin: 'eclipse' 15 | 16 | sourceCompatibility = '21' 17 | 18 | repositories { 19 | mavenCentral() 20 | maven { url 'https://maven.scijava.org/content/repositories/public/' } 21 | } 22 | 23 | dependencies { 24 | implementation 'tech.tablesaw:tablesaw-core:0.44.1' 25 | // needed for tablesaw 26 | implementation 'com.google.guava:guava:31.1-jre' 27 | 28 | implementation 'joinery:joinery-dataframe:1.9' 29 | // For the CSV import joinery needs this dependency too: 30 | implementation 'org.apache.poi:poi:3.17' 31 | 32 | implementation 'com.zavtech:morpheus-core:0.9.23' 33 | 34 | implementation 'org.datavec:datavec-api:1.0.0-beta2' 35 | implementation 'org.datavec:datavec-local:1.0.0-beta2' 36 | 37 | implementation 'com.github.holgerbrandl:krangl:0.18.4' 38 | implementation "org.jetbrains.kotlin:kotlin-stdlib-jdk8:$kotlin_version" 39 | 40 | implementation "org.jetbrains.kotlinx:dataframe:1.0.0-Beta2" 41 | implementation 'org.duckdb:duckdb_jdbc:1.3.0.0' 42 | implementation 'org.dflib:dflib-csv:1.3.0' 43 | } 44 | 45 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mathijs81/java-dataframes/0c04f136813b50ab02ea7ef39b14865f45e9c032/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-8.13-bin.zip 4 | networkTimeout=10000 5 | validateDistributionUrl=true 6 | zipStoreBase=GRADLE_USER_HOME 7 | zipStorePath=wrapper/dists 8 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # 4 | # Copyright © 2015-2021 the original authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # SPDX-License-Identifier: Apache-2.0 19 | # 20 | 21 | ############################################################################## 22 | # 23 | # Gradle start up script for POSIX generated by Gradle. 24 | # 25 | # Important for running: 26 | # 27 | # (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is 28 | # noncompliant, but you have some other compliant shell such as ksh or 29 | # bash, then to run this script, type that shell name before the whole 30 | # command line, like: 31 | # 32 | # ksh Gradle 33 | # 34 | # Busybox and similar reduced shells will NOT work, because this script 35 | # requires all of these POSIX shell features: 36 | # * functions; 37 | # * expansions «$var», «${var}», «${var:-default}», «${var+SET}», 38 | # «${var#prefix}», «${var%suffix}», and «$( cmd )»; 39 | # * compound commands having a testable exit status, especially «case»; 40 | # * various built-in commands including «command», «set», and «ulimit». 41 | # 42 | # Important for patching: 43 | # 44 | # (2) This script targets any POSIX shell, so it avoids extensions provided 45 | # by Bash, Ksh, etc; in particular arrays are avoided. 46 | # 47 | # The "traditional" practice of packing multiple parameters into a 48 | # space-separated string is a well documented source of bugs and security 49 | # problems, so this is (mostly) avoided, by progressively accumulating 50 | # options in "$@", and eventually passing that to Java. 51 | # 52 | # Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, 53 | # and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; 54 | # see the in-line comments for details. 55 | # 56 | # There are tweaks for specific operating systems such as AIX, CygWin, 57 | # Darwin, MinGW, and NonStop. 58 | # 59 | # (3) This script is generated from the Groovy template 60 | # https://github.com/gradle/gradle/blob/HEAD/platforms/jvm/plugins-application/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt 61 | # within the Gradle project. 62 | # 63 | # You can find Gradle at https://github.com/gradle/gradle/. 64 | # 65 | ############################################################################## 66 | 67 | # Attempt to set APP_HOME 68 | 69 | # Resolve links: $0 may be a link 70 | app_path=$0 71 | 72 | # Need this for daisy-chained symlinks. 73 | while 74 | APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path 75 | [ -h "$app_path" ] 76 | do 77 | ls=$( ls -ld "$app_path" ) 78 | link=${ls#*' -> '} 79 | case $link in #( 80 | /*) app_path=$link ;; #( 81 | *) app_path=$APP_HOME$link ;; 82 | esac 83 | done 84 | 85 | # This is normally unused 86 | # shellcheck disable=SC2034 87 | APP_BASE_NAME=${0##*/} 88 | # Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036) 89 | APP_HOME=$( cd -P "${APP_HOME:-./}" > /dev/null && printf '%s\n' "$PWD" ) || exit 90 | 91 | # Use the maximum available, or set MAX_FD != -1 to use that value. 92 | MAX_FD=maximum 93 | 94 | warn () { 95 | echo "$*" 96 | } >&2 97 | 98 | die () { 99 | echo 100 | echo "$*" 101 | echo 102 | exit 1 103 | } >&2 104 | 105 | # OS specific support (must be 'true' or 'false'). 106 | cygwin=false 107 | msys=false 108 | darwin=false 109 | nonstop=false 110 | case "$( uname )" in #( 111 | CYGWIN* ) cygwin=true ;; #( 112 | Darwin* ) darwin=true ;; #( 113 | MSYS* | MINGW* ) msys=true ;; #( 114 | NONSTOP* ) nonstop=true ;; 115 | esac 116 | 117 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 118 | 119 | 120 | # Determine the Java command to use to start the JVM. 121 | if [ -n "$JAVA_HOME" ] ; then 122 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 123 | # IBM's JDK on AIX uses strange locations for the executables 124 | JAVACMD=$JAVA_HOME/jre/sh/java 125 | else 126 | JAVACMD=$JAVA_HOME/bin/java 127 | fi 128 | if [ ! -x "$JAVACMD" ] ; then 129 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 130 | 131 | Please set the JAVA_HOME variable in your environment to match the 132 | location of your Java installation." 133 | fi 134 | else 135 | JAVACMD=java 136 | if ! command -v java >/dev/null 2>&1 137 | then 138 | die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 139 | 140 | Please set the JAVA_HOME variable in your environment to match the 141 | location of your Java installation." 142 | fi 143 | fi 144 | 145 | # Increase the maximum file descriptors if we can. 146 | if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then 147 | case $MAX_FD in #( 148 | max*) 149 | # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked. 150 | # shellcheck disable=SC2039,SC3045 151 | MAX_FD=$( ulimit -H -n ) || 152 | warn "Could not query maximum file descriptor limit" 153 | esac 154 | case $MAX_FD in #( 155 | '' | soft) :;; #( 156 | *) 157 | # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked. 158 | # shellcheck disable=SC2039,SC3045 159 | ulimit -n "$MAX_FD" || 160 | warn "Could not set maximum file descriptor limit to $MAX_FD" 161 | esac 162 | fi 163 | 164 | # Collect all arguments for the java command, stacking in reverse order: 165 | # * args from the command line 166 | # * the main class name 167 | # * -classpath 168 | # * -D...appname settings 169 | # * --module-path (only if needed) 170 | # * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. 171 | 172 | # For Cygwin or MSYS, switch paths to Windows format before running java 173 | if "$cygwin" || "$msys" ; then 174 | APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) 175 | CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) 176 | 177 | JAVACMD=$( cygpath --unix "$JAVACMD" ) 178 | 179 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 180 | for arg do 181 | if 182 | case $arg in #( 183 | -*) false ;; # don't mess with options #( 184 | /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath 185 | [ -e "$t" ] ;; #( 186 | *) false ;; 187 | esac 188 | then 189 | arg=$( cygpath --path --ignore --mixed "$arg" ) 190 | fi 191 | # Roll the args list around exactly as many times as the number of 192 | # args, so each arg winds up back in the position where it started, but 193 | # possibly modified. 194 | # 195 | # NB: a `for` loop captures its iteration list before it begins, so 196 | # changing the positional parameters here affects neither the number of 197 | # iterations, nor the values presented in `arg`. 198 | shift # remove old arg 199 | set -- "$@" "$arg" # push replacement arg 200 | done 201 | fi 202 | 203 | 204 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 205 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' 206 | 207 | # Collect all arguments for the java command: 208 | # * DEFAULT_JVM_OPTS, JAVA_OPTS, and optsEnvironmentVar are not allowed to contain shell fragments, 209 | # and any embedded shellness will be escaped. 210 | # * For example: A user cannot expect ${Hostname} to be expanded, as it is an environment variable and will be 211 | # treated as '${Hostname}' itself on the command line. 212 | 213 | set -- \ 214 | "-Dorg.gradle.appname=$APP_BASE_NAME" \ 215 | -classpath "$CLASSPATH" \ 216 | org.gradle.wrapper.GradleWrapperMain \ 217 | "$@" 218 | 219 | # Stop when "xargs" is not available. 220 | if ! command -v xargs >/dev/null 2>&1 221 | then 222 | die "xargs is not available" 223 | fi 224 | 225 | # Use "xargs" to parse quoted args. 226 | # 227 | # With -n1 it outputs one arg per line, with the quotes and backslashes removed. 228 | # 229 | # In Bash we could simply go: 230 | # 231 | # readarray ARGS < <( xargs -n1 <<<"$var" ) && 232 | # set -- "${ARGS[@]}" "$@" 233 | # 234 | # but POSIX shell has neither arrays nor command substitution, so instead we 235 | # post-process each arg (as a line of input to sed) to backslash-escape any 236 | # character that might be a shell metacharacter, then use eval to reverse 237 | # that process (while maintaining the separation between arguments), and wrap 238 | # the whole thing up as a single "set" statement. 239 | # 240 | # This will of course break if any of these variables contains a newline or 241 | # an unmatched quote. 242 | # 243 | 244 | eval "set -- $( 245 | printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | 246 | xargs -n1 | 247 | sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | 248 | tr '\n' ' ' 249 | )" '"$@"' 250 | 251 | exec "$JAVACMD" "$@" 252 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | @rem SPDX-License-Identifier: Apache-2.0 17 | @rem 18 | 19 | @if "%DEBUG%"=="" @echo off 20 | @rem ########################################################################## 21 | @rem 22 | @rem Gradle startup script for Windows 23 | @rem 24 | @rem ########################################################################## 25 | 26 | @rem Set local scope for the variables with windows NT shell 27 | if "%OS%"=="Windows_NT" setlocal 28 | 29 | set DIRNAME=%~dp0 30 | if "%DIRNAME%"=="" set DIRNAME=. 31 | @rem This is normally unused 32 | set APP_BASE_NAME=%~n0 33 | set APP_HOME=%DIRNAME% 34 | 35 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 36 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 37 | 38 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 39 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 40 | 41 | @rem Find java.exe 42 | if defined JAVA_HOME goto findJavaFromJavaHome 43 | 44 | set JAVA_EXE=java.exe 45 | %JAVA_EXE% -version >NUL 2>&1 46 | if %ERRORLEVEL% equ 0 goto execute 47 | 48 | echo. 1>&2 49 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2 50 | echo. 1>&2 51 | echo Please set the JAVA_HOME variable in your environment to match the 1>&2 52 | echo location of your Java installation. 1>&2 53 | 54 | goto fail 55 | 56 | :findJavaFromJavaHome 57 | set JAVA_HOME=%JAVA_HOME:"=% 58 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 59 | 60 | if exist "%JAVA_EXE%" goto execute 61 | 62 | echo. 1>&2 63 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2 64 | echo. 1>&2 65 | echo Please set the JAVA_HOME variable in your environment to match the 1>&2 66 | echo location of your Java installation. 1>&2 67 | 68 | goto fail 69 | 70 | :execute 71 | @rem Setup the command line 72 | 73 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 74 | 75 | 76 | @rem Execute Gradle 77 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 78 | 79 | :end 80 | @rem End local scope for the variables with windows NT shell 81 | if %ERRORLEVEL% equ 0 goto mainEnd 82 | 83 | :fail 84 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 85 | rem the _cmd.exe /c_ return code! 86 | set EXIT_CODE=%ERRORLEVEL% 87 | if %EXIT_CODE% equ 0 set EXIT_CODE=1 88 | if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE% 89 | exit /b %EXIT_CODE% 90 | 91 | :mainEnd 92 | if "%OS%"=="Windows_NT" endlocal 93 | 94 | :omega 95 | -------------------------------------------------------------------------------- /python_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "0.23.4\n", 13 | "53500\n" 14 | ] 15 | }, 16 | { 17 | "data": { 18 | "text/html": [ 19 | "
\n", 20 | "\n", 33 | "\n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | "
TIMECITIESINDIC_URValueFlag and Footnotes
02008GermanyPopulation on the 1st of January, total82217837NaN
12008GermanyPopulation on the 1st of January, male40274292NaN
22008GermanyPopulation on the 1st of January, female41943545NaN
32008GermanyPopulation on the 1st of January, 0-4 years, t...3469044NaN
42008GermanyPopulation on the 1st of January, 0-4 years, male1780414NaN
\n", 87 | "
" 88 | ], 89 | "text/plain": [ 90 | " TIME CITIES INDIC_UR Value \\\n", 91 | "0 2008 Germany Population on the 1st of January, total 82217837 \n", 92 | "1 2008 Germany Population on the 1st of January, male 40274292 \n", 93 | "2 2008 Germany Population on the 1st of January, female 41943545 \n", 94 | "3 2008 Germany Population on the 1st of January, 0-4 years, t... 3469044 \n", 95 | "4 2008 Germany Population on the 1st of January, 0-4 years, male 1780414 \n", 96 | "\n", 97 | " Flag and Footnotes \n", 98 | "0 NaN \n", 99 | "1 NaN \n", 100 | "2 NaN \n", 101 | "3 NaN \n", 102 | "4 NaN " 103 | ] 104 | }, 105 | "execution_count": 1, 106 | "metadata": {}, 107 | "output_type": "execute_result" 108 | } 109 | ], 110 | "source": [ 111 | "import pandas as pd\n", 112 | "\n", 113 | "data = pd.read_csv('urb_cpop1_1_Data.csv')\n", 114 | "print(pd.__version__) # I ran with 0.23.4\n", 115 | "print(len(data))\n", 116 | "data.head()" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 2, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/html": [ 127 | "
\n", 128 | "\n", 141 | "\n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | "
TIME2008200920102011201220132014201520162017
key
's-Hertogenbosch:Population on the 1st of January, 0-4 years, maleNaNNaN4121.04131.04181.04255.04295.0NaNNaNNaN
's-Hertogenbosch:Population on the 1st of January, 0-4 years, total7885.07915.08127.08211.08292.08313.08290.0NaNNaNNaN
's-Hertogenbosch:Population on the 1st of January, female69434.070060.070979.071586.072085.072541.072877.0NaNNaNNaN
's-Hertogenbosch:Population on the 1st of January, male67047.067715.068628.069200.069808.070276.070856.0NaNNaNNaN
's-Hertogenbosch:Population on the 1st of January, total136481.0137775.0139607.0140786.0141893.0142817.0143733.0NaNNaNNaN
\n", 238 | "
" 239 | ], 240 | "text/plain": [ 241 | "TIME 2008 2009 \\\n", 242 | "key \n", 243 | "'s-Hertogenbosch:Population on the 1st of Janua... NaN NaN \n", 244 | "'s-Hertogenbosch:Population on the 1st of Janua... 7885.0 7915.0 \n", 245 | "'s-Hertogenbosch:Population on the 1st of Janua... 69434.0 70060.0 \n", 246 | "'s-Hertogenbosch:Population on the 1st of Janua... 67047.0 67715.0 \n", 247 | "'s-Hertogenbosch:Population on the 1st of Janua... 136481.0 137775.0 \n", 248 | "\n", 249 | "TIME 2010 2011 \\\n", 250 | "key \n", 251 | "'s-Hertogenbosch:Population on the 1st of Janua... 4121.0 4131.0 \n", 252 | "'s-Hertogenbosch:Population on the 1st of Janua... 8127.0 8211.0 \n", 253 | "'s-Hertogenbosch:Population on the 1st of Janua... 70979.0 71586.0 \n", 254 | "'s-Hertogenbosch:Population on the 1st of Janua... 68628.0 69200.0 \n", 255 | "'s-Hertogenbosch:Population on the 1st of Janua... 139607.0 140786.0 \n", 256 | "\n", 257 | "TIME 2012 2013 \\\n", 258 | "key \n", 259 | "'s-Hertogenbosch:Population on the 1st of Janua... 4181.0 4255.0 \n", 260 | "'s-Hertogenbosch:Population on the 1st of Janua... 8292.0 8313.0 \n", 261 | "'s-Hertogenbosch:Population on the 1st of Janua... 72085.0 72541.0 \n", 262 | "'s-Hertogenbosch:Population on the 1st of Janua... 69808.0 70276.0 \n", 263 | "'s-Hertogenbosch:Population on the 1st of Janua... 141893.0 142817.0 \n", 264 | "\n", 265 | "TIME 2014 2015 2016 2017 \n", 266 | "key \n", 267 | "'s-Hertogenbosch:Population on the 1st of Janua... 4295.0 NaN NaN NaN \n", 268 | "'s-Hertogenbosch:Population on the 1st of Janua... 8290.0 NaN NaN NaN \n", 269 | "'s-Hertogenbosch:Population on the 1st of Janua... 72877.0 NaN NaN NaN \n", 270 | "'s-Hertogenbosch:Population on the 1st of Janua... 70856.0 NaN NaN NaN \n", 271 | "'s-Hertogenbosch:Population on the 1st of Janua... 143733.0 NaN NaN NaN " 272 | ] 273 | }, 274 | "execution_count": 2, 275 | "metadata": {}, 276 | "output_type": "execute_result" 277 | } 278 | ], 279 | "source": [ 280 | "# Remove rows that have no value\n", 281 | "filtered = data.drop(data[pd.isna(data.Value) | (data.Value == \":\")].index)\n", 282 | "filtered['key'] = filtered['CITIES'] + ':' + filtered['INDIC_UR']\n", 283 | "filtered['Value'] = pd.to_numeric(filtered['Value'])\n", 284 | "\n", 285 | "# pivot(...) will fail because there are some cities (e.g. Barcelona, Bilbao) that have multiple entries\n", 286 | "# for the same year\n", 287 | "cities = filtered.pivot_table(index='key', columns='TIME', values='Value', aggfunc=\"mean\")\n", 288 | "cities.head()" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 3, 294 | "metadata": {}, 295 | "outputs": [ 296 | { 297 | "data": { 298 | "text/html": [ 299 | "
\n", 300 | "\n", 313 | "\n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | "
TIME2008200920102011201220132014201520162017
key
United Kingdom:Population on the 1st of January, totalNaNNaNNaNNaNNaNNaNNaN64853393.065379044.065844142.0
Portugal:Population on the 1st of January, total10627250.010637713.010573479.010572721.010542398.010487289.010427301.010374822.010341330.010309573.0
London (greater city):Population on the 1st of January, total7668300.07753600.08002000.08173941.08256400.08362500.08477600.08606201.08730803.08797330.0
Slovakia:Population on the 1st of January, total5412254.05424925.05435273.05397036.05404322.05410836.05415949.05421349.05426252.05435343.0
Greater Manchester:Population on the 1st of January, totalNaNNaN2650800.02682528.02693800.02708600.02723900.02744508.02769152.02789822.0
West Midlands urban area:Population on the 1st of January, totalNaNNaN2390000.02419500.02431200.02446600.02462300.02479550.02500093.02527245.0
Latvia:Population on the 1st of January, total2270894.02261294.0NaN2070371.02044813.02023825.02001468.01986096.01968957.01950116.0
Lisboa (greater city):Population on the 1st of January, total1790389.01784236.01857112.01863069.01860256.01849472.01837852.01835785.01835894.01842352.0
Birmingham:Population on the 1st of January, total1019200.01028700.01055600.01073045.01079900.01088900.01096800.01106334.01117938.01132600.0
Greater Glasgow:Population on the 1st of January, totalNaNNaNNaNNaNNaNNaNNaNNaN986575.0996545.0
\n", 475 | "
" 476 | ], 477 | "text/plain": [ 478 | "TIME 2008 2009 \\\n", 479 | "key \n", 480 | "United Kingdom:Population on the 1st of January... NaN NaN \n", 481 | "Portugal:Population on the 1st of January, total 10627250.0 10637713.0 \n", 482 | "London (greater city):Population on the 1st of ... 7668300.0 7753600.0 \n", 483 | "Slovakia:Population on the 1st of January, total 5412254.0 5424925.0 \n", 484 | "Greater Manchester:Population on the 1st of Jan... NaN NaN \n", 485 | "West Midlands urban area:Population on the 1st ... NaN NaN \n", 486 | "Latvia:Population on the 1st of January, total 2270894.0 2261294.0 \n", 487 | "Lisboa (greater city):Population on the 1st of ... 1790389.0 1784236.0 \n", 488 | "Birmingham:Population on the 1st of January, total 1019200.0 1028700.0 \n", 489 | "Greater Glasgow:Population on the 1st of Januar... NaN NaN \n", 490 | "\n", 491 | "TIME 2010 2011 \\\n", 492 | "key \n", 493 | "United Kingdom:Population on the 1st of January... NaN NaN \n", 494 | "Portugal:Population on the 1st of January, total 10573479.0 10572721.0 \n", 495 | "London (greater city):Population on the 1st of ... 8002000.0 8173941.0 \n", 496 | "Slovakia:Population on the 1st of January, total 5435273.0 5397036.0 \n", 497 | "Greater Manchester:Population on the 1st of Jan... 2650800.0 2682528.0 \n", 498 | "West Midlands urban area:Population on the 1st ... 2390000.0 2419500.0 \n", 499 | "Latvia:Population on the 1st of January, total NaN 2070371.0 \n", 500 | "Lisboa (greater city):Population on the 1st of ... 1857112.0 1863069.0 \n", 501 | "Birmingham:Population on the 1st of January, total 1055600.0 1073045.0 \n", 502 | "Greater Glasgow:Population on the 1st of Januar... NaN NaN \n", 503 | "\n", 504 | "TIME 2012 2013 \\\n", 505 | "key \n", 506 | "United Kingdom:Population on the 1st of January... NaN NaN \n", 507 | "Portugal:Population on the 1st of January, total 10542398.0 10487289.0 \n", 508 | "London (greater city):Population on the 1st of ... 8256400.0 8362500.0 \n", 509 | "Slovakia:Population on the 1st of January, total 5404322.0 5410836.0 \n", 510 | "Greater Manchester:Population on the 1st of Jan... 2693800.0 2708600.0 \n", 511 | "West Midlands urban area:Population on the 1st ... 2431200.0 2446600.0 \n", 512 | "Latvia:Population on the 1st of January, total 2044813.0 2023825.0 \n", 513 | "Lisboa (greater city):Population on the 1st of ... 1860256.0 1849472.0 \n", 514 | "Birmingham:Population on the 1st of January, total 1079900.0 1088900.0 \n", 515 | "Greater Glasgow:Population on the 1st of Januar... NaN NaN \n", 516 | "\n", 517 | "TIME 2014 2015 \\\n", 518 | "key \n", 519 | "United Kingdom:Population on the 1st of January... NaN 64853393.0 \n", 520 | "Portugal:Population on the 1st of January, total 10427301.0 10374822.0 \n", 521 | "London (greater city):Population on the 1st of ... 8477600.0 8606201.0 \n", 522 | "Slovakia:Population on the 1st of January, total 5415949.0 5421349.0 \n", 523 | "Greater Manchester:Population on the 1st of Jan... 2723900.0 2744508.0 \n", 524 | "West Midlands urban area:Population on the 1st ... 2462300.0 2479550.0 \n", 525 | "Latvia:Population on the 1st of January, total 2001468.0 1986096.0 \n", 526 | "Lisboa (greater city):Population on the 1st of ... 1837852.0 1835785.0 \n", 527 | "Birmingham:Population on the 1st of January, total 1096800.0 1106334.0 \n", 528 | "Greater Glasgow:Population on the 1st of Januar... NaN NaN \n", 529 | "\n", 530 | "TIME 2016 2017 \n", 531 | "key \n", 532 | "United Kingdom:Population on the 1st of January... 65379044.0 65844142.0 \n", 533 | "Portugal:Population on the 1st of January, total 10341330.0 10309573.0 \n", 534 | "London (greater city):Population on the 1st of ... 8730803.0 8797330.0 \n", 535 | "Slovakia:Population on the 1st of January, total 5426252.0 5435343.0 \n", 536 | "Greater Manchester:Population on the 1st of Jan... 2769152.0 2789822.0 \n", 537 | "West Midlands urban area:Population on the 1st ... 2500093.0 2527245.0 \n", 538 | "Latvia:Population on the 1st of January, total 1968957.0 1950116.0 \n", 539 | "Lisboa (greater city):Population on the 1st of ... 1835894.0 1842352.0 \n", 540 | "Birmingham:Population on the 1st of January, total 1117938.0 1132600.0 \n", 541 | "Greater Glasgow:Population on the 1st of Januar... 986575.0 996545.0 " 542 | ] 543 | }, 544 | "execution_count": 3, 545 | "metadata": {}, 546 | "output_type": "execute_result" 547 | } 548 | ], 549 | "source": [ 550 | "# Show biggest cities in 2017\n", 551 | "\n", 552 | "# just 'total' would also give us \"Population on the 1st of January, 0-4 years, total\" items\n", 553 | "key_filter = 'January, total'\n", 554 | "\n", 555 | "cities.filter(like=key_filter,axis=0).sort_values(by=[2017], ascending=False).head(10)" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 4, 561 | "metadata": {}, 562 | "outputs": [ 563 | { 564 | "data": { 565 | "text/html": [ 566 | "
\n", 567 | "\n", 580 | "\n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | "
TIME2008200920102011201220132014201520162017growth
key
Bournemouth:Population on the 1st of January, total164600.0NaN176800.0183491.0185100.0187700.0190100.0268124.5271606.0269698.053.623303
Oulu:Population on the 1st of January, total131585.0133550.0139133.0141671.0143909.0190847.0193798.0196291.0198525.0NaN42.687213
Derry & Strabane Local Government District:Population on the 1st of January, total109100.0109600.0108400.0107877.0108400.0108600.0108900.0149336.0149808.0150320.038.199262
Southampton:Population on the 1st of January, total234100.0NaN231600.0236882.0237600.0240800.0243700.0311890.0316571.5316379.036.688903
Blackpool:Population on the 1st of January, total140600.0NaN142700.0142065.0142000.0141700.0141000.0194661.5194388.5195034.036.221794
Valencia:Population on the 1st of January, total807200.0814208.0809267.0792054.0797028.0792303.0786424.01085048.51089284.5NaN34.601374
Granada:Population on the 1st of January, totalNaNNaN239154.0241003.0239017.0237818.0237540.0317253.5317160.0NaN32.617477
Pamplona/Iru�a:Population on the 1st of January, total197275.0198491.0197488.0195943.0197604.0196955.0196166.0257629.0257984.0NaN30.632747
Milano (greater city):Population on the 1st of January, totalNaNNaN3154102.03854555.03875801.03925767.04038864.04061382.04074585.0NaN29.183679
Stoke-on-trent:Population on the 1st of January, total239300.0238900.0246600.0249008.0249300.0250100.0250600.0314612.0316315.0318791.028.270479
\n", 754 | "
" 755 | ], 756 | "text/plain": [ 757 | "TIME 2008 2009 \\\n", 758 | "key \n", 759 | "Bournemouth:Population on the 1st of January, t... 164600.0 NaN \n", 760 | "Oulu:Population on the 1st of January, total 131585.0 133550.0 \n", 761 | "Derry & Strabane Local Government District:Popu... 109100.0 109600.0 \n", 762 | "Southampton:Population on the 1st of January, t... 234100.0 NaN \n", 763 | "Blackpool:Population on the 1st of January, total 140600.0 NaN \n", 764 | "Valencia:Population on the 1st of January, total 807200.0 814208.0 \n", 765 | "Granada:Population on the 1st of January, total NaN NaN \n", 766 | "Pamplona/Iru�a:Population on the 1st of January... 197275.0 198491.0 \n", 767 | "Milano (greater city):Population on the 1st of ... NaN NaN \n", 768 | "Stoke-on-trent:Population on the 1st of January... 239300.0 238900.0 \n", 769 | "\n", 770 | "TIME 2010 2011 \\\n", 771 | "key \n", 772 | "Bournemouth:Population on the 1st of January, t... 176800.0 183491.0 \n", 773 | "Oulu:Population on the 1st of January, total 139133.0 141671.0 \n", 774 | "Derry & Strabane Local Government District:Popu... 108400.0 107877.0 \n", 775 | "Southampton:Population on the 1st of January, t... 231600.0 236882.0 \n", 776 | "Blackpool:Population on the 1st of January, total 142700.0 142065.0 \n", 777 | "Valencia:Population on the 1st of January, total 809267.0 792054.0 \n", 778 | "Granada:Population on the 1st of January, total 239154.0 241003.0 \n", 779 | "Pamplona/Iru�a:Population on the 1st of January... 197488.0 195943.0 \n", 780 | "Milano (greater city):Population on the 1st of ... 3154102.0 3854555.0 \n", 781 | "Stoke-on-trent:Population on the 1st of January... 246600.0 249008.0 \n", 782 | "\n", 783 | "TIME 2012 2013 \\\n", 784 | "key \n", 785 | "Bournemouth:Population on the 1st of January, t... 185100.0 187700.0 \n", 786 | "Oulu:Population on the 1st of January, total 143909.0 190847.0 \n", 787 | "Derry & Strabane Local Government District:Popu... 108400.0 108600.0 \n", 788 | "Southampton:Population on the 1st of January, t... 237600.0 240800.0 \n", 789 | "Blackpool:Population on the 1st of January, total 142000.0 141700.0 \n", 790 | "Valencia:Population on the 1st of January, total 797028.0 792303.0 \n", 791 | "Granada:Population on the 1st of January, total 239017.0 237818.0 \n", 792 | "Pamplona/Iru�a:Population on the 1st of January... 197604.0 196955.0 \n", 793 | "Milano (greater city):Population on the 1st of ... 3875801.0 3925767.0 \n", 794 | "Stoke-on-trent:Population on the 1st of January... 249300.0 250100.0 \n", 795 | "\n", 796 | "TIME 2014 2015 \\\n", 797 | "key \n", 798 | "Bournemouth:Population on the 1st of January, t... 190100.0 268124.5 \n", 799 | "Oulu:Population on the 1st of January, total 193798.0 196291.0 \n", 800 | "Derry & Strabane Local Government District:Popu... 108900.0 149336.0 \n", 801 | "Southampton:Population on the 1st of January, t... 243700.0 311890.0 \n", 802 | "Blackpool:Population on the 1st of January, total 141000.0 194661.5 \n", 803 | "Valencia:Population on the 1st of January, total 786424.0 1085048.5 \n", 804 | "Granada:Population on the 1st of January, total 237540.0 317253.5 \n", 805 | "Pamplona/Iru�a:Population on the 1st of January... 196166.0 257629.0 \n", 806 | "Milano (greater city):Population on the 1st of ... 4038864.0 4061382.0 \n", 807 | "Stoke-on-trent:Population on the 1st of January... 250600.0 314612.0 \n", 808 | "\n", 809 | "TIME 2016 2017 \\\n", 810 | "key \n", 811 | "Bournemouth:Population on the 1st of January, t... 271606.0 269698.0 \n", 812 | "Oulu:Population on the 1st of January, total 198525.0 NaN \n", 813 | "Derry & Strabane Local Government District:Popu... 149808.0 150320.0 \n", 814 | "Southampton:Population on the 1st of January, t... 316571.5 316379.0 \n", 815 | "Blackpool:Population on the 1st of January, total 194388.5 195034.0 \n", 816 | "Valencia:Population on the 1st of January, total 1089284.5 NaN \n", 817 | "Granada:Population on the 1st of January, total 317160.0 NaN \n", 818 | "Pamplona/Iru�a:Population on the 1st of January... 257984.0 NaN \n", 819 | "Milano (greater city):Population on the 1st of ... 4074585.0 NaN \n", 820 | "Stoke-on-trent:Population on the 1st of January... 316315.0 318791.0 \n", 821 | "\n", 822 | "TIME growth \n", 823 | "key \n", 824 | "Bournemouth:Population on the 1st of January, t... 53.623303 \n", 825 | "Oulu:Population on the 1st of January, total 42.687213 \n", 826 | "Derry & Strabane Local Government District:Popu... 38.199262 \n", 827 | "Southampton:Population on the 1st of January, t... 36.688903 \n", 828 | "Blackpool:Population on the 1st of January, total 36.221794 \n", 829 | "Valencia:Population on the 1st of January, total 34.601374 \n", 830 | "Granada:Population on the 1st of January, total 32.617477 \n", 831 | "Pamplona/Iru�a:Population on the 1st of January... 30.632747 \n", 832 | "Milano (greater city):Population on the 1st of ... 29.183679 \n", 833 | "Stoke-on-trent:Population on the 1st of January... 28.270479 " 834 | ] 835 | }, 836 | "execution_count": 4, 837 | "metadata": {}, 838 | "output_type": "execute_result" 839 | } 840 | ], 841 | "source": [ 842 | "# add a growth % 2010 - 2016 column \n", 843 | "cities[\"growth\"] = (cities[2016] / cities[2010] - 1) * 100\n", 844 | "cities.filter(like=key_filter,axis=0).sort_values(by=[\"growth\"], ascending=False).head(10)" 845 | ] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": null, 850 | "metadata": {}, 851 | "outputs": [], 852 | "source": [] 853 | } 854 | ], 855 | "metadata": { 856 | "kernelspec": { 857 | "display_name": "Python 3", 858 | "language": "python", 859 | "name": "python3" 860 | }, 861 | "language_info": { 862 | "codemirror_mode": { 863 | "name": "ipython", 864 | "version": 3 865 | }, 866 | "file_extension": ".py", 867 | "mimetype": "text/x-python", 868 | "name": "python", 869 | "nbconvert_exporter": "python", 870 | "pygments_lexer": "ipython3", 871 | "version": "3.6.6" 872 | } 873 | }, 874 | "nbformat": 4, 875 | "nbformat_minor": 2 876 | } 877 | -------------------------------------------------------------------------------- /src/main/java/test_dataframes/CheckResult.java: -------------------------------------------------------------------------------- 1 | package test_dataframes; 2 | 3 | import java.util.List; 4 | 5 | import com.google.common.base.Verify; 6 | 7 | public class CheckResult { 8 | public static void checkResult(List highestGrowth) { 9 | String[] expected = { "Bournemouth", "Oulu", "Derry & Strabane", "Southampton", "Blackpool", 10 | "Valencia", "Granada" }; 11 | Verify.verify(highestGrowth.size() >= expected.length, "Provide at least %s items, got %s", 12 | expected.length, highestGrowth.size()); 13 | 14 | for (int i = 0; i < expected.length; i++) { 15 | Verify.verify(((String) highestGrowth.get(i)).startsWith(expected[i]), 16 | "Expected item %s to start with %s, but was %s", i, expected[i], 17 | highestGrowth.get(i)); 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/test_dataframes/TestDFLib.java: -------------------------------------------------------------------------------- 1 | package test_dataframes; 2 | 3 | import com.google.common.base.Stopwatch; 4 | import org.apache.commons.csv.CSVFormat; 5 | import org.dflib.DataFrame; 6 | import org.dflib.Printers; 7 | import org.dflib.ValueMapper; 8 | import org.dflib.csv.Csv; 9 | import org.dflib.print.Printer; 10 | 11 | import static org.dflib.Exp.*; 12 | 13 | /** 14 | * Test the API of tablesaw to do some basic dataframe manipulations. 15 | *

16 | * https://github.com/dflib/dflib 17 | *

18 | * See https://medium.com/@thijser/doing-cool-data-science-in-java-how-3-dataframe-libraries-stack-up-5e6ccb7b437 19 | * for more information. 20 | */ 21 | public class TestDFLib { 22 | public static void main(String[] args) { 23 | 24 | Printer printer = Printers.tabular(10, 100); 25 | 26 | DataFrame data = Csv.loader() 27 | .format(CSVFormat.DEFAULT.builder().setNullString(":").build()) 28 | .col("Value", ValueMapper.stringToInt()) 29 | .load("urb_cpop1_1_Data.csv"); 30 | System.out.println(printer.toString(data)); 31 | 32 | Stopwatch watch = Stopwatch.createStarted(); 33 | DataFrame filtered = data.rows($col("Value").isNotNull()).select(); 34 | 35 | DataFrame cities = filtered.group("CITIES", "INDIC_UR", "TIME") 36 | .cols("CITIES", "INDIC_UR", "TIME", "Mean [Value]") 37 | .agg($col("CITIES"), $col("INDIC_UR"), $col("TIME"), $int("Value").avg().castAsInt()); 38 | 39 | System.out.println(printer.toString(cities)); 40 | 41 | // Need to transpose/pivot now too 42 | DataFrame finalTable = cities 43 | .cols("key").merge(concat($str("CITIES"), ":", $str("INDIC_UR"))) 44 | .pivot().rows("key").cols("TIME").vals("Mean [Value]"); 45 | 46 | // sortDescendingOn puts N/A values first unfortunately, so let's remove them 47 | // before determining and printing. 48 | DataFrame existing2017 = finalTable 49 | .rowsExcept($int("2017").isNull()).select() 50 | .rows($str("key").endsWith("January, total")).select() 51 | .sort($int("2017").desc()); 52 | System.out.println(printer.toString(existing2017)); 53 | 54 | // Add growth column 55 | 56 | DataFrame finalTable1 = finalTable 57 | .cols("growth").merge($int("2016").castAsDouble().div($int("2010")).sub(1).mul(100)); 58 | 59 | DataFrame highestGrowthTable = finalTable1 60 | .rows($str("key").endsWith("January, total")).select() 61 | .rowsExcept($col("growth").isNull()).select() 62 | .sort($double("growth").desc()); 63 | 64 | System.out.println(printer.toString(highestGrowthTable)); 65 | CheckResult.checkResult(highestGrowthTable.getColumn("key").toList()); 66 | 67 | System.out.println("Total time: " + watch); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/main/java/test_dataframes/TestDatavec.java: -------------------------------------------------------------------------------- 1 | package test_dataframes; 2 | 3 | import static org.datavec.api.transform.condition.ConditionOp.Equal; 4 | 5 | import java.io.File; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | import org.datavec.api.records.reader.RecordReader; 10 | import org.datavec.api.records.reader.impl.csv.CSVRecordReader; 11 | import org.datavec.api.split.FileSplit; 12 | import org.datavec.api.transform.ReduceOp; 13 | import org.datavec.api.transform.TransformProcess; 14 | import org.datavec.api.transform.condition.column.StringColumnCondition; 15 | import org.datavec.api.transform.reduce.Reducer; 16 | import org.datavec.api.transform.schema.Schema; 17 | import org.datavec.api.writable.IntWritable; 18 | import org.datavec.api.writable.Writable; 19 | import org.datavec.local.transforms.LocalTransformExecutor; 20 | 21 | /** 22 | * Test the API of DataVec to do some basic dataframe manipulations (unfinished). 23 | * 24 | * https://deeplearning4j.org/docs/latest/datavec-overview 25 | */ 26 | public class TestDatavec { 27 | public static void main(String[] args) throws Exception { 28 | int numLinesToSkip = 1; 29 | char delimiter = ','; 30 | RecordReader recordReader = new CSVRecordReader(numLinesToSkip,delimiter); 31 | recordReader.initialize(new FileSplit(new File("urb_cpop1_1_Data.csv"))); 32 | 33 | // It seems we need to know in advance what the fields and their order 34 | // are here... 35 | Schema csvSchema = new Schema.Builder() 36 | .addColumnInteger("TIME") 37 | .addColumnsString("CITIES", "INDIC_UR","Value","Flag and Footnotes") 38 | .build(); 39 | 40 | TransformProcess tp = new TransformProcess.Builder(csvSchema) 41 | .conditionalReplaceValueTransform("Value", new IntWritable(0), new StringColumnCondition("Value", Equal, ":")) 42 | .convertToInteger("Value") 43 | .reduce(new Reducer.Builder(ReduceOp.TakeLast) 44 | .keyColumns("CITIES", "INDIC_UR", "TIME") 45 | .meanColumns("Value") 46 | .build()) 47 | // Here we also need to know in advance the range of items 48 | .integerToOneHot("TIME", 2008, 2017) 49 | 50 | // Now we have one-hot encoded countries, with the Value column separately. 51 | // We would have to either do a conditionalCopyValueTransform Value -> year column 52 | // for every year separately, or we probably have to modify integerToOneHot 53 | // to copy our Value column instead of 1-hot to make the proper pivot. 54 | // IntegerToOneHotTransform is > 200 lines, so it's not trivial to create 55 | // such a custom transform. 56 | 57 | .build(); 58 | 59 | 60 | List> csvData = new ArrayList<>(); 61 | while(recordReader.hasNext()) { 62 | csvData.add(recordReader.next()); 63 | } 64 | printHead(csvData, csvSchema); 65 | 66 | List> transformedData = LocalTransformExecutor.execute(csvData, tp); 67 | 68 | printHead(transformedData, tp.getFinalSchema()); 69 | } 70 | 71 | private static void printHead(List> data, Schema schema) { 72 | for (int j = 0; j < schema.getColumnNames().size(); j++) { 73 | System.out.printf("%20s", schema.getColumnNames().get(j)); 74 | } 75 | System.out.println(); 76 | for (int i = 0; i < Math.min(10, data.size()); i++) { 77 | List row = data.get(i); 78 | for (int j = 0; j < row.size(); j++) { 79 | System.out.printf("%20s", row.get(j).toString()); 80 | } 81 | System.out.println(); 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/main/java/test_dataframes/TestDuckDb.kt: -------------------------------------------------------------------------------- 1 | package test_dataframes 2 | 3 | import com.google.common.base.Stopwatch 4 | import tech.tablesaw.api.Table 5 | import java.sql.DriverManager 6 | 7 | 8 | /** 9 | * Test duckdb to do some basic dataframe manipulations. 10 | * 11 | * See https://medium.com/@thijser/doing-cool-data-science-in-java-how-3-dataframe-libraries-stack-up-5e6ccb7b437 12 | * for more information. 13 | */ 14 | fun main() { 15 | val conn = DriverManager.getConnection("jdbc:duckdb:") 16 | val stmt = conn.createStatement() 17 | var rs = stmt.executeQuery("SELECT * FROM 'urb_cpop1_1_Data.csv'") 18 | Table.read().db(rs).print().also { println(it) } 19 | 20 | val watch = Stopwatch.createStarted() 21 | stmt.execute( 22 | """ 23 | CREATE TEMP TABLE t1 AS ( 24 | WITH cities AS ( 25 | SELECT CITIES || ':' || INDIC_UR as key, 26 | CAST(Value AS INTEGER) as Value, 27 | * EXCLUDE (CITIES, INDIC_UR, Value) 28 | FROM 'urb_cpop1_1_Data.csv' WHERE Value != ':'), 29 | pivot_table AS ( 30 | PIVOT cities 31 | ON TIME 32 | USING AVG(Value) 33 | GROUP BY key 34 | ) 35 | SELECT *, ("2016"::REAL / "2010"::REAL - 1.0 ) * 100.0 as growth 36 | FROM pivot_table 37 | WHERE suffix(key, 'January, total') 38 | ORDER BY growth DESC 39 | ) 40 | """ 41 | ) 42 | rs = stmt.executeQuery("SELECT * FROM t1") 43 | Table.read().db(rs).print().also { println(it) } 44 | val result = stmt.executeQuery("SELECT key FROM t1").use { r -> 45 | mutableListOf().apply { 46 | while (r.next()) { 47 | this += r.getString("key") 48 | } 49 | } 50 | } 51 | CheckResult.checkResult(result) 52 | println("Total time: $watch") 53 | } -------------------------------------------------------------------------------- /src/main/java/test_dataframes/TestJoinery.java: -------------------------------------------------------------------------------- 1 | package test_dataframes; 2 | 3 | import java.util.Collections; 4 | import java.util.List; 5 | import java.util.function.Consumer; 6 | 7 | import joinery.DataFrame; 8 | import joinery.DataFrame.KeyFunction; 9 | import joinery.DataFrame.RowFunction; 10 | import joinery.impl.Aggregation.Mean; 11 | 12 | import com.google.common.base.Stopwatch; 13 | import com.google.common.collect.Iterables; 14 | import com.google.common.collect.Lists; 15 | 16 | /** 17 | * Test the API of joinery to do some basic dataframe manipulations. 18 | * 19 | * https://github.com/cardillo/joinery 20 | * 21 | * See https://medium.com/@thijser/doing-cool-data-science-in-java-how-3-dataframe-libraries-stack-up-5e6ccb7b437 22 | * for more information. 23 | */ 24 | public class TestJoinery { 25 | public static void main(String[] args) throws Exception { 26 | DataFrame frame = DataFrame.readCsv("urb_cpop1_1_Data.csv"); 27 | System.out.println(frame.head(5)); 28 | 29 | Stopwatch watch = Stopwatch.createStarted(); 30 | 31 | // Remove ":" values from the value column 32 | int valueColIndex = Iterables.indexOf(frame.columns(), name -> name.equals("Value")); 33 | frame = frame.transform(new RowFunction() { 34 | @Override 35 | public List> apply(List values) { 36 | if (":".equals(values.get(valueColIndex))) { 37 | values = Lists.newArrayList(values); 38 | values.set(valueColIndex, null); 39 | } 40 | return Collections.singletonList(values); 41 | } 42 | }); 43 | // This will make the Value column type "Long" now: 44 | frame.convert(); 45 | 46 | // Pivot the table to get year into columns. Would be nice if joinery provides 47 | // a better method to get the index of a column name without having to resort to a 48 | // guava function, e.g. frame.columnIndex("CITIES") 49 | int citiesIndex = Iterables.indexOf(frame.columns(), name -> name.equals("CITIES")); 50 | int typeIndex = Iterables.indexOf(frame.columns(), name -> name.equals("INDIC_UR")); 51 | int timeIndex = Iterables.indexOf(frame.columns(), name -> name.equals("TIME")); 52 | 53 | DataFrame numberFrame = frame.pivot( 54 | (KeyFunction)(values -> values.get(citiesIndex) + " - " + values.get(typeIndex)), 55 | values -> values.get(timeIndex), 56 | Collections.singletonMap(valueColIndex, 57 | new Mean<>()) 58 | ); 59 | 60 | // Print top 20 items in 2017 61 | // sortBy supports something like "-2017" but this doesn't work because the 62 | // column names are of type Long and not String 63 | // Can use the int based indexing though. 64 | int _2017index = Iterables.indexOf(numberFrame.columns(), colName -> ((Number)colName).intValue() == 2017); 65 | // Remove NaN values created by mean() function 66 | // The key names are now part of the index and not cells anymore, that makes it a bit more tricky to filter, 67 | // as the Predicate of the select() function doesn't get the row name. 68 | System.out.println(filterTotalKeys(numberFrame).select( 69 | row -> !Double.isNaN((Double) row.get(_2017index))).sortBy(-_2017index).head(10)); 70 | 71 | // Add growth column 72 | int _2010index = Iterables.indexOf(numberFrame.columns(), colName -> ((Number)colName).intValue() == 2010); 73 | int _2016index = Iterables.indexOf(numberFrame.columns(), colName -> ((Number)colName).intValue() == 2016); 74 | 75 | // Great API to add a new calculated column: 76 | numberFrame = numberFrame.add("growth", 77 | row -> (row.get(_2016index).doubleValue() / row.get(_2010index).doubleValue() - 1) 78 | * 100); 79 | 80 | int growthIndex = Iterables.indexOf(numberFrame.columns(), colName -> "growth".equals(colName)); 81 | DataFrame highestGrowthFrame = filterTotalKeys(numberFrame). 82 | select(row -> !Double.isNaN((Double) row.get(growthIndex))). 83 | sortBy("-growth"); 84 | System.out.println(highestGrowthFrame.head(10)); 85 | CheckResult.checkResult(Lists.newArrayList(highestGrowthFrame.index())); 86 | 87 | System.out.println("Total time: " + watch); 88 | } 89 | 90 | private static DataFrame filterTotalKeys(DataFrame df) { 91 | DataFrame result = new DataFrame<>(df.columns()); 92 | // I can't find a function like "getRowName(int rowNumber)". 93 | // Iterating over df.index() is also not possible because there's also 94 | // no function like List getRow(int rowNumber), we need to use 95 | // the forEach function. 96 | List rowNames = Lists.newArrayList(df.index()); 97 | 98 | df.forEach(new Consumer>() { 99 | int index = 0; 100 | 101 | @Override 102 | public void accept(List row) { 103 | String rowName = (String) rowNames.get(index); 104 | if (rowName.endsWith("January, total")) { 105 | result.append(rowName, row); 106 | } 107 | index++; 108 | } 109 | }); 110 | return result; 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/main/java/test_dataframes/TestKotlinDataFrame.kt: -------------------------------------------------------------------------------- 1 | package test_dataframes 2 | 3 | import com.google.common.base.Stopwatch 4 | import org.jetbrains.kotlinx.dataframe.DataFrame 5 | import org.jetbrains.kotlinx.dataframe.io.* 6 | import org.jetbrains.kotlinx.dataframe.api.* 7 | /** 8 | * Test the API of Kotlin Dataframes to do some basic dataframe manipulations. 9 | * 10 | * See https://medium.com/@thijser/doing-cool-data-science-in-java-how-3-dataframe-libraries-stack-up-5e6ccb7b437 11 | * for more information. 12 | */ 13 | fun main() { 14 | val keyColumn = "key" 15 | val df = DataFrame.readCsv(fileOrUrl = "urb_cpop1_1_Data.csv", delimiter = ',') 16 | df.print() 17 | val watch = Stopwatch.createStarted() 18 | // remove missing values indicated with ":", convert column to IntCol 19 | val filtered = df.filter { "Value"() != ":" } 20 | .add(keyColumn) { "CITIES"() + ":" + "INDIC_UR"() } 21 | .convert { "Value"() }.toInt() 22 | 23 | var cities = filtered.groupBy(keyColumn).pivot("TIME", inward = false).mean { "Value"() } 24 | cities.print() 25 | 26 | cities = cities.filter { keyColumn().endsWith("January, total") }.sortByDesc("2017") 27 | cities.print() 28 | 29 | // growth 30 | val highestGrowthTable = 31 | cities.filter { "2010"() != null && "2016"() != null } 32 | .add("growth") { ("2016"() / "2010"() - 1.0) * 100.0 } 33 | .sortByDesc("growth") 34 | highestGrowthTable.print() 35 | 36 | CheckResult.checkResult(highestGrowthTable[keyColumn].toList()) 37 | //highestGrowthTable[{ key }].toList()) 38 | println("Total time: $watch") 39 | } -------------------------------------------------------------------------------- /src/main/java/test_dataframes/TestKrangl.kt: -------------------------------------------------------------------------------- 1 | package test_dataframes 2 | 3 | import com.google.common.base.Stopwatch 4 | import krangl.* 5 | 6 | /** 7 | * Test the API of krangl to do some basic dataframe manipulations. 8 | * 9 | * https://github.com/holgerbrandl/krangl 10 | * 11 | * See https://medium.com/@thijser/doing-cool-data-science-in-java-how-3-dataframe-libraries-stack-up-5e6ccb7b437 12 | * for more information. 13 | */ 14 | fun main() { 15 | val data = DataFrame.readCSV("urb_cpop1_1_Data.csv") 16 | 17 | val watch = Stopwatch.createStarted() 18 | // remove missing values indicated with ":", convert column to IntCol 19 | val filtered = data.filter { !(it["Value"] eq ":") }.addColumn("Value") { 20 | it["Value"].map(String::toInt) 21 | } 22 | // replace duplicated rows with mean value, create pivot table 23 | val cities = filtered.groupBy("CITIES", "INDIC_UR", "TIME") 24 | .summarize("Value" to { it["Value"].mean() }) 25 | .spread("TIME", "Value").filter { 26 | it["INDIC_UR"].isMatching { endsWith("January, total") } 27 | } 28 | 29 | println(cities.select("CITIES", "2017").sortedByDescending("2017").head(10)) 30 | 31 | val highestGrowthTable = cities.addColumn("growth") { 32 | (it["2016"] / it["2010"] - 1.0) * 100.0 33 | }.sortedByDescending("growth") 34 | 35 | println(highestGrowthTable.select("CITIES", "growth").head(10)) 36 | 37 | CheckResult.checkResult(highestGrowthTable["CITIES"].asType().toList()) 38 | println("Total time: $watch") 39 | } -------------------------------------------------------------------------------- /src/main/java/test_dataframes/TestMorpheus.java: -------------------------------------------------------------------------------- 1 | package test_dataframes; 2 | 3 | import com.google.common.base.Stopwatch; 4 | import com.google.common.collect.Lists; 5 | import com.google.common.primitives.Doubles; 6 | import com.zavtech.morpheus.frame.DataFrame; 7 | import com.zavtech.morpheus.frame.DataFrameRow; 8 | import com.zavtech.morpheus.util.Tuple; 9 | 10 | /** 11 | * Test the API of Morpheus to do some basic dataframe manipulations. 12 | * 13 | * https://github.com/zavtech/morpheus-core 14 | * 15 | * See https://medium.com/@thijser/doing-cool-data-science-in-java-how-3-dataframe-libraries-stack-up-5e6ccb7b437 16 | * for more information. 17 | */ 18 | public class TestMorpheus { 19 | public static void main(String[] args) { 20 | DataFrame frame = DataFrame.read().csv("urb_cpop1_1_Data.csv"); 21 | frame.out().print(5); 22 | System.out.println(); 23 | 24 | Stopwatch watch = Stopwatch.createStarted(); 25 | 26 | frame.cols().replaceKey("Value", "ValueStr"); 27 | frame.cols().add("Value", Integer.class, value -> { 28 | String str = value.row().getValue("ValueStr"); 29 | if (":".equals(str)) { 30 | return null; 31 | } 32 | return Integer.parseInt(str); 33 | }); 34 | 35 | DataFrame pivoted = DataFrame.empty(); 36 | frame.rows().select(row -> row.getInt("Value") > 0).rows().groupBy( 37 | row -> Tuple.of(row.getValue("CITIES") + " - " + row.getValue("INDIC_UR"), 38 | row.getValue("TIME"))).forEach(1, (tuple, groupedFrame) -> { 39 | pivoted.rows().add(tuple.item(0)); 40 | DataFrameRow thisRow = pivoted.row(tuple.item(0)); 41 | int sum = 0; 42 | for (int row = 0; row < groupedFrame.rowCount(); row++) { 43 | sum += groupedFrame.col("Value").getInt(row); 44 | } 45 | int year = groupedFrame.col("TIME").getInt(0); 46 | String yearStr = Integer.toString(year); 47 | pivoted.cols().add(yearStr, Double.class); 48 | thisRow.setDouble(yearStr, sum / (double) groupedFrame.rowCount()); 49 | }); 50 | 51 | // Print top 20 items in 2017 52 | pivoted.rows().sort(false, "2017"); 53 | 54 | pivoted.rows().select( 55 | row -> ((String) row.key()).endsWith("January, total")). 56 | out().print(10); 57 | System.out.println(); 58 | 59 | // Add growth column 60 | pivoted.cols().add("growth", Double.class, value -> { 61 | double growth = (value.row().getDouble("2016") / value.row().getDouble("2010") - 1) 62 | * 100.0; 63 | if (!Doubles.isFinite(growth)) { 64 | return 0.0; 65 | } 66 | return growth; 67 | }); 68 | DataFrame output = pivoted.rows().sort(false, "growth").rows().select( 69 | row -> ((String) row.key()).endsWith("January, total")); 70 | output.out().print(10); 71 | System.out.println(); 72 | CheckResult.checkResult(Lists.newArrayList(output.rows().keyArray())); 73 | 74 | System.out.println("Total time: " + watch); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/test_dataframes/TestTablesaw.java: -------------------------------------------------------------------------------- 1 | package test_dataframes; 2 | 3 | import static tech.tablesaw.aggregate.AggregateFunctions.mean; 4 | 5 | import tech.tablesaw.api.DoubleColumn; 6 | import tech.tablesaw.api.StringColumn; 7 | import tech.tablesaw.api.Table; 8 | import tech.tablesaw.io.csv.CsvReadOptions; 9 | 10 | import com.google.common.base.Stopwatch; 11 | 12 | /** 13 | * Test the API of tablesaw to do some basic dataframe manipulations. 14 | * 15 | * https://github.com/jtablesaw/tablesaw 16 | * 17 | * See https://medium.com/@thijser/doing-cool-data-science-in-java-how-3-dataframe-libraries-stack-up-5e6ccb7b437 18 | * for more information. 19 | */ 20 | public class TestTablesaw { 21 | public static void main(String[] args) { 22 | // This automatically makes the ":" values missing 23 | Table data = Table.read().csv( 24 | CsvReadOptions.builder("urb_cpop1_1_Data.csv").missingValueIndicator(":").build()); 25 | System.out.println(data.print(5)); 26 | 27 | Stopwatch watch = Stopwatch.createStarted(); 28 | Table filtered = data.where(data.column("Value").isNotMissing()); 29 | 30 | Table cities = filtered.summarize("Value", mean).by("CITIES", "INDIC_UR", "TIME"); 31 | System.out.println(cities.print(10)); 32 | 33 | // Need to transpose/pivot now too 34 | StringColumn key = filtered.stringColumn("CITIES") 35 | .join(":", filtered.stringColumn("INDIC_UR")).setName("key"); 36 | filtered.addColumns(key); 37 | Table finalTable = filtered.pivot("key", "TIME", "Value", mean); 38 | 39 | // sortDescendingOn puts N/A values first unfortunately, so let's remove them 40 | // before determining and printing. 41 | Table existing2017 = finalTable.dropWhere(finalTable.column("2017").isMissing()); 42 | System.out.println(filterTotalKeys(existing2017).sortDescendingOn("2017").print(20)); 43 | 44 | // Add growth column 45 | DoubleColumn growthColumn = finalTable.doubleColumn("2016").divide( 46 | finalTable.doubleColumn("2010")).subtract(1).multiply(100); 47 | growthColumn.setName("growth"); 48 | finalTable.addColumns(growthColumn); 49 | 50 | Table highestGrowthTable = filterTotalKeys( 51 | finalTable.dropWhere(finalTable.column("growth").isMissing())).sortDescendingOn( 52 | "growth"); 53 | System.out.println(highestGrowthTable.print(20)); 54 | CheckResult.checkResult(highestGrowthTable.column("key").asList()); 55 | 56 | System.out.println("Total time: " + watch); 57 | } 58 | 59 | private static Table filterTotalKeys(Table existing2017) { 60 | return existing2017.where(existing2017.stringColumn("key").endsWith("January, total")); 61 | } 62 | } 63 | --------------------------------------------------------------------------------