├── src └── practicalli │ ├── covid_19.clj │ └── design_journal.clj ├── .gitignore ├── test └── practicalli │ └── covid_19_test.clj ├── deps.edn └── README.md /src/practicalli/covid_19.clj: -------------------------------------------------------------------------------- 1 | (ns practicalli.covid-19 2 | (:require [clojure.java.io :as io] 3 | [clojure.data.csv :as csv])) 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /classes 3 | /checkouts 4 | *.jar 5 | *.class 6 | /.cpcache 7 | /.lein-* 8 | /.nrepl-history 9 | /.nrepl-port 10 | .hgignore 11 | .hg/ 12 | -------------------------------------------------------------------------------- /test/practicalli/covid_19_test.clj: -------------------------------------------------------------------------------- 1 | (ns practicalli.covid-19-test 2 | (:require [clojure.test :refer [deftest is testing]] 3 | [practicalli.covid-19 :as SUT])) 4 | -------------------------------------------------------------------------------- /deps.edn: -------------------------------------------------------------------------------- 1 | {:paths 2 | ["src" "test" "resources"] 3 | 4 | :deps 5 | {org.clojure/clojure {:mvn/version "1.10.1"} 6 | org.clojure/data.csv {:mvn/version "1.0.0"} 7 | 8 | com.mitchtalmadge/ascii-data {:mvn/version "1.4.0"}} 9 | 10 | :aliases 11 | {:test 12 | {:extra-paths ["test"] 13 | :extra-deps {org.clojure/test.check {:mvn/version "0.10.0"}}} 14 | 15 | :runner 16 | {:extra-deps {com.cognitect/test-runner 17 | {:git/url "https://github.com/cognitect-labs/test-runner" 18 | :sha "f7ef16dc3b8332b0d77bc0274578ad5270fbfedd"}} 19 | :main-opts ["-m" "cognitect.test-runner" 20 | "-d" "test"]} 21 | :jar 22 | {:extra-deps {seancorfield/depstar {:mvn/version "0.5.2"}} 23 | :main-opts ["-m" "hf.depstar.jar" "corvid-19.jar"]} 24 | 25 | :install 26 | {:extra-deps {deps-deploy {:mvn/version "0.0.9"}} 27 | :main-opts ["-m" "deps-deploy.deps-deploy" "install" "corvid-19.jar"]} 28 | 29 | :deploy 30 | {:extra-deps {deps-deploy {:mvn/version "0.0.9"}} 31 | :main-opts ["-m" "deps-deploy.deps-deploy" "deploy" "corvid-19.jar"]}}} 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # corvid-19 2 | A simple data science project, working with data from [Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19). 3 | 4 | Based on the [Baby steps with Covid-19 data for Clojure programmers](https://dragan.rocks/articles/20/Corona-1-Baby-steps-with-Covid-19-for-programmers) by [dragon.rocks](https://dragan.rocks/articles/20/Corona-1-Baby-steps-with-Covid-19-for-programmers). Please consider sponsoring the excellent work that Dragan Djuric produces. 5 | 6 | ## Usage 7 | Open the `src/practicalli/design_journal.clj` file from the project in your favorite Clojure editor and start the REPL. Starting at the top of the page, evaluate the individual expressions and experiment. 8 | 9 | ## Deployment 10 | Build a deployable jar of this library: 11 | 12 | $ clojure -A:jar 13 | 14 | Install it locally: 15 | 16 | $ clojure -A:install 17 | 18 | Deploy to Clojars 19 | 20 | $ clojure -A:deploy 21 | 22 | > Configure your clojars.org account detail in the environment variables: 23 | > `CLOJARS_USERNAME` and `CLOJARS_PASSWORD` 24 | 25 | ## License 26 | 27 | Copyright © 2020 Practicalli 28 | 29 | Distributed under the Creative Commons Attribution Share-Alike 4.0 International 30 | -------------------------------------------------------------------------------- /src/practicalli/design_journal.clj: -------------------------------------------------------------------------------- 1 | (ns practicalli.design-journal 2 | (:require [clojure.java.io :as io] 3 | [clojure.data.csv :as csv])) 4 | 5 | 6 | ;; Accessing the data 7 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 8 | 9 | ;; path to data 10 | ;; https://github.com/open-covid-19/data/blob/master/output/world.csv 11 | ;; https://raw.githubusercontent.com/open-covid-19/data/master/output/world.csv 12 | 13 | (def data "https://raw.githubusercontent.com/open-covid-19/data/master/output/world.csv") 14 | 15 | (slurp data) 16 | 17 | 18 | (def data-cache (slurp data)) 19 | 20 | data-cache 21 | 22 | ;; Not a good approach for large data sets 23 | 24 | 25 | ;; For larger files then using a buffered reader is recommended. 26 | ;; The Clojure java.io API provides a reader function 27 | ;; https://clojure.github.io/clojure/clojure.java.io-api.html 28 | 29 | ;; Add requires to source code namespace 30 | ;; (:require [clojure.java.io :as io]) 31 | 32 | (require '[clojure.java.io :as io]) 33 | 34 | (io/reader data) 35 | ;; => #object[java.io.BufferedReader 0x2f606d07 "java.io.BufferedReader@2f606d07"] 36 | 37 | ;; Get the data from the reader as a string with slurp 38 | 39 | (slurp (io/reader data)) 40 | 41 | 42 | ;; Save the data file locally 43 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 44 | ;; To save hitting GitHub with multiple calls throughout the development, 45 | ;; copy the file to `resources` directory. 46 | ;; https://raw.githubusercontent.com/open-covid-19/data/master/output/world.csv 47 | 48 | (def data-local "resources/world.csv") 49 | 50 | (slurp (io/reader data-local)) 51 | 52 | 53 | ;; Alternatively, you can use the resource function 54 | ;; which returns the location of a file on the classpath 55 | ;; the `resources` directory is included in the classpath 56 | ;; via the `deps.edn` configuration 57 | ;; `resource` is a nice abstraction to use instead of `reader` 58 | ;; especially in a web application 59 | 60 | (io/resource "world.csv") 61 | ;; => #object[java.net.URL 0x3f239a8c "file:/home/practicalli/projects/clojure/data-science/corvid-19/resources/world.csv"] 62 | 63 | 64 | ;; Working with files and directories in Clojure 65 | ;; http://clojure-doc.org/articles/cookbooks/files_and_directories.html 66 | ;; https://www.tutorialspoint.com/clojure/clojure_file_io.htm 67 | 68 | 69 | ;; Parsing CSV data 70 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 71 | ;; clojure.data.csv is a library to transform data between 72 | ;; CSV format and Clojure data structures (hash-maps, vectors, etc) 73 | ;; It contains `read-csv` and `write-csv` functions, that's all. 74 | 75 | ;; Add dependency to parse CSV files (restart REPL) 76 | ;; https://github.com/clojure/data.csv 77 | ;; org.clojure/data.csv {:mvn/version "1.0.0"} 78 | 79 | ;; see the section on lazyness for using this library effectively 80 | ;; https://github.com/clojure/data.csv#laziness 81 | 82 | (require '[clojure.data.csv :as csv]) 83 | 84 | ;; clojure.data.csv/read-csv function will create a lazy sequence 85 | ;; of the data from the CSV file, in a Clojure data structure. 86 | 87 | (csv/read-csv 88 | (slurp (io/reader data-local))) 89 | 90 | ;; as we are calling read-csv in a top level expression, 91 | ;; it becomes eager and we get the result. 92 | 93 | 94 | ;; Helper function for loading CSV files by geographical name 95 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 96 | ;; There are data sets for particular geographical regions, 97 | ;; China, USA and the whole world. 98 | 99 | ;; A helper function will 100 | 101 | (defn csv->clj [location] 102 | (csv/read-csv 103 | (slurp 104 | (io/reader 105 | (format "resources/%s.csv" location))))) 106 | 107 | 108 | ;; format is used to create the respective filename for the location 109 | (format "resources/%s.csv" "world") 110 | ;; => "resources/world.csv" 111 | 112 | ;; NOTE: original article uses io/resource which does not work locally 113 | 114 | ;; As the data set is not that large, bind it for convenience 115 | 116 | (def covid-world (csv->clj "world")) 117 | #_(def covid-china (csv->clj "china")) 118 | #_(def covid-usa (csv->clj "usa ")) 119 | 120 | ;; covid-world 121 | 122 | 123 | ;; Deconstruct the data 124 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 125 | 126 | 127 | ;; Example data 128 | ;; (["Date" "CountryCode" "CountryName" "Confirmed" "Deaths" "Latitude" "Longitude"] 129 | ;; ["2019-12-31" "AE" "United Arab Emirates" "0" "0" "23.424076" "53.847818"] 130 | ;; ["2019-12-31" "AF" "Afghanistan" "0" "0" "33.93911" "67.709953"] 131 | ;; ["2019-12-31" "AM" "Armenia" "0" "0" "40.069099" "45.038189"] 132 | ;; ["2019-12-31" "AT" "Austria" "0" "0" "47.516231" "14.550072"] 133 | ;; ["2019-12-31" "AU" "Australia" "0" "0" "-25.274398" "133.775136"] 134 | ;; ["2019-12-31" "AZ" "Azerbaijan" "0" "0" "40.143105" "47.576927"] 135 | ;; ["2019-12-31" "BE" "Belgium" "0" "0" "50.503887" "4.469936"] 136 | 137 | ;; The heading data is at the start of the file as a vector 138 | ;; Each additional vector is an observation taken on a specific date 139 | 140 | 141 | ;; headings 142 | 143 | (first covid-world) 144 | ;; => ["Date" "CountryCode" "CountryName" "Confirmed" "Deaths" "Latitude" "Longitude"] 145 | 146 | ;; Observations 147 | 148 | (second covid-world) 149 | ;; => ["2019-12-31" "AE" "United Arab Emirates" "0" "0" "23.424076" "53.847818"] 150 | 151 | ;; total number of observations 152 | 153 | (count (rest covid-world)) 154 | ;; => 5609 155 | 156 | 157 | ;; Countries with observations 158 | 159 | (distinct 160 | (map 161 | (fn [data] (second (rest data))) 162 | (rest covid-world))) 163 | 164 | ;; maybe more efficient to do 165 | 166 | (distinct 167 | (map 168 | (fn [data] (last (take 3 data))) 169 | (rest covid-world))) 170 | 171 | ;; syntax sugar 172 | 173 | (distinct 174 | (map 175 | #(last (take 3 %)) 176 | (rest covid-world))) 177 | 178 | 179 | ;; total countries 180 | ;; if we just want the totals, then we can use the abreviated country names 181 | 182 | (count 183 | (distinct 184 | (map 185 | second 186 | (rest covid-world)))) 187 | ;; => 152 188 | 189 | 190 | 191 | ;; How complete is our data? 192 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 193 | 194 | ;; One of the biggest challenges of data science is getting good data. 195 | ;; You should at least understand the basic quality of data 196 | ;; before delving deep into the science. 197 | 198 | ;; At least the data is in a proper CSV format and not just adhoc tables! 199 | 200 | ;; divide the total data set by the number of countries 201 | ;; if the remainder is zero, 202 | ;; we have all data for all countries for all dates 203 | 204 | (rem (count (next covid-world)) 205 | (count (distinct (map second (rest covid-world))))) 206 | ;; => 137 207 | 208 | 209 | ;; finding out how many data points are missing 210 | 211 | (count (next covid-world)) 212 | ;; => 5609 213 | 214 | ;; count the number of dates 215 | (count (distinct (map first (rest covid-world)))) 216 | ;; => 81 217 | 218 | 219 | ;; Given 81 dates and 152 countries, how many data sets should we have in total? 220 | 221 | (* (count (distinct (map first (rest covid-world)))) 222 | (count (distinct (map second (rest covid-world))))) 223 | ;; => 12312 224 | 225 | ;; missing observations 226 | (- 12312 5609) 227 | ;; => 6703 228 | 229 | 230 | ;; How many observations have zero confirmed cases? 231 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 232 | 233 | ;; Which column has the number of cases 234 | (first covid-world) 235 | ;; => ["Date" "CountryCode" "CountryName" "Confirmed" "Deaths" "Latitude" "Longitude"] 236 | 237 | ;; filter to show how many observations are zero 238 | 239 | (map #(nth % 3) (rest covid-world)) 240 | 241 | ;; The Confirmed data is strings of numbers, 242 | ;; ideally the Confirmed and Deaths should be numbers. 243 | 244 | ;; without changing the data format, filtering is harder 245 | 246 | (filter zero? (map #(nth % 3) (rest covid-world))) 247 | 248 | ;; using an anonymous function works 249 | 250 | (filter (fn [value] (= "0" value)) (map #(nth % 3) (rest covid-world))) 251 | 252 | ;; total 253 | 254 | (count (filter (fn [value] (= "0" value)) (map #(nth % 3) (rest covid-world)))) 255 | 256 | 257 | ;; Converting the data 258 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 259 | 260 | (def covid-world-data (next covid-world)) 261 | 262 | ;; manual conversion 263 | 264 | (def covid-world-data-converted 265 | (map (fn [[date country-code country-name confirmed death]] 266 | [date country-code country-name (Long/parseLong confirmed) (Long/parseLong death)]) 267 | covid-world-data)) 268 | 269 | covid-world-data-converted 270 | 271 | 272 | (count (filter zero? (map #(nth % 3) covid-world-data-converted))) 273 | 274 | ;; So in about half the observations there are no confirmed cases 275 | 276 | ;; Alternative: 277 | ;; semantic-csv can convert strings into Clojure types 278 | ;; as it parses the file. 279 | ;; This library creates maps from the CSV data rather than vectors 280 | ;; https://github.com/metasoarous/semantic-csv 281 | 282 | 283 | ;; Understanding the data a little more 284 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 285 | 286 | ;; There are several reasons for a zero recording on any give day 287 | ;; - pandemic had not reached a country by a particular date 288 | ;; - no tests were recorded on that date, or not published 289 | 290 | ;; how many observations happened on each day? 291 | 292 | (def date-frequencies 293 | (sort-by first (frequencies (map first covid-world-data-converted))) ) 294 | 295 | 296 | (take 10 date-frequencies) 297 | 298 | ;; The countries 299 | 300 | (take 10 (drop 10 date-frequencies)) 301 | 302 | (take 20 (drop 60 date-frequencies)) 303 | 304 | (drop-while #(= 66 (second %)) date-frequencies) 305 | 306 | ;; 66 countries were reporting up until March, 307 | ;; then there was a significant drop (change in reporting, discarding zero recordings) 308 | ;; March 11 2020 the pandemic as announced by the World Health Organisation 309 | ;; after that the numbers of observations increased markedly. 310 | 311 | 312 | ;; How much data for a particular country 313 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 314 | 315 | (def country-frequencies 316 | (sort-by first (frequencies (map second covid-world-data-converted)))) 317 | 318 | (take 10 country-frequencies) 319 | 320 | 321 | ;; narrowing to a few specific countries 322 | 323 | ;; create a set of countries to investigate 324 | 325 | (def selective-countries 326 | #{"IT" "FR" "ES" "CN"}) 327 | 328 | 329 | ;; filter for just those countries 330 | 331 | (filter (fn [[_ code]] (selective-countries code)) 332 | covid-world-data-converted) 333 | 334 | (take 10 335 | (filter (fn [[_ code]] (selective-countries code)) 336 | covid-world-data-converted)) 337 | 338 | 339 | 340 | ;; Helper functions 341 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 342 | 343 | 344 | (defn take-countries [data country-set] 345 | (filter (fn [[_ code]] (country-set code)) data)) 346 | 347 | (defn date-freqs [data] 348 | (sort-by first (frequencies (map first data)))) 349 | 350 | (defn country-freqs [data] 351 | (sort-by first (frequencies (map second data)))) 352 | 353 | (def my-countries 354 | (country-freqs 355 | (take-countries covid-world-data-converted 356 | #{"IT" "FR" "ES" "CN" "US" "RS" "DE"}))) 357 | 358 | 359 | (take 10 my-countries) 360 | 361 | 362 | 363 | 364 | ;; Plotting the data - with ascii graphs 365 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 366 | ;; ASCII-Data is a small Java library 367 | ;; for producing nice looking ASCII line-graphs and tables. 368 | ;; https://github.com/MitchTalmadge/ASCII-Data/wiki 369 | 370 | ;; Include the library in the namespace 371 | 372 | (import 'com.mitchtalmadge.asciidata.graph.ASCIIGraph) 373 | 374 | 375 | ;; Get the results from Serbia 376 | ;; just the confirmed cases 377 | ;; from the first reported case (ignore the initial zero reports) 378 | 379 | (def serbia-data 380 | (drop-while zero? 381 | (map #(nth % 3) 382 | (take-countries covid-world-data-converted #{"RS"})))) 383 | 384 | ;; first attempt at printing a graph 385 | 386 | (println 387 | (.plot (ASCIIGraph/fromSeries (double-array serbia-data)))) 388 | 389 | ;; The graph was plotted exponentially, so doesn't reflect growth correctly 390 | 391 | ;; We are interested in growth of confirmed cases, 392 | ;; not the absolute numbers. 393 | ;; So use the logarithm of this function 394 | 395 | ;; A logarithm helper function 396 | 397 | (defn logarithm ^double [^double x] 398 | (Math/log x)) 399 | 400 | 401 | (println (.plot (ASCIIGraph/fromSeries 402 | (double-array (map logarithm serbia-data))))) 403 | 404 | ;; Okay, now we can see the growth over time. 405 | 406 | 407 | ;; Make a helper function from this plotting code 408 | 409 | (defn log-plot 410 | "" 411 | [series-data] 412 | (.plot (ASCIIGraph/fromSeries 413 | (double-array (map logarithm series-data))))) 414 | 415 | 416 | ;; Plotting other countries 417 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 418 | 419 | (defn confirmed-cases 420 | "Extracts sequence of confirmed cases for a specified country. 421 | Observations before the first case was reported are not included 422 | 423 | Arguments: 424 | Data source as sequence of vectors with confirmed as integer values, 425 | Country code as a string 426 | 427 | Returns: Sequence of confirmed cases as integer values" 428 | 429 | [data-source country-code] 430 | 431 | (drop-while zero? (map #(nth % 3) 432 | (take-countries data-source #{country-code}))) ) 433 | 434 | ;; Italy 435 | 436 | (reverse (map logarithm (confirmed-cases covid-world-data-converted "IT"))) 437 | 438 | (println (log-plot (confirmed-cases covid-world-data-converted "IT"))) 439 | 440 | 441 | 442 | ;; China 443 | 444 | (println (log-plot (confirmed-cases covid-world-data-converted "CN"))) 445 | 446 | 447 | 448 | ;; Explicitly plotting the change 449 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 450 | 451 | ;; Show the range of changes of confirmed cases 452 | 453 | (defn absolute-plot [series-data] 454 | (.plot (ASCIIGraph/fromSeries 455 | (double-array series-data)))) 456 | 457 | (defn abolute-series 458 | [data-source country-code] 459 | (map #(/ % 1000) 460 | (reduce (fn [acc x] 461 | (conj acc (- x (peek acc)))) 462 | [0] 463 | (confirmed-cases data-source country-code)))) 464 | 465 | (println 466 | (absolute-plot 467 | (abolute-series covid-world-data-converted "CN"))) 468 | --------------------------------------------------------------------------------