├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── bb.edn ├── bin └── launchpad ├── data ├── marketing.csv ├── titanic │ ├── test.csv │ ├── titanic.zip │ └── train.csv └── tweets_sentiment.feather ├── deps.edn ├── deps.local.edn ├── doc └── intro.md ├── docs ├── gorilla-notes │ └── js │ │ └── compiled │ │ └── main.js ├── interactions_ols.html ├── notespace-files │ └── tree.svg ├── polyglot_kmeans.html ├── tune-titanic.html ├── userguide-advanced.html ├── userguide-categrical.html ├── userguide-experiment-tracking.html ├── userguide-intro.html ├── userguide-models.html ├── userguide-sklearnclj.html ├── userguide-third_party.html ├── userguide-titanic.html ├── userguide-transformers.html └── userguide-unsupervised.html ├── render_all.clj ├── render_titanic.clj ├── render_tune-titanic.clj ├── resources ├── .keep └── logback.xml ├── src └── scicloj │ └── ml │ ├── advanced.clj │ ├── categorical.clj │ ├── experiment_tracking.clj │ ├── interactions_ols.clj │ ├── intro.clj │ ├── models.clj │ ├── nested_cv.clj │ ├── polyglot_kmeans.clj │ ├── sklearnclj.clj │ ├── third_party.clj │ ├── titanic.clj │ ├── transformers.clj │ ├── tune_titanic.clj │ ├── ug_utils.clj │ ├── ug_utils_clerk.clj │ └── unsupervised.clj ├── submission.csv └── test └── scicloj └── ml └── tutorials_test.clj /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /classes 3 | /checkouts 4 | *.jar 5 | *.class 6 | /.calva/output-window/ 7 | /.cpcache 8 | /.lein-* 9 | /.lsp/sqlite*.db 10 | /.nrepl-history 11 | /.nrepl-port 12 | /.rebel_readline_history 13 | /.socket-repl-port 14 | .hgignore 15 | .hg/ 16 | /.cache/ 17 | /.classpath 18 | /.clj-kondo/ 19 | /.lsp/ 20 | /.project 21 | /.settings/ 22 | /.clerk/ 23 | /.vscode/ 24 | /cache_dir/ 25 | /docs/scicloj/ 26 | /public/ 27 | /runs/ 28 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | All notable changes to this project will be documented in this file. This change log follows the conventions of [keepachangelog.com](http://keepachangelog.com/). 3 | 4 | ## [Unreleased] 5 | ### Changed 6 | - Add a new arity to `make-widget-async` to provide a different widget shape. 7 | 8 | ## [0.1.1] - 2021-09-06 9 | ### Changed 10 | - Documentation on how to make the widgets. 11 | 12 | ### Removed 13 | - `make-widget-sync` - we're all async, all the time. 14 | 15 | ### Fixed 16 | - Fixed widget maker to keep working when daylight savings switches over. 17 | 18 | ## 0.1.0 - 2021-09-06 19 | ### Added 20 | - Files from the new template. 21 | - Widget maker public API - `make-widget-sync`. 22 | 23 | [Unreleased]: https://github.com/scicloj/ml.tutorials/compare/0.1.1...HEAD 24 | [0.1.1]: https://github.com/scicloj/ml.tutorials/compare/0.1.0...0.1.1 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC 2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM 3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. 4 | 5 | 1. DEFINITIONS 6 | 7 | "Contribution" means: 8 | 9 | a) in the case of the initial Contributor, the initial code and 10 | documentation distributed under this Agreement, and 11 | 12 | b) in the case of each subsequent Contributor: 13 | 14 | i) changes to the Program, and 15 | 16 | ii) additions to the Program; 17 | 18 | where such changes and/or additions to the Program originate from and are 19 | distributed by that particular Contributor. A Contribution 'originates' from 20 | a Contributor if it was added to the Program by such Contributor itself or 21 | anyone acting on such Contributor's behalf. Contributions do not include 22 | additions to the Program which: (i) are separate modules of software 23 | distributed in conjunction with the Program under their own license 24 | agreement, and (ii) are not derivative works of the Program. 25 | 26 | "Contributor" means any person or entity that distributes the Program. 27 | 28 | "Licensed Patents" mean patent claims licensable by a Contributor which are 29 | necessarily infringed by the use or sale of its Contribution alone or when 30 | combined with the Program. 31 | 32 | "Program" means the Contributions distributed in accordance with this 33 | Agreement. 34 | 35 | "Recipient" means anyone who receives the Program under this Agreement, 36 | including all Contributors. 37 | 38 | 2. GRANT OF RIGHTS 39 | 40 | a) Subject to the terms of this Agreement, each Contributor hereby grants 41 | Recipient a non-exclusive, worldwide, royalty-free copyright license to 42 | reproduce, prepare derivative works of, publicly display, publicly perform, 43 | distribute and sublicense the Contribution of such Contributor, if any, and 44 | such derivative works, in source code and object code form. 45 | 46 | b) Subject to the terms of this Agreement, each Contributor hereby grants 47 | Recipient a non-exclusive, worldwide, royalty-free patent license under 48 | Licensed Patents to make, use, sell, offer to sell, import and otherwise 49 | transfer the Contribution of such Contributor, if any, in source code and 50 | object code form. This patent license shall apply to the combination of the 51 | Contribution and the Program if, at the time the Contribution is added by the 52 | Contributor, such addition of the Contribution causes such combination to be 53 | covered by the Licensed Patents. The patent license shall not apply to any 54 | other combinations which include the Contribution. No hardware per se is 55 | licensed hereunder. 56 | 57 | c) Recipient understands that although each Contributor grants the licenses 58 | to its Contributions set forth herein, no assurances are provided by any 59 | Contributor that the Program does not infringe the patent or other 60 | intellectual property rights of any other entity. Each Contributor disclaims 61 | any liability to Recipient for claims brought by any other entity based on 62 | infringement of intellectual property rights or otherwise. As a condition to 63 | exercising the rights and licenses granted hereunder, each Recipient hereby 64 | assumes sole responsibility to secure any other intellectual property rights 65 | needed, if any. For example, if a third party patent license is required to 66 | allow Recipient to distribute the Program, it is Recipient's responsibility 67 | to acquire that license before distributing the Program. 68 | 69 | d) Each Contributor represents that to its knowledge it has sufficient 70 | copyright rights in its Contribution, if any, to grant the copyright license 71 | set forth in this Agreement. 72 | 73 | 3. REQUIREMENTS 74 | 75 | A Contributor may choose to distribute the Program in object code form under 76 | its own license agreement, provided that: 77 | 78 | a) it complies with the terms and conditions of this Agreement; and 79 | 80 | b) its license agreement: 81 | 82 | i) effectively disclaims on behalf of all Contributors all warranties and 83 | conditions, express and implied, including warranties or conditions of title 84 | and non-infringement, and implied warranties or conditions of merchantability 85 | and fitness for a particular purpose; 86 | 87 | ii) effectively excludes on behalf of all Contributors all liability for 88 | damages, including direct, indirect, special, incidental and consequential 89 | damages, such as lost profits; 90 | 91 | iii) states that any provisions which differ from this Agreement are offered 92 | by that Contributor alone and not by any other party; and 93 | 94 | iv) states that source code for the Program is available from such 95 | Contributor, and informs licensees how to obtain it in a reasonable manner on 96 | or through a medium customarily used for software exchange. 97 | 98 | When the Program is made available in source code form: 99 | 100 | a) it must be made available under this Agreement; and 101 | 102 | b) a copy of this Agreement must be included with each copy of the Program. 103 | 104 | Contributors may not remove or alter any copyright notices contained within 105 | the Program. 106 | 107 | Each Contributor must identify itself as the originator of its Contribution, 108 | if any, in a manner that reasonably allows subsequent Recipients to identify 109 | the originator of the Contribution. 110 | 111 | 4. COMMERCIAL DISTRIBUTION 112 | 113 | Commercial distributors of software may accept certain responsibilities with 114 | respect to end users, business partners and the like. While this license is 115 | intended to facilitate the commercial use of the Program, the Contributor who 116 | includes the Program in a commercial product offering should do so in a 117 | manner which does not create potential liability for other Contributors. 118 | Therefore, if a Contributor includes the Program in a commercial product 119 | offering, such Contributor ("Commercial Contributor") hereby agrees to defend 120 | and indemnify every other Contributor ("Indemnified Contributor") against any 121 | losses, damages and costs (collectively "Losses") arising from claims, 122 | lawsuits and other legal actions brought by a third party against the 123 | Indemnified Contributor to the extent caused by the acts or omissions of such 124 | Commercial Contributor in connection with its distribution of the Program in 125 | a commercial product offering. The obligations in this section do not apply 126 | to any claims or Losses relating to any actual or alleged intellectual 127 | property infringement. In order to qualify, an Indemnified Contributor must: 128 | a) promptly notify the Commercial Contributor in writing of such claim, and 129 | b) allow the Commercial Contributor to control, and cooperate with the 130 | Commercial Contributor in, the defense and any related settlement 131 | negotiations. The Indemnified Contributor may participate in any such claim 132 | at its own expense. 133 | 134 | For example, a Contributor might include the Program in a commercial product 135 | offering, Product X. That Contributor is then a Commercial Contributor. If 136 | that Commercial Contributor then makes performance claims, or offers 137 | warranties related to Product X, those performance claims and warranties are 138 | such Commercial Contributor's responsibility alone. Under this section, the 139 | Commercial Contributor would have to defend claims against the other 140 | Contributors related to those performance claims and warranties, and if a 141 | court requires any other Contributor to pay any damages as a result, the 142 | Commercial Contributor must pay those damages. 143 | 144 | 5. NO WARRANTY 145 | 146 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON 147 | AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER 148 | EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR 149 | CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A 150 | PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the 151 | appropriateness of using and distributing the Program and assumes all risks 152 | associated with its exercise of rights under this Agreement , including but 153 | not limited to the risks and costs of program errors, compliance with 154 | applicable laws, damage to or loss of data, programs or equipment, and 155 | unavailability or interruption of operations. 156 | 157 | 6. DISCLAIMER OF LIABILITY 158 | 159 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY 160 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, 161 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION 162 | LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 163 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 164 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE 165 | EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY 166 | OF SUCH DAMAGES. 167 | 168 | 7. GENERAL 169 | 170 | If any provision of this Agreement is invalid or unenforceable under 171 | applicable law, it shall not affect the validity or enforceability of the 172 | remainder of the terms of this Agreement, and without further action by the 173 | parties hereto, such provision shall be reformed to the minimum extent 174 | necessary to make such provision valid and enforceable. 175 | 176 | If Recipient institutes patent litigation against any entity (including a 177 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself 178 | (excluding combinations of the Program with other software or hardware) 179 | infringes such Recipient's patent(s), then such Recipient's rights granted 180 | under Section 2(b) shall terminate as of the date such litigation is filed. 181 | 182 | All Recipient's rights under this Agreement shall terminate if it fails to 183 | comply with any of the material terms or conditions of this Agreement and 184 | does not cure such failure in a reasonable period of time after becoming 185 | aware of such noncompliance. If all Recipient's rights under this Agreement 186 | terminate, Recipient agrees to cease use and distribution of the Program as 187 | soon as reasonably practicable. However, Recipient's obligations under this 188 | Agreement and any licenses granted by Recipient relating to the Program shall 189 | continue and survive. 190 | 191 | Everyone is permitted to copy and distribute copies of this Agreement, but in 192 | order to avoid inconsistency the Agreement is copyrighted and may only be 193 | modified in the following manner. The Agreement Steward reserves the right to 194 | publish new versions (including revisions) of this Agreement from time to 195 | time. No one other than the Agreement Steward has the right to modify this 196 | Agreement. The Eclipse Foundation is the initial Agreement Steward. The 197 | Eclipse Foundation may assign the responsibility to serve as the Agreement 198 | Steward to a suitable separate entity. Each new version of the Agreement will 199 | be given a distinguishing version number. The Program (including 200 | Contributions) may always be distributed subject to the version of the 201 | Agreement under which it was received. In addition, after a new version of 202 | the Agreement is published, Contributor may elect to distribute the Program 203 | (including its Contributions) under the new version. Except as expressly 204 | stated in Sections 2(a) and 2(b) above, Recipient receives no rights or 205 | licenses to the intellectual property of any Contributor under this 206 | Agreement, whether expressly, by implication, estoppel or otherwise. All 207 | rights in the Program not expressly granted under this Agreement are 208 | reserved. 209 | 210 | This Agreement is governed by the laws of the State of New York and the 211 | intellectual property laws of the United States of America. No party to this 212 | Agreement will bring a legal action under this Agreement more than one year 213 | after the cause of action arose. Each party waives its rights to a jury trial 214 | in any resulting litigation. 215 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | >[!NOTE] 2 | >*** 3 | >The usage of the shim `scicloj.ml` is now considered deprecated. The underlying libraries should be used directly or via 4 | >[noj](https://github.com/scicloj/noj) is a new libray to combine several of these libraries, without remapping the namespaces. 5 | >It contains as well updated versions of several of the tutorials here. 6 | >The code inside the tutorials is still valid and mostyly working, but the functions are in different namespaces when 7 | >used withouth `scicloj.ml` 8 | >*** 9 | 10 | 11 | # Tutorials for [scicloj.ml](https://github.com/scicloj/scicloj.ml) 12 | 13 | The Clojure machine learning library scicloj.ml is documented here: 14 | 15 | * [Userguide - introduction](https://scicloj.github.io/scicloj.ml-tutorials/userguide-intro.html) 16 | * [Userguide - advanced](https://scicloj.github.io/scicloj.ml-tutorials/userguide-advanced.html) 17 | * [Userguide - categorical](https://scicloj.github.io/scicloj.ml-tutorials/userguide-categrical.html) 18 | * [Reference of ML models](https://scicloj.github.io/scicloj.ml-tutorials/userguide-models.html) 19 | * [Reference of transformer functions](https://scicloj.github.io/scicloj.ml-tutorials/userguide-transformers.html) 20 | * [Example usage - predict titanic survival](https://scicloj.github.io/scicloj.ml-tutorials/userguide-titanic.html) 21 | * [Example usage - hyper parametertuning of a pipeline](https://scicloj.github.io/scicloj.ml-tutorials/tune-titanic.html) 22 | * [How to use sklearn models](https://scicloj.github.io/scicloj.ml-tutorials/userguide-sklearnclj.html) 23 | * [Reference of other libraries integrated with scicloj.ml](https://scicloj.github.io/scicloj.ml-tutorials/userguide-third_party.html) 24 | * [kmeans in Python vs Clojure](https://scicloj.github.io/scicloj.ml-tutorials/polyglot_kmeans.html) 25 | * [Experiment tracking](https://scicloj.github.io/scicloj.ml-tutorials/userguide-experiment-tracking.html) 26 | * [Unsupervised learning](https://scicloj.github.io/scicloj.ml-tutorials/userguide-unsupervised.html) 27 | * [Variable interaction in linear regression](https://scicloj.github.io/scicloj.ml-tutorials/interactions_ols.html) 28 | 29 | 30 | The source files for this documentation using [notespace](https://github.com/scicloj/notespace) 31 | and [Clerk](https://github.com/nextjournal/clerk) are in this repository. 32 | -------------------------------------------------------------------------------- /bb.edn: -------------------------------------------------------------------------------- 1 | {:deps {com.lambdaisland/launchpad {:mvn/version "0.9.49-alpha"}}} 2 | 3 | -------------------------------------------------------------------------------- /bin/launchpad: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bb 2 | 3 | (require '[lambdaisland.launchpad :as launchpad]) 4 | 5 | (launchpad/main {}) 6 | 7 | ;; (launchpad/main {:steps (into [(partial launchpad/ensure-java-version 17)] 8 | ;; launchpad/default-steps)}) 9 | -------------------------------------------------------------------------------- /data/marketing.csv: -------------------------------------------------------------------------------- 1 | youtube,facebook,newspaper,sales 2 | 276.12,45.35999999999999,83.04,26.52 3 | 53.4,47.16,54.12,12.48 4 | 20.639999999999997,55.08,83.16,11.16 5 | 181.79999999999998,49.559999999999995,70.2,22.2 6 | 216.96,12.96,70.08,15.48 7 | 10.44,58.67999999999999,90,8.64 8 | 69,39.35999999999999,28.2,14.16 9 | 144.24,23.52,13.92,15.839999999999998 10 | 10.319999999999999,2.52,1.2,5.76 11 | 239.76,3.12,25.439999999999998,12.719999999999999 12 | 79.32,6.96,29.04,10.319999999999999 13 | 257.64,28.799999999999997,4.8,20.88 14 | 28.56,42.12,79.08,11.04 15 | 117,9.12,8.64,11.639999999999999 16 | 244.92,39.48,55.199999999999996,22.8 17 | 234.48,57.24,63.48,26.88 18 | 81.36,43.92,136.79999999999998,15 19 | 337.67999999999995,47.52,66.96,29.279999999999998 20 | 83.04,24.599999999999998,21.96,13.56 21 | 176.76000000000002,28.679999999999996,22.92,17.52 22 | 262.08,33.239999999999995,64.08,21.599999999999998 23 | 284.88,6.119999999999999,28.2,15 24 | 15.839999999999998,19.08,59.519999999999996,6.72 25 | 273.96,20.279999999999998,31.439999999999998,18.599999999999998 26 | 74.75999999999999,15.12,21.96,11.639999999999999 27 | 315.47999999999996,4.2,23.4,14.399999999999999 28 | 171.48,35.16,15.12,18 29 | 288.12,20.04,27.479999999999997,19.08 30 | 298.56,32.52,27.479999999999997,22.679999999999996 31 | 84.71999999999998,19.2,48.959999999999994,12.6 32 | 351.47999999999996,33.96,51.84,25.679999999999996 33 | 135.48,20.88,46.32,14.28 34 | 116.64,1.7999999999999998,36,11.52 35 | 318.72,24,0.36,20.88 36 | 114.84,1.68,8.88,11.4 37 | 348.84,4.919999999999999,10.2,15.36 38 | 320.28,52.559999999999995,6,30.479999999999997 39 | 89.64,59.279999999999994,54.84,17.639999999999997 40 | 51.72,32.04,42.12,12.12 41 | 273.59999999999997,45.24,38.4,25.8 42 | 243,26.76,37.92,19.92 43 | 212.4,40.08,46.440000000000005,20.52 44 | 352.32,33.239999999999995,2.16,24.84 45 | 248.28,10.08,31.679999999999996,15.48 46 | 30.12,30.839999999999996,51.959999999999994,10.2 47 | 210.11999999999998,27,37.8,17.88 48 | 107.64,11.88,42.84,12.719999999999999 49 | 287.88,49.8,22.2,27.84 50 | 272.64,18.96,59.879999999999995,17.76 51 | 80.28,14.04,44.16,11.639999999999999 52 | 239.76,3.7199999999999998,41.52,13.68 53 | 120.48,11.52,4.32,12.839999999999998 54 | 259.68,50.04,47.52,27.12 55 | 219.11999999999998,55.440000000000005,70.44,25.439999999999998 56 | 315.23999999999995,34.56,19.08,24.24 57 | 238.68,59.279999999999994,72,28.439999999999998 58 | 8.76,33.72,49.68,6.6 59 | 163.43999999999997,23.04,19.92,15.839999999999998 60 | 252.96,59.519999999999996,45.24,28.56 61 | 252.83999999999997,35.4,11.16,22.08 62 | 64.2,2.4,25.679999999999996,9.719999999999999 63 | 313.56,51.24,65.64,29.04 64 | 287.16,18.599999999999998,32.76,18.84 65 | 123.24,35.52,10.08,16.8 66 | 157.32,51.35999999999999,34.68,21.599999999999998 67 | 82.8,11.16,1.08,11.16 68 | 37.8,29.52,2.64,11.4 69 | 167.16,17.4,12.239999999999998,16.08 70 | 284.88,33,13.2,22.679999999999996 71 | 260.16,52.68,32.64,26.76 72 | 238.92,36.72,46.440000000000005,21.96 73 | 131.76,17.16,38.04,14.879999999999999 74 | 32.16,39.6,23.16,10.56 75 | 155.28,6.84,37.56,13.2 76 | 256.08,29.52,15.719999999999999,20.4 77 | 20.279999999999998,52.440000000000005,107.28,10.44 78 | 33,1.92,24.84,8.28 79 | 144.6,34.199999999999996,17.04,17.04 80 | 6.48,35.879999999999995,11.28,6.359999999999999 81 | 139.2,9.24,27.720000000000002,13.2 82 | 91.68,32.04,26.76,14.16 83 | 287.76,4.919999999999999,44.279999999999994,14.76 84 | 90.36,24.36,39,13.56 85 | 82.08,53.4,42.72,16.32 86 | 256.2,51.6,40.559999999999995,26.04 87 | 231.83999999999997,22.08,78.84,18.24 88 | 91.55999999999999,33,19.2,14.399999999999999 89 | 132.84,48.72,75.84,19.2 90 | 105.96,30.599999999999998,88.08,15.48 91 | 131.76,57.35999999999999,61.67999999999999,20.04 92 | 161.16,5.88,11.16,13.44 93 | 34.32,1.7999999999999998,39.6,8.76 94 | 261.23999999999995,40.199999999999996,70.8,23.279999999999998 95 | 301.08,43.8,86.75999999999999,26.639999999999997 96 | 128.88,16.8,13.08,13.799999999999999 97 | 195.96,37.92,63.48,20.279999999999998 98 | 237.11999999999998,4.2,7.08,14.04 99 | 221.88,25.2,26.4,18.599999999999998 100 | 347.64,50.76,61.44,30.479999999999997 101 | 162.23999999999998,50.04,55.08,20.639999999999997 102 | 266.88,5.159999999999999,59.75999999999999,14.04 103 | 355.67999999999995,43.559999999999995,121.08,28.56 104 | 336.23999999999995,12.12,25.679999999999996,17.76 105 | 225.48,20.639999999999997,21.479999999999997,17.639999999999997 106 | 285.84,41.16,6.359999999999999,24.84 107 | 165.48,55.68,70.8,23.04 108 | 30,13.2,35.64,8.64 109 | 108.48,0.36,27.84,10.44 110 | 15.719999999999999,0.48,30.72,6.359999999999999 111 | 306.48,32.279999999999994,6.6,23.76 112 | 270.96,9.839999999999998,67.8,16.08 113 | 290.03999999999996,45.6,27.84,26.16 114 | 210.83999999999997,18.48,2.88,16.919999999999998 115 | 251.51999999999998,24.720000000000002,12.839999999999998,19.08 116 | 93.84,56.16,41.4,17.52 117 | 90.11999999999999,42,63.24,15.12 118 | 167.04,17.16,30.72,14.639999999999999 119 | 91.68,0.96,17.76,11.28 120 | 150.84,44.279999999999994,95.04,19.08 121 | 23.279999999999998,19.2,26.76,7.919999999999999 122 | 169.56,32.16,55.440000000000005,18.599999999999998 123 | 22.56,26.04,60.48,8.4 124 | 268.8,2.88,18.72,13.92 125 | 147.72,41.52,14.879999999999999,18.24 126 | 275.4,38.76,89.04,23.639999999999997 127 | 104.64,14.16,31.08,12.719999999999999 128 | 9.36,46.68,60.72,7.919999999999999 129 | 96.24,0,11.04,10.56 130 | 264.36,58.8,3.84,29.639999999999997 131 | 71.52,14.399999999999999,51.72,11.639999999999999 132 | 0.84,47.52,10.44,1.92 133 | 318.23999999999995,3.48,51.6,15.239999999999998 134 | 10.08,32.64,2.52,6.84 135 | 263.76,40.199999999999996,54.12,23.52 136 | 44.279999999999994,46.32,78.71999999999998,12.96 137 | 57.959999999999994,56.4,10.2,13.92 138 | 30.72,46.8,11.16,11.4 139 | 328.44,34.68,71.64,24.96 140 | 51.6,31.08,24.599999999999998,11.52 141 | 221.88,52.68,2.04,24.84 142 | 88.08,20.4,15.48,13.08 143 | 232.43999999999997,42.48,90.71999999999998,23.04 144 | 264.59999999999997,39.84,45.48,24.12 145 | 125.51999999999998,6.84,41.279999999999994,12.48 146 | 115.44,17.76,46.68,13.68 147 | 168.36,2.28,10.799999999999999,12.360000000000001 148 | 288.12,8.76,10.44,15.839999999999998 149 | 291.84,58.8,53.16,30.479999999999997 150 | 45.6,48.35999999999999,14.28,13.08 151 | 53.64,30.96,24.720000000000002,12.12 152 | 336.84,16.68,44.4,19.32 153 | 145.2,10.08,58.44,13.92 154 | 237.11999999999998,27.96,17.04,19.92 155 | 205.56,47.64,45.24,22.8 156 | 225.36,25.32,11.4,18.72 157 | 4.919999999999999,13.92,6.84,3.84 158 | 112.68,52.199999999999996,60.599999999999994,18.36 159 | 179.76000000000002,1.56,29.16,12.12 160 | 14.04,44.279999999999994,54.24,8.76 161 | 158.04,22.08,41.52,15.48 162 | 207,21.720000000000002,36.839999999999996,17.28 163 | 102.84,42.959999999999994,59.16,15.96 164 | 226.08,21.720000000000002,30.72,17.88 165 | 196.2,44.16,8.88,21.599999999999998 166 | 140.64,17.639999999999997,6.48,14.28 167 | 281.4,4.08,101.75999999999999,14.28 168 | 21.479999999999997,45.12,25.92,9.6 169 | 248.16,6.24,23.279999999999998,14.639999999999999 170 | 258.48,28.32,69.12,20.52 171 | 341.16,12.719999999999999,7.68,18 172 | 60,13.92,22.08,10.08 173 | 197.4,25.08,56.879999999999995,17.4 174 | 23.52,24.12,20.4,9.12 175 | 202.08,8.52,15.36,14.04 176 | 266.88,4.08,15.719999999999999,13.799999999999999 177 | 332.28,58.67999999999999,50.16,32.4 178 | 298.08,36.239999999999995,24.36,24.24 179 | 204.23999999999998,9.36,42.24,14.04 180 | 332.03999999999996,2.76,28.439999999999998,14.16 181 | 198.72,12,21.12,15.12 182 | 187.92,3.12,9.96,12.6 183 | 262.2,6.48,32.879999999999995,14.639999999999999 184 | 67.44,6.84,35.64,10.44 185 | 345.12,51.6,86.16,31.439999999999998 186 | 304.56,25.56,36,21.12 187 | 246,54.12,23.52,27.12 188 | 167.4,2.52,31.92,12.360000000000001 189 | 229.32,34.44,21.84,20.76 190 | 343.2,16.68,4.44,19.08 191 | 22.439999999999998,14.52,28.08,8.04 192 | 47.4,49.32,6.96,12.96 193 | 90.6,12.96,7.199999999999999,11.88 194 | 20.639999999999997,4.919999999999999,37.92,7.08 195 | 200.16,50.4,4.32,23.52 196 | 179.64,42.72,7.199999999999999,20.76 197 | 45.84,4.44,16.56,9.12 198 | 113.04,5.88,9.719999999999999,11.639999999999999 199 | 212.4,11.16,7.68,15.36 200 | 340.32,50.4,79.44,30.599999999999998 201 | 278.52,10.319999999999999,10.44,16.08 202 | -------------------------------------------------------------------------------- /data/titanic/titanic.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scicloj/scicloj.ml-tutorials/951cde0b8bd0a1b22ec856d28a6122d69d34836f/data/titanic/titanic.zip -------------------------------------------------------------------------------- /data/tweets_sentiment.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scicloj/scicloj.ml-tutorials/951cde0b8bd0a1b22ec856d28a6122d69d34836f/data/tweets_sentiment.feather -------------------------------------------------------------------------------- /deps.edn: -------------------------------------------------------------------------------- 1 | {:paths ["src" "resources"] 2 | 3 | 4 | :deps { 5 | io.github.nextjournal/clerk {:git/sha "a6bfc832a182ef3068d60a318985681ddb913595" 6 | :git/url "https://github.com/nextjournal/clerk.git"} 7 | 8 | ;; {:mvn/version "0.11.603"} 9 | org.clojure/clojure {:mvn/version "1.11.1"} 10 | 11 | scicloj/scicloj.ml {:mvn/version "0.2.2"} 12 | 13 | org.scicloj/scicloj.ml.clj-djl {:mvn/version "0.1.11"} 14 | scicloj/sklearn-clj {:mvn/version "0.3.7"} 15 | 16 | org.apache.arrow/arrow-vector {:mvn/version "6.0.0"} 17 | org.lz4/lz4-java {:mvn/version "1.8.0"} 18 | com.github.luben/zstd-jni {:mvn/version "1.5.1-1"} 19 | org.clojure/tools.logging {:mvn/version "1.2.4"} 20 | com.fasterxml.jackson.core/jackson-databind {:mvn/version 21 | "2.13.2"} 22 | com.fasterxml.jackson.core/jackson-core {:mvn/version 23 | "2.13.2"} 24 | 25 | com.fasterxml.jackson.core/jackson-annotations {:mvn/version 26 | "2.13.2"} 27 | 28 | ch.qos.logback/logback-classic {:mvn/version "1.4.4"} 29 | scicloj/notespace {:mvn/version "3-beta9"} 30 | 31 | dk.simongray/datalinguist {:mvn/version "0.1.163"} 32 | applied-science/waqi {:git/url "https://github.com/applied-science/waqi/" 33 | :sha "faefe5dfd1b161ff70089924591ac2d699527811"} 34 | clj-python/libpython-clj {:mvn/version "2.020"} 35 | scicloj/clojisr {:mvn/version "1.0.0-BETA20"} 36 | 37 | generateme/fastmath {:mvn/version "2.1.6"} 38 | uncomplicate/neanderthal {:mvn/version "0.43.0"} 39 | aerial.hanami/aerial.hanami {:mvn/version "0.12.9"} 40 | net.clojars.behrica/cluster_eval {:git/url "https://github.com/behrica/cluster-eval.git" 41 | :sha "ca34283a67bf18c8025955865fb567bd6e2e9a9a"}} 42 | ;; appliedsciencestudio/rdata {:git/url "https://github.com/appliedsciencestudio/rdata/" 43 | ;; :sha "151e6dead06b38995f1f30b09d954a060f7a2a9c"} 44 | 45 | 46 | 47 | 48 | :aliases 49 | 50 | 51 | { 52 | :jdk-17 53 | {:jvm-opts ["--add-modules" "jdk.incubator.foreign" 54 | "--enable-native-access=ALL-UNNAMED"]} 55 | 56 | :reveal {:extra-deps {vlaaad/reveal {:mvn/version "1.3.250"}} 57 | :ns-default vlaaad.reveal 58 | :exec-fn repl} 59 | :reveal-nrepl-middleware 60 | {:extra-deps {vlaaad/reveal {:mvn/version "1.3.194"}} 61 | :main-opts ["-m" "nrepl.cmdline" 62 | "--middleware" "[vlaaad.reveal.nrepl/middleware,cider.nrepl/cider-middleware]"]} 63 | 64 | :jar {:replace-deps {com.github.seancorfield/depstar {:mvn/version "2.1.278"}} 65 | :exec-fn hf.depstar/jar 66 | :exec-args {:jar "ml.tutorials.jar" :sync-pom true}} 67 | :install {:replace-deps {slipset/deps-deploy {:mvn/version "0.1.5"}} 68 | :exec-fn deps-deploy.deps-deploy/deploy 69 | :exec-args {:installer :local :artifact "ml.tutorials.jar"}} 70 | :deploy {:replace-deps {slipset/deps-deploy {:mvn/version "0.1.5"}} 71 | :exec-fn deps-deploy.deps-deploy/deploy 72 | :exec-args {:installer :remote :artifact "ml.tutorials.jar"}}}} 73 | -------------------------------------------------------------------------------- /deps.local.edn: -------------------------------------------------------------------------------- 1 | {;; regular deps.edn stuff will work in here 2 | :deps {} 3 | :aliases {} 4 | 5 | ;; but some extra keys are supported to influence launchpad itself 6 | :launchpad/aliases [:jdk-17 :test] ; additional aliases, will be added to whatever 7 | ; aliases you specify on the command line 8 | :launchpad/main-opts ["--emacs"]} ; additional CLI flags, so you can encode your 9 | ; own preferences 10 | ; which shadow builds to start, although it may 11 | ; be preferable to configure this as part of 12 | ; specific aliases in your main deps.edn 13 | -------------------------------------------------------------------------------- /doc/intro.md: -------------------------------------------------------------------------------- 1 | # Introduction to ml.tutorials 2 | 3 | TODO: write [great documentation](http://jacobian.org/writing/what-to-write/) 4 | -------------------------------------------------------------------------------- /docs/interactions_ols.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |

Loading ...

18 |
19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /docs/notespace-files/tree.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | CART 11 | 12 | 13 | 14 | 1 15 | 16 | petal_length ≤ 2.45 17 | size = 150 18 | impurity reduction = 0.3333 19 | 20 | 21 | 22 | 2 23 | 24 | species = 0 25 | size = 50 26 | deviance = 3.8466 27 | 28 | 29 | 30 | 1->2 31 | 32 | 33 | True 34 | 35 | 36 | 37 | 3 38 | 39 | petal_width ≤ 1.75 40 | size = 100 41 | impurity reduction = 0.3897 42 | 43 | 44 | 45 | 1->3 46 | 47 | 48 | False 49 | 50 | 51 | 52 | 6 53 | 54 | sepal_length ≤ 7.1 55 | size = 54 56 | impurity reduction = 0.0311 57 | 58 | 59 | 60 | 3->6 61 | 62 | 63 | 64 | 65 | 66 | 7 67 | 68 | species = 2 69 | size = 46 70 | deviance = 12.0834 71 | 72 | 73 | 74 | 3->7 75 | 76 | 77 | 78 | 79 | 80 | 12 81 | 82 | petal_width ≤ 1.65 83 | size = 53 84 | impurity reduction = 0.0141 85 | 86 | 87 | 88 | 6->12 89 | 90 | 91 | 92 | 93 | 94 | 13 95 | 96 | species = 2 97 | size = 1 98 | deviance = 1.3863 99 | 100 | 101 | 102 | 6->13 103 | 104 | 105 | 106 | 107 | 108 | 24 109 | 110 | species = 1 111 | size = 51 112 | deviance = 24.9439 113 | 114 | 115 | 116 | 12->24 117 | 118 | 119 | 120 | 121 | 122 | 25 123 | 124 | sepal_width ≤ 2.75 125 | size = 2 126 | impurity reduction = 0.5000 127 | 128 | 129 | 130 | 12->25 131 | 132 | 133 | 134 | 135 | 136 | 50 137 | 138 | species = 2 139 | size = 1 140 | deviance = 1.3863 141 | 142 | 143 | 144 | 25->50 145 | 146 | 147 | 148 | 149 | 150 | 51 151 | 152 | species = 1 153 | size = 1 154 | deviance = 1.3863 155 | 156 | 157 | 158 | 25->51 159 | 160 | 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /docs/tune-titanic.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |

Loading ...

18 |
19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /docs/userguide-categrical.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |

Loading ...

18 |
19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /docs/userguide-intro.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |

Loading ...

18 |
19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /render_all.clj: -------------------------------------------------------------------------------- 1 | (ns render-all 2 | (:require [notespace.cli :as cli] 3 | [notespace.api :as note] 4 | [clojure.java.shell] 5 | [nextjournal.clerk :as clerk])) 6 | 7 | 8 | (def nss [ 9 | {:ns 'scicloj.ml.intro :output-file "docs/userguide-intro.html"} 10 | {:ns 'scicloj.ml.advanced :output-file "docs/userguide-advanced.html"} 11 | {:ns 'scicloj.ml.categorical :output-file "docs/userguide-categrical.html"} 12 | {:ns 'scicloj.ml.transformers :output-file "docs/userguide-transformers.html"} 13 | {:ns 'scicloj.ml.titanic :output-file "docs/userguide-titanic.html"} 14 | {:ns 'scicloj.ml.tune-titanic :output-file "docs/tune-titanic.html"} 15 | {:ns 'scicloj.ml.sklearnclj :output-file "docs/userguide-sklearnclj.html"} 16 | {:ns 'scicloj.ml.third-party :output-file "docs/userguide-third_party.html"} 17 | {:ns 'scicloj.ml.experiment-tracking :output-file "docs/userguide-experiment-tracking.html"} 18 | {:ns 'scicloj.ml.unsupervised :output-file "docs/userguide-unsupervised.html"} 19 | {:ns 'scicloj.ml.interactions-ols :output-file "docs/interactions_ols.html"}]) 20 | 21 | 22 | (note/init :port 5678) 23 | 24 | (run! 25 | 26 | #(do 27 | (println "render ns: " %) 28 | (cli/eval-and-render-a-notespace %)) 29 | nss) 30 | 31 | (require '[nextjournal.clerk :as clerk]) 32 | 33 | (clerk/build! {:paths ["src/scicloj/ml/models.clj"] 34 | :bundle? true 35 | :out-path "output"}) 36 | 37 | (println 38 | (clojure.java.shell/sh "mv" "output/index.html" "docs/userguide-models.html")) 39 | 40 | (clerk/build! {:paths ["src/scicloj/ml/polyglot_kmeans.clj"] 41 | :bundle? true 42 | :out-path "output"}) 43 | 44 | (println 45 | (clojure.java.shell/sh "mv" "output/index.html" "docs/polyglot_kmeans.html")) 46 | 47 | (System/exit 0) 48 | -------------------------------------------------------------------------------- /render_titanic.clj: -------------------------------------------------------------------------------- 1 | (ns render-titanic 2 | (:require [notespace.cli :as cli] 3 | [notespace.api :as note])) 4 | 5 | (note/init :port 5678) 6 | 7 | (cli/eval-and-render-a-notespace {:ns 'scicloj.ml.titanic}) 8 | (System/exit 0) 9 | -------------------------------------------------------------------------------- /render_tune-titanic.clj: -------------------------------------------------------------------------------- 1 | (ns render-titanic 2 | (:require [notespace.cli :as cli] 3 | [notespace.api :as note])) 4 | 5 | (note/init :port 5678) 6 | 7 | (cli/eval-and-render-a-notespace {:ns 'scicloj.ml.tune-titanic}) 8 | (System/exit 0) 9 | -------------------------------------------------------------------------------- /resources/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scicloj/scicloj.ml-tutorials/951cde0b8bd0a1b22ec856d28a6122d69d34836f/resources/.keep -------------------------------------------------------------------------------- /resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /src/scicloj/ml/categorical.clj: -------------------------------------------------------------------------------- 1 | (ns scicloj.ml.categorical 2 | 3 | (:require [notespace.api :as note] 4 | [notespace.kinds :as kind])) 5 | 6 | 7 | (comment 8 | (note/init-with-browser) 9 | (note/eval-and-realize-this-notespace)) 10 | 11 | (require '[scicloj.ml.core :as ml] 12 | '[scicloj.ml.metamorph :as mm] 13 | '[scicloj.ml.dataset :as ds]) 14 | 15 | ["# Handling of categorical variables"] 16 | 17 | ["We keep important information in the metadata of the column, 18 | which can be inspected"] 19 | 20 | ["## categorical -> number"] 21 | ["Categorical columns can be converted too numbers, which is needed by several ML models."] 22 | 23 | (def ds-cat 24 | (ds/dataset {:a [:x :y :x]})) 25 | 26 | ["inspect column metadata and observe datatype :kewyword"] 27 | (-> ds-cat :a meta) 28 | 29 | ["convert categorical columns to numeric"] 30 | (def ds-number 31 | (ds/categorical->number 32 | ds-cat :all {} :int)) 33 | 34 | ^kind/dataset 35 | ds-number 36 | 37 | ["metadata has changed as well, int now, and with a lookup table"] 38 | (-> ds-number :a meta) 39 | 40 | 41 | 42 | 43 | ["## categorical -> one-hot"] 44 | ["Categorical columns can be converted to one-hot columns as well, which is needed by several ML models."] 45 | (def ds-one-hot 46 | (ds/categorical->one-hot 47 | ds-cat :all {} :int)) 48 | 49 | ^kind/dataset 50 | ds-one-hot 51 | 52 | 53 | ["we can go back as well"] 54 | (-> ds-one-hot ds/reverse-map-categorical-xforms) 55 | 56 | 57 | ["inspect metadata after conversion"] 58 | (-> ds-one-hot :a-y meta) 59 | 60 | 61 | ["we can go back"] 62 | (-> ds-one-hot ds/reverse-map-categorical-xforms) 63 | -------------------------------------------------------------------------------- /src/scicloj/ml/experiment_tracking.clj: -------------------------------------------------------------------------------- 1 | (ns scicloj.ml.experiment-tracking 2 | (:require 3 | [scicloj.ml.ug-utils :as utils] 4 | [notespace.api :as note] 5 | [notespace.kinds :as kind])) 6 | 7 | (comment 8 | (note/init-with-browser) 9 | (note/eval-this-notespace) 10 | (note/reread-this-notespace) 11 | (note/render-static-html "docs/userguide-experiment-tracking.html") 12 | (note/init)) 13 | 14 | (require '[scicloj.ml.core :as ml] 15 | '[scicloj.ml.metamorph :as mm] 16 | '[scicloj.ml.dataset :as ds] 17 | '[scicloj.metamorph.ml.tools :refer [dissoc-in]] 18 | '[taoensso.nippy :as nippy]) 19 | 20 | 21 | 22 | (defonce ds (ds/dataset "https://raw.githubusercontent.com/techascent/tech.ml/master/test/data/iris.csv" {:key-fn keyword})) 23 | 24 | (defn create-base-pipe-decl [node-size] 25 | [[:tech.v3.dataset.metamorph/set-inference-target [:species]] 26 | [:tech.v3.dataset.metamorph/categorical->number [:species]] 27 | {:metamorph/id :model} [:scicloj.metamorph.ml/model {:model-type :smile.classification/random-forest 28 | :node-size node-size}]]) 29 | ["## Run evaluation"] 30 | 31 | ["We create 6 pipelines, do a simple :holdout split and keep all results. In order to save memory, 32 | as we needed to do, if we would have thousands or more evaluations, we keep the minimal information."] 33 | 34 | (def pipes (map create-base-pipe-decl [1 5 10 20 50 100])) 35 | (def split (ds/split->seq ds :holdout)) 36 | 37 | 38 | 39 | (def evaluation-result 40 | (ml/evaluate-pipelines 41 | pipes split 42 | ml/classification-accuracy 43 | :accuracy 44 | {:evaluation-handler-fn utils/select-minimal-result 45 | 46 | :return-best-crossvalidation-only false 47 | :return-best-pipeline-only false})) 48 | 49 | ["So we get here 6 evaluation results"] 50 | evaluation-result 51 | 52 | ["simplified as list:"] 53 | 54 | (->> evaluation-result flatten 55 | (map (comp :metric :test-transform))) 56 | 57 | ["## Attach a simple result handler"] 58 | 59 | ["A result handler is a function which takes a full map representing a single evalution result and does what ever is needed. 60 | It can be a function with side effects, and it should return the minimal metric infomation as documented."] 61 | 62 | ["The function will be called for each evalution result, so in our case 6 times. We use a simple function for now, 63 | which prints the current declartive pipeline."] 64 | 65 | (def evaluation-result 66 | (ml/evaluate-pipelines 67 | pipes split 68 | ml/classification-accuracy 69 | :accuracy 70 | {;:result-dissoc-in-seq ml/result-dissoc-in-seq--all 71 | ;; :result-dissoc-in-seq [] 72 | :return-best-crossvalidation-only false 73 | :return-best-pipeline-only false 74 | :evaluation-handler-fn 75 | (fn [result] 76 | (clojure.pprint/pprint (:pipe-decl result)) 77 | result)})) 78 | 79 | ["repl output: "] 80 | ^kind/code 81 | [[:tech.v3.dataset.metamorph/set-inference-target [:species]] 82 | [:tech.v3.dataset.metamorph/categorical->number [:species]] 83 | [:scicloj.metamorph.ml/model 84 | {:model-type :smile.classification/random-forest, :node-size 1}]] 85 | [[:tech.v3.dataset.metamorph/set-inference-target [:species]] 86 | [:tech.v3.dataset.metamorph/categorical->number [:species]] 87 | [:scicloj.metamorph.ml/model 88 | {:model-type :smile.classification/random-forest, :node-size 5}]] 89 | [[:tech.v3.dataset.metamorph/set-inference-target [:species]] 90 | [:tech.v3.dataset.metamorph/categorical->number [:species]] 91 | [:scicloj.metamorph.ml/model 92 | {:model-type :smile.classification/random-forest, :node-size 10}]] 93 | [[:tech.v3.dataset.metamorph/set-inference-target [:species]] 94 | [:tech.v3.dataset.metamorph/categorical->number [:species]] 95 | [:scicloj.metamorph.ml/model 96 | {:model-type :smile.classification/random-forest, :node-size 20}]] 97 | ["...."] 98 | 99 | ["The callback function can now implement whatever needed to store the evaluation results, for example on disk. 100 | "] 101 | 102 | 103 | ["Write results to disk"] 104 | 105 | 106 | (def created-files (atom [])) 107 | (def last-result (atom {})) 108 | 109 | (def evaluation-result 110 | (ml/evaluate-pipelines 111 | pipes split 112 | ml/classification-accuracy 113 | :accuracy 114 | {:evaluation-handler-fn 115 | (fn [result] 116 | 117 | (let [reduced-result-fn (fn [result] (scicloj.metamorph.ml/reduce-result result 118 | [[:fit-ctx :model :model-data :model-as-bytes] 119 | [:train-transform :ctx :model :model-data :model-as-bytes] 120 | 121 | 122 | [:test-transform :ctx :model :model-data :model-as-bytes]]))] 123 | (scicloj.metamorph.ml.evaluation-handler/example-nippy-handler 124 | created-files "/tmp" 125 | reduced-result-fn) 126 | (reset! last-result (reduced-result-fn result)) 127 | (reduced-result-fn result))) 128 | 129 | 130 | :attach-fn-sources {:ns (find-ns 'scicloj.ml.experiment-tracking) 131 | :pipe-fns-clj-file "src/scicloj/ml/experiment_tracking.clj"}})) 132 | 133 | ["This creates one nippy file for each evaluation, containing all data of the evaluations."] 134 | 135 | (deref last-result) 136 | -------------------------------------------------------------------------------- /src/scicloj/ml/interactions_ols.clj: -------------------------------------------------------------------------------- 1 | (ns scicloj.ml.interactions-ols 2 | (:require 3 | [notespace.api :as note] 4 | [notespace.kinds :as kind] 5 | [notespace.view :as view] 6 | [tablecloth.api :as tc] 7 | [scicloj.ml.core] 8 | [scicloj.sklearn-clj.ml] 9 | [clojure.string :as str] 10 | [scicloj.ml.ug-utils :refer :all] 11 | [clojure.java.io :as io] 12 | [fastmath.stats :as fmstats])) 13 | 14 | (require '[scicloj.ml.core :as ml] 15 | '[scicloj.ml.metamorph :as mm] 16 | '[scicloj.ml.dataset :refer [dataset add-column]] 17 | '[scicloj.ml.dataset :as ds] 18 | '[tech.v3.dataset.math :as std-math] 19 | '[tech.v3.datatype.functional :as dtf] 20 | '[scicloj.metamorph.ml.toydata :as datasets]) 21 | 22 | 23 | (comment 24 | (note/init-with-browser) 25 | (note/eval-this-notespace) 26 | (note/render-static-html "docs/interactions_ols.html")) 27 | 28 | ["This examples how, how to do interactions in linear regression with `scicloj.ml`"] 29 | 30 | ["Taking ideas from: " 31 | 32 | "http://www.sthda.com/english/articles/40-regression-analysis/164-interaction-effect-in-multiple-regression-essentials/#comments-list"] 33 | 34 | (defn pp-str [x] 35 | (with-out-str (clojure.pprint/pprint x))) 36 | 37 | ["First we load the data:"] 38 | (def marketing (tc/dataset "data/marketing.csv" {:key-fn keyword})) 39 | 40 | ["## Additive model"] 41 | 42 | ["Firts we build an additive model, which model equation is 'sales = b0 + b1 * youtube + b2 * facebook'"] 43 | 44 | (def additive-pipeline 45 | (ml/pipeline 46 | (mm/set-inference-target :sales) 47 | (mm/drop-columns [:newspaper]) 48 | {:metamorph/id :model} 49 | (mm/model {:model-type :smile.regression/ordinary-least-square}))) 50 | 51 | 52 | ["We evaluate it, "] 53 | (def evaluations 54 | (ml/evaluate-pipelines 55 | [additive-pipeline] 56 | (ds/split->seq marketing :holdout) 57 | ml/rmse 58 | :loss 59 | {:other-metrices [{:name :r2 60 | :metric-fn fmstats/r2-determination}]})) 61 | 62 | 63 | ["and print the result:"] 64 | ^kind/hiccup 65 | (text->hiccup 66 | (str 67 | (-> evaluations flatten first :fit-ctx :model ml/thaw-model str))) 68 | 69 | ["We have the following metrices:"] 70 | ["RMSE"] 71 | (-> evaluations flatten first :test-transform :metric) 72 | 73 | ["R2"] 74 | (-> evaluations flatten first :test-transform :other-metrices first :metric) 75 | 76 | ["## Interaction effects"] 77 | ["Now we add interaction effects to it, resulting in this model equation: 'sales = b0 + b1 * youtube + b2 * facebook + b3 * (youtube * facebook)'"] 78 | (def pipe-interaction 79 | (ml/pipeline 80 | (mm/drop-columns [:newspaper]) 81 | (mm/add-column :youtube*facebook (fn [ds] (dtf/* (ds :youtube) (ds :facebook)))) 82 | (mm/set-inference-target :sales) 83 | {:metamorph/id :model}(mm/model {:model-type :smile.regression/ordinary-least-square}))) 84 | 85 | ["Again we evaluate the model,"] 86 | (def evaluations 87 | (ml/evaluate-pipelines 88 | [pipe-interaction] 89 | (ds/split->seq marketing :holdout) 90 | ml/rmse 91 | :loss 92 | {:other-metrices [{:name :r2 93 | :metric-fn fmstats/r2-determination}]})) 94 | 95 | 96 | ["and print it and the performance metrices:"] 97 | ^kind/hiccup 98 | (text->hiccup 99 | (str 100 | (-> evaluations flatten first :fit-ctx :model ml/thaw-model str))) 101 | 102 | ["As the multiplcation of 'youtube * facebook' is as well statistically relevant, it 103 | suggests that there is indeed an interaction between these 2 predictor variables youtube and facebook."] 104 | 105 | ["RMSE"] 106 | (-> evaluations flatten first :test-transform :metric) 107 | 108 | ["R2"] 109 | (-> evaluations flatten first :test-transform :other-metrices first :metric) 110 | 111 | ["RMSE and R2 of the intercation model are sligtly better." 112 | "These results suggest that the model with the interaction term is better than the model that contains only main effects. 113 | So, for this specific data, we should go for the model with the interaction model. 114 | "] 115 | -------------------------------------------------------------------------------- /src/scicloj/ml/intro.clj: -------------------------------------------------------------------------------- 1 | (ns scicloj.ml.intro 2 | (:require 3 | [notespace.api :as note] 4 | [notespace.kinds :as kind])) 5 | 6 | 7 | (comment 8 | (note/init-with-browser) 9 | (note/eval-this-notespace) 10 | (note/reread-this-notespace) 11 | (note/render-static-html "docs/userguide-intro.html") 12 | (note/init)) 13 | 14 | 15 | 16 | ["# Clojure and and machine learning "] 17 | 18 | ["In order to practice machine learning and create an ecosystem of models around it, 19 | we need 3 components."] 20 | 21 | ["1. A standard way to manage tabular data in memory."] 22 | ["2. Various machine learning models"] 23 | ["3. A standard way to express steps of data manipulations including train/predict of a model"] 24 | 25 | 26 | ["The Clojure language and core libraries do not have build-in, specific support for this, 27 | so some libraries are required. "] 28 | 29 | ["## Representing training data"] 30 | 31 | ["In the last 2 years the Clojure data science landscape was shaped 32 | by the appearance and maturation of a new library to manage tabular data."] 33 | 34 | ["This library is [tech.ml.dataset](https://github.com/techascent/tech.ml.dataset). 35 | It defines a in-memory tabular data structure and operations on it. It is a remarkable piece of software, 36 | highly optimized and linking in its root to native memory and allow zero-copy integration's outside Clojure."] 37 | 38 | ["As it was organically growing over time, it's API is functional and complete, 39 | but lacks consistency in some parts. 40 | "] 41 | 42 | ["This was addressed by an other library, layering on top of it, called 43 | `tablecloth`. It is available [here](https://github.com/scicloj/tablecloth)"] 44 | 45 | 46 | ["So we have now a very reliable, mature, easy to use library to store and manipulate tabular data, including text."] 47 | 48 | ["## Models"] 49 | ["Models are the core of most machine learning libraries. In scicloj.ml we 50 | rely on an common **abstraction** for all 51 | machine learning models and one Java library [Smile](https://github.com/haifengl/smile) providing models, 52 | which we bridge into Clojure via the abstraction. 53 | So we use Java models internally, but without the need for Java 54 | interop by the user. 55 | 56 | Documentation for existing models is appearing here: 57 | https://scicloj.github.io/scicloj.ml-tutorials/userguide-models.html 58 | 59 | The abstraction is independent from Smile, so we could makes bridges to other libraries, even in non JVM languages (python, R) 60 | 61 | 62 | "] 63 | 64 | ["## Data transformation pipelines."] 65 | 66 | ["In order to apply machine learning, the data needs to be transformed from its original form , 67 | (often as a data file), into the form required by the model. 68 | Sometimes these transformation are simple, like re-encode data, 69 | sometimes they are very complex. In some contexts this is as well called 70 | feature engineering, which can result in arbitrary 71 | complex dataset transformations. 72 | This transformations are mostly dataset to dataset transformations. 73 | "] 74 | 75 | 76 | ["These pipelines need to be repeatable and self-contained, 77 | as they need to run several times with different data or in variants 78 | for either cross validation or hyper-parameter tuning."] 79 | 80 | ["Clojure and the `tablecloth` library contains already 81 | the concept of running a pipeline"] 82 | 83 | ["These simpler form of a pipeline in Clojure and Tablecloth, can just make use of the fact that all tablecloth 84 | functions take a dataset as the first parameter and return a dataset. 85 | So they can be chained together with the pipe (`->`) operator of Clojure, 86 | example:"] 87 | 88 | (require '[scicloj.ml.dataset :as ds]) 89 | (def my-data 90 | (-> (ds/dataset "https://raw.githubusercontent.com/techascent/tech.ml.dataset/master/test/data/stocks.csv" {:key-fn keyword}) 91 | (ds/select-columns [:symbol :price]) 92 | (ds/add-or-replace-column :symbol (fn [ds] (map clojure.string/lower-case (ds :symbol)))))) 93 | 94 | ["This form of pipeline works to manipulate a dataset, 95 | but has three disadvantages."] 96 | 97 | [" 98 | 1. `->` is a macro, so we cannot compose pipelines easily 99 | 100 | 2. We move a dataset object through the pipeline steps, so the only object we have nicely inside the pipeline, accessible to all steps, is the dataset itself. But sometimes we need non-tabular, auxiliary, data to be shared across the pipeline steps, which is not possible with passing a dataset only.Using this simple pipelines, would force to hold auxiliary data in a global state of some form. This makes is very hard to execute pipelines repeatedly, as they are not self-contained. 101 | 102 | 3. These simpler pipeline concepts have no notion of running a pipeline in several modes. In machine learning a pipeline need to behave differently in `fit` and in `transform`. (often called `train` vs `predict`). The models learns from data in the `fit` and it applies what it has learned in `transform`. 103 | "] 104 | 105 | ["Due to this, the idea of the `metamorph` pipeline concept was born."] 106 | ["It addresses all three shortcomings of the simpler pipeline."] 107 | 108 | ["Metamorph is documented here: [metamorph](https://github.com/scicloj/metamorph)"] 109 | 110 | 111 | ["As we see in the metamorph documentation, a pipeline can be composed of functions, which adhere to some simple standards 112 | regarding input and output, as explained here: https://github.com/scicloj/metamorph#compliant-operations"] 113 | 114 | ["Tablecloth contains such operations in the `tablecloth.pipeline` 115 | namespace. All functions of the `tablecloth.api` namespace are replicated 116 | there, but metamorph compliant"] 117 | 118 | ["## scicloj.ml"] 119 | 120 | ["The Clojure ML ecosystem is based on different libraries working 121 | together, as typic and idiomatic in Clojure"] 122 | 123 | ["Some existing libraries are used internally in scicloj.ml, to create a 124 | complete machine learning library, but this is hidden from the user, 125 | and is listed here only for completeness."] 126 | 127 | [" 128 | 1. `tablecloth` - for general manipulation of the dataset 129 | 1. `tech.v3.dataset` - to finally prepare a dataset for the machine learing models 130 | 1. `metamorph.ml` - for running pipelines and machine learning core functions 131 | 1. `Smile` Java machine learning library containing lots of models 132 | "] 133 | 134 | 135 | 136 | ["These libraries can be used standalone as well. `tech.ml` was changed in order 137 | to work with scicloj.ml in a incompatible way. 138 | So it is re-released under a new name `metamorph.ml`. 139 | The others can be used by scicloj.ml without any change. 140 | "] 141 | 142 | 143 | ["In order to give easier access to the various libraries, the scicloj.ml 144 | library was created. It unifies the access to the libraries above 145 | in three simple namespaces. 146 | "] 147 | 148 | ["## Machine learning using scicloj.ml"] 149 | 150 | ["The setup for the following code needs a single dependencies in deps.edn or project.clj"] 151 | 152 | [" 153 | {:deps { 154 | scicloj/scicloj.ml {:mvn/version \"0.1.0-beta2\"}} } 155 | "] 156 | 157 | 158 | ["This library acts as a facade to the four libraries above, and arranges the functions in a simple way in these namespaces:"] 159 | 160 | ^kind/md-nocode 161 | [" 162 | 163 | | namespace | purpose | 164 | |-----------------------|----------------------------------------------------------| 165 | | scicloj.ml.core | core functionality for machine learning | 166 | | scicloj.ml.dataset | functions to manipulate a dataset | 167 | | scicloj.ml.methamorph | metamorph compliant functions to be used in ml pipelines | 168 | 169 | "] 170 | 171 | 172 | 173 | ["To start we need to require a few namespaces"] 174 | 175 | (require '[scicloj.ml.core :as ml] 176 | '[scicloj.ml.metamorph :as mm] 177 | '[scicloj.ml.dataset :refer [dataset add-column] :as ds]) 178 | 179 | 180 | 181 | ["First we load the data."] 182 | (def titanic-train 183 | (-> 184 | (ds/dataset "https://github.com/scicloj/metamorph-examples/raw/main/data/titanic/train.csv" 185 | {:key-fn keyword 186 | :parser-fn :string}))) 187 | 188 | 189 | (def titanic-test 190 | (-> 191 | (ds/dataset "https://github.com/scicloj/metamorph-examples/raw/main/data/titanic/test.csv" 192 | {:key-fn keyword 193 | :parser-fn :string}) 194 | (ds/add-column :Survived [""] :cycle))) 195 | 196 | ["Then we define the pipeline and it steps. Inside the pipeline we only use functions 197 | from namespace scicloj.ml.metamorph"] 198 | 199 | ["In scicloj.ml the model functions receives a single dataset, 200 | in which the inference target column is marked as such. The model 201 | to use is a parameter of the `model` function. All built-in 202 | models are listed here: https://scicloj.github.io/scicloj.ml-tutorials/userguide-models.html"] 203 | 204 | 205 | ["In the titanic dataset the `survived` column is a categorical variable. 206 | All target variables for classification need to be transformed first 207 | into numbers, the model can work with. This is done by the function 208 | `categorical->number`. The mapping for this is stored in the dataset on the column 209 | and can be later retrieved to transform the numeric prediction back to its 210 | categorical form."] 211 | 212 | 213 | ["In `scicloj.ml` we pass a whole dataset to a model, and we need to mark 214 | the inference target via function `set-inference-target`. 215 | All other columns are used then as feature columns. 216 | To restric the feature column, I simply remove most of them and keep only one, :Pclass"] 217 | 218 | ["Now the dataset is ready for the model, which is called in the last step. 219 | It is a logistic regression model, which gets trained to predict column 220 | :Survived from column :Pclass"] 221 | 222 | (def pipe-fn 223 | (ml/pipeline 224 | (mm/select-columns [:Survived :Pclass]) 225 | (mm/categorical->number [:Survived :Pclass]) 226 | (mm/set-inference-target :Survived) 227 | (mm/model {:model-type :smile.classification/logistic-regression}))) 228 | 229 | ["So the `ml/pipeline` function returns a function, which can be called with the ctx map."] 230 | 231 | ["We execute the pipeline in mode :fit, 232 | which will execute all pipeline steps and train as well the model. "] 233 | 234 | (def trained-ctx 235 | (pipe-fn {:metamorph/data titanic-train 236 | :metamorph/mode :fit})) 237 | 238 | ["Now we have a trained model inside trained-ctx. This is a usual map, so can be inspected in the repl. 239 | As the model is based on Smile, the trained-ctx contains the java class representing the trained model. 240 | "] 241 | 242 | ["Now we execute the pipeline in mode :transform, 243 | which will make a prediction "] 244 | 245 | ["We combine the previously obtained context 246 | (which contains the trained model)", 247 | "with the test data and mode :transform"] 248 | 249 | (def test-ctx 250 | (pipe-fn 251 | (assoc trained-ctx 252 | :metamorph/data titanic-test 253 | :metamorph/mode :transform))) 254 | 255 | 256 | 257 | ["Prediction is now part of the ctx obtained. 258 | The internally called `predict` function of `metamorph.ml` returns always the raw prediction of the model, 259 | which we can easily transform into the original categories. 260 | "] 261 | 262 | 263 | 264 | ;; ^kind/dataset 265 | (-> test-ctx :metamorph/data 266 | (ds/column-values->categorical :Survived)) 267 | 268 | 269 | 270 | 271 | 272 | ["This shows the predicted survival. "] 273 | 274 | ["The documentation of `mm/model` here https://scicloj.github.io/scicloj.ml/scicloj.ml.metamorph.html#var-model" 275 | "documents this special behavior of the function, which does something different in mode :fit vs mode :transform"] 276 | 277 | ["Any form of feature-engineering takes now the same form. 278 | We will successively 279 | add more and more steps into the pipeline to improve the model."] 280 | 281 | ["This can be build-in functions or custom functions as we see later"] 282 | 283 | 284 | (+ 1 1 (+ 2 2)) 285 | -------------------------------------------------------------------------------- /src/scicloj/ml/models.clj: -------------------------------------------------------------------------------- 1 | ^{:nextjournal.clerk/visibility {:code :hide :result :hide} 2 | :nextjournal.clerk/toc true} 3 | (ns scicloj.ml.models 4 | (:require 5 | [nextjournal.clerk :as clerk] 6 | [scicloj.ml.ug-utils :as utils] 7 | [scicloj.ml.dataset :as ds] 8 | [scicloj.ml.ug-utils-clerk :as utils-clerk] 9 | [tablecloth.api :as tc])) 10 | 11 | ^{:nextjournal.clerk/visibility {:code :hide :result :hide}} 12 | (comment 13 | (clerk/show! "src/scicloj/ml/models.clj") 14 | (clerk/halt!) 15 | (clerk/build-static-app! {:paths ["src/scicloj/ml/models.clj"] 16 | :bundle? false}) 17 | (clerk/clear-cache!) 18 | (clerk/serve! {:browse? true}) 19 | (clerk/serve! {:browse? true :watch-paths ["src/scicloj/ml/"]})) 20 | 21 | ^{:nextjournal.clerk/visibility {:code :hide :result :hide}} 22 | (require '[scicloj.ml.core :as ml] 23 | '[scicloj.ml.metamorph :as mm] 24 | '[tech.v3.datatype.functional :as dtf] 25 | '[scicloj.metamorph.ml.toydata :as datasets]) 26 | 27 | ^{:nextjournal.clerk/visibility {:code :hide :result :hide}} 28 | (clerk/add-viewers! [{:pred tc/dataset? 29 | :transform-fn (clerk/update-val #(clerk/table {:head (tc/column-names %) 30 | :rows (tc/rows % :as-seq)}))}]) 31 | ^{:nextjournal.clerk/visibility {:code :hide :result :hide}} 32 | (def build-in-models 33 | (->> 34 | (ml/model-definition-names) 35 | (filter #(contains? #{"fastmath.cluster" 36 | "smile.classification" 37 | "smile.regression" 38 | "smile.manifold" 39 | "smile.projections" 40 | "xgboost"} 41 | (namespace %))) 42 | sort)) 43 | 44 | ^{:nextjournal.clerk/visibility {:code :hide :result :hide}} 45 | (defn make-iris-pipeline [model-options] 46 | (ml/pipeline 47 | (mm/set-inference-target :species) 48 | (mm/categorical->number [:species]) 49 | (mm/model model-options))) 50 | 51 | 52 | ;; # Models 53 | 54 | ;; scicloj.ml uses the plugin `scicloj.ml.smile` and 55 | ;; `scicloj.ml.xgboost` by default, 56 | ;; which gives access to " (count build-in-models) " models from the java libraries 57 | ;; [Smile](https://haifengl.github.io/), 58 | ;; [Xgboost](https://xgboost.readthedocs.io/en/latest/jvm/index.html) and [fastmath](https://github.com/generateme/fastmath) 59 | 60 | ;; More models are avilable via other plugins. 61 | 62 | ;; Below is a list of all such models, and which parameter they take. 63 | 64 | ;; All models are available in the same way: 65 | 66 | 67 | 68 | ;; The documentation below points as well to the javadoc and user-guide chapter (for Smile models) 69 | 70 | ;; The full list of build in models is: 71 | ^{:nextjournal.clerk/visibility {:code :hide}} 72 | (clerk/html 73 | [:ul 74 | 75 | (map 76 | #(vector :li [:a {:href (str "#" (str %))} (str %)]) 77 | build-in-models)]) 78 | 79 | 80 | ;; ## Smile classification models 81 | 82 | ^{:nextjournal.clerk/visibility {:code :hide}} 83 | (clerk/html 84 | (utils-clerk/render-key-info :smile.classification/ada-boost)) 85 | ;; In this example we will use the capability of the Ada boost classifier 86 | ;; to give us the importance of variables. 87 | 88 | ;; As data we take here the Wiscon Breast Cancer dataset, which has 30 variables. 89 | 90 | (def df 91 | (-> 92 | (datasets/breast-cancer-ds))) 93 | 94 | 95 | ;; To get an overview of the dataset, we print its summary: 96 | 97 | (-> df ds/info) 98 | 99 | 100 | ;; Then we create a metamorph pipeline with the ada boost model: 101 | 102 | (def ada-pipe-fn 103 | (ml/pipeline 104 | (mm/set-inference-target :class) 105 | (mm/categorical->number [:class]) 106 | (mm/model 107 | {:model-type :smile.classification/ada-boost}))) 108 | 109 | 110 | ;; We run the pipeline in :fit. As we just explore the data, 111 | ;; not train.test split is needed. 112 | 113 | (def trained-ctx 114 | (ml/fit-pipe df 115 | ada-pipe-fn)) 116 | 117 | ;; "Next we take the model out of the pipeline:" 118 | (def model 119 | (-> trained-ctx vals (nth 2) ml/thaw-model)) 120 | 121 | ;; The variable importance can be obtained from the trained model, 122 | (def var-importances 123 | (mapv 124 | #(hash-map :variable %1 125 | :importance %2) 126 | (map 127 | #(first (.variables %)) 128 | (.. model formula predictors)) 129 | (.importance model))) 130 | 131 | 132 | ;; and we plot the variables: 133 | 134 | (clerk/vl 135 | { 136 | :data {:values 137 | var-importances} 138 | :width 800 139 | :height 500 140 | :mark {:type "bar"} 141 | :encoding {:x {:field :variable :type "nominal" :sort "-y"} 142 | :y {:field :importance :type "quantitative"}}}) 143 | 144 | 145 | ^{:nextjournal.clerk/visibility {:code :hide}} 146 | (clerk/html 147 | (utils-clerk/render-key-info ":smile.classification/decision-tree")) 148 | 149 | ;; A decision tree learns a set of rules from the data in the form 150 | ;; of a tree, which we will plot in this example. 151 | ;; We use the iris dataset: 152 | 153 | 154 | (def iris ^:nextjournal.clerk/no-cache (datasets/iris-ds)) 155 | 156 | 157 | 158 | ;; We make a pipe only containing the model, as the dataset is ready to 159 | ;; be used by `scicloj.ml` 160 | (def trained-pipe-tree 161 | (ml/fit-pipe 162 | iris 163 | (ml/pipeline 164 | {:metamorph/id :model} 165 | (mm/model 166 | {:model-type :smile.classification/decision-tree})))) 167 | 168 | ;; We extract the Java object of the trained model. 169 | 170 | (def tree-model 171 | (-> trained-pipe-tree :model ml/thaw-model)) 172 | 173 | 174 | ;; The model has a .dot function, which returns a GraphViz textual 175 | ;; representation of the decision tree, which we render to svg using the 176 | ;; [kroki](https://kroki.io/) service. 177 | 178 | (clerk/html 179 | (String. (:body (utils/kroki (.dot tree-model) :graphviz :svg)) "UTF-8")) 180 | 181 | ^{:nextjournal.clerk/visibility {:code :hide}} 182 | (clerk/html (utils-clerk/render-key-info ":smile.classification/discrete-naive-bayes")) 183 | 184 | ^{:nextjournal.clerk/visibility {:code :hide}} 185 | (clerk/html (utils-clerk/render-key-info ":smile.classification/gradient-tree-boost")) 186 | 187 | ^{:nextjournal.clerk/visibility {:code :hide}} 188 | (clerk/html (utils-clerk/render-key-info ":smile.classification/knn")) 189 | ;; In this example we use a knn model to classify some dummy data. 190 | ;; The training data is this: 191 | 192 | (def df-knn 193 | (ds/dataset {:x1 [7 7 3 1] 194 | :x2 [7 4 4 4] 195 | :y [ :bad :bad :good :good]})) 196 | 197 | 198 | 199 | ;; Then we construct a pipeline with the knn model, 200 | ;; using 3 neighbors for decision. 201 | 202 | (def knn-pipe-fn 203 | (ml/pipeline 204 | (mm/set-inference-target :y) 205 | (mm/categorical->number [:y]) 206 | (mm/model 207 | {:model-type :smile.classification/knn 208 | :k 3}))) 209 | 210 | ;; We run the pipeline in mode fit: 211 | 212 | (def trained-ctx-knn 213 | (knn-pipe-fn {:metamorph/data df-knn 214 | :metamorph/mode :fit})) 215 | 216 | 217 | ;; Then we run the pipeline in mode :transform with some test data 218 | ;; and take the prediction and convert it from numeric into categorical: 219 | 220 | (-> 221 | trained-ctx-knn 222 | (merge 223 | {:metamorph/data (ds/dataset 224 | {:x1 [3 5] 225 | :x2 [7 5] 226 | :y [nil nil]}) 227 | :metamorph/mode :transform}) 228 | knn-pipe-fn 229 | :metamorph/data 230 | (ds/column-values->categorical :y) 231 | seq) 232 | 233 | ^{:nextjournal.clerk/visibility {:code :hide}} 234 | (clerk/html (utils-clerk/render-key-info ":smile.classification/logistic-regression")) 235 | 236 | ^{:nextjournal.clerk/visibility {:code :hide}} 237 | (clerk/html (utils-clerk/render-key-info ":smile.classification/maxent-binomial")) 238 | 239 | ^{:nextjournal.clerk/visibility {:code :hide}} 240 | (clerk/html (utils-clerk/render-key-info ":smile.classification/maxent-multinomial")) 241 | 242 | ^{:nextjournal.clerk/visibility {:code :hide}} 243 | (clerk/html (utils-clerk/render-key-info ":smile.classification/random-forest")) 244 | ;; The following code plots the decision surfaces of the random forest 245 | ;; model on pairs of features. 246 | 247 | ;; We use the Iris dataset for this. 248 | 249 | (def iris-test 250 | (ds/dataset 251 | "https://raw.githubusercontent.com/scicloj/metamorph.ml/main/test/data/iris.csv" {:key-fn keyword})) 252 | 253 | 254 | 255 | 256 | ;; Standarise the data: 257 | (def iris-std 258 | (ml/pipe-it 259 | iris-test 260 | (mm/std-scale [:sepal_length :sepal_width :petal_length :petal_width] {}))) 261 | 262 | 263 | 264 | 265 | 266 | 267 | ;; The next function creates a vega specification for the random forest 268 | ;; decision surface for a given pair of column names. 269 | 270 | 271 | 272 | 273 | (def rf-pipe 274 | (make-iris-pipeline 275 | {:model-type :smile.classification/random-forest})) 276 | 277 | (clerk/vl (utils/surface-plot iris [:sepal_length :sepal_width] rf-pipe :smile.classification/random-forest)) 278 | 279 | (clerk/vl 280 | (utils/surface-plot iris-std [:sepal_length :petal_length] rf-pipe :smile.classification/random-forest)) 281 | 282 | (clerk/vl 283 | (utils/surface-plot iris-std [:sepal_length :petal_width] rf-pipe :smile.classification/random-forest)) 284 | (clerk/vl 285 | (utils/surface-plot iris-std [:sepal_width :petal_length] rf-pipe :smile.classification/random-forest)) 286 | (clerk/vl 287 | (utils/surface-plot iris-std [:sepal_width :petal_width] rf-pipe :smile.classification/random-forest)) 288 | (clerk/vl 289 | (utils/surface-plot iris-std [:petal_length :petal_width] rf-pipe :smile.classification/random-forest)) 290 | 291 | 292 | ^{:nextjournal.clerk/visibility {:code :hide}} 293 | (clerk/html (utils-clerk/render-key-info ":smile.classification/sparse-logistic-regression")) 294 | 295 | ^{:nextjournal.clerk/visibility {:code :hide}} 296 | (clerk/html (utils-clerk/render-key-info ":smile.classification/sparse-svm")) 297 | 298 | ^{:nextjournal.clerk/visibility {:code :hide}} 299 | (clerk/html (utils-clerk/render-key-info ":smile.classification/svm")) 300 | 301 | ;; ## Smile regression models 302 | ^{:nextjournal.clerk/visibility {:code :hide}} 303 | (clerk/html (utils-clerk/render-key-info ":smile.regression/elastic-net")) 304 | 305 | 306 | ^{:nextjournal.clerk/visibility {:code :hide}} 307 | (clerk/html (utils-clerk/render-key-info ":smile.regression/gradient-tree-boost")) 308 | 309 | ^{:nextjournal.clerk/visibility {:code :hide}} 310 | (clerk/html (utils-clerk/render-key-info ":smile.regression/lasso")) 311 | 312 | ;; We use the diabetes dataset and will show how Lasso regression 313 | ;; regulates the different variables dependent of lambda. 314 | 315 | ;; First we make a function to create pipelines with different lambdas 316 | (defn make-pipe-fn [lambda] 317 | (ml/pipeline 318 | (mm/update-column :disease-progression (fn [col] (map #(double %) col))) 319 | (mm/convert-types :disease-progression :float32) 320 | (mm/set-inference-target :disease-progression) 321 | {:metamorph/id :model} (mm/model {:model-type :smile.regression/lasso 322 | :lambda (double lambda)}))) 323 | 324 | ;; No we go over a sequence of lambdas and fit a pipeline for all off them 325 | ;; and store the coefficients for each predictor variable: 326 | (def diabetes (datasets/diabetes-ds)) 327 | 328 | (def coefs-vs-lambda 329 | (flatten 330 | (map 331 | (fn [lambda] 332 | (let [fitted 333 | (ml/fit-pipe 334 | diabetes 335 | (make-pipe-fn lambda)) 336 | 337 | model-instance 338 | (-> fitted 339 | :model 340 | (ml/thaw-model)) 341 | 342 | predictors 343 | (map 344 | #(first (.variables %)) 345 | (seq 346 | (.. model-instance formula predictors)))] 347 | 348 | (map 349 | #(hash-map :log-lambda (dtf/log10 lambda) 350 | :coefficient %1 351 | :predictor %2) 352 | (-> model-instance .coefficients seq) 353 | predictors))) 354 | (range 1 100000 100)))) 355 | 356 | ;; Then we plot the coefficients over the log of lambda. 357 | (clerk/vl 358 | { 359 | :data {:values coefs-vs-lambda} 360 | 361 | :width 500 362 | :height 500 363 | :mark {:type "line"} 364 | :encoding {:x {:field :log-lambda :type "quantitative"} 365 | :y {:field :coefficient :type "quantitative"} 366 | :color {:field :predictor}}}) 367 | 368 | ;; This shows that an increasing lambda regulates more and more variables 369 | ;; to zero. This plot can be used as well to find important variables, 370 | ;; namely the ones which stay > 0 even with large lambda. 371 | 372 | ^{:nextjournal.clerk/visibility {:code :hide}} 373 | (clerk/html 374 | (utils-clerk/render-key-info ":smile.regression/ordinary-least-square")) 375 | 376 | ;; In this example we will explore the relationship between the 377 | ;; body mass index (bmi) and a diabetes indicator. 378 | 379 | ;; First we load the data and split into train and test sets. 380 | ;; 381 | ^{:nextjournal.clerk/viewer :hide-result} 382 | (def diabetes (datasets/diabetes-ds)) 383 | 384 | ^{:nextjournal.clerk/viewer :hide-result} 385 | (def diabetes-train 386 | (ds/head diabetes 422)) 387 | 388 | ^{:nextjournal.clerk/viewer :hide-result} 389 | (def diabetes-test 390 | (ds/tail diabetes 20)) 391 | 392 | 393 | 394 | ;; Next we create the pipeline, converting the target variable to 395 | ;; a float value, as needed by the model. 396 | 397 | (def ols-pipe-fn 398 | (ml/pipeline 399 | (mm/select-columns [:bmi :disease-progression]) 400 | (mm/convert-types :disease-progression :float32) 401 | (mm/set-inference-target :disease-progression) 402 | {:metamorph/id :model} (mm/model {:model-type :smile.regression/ordinary-least-square}))) 403 | 404 | ;; We can then fit the model, by running the pipeline in mode :fit 405 | 406 | (def fitted 407 | (ml/fit diabetes-train ols-pipe-fn)) 408 | 409 | 410 | ;; Next we run the pipe-fn in :transform and extract the prediction 411 | ;; for the disease progression: 412 | (def diabetes-test-prediction 413 | (-> diabetes-test 414 | (ml/transform-pipe ols-pipe-fn fitted) 415 | :metamorph/data 416 | :disease-progression)) 417 | 418 | ;; The truth is available in the test dataset. 419 | (def diabetes-test-trueth 420 | (-> diabetes-test 421 | :disease-progression)) 422 | 423 | 424 | 425 | 426 | ;; The smile Java object of the LinearModel is in the pipeline as well: 427 | 428 | (def model-instance 429 | (-> fitted :model (ml/thaw-model))) 430 | 431 | ;; This object contains all information regarding the model fit 432 | ;; such as coefficients and formula: 433 | (-> model-instance .coefficients seq) 434 | (-> model-instance .formula str) 435 | 436 | ;; Smile generates as well a String with the result of the linear 437 | ;; regression as part of the toString() method of class LinearModel: 438 | 439 | (clerk/code 440 | (str model-instance)) 441 | 442 | 443 | 444 | ;; This tells us that there is a statistically significant 445 | ;; (positive) correlation between the bmi and the diabetes 446 | ;; disease progression in this data. 447 | 448 | 449 | ;; At the end we can plot the truth and the prediction on the test data, 450 | ;; and observe the linear nature of the model. 451 | 452 | (clerk/vl 453 | {:layer [ 454 | {:data {:values (map #(hash-map :disease-progression %1 :bmi %2 :type :truth) 455 | diabetes-test-trueth 456 | (:bmi diabetes-test))} 457 | 458 | :width 500 459 | :height 500 460 | :mark {:type "circle"} 461 | :encoding {:x {:field :bmi :type "quantitative"} 462 | :y {:field :disease-progression :type "quantitative"} 463 | :color {:field :type}}} 464 | 465 | {:data {:values (map #(hash-map :disease-progression %1 :bmi %2 :type :prediction) 466 | diabetes-test-prediction 467 | (:bmi diabetes-test))} 468 | 469 | :width 500 470 | :height 500 471 | :mark {:type "line"} 472 | :encoding {:x {:field :bmi :type "quantitative"} 473 | :y {:field :disease-progression :type "quantitative"} 474 | :color {:field :type}}}]}) 475 | 476 | 477 | ^{:nextjournal.clerk/visibility {:code :hide}} 478 | (clerk/html (utils-clerk/render-key-info ":smile.regression/random-forest")) 479 | 480 | ^{:nextjournal.clerk/visibility {:code :hide}} 481 | (clerk/html (utils-clerk/render-key-info ":smile.regression/ridge")) 482 | 483 | 484 | ;; ## Xgboost model 485 | ^{:nextjournal.clerk/visibility {:code :hide}} 486 | (clerk/html (utils-clerk/render-key-info ":xgboost")) 487 | 488 | ;; ## Fastmath clustering 489 | ^{:nextjournal.clerk/visibility {:code :hide}} 490 | (clerk/html (utils-clerk/render-key-info :fastmath.cluster)) 491 | 492 | ;; ## Smile projections 493 | ^{:nextjournal.clerk/visibility {:code :hide}} 494 | (clerk/html (utils-clerk/render-key-info :smile.projections)) 495 | 496 | ;; ## Smile manifold 497 | ^{:nextjournal.clerk/visibility {:code :hide}} 498 | (clerk/html (utils-clerk/render-key-info :smile.manifold)) 499 | 500 | 501 | ;; # Compare decision surfaces of different models 502 | 503 | ;; In the following we see the decision surfaces of some models on the 504 | ;; same data from the Iris dataset using 2 columns :sepal_width and sepal_length: 505 | ^{:nextjournal.clerk/visibility {:code :hide}} 506 | (mapv #(clerk/vl (utils/surface-plot iris-std [:sepal_length :sepal_width] (make-iris-pipeline %) (:model-type %))) 507 | [ 508 | {:model-type :smile.classification/ada-boost} 509 | {:model-type :smile.classification/decision-tree} 510 | {:model-type :smile.classification/gradient-tree-boost} 511 | {:model-type :smile.classification/knn} 512 | {:model-type :smile.classification/logistic-regression} 513 | {:model-type :smile.classification/random-forest} 514 | {:model-type :smile.classification/linear-discriminant-analysis} 515 | {:model-type :smile.classification/regularized-discriminant-analysis} 516 | {:model-type :smile.classification/quadratic-discriminant-analysis} 517 | {:model-type :xgboost/classification}]) 518 | 519 | 520 | 521 | ;; This shows nicely that different model types have different capabilities 522 | ;; seperate and tehre fore classify data. 523 | 524 | 525 | ;; ## Ensembles 526 | 527 | ;; An ensemble is combining several pipelines and their prediction 528 | ;; and calculate a common prediction. 529 | ;; `sicloj.ml` alows to create an ensemble whehre each model gives avote, 530 | ;; and the majority becomes the final prediction. 531 | ;; 532 | 533 | 534 | ;; First we make three pipelines, which only differ in the model type. 535 | ;; The pipleines could b completely different, but need to accept the same input data and 536 | ;; produce the same predictions (target column name and type) 537 | ;; 538 | 539 | 540 | (defn make-iris-pipeline-ensemble [model-type] 541 | (ml/pipeline 542 | (mm/select-columns [:species :sepal_length :sepal_width]) 543 | (mm/set-inference-target :species) 544 | (mm/categorical->number [:species]) 545 | {:metamorph/id :model} 546 | (mm/model 547 | {:model-type model-type}))) 548 | (defn make-iris-pipeline-ensemble [model-type] 549 | (ml/pipeline 550 | (mm/select-columns [:species :sepal_length :sepal_width]) 551 | (mm/set-inference-target :species) 552 | (mm/categorical->number [:species]) 553 | {:metamorph/id :model} 554 | (mm/model 555 | {:model-type model-type}))) 556 | 557 | 558 | (def tree-pipeline 559 | (make-iris-pipeline-ensemble :smile.classification/decision-tree)) 560 | 561 | 562 | (def knn-pipeline 563 | (make-iris-pipeline-ensemble :smile.classification/knn)) 564 | 565 | 566 | (def logistic-regression-pipeline 567 | (make-iris-pipeline-ensemble :smile.classification/logistic-regression)) 568 | 569 | 570 | ;; Know we can contruct an ensembe, using function `ensemble-pipe` 571 | 572 | (def ensemble (ml/ensemble-pipe [tree-pipeline 573 | knn-pipeline 574 | logistic-regression-pipeline])) 575 | 576 | ;; This ensemble is as any other metamorph pipeline, 577 | ;; so we can train and predict as usual: 578 | 579 | 580 | (def fitted-ctx-ensemble 581 | (ml/fit-pipe iris-std ensemble)) 582 | 583 | 584 | (def transformed-ctx-ensemble 585 | (ml/transform-pipe iris-std ensemble fitted-ctx-ensemble)) 586 | 587 | 588 | ;; Frequency of predictions 589 | 590 | 591 | (-> 592 | transformed-ctx-ensemble 593 | :metamorph/data 594 | (ds/reverse-map-categorical-xforms) 595 | :species 596 | frequencies) 597 | 598 | ;; The surface plot of the ensemble 599 | 600 | 601 | (clerk/vl (utils/surface-plot iris-std 602 | [:sepal_length :sepal_width] 603 | ensemble "voting ensemble")) 604 | -------------------------------------------------------------------------------- /src/scicloj/ml/nested_cv.clj: -------------------------------------------------------------------------------- 1 | (ns scicloj.ml.nested-cv 2 | (:require [tablecloth.api :as tc] 3 | [scicloj.metamorph.ml :as ml] 4 | [scicloj.metamorph.ml.classification :as clf] 5 | [tech.v3.datatype :as dt])) 6 | 7 | 8 | (defn nested-cv [data pipelines metric-fn loss-or-accuracy outer-k inner-k] 9 | ;; https://www.youtube.com/watch?v=DuDtXtKNpZs 10 | (let [k-folds (tc/split->seq data :kfold {:k outer-k})] 11 | (for [{train :train test :test} k-folds] 12 | (let [inner-k-fold (tc/split->seq test :kfold {:k inner-k}) 13 | evaluation (ml/evaluate-pipelines 14 | pipelines 15 | inner-k-fold 16 | metric-fn 17 | loss-or-accuracy) 18 | fit-ctx (-> evaluation first first :fit-ctx) 19 | best-pipe-fn (-> evaluation first first :pipe-fn) 20 | transform-ctx (best-pipe-fn 21 | (merge fit-ctx 22 | {:metamorph/data test :metamorph/mode :transform})) 23 | metric (metric-fn 24 | (-> transform-ctx :model :scicloj.metamorph.ml/target-ds :survived dt/->vector) 25 | (-> transform-ctx :metamorph/data :survived dt/->vector))] 26 | {:pipe-fn best-pipe-fn 27 | :fit-ctx fit-ctx 28 | :metric metric})))) 29 | 30 | -------------------------------------------------------------------------------- /src/scicloj/ml/polyglot_kmeans.clj: -------------------------------------------------------------------------------- 1 | (ns scicloj.ml.polyglot-kmeans 2 | (:require 3 | [scicloj.sklearn-clj.metamorph] 4 | [nextjournal.clerk :as clerk] 5 | [libpython-clj2.require :refer [require-python]] 6 | [libpython-clj2.python :as py :refer [py.- py.]])) 7 | 8 | (comment 9 | (clerk/serve! {:browser true}) 10 | (clerk/build-static-app! {:paths ["src/scicloj/ml/polyglot_kmeans.clj"] 11 | :bundle? false}) 12 | (clerk/clear-cache!)) 13 | 14 | ^{::clerk/visibility #{:hide}} 15 | (clerk/code 16 | " 17 | from sklearn.datasets import make_blobs 18 | from sklearn.cluster import KMeans 19 | from sklearn.preprocessing import StandardScaler 20 | 21 | features, true_labels = make_blobs( 22 | n_samples=200, 23 | centers=3, 24 | cluster_std=2.75, 25 | random_state=42 26 | ) 27 | 28 | scaler = StandardScaler() 29 | scaled_features = scaler.fit_transform(features) 30 | 31 | kmeans = KMeans( 32 | init=\"random\", 33 | n_clusters=3, 34 | n_init=10, 35 | max_iter=300, 36 | random_state=42) 37 | 38 | kmeans.fit(scaled_features) 39 | 40 | kmeans.inertia_ 41 | ") 42 | 43 | 44 | 45 | ;; # 1. Use libpython-clj 46 | ;; This is using the same python classes as above 47 | ;; So it is "the same code" 48 | ;; 49 | (require-python '[sklearn.datasets :refer [make_blobs]] 50 | '[sklearn.preprocessing :refer [StandardScaler]] 51 | '[sklearn.cluster :refer [KMeans]]) 52 | 53 | 54 | 55 | (def blobs 56 | (make_blobs :n_samples 200 57 | :n_features 50 58 | :centers 3 59 | :cluster_std 2.75 60 | :random_state 42)) 61 | 62 | (def scaler (StandardScaler)) 63 | (def features (first blobs)) 64 | (def scaled-features (py. scaler fit_transform features)) 65 | (def k-means (KMeans 66 | :init "random" 67 | :n_clusters 3 68 | :n_init 10 69 | :max_iter 300 70 | :random_state 42)) 71 | (py. k-means fit scaled-features) 72 | (py.- k-means inertia_) 73 | 74 | (println :python 75 | (py.- k-means inertia_)) 76 | 77 | 78 | ;; # 2. use sklearn-clj 79 | ;; This librraies allow to use all estimators/model from sklearn 80 | ;; It uses libpython-clj, but "hidden" behind sklearn-clj 81 | ;; 82 | 83 | (require '[scicloj.ml.sklearnclj]) 84 | (require '[scicloj.ml.dataset :as ds] 85 | '[scicloj.ml.metamorph :as mm] 86 | '[scicloj.ml.core :as ml] 87 | '[scicloj.sklearn-clj.metamorph :as sklearn-clj]) 88 | 89 | 90 | (def data (-> blobs first py/->jvm ds/dataset)) 91 | 92 | (def fitted-ctx-1 93 | (ml/fit 94 | data 95 | (mm/std-scale :all {}) 96 | {:metamorph/id :k-means} 97 | (sklearn-clj/estimate 98 | :sklearn.cluster "KMeans" 99 | {:init "random" 100 | :n_clusters 3 101 | :n_init 10 102 | :max_iter 300 103 | :random_state 42}))) 104 | (-> fitted-ctx-1 :k-means :attributes :inertia_) 105 | 106 | 107 | ;; # 3. use Clojure only pipeline 108 | ;; So no python interop in use 109 | ;; It uses clustering algorithms from JVM library Smile 110 | 111 | (require '[scicloj.ml.smile.clustering :as clustering]) 112 | 113 | (def fitted-ctx-2 114 | (ml/fit 115 | data 116 | (mm/std-scale :all {}) 117 | {:metamorph/id :k-means} 118 | (scicloj.ml.smile.clustering/cluster 119 | :k-means 120 | [3 300] 121 | :cluster))) 122 | 123 | (-> fitted-ctx-2 :k-means :info :distortion) 124 | 125 | 126 | ;; # 4. use declarative Clojure only pipeline 127 | ;; same as 3), only using metamorph declarative pipelines 128 | 129 | 130 | 131 | (def decl-pipe 132 | [[:mm/std-scale :all {}] 133 | {:metamorph/id :k-means} 134 | [:scicloj.ml.smile.clustering/cluster 135 | :k-means 136 | [3 300] 137 | :cluster]]) 138 | 139 | (def distortion-1 140 | (->> decl-pipe 141 | ml/->pipeline 142 | (ml/fit-pipe data) 143 | :k-means 144 | :info 145 | :distortion)) 146 | 147 | 148 | (frequencies 149 | (repeatedly 1000 (fn [] 150 | (->> decl-pipe 151 | ml/->pipeline 152 | (ml/fit-pipe data) 153 | :k-means 154 | :info 155 | :distortion)))) 156 | 157 | 158 | 159 | 160 | ;; # 5. in one threading macro, no variables declared 161 | ;; same as 4., but written more compact 162 | 163 | (def distortion-2 164 | (->> [[:mm/std-scale :all {}] 165 | {:metamorph/id :k-means} 166 | [:scicloj.ml.smile.clustering/cluster 167 | :k-means 168 | [3 300] 169 | :cluster]] 170 | ml/->pipeline 171 | (ml/fit-pipe data) 172 | :k-means 173 | :info)) 174 | -------------------------------------------------------------------------------- /src/scicloj/ml/sklearnclj.clj: -------------------------------------------------------------------------------- 1 | (ns scicloj.ml.sklearnclj 2 | (:require 3 | [notespace.api :as note] 4 | [notespace.kinds :as kind] 5 | [scicloj.sklearn-clj.ml] 6 | [scicloj.ml.ug-utils])) 7 | 8 | 9 | 10 | 11 | (comment 12 | (note/init-with-browser) 13 | (note/eval-this-notespace) 14 | (note/reread-this-notespace) 15 | (note/render-static-html "docs/userguide-sklearnclj.html") 16 | (note/init)) 17 | 18 | 19 | ["# sklearn-clj"] 20 | 21 | ["The [scicloj.ml](https://github.com/scicloj/scicloj.ml) plugin [sklearn-clj](https://github.com/scicloj/sklearn-clj) 22 | gives easy access to all models from [scikit-learn](https://scikit-learn.org/stable/)"] 23 | 24 | ["After [libpython.clj](https://github.com/clj-python/libpython-clj) 25 | has been setup with the python package sklearn installed, 26 | the following lines show how to use any sklearn model in a usual `scicloj.ml` pipeline:"] 27 | 28 | (require '[scicloj.ml.core :as ml] 29 | '[scicloj.ml.metamorph :as mm] 30 | '[scicloj.ml.dataset :as ds] 31 | '[tech.v3.dataset.tensor :as dst] 32 | '[scicloj.sklearn-clj :as sklearn-clj] 33 | '[scicloj.sklearn-clj.ml] 34 | '[scicloj.metamorph.ml.toydata :as toydata] 35 | '[libpython-clj2.python :refer [py.-] :as py]) 36 | 37 | 38 | ["Example: logistic regression"] 39 | 40 | (def ds (dst/tensor->dataset [[0 0 0 ] [1 1 1 ] [2 2 2]])) 41 | 42 | ["Make pipe with sklearn model 'logistic-regression'"] 43 | (def pipe 44 | (ml/pipeline 45 | (mm/set-inference-target 2) 46 | {:metamorph/id :model} 47 | (mm/model {:model-type :sklearn.classification/logistic-regression 48 | :max-iter 100}))) 49 | 50 | 51 | ["Train model"] 52 | (def fitted-ctx 53 | (pipe {:metamorph/data ds 54 | :metamorph/mode :fit})) 55 | 56 | ["Predict on new data"] 57 | (-> 58 | (ml/transform-pipe 59 | (dst/tensor->dataset [[3 4 5]]) 60 | pipe 61 | fitted-ctx) 62 | :metamorph/data) 63 | 64 | ["Access model details via python interop (libpython-clj)"] 65 | (-> fitted-ctx :model :model-data :model 66 | (py.- coef_) 67 | (py/->jvm)) 68 | 69 | 70 | 71 | 72 | 73 | ["All model attributes are as well in the context"] 74 | 75 | (def model-attributes 76 | (-> fitted-ctx :model :model-data :attributes)) 77 | 78 | ^kind/hiccup-nocode 79 | [:dl (map 80 | (fn [[k v]] 81 | [:span 82 | (vector :dt k) 83 | (vector :dd (clojure.pprint/write v :stream nil))]) 84 | model-attributes)] 85 | 86 | 87 | 88 | ["# Models"] 89 | 90 | ["Below all models are listed with their parameters and the original documentation. 91 | 92 | The parameters are given as Clojure keys in kebap-case. As the document texts are imported from python 93 | they refer to the python spelling of the parameter. But the translation between the two should be obvious."] 94 | 95 | ^kind/hiccup-nocode 96 | [:ul 97 | 98 | 99 | (->> 100 | (ml/model-definition-names) 101 | (filter #(contains? #{"sklearn.classification" 102 | "sklearn.regression"} 103 | 104 | (namespace %))) 105 | sort 106 | (map 107 | #(vector :li [:a {:href (str "#" (str %))} (str %)])))] 108 | 109 | 110 | 111 | 112 | ["## Sklearn classification"] 113 | ^kind/hiccup-nocode 114 | (scicloj.ml.ug-utils/render-key-info ":sklearn.classification") 115 | 116 | 117 | ["## Sklearn regression"] 118 | ^kind/hiccup-nocode 119 | (scicloj.ml.ug-utils/render-key-info ":sklearn.regression") 120 | -------------------------------------------------------------------------------- /src/scicloj/ml/third_party.clj: -------------------------------------------------------------------------------- 1 | (ns scicloj.ml.third-party 2 | (:require [notespace.api :as note] 3 | [notespace.kinds :as kind] 4 | [scicloj.ml.ug-utils :refer :all] 5 | [dk.simongray.datalinguist.ml.crf] 6 | [scicloj.ml.clj-djl.mmml] 7 | [scicloj.ml.clj-djl.fasttext] 8 | [tech.v3.libs.arrow :as arrow])) 9 | 10 | (comment 11 | (note/init-with-browser) 12 | (note/eval-this-notespace) 13 | (note/reread-this-notespace) 14 | (note/render-static-html "docs/userguide-third_party.html") 15 | (note/init)) 16 | 17 | 18 | (require '[scicloj.ml.core :as ml] 19 | '[scicloj.ml.metamorph :as mm] 20 | '[scicloj.ml.dataset :as ds] 21 | '[tech.v3.datatype.functional :as dfn] 22 | '[clojure.tools.namespace.find :as ns-find] 23 | '[clojure.java.classpath :as cp] 24 | '[scicloj.ml.xgboost] 25 | '[camel-snake-kebab.core :as csk]) 26 | 27 | 28 | 29 | 30 | 31 | 32 | ["# xgboost"] 33 | ["## Example code"] 34 | 35 | (def house-price 36 | (-> 37 | (ds/dataset 38 | "http://d2l-data.s3-accelerate.amazonaws.com/kaggle_house_pred_train.csv" {:key-fn csk/->kebab-case-keyword}) 39 | (ds/replace-missing :type/string "NA") 40 | (ds/categorical->number #(ds/select-columns % :type/string)))) 41 | 42 | 43 | (def split (first (ds/split->seq house-price :holdout))) 44 | 45 | (def train-ds (:train split)) 46 | (def test-ds (:test split)) 47 | 48 | 49 | (def pipe-fn 50 | (ml/pipeline 51 | (mm/replace-missing :type/numerical :value 0) 52 | (mm/set-inference-target :sale-price) 53 | {:metamorph/id :model} (mm/model {:model-type :xgboost/linear-regression}))) 54 | 55 | (def fit-result 56 | (let [fitted-ctx 57 | (ml/fit-pipe train-ds pipe-fn) 58 | test-predictions 59 | (ml/transform-pipe test-ds pipe-fn fitted-ctx) 60 | error 61 | (ml/mae (-> test-predictions :metamorph/data :sale-price) 62 | (-> test-ds :sale-price))] 63 | {:error error 64 | :gains (-> 65 | (ml/explain (-> fitted-ctx :model)) 66 | (ds/order-by :gain :desc))})) 67 | 68 | 69 | 70 | ["error:"] 71 | (:error fit-result) 72 | 73 | ["Feature importance - gain"] 74 | 75 | ^kind/dataset 76 | (:gains fit-result) 77 | 78 | ["## Reference"] 79 | 80 | ^kind/hiccup-nocode (render-key-info ":xgboost") 81 | 82 | ["# Deep learning models via clj-djl "] 83 | 84 | 85 | 86 | (def train-ds 87 | (ds/dataset 88 | "http://d2l-data.s3-accelerate.amazonaws.com/kaggle_house_pred_train.csv")) 89 | 90 | 91 | (def test-ds 92 | (-> 93 | (ds/dataset 94 | "http://d2l-data.s3-accelerate.amazonaws.com/kaggle_house_pred_test.csv") 95 | (ds/add-column "SalePrice" 0))) 96 | 97 | (defn numeric-features [ds] 98 | (ds/intersection (ds/numeric ds) 99 | (ds/feature ds))) 100 | 101 | (defn update-columns 102 | "Update a sequence of columns selected by column name seq or column selector function." 103 | [dataframe col-name-seq-or-fn update-fn] 104 | (ds/update-columns dataframe 105 | (if (fn? col-name-seq-or-fn) 106 | (ds/column-names (col-name-seq-or-fn dataframe)) 107 | col-name-seq-or-fn) 108 | update-fn)) 109 | 110 | 111 | 112 | 113 | (require 114 | '[clj-djl.nn :as nn] 115 | '[clj-djl.training :as t] 116 | '[clj-djl.training.loss :as loss] 117 | '[clj-djl.training.optimizer :as optimizer] 118 | '[clj-djl.training.tracker :as tracker] 119 | '[clj-djl.training.listener :as listener] 120 | '[clj-djl.ndarray :as nd] 121 | '[clj-djl.nn.parameter :as param]) 122 | 123 | (def learning-rate 0.05) 124 | (defn net [] (nn/sequential {:blocks (nn/linear {:units 1}) 125 | :initializer (nn/normal-initializer) 126 | :parameter param/weight})) 127 | 128 | (defn cfg [] (t/training-config {:loss (loss/l2-loss) 129 | :optimizer (optimizer/sgd 130 | {:tracker (tracker/fixed learning-rate)}) 131 | :evaluator (t/accuracy) 132 | :listeners (listener/logging)})) 133 | 134 | 135 | 136 | (def pipe 137 | (ml/pipeline 138 | 139 | (mm/drop-columns ["Id"]) 140 | (mm/set-inference-target "SalePrice") 141 | (mm/replace-missing :type/numerical :value 0) 142 | (mm/replace-missing :!type/numerical :value "None") 143 | (ml/lift update-columns numeric-features 144 | #(dfn// (dfn/- % (dfn/mean %)) 145 | (dfn/standard-deviation %))) 146 | (mm/transform-one-hot :!type/numerical :full) 147 | (mm/update-column "SalePrice" 148 | #(dfn// % (dfn/mean %))) 149 | 150 | (mm/set-inference-target "SalePrice") 151 | 152 | (mm/model {:model-type :clj-djl/djl 153 | :batchsize 64 154 | :model-spec {:name "mlp" :block-fn net} 155 | :model-cfg (cfg) 156 | :initial-shape (nd/shape 1 311) 157 | :nepoch 1}))) 158 | 159 | 160 | 161 | 162 | (def trained-pipeline 163 | (pipe {:metamorph/data train-ds 164 | :metamorph/mode :fit 165 | :metamorph.ml/full-ds (ds/concat train-ds test-ds)})) 166 | 167 | 168 | 169 | (def predicted-pipeline 170 | (pipe 171 | (merge trained-pipeline 172 | {:metamorph/data test-ds 173 | :metamorph/mode :transform}))) 174 | 175 | 176 | 177 | 178 | ( get 179 | (:metamorph/data predicted-pipeline) 180 | "SalePrice") 181 | 182 | 183 | ^kind/hiccup-nocode 184 | (render-key-info ":clj-djl/djl") 185 | 186 | 187 | ["# A NER model from Standford CoreNLP"] 188 | 189 | ^kind/hiccup-nocode 190 | (render-key-info ":corenlp") 191 | 192 | 193 | ["# Fastext text lassification rom DJL"] 194 | 195 | ^kind/hiccup-nocode 196 | (render-key-info ":clj-djl/fasttext") 197 | 198 | (def tweets 199 | (-> 200 | (ds/dataset "data/tweets_sentiment.csv" {:key-fn keyword}) 201 | (ds/drop-columns [:id]))) 202 | ;; (def tweets 203 | ;; (arrow/stream->dataset "data/tweets_sentiment.feather")) 204 | 205 | 206 | ;; (require '[tech.v3.libs.arrow]) 207 | 208 | 209 | 210 | ^kind/dataset 211 | tweets 212 | 213 | (def split (first (ds/split->seq 214 | (ds/shuffle tweets) 215 | :holdout))) 216 | 217 | 218 | 219 | (def model 220 | (ml/train (-> (:train split) 221 | (tech.v3.dataset.modelling/set-inference-target :label)) 222 | {:model-type :clj-djl/fasttext 223 | :ft-training-config {:epoch 1}})) 224 | 225 | (def 226 | prob-distribution 227 | (ml/predict (:test split) (assoc model 228 | :top-k 3))) 229 | prob-distribution 230 | -------------------------------------------------------------------------------- /src/scicloj/ml/titanic.clj: -------------------------------------------------------------------------------- 1 | (ns scicloj.ml.titanic 2 | (:require 3 | [notespace.api :as note] 4 | [notespace.kinds :as kind])) 5 | 6 | (comment 7 | (note/init-with-browser) 8 | (note/eval-this-notespace) 9 | (note/reread-this-notespace) 10 | (note/render-static-html "docs/userguide-titanic.html") 11 | 12 | (note/init)) 13 | 14 | 15 | 16 | (require '[scicloj.ml.dataset :as ds] 17 | '[tech.v3.dataset.math :as ds-math] 18 | '[tech.v3.datatype.functional :as dfn] 19 | '[scicloj.ml.core :as ml] 20 | '[scicloj.ml.metamorph :as mm] 21 | '[camel-snake-kebab.core :as csk] 22 | '[scicloj.metamorph.ml.loss :as loss] 23 | '[clojure.string :as str] 24 | '[fastmath.stats :as stats] 25 | '[fastmath.random :as rnd] 26 | '[scicloj.ml.xgboost]) 27 | 28 | 29 | 30 | ["## Introduction "] 31 | 32 | [" In this example, we will train a model which is able to predict the survival of passengers from the Titanic dataset." 33 | "In a real analysis, this would contain as well explorative analysis of the data, which I will skip here, 34 | as the purpose is to showcase machine learning with scicloj.ml, which is about model evaluation and selection."] 35 | 36 | 37 | 38 | ["### Read data"] 39 | 40 | (def data (ds/dataset "data/titanic/train.csv" {:key-fn csk/->kebab-case-keyword})) 41 | 42 | 43 | 44 | ["Column info:"] 45 | (ds/info data) 46 | 47 | 48 | ["We can explore the association between the categorical columns of the dataset 49 | with the :survived using cramers-v-corrected:"] 50 | (def categorical-feature-columns [:pclass :sex :age :parch 51 | :embarked]) 52 | (map 53 | #(hash-map 54 | % 55 | (stats/cramers-v-corrected 56 | (get data %) 57 | (:survived data))) 58 | categorical-feature-columns) 59 | 60 | ["In this dataset, :sex seems to be the best predictor for survival."] 61 | 62 | ["Association between the select variables:"] 63 | (for [c1 categorical-feature-columns c2 categorical-feature-columns] 64 | {[c1 c2] 65 | (stats/cramers-v-corrected (get data c1) (get data c2))}) 66 | 67 | 68 | ["This shows how much the columns are correlated. "] 69 | 70 | ["## clean some of the features"] 71 | 72 | ["The follwoing functios will be used in the pipeline. They clean the 73 | features to make them better predictors."] 74 | 75 | (defn categorize-cabin [data] 76 | (-> data 77 | (ds/add-or-replace-column 78 | :cabin 79 | (map 80 | #(if (empty? %) 81 | :unknown 82 | (keyword (subs 83 | % 84 | 0 1))) 85 | (:cabin data))))) 86 | 87 | 88 | (defn categorize-age [data] 89 | (-> 90 | data 91 | (ds/add-or-replace-column 92 | :age-group 93 | (map 94 | #(cond 95 | (< % 10) :child 96 | (< % 18) :teen 97 | (< % 60) :adult 98 | (> % 60) :elderly 99 | true :other) 100 | (:age data))))) 101 | 102 | ["We want to create a new column :title which might help in the score. 103 | This is an example of custom function, which creates a new column from existing columns, 104 | which is a typical case of feature engineering."] 105 | 106 | (defn name->title [dataset] 107 | (-> dataset 108 | (ds/add-or-replace-column 109 | :title 110 | (map 111 | #(-> % (str/split #"\.") 112 | first 113 | (str/split #"\,") 114 | last 115 | str/trim) 116 | (data :name))) 117 | (ds/drop-columns :name))) 118 | 119 | (def title-map 120 | {"Major" :a 121 | "Col" :a 122 | "Rev" :a 123 | "Ms" :b 124 | "Miss" :b 125 | "Jonkheer" :a 126 | "Don" :a 127 | "Mlle" :b 128 | "Mr" :a 129 | "Master" :a 130 | "Capt" :a 131 | "Mrs" :b 132 | "Lady" :b 133 | "Sir" :a 134 | "Dr" :a 135 | "the Countess" :b 136 | "Mme" :b}) 137 | 138 | (defn categorize-title [data] 139 | (-> 140 | data 141 | (ds/add-or-replace-column 142 | :title 143 | (map title-map (:title data))))) 144 | 145 | ["The final pipeline contains the functions we did before."] 146 | 147 | 148 | ;; => _unnamed [2 1]: 149 | ;; | :a | 150 | ;; |----| 151 | ;; | | 152 | ;; | | 153 | 154 | (def pipeline-fn 155 | (ml/pipeline 156 | (mm/replace-missing :embarked :value "S") 157 | (mm/replace-missing :age :value tech.v3.datatype.functional/mean) 158 | (mm/update-column :parch str) 159 | (ml/lift categorize-age) 160 | (ml/lift name->title) 161 | (ml/lift categorize-title) 162 | (ml/lift categorize-cabin) 163 | (mm/select-columns [:age-group 164 | :cabin 165 | :embarked 166 | :fare 167 | :parch 168 | :pclass 169 | :sex 170 | :survived 171 | :title]) 172 | 173 | (fn [ctx] 174 | (assoc ctx :categorical-ds 175 | (:metamorph/data ctx))) 176 | 177 | 178 | (mm/categorical->number [:survived :pclass :sex :embarked 179 | :title :age-group :cabin :parch] {} :int64) 180 | 181 | (mm/set-inference-target :survived))) 182 | 183 | 184 | ["Transformed data"] 185 | (-> 186 | (pipeline-fn {:metamorph/data data :metamorph/mode :fit}) 187 | :metamorph/data) 188 | 189 | 190 | ["The following splits the dataset in three pieces, 191 | train, val and test to predict on later. 192 | "] 193 | 194 | 195 | 196 | 197 | 198 | (def ds-split (first (ds/split->seq data :holdout {:ratio [0.8 0.2] 199 | :split-names [:train-val :test]}))) 200 | 201 | 202 | ["Create a sequence of train/test (k-fold with k=10) splits used to evaluate the pipeline."] 203 | (def train-val-splits 204 | (ds/split->seq 205 | (:train-val ds-split) 206 | :kfold 207 | {:k 10})) 208 | 209 | 210 | 211 | 212 | ["The full pipeline definition including the random forrest model."] 213 | 214 | (def full-pipeline-fn 215 | (ml/pipeline 216 | pipeline-fn 217 | ;; we overwrite the id, so the model function will store 218 | ;; it's output (the model) in the pipeline ctx under key :model 219 | {:metamorph/id :model} 220 | (mm/model {:model-type :smile.classification/random-forest}))) 221 | 222 | 223 | 224 | 225 | 226 | ["Evaluate the (single) pipeline function using the train/test split"] 227 | (def evaluations 228 | (ml/evaluate-pipelines 229 | [full-pipeline-fn] 230 | train-val-splits 231 | ml/classification-accuracy 232 | :accuracy)) 233 | 234 | 235 | ["The default k-fold splits makes 10 folds, 236 | so we train 10 models, each having its own loss."] 237 | 238 | ["The `evaluate-pipelines` fn averages the models per pipe-fn, 239 | and returns the best. 240 | So we get a single model back, as we only have one pipe fn"] 241 | 242 | ["Often we consider the model with the lowest loss to be the best."] 243 | 244 | ["Return a single model only (as a list of 1) , namely the best over all 245 | pipeline functions 246 | and all cross validations is the default behavoiur, but can be changed 247 | with the `tune options`."] 248 | 249 | ["They controll as well which information is returned."] 250 | 251 | ["`tech.ml` stores the models in the context in a serialzed form, 252 | and the function `thaw-model` can be used to get the original model back. 253 | This is a Java class in the case of 254 | model :smile.classification/random.forest, but this depends on the 255 | which `model` function is in the pipeline"] 256 | 257 | ["We can get for example, the models like this:"] 258 | 259 | (def models 260 | (->> evaluations 261 | flatten 262 | (map 263 | #(hash-map :model (ml/thaw-model (get-in % [:fit-ctx :model])) 264 | :metric ((comp :metric :test-transform) %) 265 | :fit-ctx (:fit-ctx %))) 266 | 267 | (sort-by :mean) 268 | reverse)) 269 | 270 | 271 | ["The accuracy of the best trained model is:"] 272 | (-> models first :metric) 273 | 274 | ["The one with the highest accuracy is then:"] 275 | (-> models first :model) 276 | 277 | 278 | ["We can get the predictions on new-data, which for classification contain as well 279 | the posterior probabilities per class."] 280 | 281 | ["We do this by running the pipeline again, this time with new data and merging 282 | :mode transform"] 283 | 284 | (def predictions 285 | (-> 286 | (full-pipeline-fn 287 | (assoc 288 | (:fit-ctx (first models)) 289 | :metamorph/data (:test ds-split) 290 | :metamorph/mode :transform)) 291 | :metamorph/data)) 292 | 293 | ^kind/dataset 294 | predictions 295 | 296 | 297 | ["Out of the predictions and the truth, we can construct the 298 | confusion matrix."] 299 | 300 | (def trueth 301 | (-> 302 | (full-pipeline-fn {:metamorph/data (:test ds-split) :metamorph/mode :fit}) 303 | :metamorph/data 304 | tech.v3.dataset.modelling/labels)) 305 | 306 | ^kind/dataset 307 | (-> 308 | (ml/confusion-map (:survived predictions) 309 | (:survived trueth) 310 | :none) 311 | (ml/confusion-map->ds)) 312 | 313 | ["### Hyper parameter tuning"] 314 | 315 | ["This defines a pipeline with options. The options gets passed to the model function, 316 | so become hyper-parameters of the model. 317 | 318 | The `use-age?` options is used to make a conditional pipeline. As the use-age? variable becomes part of the grid to search in, 319 | we tune it as well. 320 | This is an example how pipeline-options can be grid searched in the same way then hyper-parameters of the model. 321 | 322 | "] 323 | (defn make-pipeline-fn [options] 324 | 325 | (ml/pipeline 326 | pipeline-fn 327 | {:metamorph/id :model} 328 | (mm/model 329 | (merge options 330 | {:model-type :smile.classification/random-forest})))) 331 | 332 | ["Use sobol optimization, to find som grid points, 333 | which cover in a smart way the hyper-parameter space."] 334 | 335 | (def search-grid 336 | (->> 337 | (ml/sobol-gridsearch {:trees (ml/linear 100 500 10) 338 | :mtry (ml/categorical [0 2 4]) 339 | :split-rule (ml/categorical [:gini :entropy]) 340 | :max-depth (ml/linear 1 50 10) 341 | :node-size (ml/linear 1 10 10)}) 342 | 343 | (take 500))) 344 | 345 | 346 | ["Generate the pipeline-fns we want to evaluate."] 347 | (def pipeline-fns (map make-pipeline-fn search-grid)) 348 | 349 | (defn xgboost-pipe [opts] 350 | (ml/pipeline 351 | pipeline-fn 352 | {:metamorph/id :model} 353 | (mm/model 354 | (merge opts 355 | {:model-type :xgboost/classification})))) 356 | 357 | (def xgboost-pipes 358 | (->> 359 | (ml/sobol-gridsearch 360 | (ml/hyperparameters :xgboost/classification)) 361 | (take 500) 362 | (map xgboost-pipe))) 363 | 364 | 365 | ;; (ml/fit-pipe (:train (first train-val-splits)) xgboost-pipe) 366 | 367 | ["Evaluate all pipelines and keep results"] 368 | (def evaluations 369 | 370 | (ml/evaluate-pipelines 371 | (take 10 372 | (concat xgboost-pipes xgboost-pipes)) 373 | train-val-splits 374 | ml/classification-accuracy 375 | :accuracy 376 | {:return-best-pipeline-only false 377 | :return-best-crossvalidation-only false 378 | ;; :evaluation-handler-fn (fn [m] 379 | ;; (println (:metric m))) 380 | 381 | 382 | :map-fn :map})) 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | ["Get the key information from the evaluations and sort by the metric function used, 391 | accuracy here."] 392 | 393 | (def models 394 | (->> evaluations 395 | flatten 396 | (map 397 | #(assoc 398 | (select-keys % [:test-transform :fit-ctx :pipe-fn]) 399 | 400 | :model (ml/thaw-model (get-in % [:fit-ctx :model])))) 401 | (sort-by (comp :metric :test-transform)) 402 | reverse)) 403 | 404 | 405 | 406 | 407 | ["As we did several pipelines and several x-fold cross validation, we have quite some models trained in total "] 408 | (count models) 409 | 410 | ["As we sorted by mean accuracy, the first evaluation result is the best model,"] 411 | (def best-model (first models)) 412 | 413 | ["which is: "] 414 | (:model best-model) 415 | 416 | ["with a mean accuracy of " (-> best-model :test-transform :mean)] 417 | ["and a accuracy of " (-> best-model :test-transform :metric)] 418 | 419 | 420 | (println "mean acc: " (-> best-model :test-transform :mean)) 421 | (println "acc: " (-> best-model :test-transform :metric)) 422 | 423 | 424 | ["using options: "] 425 | (-> best-model :fit-ctx :model :options) 426 | (clojure.pprint/pprint (-> best-model :fit-ctx :model :options)) 427 | 428 | (def test-data (ds/dataset "data/titanic/test.csv" 429 | {:key-fn csk/->kebab-case-keyword})) 430 | 431 | 432 | 433 | (def predition-on-test 434 | (full-pipeline-fn 435 | (assoc (:fit-ctx best-model) 436 | :metamorph/data (ds/add-column test-data :survived nil) 437 | :metamorph/mode :transform))) 438 | 439 | 440 | (def prediction-ds 441 | (-> 442 | (predition-on-test :metamorph/data) 443 | (ds/add-column :passenger-id (:passenger-id test-data)) 444 | (ds/convert-types [:survived] :int) 445 | (ds/select-columns [:passenger-id :survived 0 1]))) 446 | 447 | ^kind/dataset 448 | prediction-ds 449 | 450 | 451 | 452 | 453 | 454 | ["# Create Subimssion file to Kaggle"] 455 | 456 | (def submission-ds 457 | (-> prediction-ds 458 | (ds/select-columns [:passenger-id :survived]) 459 | (ds/rename-columns {:passenger-id "PassengerId" 460 | :survived "Survived"}))) 461 | 462 | (ds/write-csv! submission-ds "submission.csv") 463 | 464 | 465 | ["### Learning curve"] 466 | 467 | 468 | 469 | (def training-curve-splits 470 | (map 471 | #(hash-map :train (ds/head (:train-val ds-split) %) 472 | :test (:test ds-split)) 473 | (range 5 (ds/row-count (:train-val ds-split)) 10))) 474 | 475 | 476 | 477 | (def training-curve-evaluations 478 | (ml/evaluate-pipelines [(:pipe-fn (first models))] 479 | training-curve-splits 480 | ml/classification-accuracy 481 | :accuracy 482 | {:map-fn :map 483 | :return-best-pipeline-only false 484 | :return-best-crossvalidation-only false 485 | :evaluation-handler-fn identity})) 486 | 487 | (def train-counts 488 | (->> training-curve-evaluations flatten (map #(-> % :fit-ctx :metamorph/data ds/row-count)))) 489 | 490 | 491 | 492 | (def test-metrices 493 | (->> training-curve-evaluations flatten (map #(-> % :test-transform :metric)))) 494 | 495 | (def train-metrices 496 | (->> training-curve-evaluations flatten (map #(-> % :train-transform :metric)))) 497 | 498 | (def traing-curve-plot-data 499 | (reverse 500 | (sort-by :metric 501 | (flatten 502 | (map 503 | #(vector (zipmap [:count :metric :type] [%1 %2 :test]) 504 | (zipmap [:count :metric :type] [%1 %3 :train])) 505 | train-counts 506 | test-metrices 507 | train-metrices))))) 508 | 509 | 510 | ^kind/vega 511 | { 512 | :data {:values traing-curve-plot-data} 513 | 514 | :width 500 515 | :height 500 516 | :mark {:type "line"} 517 | :encoding {:x {:field :count :type "quantitative"} 518 | :y {:field :metric :type "quantitative"} 519 | :color {:field :type}}} 520 | 521 | 522 | 523 | 524 | 525 | (comment 526 | (->> 527 | (map 528 | #(hash-map :test-metric %1 529 | :train-metric %2 530 | :better? (if (> %1 %2) :test :train)) 531 | (->> training-curve-evaluations flatten (map :metric)) 532 | (->> training-curve-evaluations flatten (map #(get-in % [:train-prediction :metric])))) 533 | (map :better?) 534 | frequencies) 535 | 536 | (println 537 | (-> (ds/dataset {:x ["A" "B" "C" "D" "E" "F"] :y (range)}) 538 | (ds/categorical->one-hot [:x] {} :int) 539 | (ds/set-inference-target :y) 540 | (scicloj.metamorph.ml/train {:model-type :smile.regression/ordinary-least-square}) 541 | ml/thaw-model))) 542 | 543 | 544 | -------------------------------------------------------------------------------- /src/scicloj/ml/transformers.clj: -------------------------------------------------------------------------------- 1 | (ns scicloj.ml.transformers 2 | (:require 3 | [notespace.api :as note] 4 | [notespace.kinds :as kind] 5 | [scicloj.ml.metamorph :as mm])) 6 | 7 | 8 | 9 | (comment 10 | (note/init-with-browser) 11 | (note/eval-this-notespace) 12 | (note/render-static-html "docs/userguide-transformers.html")) 13 | 14 | 15 | (require '[scicloj.ml.core :as ml] 16 | '[scicloj.ml.dataset :as ds] 17 | '[scicloj.ml.metamorph :as mm]) 18 | 19 | 20 | 21 | ^kind/hidden 22 | (defn docu-fn [v] 23 | (let [m (meta v)] 24 | (kind/override 25 | [ 26 | (str "## Transformer " "**" (:name m) "**") 27 | "----------------------------------------------------------" 28 | "__Clojure doc__:\n" 29 | (:doc m) 30 | "----------------------------------------------------------"] 31 | 32 | kind/md-nocode))) 33 | 34 | 35 | 36 | 37 | (docu-fn (var mm/count-vectorize)) 38 | 39 | ["In the following we transform the text given in a dataset into a 40 | map of token counts applying some default text normalization."] 41 | (def data (ds/dataset {:text ["Hello Clojure world, hello ML word !" 42 | "ML with Clojure is fun"]})) 43 | 44 | 45 | ^kind/dataset-grid 46 | data 47 | 48 | ["_"] 49 | 50 | (def fitted-ctx 51 | (ml/fit data 52 | (mm/count-vectorize :text :bow))) 53 | 54 | 55 | 56 | fitted-ctx 57 | 58 | (def bow-ds 59 | (:metamorph/data fitted-ctx)) 60 | 61 | ^kind/dataset 62 | bow-ds 63 | 64 | 65 | ["A custom tokenizer can be specified by either passing options to 66 | `scicloj.ml.smile.nlp/default-tokenize` "] 67 | 68 | 69 | (def fitted-ctx 70 | (ml/fit 71 | data 72 | (mm/count-vectorize :text :bow {:stopwords ["clojure"] 73 | :stemmer :none}))) 74 | 75 | 76 | fitted-ctx 77 | 78 | ["or passing in a implementation of a tokenizer function"] 79 | 80 | (def fitted-ctx 81 | (ml/fit 82 | data 83 | (mm/count-vectorize 84 | :text :bow 85 | {:text->bow-fn (fn [text options] 86 | {:a 1 :b 2})}))) 87 | 88 | fitted-ctx 89 | 90 | 91 | 92 | (docu-fn (var mm/bow->SparseArray)) 93 | ["Now we convert the bag-of-words map to a sparse array of class 94 | `smile.util.SparseArray` 95 | 96 | "] 97 | (def ctx-sparse 98 | (ml/fit 99 | bow-ds 100 | (mm/bow->SparseArray :bow :sparse))) 101 | 102 | ctx-sparse 103 | 104 | 105 | ^kind/dataset 106 | (:metamorph/data ctx-sparse) 107 | 108 | ["The SparseArray instances look like this:"] 109 | (zipmap 110 | (:text bow-ds) 111 | (map seq 112 | (-> ctx-sparse :metamorph/data :sparse))) 113 | 114 | (docu-fn (var mm/bow->sparse-array)) 115 | ["Now we convert the bag-of-words map to a sparse array of class 116 | `java primitive int array` 117 | "] 118 | (def ctx-sparse 119 | (ml/fit 120 | bow-ds 121 | (mm/bow->sparse-array :bow :sparse))) 122 | 123 | ctx-sparse 124 | 125 | ["We see as well the sparse representation as indices against the vocabulary 126 | of the non-zero counts."] 127 | 128 | (zipmap 129 | (:text bow-ds) 130 | (map seq 131 | (-> ctx-sparse :metamorph/data :sparse))) 132 | 133 | 134 | 135 | 136 | ["In both ->sparse function we can control the vocabulary via 137 | the option to pass in a different / custom functions which creates 138 | the vocabulary from the bow maps."] 139 | 140 | (def ctx-sparse 141 | (ml/fit 142 | bow-ds 143 | (mm/bow->SparseArray 144 | :bow :sparse 145 | {:create-vocab-fn 146 | (fn [bow] (scicloj.ml.smile.nlp/->vocabulary-top-n bow 1))}))) 147 | 148 | 149 | ctx-sparse 150 | 151 | (def ctx-sparse 152 | (ml/fit 153 | bow-ds 154 | (mm/bow->SparseArray 155 | :bow :sparse 156 | {:create-vocab-fn 157 | (fn [_] 158 | ["hello" "fun"])}))) 159 | 160 | 161 | ctx-sparse 162 | 163 | 164 | (docu-fn (var mm/bow->tfidf)) 165 | ["Here we calculate the tf-idf score from the bag of words:"] 166 | 167 | ^kind/dataset 168 | (ml/pipe-it 169 | bow-ds 170 | (mm/bow->tfidf :bow :tfidf {})) 171 | 172 | 173 | 174 | (docu-fn (var mm/model)) 175 | ["The `model` transformer allows to execute all machine learning models.clj 176 | which register themself inside the `metamorph.ml` system via the function 177 | `scicloj.metamorph.ml/define-model!`. 178 | The build in models are listed here: 179 | https://scicloj.github.io/scicloj.ml/userguide-models.html 180 | 181 | "] 182 | 183 | ["We use the Iris data for this example:"] 184 | 185 | (def iris 186 | (-> 187 | (ds/dataset 188 | "https://raw.githubusercontent.com/scicloj/metamorph.ml/main/test/data/iris.csv" {:key-fn keyword}) 189 | (tech.v3.dataset.print/print-range 5))) 190 | 191 | 192 | 193 | ^kind/dataset 194 | iris 195 | 196 | (def train-test 197 | (ds/train-test-split iris)) 198 | 199 | ["The pipeline consists in specifying the inference target, 200 | transform target to categorical and the model function"] 201 | (def pipe-fn 202 | (ml/pipeline 203 | (mm/set-inference-target :species) 204 | (mm/categorical->number [:species]) 205 | {:metamorph/id :model} 206 | (mm/model {:model-type :smile.classification/logistic-regression}))) 207 | 208 | ["First we run the training "] 209 | (def fitted-ctx 210 | (ml/fit 211 | (:train-ds train-test) 212 | pipe-fn)) 213 | 214 | 215 | ^kind/hidden 216 | (defn dissoc-in [m ks] 217 | (let [parent-path (butlast ks) 218 | leaf-key (last ks)] 219 | (if (= (count ks) 1) 220 | (dissoc m leaf-key) 221 | (update-in m parent-path dissoc leaf-key)))) 222 | 223 | (dissoc-in fitted-ctx [:model :model-data]) 224 | 225 | ["and then prediction on test"] 226 | 227 | (def transformed-ctx 228 | (ml/transform-pipe (:test-ds train-test) pipe-fn fitted-ctx)) 229 | 230 | (-> transformed-ctx 231 | (dissoc-in [:model :model-data]) 232 | (update-in [:metamorph/data ] #(tech.v3.dataset.print/print-range % 5))) 233 | 234 | 235 | ["and we get the predictions: "] 236 | ^kind/dataset 237 | (-> transformed-ctx 238 | :metamorph/data 239 | (ds/reverse-map-categorical-xforms) 240 | (ds/select-columns :species) 241 | (ds/head)) 242 | 243 | 244 | (docu-fn (var mm/std-scale)) 245 | ["We can use the std-scale transformer to center and scale data."] 246 | ["Lets take some example data:"] 247 | (def data 248 | (ds/dataset 249 | [ 250 | [100 0.001] 251 | [8 0.05] 252 | [50 0.005] 253 | [88 0.07] 254 | [4 0.1]] 255 | {:layout :as-row})) 256 | 257 | ^kind/dataset 258 | data 259 | 260 | ["Now we can center each column arround 0 and scale 261 | it by the standard deviation of the column"] 262 | 263 | ^kind/dataset 264 | (ml/pipe-it 265 | data 266 | (mm/std-scale [0 1] {})) 267 | 268 | 269 | (docu-fn (var mm/min-max-scale)) 270 | 271 | ["The min-max scaler scales columns in a specified interval, 272 | by default from -0.5 to 0.5"] 273 | 274 | ^kind/dataset 275 | (ml/pipe-it 276 | data 277 | (mm/min-max-scale [0 1] {})) 278 | 279 | (docu-fn (var mm/reduce-dimensions)) 280 | 281 | ["#### PCA example"] 282 | 283 | ["In this example we run PCA on some data."] 284 | 285 | (require '[scicloj.metamorph.ml.toydata :as toydata]) 286 | 287 | ["We use the sonar dataset which has 60 columns of quantitative data, 288 | which are certain measurements from a sonar device. 289 | The original purpose of the dataset is to learn to detect rock vs metal 290 | from the measurements"] 291 | (def sonar 292 | (toydata/sonar-ds)) 293 | 294 | ^kind/dataset 295 | sonar 296 | 297 | (def col-names (map #(keyword (str "x" %)) 298 | (range 60))) 299 | 300 | ["First we create and run a pipeline which does the PCA." 301 | "In this pipeline we do not fix the number of columns, as we want to 302 | plot the result for all numbers of components (up to 60) "] 303 | 304 | (def fitted-ctx 305 | (ml/fit 306 | sonar 307 | (mm/reduce-dimensions :pca-cov 60 308 | col-names 309 | {}))) 310 | 311 | 312 | ["The next function transforms the result from the fitted pipeline 313 | into vega lite compatible format for plotting"] 314 | ["It accesses the underlying Smile Java object to get the data on 315 | the cumulative variance for each PCA component."] 316 | (defn create-plot-data [ctx] 317 | (map 318 | #(hash-map :principal-component %1 319 | :cumulative-variance %2) 320 | (range) 321 | (-> ctx vals (nth 2) :fit-result :model bean :cumulativeVarianceProportion))) 322 | 323 | ["Next we plot the cumulative variance over the component index:"] 324 | ^kind/vega 325 | {:$schema "https://vega.github.io/schema/vega-lite/v5.json" 326 | :width 850 327 | :data {:values 328 | (create-plot-data fitted-ctx)} 329 | :mark "line" , 330 | :encoding 331 | {:x {:field :principal-component, :type "nominal"}, 332 | :y {:field :cumulative-variance, :type "quantitative"}}} 333 | 334 | ["From the plot we see, that transforming the data via PCA and reducing 335 | it from 60 dimensions to about 25 would still preserve the full variance."] 336 | ["Looking at this plot, we could now make a decision, how many dimensions 337 | to keep."] 338 | ["We could for example decide, that keeping 60 % of the variance 339 | is enough, which would result in keeping the first 2 dimensions."] 340 | 341 | ["So our pipeline becomes:"] 342 | 343 | 344 | (def fitted-ctx 345 | (ml/fit 346 | sonar 347 | (mm/reduce-dimensions :pca-cov 2 348 | col-names 349 | {}) 350 | 351 | (mm/select-columns [:material "pca-cov-0" "pca-cov-1"]) 352 | (mm/shuffle))) 353 | 354 | ^kind/dataset 355 | (:metamorph/data fitted-ctx) 356 | 357 | ["As the data is now 2-dimensional, it is easy to plot:"] 358 | 359 | (def scatter-plot-data 360 | (-> fitted-ctx 361 | :metamorph/data 362 | (ds/select-columns [:material "pca-cov-0" "pca-cov-1"]) 363 | (ds/rows :as-maps))) 364 | 365 | 366 | ^kind/vega 367 | {:$schema "https://vega.github.io/schema/vega-lite/v5.json" 368 | :data {:values scatter-plot-data} 369 | :width 500 370 | :height 500 371 | 372 | :mark :circle 373 | :encoding 374 | {:x {:field "pca-cov-0" :type "quantitative"} 375 | :y {:field "pca-cov-1" :type "quantitative"} 376 | :color {:field :material}}} 377 | 378 | ["The plot shows that the reduction to 2 dimensions does not create 379 | linear separable areas of `M` and `R`. So a linear model will not be 380 | able to predict well the material from the 2 PCA components."] 381 | 382 | ["It even seems, that the reduction to 2 dimensions removes 383 | too much information for predicting of the material for any type of model."] 384 | -------------------------------------------------------------------------------- /src/scicloj/ml/tune_titanic.clj: -------------------------------------------------------------------------------- 1 | (ns scicloj.ml.tune-titanic 2 | (:require 3 | [notespace.api :as note] 4 | [notespace.kinds :as kind])) 5 | 6 | (comment 7 | (note/init-with-browser) 8 | (note/eval-this-notespace) 9 | (note/reread-this-notespace) 10 | (note/render-static-html "docs/tune-titanic.html") 11 | (note/init)) 12 | 13 | 14 | ["This is the Clojure version of https://www.moritzkoerber.com/posts/preprocessing-hyperparameters/"] 15 | 16 | (require '[scicloj.ml.dataset :as ds] 17 | '[scicloj.ml.core :as ml] 18 | '[scicloj.ml.metamorph :as mm] 19 | '[camel-snake-kebab.core :as csk] 20 | '[scicloj.metamorph.ml.evaluation-handler :as eval-hn] 21 | '[tech.v3.datatype.functional :as dtfunc]) 22 | 23 | (def categorical-features [:pclass :sex :embarked]) 24 | (def numeric-features [:age :parch :fare]) 25 | 26 | (defn map->vec [m] (flatten (into [] m))) 27 | 28 | ["Preproceesing Pipelines including feature engineering"] 29 | 30 | (def data 31 | (-> (ds/dataset "data/titanic/train.csv" 32 | {:key-fn csk/->kebab-case-keyword}) 33 | (ds/select-columns (concat categorical-features numeric-features [:survived])) 34 | (ds/replace-missing categorical-features :value "missing") 35 | (ds/categorical->one-hot categorical-features))) 36 | 37 | 38 | (defn replace-missing [options] 39 | (fn [ctx] 40 | ( (apply mm/replace-missing numeric-features (map->vec (:replace-missing-options options))) ctx))) 41 | 42 | (defn maybe-std-scale [options] 43 | (fn [ctx] 44 | (if (-> options :scaling-options :scale?) 45 | ((mm/std-scale numeric-features {}) 46 | ctx) 47 | ctx))) 48 | 49 | (defn assoc-pipe-opts [options] 50 | (fn [ctx] 51 | (assoc ctx :pipe-options options))) 52 | 53 | 54 | (defn make-decl-pipeline[model-type options] 55 | [[::assoc-pipe-opts options] 56 | [::replace-missing options] 57 | [:mm/categorical->number [:survived ] {} :int64] 58 | [::maybe-std-scale options] 59 | [:mm/set-inference-target :survived] 60 | {:metamorph/id :model} [:mm/model (merge (:model-options options) {:model-type model-type})]]) 61 | 62 | 63 | 64 | 65 | (def logistic-regression-pipelines 66 | (map 67 | #(make-decl-pipeline :smile.classification/logistic-regression %) 68 | (ml/sobol-gridsearch {:scaling-options {:scale? (ml/categorical [true false])} 69 | :replace-missing-options {:value (ml/categorical [dtfunc/mean dtfunc/median])} 70 | :model-options {:lambda (ml/categorical [0.1 0.2 0.5 0.7 1]) 71 | :tolerance (ml/categorical [0.1 0.01 0.001 0.0001])}}))) 72 | 73 | (def random-forrest-pipelines 74 | (map 75 | #(make-decl-pipeline :smile.classification/random-forest %) 76 | (ml/sobol-gridsearch {:scaling-options {:scale? (ml/categorical [true false])} 77 | :replace-missing-options {:value (ml/categorical [dtfunc/mean dtfunc/median])} 78 | :model-options {:trees (ml/categorical [5 50 100 250]) 79 | :max-depth (ml/categorical [5 8 10])}}))) 80 | 81 | (def all-pipelines (concat random-forrest-pipelines)) 82 | 83 | 84 | 85 | (def pipe-fns 86 | (mapv ml/->pipeline all-pipelines)) 87 | 88 | ["Simple split"] 89 | (def splits (ds/split->seq data :holdout {:ratio 0.8})) 90 | (def train-ds ((first splits) :train)) 91 | (def holdout-ds ((first splits) :test)) 92 | 93 | ["Tune hyperparameter by evaluating all pipelines/models "] 94 | 95 | (def files [atom []]) 96 | (def best-evaluation 97 | (ml/evaluate-pipelines 98 | all-pipelines 99 | (ds/split->seq train-ds :kfold 5) 100 | ml/classification-accuracy 101 | :accuracy 102 | {;; :attach-fn-sources {:ns (find-ns 'scicloj.ml.tune-titanic) 103 | ;; :pipe-fns-clj-file "src/scicloj/ml/tune_titanic.clj"} 104 | :return-best-crossvalidation-only true 105 | :return-best-pipeline-only true})) 106 | 107 | (def best-accuracy (-> best-evaluation first first :train-transform :metric)) 108 | 109 | 110 | (def best-options (-> best-evaluation first first :fit-ctx :pipe-options)) 111 | 112 | (def best-pipe-fn 113 | (-> best-evaluation first first :pipe-fn)) 114 | 115 | best-pipe-fn 116 | 117 | (def best-pipe-decl 118 | (-> best-evaluation first first :pipe-decl)) 119 | 120 | 121 | 122 | 123 | 124 | 125 | ["## All information on best found pipeline"] 126 | 127 | ["best accuracy found on train data: " (-> best-evaluation first first :train-transform :metric)] 128 | ["best accuracy found on test data: " (-> best-evaluation first first :test-transform :metric)] 129 | 130 | ["best options (found on train data): "] 131 | best-options 132 | 133 | ["best pipeline (found on train data)"] 134 | best-pipe-decl 135 | 136 | ["pipe sources information"] 137 | (-> 138 | (ml/get-nice-source-info best-pipe-decl 139 | (find-ns 'scicloj.ml.tune-titanic) 140 | (-> #'data meta :file)) 141 | (update :classpath #(take 20 %))) 142 | 143 | 144 | 145 | 146 | 147 | (def predicted-survival-hold-out 148 | (-> 149 | (best-pipe-fn 150 | (merge (-> best-evaluation first first :fit-ctx) 151 | {:metamorph/data holdout-ds :metamorph/mode :transform})) 152 | :metamorph/data 153 | ds/reverse-map-categorical-xforms 154 | :survived)) 155 | 156 | ["Classication accuracy on holdout data: "] 157 | (ml/classification-accuracy predicted-survival-hold-out 158 | (holdout-ds :survived)) 159 | 160 | ["Confusion matrix on holdout data"] 161 | ^kind/dataset 162 | (-> 163 | (ml/confusion-map predicted-survival-hold-out 164 | (holdout-ds :survived)) 165 | (ml/confusion-map->ds)) 166 | 167 | ["Smile model object:"] 168 | (ml/thaw-model 169 | (-> best-evaluation first first :fit-ctx :model)) 170 | 171 | 172 | 173 | 174 | ["Feature importance:"] 175 | 176 | (seq 177 | (.importance 178 | (ml/thaw-model 179 | (-> best-evaluation first first :fit-ctx :model)))) 180 | 181 | 182 | 183 | ["## nested cross validation"] 184 | 185 | 186 | 187 | (require '[scicloj.ml.nested-cv :as nested-cv]) 188 | 189 | 190 | (def nested-cv-result 191 | (doall 192 | (nested-cv/nested-cv data all-pipelines 193 | ml/classification-accuracy 194 | :accuracy 10 5))) 195 | 196 | 197 | ["nested cv best models metrics"] 198 | (map :metric nested-cv-result) 199 | 200 | (def final-model-by-cv 201 | (let [inner-k-fold (ds/split->seq data :kfold {:k 5}) 202 | evaluation (ml/evaluate-pipelines 203 | all-pipelines 204 | inner-k-fold 205 | ml/classification-accuracy 206 | :accuracy) 207 | fit-ctx (-> evaluation first first :fit-ctx) 208 | best-pipefn (-> evaluation first first :pipe-fn)] 209 | {:best-pipe-fn best-pipefn 210 | :fit-ctx fit-ctx})) 211 | 212 | (def final-model 213 | ((:best-pipe-fn final-model-by-cv) 214 | {:metamorph/data data :metamorph/mode :fit})) 215 | 216 | ["Final best model"] 217 | (ml/thaw-model (:model final-model)) 218 | 219 | ["trained with best hyper paramter"] 220 | (-> final-model :pipe-options) 221 | -------------------------------------------------------------------------------- /src/scicloj/ml/ug_utils.clj: -------------------------------------------------------------------------------- 1 | (ns scicloj.ml.ug-utils 2 | (:require [clojure.string :as str] 3 | [notespace.kinds :as kind] 4 | [notespace.view :as view] 5 | [scicloj.ml.core :as ml] 6 | [scicloj.ml.metamorph :as mm] 7 | [tech.v3.dataset :as ds] 8 | [tech.v3.dataset.modelling :as ds-mod] 9 | [tablecloth.api :as tc] 10 | [libpython-clj2.python :as py] 11 | [tech.v3.datatype.functional :as dtf] 12 | [clj-http.client :as client])) 13 | 14 | 15 | (defn kroki [s type format] 16 | (client/post "https://kroki.io/" {:content-type :json 17 | :as :byte-array 18 | :form-params 19 | {:diagram_source s 20 | :diagram_type (name type) 21 | :output_format (name format)}})) 22 | (py/initialize!) 23 | (def doc->markdown (py/import-module "docstring_to_markdown")) 24 | 25 | 26 | 27 | (def model-keys 28 | (keys @scicloj.ml.core/model-definitions*)) 29 | 30 | (def model-options 31 | (map 32 | :options 33 | (vals @scicloj.ml.core/model-definitions*))) 34 | 35 | (defn dataset->md-hiccup [mds] 36 | (let [height (* 46 (- (count (str/split-lines (str mds))) 2)) 37 | height-limit (min height 800)] 38 | [:div {:class "table table-striped table-hover table-condensed table-responsive"} 39 | ;; :style {:height (str height-limit "px")} 40 | 41 | (view/markdowns->hiccup mds)])) 42 | 43 | 44 | (defmethod kind/kind->behaviour ::dataset-nocode 45 | [_] 46 | {:render-src? false 47 | :value->hiccup #'dataset->md-hiccup}) 48 | 49 | (defn docu-options [model-key] 50 | (kind/override 51 | (-> 52 | (tc/dataset 53 | (or 54 | (get-in @scicloj.ml.core/model-definitions* [model-key :options]) 55 | {:name [] :type [] :default []})) 56 | 57 | (tc/reorder-columns :name :type :default)) 58 | 59 | ::dataset-nocode)) 60 | 61 | 62 | 63 | 64 | ;; (-> 65 | ;; (tc/dataset 66 | ;; (get-in @scicloj.ml.core/model-definitions* [:corenlp/crf :options] )) 67 | ;; (tc/reorder-columns :name :type :default) 68 | ;; ) 69 | 70 | (defn text->hiccup 71 | "Convert newlines to [:br]'s." 72 | [text] 73 | (->> (str/split text #"\n") 74 | (interpose [:br]) 75 | (map #(if (string? %) 76 | % 77 | (with-meta % {:key (gensym "br-")}))))) 78 | 79 | (defn docu-doc-string [model-key] 80 | (try 81 | (view/markdowns->hiccup 82 | (py/py. doc->markdown convert 83 | (or 84 | (get-in @scicloj.ml.core/model-definitions* [model-key :documentation :doc-string] ) ""))) 85 | (catch Exception e ""))) 86 | 87 | 88 | 89 | 90 | (defn anchor-or-nothing [x text] 91 | (if (empty? x) 92 | [:div ""] 93 | [:div 94 | [:a {:href x} text]])) 95 | 96 | 97 | 98 | (defn render-key-info [prefix] 99 | (->> @scicloj.ml.core/model-definitions* 100 | (sort-by first) 101 | (filter #(str/starts-with? (first %) (str prefix))) 102 | (map 103 | (fn [[key definition]] 104 | [:div 105 | [:h3 {:id (str key)} (str key)] 106 | (anchor-or-nothing (:javadoc (:documentation definition)) "javadoc") 107 | (anchor-or-nothing (:user-guide (:documentation definition)) "user guide") 108 | 109 | ;; [:span (text->hiccup (or 110 | ;; (get-in @scicloj.ml.core/model-definitions* [key :documentation :description] ) ""))] 111 | 112 | [:span 113 | (dataset->md-hiccup (docu-options key))] 114 | 115 | [:span 116 | (docu-doc-string key)] 117 | 118 | [:hr] 119 | ;; [:div "Example:"] 120 | ;; [:div 121 | ;; [:p/code {:code (str 122 | ;; (get-in definition [:documentation :code-example] 123 | ;; "" )) 124 | ;; :bg-class "bg-light"}]] 125 | 126 | [:hr]])))) 127 | 128 | 129 | (text->hiccup (or 130 | (get-in @scicloj.ml.core/model-definitions* 131 | [:smile.manifold/tsne :documentation :description]) "")) 132 | 133 | 134 | (defn remove-deep [key-set data] 135 | (clojure.walk/prewalk (fn [node] (if (map? node) 136 | (apply dissoc node key-set) 137 | node)) 138 | data)) 139 | (defn stepped-range [start end n-steps] 140 | (let [diff (- end start)] 141 | (range start end (/ diff n-steps)))) 142 | 143 | (defn surface-plot [iris cols raw-pipe-fn model-name] 144 | (let [ 145 | pipe-fn 146 | (ml/pipeline 147 | (mm/select-columns (concat [:species] cols)) 148 | raw-pipe-fn) 149 | 150 | fitted-ctx 151 | (pipe-fn 152 | {:metamorph/data iris 153 | :metamorph/mode :fit}) 154 | ;; getting plot boundaries 155 | min-x (- (-> (get iris (first cols)) dtf/reduce-min) 0.2) 156 | min-y (- (-> (get iris (second cols)) dtf/reduce-min) 0.2) 157 | max-x (+ (-> (get iris (first cols)) dtf/reduce-max) 0.2) 158 | max-y (+ (-> (get iris (second cols)) dtf/reduce-max) 0.2) 159 | 160 | 161 | ;; make a grid for the decision surface 162 | grid 163 | (for [x1 (stepped-range min-x max-x 100) 164 | x2 (stepped-range min-y max-y 100)] 165 | 166 | {(first cols) x1 167 | (second cols) x2 168 | :species nil}) 169 | 170 | grid-ds (tc/dataset grid) 171 | 172 | 173 | ;; predict for all grid points 174 | prediction-grid 175 | (-> 176 | (pipe-fn 177 | (merge 178 | fitted-ctx 179 | {:metamorph/data grid-ds 180 | :metamorph/mode :transform})) 181 | :metamorph/data 182 | (ds-mod/column-values->categorical :species) 183 | seq) 184 | 185 | grid-ds-prediction 186 | (tc/add-column grid-ds :predicted-species prediction-grid) 187 | 188 | 189 | ;; predict the iris data points from data set 190 | prediction-iris 191 | (-> 192 | (pipe-fn 193 | (merge 194 | fitted-ctx 195 | {:metamorph/data iris 196 | :metamorph/mode :transform})) 197 | :metamorph/data 198 | 199 | (ds-mod/column-values->categorical :species) 200 | seq) 201 | 202 | ds-prediction 203 | (tc/add-column iris :true-species (:species iris) 204 | prediction-iris)] 205 | 206 | ;; create a 2 layer Vega lite specification 207 | {:layer 208 | [ 209 | 210 | {:data {:values (seq (tc/rows grid-ds-prediction :as-maps))} 211 | :title (str "Decision surfaces for model: " model-name) 212 | :width 500 213 | :height 500 214 | :mark {:type "square" :opacity 0.9 :strokeOpacity 0.1 :stroke nil}, 215 | :encoding {:x {:field (first cols) 216 | :type "quantitative" 217 | :scale {:domain [min-x max-x]} 218 | :axis {:format "2.2" 219 | :labelOverlap true}} 220 | 221 | :y {:field (second cols) :type "quantitative" 222 | :axis {:format "2.2" 223 | :labelOverlap true} 224 | :scale {:domain [min-y max-y]}} 225 | 226 | :color {:field :predicted-species}}} 227 | 228 | 229 | {:data {:values (seq (tc/rows ds-prediction :as-maps))} 230 | 231 | :width 500 232 | :height 500 233 | :mark {:type "circle" :opacity 1 :strokeOpacity 1}, 234 | :encoding {:x {:field (first cols) 235 | :type "quantitative" 236 | :axis {:format "2.2" 237 | :labelOverlap true} 238 | :scale {:domain [min-x max-x]}} 239 | 240 | :y {:field (second cols) :type "quantitative" 241 | :axis {:format "2.2" 242 | :labelOverlap true} 243 | :scale {:domain [min-y max-y]}} 244 | 245 | 246 | :fill {:field :true-species} ;; :legend nil 247 | 248 | :stroke { :value :black} 249 | :size {:value 300}}}]})) 250 | 251 | (defn select-paths-from-set [current-path path-set data] 252 | (cond 253 | (map? data) (into {} 254 | (remove nil?) 255 | (for [[k v] data] 256 | (let [p (conj current-path k)] 257 | (if (contains? path-set p) 258 | [k (select-paths-from-set p path-set v)])))) 259 | (sequential? data) (mapv (partial select-paths-from-set current-path path-set) data) 260 | :default data)) 261 | 262 | (defn select-paths [data paths] 263 | (select-paths-from-set [] 264 | (into #{} 265 | (mapcat #(take-while seq (iterate butlast %))) 266 | paths) 267 | data)) 268 | 269 | (defn select-minimal-result [result] 270 | (select-paths result [[:train-transform :metric] 271 | [:test-transform :metric]])) 272 | -------------------------------------------------------------------------------- /src/scicloj/ml/ug_utils_clerk.clj: -------------------------------------------------------------------------------- 1 | (ns scicloj.ml.ug-utils-clerk 2 | (:require 3 | [clojure.string :as str] 4 | [nextjournal.clerk :as clerk] 5 | [scicloj.ml.core :as ml] 6 | [scicloj.ml.ug-utils :as utils] 7 | [tablecloth.api :as tc])) 8 | 9 | (defn docu-options [model-key] 10 | 11 | (-> 12 | (tc/dataset 13 | (or 14 | (get-in @scicloj.ml.core/model-definitions* [model-key :options]) 15 | {:name [] :type [] :default []})) 16 | 17 | (tc/reorder-columns :name :type :default))) 18 | 19 | 20 | 21 | (defn stringify-enum [form] 22 | (clojure.walk/postwalk (fn [x] (do (if (instance? Enum x) (str x) x))) 23 | form)) 24 | 25 | (defn render-key-info [prefix] 26 | (vec (concat [:span] 27 | (->> @scicloj.ml.core/model-definitions* 28 | (sort-by first) 29 | (filter #(str/starts-with? (first %) (str prefix))) 30 | (mapv 31 | (fn [[key definition]] 32 | [:div 33 | ;; (clerk/md (format "### %s" (str key))) 34 | [:h3 {:id (str key)} (str key)] 35 | (utils/anchor-or-nothing (:javadoc (:documentation definition)) "javadoc") 36 | (utils/anchor-or-nothing (:user-guide (:documentation definition)) "user guide") 37 | 38 | ;; [:span (text->hiccup (or 39 | ;; (get-in @scicloj.ml.core/model-definitions* [key :documentation :description] ) ""))] 40 | 41 | [:span 42 | 43 | (let [docu-ds (docu-options key)] 44 | (if (tc/empty-ds? docu-ds) 45 | "" 46 | (-> 47 | docu-ds 48 | (tc/rows :as-maps) 49 | seq 50 | stringify-enum 51 | (clerk/table))))] 52 | [:span 53 | (utils/docu-doc-string key)] 54 | 55 | [:hr] 56 | [:hr]])))))) 57 | -------------------------------------------------------------------------------- /src/scicloj/ml/unsupervised.clj: -------------------------------------------------------------------------------- 1 | (ns scicloj.ml.unsupervised 2 | (:require 3 | [notespace.api :as note] 4 | [notespace.kinds :as kind] 5 | [net.clojars.behrica.cluster_eval :as cluster-eval])) 6 | 7 | 8 | 9 | 10 | (comment 11 | (note/init-with-browser) 12 | (note/eval-this-notespace) 13 | (note/reread-this-notespace) 14 | (note/render-static-html "docs/userguide-unsupervised.html") 15 | (note/init)) 16 | 17 | (require '[scicloj.ml.core :as ml] 18 | '[scicloj.ml.metamorph :as mm] 19 | '[scicloj.ml.dataset :as ds]) 20 | 21 | ["# Cluster Iris data"] 22 | 23 | (def iris 24 | (-> 25 | (ds/dataset 26 | "https://raw.githubusercontent.com/scicloj/metamorph.ml/main/test/data/iris.csv" {:key-fn keyword}))) 27 | 28 | 29 | 30 | 31 | ["## k-means clustering"] 32 | 33 | (def fit-ctx 34 | (ml/fit 35 | iris 36 | (mm/select-columns [:petal_length :petal_width]) 37 | {:metamorph/id :model} 38 | (mm/model {:model-type :fastmath/cluster 39 | :clustering-method :k-means 40 | :clustering-method-args [3]}))) 41 | 42 | (def iris-with-cluster 43 | (ds/add-column iris :cluster 44 | (-> fit-ctx :model :model-data :clustering))) 45 | 46 | (def centroids 47 | (map 48 | (fn [[petal-length petal-width]] 49 | (hash-map :petal_length petal-length 50 | :petal_width petal-width)) 51 | (-> fit-ctx :model :model-data :representatives))) 52 | 53 | ^kind/vega 54 | {:height 300 55 | :width 300 56 | 57 | :title "2D result of iris k-means clustering with cluster centroids (n=3)" 58 | :layer [{ 59 | :$schema "https://vega.github.io/schema/vega-lite/v5.json" 60 | :data {:values (ds/rows iris-with-cluster :as-maps)} 61 | :description "Iris data " 62 | :encoding {:x {:field :petal_length :type "quantitative"} 63 | :y {:field :petal_width :type "quantitative"} 64 | :color {:field :cluster}} 65 | :mark "point"} 66 | { 67 | :data {:values centroids} 68 | :description "Iris data " 69 | :encoding {:x {:field :petal_length :type "quantitative"} 70 | :y {:field :petal_width :type "quantitative"}} 71 | 72 | :mark {:type "point" :shape :triangle-up :color :black 73 | :filled true 74 | :size 200}}]} 75 | 76 | 77 | 78 | ["## Ellbow plot"] 79 | 80 | ["### Calculate distortion over n"] 81 | 82 | (defn make-pipe [n] 83 | (ml/pipeline 84 | (mm/drop-columns [:species]) 85 | {:metamorph/id :model} 86 | (mm/model {:model-type :fastmath/cluster 87 | :clustering-method :k-means 88 | :clustering-method-args [n]}))) 89 | 90 | 91 | 92 | (def eval-results 93 | (ml/evaluate-pipelines 94 | (map make-pipe (range 2 10)) 95 | [{:train iris}] 96 | (fn [ctx] 97 | 0) 98 | :loss 99 | {:return-best-pipeline-only false})) 100 | 101 | 102 | 103 | (defn fastmath->cluster-data [model-data] 104 | (let [ 105 | cluster-values 106 | (concat 107 | (-> model-data :data) 108 | (-> model-data :representatives)) 109 | 110 | cluster 111 | (concat 112 | (-> model-data :clustering) 113 | (range (-> model-data :representatives count))) 114 | 115 | centroid? 116 | (concat 117 | (repeat (-> model-data :data count) false) 118 | (repeat (-> model-data :representatives count) true))] 119 | 120 | {:values cluster-values 121 | :cluster cluster 122 | :centroid? centroid?})) 123 | 124 | 125 | 126 | (def ellbow-plot-data-distortion 127 | (map #(hash-map :n %1 128 | :distortion %2) 129 | (->> eval-results flatten (map #(first (get-in % [:fit-ctx :model :options :clustering-method-args])))) 130 | (->> eval-results flatten (map #(get-in % [:fit-ctx :model :model-data :info :distortion]))))) 131 | 132 | 133 | ["### Calculate silouhette score over n"] 134 | 135 | (def eval-results-silhouete 136 | (ml/evaluate-pipelines 137 | (map make-pipe (range 2 10)) 138 | [{:train iris}] 139 | (fn [ctx] 140 | (let [metric 141 | (cluster-eval/cluster-index 142 | (fastmath->cluster-data (-> ctx :model :model-data)) 143 | "calcularSilhouette")] 144 | metric)) 145 | :loss 146 | {:return-best-pipeline-only false})) 147 | 148 | 149 | (def ellbow-plot-data-silhoute 150 | (map #(hash-map :n %1 151 | :silhoute %2) 152 | (->> eval-results-silhouete flatten (map #(first (get-in % [:fit-ctx :model :options :clustering-method-args])))) 153 | (->> eval-results-silhouete flatten (map #(get-in % [:train-transform :metric]))))) 154 | 155 | 156 | ["Ellbow plots for distortion and silhoute score"] 157 | 158 | ^kind/vega 159 | {:hconcat [ 160 | {:$schema "https://vega.github.io/schema/vega-lite/v5.json" 161 | :width 200 162 | :height 200 163 | :title "Ellbow plot of distortion for various n" 164 | :data {:values ellbow-plot-data-distortion} 165 | :description "Stock prices of 5 Tech Companies over Time." 166 | :encoding {:x {:field "n" :type :ordinal} 167 | :y {:field :distortion :type "quantitative"}} 168 | :mark {:point true :type "line"}} 169 | 170 | {:$schema "https://vega.github.io/schema/vega-lite/v5.json" 171 | :width 200 172 | :height 200 173 | :title "Ellbow plot of Silhoutte score for various n" 174 | :data {:values ellbow-plot-data-silhoute} 175 | 176 | :encoding {:x {:field "n" :type :ordinal} 177 | :y {:field :silhoute :type "quantitative"}} 178 | :mark {:point true :type "line"}}]} 179 | -------------------------------------------------------------------------------- /submission.csv: -------------------------------------------------------------------------------- 1 | PassengerId,Survived 2 | 892,0 3 | 893,1 4 | 894,0 5 | 895,0 6 | 896,1 7 | 897,0 8 | 898,1 9 | 899,0 10 | 900,1 11 | 901,0 12 | 902,0 13 | 903,0 14 | 904,1 15 | 905,0 16 | 906,1 17 | 907,1 18 | 908,0 19 | 909,0 20 | 910,1 21 | 911,1 22 | 912,0 23 | 913,0 24 | 914,1 25 | 915,0 26 | 916,1 27 | 917,0 28 | 918,1 29 | 919,0 30 | 920,1 31 | 921,0 32 | 922,0 33 | 923,0 34 | 924,1 35 | 925,0 36 | 926,1 37 | 927,0 38 | 928,0 39 | 929,0 40 | 930,0 41 | 931,1 42 | 932,0 43 | 933,1 44 | 934,0 45 | 935,1 46 | 936,1 47 | 937,0 48 | 938,1 49 | 939,0 50 | 940,1 51 | 941,1 52 | 942,0 53 | 943,0 54 | 944,1 55 | 945,1 56 | 946,0 57 | 947,0 58 | 948,0 59 | 949,0 60 | 950,0 61 | 951,1 62 | 952,0 63 | 953,0 64 | 954,0 65 | 955,1 66 | 956,1 67 | 957,1 68 | 958,1 69 | 959,0 70 | 960,0 71 | 961,1 72 | 962,0 73 | 963,0 74 | 964,0 75 | 965,0 76 | 966,1 77 | 967,0 78 | 968,0 79 | 969,1 80 | 970,0 81 | 971,0 82 | 972,1 83 | 973,0 84 | 974,0 85 | 975,0 86 | 976,0 87 | 977,0 88 | 978,1 89 | 979,0 90 | 980,0 91 | 981,1 92 | 982,0 93 | 983,0 94 | 984,1 95 | 985,0 96 | 986,0 97 | 987,0 98 | 988,1 99 | 989,0 100 | 990,1 101 | 991,0 102 | 992,1 103 | 993,0 104 | 994,0 105 | 995,0 106 | 996,0 107 | 997,0 108 | 998,0 109 | 999,0 110 | 1000,0 111 | 1001,0 112 | 1002,0 113 | 1003,1 114 | 1004,1 115 | 1005,1 116 | 1006,1 117 | 1007,0 118 | 1008,0 119 | 1009,1 120 | 1010,1 121 | 1011,1 122 | 1012,1 123 | 1013,0 124 | 1014,1 125 | 1015,0 126 | 1016,0 127 | 1017,1 128 | 1018,0 129 | 1019,1 130 | 1020,0 131 | 1021,0 132 | 1022,0 133 | 1023,0 134 | 1024,0 135 | 1025,0 136 | 1026,0 137 | 1027,0 138 | 1028,0 139 | 1029,0 140 | 1030,0 141 | 1031,0 142 | 1032,0 143 | 1033,1 144 | 1034,1 145 | 1035,0 146 | 1036,0 147 | 1037,0 148 | 1038,0 149 | 1039,0 150 | 1040,0 151 | 1041,0 152 | 1042,1 153 | 1043,0 154 | 1044,0 155 | 1045,1 156 | 1046,0 157 | 1047,0 158 | 1048,1 159 | 1049,0 160 | 1050,1 161 | 1051,0 162 | 1052,1 163 | 1053,1 164 | 1054,1 165 | 1055,0 166 | 1056,0 167 | 1057,1 168 | 1058,0 169 | 1059,0 170 | 1060,1 171 | 1061,0 172 | 1062,0 173 | 1063,0 174 | 1064,0 175 | 1065,0 176 | 1066,0 177 | 1067,1 178 | 1068,1 179 | 1069,1 180 | 1070,1 181 | 1071,1 182 | 1072,0 183 | 1073,0 184 | 1074,1 185 | 1075,0 186 | 1076,1 187 | 1077,0 188 | 1078,1 189 | 1079,0 190 | 1080,0 191 | 1081,0 192 | 1082,0 193 | 1083,0 194 | 1084,0 195 | 1085,0 196 | 1086,1 197 | 1087,0 198 | 1088,1 199 | 1089,0 200 | 1090,0 201 | 1091,0 202 | 1092,1 203 | 1093,1 204 | 1094,0 205 | 1095,1 206 | 1096,0 207 | 1097,0 208 | 1098,0 209 | 1099,0 210 | 1100,1 211 | 1101,0 212 | 1102,0 213 | 1103,0 214 | 1104,0 215 | 1105,1 216 | 1106,0 217 | 1107,0 218 | 1108,1 219 | 1109,1 220 | 1110,1 221 | 1111,0 222 | 1112,1 223 | 1113,0 224 | 1114,1 225 | 1115,0 226 | 1116,1 227 | 1117,1 228 | 1118,0 229 | 1119,0 230 | 1120,0 231 | 1121,0 232 | 1122,0 233 | 1123,1 234 | 1124,0 235 | 1125,0 236 | 1126,1 237 | 1127,0 238 | 1128,0 239 | 1129,0 240 | 1130,1 241 | 1131,1 242 | 1132,1 243 | 1133,1 244 | 1134,1 245 | 1135,0 246 | 1136,0 247 | 1137,0 248 | 1138,1 249 | 1139,0 250 | 1140,1 251 | 1141,0 252 | 1142,1 253 | 1143,0 254 | 1144,0 255 | 1145,0 256 | 1146,0 257 | 1147,0 258 | 1148,0 259 | 1149,0 260 | 1150,1 261 | 1151,0 262 | 1152,0 263 | 1153,0 264 | 1154,1 265 | 1155,1 266 | 1156,0 267 | 1157,0 268 | 1158,0 269 | 1159,0 270 | 1160,1 271 | 1161,0 272 | 1162,1 273 | 1163,0 274 | 1164,1 275 | 1165,1 276 | 1166,0 277 | 1167,1 278 | 1168,0 279 | 1169,0 280 | 1170,0 281 | 1171,0 282 | 1172,0 283 | 1173,1 284 | 1174,0 285 | 1175,1 286 | 1176,1 287 | 1177,0 288 | 1178,0 289 | 1179,0 290 | 1180,0 291 | 1181,0 292 | 1182,0 293 | 1183,1 294 | 1184,0 295 | 1185,0 296 | 1186,0 297 | 1187,0 298 | 1188,1 299 | 1189,1 300 | 1190,0 301 | 1191,0 302 | 1192,0 303 | 1193,0 304 | 1194,0 305 | 1195,0 306 | 1196,0 307 | 1197,1 308 | 1198,0 309 | 1199,1 310 | 1200,1 311 | 1201,0 312 | 1202,0 313 | 1203,0 314 | 1204,0 315 | 1205,0 316 | 1206,1 317 | 1207,1 318 | 1208,1 319 | 1209,0 320 | 1210,0 321 | 1211,0 322 | 1212,0 323 | 1213,0 324 | 1214,0 325 | 1215,1 326 | 1216,1 327 | 1217,0 328 | 1218,1 329 | 1219,0 330 | 1220,0 331 | 1221,0 332 | 1222,1 333 | 1223,1 334 | 1224,0 335 | 1225,1 336 | 1226,0 337 | 1227,0 338 | 1228,0 339 | 1229,0 340 | 1230,0 341 | 1231,0 342 | 1232,0 343 | 1233,0 344 | 1234,0 345 | 1235,1 346 | 1236,0 347 | 1237,1 348 | 1238,0 349 | 1239,1 350 | 1240,0 351 | 1241,1 352 | 1242,1 353 | 1243,0 354 | 1244,0 355 | 1245,0 356 | 1246,1 357 | 1247,0 358 | 1248,1 359 | 1249,0 360 | 1250,0 361 | 1251,1 362 | 1252,0 363 | 1253,1 364 | 1254,1 365 | 1255,0 366 | 1256,1 367 | 1257,0 368 | 1258,0 369 | 1259,0 370 | 1260,1 371 | 1261,0 372 | 1262,0 373 | 1263,1 374 | 1264,0 375 | 1265,0 376 | 1266,1 377 | 1267,1 378 | 1268,0 379 | 1269,0 380 | 1270,0 381 | 1271,0 382 | 1272,0 383 | 1273,0 384 | 1274,0 385 | 1275,1 386 | 1276,0 387 | 1277,1 388 | 1278,0 389 | 1279,0 390 | 1280,0 391 | 1281,1 392 | 1282,1 393 | 1283,1 394 | 1284,0 395 | 1285,0 396 | 1286,0 397 | 1287,1 398 | 1288,0 399 | 1289,1 400 | 1290,0 401 | 1291,0 402 | 1292,1 403 | 1293,0 404 | 1294,1 405 | 1295,0 406 | 1296,0 407 | 1297,0 408 | 1298,0 409 | 1299,0 410 | 1300,1 411 | 1301,1 412 | 1302,0 413 | 1303,1 414 | 1304,1 415 | 1305,0 416 | 1306,1 417 | 1307,0 418 | 1308,0 419 | 1309,1 420 | -------------------------------------------------------------------------------- /test/scicloj/ml/tutorials_test.clj: -------------------------------------------------------------------------------- 1 | (ns scicloj.ml.tutorials-test 2 | (:require [clojure.test :refer :all] 3 | [scicloj.ml.tutorials :refer :all])) 4 | 5 | (deftest a-test 6 | (testing "FIXME, I fail." 7 | (is (= 0 1)))) 8 | --------------------------------------------------------------------------------