├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── bb.edn
├── bin
    └── launchpad
├── data
    ├── marketing.csv
    ├── titanic
    │   ├── test.csv
    │   ├── titanic.zip
    │   └── train.csv
    └── tweets_sentiment.feather
├── deps.edn
├── deps.local.edn
├── doc
    └── intro.md
├── docs
    ├── gorilla-notes
    │   └── js
    │   │   └── compiled
    │   │       └── main.js
    ├── interactions_ols.html
    ├── notespace-files
    │   └── tree.svg
    ├── polyglot_kmeans.html
    ├── tune-titanic.html
    ├── userguide-advanced.html
    ├── userguide-categrical.html
    ├── userguide-experiment-tracking.html
    ├── userguide-intro.html
    ├── userguide-models.html
    ├── userguide-sklearnclj.html
    ├── userguide-third_party.html
    ├── userguide-titanic.html
    ├── userguide-transformers.html
    └── userguide-unsupervised.html
├── render_all.clj
├── render_titanic.clj
├── render_tune-titanic.clj
├── resources
    ├── .keep
    └── logback.xml
├── src
    └── scicloj
    │   └── ml
    │       ├── advanced.clj
    │       ├── categorical.clj
    │       ├── experiment_tracking.clj
    │       ├── interactions_ols.clj
    │       ├── intro.clj
    │       ├── models.clj
    │       ├── nested_cv.clj
    │       ├── polyglot_kmeans.clj
    │       ├── sklearnclj.clj
    │       ├── third_party.clj
    │       ├── titanic.clj
    │       ├── transformers.clj
    │       ├── tune_titanic.clj
    │       ├── ug_utils.clj
    │       ├── ug_utils_clerk.clj
    │       └── unsupervised.clj
├── submission.csv
└── test
    └── scicloj
        └── ml
            └── tutorials_test.clj


/.gitignore:
--------------------------------------------------------------------------------
 1 | /target
 2 | /classes
 3 | /checkouts
 4 | *.jar
 5 | *.class
 6 | /.calva/output-window/
 7 | /.cpcache
 8 | /.lein-*
 9 | /.lsp/sqlite*.db
10 | /.nrepl-history
11 | /.nrepl-port
12 | /.rebel_readline_history
13 | /.socket-repl-port
14 | .hgignore
15 | .hg/
16 | /.cache/
17 | /.classpath
18 | /.clj-kondo/
19 | /.lsp/
20 | /.project
21 | /.settings/
22 | /.clerk/
23 | /.vscode/
24 | /cache_dir/
25 | /docs/scicloj/
26 | /public/
27 | /runs/
28 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Change Log
 2 | All notable changes to this project will be documented in this file. This change log follows the conventions of [keepachangelog.com](http://keepachangelog.com/).
 3 | 
 4 | ## [Unreleased]
 5 | ### Changed
 6 | - Add a new arity to `make-widget-async` to provide a different widget shape.
 7 | 
 8 | ## [0.1.1] - 2021-09-06
 9 | ### Changed
10 | - Documentation on how to make the widgets.
11 | 
12 | ### Removed
13 | - `make-widget-sync` - we're all async, all the time.
14 | 
15 | ### Fixed
16 | - Fixed widget maker to keep working when daylight savings switches over.
17 | 
18 | ## 0.1.0 - 2021-09-06
19 | ### Added
20 | - Files from the new template.
21 | - Widget maker public API - `make-widget-sync`.
22 | 
23 | [Unreleased]: https://github.com/scicloj/ml.tutorials/compare/0.1.1...HEAD
24 | [0.1.1]: https://github.com/scicloj/ml.tutorials/compare/0.1.0...0.1.1
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC
  2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM
  3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
  4 | 
  5 | 1. DEFINITIONS
  6 | 
  7 | "Contribution" means:
  8 | 
  9 | a) in the case of the initial Contributor, the initial code and
 10 | documentation distributed under this Agreement, and
 11 | 
 12 | b) in the case of each subsequent Contributor:
 13 | 
 14 | i) changes to the Program, and
 15 | 
 16 | ii) additions to the Program;
 17 | 
 18 | where such changes and/or additions to the Program originate from and are
 19 | distributed by that particular Contributor. A Contribution 'originates' from
 20 | a Contributor if it was added to the Program by such Contributor itself or
 21 | anyone acting on such Contributor's behalf. Contributions do not include
 22 | additions to the Program which: (i) are separate modules of software
 23 | distributed in conjunction with the Program under their own license
 24 | agreement, and (ii) are not derivative works of the Program.
 25 | 
 26 | "Contributor" means any person or entity that distributes the Program.
 27 | 
 28 | "Licensed Patents" mean patent claims licensable by a Contributor which are
 29 | necessarily infringed by the use or sale of its Contribution alone or when
 30 | combined with the Program.
 31 | 
 32 | "Program" means the Contributions distributed in accordance with this
 33 | Agreement.
 34 | 
 35 | "Recipient" means anyone who receives the Program under this Agreement,
 36 | including all Contributors.
 37 | 
 38 | 2. GRANT OF RIGHTS
 39 | 
 40 | a) Subject to the terms of this Agreement, each Contributor hereby grants
 41 | Recipient a non-exclusive, worldwide, royalty-free copyright license to
 42 | reproduce, prepare derivative works of, publicly display, publicly perform,
 43 | distribute and sublicense the Contribution of such Contributor, if any, and
 44 | such derivative works, in source code and object code form.
 45 | 
 46 | b) Subject to the terms of this Agreement, each Contributor hereby grants
 47 | Recipient a non-exclusive, worldwide, royalty-free patent license under
 48 | Licensed Patents to make, use, sell, offer to sell, import and otherwise
 49 | transfer the Contribution of such Contributor, if any, in source code and
 50 | object code form.  This patent license shall apply to the combination of the
 51 | Contribution and the Program if, at the time the Contribution is added by the
 52 | Contributor, such addition of the Contribution causes such combination to be
 53 | covered by the Licensed Patents. The patent license shall not apply to any
 54 | other combinations which include the Contribution. No hardware per se is
 55 | licensed hereunder.
 56 | 
 57 | c) Recipient understands that although each Contributor grants the licenses
 58 | to its Contributions set forth herein, no assurances are provided by any
 59 | Contributor that the Program does not infringe the patent or other
 60 | intellectual property rights of any other entity. Each Contributor disclaims
 61 | any liability to Recipient for claims brought by any other entity based on
 62 | infringement of intellectual property rights or otherwise. As a condition to
 63 | exercising the rights and licenses granted hereunder, each Recipient hereby
 64 | assumes sole responsibility to secure any other intellectual property rights
 65 | needed, if any. For example, if a third party patent license is required to
 66 | allow Recipient to distribute the Program, it is Recipient's responsibility
 67 | to acquire that license before distributing the Program.
 68 | 
 69 | d) Each Contributor represents that to its knowledge it has sufficient
 70 | copyright rights in its Contribution, if any, to grant the copyright license
 71 | set forth in this Agreement.
 72 | 
 73 | 3. REQUIREMENTS
 74 | 
 75 | A Contributor may choose to distribute the Program in object code form under
 76 | its own license agreement, provided that:
 77 | 
 78 | a) it complies with the terms and conditions of this Agreement; and
 79 | 
 80 | b) its license agreement:
 81 | 
 82 | i) effectively disclaims on behalf of all Contributors all warranties and
 83 | conditions, express and implied, including warranties or conditions of title
 84 | and non-infringement, and implied warranties or conditions of merchantability
 85 | and fitness for a particular purpose;
 86 | 
 87 | ii) effectively excludes on behalf of all Contributors all liability for
 88 | damages, including direct, indirect, special, incidental and consequential
 89 | damages, such as lost profits;
 90 | 
 91 | iii) states that any provisions which differ from this Agreement are offered
 92 | by that Contributor alone and not by any other party; and
 93 | 
 94 | iv) states that source code for the Program is available from such
 95 | Contributor, and informs licensees how to obtain it in a reasonable manner on
 96 | or through a medium customarily used for software exchange.
 97 | 
 98 | When the Program is made available in source code form:
 99 | 
100 | a) it must be made available under this Agreement; and
101 | 
102 | b) a copy of this Agreement must be included with each copy of the Program.
103 | 
104 | Contributors may not remove or alter any copyright notices contained within
105 | the Program.
106 | 
107 | Each Contributor must identify itself as the originator of its Contribution,
108 | if any, in a manner that reasonably allows subsequent Recipients to identify
109 | the originator of the Contribution.
110 | 
111 | 4. COMMERCIAL DISTRIBUTION
112 | 
113 | Commercial distributors of software may accept certain responsibilities with
114 | respect to end users, business partners and the like. While this license is
115 | intended to facilitate the commercial use of the Program, the Contributor who
116 | includes the Program in a commercial product offering should do so in a
117 | manner which does not create potential liability for other Contributors.
118 | Therefore, if a Contributor includes the Program in a commercial product
119 | offering, such Contributor ("Commercial Contributor") hereby agrees to defend
120 | and indemnify every other Contributor ("Indemnified Contributor") against any
121 | losses, damages and costs (collectively "Losses") arising from claims,
122 | lawsuits and other legal actions brought by a third party against the
123 | Indemnified Contributor to the extent caused by the acts or omissions of such
124 | Commercial Contributor in connection with its distribution of the Program in
125 | a commercial product offering.  The obligations in this section do not apply
126 | to any claims or Losses relating to any actual or alleged intellectual
127 | property infringement. In order to qualify, an Indemnified Contributor must:
128 | a) promptly notify the Commercial Contributor in writing of such claim, and
129 | b) allow the Commercial Contributor to control, and cooperate with the
130 | Commercial Contributor in, the defense and any related settlement
131 | negotiations. The Indemnified Contributor may participate in any such claim
132 | at its own expense.
133 | 
134 | For example, a Contributor might include the Program in a commercial product
135 | offering, Product X. That Contributor is then a Commercial Contributor. If
136 | that Commercial Contributor then makes performance claims, or offers
137 | warranties related to Product X, those performance claims and warranties are
138 | such Commercial Contributor's responsibility alone. Under this section, the
139 | Commercial Contributor would have to defend claims against the other
140 | Contributors related to those performance claims and warranties, and if a
141 | court requires any other Contributor to pay any damages as a result, the
142 | Commercial Contributor must pay those damages.
143 | 
144 | 5. NO WARRANTY
145 | 
146 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON
147 | AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER
148 | EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR
149 | CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A
150 | PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the
151 | appropriateness of using and distributing the Program and assumes all risks
152 | associated with its exercise of rights under this Agreement , including but
153 | not limited to the risks and costs of program errors, compliance with
154 | applicable laws, damage to or loss of data, programs or equipment, and
155 | unavailability or interruption of operations.
156 | 
157 | 6. DISCLAIMER OF LIABILITY
158 | 
159 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY
160 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL,
161 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION
162 | LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
163 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
164 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE
165 | EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY
166 | OF SUCH DAMAGES.
167 | 
168 | 7. GENERAL
169 | 
170 | If any provision of this Agreement is invalid or unenforceable under
171 | applicable law, it shall not affect the validity or enforceability of the
172 | remainder of the terms of this Agreement, and without further action by the
173 | parties hereto, such provision shall be reformed to the minimum extent
174 | necessary to make such provision valid and enforceable.
175 | 
176 | If Recipient institutes patent litigation against any entity (including a
177 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself
178 | (excluding combinations of the Program with other software or hardware)
179 | infringes such Recipient's patent(s), then such Recipient's rights granted
180 | under Section 2(b) shall terminate as of the date such litigation is filed.
181 | 
182 | All Recipient's rights under this Agreement shall terminate if it fails to
183 | comply with any of the material terms or conditions of this Agreement and
184 | does not cure such failure in a reasonable period of time after becoming
185 | aware of such noncompliance. If all Recipient's rights under this Agreement
186 | terminate, Recipient agrees to cease use and distribution of the Program as
187 | soon as reasonably practicable. However, Recipient's obligations under this
188 | Agreement and any licenses granted by Recipient relating to the Program shall
189 | continue and survive.
190 | 
191 | Everyone is permitted to copy and distribute copies of this Agreement, but in
192 | order to avoid inconsistency the Agreement is copyrighted and may only be
193 | modified in the following manner. The Agreement Steward reserves the right to
194 | publish new versions (including revisions) of this Agreement from time to
195 | time. No one other than the Agreement Steward has the right to modify this
196 | Agreement. The Eclipse Foundation is the initial Agreement Steward. The
197 | Eclipse Foundation may assign the responsibility to serve as the Agreement
198 | Steward to a suitable separate entity. Each new version of the Agreement will
199 | be given a distinguishing version number. The Program (including
200 | Contributions) may always be distributed subject to the version of the
201 | Agreement under which it was received. In addition, after a new version of
202 | the Agreement is published, Contributor may elect to distribute the Program
203 | (including its Contributions) under the new version. Except as expressly
204 | stated in Sections 2(a) and 2(b) above, Recipient receives no rights or
205 | licenses to the intellectual property of any Contributor under this
206 | Agreement, whether expressly, by implication, estoppel or otherwise. All
207 | rights in the Program not expressly granted under this Agreement are
208 | reserved.
209 | 
210 | This Agreement is governed by the laws of the State of New York and the
211 | intellectual property laws of the United States of America. No party to this
212 | Agreement will bring a legal action under this Agreement more than one year
213 | after the cause of action arose. Each party waives its rights to a jury trial
214 | in any resulting litigation.
215 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | >[!NOTE]
 2 | >***
 3 | >The usage of the shim `scicloj.ml` is now considered deprecated. The underlying libraries should be used directly or via
 4 | >[noj](https://github.com/scicloj/noj) is a new libray to combine several of these libraries, without remapping the namespaces.
 5 | >It contains as well updated versions of several of the tutorials here.
 6 | >The code inside the tutorials is still valid and mostyly working, but the functions are in different namespaces when
 7 | >used withouth `scicloj.ml`
 8 | >***
 9 | 
10 | 
11 | # Tutorials for [scicloj.ml](https://github.com/scicloj/scicloj.ml)
12 | 
13 | The Clojure machine learning library scicloj.ml is documented here: 
14 | 
15 | * [Userguide - introduction](https://scicloj.github.io/scicloj.ml-tutorials/userguide-intro.html)
16 | * [Userguide - advanced](https://scicloj.github.io/scicloj.ml-tutorials/userguide-advanced.html)
17 | * [Userguide - categorical](https://scicloj.github.io/scicloj.ml-tutorials/userguide-categrical.html)
18 | * [Reference of ML models](https://scicloj.github.io/scicloj.ml-tutorials/userguide-models.html)
19 | * [Reference of transformer functions](https://scicloj.github.io/scicloj.ml-tutorials/userguide-transformers.html)
20 | * [Example usage - predict titanic survival](https://scicloj.github.io/scicloj.ml-tutorials/userguide-titanic.html)
21 | * [Example usage - hyper parametertuning of a pipeline](https://scicloj.github.io/scicloj.ml-tutorials/tune-titanic.html)
22 | * [How to use sklearn models](https://scicloj.github.io/scicloj.ml-tutorials/userguide-sklearnclj.html)
23 | * [Reference of other libraries integrated with scicloj.ml](https://scicloj.github.io/scicloj.ml-tutorials/userguide-third_party.html)
24 | * [kmeans in Python vs Clojure](https://scicloj.github.io/scicloj.ml-tutorials/polyglot_kmeans.html)
25 | * [Experiment tracking](https://scicloj.github.io/scicloj.ml-tutorials/userguide-experiment-tracking.html)
26 | * [Unsupervised learning](https://scicloj.github.io/scicloj.ml-tutorials/userguide-unsupervised.html)
27 | * [Variable interaction in linear regression](https://scicloj.github.io/scicloj.ml-tutorials/interactions_ols.html)
28 | 
29 | 
30 | The source files for this documentation using  [notespace](https://github.com/scicloj/notespace) 
31 | and [Clerk](https://github.com/nextjournal/clerk) are in this repository.
32 | 


--------------------------------------------------------------------------------
/bb.edn:
--------------------------------------------------------------------------------
1 | {:deps {com.lambdaisland/launchpad {:mvn/version "0.9.49-alpha"}}}
2 | 
3 | 


--------------------------------------------------------------------------------
/bin/launchpad:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bb
2 | 
3 | (require '[lambdaisland.launchpad :as launchpad])
4 | 
5 | (launchpad/main {})
6 | 
7 | ;; (launchpad/main {:steps (into [(partial launchpad/ensure-java-version 17)]
8 | ;;                               launchpad/default-steps)})
9 | 


--------------------------------------------------------------------------------
/data/marketing.csv:
--------------------------------------------------------------------------------
  1 | youtube,facebook,newspaper,sales
  2 | 276.12,45.35999999999999,83.04,26.52
  3 | 53.4,47.16,54.12,12.48
  4 | 20.639999999999997,55.08,83.16,11.16
  5 | 181.79999999999998,49.559999999999995,70.2,22.2
  6 | 216.96,12.96,70.08,15.48
  7 | 10.44,58.67999999999999,90,8.64
  8 | 69,39.35999999999999,28.2,14.16
  9 | 144.24,23.52,13.92,15.839999999999998
 10 | 10.319999999999999,2.52,1.2,5.76
 11 | 239.76,3.12,25.439999999999998,12.719999999999999
 12 | 79.32,6.96,29.04,10.319999999999999
 13 | 257.64,28.799999999999997,4.8,20.88
 14 | 28.56,42.12,79.08,11.04
 15 | 117,9.12,8.64,11.639999999999999
 16 | 244.92,39.48,55.199999999999996,22.8
 17 | 234.48,57.24,63.48,26.88
 18 | 81.36,43.92,136.79999999999998,15
 19 | 337.67999999999995,47.52,66.96,29.279999999999998
 20 | 83.04,24.599999999999998,21.96,13.56
 21 | 176.76000000000002,28.679999999999996,22.92,17.52
 22 | 262.08,33.239999999999995,64.08,21.599999999999998
 23 | 284.88,6.119999999999999,28.2,15
 24 | 15.839999999999998,19.08,59.519999999999996,6.72
 25 | 273.96,20.279999999999998,31.439999999999998,18.599999999999998
 26 | 74.75999999999999,15.12,21.96,11.639999999999999
 27 | 315.47999999999996,4.2,23.4,14.399999999999999
 28 | 171.48,35.16,15.12,18
 29 | 288.12,20.04,27.479999999999997,19.08
 30 | 298.56,32.52,27.479999999999997,22.679999999999996
 31 | 84.71999999999998,19.2,48.959999999999994,12.6
 32 | 351.47999999999996,33.96,51.84,25.679999999999996
 33 | 135.48,20.88,46.32,14.28
 34 | 116.64,1.7999999999999998,36,11.52
 35 | 318.72,24,0.36,20.88
 36 | 114.84,1.68,8.88,11.4
 37 | 348.84,4.919999999999999,10.2,15.36
 38 | 320.28,52.559999999999995,6,30.479999999999997
 39 | 89.64,59.279999999999994,54.84,17.639999999999997
 40 | 51.72,32.04,42.12,12.12
 41 | 273.59999999999997,45.24,38.4,25.8
 42 | 243,26.76,37.92,19.92
 43 | 212.4,40.08,46.440000000000005,20.52
 44 | 352.32,33.239999999999995,2.16,24.84
 45 | 248.28,10.08,31.679999999999996,15.48
 46 | 30.12,30.839999999999996,51.959999999999994,10.2
 47 | 210.11999999999998,27,37.8,17.88
 48 | 107.64,11.88,42.84,12.719999999999999
 49 | 287.88,49.8,22.2,27.84
 50 | 272.64,18.96,59.879999999999995,17.76
 51 | 80.28,14.04,44.16,11.639999999999999
 52 | 239.76,3.7199999999999998,41.52,13.68
 53 | 120.48,11.52,4.32,12.839999999999998
 54 | 259.68,50.04,47.52,27.12
 55 | 219.11999999999998,55.440000000000005,70.44,25.439999999999998
 56 | 315.23999999999995,34.56,19.08,24.24
 57 | 238.68,59.279999999999994,72,28.439999999999998
 58 | 8.76,33.72,49.68,6.6
 59 | 163.43999999999997,23.04,19.92,15.839999999999998
 60 | 252.96,59.519999999999996,45.24,28.56
 61 | 252.83999999999997,35.4,11.16,22.08
 62 | 64.2,2.4,25.679999999999996,9.719999999999999
 63 | 313.56,51.24,65.64,29.04
 64 | 287.16,18.599999999999998,32.76,18.84
 65 | 123.24,35.52,10.08,16.8
 66 | 157.32,51.35999999999999,34.68,21.599999999999998
 67 | 82.8,11.16,1.08,11.16
 68 | 37.8,29.52,2.64,11.4
 69 | 167.16,17.4,12.239999999999998,16.08
 70 | 284.88,33,13.2,22.679999999999996
 71 | 260.16,52.68,32.64,26.76
 72 | 238.92,36.72,46.440000000000005,21.96
 73 | 131.76,17.16,38.04,14.879999999999999
 74 | 32.16,39.6,23.16,10.56
 75 | 155.28,6.84,37.56,13.2
 76 | 256.08,29.52,15.719999999999999,20.4
 77 | 20.279999999999998,52.440000000000005,107.28,10.44
 78 | 33,1.92,24.84,8.28
 79 | 144.6,34.199999999999996,17.04,17.04
 80 | 6.48,35.879999999999995,11.28,6.359999999999999
 81 | 139.2,9.24,27.720000000000002,13.2
 82 | 91.68,32.04,26.76,14.16
 83 | 287.76,4.919999999999999,44.279999999999994,14.76
 84 | 90.36,24.36,39,13.56
 85 | 82.08,53.4,42.72,16.32
 86 | 256.2,51.6,40.559999999999995,26.04
 87 | 231.83999999999997,22.08,78.84,18.24
 88 | 91.55999999999999,33,19.2,14.399999999999999
 89 | 132.84,48.72,75.84,19.2
 90 | 105.96,30.599999999999998,88.08,15.48
 91 | 131.76,57.35999999999999,61.67999999999999,20.04
 92 | 161.16,5.88,11.16,13.44
 93 | 34.32,1.7999999999999998,39.6,8.76
 94 | 261.23999999999995,40.199999999999996,70.8,23.279999999999998
 95 | 301.08,43.8,86.75999999999999,26.639999999999997
 96 | 128.88,16.8,13.08,13.799999999999999
 97 | 195.96,37.92,63.48,20.279999999999998
 98 | 237.11999999999998,4.2,7.08,14.04
 99 | 221.88,25.2,26.4,18.599999999999998
100 | 347.64,50.76,61.44,30.479999999999997
101 | 162.23999999999998,50.04,55.08,20.639999999999997
102 | 266.88,5.159999999999999,59.75999999999999,14.04
103 | 355.67999999999995,43.559999999999995,121.08,28.56
104 | 336.23999999999995,12.12,25.679999999999996,17.76
105 | 225.48,20.639999999999997,21.479999999999997,17.639999999999997
106 | 285.84,41.16,6.359999999999999,24.84
107 | 165.48,55.68,70.8,23.04
108 | 30,13.2,35.64,8.64
109 | 108.48,0.36,27.84,10.44
110 | 15.719999999999999,0.48,30.72,6.359999999999999
111 | 306.48,32.279999999999994,6.6,23.76
112 | 270.96,9.839999999999998,67.8,16.08
113 | 290.03999999999996,45.6,27.84,26.16
114 | 210.83999999999997,18.48,2.88,16.919999999999998
115 | 251.51999999999998,24.720000000000002,12.839999999999998,19.08
116 | 93.84,56.16,41.4,17.52
117 | 90.11999999999999,42,63.24,15.12
118 | 167.04,17.16,30.72,14.639999999999999
119 | 91.68,0.96,17.76,11.28
120 | 150.84,44.279999999999994,95.04,19.08
121 | 23.279999999999998,19.2,26.76,7.919999999999999
122 | 169.56,32.16,55.440000000000005,18.599999999999998
123 | 22.56,26.04,60.48,8.4
124 | 268.8,2.88,18.72,13.92
125 | 147.72,41.52,14.879999999999999,18.24
126 | 275.4,38.76,89.04,23.639999999999997
127 | 104.64,14.16,31.08,12.719999999999999
128 | 9.36,46.68,60.72,7.919999999999999
129 | 96.24,0,11.04,10.56
130 | 264.36,58.8,3.84,29.639999999999997
131 | 71.52,14.399999999999999,51.72,11.639999999999999
132 | 0.84,47.52,10.44,1.92
133 | 318.23999999999995,3.48,51.6,15.239999999999998
134 | 10.08,32.64,2.52,6.84
135 | 263.76,40.199999999999996,54.12,23.52
136 | 44.279999999999994,46.32,78.71999999999998,12.96
137 | 57.959999999999994,56.4,10.2,13.92
138 | 30.72,46.8,11.16,11.4
139 | 328.44,34.68,71.64,24.96
140 | 51.6,31.08,24.599999999999998,11.52
141 | 221.88,52.68,2.04,24.84
142 | 88.08,20.4,15.48,13.08
143 | 232.43999999999997,42.48,90.71999999999998,23.04
144 | 264.59999999999997,39.84,45.48,24.12
145 | 125.51999999999998,6.84,41.279999999999994,12.48
146 | 115.44,17.76,46.68,13.68
147 | 168.36,2.28,10.799999999999999,12.360000000000001
148 | 288.12,8.76,10.44,15.839999999999998
149 | 291.84,58.8,53.16,30.479999999999997
150 | 45.6,48.35999999999999,14.28,13.08
151 | 53.64,30.96,24.720000000000002,12.12
152 | 336.84,16.68,44.4,19.32
153 | 145.2,10.08,58.44,13.92
154 | 237.11999999999998,27.96,17.04,19.92
155 | 205.56,47.64,45.24,22.8
156 | 225.36,25.32,11.4,18.72
157 | 4.919999999999999,13.92,6.84,3.84
158 | 112.68,52.199999999999996,60.599999999999994,18.36
159 | 179.76000000000002,1.56,29.16,12.12
160 | 14.04,44.279999999999994,54.24,8.76
161 | 158.04,22.08,41.52,15.48
162 | 207,21.720000000000002,36.839999999999996,17.28
163 | 102.84,42.959999999999994,59.16,15.96
164 | 226.08,21.720000000000002,30.72,17.88
165 | 196.2,44.16,8.88,21.599999999999998
166 | 140.64,17.639999999999997,6.48,14.28
167 | 281.4,4.08,101.75999999999999,14.28
168 | 21.479999999999997,45.12,25.92,9.6
169 | 248.16,6.24,23.279999999999998,14.639999999999999
170 | 258.48,28.32,69.12,20.52
171 | 341.16,12.719999999999999,7.68,18
172 | 60,13.92,22.08,10.08
173 | 197.4,25.08,56.879999999999995,17.4
174 | 23.52,24.12,20.4,9.12
175 | 202.08,8.52,15.36,14.04
176 | 266.88,4.08,15.719999999999999,13.799999999999999
177 | 332.28,58.67999999999999,50.16,32.4
178 | 298.08,36.239999999999995,24.36,24.24
179 | 204.23999999999998,9.36,42.24,14.04
180 | 332.03999999999996,2.76,28.439999999999998,14.16
181 | 198.72,12,21.12,15.12
182 | 187.92,3.12,9.96,12.6
183 | 262.2,6.48,32.879999999999995,14.639999999999999
184 | 67.44,6.84,35.64,10.44
185 | 345.12,51.6,86.16,31.439999999999998
186 | 304.56,25.56,36,21.12
187 | 246,54.12,23.52,27.12
188 | 167.4,2.52,31.92,12.360000000000001
189 | 229.32,34.44,21.84,20.76
190 | 343.2,16.68,4.44,19.08
191 | 22.439999999999998,14.52,28.08,8.04
192 | 47.4,49.32,6.96,12.96
193 | 90.6,12.96,7.199999999999999,11.88
194 | 20.639999999999997,4.919999999999999,37.92,7.08
195 | 200.16,50.4,4.32,23.52
196 | 179.64,42.72,7.199999999999999,20.76
197 | 45.84,4.44,16.56,9.12
198 | 113.04,5.88,9.719999999999999,11.639999999999999
199 | 212.4,11.16,7.68,15.36
200 | 340.32,50.4,79.44,30.599999999999998
201 | 278.52,10.319999999999999,10.44,16.08
202 | 


--------------------------------------------------------------------------------
/data/titanic/titanic.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scicloj/scicloj.ml-tutorials/951cde0b8bd0a1b22ec856d28a6122d69d34836f/data/titanic/titanic.zip


--------------------------------------------------------------------------------
/data/tweets_sentiment.feather:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scicloj/scicloj.ml-tutorials/951cde0b8bd0a1b22ec856d28a6122d69d34836f/data/tweets_sentiment.feather


--------------------------------------------------------------------------------
/deps.edn:
--------------------------------------------------------------------------------
 1 | {:paths ["src" "resources"]
 2 | 
 3 | 
 4 |  :deps {
 5 |         io.github.nextjournal/clerk {:git/sha "a6bfc832a182ef3068d60a318985681ddb913595"
 6 |                                      :git/url "https://github.com/nextjournal/clerk.git"}
 7 | 
 8 |         ;; {:mvn/version "0.11.603"}
 9 |         org.clojure/clojure {:mvn/version "1.11.1"}
10 | 
11 |         scicloj/scicloj.ml             {:mvn/version "0.2.2"}
12 | 
13 |         org.scicloj/scicloj.ml.clj-djl {:mvn/version "0.1.11"}
14 |         scicloj/sklearn-clj            {:mvn/version "0.3.7"}
15 | 
16 |         org.apache.arrow/arrow-vector  {:mvn/version  "6.0.0"}
17 |         org.lz4/lz4-java               {:mvn/version "1.8.0"}
18 |         com.github.luben/zstd-jni      {:mvn/version "1.5.1-1"}
19 |         org.clojure/tools.logging {:mvn/version "1.2.4"}
20 |         com.fasterxml.jackson.core/jackson-databind {:mvn/version
21 |                                                      "2.13.2"}
22 |         com.fasterxml.jackson.core/jackson-core {:mvn/version
23 |                                                  "2.13.2"}
24 | 
25 |         com.fasterxml.jackson.core/jackson-annotations {:mvn/version
26 |                                                         "2.13.2"}
27 | 
28 |         ch.qos.logback/logback-classic {:mvn/version "1.4.4"}
29 |         scicloj/notespace              {:mvn/version "3-beta9"}
30 | 
31 |         dk.simongray/datalinguist      {:mvn/version "0.1.163"}
32 |         applied-science/waqi           {:git/url "https://github.com/applied-science/waqi/"
33 |                                         :sha "faefe5dfd1b161ff70089924591ac2d699527811"}
34 |         clj-python/libpython-clj       {:mvn/version "2.020"}
35 |         scicloj/clojisr                {:mvn/version "1.0.0-BETA20"}
36 | 
37 |         generateme/fastmath            {:mvn/version "2.1.6"}
38 |         uncomplicate/neanderthal       {:mvn/version "0.43.0"}
39 |         aerial.hanami/aerial.hanami    {:mvn/version "0.12.9"}
40 |         net.clojars.behrica/cluster_eval {:git/url "https://github.com/behrica/cluster-eval.git"
41 |                                           :sha "ca34283a67bf18c8025955865fb567bd6e2e9a9a"}}
42 |         ;; appliedsciencestudio/rdata     {:git/url "https://github.com/appliedsciencestudio/rdata/"
43 |         ;;                                 :sha "151e6dead06b38995f1f30b09d954a060f7a2a9c"}
44 | 
45 | 
46 | 
47 | 
48 |  :aliases
49 | 
50 | 
51 |  {
52 |   :jdk-17
53 |            {:jvm-opts ["--add-modules" "jdk.incubator.foreign"
54 |                        "--enable-native-access=ALL-UNNAMED"]}
55 | 
56 |   :reveal {:extra-deps {vlaaad/reveal {:mvn/version "1.3.250"}}
57 |            :ns-default vlaaad.reveal
58 |            :exec-fn repl}
59 |   :reveal-nrepl-middleware
60 |   {:extra-deps {vlaaad/reveal {:mvn/version "1.3.194"}}
61 |    :main-opts  ["-m" "nrepl.cmdline"
62 |                 "--middleware" "[vlaaad.reveal.nrepl/middleware,cider.nrepl/cider-middleware]"]}
63 |   
64 |   :jar {:replace-deps {com.github.seancorfield/depstar {:mvn/version "2.1.278"}}
65 |         :exec-fn hf.depstar/jar
66 |         :exec-args {:jar "ml.tutorials.jar" :sync-pom true}}
67 |   :install {:replace-deps {slipset/deps-deploy {:mvn/version "0.1.5"}}
68 |             :exec-fn deps-deploy.deps-deploy/deploy
69 |             :exec-args {:installer :local :artifact "ml.tutorials.jar"}}
70 |   :deploy {:replace-deps {slipset/deps-deploy {:mvn/version "0.1.5"}}
71 |            :exec-fn deps-deploy.deps-deploy/deploy
72 |            :exec-args {:installer :remote :artifact "ml.tutorials.jar"}}}}
73 | 


--------------------------------------------------------------------------------
/deps.local.edn:
--------------------------------------------------------------------------------
 1 | {;; regular deps.edn stuff will work in here
 2 |  :deps {}
 3 |  :aliases {}
 4 | 
 5 |  ;; but some extra keys are supported to influence launchpad itself
 6 |  :launchpad/aliases [:jdk-17 :test] ; additional aliases, will be added to whatever
 7 |                              ; aliases you specify on the command line
 8 |  :launchpad/main-opts ["--emacs"]} ; additional CLI flags, so you can encode your
 9 |                                   ; own preferences
10 |   ; which shadow builds to start, although it may
11 |                                 ; be preferable to configure this as part of
12 |                                 ; specific aliases in your main deps.edn
13 | 


--------------------------------------------------------------------------------
/doc/intro.md:
--------------------------------------------------------------------------------
1 | # Introduction to ml.tutorials
2 | 
3 | TODO: write [great documentation](http://jacobian.org/writing/what-to-write/)
4 | 


--------------------------------------------------------------------------------
/docs/interactions_ols.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en-US">
 3 |     <head>
 4 |         <meta charset="UTF-8">
 5 |         <meta name="viewport" content="width=device-width, initial-scale=1">
 6 |         <link href="https://stackpath.bootstrapcdn.com/bootswatch/4.5.0/sandstone/bootstrap.min.css" rel="stylesheet" type="text/css">
 7 |         <link href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.6.0/styles/solarized-light.min.css" rel="stylesheet" type="text/css">
 8 |         <link href="https://cdnjs.cloudflare.com/ajax/libs/ag-grid/24.0.0/styles/ag-grid.min.css" rel="stylesheet" type="text/css">
 9 |         <link href="https://cdnjs.cloudflare.com/ajax/libs/ag-grid/24.0.0/styles/ag-theme-balham.min.css" rel="stylesheet" type="text/css">
10 |         <link href="https://unpkg.com/leaflet@1.6.0/dist/leaflet.css" rel="stylesheet" type="text/css">
11 | 
12 |         <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.12.0/dist/katex.min.css" integrity="sha384-AfEj0r4/OFrOo5t7NnNe46zW/tFgW6x/bCJG8FqQCEo3+Aro6EYUG4+cU+KJWu/X" crossorigin="anonymous">
13 |         <!-- The loading of KaTeX is deferred to speed up page rendering -->
14 |         <script defer src="https://cdn.jsdelivr.net/npm/katex@0.12.0/dist/katex.min.js" integrity="sha384-g7c+Jr9ZivxKLnZTDUhnkOnsh30B4H0rpLUpJ4jAIKs4fnJI+sEnkvrMWph2EDg4" crossorigin="anonymous"></script>
15 |     </head>
16 |     <body>
17 |         <p id="loading">Loading ...</p>
18 |         <div id="app"></div>
19 |     </body>
20 |     <script id="state" type="text">"{:options {:reverse-notes? false, :header? false, :notes-in-cards? false, :initially-collapse? false, :auto-scroll? false, :port 5678, :custom-header [:div {:style {:font-style \"italic\", :font-family \"\\\"Lucida Console\\\", Courier, monospace\"}} \"(notespace)\" [:p \"Sun Dec 18 23:13:45 CET 2022\"] nil [:hr]], :custom-footer [:div [:hr] [:hr]]}, :ids [\"1832\" \"1834\" \"1836\" \"1838\" \"1840\" \"1842\" \"1844\" \"1846\" \"1848\" \"1850\" \"1852\" \"1854\" \"1856\" \"1858\" \"1860\" \"1862\" \"1864\" \"1866\" \"1868\" \"1870\" \"1872\" \"1874\" \"1876\" \"1878\" \"1880\" \"1882\" \"1884\" \"1886\" \"1888\" \"1890\" \"1892\" \"1894\"], :id->content {\"1836\" [:div [:p] nil nil [:p/markdown \"This examples how, how to do interactions in linear regression with `scicloj.ml`\"]], \"1874\" [:div [:p] [:div [:p/code {:code \"(def pipe-interaction\\n  (ml/pipeline\\n   (mm/drop-columns [:newspaper])\\n   (mm/add-column :youtube*facebook (fn [ds] (dtf/* (ds :youtube) (ds :facebook))))\\n   (mm/set-inference-target :sales)\\n   {:metamorph/id :model}(mm/model {:model-type :smile.regression/ordinary-least-square})))\", :bg-class \"bg-light\"}]] nil nil], \"1880\" [:div [:p] nil nil [:p/markdown \"and print it and the performance metrices:\"]], \"1856\" [:div [:p] nil nil [:p/markdown \"and print the result:\"]], \"1866\" [:div [:p] nil nil [:p/markdown \"R2\"]], \"1888\" [:div [:p] [:div [:p/code {:code \"(-> evaluations flatten first :test-transform :metric)\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"0.910278084211143\\n\"}]], \"1876\" [:div [:p] nil nil [:p/markdown \"Again we evaluate the model,\"]], \"1864\" [:div [:p] [:div [:p/code {:code \"(-> evaluations flatten first :test-transform :metric)\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"1.9444172617257074\\n\"}]], \"1868\" [:div [:p] [:div [:p/code {:code \"(-> evaluations flatten first :test-transform :other-metrices first :metric)\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"0.8919962073759851\\n\"}]], \"1838\" [:div [:p] nil nil [:p/markdown \"Taking ideas from: \\nhttp://www.sthda.com/english/articles/40-regression-analysis/164-interaction-effect-in-multiple-regression-essentials/#comments-list\"]], \"1884\" [:div [:p] nil nil [:p/markdown \"As the multiplcation of 'youtube * facebook' is as well statistically relevant, it\\nsuggests that there is indeed an interaction between these 2 predictor variables youtube and facebook.\"]], \"1842\" [:div [:p] nil nil [:p/markdown \"First we load the data:\"]], \"1832\" [:div [:p] [:div [:p/code {:code \"(require '[scicloj.ml.core :as ml]\\n         '[scicloj.ml.metamorph :as mm]\\n         '[scicloj.ml.dataset :refer [dataset add-column]]\\n         '[scicloj.ml.dataset :as ds]\\n         '[tech.v3.dataset.math :as std-math]\\n         '[tech.v3.datatype.functional :as dtf]\\n         '[scicloj.metamorph.ml.toydata :as datasets])\", :bg-class \"bg-light\"}]] nil nil], \"1858\" [:div [:p] [:div [:p/code {:code \"^kind/hiccup\\n(text->hiccup\\n (str\\n  (-> evaluations flatten first :fit-ctx :model ml/thaw-model str)))\", :bg-class \"bg-light\"}]] nil (\"Linear Model:\" [:br] \"\" [:br] \"Residuals:\" [:br] \"       Min          1Q      Median          3Q         Max\" [:br] \"  -10.6397     -1.0434      0.3164      1.4446      3.3400\" [:br] \"\" [:br] \"Coefficients:\" [:br] \"                  Estimate Std. Error    t value   Pr(>|t|)\" [:br] \"Intercept           3.5202     0.4340     8.1103     0.0000 ***\" [:br] \"youtube             0.0454     0.0017    26.9969     0.0000 ***\" [:br] \"facebook            0.1894     0.0103    18.3898     0.0000 ***\" [:br] \"---------------------------------------------------------------------\" [:br] \"Significance codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\" [:br] \"\" [:br] \"Residual standard error: 2.0550 on 130 degrees of freedom\" [:br] \"Multiple R-squared: 0.8994,    Adjusted R-squared: 0.8978\" [:br] \"F-statistic: 580.9835 on 3 and 130 DF,  p-value: 1.496e-65\")], \"1878\" [:div [:p] [:div [:p/code {:code \"(def evaluations\\n  (ml/evaluate-pipelines\\n   [pipe-interaction]\\n   (ds/split->seq marketing :holdout)\\n   ml/rmse\\n   :loss\\n   {:other-metrices [{:name :r2\\n                      :metric-fn fmstats/r2-determination}]}))\", :bg-class \"bg-light\"}]] nil nil], \"1854\" [:div [:p] [:div [:p/code {:code \"(def evaluations\\n  (ml/evaluate-pipelines\\n   [additive-pipeline]\\n   (ds/split->seq marketing :holdout)\\n   ml/rmse\\n   :loss\\n   {:other-metrices [{:name :r2\\n                      :metric-fn fmstats/r2-determination}]}))\", :bg-class \"bg-light\"}]] nil nil], \"1894\" [:div [:p] nil nil [:p/markdown \"RMSE and R2 of the intercation model are sligtly better.\\nThese results suggest that the model with the interaction term is better than the model that contains only main effects.\\nSo, for this specific data, we should go for the model with the interaction model.\\n\"]], \"1840\" [:div [:p] [:div [:p/code {:code \"(defn pp-str [x]\\n  (with-out-str (clojure.pprint/pprint x)))\", :bg-class \"bg-light\"}]] nil nil], \"1886\" [:div [:p] nil nil [:p/markdown \"RMSE\"]], \"1846\" [:div [:p] nil nil [:p/markdown \"## Additive model\"]], \"1848\" [:div [:p] nil nil [:p/markdown \"Firts we build an additive model, which model equation is 'sales = b0 + b1 * youtube + b2 * facebook'\"]], \"1870\" [:div [:p] nil nil [:p/markdown \"## Interaction effects\"]], \"1882\" [:div [:p] [:div [:p/code {:code \"^kind/hiccup\\n(text->hiccup\\n (str\\n  (-> evaluations flatten first :fit-ctx :model ml/thaw-model str)))\", :bg-class \"bg-light\"}]] nil (\"Linear Model:\" [:br] \"\" [:br] \"Residuals:\" [:br] \"       Min          1Q      Median          3Q         Max\" [:br] \"   -7.3251     -0.4438      0.2614      0.7721      1.7986\" [:br] \"\" [:br] \"Coefficients:\" [:br] \"                  Estimate Std. Error    t value   Pr(>|t|)\" [:br] \"Intercept           7.9497     0.3797    20.9361     0.0000 ***\" [:br] \"youtube             0.0201     0.0021     9.5757     0.0000 ***\" [:br] \"facebook            0.0261     0.0112     2.3422     0.0207 *\" [:br] \"youtube*facebook     0.0009     0.0001    15.3438     0.0000 ***\" [:br] \"---------------------------------------------------------------------\" [:br] \"Significance codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\" [:br] \"\" [:br] \"Residual standard error: 1.2414 on 129 degrees of freedom\" [:br] \"Multiple R-squared: 0.9641,    Adjusted R-squared: 0.9632\" [:br] \"F-statistic: 1153.5786 on 4 and 129 DF,  p-value: 6.073e-93\")], \"1852\" [:div [:p] nil nil [:p/markdown \"We evaluate it, \"]], \"1850\" [:div [:p] [:div [:p/code {:code \"(def additive-pipeline\\n  (ml/pipeline\\n   (mm/set-inference-target :sales)\\n   (mm/drop-columns [:newspaper])\\n   {:metamorph/id :model}\\n   (mm/model {:model-type :smile.regression/ordinary-least-square})))\", :bg-class \"bg-light\"}]] nil nil], \"1860\" [:div [:p] nil nil [:p/markdown \"We have the following metrices:\"]], \"1892\" [:div [:p] [:div [:p/code {:code \"(-> evaluations flatten first :test-transform :other-metrices first :metric)\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"0.9771083289130588\\n\"}]], \"1844\" [:div [:p] [:div [:p/code {:code \"(def marketing (tc/dataset \\\"data/marketing.csv\\\" {:key-fn keyword}))\", :bg-class \"bg-light\"}]] nil nil], \"1872\" [:div [:p] nil nil [:p/markdown \"Now we add interaction effects to it, resulting in this model equation: 'sales = b0 + b1 * youtube + b2 * facebook + b3 * (youtube * facebook)'\"]], \"1862\" [:div [:p] nil nil [:p/markdown \"RMSE\"]], \"1834\" [:div [:p] [:div [:p/code {:code \"(comment\\n  (note/init-with-browser)\\n  (note/eval-this-notespace)\\n  (note/render-static-html \\\"docs/interactions_ols.html\\\"))\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"nil\\n\"}]], \"1890\" [:div [:p] nil nil [:p/markdown \"R2\"]]}}"</script>
21 |     <script src="gorilla-notes/js/compiled/main.js"></script>
22 | </html>
23 | 


--------------------------------------------------------------------------------
/docs/notespace-files/tree.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
  3 |  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
  4 | <!-- Generated by graphviz version 2.49.3 (20211027.1534)
  5 |  -->
  6 | <!-- Title: CART Pages: 1 -->
  7 | <svg width="602pt" height="616pt"
  8 |  viewBox="0.00 0.00 601.92 615.77" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
  9 | <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 611.77)">
 10 | <title>CART</title>
 11 | <polygon fill="white" stroke="transparent" points="-4,4 -4,-611.77 597.92,-611.77 597.92,4 -4,4"/>
 12 | <!-- 1 -->
 13 | <g id="node1" class="node">
 14 | <title>1</title>
 15 | <path fill="transparent" stroke="black" d="M374.46,-607.77C374.46,-607.77 218.46,-607.77 218.46,-607.77 212.46,-607.77 206.46,-601.77 206.46,-595.77 206.46,-595.77 206.46,-566.77 206.46,-566.77 206.46,-560.77 212.46,-554.77 218.46,-554.77 218.46,-554.77 374.46,-554.77 374.46,-554.77 380.46,-554.77 386.46,-560.77 386.46,-566.77 386.46,-566.77 386.46,-595.77 386.46,-595.77 386.46,-601.77 380.46,-607.77 374.46,-607.77"/>
 16 | <text text-anchor="start" x="238.46" y="-592.57" font-family="Helvetica,sans-Serif" font-size="14.00">petal_length ≤ 2.45</text>
 17 | <text text-anchor="start" x="264.96" y="-577.57" font-family="Helvetica,sans-Serif" font-size="14.00">size = 150</text>
 18 | <text text-anchor="start" x="214.46" y="-562.57" font-family="Helvetica,sans-Serif" font-size="14.00">impurity reduction = 0.3333</text>
 19 | </g>
 20 | <!-- 2 -->
 21 | <g id="node2" class="node">
 22 | <title>2</title>
 23 | <ellipse fill="transparent" stroke="black" cx="197.46" cy="-481.29" rx="90.52" ry="37.45"/>
 24 | <text text-anchor="start" x="162.46" y="-492.59" font-family="Helvetica,sans-Serif" font-size="14.00">species = 0</text>
 25 | <text text-anchor="start" x="169.96" y="-477.59" font-family="Helvetica,sans-Serif" font-size="14.00">size = 50</text>
 26 | <text text-anchor="start" x="141.46" y="-462.59" font-family="Helvetica,sans-Serif" font-size="14.00">deviance = 3.8466</text>
 27 | </g>
 28 | <!-- 1&#45;&gt;2 -->
 29 | <g id="edge1" class="edge">
 30 | <title>1&#45;&gt;2</title>
 31 | <path fill="none" stroke="black" d="M270.45,-554.52C260.86,-545.03 249.74,-534.03 239.12,-523.52"/>
 32 | <polygon fill="black" stroke="black" points="241.43,-520.88 231.86,-516.34 236.51,-525.86 241.43,-520.88"/>
 33 | <text text-anchor="middle" x="231.99" y="-537.64" font-family="Helvetica,sans-Serif" font-size="14.00">True</text>
 34 | </g>
 35 | <!-- 3 -->
 36 | <g id="node3" class="node">
 37 | <title>3</title>
 38 | <path fill="transparent" stroke="black" d="M474.46,-507.79C474.46,-507.79 318.46,-507.79 318.46,-507.79 312.46,-507.79 306.46,-501.79 306.46,-495.79 306.46,-495.79 306.46,-466.79 306.46,-466.79 306.46,-460.79 312.46,-454.79 318.46,-454.79 318.46,-454.79 474.46,-454.79 474.46,-454.79 480.46,-454.79 486.46,-460.79 486.46,-466.79 486.46,-466.79 486.46,-495.79 486.46,-495.79 486.46,-501.79 480.46,-507.79 474.46,-507.79"/>
 39 | <text text-anchor="start" x="340.96" y="-492.59" font-family="Helvetica,sans-Serif" font-size="14.00">petal_width ≤ 1.75</text>
 40 | <text text-anchor="start" x="364.96" y="-477.59" font-family="Helvetica,sans-Serif" font-size="14.00">size = 100</text>
 41 | <text text-anchor="start" x="314.46" y="-462.59" font-family="Helvetica,sans-Serif" font-size="14.00">impurity reduction = 0.3897</text>
 42 | </g>
 43 | <!-- 1&#45;&gt;3 -->
 44 | <g id="edge2" class="edge">
 45 | <title>1&#45;&gt;3</title>
 46 | <path fill="none" stroke="black" d="M322.73,-554.52C335.02,-542.49 349.8,-528.01 362.88,-515.19"/>
 47 | <polygon fill="black" stroke="black" points="365.67,-517.36 370.36,-507.86 360.77,-512.36 365.67,-517.36"/>
 48 | <text text-anchor="middle" x="370.11" y="-529.16" font-family="Helvetica,sans-Serif" font-size="14.00">False</text>
 49 | </g>
 50 | <!-- 6 -->
 51 | <g id="node4" class="node">
 52 | <title>6</title>
 53 | <path fill="transparent" stroke="black" d="M373.46,-396.84C373.46,-396.84 217.46,-396.84 217.46,-396.84 211.46,-396.84 205.46,-390.84 205.46,-384.84 205.46,-384.84 205.46,-355.84 205.46,-355.84 205.46,-349.84 211.46,-343.84 217.46,-343.84 217.46,-343.84 373.46,-343.84 373.46,-343.84 379.46,-343.84 385.46,-349.84 385.46,-355.84 385.46,-355.84 385.46,-384.84 385.46,-384.84 385.46,-390.84 379.46,-396.84 373.46,-396.84"/>
 54 | <text text-anchor="start" x="239.46" y="-381.64" font-family="Helvetica,sans-Serif" font-size="14.00">sepal_length ≤ 7.1</text>
 55 | <text text-anchor="start" x="267.96" y="-366.64" font-family="Helvetica,sans-Serif" font-size="14.00">size = 54</text>
 56 | <text text-anchor="start" x="213.46" y="-351.64" font-family="Helvetica,sans-Serif" font-size="14.00">impurity reduction = 0.0311</text>
 57 | </g>
 58 | <!-- 3&#45;&gt;6 -->
 59 | <g id="edge3" class="edge">
 60 | <title>3&#45;&gt;6</title>
 61 | <path fill="none" stroke="black" d="M372.78,-454.75C358.95,-439.83 341.34,-420.83 326.42,-404.74"/>
 62 | <polygon fill="black" stroke="black" points="328.71,-402.05 319.34,-397.1 323.57,-406.81 328.71,-402.05"/>
 63 | </g>
 64 | <!-- 7 -->
 65 | <g id="node5" class="node">
 66 | <title>7</title>
 67 | <ellipse fill="transparent" stroke="black" cx="498.46" cy="-370.34" rx="95.42" ry="37.45"/>
 68 | <text text-anchor="start" x="463.46" y="-381.64" font-family="Helvetica,sans-Serif" font-size="14.00">species = 2</text>
 69 | <text text-anchor="start" x="470.96" y="-366.64" font-family="Helvetica,sans-Serif" font-size="14.00">size = 46</text>
 70 | <text text-anchor="start" x="438.96" y="-351.64" font-family="Helvetica,sans-Serif" font-size="14.00">deviance = 12.0834</text>
 71 | </g>
 72 | <!-- 3&#45;&gt;7 -->
 73 | <g id="edge4" class="edge">
 74 | <title>3&#45;&gt;7</title>
 75 | <path fill="none" stroke="black" d="M420.37,-454.75C431.9,-442.43 446.03,-427.34 459.08,-413.4"/>
 76 | <polygon fill="black" stroke="black" points="461.92,-415.49 466.2,-405.8 456.81,-410.7 461.92,-415.49"/>
 77 | </g>
 78 | <!-- 12 -->
 79 | <g id="node6" class="node">
 80 | <title>12</title>
 81 | <path fill="transparent" stroke="black" d="M274.46,-285.88C274.46,-285.88 118.46,-285.88 118.46,-285.88 112.46,-285.88 106.46,-279.88 106.46,-273.88 106.46,-273.88 106.46,-244.88 106.46,-244.88 106.46,-238.88 112.46,-232.88 118.46,-232.88 118.46,-232.88 274.46,-232.88 274.46,-232.88 280.46,-232.88 286.46,-238.88 286.46,-244.88 286.46,-244.88 286.46,-273.88 286.46,-273.88 286.46,-279.88 280.46,-285.88 274.46,-285.88"/>
 82 | <text text-anchor="start" x="140.96" y="-270.68" font-family="Helvetica,sans-Serif" font-size="14.00">petal_width ≤ 1.65</text>
 83 | <text text-anchor="start" x="168.96" y="-255.68" font-family="Helvetica,sans-Serif" font-size="14.00">size = 53</text>
 84 | <text text-anchor="start" x="114.46" y="-240.68" font-family="Helvetica,sans-Serif" font-size="14.00">impurity reduction = 0.0141</text>
 85 | </g>
 86 | <!-- 6&#45;&gt;12 -->
 87 | <g id="edge5" class="edge">
 88 | <title>6&#45;&gt;12</title>
 89 | <path fill="none" stroke="black" d="M272.25,-343.79C258.69,-328.87 241.43,-309.88 226.81,-293.78"/>
 90 | <polygon fill="black" stroke="black" points="229.18,-291.19 219.87,-286.15 224,-295.9 229.18,-291.19"/>
 91 | </g>
 92 | <!-- 13 -->
 93 | <g id="node7" class="node">
 94 | <title>13</title>
 95 | <ellipse fill="transparent" stroke="black" cx="395.46" cy="-259.38" rx="90.52" ry="37.45"/>
 96 | <text text-anchor="start" x="360.46" y="-270.68" font-family="Helvetica,sans-Serif" font-size="14.00">species = 2</text>
 97 | <text text-anchor="start" x="371.46" y="-255.68" font-family="Helvetica,sans-Serif" font-size="14.00">size = 1</text>
 98 | <text text-anchor="start" x="339.46" y="-240.68" font-family="Helvetica,sans-Serif" font-size="14.00">deviance = 1.3863</text>
 99 | </g>
100 | <!-- 6&#45;&gt;13 -->
101 | <g id="edge6" class="edge">
102 | <title>6&#45;&gt;13</title>
103 | <path fill="none" stroke="black" d="M318.9,-343.79C330.21,-331.48 344.05,-316.39 356.85,-302.45"/>
104 | <polygon fill="black" stroke="black" points="359.65,-304.58 363.83,-294.84 354.49,-299.84 359.65,-304.58"/>
105 | </g>
106 | <!-- 24 -->
107 | <g id="node8" class="node">
108 | <title>24</title>
109 | <ellipse fill="transparent" stroke="black" cx="95.46" cy="-148.43" rx="95.42" ry="37.45"/>
110 | <text text-anchor="start" x="60.46" y="-159.73" font-family="Helvetica,sans-Serif" font-size="14.00">species = 1</text>
111 | <text text-anchor="start" x="67.96" y="-144.73" font-family="Helvetica,sans-Serif" font-size="14.00">size = 51</text>
112 | <text text-anchor="start" x="35.96" y="-129.73" font-family="Helvetica,sans-Serif" font-size="14.00">deviance = 24.9439</text>
113 | </g>
114 | <!-- 12&#45;&gt;24 -->
115 | <g id="edge7" class="edge">
116 | <title>12&#45;&gt;24</title>
117 | <path fill="none" stroke="black" d="M172.78,-232.84C161.37,-220.53 147.38,-205.44 134.45,-191.49"/>
118 | <polygon fill="black" stroke="black" points="136.77,-188.84 127.4,-183.89 131.64,-193.6 136.77,-188.84"/>
119 | </g>
120 | <!-- 25 -->
121 | <g id="node9" class="node">
122 | <title>25</title>
123 | <path fill="transparent" stroke="black" d="M376.46,-174.93C376.46,-174.93 220.46,-174.93 220.46,-174.93 214.46,-174.93 208.46,-168.93 208.46,-162.93 208.46,-162.93 208.46,-133.93 208.46,-133.93 208.46,-127.93 214.46,-121.93 220.46,-121.93 220.46,-121.93 376.46,-121.93 376.46,-121.93 382.46,-121.93 388.46,-127.93 388.46,-133.93 388.46,-133.93 388.46,-162.93 388.46,-162.93 388.46,-168.93 382.46,-174.93 376.46,-174.93"/>
124 | <text text-anchor="start" x="241.46" y="-159.73" font-family="Helvetica,sans-Serif" font-size="14.00">sepal_width ≤ 2.75</text>
125 | <text text-anchor="start" x="274.46" y="-144.73" font-family="Helvetica,sans-Serif" font-size="14.00">size = 2</text>
126 | <text text-anchor="start" x="216.46" y="-129.73" font-family="Helvetica,sans-Serif" font-size="14.00">impurity reduction = 0.5000</text>
127 | </g>
128 | <!-- 12&#45;&gt;25 -->
129 | <g id="edge8" class="edge">
130 | <title>12&#45;&gt;25</title>
131 | <path fill="none" stroke="black" d="M220.37,-232.84C234.34,-217.92 252.12,-198.92 267.19,-182.83"/>
132 | <polygon fill="black" stroke="black" points="270.06,-184.88 274.34,-175.19 264.95,-180.1 270.06,-184.88"/>
133 | </g>
134 | <!-- 50 -->
135 | <g id="node10" class="node">
136 | <title>50</title>
137 | <ellipse fill="transparent" stroke="black" cx="198.46" cy="-37.48" rx="90.52" ry="37.45"/>
138 | <text text-anchor="start" x="163.46" y="-48.78" font-family="Helvetica,sans-Serif" font-size="14.00">species = 2</text>
139 | <text text-anchor="start" x="174.46" y="-33.78" font-family="Helvetica,sans-Serif" font-size="14.00">size = 1</text>
140 | <text text-anchor="start" x="142.46" y="-18.78" font-family="Helvetica,sans-Serif" font-size="14.00">deviance = 1.3863</text>
141 | </g>
142 | <!-- 25&#45;&gt;50 -->
143 | <g id="edge9" class="edge">
144 | <title>25&#45;&gt;50</title>
145 | <path fill="none" stroke="black" d="M275.01,-121.89C263.71,-109.57 249.86,-94.48 237.07,-80.54"/>
146 | <polygon fill="black" stroke="black" points="239.43,-77.94 230.09,-72.94 234.27,-82.67 239.43,-77.94"/>
147 | </g>
148 | <!-- 51 -->
149 | <g id="node11" class="node">
150 | <title>51</title>
151 | <ellipse fill="transparent" stroke="black" cx="397.46" cy="-37.48" rx="90.52" ry="37.45"/>
152 | <text text-anchor="start" x="362.46" y="-48.78" font-family="Helvetica,sans-Serif" font-size="14.00">species = 1</text>
153 | <text text-anchor="start" x="373.46" y="-33.78" font-family="Helvetica,sans-Serif" font-size="14.00">size = 1</text>
154 | <text text-anchor="start" x="341.46" y="-18.78" font-family="Helvetica,sans-Serif" font-size="14.00">deviance = 1.3863</text>
155 | </g>
156 | <!-- 25&#45;&gt;51 -->
157 | <g id="edge10" class="edge">
158 | <title>25&#45;&gt;51</title>
159 | <path fill="none" stroke="black" d="M321.67,-121.89C332.86,-109.57 346.57,-94.48 359.24,-80.54"/>
160 | <polygon fill="black" stroke="black" points="362.01,-82.69 366.15,-72.94 356.83,-77.98 362.01,-82.69"/>
161 | </g>
162 | </g>
163 | </svg>
164 | 


--------------------------------------------------------------------------------
/docs/tune-titanic.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en-US">
 3 |     <head>
 4 |         <meta charset="UTF-8">
 5 |         <meta name="viewport" content="width=device-width, initial-scale=1">
 6 |         <link href="https://stackpath.bootstrapcdn.com/bootswatch/4.5.0/sandstone/bootstrap.min.css" rel="stylesheet" type="text/css">
 7 |         <link href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.6.0/styles/solarized-light.min.css" rel="stylesheet" type="text/css">
 8 |         <link href="https://cdnjs.cloudflare.com/ajax/libs/ag-grid/24.0.0/styles/ag-grid.min.css" rel="stylesheet" type="text/css">
 9 |         <link href="https://cdnjs.cloudflare.com/ajax/libs/ag-grid/24.0.0/styles/ag-theme-balham.min.css" rel="stylesheet" type="text/css">
10 |         <link href="https://unpkg.com/leaflet@1.6.0/dist/leaflet.css" rel="stylesheet" type="text/css">
11 | 
12 |         <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.12.0/dist/katex.min.css" integrity="sha384-AfEj0r4/OFrOo5t7NnNe46zW/tFgW6x/bCJG8FqQCEo3+Aro6EYUG4+cU+KJWu/X" crossorigin="anonymous">
13 |         <!-- The loading of KaTeX is deferred to speed up page rendering -->
14 |         <script defer src="https://cdn.jsdelivr.net/npm/katex@0.12.0/dist/katex.min.js" integrity="sha384-g7c+Jr9ZivxKLnZTDUhnkOnsh30B4H0rpLUpJ4jAIKs4fnJI+sEnkvrMWph2EDg4" crossorigin="anonymous"></script>
15 |     </head>
16 |     <body>
17 |         <p id="loading">Loading ...</p>
18 |         <div id="app"></div>
19 |     </body>
20 |     <script id="state" type="text">"{:options {:reverse-notes? false, :header? false, :notes-in-cards? false, :initially-collapse? false, :auto-scroll? false, :port 5678, :custom-header [:div {:style {:font-style \"italic\", :font-family \"\\\"Lucida Console\\\", Courier, monospace\"}} \"(notespace)\" [:p \"Sun Dec 18 23:13:15 CET 2022\"] nil [:hr]], :custom-footer [:div [:hr] [:hr]]}, :ids [\"1341\" \"1343\" \"1345\" \"1347\" \"1349\" \"1351\" \"1353\" \"1355\" \"1357\" \"1359\" \"1361\" \"1363\" \"1365\" \"1367\" \"1369\" \"1371\" \"1373\" \"1375\" \"1377\" \"1379\" \"1381\" \"1383\" \"1385\" \"1387\" \"1389\" \"1391\" \"1393\" \"1395\" \"1397\" \"1399\" \"1401\" \"1403\" \"1405\" \"1407\" \"1409\" \"1411\" \"1413\" \"1415\" \"1417\" \"1419\" \"1421\" \"1423\" \"1425\" \"1427\" \"1429\" \"1431\" \"1433\" \"1435\" \"1437\" \"1439\" \"1441\" \"1443\" \"1445\" \"1447\" \"1449\" \"1451\" \"1453\"], :id->content {\"1451\" [:div [:p] nil nil [:p/markdown \"trained with best hyper paramter\"]], \"1345\" [:div [:p] [:div [:p/code {:code \"(require  '[scicloj.ml.dataset :as ds]\\n          '[scicloj.ml.core :as ml]\\n          '[scicloj.ml.metamorph :as mm]\\n          '[camel-snake-kebab.core :as csk]\\n          '[scicloj.metamorph.ml.evaluation-handler :as eval-hn]\\n          '[tech.v3.datatype.functional :as dtfunc])\", :bg-class \"bg-light\"}]] nil nil], \"1365\" [:div [:p] [:div [:p/code {:code \"(def logistic-regression-pipelines\\n  (map\\n   #(make-decl-pipeline :smile.classification/logistic-regression %)\\n   (ml/sobol-gridsearch {:scaling-options {:scale? (ml/categorical [true false])}\\n                         :replace-missing-options {:value (ml/categorical [dtfunc/mean dtfunc/median])}\\n                         :model-options {:lambda (ml/categorical [0.1 0.2 0.5 0.7 1])\\n                                         :tolerance (ml/categorical [0.1 0.01 0.001 0.0001])}})))\", :bg-class \"bg-light\"}]] nil nil], \"1449\" [:div [:p] [:div [:p/code {:code \"(ml/thaw-model (:model  final-model))\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"#object[smile.classification.RandomForest 0x5eb7a939 \\\"smile.classification.RandomForest@5eb7a939\\\"]\\n\"}]], \"1409\" [:div [:p] [:div [:p/code {:code \"best-pipe-decl\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"[[:scicloj.ml.tune-titanic/assoc-pipe-opts\\n  {:scaling-options {:scale? false},\\n   :replace-missing-options\\n   {:value\\n    #object[tech.v3.datatype.functional$mean 0x7e25d9d1 \\\"tech.v3.datatype.functional$mean@7e25d9d1\\\"]},\\n   :model-options {:trees 250, :max-depth 10}}]\\n [:scicloj.ml.tune-titanic/replace-missing\\n  {:scaling-options {:scale? false},\\n   :replace-missing-options\\n   {:value\\n    #object[tech.v3.datatype.functional$mean 0x7e25d9d1 \\\"tech.v3.datatype.functional$mean@7e25d9d1\\\"]},\\n   :model-options {:trees 250, :max-depth 10}}]\\n [:mm/categorical->number [:survived] {} :int64]\\n [:scicloj.ml.tune-titanic/maybe-std-scale\\n  {:scaling-options {:scale? false},\\n   :replace-missing-options\\n   {:value\\n    #object[tech.v3.datatype.functional$mean 0x7e25d9d1 \\\"tech.v3.datatype.functional$mean@7e25d9d1\\\"]},\\n   :model-options {:trees 250, :max-depth 10}}]\\n [:mm/set-inference-target :survived]\\n #:metamorph{:id :model}\\n [:mm/model\\n  {:trees 250,\\n   :max-depth 10,\\n   :model-type :smile.classification/random-forest}]]\\n\"}]], \"1383\" [:div [:p] [:div [:p/code {:code \"(def files [atom []])\", :bg-class \"bg-light\"}]] nil nil], \"1395\" [:div [:p] [:div [:p/code {:code \"(def best-pipe-decl\\n  (-> best-evaluation first first :pipe-decl))\", :bg-class \"bg-light\"}]] nil nil], \"1363\" [:div [:p] [:div [:p/code {:code \"(defn make-decl-pipeline[model-type options]\\n  [[::assoc-pipe-opts options]\\n   [::replace-missing options]\\n   [:mm/categorical->number [:survived ] {} :int64]\\n   [::maybe-std-scale options]\\n   [:mm/set-inference-target :survived]\\n   {:metamorph/id :model} [:mm/model (merge (:model-options options) {:model-type model-type})]])\", :bg-class \"bg-light\"}]] nil nil], \"1391\" [:div [:p] [:div [:p/code {:code \"(def best-pipe-fn\\n  (-> best-evaluation first first :pipe-fn))\", :bg-class \"bg-light\"}]] nil nil], \"1453\" [:div [:p] [:div [:p/code {:code \"(-> final-model :pipe-options)\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"{:scaling-options {:scale? true},\\n :replace-missing-options\\n {:value\\n  #object[tech.v3.datatype.functional$median 0x499d3e41 \\\"tech.v3.datatype.functional$median@499d3e41\\\"]},\\n :model-options {:trees 50, :max-depth 10}}\\n\"}]], \"1425\" [:div [:p] nil nil [:p/markdown \"Smile model object:\"]], \"1443\" [:div [:p] [:div [:p/code {:code \"(def final-model-by-cv\\n (let [inner-k-fold (ds/split->seq data :kfold {:k 5})\\n       evaluation (ml/evaluate-pipelines\\n                   all-pipelines\\n                   inner-k-fold\\n                   ml/classification-accuracy\\n                   :accuracy)\\n       fit-ctx (-> evaluation first first :fit-ctx)\\n       best-pipefn (-> evaluation first first :pipe-fn)]\\n   {:best-pipe-fn best-pipefn\\n    :fit-ctx fit-ctx}))\", :bg-class \"bg-light\"}]] nil nil], \"1407\" [:div [:p] nil nil [:p/markdown \"best pipeline (found on train data)\"]], \"1423\" [:div [:p] [:div [:p/code {:code \"^kind/dataset\\n(->\\n (ml/confusion-map predicted-survival-hold-out\\n                   (holdout-ds :survived))\\n (ml/confusion-map->ds))\", :bg-class \"bg-light\"}]] nil [:div {:class \"table table-striped table-hover table-condensed table-responsive\", :style {:height \"230px\"}} [:p/markdown \"_unnamed [3 3]:\\n\\n| :column-name |      0 |       1 |\\n|--------------|--------|---------|\\n|  column-name |      0 |       1 |\\n|            0 | 0.9159 | 0.08411 |\\n|            1 | 0.3750 |  0.6250 |\\n\"]]], \"1349\" [:div [:p] [:div [:p/code {:code \"(def  numeric-features [:age :parch :fare])\", :bg-class \"bg-light\"}]] nil nil], \"1373\" [:div [:p] nil nil [:p/markdown \"Simple split\"]], \"1399\" [:div [:p] nil nil [:p/markdown \"best accuracy found on train data: \\n0.8734622144112478\"]], \"1347\" [:div [:p] [:div [:p/code {:code \"(def  categorical-features  [:pclass :sex :embarked])\", :bg-class \"bg-light\"}]] nil nil], \"1387\" [:div [:p] [:div [:p/code {:code \"(def best-accuracy (-> best-evaluation first first :train-transform :metric))\", :bg-class \"bg-light\"}]] nil nil], \"1385\" [:div [:p] [:div [:p/code {:code \"(def best-evaluation\\n  (ml/evaluate-pipelines\\n   all-pipelines\\n   (ds/split->seq train-ds :kfold 5)\\n   ml/classification-accuracy\\n   :accuracy\\n   {;; :attach-fn-sources {:ns (find-ns 'scicloj.ml.tune-titanic)\\n    ;;                         :pipe-fns-clj-file \\\"src/scicloj/ml/tune_titanic.clj\\\"}\\n    :return-best-crossvalidation-only true\\n    :return-best-pipeline-only true}))\", :bg-class \"bg-light\"}]] nil nil], \"1429\" [:div [:p] nil nil [:p/markdown \"Feature importance:\"]], \"1341\" [:div [:p] [:div [:p/code {:code \"(comment\\n  (note/init-with-browser)\\n  (note/eval-this-notespace)\\n  (note/reread-this-notespace)\\n  (note/render-static-html \\\"docs/tune-titanic.html\\\")\\n  (note/init))\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"nil\\n\"}]], \"1375\" [:div [:p] [:div [:p/code {:code \"(def splits (ds/split->seq data :holdout {:ratio 0.8}))\", :bg-class \"bg-light\"}]] nil nil], \"1355\" [:div [:p] [:div [:p/code {:code \"(def data\\n  (-> (ds/dataset \\\"data/titanic/train.csv\\\"\\n                  {:key-fn csk/->kebab-case-keyword})\\n      (ds/select-columns (concat categorical-features numeric-features [:survived]))\\n      (ds/replace-missing categorical-features :value \\\"missing\\\")\\n      (ds/categorical->one-hot categorical-features)))\", :bg-class \"bg-light\"}]] nil nil], \"1361\" [:div [:p] [:div [:p/code {:code \"(defn assoc-pipe-opts [options]\\n  (fn [ctx]\\n    (assoc ctx :pipe-options options)))\", :bg-class \"bg-light\"}]] nil nil], \"1445\" [:div [:p] [:div [:p/code {:code \"(def final-model\\n  ((:best-pipe-fn final-model-by-cv)\\n   {:metamorph/data data :metamorph/mode :fit}))\", :bg-class \"bg-light\"}]] nil nil], \"1431\" [:div [:p] [:div [:p/code {:code \"(seq\\n (.importance\\n  (ml/thaw-model\\n   (-> best-evaluation first first :fit-ctx :model))))\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"(91.72936078911897\\n 6.804519064404864\\n 117.00237798409798\\n 5.096989319475346\\n 10.150868122594787\\n 4.240011278391342\\n 22.305792702068686\\n 33.949740317735156\\n 1.0232068587001921\\n 0.9722581226876843\\n 2.552121205979241\\n 0.0)\\n\"}]], \"1357\" [:div [:p] [:div [:p/code {:code \"(defn replace-missing [options]\\n  (fn [ctx]\\n    ( (apply mm/replace-missing numeric-features (map->vec (:replace-missing-options options))) ctx)))\", :bg-class \"bg-light\"}]] nil nil], \"1393\" [:div [:p] [:div [:p/code {:code \"best-pipe-fn\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"#object[scicloj.metamorph.core$pipeline$local_pipeline__44848 0x33c73b03 \\\"scicloj.metamorph.core$pipeline$local_pipeline__44848@33c73b03\\\"]\\n\"}]], \"1379\" [:div [:p] [:div [:p/code {:code \"(def holdout-ds ((first splits) :test))\", :bg-class \"bg-light\"}]] nil nil], \"1437\" [:div [:p] [:div [:p/code {:code \"(def nested-cv-result\\n (doall\\n  (nested-cv/nested-cv data all-pipelines\\n                       ml/classification-accuracy\\n                       :accuracy 10 5)))\", :bg-class \"bg-light\"}]] nil nil], \"1427\" [:div [:p] [:div [:p/code {:code \"(ml/thaw-model\\n (-> best-evaluation first first :fit-ctx :model))\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"#object[smile.classification.RandomForest 0x732fb0b7 \\\"smile.classification.RandomForest@732fb0b7\\\"]\\n\"}]], \"1381\" [:div [:p] nil nil [:p/markdown \"Tune hyperparameter by evaluating all pipelines/models \"]], \"1421\" [:div [:p] nil nil [:p/markdown \"Confusion matrix on holdout data\"]], \"1401\" [:div [:p] nil nil [:p/markdown \"best accuracy found on test data: \\n0.8461538461538461\"]], \"1343\" [:div [:p] nil nil [:p/markdown \"This is the Clojure version of https://www.moritzkoerber.com/posts/preprocessing-hyperparameters/\"]], \"1389\" [:div [:p] [:div [:p/code {:code \"(def best-options (-> best-evaluation first first :fit-ctx :pipe-options))\", :bg-class \"bg-light\"}]] nil nil], \"1353\" [:div [:p] nil nil [:p/markdown \"Preproceesing Pipelines including feature engineering\"]], \"1417\" [:div [:p] nil nil [:p/markdown \"Classication accuracy on holdout data: \"]], \"1413\" [:div [:p] [:div [:p/code {:code \"(->\\n (ml/get-nice-source-info best-pipe-decl\\n                          (find-ns 'scicloj.ml.tune-titanic)\\n                          (-> #'data meta :file))\\n (update :classpath #(take 20 %)))\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"{:fn-sources\\n #:mm{categorical->number\\n      {:source-str\\n       \\\"(defn categorical->number\\\\n  \\\\\\\"Convert columns into a discrete , numeric representation\\\\n  See tech.v3.dataset.categorical/fit-categorical-map.\\\\\\\"\\\\n  ([filter-fn-or-ds]\\\\n  (tech.v3.dataset.metamorph/categorical->number filter-fn-or-ds))\\\\n  ([filter-fn-or-ds table-args]\\\\n  (tech.v3.dataset.metamorph/categorical->number filter-fn-or-ds table-args))\\\\n  ([filter-fn-or-ds table-args result-datatype]\\\\n  (tech.v3.dataset.metamorph/categorical->number filter-fn-or-ds table-args result-datatype)))\\\",\\n       :source-form\\n       (defn\\n        categorical->number\\n        \\\"Convert columns into a discrete , numeric representation\\\\n  See tech.v3.dataset.categorical/fit-categorical-map.\\\"\\n        ([filter-fn-or-ds]\\n         (tech.v3.dataset.metamorph/categorical->number\\n          filter-fn-or-ds))\\n        ([filter-fn-or-ds table-args]\\n         (tech.v3.dataset.metamorph/categorical->number\\n          filter-fn-or-ds\\n          table-args))\\n        ([filter-fn-or-ds table-args result-datatype]\\n         (tech.v3.dataset.metamorph/categorical->number\\n          filter-fn-or-ds\\n          table-args\\n          result-datatype)))},\\n      model\\n      {:source-str\\n       \\\"(defn model\\\\n  \\\\\\\"Executes a machine learning model in train/predict (depending on :mode)\\\\n  from the `metamorph.ml` model registry.\\\\n\\\\n  The model is passed between both invocation via the shared context ctx in a\\\\n  key (a step indentifier) which is passed in key `:metamorph/id` and guarantied to be unique for each\\\\n  pipeline step.\\\\n\\\\n  The function writes and reads into this common context key.\\\\n\\\\n  Options:\\\\n  - `:model-type` - Keyword for the model to use\\\\n\\\\n  Further options get passed to `train` functions and are model specific.\\\\n\\\\n  See here for an overview for the models build into scicloj.ml:\\\\n\\\\n\\\\n  https://scicloj.github.io/scicloj.ml-tutorials/userguide-models.html\\\\n\\\\n  Other libraries might contribute other models,\\\\n  which are documented as part of the library.\\\\n\\\\n\\\\n  metamorph                            | .\\\\n  -------------------------------------|----------------------------------------------------------------------------\\\\n  Behaviour in mode :fit               | Calls `scicloj.metamorph.ml/train` using data in `:metamorph/data` and `options`and stores trained model in ctx under key in `:metamorph/id`\\\\n  Behaviour in mode :transform         | Reads trained model from ctx and calls `scicloj.metamorph.ml/predict` with the model in $id and data in `:metamorph/data`\\\\n  Reads keys from ctx                  | In mode `:transform` : Reads trained model to use for prediction from key in `:metamorph/id`.\\\\n  Writes keys to ctx                   | In mode `:fit` : Stores trained model in key $id and writes feature-ds and target-ds before prediction into ctx at `:scicloj.metamorph.ml/feature-ds` /`:scicloj.metamorph.ml/target-ds`\\\\n\\\\n\\\\n\\\\n\\\\n  See as well:\\\\n\\\\n  * `scicloj.metamorph.ml/train`\\\\n  * `scicloj.metamorph.ml/predict`\\\\n\\\\n  \\\\\\\"\\\\n  ([options]\\\\n  (scicloj.metamorph.ml/model options)))\\\",\\n       :source-form\\n       (defn\\n        model\\n        \\\"Executes a machine learning model in train/predict (depending on :mode)\\\\n  from the `metamorph.ml` model registry.\\\\n\\\\n  The model is passed between both invocation via the shared context ctx in a\\\\n  key (a step indentifier) which is passed in key `:metamorph/id` and guarantied to be unique for each\\\\n  pipeline step.\\\\n\\\\n  The function writes and reads into this common context key.\\\\n\\\\n  Options:\\\\n  - `:model-type` - Keyword for the model to use\\\\n\\\\n  Further options get passed to `train` functions and are model specific.\\\\n\\\\n  See here for an overview for the models build into scicloj.ml:\\\\n\\\\n\\\\n  https://scicloj.github.io/scicloj.ml-tutorials/userguide-models.html\\\\n\\\\n  Other libraries might contribute other models,\\\\n  which are documented as part of the library.\\\\n\\\\n\\\\n  metamorph                            | .\\\\n  -------------------------------------|----------------------------------------------------------------------------\\\\n  Behaviour in mode :fit               | Calls `scicloj.metamorph.ml/train` using data in `:metamorph/data` and `options`and stores trained model in ctx under key in `:metamorph/id`\\\\n  Behaviour in mode :transform         | Reads trained model from ctx and calls `scicloj.metamorph.ml/predict` with the model in $id and data in `:metamorph/data`\\\\n  Reads keys from ctx                  | In mode `:transform` : Reads trained model to use for prediction from key in `:metamorph/id`.\\\\n  Writes keys to ctx                   | In mode `:fit` : Stores trained model in key $id and writes feature-ds and target-ds before prediction into ctx at `:scicloj.metamorph.ml/feature-ds` /`:scicloj.metamorph.ml/target-ds`\\\\n\\\\n\\\\n\\\\n\\\\n  See as well:\\\\n\\\\n  * `scicloj.metamorph.ml/train`\\\\n  * `scicloj.metamorph.ml/predict`\\\\n\\\\n  \\\"\\n        ([options] (scicloj.metamorph.ml/model options)))},\\n      set-inference-target\\n      {:source-str\\n       \\\"(defn set-inference-target\\\\n  \\\\\\\"Set the inference target on the column.  This sets the :column-type member\\\\n  of the column metadata to :inference-target?.\\\\\\\"\\\\n  ([target-name-or-target-name-seq]\\\\n  (tech.v3.dataset.metamorph/set-inference-target target-name-or-target-name-seq)))\\\",\\n       :source-form\\n       (defn\\n        set-inference-target\\n        \\\"Set the inference target on the column.  This sets the :column-type member\\\\n  of the column metadata to :inference-target?.\\\"\\n        ([target-name-or-target-name-seq]\\n         (tech.v3.dataset.metamorph/set-inference-target\\n          target-name-or-target-name-seq)))}},\\n :classpath\\n (\\\"src\\\"\\n  \\\"resources\\\"\\n  \\\"/home/carsten/.m2/repository/aerial/hanami/aerial.hanami/0.12.9/aerial.hanami-0.12.9.jar\\\"\\n  \\\"/home/carsten/.gitlibs/libs/applied-science/waqi/faefe5dfd1b161ff70089924591ac2d699527811/resources\\\"\\n  \\\"/home/carsten/.gitlibs/libs/applied-science/waqi/faefe5dfd1b161ff70089924591ac2d699527811/src\\\"\\n  \\\"/home/carsten/.m2/repository/ch/qos/logback/logback-classic/1.4.4/logback-classic-1.4.4.jar\\\"\\n  \\\"/home/carsten/.m2/repository/clj-python/libpython-clj/2.020/libpython-clj-2.020.jar\\\"\\n  \\\"/home/carsten/.m2/repository/com/fasterxml/jackson/core/jackson-annotations/2.13.2/jackson-annotations-2.13.2.jar\\\"\\n  \\\"/home/carsten/.m2/repository/com/fasterxml/jackson/core/jackson-core/2.13.2/jackson-core-2.13.2.jar\\\"\\n  \\\"/home/carsten/.m2/repository/com/fasterxml/jackson/core/jackson-databind/2.13.2/jackson-databind-2.13.2.jar\\\"\\n  \\\"/home/carsten/.m2/repository/com/github/luben/zstd-jni/1.5.1-1/zstd-jni-1.5.1-1.jar\\\"\\n  \\\"/home/carsten/.m2/repository/dk/simongray/datalinguist/0.1.163/datalinguist-0.1.163.jar\\\"\\n  \\\"/home/carsten/.m2/repository/generateme/fastmath/2.1.6/fastmath-2.1.6.jar\\\"\\n  \\\"/home/carsten/.gitlibs/libs/io.github.nextjournal/clerk/a6bfc832a182ef3068d60a318985681ddb913595/src\\\"\\n  \\\"/home/carsten/.gitlibs/libs/io.github.nextjournal/clerk/a6bfc832a182ef3068d60a318985681ddb913595/resources\\\"\\n  \\\"/home/carsten/.gitlibs/libs/io.github.nextjournal/clerk/a6bfc832a182ef3068d60a318985681ddb913595/bb\\\"\\n  \\\"/home/carsten/.gitlibs/libs/net.clojars.behrica/cluster_eval/ca34283a67bf18c8025955865fb567bd6e2e9a9a/src\\\"\\n  \\\"/home/carsten/.gitlibs/libs/net.clojars.behrica/cluster_eval/ca34283a67bf18c8025955865fb567bd6e2e9a9a/resources\\\"\\n  \\\"/home/carsten/.gitlibs/libs/net.clojars.behrica/cluster_eval/ca34283a67bf18c8025955865fb567bd6e2e9a9a/target/classes\\\"\\n  \\\"/home/carsten/.m2/repository/org/apache/arrow/arrow-vector/6.0.0/arrow-vector-6.0.0.jar\\\")}\\n\"}]], \"1441\" [:div [:p] [:div [:p/code {:code \"(map :metric nested-cv-result)\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"(0.8666666666666667\\n 0.8666666666666667\\n 0.8666666666666667\\n 0.8444444444444444\\n 0.8777777777777778\\n 0.888888888888889\\n 0.8666666666666667\\n 0.8555555555555556\\n 0.8333333333333333\\n 0.9135802469135802)\\n\"}]], \"1433\" [:div [:p] nil nil [:p/markdown \"## nested cross validation\"]], \"1439\" [:div [:p] nil nil [:p/markdown \"nested cv best models metrics\"]], \"1415\" [:div [:p] [:div [:p/code {:code \"(def predicted-survival-hold-out\\n  (->\\n   (best-pipe-fn\\n    (merge (-> best-evaluation first first :fit-ctx)\\n           {:metamorph/data holdout-ds :metamorph/mode :transform}))\\n   :metamorph/data\\n   ds/reverse-map-categorical-xforms\\n   :survived))\", :bg-class \"bg-light\"}]] nil nil], \"1369\" [:div [:p] [:div [:p/code {:code \"(def all-pipelines (concat random-forrest-pipelines))\", :bg-class \"bg-light\"}]] nil nil], \"1447\" [:div [:p] nil nil [:p/markdown \"Final best model\"]], \"1435\" [:div [:p] [:div [:p/code {:code \"(require '[scicloj.ml.nested-cv :as nested-cv])\", :bg-class \"bg-light\"}]] nil nil], \"1367\" [:div [:p] [:div [:p/code {:code \"(def random-forrest-pipelines\\n  (map\\n   #(make-decl-pipeline :smile.classification/random-forest %)\\n   (ml/sobol-gridsearch {:scaling-options {:scale? (ml/categorical [true false])}\\n                         :replace-missing-options {:value (ml/categorical [dtfunc/mean dtfunc/median])}\\n                         :model-options {:trees (ml/categorical [5 50 100 250])\\n                                         :max-depth (ml/categorical [5 8 10])}})))\", :bg-class \"bg-light\"}]] nil nil], \"1397\" [:div [:p] nil nil [:p/markdown \"## All information on best found pipeline\"]], \"1419\" [:div [:p] [:div [:p/code {:code \"(ml/classification-accuracy predicted-survival-hold-out\\n                           (holdout-ds :survived))\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"0.7988826815642458\\n\"}]], \"1411\" [:div [:p] nil nil [:p/markdown \"pipe sources information\"]], \"1403\" [:div [:p] nil nil [:p/markdown \"best options (found on train data): \"]], \"1377\" [:div [:p] [:div [:p/code {:code \"(def train-ds ((first splits) :train))\", :bg-class \"bg-light\"}]] nil nil], \"1371\" [:div [:p] [:div [:p/code {:code \"(def pipe-fns\\n  (mapv ml/->pipeline all-pipelines))\", :bg-class \"bg-light\"}]] nil nil], \"1359\" [:div [:p] [:div [:p/code {:code \"(defn maybe-std-scale [options]\\n  (fn [ctx]\\n    (if (-> options :scaling-options :scale?)\\n      ((mm/std-scale numeric-features {})\\n       ctx)\\n      ctx)))\", :bg-class \"bg-light\"}]] nil nil], \"1405\" [:div [:p] [:div [:p/code {:code \"best-options\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"{:scaling-options {:scale? false},\\n :replace-missing-options\\n {:value\\n  #object[tech.v3.datatype.functional$mean 0x7e25d9d1 \\\"tech.v3.datatype.functional$mean@7e25d9d1\\\"]},\\n :model-options {:trees 250, :max-depth 10}}\\n\"}]], \"1351\" [:div [:p] [:div [:p/code {:code \"(defn map->vec [m] (flatten (into [] m)))\", :bg-class \"bg-light\"}]] nil nil]}}"</script>
21 |     <script src="gorilla-notes/js/compiled/main.js"></script>
22 | </html>
23 | 


--------------------------------------------------------------------------------
/docs/userguide-categrical.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en-US">
 3 |     <head>
 4 |         <meta charset="UTF-8">
 5 |         <meta name="viewport" content="width=device-width, initial-scale=1">
 6 |         <link href="https://stackpath.bootstrapcdn.com/bootswatch/4.5.0/sandstone/bootstrap.min.css" rel="stylesheet" type="text/css">
 7 |         <link href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.6.0/styles/solarized-light.min.css" rel="stylesheet" type="text/css">
 8 |         <link href="https://cdnjs.cloudflare.com/ajax/libs/ag-grid/24.0.0/styles/ag-grid.min.css" rel="stylesheet" type="text/css">
 9 |         <link href="https://cdnjs.cloudflare.com/ajax/libs/ag-grid/24.0.0/styles/ag-theme-balham.min.css" rel="stylesheet" type="text/css">
10 |         <link href="https://unpkg.com/leaflet@1.6.0/dist/leaflet.css" rel="stylesheet" type="text/css">
11 | 
12 |         <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.12.0/dist/katex.min.css" integrity="sha384-AfEj0r4/OFrOo5t7NnNe46zW/tFgW6x/bCJG8FqQCEo3+Aro6EYUG4+cU+KJWu/X" crossorigin="anonymous">
13 |         <!-- The loading of KaTeX is deferred to speed up page rendering -->
14 |         <script defer src="https://cdn.jsdelivr.net/npm/katex@0.12.0/dist/katex.min.js" integrity="sha384-g7c+Jr9ZivxKLnZTDUhnkOnsh30B4H0rpLUpJ4jAIKs4fnJI+sEnkvrMWph2EDg4" crossorigin="anonymous"></script>
15 |     </head>
16 |     <body>
17 |         <p id="loading">Loading ...</p>
18 |         <div id="app"></div>
19 |     </body>
20 |     <script id="state" type="text">"{:options {:reverse-notes? false, :header? false, :notes-in-cards? false, :initially-collapse? false, :auto-scroll? false, :port 5678, :custom-header [:div {:style {:font-style \"italic\", :font-family \"\\\"Lucida Console\\\", Courier, monospace\"}} \"(notespace)\" [:p \"Sun Dec 18 23:11:59 CET 2022\"] nil [:hr]], :custom-footer [:div [:hr] [:hr]]}, :ids [\"672\" \"674\" \"676\" \"678\" \"680\" \"682\" \"684\" \"686\" \"688\" \"690\" \"692\" \"694\" \"696\" \"698\" \"700\" \"702\" \"704\" \"706\" \"708\" \"710\" \"712\" \"714\" \"716\" \"718\"], :id->content {\"696\" [:div [:p] nil nil [:p/markdown \"metadata has changed as well, int now, and with a lookup table\"]], \"682\" [:div [:p] nil nil [:p/markdown \"Categorical columns can be converted too numbers, which is needed by several ML models.\"]], \"704\" [:div [:p] [:div [:p/code {:code \"(def ds-one-hot\\n  (ds/categorical->one-hot\\n   ds-cat :all {} :int))\", :bg-class \"bg-light\"}]] nil nil], \"688\" [:div [:p] [:div [:p/code {:code \"(-> ds-cat :a meta)\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"{:categorical? true, :name :a, :datatype :keyword, :n-elems 3}\\n\"}]], \"674\" [:div [:p] [:div [:p/code {:code \"(require '[scicloj.ml.core :as ml]\\n         '[scicloj.ml.metamorph :as mm]\\n         '[scicloj.ml.dataset  :as ds])\", :bg-class \"bg-light\"}]] nil nil], \"700\" [:div [:p] nil nil [:p/markdown \"## categorical -> one-hot\"]], \"684\" [:div [:p] [:div [:p/code {:code \"(def ds-cat\\n  (ds/dataset {:a [:x :y :x]}))\", :bg-class \"bg-light\"}]] nil nil], \"714\" [:div [:p] [:div [:p/code {:code \"(-> ds-one-hot :a-y meta)\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"{:categorical? true,\\n :name :a-y,\\n :datatype :int,\\n :n-elems 3,\\n :one-hot-map\\n {:one-hot-table {:y :a-y, :x :a-x},\\n  :src-column :a,\\n  :result-datatype :int}}\\n\"}]], \"712\" [:div [:p] nil nil [:p/markdown \"inspect metadata after conversion\"]], \"716\" [:div [:p] nil nil [:p/markdown \"we can go back\"]], \"706\" [:div [:p] [:div [:p/code {:code \"^kind/dataset\\nds-one-hot\", :bg-class \"bg-light\"}]] nil [:div {:class \"table table-striped table-hover table-condensed table-responsive\", :style {:height \"230px\"}} [:p/markdown \"_unnamed [3 2]:\\n\\n| :a-y | :a-x |\\n|-----:|-----:|\\n|    0 |    1 |\\n|    1 |    0 |\\n|    0 |    1 |\\n\"]]], \"686\" [:div [:p] nil nil [:p/markdown \"inspect column metadata and observe datatype :kewyword\"]], \"678\" [:div [:p] nil nil [:p/markdown \"We keep important information in the metadata of the column,\\nwhich can be inspected\"]], \"680\" [:div [:p] nil nil [:p/markdown \"## categorical -> number\"]], \"718\" [:div [:p] [:div [:p/code {:code \"(-> ds-one-hot ds/reverse-map-categorical-xforms)\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"_unnamed [3 1]:\\n\\n| :a |\\n|----|\\n| :x |\\n| :y |\\n| :x |\\n\\n\"}]], \"702\" [:div [:p] nil nil [:p/markdown \"Categorical columns can be converted to one-hot columns as well, which is needed by several ML models.\"]], \"710\" [:div [:p] [:div [:p/code {:code \"(-> ds-one-hot ds/reverse-map-categorical-xforms)\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"_unnamed [3 1]:\\n\\n| :a |\\n|----|\\n| :x |\\n| :y |\\n| :x |\\n\\n\"}]], \"694\" [:div [:p] [:div [:p/code {:code \"^kind/dataset\\nds-number\", :bg-class \"bg-light\"}]] nil [:div {:class \"table table-striped table-hover table-condensed table-responsive\", :style {:height \"230px\"}} [:p/markdown \"_unnamed [3 1]:\\n\\n| :a |\\n|---:|\\n|  1 |\\n|  0 |\\n|  1 |\\n\"]]], \"698\" [:div [:p] [:div [:p/code {:code \"(-> ds-number :a meta)\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"{:categorical? true,\\n :name :a,\\n :datatype :int,\\n :n-elems 3,\\n :categorical-map\\n {:lookup-table {:y 0, :x 1}, :src-column :a, :result-datatype :int}}\\n\"}]], \"676\" [:div [:p] nil nil [:p/markdown \"# Handling of categorical variables\"]], \"690\" [:div [:p] nil nil [:p/markdown \"convert categorical columns to numeric\"]], \"708\" [:div [:p] nil nil [:p/markdown \"we can go back as well\"]], \"692\" [:div [:p] [:div [:p/code {:code \"(def ds-number\\n  (ds/categorical->number\\n   ds-cat :all {} :int))\", :bg-class \"bg-light\"}]] nil nil], \"672\" [:div [:p] [:div [:p/code {:code \"(comment\\n  (note/init-with-browser)\\n  (note/eval-and-realize-this-notespace))\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"nil\\n\"}]]}}"</script>
21 |     <script src="gorilla-notes/js/compiled/main.js"></script>
22 | </html>
23 | 


--------------------------------------------------------------------------------
/docs/userguide-intro.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en-US">
 3 |     <head>
 4 |         <meta charset="UTF-8">
 5 |         <meta name="viewport" content="width=device-width, initial-scale=1">
 6 |         <link href="https://stackpath.bootstrapcdn.com/bootswatch/4.5.0/sandstone/bootstrap.min.css" rel="stylesheet" type="text/css">
 7 |         <link href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.6.0/styles/solarized-light.min.css" rel="stylesheet" type="text/css">
 8 |         <link href="https://cdnjs.cloudflare.com/ajax/libs/ag-grid/24.0.0/styles/ag-grid.min.css" rel="stylesheet" type="text/css">
 9 |         <link href="https://cdnjs.cloudflare.com/ajax/libs/ag-grid/24.0.0/styles/ag-theme-balham.min.css" rel="stylesheet" type="text/css">
10 |         <link href="https://unpkg.com/leaflet@1.6.0/dist/leaflet.css" rel="stylesheet" type="text/css">
11 | 
12 |         <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.12.0/dist/katex.min.css" integrity="sha384-AfEj0r4/OFrOo5t7NnNe46zW/tFgW6x/bCJG8FqQCEo3+Aro6EYUG4+cU+KJWu/X" crossorigin="anonymous">
13 |         <!-- The loading of KaTeX is deferred to speed up page rendering -->
14 |         <script defer src="https://cdn.jsdelivr.net/npm/katex@0.12.0/dist/katex.min.js" integrity="sha384-g7c+Jr9ZivxKLnZTDUhnkOnsh30B4H0rpLUpJ4jAIKs4fnJI+sEnkvrMWph2EDg4" crossorigin="anonymous"></script>
15 |     </head>
16 |     <body>
17 |         <p id="loading">Loading ...</p>
18 |         <div id="app"></div>
19 |     </body>
20 |     <script id="state" type="text">"{:options {:reverse-notes? false, :header? false, :notes-in-cards? false, :initially-collapse? false, :auto-scroll? false, :port 5678, :custom-header [:div {:style {:font-style \"italic\", :font-family \"\\\"Lucida Console\\\", Courier, monospace\"}} \"(notespace)\" [:p \"Sun Dec 18 23:11:48 CET 2022\"] nil [:hr]], :custom-footer [:div [:hr] [:hr]]}, :ids [\"68\" \"70\" \"72\" \"74\" \"76\" \"78\" \"80\" \"82\" \"84\" \"86\" \"88\" \"90\" \"92\" \"94\" \"96\" \"98\" \"100\" \"102\" \"104\" \"106\" \"108\" \"110\" \"112\" \"114\" \"116\" \"118\" \"120\" \"122\" \"124\" \"126\" \"128\" \"130\" \"132\" \"134\" \"136\" \"138\" \"140\" \"142\" \"144\" \"146\" \"148\" \"150\" \"152\" \"154\" \"156\" \"158\" \"160\" \"162\" \"164\" \"166\" \"168\" \"170\" \"172\" \"174\" \"176\" \"178\" \"180\" \"182\" \"184\" \"186\" \"188\" \"190\" \"192\" \"194\" \"196\"], :id->content {\"180\" [:div [:p] nil nil [:p/markdown \"We combine the previously obtained context\\n (which contains the trained model)\\nwith the test data and mode :transform\"]], \"88\" [:div [:p] nil nil [:p/markdown \"As it was organically growing over time, it's API is functional and complete,\\nbut lacks consistency in some parts.\\n\"]], \"108\" [:div [:p] [:div [:p/code {:code \"(require '[scicloj.ml.dataset :as ds])\", :bg-class \"bg-light\"}]] nil nil], \"188\" [:div [:p] nil nil [:p/markdown \"This shows the predicted survival. \"]], \"158\" [:div [:p] nil nil [:p/markdown \"Then we define the pipeline and it steps. Inside the pipeline we only use functions\\nfrom namespace scicloj.ml.metamorph\"]], \"118\" [:div [:p] nil nil [:p/markdown \"It addresses all three shortcomings of the simpler pipeline.\"]], \"196\" [:div [:p] [:div [:p/code {:code \"(+ 1 1 (+ 2 2))\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"6\\n\"}]], \"82\" [:div [:p] nil nil [:p/markdown \"## Representing training data\"]], \"116\" [:div [:p] nil nil [:p/markdown \"Due to this, the idea of the `metamorph` pipeline concept was born.\"]], \"68\" [:div [:p] [:div [:p/code {:code \"(comment\\n  (note/init-with-browser)\\n  (note/eval-this-notespace)\\n  (note/reread-this-notespace)\\n  (note/render-static-html \\\"docs/userguide-intro.html\\\")\\n  (note/init))\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"nil\\n\"}]], \"156\" [:div [:p] [:div [:p/code {:code \"(def titanic-test\\n  (->\\n   (ds/dataset \\\"https://github.com/scicloj/metamorph-examples/raw/main/data/titanic/test.csv\\\"\\n               {:key-fn keyword\\n                :parser-fn :string})\\n   (ds/add-column :Survived [\\\"\\\"] :cycle)))\", :bg-class \"bg-light\"}]] nil nil], \"96\" [:div [:p] nil nil [:p/markdown \"Models are the core of most machine learning libraries. In scicloj.ml we\\n rely on an common **abstraction** for all\\nmachine learning models and one Java library [Smile](https://github.com/haifengl/smile) providing models,\\nwhich we bridge into Clojure via the abstraction.\\nSo we use Java models internally, but without the need for Java\\ninterop by the user.\\n\\nDocumentation for existing models is appearing here:\\nhttps://scicloj.github.io/scicloj.ml-tutorials/userguide-models.html\\n\\nThe abstraction is independent from Smile, so we could makes bridges to other libraries, even in non JVM languages (python, R)\\n\\n\\n\"]], \"80\" [:div [:p] nil nil [:p/markdown \"The Clojure language and core libraries do not have build-in, specific support for this,\\nso some libraries are required. \"]], \"162\" [:div [:p] nil nil [:p/markdown \"In the titanic dataset the `survived` column is a categorical variable.\\nAll target variables for classification need to be transformed first\\ninto numbers, the model can work with. This is done by the function\\n`categorical->number`. The mapping for this is stored in the dataset on the column\\nand can be later retrieved to transform the numeric prediction back to its\\ncategorical form.\"]], \"192\" [:div [:p] nil nil [:p/markdown \"Any form of feature-engineering takes now the same form.\\nWe will successively\\nadd more and more steps into the pipeline to improve the model.\"]], \"98\" [:div [:p] nil nil [:p/markdown \"## Data transformation pipelines.\"]], \"78\" [:div [:p] nil nil [:p/markdown \"3. A standard way to express steps of data manipulations including train/predict of a model\"]], \"172\" [:div [:p] nil nil [:p/markdown \"We execute the pipeline in mode :fit,\\nwhich will execute all pipeline steps and train as well the model. \"]], \"176\" [:div [:p] nil nil [:p/markdown \"Now we have a trained model inside trained-ctx. This is a usual map, so can be inspected in the repl.\\n As the model is based on Smile, the trained-ctx contains the java class representing the trained model.\\n\"]], \"128\" [:div [:p] nil nil [:p/markdown \"The Clojure ML ecosystem is based on different libraries working\\ntogether, as typic and idiomatic in Clojure\"]], \"186\" [:div [:p] [:div [:p/code {:code \"(-> test-ctx :metamorph/data\\n    (ds/column-values->categorical :Survived))\", :bg-class \"bg-light\"}]] nil [:p/code {:code \"#tech.v3.dataset.column<string>[418]\\n:Survived\\n[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0...]\\n\"}]], \"152\" [:div [:p] nil nil [:p/markdown \"First we load the data.\"]], \"184\" [:div [:p] nil nil [:p/markdown \"Prediction is now part of the ctx obtained.\\nThe internally called `predict` function of `metamorph.ml` returns always the raw prediction of the model,\\nwhich we can easily transform into the original categories.\\n\"]], \"100\" [:div [:p] nil nil [:p/markdown \"In order to apply machine learning, the data needs to be transformed from its original form ,\\n(often as a data file), into the form required by the model.\\n Sometimes these transformation are simple, like re-encode data,\\nsometimes they are very complex. In some contexts this is as well called\\n feature engineering, which can result in arbitrary\\ncomplex dataset transformations.\\nThis transformations are mostly dataset to dataset transformations.\\n\"]], \"106\" [:div [:p] nil nil [:p/markdown \"These simpler form of a pipeline in Clojure and Tablecloth, can just make use of the fact that all tablecloth\\n functions take a dataset as the first parameter and return a dataset.\\nSo they can be chained together with the pipe (`->`) operator of Clojure,\\n example:\"]], \"90\" [:div [:p] nil nil [:p/markdown \"This was addressed by an other library, layering on top of it, called\\n`tablecloth`. It is available [here](https://github.com/scicloj/tablecloth)\"]], \"190\" [:div [:p] nil nil [:p/markdown \"The documentation of `mm/model` here https://scicloj.github.io/scicloj.ml/scicloj.ml.metamorph.html#var-model\\ndocuments this special behavior of the function, which does something different in mode :fit vs mode :transform\"]], \"112\" [:div [:p] nil nil [:p/markdown \"This form of pipeline works to manipulate a dataset,\\nbut has three disadvantages.\"]], \"148\" [:div [:p] nil nil [:p/markdown \"To start we need to require a few namespaces\"]], \"140\" [:div [:p] nil nil [:p/markdown \"The setup for the following code needs a single dependencies in deps.edn or project.clj\"]], \"136\" [:div [:p] nil nil [:p/markdown \"In order to give easier access to the various libraries, the scicloj.ml\\n library was created. It unifies the access to the libraries above\\nin three simple namespaces.\\n\"]], \"166\" [:div [:p] nil nil [:p/markdown \"Now the dataset is ready for the model, which is called in the last step.\\nIt is a logistic regression model, which gets trained to predict column\\n:Survived from column :Pclass\"]], \"194\" [:div [:p] nil nil [:p/markdown \"This can be build-in functions or custom functions as we see later\"]], \"168\" [:div [:p] [:div [:p/code {:code \"(def pipe-fn\\n  (ml/pipeline\\n   (mm/select-columns [:Survived :Pclass])\\n   (mm/categorical->number [:Survived :Pclass])\\n   (mm/set-inference-target :Survived)\\n   (mm/model {:model-type :smile.classification/logistic-regression})))\", :bg-class \"bg-light\"}]] nil nil], \"76\" [:div [:p] nil nil [:p/markdown \"2. Various machine learning models\"]], \"154\" [:div [:p] [:div [:p/code {:code \"(def titanic-train\\n  (->\\n   (ds/dataset \\\"https://github.com/scicloj/metamorph-examples/raw/main/data/titanic/train.csv\\\"\\n               {:key-fn keyword\\n                :parser-fn :string})))\", :bg-class \"bg-light\"}]] nil nil], \"110\" [:div [:p] [:div [:p/code {:code \"(def my-data\\n  (-> (ds/dataset \\\"https://raw.githubusercontent.com/techascent/tech.ml.dataset/master/test/data/stocks.csv\\\" {:key-fn keyword})\\n      (ds/select-columns [:symbol :price])\\n      (ds/add-or-replace-column :symbol (fn [ds] (map clojure.string/lower-case  (ds :symbol))))))\", :bg-class \"bg-light\"}]] nil nil], \"94\" [:div [:p] nil nil [:p/markdown \"## Models\"]], \"174\" [:div [:p] [:div [:p/code {:code \"(def trained-ctx\\n  (pipe-fn {:metamorph/data titanic-train\\n            :metamorph/mode :fit}))\", :bg-class \"bg-light\"}]] nil nil], \"130\" [:div [:p] nil nil [:p/markdown \"Some existing libraries are used internally in scicloj.ml, to create a\\ncomplete machine learning library, but this is hidden from the user,\\nand is listed here only for completeness.\"]], \"122\" [:div [:p] nil nil [:p/markdown \"As we see in the metamorph documentation, a pipeline can be composed of functions, which adhere to some simple standards\\nregarding input and output, as explained here: https://github.com/scicloj/metamorph#compliant-operations\"]], \"120\" [:div [:p] nil nil [:p/markdown \"Metamorph is documented here: [metamorph](https://github.com/scicloj/metamorph)\"]], \"70\" [:div [:p] nil nil [:p/markdown \"# Clojure and and machine learning \"]], \"144\" [:div [:p] nil nil [:p/markdown \"This library acts as a facade to the four libraries above, and arranges the functions in a simple way in these namespaces:\"]], \"134\" [:div [:p] nil nil [:p/markdown \"These libraries can be used standalone as well. `tech.ml` was changed  in order\\nto work with scicloj.ml in a incompatible way.\\nSo it is re-released under a new name `metamorph.ml`.\\nThe others can be used by scicloj.ml without any change.\\n \"]], \"146\" [:div [:p] nil nil [:p/markdown \"\\n\\n| namespace             | purpose                                                  |\\n|-----------------------|----------------------------------------------------------|\\n| scicloj.ml.core       | core functionality for machine learning                  |\\n| scicloj.ml.dataset    | functions to manipulate a dataset                        |\\n| scicloj.ml.methamorph | metamorph compliant functions to be used in ml pipelines |\\n\\n \"]], \"84\" [:div [:p] nil nil [:p/markdown \"In the last 2 years the Clojure data science landscape was shaped\\nby the appearance and maturation of a new library to manage tabular data.\"]], \"104\" [:div [:p] nil nil [:p/markdown \"Clojure and the `tablecloth` library contains already\\nthe concept of running a pipeline\"]], \"102\" [:div [:p] nil nil [:p/markdown \"These pipelines need to be repeatable and self-contained,\\nas they need to run several times with different data or in variants\\nfor either cross validation or hyper-parameter tuning.\"]], \"74\" [:div [:p] nil nil [:p/markdown \"1. A standard way to manage tabular data in memory.\"]], \"142\" [:div [:p] nil nil [:p/markdown \"\\n{:deps {\\n        scicloj/scicloj.ml {:mvn/version \\\"0.1.0-beta2\\\"}} }\\n\"]], \"124\" [:div [:p] nil nil [:p/markdown \"Tablecloth contains such operations in the `tablecloth.pipeline`\\nnamespace. All functions of the `tablecloth.api` namespace are replicated\\nthere, but metamorph compliant\"]], \"138\" [:div [:p] nil nil [:p/markdown \"## Machine learning using scicloj.ml\"]], \"114\" [:div [:p] nil nil [:p/markdown \"\\n1. `->` is a macro, so we cannot compose pipelines easily\\n\\n2. We move a dataset object through the pipeline steps, so the only object we have nicely inside the pipeline, accessible to all steps, is the dataset itself.  But sometimes we need non-tabular, auxiliary, data to be shared across the pipeline steps, which is not possible with passing a dataset only.Using this simple pipelines, would force to hold auxiliary data in a global state of some form. This makes is very hard to execute pipelines repeatedly, as they are not self-contained.\\n\\n3. These simpler pipeline concepts have no notion of running a pipeline in several modes. In machine learning a pipeline need to behave differently in `fit` and in `transform`. (often called `train` vs `predict`). The models learns from data in the `fit` and it applies what it has learned in `transform`.\\n\"]], \"182\" [:div [:p] [:div [:p/code {:code \"(def test-ctx\\n  (pipe-fn\\n   (assoc trained-ctx\\n          :metamorph/data titanic-test\\n          :metamorph/mode :transform)))\", :bg-class \"bg-light\"}]] nil nil], \"126\" [:div [:p] nil nil [:p/markdown \"## scicloj.ml\"]], \"178\" [:div [:p] nil nil [:p/markdown \"Now we execute the pipeline in mode :transform,\\nwhich will make a prediction \"]], \"72\" [:div [:p] nil nil [:p/markdown \"In order to practice machine learning and create an ecosystem of models around it,\\nwe need 3 components.\"]], \"170\" [:div [:p] nil nil [:p/markdown \"So the `ml/pipeline` function returns a function, which can be called with the ctx map.\"]], \"164\" [:div [:p] nil nil [:p/markdown \"In `scicloj.ml` we pass a whole dataset to a model, and we need to mark\\nthe inference target via function `set-inference-target`.\\nAll other columns are used then as feature columns.\\nTo restric the feature column, I simply remove most of them and keep only one, :Pclass\"]], \"86\" [:div [:p] nil nil [:p/markdown \"This library is [tech.ml.dataset](https://github.com/techascent/tech.ml.dataset).\\n It defines a in-memory tabular data structure and operations on it. It is a remarkable piece of software,\\nhighly optimized and linking in its root to native memory and allow zero-copy integration's outside Clojure.\"]], \"92\" [:div [:p] nil nil [:p/markdown \"So we have now a very reliable, mature, easy to use library to store and manipulate tabular data, including text.\"]], \"160\" [:div [:p] nil nil [:p/markdown \"In scicloj.ml the model functions receives a single dataset,\\nin which the inference target column is marked as such. The model\\nto use is a parameter of the `model` function. All built-in\\nmodels are listed here: https://scicloj.github.io/scicloj.ml-tutorials/userguide-models.html\"]], \"150\" [:div [:p] [:div [:p/code {:code \"(require '[scicloj.ml.core :as ml]\\n         '[scicloj.ml.metamorph :as mm]\\n         '[scicloj.ml.dataset :refer [dataset add-column] :as ds])\", :bg-class \"bg-light\"}]] nil nil], \"132\" [:div [:p] nil nil [:p/markdown \"\\n1. `tablecloth` - for general manipulation of the dataset\\n1. `tech.v3.dataset` - to finally prepare a dataset for the machine learing models\\n1. `metamorph.ml` - for running pipelines and machine learning core functions\\n1. `Smile`  Java machine learning library containing lots of models\\n\"]]}}"</script>
21 |     <script src="gorilla-notes/js/compiled/main.js"></script>
22 | </html>
23 | 


--------------------------------------------------------------------------------
/render_all.clj:
--------------------------------------------------------------------------------
 1 | (ns render-all
 2 |   (:require [notespace.cli :as cli]
 3 |             [notespace.api :as note]
 4 |             [clojure.java.shell]
 5 |             [nextjournal.clerk :as clerk]))
 6 | 
 7 |   
 8 | (def nss [
 9 |           {:ns 'scicloj.ml.intro                :output-file "docs/userguide-intro.html"}
10 |           {:ns 'scicloj.ml.advanced             :output-file "docs/userguide-advanced.html"}
11 |           {:ns 'scicloj.ml.categorical          :output-file "docs/userguide-categrical.html"}
12 |           {:ns 'scicloj.ml.transformers         :output-file "docs/userguide-transformers.html"}
13 |           {:ns 'scicloj.ml.titanic              :output-file "docs/userguide-titanic.html"}
14 |           {:ns 'scicloj.ml.tune-titanic         :output-file "docs/tune-titanic.html"}
15 |           {:ns 'scicloj.ml.sklearnclj           :output-file "docs/userguide-sklearnclj.html"}
16 |           {:ns 'scicloj.ml.third-party          :output-file "docs/userguide-third_party.html"}
17 |           {:ns 'scicloj.ml.experiment-tracking  :output-file "docs/userguide-experiment-tracking.html"}
18 |           {:ns 'scicloj.ml.unsupervised         :output-file "docs/userguide-unsupervised.html"}
19 |           {:ns 'scicloj.ml.interactions-ols     :output-file "docs/interactions_ols.html"}])
20 | 
21 | 
22 | (note/init :port 5678)
23 | 
24 | (run!
25 | 
26 |  #(do
27 |     (println "render ns: " %)
28 |     (cli/eval-and-render-a-notespace %))
29 |  nss)
30 | 
31 | (require '[nextjournal.clerk :as clerk])
32 | 
33 | (clerk/build! {:paths ["src/scicloj/ml/models.clj"]
34 |                :bundle? true
35 |                :out-path "output"})
36 | 
37 | (println
38 |  (clojure.java.shell/sh "mv" "output/index.html" "docs/userguide-models.html"))
39 | 
40 | (clerk/build! {:paths ["src/scicloj/ml/polyglot_kmeans.clj"]
41 |                :bundle? true
42 |                :out-path "output"})
43 | 
44 | (println
45 |  (clojure.java.shell/sh "mv" "output/index.html" "docs/polyglot_kmeans.html"))
46 | 
47 | (System/exit 0)
48 | 


--------------------------------------------------------------------------------
/render_titanic.clj:
--------------------------------------------------------------------------------
1 | (ns render-titanic
2 |   (:require [notespace.cli :as cli]
3 |             [notespace.api :as note]))
4 | 
5 | (note/init :port 5678)
6 | 
7 | (cli/eval-and-render-a-notespace {:ns 'scicloj.ml.titanic})
8 | (System/exit 0)
9 | 


--------------------------------------------------------------------------------
/render_tune-titanic.clj:
--------------------------------------------------------------------------------
1 | (ns render-titanic
2 |   (:require [notespace.cli :as cli]
3 |             [notespace.api :as note]))
4 | 
5 | (note/init :port 5678)
6 | 
7 | (cli/eval-and-render-a-notespace {:ns 'scicloj.ml.tune-titanic})
8 | (System/exit 0)
9 | 


--------------------------------------------------------------------------------
/resources/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scicloj/scicloj.ml-tutorials/951cde0b8bd0a1b22ec856d28a6122d69d34836f/resources/.keep


--------------------------------------------------------------------------------
/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <!-- encoders are assigned the type
 5 |          ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
 6 |     <encoder>
 7 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 8 |     </encoder>
 9 |   </appender>
10 |   <logger name="smile" level="ERROR"/>
11 |   <root level="INFO">
12 |     <appender-ref ref="STDOUT" />
13 |   </root>
14 | </configuration>
15 | 


--------------------------------------------------------------------------------
/src/scicloj/ml/categorical.clj:
--------------------------------------------------------------------------------
 1 | (ns scicloj.ml.categorical
 2 | 
 3 |   (:require [notespace.api :as note]
 4 |             [notespace.kinds :as kind]))
 5 | 
 6 | 
 7 | (comment
 8 |   (note/init-with-browser)
 9 |   (note/eval-and-realize-this-notespace))
10 | 
11 | (require '[scicloj.ml.core :as ml]
12 |          '[scicloj.ml.metamorph :as mm]
13 |          '[scicloj.ml.dataset  :as ds])
14 | 
15 | ["# Handling of categorical variables"]
16 | 
17 | ["We keep important information in the metadata of the column,
18 | which can be inspected"]
19 | 
20 | ["## categorical -> number"]
21 | ["Categorical columns can be converted too numbers, which is needed by several ML models."]
22 | 
23 | (def ds-cat
24 |   (ds/dataset {:a [:x :y :x]}))
25 | 
26 | ["inspect column metadata and observe datatype :kewyword"]
27 | (-> ds-cat :a meta)
28 | 
29 | ["convert categorical columns to numeric"]
30 | (def ds-number
31 |   (ds/categorical->number
32 |    ds-cat :all {} :int))
33 | 
34 | ^kind/dataset
35 | ds-number
36 | 
37 | ["metadata has changed as well, int now, and with a lookup table"]
38 | (-> ds-number :a meta)
39 | 
40 | 
41 | 
42 | 
43 | ["## categorical -> one-hot"]
44 | ["Categorical columns can be converted to one-hot columns as well, which is needed by several ML models."]
45 | (def ds-one-hot
46 |   (ds/categorical->one-hot
47 |    ds-cat :all {} :int))
48 | 
49 | ^kind/dataset
50 | ds-one-hot
51 | 
52 | 
53 | ["we can go back as well"]
54 | (-> ds-one-hot ds/reverse-map-categorical-xforms)
55 | 
56 | 
57 | ["inspect metadata after conversion"]
58 | (-> ds-one-hot :a-y meta)
59 | 
60 | 
61 | ["we can go back"]
62 | (-> ds-one-hot ds/reverse-map-categorical-xforms)
63 | 


--------------------------------------------------------------------------------
/src/scicloj/ml/experiment_tracking.clj:
--------------------------------------------------------------------------------
  1 | (ns scicloj.ml.experiment-tracking
  2 |   (:require
  3 |    [scicloj.ml.ug-utils :as utils]
  4 |    [notespace.api :as note]
  5 |    [notespace.kinds :as kind]))
  6 |    
  7 | (comment
  8 |   (note/init-with-browser)
  9 |   (note/eval-this-notespace)
 10 |   (note/reread-this-notespace)
 11 |   (note/render-static-html "docs/userguide-experiment-tracking.html")
 12 |   (note/init))
 13 | 
 14 | (require '[scicloj.ml.core :as ml]
 15 |          '[scicloj.ml.metamorph :as mm]
 16 |          '[scicloj.ml.dataset  :as ds]
 17 |          '[scicloj.metamorph.ml.tools :refer [dissoc-in]]
 18 |          '[taoensso.nippy :as nippy])
 19 | 
 20 | 
 21 | 
 22 | (defonce ds (ds/dataset "https://raw.githubusercontent.com/techascent/tech.ml/master/test/data/iris.csv" {:key-fn keyword}))
 23 | 
 24 | (defn create-base-pipe-decl [node-size]
 25 |   [[:tech.v3.dataset.metamorph/set-inference-target [:species]]
 26 |    [:tech.v3.dataset.metamorph/categorical->number [:species]]
 27 |    {:metamorph/id :model} [:scicloj.metamorph.ml/model {:model-type :smile.classification/random-forest
 28 |                                                         :node-size node-size}]])
 29 | ["## Run evaluation"]
 30 | 
 31 | ["We create 6 pipelines,  do a simple :holdout split and keep all results. In order to save memory,
 32 | as we needed to do, if we would have thousands or more evaluations, we keep the minimal information."]
 33 | 
 34 | (def pipes (map create-base-pipe-decl [1 5 10 20 50 100]))
 35 | (def split (ds/split->seq ds :holdout))
 36 | 
 37 | 
 38 | 
 39 | (def  evaluation-result
 40 |   (ml/evaluate-pipelines
 41 |    pipes split
 42 |    ml/classification-accuracy
 43 |    :accuracy
 44 |    {:evaluation-handler-fn utils/select-minimal-result
 45 | 
 46 |     :return-best-crossvalidation-only false
 47 |     :return-best-pipeline-only false}))
 48 | 
 49 | ["So we get here 6 evaluation results"]
 50 | evaluation-result
 51 | 
 52 | ["simplified as list:"]
 53 | 
 54 | (->> evaluation-result flatten
 55 |      (map (comp :metric :test-transform)))
 56 | 
 57 | ["## Attach a simple result handler"]
 58 | 
 59 | ["A result handler is a function which takes a full map representing a single evalution result and does what ever is needed.
 60 | It can be a function with side effects, and it should return the minimal metric infomation as documented."]
 61 | 
 62 | ["The function will be called for each evalution result, so in our case 6 times. We use a simple function for now,
 63 | which prints the current declartive pipeline."]
 64 | 
 65 | (def  evaluation-result
 66 |   (ml/evaluate-pipelines
 67 |    pipes split
 68 |    ml/classification-accuracy
 69 |    :accuracy
 70 |    {;:result-dissoc-in-seq ml/result-dissoc-in-seq--all
 71 |     ;; :result-dissoc-in-seq []
 72 |     :return-best-crossvalidation-only false
 73 |     :return-best-pipeline-only false
 74 |     :evaluation-handler-fn
 75 |     (fn [result]
 76 |       (clojure.pprint/pprint (:pipe-decl result))
 77 |       result)}))
 78 | 
 79 | ["repl output: "]
 80 | ^kind/code
 81 | [[:tech.v3.dataset.metamorph/set-inference-target [:species]]
 82 |  [:tech.v3.dataset.metamorph/categorical->number [:species]]
 83 |  [:scicloj.metamorph.ml/model
 84 |   {:model-type :smile.classification/random-forest, :node-size 1}]]
 85 | [[:tech.v3.dataset.metamorph/set-inference-target [:species]]
 86 |  [:tech.v3.dataset.metamorph/categorical->number [:species]]
 87 |  [:scicloj.metamorph.ml/model
 88 |   {:model-type :smile.classification/random-forest, :node-size 5}]]
 89 | [[:tech.v3.dataset.metamorph/set-inference-target [:species]]
 90 |  [:tech.v3.dataset.metamorph/categorical->number [:species]]
 91 |  [:scicloj.metamorph.ml/model
 92 |   {:model-type :smile.classification/random-forest, :node-size 10}]]
 93 | [[:tech.v3.dataset.metamorph/set-inference-target [:species]]
 94 |  [:tech.v3.dataset.metamorph/categorical->number [:species]]
 95 |  [:scicloj.metamorph.ml/model
 96 |   {:model-type :smile.classification/random-forest, :node-size 20}]]
 97 | ["...."]
 98 | 
 99 | ["The callback function can now implement whatever needed to store the evaluation results, for example on disk.
100 | "]
101 | 
102 | 
103 | ["Write results to disk"]
104 | 
105 | 
106 | (def created-files (atom []))
107 | (def last-result (atom {}))
108 | 
109 | (def evaluation-result
110 |   (ml/evaluate-pipelines
111 |    pipes split
112 |    ml/classification-accuracy
113 |    :accuracy
114 |    {:evaluation-handler-fn
115 |     (fn [result]
116 | 
117 |       (let [reduced-result-fn (fn [result] (scicloj.metamorph.ml/reduce-result result
118 |                                             [[:fit-ctx :model :model-data :model-as-bytes]
119 |                                              [:train-transform :ctx :model :model-data :model-as-bytes]
120 | 
121 | 
122 |                                              [:test-transform :ctx :model :model-data :model-as-bytes]]))]
123 |         (scicloj.metamorph.ml.evaluation-handler/example-nippy-handler
124 |          created-files "/tmp"
125 |          reduced-result-fn)
126 |         (reset! last-result (reduced-result-fn result))
127 |         (reduced-result-fn result)))
128 | 
129 | 
130 |     :attach-fn-sources {:ns (find-ns 'scicloj.ml.experiment-tracking)
131 |                         :pipe-fns-clj-file "src/scicloj/ml/experiment_tracking.clj"}}))
132 | 
133 | ["This creates one nippy file for each evaluation, containing all data of the evaluations."]
134 | 
135 | (deref last-result)
136 | 


--------------------------------------------------------------------------------
/src/scicloj/ml/interactions_ols.clj:
--------------------------------------------------------------------------------
  1 | (ns scicloj.ml.interactions-ols
  2 |   (:require
  3 |    [notespace.api :as note]
  4 |    [notespace.kinds :as kind]
  5 |    [notespace.view :as view]
  6 |    [tablecloth.api :as tc]
  7 |    [scicloj.ml.core]
  8 |    [scicloj.sklearn-clj.ml]
  9 |    [clojure.string :as str]
 10 |    [scicloj.ml.ug-utils :refer :all]
 11 |    [clojure.java.io :as io]
 12 |    [fastmath.stats :as fmstats]))
 13 | 
 14 | (require '[scicloj.ml.core :as ml]
 15 |          '[scicloj.ml.metamorph :as mm]
 16 |          '[scicloj.ml.dataset :refer [dataset add-column]]
 17 |          '[scicloj.ml.dataset :as ds]
 18 |          '[tech.v3.dataset.math :as std-math]
 19 |          '[tech.v3.datatype.functional :as dtf]
 20 |          '[scicloj.metamorph.ml.toydata :as datasets])
 21 | 
 22 | 
 23 | (comment
 24 |   (note/init-with-browser)
 25 |   (note/eval-this-notespace)
 26 |   (note/render-static-html "docs/interactions_ols.html"))
 27 | 
 28 | ["This examples how, how to do interactions in linear regression with `scicloj.ml`"]
 29 | 
 30 | ["Taking ideas from: "
 31 | 
 32 |  "http://www.sthda.com/english/articles/40-regression-analysis/164-interaction-effect-in-multiple-regression-essentials/#comments-list"]
 33 | 
 34 | (defn pp-str [x]
 35 |   (with-out-str (clojure.pprint/pprint x)))
 36 | 
 37 | ["First we load the data:"]
 38 | (def marketing (tc/dataset "data/marketing.csv" {:key-fn keyword}))
 39 | 
 40 | ["## Additive model"]
 41 | 
 42 | ["Firts we build an additive model, which model equation is 'sales = b0 + b1 * youtube + b2 * facebook'"]
 43 | 
 44 | (def additive-pipeline
 45 |   (ml/pipeline
 46 |    (mm/set-inference-target :sales)
 47 |    (mm/drop-columns [:newspaper])
 48 |    {:metamorph/id :model}
 49 |    (mm/model {:model-type :smile.regression/ordinary-least-square})))
 50 | 
 51 | 
 52 | ["We evaluate it, "]
 53 | (def evaluations
 54 |   (ml/evaluate-pipelines
 55 |    [additive-pipeline]
 56 |    (ds/split->seq marketing :holdout)
 57 |    ml/rmse
 58 |    :loss
 59 |    {:other-metrices [{:name :r2
 60 |                       :metric-fn fmstats/r2-determination}]}))
 61 | 
 62 | 
 63 | ["and print the result:"]
 64 | ^kind/hiccup
 65 | (text->hiccup
 66 |  (str
 67 |   (-> evaluations flatten first :fit-ctx :model ml/thaw-model str)))
 68 | 
 69 | ["We have the following metrices:"]
 70 | ["RMSE"]
 71 | (-> evaluations flatten first :test-transform :metric)
 72 | 
 73 | ["R2"]
 74 | (-> evaluations flatten first :test-transform :other-metrices first :metric)
 75 | 
 76 | ["## Interaction effects"]
 77 | ["Now we add interaction effects to it, resulting in this model equation: 'sales = b0 + b1 * youtube + b2 * facebook + b3 * (youtube * facebook)'"]
 78 | (def pipe-interaction
 79 |   (ml/pipeline
 80 |    (mm/drop-columns [:newspaper])
 81 |    (mm/add-column :youtube*facebook (fn [ds] (dtf/* (ds :youtube) (ds :facebook))))
 82 |    (mm/set-inference-target :sales)
 83 |    {:metamorph/id :model}(mm/model {:model-type :smile.regression/ordinary-least-square})))
 84 | 
 85 | ["Again we evaluate the model,"]
 86 | (def evaluations
 87 |   (ml/evaluate-pipelines
 88 |    [pipe-interaction]
 89 |    (ds/split->seq marketing :holdout)
 90 |    ml/rmse
 91 |    :loss
 92 |    {:other-metrices [{:name :r2
 93 |                       :metric-fn fmstats/r2-determination}]}))
 94 | 
 95 | 
 96 | ["and print it and the performance metrices:"]
 97 | ^kind/hiccup
 98 | (text->hiccup
 99 |  (str
100 |   (-> evaluations flatten first :fit-ctx :model ml/thaw-model str)))
101 | 
102 | ["As the multiplcation of 'youtube * facebook' is as well statistically relevant, it
103 | suggests that there is indeed an interaction between these 2 predictor variables youtube and facebook."]
104 | 
105 | ["RMSE"]
106 | (-> evaluations flatten first :test-transform :metric)
107 | 
108 | ["R2"]
109 | (-> evaluations flatten first :test-transform :other-metrices first :metric)
110 | 
111 | ["RMSE and R2 of the intercation model are sligtly better."
112 |  "These results suggest that the model with the interaction term is better than the model that contains only main effects.
113 | So, for this specific data, we should go for the model with the interaction model.
114 | "]
115 | 


--------------------------------------------------------------------------------
/src/scicloj/ml/intro.clj:
--------------------------------------------------------------------------------
  1 | (ns scicloj.ml.intro
  2 |   (:require
  3 |    [notespace.api :as note]
  4 |    [notespace.kinds :as kind]))
  5 | 
  6 | 
  7 | (comment
  8 |   (note/init-with-browser)
  9 |   (note/eval-this-notespace)
 10 |   (note/reread-this-notespace)
 11 |   (note/render-static-html "docs/userguide-intro.html")
 12 |   (note/init))
 13 | 
 14 | 
 15 | 
 16 | ["# Clojure and and machine learning "]
 17 | 
 18 | ["In order to practice machine learning and create an ecosystem of models around it,
 19 | we need 3 components."]
 20 | 
 21 | ["1. A standard way to manage tabular data in memory."]
 22 | ["2. Various machine learning models"]
 23 | ["3. A standard way to express steps of data manipulations including train/predict of a model"]
 24 | 
 25 | 
 26 | ["The Clojure language and core libraries do not have build-in, specific support for this,
 27 | so some libraries are required. "]
 28 | 
 29 | ["## Representing training data"]
 30 | 
 31 | ["In the last 2 years the Clojure data science landscape was shaped
 32 | by the appearance and maturation of a new library to manage tabular data."]
 33 | 
 34 | ["This library is [tech.ml.dataset](https://github.com/techascent/tech.ml.dataset).
 35 |  It defines a in-memory tabular data structure and operations on it. It is a remarkable piece of software,
 36 | highly optimized and linking in its root to native memory and allow zero-copy integration's outside Clojure."]
 37 | 
 38 | ["As it was organically growing over time, it's API is functional and complete,
 39 | but lacks consistency in some parts.
 40 | "]
 41 | 
 42 | ["This was addressed by an other library, layering on top of it, called
 43 | `tablecloth`. It is available [here](https://github.com/scicloj/tablecloth)"]
 44 | 
 45 | 
 46 | ["So we have now a very reliable, mature, easy to use library to store and manipulate tabular data, including text."]
 47 | 
 48 | ["## Models"]
 49 | ["Models are the core of most machine learning libraries. In scicloj.ml we
 50 |  rely on an common **abstraction** for all
 51 | machine learning models and one Java library [Smile](https://github.com/haifengl/smile) providing models,
 52 | which we bridge into Clojure via the abstraction.
 53 | So we use Java models internally, but without the need for Java
 54 | interop by the user.
 55 | 
 56 | Documentation for existing models is appearing here:
 57 | https://scicloj.github.io/scicloj.ml-tutorials/userguide-models.html
 58 | 
 59 | The abstraction is independent from Smile, so we could makes bridges to other libraries, even in non JVM languages (python, R)
 60 | 
 61 | 
 62 | "]
 63 | 
 64 | ["## Data transformation pipelines."]
 65 | 
 66 | ["In order to apply machine learning, the data needs to be transformed from its original form ,
 67 | (often as a data file), into the form required by the model.
 68 |  Sometimes these transformation are simple, like re-encode data,
 69 | sometimes they are very complex. In some contexts this is as well called
 70 |  feature engineering, which can result in arbitrary
 71 | complex dataset transformations.
 72 | This transformations are mostly dataset to dataset transformations.
 73 | "]
 74 | 
 75 | 
 76 | ["These pipelines need to be repeatable and self-contained,
 77 | as they need to run several times with different data or in variants
 78 | for either cross validation or hyper-parameter tuning."]
 79 | 
 80 | ["Clojure and the `tablecloth` library contains already
 81 | the concept of running a pipeline"]
 82 | 
 83 | ["These simpler form of a pipeline in Clojure and Tablecloth, can just make use of the fact that all tablecloth
 84 |  functions take a dataset as the first parameter and return a dataset.
 85 | So they can be chained together with the pipe (`->`) operator of Clojure,
 86 |  example:"]
 87 | 
 88 | (require '[scicloj.ml.dataset :as ds])
 89 | (def my-data
 90 |   (-> (ds/dataset "https://raw.githubusercontent.com/techascent/tech.ml.dataset/master/test/data/stocks.csv" {:key-fn keyword})
 91 |       (ds/select-columns [:symbol :price])
 92 |       (ds/add-or-replace-column :symbol (fn [ds] (map clojure.string/lower-case  (ds :symbol))))))
 93 | 
 94 | ["This form of pipeline works to manipulate a dataset,
 95 | but has three disadvantages."]
 96 | 
 97 | ["
 98 | 1. `->` is a macro, so we cannot compose pipelines easily
 99 | 
100 | 2. We move a dataset object through the pipeline steps, so the only object we have nicely inside the pipeline, accessible to all steps, is the dataset itself.  But sometimes we need non-tabular, auxiliary, data to be shared across the pipeline steps, which is not possible with passing a dataset only.Using this simple pipelines, would force to hold auxiliary data in a global state of some form. This makes is very hard to execute pipelines repeatedly, as they are not self-contained.
101 | 
102 | 3. These simpler pipeline concepts have no notion of running a pipeline in several modes. In machine learning a pipeline need to behave differently in `fit` and in `transform`. (often called `train` vs `predict`). The models learns from data in the `fit` and it applies what it has learned in `transform`.
103 | "]
104 | 
105 | ["Due to this, the idea of the `metamorph` pipeline concept was born."]
106 | ["It addresses all three shortcomings of the simpler pipeline."]
107 | 
108 | ["Metamorph is documented here: [metamorph](https://github.com/scicloj/metamorph)"]
109 | 
110 | 
111 | ["As we see in the metamorph documentation, a pipeline can be composed of functions, which adhere to some simple standards
112 | regarding input and output, as explained here: https://github.com/scicloj/metamorph#compliant-operations"]
113 | 
114 | ["Tablecloth contains such operations in the `tablecloth.pipeline`
115 | namespace. All functions of the `tablecloth.api` namespace are replicated
116 | there, but metamorph compliant"]
117 | 
118 | ["## scicloj.ml"]
119 | 
120 | ["The Clojure ML ecosystem is based on different libraries working
121 | together, as typic and idiomatic in Clojure"]
122 | 
123 | ["Some existing libraries are used internally in scicloj.ml, to create a
124 | complete machine learning library, but this is hidden from the user,
125 | and is listed here only for completeness."]
126 | 
127 | ["
128 | 1. `tablecloth` - for general manipulation of the dataset
129 | 1. `tech.v3.dataset` - to finally prepare a dataset for the machine learing models
130 | 1. `metamorph.ml` - for running pipelines and machine learning core functions
131 | 1. `Smile`  Java machine learning library containing lots of models
132 | "]
133 | 
134 | 
135 | 
136 | ["These libraries can be used standalone as well. `tech.ml` was changed  in order
137 | to work with scicloj.ml in a incompatible way.
138 | So it is re-released under a new name `metamorph.ml`.
139 | The others can be used by scicloj.ml without any change.
140 |  "]
141 | 
142 | 
143 | ["In order to give easier access to the various libraries, the scicloj.ml
144 |  library was created. It unifies the access to the libraries above
145 | in three simple namespaces.
146 | "]
147 | 
148 | ["## Machine learning using scicloj.ml"]
149 | 
150 | ["The setup for the following code needs a single dependencies in deps.edn or project.clj"]
151 | 
152 | ["
153 | {:deps {
154 |         scicloj/scicloj.ml {:mvn/version \"0.1.0-beta2\"}} }
155 | "]
156 | 
157 | 
158 | ["This library acts as a facade to the four libraries above, and arranges the functions in a simple way in these namespaces:"]
159 | 
160 | ^kind/md-nocode
161 | ["
162 | 
163 | | namespace             | purpose                                                  |
164 | |-----------------------|----------------------------------------------------------|
165 | | scicloj.ml.core       | core functionality for machine learning                  |
166 | | scicloj.ml.dataset    | functions to manipulate a dataset                        |
167 | | scicloj.ml.methamorph | metamorph compliant functions to be used in ml pipelines |
168 | 
169 |  "]
170 | 
171 | 
172 | 
173 | ["To start we need to require a few namespaces"]
174 | 
175 | (require '[scicloj.ml.core :as ml]
176 |          '[scicloj.ml.metamorph :as mm]
177 |          '[scicloj.ml.dataset :refer [dataset add-column] :as ds])
178 | 
179 | 
180 | 
181 | ["First we load the data."]
182 | (def titanic-train
183 |   (->
184 |    (ds/dataset "https://github.com/scicloj/metamorph-examples/raw/main/data/titanic/train.csv"
185 |                {:key-fn keyword
186 |                 :parser-fn :string})))
187 | 
188 | 
189 | (def titanic-test
190 |   (->
191 |    (ds/dataset "https://github.com/scicloj/metamorph-examples/raw/main/data/titanic/test.csv"
192 |                {:key-fn keyword
193 |                 :parser-fn :string})
194 |    (ds/add-column :Survived [""] :cycle)))
195 | 
196 | ["Then we define the pipeline and it steps. Inside the pipeline we only use functions
197 | from namespace scicloj.ml.metamorph"]
198 | 
199 | ["In scicloj.ml the model functions receives a single dataset,
200 | in which the inference target column is marked as such. The model
201 | to use is a parameter of the `model` function. All built-in
202 | models are listed here: https://scicloj.github.io/scicloj.ml-tutorials/userguide-models.html"]
203 | 
204 | 
205 | ["In the titanic dataset the `survived` column is a categorical variable.
206 | All target variables for classification need to be transformed first
207 | into numbers, the model can work with. This is done by the function
208 | `categorical->number`. The mapping for this is stored in the dataset on the column
209 | and can be later retrieved to transform the numeric prediction back to its
210 | categorical form."]
211 | 
212 | 
213 | ["In `scicloj.ml` we pass a whole dataset to a model, and we need to mark
214 | the inference target via function `set-inference-target`.
215 | All other columns are used then as feature columns.
216 | To restric the feature column, I simply remove most of them and keep only one, :Pclass"]
217 | 
218 | ["Now the dataset is ready for the model, which is called in the last step.
219 | It is a logistic regression model, which gets trained to predict column
220 | :Survived from column :Pclass"]
221 | 
222 | (def pipe-fn
223 |   (ml/pipeline
224 |    (mm/select-columns [:Survived :Pclass])
225 |    (mm/categorical->number [:Survived :Pclass])
226 |    (mm/set-inference-target :Survived)
227 |    (mm/model {:model-type :smile.classification/logistic-regression})))
228 | 
229 | ["So the `ml/pipeline` function returns a function, which can be called with the ctx map."]
230 | 
231 | ["We execute the pipeline in mode :fit,
232 | which will execute all pipeline steps and train as well the model. "]
233 | 
234 | (def trained-ctx
235 |   (pipe-fn {:metamorph/data titanic-train
236 |             :metamorph/mode :fit}))
237 | 
238 | ["Now we have a trained model inside trained-ctx. This is a usual map, so can be inspected in the repl.
239 |  As the model is based on Smile, the trained-ctx contains the java class representing the trained model.
240 | "]
241 | 
242 | ["Now we execute the pipeline in mode :transform,
243 | which will make a prediction "]
244 | 
245 | ["We combine the previously obtained context
246 |  (which contains the trained model)",
247 |  "with the test data and mode :transform"]
248 | 
249 | (def test-ctx
250 |   (pipe-fn
251 |    (assoc trained-ctx
252 |           :metamorph/data titanic-test
253 |           :metamorph/mode :transform)))
254 | 
255 | 
256 | 
257 | ["Prediction is now part of the ctx obtained.
258 | The internally called `predict` function of `metamorph.ml` returns always the raw prediction of the model,
259 | which we can easily transform into the original categories.
260 | "]
261 | 
262 | 
263 | 
264 | ;; ^kind/dataset
265 | (-> test-ctx :metamorph/data
266 |     (ds/column-values->categorical :Survived))
267 | 
268 | 
269 | 
270 | 
271 | 
272 | ["This shows the predicted survival. "]
273 | 
274 | ["The documentation of `mm/model` here https://scicloj.github.io/scicloj.ml/scicloj.ml.metamorph.html#var-model"
275 |  "documents this special behavior of the function, which does something different in mode :fit vs mode :transform"]
276 | 
277 | ["Any form of feature-engineering takes now the same form.
278 | We will successively
279 | add more and more steps into the pipeline to improve the model."]
280 | 
281 | ["This can be build-in functions or custom functions as we see later"]
282 | 
283 | 
284 | (+ 1 1 (+ 2 2))
285 | 


--------------------------------------------------------------------------------
/src/scicloj/ml/models.clj:
--------------------------------------------------------------------------------
  1 | ^{:nextjournal.clerk/visibility {:code :hide :result :hide}
  2 |   :nextjournal.clerk/toc true}
  3 | (ns scicloj.ml.models
  4 |   (:require
  5 |    [nextjournal.clerk :as clerk]
  6 |    [scicloj.ml.ug-utils :as utils]
  7 |    [scicloj.ml.dataset :as ds]
  8 |    [scicloj.ml.ug-utils-clerk :as utils-clerk]
  9 |    [tablecloth.api :as tc]))
 10 | 
 11 | ^{:nextjournal.clerk/visibility {:code :hide :result :hide}}
 12 | (comment
 13 |   (clerk/show! "src/scicloj/ml/models.clj")
 14 |   (clerk/halt!)
 15 |   (clerk/build-static-app! {:paths ["src/scicloj/ml/models.clj"]
 16 |                             :bundle? false})
 17 |   (clerk/clear-cache!)
 18 |   (clerk/serve! {:browse? true})
 19 |   (clerk/serve! {:browse? true :watch-paths ["src/scicloj/ml/"]}))
 20 | 
 21 | ^{:nextjournal.clerk/visibility {:code :hide :result :hide}}
 22 | (require '[scicloj.ml.core :as ml]
 23 |          '[scicloj.ml.metamorph :as mm]
 24 |          '[tech.v3.datatype.functional :as dtf]
 25 |          '[scicloj.metamorph.ml.toydata :as datasets])
 26 | 
 27 | ^{:nextjournal.clerk/visibility {:code :hide :result :hide}}
 28 | (clerk/add-viewers! [{:pred tc/dataset?
 29 |                       :transform-fn (clerk/update-val #(clerk/table {:head (tc/column-names %)
 30 |                                                                      :rows (tc/rows % :as-seq)}))}])
 31 | ^{:nextjournal.clerk/visibility {:code :hide :result :hide}}
 32 | (def build-in-models
 33 |   (->>
 34 |    (ml/model-definition-names)
 35 |    (filter #(contains? #{"fastmath.cluster"
 36 |                          "smile.classification"
 37 |                          "smile.regression"
 38 |                          "smile.manifold"
 39 |                          "smile.projections"
 40 |                          "xgboost"}
 41 |                        (namespace %)))
 42 |    sort))
 43 | 
 44 | ^{:nextjournal.clerk/visibility {:code :hide :result :hide}}
 45 | (defn make-iris-pipeline [model-options]
 46 |   (ml/pipeline
 47 |    (mm/set-inference-target :species)
 48 |    (mm/categorical->number [:species])
 49 |    (mm/model model-options)))
 50 | 
 51 | 
 52 | ;; # Models
 53 | 
 54 | ;; scicloj.ml uses the plugin `scicloj.ml.smile` and
 55 | ;; `scicloj.ml.xgboost` by default,
 56 | ;; which gives access to " (count build-in-models) " models from the java libraries
 57 | ;; [Smile](https://haifengl.github.io/),
 58 | ;; [Xgboost](https://xgboost.readthedocs.io/en/latest/jvm/index.html) and [fastmath](https://github.com/generateme/fastmath)
 59 | 
 60 | ;; More models are avilable via other plugins.
 61 | 
 62 | ;; Below is a list of all such models, and which parameter they take.
 63 | 
 64 | ;; All models are available in the same way:
 65 | 
 66 | 
 67 | 
 68 | ;; The documentation below points as well to the javadoc and user-guide chapter (for Smile models)
 69 | 
 70 | ;; The full list of build in models is:
 71 | ^{:nextjournal.clerk/visibility {:code :hide}}
 72 | (clerk/html
 73 |  [:ul
 74 | 
 75 |   (map
 76 |    #(vector :li [:a {:href (str "#" (str %))} (str %)])
 77 |    build-in-models)])
 78 | 
 79 | 
 80 | ;; ## Smile classification models
 81 | 
 82 | ^{:nextjournal.clerk/visibility {:code :hide}}
 83 | (clerk/html
 84 |  (utils-clerk/render-key-info :smile.classification/ada-boost))
 85 | ;; In this example we will use the capability of the Ada boost classifier
 86 | ;; to give us the importance of variables.
 87 | 
 88 | ;; As data we take here the Wiscon Breast Cancer dataset, which has 30 variables.
 89 | 
 90 | (def df
 91 |   (->
 92 |    (datasets/breast-cancer-ds)))
 93 | 
 94 | 
 95 | ;; To get an overview of the dataset, we print its summary:
 96 | 
 97 | (-> df ds/info)
 98 | 
 99 | 
100 | ;; Then we create a metamorph  pipeline with the ada boost model:
101 | 
102 | (def ada-pipe-fn
103 |   (ml/pipeline
104 |    (mm/set-inference-target :class)
105 |    (mm/categorical->number [:class])
106 |    (mm/model
107 |     {:model-type :smile.classification/ada-boost})))
108 | 
109 | 
110 | ;; We run the pipeline in :fit. As we just explore the data,
111 | ;; not train.test split is needed.
112 | 
113 | (def trained-ctx
114 |   (ml/fit-pipe df
115 |    ada-pipe-fn))
116 | 
117 | ;; "Next we take the model out of the pipeline:"
118 | (def model
119 |   (-> trained-ctx vals (nth 2) ml/thaw-model))
120 | 
121 | ;; The variable importance can be obtained from the trained model,
122 | (def var-importances
123 |   (mapv
124 |    #(hash-map :variable %1
125 |               :importance %2)
126 |    (map
127 |     #(first (.variables %))
128 |     (.. model formula predictors))
129 |    (.importance model)))
130 | 
131 | 
132 | ;; and we plot the variables:
133 | 
134 | (clerk/vl
135 |  {
136 |   :data {:values
137 |           var-importances}
138 |   :width  800
139 |   :height 500
140 |   :mark {:type "bar"}
141 |   :encoding {:x {:field :variable :type "nominal" :sort "-y"}
142 |              :y {:field :importance :type "quantitative"}}})
143 | 
144 | 
145 | ^{:nextjournal.clerk/visibility {:code :hide}}
146 | (clerk/html
147 |  (utils-clerk/render-key-info ":smile.classification/decision-tree"))
148 | 
149 | ;; A decision tree learns a set of rules from the data in the form
150 | ;; of a tree, which we will plot in this example.
151 | ;; We use the iris dataset:
152 | 
153 | 
154 | (def iris  ^:nextjournal.clerk/no-cache  (datasets/iris-ds))
155 | 
156 | 
157 | 
158 | ;; We make a pipe only containing the model, as the dataset is ready to
159 | ;; be used by `scicloj.ml`
160 | (def trained-pipe-tree
161 |   (ml/fit-pipe
162 |    iris
163 |    (ml/pipeline
164 |     {:metamorph/id :model}
165 |     (mm/model
166 |      {:model-type :smile.classification/decision-tree}))))
167 | 
168 | ;; We extract the Java object of the trained model.
169 | 
170 | (def tree-model
171 |   (-> trained-pipe-tree :model ml/thaw-model))
172 | 
173 | 
174 | ;; The model has a .dot function, which returns a GraphViz textual
175 | ;; representation of the decision tree, which we render to svg using the
176 | ;; [kroki](https://kroki.io/) service.
177 | 
178 | (clerk/html
179 |  (String. (:body (utils/kroki (.dot tree-model) :graphviz :svg)) "UTF-8"))
180 | 
181 | ^{:nextjournal.clerk/visibility {:code :hide}}
182 | (clerk/html (utils-clerk/render-key-info ":smile.classification/discrete-naive-bayes"))
183 | 
184 | ^{:nextjournal.clerk/visibility {:code :hide}}
185 | (clerk/html (utils-clerk/render-key-info ":smile.classification/gradient-tree-boost"))
186 | 
187 | ^{:nextjournal.clerk/visibility {:code :hide}}
188 | (clerk/html (utils-clerk/render-key-info ":smile.classification/knn"))
189 | ;; In this example we use a knn model to classify some dummy data.
190 | ;; The training data is this:
191 | 
192 | (def df-knn
193 |   (ds/dataset {:x1 [7 7 3 1]
194 |                :x2 [7 4 4 4]
195 |                :y [ :bad :bad :good :good]}))
196 | 
197 | 
198 | 
199 | ;; Then we construct a pipeline with the knn model,
200 | ;; using 3 neighbors for decision.
201 | 
202 | (def knn-pipe-fn
203 |   (ml/pipeline
204 |    (mm/set-inference-target :y)
205 |    (mm/categorical->number [:y])
206 |    (mm/model
207 |     {:model-type :smile.classification/knn
208 |      :k 3})))
209 | 
210 | ;; We run the pipeline in mode fit:
211 | 
212 | (def trained-ctx-knn
213 |   (knn-pipe-fn {:metamorph/data df-knn
214 |                 :metamorph/mode :fit}))
215 | 
216 | 
217 | ;; Then we run the pipeline in mode :transform with some test data
218 | ;; and take the prediction and convert it from numeric into categorical:
219 | 
220 | (->
221 |  trained-ctx-knn
222 |  (merge
223 |   {:metamorph/data (ds/dataset
224 |                     {:x1 [3 5]
225 |                      :x2 [7 5]
226 |                      :y [nil nil]})
227 |    :metamorph/mode :transform})
228 |  knn-pipe-fn
229 |  :metamorph/data
230 |  (ds/column-values->categorical :y)
231 |  seq)
232 | 
233 | ^{:nextjournal.clerk/visibility {:code :hide}}
234 | (clerk/html (utils-clerk/render-key-info ":smile.classification/logistic-regression"))
235 | 
236 | ^{:nextjournal.clerk/visibility {:code :hide}}
237 | (clerk/html (utils-clerk/render-key-info ":smile.classification/maxent-binomial"))
238 | 
239 | ^{:nextjournal.clerk/visibility {:code :hide}}
240 | (clerk/html (utils-clerk/render-key-info ":smile.classification/maxent-multinomial"))
241 | 
242 | ^{:nextjournal.clerk/visibility {:code :hide}}
243 | (clerk/html (utils-clerk/render-key-info ":smile.classification/random-forest"))
244 | ;; The following code plots the decision surfaces of the random forest
245 |  ;; model on pairs of features.
246 | 
247 | ;; We use the Iris dataset for this.
248 | 
249 | (def iris-test
250 |   (ds/dataset
251 |    "https://raw.githubusercontent.com/scicloj/metamorph.ml/main/test/data/iris.csv" {:key-fn keyword}))
252 | 
253 | 
254 | 
255 | 
256 | ;; Standarise the data:
257 | (def iris-std
258 |   (ml/pipe-it
259 |    iris-test
260 |    (mm/std-scale [:sepal_length :sepal_width :petal_length :petal_width] {})))
261 | 
262 | 
263 | 
264 | 
265 | 
266 | 
267 | ;; The next function creates a vega specification for the random forest
268 | ;; decision surface for a given pair of column names.
269 | 
270 | 
271 | 
272 | 
273 | (def rf-pipe
274 |   (make-iris-pipeline
275 |                       {:model-type :smile.classification/random-forest}))
276 | 
277 | (clerk/vl (utils/surface-plot iris [:sepal_length :sepal_width] rf-pipe :smile.classification/random-forest))
278 | 
279 | (clerk/vl
280 |  (utils/surface-plot iris-std [:sepal_length :petal_length] rf-pipe :smile.classification/random-forest))
281 | 
282 | (clerk/vl
283 |  (utils/surface-plot iris-std [:sepal_length :petal_width] rf-pipe :smile.classification/random-forest))
284 | (clerk/vl
285 |  (utils/surface-plot iris-std [:sepal_width :petal_length] rf-pipe :smile.classification/random-forest))
286 | (clerk/vl
287 |  (utils/surface-plot iris-std [:sepal_width :petal_width] rf-pipe :smile.classification/random-forest))
288 | (clerk/vl
289 |  (utils/surface-plot iris-std [:petal_length :petal_width] rf-pipe :smile.classification/random-forest))
290 | 
291 | 
292 | ^{:nextjournal.clerk/visibility {:code :hide}}
293 | (clerk/html (utils-clerk/render-key-info ":smile.classification/sparse-logistic-regression"))
294 | 
295 | ^{:nextjournal.clerk/visibility {:code :hide}}
296 | (clerk/html (utils-clerk/render-key-info ":smile.classification/sparse-svm"))
297 | 
298 | ^{:nextjournal.clerk/visibility {:code :hide}}
299 | (clerk/html (utils-clerk/render-key-info ":smile.classification/svm"))
300 | 
301 | ;; ## Smile regression models
302 | ^{:nextjournal.clerk/visibility {:code :hide}}
303 | (clerk/html (utils-clerk/render-key-info ":smile.regression/elastic-net"))
304 | 
305 | 
306 | ^{:nextjournal.clerk/visibility {:code :hide}}
307 | (clerk/html (utils-clerk/render-key-info ":smile.regression/gradient-tree-boost"))
308 | 
309 | ^{:nextjournal.clerk/visibility {:code :hide}}
310 | (clerk/html (utils-clerk/render-key-info ":smile.regression/lasso"))
311 | 
312 | ;; We use the diabetes dataset and will show how Lasso regression
313 | ;; regulates the different variables dependent of lambda.
314 | 
315 | ;; First we make a function to create pipelines with different lambdas
316 | (defn make-pipe-fn [lambda]
317 |   (ml/pipeline
318 |    (mm/update-column :disease-progression (fn [col] (map #(double %) col)))
319 |    (mm/convert-types :disease-progression :float32)
320 |    (mm/set-inference-target :disease-progression)
321 |    {:metamorph/id :model} (mm/model {:model-type :smile.regression/lasso
322 |                                      :lambda (double lambda)})))
323 | 
324 | ;; No we go over a sequence of lambdas and fit a pipeline for all off them
325 | ;; and store the coefficients for each predictor variable:
326 | (def diabetes (datasets/diabetes-ds))
327 | 
328 | (def coefs-vs-lambda
329 |   (flatten
330 |    (map
331 |     (fn [lambda]
332 |       (let [fitted
333 |             (ml/fit-pipe
334 |              diabetes
335 |              (make-pipe-fn lambda))
336 | 
337 |             model-instance
338 |             (-> fitted
339 |                 :model
340 |                 (ml/thaw-model))
341 | 
342 |             predictors
343 |             (map
344 |              #(first (.variables %))
345 |              (seq
346 |               (.. model-instance formula predictors)))]
347 | 
348 |         (map
349 |          #(hash-map :log-lambda (dtf/log10 lambda)
350 |                     :coefficient %1
351 |                     :predictor %2)
352 |          (-> model-instance .coefficients seq)
353 |          predictors)))
354 |     (range 1 100000 100))))
355 | 
356 | ;; Then we plot the coefficients over the log of lambda.
357 | (clerk/vl
358 |  {
359 |   :data {:values coefs-vs-lambda}
360 | 
361 |   :width 500
362 |   :height 500
363 |   :mark {:type "line"}
364 |   :encoding {:x {:field :log-lambda :type "quantitative"}
365 |              :y {:field :coefficient :type "quantitative"}
366 |              :color {:field :predictor}}})
367 | 
368 | ;; This shows that an increasing lambda regulates more and more variables
369 |  ;; to zero. This plot can be used as well to find important variables,
370 | ;; namely the ones which stay > 0 even with large lambda.
371 | 
372 | ^{:nextjournal.clerk/visibility {:code :hide}}
373 | (clerk/html
374 |  (utils-clerk/render-key-info ":smile.regression/ordinary-least-square"))
375 | 
376 | ;; In this example we will explore the relationship between the
377 | ;; body mass index (bmi) and a diabetes indicator.
378 | 
379 | ;; First we load the data and split into train and test sets.
380 | ;;
381 | ^{:nextjournal.clerk/viewer :hide-result}
382 | (def diabetes (datasets/diabetes-ds))
383 | 
384 | ^{:nextjournal.clerk/viewer :hide-result}
385 | (def diabetes-train
386 |   (ds/head diabetes 422))
387 | 
388 | ^{:nextjournal.clerk/viewer :hide-result}
389 | (def diabetes-test
390 |   (ds/tail diabetes 20))
391 | 
392 | 
393 | 
394 | ;; Next we create the pipeline, converting the target variable to
395 | ;; a float value, as needed by the model.
396 | 
397 | (def ols-pipe-fn
398 |   (ml/pipeline
399 |    (mm/select-columns [:bmi :disease-progression])
400 |    (mm/convert-types :disease-progression :float32)
401 |    (mm/set-inference-target :disease-progression)
402 |    {:metamorph/id :model} (mm/model {:model-type :smile.regression/ordinary-least-square})))
403 | 
404 | ;; We can then fit the model, by running the pipeline in mode :fit
405 | 
406 | (def fitted
407 |   (ml/fit diabetes-train ols-pipe-fn))
408 | 
409 | 
410 | ;; Next we run the pipe-fn in :transform and extract the prediction
411 | ;; for the disease progression:
412 | (def diabetes-test-prediction
413 |   (-> diabetes-test
414 |       (ml/transform-pipe ols-pipe-fn fitted)
415 |       :metamorph/data
416 |       :disease-progression))
417 | 
418 | ;; The truth is available in the test dataset.
419 | (def diabetes-test-trueth
420 |   (-> diabetes-test
421 |       :disease-progression))
422 | 
423 | 
424 | 
425 | 
426 | ;; The smile Java object of the LinearModel is in the pipeline as well:
427 | 
428 | (def model-instance
429 |   (-> fitted :model  (ml/thaw-model)))
430 | 
431 | ;; This object contains all information regarding the model fit
432 | ;; such as coefficients and formula:
433 | (-> model-instance .coefficients seq)
434 | (-> model-instance .formula str)
435 | 
436 | ;; Smile generates as well a String with the result of the linear
437 | ;; regression as part of the toString() method of class LinearModel:
438 | 
439 | (clerk/code
440 |  (str model-instance))
441 | 
442 | 
443 | 
444 | ;; This tells us that there is a statistically significant
445 | ;; (positive) correlation between the bmi and the diabetes
446 | ;; disease progression in this data.
447 | 
448 | 
449 | ;; At the end we can plot the truth and the prediction on the test data,
450 | ;; and observe the linear nature of the model.
451 | 
452 | (clerk/vl
453 |  {:layer [
454 |           {:data {:values (map #(hash-map :disease-progression %1 :bmi %2 :type :truth)
455 |                                diabetes-test-trueth
456 |                                (:bmi  diabetes-test))}
457 | 
458 |            :width 500
459 |            :height 500
460 |            :mark {:type "circle"}
461 |            :encoding {:x {:field :bmi :type "quantitative"}
462 |                       :y {:field :disease-progression :type "quantitative"}
463 |                       :color {:field :type}}}
464 | 
465 |           {:data {:values (map #(hash-map :disease-progression %1 :bmi %2 :type :prediction)
466 |                                diabetes-test-prediction
467 |                                (:bmi diabetes-test))}
468 | 
469 |            :width 500
470 |            :height 500
471 |            :mark {:type "line"}
472 |            :encoding {:x {:field :bmi :type "quantitative"}
473 |                       :y {:field :disease-progression :type "quantitative"}
474 |                       :color {:field :type}}}]})
475 | 
476 | 
477 | ^{:nextjournal.clerk/visibility {:code :hide}}
478 | (clerk/html (utils-clerk/render-key-info ":smile.regression/random-forest"))
479 | 
480 | ^{:nextjournal.clerk/visibility {:code :hide}}
481 | (clerk/html (utils-clerk/render-key-info ":smile.regression/ridge"))
482 | 
483 | 
484 | ;; ## Xgboost model
485 | ^{:nextjournal.clerk/visibility {:code :hide}}
486 | (clerk/html (utils-clerk/render-key-info ":xgboost"))
487 | 
488 | ;; ## Fastmath clustering
489 | ^{:nextjournal.clerk/visibility {:code :hide}}
490 | (clerk/html (utils-clerk/render-key-info :fastmath.cluster))
491 | 
492 | ;; ## Smile projections
493 | ^{:nextjournal.clerk/visibility {:code :hide}}
494 | (clerk/html (utils-clerk/render-key-info :smile.projections))
495 | 
496 | ;; ## Smile manifold
497 | ^{:nextjournal.clerk/visibility {:code :hide}}
498 | (clerk/html (utils-clerk/render-key-info :smile.manifold))
499 | 
500 | 
501 | ;; # Compare decision surfaces of different models
502 | 
503 | ;; In the following we see the decision surfaces of some models on the
504 | ;; same data from the Iris dataset using 2 columns :sepal_width and sepal_length:
505 | ^{:nextjournal.clerk/visibility {:code :hide}}
506 | (mapv #(clerk/vl (utils/surface-plot iris-std [:sepal_length :sepal_width] (make-iris-pipeline %) (:model-type %)))
507 |      [
508 |       {:model-type :smile.classification/ada-boost}
509 |       {:model-type :smile.classification/decision-tree}
510 |       {:model-type :smile.classification/gradient-tree-boost}
511 |       {:model-type :smile.classification/knn}
512 |       {:model-type :smile.classification/logistic-regression}
513 |       {:model-type :smile.classification/random-forest}
514 |       {:model-type :smile.classification/linear-discriminant-analysis}
515 |       {:model-type :smile.classification/regularized-discriminant-analysis}
516 |       {:model-type :smile.classification/quadratic-discriminant-analysis}
517 |       {:model-type :xgboost/classification}])
518 | 
519 | 
520 | 
521 | ;; This shows nicely that different model types have different capabilities
522 | ;; seperate and tehre fore classify data.
523 | 
524 | 
525 | ;; ## Ensembles
526 | 
527 | ;; An ensemble is combining several pipelines and their prediction
528 | ;; and calculate a common prediction.
529 | ;; `sicloj.ml` alows to create an ensemble whehre each model gives avote,
530 | ;; and the majority becomes the final prediction.
531 | ;;
532 | 
533 | 
534 | ;;  First we make three pipelines, which only differ in the model type.
535 | ;;  The pipleines could b completely different, but need to accept the same input data and
536 | ;;  produce the same predictions (target column name and type)
537 | ;;
538 | 
539 | 
540 | (defn make-iris-pipeline-ensemble [model-type]
541 |   (ml/pipeline
542 |    (mm/select-columns [:species :sepal_length :sepal_width])
543 |    (mm/set-inference-target :species)
544 |    (mm/categorical->number [:species])
545 |    {:metamorph/id :model}
546 |    (mm/model
547 |     {:model-type model-type})))
548 | (defn make-iris-pipeline-ensemble [model-type]
549 |   (ml/pipeline
550 |    (mm/select-columns [:species :sepal_length :sepal_width])
551 |    (mm/set-inference-target :species)
552 |    (mm/categorical->number [:species])
553 |    {:metamorph/id :model}
554 |    (mm/model
555 |     {:model-type model-type})))
556 | 
557 | 
558 | (def tree-pipeline
559 |   (make-iris-pipeline-ensemble :smile.classification/decision-tree))
560 |  
561 | 
562 | (def knn-pipeline
563 |   (make-iris-pipeline-ensemble :smile.classification/knn))
564 |   
565 | 
566 | (def logistic-regression-pipeline
567 |   (make-iris-pipeline-ensemble :smile.classification/logistic-regression))
568 |   
569 | 
570 | ;;  Know we can contruct an ensembe, using function `ensemble-pipe`
571 | 
572 | (def ensemble (ml/ensemble-pipe [tree-pipeline
573 |                                  knn-pipeline
574 |                                  logistic-regression-pipeline]))
575 | 
576 | ;;  This ensemble is as any other metamorph pipeline,
577 | ;;  so we can train and predict as usual:
578 | 
579 | 
580 | (def fitted-ctx-ensemble
581 |   (ml/fit-pipe iris-std ensemble))
582 | 
583 | 
584 | (def transformed-ctx-ensemble
585 |   (ml/transform-pipe iris-std ensemble fitted-ctx-ensemble))
586 | 
587 | 
588 | ;;  Frequency of predictions
589 | 
590 | 
591 | (->
592 |  transformed-ctx-ensemble
593 |  :metamorph/data
594 |  (ds/reverse-map-categorical-xforms)
595 |  :species
596 |  frequencies)
597 | 
598 | ;;  The surface plot of the ensemble
599 | 
600 | 
601 | (clerk/vl (utils/surface-plot iris-std
602 |                         [:sepal_length :sepal_width]
603 |                         ensemble "voting ensemble"))
604 | 


--------------------------------------------------------------------------------
/src/scicloj/ml/nested_cv.clj:
--------------------------------------------------------------------------------
 1 | (ns scicloj.ml.nested-cv
 2 |   (:require [tablecloth.api :as tc]
 3 |             [scicloj.metamorph.ml :as ml]
 4 |             [scicloj.metamorph.ml.classification :as clf]
 5 |             [tech.v3.datatype :as dt]))
 6 | 
 7 | 
 8 | (defn nested-cv [data pipelines metric-fn loss-or-accuracy outer-k inner-k]
 9 |   ;;  https://www.youtube.com/watch?v=DuDtXtKNpZs
10 |   (let [k-folds (tc/split->seq data :kfold {:k outer-k})]
11 |     (for [{train :train test :test} k-folds]
12 |       (let [inner-k-fold (tc/split->seq test :kfold {:k inner-k})
13 |             evaluation (ml/evaluate-pipelines
14 |                         pipelines
15 |                         inner-k-fold
16 |                         metric-fn
17 |                         loss-or-accuracy)
18 |             fit-ctx (-> evaluation first first :fit-ctx)
19 |             best-pipe-fn (-> evaluation first first :pipe-fn)
20 |             transform-ctx (best-pipe-fn
21 |                            (merge fit-ctx
22 |                                   {:metamorph/data test :metamorph/mode :transform}))
23 |             metric (metric-fn
24 |                     (-> transform-ctx :model :scicloj.metamorph.ml/target-ds :survived dt/->vector)
25 |                     (-> transform-ctx :metamorph/data :survived dt/->vector))]
26 |         {:pipe-fn best-pipe-fn
27 |          :fit-ctx fit-ctx
28 |          :metric metric}))))
29 |  
30 | 


--------------------------------------------------------------------------------
/src/scicloj/ml/polyglot_kmeans.clj:
--------------------------------------------------------------------------------
  1 | (ns scicloj.ml.polyglot-kmeans
  2 |   (:require
  3 |    [scicloj.sklearn-clj.metamorph]
  4 |    [nextjournal.clerk :as clerk]
  5 |    [libpython-clj2.require :refer [require-python]]
  6 |    [libpython-clj2.python :as py :refer [py.- py.]]))
  7 | 
  8 | (comment
  9 |   (clerk/serve! {:browser true})
 10 |   (clerk/build-static-app! {:paths ["src/scicloj/ml/polyglot_kmeans.clj"]
 11 |                             :bundle? false})
 12 |   (clerk/clear-cache!))
 13 | 
 14 | ^{::clerk/visibility #{:hide}}
 15 | (clerk/code
 16 |  "
 17 | from sklearn.datasets import make_blobs
 18 | from sklearn.cluster import KMeans
 19 | from sklearn.preprocessing import StandardScaler
 20 | 
 21 | features, true_labels = make_blobs(
 22 |     n_samples=200,
 23 |     centers=3,
 24 |     cluster_std=2.75,
 25 |     random_state=42
 26 | )
 27 | 
 28 | scaler = StandardScaler()
 29 | scaled_features = scaler.fit_transform(features)
 30 | 
 31 | kmeans = KMeans(
 32 |     init=\"random\",
 33 |     n_clusters=3,
 34 |     n_init=10,
 35 |     max_iter=300,
 36 |     random_state=42)
 37 | 
 38 | kmeans.fit(scaled_features)
 39 | 
 40 | kmeans.inertia_
 41 | ")
 42 | 
 43 | 
 44 | 
 45 | ;; # 1. Use libpython-clj
 46 | ;; This is using the same python classes as above
 47 | ;; So it is "the same code"
 48 | ;;
 49 | (require-python '[sklearn.datasets :refer [make_blobs]]
 50 |                 '[sklearn.preprocessing :refer [StandardScaler]]
 51 |                 '[sklearn.cluster :refer [KMeans]])
 52 | 
 53 | 
 54 | 
 55 | (def blobs
 56 |   (make_blobs :n_samples 200
 57 |               :n_features 50
 58 |               :centers 3
 59 |               :cluster_std 2.75
 60 |               :random_state 42))
 61 | 
 62 | (def scaler (StandardScaler))
 63 | (def features (first blobs))
 64 | (def scaled-features (py. scaler fit_transform features))
 65 | (def k-means (KMeans
 66 |               :init "random"
 67 |               :n_clusters 3
 68 |               :n_init 10
 69 |               :max_iter 300
 70 |               :random_state 42))
 71 | (py. k-means fit scaled-features)
 72 | (py.- k-means inertia_)
 73 | 
 74 | (println :python
 75 |  (py.- k-means inertia_))
 76 | 
 77 | 
 78 | ;; # 2. use sklearn-clj
 79 | ;; This librraies allow to use all estimators/model from sklearn
 80 | ;; It uses libpython-clj, but "hidden" behind sklearn-clj
 81 | ;;
 82 | 
 83 | (require '[scicloj.ml.sklearnclj])
 84 | (require '[scicloj.ml.dataset :as ds]
 85 |          '[scicloj.ml.metamorph :as mm]
 86 |          '[scicloj.ml.core :as ml]
 87 |          '[scicloj.sklearn-clj.metamorph :as sklearn-clj])
 88 | 
 89 | 
 90 | (def data (-> blobs first py/->jvm ds/dataset))
 91 | 
 92 | (def fitted-ctx-1
 93 |   (ml/fit
 94 |    data
 95 |    (mm/std-scale  :all {})
 96 |    {:metamorph/id :k-means}
 97 |    (sklearn-clj/estimate
 98 |     :sklearn.cluster "KMeans"
 99 |     {:init "random"
100 |      :n_clusters 3
101 |      :n_init 10
102 |      :max_iter 300
103 |      :random_state 42})))
104 | (-> fitted-ctx-1 :k-means :attributes :inertia_)
105 | 
106 | 
107 | ;; # 3. use Clojure only pipeline
108 | ;;  So no python interop in use
109 | ;;  It uses clustering algorithms from JVM library Smile
110 | 
111 | (require '[scicloj.ml.smile.clustering :as clustering])
112 | 
113 | (def fitted-ctx-2
114 |   (ml/fit
115 |    data
116 |    (mm/std-scale  :all {})
117 |    {:metamorph/id :k-means}
118 |    (scicloj.ml.smile.clustering/cluster
119 |     :k-means
120 |     [3 300]
121 |     :cluster)))
122 | 
123 | (-> fitted-ctx-2 :k-means  :info :distortion)
124 | 
125 | 
126 | ;; # 4. use declarative Clojure only pipeline
127 | ;; same as 3), only using metamorph declarative pipelines
128 | 
129 | 
130 | 
131 | (def decl-pipe
132 |   [[:mm/std-scale :all {}]
133 |    {:metamorph/id :k-means}
134 |    [:scicloj.ml.smile.clustering/cluster
135 |     :k-means
136 |     [3 300]
137 |     :cluster]])
138 | 
139 | (def distortion-1
140 |   (->> decl-pipe
141 |        ml/->pipeline
142 |        (ml/fit-pipe data)
143 |        :k-means
144 |        :info
145 |        :distortion))
146 | 
147 | 
148 | (frequencies
149 |  (repeatedly 1000 (fn []
150 |                     (->> decl-pipe
151 |                          ml/->pipeline
152 |                          (ml/fit-pipe data)
153 |                          :k-means
154 |                          :info
155 |                          :distortion))))
156 | 
157 | 
158 | 
159 | 
160 | ;; # 5. in one threading macro, no variables declared
161 | ;; same as 4., but written more compact
162 | 
163 | (def distortion-2
164 |   (->> [[:mm/std-scale :all {}]
165 |         {:metamorph/id :k-means}
166 |         [:scicloj.ml.smile.clustering/cluster
167 |          :k-means
168 |          [3 300]
169 |          :cluster]]
170 |        ml/->pipeline
171 |        (ml/fit-pipe data)
172 |        :k-means
173 |        :info))
174 | 


--------------------------------------------------------------------------------
/src/scicloj/ml/sklearnclj.clj:
--------------------------------------------------------------------------------
  1 | (ns scicloj.ml.sklearnclj
  2 |   (:require
  3 |    [notespace.api :as note]
  4 |    [notespace.kinds :as kind]
  5 |    [scicloj.sklearn-clj.ml]
  6 |    [scicloj.ml.ug-utils]))
  7 |    
  8 | 
  9 | 
 10 | 
 11 | (comment
 12 |   (note/init-with-browser)
 13 |   (note/eval-this-notespace)
 14 |   (note/reread-this-notespace)
 15 |   (note/render-static-html "docs/userguide-sklearnclj.html")
 16 |   (note/init))
 17 | 
 18 |   
 19 | ["# sklearn-clj"]
 20 | 
 21 | ["The [scicloj.ml](https://github.com/scicloj/scicloj.ml) plugin [sklearn-clj](https://github.com/scicloj/sklearn-clj)
 22 |  gives easy access to all models from [scikit-learn](https://scikit-learn.org/stable/)"]
 23 | 
 24 | ["After [libpython.clj](https://github.com/clj-python/libpython-clj)
 25 |  has been setup with the python package sklearn installed,
 26 | the following lines show how to use any sklearn model in a usual `scicloj.ml` pipeline:"]
 27 | 
 28 | (require '[scicloj.ml.core :as ml]
 29 |          '[scicloj.ml.metamorph :as mm]
 30 |          '[scicloj.ml.dataset :as ds]
 31 |          '[tech.v3.dataset.tensor :as dst]
 32 |          '[scicloj.sklearn-clj :as sklearn-clj]
 33 |          '[scicloj.sklearn-clj.ml]
 34 |          '[scicloj.metamorph.ml.toydata :as toydata]
 35 |          '[libpython-clj2.python :refer [py.-] :as py])
 36 |          
 37 | 
 38 | ["Example: logistic regression"]
 39 | 
 40 | (def ds (dst/tensor->dataset [[0 0 0 ] [1 1 1 ] [2 2 2]]))
 41 | 
 42 | ["Make pipe with sklearn model 'logistic-regression'"]
 43 | (def pipe
 44 |   (ml/pipeline
 45 |    (mm/set-inference-target 2)
 46 |    {:metamorph/id :model}
 47 |    (mm/model {:model-type :sklearn.classification/logistic-regression
 48 |               :max-iter 100})))
 49 | 
 50 | 
 51 | ["Train model"]
 52 | (def fitted-ctx
 53 |   (pipe {:metamorph/data ds
 54 |          :metamorph/mode :fit}))
 55 | 
 56 | ["Predict on new data"]
 57 | (->
 58 |  (ml/transform-pipe
 59 |   (dst/tensor->dataset [[3 4 5]])
 60 |   pipe
 61 |   fitted-ctx)
 62 |  :metamorph/data)
 63 | 
 64 | ["Access model details via python interop (libpython-clj)"]
 65 | (-> fitted-ctx :model :model-data :model
 66 |     (py.- coef_)
 67 |     (py/->jvm))
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | ["All model attributes are as well in the context"]
 74 | 
 75 | (def model-attributes
 76 |   (-> fitted-ctx :model :model-data :attributes))
 77 | 
 78 | ^kind/hiccup-nocode
 79 | [:dl (map
 80 |       (fn [[k v]]
 81 |         [:span
 82 |          (vector :dt k)
 83 |          (vector :dd  (clojure.pprint/write v :stream nil))])
 84 |       model-attributes)]
 85 | 
 86 | 
 87 | 
 88 | ["# Models"]
 89 | 
 90 | ["Below all models are listed with their parameters and the original documentation.
 91 | 
 92 | The parameters are given as Clojure keys in kebap-case. As the document texts are imported from python
 93 | they refer to the python spelling of the parameter. But the translation between the two should be obvious."]
 94 | 
 95 | ^kind/hiccup-nocode
 96 | [:ul
 97 | 
 98 | 
 99 |  (->>
100 |   (ml/model-definition-names)
101 |   (filter #(contains? #{"sklearn.classification"
102 |                         "sklearn.regression"}
103 | 
104 |                      (namespace %)))
105 |   sort
106 |   (map
107 |    #(vector :li [:a {:href (str "#" (str %))} (str %)])))]
108 |   
109 | 
110 | 
111 | 
112 | ["## Sklearn classification"]
113 | ^kind/hiccup-nocode
114 | (scicloj.ml.ug-utils/render-key-info ":sklearn.classification")
115 | 
116 | 
117 | ["## Sklearn regression"]
118 | ^kind/hiccup-nocode
119 | (scicloj.ml.ug-utils/render-key-info ":sklearn.regression")
120 | 


--------------------------------------------------------------------------------
/src/scicloj/ml/third_party.clj:
--------------------------------------------------------------------------------
  1 | (ns scicloj.ml.third-party
  2 |  (:require [notespace.api :as note]
  3 |            [notespace.kinds :as kind]
  4 |            [scicloj.ml.ug-utils :refer :all]
  5 |            [dk.simongray.datalinguist.ml.crf]
  6 |            [scicloj.ml.clj-djl.mmml]
  7 |            [scicloj.ml.clj-djl.fasttext]
  8 |            [tech.v3.libs.arrow :as arrow]))
  9 | 
 10 | (comment
 11 |   (note/init-with-browser)
 12 |   (note/eval-this-notespace)
 13 |   (note/reread-this-notespace)
 14 |   (note/render-static-html "docs/userguide-third_party.html")
 15 |   (note/init))
 16 | 
 17 | 
 18 | (require '[scicloj.ml.core :as ml]
 19 |          '[scicloj.ml.metamorph :as mm]
 20 |          '[scicloj.ml.dataset  :as ds]
 21 |          '[tech.v3.datatype.functional :as dfn]
 22 |          '[clojure.tools.namespace.find :as ns-find]
 23 |          '[clojure.java.classpath :as cp]
 24 |          '[scicloj.ml.xgboost]
 25 |          '[camel-snake-kebab.core :as csk])
 26 | 
 27 | 
 28 | 
 29 | 
 30 | 
 31 | 
 32 | ["# xgboost"]
 33 | ["## Example code"]
 34 | 
 35 | (def house-price
 36 |   (->
 37 |    (ds/dataset
 38 |     "http://d2l-data.s3-accelerate.amazonaws.com/kaggle_house_pred_train.csv" {:key-fn csk/->kebab-case-keyword})
 39 |    (ds/replace-missing :type/string "NA")
 40 |    (ds/categorical->number  #(ds/select-columns % :type/string))))
 41 | 
 42 | 
 43 | (def split (first (ds/split->seq house-price :holdout)))
 44 | 
 45 | (def train-ds (:train split))
 46 | (def test-ds (:test split))
 47 | 
 48 | 
 49 | (def pipe-fn
 50 |   (ml/pipeline
 51 |    (mm/replace-missing :type/numerical :value 0)
 52 |    (mm/set-inference-target :sale-price)
 53 |    {:metamorph/id :model} (mm/model {:model-type :xgboost/linear-regression})))
 54 | 
 55 | (def fit-result
 56 |   (let [fitted-ctx
 57 |         (ml/fit-pipe train-ds pipe-fn)
 58 |         test-predictions
 59 |         (ml/transform-pipe test-ds pipe-fn fitted-ctx)
 60 |         error
 61 |         (ml/mae (-> test-predictions  :metamorph/data :sale-price)
 62 |                 (-> test-ds :sale-price))]
 63 |     {:error error
 64 |      :gains (->
 65 |              (ml/explain (-> fitted-ctx :model))
 66 |              (ds/order-by :gain :desc))}))
 67 | 
 68 | 
 69 | 
 70 | ["error:"]
 71 | (:error fit-result)
 72 | 
 73 | ["Feature importance - gain"]
 74 | 
 75 | ^kind/dataset
 76 | (:gains fit-result)
 77 | 
 78 | ["## Reference"]
 79 | 
 80 | ^kind/hiccup-nocode (render-key-info ":xgboost")
 81 | 
 82 | ["# Deep learning models via clj-djl "]
 83 | 
 84 | 
 85 | 
 86 | (def train-ds
 87 |   (ds/dataset
 88 |    "http://d2l-data.s3-accelerate.amazonaws.com/kaggle_house_pred_train.csv"))
 89 | 
 90 | 
 91 | (def test-ds
 92 |   (->
 93 |    (ds/dataset
 94 |     "http://d2l-data.s3-accelerate.amazonaws.com/kaggle_house_pred_test.csv")
 95 |    (ds/add-column "SalePrice" 0)))
 96 | 
 97 | (defn numeric-features [ds]
 98 |   (ds/intersection (ds/numeric ds)
 99 |                    (ds/feature ds)))
100 | 
101 | (defn update-columns
102 |   "Update a sequence of columns selected by column name seq or column selector function."
103 |   [dataframe col-name-seq-or-fn update-fn]
104 |   (ds/update-columns dataframe
105 |                      (if (fn? col-name-seq-or-fn)
106 |                        (ds/column-names (col-name-seq-or-fn dataframe))
107 |                        col-name-seq-or-fn)
108 |                      update-fn))
109 | 
110 | 
111 | 
112 | 
113 | (require
114 |  '[clj-djl.nn :as nn]
115 |  '[clj-djl.training :as t]
116 |  '[clj-djl.training.loss :as loss]
117 |  '[clj-djl.training.optimizer :as optimizer]
118 |  '[clj-djl.training.tracker :as tracker]
119 |  '[clj-djl.training.listener :as listener]
120 |  '[clj-djl.ndarray :as nd]
121 |  '[clj-djl.nn.parameter :as param])
122 | 
123 | (def  learning-rate 0.05)
124 | (defn net [] (nn/sequential {:blocks (nn/linear {:units 1})
125 |                              :initializer (nn/normal-initializer)
126 |                              :parameter param/weight}))
127 | 
128 | (defn cfg [] (t/training-config {:loss (loss/l2-loss)
129 |                                  :optimizer (optimizer/sgd
130 |                                              {:tracker (tracker/fixed learning-rate)})
131 |                                  :evaluator (t/accuracy)
132 |                                  :listeners (listener/logging)}))
133 | 
134 | 
135 | 
136 | (def pipe
137 |   (ml/pipeline
138 | 
139 |    (mm/drop-columns ["Id"])
140 |    (mm/set-inference-target "SalePrice")
141 |    (mm/replace-missing :type/numerical :value 0)
142 |    (mm/replace-missing :!type/numerical :value "None")
143 |    (ml/lift update-columns numeric-features
144 |             #(dfn// (dfn/- % (dfn/mean %))
145 |                     (dfn/standard-deviation %)))
146 |    (mm/transform-one-hot :!type/numerical :full)
147 |    (mm/update-column "SalePrice"
148 |                      #(dfn// % (dfn/mean %)))
149 | 
150 |    (mm/set-inference-target "SalePrice")
151 | 
152 |    (mm/model {:model-type :clj-djl/djl
153 |               :batchsize 64
154 |               :model-spec {:name "mlp" :block-fn net}
155 |               :model-cfg (cfg)
156 |               :initial-shape (nd/shape 1 311)
157 |               :nepoch 1})))
158 | 
159 | 
160 | 
161 | 
162 | (def trained-pipeline
163 |   (pipe {:metamorph/data train-ds
164 |          :metamorph/mode :fit
165 |          :metamorph.ml/full-ds (ds/concat train-ds test-ds)}))
166 | 
167 | 
168 |          
169 | (def predicted-pipeline
170 |   (pipe
171 |    (merge trained-pipeline
172 |           {:metamorph/data test-ds
173 |            :metamorph/mode :transform})))
174 | 
175 | 
176 | 
177 | 
178 | ( get
179 |  (:metamorph/data predicted-pipeline)
180 |  "SalePrice")
181 | 
182 | 
183 | ^kind/hiccup-nocode
184 | (render-key-info ":clj-djl/djl")
185 | 
186 | 
187 | ["# A NER model from Standford CoreNLP"]
188 | 
189 | ^kind/hiccup-nocode
190 | (render-key-info ":corenlp")
191 | 
192 | 
193 | ["# Fastext text lassification rom DJL"]
194 | 
195 | ^kind/hiccup-nocode
196 | (render-key-info ":clj-djl/fasttext")
197 | 
198 | (def tweets
199 |   (->
200 |    (ds/dataset "data/tweets_sentiment.csv" {:key-fn keyword})
201 |    (ds/drop-columns [:id])))
202 | ;; (def tweets
203 | ;;   (arrow/stream->dataset "data/tweets_sentiment.feather"))
204 | 
205 | 
206 | ;; (require  '[tech.v3.libs.arrow])
207 | 
208 | 
209 | 
210 | ^kind/dataset
211 | tweets
212 | 
213 | (def split (first (ds/split->seq
214 |                    (ds/shuffle tweets)
215 |                    :holdout)))
216 | 
217 | 
218 | 
219 | (def model
220 |   (ml/train (-> (:train split)
221 |                 (tech.v3.dataset.modelling/set-inference-target :label))
222 |             {:model-type :clj-djl/fasttext
223 |              :ft-training-config {:epoch 1}}))
224 | 
225 | (def
226 |   prob-distribution
227 |   (ml/predict (:test split) (assoc model
228 |                                    :top-k 3)))
229 | prob-distribution
230 | 


--------------------------------------------------------------------------------
/src/scicloj/ml/titanic.clj:
--------------------------------------------------------------------------------
  1 | (ns scicloj.ml.titanic
  2 |   (:require
  3 |    [notespace.api :as note]
  4 |    [notespace.kinds :as kind]))
  5 | 
  6 | (comment
  7 |   (note/init-with-browser)
  8 |   (note/eval-this-notespace)
  9 |   (note/reread-this-notespace)
 10 |   (note/render-static-html "docs/userguide-titanic.html")
 11 | 
 12 |   (note/init))
 13 |   
 14 | 
 15 | 
 16 | (require '[scicloj.ml.dataset :as ds]
 17 |          '[tech.v3.dataset.math :as ds-math]
 18 |          '[tech.v3.datatype.functional :as dfn]
 19 |          '[scicloj.ml.core :as ml]
 20 |          '[scicloj.ml.metamorph :as mm]
 21 |          '[camel-snake-kebab.core :as csk]
 22 |          '[scicloj.metamorph.ml.loss :as loss]
 23 |          '[clojure.string :as str]
 24 |          '[fastmath.stats :as stats]
 25 |          '[fastmath.random :as rnd]
 26 |          '[scicloj.ml.xgboost])
 27 |           
 28 | 
 29 | 
 30 | ["## Introduction "]
 31 | 
 32 | [" In this example, we will train a model which is able to predict the survival of passengers from the Titanic dataset."
 33 |  "In a real analysis, this would contain as well explorative analysis of the data, which I will skip here,
 34 | as the purpose is to showcase machine learning with scicloj.ml, which is about model evaluation and selection."]
 35 |  
 36 | 
 37 | 
 38 | ["### Read data"]
 39 | 
 40 | (def data (ds/dataset "data/titanic/train.csv" {:key-fn csk/->kebab-case-keyword}))
 41 | 
 42 | 
 43 | 
 44 | ["Column info:"]
 45 | (ds/info data)
 46 | 
 47 | 
 48 | ["We can explore the association between the categorical columns of the dataset
 49 | with the :survived using cramers-v-corrected:"]
 50 | (def categorical-feature-columns [:pclass :sex :age :parch
 51 |                                     :embarked])
 52 | (map
 53 |  #(hash-map
 54 |    %
 55 |    (stats/cramers-v-corrected
 56 |     (get  data %)
 57 |     (:survived data)))
 58 |  categorical-feature-columns)
 59 |  
 60 | ["In this dataset, :sex seems to be the best predictor for survival."]
 61 | 
 62 | ["Association between the select variables:"]
 63 | (for [c1 categorical-feature-columns c2 categorical-feature-columns]
 64 |   {[c1 c2]
 65 |    (stats/cramers-v-corrected (get data c1) (get data  c2))})
 66 |   
 67 | 
 68 | ["This shows how much the columns are correlated. "]
 69 | 
 70 | ["## clean some of the features"]
 71 | 
 72 | ["The follwoing functios will be used in the pipeline. They clean the
 73 | features to make them better predictors."]
 74 | 
 75 | (defn categorize-cabin [data]
 76 |   (-> data
 77 |       (ds/add-or-replace-column
 78 |        :cabin
 79 |        (map
 80 |         #(if (empty? %)
 81 |            :unknown
 82 |            (keyword (subs
 83 |                      %
 84 |                      0 1)))
 85 |         (:cabin data)))))
 86 |         
 87 | 
 88 | (defn categorize-age [data]
 89 |   (->
 90 |    data
 91 |    (ds/add-or-replace-column
 92 |     :age-group
 93 |     (map
 94 |      #(cond
 95 |         (< % 10) :child
 96 |         (< % 18) :teen
 97 |         (< % 60) :adult
 98 |         (> % 60) :elderly
 99 |         true :other)
100 |      (:age data)))))
101 | 
102 | ["We want to create a new column :title which might help in the score.
103 | This is an example of custom function, which creates a new column from existing columns,
104 | which is a typical case of feature engineering."]
105 | 
106 | (defn name->title [dataset]
107 |   (-> dataset
108 |       (ds/add-or-replace-column
109 |        :title
110 |        (map
111 |         #(-> % (str/split  #"\.")
112 |              first
113 |              (str/split  #"\,")
114 |              last
115 |              str/trim)
116 |         (data :name)))
117 |       (ds/drop-columns :name)))
118 | 
119 | (def title-map
120 |   {"Major" :a
121 |    "Col" :a
122 |    "Rev" :a
123 |    "Ms" :b
124 |    "Miss" :b
125 |    "Jonkheer" :a
126 |    "Don" :a
127 |    "Mlle" :b
128 |    "Mr" :a
129 |    "Master" :a
130 |    "Capt" :a
131 |    "Mrs" :b
132 |    "Lady" :b
133 |    "Sir" :a
134 |    "Dr" :a
135 |    "the Countess" :b
136 |    "Mme" :b})
137 | 
138 | (defn categorize-title [data]
139 |  (->
140 |     data
141 |     (ds/add-or-replace-column
142 |      :title
143 |      (map title-map (:title data)))))
144 | 
145 | ["The final pipeline contains the functions we did before."]
146 | 
147 | 
148 | ;; => _unnamed [2 1]:
149 | ;;    | :a |
150 | ;;    |----|
151 | ;;    |    |
152 | ;;    |    |
153 | 
154 | (def pipeline-fn
155 |   (ml/pipeline
156 |    (mm/replace-missing :embarked :value "S")
157 |    (mm/replace-missing :age :value tech.v3.datatype.functional/mean)
158 |    (mm/update-column :parch str)
159 |    (ml/lift categorize-age)
160 |    (ml/lift name->title)
161 |    (ml/lift categorize-title)
162 |    (ml/lift categorize-cabin)
163 |    (mm/select-columns [:age-group
164 |                        :cabin
165 |                        :embarked
166 |                        :fare
167 |                        :parch
168 |                        :pclass
169 |                        :sex
170 |                        :survived
171 |                        :title])
172 | 
173 |    (fn [ctx]
174 |      (assoc ctx :categorical-ds
175 |             (:metamorph/data ctx)))
176 | 
177 | 
178 |    (mm/categorical->number [:survived :pclass :sex :embarked
179 |                             :title :age-group :cabin :parch] {} :int64)
180 | 
181 |    (mm/set-inference-target :survived)))
182 | 
183 | 
184 | ["Transformed data"]
185 | (->
186 |  (pipeline-fn {:metamorph/data data :metamorph/mode :fit})
187 |  :metamorph/data)
188 | 
189 | 
190 | ["The following splits the dataset in three pieces,
191 |  train, val and test to predict on later.
192 | "]
193 | 
194 | 
195 | 
196 | 
197 | 
198 | (def ds-split (first (ds/split->seq data :holdout {:ratio [0.8 0.2]
199 |                                                    :split-names [:train-val :test]})))
200 |                                     
201 | 
202 | ["Create a sequence of train/test  (k-fold with k=10) splits used to evaluate the pipeline."]
203 | (def train-val-splits
204 |     (ds/split->seq
205 |      (:train-val ds-split)
206 |      :kfold
207 |      {:k 10}))
208 | 
209 | 
210 | 
211 | 
212 | ["The full pipeline definition including the random forrest model."]
213 | 
214 | (def full-pipeline-fn
215 |   (ml/pipeline
216 |    pipeline-fn
217 |    ;; we overwrite the id, so the model function will store
218 |    ;; it's output (the model) in the pipeline ctx under key :model
219 |    {:metamorph/id :model}
220 |    (mm/model {:model-type :smile.classification/random-forest})))
221 | 
222 | 
223 | 
224 | 
225 | 
226 | ["Evaluate the (single) pipeline function using the train/test split"]
227 | (def evaluations
228 |   (ml/evaluate-pipelines
229 |    [full-pipeline-fn]
230 |    train-val-splits
231 |    ml/classification-accuracy
232 |    :accuracy))
233 | 
234 | 
235 | ["The default k-fold splits makes 10 folds,
236 | so we train 10 models, each having its own loss."]
237 | 
238 | ["The `evaluate-pipelines` fn averages the models per pipe-fn,
239 | and returns the best.
240 | So we get a single model back, as we only have one pipe fn"]
241 | 
242 | ["Often we consider the model with the lowest loss to be the best."]
243 | 
244 | ["Return a single model only (as a list of 1) , namely the best over all
245 |  pipeline functions
246 | and all cross validations is the default behavoiur, but can be changed
247 | with the `tune options`."]
248 | 
249 | ["They controll as well which information is returned."]
250 | 
251 | ["`tech.ml` stores the models in the context in a serialzed form,
252 | and the function `thaw-model` can be used to get the original model back.
253 | This is a Java class in the case of
254 |  model :smile.classification/random.forest, but this depends on the
255 | which `model` function is in the pipeline"]
256 | 
257 | ["We can get for example,  the models like this:"]
258 | 
259 | (def models
260 |   (->> evaluations
261 |        flatten
262 |        (map
263 |         #(hash-map :model (ml/thaw-model (get-in % [:fit-ctx :model]))
264 |                    :metric ((comp :metric :test-transform) %)
265 |                    :fit-ctx (:fit-ctx %)))
266 |                    
267 |        (sort-by :mean)
268 |        reverse))
269 | 
270 | 
271 | ["The accuracy of the best trained model is:"]
272 | (-> models first :metric)
273 | 
274 | ["The one with the highest accuracy is then:"]
275 | (-> models first :model)
276 | 
277 | 
278 | ["We can get the predictions on new-data, which for classification contain as well
279 | the posterior probabilities per class."]
280 | 
281 | ["We do this by running the pipeline again, this time with new data and merging
282 | :mode transform"]
283 | 
284 | (def predictions
285 |   (->
286 |    (full-pipeline-fn
287 |     (assoc
288 |      (:fit-ctx (first models))
289 |      :metamorph/data (:test ds-split)
290 |      :metamorph/mode :transform))
291 |    :metamorph/data))
292 | 
293 | ^kind/dataset
294 | predictions
295 | 
296 | 
297 | ["Out of the predictions and the truth, we can construct the
298 |  confusion matrix."]
299 | 
300 | (def trueth
301 |   (->
302 |    (full-pipeline-fn {:metamorph/data (:test ds-split) :metamorph/mode :fit})
303 |    :metamorph/data
304 |    tech.v3.dataset.modelling/labels))
305 | 
306 | ^kind/dataset
307 | (->
308 |  (ml/confusion-map (:survived predictions)
309 |                    (:survived trueth)
310 |                    :none)
311 |  (ml/confusion-map->ds))
312 | 
313 | ["### Hyper parameter tuning"]
314 | 
315 | ["This defines a pipeline with options. The options gets passed to the model function,
316 | so become hyper-parameters of the model.
317 | 
318 | The `use-age?` options is used to make a conditional pipeline. As the use-age? variable becomes part of the grid to search in,
319 | we tune it as well.
320 | This is an example how pipeline-options can be grid searched in the same way then hyper-parameters of the model.
321 | 
322 | "]
323 | (defn make-pipeline-fn [options]
324 | 
325 |   (ml/pipeline
326 |    pipeline-fn
327 |    {:metamorph/id :model}
328 |    (mm/model
329 |     (merge options
330 |            {:model-type :smile.classification/random-forest}))))
331 | 
332 | ["Use sobol optimization, to find som grid points,
333 | which cover in a smart way the hyper-parameter space."]
334 | 
335 | (def search-grid
336 |   (->>
337 |    (ml/sobol-gridsearch {:trees (ml/linear 100 500 10)
338 |                          :mtry (ml/categorical [0 2 4])
339 |                          :split-rule (ml/categorical [:gini :entropy])
340 |                          :max-depth (ml/linear 1 50 10)
341 |                          :node-size (ml/linear 1 10 10)})
342 | 
343 |    (take 500)))
344 |   
345 | 
346 | ["Generate the pipeline-fns we want to evaluate."]
347 | (def pipeline-fns (map make-pipeline-fn search-grid))
348 | 
349 | (defn xgboost-pipe [opts]
350 |   (ml/pipeline
351 |      pipeline-fn
352 |      {:metamorph/id :model}
353 |      (mm/model
354 |       (merge opts
355 |              {:model-type :xgboost/classification}))))
356 | 
357 | (def xgboost-pipes
358 |   (->>
359 |    (ml/sobol-gridsearch
360 |     (ml/hyperparameters :xgboost/classification))
361 |    (take 500)
362 |    (map xgboost-pipe)))
363 | 
364 | 
365 | ;; (ml/fit-pipe (:train (first train-val-splits)) xgboost-pipe)
366 | 
367 | ["Evaluate all  pipelines and keep results"]
368 | (def evaluations
369 | 
370 |   (ml/evaluate-pipelines
371 |    (take 10
372 |          (concat xgboost-pipes xgboost-pipes))
373 |    train-val-splits
374 |    ml/classification-accuracy
375 |    :accuracy
376 |    {:return-best-pipeline-only false
377 |     :return-best-crossvalidation-only false
378 |     ;; :evaluation-handler-fn (fn [m]
379 |     ;;                          (println (:metric m)))
380 | 
381 | 
382 |     :map-fn :map}))
383 |     
384 |     
385 | 
386 | 
387 | 
388 | 
389 | 
390 | ["Get the key information from the evaluations and sort by the metric function used,
391 |  accuracy here."]
392 | 
393 | (def models
394 |   (->> evaluations
395 |        flatten
396 |        (map
397 |         #(assoc
398 |           (select-keys % [:test-transform :fit-ctx :pipe-fn])
399 | 
400 |           :model (ml/thaw-model (get-in % [:fit-ctx :model]))))
401 |        (sort-by (comp :metric :test-transform))
402 |        reverse))
403 | 
404 | 
405 | 
406 | 
407 | ["As we did several pipelines and several x-fold cross validation, we have quite some models trained in total "]
408 | (count models)
409 | 
410 | ["As we sorted by mean accuracy, the first evaluation result is the best model,"]
411 | (def best-model (first models))
412 | 
413 | ["which is: "]
414 | (:model best-model)
415 | 
416 | ["with a mean accuracy of "  (-> best-model :test-transform :mean)]
417 | ["and a accuracy of "  (-> best-model :test-transform :metric)]
418 | 
419 | 
420 | (println "mean acc: " (-> best-model :test-transform :mean))
421 | (println "acc: " (-> best-model :test-transform :metric))
422 | 
423 | 
424 | ["using options: "]
425 | (-> best-model :fit-ctx :model :options)
426 | (clojure.pprint/pprint (-> best-model :fit-ctx :model :options))
427 | 
428 | (def test-data (ds/dataset "data/titanic/test.csv"
429 |                            {:key-fn csk/->kebab-case-keyword}))
430 | 
431 | 
432 | 
433 | (def predition-on-test
434 |   (full-pipeline-fn
435 |    (assoc (:fit-ctx best-model)
436 |           :metamorph/data (ds/add-column test-data :survived nil)
437 |           :metamorph/mode :transform)))
438 | 
439 | 
440 | (def prediction-ds
441 |   (->
442 |    (predition-on-test :metamorph/data)
443 |    (ds/add-column :passenger-id (:passenger-id test-data))
444 |    (ds/convert-types [:survived] :int)
445 |    (ds/select-columns [:passenger-id :survived 0 1])))
446 | 
447 | ^kind/dataset
448 | prediction-ds
449 | 
450 | 
451 | 
452 | 
453 | 
454 | ["# Create Subimssion file to Kaggle"]
455 | 
456 | (def submission-ds
457 |   (-> prediction-ds
458 |       (ds/select-columns [:passenger-id :survived])
459 |       (ds/rename-columns {:passenger-id "PassengerId"
460 |                           :survived "Survived"})))
461 | 
462 | (ds/write-csv! submission-ds "submission.csv")
463 | 
464 | 
465 | ["### Learning curve"]
466 | 
467 | 
468 | 
469 | (def training-curve-splits
470 |   (map
471 |    #(hash-map :train (ds/head (:train-val ds-split) %)
472 |               :test (:test ds-split))
473 |    (range 5 (ds/row-count (:train-val ds-split)) 10)))
474 | 
475 | 
476 | 
477 | (def training-curve-evaluations
478 |   (ml/evaluate-pipelines [(:pipe-fn (first models))]
479 |                          training-curve-splits
480 |                          ml/classification-accuracy
481 |                          :accuracy
482 |                          {:map-fn :map
483 |                           :return-best-pipeline-only false
484 |                           :return-best-crossvalidation-only false
485 |                           :evaluation-handler-fn identity}))
486 |                           
487 | (def train-counts
488 |   (->> training-curve-evaluations flatten (map #(-> % :fit-ctx :metamorph/data ds/row-count))))
489 | 
490 | 
491 | 
492 | (def test-metrices
493 |   (->> training-curve-evaluations flatten (map #(-> % :test-transform :metric))))
494 | 
495 | (def train-metrices
496 |   (->> training-curve-evaluations flatten (map #(-> % :train-transform :metric))))
497 | 
498 | (def traing-curve-plot-data
499 |   (reverse
500 |    (sort-by :metric
501 |             (flatten
502 |              (map
503 |               #(vector (zipmap [:count :metric :type] [%1 %2 :test])
504 |                        (zipmap [:count :metric :type] [%1 %3 :train]))
505 |               train-counts
506 |               test-metrices
507 |               train-metrices)))))
508 | 
509 | 
510 | ^kind/vega
511 | {
512 |  :data {:values traing-curve-plot-data}
513 | 
514 |  :width 500
515 |  :height 500
516 |  :mark {:type "line"}
517 |  :encoding {:x {:field :count :type "quantitative"}
518 |             :y {:field :metric :type "quantitative"}
519 |             :color {:field :type}}}
520 | 
521 | 
522 | 
523 | 
524 | 
525 | (comment
526 |   (->>
527 |    (map
528 |     #(hash-map :test-metric %1
529 |                :train-metric %2
530 |                :better? (if (> %1 %2) :test :train))
531 |     (->> training-curve-evaluations flatten (map :metric))
532 |     (->> training-curve-evaluations flatten (map #(get-in % [:train-prediction :metric]))))
533 |    (map :better?)
534 |    frequencies)
535 | 
536 |   (println
537 |    (-> (ds/dataset {:x ["A" "B" "C" "D" "E" "F"] :y (range)})
538 |        (ds/categorical->one-hot [:x] {} :int)
539 |        (ds/set-inference-target :y)
540 |        (scicloj.metamorph.ml/train {:model-type :smile.regression/ordinary-least-square})
541 |        ml/thaw-model)))
542 | 
543 |   
544 | 


--------------------------------------------------------------------------------
/src/scicloj/ml/transformers.clj:
--------------------------------------------------------------------------------
  1 | (ns scicloj.ml.transformers
  2 |   (:require
  3 |    [notespace.api :as note]
  4 |    [notespace.kinds :as kind]
  5 |    [scicloj.ml.metamorph :as mm]))
  6 |    
  7 |   
  8 | 
  9 | (comment
 10 |   (note/init-with-browser)
 11 |   (note/eval-this-notespace)
 12 |   (note/render-static-html "docs/userguide-transformers.html"))
 13 |   
 14 | 
 15 | (require '[scicloj.ml.core :as ml]
 16 |          '[scicloj.ml.dataset :as ds]
 17 |          '[scicloj.ml.metamorph :as mm])
 18 | 
 19 | 
 20 | 
 21 | ^kind/hidden
 22 | (defn docu-fn [v]
 23 |   (let [m (meta v)]
 24 |     (kind/override
 25 |      [
 26 |       (str  "## Transformer " "**" (:name m) "**")
 27 |       "----------------------------------------------------------"
 28 |       "__Clojure doc__:\n"
 29 |       (:doc m)
 30 |       "----------------------------------------------------------"]
 31 |       
 32 |      kind/md-nocode)))
 33 |      
 34 | 
 35 | 
 36 | 
 37 | (docu-fn (var mm/count-vectorize))
 38 | 
 39 | ["In the following we transform the text given in a dataset into a
 40 |  map of token counts applying some default text normalization."]
 41 | (def data (ds/dataset {:text ["Hello Clojure world, hello ML word !"
 42 |                               "ML with Clojure is fun"]}))
 43 |                               
 44 | 
 45 | ^kind/dataset-grid
 46 | data
 47 | 
 48 | ["_"]
 49 | 
 50 | (def fitted-ctx
 51 |   (ml/fit data
 52 |           (mm/count-vectorize :text :bow)))
 53 | 
 54 | 
 55 | 
 56 | fitted-ctx
 57 | 
 58 | (def bow-ds
 59 |  (:metamorph/data fitted-ctx))
 60 | 
 61 | ^kind/dataset
 62 | bow-ds
 63 | 
 64 | 
 65 | ["A custom tokenizer can be specified by either passing options to
 66 | `scicloj.ml.smile.nlp/default-tokenize` "]
 67 | 
 68 | 
 69 | (def fitted-ctx
 70 |   (ml/fit
 71 |    data
 72 |    (mm/count-vectorize :text :bow {:stopwords ["clojure"]
 73 |                                    :stemmer :none})))
 74 |                                    
 75 | 
 76 | fitted-ctx
 77 | 
 78 | ["or passing in a implementation of a tokenizer function"]
 79 | 
 80 | (def fitted-ctx
 81 |   (ml/fit
 82 |    data
 83 |    (mm/count-vectorize
 84 |     :text :bow
 85 |     {:text->bow-fn (fn [text options]
 86 |                      {:a 1 :b 2})})))
 87 |                       
 88 | fitted-ctx
 89 | 
 90 | 
 91 | 
 92 | (docu-fn (var mm/bow->SparseArray))
 93 | ["Now we convert the bag-of-words map to a sparse array of class
 94 |  `smile.util.SparseArray`
 95 | 
 96 | "]
 97 | (def ctx-sparse
 98 |   (ml/fit
 99 |    bow-ds
100 |    (mm/bow->SparseArray :bow :sparse)))
101 | 
102 | ctx-sparse
103 | 
104 | 
105 | ^kind/dataset
106 | (:metamorph/data ctx-sparse)
107 | 
108 | ["The SparseArray instances look like this:"]
109 | (zipmap
110 |  (:text bow-ds)
111 |  (map seq
112 |       (-> ctx-sparse :metamorph/data :sparse)))
113 | 
114 | (docu-fn (var mm/bow->sparse-array))
115 | ["Now we convert the bag-of-words map to a sparse array of class
116 |  `java primitive int array`
117 | "]
118 | (def ctx-sparse
119 |   (ml/fit
120 |    bow-ds
121 |    (mm/bow->sparse-array :bow :sparse)))
122 | 
123 | ctx-sparse
124 | 
125 | ["We see as well the sparse representation as indices against the vocabulary
126 | of the non-zero counts."]
127 | 
128 | (zipmap
129 |  (:text bow-ds)
130 |  (map seq
131 |       (-> ctx-sparse :metamorph/data :sparse)))
132 | 
133 | 
134 | 
135 | 
136 | ["In both ->sparse function we can control the vocabulary via
137 | the option to pass in a different / custom functions which creates
138 | the vocabulary from the bow maps."]
139 | 
140 | (def ctx-sparse
141 |   (ml/fit
142 |    bow-ds
143 |    (mm/bow->SparseArray
144 |     :bow :sparse
145 |     {:create-vocab-fn
146 |      (fn [bow] (scicloj.ml.smile.nlp/->vocabulary-top-n bow 1))})))
147 |      
148 | 
149 | ctx-sparse
150 | 
151 | (def ctx-sparse
152 |   (ml/fit
153 |    bow-ds
154 |    (mm/bow->SparseArray
155 |     :bow :sparse
156 |     {:create-vocab-fn
157 |      (fn [_]
158 |        ["hello" "fun"])})))
159 |        
160 | 
161 | ctx-sparse
162 | 
163 | 
164 | (docu-fn (var mm/bow->tfidf))
165 | ["Here we calculate the tf-idf score from the bag of words:"]
166 | 
167 | ^kind/dataset
168 | (ml/pipe-it
169 |  bow-ds
170 |  (mm/bow->tfidf :bow :tfidf {}))
171 | 
172 | 
173 | 
174 | (docu-fn (var mm/model))
175 | ["The `model` transformer allows to execute all machine learning models.clj
176 | which register themself inside the `metamorph.ml` system via the function
177 | `scicloj.metamorph.ml/define-model!`.
178 | The build in models are listed here:
179 | https://scicloj.github.io/scicloj.ml/userguide-models.html
180 | 
181 | "]
182 | 
183 | ["We use the Iris data for this example:"]
184 | 
185 | (def iris
186 |   (->
187 |    (ds/dataset
188 |     "https://raw.githubusercontent.com/scicloj/metamorph.ml/main/test/data/iris.csv" {:key-fn keyword})
189 |    (tech.v3.dataset.print/print-range 5)))
190 |    
191 |   
192 | 
193 | ^kind/dataset
194 | iris
195 | 
196 | (def train-test
197 |   (ds/train-test-split iris))
198 | 
199 | ["The pipeline consists in specifying the inference target,
200 |  transform target to categorical and the model function"]
201 | (def pipe-fn
202 |   (ml/pipeline
203 |    (mm/set-inference-target :species)
204 |    (mm/categorical->number [:species])
205 |    {:metamorph/id :model}
206 |    (mm/model {:model-type :smile.classification/logistic-regression})))
207 | 
208 | ["First we run the training "]
209 | (def fitted-ctx
210 |   (ml/fit
211 |    (:train-ds train-test)
212 |    pipe-fn))
213 |    
214 | 
215 | ^kind/hidden
216 | (defn dissoc-in [m ks]
217 |   (let [parent-path (butlast ks)
218 |         leaf-key (last ks)]
219 |     (if (= (count ks) 1)
220 |       (dissoc m leaf-key)
221 |       (update-in m parent-path dissoc leaf-key))))
222 | 
223 | (dissoc-in  fitted-ctx [:model :model-data])
224 | 
225 | ["and then prediction on test"]
226 | 
227 | (def transformed-ctx
228 |   (ml/transform-pipe (:test-ds train-test) pipe-fn fitted-ctx))
229 | 
230 | (-> transformed-ctx
231 |     (dissoc-in [:model :model-data])
232 |     (update-in [:metamorph/data ] #(tech.v3.dataset.print/print-range % 5)))
233 |     
234 | 
235 | ["and we get the predictions: "]
236 | ^kind/dataset
237 | (-> transformed-ctx
238 |     :metamorph/data
239 |     (ds/reverse-map-categorical-xforms)
240 |     (ds/select-columns :species)
241 |     (ds/head))
242 | 
243 | 
244 | (docu-fn (var mm/std-scale))
245 | ["We can use the std-scale transformer to center and scale data."]
246 | ["Lets take some example data:"]
247 | (def data
248 |   (ds/dataset
249 |    [
250 |     [100 0.001]
251 |     [8   0.05]
252 |     [50  0.005]
253 |     [88  0.07]
254 |     [4   0.1]]
255 |    {:layout :as-row}))
256 | 
257 | ^kind/dataset
258 | data
259 | 
260 | ["Now we can center each column arround 0 and scale
261 | it by the standard deviation  of the column"]
262 | 
263 | ^kind/dataset
264 | (ml/pipe-it
265 |  data
266 |  (mm/std-scale [0 1] {}))
267 | 
268 | 
269 | (docu-fn (var mm/min-max-scale))
270 | 
271 | ["The min-max scaler scales columns in a specified interval,
272 | by default from -0.5 to 0.5"]
273 | 
274 | ^kind/dataset
275 | (ml/pipe-it
276 |  data
277 |  (mm/min-max-scale [0 1] {}))
278 | 
279 | (docu-fn (var mm/reduce-dimensions))
280 | 
281 | ["#### PCA example"]
282 | 
283 | ["In this example we run PCA on some data."]
284 | 
285 | (require '[scicloj.metamorph.ml.toydata :as toydata])
286 | 
287 | ["We use the sonar dataset which has 60 columns of quantitative data,
288 | which are certain measurements from a sonar device.
289 | The original purpose of the dataset is to learn to detect rock vs metal
290 |  from the measurements"]
291 | (def sonar
292 |   (toydata/sonar-ds))
293 | 
294 | ^kind/dataset
295 | sonar
296 | 
297 | (def col-names (map #(keyword (str "x" %))
298 |                     (range 60)))
299 | 
300 | ["First we create and run  a pipeline which does the PCA."
301 |  "In this pipeline we do not fix the number of columns, as we want to
302 | plot the result for all numbers of components (up to 60) "]
303 |  
304 | (def fitted-ctx
305 |   (ml/fit
306 |    sonar
307 |    (mm/reduce-dimensions :pca-cov 60
308 |                          col-names
309 |                          {})))
310 | 
311 | 
312 | ["The next function transforms the result from the fitted pipeline
313 | into vega lite compatible format for plotting"]
314 | ["It accesses the underlying Smile Java object to get the data on
315 | the cumulative variance for each PCA component."]
316 | (defn create-plot-data [ctx]
317 |   (map
318 |    #(hash-map :principal-component %1
319 |               :cumulative-variance %2)
320 |    (range)
321 |    (-> ctx vals (nth 2) :fit-result :model bean :cumulativeVarianceProportion)))
322 | 
323 | ["Next we plot the cumulative variance over the component index:"]
324 | ^kind/vega
325 | {:$schema "https://vega.github.io/schema/vega-lite/v5.json"
326 |  :width 850
327 |  :data {:values
328 |         (create-plot-data fitted-ctx)}
329 |  :mark "line" ,
330 |  :encoding
331 |  {:x {:field :principal-component, :type "nominal"},
332 |   :y {:field :cumulative-variance, :type "quantitative"}}}
333 | 
334 | ["From the plot we see, that transforming the data via PCA and reducing
335 | it from 60 dimensions to about 25 would still preserve the full variance."]
336 | ["Looking at this plot, we could now make a decision, how many dimensions
337 |  to keep."]
338 | ["We could for example decide, that keeping 60 % of the variance
339 | is enough, which would result in keeping the first 2 dimensions."]
340 | 
341 | ["So our pipeline becomes:"]
342 | 
343 | 
344 | (def fitted-ctx
345 |   (ml/fit
346 |    sonar
347 |    (mm/reduce-dimensions :pca-cov 2
348 |                          col-names
349 |                          {})
350 |                          
351 |    (mm/select-columns  [:material "pca-cov-0" "pca-cov-1"])
352 |    (mm/shuffle)))
353 | 
354 | ^kind/dataset
355 | (:metamorph/data fitted-ctx)
356 | 
357 | ["As the data is now 2-dimensional, it is easy to plot:"]
358 | 
359 | (def scatter-plot-data
360 |   (-> fitted-ctx
361 |       :metamorph/data
362 |       (ds/select-columns [:material "pca-cov-0" "pca-cov-1"])
363 |       (ds/rows :as-maps)))
364 | 
365 | 
366 | ^kind/vega
367 | {:$schema "https://vega.github.io/schema/vega-lite/v5.json"
368 |  :data {:values scatter-plot-data}
369 |  :width 500
370 |  :height 500
371 | 
372 |  :mark :circle
373 |  :encoding
374 |  {:x {:field "pca-cov-0"  :type "quantitative"}
375 |   :y {:field "pca-cov-1"  :type "quantitative"}
376 |   :color {:field :material}}}
377 | 
378 | ["The plot shows that the reduction to 2 dimensions does not create
379 | linear separable areas of `M` and `R`. So a linear model will not be
380 |  able to predict well the material from the 2 PCA components."]
381 | 
382 | ["It even seems, that the reduction to 2 dimensions removes
383 | too much information for predicting of the material for any type of model."]
384 | 


--------------------------------------------------------------------------------
/src/scicloj/ml/tune_titanic.clj:
--------------------------------------------------------------------------------
  1 | (ns scicloj.ml.tune-titanic
  2 |   (:require
  3 |    [notespace.api :as note]
  4 |    [notespace.kinds :as kind]))
  5 |   
  6 | (comment
  7 |   (note/init-with-browser)
  8 |   (note/eval-this-notespace)
  9 |   (note/reread-this-notespace)
 10 |   (note/render-static-html "docs/tune-titanic.html")
 11 |   (note/init))
 12 | 
 13 | 
 14 | ["This is the Clojure version of https://www.moritzkoerber.com/posts/preprocessing-hyperparameters/"]
 15 | 
 16 | (require  '[scicloj.ml.dataset :as ds]
 17 |           '[scicloj.ml.core :as ml]
 18 |           '[scicloj.ml.metamorph :as mm]
 19 |           '[camel-snake-kebab.core :as csk]
 20 |           '[scicloj.metamorph.ml.evaluation-handler :as eval-hn]
 21 |           '[tech.v3.datatype.functional :as dtfunc])
 22 | 
 23 | (def  categorical-features  [:pclass :sex :embarked])
 24 | (def  numeric-features [:age :parch :fare])
 25 | 
 26 | (defn map->vec [m] (flatten (into [] m)))
 27 | 
 28 | ["Preproceesing Pipelines including feature engineering"]
 29 | 
 30 | (def data
 31 |   (-> (ds/dataset "data/titanic/train.csv"
 32 |                   {:key-fn csk/->kebab-case-keyword})
 33 |       (ds/select-columns (concat categorical-features numeric-features [:survived]))
 34 |       (ds/replace-missing categorical-features :value "missing")
 35 |       (ds/categorical->one-hot categorical-features)))
 36 | 
 37 | 
 38 | (defn replace-missing [options]
 39 |   (fn [ctx]
 40 |     ( (apply mm/replace-missing numeric-features (map->vec (:replace-missing-options options))) ctx)))
 41 | 
 42 | (defn maybe-std-scale [options]
 43 |   (fn [ctx]
 44 |     (if (-> options :scaling-options :scale?)
 45 |       ((mm/std-scale numeric-features {})
 46 |        ctx)
 47 |       ctx)))
 48 | 
 49 | (defn assoc-pipe-opts [options]
 50 |   (fn [ctx]
 51 |     (assoc ctx :pipe-options options)))
 52 | 
 53 | 
 54 | (defn make-decl-pipeline[model-type options]
 55 |   [[::assoc-pipe-opts options]
 56 |    [::replace-missing options]
 57 |    [:mm/categorical->number [:survived ] {} :int64]
 58 |    [::maybe-std-scale options]
 59 |    [:mm/set-inference-target :survived]
 60 |    {:metamorph/id :model} [:mm/model (merge (:model-options options) {:model-type model-type})]])
 61 | 
 62 | 
 63 | 
 64 | 
 65 | (def logistic-regression-pipelines
 66 |   (map
 67 |    #(make-decl-pipeline :smile.classification/logistic-regression %)
 68 |    (ml/sobol-gridsearch {:scaling-options {:scale? (ml/categorical [true false])}
 69 |                          :replace-missing-options {:value (ml/categorical [dtfunc/mean dtfunc/median])}
 70 |                          :model-options {:lambda (ml/categorical [0.1 0.2 0.5 0.7 1])
 71 |                                          :tolerance (ml/categorical [0.1 0.01 0.001 0.0001])}})))
 72 | 
 73 | (def random-forrest-pipelines
 74 |   (map
 75 |    #(make-decl-pipeline :smile.classification/random-forest %)
 76 |    (ml/sobol-gridsearch {:scaling-options {:scale? (ml/categorical [true false])}
 77 |                          :replace-missing-options {:value (ml/categorical [dtfunc/mean dtfunc/median])}
 78 |                          :model-options {:trees (ml/categorical [5 50 100 250])
 79 |                                          :max-depth (ml/categorical [5 8 10])}})))
 80 | 
 81 | (def all-pipelines (concat random-forrest-pipelines))
 82 | 
 83 | 
 84 | 
 85 | (def pipe-fns
 86 |   (mapv ml/->pipeline all-pipelines))
 87 | 
 88 | ["Simple split"]
 89 | (def splits (ds/split->seq data :holdout {:ratio 0.8}))
 90 | (def train-ds ((first splits) :train))
 91 | (def holdout-ds ((first splits) :test))
 92 | 
 93 | ["Tune hyperparameter by evaluating all pipelines/models "]
 94 | 
 95 | (def files [atom []])
 96 | (def best-evaluation
 97 |   (ml/evaluate-pipelines
 98 |    all-pipelines
 99 |    (ds/split->seq train-ds :kfold 5)
100 |    ml/classification-accuracy
101 |    :accuracy
102 |    {;; :attach-fn-sources {:ns (find-ns 'scicloj.ml.tune-titanic)
103 |     ;;                         :pipe-fns-clj-file "src/scicloj/ml/tune_titanic.clj"}
104 |     :return-best-crossvalidation-only true
105 |     :return-best-pipeline-only true}))
106 | 
107 | (def best-accuracy (-> best-evaluation first first :train-transform :metric))
108 | 
109 | 
110 | (def best-options (-> best-evaluation first first :fit-ctx :pipe-options))
111 | 
112 | (def best-pipe-fn
113 |   (-> best-evaluation first first :pipe-fn))
114 | 
115 | best-pipe-fn
116 | 
117 | (def best-pipe-decl
118 |   (-> best-evaluation first first :pipe-decl))
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | ["## All information on best found pipeline"]
126 | 
127 | ["best accuracy found on train data: " (-> best-evaluation first first :train-transform :metric)]
128 | ["best accuracy found on test data: " (-> best-evaluation first first :test-transform :metric)]
129 | 
130 | ["best options (found on train data): "]
131 | best-options
132 | 
133 | ["best pipeline (found on train data)"]
134 | best-pipe-decl
135 | 
136 | ["pipe sources information"]
137 | (->
138 |  (ml/get-nice-source-info best-pipe-decl
139 |                           (find-ns 'scicloj.ml.tune-titanic)
140 |                           (-> #'data meta :file))
141 |  (update :classpath #(take 20 %)))
142 | 
143 | 
144 | 
145 | 
146 | 
147 | (def predicted-survival-hold-out
148 |   (->
149 |    (best-pipe-fn
150 |     (merge (-> best-evaluation first first :fit-ctx)
151 |            {:metamorph/data holdout-ds :metamorph/mode :transform}))
152 |    :metamorph/data
153 |    ds/reverse-map-categorical-xforms
154 |    :survived))
155 | 
156 | ["Classication accuracy on holdout data: "]
157 | (ml/classification-accuracy predicted-survival-hold-out
158 |                            (holdout-ds :survived))
159 | 
160 | ["Confusion matrix on holdout data"]
161 | ^kind/dataset
162 | (->
163 |  (ml/confusion-map predicted-survival-hold-out
164 |                    (holdout-ds :survived))
165 |  (ml/confusion-map->ds))
166 | 
167 | ["Smile model object:"]
168 | (ml/thaw-model
169 |  (-> best-evaluation first first :fit-ctx :model))
170 | 
171 | 
172 | 
173 | 
174 | ["Feature importance:"]
175 | 
176 | (seq
177 |  (.importance
178 |   (ml/thaw-model
179 |    (-> best-evaluation first first :fit-ctx :model))))
180 | 
181 | 
182 | 
183 | ["## nested cross validation"]
184 | 
185 | 
186 | 
187 | (require '[scicloj.ml.nested-cv :as nested-cv])
188 | 
189 | 
190 | (def nested-cv-result
191 |  (doall
192 |   (nested-cv/nested-cv data all-pipelines
193 |                        ml/classification-accuracy
194 |                        :accuracy 10 5)))
195 | 
196 | 
197 | ["nested cv best models metrics"]
198 | (map :metric nested-cv-result)
199 | 
200 | (def final-model-by-cv
201 |  (let [inner-k-fold (ds/split->seq data :kfold {:k 5})
202 |        evaluation (ml/evaluate-pipelines
203 |                    all-pipelines
204 |                    inner-k-fold
205 |                    ml/classification-accuracy
206 |                    :accuracy)
207 |        fit-ctx (-> evaluation first first :fit-ctx)
208 |        best-pipefn (-> evaluation first first :pipe-fn)]
209 |    {:best-pipe-fn best-pipefn
210 |     :fit-ctx fit-ctx}))
211 | 
212 | (def final-model
213 |   ((:best-pipe-fn final-model-by-cv)
214 |    {:metamorph/data data :metamorph/mode :fit}))
215 | 
216 | ["Final best model"]
217 | (ml/thaw-model (:model  final-model))
218 | 
219 | ["trained with best hyper paramter"]
220 | (-> final-model :pipe-options)
221 | 


--------------------------------------------------------------------------------
/src/scicloj/ml/ug_utils.clj:
--------------------------------------------------------------------------------
  1 | (ns scicloj.ml.ug-utils
  2 |   (:require [clojure.string :as str]
  3 |             [notespace.kinds :as kind]
  4 |             [notespace.view :as view]
  5 |             [scicloj.ml.core :as ml]
  6 |             [scicloj.ml.metamorph :as mm]
  7 |             [tech.v3.dataset :as ds]
  8 |             [tech.v3.dataset.modelling :as ds-mod]
  9 |             [tablecloth.api :as tc]
 10 |             [libpython-clj2.python :as py]
 11 |             [tech.v3.datatype.functional :as dtf]
 12 |             [clj-http.client :as client]))
 13 |             
 14 | 
 15 | (defn kroki [s type format]
 16 |   (client/post "https://kroki.io/" {:content-type :json
 17 |                                     :as :byte-array
 18 |                                     :form-params
 19 |                                     {:diagram_source s
 20 |                                      :diagram_type (name type)
 21 |                                      :output_format (name format)}}))
 22 | (py/initialize!)
 23 | (def doc->markdown (py/import-module "docstring_to_markdown"))
 24 | 
 25 | 
 26 | 
 27 | (def model-keys
 28 |   (keys @scicloj.ml.core/model-definitions*))
 29 | 
 30 | (def model-options
 31 |   (map
 32 |    :options
 33 |    (vals @scicloj.ml.core/model-definitions*)))
 34 | 
 35 | (defn dataset->md-hiccup [mds]
 36 |   (let [height (* 46 (- (count (str/split-lines (str mds))) 2))
 37 |         height-limit (min height 800)]
 38 |     [:div {:class "table table-striped table-hover table-condensed table-responsive"}
 39 |            ;; :style {:height (str height-limit "px")}
 40 |            
 41 |      (view/markdowns->hiccup mds)]))
 42 | 
 43 | 
 44 | (defmethod kind/kind->behaviour ::dataset-nocode
 45 |   [_]
 46 |   {:render-src?   false
 47 |    :value->hiccup #'dataset->md-hiccup})
 48 | 
 49 | (defn docu-options [model-key]
 50 |   (kind/override
 51 |    (->
 52 |     (tc/dataset
 53 |      (or
 54 |       (get-in @scicloj.ml.core/model-definitions* [model-key :options])
 55 |       {:name [] :type [] :default []}))
 56 | 
 57 |     (tc/reorder-columns :name :type :default))
 58 | 
 59 |    ::dataset-nocode))
 60 |    
 61 |   
 62 | 
 63 | 
 64 | ;; (->
 65 | ;;  (tc/dataset
 66 | ;;   (get-in @scicloj.ml.core/model-definitions* [:corenlp/crf :options] ))
 67 | ;; (tc/reorder-columns :name :type :default)
 68 | ;;  )
 69 | 
 70 | (defn text->hiccup
 71 |   "Convert newlines to [:br]'s."
 72 |   [text]
 73 |   (->> (str/split text #"\n")
 74 |        (interpose [:br])
 75 |        (map #(if (string? %)
 76 |                %
 77 |                (with-meta % {:key (gensym "br-")})))))
 78 | 
 79 | (defn docu-doc-string [model-key]
 80 |   (try
 81 |     (view/markdowns->hiccup
 82 |      (py/py. doc->markdown convert
 83 |              (or
 84 |               (get-in @scicloj.ml.core/model-definitions* [model-key :documentation :doc-string] ) "")))
 85 |     (catch Exception e "")))
 86 | 
 87 | 
 88 | 
 89 | 
 90 | (defn anchor-or-nothing [x text]
 91 |   (if (empty? x)
 92 |     [:div ""]
 93 |     [:div
 94 |      [:a {:href x} text]]))
 95 |     
 96 |   
 97 | 
 98 | (defn render-key-info [prefix]
 99 |   (->> @scicloj.ml.core/model-definitions*
100 |        (sort-by first)
101 |        (filter #(str/starts-with? (first %) (str prefix)))
102 |        (map
103 |         (fn [[key definition]]
104 |           [:div
105 |            [:h3 {:id (str key)} (str key)]
106 |            (anchor-or-nothing (:javadoc (:documentation definition)) "javadoc")
107 |            (anchor-or-nothing (:user-guide (:documentation definition)) "user guide")
108 | 
109 |            ;; [:span (text->hiccup (or
110 |            ;;                       (get-in @scicloj.ml.core/model-definitions* [key :documentation :description] ) ""))]
111 | 
112 |            [:span
113 |             (dataset->md-hiccup (docu-options key))]
114 | 
115 |            [:span
116 |             (docu-doc-string key)]
117 | 
118 |            [:hr]
119 |            ;; [:div "Example:"]
120 |            ;; [:div
121 |            ;;  [:p/code {:code (str
122 |            ;;                   (get-in definition [:documentation :code-example]
123 |            ;;                           "" ))
124 |            ;;            :bg-class "bg-light"}]]
125 | 
126 |            [:hr]]))))
127 |            
128 | 
129 | (text->hiccup (or
130 |                (get-in @scicloj.ml.core/model-definitions*
131 |                        [:smile.manifold/tsne :documentation :description]) ""))
132 | 
133 | 
134 | (defn remove-deep [key-set data]
135 |   (clojure.walk/prewalk (fn [node] (if (map? node)
136 |                                     (apply dissoc node key-set)
137 |                                     node))
138 |                         data))
139 | (defn stepped-range [start end n-steps]
140 |   (let [diff (- end start)]
141 |     (range start end (/ diff n-steps))))
142 | 
143 | (defn surface-plot [iris cols raw-pipe-fn model-name]
144 |   (let [
145 |         pipe-fn
146 |         (ml/pipeline
147 |          (mm/select-columns (concat [:species] cols))
148 |          raw-pipe-fn)
149 | 
150 |         fitted-ctx
151 |         (pipe-fn
152 |          {:metamorph/data iris
153 |           :metamorph/mode :fit})
154 |         ;; getting plot boundaries
155 |         min-x (- (-> (get iris (first cols)) dtf/reduce-min) 0.2)
156 |         min-y (- (-> (get iris (second cols)) dtf/reduce-min) 0.2)
157 |         max-x (+ (-> (get iris (first cols)) dtf/reduce-max) 0.2)
158 |         max-y (+ (-> (get iris (second cols)) dtf/reduce-max) 0.2)
159 | 
160 | 
161 |         ;; make a grid for the decision surface
162 |         grid
163 |         (for [x1 (stepped-range min-x max-x 100)
164 |               x2 (stepped-range min-y max-y 100)]
165 | 
166 |           {(first cols) x1
167 |            (second cols) x2
168 |            :species nil})
169 | 
170 |         grid-ds (tc/dataset grid)
171 | 
172 | 
173 |         ;; predict for all grid points
174 |         prediction-grid
175 |         (->
176 |          (pipe-fn
177 |           (merge
178 |            fitted-ctx
179 |            {:metamorph/data grid-ds
180 |             :metamorph/mode :transform}))
181 |          :metamorph/data
182 |          (ds-mod/column-values->categorical :species)
183 |          seq)
184 | 
185 |         grid-ds-prediction
186 |         (tc/add-column grid-ds :predicted-species prediction-grid)
187 | 
188 | 
189 |         ;; predict the iris data points from data set
190 |         prediction-iris
191 |         (->
192 |          (pipe-fn
193 |           (merge
194 |            fitted-ctx
195 |            {:metamorph/data iris
196 |             :metamorph/mode :transform}))
197 |          :metamorph/data
198 | 
199 |          (ds-mod/column-values->categorical :species)
200 |          seq)
201 | 
202 |         ds-prediction
203 |         (tc/add-column iris :true-species (:species iris)
204 |                        prediction-iris)]
205 | 
206 |     ;; create a 2 layer Vega lite specification
207 |     {:layer
208 |      [
209 | 
210 |       {:data {:values (seq (tc/rows grid-ds-prediction :as-maps))}
211 |        :title (str "Decision surfaces for model: " model-name)
212 |        :width 500
213 |        :height 500
214 |        :mark {:type "square" :opacity 0.9 :strokeOpacity 0.1 :stroke nil},
215 |        :encoding {:x {:field (first cols)
216 |                       :type "quantitative"
217 |                       :scale {:domain [min-x max-x]}
218 |                       :axis {:format "2.2"
219 |                              :labelOverlap true}}
220 |                       
221 |                   :y {:field (second cols) :type "quantitative"
222 |                       :axis {:format "2.2"
223 |                              :labelOverlap true}
224 |                       :scale {:domain [min-y max-y]}}
225 |                       
226 |                   :color {:field :predicted-species}}}
227 |                   
228 | 
229 |       {:data {:values (seq (tc/rows ds-prediction :as-maps))}
230 | 
231 |        :width 500
232 |        :height 500
233 |        :mark {:type "circle" :opacity 1 :strokeOpacity 1},
234 |        :encoding {:x {:field (first cols)
235 |                       :type "quantitative"
236 |                       :axis {:format "2.2"
237 |                              :labelOverlap true}
238 |                       :scale {:domain [min-x max-x]}}
239 |                       
240 |                   :y {:field (second cols) :type "quantitative"
241 |                       :axis {:format "2.2"
242 |                              :labelOverlap true}
243 |                       :scale {:domain [min-y max-y]}}
244 |                       
245 | 
246 |                   :fill {:field :true-species} ;; :legend nil
247 |                          
248 |                   :stroke { :value :black}
249 |                   :size {:value 300}}}]}))
250 | 
251 | (defn select-paths-from-set [current-path path-set data]
252 |   (cond
253 |     (map? data) (into {}
254 |                       (remove nil?)
255 |                       (for [[k v] data]
256 |                         (let [p (conj current-path k)]
257 |                           (if (contains? path-set p)
258 |                             [k (select-paths-from-set p path-set v)]))))
259 |     (sequential? data) (mapv (partial select-paths-from-set current-path path-set) data)
260 |     :default data))
261 | 
262 | (defn select-paths [data paths]
263 |   (select-paths-from-set []
264 |                          (into #{}
265 |                                (mapcat #(take-while seq (iterate butlast %)))
266 |                                paths)
267 |                          data))
268 | 
269 | (defn select-minimal-result [result]
270 |     (select-paths result [[:train-transform :metric]
271 |                           [:test-transform :metric]]))
272 | 


--------------------------------------------------------------------------------
/src/scicloj/ml/ug_utils_clerk.clj:
--------------------------------------------------------------------------------
 1 | (ns scicloj.ml.ug-utils-clerk
 2 |   (:require
 3 |    [clojure.string :as str]
 4 |    [nextjournal.clerk :as clerk]
 5 |    [scicloj.ml.core :as ml]
 6 |    [scicloj.ml.ug-utils :as utils]
 7 |    [tablecloth.api :as tc]))
 8 | 
 9 | (defn docu-options [model-key]
10 | 
11 |   (->
12 |    (tc/dataset
13 |     (or
14 |      (get-in @scicloj.ml.core/model-definitions* [model-key :options])
15 |      {:name [] :type [] :default []}))
16 | 
17 |    (tc/reorder-columns :name :type :default)))
18 | 
19 | 
20 | 
21 | (defn stringify-enum [form]
22 |   (clojure.walk/postwalk (fn [x] (do (if  (instance? Enum x) (str x) x)))
23 |                          form))
24 | 
25 | (defn render-key-info [prefix]
26 |   (vec (concat [:span]
27 |                (->> @scicloj.ml.core/model-definitions*
28 |                     (sort-by first)
29 |                     (filter #(str/starts-with? (first %) (str prefix)))
30 |                     (mapv
31 |                      (fn [[key definition]]
32 |                        [:div
33 |                         ;; (clerk/md (format "### %s" (str key)))
34 |                         [:h3 {:id (str key)} (str key)]
35 |                         (utils/anchor-or-nothing (:javadoc (:documentation definition)) "javadoc")
36 |                         (utils/anchor-or-nothing (:user-guide (:documentation definition)) "user guide")
37 | 
38 |                         ;; [:span (text->hiccup (or
39 |                         ;;                       (get-in @scicloj.ml.core/model-definitions* [key :documentation :description] ) ""))]
40 | 
41 |                         [:span
42 | 
43 |                          (let [docu-ds (docu-options key)]
44 |                            (if  (tc/empty-ds? docu-ds)
45 |                              ""
46 |                              (->
47 |                               docu-ds
48 |                               (tc/rows :as-maps)
49 |                               seq
50 |                               stringify-enum
51 |                               (clerk/table))))]
52 |                         [:span
53 |                          (utils/docu-doc-string key)]
54 | 
55 |                         [:hr]
56 |                         [:hr]]))))))
57 | 


--------------------------------------------------------------------------------
/src/scicloj/ml/unsupervised.clj:
--------------------------------------------------------------------------------
  1 | (ns scicloj.ml.unsupervised
  2 |   (:require
  3 |    [notespace.api :as note]
  4 |    [notespace.kinds :as kind]
  5 |    [net.clojars.behrica.cluster_eval :as cluster-eval]))
  6 | 
  7 | 
  8 | 
  9 | 
 10 | (comment
 11 |   (note/init-with-browser)
 12 |   (note/eval-this-notespace)
 13 |   (note/reread-this-notespace)
 14 |   (note/render-static-html "docs/userguide-unsupervised.html")
 15 |   (note/init))
 16 | 
 17 | (require '[scicloj.ml.core :as ml]
 18 |          '[scicloj.ml.metamorph :as mm]
 19 |          '[scicloj.ml.dataset  :as ds])
 20 | 
 21 | ["# Cluster Iris data"]
 22 | 
 23 | (def iris
 24 |   (->
 25 |    (ds/dataset
 26 |     "https://raw.githubusercontent.com/scicloj/metamorph.ml/main/test/data/iris.csv" {:key-fn keyword})))
 27 | 
 28 | 
 29 | 
 30 | 
 31 | ["## k-means clustering"]
 32 | 
 33 | (def fit-ctx
 34 |   (ml/fit
 35 |    iris
 36 |    (mm/select-columns [:petal_length :petal_width])
 37 |    {:metamorph/id :model}
 38 |    (mm/model {:model-type :fastmath/cluster
 39 |               :clustering-method :k-means
 40 |               :clustering-method-args [3]})))
 41 | 
 42 | (def iris-with-cluster
 43 |   (ds/add-column iris :cluster
 44 |                  (-> fit-ctx :model :model-data :clustering)))
 45 | 
 46 | (def centroids
 47 |   (map
 48 |    (fn [[petal-length petal-width]]
 49 |      (hash-map :petal_length petal-length
 50 |                :petal_width petal-width))
 51 |    (-> fit-ctx :model :model-data :representatives)))
 52 | 
 53 | ^kind/vega
 54 | {:height 300
 55 |  :width 300
 56 | 
 57 |  :title "2D result of iris k-means clustering with cluster centroids (n=3)"
 58 |  :layer [{
 59 |           :$schema "https://vega.github.io/schema/vega-lite/v5.json"
 60 |           :data {:values (ds/rows iris-with-cluster :as-maps)}
 61 |           :description "Iris data "
 62 |           :encoding {:x {:field :petal_length :type "quantitative"}
 63 |                      :y {:field :petal_width :type "quantitative"}
 64 |                      :color {:field :cluster}}
 65 |           :mark "point"}
 66 |          {
 67 |           :data {:values centroids}
 68 |           :description "Iris data "
 69 |           :encoding {:x {:field :petal_length :type "quantitative"}
 70 |                      :y {:field :petal_width :type "quantitative"}}
 71 | 
 72 |           :mark {:type "point" :shape :triangle-up :color :black
 73 |                  :filled true
 74 |                  :size 200}}]}
 75 | 
 76 |          
 77 | 
 78 | ["## Ellbow plot"]
 79 | 
 80 | ["### Calculate distortion over n"]
 81 | 
 82 | (defn make-pipe [n]
 83 |   (ml/pipeline
 84 |    (mm/drop-columns [:species])
 85 |    {:metamorph/id :model}
 86 |    (mm/model {:model-type :fastmath/cluster
 87 |               :clustering-method :k-means
 88 |               :clustering-method-args [n]})))
 89 | 
 90 | 
 91 | 
 92 | (def eval-results
 93 |   (ml/evaluate-pipelines
 94 |    (map make-pipe (range 2 10))
 95 |    [{:train iris}]
 96 |    (fn [ctx]
 97 |      0)
 98 |    :loss
 99 |    {:return-best-pipeline-only false}))
100 | 
101 | 
102 | 
103 | (defn fastmath->cluster-data [model-data]
104 |   (let [
105 |         cluster-values
106 |         (concat
107 |          (-> model-data :data)
108 |          (-> model-data :representatives))
109 | 
110 |         cluster
111 |         (concat
112 |          (-> model-data :clustering)
113 |          (range (-> model-data :representatives count)))
114 | 
115 |         centroid?
116 |         (concat
117 |          (repeat (-> model-data :data count) false)
118 |          (repeat (-> model-data :representatives count) true))]
119 | 
120 |     {:values cluster-values
121 |      :cluster cluster
122 |      :centroid? centroid?}))
123 | 
124 | 
125 | 
126 | (def ellbow-plot-data-distortion
127 |   (map #(hash-map :n %1
128 |                   :distortion %2)
129 |        (->> eval-results flatten (map #(first (get-in % [:fit-ctx :model :options :clustering-method-args]))))
130 |        (->> eval-results flatten (map #(get-in % [:fit-ctx :model :model-data :info :distortion])))))
131 |         
132 | 
133 | ["### Calculate silouhette score over n"]
134 | 
135 | (def eval-results-silhouete
136 |   (ml/evaluate-pipelines
137 |    (map make-pipe (range 2 10))
138 |    [{:train iris}]
139 |    (fn [ctx]
140 |      (let [metric
141 |            (cluster-eval/cluster-index
142 |             (fastmath->cluster-data (-> ctx :model :model-data))
143 |             "calcularSilhouette")]
144 |        metric))
145 |    :loss
146 |    {:return-best-pipeline-only false}))
147 | 
148 | 
149 | (def ellbow-plot-data-silhoute
150 |   (map #(hash-map :n %1
151 |                   :silhoute %2)
152 |        (->> eval-results-silhouete flatten (map #(first (get-in % [:fit-ctx :model :options :clustering-method-args]))))
153 |        (->> eval-results-silhouete flatten (map #(get-in % [:train-transform :metric])))))
154 | 
155 | 
156 | ["Ellbow plots for distortion and silhoute score"]
157 | 
158 | ^kind/vega
159 | {:hconcat [
160 |            {:$schema "https://vega.github.io/schema/vega-lite/v5.json"
161 |             :width 200
162 |             :height 200
163 |             :title "Ellbow plot of distortion for various n"
164 |             :data {:values ellbow-plot-data-distortion}
165 |             :description "Stock prices of 5 Tech Companies over Time."
166 |             :encoding {:x {:field "n" :type :ordinal}
167 |                        :y {:field :distortion :type "quantitative"}}
168 |             :mark {:point true :type "line"}}
169 | 
170 |            {:$schema "https://vega.github.io/schema/vega-lite/v5.json"
171 |             :width 200
172 |             :height 200
173 |             :title "Ellbow plot of Silhoutte score for various n"
174 |             :data {:values ellbow-plot-data-silhoute}
175 | 
176 |             :encoding {:x {:field "n" :type :ordinal}
177 |                        :y {:field :silhoute :type "quantitative"}}
178 |             :mark {:point true :type "line"}}]}
179 | 


--------------------------------------------------------------------------------
/submission.csv:
--------------------------------------------------------------------------------
  1 | PassengerId,Survived
  2 | 892,0
  3 | 893,1
  4 | 894,0
  5 | 895,0
  6 | 896,1
  7 | 897,0
  8 | 898,1
  9 | 899,0
 10 | 900,1
 11 | 901,0
 12 | 902,0
 13 | 903,0
 14 | 904,1
 15 | 905,0
 16 | 906,1
 17 | 907,1
 18 | 908,0
 19 | 909,0
 20 | 910,1
 21 | 911,1
 22 | 912,0
 23 | 913,0
 24 | 914,1
 25 | 915,0
 26 | 916,1
 27 | 917,0
 28 | 918,1
 29 | 919,0
 30 | 920,1
 31 | 921,0
 32 | 922,0
 33 | 923,0
 34 | 924,1
 35 | 925,0
 36 | 926,1
 37 | 927,0
 38 | 928,0
 39 | 929,0
 40 | 930,0
 41 | 931,1
 42 | 932,0
 43 | 933,1
 44 | 934,0
 45 | 935,1
 46 | 936,1
 47 | 937,0
 48 | 938,1
 49 | 939,0
 50 | 940,1
 51 | 941,1
 52 | 942,0
 53 | 943,0
 54 | 944,1
 55 | 945,1
 56 | 946,0
 57 | 947,0
 58 | 948,0
 59 | 949,0
 60 | 950,0
 61 | 951,1
 62 | 952,0
 63 | 953,0
 64 | 954,0
 65 | 955,1
 66 | 956,1
 67 | 957,1
 68 | 958,1
 69 | 959,0
 70 | 960,0
 71 | 961,1
 72 | 962,0
 73 | 963,0
 74 | 964,0
 75 | 965,0
 76 | 966,1
 77 | 967,0
 78 | 968,0
 79 | 969,1
 80 | 970,0
 81 | 971,0
 82 | 972,1
 83 | 973,0
 84 | 974,0
 85 | 975,0
 86 | 976,0
 87 | 977,0
 88 | 978,1
 89 | 979,0
 90 | 980,0
 91 | 981,1
 92 | 982,0
 93 | 983,0
 94 | 984,1
 95 | 985,0
 96 | 986,0
 97 | 987,0
 98 | 988,1
 99 | 989,0
100 | 990,1
101 | 991,0
102 | 992,1
103 | 993,0
104 | 994,0
105 | 995,0
106 | 996,0
107 | 997,0
108 | 998,0
109 | 999,0
110 | 1000,0
111 | 1001,0
112 | 1002,0
113 | 1003,1
114 | 1004,1
115 | 1005,1
116 | 1006,1
117 | 1007,0
118 | 1008,0
119 | 1009,1
120 | 1010,1
121 | 1011,1
122 | 1012,1
123 | 1013,0
124 | 1014,1
125 | 1015,0
126 | 1016,0
127 | 1017,1
128 | 1018,0
129 | 1019,1
130 | 1020,0
131 | 1021,0
132 | 1022,0
133 | 1023,0
134 | 1024,0
135 | 1025,0
136 | 1026,0
137 | 1027,0
138 | 1028,0
139 | 1029,0
140 | 1030,0
141 | 1031,0
142 | 1032,0
143 | 1033,1
144 | 1034,1
145 | 1035,0
146 | 1036,0
147 | 1037,0
148 | 1038,0
149 | 1039,0
150 | 1040,0
151 | 1041,0
152 | 1042,1
153 | 1043,0
154 | 1044,0
155 | 1045,1
156 | 1046,0
157 | 1047,0
158 | 1048,1
159 | 1049,0
160 | 1050,1
161 | 1051,0
162 | 1052,1
163 | 1053,1
164 | 1054,1
165 | 1055,0
166 | 1056,0
167 | 1057,1
168 | 1058,0
169 | 1059,0
170 | 1060,1
171 | 1061,0
172 | 1062,0
173 | 1063,0
174 | 1064,0
175 | 1065,0
176 | 1066,0
177 | 1067,1
178 | 1068,1
179 | 1069,1
180 | 1070,1
181 | 1071,1
182 | 1072,0
183 | 1073,0
184 | 1074,1
185 | 1075,0
186 | 1076,1
187 | 1077,0
188 | 1078,1
189 | 1079,0
190 | 1080,0
191 | 1081,0
192 | 1082,0
193 | 1083,0
194 | 1084,0
195 | 1085,0
196 | 1086,1
197 | 1087,0
198 | 1088,1
199 | 1089,0
200 | 1090,0
201 | 1091,0
202 | 1092,1
203 | 1093,1
204 | 1094,0
205 | 1095,1
206 | 1096,0
207 | 1097,0
208 | 1098,0
209 | 1099,0
210 | 1100,1
211 | 1101,0
212 | 1102,0
213 | 1103,0
214 | 1104,0
215 | 1105,1
216 | 1106,0
217 | 1107,0
218 | 1108,1
219 | 1109,1
220 | 1110,1
221 | 1111,0
222 | 1112,1
223 | 1113,0
224 | 1114,1
225 | 1115,0
226 | 1116,1
227 | 1117,1
228 | 1118,0
229 | 1119,0
230 | 1120,0
231 | 1121,0
232 | 1122,0
233 | 1123,1
234 | 1124,0
235 | 1125,0
236 | 1126,1
237 | 1127,0
238 | 1128,0
239 | 1129,0
240 | 1130,1
241 | 1131,1
242 | 1132,1
243 | 1133,1
244 | 1134,1
245 | 1135,0
246 | 1136,0
247 | 1137,0
248 | 1138,1
249 | 1139,0
250 | 1140,1
251 | 1141,0
252 | 1142,1
253 | 1143,0
254 | 1144,0
255 | 1145,0
256 | 1146,0
257 | 1147,0
258 | 1148,0
259 | 1149,0
260 | 1150,1
261 | 1151,0
262 | 1152,0
263 | 1153,0
264 | 1154,1
265 | 1155,1
266 | 1156,0
267 | 1157,0
268 | 1158,0
269 | 1159,0
270 | 1160,1
271 | 1161,0
272 | 1162,1
273 | 1163,0
274 | 1164,1
275 | 1165,1
276 | 1166,0
277 | 1167,1
278 | 1168,0
279 | 1169,0
280 | 1170,0
281 | 1171,0
282 | 1172,0
283 | 1173,1
284 | 1174,0
285 | 1175,1
286 | 1176,1
287 | 1177,0
288 | 1178,0
289 | 1179,0
290 | 1180,0
291 | 1181,0
292 | 1182,0
293 | 1183,1
294 | 1184,0
295 | 1185,0
296 | 1186,0
297 | 1187,0
298 | 1188,1
299 | 1189,1
300 | 1190,0
301 | 1191,0
302 | 1192,0
303 | 1193,0
304 | 1194,0
305 | 1195,0
306 | 1196,0
307 | 1197,1
308 | 1198,0
309 | 1199,1
310 | 1200,1
311 | 1201,0
312 | 1202,0
313 | 1203,0
314 | 1204,0
315 | 1205,0
316 | 1206,1
317 | 1207,1
318 | 1208,1
319 | 1209,0
320 | 1210,0
321 | 1211,0
322 | 1212,0
323 | 1213,0
324 | 1214,0
325 | 1215,1
326 | 1216,1
327 | 1217,0
328 | 1218,1
329 | 1219,0
330 | 1220,0
331 | 1221,0
332 | 1222,1
333 | 1223,1
334 | 1224,0
335 | 1225,1
336 | 1226,0
337 | 1227,0
338 | 1228,0
339 | 1229,0
340 | 1230,0
341 | 1231,0
342 | 1232,0
343 | 1233,0
344 | 1234,0
345 | 1235,1
346 | 1236,0
347 | 1237,1
348 | 1238,0
349 | 1239,1
350 | 1240,0
351 | 1241,1
352 | 1242,1
353 | 1243,0
354 | 1244,0
355 | 1245,0
356 | 1246,1
357 | 1247,0
358 | 1248,1
359 | 1249,0
360 | 1250,0
361 | 1251,1
362 | 1252,0
363 | 1253,1
364 | 1254,1
365 | 1255,0
366 | 1256,1
367 | 1257,0
368 | 1258,0
369 | 1259,0
370 | 1260,1
371 | 1261,0
372 | 1262,0
373 | 1263,1
374 | 1264,0
375 | 1265,0
376 | 1266,1
377 | 1267,1
378 | 1268,0
379 | 1269,0
380 | 1270,0
381 | 1271,0
382 | 1272,0
383 | 1273,0
384 | 1274,0
385 | 1275,1
386 | 1276,0
387 | 1277,1
388 | 1278,0
389 | 1279,0
390 | 1280,0
391 | 1281,1
392 | 1282,1
393 | 1283,1
394 | 1284,0
395 | 1285,0
396 | 1286,0
397 | 1287,1
398 | 1288,0
399 | 1289,1
400 | 1290,0
401 | 1291,0
402 | 1292,1
403 | 1293,0
404 | 1294,1
405 | 1295,0
406 | 1296,0
407 | 1297,0
408 | 1298,0
409 | 1299,0
410 | 1300,1
411 | 1301,1
412 | 1302,0
413 | 1303,1
414 | 1304,1
415 | 1305,0
416 | 1306,1
417 | 1307,0
418 | 1308,0
419 | 1309,1
420 | 


--------------------------------------------------------------------------------
/test/scicloj/ml/tutorials_test.clj:
--------------------------------------------------------------------------------
1 | (ns scicloj.ml.tutorials-test
2 |   (:require [clojure.test :refer :all]
3 |             [scicloj.ml.tutorials :refer :all]))
4 | 
5 | (deftest a-test
6 |   (testing "FIXME, I fail."
7 |     (is (= 0 1))))
8 | 


--------------------------------------------------------------------------------