├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── bb.edn
├── bin
└── launchpad
├── data
├── marketing.csv
├── titanic
│ ├── test.csv
│ ├── titanic.zip
│ └── train.csv
└── tweets_sentiment.feather
├── deps.edn
├── deps.local.edn
├── doc
└── intro.md
├── docs
├── gorilla-notes
│ └── js
│ │ └── compiled
│ │ └── main.js
├── interactions_ols.html
├── notespace-files
│ └── tree.svg
├── polyglot_kmeans.html
├── tune-titanic.html
├── userguide-advanced.html
├── userguide-categrical.html
├── userguide-experiment-tracking.html
├── userguide-intro.html
├── userguide-models.html
├── userguide-sklearnclj.html
├── userguide-third_party.html
├── userguide-titanic.html
├── userguide-transformers.html
└── userguide-unsupervised.html
├── render_all.clj
├── render_titanic.clj
├── render_tune-titanic.clj
├── resources
├── .keep
└── logback.xml
├── src
└── scicloj
│ └── ml
│ ├── advanced.clj
│ ├── categorical.clj
│ ├── experiment_tracking.clj
│ ├── interactions_ols.clj
│ ├── intro.clj
│ ├── models.clj
│ ├── nested_cv.clj
│ ├── polyglot_kmeans.clj
│ ├── sklearnclj.clj
│ ├── third_party.clj
│ ├── titanic.clj
│ ├── transformers.clj
│ ├── tune_titanic.clj
│ ├── ug_utils.clj
│ ├── ug_utils_clerk.clj
│ └── unsupervised.clj
├── submission.csv
└── test
└── scicloj
└── ml
└── tutorials_test.clj
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /classes
3 | /checkouts
4 | *.jar
5 | *.class
6 | /.calva/output-window/
7 | /.cpcache
8 | /.lein-*
9 | /.lsp/sqlite*.db
10 | /.nrepl-history
11 | /.nrepl-port
12 | /.rebel_readline_history
13 | /.socket-repl-port
14 | .hgignore
15 | .hg/
16 | /.cache/
17 | /.classpath
18 | /.clj-kondo/
19 | /.lsp/
20 | /.project
21 | /.settings/
22 | /.clerk/
23 | /.vscode/
24 | /cache_dir/
25 | /docs/scicloj/
26 | /public/
27 | /runs/
28 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Change Log
2 | All notable changes to this project will be documented in this file. This change log follows the conventions of [keepachangelog.com](http://keepachangelog.com/).
3 |
4 | ## [Unreleased]
5 | ### Changed
6 | - Add a new arity to `make-widget-async` to provide a different widget shape.
7 |
8 | ## [0.1.1] - 2021-09-06
9 | ### Changed
10 | - Documentation on how to make the widgets.
11 |
12 | ### Removed
13 | - `make-widget-sync` - we're all async, all the time.
14 |
15 | ### Fixed
16 | - Fixed widget maker to keep working when daylight savings switches over.
17 |
18 | ## 0.1.0 - 2021-09-06
19 | ### Added
20 | - Files from the new template.
21 | - Widget maker public API - `make-widget-sync`.
22 |
23 | [Unreleased]: https://github.com/scicloj/ml.tutorials/compare/0.1.1...HEAD
24 | [0.1.1]: https://github.com/scicloj/ml.tutorials/compare/0.1.0...0.1.1
25 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC
2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM
3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
4 |
5 | 1. DEFINITIONS
6 |
7 | "Contribution" means:
8 |
9 | a) in the case of the initial Contributor, the initial code and
10 | documentation distributed under this Agreement, and
11 |
12 | b) in the case of each subsequent Contributor:
13 |
14 | i) changes to the Program, and
15 |
16 | ii) additions to the Program;
17 |
18 | where such changes and/or additions to the Program originate from and are
19 | distributed by that particular Contributor. A Contribution 'originates' from
20 | a Contributor if it was added to the Program by such Contributor itself or
21 | anyone acting on such Contributor's behalf. Contributions do not include
22 | additions to the Program which: (i) are separate modules of software
23 | distributed in conjunction with the Program under their own license
24 | agreement, and (ii) are not derivative works of the Program.
25 |
26 | "Contributor" means any person or entity that distributes the Program.
27 |
28 | "Licensed Patents" mean patent claims licensable by a Contributor which are
29 | necessarily infringed by the use or sale of its Contribution alone or when
30 | combined with the Program.
31 |
32 | "Program" means the Contributions distributed in accordance with this
33 | Agreement.
34 |
35 | "Recipient" means anyone who receives the Program under this Agreement,
36 | including all Contributors.
37 |
38 | 2. GRANT OF RIGHTS
39 |
40 | a) Subject to the terms of this Agreement, each Contributor hereby grants
41 | Recipient a non-exclusive, worldwide, royalty-free copyright license to
42 | reproduce, prepare derivative works of, publicly display, publicly perform,
43 | distribute and sublicense the Contribution of such Contributor, if any, and
44 | such derivative works, in source code and object code form.
45 |
46 | b) Subject to the terms of this Agreement, each Contributor hereby grants
47 | Recipient a non-exclusive, worldwide, royalty-free patent license under
48 | Licensed Patents to make, use, sell, offer to sell, import and otherwise
49 | transfer the Contribution of such Contributor, if any, in source code and
50 | object code form. This patent license shall apply to the combination of the
51 | Contribution and the Program if, at the time the Contribution is added by the
52 | Contributor, such addition of the Contribution causes such combination to be
53 | covered by the Licensed Patents. The patent license shall not apply to any
54 | other combinations which include the Contribution. No hardware per se is
55 | licensed hereunder.
56 |
57 | c) Recipient understands that although each Contributor grants the licenses
58 | to its Contributions set forth herein, no assurances are provided by any
59 | Contributor that the Program does not infringe the patent or other
60 | intellectual property rights of any other entity. Each Contributor disclaims
61 | any liability to Recipient for claims brought by any other entity based on
62 | infringement of intellectual property rights or otherwise. As a condition to
63 | exercising the rights and licenses granted hereunder, each Recipient hereby
64 | assumes sole responsibility to secure any other intellectual property rights
65 | needed, if any. For example, if a third party patent license is required to
66 | allow Recipient to distribute the Program, it is Recipient's responsibility
67 | to acquire that license before distributing the Program.
68 |
69 | d) Each Contributor represents that to its knowledge it has sufficient
70 | copyright rights in its Contribution, if any, to grant the copyright license
71 | set forth in this Agreement.
72 |
73 | 3. REQUIREMENTS
74 |
75 | A Contributor may choose to distribute the Program in object code form under
76 | its own license agreement, provided that:
77 |
78 | a) it complies with the terms and conditions of this Agreement; and
79 |
80 | b) its license agreement:
81 |
82 | i) effectively disclaims on behalf of all Contributors all warranties and
83 | conditions, express and implied, including warranties or conditions of title
84 | and non-infringement, and implied warranties or conditions of merchantability
85 | and fitness for a particular purpose;
86 |
87 | ii) effectively excludes on behalf of all Contributors all liability for
88 | damages, including direct, indirect, special, incidental and consequential
89 | damages, such as lost profits;
90 |
91 | iii) states that any provisions which differ from this Agreement are offered
92 | by that Contributor alone and not by any other party; and
93 |
94 | iv) states that source code for the Program is available from such
95 | Contributor, and informs licensees how to obtain it in a reasonable manner on
96 | or through a medium customarily used for software exchange.
97 |
98 | When the Program is made available in source code form:
99 |
100 | a) it must be made available under this Agreement; and
101 |
102 | b) a copy of this Agreement must be included with each copy of the Program.
103 |
104 | Contributors may not remove or alter any copyright notices contained within
105 | the Program.
106 |
107 | Each Contributor must identify itself as the originator of its Contribution,
108 | if any, in a manner that reasonably allows subsequent Recipients to identify
109 | the originator of the Contribution.
110 |
111 | 4. COMMERCIAL DISTRIBUTION
112 |
113 | Commercial distributors of software may accept certain responsibilities with
114 | respect to end users, business partners and the like. While this license is
115 | intended to facilitate the commercial use of the Program, the Contributor who
116 | includes the Program in a commercial product offering should do so in a
117 | manner which does not create potential liability for other Contributors.
118 | Therefore, if a Contributor includes the Program in a commercial product
119 | offering, such Contributor ("Commercial Contributor") hereby agrees to defend
120 | and indemnify every other Contributor ("Indemnified Contributor") against any
121 | losses, damages and costs (collectively "Losses") arising from claims,
122 | lawsuits and other legal actions brought by a third party against the
123 | Indemnified Contributor to the extent caused by the acts or omissions of such
124 | Commercial Contributor in connection with its distribution of the Program in
125 | a commercial product offering. The obligations in this section do not apply
126 | to any claims or Losses relating to any actual or alleged intellectual
127 | property infringement. In order to qualify, an Indemnified Contributor must:
128 | a) promptly notify the Commercial Contributor in writing of such claim, and
129 | b) allow the Commercial Contributor to control, and cooperate with the
130 | Commercial Contributor in, the defense and any related settlement
131 | negotiations. The Indemnified Contributor may participate in any such claim
132 | at its own expense.
133 |
134 | For example, a Contributor might include the Program in a commercial product
135 | offering, Product X. That Contributor is then a Commercial Contributor. If
136 | that Commercial Contributor then makes performance claims, or offers
137 | warranties related to Product X, those performance claims and warranties are
138 | such Commercial Contributor's responsibility alone. Under this section, the
139 | Commercial Contributor would have to defend claims against the other
140 | Contributors related to those performance claims and warranties, and if a
141 | court requires any other Contributor to pay any damages as a result, the
142 | Commercial Contributor must pay those damages.
143 |
144 | 5. NO WARRANTY
145 |
146 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON
147 | AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER
148 | EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR
149 | CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A
150 | PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the
151 | appropriateness of using and distributing the Program and assumes all risks
152 | associated with its exercise of rights under this Agreement , including but
153 | not limited to the risks and costs of program errors, compliance with
154 | applicable laws, damage to or loss of data, programs or equipment, and
155 | unavailability or interruption of operations.
156 |
157 | 6. DISCLAIMER OF LIABILITY
158 |
159 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY
160 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL,
161 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION
162 | LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
163 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
164 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE
165 | EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY
166 | OF SUCH DAMAGES.
167 |
168 | 7. GENERAL
169 |
170 | If any provision of this Agreement is invalid or unenforceable under
171 | applicable law, it shall not affect the validity or enforceability of the
172 | remainder of the terms of this Agreement, and without further action by the
173 | parties hereto, such provision shall be reformed to the minimum extent
174 | necessary to make such provision valid and enforceable.
175 |
176 | If Recipient institutes patent litigation against any entity (including a
177 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself
178 | (excluding combinations of the Program with other software or hardware)
179 | infringes such Recipient's patent(s), then such Recipient's rights granted
180 | under Section 2(b) shall terminate as of the date such litigation is filed.
181 |
182 | All Recipient's rights under this Agreement shall terminate if it fails to
183 | comply with any of the material terms or conditions of this Agreement and
184 | does not cure such failure in a reasonable period of time after becoming
185 | aware of such noncompliance. If all Recipient's rights under this Agreement
186 | terminate, Recipient agrees to cease use and distribution of the Program as
187 | soon as reasonably practicable. However, Recipient's obligations under this
188 | Agreement and any licenses granted by Recipient relating to the Program shall
189 | continue and survive.
190 |
191 | Everyone is permitted to copy and distribute copies of this Agreement, but in
192 | order to avoid inconsistency the Agreement is copyrighted and may only be
193 | modified in the following manner. The Agreement Steward reserves the right to
194 | publish new versions (including revisions) of this Agreement from time to
195 | time. No one other than the Agreement Steward has the right to modify this
196 | Agreement. The Eclipse Foundation is the initial Agreement Steward. The
197 | Eclipse Foundation may assign the responsibility to serve as the Agreement
198 | Steward to a suitable separate entity. Each new version of the Agreement will
199 | be given a distinguishing version number. The Program (including
200 | Contributions) may always be distributed subject to the version of the
201 | Agreement under which it was received. In addition, after a new version of
202 | the Agreement is published, Contributor may elect to distribute the Program
203 | (including its Contributions) under the new version. Except as expressly
204 | stated in Sections 2(a) and 2(b) above, Recipient receives no rights or
205 | licenses to the intellectual property of any Contributor under this
206 | Agreement, whether expressly, by implication, estoppel or otherwise. All
207 | rights in the Program not expressly granted under this Agreement are
208 | reserved.
209 |
210 | This Agreement is governed by the laws of the State of New York and the
211 | intellectual property laws of the United States of America. No party to this
212 | Agreement will bring a legal action under this Agreement more than one year
213 | after the cause of action arose. Each party waives its rights to a jury trial
214 | in any resulting litigation.
215 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | >[!NOTE]
2 | >***
3 | >The usage of the shim `scicloj.ml` is now considered deprecated. The underlying libraries should be used directly or via
4 | >[noj](https://github.com/scicloj/noj) is a new libray to combine several of these libraries, without remapping the namespaces.
5 | >It contains as well updated versions of several of the tutorials here.
6 | >The code inside the tutorials is still valid and mostyly working, but the functions are in different namespaces when
7 | >used withouth `scicloj.ml`
8 | >***
9 |
10 |
11 | # Tutorials for [scicloj.ml](https://github.com/scicloj/scicloj.ml)
12 |
13 | The Clojure machine learning library scicloj.ml is documented here:
14 |
15 | * [Userguide - introduction](https://scicloj.github.io/scicloj.ml-tutorials/userguide-intro.html)
16 | * [Userguide - advanced](https://scicloj.github.io/scicloj.ml-tutorials/userguide-advanced.html)
17 | * [Userguide - categorical](https://scicloj.github.io/scicloj.ml-tutorials/userguide-categrical.html)
18 | * [Reference of ML models](https://scicloj.github.io/scicloj.ml-tutorials/userguide-models.html)
19 | * [Reference of transformer functions](https://scicloj.github.io/scicloj.ml-tutorials/userguide-transformers.html)
20 | * [Example usage - predict titanic survival](https://scicloj.github.io/scicloj.ml-tutorials/userguide-titanic.html)
21 | * [Example usage - hyper parametertuning of a pipeline](https://scicloj.github.io/scicloj.ml-tutorials/tune-titanic.html)
22 | * [How to use sklearn models](https://scicloj.github.io/scicloj.ml-tutorials/userguide-sklearnclj.html)
23 | * [Reference of other libraries integrated with scicloj.ml](https://scicloj.github.io/scicloj.ml-tutorials/userguide-third_party.html)
24 | * [kmeans in Python vs Clojure](https://scicloj.github.io/scicloj.ml-tutorials/polyglot_kmeans.html)
25 | * [Experiment tracking](https://scicloj.github.io/scicloj.ml-tutorials/userguide-experiment-tracking.html)
26 | * [Unsupervised learning](https://scicloj.github.io/scicloj.ml-tutorials/userguide-unsupervised.html)
27 | * [Variable interaction in linear regression](https://scicloj.github.io/scicloj.ml-tutorials/interactions_ols.html)
28 |
29 |
30 | The source files for this documentation using [notespace](https://github.com/scicloj/notespace)
31 | and [Clerk](https://github.com/nextjournal/clerk) are in this repository.
32 |
--------------------------------------------------------------------------------
/bb.edn:
--------------------------------------------------------------------------------
1 | {:deps {com.lambdaisland/launchpad {:mvn/version "0.9.49-alpha"}}}
2 |
3 |
--------------------------------------------------------------------------------
/bin/launchpad:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bb
2 |
3 | (require '[lambdaisland.launchpad :as launchpad])
4 |
5 | (launchpad/main {})
6 |
7 | ;; (launchpad/main {:steps (into [(partial launchpad/ensure-java-version 17)]
8 | ;; launchpad/default-steps)})
9 |
--------------------------------------------------------------------------------
/data/marketing.csv:
--------------------------------------------------------------------------------
1 | youtube,facebook,newspaper,sales
2 | 276.12,45.35999999999999,83.04,26.52
3 | 53.4,47.16,54.12,12.48
4 | 20.639999999999997,55.08,83.16,11.16
5 | 181.79999999999998,49.559999999999995,70.2,22.2
6 | 216.96,12.96,70.08,15.48
7 | 10.44,58.67999999999999,90,8.64
8 | 69,39.35999999999999,28.2,14.16
9 | 144.24,23.52,13.92,15.839999999999998
10 | 10.319999999999999,2.52,1.2,5.76
11 | 239.76,3.12,25.439999999999998,12.719999999999999
12 | 79.32,6.96,29.04,10.319999999999999
13 | 257.64,28.799999999999997,4.8,20.88
14 | 28.56,42.12,79.08,11.04
15 | 117,9.12,8.64,11.639999999999999
16 | 244.92,39.48,55.199999999999996,22.8
17 | 234.48,57.24,63.48,26.88
18 | 81.36,43.92,136.79999999999998,15
19 | 337.67999999999995,47.52,66.96,29.279999999999998
20 | 83.04,24.599999999999998,21.96,13.56
21 | 176.76000000000002,28.679999999999996,22.92,17.52
22 | 262.08,33.239999999999995,64.08,21.599999999999998
23 | 284.88,6.119999999999999,28.2,15
24 | 15.839999999999998,19.08,59.519999999999996,6.72
25 | 273.96,20.279999999999998,31.439999999999998,18.599999999999998
26 | 74.75999999999999,15.12,21.96,11.639999999999999
27 | 315.47999999999996,4.2,23.4,14.399999999999999
28 | 171.48,35.16,15.12,18
29 | 288.12,20.04,27.479999999999997,19.08
30 | 298.56,32.52,27.479999999999997,22.679999999999996
31 | 84.71999999999998,19.2,48.959999999999994,12.6
32 | 351.47999999999996,33.96,51.84,25.679999999999996
33 | 135.48,20.88,46.32,14.28
34 | 116.64,1.7999999999999998,36,11.52
35 | 318.72,24,0.36,20.88
36 | 114.84,1.68,8.88,11.4
37 | 348.84,4.919999999999999,10.2,15.36
38 | 320.28,52.559999999999995,6,30.479999999999997
39 | 89.64,59.279999999999994,54.84,17.639999999999997
40 | 51.72,32.04,42.12,12.12
41 | 273.59999999999997,45.24,38.4,25.8
42 | 243,26.76,37.92,19.92
43 | 212.4,40.08,46.440000000000005,20.52
44 | 352.32,33.239999999999995,2.16,24.84
45 | 248.28,10.08,31.679999999999996,15.48
46 | 30.12,30.839999999999996,51.959999999999994,10.2
47 | 210.11999999999998,27,37.8,17.88
48 | 107.64,11.88,42.84,12.719999999999999
49 | 287.88,49.8,22.2,27.84
50 | 272.64,18.96,59.879999999999995,17.76
51 | 80.28,14.04,44.16,11.639999999999999
52 | 239.76,3.7199999999999998,41.52,13.68
53 | 120.48,11.52,4.32,12.839999999999998
54 | 259.68,50.04,47.52,27.12
55 | 219.11999999999998,55.440000000000005,70.44,25.439999999999998
56 | 315.23999999999995,34.56,19.08,24.24
57 | 238.68,59.279999999999994,72,28.439999999999998
58 | 8.76,33.72,49.68,6.6
59 | 163.43999999999997,23.04,19.92,15.839999999999998
60 | 252.96,59.519999999999996,45.24,28.56
61 | 252.83999999999997,35.4,11.16,22.08
62 | 64.2,2.4,25.679999999999996,9.719999999999999
63 | 313.56,51.24,65.64,29.04
64 | 287.16,18.599999999999998,32.76,18.84
65 | 123.24,35.52,10.08,16.8
66 | 157.32,51.35999999999999,34.68,21.599999999999998
67 | 82.8,11.16,1.08,11.16
68 | 37.8,29.52,2.64,11.4
69 | 167.16,17.4,12.239999999999998,16.08
70 | 284.88,33,13.2,22.679999999999996
71 | 260.16,52.68,32.64,26.76
72 | 238.92,36.72,46.440000000000005,21.96
73 | 131.76,17.16,38.04,14.879999999999999
74 | 32.16,39.6,23.16,10.56
75 | 155.28,6.84,37.56,13.2
76 | 256.08,29.52,15.719999999999999,20.4
77 | 20.279999999999998,52.440000000000005,107.28,10.44
78 | 33,1.92,24.84,8.28
79 | 144.6,34.199999999999996,17.04,17.04
80 | 6.48,35.879999999999995,11.28,6.359999999999999
81 | 139.2,9.24,27.720000000000002,13.2
82 | 91.68,32.04,26.76,14.16
83 | 287.76,4.919999999999999,44.279999999999994,14.76
84 | 90.36,24.36,39,13.56
85 | 82.08,53.4,42.72,16.32
86 | 256.2,51.6,40.559999999999995,26.04
87 | 231.83999999999997,22.08,78.84,18.24
88 | 91.55999999999999,33,19.2,14.399999999999999
89 | 132.84,48.72,75.84,19.2
90 | 105.96,30.599999999999998,88.08,15.48
91 | 131.76,57.35999999999999,61.67999999999999,20.04
92 | 161.16,5.88,11.16,13.44
93 | 34.32,1.7999999999999998,39.6,8.76
94 | 261.23999999999995,40.199999999999996,70.8,23.279999999999998
95 | 301.08,43.8,86.75999999999999,26.639999999999997
96 | 128.88,16.8,13.08,13.799999999999999
97 | 195.96,37.92,63.48,20.279999999999998
98 | 237.11999999999998,4.2,7.08,14.04
99 | 221.88,25.2,26.4,18.599999999999998
100 | 347.64,50.76,61.44,30.479999999999997
101 | 162.23999999999998,50.04,55.08,20.639999999999997
102 | 266.88,5.159999999999999,59.75999999999999,14.04
103 | 355.67999999999995,43.559999999999995,121.08,28.56
104 | 336.23999999999995,12.12,25.679999999999996,17.76
105 | 225.48,20.639999999999997,21.479999999999997,17.639999999999997
106 | 285.84,41.16,6.359999999999999,24.84
107 | 165.48,55.68,70.8,23.04
108 | 30,13.2,35.64,8.64
109 | 108.48,0.36,27.84,10.44
110 | 15.719999999999999,0.48,30.72,6.359999999999999
111 | 306.48,32.279999999999994,6.6,23.76
112 | 270.96,9.839999999999998,67.8,16.08
113 | 290.03999999999996,45.6,27.84,26.16
114 | 210.83999999999997,18.48,2.88,16.919999999999998
115 | 251.51999999999998,24.720000000000002,12.839999999999998,19.08
116 | 93.84,56.16,41.4,17.52
117 | 90.11999999999999,42,63.24,15.12
118 | 167.04,17.16,30.72,14.639999999999999
119 | 91.68,0.96,17.76,11.28
120 | 150.84,44.279999999999994,95.04,19.08
121 | 23.279999999999998,19.2,26.76,7.919999999999999
122 | 169.56,32.16,55.440000000000005,18.599999999999998
123 | 22.56,26.04,60.48,8.4
124 | 268.8,2.88,18.72,13.92
125 | 147.72,41.52,14.879999999999999,18.24
126 | 275.4,38.76,89.04,23.639999999999997
127 | 104.64,14.16,31.08,12.719999999999999
128 | 9.36,46.68,60.72,7.919999999999999
129 | 96.24,0,11.04,10.56
130 | 264.36,58.8,3.84,29.639999999999997
131 | 71.52,14.399999999999999,51.72,11.639999999999999
132 | 0.84,47.52,10.44,1.92
133 | 318.23999999999995,3.48,51.6,15.239999999999998
134 | 10.08,32.64,2.52,6.84
135 | 263.76,40.199999999999996,54.12,23.52
136 | 44.279999999999994,46.32,78.71999999999998,12.96
137 | 57.959999999999994,56.4,10.2,13.92
138 | 30.72,46.8,11.16,11.4
139 | 328.44,34.68,71.64,24.96
140 | 51.6,31.08,24.599999999999998,11.52
141 | 221.88,52.68,2.04,24.84
142 | 88.08,20.4,15.48,13.08
143 | 232.43999999999997,42.48,90.71999999999998,23.04
144 | 264.59999999999997,39.84,45.48,24.12
145 | 125.51999999999998,6.84,41.279999999999994,12.48
146 | 115.44,17.76,46.68,13.68
147 | 168.36,2.28,10.799999999999999,12.360000000000001
148 | 288.12,8.76,10.44,15.839999999999998
149 | 291.84,58.8,53.16,30.479999999999997
150 | 45.6,48.35999999999999,14.28,13.08
151 | 53.64,30.96,24.720000000000002,12.12
152 | 336.84,16.68,44.4,19.32
153 | 145.2,10.08,58.44,13.92
154 | 237.11999999999998,27.96,17.04,19.92
155 | 205.56,47.64,45.24,22.8
156 | 225.36,25.32,11.4,18.72
157 | 4.919999999999999,13.92,6.84,3.84
158 | 112.68,52.199999999999996,60.599999999999994,18.36
159 | 179.76000000000002,1.56,29.16,12.12
160 | 14.04,44.279999999999994,54.24,8.76
161 | 158.04,22.08,41.52,15.48
162 | 207,21.720000000000002,36.839999999999996,17.28
163 | 102.84,42.959999999999994,59.16,15.96
164 | 226.08,21.720000000000002,30.72,17.88
165 | 196.2,44.16,8.88,21.599999999999998
166 | 140.64,17.639999999999997,6.48,14.28
167 | 281.4,4.08,101.75999999999999,14.28
168 | 21.479999999999997,45.12,25.92,9.6
169 | 248.16,6.24,23.279999999999998,14.639999999999999
170 | 258.48,28.32,69.12,20.52
171 | 341.16,12.719999999999999,7.68,18
172 | 60,13.92,22.08,10.08
173 | 197.4,25.08,56.879999999999995,17.4
174 | 23.52,24.12,20.4,9.12
175 | 202.08,8.52,15.36,14.04
176 | 266.88,4.08,15.719999999999999,13.799999999999999
177 | 332.28,58.67999999999999,50.16,32.4
178 | 298.08,36.239999999999995,24.36,24.24
179 | 204.23999999999998,9.36,42.24,14.04
180 | 332.03999999999996,2.76,28.439999999999998,14.16
181 | 198.72,12,21.12,15.12
182 | 187.92,3.12,9.96,12.6
183 | 262.2,6.48,32.879999999999995,14.639999999999999
184 | 67.44,6.84,35.64,10.44
185 | 345.12,51.6,86.16,31.439999999999998
186 | 304.56,25.56,36,21.12
187 | 246,54.12,23.52,27.12
188 | 167.4,2.52,31.92,12.360000000000001
189 | 229.32,34.44,21.84,20.76
190 | 343.2,16.68,4.44,19.08
191 | 22.439999999999998,14.52,28.08,8.04
192 | 47.4,49.32,6.96,12.96
193 | 90.6,12.96,7.199999999999999,11.88
194 | 20.639999999999997,4.919999999999999,37.92,7.08
195 | 200.16,50.4,4.32,23.52
196 | 179.64,42.72,7.199999999999999,20.76
197 | 45.84,4.44,16.56,9.12
198 | 113.04,5.88,9.719999999999999,11.639999999999999
199 | 212.4,11.16,7.68,15.36
200 | 340.32,50.4,79.44,30.599999999999998
201 | 278.52,10.319999999999999,10.44,16.08
202 |
--------------------------------------------------------------------------------
/data/titanic/titanic.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scicloj/scicloj.ml-tutorials/951cde0b8bd0a1b22ec856d28a6122d69d34836f/data/titanic/titanic.zip
--------------------------------------------------------------------------------
/data/tweets_sentiment.feather:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scicloj/scicloj.ml-tutorials/951cde0b8bd0a1b22ec856d28a6122d69d34836f/data/tweets_sentiment.feather
--------------------------------------------------------------------------------
/deps.edn:
--------------------------------------------------------------------------------
1 | {:paths ["src" "resources"]
2 |
3 |
4 | :deps {
5 | io.github.nextjournal/clerk {:git/sha "a6bfc832a182ef3068d60a318985681ddb913595"
6 | :git/url "https://github.com/nextjournal/clerk.git"}
7 |
8 | ;; {:mvn/version "0.11.603"}
9 | org.clojure/clojure {:mvn/version "1.11.1"}
10 |
11 | scicloj/scicloj.ml {:mvn/version "0.2.2"}
12 |
13 | org.scicloj/scicloj.ml.clj-djl {:mvn/version "0.1.11"}
14 | scicloj/sklearn-clj {:mvn/version "0.3.7"}
15 |
16 | org.apache.arrow/arrow-vector {:mvn/version "6.0.0"}
17 | org.lz4/lz4-java {:mvn/version "1.8.0"}
18 | com.github.luben/zstd-jni {:mvn/version "1.5.1-1"}
19 | org.clojure/tools.logging {:mvn/version "1.2.4"}
20 | com.fasterxml.jackson.core/jackson-databind {:mvn/version
21 | "2.13.2"}
22 | com.fasterxml.jackson.core/jackson-core {:mvn/version
23 | "2.13.2"}
24 |
25 | com.fasterxml.jackson.core/jackson-annotations {:mvn/version
26 | "2.13.2"}
27 |
28 | ch.qos.logback/logback-classic {:mvn/version "1.4.4"}
29 | scicloj/notespace {:mvn/version "3-beta9"}
30 |
31 | dk.simongray/datalinguist {:mvn/version "0.1.163"}
32 | applied-science/waqi {:git/url "https://github.com/applied-science/waqi/"
33 | :sha "faefe5dfd1b161ff70089924591ac2d699527811"}
34 | clj-python/libpython-clj {:mvn/version "2.020"}
35 | scicloj/clojisr {:mvn/version "1.0.0-BETA20"}
36 |
37 | generateme/fastmath {:mvn/version "2.1.6"}
38 | uncomplicate/neanderthal {:mvn/version "0.43.0"}
39 | aerial.hanami/aerial.hanami {:mvn/version "0.12.9"}
40 | net.clojars.behrica/cluster_eval {:git/url "https://github.com/behrica/cluster-eval.git"
41 | :sha "ca34283a67bf18c8025955865fb567bd6e2e9a9a"}}
42 | ;; appliedsciencestudio/rdata {:git/url "https://github.com/appliedsciencestudio/rdata/"
43 | ;; :sha "151e6dead06b38995f1f30b09d954a060f7a2a9c"}
44 |
45 |
46 |
47 |
48 | :aliases
49 |
50 |
51 | {
52 | :jdk-17
53 | {:jvm-opts ["--add-modules" "jdk.incubator.foreign"
54 | "--enable-native-access=ALL-UNNAMED"]}
55 |
56 | :reveal {:extra-deps {vlaaad/reveal {:mvn/version "1.3.250"}}
57 | :ns-default vlaaad.reveal
58 | :exec-fn repl}
59 | :reveal-nrepl-middleware
60 | {:extra-deps {vlaaad/reveal {:mvn/version "1.3.194"}}
61 | :main-opts ["-m" "nrepl.cmdline"
62 | "--middleware" "[vlaaad.reveal.nrepl/middleware,cider.nrepl/cider-middleware]"]}
63 |
64 | :jar {:replace-deps {com.github.seancorfield/depstar {:mvn/version "2.1.278"}}
65 | :exec-fn hf.depstar/jar
66 | :exec-args {:jar "ml.tutorials.jar" :sync-pom true}}
67 | :install {:replace-deps {slipset/deps-deploy {:mvn/version "0.1.5"}}
68 | :exec-fn deps-deploy.deps-deploy/deploy
69 | :exec-args {:installer :local :artifact "ml.tutorials.jar"}}
70 | :deploy {:replace-deps {slipset/deps-deploy {:mvn/version "0.1.5"}}
71 | :exec-fn deps-deploy.deps-deploy/deploy
72 | :exec-args {:installer :remote :artifact "ml.tutorials.jar"}}}}
73 |
--------------------------------------------------------------------------------
/deps.local.edn:
--------------------------------------------------------------------------------
1 | {;; regular deps.edn stuff will work in here
2 | :deps {}
3 | :aliases {}
4 |
5 | ;; but some extra keys are supported to influence launchpad itself
6 | :launchpad/aliases [:jdk-17 :test] ; additional aliases, will be added to whatever
7 | ; aliases you specify on the command line
8 | :launchpad/main-opts ["--emacs"]} ; additional CLI flags, so you can encode your
9 | ; own preferences
10 | ; which shadow builds to start, although it may
11 | ; be preferable to configure this as part of
12 | ; specific aliases in your main deps.edn
13 |
--------------------------------------------------------------------------------
/doc/intro.md:
--------------------------------------------------------------------------------
1 | # Introduction to ml.tutorials
2 |
3 | TODO: write [great documentation](http://jacobian.org/writing/what-to-write/)
4 |
--------------------------------------------------------------------------------
/docs/interactions_ols.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 | Loading ...
18 |
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/docs/notespace-files/tree.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
6 |
7 |
9 |
10 | CART
11 |
12 |
13 |
14 | 1
15 |
16 | petal_length ≤ 2.45
17 | size = 150
18 | impurity reduction = 0.3333
19 |
20 |
21 |
22 | 2
23 |
24 | species = 0
25 | size = 50
26 | deviance = 3.8466
27 |
28 |
29 |
30 | 1->2
31 |
32 |
33 | True
34 |
35 |
36 |
37 | 3
38 |
39 | petal_width ≤ 1.75
40 | size = 100
41 | impurity reduction = 0.3897
42 |
43 |
44 |
45 | 1->3
46 |
47 |
48 | False
49 |
50 |
51 |
52 | 6
53 |
54 | sepal_length ≤ 7.1
55 | size = 54
56 | impurity reduction = 0.0311
57 |
58 |
59 |
60 | 3->6
61 |
62 |
63 |
64 |
65 |
66 | 7
67 |
68 | species = 2
69 | size = 46
70 | deviance = 12.0834
71 |
72 |
73 |
74 | 3->7
75 |
76 |
77 |
78 |
79 |
80 | 12
81 |
82 | petal_width ≤ 1.65
83 | size = 53
84 | impurity reduction = 0.0141
85 |
86 |
87 |
88 | 6->12
89 |
90 |
91 |
92 |
93 |
94 | 13
95 |
96 | species = 2
97 | size = 1
98 | deviance = 1.3863
99 |
100 |
101 |
102 | 6->13
103 |
104 |
105 |
106 |
107 |
108 | 24
109 |
110 | species = 1
111 | size = 51
112 | deviance = 24.9439
113 |
114 |
115 |
116 | 12->24
117 |
118 |
119 |
120 |
121 |
122 | 25
123 |
124 | sepal_width ≤ 2.75
125 | size = 2
126 | impurity reduction = 0.5000
127 |
128 |
129 |
130 | 12->25
131 |
132 |
133 |
134 |
135 |
136 | 50
137 |
138 | species = 2
139 | size = 1
140 | deviance = 1.3863
141 |
142 |
143 |
144 | 25->50
145 |
146 |
147 |
148 |
149 |
150 | 51
151 |
152 | species = 1
153 | size = 1
154 | deviance = 1.3863
155 |
156 |
157 |
158 | 25->51
159 |
160 |
161 |
162 |
163 |
164 |
--------------------------------------------------------------------------------
/docs/tune-titanic.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 | Loading ...
18 |
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/docs/userguide-categrical.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 | Loading ...
18 |
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/docs/userguide-intro.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 | Loading ...
18 |
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/render_all.clj:
--------------------------------------------------------------------------------
1 | (ns render-all
2 | (:require [notespace.cli :as cli]
3 | [notespace.api :as note]
4 | [clojure.java.shell]
5 | [nextjournal.clerk :as clerk]))
6 |
7 |
8 | (def nss [
9 | {:ns 'scicloj.ml.intro :output-file "docs/userguide-intro.html"}
10 | {:ns 'scicloj.ml.advanced :output-file "docs/userguide-advanced.html"}
11 | {:ns 'scicloj.ml.categorical :output-file "docs/userguide-categrical.html"}
12 | {:ns 'scicloj.ml.transformers :output-file "docs/userguide-transformers.html"}
13 | {:ns 'scicloj.ml.titanic :output-file "docs/userguide-titanic.html"}
14 | {:ns 'scicloj.ml.tune-titanic :output-file "docs/tune-titanic.html"}
15 | {:ns 'scicloj.ml.sklearnclj :output-file "docs/userguide-sklearnclj.html"}
16 | {:ns 'scicloj.ml.third-party :output-file "docs/userguide-third_party.html"}
17 | {:ns 'scicloj.ml.experiment-tracking :output-file "docs/userguide-experiment-tracking.html"}
18 | {:ns 'scicloj.ml.unsupervised :output-file "docs/userguide-unsupervised.html"}
19 | {:ns 'scicloj.ml.interactions-ols :output-file "docs/interactions_ols.html"}])
20 |
21 |
22 | (note/init :port 5678)
23 |
24 | (run!
25 |
26 | #(do
27 | (println "render ns: " %)
28 | (cli/eval-and-render-a-notespace %))
29 | nss)
30 |
31 | (require '[nextjournal.clerk :as clerk])
32 |
33 | (clerk/build! {:paths ["src/scicloj/ml/models.clj"]
34 | :bundle? true
35 | :out-path "output"})
36 |
37 | (println
38 | (clojure.java.shell/sh "mv" "output/index.html" "docs/userguide-models.html"))
39 |
40 | (clerk/build! {:paths ["src/scicloj/ml/polyglot_kmeans.clj"]
41 | :bundle? true
42 | :out-path "output"})
43 |
44 | (println
45 | (clojure.java.shell/sh "mv" "output/index.html" "docs/polyglot_kmeans.html"))
46 |
47 | (System/exit 0)
48 |
--------------------------------------------------------------------------------
/render_titanic.clj:
--------------------------------------------------------------------------------
1 | (ns render-titanic
2 | (:require [notespace.cli :as cli]
3 | [notespace.api :as note]))
4 |
5 | (note/init :port 5678)
6 |
7 | (cli/eval-and-render-a-notespace {:ns 'scicloj.ml.titanic})
8 | (System/exit 0)
9 |
--------------------------------------------------------------------------------
/render_tune-titanic.clj:
--------------------------------------------------------------------------------
1 | (ns render-titanic
2 | (:require [notespace.cli :as cli]
3 | [notespace.api :as note]))
4 |
5 | (note/init :port 5678)
6 |
7 | (cli/eval-and-render-a-notespace {:ns 'scicloj.ml.tune-titanic})
8 | (System/exit 0)
9 |
--------------------------------------------------------------------------------
/resources/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scicloj/scicloj.ml-tutorials/951cde0b8bd0a1b22ec856d28a6122d69d34836f/resources/.keep
--------------------------------------------------------------------------------
/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
6 |
7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/src/scicloj/ml/categorical.clj:
--------------------------------------------------------------------------------
1 | (ns scicloj.ml.categorical
2 |
3 | (:require [notespace.api :as note]
4 | [notespace.kinds :as kind]))
5 |
6 |
7 | (comment
8 | (note/init-with-browser)
9 | (note/eval-and-realize-this-notespace))
10 |
11 | (require '[scicloj.ml.core :as ml]
12 | '[scicloj.ml.metamorph :as mm]
13 | '[scicloj.ml.dataset :as ds])
14 |
15 | ["# Handling of categorical variables"]
16 |
17 | ["We keep important information in the metadata of the column,
18 | which can be inspected"]
19 |
20 | ["## categorical -> number"]
21 | ["Categorical columns can be converted too numbers, which is needed by several ML models."]
22 |
23 | (def ds-cat
24 | (ds/dataset {:a [:x :y :x]}))
25 |
26 | ["inspect column metadata and observe datatype :kewyword"]
27 | (-> ds-cat :a meta)
28 |
29 | ["convert categorical columns to numeric"]
30 | (def ds-number
31 | (ds/categorical->number
32 | ds-cat :all {} :int))
33 |
34 | ^kind/dataset
35 | ds-number
36 |
37 | ["metadata has changed as well, int now, and with a lookup table"]
38 | (-> ds-number :a meta)
39 |
40 |
41 |
42 |
43 | ["## categorical -> one-hot"]
44 | ["Categorical columns can be converted to one-hot columns as well, which is needed by several ML models."]
45 | (def ds-one-hot
46 | (ds/categorical->one-hot
47 | ds-cat :all {} :int))
48 |
49 | ^kind/dataset
50 | ds-one-hot
51 |
52 |
53 | ["we can go back as well"]
54 | (-> ds-one-hot ds/reverse-map-categorical-xforms)
55 |
56 |
57 | ["inspect metadata after conversion"]
58 | (-> ds-one-hot :a-y meta)
59 |
60 |
61 | ["we can go back"]
62 | (-> ds-one-hot ds/reverse-map-categorical-xforms)
63 |
--------------------------------------------------------------------------------
/src/scicloj/ml/experiment_tracking.clj:
--------------------------------------------------------------------------------
1 | (ns scicloj.ml.experiment-tracking
2 | (:require
3 | [scicloj.ml.ug-utils :as utils]
4 | [notespace.api :as note]
5 | [notespace.kinds :as kind]))
6 |
7 | (comment
8 | (note/init-with-browser)
9 | (note/eval-this-notespace)
10 | (note/reread-this-notespace)
11 | (note/render-static-html "docs/userguide-experiment-tracking.html")
12 | (note/init))
13 |
14 | (require '[scicloj.ml.core :as ml]
15 | '[scicloj.ml.metamorph :as mm]
16 | '[scicloj.ml.dataset :as ds]
17 | '[scicloj.metamorph.ml.tools :refer [dissoc-in]]
18 | '[taoensso.nippy :as nippy])
19 |
20 |
21 |
22 | (defonce ds (ds/dataset "https://raw.githubusercontent.com/techascent/tech.ml/master/test/data/iris.csv" {:key-fn keyword}))
23 |
24 | (defn create-base-pipe-decl [node-size]
25 | [[:tech.v3.dataset.metamorph/set-inference-target [:species]]
26 | [:tech.v3.dataset.metamorph/categorical->number [:species]]
27 | {:metamorph/id :model} [:scicloj.metamorph.ml/model {:model-type :smile.classification/random-forest
28 | :node-size node-size}]])
29 | ["## Run evaluation"]
30 |
31 | ["We create 6 pipelines, do a simple :holdout split and keep all results. In order to save memory,
32 | as we needed to do, if we would have thousands or more evaluations, we keep the minimal information."]
33 |
34 | (def pipes (map create-base-pipe-decl [1 5 10 20 50 100]))
35 | (def split (ds/split->seq ds :holdout))
36 |
37 |
38 |
39 | (def evaluation-result
40 | (ml/evaluate-pipelines
41 | pipes split
42 | ml/classification-accuracy
43 | :accuracy
44 | {:evaluation-handler-fn utils/select-minimal-result
45 |
46 | :return-best-crossvalidation-only false
47 | :return-best-pipeline-only false}))
48 |
49 | ["So we get here 6 evaluation results"]
50 | evaluation-result
51 |
52 | ["simplified as list:"]
53 |
54 | (->> evaluation-result flatten
55 | (map (comp :metric :test-transform)))
56 |
57 | ["## Attach a simple result handler"]
58 |
59 | ["A result handler is a function which takes a full map representing a single evalution result and does what ever is needed.
60 | It can be a function with side effects, and it should return the minimal metric infomation as documented."]
61 |
62 | ["The function will be called for each evalution result, so in our case 6 times. We use a simple function for now,
63 | which prints the current declartive pipeline."]
64 |
65 | (def evaluation-result
66 | (ml/evaluate-pipelines
67 | pipes split
68 | ml/classification-accuracy
69 | :accuracy
70 | {;:result-dissoc-in-seq ml/result-dissoc-in-seq--all
71 | ;; :result-dissoc-in-seq []
72 | :return-best-crossvalidation-only false
73 | :return-best-pipeline-only false
74 | :evaluation-handler-fn
75 | (fn [result]
76 | (clojure.pprint/pprint (:pipe-decl result))
77 | result)}))
78 |
79 | ["repl output: "]
80 | ^kind/code
81 | [[:tech.v3.dataset.metamorph/set-inference-target [:species]]
82 | [:tech.v3.dataset.metamorph/categorical->number [:species]]
83 | [:scicloj.metamorph.ml/model
84 | {:model-type :smile.classification/random-forest, :node-size 1}]]
85 | [[:tech.v3.dataset.metamorph/set-inference-target [:species]]
86 | [:tech.v3.dataset.metamorph/categorical->number [:species]]
87 | [:scicloj.metamorph.ml/model
88 | {:model-type :smile.classification/random-forest, :node-size 5}]]
89 | [[:tech.v3.dataset.metamorph/set-inference-target [:species]]
90 | [:tech.v3.dataset.metamorph/categorical->number [:species]]
91 | [:scicloj.metamorph.ml/model
92 | {:model-type :smile.classification/random-forest, :node-size 10}]]
93 | [[:tech.v3.dataset.metamorph/set-inference-target [:species]]
94 | [:tech.v3.dataset.metamorph/categorical->number [:species]]
95 | [:scicloj.metamorph.ml/model
96 | {:model-type :smile.classification/random-forest, :node-size 20}]]
97 | ["...."]
98 |
99 | ["The callback function can now implement whatever needed to store the evaluation results, for example on disk.
100 | "]
101 |
102 |
103 | ["Write results to disk"]
104 |
105 |
106 | (def created-files (atom []))
107 | (def last-result (atom {}))
108 |
109 | (def evaluation-result
110 | (ml/evaluate-pipelines
111 | pipes split
112 | ml/classification-accuracy
113 | :accuracy
114 | {:evaluation-handler-fn
115 | (fn [result]
116 |
117 | (let [reduced-result-fn (fn [result] (scicloj.metamorph.ml/reduce-result result
118 | [[:fit-ctx :model :model-data :model-as-bytes]
119 | [:train-transform :ctx :model :model-data :model-as-bytes]
120 |
121 |
122 | [:test-transform :ctx :model :model-data :model-as-bytes]]))]
123 | (scicloj.metamorph.ml.evaluation-handler/example-nippy-handler
124 | created-files "/tmp"
125 | reduced-result-fn)
126 | (reset! last-result (reduced-result-fn result))
127 | (reduced-result-fn result)))
128 |
129 |
130 | :attach-fn-sources {:ns (find-ns 'scicloj.ml.experiment-tracking)
131 | :pipe-fns-clj-file "src/scicloj/ml/experiment_tracking.clj"}}))
132 |
133 | ["This creates one nippy file for each evaluation, containing all data of the evaluations."]
134 |
135 | (deref last-result)
136 |
--------------------------------------------------------------------------------
/src/scicloj/ml/interactions_ols.clj:
--------------------------------------------------------------------------------
1 | (ns scicloj.ml.interactions-ols
2 | (:require
3 | [notespace.api :as note]
4 | [notespace.kinds :as kind]
5 | [notespace.view :as view]
6 | [tablecloth.api :as tc]
7 | [scicloj.ml.core]
8 | [scicloj.sklearn-clj.ml]
9 | [clojure.string :as str]
10 | [scicloj.ml.ug-utils :refer :all]
11 | [clojure.java.io :as io]
12 | [fastmath.stats :as fmstats]))
13 |
14 | (require '[scicloj.ml.core :as ml]
15 | '[scicloj.ml.metamorph :as mm]
16 | '[scicloj.ml.dataset :refer [dataset add-column]]
17 | '[scicloj.ml.dataset :as ds]
18 | '[tech.v3.dataset.math :as std-math]
19 | '[tech.v3.datatype.functional :as dtf]
20 | '[scicloj.metamorph.ml.toydata :as datasets])
21 |
22 |
23 | (comment
24 | (note/init-with-browser)
25 | (note/eval-this-notespace)
26 | (note/render-static-html "docs/interactions_ols.html"))
27 |
28 | ["This examples how, how to do interactions in linear regression with `scicloj.ml`"]
29 |
30 | ["Taking ideas from: "
31 |
32 | "http://www.sthda.com/english/articles/40-regression-analysis/164-interaction-effect-in-multiple-regression-essentials/#comments-list"]
33 |
34 | (defn pp-str [x]
35 | (with-out-str (clojure.pprint/pprint x)))
36 |
37 | ["First we load the data:"]
38 | (def marketing (tc/dataset "data/marketing.csv" {:key-fn keyword}))
39 |
40 | ["## Additive model"]
41 |
42 | ["Firts we build an additive model, which model equation is 'sales = b0 + b1 * youtube + b2 * facebook'"]
43 |
44 | (def additive-pipeline
45 | (ml/pipeline
46 | (mm/set-inference-target :sales)
47 | (mm/drop-columns [:newspaper])
48 | {:metamorph/id :model}
49 | (mm/model {:model-type :smile.regression/ordinary-least-square})))
50 |
51 |
52 | ["We evaluate it, "]
53 | (def evaluations
54 | (ml/evaluate-pipelines
55 | [additive-pipeline]
56 | (ds/split->seq marketing :holdout)
57 | ml/rmse
58 | :loss
59 | {:other-metrices [{:name :r2
60 | :metric-fn fmstats/r2-determination}]}))
61 |
62 |
63 | ["and print the result:"]
64 | ^kind/hiccup
65 | (text->hiccup
66 | (str
67 | (-> evaluations flatten first :fit-ctx :model ml/thaw-model str)))
68 |
69 | ["We have the following metrices:"]
70 | ["RMSE"]
71 | (-> evaluations flatten first :test-transform :metric)
72 |
73 | ["R2"]
74 | (-> evaluations flatten first :test-transform :other-metrices first :metric)
75 |
76 | ["## Interaction effects"]
77 | ["Now we add interaction effects to it, resulting in this model equation: 'sales = b0 + b1 * youtube + b2 * facebook + b3 * (youtube * facebook)'"]
78 | (def pipe-interaction
79 | (ml/pipeline
80 | (mm/drop-columns [:newspaper])
81 | (mm/add-column :youtube*facebook (fn [ds] (dtf/* (ds :youtube) (ds :facebook))))
82 | (mm/set-inference-target :sales)
83 | {:metamorph/id :model}(mm/model {:model-type :smile.regression/ordinary-least-square})))
84 |
85 | ["Again we evaluate the model,"]
86 | (def evaluations
87 | (ml/evaluate-pipelines
88 | [pipe-interaction]
89 | (ds/split->seq marketing :holdout)
90 | ml/rmse
91 | :loss
92 | {:other-metrices [{:name :r2
93 | :metric-fn fmstats/r2-determination}]}))
94 |
95 |
96 | ["and print it and the performance metrices:"]
97 | ^kind/hiccup
98 | (text->hiccup
99 | (str
100 | (-> evaluations flatten first :fit-ctx :model ml/thaw-model str)))
101 |
102 | ["As the multiplcation of 'youtube * facebook' is as well statistically relevant, it
103 | suggests that there is indeed an interaction between these 2 predictor variables youtube and facebook."]
104 |
105 | ["RMSE"]
106 | (-> evaluations flatten first :test-transform :metric)
107 |
108 | ["R2"]
109 | (-> evaluations flatten first :test-transform :other-metrices first :metric)
110 |
111 | ["RMSE and R2 of the intercation model are sligtly better."
112 | "These results suggest that the model with the interaction term is better than the model that contains only main effects.
113 | So, for this specific data, we should go for the model with the interaction model.
114 | "]
115 |
--------------------------------------------------------------------------------
/src/scicloj/ml/intro.clj:
--------------------------------------------------------------------------------
1 | (ns scicloj.ml.intro
2 | (:require
3 | [notespace.api :as note]
4 | [notespace.kinds :as kind]))
5 |
6 |
7 | (comment
8 | (note/init-with-browser)
9 | (note/eval-this-notespace)
10 | (note/reread-this-notespace)
11 | (note/render-static-html "docs/userguide-intro.html")
12 | (note/init))
13 |
14 |
15 |
16 | ["# Clojure and and machine learning "]
17 |
18 | ["In order to practice machine learning and create an ecosystem of models around it,
19 | we need 3 components."]
20 |
21 | ["1. A standard way to manage tabular data in memory."]
22 | ["2. Various machine learning models"]
23 | ["3. A standard way to express steps of data manipulations including train/predict of a model"]
24 |
25 |
26 | ["The Clojure language and core libraries do not have build-in, specific support for this,
27 | so some libraries are required. "]
28 |
29 | ["## Representing training data"]
30 |
31 | ["In the last 2 years the Clojure data science landscape was shaped
32 | by the appearance and maturation of a new library to manage tabular data."]
33 |
34 | ["This library is [tech.ml.dataset](https://github.com/techascent/tech.ml.dataset).
35 | It defines a in-memory tabular data structure and operations on it. It is a remarkable piece of software,
36 | highly optimized and linking in its root to native memory and allow zero-copy integration's outside Clojure."]
37 |
38 | ["As it was organically growing over time, it's API is functional and complete,
39 | but lacks consistency in some parts.
40 | "]
41 |
42 | ["This was addressed by an other library, layering on top of it, called
43 | `tablecloth`. It is available [here](https://github.com/scicloj/tablecloth)"]
44 |
45 |
46 | ["So we have now a very reliable, mature, easy to use library to store and manipulate tabular data, including text."]
47 |
48 | ["## Models"]
49 | ["Models are the core of most machine learning libraries. In scicloj.ml we
50 | rely on an common **abstraction** for all
51 | machine learning models and one Java library [Smile](https://github.com/haifengl/smile) providing models,
52 | which we bridge into Clojure via the abstraction.
53 | So we use Java models internally, but without the need for Java
54 | interop by the user.
55 |
56 | Documentation for existing models is appearing here:
57 | https://scicloj.github.io/scicloj.ml-tutorials/userguide-models.html
58 |
59 | The abstraction is independent from Smile, so we could makes bridges to other libraries, even in non JVM languages (python, R)
60 |
61 |
62 | "]
63 |
64 | ["## Data transformation pipelines."]
65 |
66 | ["In order to apply machine learning, the data needs to be transformed from its original form ,
67 | (often as a data file), into the form required by the model.
68 | Sometimes these transformation are simple, like re-encode data,
69 | sometimes they are very complex. In some contexts this is as well called
70 | feature engineering, which can result in arbitrary
71 | complex dataset transformations.
72 | This transformations are mostly dataset to dataset transformations.
73 | "]
74 |
75 |
76 | ["These pipelines need to be repeatable and self-contained,
77 | as they need to run several times with different data or in variants
78 | for either cross validation or hyper-parameter tuning."]
79 |
80 | ["Clojure and the `tablecloth` library contains already
81 | the concept of running a pipeline"]
82 |
83 | ["These simpler form of a pipeline in Clojure and Tablecloth, can just make use of the fact that all tablecloth
84 | functions take a dataset as the first parameter and return a dataset.
85 | So they can be chained together with the pipe (`->`) operator of Clojure,
86 | example:"]
87 |
88 | (require '[scicloj.ml.dataset :as ds])
89 | (def my-data
90 | (-> (ds/dataset "https://raw.githubusercontent.com/techascent/tech.ml.dataset/master/test/data/stocks.csv" {:key-fn keyword})
91 | (ds/select-columns [:symbol :price])
92 | (ds/add-or-replace-column :symbol (fn [ds] (map clojure.string/lower-case (ds :symbol))))))
93 |
94 | ["This form of pipeline works to manipulate a dataset,
95 | but has three disadvantages."]
96 |
97 | ["
98 | 1. `->` is a macro, so we cannot compose pipelines easily
99 |
100 | 2. We move a dataset object through the pipeline steps, so the only object we have nicely inside the pipeline, accessible to all steps, is the dataset itself. But sometimes we need non-tabular, auxiliary, data to be shared across the pipeline steps, which is not possible with passing a dataset only.Using this simple pipelines, would force to hold auxiliary data in a global state of some form. This makes is very hard to execute pipelines repeatedly, as they are not self-contained.
101 |
102 | 3. These simpler pipeline concepts have no notion of running a pipeline in several modes. In machine learning a pipeline need to behave differently in `fit` and in `transform`. (often called `train` vs `predict`). The models learns from data in the `fit` and it applies what it has learned in `transform`.
103 | "]
104 |
105 | ["Due to this, the idea of the `metamorph` pipeline concept was born."]
106 | ["It addresses all three shortcomings of the simpler pipeline."]
107 |
108 | ["Metamorph is documented here: [metamorph](https://github.com/scicloj/metamorph)"]
109 |
110 |
111 | ["As we see in the metamorph documentation, a pipeline can be composed of functions, which adhere to some simple standards
112 | regarding input and output, as explained here: https://github.com/scicloj/metamorph#compliant-operations"]
113 |
114 | ["Tablecloth contains such operations in the `tablecloth.pipeline`
115 | namespace. All functions of the `tablecloth.api` namespace are replicated
116 | there, but metamorph compliant"]
117 |
118 | ["## scicloj.ml"]
119 |
120 | ["The Clojure ML ecosystem is based on different libraries working
121 | together, as typic and idiomatic in Clojure"]
122 |
123 | ["Some existing libraries are used internally in scicloj.ml, to create a
124 | complete machine learning library, but this is hidden from the user,
125 | and is listed here only for completeness."]
126 |
127 | ["
128 | 1. `tablecloth` - for general manipulation of the dataset
129 | 1. `tech.v3.dataset` - to finally prepare a dataset for the machine learing models
130 | 1. `metamorph.ml` - for running pipelines and machine learning core functions
131 | 1. `Smile` Java machine learning library containing lots of models
132 | "]
133 |
134 |
135 |
136 | ["These libraries can be used standalone as well. `tech.ml` was changed in order
137 | to work with scicloj.ml in a incompatible way.
138 | So it is re-released under a new name `metamorph.ml`.
139 | The others can be used by scicloj.ml without any change.
140 | "]
141 |
142 |
143 | ["In order to give easier access to the various libraries, the scicloj.ml
144 | library was created. It unifies the access to the libraries above
145 | in three simple namespaces.
146 | "]
147 |
148 | ["## Machine learning using scicloj.ml"]
149 |
150 | ["The setup for the following code needs a single dependencies in deps.edn or project.clj"]
151 |
152 | ["
153 | {:deps {
154 | scicloj/scicloj.ml {:mvn/version \"0.1.0-beta2\"}} }
155 | "]
156 |
157 |
158 | ["This library acts as a facade to the four libraries above, and arranges the functions in a simple way in these namespaces:"]
159 |
160 | ^kind/md-nocode
161 | ["
162 |
163 | | namespace | purpose |
164 | |-----------------------|----------------------------------------------------------|
165 | | scicloj.ml.core | core functionality for machine learning |
166 | | scicloj.ml.dataset | functions to manipulate a dataset |
167 | | scicloj.ml.methamorph | metamorph compliant functions to be used in ml pipelines |
168 |
169 | "]
170 |
171 |
172 |
173 | ["To start we need to require a few namespaces"]
174 |
175 | (require '[scicloj.ml.core :as ml]
176 | '[scicloj.ml.metamorph :as mm]
177 | '[scicloj.ml.dataset :refer [dataset add-column] :as ds])
178 |
179 |
180 |
181 | ["First we load the data."]
182 | (def titanic-train
183 | (->
184 | (ds/dataset "https://github.com/scicloj/metamorph-examples/raw/main/data/titanic/train.csv"
185 | {:key-fn keyword
186 | :parser-fn :string})))
187 |
188 |
189 | (def titanic-test
190 | (->
191 | (ds/dataset "https://github.com/scicloj/metamorph-examples/raw/main/data/titanic/test.csv"
192 | {:key-fn keyword
193 | :parser-fn :string})
194 | (ds/add-column :Survived [""] :cycle)))
195 |
196 | ["Then we define the pipeline and it steps. Inside the pipeline we only use functions
197 | from namespace scicloj.ml.metamorph"]
198 |
199 | ["In scicloj.ml the model functions receives a single dataset,
200 | in which the inference target column is marked as such. The model
201 | to use is a parameter of the `model` function. All built-in
202 | models are listed here: https://scicloj.github.io/scicloj.ml-tutorials/userguide-models.html"]
203 |
204 |
205 | ["In the titanic dataset the `survived` column is a categorical variable.
206 | All target variables for classification need to be transformed first
207 | into numbers, the model can work with. This is done by the function
208 | `categorical->number`. The mapping for this is stored in the dataset on the column
209 | and can be later retrieved to transform the numeric prediction back to its
210 | categorical form."]
211 |
212 |
213 | ["In `scicloj.ml` we pass a whole dataset to a model, and we need to mark
214 | the inference target via function `set-inference-target`.
215 | All other columns are used then as feature columns.
216 | To restric the feature column, I simply remove most of them and keep only one, :Pclass"]
217 |
218 | ["Now the dataset is ready for the model, which is called in the last step.
219 | It is a logistic regression model, which gets trained to predict column
220 | :Survived from column :Pclass"]
221 |
222 | (def pipe-fn
223 | (ml/pipeline
224 | (mm/select-columns [:Survived :Pclass])
225 | (mm/categorical->number [:Survived :Pclass])
226 | (mm/set-inference-target :Survived)
227 | (mm/model {:model-type :smile.classification/logistic-regression})))
228 |
229 | ["So the `ml/pipeline` function returns a function, which can be called with the ctx map."]
230 |
231 | ["We execute the pipeline in mode :fit,
232 | which will execute all pipeline steps and train as well the model. "]
233 |
234 | (def trained-ctx
235 | (pipe-fn {:metamorph/data titanic-train
236 | :metamorph/mode :fit}))
237 |
238 | ["Now we have a trained model inside trained-ctx. This is a usual map, so can be inspected in the repl.
239 | As the model is based on Smile, the trained-ctx contains the java class representing the trained model.
240 | "]
241 |
242 | ["Now we execute the pipeline in mode :transform,
243 | which will make a prediction "]
244 |
245 | ["We combine the previously obtained context
246 | (which contains the trained model)",
247 | "with the test data and mode :transform"]
248 |
249 | (def test-ctx
250 | (pipe-fn
251 | (assoc trained-ctx
252 | :metamorph/data titanic-test
253 | :metamorph/mode :transform)))
254 |
255 |
256 |
257 | ["Prediction is now part of the ctx obtained.
258 | The internally called `predict` function of `metamorph.ml` returns always the raw prediction of the model,
259 | which we can easily transform into the original categories.
260 | "]
261 |
262 |
263 |
264 | ;; ^kind/dataset
265 | (-> test-ctx :metamorph/data
266 | (ds/column-values->categorical :Survived))
267 |
268 |
269 |
270 |
271 |
272 | ["This shows the predicted survival. "]
273 |
274 | ["The documentation of `mm/model` here https://scicloj.github.io/scicloj.ml/scicloj.ml.metamorph.html#var-model"
275 | "documents this special behavior of the function, which does something different in mode :fit vs mode :transform"]
276 |
277 | ["Any form of feature-engineering takes now the same form.
278 | We will successively
279 | add more and more steps into the pipeline to improve the model."]
280 |
281 | ["This can be build-in functions or custom functions as we see later"]
282 |
283 |
284 | (+ 1 1 (+ 2 2))
285 |
--------------------------------------------------------------------------------
/src/scicloj/ml/models.clj:
--------------------------------------------------------------------------------
1 | ^{:nextjournal.clerk/visibility {:code :hide :result :hide}
2 | :nextjournal.clerk/toc true}
3 | (ns scicloj.ml.models
4 | (:require
5 | [nextjournal.clerk :as clerk]
6 | [scicloj.ml.ug-utils :as utils]
7 | [scicloj.ml.dataset :as ds]
8 | [scicloj.ml.ug-utils-clerk :as utils-clerk]
9 | [tablecloth.api :as tc]))
10 |
11 | ^{:nextjournal.clerk/visibility {:code :hide :result :hide}}
12 | (comment
13 | (clerk/show! "src/scicloj/ml/models.clj")
14 | (clerk/halt!)
15 | (clerk/build-static-app! {:paths ["src/scicloj/ml/models.clj"]
16 | :bundle? false})
17 | (clerk/clear-cache!)
18 | (clerk/serve! {:browse? true})
19 | (clerk/serve! {:browse? true :watch-paths ["src/scicloj/ml/"]}))
20 |
21 | ^{:nextjournal.clerk/visibility {:code :hide :result :hide}}
22 | (require '[scicloj.ml.core :as ml]
23 | '[scicloj.ml.metamorph :as mm]
24 | '[tech.v3.datatype.functional :as dtf]
25 | '[scicloj.metamorph.ml.toydata :as datasets])
26 |
27 | ^{:nextjournal.clerk/visibility {:code :hide :result :hide}}
28 | (clerk/add-viewers! [{:pred tc/dataset?
29 | :transform-fn (clerk/update-val #(clerk/table {:head (tc/column-names %)
30 | :rows (tc/rows % :as-seq)}))}])
31 | ^{:nextjournal.clerk/visibility {:code :hide :result :hide}}
32 | (def build-in-models
33 | (->>
34 | (ml/model-definition-names)
35 | (filter #(contains? #{"fastmath.cluster"
36 | "smile.classification"
37 | "smile.regression"
38 | "smile.manifold"
39 | "smile.projections"
40 | "xgboost"}
41 | (namespace %)))
42 | sort))
43 |
44 | ^{:nextjournal.clerk/visibility {:code :hide :result :hide}}
45 | (defn make-iris-pipeline [model-options]
46 | (ml/pipeline
47 | (mm/set-inference-target :species)
48 | (mm/categorical->number [:species])
49 | (mm/model model-options)))
50 |
51 |
52 | ;; # Models
53 |
54 | ;; scicloj.ml uses the plugin `scicloj.ml.smile` and
55 | ;; `scicloj.ml.xgboost` by default,
56 | ;; which gives access to " (count build-in-models) " models from the java libraries
57 | ;; [Smile](https://haifengl.github.io/),
58 | ;; [Xgboost](https://xgboost.readthedocs.io/en/latest/jvm/index.html) and [fastmath](https://github.com/generateme/fastmath)
59 |
60 | ;; More models are avilable via other plugins.
61 |
62 | ;; Below is a list of all such models, and which parameter they take.
63 |
64 | ;; All models are available in the same way:
65 |
66 |
67 |
68 | ;; The documentation below points as well to the javadoc and user-guide chapter (for Smile models)
69 |
70 | ;; The full list of build in models is:
71 | ^{:nextjournal.clerk/visibility {:code :hide}}
72 | (clerk/html
73 | [:ul
74 |
75 | (map
76 | #(vector :li [:a {:href (str "#" (str %))} (str %)])
77 | build-in-models)])
78 |
79 |
80 | ;; ## Smile classification models
81 |
82 | ^{:nextjournal.clerk/visibility {:code :hide}}
83 | (clerk/html
84 | (utils-clerk/render-key-info :smile.classification/ada-boost))
85 | ;; In this example we will use the capability of the Ada boost classifier
86 | ;; to give us the importance of variables.
87 |
88 | ;; As data we take here the Wiscon Breast Cancer dataset, which has 30 variables.
89 |
90 | (def df
91 | (->
92 | (datasets/breast-cancer-ds)))
93 |
94 |
95 | ;; To get an overview of the dataset, we print its summary:
96 |
97 | (-> df ds/info)
98 |
99 |
100 | ;; Then we create a metamorph pipeline with the ada boost model:
101 |
102 | (def ada-pipe-fn
103 | (ml/pipeline
104 | (mm/set-inference-target :class)
105 | (mm/categorical->number [:class])
106 | (mm/model
107 | {:model-type :smile.classification/ada-boost})))
108 |
109 |
110 | ;; We run the pipeline in :fit. As we just explore the data,
111 | ;; not train.test split is needed.
112 |
113 | (def trained-ctx
114 | (ml/fit-pipe df
115 | ada-pipe-fn))
116 |
117 | ;; "Next we take the model out of the pipeline:"
118 | (def model
119 | (-> trained-ctx vals (nth 2) ml/thaw-model))
120 |
121 | ;; The variable importance can be obtained from the trained model,
122 | (def var-importances
123 | (mapv
124 | #(hash-map :variable %1
125 | :importance %2)
126 | (map
127 | #(first (.variables %))
128 | (.. model formula predictors))
129 | (.importance model)))
130 |
131 |
132 | ;; and we plot the variables:
133 |
134 | (clerk/vl
135 | {
136 | :data {:values
137 | var-importances}
138 | :width 800
139 | :height 500
140 | :mark {:type "bar"}
141 | :encoding {:x {:field :variable :type "nominal" :sort "-y"}
142 | :y {:field :importance :type "quantitative"}}})
143 |
144 |
145 | ^{:nextjournal.clerk/visibility {:code :hide}}
146 | (clerk/html
147 | (utils-clerk/render-key-info ":smile.classification/decision-tree"))
148 |
149 | ;; A decision tree learns a set of rules from the data in the form
150 | ;; of a tree, which we will plot in this example.
151 | ;; We use the iris dataset:
152 |
153 |
154 | (def iris ^:nextjournal.clerk/no-cache (datasets/iris-ds))
155 |
156 |
157 |
158 | ;; We make a pipe only containing the model, as the dataset is ready to
159 | ;; be used by `scicloj.ml`
160 | (def trained-pipe-tree
161 | (ml/fit-pipe
162 | iris
163 | (ml/pipeline
164 | {:metamorph/id :model}
165 | (mm/model
166 | {:model-type :smile.classification/decision-tree}))))
167 |
168 | ;; We extract the Java object of the trained model.
169 |
170 | (def tree-model
171 | (-> trained-pipe-tree :model ml/thaw-model))
172 |
173 |
174 | ;; The model has a .dot function, which returns a GraphViz textual
175 | ;; representation of the decision tree, which we render to svg using the
176 | ;; [kroki](https://kroki.io/) service.
177 |
178 | (clerk/html
179 | (String. (:body (utils/kroki (.dot tree-model) :graphviz :svg)) "UTF-8"))
180 |
181 | ^{:nextjournal.clerk/visibility {:code :hide}}
182 | (clerk/html (utils-clerk/render-key-info ":smile.classification/discrete-naive-bayes"))
183 |
184 | ^{:nextjournal.clerk/visibility {:code :hide}}
185 | (clerk/html (utils-clerk/render-key-info ":smile.classification/gradient-tree-boost"))
186 |
187 | ^{:nextjournal.clerk/visibility {:code :hide}}
188 | (clerk/html (utils-clerk/render-key-info ":smile.classification/knn"))
189 | ;; In this example we use a knn model to classify some dummy data.
190 | ;; The training data is this:
191 |
192 | (def df-knn
193 | (ds/dataset {:x1 [7 7 3 1]
194 | :x2 [7 4 4 4]
195 | :y [ :bad :bad :good :good]}))
196 |
197 |
198 |
199 | ;; Then we construct a pipeline with the knn model,
200 | ;; using 3 neighbors for decision.
201 |
202 | (def knn-pipe-fn
203 | (ml/pipeline
204 | (mm/set-inference-target :y)
205 | (mm/categorical->number [:y])
206 | (mm/model
207 | {:model-type :smile.classification/knn
208 | :k 3})))
209 |
210 | ;; We run the pipeline in mode fit:
211 |
212 | (def trained-ctx-knn
213 | (knn-pipe-fn {:metamorph/data df-knn
214 | :metamorph/mode :fit}))
215 |
216 |
217 | ;; Then we run the pipeline in mode :transform with some test data
218 | ;; and take the prediction and convert it from numeric into categorical:
219 |
220 | (->
221 | trained-ctx-knn
222 | (merge
223 | {:metamorph/data (ds/dataset
224 | {:x1 [3 5]
225 | :x2 [7 5]
226 | :y [nil nil]})
227 | :metamorph/mode :transform})
228 | knn-pipe-fn
229 | :metamorph/data
230 | (ds/column-values->categorical :y)
231 | seq)
232 |
233 | ^{:nextjournal.clerk/visibility {:code :hide}}
234 | (clerk/html (utils-clerk/render-key-info ":smile.classification/logistic-regression"))
235 |
236 | ^{:nextjournal.clerk/visibility {:code :hide}}
237 | (clerk/html (utils-clerk/render-key-info ":smile.classification/maxent-binomial"))
238 |
239 | ^{:nextjournal.clerk/visibility {:code :hide}}
240 | (clerk/html (utils-clerk/render-key-info ":smile.classification/maxent-multinomial"))
241 |
242 | ^{:nextjournal.clerk/visibility {:code :hide}}
243 | (clerk/html (utils-clerk/render-key-info ":smile.classification/random-forest"))
244 | ;; The following code plots the decision surfaces of the random forest
245 | ;; model on pairs of features.
246 |
247 | ;; We use the Iris dataset for this.
248 |
249 | (def iris-test
250 | (ds/dataset
251 | "https://raw.githubusercontent.com/scicloj/metamorph.ml/main/test/data/iris.csv" {:key-fn keyword}))
252 |
253 |
254 |
255 |
256 | ;; Standarise the data:
257 | (def iris-std
258 | (ml/pipe-it
259 | iris-test
260 | (mm/std-scale [:sepal_length :sepal_width :petal_length :petal_width] {})))
261 |
262 |
263 |
264 |
265 |
266 |
267 | ;; The next function creates a vega specification for the random forest
268 | ;; decision surface for a given pair of column names.
269 |
270 |
271 |
272 |
273 | (def rf-pipe
274 | (make-iris-pipeline
275 | {:model-type :smile.classification/random-forest}))
276 |
277 | (clerk/vl (utils/surface-plot iris [:sepal_length :sepal_width] rf-pipe :smile.classification/random-forest))
278 |
279 | (clerk/vl
280 | (utils/surface-plot iris-std [:sepal_length :petal_length] rf-pipe :smile.classification/random-forest))
281 |
282 | (clerk/vl
283 | (utils/surface-plot iris-std [:sepal_length :petal_width] rf-pipe :smile.classification/random-forest))
284 | (clerk/vl
285 | (utils/surface-plot iris-std [:sepal_width :petal_length] rf-pipe :smile.classification/random-forest))
286 | (clerk/vl
287 | (utils/surface-plot iris-std [:sepal_width :petal_width] rf-pipe :smile.classification/random-forest))
288 | (clerk/vl
289 | (utils/surface-plot iris-std [:petal_length :petal_width] rf-pipe :smile.classification/random-forest))
290 |
291 |
292 | ^{:nextjournal.clerk/visibility {:code :hide}}
293 | (clerk/html (utils-clerk/render-key-info ":smile.classification/sparse-logistic-regression"))
294 |
295 | ^{:nextjournal.clerk/visibility {:code :hide}}
296 | (clerk/html (utils-clerk/render-key-info ":smile.classification/sparse-svm"))
297 |
298 | ^{:nextjournal.clerk/visibility {:code :hide}}
299 | (clerk/html (utils-clerk/render-key-info ":smile.classification/svm"))
300 |
301 | ;; ## Smile regression models
302 | ^{:nextjournal.clerk/visibility {:code :hide}}
303 | (clerk/html (utils-clerk/render-key-info ":smile.regression/elastic-net"))
304 |
305 |
306 | ^{:nextjournal.clerk/visibility {:code :hide}}
307 | (clerk/html (utils-clerk/render-key-info ":smile.regression/gradient-tree-boost"))
308 |
309 | ^{:nextjournal.clerk/visibility {:code :hide}}
310 | (clerk/html (utils-clerk/render-key-info ":smile.regression/lasso"))
311 |
312 | ;; We use the diabetes dataset and will show how Lasso regression
313 | ;; regulates the different variables dependent of lambda.
314 |
315 | ;; First we make a function to create pipelines with different lambdas
316 | (defn make-pipe-fn [lambda]
317 | (ml/pipeline
318 | (mm/update-column :disease-progression (fn [col] (map #(double %) col)))
319 | (mm/convert-types :disease-progression :float32)
320 | (mm/set-inference-target :disease-progression)
321 | {:metamorph/id :model} (mm/model {:model-type :smile.regression/lasso
322 | :lambda (double lambda)})))
323 |
324 | ;; No we go over a sequence of lambdas and fit a pipeline for all off them
325 | ;; and store the coefficients for each predictor variable:
326 | (def diabetes (datasets/diabetes-ds))
327 |
328 | (def coefs-vs-lambda
329 | (flatten
330 | (map
331 | (fn [lambda]
332 | (let [fitted
333 | (ml/fit-pipe
334 | diabetes
335 | (make-pipe-fn lambda))
336 |
337 | model-instance
338 | (-> fitted
339 | :model
340 | (ml/thaw-model))
341 |
342 | predictors
343 | (map
344 | #(first (.variables %))
345 | (seq
346 | (.. model-instance formula predictors)))]
347 |
348 | (map
349 | #(hash-map :log-lambda (dtf/log10 lambda)
350 | :coefficient %1
351 | :predictor %2)
352 | (-> model-instance .coefficients seq)
353 | predictors)))
354 | (range 1 100000 100))))
355 |
356 | ;; Then we plot the coefficients over the log of lambda.
357 | (clerk/vl
358 | {
359 | :data {:values coefs-vs-lambda}
360 |
361 | :width 500
362 | :height 500
363 | :mark {:type "line"}
364 | :encoding {:x {:field :log-lambda :type "quantitative"}
365 | :y {:field :coefficient :type "quantitative"}
366 | :color {:field :predictor}}})
367 |
368 | ;; This shows that an increasing lambda regulates more and more variables
369 | ;; to zero. This plot can be used as well to find important variables,
370 | ;; namely the ones which stay > 0 even with large lambda.
371 |
372 | ^{:nextjournal.clerk/visibility {:code :hide}}
373 | (clerk/html
374 | (utils-clerk/render-key-info ":smile.regression/ordinary-least-square"))
375 |
376 | ;; In this example we will explore the relationship between the
377 | ;; body mass index (bmi) and a diabetes indicator.
378 |
379 | ;; First we load the data and split into train and test sets.
380 | ;;
381 | ^{:nextjournal.clerk/viewer :hide-result}
382 | (def diabetes (datasets/diabetes-ds))
383 |
384 | ^{:nextjournal.clerk/viewer :hide-result}
385 | (def diabetes-train
386 | (ds/head diabetes 422))
387 |
388 | ^{:nextjournal.clerk/viewer :hide-result}
389 | (def diabetes-test
390 | (ds/tail diabetes 20))
391 |
392 |
393 |
394 | ;; Next we create the pipeline, converting the target variable to
395 | ;; a float value, as needed by the model.
396 |
397 | (def ols-pipe-fn
398 | (ml/pipeline
399 | (mm/select-columns [:bmi :disease-progression])
400 | (mm/convert-types :disease-progression :float32)
401 | (mm/set-inference-target :disease-progression)
402 | {:metamorph/id :model} (mm/model {:model-type :smile.regression/ordinary-least-square})))
403 |
404 | ;; We can then fit the model, by running the pipeline in mode :fit
405 |
406 | (def fitted
407 | (ml/fit diabetes-train ols-pipe-fn))
408 |
409 |
410 | ;; Next we run the pipe-fn in :transform and extract the prediction
411 | ;; for the disease progression:
412 | (def diabetes-test-prediction
413 | (-> diabetes-test
414 | (ml/transform-pipe ols-pipe-fn fitted)
415 | :metamorph/data
416 | :disease-progression))
417 |
418 | ;; The truth is available in the test dataset.
419 | (def diabetes-test-trueth
420 | (-> diabetes-test
421 | :disease-progression))
422 |
423 |
424 |
425 |
426 | ;; The smile Java object of the LinearModel is in the pipeline as well:
427 |
428 | (def model-instance
429 | (-> fitted :model (ml/thaw-model)))
430 |
431 | ;; This object contains all information regarding the model fit
432 | ;; such as coefficients and formula:
433 | (-> model-instance .coefficients seq)
434 | (-> model-instance .formula str)
435 |
436 | ;; Smile generates as well a String with the result of the linear
437 | ;; regression as part of the toString() method of class LinearModel:
438 |
439 | (clerk/code
440 | (str model-instance))
441 |
442 |
443 |
444 | ;; This tells us that there is a statistically significant
445 | ;; (positive) correlation between the bmi and the diabetes
446 | ;; disease progression in this data.
447 |
448 |
449 | ;; At the end we can plot the truth and the prediction on the test data,
450 | ;; and observe the linear nature of the model.
451 |
452 | (clerk/vl
453 | {:layer [
454 | {:data {:values (map #(hash-map :disease-progression %1 :bmi %2 :type :truth)
455 | diabetes-test-trueth
456 | (:bmi diabetes-test))}
457 |
458 | :width 500
459 | :height 500
460 | :mark {:type "circle"}
461 | :encoding {:x {:field :bmi :type "quantitative"}
462 | :y {:field :disease-progression :type "quantitative"}
463 | :color {:field :type}}}
464 |
465 | {:data {:values (map #(hash-map :disease-progression %1 :bmi %2 :type :prediction)
466 | diabetes-test-prediction
467 | (:bmi diabetes-test))}
468 |
469 | :width 500
470 | :height 500
471 | :mark {:type "line"}
472 | :encoding {:x {:field :bmi :type "quantitative"}
473 | :y {:field :disease-progression :type "quantitative"}
474 | :color {:field :type}}}]})
475 |
476 |
477 | ^{:nextjournal.clerk/visibility {:code :hide}}
478 | (clerk/html (utils-clerk/render-key-info ":smile.regression/random-forest"))
479 |
480 | ^{:nextjournal.clerk/visibility {:code :hide}}
481 | (clerk/html (utils-clerk/render-key-info ":smile.regression/ridge"))
482 |
483 |
484 | ;; ## Xgboost model
485 | ^{:nextjournal.clerk/visibility {:code :hide}}
486 | (clerk/html (utils-clerk/render-key-info ":xgboost"))
487 |
488 | ;; ## Fastmath clustering
489 | ^{:nextjournal.clerk/visibility {:code :hide}}
490 | (clerk/html (utils-clerk/render-key-info :fastmath.cluster))
491 |
492 | ;; ## Smile projections
493 | ^{:nextjournal.clerk/visibility {:code :hide}}
494 | (clerk/html (utils-clerk/render-key-info :smile.projections))
495 |
496 | ;; ## Smile manifold
497 | ^{:nextjournal.clerk/visibility {:code :hide}}
498 | (clerk/html (utils-clerk/render-key-info :smile.manifold))
499 |
500 |
501 | ;; # Compare decision surfaces of different models
502 |
503 | ;; In the following we see the decision surfaces of some models on the
504 | ;; same data from the Iris dataset using 2 columns :sepal_width and sepal_length:
505 | ^{:nextjournal.clerk/visibility {:code :hide}}
506 | (mapv #(clerk/vl (utils/surface-plot iris-std [:sepal_length :sepal_width] (make-iris-pipeline %) (:model-type %)))
507 | [
508 | {:model-type :smile.classification/ada-boost}
509 | {:model-type :smile.classification/decision-tree}
510 | {:model-type :smile.classification/gradient-tree-boost}
511 | {:model-type :smile.classification/knn}
512 | {:model-type :smile.classification/logistic-regression}
513 | {:model-type :smile.classification/random-forest}
514 | {:model-type :smile.classification/linear-discriminant-analysis}
515 | {:model-type :smile.classification/regularized-discriminant-analysis}
516 | {:model-type :smile.classification/quadratic-discriminant-analysis}
517 | {:model-type :xgboost/classification}])
518 |
519 |
520 |
521 | ;; This shows nicely that different model types have different capabilities
522 | ;; seperate and tehre fore classify data.
523 |
524 |
525 | ;; ## Ensembles
526 |
527 | ;; An ensemble is combining several pipelines and their prediction
528 | ;; and calculate a common prediction.
529 | ;; `sicloj.ml` alows to create an ensemble whehre each model gives avote,
530 | ;; and the majority becomes the final prediction.
531 | ;;
532 |
533 |
534 | ;; First we make three pipelines, which only differ in the model type.
535 | ;; The pipleines could b completely different, but need to accept the same input data and
536 | ;; produce the same predictions (target column name and type)
537 | ;;
538 |
539 |
540 | (defn make-iris-pipeline-ensemble [model-type]
541 | (ml/pipeline
542 | (mm/select-columns [:species :sepal_length :sepal_width])
543 | (mm/set-inference-target :species)
544 | (mm/categorical->number [:species])
545 | {:metamorph/id :model}
546 | (mm/model
547 | {:model-type model-type})))
548 | (defn make-iris-pipeline-ensemble [model-type]
549 | (ml/pipeline
550 | (mm/select-columns [:species :sepal_length :sepal_width])
551 | (mm/set-inference-target :species)
552 | (mm/categorical->number [:species])
553 | {:metamorph/id :model}
554 | (mm/model
555 | {:model-type model-type})))
556 |
557 |
558 | (def tree-pipeline
559 | (make-iris-pipeline-ensemble :smile.classification/decision-tree))
560 |
561 |
562 | (def knn-pipeline
563 | (make-iris-pipeline-ensemble :smile.classification/knn))
564 |
565 |
566 | (def logistic-regression-pipeline
567 | (make-iris-pipeline-ensemble :smile.classification/logistic-regression))
568 |
569 |
570 | ;; Know we can contruct an ensembe, using function `ensemble-pipe`
571 |
572 | (def ensemble (ml/ensemble-pipe [tree-pipeline
573 | knn-pipeline
574 | logistic-regression-pipeline]))
575 |
576 | ;; This ensemble is as any other metamorph pipeline,
577 | ;; so we can train and predict as usual:
578 |
579 |
580 | (def fitted-ctx-ensemble
581 | (ml/fit-pipe iris-std ensemble))
582 |
583 |
584 | (def transformed-ctx-ensemble
585 | (ml/transform-pipe iris-std ensemble fitted-ctx-ensemble))
586 |
587 |
588 | ;; Frequency of predictions
589 |
590 |
591 | (->
592 | transformed-ctx-ensemble
593 | :metamorph/data
594 | (ds/reverse-map-categorical-xforms)
595 | :species
596 | frequencies)
597 |
598 | ;; The surface plot of the ensemble
599 |
600 |
601 | (clerk/vl (utils/surface-plot iris-std
602 | [:sepal_length :sepal_width]
603 | ensemble "voting ensemble"))
604 |
--------------------------------------------------------------------------------
/src/scicloj/ml/nested_cv.clj:
--------------------------------------------------------------------------------
1 | (ns scicloj.ml.nested-cv
2 | (:require [tablecloth.api :as tc]
3 | [scicloj.metamorph.ml :as ml]
4 | [scicloj.metamorph.ml.classification :as clf]
5 | [tech.v3.datatype :as dt]))
6 |
7 |
8 | (defn nested-cv [data pipelines metric-fn loss-or-accuracy outer-k inner-k]
9 | ;; https://www.youtube.com/watch?v=DuDtXtKNpZs
10 | (let [k-folds (tc/split->seq data :kfold {:k outer-k})]
11 | (for [{train :train test :test} k-folds]
12 | (let [inner-k-fold (tc/split->seq test :kfold {:k inner-k})
13 | evaluation (ml/evaluate-pipelines
14 | pipelines
15 | inner-k-fold
16 | metric-fn
17 | loss-or-accuracy)
18 | fit-ctx (-> evaluation first first :fit-ctx)
19 | best-pipe-fn (-> evaluation first first :pipe-fn)
20 | transform-ctx (best-pipe-fn
21 | (merge fit-ctx
22 | {:metamorph/data test :metamorph/mode :transform}))
23 | metric (metric-fn
24 | (-> transform-ctx :model :scicloj.metamorph.ml/target-ds :survived dt/->vector)
25 | (-> transform-ctx :metamorph/data :survived dt/->vector))]
26 | {:pipe-fn best-pipe-fn
27 | :fit-ctx fit-ctx
28 | :metric metric}))))
29 |
30 |
--------------------------------------------------------------------------------
/src/scicloj/ml/polyglot_kmeans.clj:
--------------------------------------------------------------------------------
1 | (ns scicloj.ml.polyglot-kmeans
2 | (:require
3 | [scicloj.sklearn-clj.metamorph]
4 | [nextjournal.clerk :as clerk]
5 | [libpython-clj2.require :refer [require-python]]
6 | [libpython-clj2.python :as py :refer [py.- py.]]))
7 |
8 | (comment
9 | (clerk/serve! {:browser true})
10 | (clerk/build-static-app! {:paths ["src/scicloj/ml/polyglot_kmeans.clj"]
11 | :bundle? false})
12 | (clerk/clear-cache!))
13 |
14 | ^{::clerk/visibility #{:hide}}
15 | (clerk/code
16 | "
17 | from sklearn.datasets import make_blobs
18 | from sklearn.cluster import KMeans
19 | from sklearn.preprocessing import StandardScaler
20 |
21 | features, true_labels = make_blobs(
22 | n_samples=200,
23 | centers=3,
24 | cluster_std=2.75,
25 | random_state=42
26 | )
27 |
28 | scaler = StandardScaler()
29 | scaled_features = scaler.fit_transform(features)
30 |
31 | kmeans = KMeans(
32 | init=\"random\",
33 | n_clusters=3,
34 | n_init=10,
35 | max_iter=300,
36 | random_state=42)
37 |
38 | kmeans.fit(scaled_features)
39 |
40 | kmeans.inertia_
41 | ")
42 |
43 |
44 |
45 | ;; # 1. Use libpython-clj
46 | ;; This is using the same python classes as above
47 | ;; So it is "the same code"
48 | ;;
49 | (require-python '[sklearn.datasets :refer [make_blobs]]
50 | '[sklearn.preprocessing :refer [StandardScaler]]
51 | '[sklearn.cluster :refer [KMeans]])
52 |
53 |
54 |
55 | (def blobs
56 | (make_blobs :n_samples 200
57 | :n_features 50
58 | :centers 3
59 | :cluster_std 2.75
60 | :random_state 42))
61 |
62 | (def scaler (StandardScaler))
63 | (def features (first blobs))
64 | (def scaled-features (py. scaler fit_transform features))
65 | (def k-means (KMeans
66 | :init "random"
67 | :n_clusters 3
68 | :n_init 10
69 | :max_iter 300
70 | :random_state 42))
71 | (py. k-means fit scaled-features)
72 | (py.- k-means inertia_)
73 |
74 | (println :python
75 | (py.- k-means inertia_))
76 |
77 |
78 | ;; # 2. use sklearn-clj
79 | ;; This librraies allow to use all estimators/model from sklearn
80 | ;; It uses libpython-clj, but "hidden" behind sklearn-clj
81 | ;;
82 |
83 | (require '[scicloj.ml.sklearnclj])
84 | (require '[scicloj.ml.dataset :as ds]
85 | '[scicloj.ml.metamorph :as mm]
86 | '[scicloj.ml.core :as ml]
87 | '[scicloj.sklearn-clj.metamorph :as sklearn-clj])
88 |
89 |
90 | (def data (-> blobs first py/->jvm ds/dataset))
91 |
92 | (def fitted-ctx-1
93 | (ml/fit
94 | data
95 | (mm/std-scale :all {})
96 | {:metamorph/id :k-means}
97 | (sklearn-clj/estimate
98 | :sklearn.cluster "KMeans"
99 | {:init "random"
100 | :n_clusters 3
101 | :n_init 10
102 | :max_iter 300
103 | :random_state 42})))
104 | (-> fitted-ctx-1 :k-means :attributes :inertia_)
105 |
106 |
107 | ;; # 3. use Clojure only pipeline
108 | ;; So no python interop in use
109 | ;; It uses clustering algorithms from JVM library Smile
110 |
111 | (require '[scicloj.ml.smile.clustering :as clustering])
112 |
113 | (def fitted-ctx-2
114 | (ml/fit
115 | data
116 | (mm/std-scale :all {})
117 | {:metamorph/id :k-means}
118 | (scicloj.ml.smile.clustering/cluster
119 | :k-means
120 | [3 300]
121 | :cluster)))
122 |
123 | (-> fitted-ctx-2 :k-means :info :distortion)
124 |
125 |
126 | ;; # 4. use declarative Clojure only pipeline
127 | ;; same as 3), only using metamorph declarative pipelines
128 |
129 |
130 |
131 | (def decl-pipe
132 | [[:mm/std-scale :all {}]
133 | {:metamorph/id :k-means}
134 | [:scicloj.ml.smile.clustering/cluster
135 | :k-means
136 | [3 300]
137 | :cluster]])
138 |
139 | (def distortion-1
140 | (->> decl-pipe
141 | ml/->pipeline
142 | (ml/fit-pipe data)
143 | :k-means
144 | :info
145 | :distortion))
146 |
147 |
148 | (frequencies
149 | (repeatedly 1000 (fn []
150 | (->> decl-pipe
151 | ml/->pipeline
152 | (ml/fit-pipe data)
153 | :k-means
154 | :info
155 | :distortion))))
156 |
157 |
158 |
159 |
160 | ;; # 5. in one threading macro, no variables declared
161 | ;; same as 4., but written more compact
162 |
163 | (def distortion-2
164 | (->> [[:mm/std-scale :all {}]
165 | {:metamorph/id :k-means}
166 | [:scicloj.ml.smile.clustering/cluster
167 | :k-means
168 | [3 300]
169 | :cluster]]
170 | ml/->pipeline
171 | (ml/fit-pipe data)
172 | :k-means
173 | :info))
174 |
--------------------------------------------------------------------------------
/src/scicloj/ml/sklearnclj.clj:
--------------------------------------------------------------------------------
1 | (ns scicloj.ml.sklearnclj
2 | (:require
3 | [notespace.api :as note]
4 | [notespace.kinds :as kind]
5 | [scicloj.sklearn-clj.ml]
6 | [scicloj.ml.ug-utils]))
7 |
8 |
9 |
10 |
11 | (comment
12 | (note/init-with-browser)
13 | (note/eval-this-notespace)
14 | (note/reread-this-notespace)
15 | (note/render-static-html "docs/userguide-sklearnclj.html")
16 | (note/init))
17 |
18 |
19 | ["# sklearn-clj"]
20 |
21 | ["The [scicloj.ml](https://github.com/scicloj/scicloj.ml) plugin [sklearn-clj](https://github.com/scicloj/sklearn-clj)
22 | gives easy access to all models from [scikit-learn](https://scikit-learn.org/stable/)"]
23 |
24 | ["After [libpython.clj](https://github.com/clj-python/libpython-clj)
25 | has been setup with the python package sklearn installed,
26 | the following lines show how to use any sklearn model in a usual `scicloj.ml` pipeline:"]
27 |
28 | (require '[scicloj.ml.core :as ml]
29 | '[scicloj.ml.metamorph :as mm]
30 | '[scicloj.ml.dataset :as ds]
31 | '[tech.v3.dataset.tensor :as dst]
32 | '[scicloj.sklearn-clj :as sklearn-clj]
33 | '[scicloj.sklearn-clj.ml]
34 | '[scicloj.metamorph.ml.toydata :as toydata]
35 | '[libpython-clj2.python :refer [py.-] :as py])
36 |
37 |
38 | ["Example: logistic regression"]
39 |
40 | (def ds (dst/tensor->dataset [[0 0 0 ] [1 1 1 ] [2 2 2]]))
41 |
42 | ["Make pipe with sklearn model 'logistic-regression'"]
43 | (def pipe
44 | (ml/pipeline
45 | (mm/set-inference-target 2)
46 | {:metamorph/id :model}
47 | (mm/model {:model-type :sklearn.classification/logistic-regression
48 | :max-iter 100})))
49 |
50 |
51 | ["Train model"]
52 | (def fitted-ctx
53 | (pipe {:metamorph/data ds
54 | :metamorph/mode :fit}))
55 |
56 | ["Predict on new data"]
57 | (->
58 | (ml/transform-pipe
59 | (dst/tensor->dataset [[3 4 5]])
60 | pipe
61 | fitted-ctx)
62 | :metamorph/data)
63 |
64 | ["Access model details via python interop (libpython-clj)"]
65 | (-> fitted-ctx :model :model-data :model
66 | (py.- coef_)
67 | (py/->jvm))
68 |
69 |
70 |
71 |
72 |
73 | ["All model attributes are as well in the context"]
74 |
75 | (def model-attributes
76 | (-> fitted-ctx :model :model-data :attributes))
77 |
78 | ^kind/hiccup-nocode
79 | [:dl (map
80 | (fn [[k v]]
81 | [:span
82 | (vector :dt k)
83 | (vector :dd (clojure.pprint/write v :stream nil))])
84 | model-attributes)]
85 |
86 |
87 |
88 | ["# Models"]
89 |
90 | ["Below all models are listed with their parameters and the original documentation.
91 |
92 | The parameters are given as Clojure keys in kebap-case. As the document texts are imported from python
93 | they refer to the python spelling of the parameter. But the translation between the two should be obvious."]
94 |
95 | ^kind/hiccup-nocode
96 | [:ul
97 |
98 |
99 | (->>
100 | (ml/model-definition-names)
101 | (filter #(contains? #{"sklearn.classification"
102 | "sklearn.regression"}
103 |
104 | (namespace %)))
105 | sort
106 | (map
107 | #(vector :li [:a {:href (str "#" (str %))} (str %)])))]
108 |
109 |
110 |
111 |
112 | ["## Sklearn classification"]
113 | ^kind/hiccup-nocode
114 | (scicloj.ml.ug-utils/render-key-info ":sklearn.classification")
115 |
116 |
117 | ["## Sklearn regression"]
118 | ^kind/hiccup-nocode
119 | (scicloj.ml.ug-utils/render-key-info ":sklearn.regression")
120 |
--------------------------------------------------------------------------------
/src/scicloj/ml/third_party.clj:
--------------------------------------------------------------------------------
1 | (ns scicloj.ml.third-party
2 | (:require [notespace.api :as note]
3 | [notespace.kinds :as kind]
4 | [scicloj.ml.ug-utils :refer :all]
5 | [dk.simongray.datalinguist.ml.crf]
6 | [scicloj.ml.clj-djl.mmml]
7 | [scicloj.ml.clj-djl.fasttext]
8 | [tech.v3.libs.arrow :as arrow]))
9 |
10 | (comment
11 | (note/init-with-browser)
12 | (note/eval-this-notespace)
13 | (note/reread-this-notespace)
14 | (note/render-static-html "docs/userguide-third_party.html")
15 | (note/init))
16 |
17 |
18 | (require '[scicloj.ml.core :as ml]
19 | '[scicloj.ml.metamorph :as mm]
20 | '[scicloj.ml.dataset :as ds]
21 | '[tech.v3.datatype.functional :as dfn]
22 | '[clojure.tools.namespace.find :as ns-find]
23 | '[clojure.java.classpath :as cp]
24 | '[scicloj.ml.xgboost]
25 | '[camel-snake-kebab.core :as csk])
26 |
27 |
28 |
29 |
30 |
31 |
32 | ["# xgboost"]
33 | ["## Example code"]
34 |
35 | (def house-price
36 | (->
37 | (ds/dataset
38 | "http://d2l-data.s3-accelerate.amazonaws.com/kaggle_house_pred_train.csv" {:key-fn csk/->kebab-case-keyword})
39 | (ds/replace-missing :type/string "NA")
40 | (ds/categorical->number #(ds/select-columns % :type/string))))
41 |
42 |
43 | (def split (first (ds/split->seq house-price :holdout)))
44 |
45 | (def train-ds (:train split))
46 | (def test-ds (:test split))
47 |
48 |
49 | (def pipe-fn
50 | (ml/pipeline
51 | (mm/replace-missing :type/numerical :value 0)
52 | (mm/set-inference-target :sale-price)
53 | {:metamorph/id :model} (mm/model {:model-type :xgboost/linear-regression})))
54 |
55 | (def fit-result
56 | (let [fitted-ctx
57 | (ml/fit-pipe train-ds pipe-fn)
58 | test-predictions
59 | (ml/transform-pipe test-ds pipe-fn fitted-ctx)
60 | error
61 | (ml/mae (-> test-predictions :metamorph/data :sale-price)
62 | (-> test-ds :sale-price))]
63 | {:error error
64 | :gains (->
65 | (ml/explain (-> fitted-ctx :model))
66 | (ds/order-by :gain :desc))}))
67 |
68 |
69 |
70 | ["error:"]
71 | (:error fit-result)
72 |
73 | ["Feature importance - gain"]
74 |
75 | ^kind/dataset
76 | (:gains fit-result)
77 |
78 | ["## Reference"]
79 |
80 | ^kind/hiccup-nocode (render-key-info ":xgboost")
81 |
82 | ["# Deep learning models via clj-djl "]
83 |
84 |
85 |
86 | (def train-ds
87 | (ds/dataset
88 | "http://d2l-data.s3-accelerate.amazonaws.com/kaggle_house_pred_train.csv"))
89 |
90 |
91 | (def test-ds
92 | (->
93 | (ds/dataset
94 | "http://d2l-data.s3-accelerate.amazonaws.com/kaggle_house_pred_test.csv")
95 | (ds/add-column "SalePrice" 0)))
96 |
97 | (defn numeric-features [ds]
98 | (ds/intersection (ds/numeric ds)
99 | (ds/feature ds)))
100 |
101 | (defn update-columns
102 | "Update a sequence of columns selected by column name seq or column selector function."
103 | [dataframe col-name-seq-or-fn update-fn]
104 | (ds/update-columns dataframe
105 | (if (fn? col-name-seq-or-fn)
106 | (ds/column-names (col-name-seq-or-fn dataframe))
107 | col-name-seq-or-fn)
108 | update-fn))
109 |
110 |
111 |
112 |
113 | (require
114 | '[clj-djl.nn :as nn]
115 | '[clj-djl.training :as t]
116 | '[clj-djl.training.loss :as loss]
117 | '[clj-djl.training.optimizer :as optimizer]
118 | '[clj-djl.training.tracker :as tracker]
119 | '[clj-djl.training.listener :as listener]
120 | '[clj-djl.ndarray :as nd]
121 | '[clj-djl.nn.parameter :as param])
122 |
123 | (def learning-rate 0.05)
124 | (defn net [] (nn/sequential {:blocks (nn/linear {:units 1})
125 | :initializer (nn/normal-initializer)
126 | :parameter param/weight}))
127 |
128 | (defn cfg [] (t/training-config {:loss (loss/l2-loss)
129 | :optimizer (optimizer/sgd
130 | {:tracker (tracker/fixed learning-rate)})
131 | :evaluator (t/accuracy)
132 | :listeners (listener/logging)}))
133 |
134 |
135 |
136 | (def pipe
137 | (ml/pipeline
138 |
139 | (mm/drop-columns ["Id"])
140 | (mm/set-inference-target "SalePrice")
141 | (mm/replace-missing :type/numerical :value 0)
142 | (mm/replace-missing :!type/numerical :value "None")
143 | (ml/lift update-columns numeric-features
144 | #(dfn// (dfn/- % (dfn/mean %))
145 | (dfn/standard-deviation %)))
146 | (mm/transform-one-hot :!type/numerical :full)
147 | (mm/update-column "SalePrice"
148 | #(dfn// % (dfn/mean %)))
149 |
150 | (mm/set-inference-target "SalePrice")
151 |
152 | (mm/model {:model-type :clj-djl/djl
153 | :batchsize 64
154 | :model-spec {:name "mlp" :block-fn net}
155 | :model-cfg (cfg)
156 | :initial-shape (nd/shape 1 311)
157 | :nepoch 1})))
158 |
159 |
160 |
161 |
162 | (def trained-pipeline
163 | (pipe {:metamorph/data train-ds
164 | :metamorph/mode :fit
165 | :metamorph.ml/full-ds (ds/concat train-ds test-ds)}))
166 |
167 |
168 |
169 | (def predicted-pipeline
170 | (pipe
171 | (merge trained-pipeline
172 | {:metamorph/data test-ds
173 | :metamorph/mode :transform})))
174 |
175 |
176 |
177 |
178 | ( get
179 | (:metamorph/data predicted-pipeline)
180 | "SalePrice")
181 |
182 |
183 | ^kind/hiccup-nocode
184 | (render-key-info ":clj-djl/djl")
185 |
186 |
187 | ["# A NER model from Standford CoreNLP"]
188 |
189 | ^kind/hiccup-nocode
190 | (render-key-info ":corenlp")
191 |
192 |
193 | ["# Fastext text lassification rom DJL"]
194 |
195 | ^kind/hiccup-nocode
196 | (render-key-info ":clj-djl/fasttext")
197 |
198 | (def tweets
199 | (->
200 | (ds/dataset "data/tweets_sentiment.csv" {:key-fn keyword})
201 | (ds/drop-columns [:id])))
202 | ;; (def tweets
203 | ;; (arrow/stream->dataset "data/tweets_sentiment.feather"))
204 |
205 |
206 | ;; (require '[tech.v3.libs.arrow])
207 |
208 |
209 |
210 | ^kind/dataset
211 | tweets
212 |
213 | (def split (first (ds/split->seq
214 | (ds/shuffle tweets)
215 | :holdout)))
216 |
217 |
218 |
219 | (def model
220 | (ml/train (-> (:train split)
221 | (tech.v3.dataset.modelling/set-inference-target :label))
222 | {:model-type :clj-djl/fasttext
223 | :ft-training-config {:epoch 1}}))
224 |
225 | (def
226 | prob-distribution
227 | (ml/predict (:test split) (assoc model
228 | :top-k 3)))
229 | prob-distribution
230 |
--------------------------------------------------------------------------------
/src/scicloj/ml/titanic.clj:
--------------------------------------------------------------------------------
1 | (ns scicloj.ml.titanic
2 | (:require
3 | [notespace.api :as note]
4 | [notespace.kinds :as kind]))
5 |
6 | (comment
7 | (note/init-with-browser)
8 | (note/eval-this-notespace)
9 | (note/reread-this-notespace)
10 | (note/render-static-html "docs/userguide-titanic.html")
11 |
12 | (note/init))
13 |
14 |
15 |
16 | (require '[scicloj.ml.dataset :as ds]
17 | '[tech.v3.dataset.math :as ds-math]
18 | '[tech.v3.datatype.functional :as dfn]
19 | '[scicloj.ml.core :as ml]
20 | '[scicloj.ml.metamorph :as mm]
21 | '[camel-snake-kebab.core :as csk]
22 | '[scicloj.metamorph.ml.loss :as loss]
23 | '[clojure.string :as str]
24 | '[fastmath.stats :as stats]
25 | '[fastmath.random :as rnd]
26 | '[scicloj.ml.xgboost])
27 |
28 |
29 |
30 | ["## Introduction "]
31 |
32 | [" In this example, we will train a model which is able to predict the survival of passengers from the Titanic dataset."
33 | "In a real analysis, this would contain as well explorative analysis of the data, which I will skip here,
34 | as the purpose is to showcase machine learning with scicloj.ml, which is about model evaluation and selection."]
35 |
36 |
37 |
38 | ["### Read data"]
39 |
40 | (def data (ds/dataset "data/titanic/train.csv" {:key-fn csk/->kebab-case-keyword}))
41 |
42 |
43 |
44 | ["Column info:"]
45 | (ds/info data)
46 |
47 |
48 | ["We can explore the association between the categorical columns of the dataset
49 | with the :survived using cramers-v-corrected:"]
50 | (def categorical-feature-columns [:pclass :sex :age :parch
51 | :embarked])
52 | (map
53 | #(hash-map
54 | %
55 | (stats/cramers-v-corrected
56 | (get data %)
57 | (:survived data)))
58 | categorical-feature-columns)
59 |
60 | ["In this dataset, :sex seems to be the best predictor for survival."]
61 |
62 | ["Association between the select variables:"]
63 | (for [c1 categorical-feature-columns c2 categorical-feature-columns]
64 | {[c1 c2]
65 | (stats/cramers-v-corrected (get data c1) (get data c2))})
66 |
67 |
68 | ["This shows how much the columns are correlated. "]
69 |
70 | ["## clean some of the features"]
71 |
72 | ["The follwoing functios will be used in the pipeline. They clean the
73 | features to make them better predictors."]
74 |
75 | (defn categorize-cabin [data]
76 | (-> data
77 | (ds/add-or-replace-column
78 | :cabin
79 | (map
80 | #(if (empty? %)
81 | :unknown
82 | (keyword (subs
83 | %
84 | 0 1)))
85 | (:cabin data)))))
86 |
87 |
88 | (defn categorize-age [data]
89 | (->
90 | data
91 | (ds/add-or-replace-column
92 | :age-group
93 | (map
94 | #(cond
95 | (< % 10) :child
96 | (< % 18) :teen
97 | (< % 60) :adult
98 | (> % 60) :elderly
99 | true :other)
100 | (:age data)))))
101 |
102 | ["We want to create a new column :title which might help in the score.
103 | This is an example of custom function, which creates a new column from existing columns,
104 | which is a typical case of feature engineering."]
105 |
106 | (defn name->title [dataset]
107 | (-> dataset
108 | (ds/add-or-replace-column
109 | :title
110 | (map
111 | #(-> % (str/split #"\.")
112 | first
113 | (str/split #"\,")
114 | last
115 | str/trim)
116 | (data :name)))
117 | (ds/drop-columns :name)))
118 |
119 | (def title-map
120 | {"Major" :a
121 | "Col" :a
122 | "Rev" :a
123 | "Ms" :b
124 | "Miss" :b
125 | "Jonkheer" :a
126 | "Don" :a
127 | "Mlle" :b
128 | "Mr" :a
129 | "Master" :a
130 | "Capt" :a
131 | "Mrs" :b
132 | "Lady" :b
133 | "Sir" :a
134 | "Dr" :a
135 | "the Countess" :b
136 | "Mme" :b})
137 |
138 | (defn categorize-title [data]
139 | (->
140 | data
141 | (ds/add-or-replace-column
142 | :title
143 | (map title-map (:title data)))))
144 |
145 | ["The final pipeline contains the functions we did before."]
146 |
147 |
148 | ;; => _unnamed [2 1]:
149 | ;; | :a |
150 | ;; |----|
151 | ;; | |
152 | ;; | |
153 |
154 | (def pipeline-fn
155 | (ml/pipeline
156 | (mm/replace-missing :embarked :value "S")
157 | (mm/replace-missing :age :value tech.v3.datatype.functional/mean)
158 | (mm/update-column :parch str)
159 | (ml/lift categorize-age)
160 | (ml/lift name->title)
161 | (ml/lift categorize-title)
162 | (ml/lift categorize-cabin)
163 | (mm/select-columns [:age-group
164 | :cabin
165 | :embarked
166 | :fare
167 | :parch
168 | :pclass
169 | :sex
170 | :survived
171 | :title])
172 |
173 | (fn [ctx]
174 | (assoc ctx :categorical-ds
175 | (:metamorph/data ctx)))
176 |
177 |
178 | (mm/categorical->number [:survived :pclass :sex :embarked
179 | :title :age-group :cabin :parch] {} :int64)
180 |
181 | (mm/set-inference-target :survived)))
182 |
183 |
184 | ["Transformed data"]
185 | (->
186 | (pipeline-fn {:metamorph/data data :metamorph/mode :fit})
187 | :metamorph/data)
188 |
189 |
190 | ["The following splits the dataset in three pieces,
191 | train, val and test to predict on later.
192 | "]
193 |
194 |
195 |
196 |
197 |
198 | (def ds-split (first (ds/split->seq data :holdout {:ratio [0.8 0.2]
199 | :split-names [:train-val :test]})))
200 |
201 |
202 | ["Create a sequence of train/test (k-fold with k=10) splits used to evaluate the pipeline."]
203 | (def train-val-splits
204 | (ds/split->seq
205 | (:train-val ds-split)
206 | :kfold
207 | {:k 10}))
208 |
209 |
210 |
211 |
212 | ["The full pipeline definition including the random forrest model."]
213 |
214 | (def full-pipeline-fn
215 | (ml/pipeline
216 | pipeline-fn
217 | ;; we overwrite the id, so the model function will store
218 | ;; it's output (the model) in the pipeline ctx under key :model
219 | {:metamorph/id :model}
220 | (mm/model {:model-type :smile.classification/random-forest})))
221 |
222 |
223 |
224 |
225 |
226 | ["Evaluate the (single) pipeline function using the train/test split"]
227 | (def evaluations
228 | (ml/evaluate-pipelines
229 | [full-pipeline-fn]
230 | train-val-splits
231 | ml/classification-accuracy
232 | :accuracy))
233 |
234 |
235 | ["The default k-fold splits makes 10 folds,
236 | so we train 10 models, each having its own loss."]
237 |
238 | ["The `evaluate-pipelines` fn averages the models per pipe-fn,
239 | and returns the best.
240 | So we get a single model back, as we only have one pipe fn"]
241 |
242 | ["Often we consider the model with the lowest loss to be the best."]
243 |
244 | ["Return a single model only (as a list of 1) , namely the best over all
245 | pipeline functions
246 | and all cross validations is the default behavoiur, but can be changed
247 | with the `tune options`."]
248 |
249 | ["They controll as well which information is returned."]
250 |
251 | ["`tech.ml` stores the models in the context in a serialzed form,
252 | and the function `thaw-model` can be used to get the original model back.
253 | This is a Java class in the case of
254 | model :smile.classification/random.forest, but this depends on the
255 | which `model` function is in the pipeline"]
256 |
257 | ["We can get for example, the models like this:"]
258 |
259 | (def models
260 | (->> evaluations
261 | flatten
262 | (map
263 | #(hash-map :model (ml/thaw-model (get-in % [:fit-ctx :model]))
264 | :metric ((comp :metric :test-transform) %)
265 | :fit-ctx (:fit-ctx %)))
266 |
267 | (sort-by :mean)
268 | reverse))
269 |
270 |
271 | ["The accuracy of the best trained model is:"]
272 | (-> models first :metric)
273 |
274 | ["The one with the highest accuracy is then:"]
275 | (-> models first :model)
276 |
277 |
278 | ["We can get the predictions on new-data, which for classification contain as well
279 | the posterior probabilities per class."]
280 |
281 | ["We do this by running the pipeline again, this time with new data and merging
282 | :mode transform"]
283 |
284 | (def predictions
285 | (->
286 | (full-pipeline-fn
287 | (assoc
288 | (:fit-ctx (first models))
289 | :metamorph/data (:test ds-split)
290 | :metamorph/mode :transform))
291 | :metamorph/data))
292 |
293 | ^kind/dataset
294 | predictions
295 |
296 |
297 | ["Out of the predictions and the truth, we can construct the
298 | confusion matrix."]
299 |
300 | (def trueth
301 | (->
302 | (full-pipeline-fn {:metamorph/data (:test ds-split) :metamorph/mode :fit})
303 | :metamorph/data
304 | tech.v3.dataset.modelling/labels))
305 |
306 | ^kind/dataset
307 | (->
308 | (ml/confusion-map (:survived predictions)
309 | (:survived trueth)
310 | :none)
311 | (ml/confusion-map->ds))
312 |
313 | ["### Hyper parameter tuning"]
314 |
315 | ["This defines a pipeline with options. The options gets passed to the model function,
316 | so become hyper-parameters of the model.
317 |
318 | The `use-age?` options is used to make a conditional pipeline. As the use-age? variable becomes part of the grid to search in,
319 | we tune it as well.
320 | This is an example how pipeline-options can be grid searched in the same way then hyper-parameters of the model.
321 |
322 | "]
323 | (defn make-pipeline-fn [options]
324 |
325 | (ml/pipeline
326 | pipeline-fn
327 | {:metamorph/id :model}
328 | (mm/model
329 | (merge options
330 | {:model-type :smile.classification/random-forest}))))
331 |
332 | ["Use sobol optimization, to find som grid points,
333 | which cover in a smart way the hyper-parameter space."]
334 |
335 | (def search-grid
336 | (->>
337 | (ml/sobol-gridsearch {:trees (ml/linear 100 500 10)
338 | :mtry (ml/categorical [0 2 4])
339 | :split-rule (ml/categorical [:gini :entropy])
340 | :max-depth (ml/linear 1 50 10)
341 | :node-size (ml/linear 1 10 10)})
342 |
343 | (take 500)))
344 |
345 |
346 | ["Generate the pipeline-fns we want to evaluate."]
347 | (def pipeline-fns (map make-pipeline-fn search-grid))
348 |
349 | (defn xgboost-pipe [opts]
350 | (ml/pipeline
351 | pipeline-fn
352 | {:metamorph/id :model}
353 | (mm/model
354 | (merge opts
355 | {:model-type :xgboost/classification}))))
356 |
357 | (def xgboost-pipes
358 | (->>
359 | (ml/sobol-gridsearch
360 | (ml/hyperparameters :xgboost/classification))
361 | (take 500)
362 | (map xgboost-pipe)))
363 |
364 |
365 | ;; (ml/fit-pipe (:train (first train-val-splits)) xgboost-pipe)
366 |
367 | ["Evaluate all pipelines and keep results"]
368 | (def evaluations
369 |
370 | (ml/evaluate-pipelines
371 | (take 10
372 | (concat xgboost-pipes xgboost-pipes))
373 | train-val-splits
374 | ml/classification-accuracy
375 | :accuracy
376 | {:return-best-pipeline-only false
377 | :return-best-crossvalidation-only false
378 | ;; :evaluation-handler-fn (fn [m]
379 | ;; (println (:metric m)))
380 |
381 |
382 | :map-fn :map}))
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 | ["Get the key information from the evaluations and sort by the metric function used,
391 | accuracy here."]
392 |
393 | (def models
394 | (->> evaluations
395 | flatten
396 | (map
397 | #(assoc
398 | (select-keys % [:test-transform :fit-ctx :pipe-fn])
399 |
400 | :model (ml/thaw-model (get-in % [:fit-ctx :model]))))
401 | (sort-by (comp :metric :test-transform))
402 | reverse))
403 |
404 |
405 |
406 |
407 | ["As we did several pipelines and several x-fold cross validation, we have quite some models trained in total "]
408 | (count models)
409 |
410 | ["As we sorted by mean accuracy, the first evaluation result is the best model,"]
411 | (def best-model (first models))
412 |
413 | ["which is: "]
414 | (:model best-model)
415 |
416 | ["with a mean accuracy of " (-> best-model :test-transform :mean)]
417 | ["and a accuracy of " (-> best-model :test-transform :metric)]
418 |
419 |
420 | (println "mean acc: " (-> best-model :test-transform :mean))
421 | (println "acc: " (-> best-model :test-transform :metric))
422 |
423 |
424 | ["using options: "]
425 | (-> best-model :fit-ctx :model :options)
426 | (clojure.pprint/pprint (-> best-model :fit-ctx :model :options))
427 |
428 | (def test-data (ds/dataset "data/titanic/test.csv"
429 | {:key-fn csk/->kebab-case-keyword}))
430 |
431 |
432 |
433 | (def predition-on-test
434 | (full-pipeline-fn
435 | (assoc (:fit-ctx best-model)
436 | :metamorph/data (ds/add-column test-data :survived nil)
437 | :metamorph/mode :transform)))
438 |
439 |
440 | (def prediction-ds
441 | (->
442 | (predition-on-test :metamorph/data)
443 | (ds/add-column :passenger-id (:passenger-id test-data))
444 | (ds/convert-types [:survived] :int)
445 | (ds/select-columns [:passenger-id :survived 0 1])))
446 |
447 | ^kind/dataset
448 | prediction-ds
449 |
450 |
451 |
452 |
453 |
454 | ["# Create Subimssion file to Kaggle"]
455 |
456 | (def submission-ds
457 | (-> prediction-ds
458 | (ds/select-columns [:passenger-id :survived])
459 | (ds/rename-columns {:passenger-id "PassengerId"
460 | :survived "Survived"})))
461 |
462 | (ds/write-csv! submission-ds "submission.csv")
463 |
464 |
465 | ["### Learning curve"]
466 |
467 |
468 |
469 | (def training-curve-splits
470 | (map
471 | #(hash-map :train (ds/head (:train-val ds-split) %)
472 | :test (:test ds-split))
473 | (range 5 (ds/row-count (:train-val ds-split)) 10)))
474 |
475 |
476 |
477 | (def training-curve-evaluations
478 | (ml/evaluate-pipelines [(:pipe-fn (first models))]
479 | training-curve-splits
480 | ml/classification-accuracy
481 | :accuracy
482 | {:map-fn :map
483 | :return-best-pipeline-only false
484 | :return-best-crossvalidation-only false
485 | :evaluation-handler-fn identity}))
486 |
487 | (def train-counts
488 | (->> training-curve-evaluations flatten (map #(-> % :fit-ctx :metamorph/data ds/row-count))))
489 |
490 |
491 |
492 | (def test-metrices
493 | (->> training-curve-evaluations flatten (map #(-> % :test-transform :metric))))
494 |
495 | (def train-metrices
496 | (->> training-curve-evaluations flatten (map #(-> % :train-transform :metric))))
497 |
498 | (def traing-curve-plot-data
499 | (reverse
500 | (sort-by :metric
501 | (flatten
502 | (map
503 | #(vector (zipmap [:count :metric :type] [%1 %2 :test])
504 | (zipmap [:count :metric :type] [%1 %3 :train]))
505 | train-counts
506 | test-metrices
507 | train-metrices)))))
508 |
509 |
510 | ^kind/vega
511 | {
512 | :data {:values traing-curve-plot-data}
513 |
514 | :width 500
515 | :height 500
516 | :mark {:type "line"}
517 | :encoding {:x {:field :count :type "quantitative"}
518 | :y {:field :metric :type "quantitative"}
519 | :color {:field :type}}}
520 |
521 |
522 |
523 |
524 |
525 | (comment
526 | (->>
527 | (map
528 | #(hash-map :test-metric %1
529 | :train-metric %2
530 | :better? (if (> %1 %2) :test :train))
531 | (->> training-curve-evaluations flatten (map :metric))
532 | (->> training-curve-evaluations flatten (map #(get-in % [:train-prediction :metric]))))
533 | (map :better?)
534 | frequencies)
535 |
536 | (println
537 | (-> (ds/dataset {:x ["A" "B" "C" "D" "E" "F"] :y (range)})
538 | (ds/categorical->one-hot [:x] {} :int)
539 | (ds/set-inference-target :y)
540 | (scicloj.metamorph.ml/train {:model-type :smile.regression/ordinary-least-square})
541 | ml/thaw-model)))
542 |
543 |
544 |
--------------------------------------------------------------------------------
/src/scicloj/ml/transformers.clj:
--------------------------------------------------------------------------------
1 | (ns scicloj.ml.transformers
2 | (:require
3 | [notespace.api :as note]
4 | [notespace.kinds :as kind]
5 | [scicloj.ml.metamorph :as mm]))
6 |
7 |
8 |
9 | (comment
10 | (note/init-with-browser)
11 | (note/eval-this-notespace)
12 | (note/render-static-html "docs/userguide-transformers.html"))
13 |
14 |
15 | (require '[scicloj.ml.core :as ml]
16 | '[scicloj.ml.dataset :as ds]
17 | '[scicloj.ml.metamorph :as mm])
18 |
19 |
20 |
21 | ^kind/hidden
22 | (defn docu-fn [v]
23 | (let [m (meta v)]
24 | (kind/override
25 | [
26 | (str "## Transformer " "**" (:name m) "**")
27 | "----------------------------------------------------------"
28 | "__Clojure doc__:\n"
29 | (:doc m)
30 | "----------------------------------------------------------"]
31 |
32 | kind/md-nocode)))
33 |
34 |
35 |
36 |
37 | (docu-fn (var mm/count-vectorize))
38 |
39 | ["In the following we transform the text given in a dataset into a
40 | map of token counts applying some default text normalization."]
41 | (def data (ds/dataset {:text ["Hello Clojure world, hello ML word !"
42 | "ML with Clojure is fun"]}))
43 |
44 |
45 | ^kind/dataset-grid
46 | data
47 |
48 | ["_"]
49 |
50 | (def fitted-ctx
51 | (ml/fit data
52 | (mm/count-vectorize :text :bow)))
53 |
54 |
55 |
56 | fitted-ctx
57 |
58 | (def bow-ds
59 | (:metamorph/data fitted-ctx))
60 |
61 | ^kind/dataset
62 | bow-ds
63 |
64 |
65 | ["A custom tokenizer can be specified by either passing options to
66 | `scicloj.ml.smile.nlp/default-tokenize` "]
67 |
68 |
69 | (def fitted-ctx
70 | (ml/fit
71 | data
72 | (mm/count-vectorize :text :bow {:stopwords ["clojure"]
73 | :stemmer :none})))
74 |
75 |
76 | fitted-ctx
77 |
78 | ["or passing in a implementation of a tokenizer function"]
79 |
80 | (def fitted-ctx
81 | (ml/fit
82 | data
83 | (mm/count-vectorize
84 | :text :bow
85 | {:text->bow-fn (fn [text options]
86 | {:a 1 :b 2})})))
87 |
88 | fitted-ctx
89 |
90 |
91 |
92 | (docu-fn (var mm/bow->SparseArray))
93 | ["Now we convert the bag-of-words map to a sparse array of class
94 | `smile.util.SparseArray`
95 |
96 | "]
97 | (def ctx-sparse
98 | (ml/fit
99 | bow-ds
100 | (mm/bow->SparseArray :bow :sparse)))
101 |
102 | ctx-sparse
103 |
104 |
105 | ^kind/dataset
106 | (:metamorph/data ctx-sparse)
107 |
108 | ["The SparseArray instances look like this:"]
109 | (zipmap
110 | (:text bow-ds)
111 | (map seq
112 | (-> ctx-sparse :metamorph/data :sparse)))
113 |
114 | (docu-fn (var mm/bow->sparse-array))
115 | ["Now we convert the bag-of-words map to a sparse array of class
116 | `java primitive int array`
117 | "]
118 | (def ctx-sparse
119 | (ml/fit
120 | bow-ds
121 | (mm/bow->sparse-array :bow :sparse)))
122 |
123 | ctx-sparse
124 |
125 | ["We see as well the sparse representation as indices against the vocabulary
126 | of the non-zero counts."]
127 |
128 | (zipmap
129 | (:text bow-ds)
130 | (map seq
131 | (-> ctx-sparse :metamorph/data :sparse)))
132 |
133 |
134 |
135 |
136 | ["In both ->sparse function we can control the vocabulary via
137 | the option to pass in a different / custom functions which creates
138 | the vocabulary from the bow maps."]
139 |
140 | (def ctx-sparse
141 | (ml/fit
142 | bow-ds
143 | (mm/bow->SparseArray
144 | :bow :sparse
145 | {:create-vocab-fn
146 | (fn [bow] (scicloj.ml.smile.nlp/->vocabulary-top-n bow 1))})))
147 |
148 |
149 | ctx-sparse
150 |
151 | (def ctx-sparse
152 | (ml/fit
153 | bow-ds
154 | (mm/bow->SparseArray
155 | :bow :sparse
156 | {:create-vocab-fn
157 | (fn [_]
158 | ["hello" "fun"])})))
159 |
160 |
161 | ctx-sparse
162 |
163 |
164 | (docu-fn (var mm/bow->tfidf))
165 | ["Here we calculate the tf-idf score from the bag of words:"]
166 |
167 | ^kind/dataset
168 | (ml/pipe-it
169 | bow-ds
170 | (mm/bow->tfidf :bow :tfidf {}))
171 |
172 |
173 |
174 | (docu-fn (var mm/model))
175 | ["The `model` transformer allows to execute all machine learning models.clj
176 | which register themself inside the `metamorph.ml` system via the function
177 | `scicloj.metamorph.ml/define-model!`.
178 | The build in models are listed here:
179 | https://scicloj.github.io/scicloj.ml/userguide-models.html
180 |
181 | "]
182 |
183 | ["We use the Iris data for this example:"]
184 |
185 | (def iris
186 | (->
187 | (ds/dataset
188 | "https://raw.githubusercontent.com/scicloj/metamorph.ml/main/test/data/iris.csv" {:key-fn keyword})
189 | (tech.v3.dataset.print/print-range 5)))
190 |
191 |
192 |
193 | ^kind/dataset
194 | iris
195 |
196 | (def train-test
197 | (ds/train-test-split iris))
198 |
199 | ["The pipeline consists in specifying the inference target,
200 | transform target to categorical and the model function"]
201 | (def pipe-fn
202 | (ml/pipeline
203 | (mm/set-inference-target :species)
204 | (mm/categorical->number [:species])
205 | {:metamorph/id :model}
206 | (mm/model {:model-type :smile.classification/logistic-regression})))
207 |
208 | ["First we run the training "]
209 | (def fitted-ctx
210 | (ml/fit
211 | (:train-ds train-test)
212 | pipe-fn))
213 |
214 |
215 | ^kind/hidden
216 | (defn dissoc-in [m ks]
217 | (let [parent-path (butlast ks)
218 | leaf-key (last ks)]
219 | (if (= (count ks) 1)
220 | (dissoc m leaf-key)
221 | (update-in m parent-path dissoc leaf-key))))
222 |
223 | (dissoc-in fitted-ctx [:model :model-data])
224 |
225 | ["and then prediction on test"]
226 |
227 | (def transformed-ctx
228 | (ml/transform-pipe (:test-ds train-test) pipe-fn fitted-ctx))
229 |
230 | (-> transformed-ctx
231 | (dissoc-in [:model :model-data])
232 | (update-in [:metamorph/data ] #(tech.v3.dataset.print/print-range % 5)))
233 |
234 |
235 | ["and we get the predictions: "]
236 | ^kind/dataset
237 | (-> transformed-ctx
238 | :metamorph/data
239 | (ds/reverse-map-categorical-xforms)
240 | (ds/select-columns :species)
241 | (ds/head))
242 |
243 |
244 | (docu-fn (var mm/std-scale))
245 | ["We can use the std-scale transformer to center and scale data."]
246 | ["Lets take some example data:"]
247 | (def data
248 | (ds/dataset
249 | [
250 | [100 0.001]
251 | [8 0.05]
252 | [50 0.005]
253 | [88 0.07]
254 | [4 0.1]]
255 | {:layout :as-row}))
256 |
257 | ^kind/dataset
258 | data
259 |
260 | ["Now we can center each column arround 0 and scale
261 | it by the standard deviation of the column"]
262 |
263 | ^kind/dataset
264 | (ml/pipe-it
265 | data
266 | (mm/std-scale [0 1] {}))
267 |
268 |
269 | (docu-fn (var mm/min-max-scale))
270 |
271 | ["The min-max scaler scales columns in a specified interval,
272 | by default from -0.5 to 0.5"]
273 |
274 | ^kind/dataset
275 | (ml/pipe-it
276 | data
277 | (mm/min-max-scale [0 1] {}))
278 |
279 | (docu-fn (var mm/reduce-dimensions))
280 |
281 | ["#### PCA example"]
282 |
283 | ["In this example we run PCA on some data."]
284 |
285 | (require '[scicloj.metamorph.ml.toydata :as toydata])
286 |
287 | ["We use the sonar dataset which has 60 columns of quantitative data,
288 | which are certain measurements from a sonar device.
289 | The original purpose of the dataset is to learn to detect rock vs metal
290 | from the measurements"]
291 | (def sonar
292 | (toydata/sonar-ds))
293 |
294 | ^kind/dataset
295 | sonar
296 |
297 | (def col-names (map #(keyword (str "x" %))
298 | (range 60)))
299 |
300 | ["First we create and run a pipeline which does the PCA."
301 | "In this pipeline we do not fix the number of columns, as we want to
302 | plot the result for all numbers of components (up to 60) "]
303 |
304 | (def fitted-ctx
305 | (ml/fit
306 | sonar
307 | (mm/reduce-dimensions :pca-cov 60
308 | col-names
309 | {})))
310 |
311 |
312 | ["The next function transforms the result from the fitted pipeline
313 | into vega lite compatible format for plotting"]
314 | ["It accesses the underlying Smile Java object to get the data on
315 | the cumulative variance for each PCA component."]
316 | (defn create-plot-data [ctx]
317 | (map
318 | #(hash-map :principal-component %1
319 | :cumulative-variance %2)
320 | (range)
321 | (-> ctx vals (nth 2) :fit-result :model bean :cumulativeVarianceProportion)))
322 |
323 | ["Next we plot the cumulative variance over the component index:"]
324 | ^kind/vega
325 | {:$schema "https://vega.github.io/schema/vega-lite/v5.json"
326 | :width 850
327 | :data {:values
328 | (create-plot-data fitted-ctx)}
329 | :mark "line" ,
330 | :encoding
331 | {:x {:field :principal-component, :type "nominal"},
332 | :y {:field :cumulative-variance, :type "quantitative"}}}
333 |
334 | ["From the plot we see, that transforming the data via PCA and reducing
335 | it from 60 dimensions to about 25 would still preserve the full variance."]
336 | ["Looking at this plot, we could now make a decision, how many dimensions
337 | to keep."]
338 | ["We could for example decide, that keeping 60 % of the variance
339 | is enough, which would result in keeping the first 2 dimensions."]
340 |
341 | ["So our pipeline becomes:"]
342 |
343 |
344 | (def fitted-ctx
345 | (ml/fit
346 | sonar
347 | (mm/reduce-dimensions :pca-cov 2
348 | col-names
349 | {})
350 |
351 | (mm/select-columns [:material "pca-cov-0" "pca-cov-1"])
352 | (mm/shuffle)))
353 |
354 | ^kind/dataset
355 | (:metamorph/data fitted-ctx)
356 |
357 | ["As the data is now 2-dimensional, it is easy to plot:"]
358 |
359 | (def scatter-plot-data
360 | (-> fitted-ctx
361 | :metamorph/data
362 | (ds/select-columns [:material "pca-cov-0" "pca-cov-1"])
363 | (ds/rows :as-maps)))
364 |
365 |
366 | ^kind/vega
367 | {:$schema "https://vega.github.io/schema/vega-lite/v5.json"
368 | :data {:values scatter-plot-data}
369 | :width 500
370 | :height 500
371 |
372 | :mark :circle
373 | :encoding
374 | {:x {:field "pca-cov-0" :type "quantitative"}
375 | :y {:field "pca-cov-1" :type "quantitative"}
376 | :color {:field :material}}}
377 |
378 | ["The plot shows that the reduction to 2 dimensions does not create
379 | linear separable areas of `M` and `R`. So a linear model will not be
380 | able to predict well the material from the 2 PCA components."]
381 |
382 | ["It even seems, that the reduction to 2 dimensions removes
383 | too much information for predicting of the material for any type of model."]
384 |
--------------------------------------------------------------------------------
/src/scicloj/ml/tune_titanic.clj:
--------------------------------------------------------------------------------
1 | (ns scicloj.ml.tune-titanic
2 | (:require
3 | [notespace.api :as note]
4 | [notespace.kinds :as kind]))
5 |
6 | (comment
7 | (note/init-with-browser)
8 | (note/eval-this-notespace)
9 | (note/reread-this-notespace)
10 | (note/render-static-html "docs/tune-titanic.html")
11 | (note/init))
12 |
13 |
14 | ["This is the Clojure version of https://www.moritzkoerber.com/posts/preprocessing-hyperparameters/"]
15 |
16 | (require '[scicloj.ml.dataset :as ds]
17 | '[scicloj.ml.core :as ml]
18 | '[scicloj.ml.metamorph :as mm]
19 | '[camel-snake-kebab.core :as csk]
20 | '[scicloj.metamorph.ml.evaluation-handler :as eval-hn]
21 | '[tech.v3.datatype.functional :as dtfunc])
22 |
23 | (def categorical-features [:pclass :sex :embarked])
24 | (def numeric-features [:age :parch :fare])
25 |
26 | (defn map->vec [m] (flatten (into [] m)))
27 |
28 | ["Preproceesing Pipelines including feature engineering"]
29 |
30 | (def data
31 | (-> (ds/dataset "data/titanic/train.csv"
32 | {:key-fn csk/->kebab-case-keyword})
33 | (ds/select-columns (concat categorical-features numeric-features [:survived]))
34 | (ds/replace-missing categorical-features :value "missing")
35 | (ds/categorical->one-hot categorical-features)))
36 |
37 |
38 | (defn replace-missing [options]
39 | (fn [ctx]
40 | ( (apply mm/replace-missing numeric-features (map->vec (:replace-missing-options options))) ctx)))
41 |
42 | (defn maybe-std-scale [options]
43 | (fn [ctx]
44 | (if (-> options :scaling-options :scale?)
45 | ((mm/std-scale numeric-features {})
46 | ctx)
47 | ctx)))
48 |
49 | (defn assoc-pipe-opts [options]
50 | (fn [ctx]
51 | (assoc ctx :pipe-options options)))
52 |
53 |
54 | (defn make-decl-pipeline[model-type options]
55 | [[::assoc-pipe-opts options]
56 | [::replace-missing options]
57 | [:mm/categorical->number [:survived ] {} :int64]
58 | [::maybe-std-scale options]
59 | [:mm/set-inference-target :survived]
60 | {:metamorph/id :model} [:mm/model (merge (:model-options options) {:model-type model-type})]])
61 |
62 |
63 |
64 |
65 | (def logistic-regression-pipelines
66 | (map
67 | #(make-decl-pipeline :smile.classification/logistic-regression %)
68 | (ml/sobol-gridsearch {:scaling-options {:scale? (ml/categorical [true false])}
69 | :replace-missing-options {:value (ml/categorical [dtfunc/mean dtfunc/median])}
70 | :model-options {:lambda (ml/categorical [0.1 0.2 0.5 0.7 1])
71 | :tolerance (ml/categorical [0.1 0.01 0.001 0.0001])}})))
72 |
73 | (def random-forrest-pipelines
74 | (map
75 | #(make-decl-pipeline :smile.classification/random-forest %)
76 | (ml/sobol-gridsearch {:scaling-options {:scale? (ml/categorical [true false])}
77 | :replace-missing-options {:value (ml/categorical [dtfunc/mean dtfunc/median])}
78 | :model-options {:trees (ml/categorical [5 50 100 250])
79 | :max-depth (ml/categorical [5 8 10])}})))
80 |
81 | (def all-pipelines (concat random-forrest-pipelines))
82 |
83 |
84 |
85 | (def pipe-fns
86 | (mapv ml/->pipeline all-pipelines))
87 |
88 | ["Simple split"]
89 | (def splits (ds/split->seq data :holdout {:ratio 0.8}))
90 | (def train-ds ((first splits) :train))
91 | (def holdout-ds ((first splits) :test))
92 |
93 | ["Tune hyperparameter by evaluating all pipelines/models "]
94 |
95 | (def files [atom []])
96 | (def best-evaluation
97 | (ml/evaluate-pipelines
98 | all-pipelines
99 | (ds/split->seq train-ds :kfold 5)
100 | ml/classification-accuracy
101 | :accuracy
102 | {;; :attach-fn-sources {:ns (find-ns 'scicloj.ml.tune-titanic)
103 | ;; :pipe-fns-clj-file "src/scicloj/ml/tune_titanic.clj"}
104 | :return-best-crossvalidation-only true
105 | :return-best-pipeline-only true}))
106 |
107 | (def best-accuracy (-> best-evaluation first first :train-transform :metric))
108 |
109 |
110 | (def best-options (-> best-evaluation first first :fit-ctx :pipe-options))
111 |
112 | (def best-pipe-fn
113 | (-> best-evaluation first first :pipe-fn))
114 |
115 | best-pipe-fn
116 |
117 | (def best-pipe-decl
118 | (-> best-evaluation first first :pipe-decl))
119 |
120 |
121 |
122 |
123 |
124 |
125 | ["## All information on best found pipeline"]
126 |
127 | ["best accuracy found on train data: " (-> best-evaluation first first :train-transform :metric)]
128 | ["best accuracy found on test data: " (-> best-evaluation first first :test-transform :metric)]
129 |
130 | ["best options (found on train data): "]
131 | best-options
132 |
133 | ["best pipeline (found on train data)"]
134 | best-pipe-decl
135 |
136 | ["pipe sources information"]
137 | (->
138 | (ml/get-nice-source-info best-pipe-decl
139 | (find-ns 'scicloj.ml.tune-titanic)
140 | (-> #'data meta :file))
141 | (update :classpath #(take 20 %)))
142 |
143 |
144 |
145 |
146 |
147 | (def predicted-survival-hold-out
148 | (->
149 | (best-pipe-fn
150 | (merge (-> best-evaluation first first :fit-ctx)
151 | {:metamorph/data holdout-ds :metamorph/mode :transform}))
152 | :metamorph/data
153 | ds/reverse-map-categorical-xforms
154 | :survived))
155 |
156 | ["Classication accuracy on holdout data: "]
157 | (ml/classification-accuracy predicted-survival-hold-out
158 | (holdout-ds :survived))
159 |
160 | ["Confusion matrix on holdout data"]
161 | ^kind/dataset
162 | (->
163 | (ml/confusion-map predicted-survival-hold-out
164 | (holdout-ds :survived))
165 | (ml/confusion-map->ds))
166 |
167 | ["Smile model object:"]
168 | (ml/thaw-model
169 | (-> best-evaluation first first :fit-ctx :model))
170 |
171 |
172 |
173 |
174 | ["Feature importance:"]
175 |
176 | (seq
177 | (.importance
178 | (ml/thaw-model
179 | (-> best-evaluation first first :fit-ctx :model))))
180 |
181 |
182 |
183 | ["## nested cross validation"]
184 |
185 |
186 |
187 | (require '[scicloj.ml.nested-cv :as nested-cv])
188 |
189 |
190 | (def nested-cv-result
191 | (doall
192 | (nested-cv/nested-cv data all-pipelines
193 | ml/classification-accuracy
194 | :accuracy 10 5)))
195 |
196 |
197 | ["nested cv best models metrics"]
198 | (map :metric nested-cv-result)
199 |
200 | (def final-model-by-cv
201 | (let [inner-k-fold (ds/split->seq data :kfold {:k 5})
202 | evaluation (ml/evaluate-pipelines
203 | all-pipelines
204 | inner-k-fold
205 | ml/classification-accuracy
206 | :accuracy)
207 | fit-ctx (-> evaluation first first :fit-ctx)
208 | best-pipefn (-> evaluation first first :pipe-fn)]
209 | {:best-pipe-fn best-pipefn
210 | :fit-ctx fit-ctx}))
211 |
212 | (def final-model
213 | ((:best-pipe-fn final-model-by-cv)
214 | {:metamorph/data data :metamorph/mode :fit}))
215 |
216 | ["Final best model"]
217 | (ml/thaw-model (:model final-model))
218 |
219 | ["trained with best hyper paramter"]
220 | (-> final-model :pipe-options)
221 |
--------------------------------------------------------------------------------
/src/scicloj/ml/ug_utils.clj:
--------------------------------------------------------------------------------
1 | (ns scicloj.ml.ug-utils
2 | (:require [clojure.string :as str]
3 | [notespace.kinds :as kind]
4 | [notespace.view :as view]
5 | [scicloj.ml.core :as ml]
6 | [scicloj.ml.metamorph :as mm]
7 | [tech.v3.dataset :as ds]
8 | [tech.v3.dataset.modelling :as ds-mod]
9 | [tablecloth.api :as tc]
10 | [libpython-clj2.python :as py]
11 | [tech.v3.datatype.functional :as dtf]
12 | [clj-http.client :as client]))
13 |
14 |
15 | (defn kroki [s type format]
16 | (client/post "https://kroki.io/" {:content-type :json
17 | :as :byte-array
18 | :form-params
19 | {:diagram_source s
20 | :diagram_type (name type)
21 | :output_format (name format)}}))
22 | (py/initialize!)
23 | (def doc->markdown (py/import-module "docstring_to_markdown"))
24 |
25 |
26 |
27 | (def model-keys
28 | (keys @scicloj.ml.core/model-definitions*))
29 |
30 | (def model-options
31 | (map
32 | :options
33 | (vals @scicloj.ml.core/model-definitions*)))
34 |
35 | (defn dataset->md-hiccup [mds]
36 | (let [height (* 46 (- (count (str/split-lines (str mds))) 2))
37 | height-limit (min height 800)]
38 | [:div {:class "table table-striped table-hover table-condensed table-responsive"}
39 | ;; :style {:height (str height-limit "px")}
40 |
41 | (view/markdowns->hiccup mds)]))
42 |
43 |
44 | (defmethod kind/kind->behaviour ::dataset-nocode
45 | [_]
46 | {:render-src? false
47 | :value->hiccup #'dataset->md-hiccup})
48 |
49 | (defn docu-options [model-key]
50 | (kind/override
51 | (->
52 | (tc/dataset
53 | (or
54 | (get-in @scicloj.ml.core/model-definitions* [model-key :options])
55 | {:name [] :type [] :default []}))
56 |
57 | (tc/reorder-columns :name :type :default))
58 |
59 | ::dataset-nocode))
60 |
61 |
62 |
63 |
64 | ;; (->
65 | ;; (tc/dataset
66 | ;; (get-in @scicloj.ml.core/model-definitions* [:corenlp/crf :options] ))
67 | ;; (tc/reorder-columns :name :type :default)
68 | ;; )
69 |
70 | (defn text->hiccup
71 | "Convert newlines to [:br]'s."
72 | [text]
73 | (->> (str/split text #"\n")
74 | (interpose [:br])
75 | (map #(if (string? %)
76 | %
77 | (with-meta % {:key (gensym "br-")})))))
78 |
79 | (defn docu-doc-string [model-key]
80 | (try
81 | (view/markdowns->hiccup
82 | (py/py. doc->markdown convert
83 | (or
84 | (get-in @scicloj.ml.core/model-definitions* [model-key :documentation :doc-string] ) "")))
85 | (catch Exception e "")))
86 |
87 |
88 |
89 |
90 | (defn anchor-or-nothing [x text]
91 | (if (empty? x)
92 | [:div ""]
93 | [:div
94 | [:a {:href x} text]]))
95 |
96 |
97 |
98 | (defn render-key-info [prefix]
99 | (->> @scicloj.ml.core/model-definitions*
100 | (sort-by first)
101 | (filter #(str/starts-with? (first %) (str prefix)))
102 | (map
103 | (fn [[key definition]]
104 | [:div
105 | [:h3 {:id (str key)} (str key)]
106 | (anchor-or-nothing (:javadoc (:documentation definition)) "javadoc")
107 | (anchor-or-nothing (:user-guide (:documentation definition)) "user guide")
108 |
109 | ;; [:span (text->hiccup (or
110 | ;; (get-in @scicloj.ml.core/model-definitions* [key :documentation :description] ) ""))]
111 |
112 | [:span
113 | (dataset->md-hiccup (docu-options key))]
114 |
115 | [:span
116 | (docu-doc-string key)]
117 |
118 | [:hr]
119 | ;; [:div "Example:"]
120 | ;; [:div
121 | ;; [:p/code {:code (str
122 | ;; (get-in definition [:documentation :code-example]
123 | ;; "" ))
124 | ;; :bg-class "bg-light"}]]
125 |
126 | [:hr]]))))
127 |
128 |
129 | (text->hiccup (or
130 | (get-in @scicloj.ml.core/model-definitions*
131 | [:smile.manifold/tsne :documentation :description]) ""))
132 |
133 |
134 | (defn remove-deep [key-set data]
135 | (clojure.walk/prewalk (fn [node] (if (map? node)
136 | (apply dissoc node key-set)
137 | node))
138 | data))
139 | (defn stepped-range [start end n-steps]
140 | (let [diff (- end start)]
141 | (range start end (/ diff n-steps))))
142 |
143 | (defn surface-plot [iris cols raw-pipe-fn model-name]
144 | (let [
145 | pipe-fn
146 | (ml/pipeline
147 | (mm/select-columns (concat [:species] cols))
148 | raw-pipe-fn)
149 |
150 | fitted-ctx
151 | (pipe-fn
152 | {:metamorph/data iris
153 | :metamorph/mode :fit})
154 | ;; getting plot boundaries
155 | min-x (- (-> (get iris (first cols)) dtf/reduce-min) 0.2)
156 | min-y (- (-> (get iris (second cols)) dtf/reduce-min) 0.2)
157 | max-x (+ (-> (get iris (first cols)) dtf/reduce-max) 0.2)
158 | max-y (+ (-> (get iris (second cols)) dtf/reduce-max) 0.2)
159 |
160 |
161 | ;; make a grid for the decision surface
162 | grid
163 | (for [x1 (stepped-range min-x max-x 100)
164 | x2 (stepped-range min-y max-y 100)]
165 |
166 | {(first cols) x1
167 | (second cols) x2
168 | :species nil})
169 |
170 | grid-ds (tc/dataset grid)
171 |
172 |
173 | ;; predict for all grid points
174 | prediction-grid
175 | (->
176 | (pipe-fn
177 | (merge
178 | fitted-ctx
179 | {:metamorph/data grid-ds
180 | :metamorph/mode :transform}))
181 | :metamorph/data
182 | (ds-mod/column-values->categorical :species)
183 | seq)
184 |
185 | grid-ds-prediction
186 | (tc/add-column grid-ds :predicted-species prediction-grid)
187 |
188 |
189 | ;; predict the iris data points from data set
190 | prediction-iris
191 | (->
192 | (pipe-fn
193 | (merge
194 | fitted-ctx
195 | {:metamorph/data iris
196 | :metamorph/mode :transform}))
197 | :metamorph/data
198 |
199 | (ds-mod/column-values->categorical :species)
200 | seq)
201 |
202 | ds-prediction
203 | (tc/add-column iris :true-species (:species iris)
204 | prediction-iris)]
205 |
206 | ;; create a 2 layer Vega lite specification
207 | {:layer
208 | [
209 |
210 | {:data {:values (seq (tc/rows grid-ds-prediction :as-maps))}
211 | :title (str "Decision surfaces for model: " model-name)
212 | :width 500
213 | :height 500
214 | :mark {:type "square" :opacity 0.9 :strokeOpacity 0.1 :stroke nil},
215 | :encoding {:x {:field (first cols)
216 | :type "quantitative"
217 | :scale {:domain [min-x max-x]}
218 | :axis {:format "2.2"
219 | :labelOverlap true}}
220 |
221 | :y {:field (second cols) :type "quantitative"
222 | :axis {:format "2.2"
223 | :labelOverlap true}
224 | :scale {:domain [min-y max-y]}}
225 |
226 | :color {:field :predicted-species}}}
227 |
228 |
229 | {:data {:values (seq (tc/rows ds-prediction :as-maps))}
230 |
231 | :width 500
232 | :height 500
233 | :mark {:type "circle" :opacity 1 :strokeOpacity 1},
234 | :encoding {:x {:field (first cols)
235 | :type "quantitative"
236 | :axis {:format "2.2"
237 | :labelOverlap true}
238 | :scale {:domain [min-x max-x]}}
239 |
240 | :y {:field (second cols) :type "quantitative"
241 | :axis {:format "2.2"
242 | :labelOverlap true}
243 | :scale {:domain [min-y max-y]}}
244 |
245 |
246 | :fill {:field :true-species} ;; :legend nil
247 |
248 | :stroke { :value :black}
249 | :size {:value 300}}}]}))
250 |
251 | (defn select-paths-from-set [current-path path-set data]
252 | (cond
253 | (map? data) (into {}
254 | (remove nil?)
255 | (for [[k v] data]
256 | (let [p (conj current-path k)]
257 | (if (contains? path-set p)
258 | [k (select-paths-from-set p path-set v)]))))
259 | (sequential? data) (mapv (partial select-paths-from-set current-path path-set) data)
260 | :default data))
261 |
262 | (defn select-paths [data paths]
263 | (select-paths-from-set []
264 | (into #{}
265 | (mapcat #(take-while seq (iterate butlast %)))
266 | paths)
267 | data))
268 |
269 | (defn select-minimal-result [result]
270 | (select-paths result [[:train-transform :metric]
271 | [:test-transform :metric]]))
272 |
--------------------------------------------------------------------------------
/src/scicloj/ml/ug_utils_clerk.clj:
--------------------------------------------------------------------------------
1 | (ns scicloj.ml.ug-utils-clerk
2 | (:require
3 | [clojure.string :as str]
4 | [nextjournal.clerk :as clerk]
5 | [scicloj.ml.core :as ml]
6 | [scicloj.ml.ug-utils :as utils]
7 | [tablecloth.api :as tc]))
8 |
9 | (defn docu-options [model-key]
10 |
11 | (->
12 | (tc/dataset
13 | (or
14 | (get-in @scicloj.ml.core/model-definitions* [model-key :options])
15 | {:name [] :type [] :default []}))
16 |
17 | (tc/reorder-columns :name :type :default)))
18 |
19 |
20 |
21 | (defn stringify-enum [form]
22 | (clojure.walk/postwalk (fn [x] (do (if (instance? Enum x) (str x) x)))
23 | form))
24 |
25 | (defn render-key-info [prefix]
26 | (vec (concat [:span]
27 | (->> @scicloj.ml.core/model-definitions*
28 | (sort-by first)
29 | (filter #(str/starts-with? (first %) (str prefix)))
30 | (mapv
31 | (fn [[key definition]]
32 | [:div
33 | ;; (clerk/md (format "### %s" (str key)))
34 | [:h3 {:id (str key)} (str key)]
35 | (utils/anchor-or-nothing (:javadoc (:documentation definition)) "javadoc")
36 | (utils/anchor-or-nothing (:user-guide (:documentation definition)) "user guide")
37 |
38 | ;; [:span (text->hiccup (or
39 | ;; (get-in @scicloj.ml.core/model-definitions* [key :documentation :description] ) ""))]
40 |
41 | [:span
42 |
43 | (let [docu-ds (docu-options key)]
44 | (if (tc/empty-ds? docu-ds)
45 | ""
46 | (->
47 | docu-ds
48 | (tc/rows :as-maps)
49 | seq
50 | stringify-enum
51 | (clerk/table))))]
52 | [:span
53 | (utils/docu-doc-string key)]
54 |
55 | [:hr]
56 | [:hr]]))))))
57 |
--------------------------------------------------------------------------------
/src/scicloj/ml/unsupervised.clj:
--------------------------------------------------------------------------------
1 | (ns scicloj.ml.unsupervised
2 | (:require
3 | [notespace.api :as note]
4 | [notespace.kinds :as kind]
5 | [net.clojars.behrica.cluster_eval :as cluster-eval]))
6 |
7 |
8 |
9 |
10 | (comment
11 | (note/init-with-browser)
12 | (note/eval-this-notespace)
13 | (note/reread-this-notespace)
14 | (note/render-static-html "docs/userguide-unsupervised.html")
15 | (note/init))
16 |
17 | (require '[scicloj.ml.core :as ml]
18 | '[scicloj.ml.metamorph :as mm]
19 | '[scicloj.ml.dataset :as ds])
20 |
21 | ["# Cluster Iris data"]
22 |
23 | (def iris
24 | (->
25 | (ds/dataset
26 | "https://raw.githubusercontent.com/scicloj/metamorph.ml/main/test/data/iris.csv" {:key-fn keyword})))
27 |
28 |
29 |
30 |
31 | ["## k-means clustering"]
32 |
33 | (def fit-ctx
34 | (ml/fit
35 | iris
36 | (mm/select-columns [:petal_length :petal_width])
37 | {:metamorph/id :model}
38 | (mm/model {:model-type :fastmath/cluster
39 | :clustering-method :k-means
40 | :clustering-method-args [3]})))
41 |
42 | (def iris-with-cluster
43 | (ds/add-column iris :cluster
44 | (-> fit-ctx :model :model-data :clustering)))
45 |
46 | (def centroids
47 | (map
48 | (fn [[petal-length petal-width]]
49 | (hash-map :petal_length petal-length
50 | :petal_width petal-width))
51 | (-> fit-ctx :model :model-data :representatives)))
52 |
53 | ^kind/vega
54 | {:height 300
55 | :width 300
56 |
57 | :title "2D result of iris k-means clustering with cluster centroids (n=3)"
58 | :layer [{
59 | :$schema "https://vega.github.io/schema/vega-lite/v5.json"
60 | :data {:values (ds/rows iris-with-cluster :as-maps)}
61 | :description "Iris data "
62 | :encoding {:x {:field :petal_length :type "quantitative"}
63 | :y {:field :petal_width :type "quantitative"}
64 | :color {:field :cluster}}
65 | :mark "point"}
66 | {
67 | :data {:values centroids}
68 | :description "Iris data "
69 | :encoding {:x {:field :petal_length :type "quantitative"}
70 | :y {:field :petal_width :type "quantitative"}}
71 |
72 | :mark {:type "point" :shape :triangle-up :color :black
73 | :filled true
74 | :size 200}}]}
75 |
76 |
77 |
78 | ["## Ellbow plot"]
79 |
80 | ["### Calculate distortion over n"]
81 |
82 | (defn make-pipe [n]
83 | (ml/pipeline
84 | (mm/drop-columns [:species])
85 | {:metamorph/id :model}
86 | (mm/model {:model-type :fastmath/cluster
87 | :clustering-method :k-means
88 | :clustering-method-args [n]})))
89 |
90 |
91 |
92 | (def eval-results
93 | (ml/evaluate-pipelines
94 | (map make-pipe (range 2 10))
95 | [{:train iris}]
96 | (fn [ctx]
97 | 0)
98 | :loss
99 | {:return-best-pipeline-only false}))
100 |
101 |
102 |
103 | (defn fastmath->cluster-data [model-data]
104 | (let [
105 | cluster-values
106 | (concat
107 | (-> model-data :data)
108 | (-> model-data :representatives))
109 |
110 | cluster
111 | (concat
112 | (-> model-data :clustering)
113 | (range (-> model-data :representatives count)))
114 |
115 | centroid?
116 | (concat
117 | (repeat (-> model-data :data count) false)
118 | (repeat (-> model-data :representatives count) true))]
119 |
120 | {:values cluster-values
121 | :cluster cluster
122 | :centroid? centroid?}))
123 |
124 |
125 |
126 | (def ellbow-plot-data-distortion
127 | (map #(hash-map :n %1
128 | :distortion %2)
129 | (->> eval-results flatten (map #(first (get-in % [:fit-ctx :model :options :clustering-method-args]))))
130 | (->> eval-results flatten (map #(get-in % [:fit-ctx :model :model-data :info :distortion])))))
131 |
132 |
133 | ["### Calculate silouhette score over n"]
134 |
135 | (def eval-results-silhouete
136 | (ml/evaluate-pipelines
137 | (map make-pipe (range 2 10))
138 | [{:train iris}]
139 | (fn [ctx]
140 | (let [metric
141 | (cluster-eval/cluster-index
142 | (fastmath->cluster-data (-> ctx :model :model-data))
143 | "calcularSilhouette")]
144 | metric))
145 | :loss
146 | {:return-best-pipeline-only false}))
147 |
148 |
149 | (def ellbow-plot-data-silhoute
150 | (map #(hash-map :n %1
151 | :silhoute %2)
152 | (->> eval-results-silhouete flatten (map #(first (get-in % [:fit-ctx :model :options :clustering-method-args]))))
153 | (->> eval-results-silhouete flatten (map #(get-in % [:train-transform :metric])))))
154 |
155 |
156 | ["Ellbow plots for distortion and silhoute score"]
157 |
158 | ^kind/vega
159 | {:hconcat [
160 | {:$schema "https://vega.github.io/schema/vega-lite/v5.json"
161 | :width 200
162 | :height 200
163 | :title "Ellbow plot of distortion for various n"
164 | :data {:values ellbow-plot-data-distortion}
165 | :description "Stock prices of 5 Tech Companies over Time."
166 | :encoding {:x {:field "n" :type :ordinal}
167 | :y {:field :distortion :type "quantitative"}}
168 | :mark {:point true :type "line"}}
169 |
170 | {:$schema "https://vega.github.io/schema/vega-lite/v5.json"
171 | :width 200
172 | :height 200
173 | :title "Ellbow plot of Silhoutte score for various n"
174 | :data {:values ellbow-plot-data-silhoute}
175 |
176 | :encoding {:x {:field "n" :type :ordinal}
177 | :y {:field :silhoute :type "quantitative"}}
178 | :mark {:point true :type "line"}}]}
179 |
--------------------------------------------------------------------------------
/submission.csv:
--------------------------------------------------------------------------------
1 | PassengerId,Survived
2 | 892,0
3 | 893,1
4 | 894,0
5 | 895,0
6 | 896,1
7 | 897,0
8 | 898,1
9 | 899,0
10 | 900,1
11 | 901,0
12 | 902,0
13 | 903,0
14 | 904,1
15 | 905,0
16 | 906,1
17 | 907,1
18 | 908,0
19 | 909,0
20 | 910,1
21 | 911,1
22 | 912,0
23 | 913,0
24 | 914,1
25 | 915,0
26 | 916,1
27 | 917,0
28 | 918,1
29 | 919,0
30 | 920,1
31 | 921,0
32 | 922,0
33 | 923,0
34 | 924,1
35 | 925,0
36 | 926,1
37 | 927,0
38 | 928,0
39 | 929,0
40 | 930,0
41 | 931,1
42 | 932,0
43 | 933,1
44 | 934,0
45 | 935,1
46 | 936,1
47 | 937,0
48 | 938,1
49 | 939,0
50 | 940,1
51 | 941,1
52 | 942,0
53 | 943,0
54 | 944,1
55 | 945,1
56 | 946,0
57 | 947,0
58 | 948,0
59 | 949,0
60 | 950,0
61 | 951,1
62 | 952,0
63 | 953,0
64 | 954,0
65 | 955,1
66 | 956,1
67 | 957,1
68 | 958,1
69 | 959,0
70 | 960,0
71 | 961,1
72 | 962,0
73 | 963,0
74 | 964,0
75 | 965,0
76 | 966,1
77 | 967,0
78 | 968,0
79 | 969,1
80 | 970,0
81 | 971,0
82 | 972,1
83 | 973,0
84 | 974,0
85 | 975,0
86 | 976,0
87 | 977,0
88 | 978,1
89 | 979,0
90 | 980,0
91 | 981,1
92 | 982,0
93 | 983,0
94 | 984,1
95 | 985,0
96 | 986,0
97 | 987,0
98 | 988,1
99 | 989,0
100 | 990,1
101 | 991,0
102 | 992,1
103 | 993,0
104 | 994,0
105 | 995,0
106 | 996,0
107 | 997,0
108 | 998,0
109 | 999,0
110 | 1000,0
111 | 1001,0
112 | 1002,0
113 | 1003,1
114 | 1004,1
115 | 1005,1
116 | 1006,1
117 | 1007,0
118 | 1008,0
119 | 1009,1
120 | 1010,1
121 | 1011,1
122 | 1012,1
123 | 1013,0
124 | 1014,1
125 | 1015,0
126 | 1016,0
127 | 1017,1
128 | 1018,0
129 | 1019,1
130 | 1020,0
131 | 1021,0
132 | 1022,0
133 | 1023,0
134 | 1024,0
135 | 1025,0
136 | 1026,0
137 | 1027,0
138 | 1028,0
139 | 1029,0
140 | 1030,0
141 | 1031,0
142 | 1032,0
143 | 1033,1
144 | 1034,1
145 | 1035,0
146 | 1036,0
147 | 1037,0
148 | 1038,0
149 | 1039,0
150 | 1040,0
151 | 1041,0
152 | 1042,1
153 | 1043,0
154 | 1044,0
155 | 1045,1
156 | 1046,0
157 | 1047,0
158 | 1048,1
159 | 1049,0
160 | 1050,1
161 | 1051,0
162 | 1052,1
163 | 1053,1
164 | 1054,1
165 | 1055,0
166 | 1056,0
167 | 1057,1
168 | 1058,0
169 | 1059,0
170 | 1060,1
171 | 1061,0
172 | 1062,0
173 | 1063,0
174 | 1064,0
175 | 1065,0
176 | 1066,0
177 | 1067,1
178 | 1068,1
179 | 1069,1
180 | 1070,1
181 | 1071,1
182 | 1072,0
183 | 1073,0
184 | 1074,1
185 | 1075,0
186 | 1076,1
187 | 1077,0
188 | 1078,1
189 | 1079,0
190 | 1080,0
191 | 1081,0
192 | 1082,0
193 | 1083,0
194 | 1084,0
195 | 1085,0
196 | 1086,1
197 | 1087,0
198 | 1088,1
199 | 1089,0
200 | 1090,0
201 | 1091,0
202 | 1092,1
203 | 1093,1
204 | 1094,0
205 | 1095,1
206 | 1096,0
207 | 1097,0
208 | 1098,0
209 | 1099,0
210 | 1100,1
211 | 1101,0
212 | 1102,0
213 | 1103,0
214 | 1104,0
215 | 1105,1
216 | 1106,0
217 | 1107,0
218 | 1108,1
219 | 1109,1
220 | 1110,1
221 | 1111,0
222 | 1112,1
223 | 1113,0
224 | 1114,1
225 | 1115,0
226 | 1116,1
227 | 1117,1
228 | 1118,0
229 | 1119,0
230 | 1120,0
231 | 1121,0
232 | 1122,0
233 | 1123,1
234 | 1124,0
235 | 1125,0
236 | 1126,1
237 | 1127,0
238 | 1128,0
239 | 1129,0
240 | 1130,1
241 | 1131,1
242 | 1132,1
243 | 1133,1
244 | 1134,1
245 | 1135,0
246 | 1136,0
247 | 1137,0
248 | 1138,1
249 | 1139,0
250 | 1140,1
251 | 1141,0
252 | 1142,1
253 | 1143,0
254 | 1144,0
255 | 1145,0
256 | 1146,0
257 | 1147,0
258 | 1148,0
259 | 1149,0
260 | 1150,1
261 | 1151,0
262 | 1152,0
263 | 1153,0
264 | 1154,1
265 | 1155,1
266 | 1156,0
267 | 1157,0
268 | 1158,0
269 | 1159,0
270 | 1160,1
271 | 1161,0
272 | 1162,1
273 | 1163,0
274 | 1164,1
275 | 1165,1
276 | 1166,0
277 | 1167,1
278 | 1168,0
279 | 1169,0
280 | 1170,0
281 | 1171,0
282 | 1172,0
283 | 1173,1
284 | 1174,0
285 | 1175,1
286 | 1176,1
287 | 1177,0
288 | 1178,0
289 | 1179,0
290 | 1180,0
291 | 1181,0
292 | 1182,0
293 | 1183,1
294 | 1184,0
295 | 1185,0
296 | 1186,0
297 | 1187,0
298 | 1188,1
299 | 1189,1
300 | 1190,0
301 | 1191,0
302 | 1192,0
303 | 1193,0
304 | 1194,0
305 | 1195,0
306 | 1196,0
307 | 1197,1
308 | 1198,0
309 | 1199,1
310 | 1200,1
311 | 1201,0
312 | 1202,0
313 | 1203,0
314 | 1204,0
315 | 1205,0
316 | 1206,1
317 | 1207,1
318 | 1208,1
319 | 1209,0
320 | 1210,0
321 | 1211,0
322 | 1212,0
323 | 1213,0
324 | 1214,0
325 | 1215,1
326 | 1216,1
327 | 1217,0
328 | 1218,1
329 | 1219,0
330 | 1220,0
331 | 1221,0
332 | 1222,1
333 | 1223,1
334 | 1224,0
335 | 1225,1
336 | 1226,0
337 | 1227,0
338 | 1228,0
339 | 1229,0
340 | 1230,0
341 | 1231,0
342 | 1232,0
343 | 1233,0
344 | 1234,0
345 | 1235,1
346 | 1236,0
347 | 1237,1
348 | 1238,0
349 | 1239,1
350 | 1240,0
351 | 1241,1
352 | 1242,1
353 | 1243,0
354 | 1244,0
355 | 1245,0
356 | 1246,1
357 | 1247,0
358 | 1248,1
359 | 1249,0
360 | 1250,0
361 | 1251,1
362 | 1252,0
363 | 1253,1
364 | 1254,1
365 | 1255,0
366 | 1256,1
367 | 1257,0
368 | 1258,0
369 | 1259,0
370 | 1260,1
371 | 1261,0
372 | 1262,0
373 | 1263,1
374 | 1264,0
375 | 1265,0
376 | 1266,1
377 | 1267,1
378 | 1268,0
379 | 1269,0
380 | 1270,0
381 | 1271,0
382 | 1272,0
383 | 1273,0
384 | 1274,0
385 | 1275,1
386 | 1276,0
387 | 1277,1
388 | 1278,0
389 | 1279,0
390 | 1280,0
391 | 1281,1
392 | 1282,1
393 | 1283,1
394 | 1284,0
395 | 1285,0
396 | 1286,0
397 | 1287,1
398 | 1288,0
399 | 1289,1
400 | 1290,0
401 | 1291,0
402 | 1292,1
403 | 1293,0
404 | 1294,1
405 | 1295,0
406 | 1296,0
407 | 1297,0
408 | 1298,0
409 | 1299,0
410 | 1300,1
411 | 1301,1
412 | 1302,0
413 | 1303,1
414 | 1304,1
415 | 1305,0
416 | 1306,1
417 | 1307,0
418 | 1308,0
419 | 1309,1
420 |
--------------------------------------------------------------------------------
/test/scicloj/ml/tutorials_test.clj:
--------------------------------------------------------------------------------
1 | (ns scicloj.ml.tutorials-test
2 | (:require [clojure.test :refer :all]
3 | [scicloj.ml.tutorials :refer :all]))
4 |
5 | (deftest a-test
6 | (testing "FIXME, I fail."
7 | (is (= 0 1))))
8 |
--------------------------------------------------------------------------------