is one of"
44 | @echo " pdf to make LaTeX files and run them through pdflatex"
45 | @echo " html to make standalone HTML files"
46 | @echo " exe to run jupyter notebooks except those in deep_learning that requires GPU."
47 | @echo " clean rm BUILDDIR, auto_gallery, rst files"
48 | @echo " cleanall rm BUILDDIR, auto_gallery, rst files and clear output of notebooks"
49 | @echo " dirhtml to make HTML files named index.html in directories"
50 | @echo " singlehtml to make a single large HTML file"
51 | @echo " epub to make an epub"
52 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
53 | @echo " latexpdf to make LaTeX files and run them through pdflatex"
54 | @echo " text to make text files"
55 | @echo " changes to make an overview of all changed/added/deprecated items"
56 | @echo " linkcheck to check all external links for integrity"
57 | @echo " doctest to run all doctests embedded in the documentation (if enabled)"
58 | @echo " coverage to run coverage check of the documentation (if enabled)"
59 |
60 | # Rule to convert notebook to rst
61 | #.ipynb.rst:
62 | %.rst : %.ipynb
63 | jupyter nbconvert --to rst $<
64 | mv $@ $@.filtered
65 | cat $@.filtered|bin/filter_fix_rst.py > $@
66 | rm -f $@.filtered
67 |
68 | # jupyter nbconvert --to rst --stdout $< | bin/filter_fix_rst.py > $@
69 | # jupyter nbconvert --to rst $< --output $@
70 |
71 | debug:
72 | @echo $(RST)
73 |
74 |
75 | rst: $(RST)
76 |
77 | clean:
78 | rm -rf $(BUILDDIR)/*
79 | rm -rf auto_gallery/
80 | rm -f $(RST)
81 | rm -rf $(NTBOOK_FILES)
82 |
83 | cleanall:
84 | rm -rf $(BUILDDIR)/*
85 | rm -rf auto_gallery/
86 | rm -f $(RST)
87 | rm -rf $(NTBOOK_FILES)
88 | for nb in $(NTBOOK) ; do jupyter nbconvert --clear-output $$nb; done
89 |
90 | exe:
91 | @echo "Execute notebooks"
92 | for nb in $(NTBOOK_TO_EXE) ; do jupyter nbconvert --to notebook --execute $$nb --output $$(basename $$nb); done
93 | # $(EXEIPYNB) $(NTBOOK)
94 | # @echo toto nbconvert --to notebook --execute $< --output $(basename $<)
95 |
96 | html: rst
97 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
98 | @echo
99 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
100 |
101 | dirhtml: rst
102 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
103 | @echo
104 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
105 |
106 | singlehtml: rst
107 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
108 | @echo
109 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
110 |
111 | docx: rst
112 | $(SPHINXBUILD) -b docx $(ALLSPHINXOPTS) $(BUILDDIR)/docx
113 | @echo
114 | @echo "Build finished. The docx page is in $(BUILDDIR)/docx."
115 |
116 | epub: rst
117 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
118 | @echo
119 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
120 |
121 | latex: rst
122 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
123 | @echo
124 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
125 | @echo "Run \`make' in that directory to run these through (pdf)latex" \
126 | "(use \`make latexpdf' here to do that automatically)."
127 |
128 | latexpdf: rst
129 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
130 | @echo "Running LaTeX files through pdflatex..."
131 | $(MAKE) -C $(BUILDDIR)/latex all-pdf
132 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
133 | cp build/latex/StatisticsMachineLearningPython.pdf StatisticsMachineLearningPython.pdf
134 |
135 | pdf: latexpdf
136 |
137 | text: rst
138 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
139 | @echo
140 | @echo "Build finished. The text files are in $(BUILDDIR)/text."
141 |
142 | changes: rst
143 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
144 | @echo
145 | @echo "The overview file is in $(BUILDDIR)/changes."
146 |
147 | linkcheck: rst
148 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
149 | @echo
150 | @echo "Link check complete; look for any errors in the above output " \
151 | "or in $(BUILDDIR)/linkcheck/output.txt."
152 |
153 |
--------------------------------------------------------------------------------
/info.rst:
--------------------------------------------------------------------------------
1 | gh-pages
2 | --------
3 |
4 | TODO: do it with: circleci
5 |
6 | - https://circleci.com/blog/deploying-documentation-to-github-pages-with-continuous-integration/
7 | - https://github.com/jklukas/docs-on-gh-pages
8 |
9 |
10 | Publishing sphinx-generated docs on github:
11 |
12 | https://daler.github.io/sphinxdoc-test/includeme.html
13 |
14 |
15 |
16 | Upload to github
17 | ----------------
18 |
19 |
20 | "$WD/build/html" contains the pystsamsl website. Now we start to upload to github server. Clone from github to a temporary directory, and checkout gh-pages branch
21 |
22 | First time
23 | ```
24 | WD=~/git/pystatsml
25 | cd ~/git
26 | mv pystatsml_gh-pages pystatsml_gh-pages.bak
27 | git clone git@github.com:duchesnay/pystatsml.git pystatsml_gh-pages
28 | git symbolic-ref HEAD refs/heads/gh-pages
29 | rm .git/index
30 | git clean -fdx
31 | cp -r $WD/build/html/* ./
32 | cp -r $WD/auto_gallery ./
33 | git add .
34 | git add -f auto_gallery
35 | git add -f _sources
36 | git add -f _static
37 | git add -f _images
38 | touch .nojekyll
39 | gedit index.html # see blow
40 | git commit -am "gh-pages First commit"
41 | git push origin gh-pages
42 | firefox index.html
43 | ```
44 |
45 | Update
46 | ```
47 | WD=~/git/pystatsml
48 | cd $WD
49 | make pdf html singlehtml
50 | cd ~/git/pystatsml_gh-pages
51 | git checkout gh-pages
52 | rsync -avu $WD/build/html/* ./
53 | rsync -avu $WD/auto_gallery ./
54 | git add .
55 | git add -f auto_gallery
56 | git add -f _sources
57 | git add -f _static
58 | git add -f _images
59 | meld index.html index.html.save
60 | #gedit # see blow
61 | git commit -am "gh-pages update commit"
62 | git push origin gh-pages
63 | firefox index.html
64 | ```
65 |
66 | Then
67 | ```
68 | gedit index.html
69 |
70 | Replace:
71 | ```
72 |
73 |
Phantom
74 |
75 | ```
76 | by
77 |
78 | ```
79 |
80 |
Statistics and Machine Learning in
81 | Python
82 |
83 |
84 |
85 |
86 | Edouard Duchesnay, Tommy Löfstedt, Feki Younes
87 | ```
88 |
89 | Then
90 |
91 | ```
92 | git commit -am "Title and authors"
93 | git push origin gh-pages
94 | firefox $WD/build/html/index.html
95 | ```
96 |
97 | Now, you can visit your updated website at https://duchesnay.github.io/pystatsml.
98 |
99 |
100 | ML Resources
101 | ------------
102 |
103 | - **my_tech_resources**
104 | https://github.com/JamesLavin/my_tech_resources
105 |
106 | - **Practical Machine Learning Course Notes (in R)**
107 | https://sux13.github.io/DataScienceSpCourseNotes/8_PREDMACHLEARN/Practical_Machine_Learning_Course_Notes.html
108 |
109 | - **Computational Statistics in Python**
110 | https://people.duke.edu/~ccc14/sta-663/index.html
111 |
112 | - **scipy-lectures**
113 |
114 | https://github.com/scipy-lectures/scipy-lecture-notes
115 |
116 | - **Scientific Python & Software engineering best practices**
117 | https://github.com/paris-saclay-cds/python-workshop
118 |
119 | - **Deep Learning course in python**
120 | https://github.com/m2dsupsdlclass/lectures-labs
121 |
122 | - **Others**
123 | https://github.com/justmarkham/DAT4
124 |
125 | http://statweb.stanford.edu/~jtaylo/courses/stats202/index.html
126 |
127 | http://www.dataschool.io/
128 |
129 | https://onlinecourses.science.psu.edu/stat857/node/141
130 |
131 | https://github.com/rasbt/python-machine-learning-book
132 |
133 | https://onlinecourses.science.psu.edu/stat505/
134 |
135 | http://www.kdnuggets.com/2016/04/top-10-ipython-nb-tutorials.html
136 |
137 |
138 | Jupyter Notebooks
139 | -----------------
140 |
141 | https://jupyterbook.org/advanced/advanced.html#jupyter-cell-tags
142 |
143 |
144 | Markdown
145 | --------
146 | http://daringfireball.net/projects/markdown/basics
147 |
148 | R with Jupyther
149 | ~~~~~~~~~~~~~~~
150 |
151 | conda install -c r r-essentials
152 |
153 | Sphinx
154 | ------
155 |
156 | http://sphinx-doc.org/
157 |
158 | IPython notebooks + Sphinx
159 | --------------------------
160 |
161 | http://sphinx-ipynb.readthedocs.org/en/latest/howto.html
162 |
163 |
164 | nbsphinx: Jupyter Notebook Tools for Sphinx
165 |
166 | https://nbsphinx.readthedocs.io/en/0.3.3/
167 |
168 | nbsphinx is a Sphinx extension that provides a source parser for *.ipynb files. Custom Sphinx directives are used to show Jupyter Notebook code cells (and of course their results) in both HTML and LaTeX output. Un-evaluated notebooks – i.e. notebooks without stored output cells – will be automatically executed during the Sphinx build process.
169 |
170 | conda install -c conda-forge nbsphinx
171 |
172 | sphinx-gallery
173 | --------------
174 |
175 | https://sphinx-gallery.readthedocs.io/en/latest/
176 |
177 | ``pip install sphinx-gallery``
178 |
179 | http://www.scipy-lectures.org
180 |
181 | https://github.com/scipy-lectures/scipy-lecture-notes
182 |
183 | strip jupyter output before submission
184 | --------------------------------------
185 |
186 | https://github.com/kynan/nbstripout
187 |
188 | ``conda install -c conda-forge nbstripout``
189 |
190 | Set up the git filter and attributes as described in the manual installation instructions below:
191 |
192 | ``cd pystatsml``
193 | ``nbstripout --install``
194 |
195 |
196 | rst
197 | ---
198 |
199 | http://docutils.sourceforge.net/rst.html
200 | http://docutils.sourceforge.net/docs/ref/rst/
201 |
202 |
203 |
204 | R vs Python
205 | -----------
206 |
207 | https://www.datacamp.com/community/tutorials/r-or-python-for-data-analysis
208 | http://pandas.pydata.org/pandas-docs/stable/comparison_with_r.html
209 |
210 | Mail to share the course
211 | ------------------------
212 |
213 | Please find the link to my Machine Learning course in Python, it is a draft version:
214 | ftp://ftp.cea.fr//pub/unati/people/educhesnay/pystatml/StatisticsMachineLearningPython.pdf
215 |
216 | Below the link to github:
217 | https://github.com/duchesnay/pystatsml
218 |
219 |
220 | git clone https://github.com/duchesnay/pystatsml.git
221 |
222 |
223 | Basically, it uses Jupyter notebook and pure python, everything is converted to rst and assembled to html or pdf using sphynx.
224 |
225 | It is a draft version, not finished yet with many spelling mistakes.
226 |
227 | Please fork and perform some pull request. If you are willing to contribute.
228 |
229 |
230 |
231 |
--------------------------------------------------------------------------------
/utils/ml_non_linear_prediction.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Mar 31 09:54:25 2016
4 |
5 | @author: edouard.duchesnay@cea.fr
6 | """
7 |
8 | '''
9 | SVM & Kernel methods
10 | ====================
11 | '''
12 | import numpy as np
13 | from numpy.linalg import norm
14 |
15 | from mpl_toolkits.mplot3d import Axes3D
16 | import matplotlib.pyplot as plt
17 | import sklearn.metrics as metrics
18 | #%matplotlib inline
19 | #%matplotlib qt
20 |
21 |
22 |
23 | class KernDensity:
24 | def __init__(self, sigma=1):
25 | self.sigma = sigma
26 |
27 | def fit(self, X, y, alphas=None):
28 | self.X = X
29 | self.y = y
30 | if alphas is None:
31 | alphas = np.ones(X.shape[0])
32 | self.alphas = alphas
33 |
34 | def predict(self, X):
35 | y_pred = np.zeros((X.shape[0]))
36 | for j, x in enumerate(X):
37 | for i in range(self.X.shape[0]):
38 | #print(j, i, x)
39 | y_pred[j] += self.alphas[i] * self.y[i] * np.exp( - (norm(self.X[i, :] - x) ** 2) / (2 * self.sigma ** 2))
40 | return(y_pred)
41 |
42 |
43 | ## Plot 3D
44 | def plot3d(coord_x, coord_y, coord_z, points, y, zlim=None, ax=None, fig=None, xylabelsize=33):
45 | # Plot
46 | from matplotlib import cm
47 | if fig is None:
48 | fig = plt.figure()
49 | if ax is None:
50 | ax = fig.add_subplot(111, projection='3d')
51 | z_min = np.min(coord_z) - np.max(coord_z) * 2
52 | ax.plot_surface(coord_x, coord_y, coord_z, rstride=2, cstride=2,
53 | #vmin=Z.min(), vmax=Z.max(),
54 | cmap=cm.coolwarm,
55 | linewidth=1, antialiased=True)
56 | cset = ax.contourf(coord_x, coord_y, coord_z, zdir='z', offset=z_min-10,
57 | cmap=cm.coolwarm)
58 | argmin = coord_x.ravel()[coord_z.argmin()], coord_y.ravel()[coord_z.argmin()]
59 | print("argmin", argmin)
60 | # add point and cross at defined point
61 | colors = {-1:'b', 1:'r'}
62 | for lev in np.unique(y):
63 | pts = points[y==lev, :]
64 | ax.plot(pts[:, 0], pts[:, 1], 'o', color=colors[lev], zs=[z_min]*pts.shape[0], ms=10)
65 | ax.set_xlabel(r'$x^0$', size=xylabelsize)
66 | ax.set_ylabel(r'$x^1$', size=xylabelsize)
67 | #ax.set_zlabel(r'$Kernel density$', size=xylabelsize)
68 | ax.set_zlim(z_min, np.max(coord_z))
69 | return ax, z_min, argmin
70 |
71 |
72 | ## Dataset
73 | ##########
74 |
75 | im = np.array(
76 | [[ 1., 1., 1., 1., 0., 0., 0., 0.],
77 | [ 1., 1., 1., 1., 0., 0., 0., 0.],
78 | [ 1., 1., 1., 1., 0., 0., 0., 0.],
79 | [ 1., 1., 1., 1., 1., 0., 0., 0.],
80 | [ 0., 0., 0., 1., 1., 1., 1., 1.],
81 | [ 0., 0., 0., 0., 1., 1., 1., 1.],
82 | [ 0., 0., 0., 0., 1., 1., 1., 1.],
83 | [ 0., 0., 0., 0., 1., 1., 1., 1.]])
84 |
85 | x0, y0 = np.where(im == 0)
86 | x1, y1 = np.where(im == 1)
87 |
88 | X = np.column_stack([
89 | np.concatenate([x0, x1]),
90 | np.concatenate([y0, y1])])
91 | y = np.array([-1] * len(x0) + [1] * len(x1))
92 |
93 | xmin, xmax, ymin, ymax = 0, im.shape[0]-1, 0, im.shape[1]-1
94 | coord_x, coord_y = np.mgrid[xmin:xmax:50j, ymin:ymax:50j]
95 | XX = np.column_stack([coord_x.ravel(), coord_y.ravel()])
96 |
97 |
98 | # Kernel mapping
99 | ################
100 |
101 | self = KernDensity(sigma=.2)
102 | self.fit(X, y)
103 | y_pred_kde = self.predict(XX)
104 | coord_z_kde = y_pred_kde.reshape(coord_x.shape)
105 | points=X
106 |
107 | # View 2D
108 | if False:
109 | plt.imshow(np.rot90(coord_z_kde), cmap=plt.cm.coolwarm, extent=[xmin, xmax, ymin, ymax], aspect='auto')
110 | plt.plot(X[y==1, 0], X[y==1, 1], 'o', color='r')#, zs=[z_min], ms=20)
111 | plt.plot(X[y==-1, 0], X[y==-1, 1], 'o', color='b')#, zs=[z_min], ms=20)
112 |
113 |
114 | fig = plt.figure(figsize=(30, 15))
115 |
116 | ax=fig.add_subplot(121, projection='3d')
117 | ax, z_min, argmin = plot3d(coord_x, coord_y, coord_z_kde, points=X, y=y, ax=ax, fig=fig)
118 | plt.title(r'$x \rightarrow K(x_i, x) = \exp\left(-\frac{||x_i - x_j||^2}{2\sigma^2}\right)$', size=33)
119 | # set camera to fixed point of view
120 | print(ax.azim, ax.elev, ax.dist)
121 | #(-152.49214958606902, 21.717791411042867, 10)
122 | #ax.view_init(azim=-152, elev=21) #Reproduce view
123 | #ax.view_init(azim=-14.1935483871, elev=29.6875, dist=10)
124 |
125 | # SV
126 | #####
127 |
128 | from sklearn.svm import SVC
129 | #1.0 / X.shape[1] 0.5
130 | #(1/(2 *.2)) : 2.5
131 | clf = SVC(kernel='rbf')#, gamma=1)
132 | clf.fit(X, y)
133 | clf.support_vectors_.shape
134 |
135 | print(clf.support_.shape)
136 |
137 | np.all(X[clf.support_,:] == clf.support_vectors_)
138 |
139 | Xsv = clf.support_vectors_
140 | y_sv = y[clf.support_]
141 |
142 | y_pred_svm = clf.predict(XX)
143 | #self = KernDensity(sigma=.2)
144 | #self.fit(X, y)
145 | #y_pred = self.predict(XX)
146 | coord_z_svm = y_pred_svm.reshape(coord_x.shape)
147 |
148 | # View 2D
149 | if False:
150 | plt.imshow(np.rot90(coord_z_svm), cmap=plt.cm.coolwarm, extent=[xmin, xmax, ymin, ymax], aspect='auto')
151 | plt.plot(Xsv[y_sv==1, 0], Xsv[y_sv==1, 1], 'o', color='r')#, zs=[z_min], ms=20)
152 | plt.plot(Xsv[y_sv==-1, 0], Xsv[y_sv==-1, 1], 'o', color='b')#, zs=[z_min], ms=20)
153 |
154 |
155 |
156 | #fig = plt.figure(figsize=(15, 15))
157 | ax=fig.add_subplot(122, projection='3d')
158 | ax, z_min, argmin = plot3d(coord_x, coord_y, coord_z_svm, points=Xsv, y=y_sv, ax=ax, fig=fig)
159 | plt.title(r'$f(x) = sign \left(\sum_{i \in SV}\alpha_i y_i \exp\left(-\frac{||x_i - x_j||^2}{2\sigma^2}\right)\right)$', size=33)
160 | # set camera to fixed point of view
161 | #ax.azim, ax.elev, ax.dist
162 | #(-152.49214958606902, 21.717791411042867, 10)
163 | #ax.view_init(azim=-152, elev=21) #Reproduce view
164 |
165 | ############
166 |
167 | import numpy as np
168 | from sklearn.svm import SVC
169 | from sklearn import datasets
170 | import matplotlib.pyplot as plt
171 |
172 | # dataset
173 | X, y = datasets.make_classification(n_samples=10, n_features=2,n_redundant=0,
174 | n_classes=2,
175 | random_state=1,
176 | shuffle=False)
177 | clf = SVC(kernel='rbf')#, gamma=1)
178 | clf.fit(X, y)
179 | print("#Errors: %i" % np.sum(y != clf.predict(X)))
180 |
181 | clf.decision_function(X)
182 |
183 | # Usefull internals:
184 | # Array of support vectors
185 | clf.support_vectors_
186 |
187 | # indices of support vectors within original X
188 | np.all(X[clf.support_,:] == clf.support_vectors_)
189 |
190 |
191 | ########################
192 |
193 |
194 | from sklearn.ensemble import RandomForestClassifier
195 |
196 | forest = RandomForestClassifier(n_estimators = 100)
197 | forest.fit(X, y)
198 |
199 | print("#Errors: %i" % np.sum(y != forest.predict(X)))
200 |
201 |
202 |
--------------------------------------------------------------------------------
/python_lang/python_lang_solutions.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sat Jan 16 10:03:29 2016
4 |
5 | @author: edouard.duchesnay@gmail.com
6 | """
7 |
8 | ###############################################################################
9 | # Exercise 1: functions
10 | # ~~~~~~~~~~~~~~~~~~~~~
11 | #
12 | # Create a function that acts as a simple calulator If the operation is
13 | # not specified, default to addition If the operation is misspecified,
14 | # return an prompt message Ex: ``calc(4,5,"multiply")`` returns 20 Ex:
15 | # ``calc(3,5)`` returns 8 Ex: ``calc(1, 2, "something")`` returns error
16 | # message
17 | #
18 |
19 | def calc(a, b, op='add'):
20 | if op == 'add':
21 | return a + b
22 | elif op == 'sub':
23 | return a - b
24 | else:
25 | print('valid operations are add and sub')
26 |
27 |
28 | # call the function
29 | calc(10, 4, op='add') # returns 14
30 | calc(10, 4, 'add') # also returns 14: unnamed arguments are inferred by position
31 | calc(10, 4) # also returns 14: default for 'op' is 'add'
32 | calc(10, 4, 'sub') # returns 6
33 | calc(10, 4, 'div') # prints 'valid operations are add and sub'
34 |
35 | a, b, op = 2, 3, "+"
36 |
37 |
38 | def calc2(a, b, op='+'):
39 | st = "%.f %s %.f" % (a, op, b)
40 | return eval(st)
41 |
42 |
43 | calc2(3, 3, "+")
44 |
45 |
46 | ###############################################################################
47 | # Exercise 2: functions + list + loop
48 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
49 | #
50 | # Given a list of numbers, return a list where all adjacent duplicate
51 | # elements have been reduced to a single element. Ex: ``[1, 2, 2, 3, 2]``
52 | # returns ``[1, 2, 3, 2]``. You may create a new list or modify the passed
53 | # in list.
54 | #
55 | # Remove all duplicate values (adjacent or not) Ex: ``[1, 2, 2, 3, 2]``
56 | # returns ``[1, 2, 3]``
57 | #
58 |
59 |
60 | def remove_adjacent_duplicates(original_list):
61 | new_list = []
62 | new_list.append(original_list[0])
63 | for num in original_list[1:]:
64 | if num != new_list[-1]:
65 | new_list.append(num)
66 | return new_list
67 |
68 | remove_adjacent_duplicates([1, 2, 2, 3, 2])
69 |
70 | def remove_duplicates(original_list):
71 | new_list = []
72 | for num in original_list:
73 | if num not in new_list:
74 | new_list.append(num)
75 | return new_list
76 |
77 | remove_duplicates([3, 2, 2, 1, 2])
78 |
79 | # or this solution mights modify the order
80 |
81 | def remove_duplicates(original_list):
82 | return(list(set(original_list)))
83 |
84 | remove_duplicates([3, 2, 2, 1, 2])
85 |
86 |
87 | ###############################################################################
88 | # Exercise 3: File I/O
89 | # ~~~~~~~~~~~~~~~~~~~~
90 | #
91 | # 1. Copy/paste the BSD 4 clause license (https://en.wikipedia.org/wiki/BSD_licenses)
92 | # into a text file. Read, the file and count the occurrences of each
93 | # word within the file. Store the words' occurrence number in a dictionary.
94 | #
95 | # 2. Write an executable python command ``count_words.py`` that parse
96 | # a list of input files provided after ``--input`` parameter.
97 | # The dictionary of occurrence is save in a csv file provides by ``--output``.
98 | # with default value word_count.csv.
99 | # Use:
100 | # - open
101 | # - regular expression
102 | # - argparse (https://docs.python.org/3/howto/argparse.html)
103 |
104 |
105 | bsd_4clause = """
106 | Copyright (c) ,
107 | All rights reserved.
108 |
109 | Redistribution and use in source and binary forms, with or without
110 | modification, are permitted provided that the following conditions are met:
111 | 1. Redistributions of source code must retain the above copyright
112 | notice, this list of conditions and the following disclaimer.
113 | 2. Redistributions in binary form must reproduce the above copyright
114 | notice, this list of conditions and the following disclaimer in the
115 | documentation and/or other materials provided with the distribution.
116 | 3. All advertising materials mentioning features or use of this software
117 | must display the following acknowledgement:
118 | This product includes software developed by the .
119 | 4. Neither the name of the nor the
120 | names of its contributors may be used to endorse or promote products
121 | derived from this software without specific prior written permission.
122 |
123 | THIS SOFTWARE IS PROVIDED BY ''AS IS'' AND ANY
124 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
125 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
126 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY
127 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
128 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
129 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
130 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
131 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
132 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
133 | """
134 |
135 | import os
136 | import tempfile
137 |
138 | tmpfilename = os.path.join(tempfile.gettempdir(),
139 | "bsd.txt")
140 |
141 | fd = open(tmpfilename, "w")
142 | fd.write(bsd_4clause)
143 | fd.close()
144 |
145 | fd = open(tmpfilename, "r")
146 |
147 | count = dict()
148 | for line in fd:
149 | line = line.lower()
150 | for word in line.split():
151 | if not word in count:
152 | count[word] = 1
153 | else:
154 | count[word] += 1
155 |
156 | print(count)
157 |
158 | """
159 | Comment to deal with missing import of urllib2
160 |
161 | import urllib2
162 | url = "https://www.gnu.org/licenses/gpl-3.0.txt"
163 | f = urllib2.urlopen(url)
164 | content = f.read()
165 | f.close()
166 | content = content.replace("\n", " ")
167 | content = content.lower()
168 | c = content.split(' ')
169 | print(len(c))
170 | from collections import Counter
171 | print(Counter(c))
172 | """
173 |
174 | ###############################################################################
175 | # Exercise 4: OOP
176 | # ~~~~~~~~~~~~~~~
177 | #
178 | # 1. Create a class ``Employee`` with 2 attributes provided in the
179 | # constructor: ``name``, ``years_of_service``. With one method
180 | # ``salary`` with is obtained by ``1500 + 100 * years_of_service``.
181 | #
182 | # 2. Create a subclass ``Manager`` which redefine ``salary`` method
183 | # ``2500 + 120 * years_of_service``.
184 | #
185 | # 3. Create a small dictionary database where the key is the
186 | # employee's name. Populate the database with: samples =
187 | # Employee('lucy', 3), Employee('john', 1), Manager('julie', 10),
188 | # Manager('paul', 3)
189 | #
190 | # 4. Return a table of made name, salary rows, i.e. a list of list [[name,
191 | # salary]]
192 | #
193 | # 5. Compute the average salary
194 |
195 | import pandas as pd
196 |
197 |
198 | class Employee:
199 | def __init__(self, name, years_of_service):
200 | self.name = name
201 | self.years_of_service = years_of_service
202 |
203 | def salary(self):
204 | return 1500 + 100 * self.years_of_service
205 |
206 |
207 | class Manager(Employee):
208 | def salary(self):
209 | return 2500 + 120 * self.years_of_service
210 |
211 |
212 | samples = [Employee("lucy", 3),
213 | Employee("john", 1),
214 | Manager('julie', 3),
215 | Manager('paul', 1)]
216 |
217 | employees = {e.name: e for e in samples}
218 |
219 | employees.keys()
220 |
221 | df = pd.DataFrame([[name, obj.salary()] for name, obj in employees.items()],
222 | columns=['name', 'salary'])
223 |
224 | [[name, employees[name].salary()] for name
225 | in employees]
226 |
227 | sum([e.salary() for e in employees.values()]) / len(employees)
228 |
--------------------------------------------------------------------------------
/introduction/python_ecosystem.rst:
--------------------------------------------------------------------------------
1 | Python ecosystem for data-science
2 | ---------------------------------
3 |
4 | .. RST https://thomas-cokelaer.info/tutorials/sphinx/rest_syntax.html
5 |
6 | .. image:: images/python_ecosystem.png
7 | :scale: 100
8 | :align: center
9 |
10 | Python language
11 | ~~~~~~~~~~~~~~~
12 |
13 | - Interpreted
14 | - Garbage collector (do not prevent from memory leak)
15 | - Dynamically-typed language (Java is statically typed)
16 |
17 |
18 | Anaconda
19 | ~~~~~~~~
20 |
21 | Anaconda is a python distribution that ships most of python tools and libraries
22 |
23 | **Installation**
24 |
25 |
26 | 1. Download anaconda (Python 3.x) http://continuum.io/downloads
27 |
28 | 2. Install it, on Linux
29 | ::
30 |
31 | bash Anaconda3-2.4.1-Linux-x86_64.sh
32 |
33 | 3. Add anaconda path in your PATH variable in your ``.bashrc`` file:
34 | ::
35 |
36 | export PATH="${HOME}/anaconda3/bin:$PATH"
37 |
38 | **Managing with ``conda``**
39 |
40 |
41 | Update conda package and environment manager to current version
42 |
43 | ::
44 |
45 | conda update conda
46 |
47 |
48 | Install additional packages. Those commands install qt back-end (Fix a temporary issue to run spyder)
49 |
50 | ::
51 |
52 | conda install pyqt
53 | conda install PyOpenGL
54 | conda update --all
55 |
56 |
57 | Install seaborn for graphics
58 |
59 | ::
60 |
61 | conda install seaborn
62 | # install a specific version from anaconda chanel
63 | conda install -c anaconda pyqt=4.11.4
64 |
65 | List installed packages
66 |
67 | ::
68 |
69 | conda list
70 |
71 | Search available packages
72 |
73 | ::
74 |
75 | conda search pyqt
76 | conda search scikit-learn
77 |
78 |
79 |
80 | **Environments**
81 |
82 |
83 | - A conda environment is a directory that contains a specific collection of conda packages that you have installed.
84 | - Control packages environment for a specific purpose: collaborating with someone else, delivering an application to your client,
85 | - Switch between environments
86 |
87 | List of all environments
88 |
89 | ::
90 | conda info --envs
91 |
92 | 1. Create new environment
93 | 2. Activate
94 | 3. Install new package
95 |
96 | ::
97 |
98 | conda create --name test
99 | # Or
100 | conda env create -f environment.yml
101 | source activate test
102 | conda info --envs
103 | conda list
104 | conda search -f numpy
105 | conda install numpy
106 |
107 | **Miniconda**
108 |
109 | Anaconda without the collection of (>700) packages.
110 | With Miniconda you download only the packages you want with the conda command: ``conda install PACKAGENAME``
111 |
112 |
113 |
114 | 1. Download anaconda (Python 3.x) https://conda.io/miniconda.html
115 |
116 | 2. Install it, on Linux
117 |
118 | ::
119 |
120 | bash Miniconda3-latest-Linux-x86_64.sh
121 |
122 | 3. Add anaconda path in your PATH variable in your ``.bashrc`` file:
123 |
124 | ::
125 |
126 | export PATH=${HOME}/miniconda3/bin:$PATH
127 |
128 | 4. Install required packages
129 |
130 | ::
131 |
132 | conda install -y scipy
133 | conda install -y pandas
134 | conda install -y matplotlib
135 | conda install -y statsmodels
136 | conda install -y scikit-learn
137 | conda install -y sqlite
138 | conda install -y spyder
139 | conda install -y jupyter
140 |
141 |
142 | Commands
143 | ~~~~~~~~
144 |
145 | **python**: python interpreter. On the dos/unix command line execute wholes file::
146 |
147 | python file.py
148 |
149 | Interactive mode::
150 |
151 | python
152 |
153 | Quite with ``CTL-D``
154 |
155 | **ipython**: advanced interactive python interpreter::
156 |
157 | ipython
158 |
159 | Quite with ``CTL-D``
160 |
161 | **pip** alternative for packages management (update ``-U`` in user directory ``--user``):
162 |
163 | ::
164 |
165 | pip install -U --user seaborn
166 |
167 | For neuroimaging:
168 |
169 | ::
170 |
171 | pip install -U --user nibabel
172 | pip install -U --user nilearn
173 |
174 |
175 | **spyder**: IDE (integrated development environment):
176 |
177 | - Syntax highlighting.
178 | - Code introspection for code completion (use ``TAB``).
179 | - Support for multiple Python consoles (including IPython).
180 | - Explore and edit variables from a GUI.
181 | - Debugging.
182 | - Navigate in code (go to function definition) ``CTL``.
183 |
184 | 3 or 4 panels:
185 |
186 | +-------------+-------------------------+
187 | | text editor | help/variable explorer |
188 | +-------------+-------------------------+
189 | | | ipython interpreter |
190 | +-------------+-------------------------+
191 |
192 | Shortcuts:
193 | - ``F9`` run line/selection
194 |
195 | Libraries
196 | ~~~~~~~~~
197 |
198 | scipy.org: ``_
199 |
200 |
201 | **Numpy**: Basic numerical operation. Matrix operation plus some basic solvers.::
202 |
203 | import numpy as np
204 | X = np.array([[1, 2], [3, 4]])
205 | #v = np.array([1, 2]).reshape((2, 1))
206 | v = np.array([1, 2])
207 | np.dot(X, v) # no broadcasting
208 | X * v # broadcasting
209 | np.dot(v, X)
210 | X - X.mean(axis=0)
211 |
212 | **Scipy**: general scientific libraries with advanced solver::
213 |
214 | import scipy
215 | import scipy.linalg
216 | scipy.linalg.svd(X, full_matrices=False)
217 |
218 | **Matplotlib**: visualization::
219 |
220 | import numpy as np
221 | import matplotlib.pyplot as plt
222 | #%matplotlib qt
223 | x = np.linspace(0, 10, 50)
224 | sinus = np.sin(x)
225 | plt.plot(x, sinus)
226 | plt.show()
227 |
228 | **Pandas**: Manipulation of structured data (tables). input/output excel files, etc.
229 |
230 | **Statsmodel**: Advanced statistics
231 |
232 | **Scikit-learn**: Machine learning
233 |
234 | .. http://truben.no/table/
235 |
236 | +--------------+-----------------------------+----------------------+----------------+-------------------+--------------+-----------------+------------------+
237 | | library | Arrays data, Num. comp, I/O | Structured data, I/O | Solvers: basic | Solvers: advanced | Stats: basic | Stats: advanced | Machine learning |
238 | +==============+=============================+======================+================+===================+==============+=================+==================+
239 | | Numpy | X | | X | | | | |
240 | +--------------+-----------------------------+----------------------+----------------+-------------------+--------------+-----------------+------------------+
241 | | Scipy | | | X | X | X | | |
242 | +--------------+-----------------------------+----------------------+----------------+-------------------+--------------+-----------------+------------------+
243 | | Pandas | | X | | | | | |
244 | +--------------+-----------------------------+----------------------+----------------+-------------------+--------------+-----------------+------------------+
245 | | Statmodels | | | | | X | X | |
246 | +--------------+-----------------------------+----------------------+----------------+-------------------+--------------+-----------------+------------------+
247 | | Scikit-learn | | | | | | | X |
248 | +--------------+-----------------------------+----------------------+----------------+-------------------+--------------+-----------------+------------------+
249 |
250 |
--------------------------------------------------------------------------------
/R/ml_dimensionality_reduction_exo.R:
--------------------------------------------------------------------------------
1 | ######
2 | ## PCA
3 | ######
4 |
5 | # Write a class `BasicPCA` with two methods `fit(X)` that estimates the data mean
6 | # and principal components directions. `transform(X)` that project a new the data
7 | # into the principal components.
8 | #
9 | # Check that your `BasicPCA` pfermed simillarly than the one from sklearn:
10 | # `from sklearn.decomposition import PCA`
11 |
12 |
13 | BasicPCA <- function(X, scale=FALSE){
14 | obj = list()
15 | Xc <- scale(X, center=TRUE, scale=scale)
16 | obj$mean <- attr(Xc, "scaled:center")
17 | s <- svd(Xc, nu = 0)
18 | # v [K x P] a matrix whose columns contain the right singular vectors of x
19 | obj$V = s$v
20 | obj$var = 1 / (nrow(X) - 1) * s$d ^2
21 | return(obj)
22 | }
23 |
24 | BasicPCA.transform <- function(obj, X){
25 | Xc <- scale(X, center=obj$mean, scale=FALSE)
26 | return(Xc %*% obj$V)
27 | }
28 |
29 | # https://tgmstat.wordpress.com/2013/11/28/computing-and-visualizing-pca-in-r/
30 | # dataset
31 | n_samples = 10
32 | experience = rnorm(n_samples)
33 | salary = 1500 + experience + .5 * rnorm(n_samples)
34 | other = rnorm(n_samples)
35 | X = cbind(experience, salary, other)
36 |
37 | # Optional: standardize data
38 | Xcs = scale(X, center=TRUE, scale=FALSE)
39 | attr(Xcs, "scaled:center") = NULL
40 | attr(Xcs, "scaled:scale") = NULL
41 |
42 | basic_pca = BasicPCA(Xcs)
43 | BasicPCA.transform(basic_pca, Xcs)
44 |
45 | # PCA with prcomp
46 | pca = prcomp(Xcs, center=TRUE, scale.=FALSE)
47 | names(pca)
48 |
49 | # Compare
50 | all(pca$rotation == basic_pca$V)
51 | all(predict(pca, Xcs) == BasicPCA.transform(basic_pca, Xcs))
52 |
53 | # "https://raw.github.com/neurospin/pystatsml/master/data/iris.csv"
54 | #
55 | # Describe the data set. Should the dataset been standardized ?
56 | #
57 | # Retrieve the explained variance ratio. Determine $K$ the number of components.
58 | #
59 | # Print the $K$ principal components direction and correlation of the $K$ principal
60 | # components with original variables. Interpret the contribution of original variables
61 | # into the PC.
62 | #
63 | # Plot samples projected into the $K$ first PCs.
64 | #
65 | # Color samples with their species.
66 | #
67 |
68 | url = 'ftp://ftp.cea.fr/pub/unati/people/educhesnay/pystatml/data/iris.csv'
69 | data = read.csv(url)
70 | #setwd("/home/ed203246/git/pystatsml/notebooks")
71 | data = read.csv("../data/iris.csv")
72 |
73 | # Describe the data set. Should the dataset been standardized ?
74 |
75 | summary(data)
76 | # sepal_length sepal_width petal_length petal_width species
77 | # Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100 setosa :50
78 | # 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300 versicolor:50
79 | # Median :5.800 Median :3.000 Median :4.350 Median :1.300 virginica :50
80 | # Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
81 | # 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
82 | # Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
83 |
84 | numcols = colnames(data)[unlist(lapply(data, is.numeric))]
85 | apply(data[, numcols], 2, sd)
86 | #sepal_length sepal_width petal_length petal_width
87 | #0.8280661 0.4358663 1.7652982 0.7622377
88 |
89 |
90 | # Describe the structure of correlation among variables.
91 | X = data[, numcols]
92 | cor(X)
93 |
94 | # Compute a PCA with the maximum number of compoenents.
95 | Xcs = scale(X, center=TRUE, scale=TRUE)
96 | attr(Xcs, "scaled:center") = NULL
97 | attr(Xcs, "scaled:scale") = NULL
98 | apply(Xcs, 2, sd)
99 | apply(Xcs, 2, mean)
100 |
101 | #Compute a PCA with the maximum number of compoenents.
102 | pca = prcomp(Xcs)
103 |
104 | # Variance ratio by component
105 | (pca$sdev ** 2) / sum(pca$sdev ** 2)
106 | #[1] 0.729624454 0.228507618 0.036689219 0.005178709
107 |
108 | # cumulative explained variance
109 | cumsum(pca$sdev ** 2) / sum(pca$sdev ** 2)
110 |
111 | # K = 2
112 | names(pca)
113 | pca$rotation
114 |
115 | PC = predict(pca, Xcs)
116 | t(cor(Xcs, PC[, 1:2]))
117 | # sepal_length sepal_width petal_length petal_width
118 | # PC1 0.8901688 -0.4601427 0.99155518 0.96497896
119 | # PC2 -0.3608299 -0.8827163 -0.02341519 -0.06399985
120 |
121 | data = cbind(data, PC)
122 |
123 | # Plot samples projected into the K first PCs
124 | # Color samples with their species.
125 | library(ggplot2)
126 |
127 | qplot(PC1, PC2, data=data, colour=species)
128 |
129 | ####################################################################
130 | ## MDS
131 | ####################################################################
132 |
133 | ##############
134 | ## eurodist ##
135 | ##############
136 |
137 | # Perform similar analysis on eurodist dataset using R, using:
138 | # - MDS: cmdscale.
139 | # - Euclidian parwise distance: dist
140 | #
141 | #url = 'ftp://ftp.cea.fr/pub/unati/people/educhesnay/pystatml/data/eurodist.csv'
142 | #data = read.csv(url)
143 |
144 | setwd("~/git/pystatsml/notebooks")
145 | #url = 'ftp://ftp.cea.fr/pub/unati/people/educhesnay/pystatml/data/eurodist.csv'
146 | data = read.csv("../data/eurodist.csv")
147 |
148 | city = data[["city"]]
149 | D = data[, 2:ncol(data)]
150 |
151 | print(data[1:5, 1:5])
152 |
153 | # Arbitrary choice of K=2 components
154 | mds = cmdscale(D, k=2, , eig=T)
155 |
156 | # Recover coordinates of the cities in Euclidian referential whose orientation is arbitrary.
157 | print(as.matrix(dist(mds$points))[1:5, 1:5])
158 |
159 | plot(mds$points[,1], -mds$points[,2])
160 | text(mds$points[,1], -mds$points[,2], city, cex=0.8)
161 |
162 |
163 | # Apply MDS using cmdscale
164 | k_range = 1:(min(5, dim(D)-1))
165 | stress <- rep(0, max.k)
166 | for (kk in k_range){
167 | mds <- cmdscale(D, k=kk, eig=T)
168 | stress[kk] = (sum((D - as.matrix(dist(mds$points))) ^ 2)) ^ 0.5
169 | }
170 | plot(k_range, stress, type="l", xlab="k", ylab="stress")
171 | #cbind(1:max.k,P.k)
172 |
173 | # Ressources
174 | # http://people.stat.sc.edu/Hitchcock/chapter5_R_examples.txt
175 |
176 | ##########
177 | ## iris ##
178 | ##########
179 |
180 | # Perform similar analysis on eurodist dataset using R, using:
181 | # - MDS: cmdscale.
182 | # - Euclidian parwise distance: dist
183 | #
184 | #url = 'ftp://ftp.cea.fr/pub/unati/people/educhesnay/pystatml/data/iris.csv'
185 | #data = read.csv(url)
186 |
187 | setwd("~/git/pystatsml/notebooks")
188 | #url = 'ftp://ftp.cea.fr/pub/unati/people/educhesnay/pystatml/data/iris.csv'
189 | data = read.csv("../data/iris.csv")
190 |
191 | species = data[["species"]]
192 | X = scale(data[, 1:4])
193 | attr(X, "scaled:center") = NULL
194 | attr(X, "scaled:scale") = NULL
195 | D = as.matrix(dist(X))
196 | print(D[1:5, 1:5])
197 |
198 | # Select K
199 | k_range = 1:(min(5, dim(D)-1))
200 | stress <- rep(0, max.k)
201 | for (kk in k_range){
202 | mds <- cmdscale(D, k=kk, eig=T)
203 | stress[kk] = (sum((D - as.matrix(dist(mds$points))) ^ 2)) ^ 0.5
204 | }
205 | plot(k_range, stress, type="l", xlab="k", ylab="stress")
206 |
207 | K = 2 # components
208 | mds = cmdscale(D, k=K , eig=T)
209 |
210 | # Recover coordinates of the cities in Euclidian referential whose orientation is arbitrary.
211 | print(as.matrix(dist(mds$points))[1:5, 1:5])
212 |
213 | plot(mds$points[,1], -mds$points[,2], col=species)
214 |
215 | # PCA with prcomp
216 | pca = prcomp(X, center=TRUE, scale.=FALSE)
217 | names(pca)
218 | PC = predict(pca, X)[, 1:K]
219 |
220 | # Compute correlation between PCA and MDS components
221 | cor(cbind(mds$points, PC))
222 |
223 | # 1.000000e+00 1.551000e-16 1.000000e+00 4.766625e-16
224 | # 1.551000e-16 1.000000e+00 4.474091e-16 -1.000000e+00
225 | # PC1 1.000000e+00 4.474091e-16 1.000000e+00 1.842964e-16
226 | # PC2 4.766625e-16 -1.000000e+00 1.842964e-16 1.000000e+00
227 |
228 |
229 | ####################################################################
230 | ## isomap
231 | ####################################################################
232 | install.packages("vegan")
233 |
234 | s_curve = read.csv("../data/s_curve.csv")
235 | colnames(s_curve)
236 |
237 | X = as.matrix(s_curve[, c("x", "y", "z")])
238 | color = s_curve[["color"]]
239 | D <- dist(X, method="euclidean")
240 |
241 | library(vegan)
242 |
243 | iso = isomap(D, ndim=2, k=10)
244 |
245 | #install.packages("ggplot2")
246 | library(ggplot2)
247 |
248 | qplot(iso$points[,1], iso$points[,2], col=color) + scale_colour_gradientn(colours=rainbow(4))
249 | scale_fill_distiller(palette = "Spectral")
--------------------------------------------------------------------------------
/utils/ml_processing_pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Apr 11 15:40:35 2016
4 |
5 | @author: edoaurd.duchesnay@cea.fr
6 | """
7 | from sklearn import preprocessing
8 | preprocessing.OneHotEncoder
9 |
10 |
11 | '''
12 | Regression pipelines
13 | ====================
14 | '''
15 | import numpy as np
16 | from sklearn import datasets
17 | import sklearn.linear_model as lm
18 | from sklearn import preprocessing
19 | from sklearn.cross_validation import cross_val_score
20 | from sklearn.feature_selection import SelectKBest
21 | from sklearn.feature_selection import f_regression
22 | from sklearn.pipeline import Pipeline
23 | from sklearn.grid_search import GridSearchCV
24 | import sklearn.metrics as metrics
25 |
26 | # Datasets
27 | n_samples, n_features, noise_sd = 100, 100, 20
28 | X, y, coef = datasets.make_regression(n_samples=n_samples, n_features=n_features,
29 | noise=noise_sd, n_informative=5,
30 | random_state=42, coef=True)
31 |
32 | # Use this to tune the noise parameter such that snr < 5
33 | print("SNR:", np.std(np.dot(X, coef)) / noise_sd)
34 |
35 | print("=============================")
36 | print("== Basic linear regression ==")
37 | print("=============================")
38 |
39 | scores = cross_val_score(estimator=lm.LinearRegression(), X=X, y=y, cv=5)
40 | print("Test r2:%.2f" % scores.mean())
41 |
42 | print("==============================================")
43 | print("== Scaler + anova filter + ridge regression ==")
44 | print("==============================================")
45 |
46 | anova_ridge = Pipeline([
47 | ('standardscaler', preprocessing.StandardScaler()),
48 | ('selectkbest', SelectKBest(f_regression)),
49 | ('ridge', lm.Ridge())
50 | ])
51 | param_grid = {'selectkbest__k':np.arange(10, 110, 10),
52 | 'ridge__alpha':[.001, .01, .1, 1, 10, 100] }
53 |
54 | # Expect execution in ipython, for python remove the %time
55 | print("----------------------------")
56 | print("-- Parallelize inner loop --")
57 | print("----------------------------")
58 |
59 | anova_ridge_cv = GridSearchCV(anova_ridge, cv=5, param_grid=param_grid, n_jobs=-1)
60 | %time scores = cross_val_score(estimator=anova_ridge_cv, X=X, y=y, cv=5)
61 | print("Test r2:%.2f" % scores.mean())
62 |
63 | print("----------------------------")
64 | print("-- Parallelize outer loop --")
65 | print("----------------------------")
66 |
67 | anova_ridge_cv = GridSearchCV(anova_ridge, cv=5, param_grid=param_grid)
68 | %time scores = cross_val_score(estimator=anova_ridge_cv, X=X, y=y, cv=5, n_jobs=-1)
69 | print("Test r2:%.2f" % scores.mean())
70 |
71 |
72 | print("=====================================")
73 | print("== Scaler + Elastic-net regression ==")
74 | print("=====================================")
75 |
76 | alphas = [.0001, .001, .01, .1, 1, 10, 100, 1000]
77 | l1_ratio = [.1, .5, .9]
78 |
79 | print("----------------------------")
80 | print("-- Parallelize outer loop --")
81 | print("----------------------------")
82 |
83 | enet = Pipeline([
84 | ('standardscaler', preprocessing.StandardScaler()),
85 | ('enet', lm.ElasticNet(max_iter=10000)),
86 | ])
87 | param_grid = {'enet__alpha':alphas ,
88 | 'enet__l1_ratio':l1_ratio}
89 | enet_cv = GridSearchCV(enet, cv=5, param_grid=param_grid)
90 | %time scores = cross_val_score(estimator=enet_cv, X=X, y=y, cv=5, n_jobs=-1)
91 | print("Test r2:%.2f" % scores.mean())
92 |
93 | print("-----------------------------------------------")
94 | print("-- Parallelize outer loop + built-in CV --")
95 | print("-- Remark: scaler is only done on outer loop --")
96 | print("-----------------------------------------------")
97 |
98 | enet_cv = Pipeline([
99 | ('standardscaler', preprocessing.StandardScaler()),
100 | ('enet', lm.ElasticNetCV(max_iter=10000, l1_ratio=l1_ratio, alphas=alphas)),
101 | ])
102 |
103 | %time scores = cross_val_score(estimator=enet_cv, X=X, y=y, cv=5)
104 | print("Test r2:%.2f" % scores.mean())
105 |
106 | '''
107 | Classification pipelines
108 | ========================
109 | '''
110 | import numpy as np
111 | from sklearn import datasets
112 | import sklearn.linear_model as lm
113 | from sklearn import preprocessing
114 | from sklearn.cross_validation import cross_val_score
115 | from sklearn.feature_selection import SelectKBest
116 | from sklearn.feature_selection import f_classif
117 | from sklearn.pipeline import Pipeline
118 | from sklearn.grid_search import GridSearchCV
119 | import sklearn.metrics as metrics
120 |
121 | # Datasets
122 | n_samples, n_features, noise_sd = 100, 100, 20
123 | X, y = datasets.make_classification(n_samples=n_samples, n_features=n_features,
124 | n_informative=5, random_state=42)
125 |
126 |
127 | def balanced_acc(estimator, X, y, **kwargs):
128 | '''
129 | Balanced accuracy scorer
130 | '''
131 | return metrics.recall_score(y, estimator.predict(X), average=None).mean()
132 |
133 | print("===============================")
134 | print("== Basic logistic regression ==")
135 | print("===============================")
136 |
137 | scores = cross_val_score(estimator=lm.LogisticRegression(C=1e8, class_weight='balanced'),
138 | X=X, y=y, cv=5, scoring=balanced_acc)
139 | print("Test bACC:%.2f" % scores.mean())
140 |
141 | print("=======================================================")
142 | print("== Scaler + anova filter + ridge logistic regression ==")
143 | print("=======================================================")
144 |
145 | anova_ridge = Pipeline([
146 | ('standardscaler', preprocessing.StandardScaler()),
147 | ('selectkbest', SelectKBest(f_classif)),
148 | ('ridge', lm.LogisticRegression(penalty='l2', class_weight='balanced'))
149 | ])
150 | param_grid = {'selectkbest__k':np.arange(10, 110, 10),
151 | 'ridge__C':[.0001, .001, .01, .1, 1, 10, 100, 1000, 10000]}
152 |
153 |
154 | # Expect execution in ipython, for python remove the %time
155 | print("----------------------------")
156 | print("-- Parallelize inner loop --")
157 | print("----------------------------")
158 |
159 | anova_ridge_cv = GridSearchCV(anova_ridge, cv=5, param_grid=param_grid,
160 | scoring=balanced_acc, n_jobs=-1)
161 | %time scores = cross_val_score(estimator=anova_ridge_cv, X=X, y=y, cv=5,\
162 | scoring=balanced_acc)
163 | print("Test bACC:%.2f" % scores.mean())
164 |
165 | print("----------------------------")
166 | print("-- Parallelize outer loop --")
167 | print("----------------------------")
168 |
169 | anova_ridge_cv = GridSearchCV(anova_ridge, cv=5, param_grid=param_grid,
170 | scoring=balanced_acc)
171 | %time scores = cross_val_score(estimator=anova_ridge_cv, X=X, y=y, cv=5,\
172 | scoring=balanced_acc, n_jobs=-1)
173 | print("Test bACC:%.2f" % scores.mean())
174 |
175 |
176 | print("========================================")
177 | print("== Scaler + lasso logistic regression ==")
178 | print("========================================")
179 |
180 | Cs = np.array([.0001, .001, .01, .1, 1, 10, 100, 1000, 10000])
181 | alphas = 1 / Cs
182 | l1_ratio = [.1, .5, .9]
183 |
184 | print("----------------------------")
185 | print("-- Parallelize outer loop --")
186 | print("----------------------------")
187 |
188 | lasso = Pipeline([
189 | ('standardscaler', preprocessing.StandardScaler()),
190 | ('lasso', lm.LogisticRegression(penalty='l1', class_weight='balanced')),
191 | ])
192 | param_grid = {'lasso__C':Cs}
193 | enet_cv = GridSearchCV(lasso, cv=5, param_grid=param_grid, scoring=balanced_acc)
194 | %time scores = cross_val_score(estimator=enet_cv, X=X, y=y, cv=5,\
195 | scoring=balanced_acc, n_jobs=-1)
196 | print("Test bACC:%.2f" % scores.mean())
197 |
198 |
199 | print("-----------------------------------------------")
200 | print("-- Parallelize outer loop + built-in CV --")
201 | print("-- Remark: scaler is only done on outer loop --")
202 | print("-----------------------------------------------")
203 |
204 | lasso_cv = Pipeline([
205 | ('standardscaler', preprocessing.StandardScaler()),
206 | ('lasso', lm.LogisticRegressionCV(Cs=Cs, scoring=balanced_acc)),
207 | ])
208 |
209 | %time scores = cross_val_score(estimator=lasso_cv, X=X, y=y, cv=5)
210 | print("Test bACC:%.2f" % scores.mean())
211 |
212 |
213 | print("=============================================")
214 | print("== Scaler + Elasticnet logistic regression ==")
215 | print("=============================================")
216 |
217 | print("----------------------------")
218 | print("-- Parallelize outer loop --")
219 | print("----------------------------")
220 |
221 | enet = Pipeline([
222 | ('standardscaler', preprocessing.StandardScaler()),
223 | ('enet', lm.SGDClassifier(loss="log", penalty="elasticnet",
224 | alpha=0.0001, l1_ratio=0.15, class_weight='balanced')),
225 | ])
226 |
227 | param_grid = {'enet__alpha':alphas,
228 | 'enet__l1_ratio':l1_ratio}
229 |
230 | enet_cv = GridSearchCV(enet, cv=5, param_grid=param_grid, scoring=balanced_acc)
231 | %time scores = cross_val_score(estimator=enet_cv, X=X, y=y, cv=5,\
232 | scoring=balanced_acc, n_jobs=-1)
233 | print("Test bACC:%.2f" % scores.mean())
234 |
--------------------------------------------------------------------------------
/machine_learning/ml_supervized_nonlinear.py:
--------------------------------------------------------------------------------
1 | '''
2 | Non-linear models
3 | =================
4 |
5 | Here we focuse on non-linear models for classification. Nevertheless, each
6 | classification model has its regression counterpart.
7 | '''
8 |
9 | # get_ipython().run_line_magic('matplotlib', 'inline')
10 | import matplotlib.pyplot as plt
11 |
12 | import numpy as np
13 | import pandas as pd
14 | import seaborn as sns
15 | import matplotlib.pyplot as plt
16 |
17 | from sklearn.svm import SVC
18 | from sklearn.preprocessing import StandardScaler
19 |
20 | from sklearn import datasets
21 | from sklearn import metrics
22 | from sklearn.model_selection import train_test_split
23 |
24 | np.set_printoptions(precision=2)
25 | pd.set_option('precision', 2)
26 |
27 | # %%
28 | # Support Vector Machines (SVM)
29 | # -----------------------------
30 | #
31 | # SVM are based kernel methods require only a user-specified kernel function
32 | # :math:`K(x_i, x_j)`, i.e., a **similarity function** over pairs of data
33 | # points :math:`(x_i, x_j)` into kernel (dual) space on which learning
34 | # algorithms operate linearly, i.e. every operation on points is a linear
35 | # combination of :math:`K(x_i, x_j)`.
36 | # Outline of the SVM algorithm:
37 | #
38 | # 1. Map points :math:`x` into kernel space using a kernel function:
39 | # :math:`x \rightarrow K(x, .)`.
40 | # 2. Learning algorithms operates linearly by dot product into high-kernel
41 | # space :math:`K(., x_i) \cdot K(., x_j)`.
42 | # - Using the kernel trick (Mercer’s Theorem) replaces dot product in high
43 | # dimensional space by a simpler operation such that
44 | # :math:`K(., x_i) \cdot K(., x_j) = K(x_i, x_j)`.
45 | # Thus we only need to compute a similarity measure for each pairs of
46 | # point and store in a :math:`N \times N` Gram matrix.
47 | # - Finally, The learning process consist of estimating the $\alpha_i$ of
48 | # the decision function that maximises the hinge loss (of :math:`f(x)`)
49 | # plus some penalty when applied on all training points.
50 | #
51 | # .. math::
52 | #
53 | # f(x) = \text{sign} \left(\sum_i^N \alpha_i~y_i~K(x_i, x)\right).
54 | #
55 | # 3. Predict a new point $x$ using the decision function.
56 | #
57 | # .. figure:: ../images/svm_rbf_kernel_mapping_and_decision_function.png
58 | # :alt: Support Vector Machines.
59 | #
60 | # Gaussian kernel (RBF, Radial Basis Function):
61 | #
62 | # One of the most commonly used kernel is the Radial Basis Function (RBF) Kernel.
63 | # For a pair of points :math:`x_i, x_j` the RBF kernel is defined as:
64 | #
65 | # .. raw:: latex
66 | #
67 | # \begin{align}
68 | # K(x_i, x_j) &= \exp\left(-\frac{\|x_i - x_j\|^2}{2\sigma^2}\right)\\
69 | # &= \exp\left(-\gamma~\|x_i - x_j\|^2\right)
70 | # \end{align}
71 | #
72 | # Where :math:`\sigma` (or :math:`\gamma`) defines the kernel width parameter.
73 | # Basically, we consider a Gaussian function centered on each training sample
74 | # :math:`x_i`. it has a ready interpretation as a similarity measure as it
75 | # decreases with squared Euclidean distance between the two feature vectors.
76 | #
77 | # Non linear SVM also exists for regression problems.
78 |
79 |
80 | # %%
81 | # dataset
82 |
83 | X, y = datasets.load_breast_cancer(return_X_y=True)
84 | X_train, X_test, y_train, y_test = \
85 | train_test_split(X, y, test_size=0.5, stratify=y, random_state=42)
86 |
87 | # %%
88 | # Preprocessing: unequal variance of input features, requires scaling for svm.
89 |
90 | ax = sns.displot(x=X_train.std(axis=0), kind="kde", bw_adjust=.2, cut=0,
91 | fill=True, height=3, aspect=1.5,)
92 | _ = ax.set_xlabels("Std-dev").tight_layout()
93 |
94 | scaler = StandardScaler()
95 | X_train = scaler.fit_transform(X_train)
96 | X_test = scaler.fit_transform(X_test)
97 |
98 | # %%
99 | # Fit-predict
100 | # Probalility is a logistic of the decision_function
101 |
102 | svm = SVC(kernel='rbf', probability=True).fit(X_train, y_train)
103 | y_pred = svm.predict(X_test)
104 | y_score = svm.decision_function(X_test)
105 | y_prob = svm.predict_proba(X_test)[:, 1]
106 |
107 | ax = sns.relplot(x=y_score, y=y_prob, hue=y_pred, height=2, aspect=1.5)
108 | _ = ax.set_axis_labels("decision function", "Probability").tight_layout()
109 |
110 | # %% Scores
111 |
112 | print("bAcc: %.2f, AUC: %.2f (AUC with proba: %.2f)" % (
113 | metrics.balanced_accuracy_score(y_true=y_test, y_pred=y_pred),
114 | metrics.roc_auc_score(y_true=y_test, y_score=y_score),
115 | metrics.roc_auc_score(y_true=y_test, y_score=y_prob)))
116 |
117 | # Usefull internals: indices of support vectors within original X
118 | np.all(X_train[svm.support_, :] == svm.support_vectors_)
119 |
120 |
121 | # %%
122 | # Random forest
123 | # -------------
124 | #
125 | # Decision tree
126 | # ~~~~~~~~~~~~~
127 | #
128 | # A tree can be "learned" by splitting the training dataset into subsets based on an features value test.
129 | # Each internal node represents a "test" on an feature resulting on the split of the current sample. At each step the algorithm selects the feature and a cutoff value that maximises a given metric. Different metrics exist for regression tree (target is continuous) or classification tree (the target is qualitative).
130 | # This process is repeated on each derived subset in a recursive manner called recursive partitioning. The recursion is completed when the subset at a node has all the same value of the target variable, or when splitting no longer adds value to the predictions. This general principle is implemented by many recursive partitioning tree algorithms.
131 | #
132 | # .. figure:: ../images/classification_tree.png
133 | # :width: 400
134 | # :alt: Classification tree.
135 | #
136 | # Decision trees are simple to understand and interpret however they tend to overfit the data. However decision trees tend to overfit the training set. Leo Breiman propose random forest to deal with this issue.
137 | #
138 | # A single decision tree is usually overfits the data it is learning from because it learn from only one pathway of decisions. Predictions from a single decision tree usually don’t make accurate predictions on new data.
139 | #
140 | # Forest
141 | # ~~~~~~
142 | #
143 | # A random forest is a meta estimator that fits a number of **decision tree learners** on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting.
144 | # Random forest models reduce the risk of overfitting by introducing randomness by:
145 | #
146 | # .. figure:: ../images/random_forest.png
147 | # :width: 300
148 | # :alt: Random forest.
149 | #
150 | # - building multiple trees (n_estimators)
151 | # - drawing observations with replacement (i.e., a bootstrapped sample)
152 | # - splitting nodes on the best split among a random subset of the features selected at every node
153 | #
154 |
155 | from sklearn.ensemble import RandomForestClassifier
156 |
157 | forest = RandomForestClassifier(n_estimators = 100)
158 | forest.fit(X_train, y_train)
159 |
160 | y_pred = forest.predict(X_test)
161 | y_prob = forest.predict_proba(X_test)[:, 1]
162 |
163 |
164 | # %% Scores
165 |
166 | print("bAcc: %.2f, AUC: %.2f " % (
167 | metrics.balanced_accuracy_score(y_true=y_test, y_pred=y_pred),
168 | metrics.roc_auc_score(y_true=y_test, y_score=y_prob)))
169 |
170 | # %%
171 | # Extra Trees (Low Variance)
172 | #
173 | # Extra Trees is like Random Forest, in that it builds multiple trees and splits nodes using random subsets of features, but with two key differences: it does not bootstrap observations (meaning it samples without replacement), and nodes are split on random splits, not best splits. So, in summary, ExtraTrees:
174 | # builds multiple trees with bootstrap = False by default, which means it samples without replacement
175 | # nodes are split based on random splits among a random subset of the features selected at every node
176 | # In Extra Trees, randomness doesn’t come from bootstrapping of data, but rather comes from the random splits of all observations.
177 | # ExtraTrees is named for (Extremely Randomized Trees).
178 |
179 |
180 | # %%
181 | # Gradient boosting
182 | # -----------------
183 | #
184 | # Gradient boosting is a meta estimator that fits a sequence of **weak learners**.
185 | # Each learner aims to reduce the residuals (errors) produced by the previous learner.
186 | # The two main hyper-parameters are:
187 | #
188 | # - The **learning rate** (*lr*) controls over-fitting:
189 | # decreasing the *lr* limits the capacity of a learner to overfit the residuals, ie,
190 | # it slows down the learning speed and thus increases the **regularisation**.
191 | #
192 | # - The **sub-sampling fraction** controls the fraction of samples to be used for
193 | # fitting the learners. Values smaller than 1 leads to **Stochastic Gradient Boosting**.
194 | # It thus controls for over-fitting reducing variance and incresing bias.
195 | #
196 | # .. figure:: ../images/gradient_boosting.png
197 | # :width: 500
198 | # :alt: Gradient boosting.
199 | #
200 |
201 |
202 | from sklearn.ensemble import GradientBoostingClassifier
203 |
204 | gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
205 | subsample=0.5, random_state=0)
206 | gb.fit(X_train, y_train)
207 |
208 | y_pred = gb.predict(X_test)
209 | y_prob = gb.predict_proba(X_test)[:, 1]
210 |
211 | print("bAcc: %.2f, AUC: %.2f " % (
212 | metrics.balanced_accuracy_score(y_true=y_test, y_pred=y_pred),
213 | metrics.roc_auc_score(y_true=y_test, y_score=y_prob)))
214 |
--------------------------------------------------------------------------------
/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # Machine Learning documentation build configuration file, created by
4 | # sphinx-quickstart on Mon Nov 30 16:25:34 2015.
5 | #
6 | # This file is execfile()d with the current directory set to its
7 | # containing dir.
8 | #
9 | # Note that not all possible configuration values are present in this
10 | # autogenerated file.
11 | #
12 | # All configuration values have a default; values that are commented out
13 | # serve to show the default.
14 |
15 | import sys
16 | import os
17 | import shlex
18 |
19 | # If extensions (or modules to document with autodoc) are in another directory,
20 | # add these directories to sys.path here. If the directory is relative to the
21 | # documentation root, use os.path.abspath to make it absolute, like shown here.
22 | #sys.path.insert(0, os.path.abspath('.'))
23 |
24 | # -- General configuration ------------------------------------------------
25 |
26 | # If your documentation needs a minimal Sphinx version, state it here.
27 | #needs_sphinx = '1.0'
28 |
29 | # Add any Sphinx extension module names here, as strings. They can be
30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
31 | # ones.
32 | extensions = [
33 | 'sphinx.ext.mathjax',
34 | 'sphinx_gallery.gen_gallery',
35 | 'docxbuilder',
36 | ]
37 |
38 | # Add any paths that contain templates here, relative to this directory.
39 | templates_path = ['_templates']
40 |
41 | # The suffix(es) of source filenames.
42 | # You can specify multiple suffix as a list of string:
43 | # source_suffix = ['.rst', '.md']
44 | source_suffix = '.rst'
45 |
46 | # The encoding of source files.
47 | #source_encoding = 'utf-8-sig'
48 |
49 | # The master toctree document.
50 | master_doc = 'index'
51 |
52 | # General information about the project.
53 | project = u'Statistics and Machine Learning in Python'
54 | copyright = u'2020, Edouard Duchesnay, NeuroSpin CEA Université Paris-Saclay, France'
55 | author = u'Edouard Duchesnay, Tommy Löfstedt, Younes Feki'
56 |
57 | # The version info for the project you're documenting, acts as replacement for
58 | # |version| and |release|, also used in various other places throughout the
59 | # built documents.
60 | #
61 | # The short X.Y version.
62 | version = '0.5'
63 | # The full version, including alpha/beta/rc tags.
64 | release = '0.5'
65 |
66 | # The language for content autogenerated by Sphinx. Refer to documentation
67 | # for a list of supported languages.
68 | #
69 | # This is also used if you do content translation via gettext catalogs.
70 | # Usually you set "language" from the command line for these cases.
71 | language = None
72 |
73 | # There are two options for replacing |today|: either, you set today to some
74 | # non-false value, then it is used:
75 | #today = ''
76 | # Else, today_fmt is used as the format for a strftime call.
77 | #today_fmt = '%B %d, %Y'
78 |
79 | # List of patterns, relative to source directory, that match files and
80 | # directories to ignore when looking for source files.
81 | exclude_patterns = ["notebooks/notebooks"]
82 |
83 | # The reST default role (used for this markup: `text`) to use for all
84 | # documents.
85 | #default_role = None
86 |
87 | # If true, '()' will be appended to :func: etc. cross-reference text.
88 | #add_function_parentheses = True
89 |
90 | # If true, the current module name will be prepended to all description
91 | # unit titles (such as .. function::).
92 | #add_module_names = True
93 |
94 | # If true, sectionauthor and moduleauthor directives will be shown in the
95 | # output. They are ignored by default.
96 | #show_authors = False
97 |
98 | # The name of the Pygments (syntax highlighting) style to use.
99 | pygments_style = 'sphinx'
100 |
101 | # A list of ignored prefixes for module index sorting.
102 | #modindex_common_prefix = []
103 |
104 | # If true, keep warnings as "system message" paragraphs in the built documents.
105 | keep_warnings = False
106 |
107 | # If true, `todo` and `todoList` produce output, else they produce nothing.
108 | todo_include_todos = False
109 |
110 |
111 | # -- Options for HTML output ----------------------------------------------
112 |
113 | # The theme to use for HTML and HTML Help pages. See the documentation for
114 | # a list of builtin themes.
115 | html_theme = 'alabaster'
116 |
117 | # Theme options are theme-specific and customize the look and feel of a theme
118 | # further. For a list of options available for each theme, see the
119 | # documentation.
120 | #html_theme_options = {}
121 |
122 | # Add any paths that contain custom themes here, relative to this directory.
123 | #html_theme_path = []
124 |
125 | # The name for this set of Sphinx documents. If None, it defaults to
126 | # " v documentation".
127 | #html_title = None
128 |
129 | # A shorter title for the navigation bar. Default is the same as html_title.
130 | #html_short_title = None
131 |
132 | # The name of an image file (relative to this directory) to place at the top
133 | # of the sidebar.
134 | #html_logo = None
135 |
136 | # The name of an image file (within the static path) to use as favicon of the
137 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
138 | # pixels large.
139 | #html_favicon = None
140 |
141 | # Add any paths that contain custom static files (such as style sheets) here,
142 | # relative to this directory. They are copied after the builtin static files,
143 | # so a file named "default.css" will overwrite the builtin "default.css".
144 | html_static_path = ['_static']
145 |
146 | # Add any extra paths that contain custom files (such as robots.txt or
147 | # .htaccess) here, relative to this directory. These files are copied
148 | # directly to the root of the documentation.
149 | #html_extra_path = []
150 |
151 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
152 | # using the given strftime format.
153 | #html_last_updated_fmt = '%b %d, %Y'
154 |
155 | # If true, SmartyPants will be used to convert quotes and dashes to
156 | # typographically correct entities.
157 | #html_use_smartypants = True
158 |
159 | # Custom sidebar templates, maps document names to template names.
160 | #html_sidebars = {}
161 |
162 | # Additional templates that should be rendered to pages, maps page names to
163 | # template names.
164 | #html_additional_pages = {}
165 |
166 | # If false, no module index is generated.
167 | #html_domain_indices = True
168 |
169 | # If false, no index is generated.
170 | #html_use_index = True
171 |
172 | # If true, the index is split into individual pages for each letter.
173 | #html_split_index = False
174 |
175 | # If true, links to the reST sources are added to the pages.
176 | #html_show_sourcelink = True
177 |
178 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
179 | #html_show_sphinx = True
180 |
181 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
182 | #html_show_copyright = True
183 |
184 | # If true, an OpenSearch description file will be output, and all pages will
185 | # contain a tag referring to it. The value of this option must be the
186 | # base URL from which the finished HTML is served.
187 | html_use_opensearch = 'https://duchesnay.github.io/pystatsml/'
188 |
189 | # This is the file name suffix for HTML files (e.g. ".xhtml").
190 | #html_file_suffix = None
191 |
192 | # Language to be used for generating the HTML full-text search index.
193 | # Sphinx supports the following languages:
194 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
195 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
196 | #html_search_language = 'en'
197 |
198 | # A dictionary with options for the search language support, empty by default.
199 | # Now only 'ja' uses this config value
200 | #html_search_options = {'type': 'default'}
201 |
202 | # The name of a javascript file (relative to the configuration directory) that
203 | # implements a search results scorer. If empty, the default will be used.
204 | #html_search_scorer = 'scorer.js'
205 |
206 | # Output file base name for HTML help builder.
207 | htmlhelp_basename = 'StatisticsMachineLearningPython'
208 |
209 | # -- Options for LaTeX output ---------------------------------------------
210 |
211 | latex_elements = {
212 | # The paper size ('letterpaper' or 'a4paper').
213 | 'papersize': 'a4paper',
214 |
215 | # The font size ('10pt', '11pt' or '12pt').
216 | #'pointsize': '10pt',
217 | 'pointsize': '11pt',
218 | # Additional stuff for the LaTeX preamble.
219 | # 'preamble': '''
220 | # \\usepackage{amsfonts}
221 | # ''',
222 | 'preamble': r'''
223 | \usepackage{charter}
224 | \usepackage[defaultsans]{lato}
225 | \usepackage{inconsolata}
226 | ''',
227 |
228 | # Latex figure (float) alignment
229 | #'figure_align': 'htbp',
230 | }
231 |
232 | # Grouping the document tree into LaTeX files. List of tuples
233 | # (source start file, target name, title,
234 | # author, documentclass [howto, manual, or own class]).
235 | latex_documents = [
236 | (master_doc, 'StatisticsMachineLearningPython.tex', u'Statistics and Machine Learning in Python',
237 | # (master_doc, 'StatisticsMachineLearningPython.tex', u'Python fundamentals and advanced',
238 | u'Edouard Duchesnay, Tommy Löfstedt, Feki Younes', 'manual'),
239 | ]
240 |
241 | # The name of an image file (relative to this directory) to place at the top of
242 | # the title page.
243 | #latex_logo = None
244 |
245 | # For "manual" documents, if this is true, then toplevel headings are parts,
246 | # not chapters.
247 | #latex_use_parts = False
248 |
249 | # If true, show page references after internal links.
250 | #latex_show_pagerefs = False
251 |
252 | # If true, show URL addresses after external links.
253 | # latex_show_urls = True
254 |
255 | # Documents to append as an appendix to all manuals.
256 | #latex_appendices = []
257 |
258 | # If false, no module index is generated.
259 | #latex_domain_indices = True
260 |
261 |
262 | # -- Options for manual page output ---------------------------------------
263 |
264 | # One entry per manual page. List of tuples
265 | # (source start file, name, description, authors, manual section).
266 | man_pages = [
267 | (master_doc, 'statisticsmachinelearning', u'Statistics and Machine Learning in Python',
268 | [author], 1)
269 | ]
270 |
271 | # If true, show URL addresses after external links.
272 | #man_show_urls = False
273 |
274 |
275 | # -- Options for Texinfo output -------------------------------------------
276 |
277 | # Grouping the document tree into Texinfo files. List of tuples
278 | # (source start file, target name, title, author,
279 | # dir menu entry, description, category)
280 | texinfo_documents = [
281 | (master_doc, 'StatisticsMachineLearningPython', u'Statistics and Machine Learning in Python',
282 | author, 'MachineLearning', 'One line description of project.',
283 | 'Miscellaneous'),
284 | ]
285 |
286 | # Documents to append as an appendix to all manuals.
287 | #texinfo_appendices = []
288 |
289 | # If false, no module index is generated.
290 | #texinfo_domain_indices = True
291 |
292 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
293 | #texinfo_show_urls = 'footnote'
294 |
295 | # If true, do not generate a @detailmenu in the "Top" node's menu.
296 | #texinfo_no_detailmenu = False
297 |
298 |
299 | # -- Options for sphinx gallery -------------------------------------------
300 |
301 | sphinx_gallery_conf = {
302 | # path to your examples scripts
303 | 'examples_dirs' : ['python_lang', 'scientific_python', 'statistics', 'machine_learning', 'labs'],
304 | 'filename_pattern': '/',
305 | # path where to save gallery generated examples
306 | 'gallery_dirs' : ['auto_gallery', 'auto_gallery', 'auto_gallery', 'auto_gallery', 'auto_gallery'],
307 | 'backreferences_dir': False}
308 |
309 |
310 |
--------------------------------------------------------------------------------
/machine_learning/decomposition_solutions.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Dimension reduction and feature extraction\n",
8 | "\n",
9 | "## Principal Component Analysis\n",
10 | "\n",
11 | "### Implement PCA\n",
12 | "\n",
13 | "- Write a class `BasicPCA` with two methods `fit(X)` that estimates the data mean and principal components directions. `transform(X)` that project a new the data into the principal components.\n",
14 | "\n",
15 | "- Check that your `BasicPCA` performed similarly to the one from sklearn:\n",
16 | "`from sklearn.decomposition import PCA`"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": null,
22 | "metadata": {
23 | "execution": {
24 | "iopub.execute_input": "2020-10-11T22:53:14.585085Z",
25 | "iopub.status.busy": "2020-10-11T22:53:14.584709Z",
26 | "iopub.status.idle": "2020-10-11T22:53:15.274591Z",
27 | "shell.execute_reply": "2020-10-11T22:53:15.274226Z"
28 | }
29 | },
30 | "outputs": [],
31 | "source": [
32 | "import numpy as np\n",
33 | "import scipy\n",
34 | "import matplotlib.pyplot as plt\n",
35 | "import seaborn as sns\n",
36 | "%matplotlib inline\n",
37 | "#%matplotlib qt\n",
38 | "\n",
39 | "np.random.seed(42)\n",
40 | "\n",
41 | "\n",
42 | "import numpy as np\n",
43 | "from sklearn.decomposition import PCA\n",
44 | "\n",
45 | "\n",
46 | "class BasicPCA():\n",
47 | " def fit(self, X):\n",
48 | " # U : Unitary matrix having left singular vectors as columns.\n",
49 | " # Of shape (n_samples,n_samples) or (n_samples,n_comps), depending on\n",
50 | " # full_matrices.\n",
51 | " #\n",
52 | " # s : The singular values, sorted in non-increasing order. Of shape (n_comps,), \n",
53 | " # with n_comps = min(n_samples, n_features).\n",
54 | " #\n",
55 | " # Vh: Unitary matrix having right singular vectors as rows. \n",
56 | " # Of shape (n_features, n_features) or (n_comps, n_features) depending on full_matrices.\n",
57 | " self.mean = X.mean(axis=0)\n",
58 | " Xc = X - self.mean # Centering is required\n",
59 | " U, s, V = scipy.linalg.svd(Xc, full_matrices=False)\n",
60 | " self.explained_variance_ = (s ** 2) / X.shape[0]\n",
61 | " self.explained_variance_ratio_ = (self.explained_variance_ /\n",
62 | " self.explained_variance_.sum())\n",
63 | " self.princ_comp_dir = V\n",
64 | "\n",
65 | " def transform(self, X):\n",
66 | " Xc = X - self.mean\n",
67 | " return(np.dot(Xc, self.princ_comp_dir.T))\n",
68 | "\n",
69 | "# test\n",
70 | "np.random.seed(42)\n",
71 | " \n",
72 | "# dataset\n",
73 | "n_samples = 100\n",
74 | "experience = np.random.normal(size=n_samples)\n",
75 | "salary = 1500 + experience + np.random.normal(size=n_samples, scale=.5)\n",
76 | "X = np.column_stack([experience, salary])\n",
77 | "\n",
78 | "X = np.column_stack([experience, salary])\n",
79 | "pca = PCA(n_components=2)\n",
80 | "pca.fit(X)\n",
81 | "\n",
82 | "basic_pca = BasicPCA()\n",
83 | "basic_pca.fit(X)\n",
84 | "\n",
85 | "print(pca.explained_variance_ratio_)\n",
86 | "assert np.all(basic_pca.transform(X) == pca.transform(X))\n"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "### Apply PCA on iris dataset\n",
94 | "\n",
95 | "Apply your sklearn PCA on `iris` dataset available at: 'https://github.com/duchesnay/pystatsml/raw/master/datasets/iris.csv'."
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "metadata": {
102 | "execution": {
103 | "iopub.execute_input": "2020-10-11T22:53:15.278801Z",
104 | "iopub.status.busy": "2020-10-11T22:53:15.278467Z",
105 | "iopub.status.idle": "2020-10-11T22:53:16.236441Z",
106 | "shell.execute_reply": "2020-10-11T22:53:16.234869Z"
107 | }
108 | },
109 | "outputs": [],
110 | "source": [
111 | "import matplotlib.pyplot as plt\n",
112 | "\n",
113 | "from sklearn.decomposition import PCA\n",
114 | "# https://tgmstat.wordpress.com/2013/11/28/computing-and-visualizing-pca-in-r/\n",
115 | "\n",
116 | "import numpy as np\n",
117 | "import pandas as pd\n",
118 | "\n",
119 | "try:\n",
120 | " salary = pd.read_csv('datasets/iris.csv')\n",
121 | "except:\n",
122 | " url = 'https://github.com/duchesnay/pystatsml/raw/master/datasets/iris.csv'\n",
123 | " df = pd.read_csv(url)\n",
124 | "\n",
125 | "print(df.head())"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "Describe the data set. Should the dataset been standardized ?"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {
139 | "execution": {
140 | "iopub.execute_input": "2020-10-11T22:53:16.256201Z",
141 | "iopub.status.busy": "2020-10-11T22:53:16.255386Z",
142 | "iopub.status.idle": "2020-10-11T22:53:16.269795Z",
143 | "shell.execute_reply": "2020-10-11T22:53:16.269211Z"
144 | }
145 | },
146 | "outputs": [],
147 | "source": [
148 | "print(df.describe())"
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "metadata": {},
154 | "source": [
155 | "Describe the structure of correlation among variables."
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {
162 | "execution": {
163 | "iopub.execute_input": "2020-10-11T22:53:16.273240Z",
164 | "iopub.status.busy": "2020-10-11T22:53:16.272789Z",
165 | "iopub.status.idle": "2020-10-11T22:53:16.275060Z",
166 | "shell.execute_reply": "2020-10-11T22:53:16.274585Z"
167 | }
168 | },
169 | "outputs": [],
170 | "source": [
171 | "X = np.array(df.iloc[:, :4])\n",
172 | "#np.around(np.corrcoef(X.T), 3)"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": null,
178 | "metadata": {
179 | "execution": {
180 | "iopub.execute_input": "2020-10-11T22:53:16.279201Z",
181 | "iopub.status.busy": "2020-10-11T22:53:16.278783Z",
182 | "iopub.status.idle": "2020-10-11T22:53:16.283272Z",
183 | "shell.execute_reply": "2020-10-11T22:53:16.282896Z"
184 | }
185 | },
186 | "outputs": [],
187 | "source": [
188 | "# Center and standardize\n",
189 | "\n",
190 | "X = np.array(df.iloc[:, :4])\n",
191 | "X -= np.mean(X, axis=0)\n",
192 | "X /= np.std(X, axis=0, ddof=1)\n",
193 | "np.around(np.corrcoef(X.T), 3)"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "metadata": {},
199 | "source": [
200 | "Compute a PCA with the maximum number of components."
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": null,
206 | "metadata": {
207 | "execution": {
208 | "iopub.execute_input": "2020-10-11T22:53:16.286362Z",
209 | "iopub.status.busy": "2020-10-11T22:53:16.285897Z",
210 | "iopub.status.idle": "2020-10-11T22:53:16.288689Z",
211 | "shell.execute_reply": "2020-10-11T22:53:16.288349Z"
212 | }
213 | },
214 | "outputs": [],
215 | "source": [
216 | "pca = PCA(n_components=X.shape[1])\n",
217 | "pca.fit(X)"
218 | ]
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {},
223 | "source": [
224 | "Retrieve the explained variance ratio. Determine $K$ the number of components."
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {
231 | "execution": {
232 | "iopub.execute_input": "2020-10-11T22:53:16.291425Z",
233 | "iopub.status.busy": "2020-10-11T22:53:16.291098Z",
234 | "iopub.status.idle": "2020-10-11T22:53:16.293764Z",
235 | "shell.execute_reply": "2020-10-11T22:53:16.294048Z"
236 | }
237 | },
238 | "outputs": [],
239 | "source": [
240 | "print(pca.explained_variance_ratio_)\n",
241 | "\n",
242 | "K = 2\n",
243 | "pca = PCA(n_components=X.shape[1])\n",
244 | "pca.fit(X)\n",
245 | "PC = pca.transform(X)\n",
246 | "#print(PC)"
247 | ]
248 | },
249 | {
250 | "cell_type": "markdown",
251 | "metadata": {},
252 | "source": [
253 | "Print the $K$ principal components direction and correlation of the $K$ principal\n",
254 | "components with original variables. Interpret the contribution of original variables\n",
255 | "into the PC.\n"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": null,
261 | "metadata": {
262 | "execution": {
263 | "iopub.execute_input": "2020-10-11T22:53:16.297928Z",
264 | "iopub.status.busy": "2020-10-11T22:53:16.297500Z",
265 | "iopub.status.idle": "2020-10-11T22:53:16.302829Z",
266 | "shell.execute_reply": "2020-10-11T22:53:16.302482Z"
267 | }
268 | },
269 | "outputs": [],
270 | "source": [
271 | "print(pca.components_)\n",
272 | "CorPC = pd.DataFrame(\n",
273 | " [[np.corrcoef(X[:, j], PC[:, k])[0, 1] for j in range(X.shape[1])]\n",
274 | " for k in range(K)],\n",
275 | " columns = df.columns[:4],\n",
276 | " index = [\"PC %i\"%k for k in range(K)]\n",
277 | ")\n",
278 | "\n",
279 | "print(CorPC)"
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {},
285 | "source": [
286 | "Plot samples projected into the $K$ first PCs. Color samples with their species."
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": null,
292 | "metadata": {
293 | "execution": {
294 | "iopub.execute_input": "2020-10-11T22:53:16.316818Z",
295 | "iopub.status.busy": "2020-10-11T22:53:16.316510Z",
296 | "iopub.status.idle": "2020-10-11T22:53:16.396495Z",
297 | "shell.execute_reply": "2020-10-11T22:53:16.396182Z"
298 | }
299 | },
300 | "outputs": [],
301 | "source": [
302 | "colors = {'setosa':'r', 'versicolor':'g', 'virginica':'blue'}\n",
303 | "print(df[\"species\"].unique())\n",
304 | "#plt.scatter(df['experience'], df['salary'], c=df['education'].apply(lambda x: colors[x]), s=100)\n",
305 | "plt.scatter(PC[:, 0], PC[:, 1], c=df[\"species\"].apply(lambda x: colors[x]))\n",
306 | "plt.xlabel(\"PC1\")\n",
307 | "plt.ylabel(\"PC2\")"
308 | ]
309 | },
310 | {
311 | "cell_type": "markdown",
312 | "metadata": {},
313 | "source": [
314 | "Pairewise plot"
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": null,
320 | "metadata": {
321 | "execution": {
322 | "iopub.execute_input": "2020-10-11T22:53:16.442119Z",
323 | "iopub.status.busy": "2020-10-11T22:53:16.441495Z",
324 | "iopub.status.idle": "2020-10-11T22:53:23.105722Z",
325 | "shell.execute_reply": "2020-10-11T22:53:23.106018Z"
326 | }
327 | },
328 | "outputs": [],
329 | "source": [
330 | "import seaborn as sns\n",
331 | "\n",
332 | "df[\"PC1\"] = PC[:, 0]\n",
333 | "df[\"PC2\"] = PC[:, 1]\n",
334 | "\n",
335 | "ax = sns.pairplot(df, hue=\"species\")"
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": null,
341 | "metadata": {},
342 | "outputs": [],
343 | "source": []
344 | }
345 | ],
346 | "metadata": {
347 | "anaconda-cloud": {},
348 | "kernelspec": {
349 | "display_name": "Python 3",
350 | "language": "python",
351 | "name": "python3"
352 | },
353 | "language_info": {
354 | "codemirror_mode": {
355 | "name": "ipython",
356 | "version": 3
357 | },
358 | "file_extension": ".py",
359 | "mimetype": "text/x-python",
360 | "name": "python",
361 | "nbconvert_exporter": "python",
362 | "pygments_lexer": "ipython3",
363 | "version": "3.7.9"
364 | }
365 | },
366 | "nbformat": 4,
367 | "nbformat_minor": 2
368 | }
369 |
--------------------------------------------------------------------------------
/scientific_python/scipy_matplotlib.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data visualization: matplotlib & seaborn \n",
8 | "\n",
9 | "\n",
10 | "## Basic plots"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {
17 | "execution": {
18 | "iopub.execute_input": "2020-10-11T22:54:06.283262Z",
19 | "iopub.status.busy": "2020-10-11T22:54:06.281496Z",
20 | "iopub.status.idle": "2020-10-11T22:54:06.619890Z",
21 | "shell.execute_reply": "2020-10-11T22:54:06.619484Z"
22 | }
23 | },
24 | "outputs": [],
25 | "source": [
26 | "import numpy as np\n",
27 | "import matplotlib.pyplot as plt\n",
28 | "import seaborn as sns\n",
29 | "\n",
30 | "# inline plot (for jupyter)\n",
31 | "%matplotlib inline\n",
32 | "\n",
33 | "plt.figure(figsize=(9, 3))\n",
34 | "x = np.linspace(0, 10, 50)\n",
35 | "sinus = np.sin(x)\n",
36 | "\n",
37 | "plt.plot(x, sinus)\n",
38 | "plt.show()"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {
45 | "execution": {
46 | "iopub.execute_input": "2020-10-11T22:54:06.631218Z",
47 | "iopub.status.busy": "2020-10-11T22:54:06.630138Z",
48 | "iopub.status.idle": "2020-10-11T22:54:06.715538Z",
49 | "shell.execute_reply": "2020-10-11T22:54:06.715894Z"
50 | }
51 | },
52 | "outputs": [],
53 | "source": [
54 | "plt.figure(figsize=(9, 3))\n",
55 | "\n",
56 | "plt.plot(x, sinus, \"o\")\n",
57 | "plt.show()\n",
58 | "# use plt.plot to get color / marker abbreviations"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {
65 | "execution": {
66 | "iopub.execute_input": "2020-10-11T22:54:06.728139Z",
67 | "iopub.status.busy": "2020-10-11T22:54:06.727746Z",
68 | "iopub.status.idle": "2020-10-11T22:54:06.834198Z",
69 | "shell.execute_reply": "2020-10-11T22:54:06.833848Z"
70 | }
71 | },
72 | "outputs": [],
73 | "source": [
74 | "# Rapid multiplot\n",
75 | "\n",
76 | "plt.figure(figsize=(9, 3))\n",
77 | "cosinus = np.cos(x)\n",
78 | "plt.plot(x, sinus, \"-b\", x, sinus, \"ob\", x, cosinus, \"-r\", x, cosinus, \"or\")\n",
79 | "plt.xlabel('this is x!')\n",
80 | "plt.ylabel('this is y!')\n",
81 | "plt.title('My First Plot')\n",
82 | "plt.show()"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {
89 | "execution": {
90 | "iopub.execute_input": "2020-10-11T22:54:06.847651Z",
91 | "iopub.status.busy": "2020-10-11T22:54:06.846622Z",
92 | "iopub.status.idle": "2020-10-11T22:54:06.953662Z",
93 | "shell.execute_reply": "2020-10-11T22:54:06.953293Z"
94 | }
95 | },
96 | "outputs": [],
97 | "source": [
98 | "# Step by step\n",
99 | "\n",
100 | "plt.figure(figsize=(9, 3))\n",
101 | "plt.plot(x, sinus, label='sinus', color='blue', linestyle='--', linewidth=2)\n",
102 | "plt.plot(x, cosinus, label='cosinus', color='red', linestyle='-', linewidth=2)\n",
103 | "plt.legend()\n",
104 | "plt.show()"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "## Scatter (2D) plots\n",
112 | "\n",
113 | "Load dataset"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "metadata": {
120 | "execution": {
121 | "iopub.execute_input": "2020-10-11T22:54:06.956572Z",
122 | "iopub.status.busy": "2020-10-11T22:54:06.956237Z",
123 | "iopub.status.idle": "2020-10-11T22:54:07.103716Z",
124 | "shell.execute_reply": "2020-10-11T22:54:07.103342Z"
125 | }
126 | },
127 | "outputs": [],
128 | "source": [
129 | "import pandas as pd\n",
130 | "try:\n",
131 | " salary = pd.read_csv(\"../datasets/salary_table.csv\")\n",
132 | "except:\n",
133 | " url = 'https://github.com/duchesnay/pystatsml/raw/master/datasets/salary_table.csv'\n",
134 | " salary = pd.read_csv(url)\n",
135 | "\n",
136 | "df = salary\n",
137 | "print(df.head())"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "metadata": {},
143 | "source": [
144 | "### Simple scatter with colors"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": null,
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "plt.figure(figsize=(3, 3), dpi=100)\n",
154 | "_ = sns.scatterplot(x=\"experience\", y=\"salary\", hue=\"education\", data=salary)"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {},
160 | "source": [
161 | "Legend outside"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {},
168 | "outputs": [],
169 | "source": [
170 | "ax = sns.relplot(x=\"experience\", y=\"salary\", hue=\"education\", data=salary)"
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "metadata": {},
176 | "source": [
177 | "### Linear model"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": [
186 | "ax = sns.lmplot(x=\"experience\", y=\"salary\", hue=\"education\", data=salary)"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "metadata": {},
192 | "source": [
193 | "### Scatter plot with colors and symbols"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {},
200 | "outputs": [],
201 | "source": [
202 | "ax = sns.relplot(x=\"experience\", y=\"salary\", hue=\"education\", style='management', data=salary)"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {},
208 | "source": [
209 | "## Saving Figures"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "metadata": {
216 | "execution": {
217 | "iopub.execute_input": "2020-10-11T22:54:07.420427Z",
218 | "iopub.status.busy": "2020-10-11T22:54:07.419445Z",
219 | "iopub.status.idle": "2020-10-11T22:54:07.649956Z",
220 | "shell.execute_reply": "2020-10-11T22:54:07.649633Z"
221 | }
222 | },
223 | "outputs": [],
224 | "source": [
225 | "### bitmap format\n",
226 | "plt.plot(x, sinus)\n",
227 | "plt.savefig(\"sinus.png\")\n",
228 | "plt.close()\n",
229 | "\n",
230 | "# Prefer vectorial format (SVG: Scalable Vector Graphics) can be edited with \n",
231 | "# Inkscape, Adobe Illustrator, Blender, etc.\n",
232 | "plt.plot(x, sinus)\n",
233 | "plt.savefig(\"sinus.svg\")\n",
234 | "plt.close()\n",
235 | "\n",
236 | "# Or pdf\n",
237 | "plt.plot(x, sinus)\n",
238 | "plt.savefig(\"sinus.pdf\")\n",
239 | "plt.close()"
240 | ]
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "metadata": {},
245 | "source": [
246 | "### Boxplot and violin plot: one factor\n",
247 | "\n",
248 | "Box plots are non-parametric: they display variation in samples of a statistical population without making any assumptions of the underlying statistical distribution.\n",
249 | "\n",
250 | "{width=7cm}"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": null,
256 | "metadata": {},
257 | "outputs": [],
258 | "source": [
259 | "ax = sns.boxplot(x=\"management\", y=\"salary\", data=salary)\n",
260 | "ax = sns.stripplot(x=\"management\", y=\"salary\", data=salary, jitter=True, color=\"black\")"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": null,
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "ax = sns.violinplot(x=\"management\", y=\"salary\", data=salary)\n",
270 | "ax = sns.stripplot(x=\"management\", y=\"salary\", data=salary, jitter=True, color=\"white\")"
271 | ]
272 | },
273 | {
274 | "cell_type": "markdown",
275 | "metadata": {},
276 | "source": [
277 | "### Boxplot and violin plot: two factors"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "metadata": {},
284 | "outputs": [],
285 | "source": [
286 | "ax = sns.boxplot(x=\"management\", y=\"salary\", hue=\"education\", data=salary)\n",
287 | "ax = sns.stripplot(x=\"management\", y=\"salary\", hue=\"education\", data=salary, jitter=True, dodge=True, linewidth=1)"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": null,
293 | "metadata": {
294 | "execution": {
295 | "iopub.execute_input": "2020-10-11T22:54:07.652516Z",
296 | "iopub.status.busy": "2020-10-11T22:54:07.652175Z",
297 | "iopub.status.idle": "2020-10-11T22:54:08.055323Z",
298 | "shell.execute_reply": "2020-10-11T22:54:08.054906Z"
299 | }
300 | },
301 | "outputs": [],
302 | "source": [
303 | "ax = sns.violinplot(x=\"management\", y=\"salary\", hue=\"education\", data=salary)\n",
304 | "ax = sns.stripplot(x=\"management\", y=\"salary\", hue=\"education\", data=salary, jitter=True, dodge=True, linewidth=1)"
305 | ]
306 | },
307 | {
308 | "cell_type": "markdown",
309 | "metadata": {},
310 | "source": [
311 | "### Distributions and density plot\n",
312 | "\n",
313 | "[Distributions with seaborn](https://seaborn.pydata.org/tutorial/distributions.html)\n"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": null,
319 | "metadata": {},
320 | "outputs": [],
321 | "source": [
322 | "ax = sns.displot(x=\"salary\", hue=\"management\", kind=\"kde\", data=salary, fill=True)"
323 | ]
324 | },
325 | {
326 | "cell_type": "markdown",
327 | "metadata": {},
328 | "source": [
329 | "## Multiple axis"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": null,
335 | "metadata": {},
336 | "outputs": [],
337 | "source": [
338 | "fig, axes = plt.subplots(3, 1, figsize=(9, 9), sharex=True)\n",
339 | "\n",
340 | "i = 0\n",
341 | "for edu, d in salary.groupby(['education']):\n",
342 | " sns.kdeplot(x=\"salary\", hue=\"management\", data=d, fill=True, ax=axes[i], palette=\"muted\")\n",
343 | " axes[i].set_title(edu)\n",
344 | " i += 1"
345 | ]
346 | },
347 | {
348 | "cell_type": "markdown",
349 | "metadata": {},
350 | "source": [
351 | "## Pairwise scatter plots"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": null,
357 | "metadata": {},
358 | "outputs": [],
359 | "source": [
360 | "ax = sns.pairplot(salary, hue=\"management\")"
361 | ]
362 | },
363 | {
364 | "cell_type": "markdown",
365 | "metadata": {},
366 | "source": [
367 | "## Time series"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": null,
373 | "metadata": {
374 | "execution": {
375 | "iopub.execute_input": "2020-10-11T22:54:10.349932Z",
376 | "iopub.status.busy": "2020-10-11T22:54:10.349585Z",
377 | "iopub.status.idle": "2020-10-11T22:54:11.426751Z",
378 | "shell.execute_reply": "2020-10-11T22:54:11.426337Z"
379 | }
380 | },
381 | "outputs": [],
382 | "source": [
383 | "import seaborn as sns\n",
384 | "sns.set(style=\"darkgrid\")\n",
385 | "\n",
386 | "# Load an example dataset with long-form data\n",
387 | "fmri = sns.load_dataset(\"fmri\")\n",
388 | "\n",
389 | "# Plot the responses for different events and regions\n",
390 | "ax = sns.pointplot(x=\"timepoint\", y=\"signal\",\n",
391 | " hue=\"region\", style=\"event\",\n",
392 | " data=fmri)"
393 | ]
394 | }
395 | ],
396 | "metadata": {
397 | "anaconda-cloud": {},
398 | "kernelspec": {
399 | "display_name": "Python 3",
400 | "language": "python",
401 | "name": "python3"
402 | },
403 | "language_info": {
404 | "codemirror_mode": {
405 | "name": "ipython",
406 | "version": 3
407 | },
408 | "file_extension": ".py",
409 | "mimetype": "text/x-python",
410 | "name": "python",
411 | "nbconvert_exporter": "python",
412 | "pygments_lexer": "ipython3",
413 | "version": "3.7.9"
414 | }
415 | },
416 | "nbformat": 4,
417 | "nbformat_minor": 2
418 | }
419 |
--------------------------------------------------------------------------------
/utils/time_series.py:
--------------------------------------------------------------------------------
1 | '''
2 | # Time Series in python
3 |
4 | Two libraries:
5 |
6 | - Pandas: https://pandas.pydata.org/pandas-docs/stable/timeseries.html
7 | - scipy http://www.statsmodels.org/devel/tsa.html
8 | '''
9 |
10 | '''
11 | ## Stationarity
12 |
13 | A TS is said to be stationary if its statistical properties such as mean, variance remain constant over time.
14 |
15 | - constant mean
16 | - constant variance
17 | - an autocovariance that does not depend on time.
18 |
19 | what is making a TS non-stationary. There are 2 major reasons behind non-stationaruty of a TS:
20 |
21 | 1. Trend – varying mean over time. For eg, in this case we saw that on average, the number of passengers was growing over time.
22 |
23 | 2. Seasonality – variations at specific time-frames. eg people might have a tendency to buy cars in a particular month because of pay increment or festivals.
24 | '''
25 |
26 | '''
27 | ## Pandas Time Series Data Structure
28 |
29 | A Series is similar to a list or an array in Python.
30 | It represents a series of values (numeric or otherwise) such as a column of data.
31 | It provides additional functionality, methods, and operators, which make it a more powerful version of a list.
32 | '''
33 |
34 | import pandas as pd
35 | import numpy as np
36 |
37 | # Create a Series from a list
38 | ser = pd.Series([1, 3])
39 | print(ser)
40 |
41 | # String as index
42 | prices = {'apple': 4.99,
43 | 'banana': 1.99,
44 | 'orange': 3.99}
45 | ser = pd.Series(prices)
46 | print(ser)
47 |
48 | x = pd.Series(np.arange(1,3), index=[x for x in 'ab'])
49 | print(x)
50 | print(x['b'])
51 |
52 | '''
53 | ## Time Series Analysis of Google Trends
54 |
55 | source: https://www.datacamp.com/community/tutorials/time-series-analysis-tutorial
56 |
57 | Get Google Trends data of keywords such as 'diet' and 'gym' and see how they vary over time while learning about trends and seasonality in time series data.
58 |
59 | In the Facebook Live code along session on the 4th of January, we checked out Google trends data of keywords 'diet', 'gym' and 'finance' to see how they vary over time. We asked ourselves if there could be more searches for these terms in January when we're all trying to turn over a new leaf?
60 |
61 | In this tutorial, you'll go through the code that we put together during the session step by step. You're not going to do much mathematics but you are going to do the following:
62 |
63 | - Read data
64 | - Recode data
65 | - Exploratory Data Analysis
66 |
67 | '''
68 |
69 |
70 | '''
71 | ## Read data
72 | '''
73 |
74 | import numpy as np
75 | import pandas as pd
76 | import matplotlib.pyplot as plt
77 | import seaborn as sns
78 |
79 | # Plot appears on its own windows
80 | %matplotlib qt
81 | # Tools / Preferences / Ipython Console / Graphics / Graphics Backend / Backend: “automatic”
82 | # Interactive Matplotlib Jupyter Notebook
83 | # %matplotlib inline
84 |
85 | try:
86 | url = "https://raw.githubusercontent.com/datacamp/datacamp_facebook_live_ny_resolution/master/data/multiTimeline.csv"
87 | df = pd.read_csv(url, skiprows=2)
88 | except:
89 | df = pd.read_csv("../data/multiTimeline.csv", skiprows=2)
90 |
91 | print(df.head())
92 |
93 | # Rename columns
94 | df.columns = ['month', 'diet', 'gym', 'finance']
95 |
96 | # Describe
97 | print(df.describe())
98 |
99 | '''
100 | ## Recode data
101 |
102 | Next, you'll turn the 'month' column into a DateTime data type and make it the index of the DataFrame.
103 |
104 | Note that you do this because you saw in the result of the .info() method that the 'Month' column was actually an of data type object. Now, that generic data type encapsulates everything from strings to integers, etc. That's not exactly what you want when you want to be looking at time series data. That's why you'll use .to_datetime() to convert the 'month' column in your DataFrame to a DateTime.
105 |
106 | Be careful! Make sure to include the inplace argument when you're setting the index of the DataFrame df so that you actually alter the original index and set it to the 'month' column.
107 | '''
108 | df.month = pd.to_datetime(df.month)
109 | df.set_index('month', inplace=True)
110 |
111 | print(df.head())
112 |
113 | '''
114 | ## Exploratory Data Analysis
115 |
116 | You can use a built-in pandas visualization method .plot() to plot your
117 | data as 3 line plots on a single
118 | figure (one for each column, namely, 'diet', 'gym', and 'finance').
119 | '''
120 | df.plot()
121 | plt.xlabel('Year');
122 |
123 | # change figure parameters
124 | # df.plot(figsize=(20,10), linewidth=5, fontsize=20)
125 |
126 | # Plot single column
127 | # df[['diet']].plot(figsize=(20,10), linewidth=5, fontsize=20)
128 | # plt.xlabel('Year', fontsize=20);
129 |
130 | '''
131 | Note that this data is relative. As you can read on Google trends:
132 |
133 | Numbers represent search interest relative to the highest point on the chart
134 | for the given region and time.
135 | A value of 100 is the peak popularity for the term.
136 | A value of 50 means that the term is half as popular.
137 | Likewise a score of 0 means the term was less than 1% as popular as the peak.
138 |
139 | '''
140 |
141 |
142 | '''
143 | ## Resampling, Smoothing, Windowing, Rolling average: Trends
144 |
145 | Rolling average, for each time point, take the average of the points on either side of it.
146 | Note that the number of points is specified by a window size.
147 |
148 | Remove Seasonality with pandas Series.
149 |
150 | See: http://pandas.pydata.org/pandas-docs/stable/timeseries.html
151 | A: 'year end frequency' year frequency
152 | '''
153 | diet = df['diet']
154 |
155 | diet_resamp_yr = diet.resample('A').mean()
156 | diet_roll_yr = diet.rolling(12).mean()
157 |
158 | ax = diet.plot(alpha=0.5, style='-') # store axis (ax) for latter plots
159 | diet_resamp_yr.plot(style=':', label='Resample at year frequency', ax=ax)
160 | diet_roll_yr.plot(style='--', label='Rolling average (smooth), window size=12', ax=ax)
161 | ax.legend()
162 |
163 |
164 | '''
165 | Rolling average (smoothing) with Numpy
166 | '''
167 |
168 | x = np.asarray(df[['diet']])
169 | win = 12
170 | win_half = int(win / 2)
171 | # print([((idx-win_half), (idx+win_half)) for idx in np.arange(win_half, len(x))])
172 |
173 | diet_smooth = np.array([x[(idx-win_half):(idx+win_half)].mean() for idx in np.arange(win_half, len(x))])
174 | plt.plot(diet_smooth)
175 |
176 | '''
177 | Trends Plot Diet and Gym
178 |
179 | Build a new DataFrame which is the concatenation diet and gym smoothed data
180 | '''
181 | gym = df['gym']
182 |
183 | df_avg = pd.concat([diet.rolling(12).mean(), gym.rolling(12).mean()], axis=1)
184 | df_avg.plot()
185 | plt.xlabel('Year')
186 |
187 | '''
188 | Detrending
189 | '''
190 |
191 | df_dtrend = df[["diet", "gym"]] - df_avg
192 | df_dtrend.plot()
193 | plt.xlabel('Year')
194 |
195 | '''
196 | ## First-order differencing: Seasonal Patterns
197 |
198 | '''
199 |
200 | # diff = original - shiftted data
201 | # (exclude first term for some implementation details)
202 | assert np.all((diet.diff() == diet - diet.shift())[1:])
203 |
204 | df.diff().plot()
205 | plt.xlabel('Year')
206 |
207 | '''
208 | ## Periodicity and Correlation
209 | '''
210 |
211 | df.plot()
212 | plt.xlabel('Year');
213 | print(df.corr())
214 |
215 | '''
216 | Plot correlation matrix
217 | '''
218 |
219 | sns.heatmap(df.corr(), cmap="coolwarm")
220 |
221 |
222 | '''
223 | 'diet' and 'gym' are negatively correlated!
224 | Remember that you have a seasonal and a trend component.
225 | From the correlation coefficient, 'diet' and 'gym' are negatively correlated:
226 |
227 | - trends components are negatively correlated.
228 | - seasonal components would positively correlated and their
229 |
230 | The actual correlation coefficient is actually capturing both of those.
231 |
232 | Seasonal correlation: correlation of the first-order differences of these time series
233 | '''
234 |
235 | df.diff().plot()
236 | plt.xlabel('Year');
237 |
238 | print(df.diff().corr())
239 |
240 | '''
241 | Plot correlation matrix
242 | '''
243 |
244 | sns.heatmap(df.diff().corr(), cmap="coolwarm")
245 |
246 | '''
247 | Decomposing time serie in trend, seasonality and residuals
248 | '''
249 |
250 | from statsmodels.tsa.seasonal import seasonal_decompose
251 |
252 | x = gym
253 |
254 | x = x.astype(float) # force float
255 | decomposition = seasonal_decompose(x)
256 | trend = decomposition.trend
257 | seasonal = decomposition.seasonal
258 | residual = decomposition.resid
259 |
260 | plt.subplot(411)
261 | plt.plot(x, label='Original')
262 | plt.legend(loc='best')
263 | plt.subplot(412)
264 | plt.plot(trend, label='Trend')
265 | plt.legend(loc='best')
266 | plt.subplot(413)
267 | plt.plot(seasonal,label='Seasonality')
268 | plt.legend(loc='best')
269 | plt.subplot(414)
270 | plt.plot(residual, label='Residuals')
271 | plt.legend(loc='best')
272 | plt.tight_layout()
273 |
274 |
275 | '''
276 | ## Autocorrelation
277 |
278 | A time series is periodic if it repeats itself at equally spaced intervals, say, every 12 months.
279 | Autocorrelation Function (ACF): It is a measure of the correlation between the TS with a
280 | lagged version of itself. For instance at lag 5, ACF would compare series at time instant t1...t2
281 | with series at instant t1-5...t2-5 (t1-5 and t2 being end points).
282 |
283 | Plot
284 | '''
285 | # from pandas.plotting import autocorrelation_plot
286 | from pandas.tools.plotting import autocorrelation_plot
287 |
288 | x = df["diet"].astype(float)
289 | autocorrelation_plot(x)
290 |
291 | '''
292 | Compute Autocorrelation Function (ACF)
293 | '''
294 |
295 | from statsmodels.tsa.stattools import acf
296 |
297 | x_diff = x.diff().dropna() # first item is NA
298 | lag_acf = acf(x_diff, nlags=36)
299 | plt.plot(lag_acf)
300 | plt.title('Autocorrelation Function')
301 |
302 | '''
303 | ACF peaks every 12 months: Time series is correlated with itself shifted by 12 months.
304 | '''
305 |
306 | '''
307 | ## Time Series Forecasting with Python using Autoregressive Moving Average (ARMA) models
308 |
309 | Source:
310 |
311 | - https://www.packtpub.com/mapt/book/big_data_and_business_intelligence/9781783553358/7/ch07lvl1sec77/arma-models
312 |
313 | - http://en.wikipedia.org/wiki/Autoregressive%E2%80%93moving-average_model
314 |
315 | - ARIMA: https://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/
316 |
317 | ARMA models are often used to forecast a time series.
318 | These models combine autoregressive and moving average models.
319 | In moving average models, we assume that a variable is the sum of the mean of the
320 | time series and a linear combination of noise components.
321 |
322 | The autoregressive and moving average models can have different orders. In general, we can define an ARMA model with p autoregressive terms and q moving average terms as follows:
323 |
324 | $$
325 | x_t = \sum_i^p a_i x_{t-i} +\sum_i^q b_i \varepsilon_{t-i} + \varepsilon_t
326 | $$
327 | '''
328 |
329 | '''
330 | ### Choosing p and q
331 |
332 | Plot the partial autocorrelation functions for an estimate of p, and likewise using the autocorrelation functions for an estimate of q.
333 |
334 | Partial Autocorrelation Function (PACF): This measures the correlation between the TS with a lagged version of itself but after eliminating the variations already explained by the intervening comparisons. Eg at lag 5, it will check the correlation but remove the effects already explained by lags 1 to 4.
335 | '''
336 | from statsmodels.tsa.stattools import acf, pacf
337 |
338 | x = df["gym"].astype(float)
339 |
340 | x_diff = x.diff().dropna() # first item is NA
341 | # ACF and PACF plots:
342 |
343 | lag_acf = acf(x_diff, nlags=20)
344 | lag_pacf = pacf(x_diff, nlags=20, method='ols')
345 |
346 | #Plot ACF:
347 | plt.subplot(121)
348 | plt.plot(lag_acf)
349 | plt.axhline(y=0,linestyle='--',color='gray')
350 | plt.axhline(y=-1.96/np.sqrt(len(x_diff)),linestyle='--',color='gray')
351 | plt.axhline(y=1.96/np.sqrt(len(x_diff)),linestyle='--',color='gray')
352 | plt.title('Autocorrelation Function (q=1)')
353 |
354 | #Plot PACF:
355 | plt.subplot(122)
356 | plt.plot(lag_pacf)
357 | plt.axhline(y=0,linestyle='--',color='gray')
358 | plt.axhline(y=-1.96/np.sqrt(len(x_diff)),linestyle='--',color='gray')
359 | plt.axhline(y=1.96/np.sqrt(len(x_diff)),linestyle='--',color='gray')
360 | plt.title('Partial Autocorrelation Function (p=1)')
361 | plt.tight_layout()
362 |
363 | '''
364 | In this plot, the two dotted lines on either sides of 0 are the confidence interevals.
365 | These can be used to determine the p and q values as:
366 |
367 | - p: The lag value where the PACF chart crosses the upper confidence interval for the first time, in this case p=1.
368 |
369 | - q: The lag value where the ACF chart crosses the upper confidence interval for the first time, in this case q=1.
370 | '''
371 |
372 | '''
373 | ### Fit ARMA model with statsmodels
374 |
375 | 1. Define the model by calling `ARMA()` and passing in the p and q parameters.
376 |
377 | 2. The model is prepared on the training data by calling the `fit()` function.
378 |
379 | 3. Predictions can be made by calling the `predict()` function and specifying the index of the time or times to be predicted.
380 | '''
381 |
382 | from statsmodels.tsa.arima_model import ARMA
383 |
384 |
385 | model = ARMA(x, order=(1,1)).fit() # fit model
386 |
387 | print(model.summary())
388 | plt.plot(x)
389 | plt.plot(model.predict(), color='red')
390 | plt.title('RSS: %.4f'% sum((model.fittedvalues-x)**2))
--------------------------------------------------------------------------------