├── .gitignore
├── LICENSE
├── README.md
├── SimpleImageDataset
    ├── building00.jpg
    ├── building01.jpg
    ├── building02.jpg
    ├── building03.jpg
    ├── building04.jpg
    ├── building05.jpg
    ├── building06.jpg
    ├── building07.jpg
    ├── building08.jpg
    ├── building09.jpg
    ├── building10.jpg
    ├── building11.jpg
    ├── building12.jpg
    ├── building13.jpg
    ├── building14.jpg
    ├── building15.jpg
    ├── building16.jpg
    ├── building17.jpg
    ├── building18.jpg
    ├── building19.jpg
    ├── building20.jpg
    ├── building21.jpg
    ├── building22.jpg
    ├── building23.jpg
    ├── building24.jpg
    ├── building25.jpg
    ├── building26.jpg
    ├── building27.jpg
    ├── building28.jpg
    ├── building29.jpg
    ├── scene00.jpg
    ├── scene01.jpg
    ├── scene02.jpg
    ├── scene03.jpg
    ├── scene04.jpg
    ├── scene05.jpg
    ├── scene06.jpg
    ├── scene07.jpg
    ├── scene08.jpg
    ├── scene09.jpg
    ├── scene10.jpg
    ├── scene11.jpg
    ├── scene12.jpg
    ├── scene13.jpg
    ├── scene14.jpg
    ├── scene15.jpg
    ├── scene16.jpg
    ├── scene17.jpg
    ├── scene18.jpg
    ├── scene19.jpg
    ├── scene20.jpg
    ├── scene21.jpg
    ├── scene22.jpg
    ├── scene23.jpg
    ├── scene24.jpg
    ├── scene25.jpg
    ├── scene26.jpg
    ├── scene27.jpg
    ├── scene28.jpg
    ├── scene29.jpg
    ├── text00.jpg
    ├── text01.jpg
    ├── text02.jpg
    ├── text03.jpg
    ├── text04.jpg
    ├── text05.jpg
    ├── text06.jpg
    ├── text07.jpg
    ├── text08.jpg
    ├── text09.jpg
    ├── text10.jpg
    ├── text11.jpg
    ├── text12.jpg
    ├── text13.jpg
    ├── text14.jpg
    ├── text15.jpg
    ├── text16.jpg
    ├── text17.jpg
    ├── text18.jpg
    ├── text19.jpg
    ├── text20.jpg
    ├── text21.jpg
    ├── text22.jpg
    ├── text23.jpg
    ├── text24.jpg
    ├── text25.jpg
    ├── text26.jpg
    ├── text27.jpg
    ├── text28.jpg
    └── text29.jpg
├── ch01
    ├── analyze_webstats.py
    ├── data
    │   └── web_traffic.tsv
    ├── gen_webstats.py
    ├── performance_test.py
    └── utils.py
├── ch02
    ├── README.rst
    ├── chapter.py
    ├── data
    │   └── seeds.tsv
    ├── extra
    │   └── create_tsv.py
    ├── figure1.py
    ├── figure2.py
    ├── figure4_5_no_sklearn.py
    ├── figure4_5_sklearn.py
    ├── heldout.py
    ├── knn.py
    ├── load.py
    ├── seeds_knn.py
    ├── seeds_knn_increasing_k.py
    ├── seeds_knn_sklearn.py
    ├── seeds_threshold.py
    ├── simple_threshold.py
    ├── stump.py
    ├── tests
    │   └── test_load.py
    └── threshold.py
├── ch03
    ├── README.md
    ├── data
    │   └── toy
    │   │   ├── 01.txt
    │   │   ├── 02.txt
    │   │   ├── 03.txt
    │   │   ├── 04.txt
    │   │   └── 05.txt
    ├── noise_analysis.py
    ├── plot_kmeans_example.py
    ├── rel_post_01.py
    ├── rel_post_20news.py
    ├── tfidf.py
    └── utils.py
├── ch04
    ├── .gitignore
    ├── README.rst
    ├── blei_lda.py
    ├── build_lda.py
    ├── data
    │   ├── .gitignore
    │   ├── download_ap.sh
    │   ├── download_wp.sh
    │   └── preprocess-wikidata.sh
    ├── wikitopics_create.py
    ├── wikitopics_create_hdp.py
    ├── wikitopics_plot.py
    └── wordcloud.py
├── ch05
    ├── PosTagFreqVectorizer.py
    ├── README.md
    ├── chose_instances.py
    ├── classify.py
    ├── data.py
    ├── log_reg_example.py
    ├── so_xml_to_tsv.py
    └── utils.py
├── ch06
    ├── 01_start.py
    ├── 02_tuning.py
    ├── 03_clean.py
    ├── 04_sent.py
    ├── README.md
    ├── data
    │   ├── corpus.csv
    │   ├── missing.tsv
    │   └── not_authorized.tsv
    ├── install.py
    ├── twitterauth.py
    └── utils.py
├── ch07
    ├── .gitignore
    ├── README.rst
    ├── boston1.py
    ├── boston1numpy.py
    ├── boston_cv_penalized.py
    ├── data
    │   ├── .gitignore
    │   └── download.sh
    ├── figure1_2.py
    ├── figure3.py
    ├── figure4.py
    ├── lasso_path_plot.py
    ├── lr10k.py
    └── predict10k_en.py
├── ch08
    ├── README.rst
    ├── all_correlations.py
    ├── apriori
    │   ├── .gitignore
    │   ├── apriori.py
    │   ├── apriori_example.py
    │   ├── apriori_naive.py
    │   ├── download.sh
    │   └── histogram.py
    ├── averaged.py
    ├── chapter.py
    ├── corrneighbours.py
    ├── data
    │   ├── .gitignore
    │   └── download.sh
    ├── figure3.py
    ├── load_ml100k.py
    ├── norm.py
    ├── regression.py
    ├── similar_movie.py
    └── stacked.py
├── ch09
    ├── 01_fft_based_classifier.py
    ├── 02_ceps_based_classifier.py
    ├── Makefile
    ├── ceps.py
    ├── fft.py
    └── utils.py
├── ch10
    ├── .gitignore
    ├── README.rst
    ├── chapter.py
    ├── download.sh
    ├── features.py
    ├── figure10.py
    ├── large_classification.py
    ├── lena-ring.py
    ├── neighbors.py
    ├── scene00.jpg
    ├── simple_classification.py
    ├── threshold.py
    └── thresholded_figure.py
├── ch11
    ├── demo_corr.py
    ├── demo_mds.py
    ├── demo_mi.py
    ├── demo_pca.py
    ├── demo_rfe.py
    └── utils.py
└── ch12
    ├── .gitignore
    ├── README.rst
    ├── chapter.py
    ├── features.py
    ├── image-classification.py
    ├── jugfile.py
    ├── run-image-classification.sh
    ├── run-jugfile.sh
    └── setup-aws.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Luis Pedro Coelho
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Building Machine Learning Systems with Python
 2 | =============================================
 3 | 
 4 | Source Code for the book Building Machine Learning Systems with Python by [Luis
 5 | Pedro Coelho](http://luispedro.org) and [Willi Richert](http://twotoreal.com).
 6 | 
 7 | The book was published in 2013 (second edition in 2015) by Packt Publishing and
 8 | is available [from their
 9 | website](http://www.packtpub.com/building-machine-learning-systems-with-python/book).
10 | 
11 | The code in the repository corresponds to the second edition. Code for the
12 | first edition is available in [first\_edition
13 | branch](https://github.com/luispedro/BuildingMachineLearningSystemsWithPython/tree/first_edition).
14 | 
15 | 


--------------------------------------------------------------------------------
/SimpleImageDataset/building00.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building00.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building01.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building02.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building03.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building04.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building05.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building05.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building06.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building06.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building07.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building07.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building08.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building08.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building09.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building09.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building10.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building10.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building11.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building11.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building12.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building12.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building13.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building13.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building14.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building14.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building15.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building15.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building16.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building16.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building17.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building17.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building18.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building18.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building19.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building19.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building20.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building20.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building21.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building21.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building22.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building22.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building23.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building23.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building24.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building24.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building25.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building25.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building26.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building26.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building27.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building27.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building28.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building28.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/building29.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building29.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene00.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene00.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene01.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene02.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene03.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene04.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene05.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene05.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene06.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene06.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene07.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene07.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene08.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene08.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene09.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene09.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene10.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene10.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene11.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene11.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene12.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene12.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene13.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene13.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene14.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene14.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene15.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene15.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene16.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene16.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene17.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene17.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene18.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene18.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene19.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene19.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene20.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene20.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene21.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene21.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene22.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene22.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene23.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene23.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene24.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene24.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene25.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene25.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene26.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene26.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene27.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene27.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene28.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene28.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/scene29.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene29.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text00.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text00.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text01.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text02.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text03.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text04.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text05.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text05.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text06.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text06.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text07.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text07.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text08.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text08.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text09.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text09.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text10.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text10.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text11.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text11.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text12.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text12.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text13.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text13.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text14.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text14.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text15.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text15.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text16.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text16.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text17.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text17.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text18.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text18.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text19.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text19.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text20.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text20.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text21.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text21.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text22.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text22.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text23.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text23.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text24.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text24.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text25.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text25.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text26.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text26.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text27.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text27.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text28.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text28.jpg


--------------------------------------------------------------------------------
/SimpleImageDataset/text29.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text29.jpg


--------------------------------------------------------------------------------
/ch01/analyze_webstats.py:
--------------------------------------------------------------------------------
  1 | # This code is supporting material for the book
  2 | # Building Machine Learning Systems with Python
  3 | # by Willi Richert and Luis Pedro Coelho
  4 | # published by PACKT Publishing
  5 | #
  6 | # It is made available under the MIT License
  7 | 
  8 | import os
  9 | from utils import DATA_DIR, CHART_DIR
 10 | import scipy as sp
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | sp.random.seed(3)  # to reproduce the data later on
 14 | 
 15 | data = sp.genfromtxt(os.path.join(DATA_DIR, "web_traffic.tsv"), delimiter="\t")
 16 | print(data[:10])
 17 | print(data.shape)
 18 | 
 19 | # all examples will have three classes in this file
 20 | colors = ['g', 'k', 'b', 'm', 'r']
 21 | linestyles = ['-', '-.', '--', ':', '-']
 22 | 
 23 | x = data[:, 0]
 24 | y = data[:, 1]
 25 | print("Number of invalid entries:", sp.sum(sp.isnan(y)))
 26 | x = x[~sp.isnan(y)]
 27 | y = y[~sp.isnan(y)]
 28 | 
 29 | 
 30 | def plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None):
 31 |     ''' plot input data '''
 32 | 
 33 |     plt.figure(num=None, figsize=(8, 6))
 34 |     plt.clf()
 35 |     plt.scatter(x, y, s=10)
 36 |     plt.title("Web traffic over the last month")
 37 |     plt.xlabel("Time")
 38 |     plt.ylabel("Hits/hour")
 39 |     plt.xticks(
 40 |         [w * 7 * 24 for w in range(10)], ['week %i' % w for w in range(10)])
 41 | 
 42 |     if models:
 43 |         if mx is None:
 44 |             mx = sp.linspace(0, x[-1], 1000)
 45 |         for model, style, color in zip(models, linestyles, colors):
 46 |             # print "Model:",model
 47 |             # print "Coeffs:",model.coeffs
 48 |             plt.plot(mx, model(mx), linestyle=style, linewidth=2, c=color)
 49 | 
 50 |         plt.legend(["d=%i" % m.order for m in models], loc="upper left")
 51 | 
 52 |     plt.autoscale(tight=True)
 53 |     plt.ylim(ymin=0)
 54 |     if ymax:
 55 |         plt.ylim(ymax=ymax)
 56 |     if xmin:
 57 |         plt.xlim(xmin=xmin)
 58 |     plt.grid(True, linestyle='-', color='0.75')
 59 |     plt.savefig(fname)
 60 | 
 61 | # first look at the data
 62 | plot_models(x, y, None, os.path.join(CHART_DIR, "1400_01_01.png"))
 63 | 
 64 | # create and plot models
 65 | fp1, res1, rank1, sv1, rcond1 = sp.polyfit(x, y, 1, full=True)
 66 | print("Model parameters of fp1: %s" % fp1)
 67 | print("Error of the model of fp1:", res1)
 68 | f1 = sp.poly1d(fp1)
 69 | 
 70 | fp2, res2, rank2, sv2, rcond2 = sp.polyfit(x, y, 2, full=True)
 71 | print("Model parameters of fp2: %s" % fp2)
 72 | print("Error of the model of fp2:", res2)
 73 | f2 = sp.poly1d(fp2)
 74 | f3 = sp.poly1d(sp.polyfit(x, y, 3))
 75 | f10 = sp.poly1d(sp.polyfit(x, y, 10))
 76 | f100 = sp.poly1d(sp.polyfit(x, y, 100))
 77 | 
 78 | plot_models(x, y, [f1], os.path.join(CHART_DIR, "1400_01_02.png"))
 79 | plot_models(x, y, [f1, f2], os.path.join(CHART_DIR, "1400_01_03.png"))
 80 | plot_models(
 81 |     x, y, [f1, f2, f3, f10, f100], os.path.join(CHART_DIR, "1400_01_04.png"))
 82 | 
 83 | # fit and plot a model using the knowledge about inflection point
 84 | inflection = 3.5 * 7 * 24
 85 | xa = x[:inflection]
 86 | ya = y[:inflection]
 87 | xb = x[inflection:]
 88 | yb = y[inflection:]
 89 | 
 90 | fa = sp.poly1d(sp.polyfit(xa, ya, 1))
 91 | fb = sp.poly1d(sp.polyfit(xb, yb, 1))
 92 | 
 93 | plot_models(x, y, [fa, fb], os.path.join(CHART_DIR, "1400_01_05.png"))
 94 | 
 95 | 
 96 | def error(f, x, y):
 97 |     return sp.sum((f(x) - y) ** 2)
 98 | 
 99 | print("Errors for the complete data set:")
100 | for f in [f1, f2, f3, f10, f100]:
101 |     print("Error d=%i: %f" % (f.order, error(f, x, y)))
102 | 
103 | print("Errors for only the time after inflection point")
104 | for f in [f1, f2, f3, f10, f100]:
105 |     print("Error d=%i: %f" % (f.order, error(f, xb, yb)))
106 | 
107 | print("Error inflection=%f" % (error(fa, xa, ya) + error(fb, xb, yb)))
108 | 
109 | 
110 | # extrapolating into the future
111 | plot_models(
112 |     x, y, [f1, f2, f3, f10, f100],
113 |     os.path.join(CHART_DIR, "1400_01_06.png"),
114 |     mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
115 |     ymax=10000, xmin=0 * 7 * 24)
116 | 
117 | print("Trained only on data after inflection point")
118 | fb1 = fb
119 | fb2 = sp.poly1d(sp.polyfit(xb, yb, 2))
120 | fb3 = sp.poly1d(sp.polyfit(xb, yb, 3))
121 | fb10 = sp.poly1d(sp.polyfit(xb, yb, 10))
122 | fb100 = sp.poly1d(sp.polyfit(xb, yb, 100))
123 | 
124 | print("Errors for only the time after inflection point")
125 | for f in [fb1, fb2, fb3, fb10, fb100]:
126 |     print("Error d=%i: %f" % (f.order, error(f, xb, yb)))
127 | 
128 | plot_models(
129 |     x, y, [fb1, fb2, fb3, fb10, fb100],
130 |     os.path.join(CHART_DIR, "1400_01_07.png"),
131 |     mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
132 |     ymax=10000, xmin=0 * 7 * 24)
133 | 
134 | # separating training from testing data
135 | frac = 0.3
136 | split_idx = int(frac * len(xb))
137 | shuffled = sp.random.permutation(list(range(len(xb))))
138 | test = sorted(shuffled[:split_idx])
139 | train = sorted(shuffled[split_idx:])
140 | fbt1 = sp.poly1d(sp.polyfit(xb[train], yb[train], 1))
141 | fbt2 = sp.poly1d(sp.polyfit(xb[train], yb[train], 2))
142 | print("fbt2(x)= \n%s" % fbt2)
143 | print("fbt2(x)-100,000= \n%s" % (fbt2-100000))
144 | fbt3 = sp.poly1d(sp.polyfit(xb[train], yb[train], 3))
145 | fbt10 = sp.poly1d(sp.polyfit(xb[train], yb[train], 10))
146 | fbt100 = sp.poly1d(sp.polyfit(xb[train], yb[train], 100))
147 | 
148 | print("Test errors for only the time after inflection point")
149 | for f in [fbt1, fbt2, fbt3, fbt10, fbt100]:
150 |     print("Error d=%i: %f" % (f.order, error(f, xb[test], yb[test])))
151 | 
152 | plot_models(
153 |     x, y, [fbt1, fbt2, fbt3, fbt10, fbt100],
154 |     os.path.join(CHART_DIR, "1400_01_08.png"),
155 |     mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
156 |     ymax=10000, xmin=0 * 7 * 24)
157 | 
158 | from scipy.optimize import fsolve
159 | print(fbt2)
160 | print(fbt2 - 100000)
161 | reached_max = fsolve(fbt2 - 100000, x0=800) / (7 * 24)
162 | print("100,000 hits/hour expected at week %f" % reached_max[0])
163 | 


--------------------------------------------------------------------------------
/ch01/gen_webstats.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | # This script generates web traffic data for our hypothetical
 9 | # web startup "MLASS" in chapter 01
10 | 
11 | import os
12 | import scipy as sp
13 | from scipy.stats import gamma
14 | import matplotlib.pyplot as plt
15 | 
16 | from utils import DATA_DIR, CHART_DIR
17 | 
18 | sp.random.seed(3)  # to reproduce the data later on
19 | 
20 | x = sp.arange(1, 31*24)
21 | y = sp.array(200*(sp.sin(2*sp.pi*x/(7*24))), dtype=int)
22 | y += gamma.rvs(15, loc=0, scale=100, size=len(x))
23 | y += 2 * sp.exp(x/100.0)
24 | y = sp.ma.array(y, mask=[y<0])
25 | print(sum(y), sum(y<0))
26 | 
27 | plt.scatter(x, y)
28 | plt.title("Web traffic over the last month")
29 | plt.xlabel("Time")
30 | plt.ylabel("Hits/hour")
31 | plt.xticks([w*7*24 for w in range(5)], 
32 |            ['week %i' %(w+1) for w in range(5)])
33 | plt.autoscale(tight=True)
34 | plt.grid()
35 | plt.savefig(os.path.join(CHART_DIR, "1400_01_01.png"))
36 | 
37 | sp.savetxt(os.path.join(DATA_DIR, "web_traffic.tsv"), 
38 |            list(zip(x, y)), delimiter="\t", fmt="%s")
39 | 


--------------------------------------------------------------------------------
/ch01/performance_test.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | 
 9 | import timeit
10 | 
11 | normal_py_sec = timeit.timeit('sum(x*x for x in range(1000))',
12 |                               number=10000)
13 | naive_np_sec = timeit.timeit('sum(na*na)',
14 |                              setup="import numpy as np; na=np.arange(1000)",
15 |                              number=10000)
16 | good_np_sec = timeit.timeit('na.dot(na)',
17 |                             setup="import numpy as np; na=np.arange(1000)",
18 |                             number=10000)
19 | 
20 | print("Normal Python: %f sec" % normal_py_sec)
21 | print("Naive NumPy: %f sec" % naive_np_sec)
22 | print("Good NumPy: %f sec" % good_np_sec)
23 | 


--------------------------------------------------------------------------------
/ch01/utils.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import os
 9 | 
10 | DATA_DIR = os.path.join(
11 |     os.path.dirname(os.path.realpath(__file__)), "data")
12 | 
13 | CHART_DIR = os.path.join(
14 |     os.path.dirname(os.path.realpath(__file__)), "charts")
15 | 
16 | for d in [DATA_DIR, CHART_DIR]:
17 |     if not os.path.exists(d):
18 |         os.mkdir(d)
19 | 
20 | 


--------------------------------------------------------------------------------
/ch02/README.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | Chapter 2
 3 | =========
 4 | 
 5 | Support code for *Chapter 2: Learning How to Classify with Real-world
 6 | Examples*. The directory data contains the seeds dataset, originally downloaded
 7 | from https://archive.ics.uci.edu/ml/datasets/seeds
 8 | 
 9 | chapter.py
10 |     The code as printed in the book.
11 | 
12 | figure1.py
13 |     Figure 1 in the book: all 2-by-2 scatter plots
14 | 
15 | figure2.py
16 |     Figure 2 in the book: threshold & decision area
17 | 
18 | figure4_5_sklearn.py
19 |     Figures 4 and 5 in the book: Knn decision borders before and after feature
20 |     normalization. This also produces a version of the figure using 11
21 |     neighbors (not in the book), which shows that the result is smoother, not
22 |     as sensitive to exact positions of each datapoint.
23 | 
24 | figure4_5_no_sklearn.py
25 |     Alternative code for Figures 4 and 5 without using scikit-learn
26 |     
27 | load.py
28 |     Code to load the seeds data
29 | 
30 | simple_threshold.py
31 |     Code from the book: finds the first partition, between Setosa and the other classes.
32 | 
33 | stump.py
34 |     Code from the book: finds the second partition, between Virginica and Versicolor.
35 | 
36 | threshold.py
37 |     Functional implementation of a threshold classifier
38 | 
39 | heldout.py
40 |     Evalute the threshold model on heldout data
41 | 
42 | seeds_knn_sklearn.py
43 |     Demonstrate cross-validation and feature normalization using scikit-learn
44 |     
45 | seeds_threshold.py
46 |     Test thresholding model on the seeds dataset (result mention in book, but no code)
47 | 
48 | seeds_knn_increasing_k.py
49 |     Test effect of increasing num_neighbors on accuracy.
50 | 
51 | knn.py
52 |     Implementation of K-Nearest neighbor without using scikit-learn.
53 | 
54 | seeds_knn.py
55 |     Demonstrate cross-validation (without scikit-learn)
56 | 


--------------------------------------------------------------------------------
/ch02/chapter.py:
--------------------------------------------------------------------------------
  1 | # This code is supporting material for the book
  2 | # Building Machine Learning Systems with Python
  3 | # by Willi Richert and Luis Pedro Coelho
  4 | # published by PACKT Publishing
  5 | #
  6 | # It is made available under the MIT License
  7 | 
  8 | 
  9 | from matplotlib import pyplot as plt
 10 | import numpy as np
 11 | 
 12 | # We load the data with load_iris from sklearn
 13 | from sklearn.datasets import load_iris
 14 | data = load_iris()
 15 | 
 16 | # load_iris returns an object with several fields
 17 | features = data.data
 18 | feature_names = data.feature_names
 19 | target = data.target
 20 | target_names = data.target_names
 21 | 
 22 | for t in range(3):
 23 |  if t == 0:
 24 |      c = 'r'
 25 |      marker = '>'
 26 |  elif t == 1:
 27 |      c = 'g'
 28 |      marker = 'o'
 29 |  elif t == 2:
 30 |      c = 'b'
 31 |      marker = 'x'
 32 |  plt.scatter(features[target == t, 0],
 33 |             features[target == t, 1],
 34 |             marker=marker,
 35 |             c=c)
 36 | # We use NumPy fancy indexing to get an array of strings:
 37 | labels = target_names[target]
 38 | 
 39 | # The petal length is the feature at position 2
 40 | plength = features[:, 2]
 41 | 
 42 | # Build an array of booleans:
 43 | is_setosa = (labels == 'setosa')
 44 | 
 45 | # This is the important step:
 46 | max_setosa =plength[is_setosa].max()
 47 | min_non_setosa = plength[~is_setosa].min()
 48 | print('Maximum of setosa: {0}.'.format(max_setosa))
 49 | 
 50 | print('Minimum of others: {0}.'.format(min_non_setosa))
 51 | 
 52 | # ~ is the boolean negation operator
 53 | features = features[~is_setosa]
 54 | labels = labels[~is_setosa]
 55 | # Build a new target variable, is_virigina
 56 | is_virginica = (labels == 'virginica')
 57 | 
 58 | # Initialize best_acc to impossibly low value
 59 | best_acc = -1.0
 60 | for fi in range(features.shape[1]):
 61 |     # We are going to test all possible thresholds
 62 |     thresh = features[:,fi]
 63 |     for t in thresh:
 64 | 
 65 |         # Get the vector for feature `fi`
 66 |         feature_i = features[:, fi]
 67 |         # apply threshold `t`
 68 |         pred = (feature_i > t)
 69 |         acc = (pred == is_virginica).mean()
 70 |         rev_acc = (pred == ~is_virginica).mean()
 71 |         if rev_acc > acc:
 72 |             reverse = True
 73 |             acc = rev_acc
 74 |         else:
 75 |             reverse = False
 76 | 
 77 |         if acc > best_acc:
 78 |             best_acc = acc
 79 |             best_fi = fi
 80 |             best_t = t
 81 |             best_reverse = reverse
 82 | 
 83 | print(best_fi, best_t, best_reverse, best_acc)
 84 | 
 85 | def is_virginica_test(fi, t, reverse, example):
 86 |     'Apply threshold model to a new example'
 87 |     test = example[fi] > t
 88 |     if reverse:
 89 |         test = not test
 90 |     return test
 91 | from threshold import fit_model, predict
 92 | 
 93 | # ning accuracy was 96.0%.
 94 | # ing accuracy was 90.0% (N = 50).
 95 | correct = 0.0
 96 | 
 97 | for ei in range(len(features)):
 98 |     # select all but the one at position `ei`:
 99 |     training = np.ones(len(features), bool)
100 |     training[ei] = False
101 |     testing = ~training
102 |     model = fit_model(features[training], is_virginica[training])
103 |     predictions = predict(model, features[testing])
104 |     correct += np.sum(predictions == is_virginica[testing])
105 | acc = correct/float(len(features))
106 | print('Accuracy: {0:.1%}'.format(acc))
107 | 
108 | 
109 | ###########################################
110 | ############## SEEDS DATASET ##############
111 | ###########################################
112 | 
113 | from load import load_dataset
114 | 
115 | feature_names = [
116 |     'area',
117 |     'perimeter',
118 |     'compactness',
119 |     'length of kernel',
120 |     'width of kernel',
121 |     'asymmetry coefficien',
122 |     'length of kernel groove',
123 | ]
124 | features, labels = load_dataset('seeds')
125 | 
126 | 
127 | 
128 | from sklearn.neighbors import KNeighborsClassifier
129 | classifier = KNeighborsClassifier(n_neighbors=1)
130 | from sklearn.cross_validation import KFold
131 | 
132 | kf = KFold(len(features), n_folds=5, shuffle=True)
133 | means = []
134 | for training,testing in kf:
135 |    # We learn a model for this fold with `fit` and then apply it to the
136 |    # testing data with `predict`:
137 |    classifier.fit(features[training], labels[training])
138 |    prediction = classifier.predict(features[testing])
139 | 
140 |    # np.mean on an array of booleans returns fraction
141 |  # of correct decisions for this fold:
142 |    curmean = np.mean(prediction == labels[testing])
143 |    means.append(curmean)
144 | print('Mean accuracy: {:.1%}'.format(np.mean(means)))
145 | 
146 | 
147 | from sklearn.pipeline import Pipeline
148 | from sklearn.preprocessing import StandardScaler
149 | 
150 | classifier = KNeighborsClassifier(n_neighbors=1)
151 | classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)])
152 | 
153 | means = []
154 | for training,testing in kf:
155 |     # We learn a model for this fold with `fit` and then apply it to the
156 |     # testing data with `predict`:
157 |     classifier.fit(features[training], labels[training])
158 |     prediction = classifier.predict(features[testing])
159 | 
160 |     # np.mean on an array of booleans returns fraction
161 |     # of correct decisions for this fold:
162 |     curmean = np.mean(prediction == labels[testing])
163 |     means.append(curmean)
164 | print('Mean accuracy: {:.1%}'.format(np.mean(means)))
165 | 


--------------------------------------------------------------------------------
/ch02/extra/create_tsv.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import milksets.seeds
 9 | 
10 | 
11 | def save_as_tsv(fname, module):
12 |     features, labels = module.load()
13 |     nlabels = [module.label_names[ell] for ell in labels]
14 |     with open(fname, 'w') as ofile:
15 |         for f, n in zip(features, nlabels):
16 |             print >>ofile, "\t".join(map(str, f) + [n])
17 | 
18 | save_as_tsv('seeds.tsv', milksets.seeds)
19 | 


--------------------------------------------------------------------------------
/ch02/figure1.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | from matplotlib import pyplot as plt
 9 | 
10 | # We load the data with load_iris from sklearn
11 | from sklearn.datasets import load_iris
12 | 
13 | # load_iris returns an object with several fields
14 | data = load_iris()
15 | features = data.data
16 | feature_names = data.feature_names
17 | target = data.target
18 | target_names = data.target_names
19 | 
20 | fig,axes = plt.subplots(2, 3)
21 | pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
22 | 
23 | # Set up 3 different pairs of (color, marker)
24 | color_markers = [
25 |         ('r', '>'),
26 |         ('g', 'o'),
27 |         ('b', 'x'),
28 |         ]
29 | for i, (p0, p1) in enumerate(pairs):
30 |     ax = axes.flat[i]
31 | 
32 |     for t in range(3):
33 |         # Use a different color/marker for each class `t`
34 |         c,marker = color_markers[t]
35 |         ax.scatter(features[target == t, p0], features[
36 |                     target == t, p1], marker=marker, c=c)
37 |     ax.set_xlabel(feature_names[p0])
38 |     ax.set_ylabel(feature_names[p1])
39 |     ax.set_xticks([])
40 |     ax.set_yticks([])
41 | fig.tight_layout()
42 | fig.savefig('figure1.png')
43 | 


--------------------------------------------------------------------------------
/ch02/figure2.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | COLOUR_FIGURE = False
 9 | 
10 | from matplotlib import pyplot as plt
11 | from sklearn.datasets import load_iris
12 | data = load_iris()
13 | features = data.data
14 | feature_names = data.feature_names
15 | target = data.target
16 | target_names = data.target_names
17 | 
18 | # We use NumPy fancy indexing to get an array of strings:
19 | labels = target_names[target]
20 | 
21 | is_setosa = (labels == 'setosa')
22 | features = features[~is_setosa]
23 | labels = labels[~is_setosa]
24 | is_virginica = (labels == 'virginica')
25 | 
26 | # Hand fixed thresholds:
27 | t = 1.65
28 | t2 = 1.75
29 | 
30 | # Features to use: 3 & 2
31 | f0, f1 = 3, 2
32 | 
33 | if COLOUR_FIGURE:
34 |     area1c = (1., .8, .8)
35 |     area2c = (.8, .8, 1.)
36 | else:
37 |     area1c = (1., 1, 1)
38 |     area2c = (.7, .7, .7)
39 | 
40 | # Plot from 90% of smallest value to 110% of largest value
41 | # (all feature values are positive, otherwise this would not work very well)
42 | 
43 | x0 = features[:, f0].min() * .9
44 | x1 = features[:, f0].max() * 1.1
45 | 
46 | y0 = features[:, f1].min() * .9
47 | y1 = features[:, f1].max() * 1.1
48 | 
49 | fig,ax = plt.subplots()
50 | ax.fill_between([t, x1], [y0, y0], [y1, y1], color=area2c)
51 | ax.fill_between([x0, t], [y0, y0], [y1, y1], color=area1c)
52 | ax.plot([t, t], [y0, y1], 'k--', lw=2)
53 | ax.plot([t2, t2], [y0, y1], 'k:', lw=2)
54 | ax.scatter(features[is_virginica, f0],
55 |             features[is_virginica, f1], c='b', marker='o', s=40)
56 | ax.scatter(features[~is_virginica, f0],
57 |             features[~is_virginica, f1], c='r', marker='x', s=40)
58 | ax.set_ylim(y0, y1)
59 | ax.set_xlim(x0, x1)
60 | ax.set_xlabel(feature_names[f0])
61 | ax.set_ylabel(feature_names[f1])
62 | fig.tight_layout()
63 | fig.savefig('figure2.png')
64 | 


--------------------------------------------------------------------------------
/ch02/figure4_5_no_sklearn.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | COLOUR_FIGURE = False
 9 | 
10 | from matplotlib import pyplot as plt
11 | from matplotlib.colors import ListedColormap
12 | from load import load_dataset
13 | import numpy as np
14 | from knn import fit_model, predict
15 | 
16 | feature_names = [
17 |     'area',
18 |     'perimeter',
19 |     'compactness',
20 |     'length of kernel',
21 |     'width of kernel',
22 |     'asymmetry coefficien',
23 |     'length of kernel groove',
24 | ]
25 | 
26 | 
27 | def plot_decision(features, labels):
28 |     '''Plots decision boundary for KNN
29 | 
30 |     Parameters
31 |     ----------
32 |     features : ndarray
33 |     labels : sequence
34 | 
35 |     Returns
36 |     -------
37 |     fig : Matplotlib Figure
38 |     ax  : Matplotlib Axes
39 |     '''
40 |     y0, y1 = features[:, 2].min() * .9, features[:, 2].max() * 1.1
41 |     x0, x1 = features[:, 0].min() * .9, features[:, 0].max() * 1.1
42 |     X = np.linspace(x0, x1, 100)
43 |     Y = np.linspace(y0, y1, 100)
44 |     X, Y = np.meshgrid(X, Y)
45 | 
46 |     model = fit_model(1, features[:, (0, 2)], np.array(labels))
47 |     C = predict(
48 |         model, np.vstack([X.ravel(), Y.ravel()]).T).reshape(X.shape)
49 |     if COLOUR_FIGURE:
50 |         cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)])
51 |     else:
52 |         cmap = ListedColormap([(1., 1., 1.), (.2, .2, .2), (.6, .6, .6)])
53 |     fig,ax = plt.subplots()
54 |     ax.set_xlim(x0, x1)
55 |     ax.set_ylim(y0, y1)
56 |     ax.set_xlabel(feature_names[0])
57 |     ax.set_ylabel(feature_names[2])
58 |     ax.pcolormesh(X, Y, C, cmap=cmap)
59 |     if COLOUR_FIGURE:
60 |         cmap = ListedColormap([(1., .0, .0), (.0, 1., .0), (.0, .0, 1.)])
61 |         ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap)
62 |     else:
63 |         for lab, ma in zip(range(3), "Do^"):
64 |             ax.plot(features[labels == lab, 0], features[
65 |                      labels == lab, 2], ma, c=(1., 1., 1.))
66 |     return fig,ax
67 | 
68 | 
69 | features, labels = load_dataset('seeds')
70 | names = sorted(set(labels))
71 | labels = np.array([names.index(ell) for ell in labels])
72 | 
73 | fig,ax = plot_decision(features, labels)
74 | fig.savefig('figure4.png')
75 | 
76 | features -= features.mean(0)
77 | features /= features.std(0)
78 | fig,ax = plot_decision(features, labels)
79 | fig.savefig('figure5.png')
80 | 


--------------------------------------------------------------------------------
/ch02/figure4_5_sklearn.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | COLOUR_FIGURE = False
 9 | 
10 | from matplotlib import pyplot as plt
11 | from matplotlib.colors import ListedColormap
12 | from load import load_dataset
13 | import numpy as np
14 | from sklearn.neighbors import KNeighborsClassifier
15 | 
16 | feature_names = [
17 |     'area',
18 |     'perimeter',
19 |     'compactness',
20 |     'length of kernel',
21 |     'width of kernel',
22 |     'asymmetry coefficien',
23 |     'length of kernel groove',
24 | ]
25 | 
26 | 
27 | def plot_decision(features, labels, num_neighbors=1):
28 |     '''Plots decision boundary for KNN
29 | 
30 |     Parameters
31 |     ----------
32 |     features : ndarray
33 |     labels : sequence
34 | 
35 |     Returns
36 |     -------
37 |     fig : Matplotlib Figure
38 |     ax  : Matplotlib Axes
39 |     '''
40 |     y0, y1 = features[:, 2].min() * .9, features[:, 2].max() * 1.1
41 |     x0, x1 = features[:, 0].min() * .9, features[:, 0].max() * 1.1
42 |     X = np.linspace(x0, x1, 1000)
43 |     Y = np.linspace(y0, y1, 1000)
44 |     X, Y = np.meshgrid(X, Y)
45 | 
46 |     model = KNeighborsClassifier(num_neighbors)
47 |     model.fit(features[:, (0,2)], labels)
48 |     C = model.predict(np.vstack([X.ravel(), Y.ravel()]).T).reshape(X.shape)
49 |     if COLOUR_FIGURE:
50 |         cmap = ListedColormap([(1., .7, .7), (.7, 1., .7), (.7, .7, 1.)])
51 |     else:
52 |         cmap = ListedColormap([(1., 1., 1.), (.2, .2, .2), (.6, .6, .6)])
53 |     fig,ax = plt.subplots()
54 |     ax.set_xlim(x0, x1)
55 |     ax.set_ylim(y0, y1)
56 |     ax.set_xlabel(feature_names[0])
57 |     ax.set_ylabel(feature_names[2])
58 |     ax.pcolormesh(X, Y, C, cmap=cmap)
59 |     if COLOUR_FIGURE:
60 |         cmap = ListedColormap([(1., .0, .0), (.1, .6, .1), (.0, .0, 1.)])
61 |         ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap)
62 |     else:
63 |         for lab, ma in zip(range(3), "Do^"):
64 |             ax.plot(features[labels == lab, 0], features[
65 |                      labels == lab, 2], ma, c=(1., 1., 1.), ms=6)
66 |     return fig,ax
67 | 
68 | 
69 | features, labels = load_dataset('seeds')
70 | names = sorted(set(labels))
71 | labels = np.array([names.index(ell) for ell in labels])
72 | 
73 | fig,ax = plot_decision(features, labels)
74 | fig.tight_layout()
75 | fig.savefig('figure4sklearn.png')
76 | 
77 | features -= features.mean(0)
78 | features /= features.std(0)
79 | fig,ax = plot_decision(features, labels)
80 | fig.tight_layout()
81 | fig.savefig('figure5sklearn.png')
82 | 
83 | fig,ax = plot_decision(features, labels, 11)
84 | fig.tight_layout()
85 | fig.savefig('figure5sklearn_with_11_neighbors.png')
86 | 


--------------------------------------------------------------------------------
/ch02/heldout.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | # This script demonstrates the difference between the training accuracy and
 9 | # testing (held-out) accuracy.
10 | 
11 | import numpy as np
12 | from sklearn.datasets import load_iris
13 | from threshold import fit_model, accuracy
14 | 
15 | data = load_iris()
16 | features = data['data']
17 | labels = data['target_names'][data['target']]
18 | 
19 | # We are going to remove the setosa examples as they are too easy:
20 | is_setosa = (labels == 'setosa')
21 | features = features[~is_setosa]
22 | labels = labels[~is_setosa]
23 | 
24 | # Now we classify virginica vs non-virginica
25 | is_virginica = (labels == 'virginica')
26 | 
27 | # Split the data in two: testing and training
28 | testing = np.tile([True, False], 50) # testing = [True,False,True,False,True,False...]
29 | 
30 | # Training is the negation of testing: i.e., datapoints not used for testing,
31 | # will be used for training
32 | training = ~testing
33 | 
34 | model = fit_model(features[training], is_virginica[training])
35 | train_accuracy = accuracy(features[training], is_virginica[training], model)
36 | test_accuracy = accuracy(features[testing], is_virginica[testing], model)
37 | 
38 | print('''\
39 | Training accuracy was {0:.1%}.
40 | Testing accuracy was {1:.1%} (N = {2}).
41 | '''.format(train_accuracy, test_accuracy, testing.sum()))
42 | 


--------------------------------------------------------------------------------
/ch02/knn.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import numpy as np
 9 | 
10 | # This function was called ``learn_model`` in the first edition
11 | def fit_model(k, features, labels):
12 |     '''Learn a k-nn model'''
13 |     # There is no model in k-nn, just a copy of the inputs
14 |     return k, features.copy(), labels.copy()
15 | 
16 | 
17 | def plurality(xs):
18 |     '''Find the most common element in a collection'''
19 |     from collections import defaultdict
20 |     counts = defaultdict(int)
21 |     for x in xs:
22 |         counts[x] += 1
23 |     maxv = max(counts.values())
24 |     for k, v in counts.items():
25 |         if v == maxv:
26 |             return k
27 | 
28 | # This function was called ``apply_model`` in the first edition
29 | def predict(model, features):
30 |     '''Apply k-nn model'''
31 |     k, train_feats, labels = model
32 |     results = []
33 |     for f in features:
34 |         label_dist = []
35 |         # Compute all distances:
36 |         for t, ell in zip(train_feats, labels):
37 |             label_dist.append((np.linalg.norm(f - t), ell))
38 |         label_dist.sort(key=lambda d_ell: d_ell[0])
39 |         label_dist = label_dist[:k]
40 |         results.append(plurality([ell for _, ell in label_dist]))
41 |     return np.array(results)
42 | 
43 | 
44 | def accuracy(features, labels, model):
45 |     preds = predict(model, features)
46 |     return np.mean(preds == labels)
47 | 


--------------------------------------------------------------------------------
/ch02/load.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import numpy as np
 9 | 
10 | 
11 | def load_dataset(dataset_name):
12 |     '''
13 |     data,labels = load_dataset(dataset_name)
14 | 
15 |     Load a given dataset
16 | 
17 |     Returns
18 |     -------
19 |     data : numpy ndarray
20 |     labels : list of str
21 |     '''
22 |     data = []
23 |     labels = []
24 |     with open('./data/{0}.tsv'.format(dataset_name)) as ifile:
25 |         for line in ifile:
26 |             tokens = line.strip().split('\t')
27 |             data.append([float(tk) for tk in tokens[:-1]])
28 |             labels.append(tokens[-1])
29 |     data = np.array(data)
30 |     labels = np.array(labels)
31 |     return data, labels
32 | 


--------------------------------------------------------------------------------
/ch02/seeds_knn.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | from load import load_dataset
 9 | import numpy as np
10 | from knn import fit_model, accuracy
11 | 
12 | features, labels = load_dataset('seeds')
13 | 
14 | 
15 | def cross_validate(features, labels):
16 |     '''Compute cross-validation errors'''
17 |     error = 0.0
18 |     for fold in range(10):
19 |         training = np.ones(len(features), bool)
20 |         training[fold::10] = 0
21 |         testing = ~training
22 |         model = fit_model(1, features[training], labels[training])
23 |         test_error = accuracy(features[testing], labels[testing], model)
24 |         error += test_error
25 | 
26 |     return error / 10.0
27 | 
28 | error = cross_validate(features, labels)
29 | print('Ten fold cross-validated error was {0:.1%}.'.format(error))
30 | 
31 | # Z-score (whiten) the features
32 | features -= features.mean(0)
33 | features /= features.std(0)
34 | error = cross_validate(features, labels)
35 | print(
36 |     'Ten fold cross-validated error after z-scoring was {0:.1%}.'.format(error))
37 | 


--------------------------------------------------------------------------------
/ch02/seeds_knn_increasing_k.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | # Basic imports
 9 | from __future__ import print_function
10 | import numpy as np
11 | from matplotlib import pyplot as plt
12 | from load import load_dataset
13 | 
14 | 
15 | from sklearn.neighbors import KNeighborsClassifier
16 | 
17 | from sklearn.cross_validation import cross_val_score
18 | from sklearn.pipeline import Pipeline
19 | from sklearn.preprocessing import StandardScaler
20 | 
21 | 
22 | features, labels = load_dataset('seeds')
23 | 
24 | # Values of k to consider: all in 1 .. 160
25 | ks = np.arange(1,161)
26 | 
27 | # We build a classifier object here with the default number of neighbors
28 | # (It happens to be 5, but it does not matter as we will be changing it below
29 | classifier = KNeighborsClassifier()
30 | classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)])
31 | 
32 | # accuracies will hold our results
33 | accuracies = []
34 | for k in ks:
35 |     # set the classifier parameter
36 |     classifier.set_params(knn__n_neighbors=k)
37 |     crossed = cross_val_score(classifier, features, labels)
38 | 
39 |     # Save only the average
40 |     accuracies.append(crossed.mean())
41 | 
42 | accuracies = np.array(accuracies)
43 | 
44 | # Scale the accuracies by 100 to plot as a percentage instead of as a fraction
45 | plt.plot(ks, accuracies*100)
46 | plt.xlabel('Value for k (nr. of neighbors)')
47 | plt.ylabel('Accuracy (%)')
48 | plt.savefig('figure6.png')
49 | 


--------------------------------------------------------------------------------
/ch02/seeds_knn_sklearn.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | # Basic imports
 9 | from __future__ import print_function
10 | import numpy as np
11 | from load import load_dataset
12 | 
13 | 
14 | # Import sklearn implementation of KNN
15 | from sklearn.neighbors import KNeighborsClassifier
16 | 
17 | features, labels = load_dataset('seeds')
18 | classifier = KNeighborsClassifier(n_neighbors=4)
19 | 
20 | 
21 | n = len(features)
22 | correct = 0.0
23 | for ei in range(n):
24 |     training = np.ones(n, bool)
25 |     training[ei] = 0
26 |     testing = ~training
27 |     classifier.fit(features[training], labels[training])
28 |     pred = classifier.predict(features[ei])
29 |     correct += (pred == labels[ei])
30 | print('Result of leave-one-out: {}'.format(correct/n))
31 | 
32 | # Import KFold object
33 | from sklearn.cross_validation import KFold
34 | 
35 | # means will hold the mean for each fold
36 | means = []
37 | 
38 | # kf is a generator of pairs (training,testing) so that each iteration
39 | # implements a separate fold.
40 | kf = KFold(len(features), n_folds=3, shuffle=True)
41 | for training,testing in kf:
42 |     # We learn a model for this fold with `fit` and then apply it to the
43 |     # testing data with `predict`:
44 |     classifier.fit(features[training], labels[training])
45 |     prediction = classifier.predict(features[testing])
46 | 
47 |     # np.mean on an array of booleans returns the fraction of correct decisions
48 |     # for this fold:
49 |     curmean = np.mean(prediction == labels[testing])
50 |     means.append(curmean)
51 | print('Result of cross-validation using KFold: {}'.format(means))
52 | 
53 | # The function cross_val_score does the same thing as the loop above with a
54 | # single function call
55 | 
56 | from sklearn.cross_validation import cross_val_score
57 | crossed = cross_val_score(classifier, features, labels)
58 | print('Result of cross-validation using cross_val_score: {}'.format(crossed))
59 | 
60 | # The results above use the features as is, which we learned was not optimal
61 | # except if the features happen to all be in the same scale. We can pre-scale
62 | # the features as explained in the main text:
63 | 
64 | from sklearn.pipeline import Pipeline
65 | from sklearn.preprocessing import StandardScaler
66 | classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)])
67 | crossed = cross_val_score(classifier, features, labels)
68 | print('Result with prescaling: {}'.format(crossed))
69 | 
70 | 
71 | # Now, generate & print a cross-validated confusion matrix for the same result
72 | from sklearn.metrics import confusion_matrix
73 | names = list(set(labels))
74 | labels = np.array([names.index(ell) for ell in labels])
75 | preds = labels.copy()
76 | preds[:] = -1
77 | for train, test in kf:
78 |     classifier.fit(features[train], labels[train])
79 |     preds[test] = classifier.predict(features[test])
80 | 
81 | cmat = confusion_matrix(labels, preds)
82 | print()
83 | print('Confusion matrix: [rows represent true outcome, columns predicted outcome]')
84 | print(cmat)
85 | 
86 | # The explicit float() conversion is necessary in Python 2
87 | # (Otherwise, result is rounded to 0)
88 | acc = cmat.trace()/float(cmat.sum())
89 | print('Accuracy: {0:.1%}'.format(acc))
90 | 
91 | 


--------------------------------------------------------------------------------
/ch02/seeds_threshold.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | from load import load_dataset
 9 | import numpy as np
10 | from threshold import fit_model, accuracy
11 | 
12 | features, labels = load_dataset('seeds')
13 | 
14 | # Turn the labels into a binary array
15 | labels = (labels == 'Canadian')
16 | 
17 | error = 0.0
18 | for fold in range(10):
19 |     training = np.ones(len(features), bool)
20 | 
21 |     # numpy magic to make an array with 10% of 0s starting at fold
22 |     training[fold::10] = 0
23 | 
24 |     # whatever is not training is for testing
25 |     testing = ~training
26 | 
27 |     model = fit_model(features[training], labels[training])
28 |     test_error = accuracy(features[testing], labels[testing], model)
29 |     error += test_error
30 | 
31 | error /= 10.0
32 | 
33 | print('Ten fold cross-validated error was {0:.1%}.'.format(error))
34 | 


--------------------------------------------------------------------------------
/ch02/simple_threshold.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | from sklearn.datasets import load_iris
 9 | 
10 | data = load_iris()
11 | features = data['data']
12 | target = data['target']
13 | target_names = data['target_names']
14 | labels = target_names[target]
15 | plength = features[:, 2]
16 | 
17 | # To use numpy operations to get setosa features,
18 | # we build a boolean array
19 | is_setosa = (labels == 'setosa')
20 | 
21 | max_setosa = plength[is_setosa].max()
22 | min_non_setosa = plength[~is_setosa].min()
23 | 
24 | print('Maximum of setosa: {0}.'.format(max_setosa))
25 | print('Minimum of others: {0}.'.format(min_non_setosa))
26 | 


--------------------------------------------------------------------------------
/ch02/stump.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | from sklearn.datasets import load_iris
 9 | data = load_iris()
10 | features = data.data
11 | labels = data.target_names[data.target]
12 | 
13 | 
14 | is_setosa = (labels == 'setosa')
15 | features = features[~is_setosa]
16 | labels = labels[~is_setosa]
17 | is_virginica = (labels == 'virginica')
18 | 
19 | 
20 | # Initialize to a value that is worse than any possible test
21 | best_acc = -1.0
22 | 
23 | # Loop over all the features
24 | for fi in range(features.shape[1]):
25 |     # Test every possible threshold value for feature fi
26 |     thresh = features[:, fi].copy()
27 | 
28 |     # Test them in order
29 |     thresh.sort()
30 |     for t in thresh:
31 | 
32 |         # Generate predictions using t as a threshold
33 |         pred = (features[:, fi] > t)
34 | 
35 |         # Accuracy is the fraction of predictions that match reality
36 |         acc = (pred == is_virginica).mean()
37 | 
38 |         # We test whether negating the test is a better threshold:
39 |         acc_neg = ((~pred) == is_virginica).mean()
40 |         if acc_neg > acc:
41 |             acc = acc_neg
42 |             negated = True
43 |         else:
44 |             negated = False
45 | 
46 |         # If this is better than previous best, then this is now the new best:
47 | 
48 |         if acc > best_acc:
49 |             best_acc = acc
50 |             best_fi = fi
51 |             best_t = t
52 |             best_is_negated = negated
53 | 
54 | print('Best threshold is {0} on feature {1} (index {2}), which achieves accuracy of {3:.1%}.'.format(
55 |     best_t, data.feature_names[best_fi], best_fi, best_acc))
56 | 


--------------------------------------------------------------------------------
/ch02/tests/test_load.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | from load import load_dataset
 9 | 
10 | 
11 | def test_iris():
12 |     features, labels = load_dataset('iris')
13 |     assert len(features[0]) == 4
14 |     assert len(features)
15 |     assert len(features) == len(labels)
16 | 
17 | 
18 | def test_seeds():
19 |     features, labels = load_dataset('seeds')
20 |     assert len(features[0]) == 7
21 |     assert len(features)
22 |     assert len(features) == len(labels)
23 | 


--------------------------------------------------------------------------------
/ch02/threshold.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import numpy as np
 9 | 
10 | 
11 | # This function was called ``learn_model`` in the first edition
12 | def fit_model(features, labels):
13 |     '''Learn a simple threshold model'''
14 |     best_acc = -1.0
15 |     # Loop over all the features:
16 |     for fi in range(features.shape[1]):
17 |         thresh = features[:, fi].copy()
18 |         # test all feature values in order:
19 |         thresh.sort()
20 |         for t in thresh:
21 |             pred = (features[:, fi] > t)
22 | 
23 |             # Measure the accuracy of this 
24 |             acc = (pred == labels).mean()
25 | 
26 |             rev_acc = (pred == ~labels).mean()
27 |             if rev_acc > acc:
28 |                 acc = rev_acc
29 |                 reverse = True
30 |             else:
31 |                 reverse = False
32 |             if acc > best_acc:
33 |                 best_acc = acc
34 |                 best_fi = fi
35 |                 best_t = t
36 |                 best_reverse = reverse
37 | 
38 |     # A model is a threshold and an index
39 |     return best_t, best_fi, best_reverse
40 | 
41 | 
42 | # This function was called ``apply_model`` in the first edition
43 | def predict(model, features):
44 |     '''Apply a learned model'''
45 |     # A model is a pair as returned by fit_model
46 |     t, fi, reverse = model
47 |     if reverse:
48 |         return features[:, fi] <= t
49 |     else:
50 |         return features[:, fi] > t
51 | 
52 | def accuracy(features, labels, model):
53 |     '''Compute the accuracy of the model'''
54 |     preds = predict(model, features)
55 |     return np.mean(preds == labels)
56 | 


--------------------------------------------------------------------------------
/ch03/README.md:
--------------------------------------------------------------------------------
1 | Chapter 3 - Clustering - Finding Related Posts
2 | ==============================================
3 | 
4 | For this chapter you will need the '20news' dataset from
5 | http://mlcomp.org/datasets/379. To get the data you will need to
6 | register, but it is totally free. When being logged in, you will 
7 | see a ZIP download link.
8 | 


--------------------------------------------------------------------------------
/ch03/data/toy/01.txt:
--------------------------------------------------------------------------------
1 | This is a toy post about machine learning. Actually, it contains not much interesting stuff.


--------------------------------------------------------------------------------
/ch03/data/toy/02.txt:
--------------------------------------------------------------------------------
1 | Imaging databases provide storage capabilities.


--------------------------------------------------------------------------------
/ch03/data/toy/03.txt:
--------------------------------------------------------------------------------
1 | Most imaging databases save images permanently.
2 | 


--------------------------------------------------------------------------------
/ch03/data/toy/04.txt:
--------------------------------------------------------------------------------
1 | Imaging databases store data.


--------------------------------------------------------------------------------
/ch03/data/toy/05.txt:
--------------------------------------------------------------------------------
1 | Imaging databases store data. Imaging databases store data. Imaging databases store data.


--------------------------------------------------------------------------------
/ch03/noise_analysis.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import sklearn.datasets
 9 | 
10 | groups = [
11 |     'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
12 |     'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
13 | train_data = sklearn.datasets.fetch_20newsgroups(subset="train",
14 |                                                  categories=groups)
15 | 
16 | labels = train_data.target
17 | num_clusters = 50  # sp.unique(labels).shape[0]
18 | 
19 | import nltk.stem
20 | english_stemmer = nltk.stem.SnowballStemmer('english')
21 | 
22 | from sklearn.feature_extraction.text import TfidfVectorizer
23 | 
24 | 
25 | class StemmedTfidfVectorizer(TfidfVectorizer):
26 | 
27 |     def build_analyzer(self):
28 |         analyzer = super(TfidfVectorizer, self).build_analyzer()
29 |         return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
30 | 
31 | vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5,
32 |                                     stop_words='english', decode_error='ignore'
33 |                                     )
34 | vectorized = vectorizer.fit_transform(train_data.data)
35 | 
36 | post_group = zip(train_data.data, train_data.target)
37 | # Create a list of tuples that can be sorted by
38 | # the length of the posts
39 | all = [(len(post[0]), post[0], train_data.target_names[post[1]])
40 |        for post in post_group]
41 | graphics = sorted([post for post in all if post[2] == 'comp.graphics'])
42 | print(graphics[5])
43 | # (245, 'From: SITUNAYA@IBM3090.BHAM.AC.UK\nSubject: test....(sorry)\nOrganization:
44 | # The University of Birmingham, United Kingdom\nLines: 1\nNNTP-Posting-Host: ibm3090.bham.ac.uk
45 | # \n\n==============================================================================\n',
46 | # 'comp.graphics')
47 | 
48 | noise_post = graphics[5][1]
49 | 
50 | analyzer = vectorizer.build_analyzer()
51 | print(list(analyzer(noise_post)))
52 | 
53 | useful = set(analyzer(noise_post)).intersection(vectorizer.get_feature_names())
54 | print(sorted(useful))
55 | # ['ac', 'birmingham', 'host', 'kingdom', 'nntp', 'sorri', 'test', 'uk', 'unit', 'univers']
56 | 
57 | for term in sorted(useful):
58 |     print('IDF(%s)=%.2f' % (term,
59 |                             vectorizer._tfidf.idf_[vectorizer.vocabulary_[term]]))
60 | # IDF(ac)=3.51
61 | # IDF(birmingham)=6.77
62 | # IDF(host)=1.74
63 | # IDF(kingdom)=6.68
64 | # IDF(nntp)=1.77
65 | # IDF(sorri)=4.14
66 | # IDF(test)=3.83
67 | # IDF(uk)=3.70
68 | # IDF(unit)=4.42
69 | # IDF(univers)=1.91
70 | 


--------------------------------------------------------------------------------
/ch03/plot_kmeans_example.py:
--------------------------------------------------------------------------------
  1 | # This code is supporting material for the book
  2 | # Building Machine Learning Systems with Python
  3 | # by Willi Richert and Luis Pedro Coelho
  4 | # published by PACKT Publishing
  5 | #
  6 | # It is made available under the MIT License
  7 | 
  8 | # inspired by http://scikit-
  9 | # learn.org/dev/auto_examples/cluster/plot_kmeans_digits.html#example-
 10 | # cluster-plot-kmeans-digits-py
 11 | 
 12 | import os
 13 | import scipy as sp
 14 | from scipy.stats import norm
 15 | from matplotlib import pylab
 16 | from sklearn.cluster import KMeans
 17 | 
 18 | from utils import CHART_DIR
 19 | 
 20 | seed = 2
 21 | sp.random.seed(seed)  # to reproduce the data later on
 22 | 
 23 | num_clusters = 3
 24 | 
 25 | 
 26 | def plot_clustering(x, y, title, mx=None, ymax=None, xmin=None, km=None):
 27 |     pylab.figure(num=None, figsize=(8, 6))
 28 |     if km:
 29 |         pylab.scatter(x, y, s=50, c=km.predict(list(zip(x, y))))
 30 |     else:
 31 |         pylab.scatter(x, y, s=50)
 32 | 
 33 |     pylab.title(title)
 34 |     pylab.xlabel("Occurrence word 1")
 35 |     pylab.ylabel("Occurrence word 2")
 36 | 
 37 |     pylab.autoscale(tight=True)
 38 |     pylab.ylim(ymin=0, ymax=1)
 39 |     pylab.xlim(xmin=0, xmax=1)
 40 |     pylab.grid(True, linestyle='-', color='0.75')
 41 | 
 42 |     return pylab
 43 | 
 44 | 
 45 | xw1 = norm(loc=0.3, scale=.15).rvs(20)
 46 | yw1 = norm(loc=0.3, scale=.15).rvs(20)
 47 | 
 48 | xw2 = norm(loc=0.7, scale=.15).rvs(20)
 49 | yw2 = norm(loc=0.7, scale=.15).rvs(20)
 50 | 
 51 | xw3 = norm(loc=0.2, scale=.15).rvs(20)
 52 | yw3 = norm(loc=0.8, scale=.15).rvs(20)
 53 | 
 54 | x = sp.append(sp.append(xw1, xw2), xw3)
 55 | y = sp.append(sp.append(yw1, yw2), yw3)
 56 | 
 57 | i = 1
 58 | plot_clustering(x, y, "Vectors")
 59 | pylab.savefig(os.path.join(CHART_DIR, "1400_03_0%i.png" % i))
 60 | pylab.clf()
 61 | 
 62 | i += 1
 63 | 
 64 | # 1 iteration ####################
 65 | 
 66 | mx, my = sp.meshgrid(sp.arange(0, 1, 0.001), sp.arange(0, 1, 0.001))
 67 | 
 68 | km = KMeans(init='random', n_clusters=num_clusters, verbose=1,
 69 |             n_init=1, max_iter=1,
 70 |             random_state=seed)
 71 | km.fit(sp.array(list(zip(x, y))))
 72 | 
 73 | Z = km.predict(sp.c_[mx.ravel(), my.ravel()]).reshape(mx.shape)
 74 | 
 75 | plot_clustering(x, y, "Clustering iteration 1", km=km)
 76 | pylab.imshow(Z, interpolation='nearest',
 77 |              extent=(mx.min(), mx.max(), my.min(), my.max()),
 78 |              cmap=pylab.cm.Blues,
 79 |              aspect='auto', origin='lower')
 80 | 
 81 | c1a, c1b, c1c = km.cluster_centers_
 82 | pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
 83 |               marker='x', linewidth=2, s=100, color='black')
 84 | pylab.savefig(os.path.join(CHART_DIR, "1400_03_0%i.png" % i))
 85 | pylab.clf()
 86 | 
 87 | i += 1
 88 | 
 89 | # 2 iterations ####################
 90 | km = KMeans(init='random', n_clusters=num_clusters, verbose=1,
 91 |             n_init=1, max_iter=2,
 92 |             random_state=seed)
 93 | km.fit(sp.array(list(zip(x, y))))
 94 | 
 95 | Z = km.predict(sp.c_[mx.ravel(), my.ravel()]).reshape(mx.shape)
 96 | 
 97 | plot_clustering(x, y, "Clustering iteration 2", km=km)
 98 | pylab.imshow(Z, interpolation='nearest',
 99 |              extent=(mx.min(), mx.max(), my.min(), my.max()),
100 |              cmap=pylab.cm.Blues,
101 |              aspect='auto', origin='lower')
102 | 
103 | c2a, c2b, c2c = km.cluster_centers_
104 | pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
105 |               marker='x', linewidth=2, s=100, color='black')
106 | 
107 | pylab.gca().add_patch(
108 |     pylab.Arrow(c1a[0], c1a[1], c2a[0] - c1a[0], c2a[1] - c1a[1], width=0.1))
109 | pylab.gca().add_patch(
110 |     pylab.Arrow(c1b[0], c1b[1], c2b[0] - c1b[0], c2b[1] - c1b[1], width=0.1))
111 | pylab.gca().add_patch(
112 |     pylab.Arrow(c1c[0], c1c[1], c2c[0] - c1c[0], c2c[1] - c1c[1], width=0.1))
113 | 
114 | pylab.savefig(os.path.join(CHART_DIR, "1400_03_0%i.png" % i))
115 | pylab.clf()
116 | 
117 | i += 1
118 | 
119 | # 3 iterations ####################
120 | km = KMeans(init='random', n_clusters=num_clusters, verbose=1,
121 |             n_init=1, max_iter=10,
122 |             random_state=seed)
123 | km.fit(sp.array(list(zip(x, y))))
124 | 
125 | Z = km.predict(sp.c_[mx.ravel(), my.ravel()]).reshape(mx.shape)
126 | 
127 | plot_clustering(x, y, "Clustering iteration 10", km=km)
128 | pylab.imshow(Z, interpolation='nearest',
129 |              extent=(mx.min(), mx.max(), my.min(), my.max()),
130 |              cmap=pylab.cm.Blues,
131 |              aspect='auto', origin='lower')
132 | 
133 | pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
134 |               marker='x', linewidth=2, s=100, color='black')
135 | pylab.savefig(os.path.join(CHART_DIR, "1400_03_0%i.png" % i))
136 | pylab.clf()
137 | 
138 | i += 1
139 | 


--------------------------------------------------------------------------------
/ch03/rel_post_01.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import os
 9 | import sys
10 | 
11 | import scipy as sp
12 | 
13 | from sklearn.feature_extraction.text import CountVectorizer
14 | 
15 | from utils import DATA_DIR
16 | 
17 | TOY_DIR = os.path.join(DATA_DIR, "toy")
18 | posts = [open(os.path.join(TOY_DIR, f)).read() for f in os.listdir(TOY_DIR)]
19 | 
20 | new_post = "imaging databases"
21 | 
22 | import nltk.stem
23 | english_stemmer = nltk.stem.SnowballStemmer('english')
24 | 
25 | 
26 | class StemmedCountVectorizer(CountVectorizer):
27 | 
28 |     def build_analyzer(self):
29 |         analyzer = super(StemmedCountVectorizer, self).build_analyzer()
30 |         return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
31 | 
32 | # vectorizer = CountVectorizer(min_df=1, stop_words='english',
33 | # preprocessor=stemmer)
34 | vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english')
35 | 
36 | from sklearn.feature_extraction.text import TfidfVectorizer
37 | 
38 | 
39 | class StemmedTfidfVectorizer(TfidfVectorizer):
40 | 
41 |     def build_analyzer(self):
42 |         analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
43 |         return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
44 | 
45 | vectorizer = StemmedTfidfVectorizer(
46 |     min_df=1, stop_words='english', decode_error='ignore')
47 | 
48 | X_train = vectorizer.fit_transform(posts)
49 | 
50 | num_samples, num_features = X_train.shape
51 | print("#samples: %d, #features: %d" % (num_samples, num_features))
52 | 
53 | new_post_vec = vectorizer.transform([new_post])
54 | print(new_post_vec, type(new_post_vec))
55 | print(new_post_vec.toarray())
56 | print(vectorizer.get_feature_names())
57 | 
58 | 
59 | def dist_raw(v1, v2):
60 |     delta = v1 - v2
61 |     return sp.linalg.norm(delta.toarray())
62 | 
63 | 
64 | def dist_norm(v1, v2):
65 |     v1_normalized = v1 / sp.linalg.norm(v1.toarray())
66 |     v2_normalized = v2 / sp.linalg.norm(v2.toarray())
67 | 
68 |     delta = v1_normalized - v2_normalized
69 | 
70 |     return sp.linalg.norm(delta.toarray())
71 | 
72 | dist = dist_norm
73 | 
74 | best_dist = sys.maxsize
75 | best_i = None
76 | 
77 | for i in range(0, num_samples):
78 |     post = posts[i]
79 |     if post == new_post:
80 |         continue
81 |     post_vec = X_train.getrow(i)
82 |     d = dist(post_vec, new_post_vec)
83 | 
84 |     print("=== Post %i with dist=%.2f: %s" % (i, d, post))
85 | 
86 |     if d < best_dist:
87 |         best_dist = d
88 |         best_i = i
89 | 
90 | print("Best post is %i with dist=%.2f" % (best_i, best_dist))
91 | 


--------------------------------------------------------------------------------
/ch03/rel_post_20news.py:
--------------------------------------------------------------------------------
  1 | # This code is supporting material for the book
  2 | # Building Machine Learning Systems with Python
  3 | # by Willi Richert and Luis Pedro Coelho
  4 | # published by PACKT Publishing
  5 | #
  6 | # It is made available under the MIT License
  7 | 
  8 | import sklearn.datasets
  9 | import scipy as sp
 10 | 
 11 | new_post = \
 12 |     """Disk drive problems. Hi, I have a problem with my hard disk.
 13 | After 1 year it is working only sporadically now.
 14 | I tried to format it, but now it doesn't boot any more.
 15 | Any ideas? Thanks.
 16 | """
 17 | 
 18 | print("""\
 19 | Dear reader of the 1st edition of 'Building Machine Learning Systems with Python'!
 20 | For the 2nd edition we introduced a couple of changes that will result into
 21 | results that differ from the results in the 1st edition.
 22 | E.g. we now fully rely on scikit's fetch_20newsgroups() instead of requiring
 23 | you to download the data manually from MLCOMP.
 24 | If you have any questions, please ask at http://www.twotoreal.com
 25 | """)
 26 | 
 27 | all_data = sklearn.datasets.fetch_20newsgroups(subset="all")
 28 | print("Number of total posts: %i" % len(all_data.filenames))
 29 | # Number of total posts: 18846
 30 | 
 31 | groups = [
 32 |     'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
 33 |     'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
 34 | train_data = sklearn.datasets.fetch_20newsgroups(subset="train",
 35 |                                                  categories=groups)
 36 | print("Number of training posts in tech groups:", len(train_data.filenames))
 37 | # Number of training posts in tech groups: 3529
 38 | 
 39 | labels = train_data.target
 40 | num_clusters = 50  # sp.unique(labels).shape[0]
 41 | 
 42 | import nltk.stem
 43 | english_stemmer = nltk.stem.SnowballStemmer('english')
 44 | 
 45 | from sklearn.feature_extraction.text import TfidfVectorizer
 46 | 
 47 | 
 48 | class StemmedTfidfVectorizer(TfidfVectorizer):
 49 | 
 50 |     def build_analyzer(self):
 51 |         analyzer = super(TfidfVectorizer, self).build_analyzer()
 52 |         return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
 53 | 
 54 | vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5,
 55 |                                     stop_words='english', decode_error='ignore'
 56 |                                     )
 57 | 
 58 | vectorized = vectorizer.fit_transform(train_data.data)
 59 | num_samples, num_features = vectorized.shape
 60 | print("#samples: %d, #features: %d" % (num_samples, num_features))
 61 | # samples: 3529, #features: 4712
 62 | 
 63 | from sklearn.cluster import KMeans
 64 | 
 65 | km = KMeans(n_clusters=num_clusters, n_init=1, verbose=1, random_state=3)
 66 | clustered = km.fit(vectorized)
 67 | 
 68 | print("km.labels_=%s" % km.labels_)
 69 | # km.labels_=[ 6 34 22 ...,  2 21 26]
 70 | 
 71 | print("km.labels_.shape=%s" % km.labels_.shape)
 72 | # km.labels_.shape=3529
 73 | 
 74 | from sklearn import metrics
 75 | print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
 76 | # Homogeneity: 0.400
 77 | print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
 78 | # Completeness: 0.206
 79 | print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
 80 | # V-measure: 0.272
 81 | print("Adjusted Rand Index: %0.3f" %
 82 |       metrics.adjusted_rand_score(labels, km.labels_))
 83 | # Adjusted Rand Index: 0.064
 84 | print("Adjusted Mutual Information: %0.3f" %
 85 |       metrics.adjusted_mutual_info_score(labels, km.labels_))
 86 | # Adjusted Mutual Information: 0.197
 87 | print(("Silhouette Coefficient: %0.3f" %
 88 |        metrics.silhouette_score(vectorized, labels, sample_size=1000)))
 89 | # Silhouette Coefficient: 0.006
 90 | 
 91 | new_post_vec = vectorizer.transform([new_post])
 92 | new_post_label = km.predict(new_post_vec)[0]
 93 | 
 94 | similar_indices = (km.labels_ == new_post_label).nonzero()[0]
 95 | 
 96 | similar = []
 97 | for i in similar_indices:
 98 |     dist = sp.linalg.norm((new_post_vec - vectorized[i]).toarray())
 99 |     similar.append((dist, train_data.data[i]))
100 | 
101 | similar = sorted(similar)
102 | print("Count similar: %i" % len(similar))
103 | 
104 | show_at_1 = similar[0]
105 | show_at_2 = similar[int(len(similar) / 10)]
106 | show_at_3 = similar[int(len(similar) / 2)]
107 | 
108 | print("=== #1 ===")
109 | print(show_at_1)
110 | print()
111 | 
112 | print("=== #2 ===")
113 | print(show_at_2)
114 | print()
115 | 
116 | print("=== #3 ===")
117 | print(show_at_3)
118 | 


--------------------------------------------------------------------------------
/ch03/tfidf.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import scipy as sp
 9 | 
10 | 
11 | def tfidf(t, d, D):
12 |     tf = float(d.count(t)) / sum(d.count(w) for w in set(d))
13 |     idf = sp.log(float(len(D)) / (len([doc for doc in D if t in doc])))
14 |     return tf * idf
15 | 
16 | 
17 | a, abb, abc = ["a"], ["a", "b", "b"], ["a", "b", "c"]
18 | D = [a, abb, abc]
19 | 
20 | print(tfidf("a", a, D))
21 | print(tfidf("b", abb, D))
22 | print(tfidf("a", abc, D))
23 | print(tfidf("b", abc, D))
24 | print(tfidf("c", abc, D))
25 | 


--------------------------------------------------------------------------------
/ch03/utils.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import os
 9 | import sys
10 | 
11 | DATA_DIR = os.path.join(
12 |     os.path.dirname(os.path.realpath(__file__)), "data")
13 | 
14 | if not os.path.exists(DATA_DIR):
15 |     print("Uh, we were expecting a data directory, which contains the toy data")
16 |     sys.exit(1)
17 | 
18 | CHART_DIR = os.path.join(
19 |     os.path.dirname(os.path.realpath(__file__)), "charts")
20 | if not os.path.exists(CHART_DIR):
21 |     os.mkdir(CHART_DIR)
22 | 
23 | 


--------------------------------------------------------------------------------
/ch04/.gitignore:
--------------------------------------------------------------------------------
1 | wiki_lda.pkl
2 | wiki_lda.pkl.state
3 | *.png
4 | *.npy
5 | *.pkl
6 | topics.txt
7 | 


--------------------------------------------------------------------------------
/ch04/README.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | Chapter 4
 3 | =========
 4 | 
 5 | Support code for *Chapter 4: Topic Modeling*
 6 | 
 7 | 
 8 | AP Data
 9 | -------
10 | 
11 | To download the AP data, use the ``download_ap.sh`` script inside the ``data``
12 | directory::
13 | 
14 |     cd data
15 |     ./download_ap.sh
16 | 
17 | Word cloud creation
18 | -------------------
19 | 
20 | Word cloud creation requires that ``pytagcloud`` be installed (in turn, this
21 | requires ``pygame``). Since this is not an essential part of the chapter, the
22 | code will work even if you have not installed it (naturally, the cloud image
23 | will not be generated and a warning will be printed).
24 | 
25 | 
26 | Wikipedia processing
27 | --------------------
28 | 
29 | You will need **a lot of disk space**. The download of the Wikipedia text is
30 | 11GB and preprocessing it takes another 24GB to save it in the intermediate
31 | format that gensim uses for a total of 34GB!
32 | 
33 | Run the following two commands inside the ``data/`` directory::
34 | 
35 |     ./download_wp.sh
36 |     ./preprocess-wikidata.sh
37 | 
38 | As the filenames indicate, the first step will download the data and the second
39 | one will preprocess it. Preprocessing can take several hours, but it is
40 | feasible to run it on a modern laptop. Once the second step is finished, you
41 | may remove the input file if you want to save disk space
42 | (``data/enwiki-latest-pages-articles.xml.bz2``).
43 | 
44 | To generate the model, you can run the ``wikitopics_create.py`` script, while
45 | the ``wikitopics_plot.py`` script will plot the most heavily discussed topic as
46 | well as the least heavily discussed one. The code is split into steps as the
47 | first one can take a very long time. Then it saves the results so that you can
48 | later explore them at leisure.
49 | 
50 | You should not expect that your results will exactly match the results in the
51 | book, for two reasons:
52 | 
53 | 1. The LDA algorithm is a probabilistic algorithm and can give different
54 |    results every time it is run.
55 | 2. Wikipedia keeps changing. Thus, even your input data will be different.
56 | 
57 | Scripts
58 | -------
59 | 
60 | blei_lda.py
61 |     Computes LDA using the AP Corpus.
62 | wikitopics_create.py
63 |     Create the topic model for Wikipedia using LDA (must download wikipedia database first)
64 | wikitopics_create_hdp.py
65 |     Create the topic model for Wikipedia using HDP (must download wikipedia database first)
66 | 


--------------------------------------------------------------------------------
/ch04/blei_lda.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | from __future__ import print_function
 9 | from wordcloud import create_cloud
10 | try:
11 |     from gensim import corpora, models, matutils
12 | except:
13 |     print("import gensim failed.")
14 |     print()
15 |     print("Please install it")
16 |     raise
17 | 
18 | import matplotlib.pyplot as plt
19 | import numpy as np
20 | from os import path
21 | 
22 | NUM_TOPICS = 100
23 | 
24 | # Check that data exists
25 | if not path.exists('./data/ap/ap.dat'):
26 |     print('Error: Expected data to be present at data/ap/')
27 |     print('Please cd into ./data & run ./download_ap.sh')
28 | 
29 | # Load the data
30 | corpus = corpora.BleiCorpus('./data/ap/ap.dat', './data/ap/vocab.txt')
31 | 
32 | # Build the topic model
33 | model = models.ldamodel.LdaModel(
34 |     corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=None)
35 | 
36 | # Iterate over all the topics in the model
37 | for ti in range(model.num_topics):
38 |     words = model.show_topic(ti, 64)
39 |     tf = sum(f for _, f in words)
40 |     with open('topics.txt', 'w') as output:
41 |         output.write('\n'.join('{}:{}'.format(w, int(1000. * f / tf)) for w, f in words))
42 |         output.write("\n\n\n")
43 | 
44 | # We first identify the most discussed topic, i.e., the one with the
45 | # highest total weight
46 | 
47 | topics = matutils.corpus2dense(model[corpus], num_terms=model.num_topics)
48 | weight = topics.sum(1)
49 | max_topic = weight.argmax()
50 | 
51 | 
52 | # Get the top 64 words for this topic
53 | # Without the argument, show_topic would return only 10 words
54 | words = model.show_topic(max_topic, 64)
55 | 
56 | # This function will actually check for the presence of pytagcloud and is otherwise a no-op
57 | create_cloud('cloud_blei_lda.png', words)
58 | 
59 | num_topics_used = [len(model[doc]) for doc in corpus]
60 | fig,ax = plt.subplots()
61 | ax.hist(num_topics_used, np.arange(42))
62 | ax.set_ylabel('Nr of documents')
63 | ax.set_xlabel('Nr of topics')
64 | fig.tight_layout()
65 | fig.savefig('Figure_04_01.png')
66 | 
67 | 
68 | # Now, repeat the same exercise using alpha=1.0
69 | # You can edit the constant below to play around with this parameter
70 | ALPHA = 1.0
71 | 
72 | model1 = models.ldamodel.LdaModel(
73 |     corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=ALPHA)
74 | num_topics_used1 = [len(model1[doc]) for doc in corpus]
75 | 
76 | fig,ax = plt.subplots()
77 | ax.hist([num_topics_used, num_topics_used1], np.arange(42))
78 | ax.set_ylabel('Nr of documents')
79 | ax.set_xlabel('Nr of topics')
80 | 
81 | # The coordinates below were fit by trial and error to look good
82 | ax.text(9, 223, r'default alpha')
83 | ax.text(26, 156, 'alpha=1.0')
84 | fig.tight_layout()
85 | fig.savefig('Figure_04_02.png')
86 | 
87 | 


--------------------------------------------------------------------------------
/ch04/build_lda.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | from __future__ import print_function
 8 | 
 9 | try:
10 |     import nltk.corpus
11 | except ImportError:
12 |     print("nltk not found")
13 |     print("please install it")
14 |     raise
15 | from scipy.spatial import distance
16 | import numpy as np
17 | from gensim import corpora, models
18 | import sklearn.datasets
19 | import nltk.stem
20 | from collections import defaultdict
21 | 
22 | english_stemmer = nltk.stem.SnowballStemmer('english')
23 | stopwords = set(nltk.corpus.stopwords.words('english'))
24 | stopwords.update(['from:', 'subject:', 'writes:', 'writes'])
25 | 
26 | 
27 | class DirectText(corpora.textcorpus.TextCorpus):
28 | 
29 |     def get_texts(self):
30 |         return self.input
31 | 
32 |     def __len__(self):
33 |         return len(self.input)
34 | try:
35 |     dataset = sklearn.datasets.load_mlcomp("20news-18828", "train",
36 |                                        mlcomp_root='./data')
37 | except:
38 |     print("Newsgroup data not found.")
39 |     print("Please download from http://mlcomp.org/datasets/379")
40 |     print("And expand the zip into the subdirectory data/")
41 |     print()
42 |     print()
43 |     raise
44 | 
45 | otexts = dataset.data
46 | texts = dataset.data
47 | 
48 | texts = [t.decode('utf-8', 'ignore') for t in texts]
49 | texts = [t.split() for t in texts]
50 | texts = [map(lambda w: w.lower(), t) for t in texts]
51 | texts = [filter(lambda s: not len(set("+-.?!()>@012345689") & set(s)), t)
52 |          for t in texts]
53 | texts = [filter(lambda s: (len(s) > 3) and (s not in stopwords), t)
54 |          for t in texts]
55 | texts = [map(english_stemmer.stem, t) for t in texts]
56 | usage = defaultdict(int)
57 | for t in texts:
58 |     for w in set(t):
59 |         usage[w] += 1
60 | limit = len(texts) / 10
61 | too_common = [w for w in usage if usage[w] > limit]
62 | too_common = set(too_common)
63 | texts = [filter(lambda s: s not in too_common, t) for t in texts]
64 | 
65 | corpus = DirectText(texts)
66 | dictionary = corpus.dictionary
67 | try:
68 |     dictionary['computer']
69 | except:
70 |     pass
71 | 
72 | model = models.ldamodel.LdaModel(
73 |     corpus, num_topics=100, id2word=dictionary.id2token)
74 | 
75 | thetas = np.zeros((len(texts), 100))
76 | for i, c in enumerate(corpus):
77 |     for ti, v in model[c]:
78 |         thetas[i, ti] += v
79 | 
80 | distances = distance.squareform(distance.pdist(thetas))
81 | large = distances.max() + 1
82 | for i in range(len(distances)):
83 |     distances[i, i] = large
84 | 
85 | print(otexts[1])
86 | print()
87 | print()
88 | print()
89 | print(otexts[distances[1].argmin()])
90 | 


--------------------------------------------------------------------------------
/ch04/data/.gitignore:
--------------------------------------------------------------------------------
 1 | ap.tgz
 2 | ap/
 3 | dataset-379-20news-18828_HJRZF.zip
 4 | 379/
 5 | enwiki-latest-pages-articles.xml.bz2
 6 | wiki_en_output_bow.mm
 7 | wiki_en_output_bow.mm.gz
 8 | wiki_en_output_bow.mm.index
 9 | wiki_en_output_tfidf.mm
10 | wiki_en_output_tfidf.mm.gz
11 | wiki_en_output_tfidf.mm.index
12 | wiki_en_output_wordids.txt.bz2
13 | 


--------------------------------------------------------------------------------
/ch04/data/download_ap.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | wget http://www.cs.columbia.edu/~blei/lda-c/ap.tgz
3 | tar xzf ap.tgz
4 | 


--------------------------------------------------------------------------------
/ch04/data/download_wp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
3 | 


--------------------------------------------------------------------------------
/ch04/data/preprocess-wikidata.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | python -m gensim.scripts.make_wiki enwiki-latest-pages-articles.xml.bz2 wiki_en_output
4 | 


--------------------------------------------------------------------------------
/ch04/wikitopics_create.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | from __future__ import print_function
 9 | import logging
10 | import gensim
11 | import numpy as np
12 | 
13 | NR_OF_TOPICS = 100
14 | 
15 | # Set up logging in order to get progress information as the model is being built:
16 | logging.basicConfig(
17 |     format='%(asctime)s : %(levelname)s : %(message)s',
18 |     level=logging.INFO)
19 | 
20 | # Load the preprocessed corpus (id2word & mm):
21 | id2word = gensim.corpora.Dictionary.load_from_text(
22 |     'data/wiki_en_output_wordids.txt.bz2')
23 | mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm')
24 | 
25 | # Calling the constructor is enough to build the model
26 | # This call will take a few hours!
27 | model = gensim.models.ldamodel.LdaModel(
28 |     corpus=mm,
29 |     id2word=id2word,
30 |     num_topics=NR_OF_TOPICS,
31 |     update_every=1,
32 |     chunksize=10000,
33 |     passes=1)
34 | 
35 | # Save the model so we do not need to learn it again.
36 | model.save('wiki_lda.pkl')
37 | 
38 | # Compute the document/topic matrix
39 | topics = np.zeros((len(mm), model.num_topics))
40 | for di,doc in enumerate(mm):
41 |     doc_top = model[doc]
42 |     for ti,tv in doc_top:
43 |         topics[di,ti] += tv
44 | np.save('topics.npy', topics)
45 | 
46 | # Alternatively, we create a sparse matrix and save that. This alternative
47 | # saves disk space, at the cost of slightly more complex code:
48 | 
49 | ## from scipy import sparse, io
50 | ## sp = sparse.csr_matrix(topics)
51 | ## io.savemat('topics.mat', {'topics': sp})
52 | 


--------------------------------------------------------------------------------
/ch04/wikitopics_create_hdp.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | from __future__ import print_function
 9 | import logging
10 | import gensim
11 | import numpy as np
12 | 
13 | # Set up logging in order to get progress information as the model is being built:
14 | logging.basicConfig(
15 |     format='%(asctime)s : %(levelname)s : %(message)s',
16 |     level=logging.INFO)
17 | 
18 | # Load the preprocessed corpus (id2word & mm):
19 | id2word = gensim.corpora.Dictionary.load_from_text(
20 |     'data/wiki_en_output_wordids.txt.bz2')
21 | mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm')
22 | 
23 | # Calling the constructor is enough to build the model
24 | # This call will take a few hours!
25 | model = gensim.models.hdpmodel.HdpModel(
26 |     corpus=mm,
27 |     id2word=id2word,
28 |     chunksize=10000)
29 | 
30 | # Save the model so we do not need to learn it again.
31 | model.save('wiki_hdp.pkl')
32 | 
33 | # Compute the document/topic matrix
34 | topics = np.zeros((len(mm), model.num_topics))
35 | for di,doc in enumerate(mm):
36 |     doc_top = model[doc]
37 |     for ti,tv in doc_top:
38 |         topics[di,ti] += tv
39 | np.save('topics_hdp.npy', topics)
40 | 


--------------------------------------------------------------------------------
/ch04/wikitopics_plot.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | from __future__ import print_function
 9 | import numpy as np
10 | import gensim
11 | from os import path
12 | from wordcloud import create_cloud
13 | 
14 | if not path.exists('wiki_lda.pkl'):
15 |     import sys
16 |     sys.stderr.write('''\
17 | This script must be run after wikitopics_create.py!
18 | 
19 | That script creates and saves the LDA model (this must onlly be done once).
20 | This script is responsible for the analysis.''')
21 |     sys.exit(1)
22 | 
23 | # Load the preprocessed Wikipedia corpus (id2word and mm)
24 | id2word = gensim.corpora.Dictionary.load_from_text(
25 |     'data/wiki_en_output_wordids.txt.bz2')
26 | mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm')
27 | 
28 | # Load the precomputed model
29 | model = gensim.models.ldamodel.LdaModel.load('wiki_lda.pkl')
30 | 
31 | topics = np.load('topics.npy', mmap_mode='r')
32 | 
33 | # Compute the number of topics mentioned in each document
34 | lens = (topics > 0).sum(axis=1)
35 | print('Mean number of topics mentioned: {0:.3}'.format(np.mean(lens)))
36 | print('Percentage of articles mentioning less than 10 topics: {0:.1%}'.format(np.mean(lens <= 10)))
37 | 
38 | # Weights will be the total weight of each topic
39 | weights = topics.sum(0)
40 | 
41 | # Retrieve the most heavily used topic and plot it as a word cloud:
42 | words = model.show_topic(weights.argmax(), 64)
43 | 
44 | # The parameter ``maxsize`` often needs some manual tuning to make it look nice.
45 | create_cloud('Wikipedia_most.png', words, maxsize=250, fontname='Cardo')
46 | 
47 | fraction_mention = np.mean(topics[:,weights.argmax()] > 0)
48 | print("The most mentioned topics is mentioned in {:.1%} of documents.".format(fraction_mention))
49 | total_weight = np.mean(topics[:,weights.argmax()])
50 | print("It represents {:.1%} of the total number of words.".format(total_weight))
51 | print()
52 | print()
53 | print()
54 | 
55 | # Retrieve the **least** heavily used topic and plot it as a word cloud:
56 | words = model.show_topic(weights.argmin(), 64)
57 | create_cloud('Wikipedia_least.png', words, maxsize=150, fontname='Cardo')
58 | fraction_mention = np.mean(topics[:,weights.argmin()] > 0)
59 | print("The least mentioned topics is mentioned in {:.1%} of documents.".format(fraction_mention))
60 | total_weight = np.mean(topics[:,weights.argmin()])
61 | print("It represents {:.1%} of the total number of words.".format(total_weight))
62 | print()
63 | print()
64 | print()
65 | 


--------------------------------------------------------------------------------
/ch04/wordcloud.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | warned_of_error = False
 3 | 
 4 | def create_cloud(oname, words,maxsize=120, fontname='Lobster'):
 5 |     '''Creates a word cloud (when pytagcloud is installed)
 6 | 
 7 |     Parameters
 8 |     ----------
 9 |     oname : output filename
10 |     words : list of (value,str)
11 |     maxsize : int, optional
12 |         Size of maximum word. The best setting for this parameter will often
13 |         require some manual tuning for each input.
14 |     fontname : str, optional
15 |         Font to use.
16 |     '''
17 |     try:
18 |         from pytagcloud import create_tag_image, make_tags
19 |     except ImportError:
20 |         if not warned_of_error:
21 |             print("Could not import pytagcloud. Skipping cloud generation")
22 |         return
23 | 
24 |     # gensim returns a weight between 0 and 1 for each word, while pytagcloud
25 |     # expects an integer word count. So, we multiply by a large number and
26 |     # round. For a visualization this is an adequate approximation.
27 |     words = [(w,int(v*10000)) for w,v in words]
28 |     tags = make_tags(words, maxsize=maxsize)
29 |     create_tag_image(tags, oname, size=(1800, 1200), fontname=fontname)
30 | 


--------------------------------------------------------------------------------
/ch05/README.md:
--------------------------------------------------------------------------------
 1 | Chapter 5 - Classification - Detecting Poor Answers
 2 | ===================================================
 3 | 
 4 | The book chapter is based on StackExchange's data blob from August 2012 for the first edition. 
 5 | 
 6 | After publishing the book, StackExchange released the May 2014 version at
 7 | [https://archive.org/download/stackexchange/stackexchange_archive.torrent](https://archive.org/download/stackexchange/stackexchange_archive.torrent).
 8 | 
 9 | Note that using the latest version, you will get slightly different results.
10 | 
11 | The code is using pyenchant for spell correction. Pyenchent is only used to increase your pleasure of eperimenting with additional features. It is not used later on the chapter. So, if you find out that your platform poses too big problems to install it (e.g. on 64bit Windows), don't bother.
12 | 


--------------------------------------------------------------------------------
/ch05/data.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import os
 9 | 
10 | DATA_DIR = "data"  # put your posts-2012.xml into this directory
11 | CHART_DIR = "charts"
12 | 
13 | filtered = os.path.join(DATA_DIR, "filtered.tsv")
14 | filtered_meta = os.path.join(DATA_DIR, "filtered-meta.json")
15 | 
16 | chosen = os.path.join(DATA_DIR, "chosen.tsv")
17 | chosen_meta = os.path.join(DATA_DIR, "chosen-meta.json")
18 | 


--------------------------------------------------------------------------------
/ch05/log_reg_example.py:
--------------------------------------------------------------------------------
  1 | # This code is supporting material for the book
  2 | # Building Machine Learning Systems with Python
  3 | # by Willi Richert and Luis Pedro Coelho
  4 | # published by PACKT Publishing
  5 | #
  6 | # It is made available under the MIT License
  7 | 
  8 | import os
  9 | from data import CHART_DIR
 10 | 
 11 | import numpy as np
 12 | from scipy.stats import norm
 13 | 
 14 | from matplotlib import pyplot
 15 | np.random.seed(3)
 16 | 
 17 | num_per_class = 40
 18 | X = np.hstack((norm.rvs(2, size=num_per_class, scale=2),
 19 |               norm.rvs(8, size=num_per_class, scale=3)))
 20 | y = np.hstack((np.zeros(num_per_class),
 21 |                np.ones(num_per_class)))
 22 | 
 23 | 
 24 | def lr_model(clf, X):
 25 |     return 1.0 / (1.0 + np.exp(-(clf.intercept_ + clf.coef_ * X)))
 26 | 
 27 | from sklearn.linear_model import LogisticRegression
 28 | logclf = LogisticRegression()
 29 | print(logclf)
 30 | logclf.fit(X.reshape(num_per_class * 2, 1), y)
 31 | print(np.exp(logclf.intercept_), np.exp(logclf.coef_.ravel()))
 32 | print("P(x=-1)=%.2f\tP(x=7)=%.2f" %
 33 |       (lr_model(logclf, -1), lr_model(logclf, 7)))
 34 | X_test = np.arange(-5, 20, 0.1)
 35 | pyplot.figure(figsize=(10, 4))
 36 | pyplot.xlim((-5, 20))
 37 | pyplot.scatter(X, y, c=y)
 38 | pyplot.xlabel("feature value")
 39 | pyplot.ylabel("class")
 40 | pyplot.grid(True, linestyle='-', color='0.75')
 41 | pyplot.savefig(
 42 |     os.path.join(CHART_DIR, "log_reg_example_data.png"), bbox_inches="tight")
 43 | 
 44 | 
 45 | def lin_model(clf, X):
 46 |     return clf.intercept_ + clf.coef_ * X
 47 | 
 48 | from sklearn.linear_model import LinearRegression
 49 | clf = LinearRegression()
 50 | print(clf)
 51 | clf.fit(X.reshape(num_per_class * 2, 1), y)
 52 | X_odds = np.arange(0, 1, 0.001)
 53 | pyplot.figure(figsize=(10, 4))
 54 | pyplot.subplot(1, 2, 1)
 55 | pyplot.scatter(X, y, c=y)
 56 | pyplot.plot(X_test, lin_model(clf, X_test))
 57 | pyplot.xlabel("feature value")
 58 | pyplot.ylabel("class")
 59 | pyplot.title("linear fit on original data")
 60 | pyplot.grid(True, linestyle='-', color='0.75')
 61 | 
 62 | X_ext = np.hstack((X, norm.rvs(20, size=100, scale=5)))
 63 | y_ext = np.hstack((y, np.ones(100)))
 64 | clf = LinearRegression()
 65 | clf.fit(X_ext.reshape(num_per_class * 2 + 100, 1), y_ext)
 66 | pyplot.subplot(1, 2, 2)
 67 | pyplot.scatter(X_ext, y_ext, c=y_ext)
 68 | pyplot.plot(X_ext, lin_model(clf, X_ext))
 69 | pyplot.xlabel("feature value")
 70 | pyplot.ylabel("class")
 71 | pyplot.title("linear fit on additional data")
 72 | pyplot.grid(True, linestyle='-', color='0.75')
 73 | pyplot.savefig(
 74 |     os.path.join(CHART_DIR, "log_reg_log_linear_fit.png"), bbox_inches="tight")
 75 | 
 76 | pyplot.figure(figsize=(10, 4))
 77 | pyplot.xlim((-5, 20))
 78 | pyplot.scatter(X, y, c=y)
 79 | pyplot.plot(X_test, lr_model(logclf, X_test).ravel())
 80 | pyplot.plot(X_test, np.ones(X_test.shape[0]) * 0.5, "--")
 81 | pyplot.xlabel("feature value")
 82 | pyplot.ylabel("class")
 83 | pyplot.grid(True, linestyle='-', color='0.75')
 84 | pyplot.savefig(
 85 |     os.path.join(CHART_DIR, "log_reg_example_fitted.png"), bbox_inches="tight")
 86 | 
 87 | X = np.arange(0, 1, 0.001)
 88 | pyplot.figure(figsize=(10, 4))
 89 | pyplot.subplot(1, 2, 1)
 90 | pyplot.xlim((0, 1))
 91 | pyplot.ylim((0, 10))
 92 | pyplot.plot(X, X / (1 - X))
 93 | pyplot.xlabel("P")
 94 | pyplot.ylabel("odds = P / (1-P)")
 95 | pyplot.grid(True, linestyle='-', color='0.75')
 96 | 
 97 | pyplot.subplot(1, 2, 2)
 98 | pyplot.xlim((0, 1))
 99 | pyplot.plot(X, np.log(X / (1 - X)))
100 | pyplot.xlabel("P")
101 | pyplot.ylabel("log(odds) = log(P / (1-P))")
102 | pyplot.grid(True, linestyle='-', color='0.75')
103 | pyplot.savefig(
104 |     os.path.join(CHART_DIR, "log_reg_log_odds.png"), bbox_inches="tight")
105 | 


--------------------------------------------------------------------------------
/ch06/01_start.py:
--------------------------------------------------------------------------------
  1 | # This code is supporting material for the book
  2 | # Building Machine Learning Systems with Python
  3 | # by Willi Richert and Luis Pedro Coelho
  4 | # published by PACKT Publishing
  5 | #
  6 | # It is made available under the MIT License
  7 | 
  8 | #
  9 | # This script trains multinomial Naive Bayes on the tweet corpus
 10 | # to find two different results:
 11 | # - How well can we distinguis positive from negative tweets?
 12 | # - How well can we detect whether a tweet contains sentiment at all?
 13 | #
 14 | 
 15 | import time
 16 | start_time = time.time()
 17 | 
 18 | import numpy as np
 19 | 
 20 | from sklearn.metrics import precision_recall_curve, roc_curve, auc
 21 | from sklearn.cross_validation import ShuffleSplit
 22 | 
 23 | from utils import plot_pr
 24 | from utils import load_sanders_data
 25 | from utils import tweak_labels
 26 | 
 27 | from sklearn.feature_extraction.text import TfidfVectorizer
 28 | from sklearn.pipeline import Pipeline
 29 | 
 30 | from sklearn.naive_bayes import MultinomialNB
 31 | 
 32 | 
 33 | def create_ngram_model():
 34 |     tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3),
 35 |                                    analyzer="word", binary=False)
 36 |     clf = MultinomialNB()
 37 |     pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)])
 38 |     return pipeline
 39 | 
 40 | 
 41 | def train_model(clf_factory, X, Y, name="NB ngram", plot=False):
 42 |     cv = ShuffleSplit(
 43 |         n=len(X), n_iter=10, test_size=0.3, random_state=0)
 44 | 
 45 |     train_errors = []
 46 |     test_errors = []
 47 | 
 48 |     scores = []
 49 |     pr_scores = []
 50 |     precisions, recalls, thresholds = [], [], []
 51 | 
 52 |     for train, test in cv:
 53 |         X_train, y_train = X[train], Y[train]
 54 |         X_test, y_test = X[test], Y[test]
 55 | 
 56 |         clf = clf_factory()
 57 |         clf.fit(X_train, y_train)
 58 | 
 59 |         train_score = clf.score(X_train, y_train)
 60 |         test_score = clf.score(X_test, y_test)
 61 | 
 62 |         train_errors.append(1 - train_score)
 63 |         test_errors.append(1 - test_score)
 64 | 
 65 |         scores.append(test_score)
 66 |         proba = clf.predict_proba(X_test)
 67 | 
 68 |         fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
 69 |         precision, recall, pr_thresholds = precision_recall_curve(
 70 |             y_test, proba[:, 1])
 71 | 
 72 |         pr_scores.append(auc(recall, precision))
 73 |         precisions.append(precision)
 74 |         recalls.append(recall)
 75 |         thresholds.append(pr_thresholds)
 76 | 
 77 |     scores_to_sort = pr_scores
 78 |     median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
 79 | 
 80 |     if plot:
 81 |         plot_pr(pr_scores[median], name, "01", precisions[median],
 82 |                 recalls[median], label=name)
 83 | 
 84 |         summary = (np.mean(scores), np.std(scores),
 85 |                    np.mean(pr_scores), np.std(pr_scores))
 86 |         print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)
 87 | 
 88 |     return np.mean(train_errors), np.mean(test_errors)
 89 | 
 90 | 
 91 | def print_incorrect(clf, X, Y):
 92 |     Y_hat = clf.predict(X)
 93 |     wrong_idx = Y_hat != Y
 94 |     X_wrong = X[wrong_idx]
 95 |     Y_wrong = Y[wrong_idx]
 96 |     Y_hat_wrong = Y_hat[wrong_idx]
 97 |     for idx in range(len(X_wrong)):
 98 |         print("clf.predict('%s')=%i instead of %i" %
 99 |               (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]))
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     X_orig, Y_orig = load_sanders_data()
104 |     classes = np.unique(Y_orig)
105 |     for c in classes:
106 |         print("#%s: %i" % (c, sum(Y_orig == c)))
107 | 
108 |     print("== Pos vs. neg ==")
109 |     pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
110 |     X = X_orig[pos_neg]
111 |     Y = Y_orig[pos_neg]
112 |     Y = tweak_labels(Y, ["positive"])
113 | 
114 |     train_model(create_ngram_model, X, Y, name="pos vs neg", plot=True)
115 | 
116 |     print("== Pos/neg vs. irrelevant/neutral ==")
117 |     X = X_orig
118 |     Y = tweak_labels(Y_orig, ["positive", "negative"])
119 |     train_model(create_ngram_model, X, Y, name="sent vs rest", plot=True)
120 | 
121 |     print("== Pos vs. rest ==")
122 |     X = X_orig
123 |     Y = tweak_labels(Y_orig, ["positive"])
124 |     train_model(create_ngram_model, X, Y, name="pos vs rest", plot=True)
125 | 
126 |     print("== Neg vs. rest ==")
127 |     X = X_orig
128 |     Y = tweak_labels(Y_orig, ["negative"])
129 |     train_model(create_ngram_model, X, Y, name="neg vs rest", plot=True)
130 | 
131 |     print("time spent:", time.time() - start_time)
132 | 


--------------------------------------------------------------------------------
/ch06/README.md:
--------------------------------------------------------------------------------
 1 | Chapter 6 - Classification II - Sentiment Analysis
 2 | ==================================================
 3 | 
 4 | When doing last code sanity checks for the book, Twitter
 5 | was using the API 1.0, which did not require authentication.
 6 | With its switch to version 1.1, this has now changed.
 7 | 
 8 | If you don't have already created your personal Twitter
 9 | access keys and tokens, you might want to do so at
10 | [https://dev.twitter.com/docs/auth/tokens-devtwittercom](https://dev.twitter.com/docs/auth/tokens-devtwittercom) and paste the keys/secrets into twitterauth.py
11 | 
12 | According to [https://dev.twitter.com/docs/rate-limiting/1](https://dev.twitter.com/docs/rate-limiting/1) Twitter has a rate limit of fetching 350 tweets/h for authorized users.
13 | 
14 | Note that some tweets might be missing when you are running install.py (user got suspended, changed authorization, or tweet deleted) and thus you might get different results. We keep track of those tweet IDs in data/{missing,not_authorized}.tsv, so that they are not fetched when you run install.py.
15 | 


--------------------------------------------------------------------------------
/ch06/twitterauth.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import sys
 9 | 
10 | CONSUMER_KEY = None
11 | CONSUMER_SECRET = None
12 | 
13 | ACCESS_TOKEN_KEY = None
14 | ACCESS_TOKEN_SECRET = None
15 | 
16 | if CONSUMER_KEY is None or CONSUMER_SECRET is None or ACCESS_TOKEN_KEY is None or ACCESS_TOKEN_SECRET is None:
17 |     print("""\
18 | When doing last code sanity checks for the book, Twitter
19 | was using the API 1.0, which did not require authentication.
20 | With its switch to version 1.1, this has now changed.
21 | 
22 | It seems that you don't have already created your personal Twitter
23 | access keys and tokens. Please do so at
24 | https://dev.twitter.com/docs/auth/tokens-devtwittercom
25 | and paste the keys/secrets into twitterauth.py
26 | 
27 | Sorry for the inconvenience,
28 | The authors.""")
29 | 
30 |     sys.exit(1)
31 | 


--------------------------------------------------------------------------------
/ch07/.gitignore:
--------------------------------------------------------------------------------
1 | *.png
2 | 


--------------------------------------------------------------------------------
/ch07/README.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | Chapter 7
 3 | =========
 4 | 
 5 | Support code for *Chapter 7: Regression* 
 6 | 
 7 | 
 8 | Boston data analysis
 9 | --------------------
10 | 
11 | This dataset is shipped with sklearn. Thus, no extra download is required.
12 | 
13 | 
14 | boston1.py
15 |     Fit a linear regression model to the Boston house price data
16 | boston1numpy.py
17 |     Version of above script using numpy operations for linear regression
18 | boston_cv_penalized.py
19 |     Test different penalized (and OLS) regression schemes on the Boston dataset
20 | figure1_2.py
21 |     Show the regression line for Boston data
22 | figure3.py
23 |     Show the regression line for Boston data with OLS and Lasso
24 | figure4.py
25 |     Scatter plot of predicted-vs-actual for multidimensional regression
26 | 
27 | 10K data analysis
28 | -----------------
29 | 
30 | lr10k.py
31 |     Linear regression on 10K dataset, evaluation by cross-validation
32 | predict10k_en.py
33 |     Elastic nets (including with inner cross-validation for parameter
34 |     settings). Produces scatter plot.
35 | 
36 | 
37 | MovieLens data analysis
38 | -----------------------
39 | 
40 | In this chapter, we only consider a very simple approach, which is implemented
41 | in the ``usermodel.py`` script.
42 | 
43 | 


--------------------------------------------------------------------------------
/ch07/boston1.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | # This script shows an example of simple (ordinary) linear regression
 9 | 
10 | # The first edition of the book NumPy functions only for this operation. See
11 | # the file boston1numpy.py for that version.
12 | 
13 | import numpy as np
14 | from sklearn.datasets import load_boston
15 | from sklearn.linear_model import LinearRegression
16 | from matplotlib import pyplot as plt
17 | 
18 | boston = load_boston()
19 | x = boston.data
20 | y = boston.target
21 | 
22 | # Fitting a model is trivial: call the ``fit`` method in LinearRegression:
23 | lr = LinearRegression()
24 | lr.fit(x, y)
25 | 
26 | # The instance member `residues_` contains the sum of the squared residues
27 | rmse = np.sqrt(lr.residues_/len(x))
28 | print('RMSE: {}'.format(rmse))
29 | 
30 | fig, ax = plt.subplots()
31 | # Plot a diagonal (for reference):
32 | ax.plot([0, 50], [0, 50], '-', color=(.9,.3,.3), lw=4)
33 | 
34 | # Plot the prediction versus real:
35 | ax.scatter(lr.predict(x), boston.target)
36 | 
37 | ax.set_xlabel('predicted')
38 | ax.set_ylabel('real')
39 | fig.savefig('Figure_07_08.png')
40 | 


--------------------------------------------------------------------------------
/ch07/boston1numpy.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | # This script shows an example of simple (ordinary) linear regression
 9 | 
10 | import numpy as np
11 | from sklearn.datasets import load_boston
12 | import pylab as plt
13 | 
14 | boston = load_boston()
15 | x = np.array([np.concatenate((v, [1])) for v in boston.data])
16 | y = boston.target
17 | 
18 | # np.linal.lstsq implements least-squares linear regression
19 | s, total_error, _, _ = np.linalg.lstsq(x, y)
20 | 
21 | rmse = np.sqrt(total_error[0] / len(x))
22 | print('Residual: {}'.format(rmse))
23 | 
24 | # Plot the prediction versus real:
25 | plt.plot(np.dot(x, s), boston.target, 'ro')
26 | 
27 | # Plot a diagonal (for reference):
28 | plt.plot([0, 50], [0, 50], 'g-')
29 | plt.xlabel('predicted')
30 | plt.ylabel('real')
31 | plt.show()
32 | 


--------------------------------------------------------------------------------
/ch07/boston_cv_penalized.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | # This script fits several forms of penalized regression
 9 | 
10 | from __future__ import print_function
11 | import numpy as np
12 | from sklearn.cross_validation import KFold
13 | from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
14 | from sklearn.metrics import r2_score
15 | from sklearn.datasets import load_boston
16 | boston = load_boston()
17 | x = boston.data
18 | y = boston.target
19 | 
20 | for name, met in [
21 |         ('linear regression', LinearRegression()),
22 |         ('lasso()', Lasso()),
23 |         ('elastic-net(.5)', ElasticNet(alpha=0.5)),
24 |         ('lasso(.5)', Lasso(alpha=0.5)),
25 |         ('ridge(.5)', Ridge(alpha=0.5)),
26 | ]:
27 |     # Fit on the whole data:
28 |     met.fit(x, y)
29 | 
30 |     # Predict on the whole data:
31 |     p = met.predict(x)
32 |     r2_train = r2_score(y, p)
33 | 
34 |     # Now, we use 10 fold cross-validation to estimate generalization error
35 |     kf = KFold(len(x), n_folds=5)
36 |     p = np.zeros_like(y)
37 |     for train, test in kf:
38 |         met.fit(x[train], y[train])
39 |         p[test] = met.predict(x[test])
40 | 
41 |     r2_cv = r2_score(y, p)
42 |     print('Method: {}'.format(name))
43 |     print('R2 on training: {}'.format(r2_train))
44 |     print('R2 on 5-fold CV: {}'.format(r2_cv))
45 |     print()
46 |     print()
47 | 


--------------------------------------------------------------------------------
/ch07/data/.gitignore:
--------------------------------------------------------------------------------
1 | E2006.train
2 | 


--------------------------------------------------------------------------------
/ch07/data/download.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | curl -O http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2
3 | bunzip2 E2006.train.bz2
4 | 


--------------------------------------------------------------------------------
/ch07/figure1_2.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import numpy as np
 9 | from sklearn.datasets import load_boston
10 | from sklearn.linear_model import LinearRegression
11 | from sklearn.metrics import mean_squared_error, r2_score
12 | from matplotlib import pyplot as plt
13 | 
14 | boston = load_boston()
15 | 
16 | # Index number five in the number of rooms
17 | fig,ax = plt.subplots()
18 | ax.scatter(boston.data[:, 5], boston.target)
19 | ax.set_xlabel("Average number of rooms (RM)")
20 | ax.set_ylabel("House Price")
21 | 
22 | x = boston.data[:, 5]
23 | # fit (used below) takes a two-dimensional array as input. We use np.atleast_2d
24 | # to convert from one to two dimensional, then transpose to make sure that the
25 | # format matches:
26 | x = np.transpose(np.atleast_2d(x))
27 | 
28 | y = boston.target
29 | 
30 | lr = LinearRegression(fit_intercept=False)
31 | lr.fit(x, y)
32 | 
33 | ax.plot([0, boston.data[:, 5].max() + 1],
34 |          [0, lr.predict(boston.data[:, 5].max() + 1)], '-', lw=4)
35 | fig.savefig('Figure1.png')
36 | 
37 | mse = mean_squared_error(y, lr.predict(x))
38 | rmse = np.sqrt(mse)
39 | print('RMSE (no intercept): {}'.format(rmse))
40 | 
41 | # Repeat, but fitting an intercept this time:
42 | lr = LinearRegression(fit_intercept=True)
43 | 
44 | lr.fit(x, y)
45 | 
46 | fig,ax = plt.subplots()
47 | ax.set_xlabel("Average number of rooms (RM)")
48 | ax.set_ylabel("House Price")
49 | ax.scatter(boston.data[:, 5], boston.target)
50 | xmin = x.min()
51 | xmax = x.max()
52 | ax.plot([xmin, xmax], lr.predict([[xmin], [xmax]]) , '-', lw=4)
53 | fig.savefig('Figure2.png')
54 | 
55 | mse = mean_squared_error(y, lr.predict(x))
56 | print("Mean squared error (of training data): {:.3}".format(mse))
57 | 
58 | rmse = np.sqrt(mse)
59 | print("Root mean squared error (of training data): {:.3}".format(rmse))
60 | 
61 | cod = r2_score(y, lr.predict(x))
62 | print('COD (on training data): {:.2}'.format(cod))
63 | 
64 | 


--------------------------------------------------------------------------------
/ch07/figure3.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | from sklearn.linear_model import LinearRegression, Lasso
 9 | import numpy as np
10 | from sklearn.datasets import load_boston
11 | from matplotlib import pyplot as plt
12 | 
13 | boston = load_boston()
14 | fig, ax = plt.subplots()
15 | ax.scatter(boston.data[:, 5], boston.target)
16 | ax.set_xlabel("Number of rooms (RM)")
17 | ax.set_ylabel("House Price")
18 | 
19 | 
20 | x = boston.data[:, 5]
21 | xmin = x.min()
22 | xmax = x.max()
23 | x = np.transpose(np.atleast_2d(x))
24 | y = boston.target
25 | 
26 | lr = LinearRegression()
27 | lr.fit(x, y)
28 | ax.plot([xmin, xmax], lr.predict([[xmin], [xmax]]), ':', lw=4, label='OLS model')
29 | 
30 | las = Lasso()
31 | las.fit(x, y)
32 | ax.plot([xmin, xmax], las.predict([ [xmin], [xmax] ]), '-', lw=4, label='Lasso model')
33 | fig.savefig('Figure3.png')
34 | 


--------------------------------------------------------------------------------
/ch07/figure4.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | 
 9 | # This script plots prediction-vs-actual on training set for the Boston dataset
10 | # using OLS regression
11 | import numpy as np
12 | from sklearn.linear_model import LinearRegression
13 | from sklearn.datasets import load_boston
14 | from sklearn.metrics import mean_squared_error
15 | from matplotlib import pyplot as plt
16 | 
17 | boston = load_boston()
18 | 
19 | x = boston.data
20 | y = boston.target
21 | 
22 | lr = LinearRegression()
23 | lr.fit(x, y)
24 | p = lr.predict(x)
25 | print("RMSE: {:.2}.".format(np.sqrt(mean_squared_error(y, p))))
26 | print("R2: {:.2}.".format(lr.score(x, y)))
27 | fig,ax = plt.subplots()
28 | ax.scatter(p, y)
29 | ax.set_xlabel('Predicted price')
30 | ax.set_ylabel('Actual price')
31 | ax.plot([y.min(), y.max()], [y.min(), y.max()], lw=4)
32 | 
33 | fig.savefig('Figure4.png')
34 | 


--------------------------------------------------------------------------------
/ch07/lasso_path_plot.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | from sklearn.linear_model import Lasso
 9 | from sklearn.datasets import load_boston
10 | from matplotlib import pyplot as plt
11 | import numpy as np
12 | 
13 | boston = load_boston()
14 | x = boston.data
15 | y = boston.target
16 | 
17 | las = Lasso(normalize=1)
18 | alphas = np.logspace(-5, 2, 1000)
19 | alphas, coefs, _= las.path(x, y, alphas=alphas)
20 | 
21 | fig,ax = plt.subplots()
22 | ax.plot(alphas, coefs.T)
23 | ax.set_xscale('log')
24 | ax.set_xlim(alphas.max(), alphas.min())
25 | ax.set_xlabel('Lasso coefficient path as a function of alpha')
26 | ax.set_xlabel('Alpha')
27 | ax.set_ylabel('Coefficient weight')
28 | fig.savefig('Figure_LassoPath.png')
29 | 
30 | 


--------------------------------------------------------------------------------
/ch07/lr10k.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import numpy as np
 9 | from sklearn.metrics import mean_squared_error, r2_score
10 | from sklearn.datasets import load_svmlight_file
11 | from sklearn.linear_model import LinearRegression
12 | from sklearn.cross_validation import KFold
13 | 
14 | # Whether to use Elastic nets (otherwise, ordinary linear regression is used)
15 | 
16 | # Load data:
17 | data, target = load_svmlight_file('data/E2006.train')
18 | 
19 | lr = LinearRegression()
20 | 
21 | # Compute error on training data to demonstrate that we can obtain near perfect
22 | # scores:
23 | 
24 | lr.fit(data, target)
25 | pred = lr.predict(data)
26 | 
27 | print('RMSE on training, {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
28 | print('R2 on training, {:.2}'.format(r2_score(target, pred)))
29 | print('')
30 | 
31 | pred = np.zeros_like(target)
32 | kf = KFold(len(target), n_folds=5)
33 | for train, test in kf:
34 |     lr.fit(data[train], target[train])
35 |     pred[test] = lr.predict(data[test])
36 | 
37 | print('RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
38 | print('R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred)))
39 | 


--------------------------------------------------------------------------------
/ch07/predict10k_en.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import numpy as np
 9 | from sklearn.datasets import load_svmlight_file
10 | from sklearn.cross_validation import KFold
11 | from sklearn.linear_model import ElasticNetCV, ElasticNet
12 | from sklearn.metrics import mean_squared_error, r2_score
13 | from matplotlib import pyplot as plt
14 | 
15 | data, target = load_svmlight_file('data/E2006.train')
16 | 
17 | # Edit the lines below if you want to switch method:
18 | # from sklearn.linear_model import Lasso
19 | # met = Lasso(alpha=0.1)
20 | met = ElasticNet(alpha=0.1)
21 | 
22 | kf = KFold(len(target), n_folds=5)
23 | pred = np.zeros_like(target)
24 | for train, test in kf:
25 |     met.fit(data[train], target[train])
26 |     pred[test] = met.predict(data[test])
27 | 
28 | print('[EN 0.1] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
29 | print('[EN 0.1] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred)))
30 | print('')
31 | 
32 | # Construct an ElasticNetCV object (use all available CPUs)
33 | met = ElasticNetCV(n_jobs=-1)
34 | 
35 | kf = KFold(len(target), n_folds=5)
36 | pred = np.zeros_like(target)
37 | for train, test in kf:
38 |     met.fit(data[train], target[train])
39 |     pred[test] = met.predict(data[test])
40 | 
41 | print('[EN CV] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
42 | print('[EN CV] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred)))
43 | print('')
44 | 
45 | met.fit(data, target)
46 | pred = met.predict(data)
47 | print('[EN CV] RMSE on training, {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
48 | print('[EN CV] R2 on training, {:.2}'.format(r2_score(target, pred)))
49 | 
50 | 
51 | # Construct an ElasticNetCV object (use all available CPUs)
52 | met = ElasticNetCV(n_jobs=-1, l1_ratio=[.01, .05, .25, .5, .75, .95, .99])
53 | 
54 | kf = KFold(len(target), n_folds=5)
55 | pred = np.zeros_like(target)
56 | for train, test in kf:
57 |     met.fit(data[train], target[train])
58 |     pred[test] = met.predict(data[test])
59 | 
60 | 
61 | print('[EN CV l1_ratio] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
62 | print('[EN CV l1_ratio] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred)))
63 | print('')
64 | 
65 | 
66 | fig, ax = plt.subplots()
67 | y = target
68 | ax.scatter(y, pred, c='k')
69 | ax.plot([-5,-1], [-5,-1], 'r-', lw=2)
70 | ax.set_xlabel('Actual value')
71 | ax.set_ylabel('Predicted value')
72 | fig.savefig('Figure_10k_scatter_EN_l1_ratio.png')
73 | 
74 | 


--------------------------------------------------------------------------------
/ch08/README.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | Chapter 8
 3 | =========
 4 | 
 5 | Support code for *Chapter 8: Recommendations*.
 6 | 
 7 | The code refers to the second edition of the book and this code has been
 8 | significantly refactored when compared to the first one.
 9 | 
10 | Ratings Prediction
11 | ------------------
12 | 
13 | Note that since the partition of the data into training and testing is random,
14 | everytime you run the code, the results will be different.
15 | 
16 | 
17 | load_ml100k.py
18 |     Load data & partition into test/train
19 | norm.py
20 |     Normalize the data
21 | corrneighbours.py
22 |     Neighbour models based on ncrroaltoin
23 | regression.py
24 |     Regression models
25 | stacked.py
26 |     Stacked predictions
27 | averaged.py
28 |     Averaging of predictions (mentioned in book, but code is not shown there).
29 | 
30 | Association Rule Mining
31 | -----------------------
32 | 
33 | Check the folder ``apriori/``
34 | 
35 | apriori/histogram.py
36 |     Print a histogram of how many times each product was bought
37 | apriori/apriori.py
38 |     Implementation of Apriori algorithm and association rule building
39 | apriori/apriori_example.py
40 |     Example of Apriori algorithm in retail dataset
41 | 
42 | 


--------------------------------------------------------------------------------
/ch08/all_correlations.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import numpy as np
 9 | 
10 | def all_correlations(y, X):
11 |     from scipy import spatial
12 |     y = np.atleast_2d(y)
13 |     sp = spatial.distance.cdist(X, y, 'correlation')
14 |     # The "correlation distance" is 1 - corr(x,y); so we invert that to obtain the correlation
15 |     return 1 - sp.ravel()
16 | 
17 | # This is the version in the book (1st Edition):
18 | def all_correlations_book_version(bait, target):
19 |     '''
20 |     corrs = all_correlations(bait, target)
21 | 
22 |     corrs[i] is the correlation between bait and target[i]
23 |     '''
24 |     return np.array(
25 |         [np.corrcoef(bait, c)[0, 1]
26 |          for c in target])
27 | 
28 | # This is a faster, but harder to read, implementation:
29 | def all_correlations_fast_no_scipy(y, X):
30 |     '''
31 |     Cs = all_correlations(y, X)
32 | 
33 |     Cs[i] = np.corrcoef(y, X[i])[0,1]
34 |     '''
35 |     X = np.asanyarray(X, float)
36 |     y = np.asanyarray(y, float)
37 |     xy = np.dot(X, y)
38 |     y_ = y.mean()
39 |     ys_ = y.std()
40 |     x_ = X.mean(1)
41 |     xs_ = X.std(1)
42 |     n = float(len(y))
43 |     ys_ += 1e-5  # Handle zeros in ys
44 |     xs_ += 1e-5  # Handle zeros in x
45 | 
46 |     return (xy - x_ * y_ * n) / n / xs_ / ys_
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/ch08/apriori/.gitignore:
--------------------------------------------------------------------------------
1 | retail.dat.gz
2 | 


--------------------------------------------------------------------------------
/ch08/apriori/apriori.py:
--------------------------------------------------------------------------------
  1 | # This code is supporting material for the book
  2 | # Building Machine Learning Systems with Python
  3 | # by Willi Richert and Luis Pedro Coelho
  4 | # published by PACKT Publishing
  5 | #
  6 | # It is made available under the MIT License
  7 | 
  8 | from collections import namedtuple
  9 | 
 10 | 
 11 | def apriori(dataset, minsupport, maxsize):
 12 |     '''
 13 |     freqsets, support = apriori(dataset, minsupport, maxsize)
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     dataset : sequence of sequences
 18 |         input dataset
 19 |     minsupport : int
 20 |         Minimal support for frequent items
 21 |     maxsize : int
 22 |         Maximal size of frequent items to return
 23 | 
 24 |     Returns
 25 |     -------
 26 |     freqsets : sequence of sequences
 27 |     support : dictionary
 28 |         This associates each itemset (represented as a frozenset) with a float
 29 |         (the support of that itemset)
 30 |     '''
 31 |     from collections import defaultdict
 32 | 
 33 |     baskets = defaultdict(list)
 34 |     pointers = defaultdict(list)
 35 | 
 36 |     for i, ds in enumerate(dataset):
 37 |         for ell in ds:
 38 |             pointers[ell].append(i)
 39 |             baskets[frozenset([ell])].append(i)
 40 | 
 41 |     # Convert pointer items to frozensets to speed up operations later
 42 |     new_pointers = dict()
 43 |     for k in pointers:
 44 |         if len(pointers[k]) >= minsupport:
 45 |             new_pointers[k] = frozenset(pointers[k])
 46 |     pointers = new_pointers
 47 |     for k in baskets:
 48 |         baskets[k] = frozenset(baskets[k])
 49 | 
 50 | 
 51 |     # Valid are all elements whose support is >= minsupport
 52 |     valid = set()
 53 |     for el, c in baskets.items():
 54 |         if len(c) >= minsupport:
 55 |             valid.update(el)
 56 | 
 57 |     # Itemsets at first iteration are simply all singleton with valid elements:
 58 |     itemsets = [frozenset([v]) for v in valid]
 59 |     freqsets = []
 60 |     for i in range(maxsize - 1):
 61 |         print("At iteration {}, number of frequent baskets: {}".format(
 62 |             i, len(itemsets)))
 63 |         newsets = []
 64 |         for it in itemsets:
 65 |             ccounts = baskets[it]
 66 | 
 67 |             for v, pv in pointers.items():
 68 |                 if v not in it:
 69 |                     csup = (ccounts & pv)
 70 |                     if len(csup) >= minsupport:
 71 |                         new = frozenset(it | frozenset([v]))
 72 |                         if new not in baskets:
 73 |                             newsets.append(new)
 74 |                             baskets[new] = csup
 75 |         freqsets.extend(itemsets)
 76 |         itemsets = newsets
 77 |         if not len(itemsets):
 78 |             break
 79 |     support = {}
 80 |     for k in baskets:
 81 |         support[k] = float(len(baskets[k]))
 82 |     return freqsets, support
 83 | 
 84 | 
 85 | # A namedtuple to collect all values that may be interesting
 86 | AssociationRule = namedtuple('AssociationRule', ['antecendent', 'consequent', 'base', 'py_x', 'lift'])
 87 | 
 88 | def association_rules(dataset, freqsets, support, minlift):
 89 |     '''
 90 |     for assoc_rule in association_rules(dataset, freqsets, support, minlift):
 91 |         ...
 92 | 
 93 |     This function takes the returns from ``apriori``.
 94 | 
 95 |     Parameters
 96 |     ----------
 97 |     dataset : sequence of sequences
 98 |         input dataset
 99 |     freqsets : sequence of sequences
100 |     support : dictionary
101 |     minlift : int
102 |         minimal lift of yielded rules
103 | 
104 |     Returns
105 |     -------
106 |     assoc_rule : sequence of AssociationRule objects
107 |     '''
108 |     nr_transactions = float(len(dataset))
109 |     freqsets = [f for f in freqsets if len(f) > 1]
110 |     for fset in freqsets:
111 |         for f in fset:
112 |             consequent = frozenset([f])
113 |             antecendent = fset - consequent
114 |             py_x = support[fset] / support[antecendent]
115 |             base = support[consequent] / nr_transactions
116 |             lift = py_x / base
117 |             if lift > minlift:
118 |                 yield AssociationRule(antecendent, consequent, base, py_x, lift)
119 | 
120 | 


--------------------------------------------------------------------------------
/ch08/apriori/apriori_example.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | from apriori import apriori, association_rules
 9 | from gzip import GzipFile
10 | 
11 | # Load dataset
12 | dataset = [[int(tok) for tok in line.strip().split()]
13 |            for line in GzipFile('retail.dat.gz')]
14 | 
15 | freqsets, support = apriori(dataset, 80, maxsize=16)
16 | rules = list(association_rules(dataset, freqsets, support, minlift=30.0))
17 | 
18 | rules.sort(key=(lambda ar: -ar.lift))
19 | for ar in rules:
20 |     print('{} -> {} (lift = {:.4})'
21 |           .format(set(ar.antecendent),
22 |                     set(ar.consequent),
23 |                     ar.lift))
24 | 


--------------------------------------------------------------------------------
/ch08/apriori/apriori_naive.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | from collections import defaultdict
 9 | from itertools import chain
10 | from gzip import GzipFile
11 | minsupport = 80
12 | 
13 | dataset = [[int(tok) for tok in line.strip().split()]
14 |            for line in GzipFile('retail.dat.gz')]
15 | 
16 | counts = defaultdict(int)
17 | for elem in chain(*dataset):
18 |     counts[elem] += 1
19 | 
20 | # Only elements that have at least minsupport should be considered.
21 | valid = set(el for el, c in counts.items() if (c >= minsupport))
22 | 
23 | # Filter the dataset to contain only valid elements
24 | # (This step is not strictly necessary, but will make the rest of the code
25 | # faster as the itemsets will be smaller):
26 | dataset = [[el for el in ds if (el in valid)] for ds in dataset]
27 | 
28 | # Convert to frozenset for fast processing
29 | dataset = [frozenset(ds) for ds in dataset]
30 | 
31 | itemsets = [frozenset([v]) for v in valid]
32 | freqsets = itemsets[:]
33 | for i in range(16):
34 |     print("At iteration {}, number of frequent baskets: {}".format(
35 |         i, len(itemsets)))
36 |     nextsets = []
37 | 
38 |     tested = set()
39 |     for it in itemsets:
40 |         for v in valid:
41 |             if v not in it:
42 |                 # Create a new candidate set by adding v to it
43 |                 c = (it | frozenset([v]))
44 | 
45 |                 # Check if we have tested it already:
46 |                 if c in tested:
47 |                     continue
48 |                 tested.add(c)
49 | 
50 |                 # Count support by looping over dataset
51 |                 # This step is slow.
52 |                 # Check `apriori.py` for a better implementation.
53 |                 support_c = sum(1 for d in dataset if d.issuperset(c))
54 |                 if support_c > minsupport:
55 |                     nextsets.append(c)
56 |     freqsets.extend(nextsets)
57 |     itemsets = nextsets
58 |     if not len(itemsets):
59 |         break
60 | print("Finished!")
61 | 
62 | 
63 | def rules_from_itemset(itemset, dataset, minlift=1.):
64 |     nr_transactions = float(len(dataset))
65 |     for item in itemset:
66 |         consequent = frozenset([item])
67 |         antecedent = itemset-consequent
68 |         base = 0.0
69 |         # acount: antecedent count
70 |         acount = 0.0
71 | 
72 |         # ccount : consequent count
73 |         ccount = 0.0
74 |         for d in dataset:
75 |           if item in d: base += 1
76 |           if d.issuperset(itemset): ccount += 1
77 |           if d.issuperset(antecedent): acount += 1
78 |         base /= nr_transactions
79 |         p_y_given_x = ccount/acount
80 |         lift = p_y_given_x / base
81 |         if lift > minlift:
82 |             print('Rule {0} ->  {1} has lift {2}'
83 |                   .format(antecedent, consequent,lift))
84 | 
85 | for itemset in freqsets:
86 |     if len(itemset) > 1:
87 |         rules_from_itemset(itemset, dataset, minlift=4.)
88 | 


--------------------------------------------------------------------------------
/ch08/apriori/download.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 | wget http://fimi.ua.ac.be/data/retail.dat.gz
3 | 


--------------------------------------------------------------------------------
/ch08/apriori/histogram.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import numpy as np
 9 | from collections import defaultdict
10 | from itertools import chain
11 | from gzip import GzipFile
12 | dataset = [[int(tok) for tok in line.strip().split()]
13 |            for line in GzipFile('retail.dat.gz')]
14 | counts = defaultdict(int)
15 | for elem in chain(*dataset):
16 |     counts[elem] += 1
17 | counts = np.array(list(counts.values()))
18 | bins = [1, 2, 4, 8, 16, 32, 64, 128, 512]
19 | print(' {0:11} | {1:12}'.format('Nr of baskets', 'Nr of products'))
20 | print('--------------------------------')
21 | for i in range(len(bins)):
22 |     bot = bins[i]
23 |     top = (bins[i + 1] if (i + 1) < len(bins) else 100000000000)
24 |     print('  {0:4} - {1:3}   | {2:12}'.format(
25 |         bot, (top if top < 1000 else ''), np.sum((counts >= bot) & (counts < top))))
26 | 


--------------------------------------------------------------------------------
/ch08/averaged.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import load_ml100k
 3 | import regression
 4 | import corrneighbours
 5 | from sklearn import metrics
 6 | import norm
 7 | 
 8 | def predict(train):
 9 |     predicted0 = regression.predict(train)
10 |     predicted1 = regression.predict(train.T).T
11 |     predicted2 = corrneighbours.predict(train)
12 |     predicted3 = corrneighbours.predict(train.T).T
13 |     predicted4 = norm.predict(train)
14 |     predicted5 = norm.predict(train.T).T
15 |     stack = np.array([
16 |         predicted0,
17 |         predicted1,
18 |         predicted2,
19 |         predicted3,
20 |         predicted4,
21 |         predicted5,
22 |         ])
23 |     return stack.mean(0)
24 | 
25 | 
26 | def main():
27 |     train,test = load_ml100k.get_train_test(random_state=12)
28 |     predicted = predict(train)
29 |     r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
30 |     print('R2 averaged: {:.2%}'.format(r2))
31 | 
32 | if __name__ == '__main__':
33 |     main()
34 | 


--------------------------------------------------------------------------------
/ch08/corrneighbours.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | from __future__ import print_function
 9 | import numpy as np
10 | from load_ml100k import get_train_test
11 | from scipy.spatial import distance
12 | from sklearn import metrics
13 | 
14 | from norm import NormalizePositive
15 | 
16 | def predict(otrain):
17 |     binary = (otrain > 0)
18 |     norm = NormalizePositive(axis=1)
19 |     train = norm.fit_transform(otrain)
20 | 
21 |     dists = distance.pdist(binary, 'correlation')
22 |     dists = distance.squareform(dists)
23 | 
24 |     neighbors = dists.argsort(axis=1)
25 |     filled = train.copy()
26 |     for u in range(filled.shape[0]):
27 |         # n_u are the neighbors of user
28 |         n_u = neighbors[u, 1:]
29 |         for m in range(filled.shape[1]):
30 |             # This code could be faster using numpy indexing trickery as the
31 |             # cost of readibility (this is left as an exercise to the reader):
32 |             revs = [train[neigh, m]
33 |                     for neigh in n_u
34 |                     if binary[neigh, m]]
35 |             if len(revs):
36 |                 n = len(revs)
37 |                 n //= 2
38 |                 n += 1
39 |                 revs = revs[:n]
40 |                 filled[u,m] = np.mean(revs)
41 | 
42 |     return norm.inverse_transform(filled)
43 | 
44 | def main(transpose_inputs=False):
45 |     train, test = get_train_test(random_state=12)
46 |     if transpose_inputs:
47 |         train = train.T
48 |         test  = test.T
49 | 
50 |     predicted = predict(train)
51 |     r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
52 |     print('R2 score (binary {} neighbours): {:.1%}'.format(
53 |         ('movie' if transpose_inputs else 'user'),
54 |         r2))
55 | 
56 | if __name__ == '__main__':
57 |     main()
58 |     main(transpose_inputs=True)
59 | 


--------------------------------------------------------------------------------
/ch08/data/.gitignore:
--------------------------------------------------------------------------------
1 | retail.dat.gz
2 | ml-100k.zip
3 | /ml-100k/
4 | 


--------------------------------------------------------------------------------
/ch08/data/download.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | curl -L -O http://files.grouplens.org/papers/ml-100k.zip
3 | unzip ml-100k.zip
4 | curl -L -O http://fimi.ua.ac.be/data/retail.dat.gz
5 | 


--------------------------------------------------------------------------------
/ch08/figure3.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | from load_ml100k import load
 9 | from matplotlib import pyplot as plt
10 | data = load()
11 | plt.gray()
12 | plt.imshow(data[:200, :200], interpolation='nearest')
13 | plt.xlabel('User ID')
14 | plt.ylabel('Film ID')
15 | plt.savefig('Figure_08_03_DataMatrix.png')
16 | 


--------------------------------------------------------------------------------
/ch08/load_ml100k.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | def load():
 9 |     '''Load ML-100k data
10 | 
11 |     Returns the review matrix as a numpy array'''
12 |     import numpy as np
13 |     from scipy import sparse
14 |     from os import path
15 | 
16 |     if not path.exists('data/ml-100k/u.data'):
17 |         raise IOError("Data has not been downloaded.\nTry the following:\n\n\tcd data\n\t./download.sh")
18 | 
19 |     # The input is in the form of a CSC sparse matrix, so it's a natural fit to
20 |     # load the data, but we then convert to a more traditional array before
21 |     # returning
22 |     data = np.loadtxt('data/ml-100k/u.data')
23 |     ij = data[:, :2]
24 |     ij -= 1  # original data is in 1-based system
25 |     values = data[:, 2]
26 |     reviews = sparse.csc_matrix((values, ij.T)).astype(float)
27 |     return reviews.toarray()
28 | 
29 | def get_train_test(reviews=None, random_state=None):
30 |     '''Split data into training & testing
31 | 
32 |     Parameters
33 |     ----------
34 |     reviews : ndarray, optional
35 |         Input data
36 | 
37 |     Returns
38 |     -------
39 |     train : ndarray
40 |         training data
41 |     test : ndarray
42 |         testing data
43 |     '''
44 |     import numpy as np
45 |     import random
46 |     r = random.Random(random_state)
47 | 
48 |     if reviews is None:
49 |         reviews = load()
50 |     U,M = np.where(reviews)
51 |     test_idxs = np.array(r.sample(range(len(U)), len(U)//10))
52 |     train = reviews.copy()
53 |     train[U[test_idxs], M[test_idxs]] = 0
54 | 
55 |     test = np.zeros_like(reviews)
56 |     test[U[test_idxs], M[test_idxs]] = reviews[U[test_idxs], M[test_idxs]]
57 | 
58 |     return train, test
59 | 
60 | 


--------------------------------------------------------------------------------
/ch08/norm.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class NormalizePositive(object):
 4 | 
 5 |     def __init__(self, axis=0):
 6 |         self.axis = axis
 7 | 
 8 |     def fit(self, features, y=None):
 9 |         # count features that are greater than zero in axis `self.axis`:
10 |         if self.axis == 1:
11 |             features = features.T
12 |         binary = (features > 0)
13 |         count = binary.sum(axis=0)
14 | 
15 |         # to avoid division by zero, set zero counts to one:
16 |         count[count == 0] = 1.
17 | 
18 |         self.mean = features.sum(axis=0)/count
19 | 
20 |         # Compute variance by average squared difference to the mean, but only
21 |         # consider differences where binary is True (i.e., where there was a
22 |         # true rating):
23 |         diff = (features - self.mean) * binary
24 |         diff **= 2
25 |         # regularize the estimate of std by adding 0.1
26 |         self.std = np.sqrt(0.1 + diff.sum(axis=0)/count)
27 |         return self
28 | 
29 |     def transform(self, features):
30 |         if self.axis == 1:
31 |             features = features.T
32 |         binary = (features > 0)
33 |         features = features - self.mean
34 |         features /= self.std
35 |         features *= binary
36 |         if self.axis == 1:
37 |             features = features.T
38 |         return features
39 | 
40 |     def inverse_transform(self, features, copy=True):
41 |         if copy:
42 |             features = features.copy()
43 |         if self.axis == 1:
44 |             features = features.T
45 |         features *= self.std
46 |         features += self.mean
47 |         if self.axis == 1:
48 |             features = features.T
49 |         return features
50 | 
51 |     def fit_transform(self, features):
52 |         return self.fit(features).transform(features)
53 | 
54 | 
55 | def predict(train):
56 |     norm = NormalizePositive()
57 |     train = norm.fit_transform(train)
58 |     return norm.inverse_transform(train * 0.)
59 | 
60 | 
61 | def main(transpose_inputs=False):
62 |     from load_ml100k import get_train_test
63 |     from sklearn import metrics
64 |     train,test = get_train_test(random_state=12)
65 |     if transpose_inputs:
66 |         train = train.T
67 |         test = test.T
68 |     predicted = predict(train)
69 |     r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
70 |     print('R2 score ({} normalization): {:.1%}'.format(
71 |         ('movie' if transpose_inputs else 'user'),
72 |         r2))
73 | if __name__ == '__main__':
74 |     main()
75 |     main(transpose_inputs=True)
76 | 


--------------------------------------------------------------------------------
/ch08/regression.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import numpy as np
 9 | from sklearn.linear_model import ElasticNetCV
10 | from norm import NormalizePositive
11 | from sklearn import metrics
12 | 
13 | 
14 | def predict(train):
15 |     binary = (train > 0)
16 |     reg = ElasticNetCV(fit_intercept=True, alphas=[
17 |                        0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
18 |     norm = NormalizePositive()
19 |     train = norm.fit_transform(train)
20 | 
21 |     filled = train.copy()
22 |     # iterate over all users
23 |     for u in range(train.shape[0]):
24 |         # remove the current user for training
25 |         curtrain = np.delete(train, u, axis=0)
26 |         bu = binary[u]
27 |         if np.sum(bu) > 5:
28 |             reg.fit(curtrain[:,bu].T, train[u, bu])
29 | 
30 |             # Fill the values that were not there already
31 |             filled[u, ~bu] = reg.predict(curtrain[:,~bu].T)
32 |     return norm.inverse_transform(filled)
33 | 
34 | 
35 | def main(transpose_inputs=False):
36 |     from load_ml100k import get_train_test
37 |     train,test = get_train_test(random_state=12)
38 |     if transpose_inputs:
39 |         train = train.T
40 |         test = test.T
41 |     filled = predict(train)
42 |     r2 = metrics.r2_score(test[test > 0], filled[test > 0])
43 | 
44 |     print('R2 score ({} regression): {:.1%}'.format(
45 |         ('movie' if transpose_inputs else 'user'),
46 |         r2))
47 | 
48 | if __name__ == '__main__':
49 |     main()
50 |     main(transpose_inputs=True)
51 | 


--------------------------------------------------------------------------------
/ch08/similar_movie.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | from __future__ import print_function
 9 | import numpy as np
10 | 
11 | 
12 | def nn_movie(ureviews, reviews, uid, mid, k=1):
13 |     '''Movie neighbor based classifier
14 | 
15 |     Parameters
16 |     ----------
17 |     ureviews : ndarray
18 |     reviews : ndarray
19 |     uid : int
20 |         index of user
21 |     mid : int
22 |         index of movie
23 |     k : int
24 |         index of neighbor to return
25 | 
26 |     Returns
27 |     -------
28 |     pred : float
29 |     '''
30 |     X = ureviews
31 |     y = ureviews[mid].copy()
32 |     y -= y.mean()
33 |     y /= (y.std() + 1e-5)
34 |     corrs = np.dot(X, y)
35 |     likes = corrs.argsort()
36 |     likes = likes[::-1]
37 |     c = 0
38 |     pred = 3.
39 |     for ell in likes:
40 |         if ell == mid:
41 |             continue
42 |         if reviews[uid, ell] > 0:
43 |             pred = reviews[uid, ell]
44 |             if c == k:
45 |                 return pred
46 |             c += 1
47 |     return pred
48 | 
49 | 
50 | def all_estimates(reviews, k=1):
51 |     '''Estimate all review ratings
52 |     '''
53 |     reviews = reviews.astype(float)
54 |     k -= 1
55 |     nusers, nmovies = reviews.shape
56 |     estimates = np.zeros_like(reviews)
57 |     for u in range(nusers):
58 |         ureviews = np.delete(reviews, u, axis=0)
59 |         ureviews -= ureviews.mean(0)
60 |         ureviews /= (ureviews.std(0) + 1e-5)
61 |         ureviews = ureviews.T.copy()
62 |         for m in np.where(reviews[u] > 0)[0]:
63 |             estimates[u, m] = nn_movie(ureviews, reviews, u, m, k)
64 |     return estimates
65 | 
66 | if __name__ == '__main__':
67 |     from load_ml100k import load
68 |     reviews = load()
69 |     estimates = all_estimates(reviews)
70 |     error = (estimates - reviews)
71 |     error **= 2
72 |     error = error[reviews > 0]
73 |     rmse = np.sqrt(error.mean())
74 |     print("RMSE is {0}.".format(rmse))
75 | 


--------------------------------------------------------------------------------
/ch08/stacked.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import load_ml100k
 3 | import regression
 4 | import corrneighbours
 5 | from sklearn import linear_model, metrics
 6 | import norm
 7 | 
 8 | def predict(train):
 9 |     tr_train,tr_test = load_ml100k.get_train_test(train, random_state=34)
10 |     tr_predicted0 = regression.predict(tr_train)
11 |     tr_predicted1 = regression.predict(tr_train.T).T
12 |     tr_predicted2 = corrneighbours.predict(tr_train)
13 |     tr_predicted3 = corrneighbours.predict(tr_train.T).T
14 |     tr_predicted4 = norm.predict(tr_train)
15 |     tr_predicted5 = norm.predict(tr_train.T).T
16 |     stack_tr = np.array([
17 |         tr_predicted0[tr_test > 0],
18 |         tr_predicted1[tr_test > 0],
19 |         tr_predicted2[tr_test > 0],
20 |         tr_predicted3[tr_test > 0],
21 |         tr_predicted4[tr_test > 0],
22 |         tr_predicted5[tr_test > 0],
23 |         ]).T
24 | 
25 |     lr = linear_model.LinearRegression()
26 |     lr.fit(stack_tr, tr_test[tr_test > 0])
27 | 
28 |     stack_te = np.array([
29 |         tr_predicted0.ravel(),
30 |         tr_predicted1.ravel(),
31 |         tr_predicted2.ravel(),
32 |         tr_predicted3.ravel(),
33 |         tr_predicted4.ravel(),
34 |         tr_predicted5.ravel(),
35 |         ]).T
36 | 
37 |     return lr.predict(stack_te).reshape(train.shape)
38 | 
39 | 
40 | def main():
41 |     train,test = load_ml100k.get_train_test(random_state=12)
42 |     predicted = predict(train)
43 |     r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
44 |     print('R2 stacked: {:.2%}'.format(r2))
45 | 
46 | if __name__ == '__main__':
47 |     main()
48 | 


--------------------------------------------------------------------------------
/ch09/01_fft_based_classifier.py:
--------------------------------------------------------------------------------
  1 | # This code is supporting material for the book
  2 | # Building Machine Learning Systems with Python
  3 | # by Willi Richert and Luis Pedro Coelho
  4 | # published by PACKT Publishing
  5 | #
  6 | # It is made available under the MIT License
  7 | 
  8 | import numpy as np
  9 | from collections import defaultdict
 10 | 
 11 | from sklearn.metrics import precision_recall_curve, roc_curve
 12 | from sklearn.metrics import auc
 13 | from sklearn.cross_validation import ShuffleSplit
 14 | 
 15 | from sklearn.metrics import confusion_matrix
 16 | 
 17 | from utils import plot_pr, plot_roc, plot_confusion_matrix, GENRE_LIST
 18 | 
 19 | from fft import read_fft
 20 | 
 21 | genre_list = GENRE_LIST
 22 | 
 23 | 
 24 | def train_model(clf_factory, X, Y, name, plot=False):
 25 |     labels = np.unique(Y)
 26 | 
 27 |     cv = ShuffleSplit(
 28 |         n=len(X), n_iter=1, test_size=0.3, indices=True, random_state=0)
 29 | 
 30 |     train_errors = []
 31 |     test_errors = []
 32 | 
 33 |     scores = []
 34 |     pr_scores = defaultdict(list)
 35 |     precisions, recalls, thresholds = defaultdict(
 36 |         list), defaultdict(list), defaultdict(list)
 37 | 
 38 |     roc_scores = defaultdict(list)
 39 |     tprs = defaultdict(list)
 40 |     fprs = defaultdict(list)
 41 | 
 42 |     clfs = []  # just to later get the median
 43 | 
 44 |     cms = []
 45 | 
 46 |     for train, test in cv:
 47 |         X_train, y_train = X[train], Y[train]
 48 |         X_test, y_test = X[test], Y[test]
 49 | 
 50 |         clf = clf_factory()
 51 |         clf.fit(X_train, y_train)
 52 |         clfs.append(clf)
 53 | 
 54 |         train_score = clf.score(X_train, y_train)
 55 |         test_score = clf.score(X_test, y_test)
 56 |         scores.append(test_score)
 57 | 
 58 |         train_errors.append(1 - train_score)
 59 |         test_errors.append(1 - test_score)
 60 | 
 61 |         y_pred = clf.predict(X_test)
 62 |         cm = confusion_matrix(y_test, y_pred)
 63 |         cms.append(cm)
 64 | 
 65 |         for label in labels:
 66 |             y_label_test = np.asarray(y_test == label, dtype=int)
 67 |             proba = clf.predict_proba(X_test)
 68 |             proba_label = proba[:, label]
 69 | 
 70 |             precision, recall, pr_thresholds = precision_recall_curve(
 71 |                 y_label_test, proba_label)
 72 |             pr_scores[label].append(auc(recall, precision))
 73 |             precisions[label].append(precision)
 74 |             recalls[label].append(recall)
 75 |             thresholds[label].append(pr_thresholds)
 76 | 
 77 |             fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
 78 |             roc_scores[label].append(auc(fpr, tpr))
 79 |             tprs[label].append(tpr)
 80 |             fprs[label].append(fpr)
 81 | 
 82 |     if plot:
 83 |         for label in labels:
 84 |             print("Plotting %s" % genre_list[label])
 85 |             scores_to_sort = roc_scores[label]
 86 |             median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
 87 | 
 88 |             desc = "%s %s" % (name, genre_list[label])
 89 |             plot_pr(pr_scores[label][median], desc, precisions[label][median],
 90 |                     recalls[label][median], label='%s vs rest' % genre_list[label])
 91 |             plot_roc(roc_scores[label][median], desc, tprs[label][median],
 92 |                      fprs[label][median], label='%s vs rest' % genre_list[label])
 93 | 
 94 |     all_pr_scores = np.asarray(pr_scores.values()).flatten()
 95 |     summary = (np.mean(scores), np.std(scores),
 96 |                np.mean(all_pr_scores), np.std(all_pr_scores))
 97 |     print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)
 98 | 
 99 |     return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
100 | 
101 | 
102 | def create_model():
103 |     from sklearn.linear_model.logistic import LogisticRegression
104 |     clf = LogisticRegression()
105 | 
106 |     return clf
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     X, y = read_fft(genre_list)
111 | 
112 |     train_avg, test_avg, cms = train_model(
113 |         create_model, X, y, "Log Reg FFT", plot=True)
114 | 
115 |     cm_avg = np.mean(cms, axis=0)
116 |     cm_norm = cm_avg / np.sum(cm_avg, axis=0)
117 | 
118 |     plot_confusion_matrix(cm_norm, genre_list, "fft",
119 |                           "Confusion matrix of an FFT based classifier")
120 | 


--------------------------------------------------------------------------------
/ch09/02_ceps_based_classifier.py:
--------------------------------------------------------------------------------
  1 | # This code is supporting material for the book
  2 | # Building Machine Learning Systems with Python
  3 | # by Willi Richert and Luis Pedro Coelho
  4 | # published by PACKT Publishing
  5 | #
  6 | # It is made available under the MIT License
  7 | 
  8 | import numpy as np
  9 | from collections import defaultdict
 10 | 
 11 | from sklearn.metrics import precision_recall_curve, roc_curve
 12 | from sklearn.metrics import auc
 13 | from sklearn.cross_validation import ShuffleSplit
 14 | 
 15 | from sklearn.metrics import confusion_matrix
 16 | 
 17 | from utils import plot_roc, plot_confusion_matrix, GENRE_LIST
 18 | 
 19 | from ceps import read_ceps
 20 | 
 21 | 
 22 | genre_list = GENRE_LIST
 23 | 
 24 | 
 25 | def train_model(clf_factory, X, Y, name, plot=False):
 26 |     labels = np.unique(Y)
 27 | 
 28 |     cv = ShuffleSplit(
 29 |         n=len(X), n_iter=1, test_size=0.3, indices=True, random_state=0)
 30 | 
 31 |     train_errors = []
 32 |     test_errors = []
 33 | 
 34 |     scores = []
 35 |     pr_scores = defaultdict(list)
 36 |     precisions, recalls, thresholds = defaultdict(
 37 |         list), defaultdict(list), defaultdict(list)
 38 | 
 39 |     roc_scores = defaultdict(list)
 40 |     tprs = defaultdict(list)
 41 |     fprs = defaultdict(list)
 42 | 
 43 |     clfs = []  # just to later get the median
 44 | 
 45 |     cms = []
 46 | 
 47 |     for train, test in cv:
 48 |         X_train, y_train = X[train], Y[train]
 49 |         X_test, y_test = X[test], Y[test]
 50 | 
 51 |         clf = clf_factory()
 52 |         clf.fit(X_train, y_train)
 53 |         clfs.append(clf)
 54 | 
 55 |         train_score = clf.score(X_train, y_train)
 56 |         test_score = clf.score(X_test, y_test)
 57 |         scores.append(test_score)
 58 | 
 59 |         train_errors.append(1 - train_score)
 60 |         test_errors.append(1 - test_score)
 61 | 
 62 |         y_pred = clf.predict(X_test)
 63 |         cm = confusion_matrix(y_test, y_pred)
 64 |         cms.append(cm)
 65 | 
 66 |         for label in labels:
 67 |             y_label_test = np.asarray(y_test == label, dtype=int)
 68 |             proba = clf.predict_proba(X_test)
 69 |             proba_label = proba[:, label]
 70 | 
 71 |             precision, recall, pr_thresholds = precision_recall_curve(
 72 |                 y_label_test, proba_label)
 73 |             pr_scores[label].append(auc(recall, precision))
 74 |             precisions[label].append(precision)
 75 |             recalls[label].append(recall)
 76 |             thresholds[label].append(pr_thresholds)
 77 | 
 78 |             fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
 79 |             roc_scores[label].append(auc(fpr, tpr))
 80 |             tprs[label].append(tpr)
 81 |             fprs[label].append(fpr)
 82 | 
 83 |     if plot:
 84 |         for label in labels:
 85 |             print("Plotting %s" % genre_list[label])
 86 |             scores_to_sort = roc_scores[label]
 87 |             median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
 88 | 
 89 |             desc = "%s %s" % (name, genre_list[label])
 90 |             plot_roc(roc_scores[label][median], desc, tprs[label][median],
 91 |                      fprs[label][median], label='%s vs rest' % genre_list[label])
 92 | 
 93 |     all_pr_scores = np.asarray(pr_scores.values()).flatten()
 94 |     summary = (np.mean(scores), np.std(scores),
 95 |                np.mean(all_pr_scores), np.std(all_pr_scores))
 96 |     print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)
 97 | 
 98 |     return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
 99 | 
100 | 
101 | def create_model():
102 |     from sklearn.linear_model.logistic import LogisticRegression
103 |     clf = LogisticRegression()
104 | 
105 |     return clf
106 | 
107 | 
108 | if __name__ == "__main__":
109 |     X, y = read_ceps(genre_list)
110 | 
111 |     train_avg, test_avg, cms = train_model(
112 |         create_model, X, y, "Log Reg CEPS", plot=True)
113 | 
114 |     cm_avg = np.mean(cms, axis=0)
115 |     cm_norm = cm_avg / np.sum(cm_avg, axis=0)
116 | 
117 |     plot_confusion_matrix(cm_norm, genre_list, "ceps",
118 |                           "Confusion matrix of a CEPS based classifier")
119 | 


--------------------------------------------------------------------------------
/ch09/Makefile:
--------------------------------------------------------------------------------
 1 | CHART_DIR = charts
 2 | 
 3 | fft:
 4 | 	python 01_fft_based_classifier.py
 5 | 
 6 | ceps:
 7 | 	python 02_ceps_based_classifier.py
 8 | 
 9 | rocs_fft.png: 
10 | 	convert $(CHART_DIR)/roc_Log_Reg_FFT_classical.png $(CHART_DIR)/roc_Log_Reg_FFT_jazz.png +append row1.png
11 | 	convert $(CHART_DIR)/roc_Log_Reg_FFT_country.png $(CHART_DIR)/roc_Log_Reg_FFT_pop.png +append row2.png
12 | 	convert $(CHART_DIR)/roc_Log_Reg_FFT_rock.png $(CHART_DIR)/roc_Log_Reg_FFT_metal.png +append row3.png
13 | 	convert row1.png row2.png row3.png -append $(CHART_DIR)/rocs_fft.png
14 | 	
15 | rocs_ceps.png: 
16 | 	convert $(CHART_DIR)/roc_Log_Reg_CEPS_classical.png $(CHART_DIR)/roc_Log_Reg_CEPS_jazz.png +append row1.png
17 | 	convert $(CHART_DIR)/roc_Log_Reg_CEPS_country.png $(CHART_DIR)/roc_Log_Reg_CEPS_pop.png +append row2.png
18 | 	convert $(CHART_DIR)/roc_Log_Reg_CEPS_rock.png $(CHART_DIR)/roc_Log_Reg_CEPS_metal.png +append row3.png
19 | 	convert row1.png row2.png row3.png -append $(CHART_DIR)/rocs_ceps.png
20 | 	
21 | roc_pr.png: fft
22 | 	convert $(CHART_DIR)/pr_Log_Reg_FFT_country.png $(CHART_DIR)/roc_Log_Reg_FFT_country.png +append roc_pr.png
23 | 
24 | sox sine_a.wav sine_b.wav sine_mix.wav:
25 | 	sox --null -r 22050 sine_a.wav synth 0.2 sine 400
26 | 	sox --null -r 22050 sine_b.wav synth 0.2 sine 3000
27 | 	sox --combine mix --volume 1 sine_b.wav --volume 0.5 sine_a.wav sine_mix.wav
28 | 
29 | fft_demo: sine_a.wav sine_b.wav sine_mix.wav
30 | 	python fft.py 
31 | 	convert sine_a_wav_fft.png sine_b_wav_fft.png sine_mix_wav_fft.png -append fft_demo.png
32 | 	
33 | 	python fft.py /media/sf_P/pymlbook-data/09-genre-class/genres/jazz/jazz.00012.wav
34 | 	mv jazz.00012_wav_fft.png fft_example.png
35 | 	
36 | 
37 | 


--------------------------------------------------------------------------------
/ch09/ceps.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import os
 9 | import glob
10 | import sys
11 | 
12 | import numpy as np
13 | import scipy
14 | import scipy.io.wavfile
15 | from scikits.talkbox.features import mfcc
16 | 
17 | from utils import GENRE_DIR
18 | 
19 | 
20 | def write_ceps(ceps, fn):
21 |     """
22 |     Write the MFCC to separate files to speed up processing.
23 |     """
24 |     base_fn, ext = os.path.splitext(fn)
25 |     data_fn = base_fn + ".ceps"
26 |     np.save(data_fn, ceps)
27 |     print("Written %s"%data_fn)
28 | 
29 | 
30 | def create_ceps(fn):
31 |     sample_rate, X = scipy.io.wavfile.read(fn)
32 | 
33 |     ceps, mspec, spec = mfcc(X)
34 |     write_ceps(ceps, fn)
35 | 
36 | 
37 | def read_ceps(genre_list, base_dir=GENRE_DIR):
38 |     X = []
39 |     y = []
40 |     for label, genre in enumerate(genre_list):
41 |         for fn in glob.glob(os.path.join(base_dir, genre, "*.ceps.npy")):
42 |             ceps = np.load(fn)
43 |             num_ceps = len(ceps)
44 |             X.append(
45 |                 np.mean(ceps[int(num_ceps / 10):int(num_ceps * 9 / 10)], axis=0))
46 |             y.append(label)
47 | 
48 |     return np.array(X), np.array(y)
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     os.chdir(GENRE_DIR)
53 |     glob_wav = os.path.join(sys.argv[1], "*.wav")
54 |     print(glob_wav)
55 |     for fn in glob.glob(glob_wav):
56 |         create_ceps(fn)
57 | 


--------------------------------------------------------------------------------
/ch09/fft.py:
--------------------------------------------------------------------------------
  1 | # This code is supporting material for the book
  2 | # Building Machine Learning Systems with Python
  3 | # by Willi Richert and Luis Pedro Coelho
  4 | # published by PACKT Publishing
  5 | #
  6 | # It is made available under the MIT License
  7 | 
  8 | import sys
  9 | import os
 10 | import glob
 11 | 
 12 | import numpy as np
 13 | import scipy
 14 | import scipy.io.wavfile
 15 | 
 16 | from utils import GENRE_DIR, CHART_DIR
 17 | 
 18 | import matplotlib.pyplot as plt
 19 | from matplotlib.ticker import EngFormatter
 20 | 
 21 | 
 22 | def write_fft(fft_features, fn):
 23 |     """
 24 |     Write the FFT features to separate files to speed up processing.
 25 |     """
 26 |     base_fn, ext = os.path.splitext(fn)
 27 |     data_fn = base_fn + ".fft"
 28 | 
 29 |     np.save(data_fn, fft_features)
 30 |     print("Written "%data_fn)
 31 | 
 32 | 
 33 | def create_fft(fn):
 34 |     sample_rate, X = scipy.io.wavfile.read(fn)
 35 | 
 36 |     fft_features = abs(scipy.fft(X)[:1000])
 37 |     write_fft(fft_features, fn)
 38 | 
 39 | 
 40 | def read_fft(genre_list, base_dir=GENRE_DIR):
 41 |     X = []
 42 |     y = []
 43 |     for label, genre in enumerate(genre_list):
 44 |         genre_dir = os.path.join(base_dir, genre, "*.fft.npy")
 45 |         file_list = glob.glob(genre_dir)
 46 |         assert(file_list), genre_dir
 47 |         for fn in file_list:
 48 |             fft_features = np.load(fn)
 49 | 
 50 |             X.append(fft_features[:2000])
 51 |             y.append(label)
 52 | 
 53 |     return np.array(X), np.array(y)
 54 | 
 55 | 
 56 | def plot_wav_fft(wav_filename, desc=None):
 57 |     plt.clf()
 58 |     plt.figure(num=None, figsize=(6, 4))
 59 |     sample_rate, X = scipy.io.wavfile.read(wav_filename)
 60 |     spectrum = np.fft.fft(X)
 61 |     freq = np.fft.fftfreq(len(X), 1.0 / sample_rate)
 62 | 
 63 |     plt.subplot(211)
 64 |     num_samples = 200.0
 65 |     plt.xlim(0, num_samples / sample_rate)
 66 |     plt.xlabel("time [s]")
 67 |     plt.title(desc or wav_filename)
 68 |     plt.plot(np.arange(num_samples) / sample_rate, X[:num_samples])
 69 |     plt.grid(True)
 70 | 
 71 |     plt.subplot(212)
 72 |     plt.xlim(0, 5000)
 73 |     plt.xlabel("frequency [Hz]")
 74 |     plt.xticks(np.arange(5) * 1000)
 75 |     if desc:
 76 |         desc = desc.strip()
 77 |         fft_desc = desc[0].lower() + desc[1:]
 78 |     else:
 79 |         fft_desc = wav_filename
 80 |     plt.title("FFT of %s" % fft_desc)
 81 |     plt.plot(freq, abs(spectrum), linewidth=5)
 82 |     plt.grid(True)
 83 | 
 84 |     plt.tight_layout()
 85 | 
 86 |     rel_filename = os.path.split(wav_filename)[1]
 87 |     plt.savefig("%s_wav_fft.png" % os.path.splitext(rel_filename)[0],
 88 |                 bbox_inches='tight')
 89 | 
 90 |     plt.show()
 91 | 
 92 | 
 93 | def plot_wav_fft_demo():
 94 |     plot_wav_fft("sine_a.wav", "400Hz sine wave")
 95 |     plot_wav_fft("sine_b.wav", "3,000Hz sine wave")
 96 |     plot_wav_fft("sine_mix.wav", "Mixed sine wave")
 97 | 
 98 | 
 99 | def plot_specgram(ax, fn):
100 |     sample_rate, X = scipy.io.wavfile.read(fn)
101 |     ax.specgram(X, Fs=sample_rate, xextent=(0, 30))
102 | 
103 | 
104 | def plot_specgrams(base_dir=CHART_DIR):
105 |     """
106 |     Plot a bunch of spectrograms of wav files in different genres
107 |     """
108 |     plt.clf()
109 |     genres = ["classical", "jazz", "country", "pop", "rock", "metal"]
110 |     num_files = 3
111 |     f, axes = plt.subplots(len(genres), num_files)
112 | 
113 |     for genre_idx, genre in enumerate(genres):
114 |         for idx, fn in enumerate(glob.glob(os.path.join(GENRE_DIR, genre, "*.wav"))):
115 |             if idx == num_files:
116 |                 break
117 |             axis = axes[genre_idx, idx]
118 |             axis.yaxis.set_major_formatter(EngFormatter())
119 |             axis.set_title("%s song %i" % (genre, idx + 1))
120 |             plot_specgram(axis, fn)
121 | 
122 |     specgram_file = os.path.join(base_dir, "Spectrogram_Genres.png")
123 |     plt.savefig(specgram_file, bbox_inches="tight")
124 | 
125 |     plt.show()
126 | 
127 | 
128 | if __name__ == "__main__":
129 |     # for fn in glob.glob(os.path.join(sys.argv[1], "*.wav")):
130 |     #    create_fft(fn)
131 | 
132 |     # plot_decomp()
133 | 
134 |     if len(sys.argv) > 1:
135 |         plot_wav_fft(sys.argv[1], desc="some sample song")
136 |     else:
137 |         plot_wav_fft_demo()
138 | 
139 |     plot_specgrams()
140 | 


--------------------------------------------------------------------------------
/ch10/.gitignore:
--------------------------------------------------------------------------------
1 | AnimTransDistr/
2 | 


--------------------------------------------------------------------------------
/ch10/README.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | Chapter 10
 3 | ==========
 4 | 
 5 | Support code for *Chapter 10: Pattern Recognition & Computer Vision*
 6 | 
 7 | Data
 8 | ----
 9 | 
10 | This chapter relies on a publicly available dataset (which can be downloaded
11 | using the ``download.sh`` script inside the ``data/`` directory) as well the
12 | dataset that is packaged with the repository at ``../SimpleImageDataset/``.
13 | 
14 | Running ``download.sh`` will retrieve the other dataset into a directory
15 | ``AnimTransDistr/``.
16 | 
17 | Scripts
18 | -------
19 | 
20 | chapter.py
21 |     Code as written in the book.
22 | thresholded_figure.py
23 |     Computes the thresholded figures, including after Gaussian blurring
24 | lena-ring.py
25 |     Lena image with center in focus and blurred edges
26 | figure10.py
27 |     Just paste two images next to each others
28 | features.py
29 |     Contains the color histogram function from the book as well as a simple
30 |     wrapper around ``mahotas.texture.haralick``
31 | simple_classification.py
32 |     Classify SimpleImageDataset with texture features + color histogram features
33 | large_classification.py
34 |     Classify ``AnimTransDistr`` with both texture and SURF features.
35 | neighbors.py
36 |     Computes image neighbors as well as the neighbor figure from the book.
37 | 
38 | 


--------------------------------------------------------------------------------
/ch10/download.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | mkdir -p AnimTransDistr
4 | cd AnimTransDistr
5 | curl -O http://vision.stanford.edu/Datasets/AnimTransDistr.rar
6 | unrar x AnimTransDistr.rar
7 | # The following file is a weird file:
8 | rm Anims/104034.jpg
9 | 


--------------------------------------------------------------------------------
/ch10/features.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import numpy as np
 9 | import mahotas as mh
10 | 
11 | 
12 | def edginess_sobel(image):
13 |     '''Measure the "edginess" of an image
14 | 
15 |     image should be a 2d numpy array (an image)
16 | 
17 |     Returns a floating point value which is higher the "edgier" the image is.
18 | 
19 |     '''
20 |     edges = mh.sobel(image, just_filter=True)
21 |     edges = edges.ravel()
22 |     return np.sqrt(np.dot(edges, edges))
23 | 
24 | def texture(im):
25 |     '''Compute features for an image
26 | 
27 |     Parameters
28 |     ----------
29 |     im : ndarray
30 | 
31 |     Returns
32 |     -------
33 |     fs : ndarray
34 |         1-D array of features
35 |     '''
36 |     im = im.astype(np.uint8)
37 |     return mh.features.haralick(im).ravel()
38 | 
39 | 
40 | def chist(im):
41 |     '''Compute color histogram of input image
42 | 
43 |     Parameters
44 |     ----------
45 |     im : ndarray
46 |         should be an RGB image
47 | 
48 |     Returns
49 |     -------
50 |     c : ndarray
51 |         1-D array of histogram values
52 |     '''
53 | 
54 |     # Downsample pixel values:
55 |     im = im // 64
56 | 
57 |     # We can also implement the following by using np.histogramdd
58 |     # im = im.reshape((-1,3))
59 |     # bins = [np.arange(5), np.arange(5), np.arange(5)]
60 |     # hist = np.histogramdd(im, bins=bins)[0]
61 |     # hist = hist.ravel()
62 | 
63 |     # Separate RGB channels:
64 |     r,g,b = im.transpose((2,0,1))
65 | 
66 |     pixels = 1 * r + 4 * g + 16 * b
67 |     hist = np.bincount(pixels.ravel(), minlength=64)
68 |     hist = hist.astype(float)
69 |     return np.log1p(hist)
70 | 
71 | 


--------------------------------------------------------------------------------
/ch10/figure10.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import numpy as np
 9 | import mahotas as mh
10 | 
11 | # This little script just builds an image with two examples, side-by-side:
12 | 
13 | text = mh.imread("../SimpleImageDataset/text21.jpg")
14 | building = mh.imread("../SimpleImageDataset/building00.jpg")
15 | h, w, _ = text.shape
16 | canvas = np.zeros((h, 2 * w + 128, 3), np.uint8)
17 | canvas[:, -w:] = building
18 | canvas[:, :w] = text
19 | canvas = canvas[::4, ::4]
20 | mh.imsave('figure10.jpg', canvas)
21 | 


--------------------------------------------------------------------------------
/ch10/large_classification.py:
--------------------------------------------------------------------------------
  1 | # This code is supporting material for the book
  2 | # Building Machine Learning Systems with Python
  3 | # by Willi Richert and Luis Pedro Coelho
  4 | # published by PACKT Publishing
  5 | #
  6 | # It is made available under the MIT License
  7 | 
  8 | from __future__ import print_function
  9 | import mahotas as mh
 10 | from glob import glob
 11 | from sklearn import cross_validation
 12 | from sklearn.linear_model import LogisticRegression
 13 | from sklearn.pipeline import Pipeline
 14 | from sklearn.preprocessing import StandardScaler
 15 | from sklearn.grid_search import GridSearchCV
 16 | import numpy as np
 17 | 
 18 | basedir = 'AnimTransDistr'
 19 | print('This script will test classification of the AnimTransDistr dataset')
 20 | 
 21 | C_range = 10.0 ** np.arange(-4, 3)
 22 | grid = GridSearchCV(LogisticRegression(), param_grid={'C' : C_range})
 23 | clf = Pipeline([('preproc', StandardScaler()),
 24 |                 ('classifier', grid)])
 25 | 
 26 | def features_for(im):
 27 |     from features import chist
 28 |     im = mh.imread(im)
 29 |     img = mh.colors.rgb2grey(im).astype(np.uint8)
 30 |     return np.concatenate([mh.features.haralick(img).ravel(),
 31 |                                 chist(im)])
 32 | 
 33 | def images():
 34 |     '''Iterate over all (image,label) pairs
 35 | 
 36 |     This function will return
 37 |     '''
 38 |     for ci, cl in enumerate(classes):
 39 |         images = glob('{}/{}/*.jpg'.format(basedir, cl))
 40 |         for im in sorted(images):
 41 |             yield im, ci
 42 | 
 43 | classes = [
 44 |     'Anims',
 45 |     'Cars',
 46 |     'Distras',
 47 |     'Trans',
 48 | ]
 49 | 
 50 | print('Computing whole-image texture features...')
 51 | ifeatures = []
 52 | labels = []
 53 | for im, ell in images():
 54 |     ifeatures.append(features_for(im))
 55 |     labels.append(ell)
 56 | 
 57 | ifeatures = np.array(ifeatures)
 58 | labels = np.array(labels)
 59 | 
 60 | cv = cross_validation.KFold(len(ifeatures), 5, shuffle=True, random_state=123)
 61 | scores0 = cross_validation.cross_val_score(
 62 |     clf, ifeatures, labels, cv=cv)
 63 | print('Accuracy (5 fold x-val) with Logistic Regression [image features]: {:.1%}'.format(
 64 |     scores0.mean()))
 65 | 
 66 | 
 67 | from sklearn.cluster import KMeans
 68 | from mahotas.features import surf
 69 | 
 70 | 
 71 | print('Computing SURF descriptors...')
 72 | alldescriptors = []
 73 | for im,_ in images():
 74 |     im = mh.imread(im, as_grey=True)
 75 |     im = im.astype(np.uint8)
 76 | 
 77 |     # To use dense sampling, you can try the following line:
 78 |     # alldescriptors.append(surf.dense(im, spacing=16))
 79 |     alldescriptors.append(surf.surf(im, descriptor_only=True))
 80 | 
 81 | print('Descriptor computation complete.')
 82 | k = 256
 83 | km = KMeans(k)
 84 | 
 85 | concatenated = np.concatenate(alldescriptors)
 86 | print('Number of descriptors: {}'.format(
 87 |         len(concatenated)))
 88 | concatenated = concatenated[::64]
 89 | print('Clustering with K-means...')
 90 | km.fit(concatenated)
 91 | sfeatures = []
 92 | for d in alldescriptors:
 93 |     c = km.predict(d)
 94 |     sfeatures.append(np.bincount(c, minlength=k))
 95 | sfeatures = np.array(sfeatures, dtype=float)
 96 | print('predicting...')
 97 | score_SURF = cross_validation.cross_val_score(
 98 |     clf, sfeatures, labels, cv=cv).mean()
 99 | print('Accuracy (5 fold x-val) with Logistic Regression [SURF features]: {:.1%}'.format(
100 |     score_SURF.mean()))
101 | 
102 | 
103 | print('Performing classification with all features combined...')
104 | allfeatures = np.hstack([sfeatures, ifeatures])
105 | score_SURF_global = cross_validation.cross_val_score(
106 |     clf, allfeatures, labels, cv=cv).mean()
107 | print('Accuracy (5 fold x-val) with Logistic Regression [All features]: {:.1%}'.format(
108 |     score_SURF_global.mean()))
109 | 


--------------------------------------------------------------------------------
/ch10/lena-ring.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import mahotas as mh
 9 | import numpy as np
10 | 
11 | # Read in the image
12 | im = mh.demos.load('lena')
13 | 
14 | # This breaks up the image into RGB channels
15 | r, g, b = im.transpose(2, 0, 1)
16 | h, w = r.shape
17 | 
18 | # smooth the image per channel:
19 | r12 = mh.gaussian_filter(r, 12.)
20 | g12 = mh.gaussian_filter(g, 12.)
21 | b12 = mh.gaussian_filter(b, 12.)
22 | 
23 | # build back the RGB image
24 | im12 = mh.as_rgb(r12, g12, b12)
25 | 
26 | X, Y = np.mgrid[:h, :w]
27 | X = X - h / 2.
28 | Y = Y - w / 2.
29 | X /= X.max()
30 | Y /= Y.max()
31 | 
32 | # Array C will have the highest values in the center, fading out to the edges:
33 | 
34 | C = np.exp(-2. * (X ** 2 + Y ** 2))
35 | C -= C.min()
36 | C /= C.ptp()
37 | C = C[:, :, None]
38 | 
39 | # The final result is sharp in the centre and smooths out to the borders:
40 | ring = mh.stretch(im * C + (1 - C) * im12)
41 | mh.imsave('lena-ring.jpg', ring)
42 | 


--------------------------------------------------------------------------------
/ch10/neighbors.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | 
 6 | import numpy as np
 7 | import mahotas as mh
 8 | from glob import glob
 9 | from features import texture, chist
10 | from matplotlib import pyplot as plt
11 | from sklearn.preprocessing import StandardScaler
12 | from scipy.spatial import distance
13 | 
14 | basedir = '../SimpleImageDataset/'
15 | 
16 | 
17 | haralicks = []
18 | chists = []
19 | 
20 | print('Computing features...')
21 | # Use glob to get all the images
22 | images = glob('{}/*.jpg'.format(basedir))
23 | # We sort the images to ensure that they are always processed in the same order
24 | # Otherwise, this would introduce some variation just based on the random
25 | # ordering that the filesystem uses
26 | images.sort()
27 | 
28 | for fname in images:
29 |     imc = mh.imread(fname)
30 |     imc = imc[200:-200,200:-200]
31 |     haralicks.append(texture(mh.colors.rgb2grey(imc)))
32 |     chists.append(chist(imc))
33 | 
34 | haralicks = np.array(haralicks)
35 | chists = np.array(chists)
36 | features = np.hstack([chists, haralicks])
37 | 
38 | print('Computing neighbors...')
39 | sc = StandardScaler()
40 | features = sc.fit_transform(features)
41 | dists = distance.squareform(distance.pdist(features))
42 | 
43 | print('Plotting...')
44 | fig, axes = plt.subplots(2, 9, figsize=(16,8))
45 | 
46 | # Remove ticks from all subplots
47 | for ax in axes.flat:
48 |     ax.set_xticks([])
49 |     ax.set_yticks([])
50 | 
51 | for ci,i in enumerate(range(0,90,10)):
52 |     left = images[i]
53 |     dists_left = dists[i]
54 |     right = dists_left.argsort()
55 |     # right[0] is the same as left[i], so pick the next closest element
56 |     right = right[1]
57 |     right = images[right]
58 |     left = mh.imread(left)
59 |     right = mh.imread(right)
60 |     axes[0, ci].imshow(left)
61 |     axes[1, ci].imshow(right)
62 | 
63 | fig.tight_layout()
64 | fig.savefig('figure_neighbors.png', dpi=300)
65 | 


--------------------------------------------------------------------------------
/ch10/scene00.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/ch10/scene00.jpg


--------------------------------------------------------------------------------
/ch10/simple_classification.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import mahotas as mh
 9 | import numpy as np
10 | from glob import glob
11 | 
12 | from features import texture, chist
13 | from sklearn.linear_model import LogisticRegression
14 | from sklearn.pipeline import Pipeline
15 | from sklearn.preprocessing import StandardScaler
16 | 
17 | basedir = '../SimpleImageDataset/'
18 | 
19 | 
20 | haralicks = []
21 | labels = []
22 | chists = []
23 | 
24 | print('This script will test (with cross-validation) classification of the simple 3 class dataset')
25 | print('Computing features...')
26 | # Use glob to get all the images
27 | images = glob('{}/*.jpg'.format(basedir))
28 | 
29 | # We sort the images to ensure that they are always processed in the same order
30 | # Otherwise, this would introduce some variation just based on the random
31 | # ordering that the filesystem uses
32 | for fname in sorted(images):
33 |     imc = mh.imread(fname)
34 |     haralicks.append(texture(mh.colors.rgb2grey(imc)))
35 |     chists.append(chist(imc))
36 | 
37 |     # Files are named like building00.jpg, scene23.jpg...
38 |     labels.append(fname[:-len('xx.jpg')])
39 | 
40 | print('Finished computing features.')
41 | 
42 | haralicks = np.array(haralicks)
43 | labels = np.array(labels)
44 | chists = np.array(chists)
45 | 
46 | haralick_plus_chists = np.hstack([chists, haralicks])
47 | 
48 | 
49 | # We use Logistic Regression because it achieves high accuracy on small(ish) datasets
50 | # Feel free to experiment with other classifiers
51 | clf = Pipeline([('preproc', StandardScaler()),
52 |                 ('classifier', LogisticRegression())])
53 | 
54 | from sklearn import cross_validation
55 | cv = cross_validation.LeaveOneOut(len(images))
56 | scores = cross_validation.cross_val_score(
57 |     clf, haralicks, labels, cv=cv)
58 | print('Accuracy (Leave-one-out) with Logistic Regression [haralick features]: {:.1%}'.format(
59 |     scores.mean()))
60 | 
61 | scores = cross_validation.cross_val_score(
62 |     clf, chists, labels, cv=cv)
63 | print('Accuracy (Leave-one-out) with Logistic Regression [color histograms]: {:.1%}'.format(
64 |     scores.mean()))
65 | 
66 | scores = cross_validation.cross_val_score(
67 |     clf, haralick_plus_chists, labels, cv=cv)
68 | print('Accuracy (Leave-one-out) with Logistic Regression [texture features + color histograms]: {:.1%}'.format(
69 |     scores.mean()))
70 | 
71 | 


--------------------------------------------------------------------------------
/ch10/threshold.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import numpy as np
 9 | import mahotas as mh
10 | 
11 | # Load our example image:
12 | image = mh.imread('../SimpleImageDataset/building05.jpg')
13 | 
14 | # Convert to greyscale
15 | image = mh.colors.rgb2gray(image, dtype=np.uint8)
16 | 
17 | # Compute a threshold value:
18 | thresh = mh.thresholding.otsu(image)
19 | print('Otsu threshold is {0}'.format(thresh))
20 | 
21 | # Compute the thresholded image
22 | otsubin = (image > thresh)
23 | print('Saving thresholded image (with Otsu threshold) to otsu-threshold.jpeg')
24 | mh.imsave('otsu-threshold.jpeg', otsubin.astype(np.uint8) * 255)
25 | 
26 | # Execute morphological opening to smooth out the edges
27 | otsubin = mh.open(otsubin, np.ones((15, 15)))
28 | mh.imsave('otsu-closed.jpeg', otsubin.astype(np.uint8) * 255)
29 | 
30 | # An alternative thresholding method:
31 | thresh = mh.thresholding.rc(image)
32 | print('Ridley-Calvard threshold is {0}'.format(thresh))
33 | print('Saving thresholded image (with Ridley-Calvard threshold) to rc-threshold.jpeg')
34 | mh.imsave('rc-threshold.jpeg', (image > thresh).astype(np.uint8) * 255)
35 | 


--------------------------------------------------------------------------------
/ch10/thresholded_figure.py:
--------------------------------------------------------------------------------
 1 | import mahotas as mh
 2 | import numpy as np
 3 | from matplotlib import pyplot as plt
 4 | 
 5 | # Load image & convert to B&W
 6 | image = mh.imread('../SimpleImageDataset/scene00.jpg')
 7 | image = mh.colors.rgb2grey(image, dtype=np.uint8)
 8 | plt.imshow(image)
 9 | plt.gray()
10 | plt.title('original image')
11 | 
12 | thresh = mh.thresholding.otsu(image)
13 | print('Otsu threshold is {}.'.format(thresh))
14 | 
15 | threshed = (image > thresh)
16 | plt.figure()
17 | plt.imshow(threshed)
18 | plt.title('threholded image')
19 | mh.imsave('thresholded.png', threshed.astype(np.uint8)*255)
20 | 
21 | im16 = mh.gaussian_filter(image, 16)
22 | 
23 | # Repeat the thresholding operations with the blurred image 
24 | thresh = mh.thresholding.otsu(im16.astype(np.uint8))
25 | threshed = (im16 > thresh)
26 | plt.figure()
27 | plt.imshow(threshed)
28 | plt.title('threholded image (after blurring)')
29 | print('Otsu threshold after blurring is {}.'.format(thresh))
30 | mh.imsave('thresholded16.png', threshed.astype(np.uint8)*255)
31 | plt.show()
32 | 


--------------------------------------------------------------------------------
/ch11/demo_corr.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import os
 9 | 
10 | from matplotlib import pylab
11 | import numpy as np
12 | import scipy
13 | from scipy.stats import norm, pearsonr
14 | 
15 | from utils import CHART_DIR
16 | 
17 | 
18 | def _plot_correlation_func(x, y):
19 | 
20 |     r, p = pearsonr(x, y)
21 |     title = "Cor($X_1$, $X_2$) = %.3f" % r
22 |     pylab.scatter(x, y)
23 |     pylab.title(title)
24 |     pylab.xlabel("$X_1$")
25 |     pylab.ylabel("$X_2$")
26 | 
27 |     f1 = scipy.poly1d(scipy.polyfit(x, y, 1))
28 |     pylab.plot(x, f1(x), "r--", linewidth=2)
29 |     # pylab.xticks([w*7*24 for w in [0,1,2,3,4]], ['week %i'%(w+1) for w in
30 |     # [0,1,2,3,4]])
31 | 
32 | 
33 | def plot_correlation_demo():
34 |     np.random.seed(0)  # to reproduce the data later on
35 |     pylab.clf()
36 |     pylab.figure(num=None, figsize=(8, 8))
37 | 
38 |     x = np.arange(0, 10, 0.2)
39 | 
40 |     pylab.subplot(221)
41 |     y = 0.5 * x + norm.rvs(1, scale=.01, size=len(x))
42 |     _plot_correlation_func(x, y)
43 | 
44 |     pylab.subplot(222)
45 |     y = 0.5 * x + norm.rvs(1, scale=.1, size=len(x))
46 |     _plot_correlation_func(x, y)
47 | 
48 |     pylab.subplot(223)
49 |     y = 0.5 * x + norm.rvs(1, scale=1, size=len(x))
50 |     _plot_correlation_func(x, y)
51 | 
52 |     pylab.subplot(224)
53 |     y = norm.rvs(1, scale=10, size=len(x))
54 |     _plot_correlation_func(x, y)
55 | 
56 |     pylab.autoscale(tight=True)
57 |     pylab.grid(True)
58 | 
59 |     filename = "corr_demo_1.png"
60 |     pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
61 | 
62 |     pylab.clf()
63 |     pylab.figure(num=None, figsize=(8, 8))
64 | 
65 |     x = np.arange(-5, 5, 0.2)
66 | 
67 |     pylab.subplot(221)
68 |     y = 0.5 * x ** 2 + norm.rvs(1, scale=.01, size=len(x))
69 |     _plot_correlation_func(x, y)
70 | 
71 |     pylab.subplot(222)
72 |     y = 0.5 * x ** 2 + norm.rvs(1, scale=.1, size=len(x))
73 |     _plot_correlation_func(x, y)
74 | 
75 |     pylab.subplot(223)
76 |     y = 0.5 * x ** 2 + norm.rvs(1, scale=1, size=len(x))
77 |     _plot_correlation_func(x, y)
78 | 
79 |     pylab.subplot(224)
80 |     y = 0.5 * x ** 2 + norm.rvs(1, scale=10, size=len(x))
81 |     _plot_correlation_func(x, y)
82 | 
83 |     pylab.autoscale(tight=True)
84 |     pylab.grid(True)
85 | 
86 |     filename = "corr_demo_2.png"
87 |     pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
88 | 
89 | if __name__ == '__main__':
90 |     plot_correlation_demo()
91 | 


--------------------------------------------------------------------------------
/ch11/demo_mds.py:
--------------------------------------------------------------------------------
  1 | # This code is supporting material for the book
  2 | # Building Machine Learning Systems with Python
  3 | # by Willi Richert and Luis Pedro Coelho
  4 | # published by PACKT Publishing
  5 | #
  6 | # It is made available under the MIT License
  7 | 
  8 | import os
  9 | 
 10 | import numpy as np
 11 | from matplotlib import pylab
 12 | from mpl_toolkits.mplot3d import Axes3D
 13 | 
 14 | from sklearn import linear_model, manifold, decomposition, datasets
 15 | logistic = linear_model.LogisticRegression()
 16 | 
 17 | from utils import CHART_DIR
 18 | 
 19 | np.random.seed(3)
 20 | 
 21 | # all examples will have three classes in this file
 22 | colors = ['r', 'g', 'b']
 23 | markers = ['o', 6, '*']
 24 | 
 25 | 
 26 | def plot_demo_1():
 27 |     X = np.c_[np.ones(5), 2 * np.ones(5), 10 * np.ones(5)].T
 28 |     y = np.array([0, 1, 2])
 29 | 
 30 |     fig = pylab.figure(figsize=(10, 4))
 31 | 
 32 |     ax = fig.add_subplot(121, projection='3d')
 33 |     ax.set_axis_bgcolor('white')
 34 | 
 35 |     mds = manifold.MDS(n_components=3)
 36 |     Xtrans = mds.fit_transform(X)
 37 | 
 38 |     for cl, color, marker in zip(np.unique(y), colors, markers):
 39 |         ax.scatter(
 40 |             Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black')
 41 |     pylab.title("MDS on example data set in 3 dimensions")
 42 |     ax.view_init(10, -15)
 43 | 
 44 |     mds = manifold.MDS(n_components=2)
 45 |     Xtrans = mds.fit_transform(X)
 46 | 
 47 |     ax = fig.add_subplot(122)
 48 |     for cl, color, marker in zip(np.unique(y), colors, markers):
 49 |         ax.scatter(
 50 |             Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black')
 51 |     pylab.title("MDS on example data set in 2 dimensions")
 52 | 
 53 |     filename = "mds_demo_1.png"
 54 |     pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
 55 | 
 56 | 
 57 | def plot_iris_mds():
 58 | 
 59 |     iris = datasets.load_iris()
 60 |     X = iris.data
 61 |     y = iris.target
 62 | 
 63 |     # MDS
 64 | 
 65 |     fig = pylab.figure(figsize=(10, 4))
 66 | 
 67 |     ax = fig.add_subplot(121, projection='3d')
 68 |     ax.set_axis_bgcolor('white')
 69 | 
 70 |     mds = manifold.MDS(n_components=3)
 71 |     Xtrans = mds.fit_transform(X)
 72 | 
 73 |     for cl, color, marker in zip(np.unique(y), colors, markers):
 74 |         ax.scatter(
 75 |             Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black')
 76 |     pylab.title("MDS on Iris data set in 3 dimensions")
 77 |     ax.view_init(10, -15)
 78 | 
 79 |     mds = manifold.MDS(n_components=2)
 80 |     Xtrans = mds.fit_transform(X)
 81 | 
 82 |     ax = fig.add_subplot(122)
 83 |     for cl, color, marker in zip(np.unique(y), colors, markers):
 84 |         ax.scatter(
 85 |             Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black')
 86 |     pylab.title("MDS on Iris data set in 2 dimensions")
 87 | 
 88 |     filename = "mds_demo_iris.png"
 89 |     pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
 90 | 
 91 |     # PCA
 92 | 
 93 |     fig = pylab.figure(figsize=(10, 4))
 94 | 
 95 |     ax = fig.add_subplot(121, projection='3d')
 96 |     ax.set_axis_bgcolor('white')
 97 | 
 98 |     pca = decomposition.PCA(n_components=3)
 99 |     Xtrans = pca.fit(X).transform(X)
100 | 
101 |     for cl, color, marker in zip(np.unique(y), colors, markers):
102 |         ax.scatter(
103 |             Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black')
104 |     pylab.title("PCA on Iris data set in 3 dimensions")
105 |     ax.view_init(50, -35)
106 | 
107 |     pca = decomposition.PCA(n_components=2)
108 |     Xtrans = pca.fit_transform(X)
109 | 
110 |     ax = fig.add_subplot(122)
111 |     for cl, color, marker in zip(np.unique(y), colors, markers):
112 |         ax.scatter(
113 |             Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black')
114 |     pylab.title("PCA on Iris data set in 2 dimensions")
115 | 
116 |     filename = "pca_demo_iris.png"
117 |     pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
118 | 
119 | 
120 | if __name__ == '__main__':
121 |     plot_demo_1()
122 |     plot_iris_mds()
123 | 


--------------------------------------------------------------------------------
/ch11/demo_mi.py:
--------------------------------------------------------------------------------
  1 | # This code is supporting material for the book
  2 | # Building Machine Learning Systems with Python
  3 | # by Willi Richert and Luis Pedro Coelho
  4 | # published by PACKT Publishing
  5 | #
  6 | # It is made available under the MIT License
  7 | 
  8 | import os
  9 | 
 10 | from matplotlib import pylab
 11 | import numpy as np
 12 | from scipy.stats import norm, entropy
 13 | 
 14 | from utils import CHART_DIR
 15 | 
 16 | 
 17 | def mutual_info(x, y, bins=10):
 18 |     counts_xy, bins_x, bins_y = np.histogram2d(x, y, bins=(bins, bins))
 19 |     counts_x, bins = np.histogram(x, bins=bins)
 20 |     counts_y, bins = np.histogram(y, bins=bins)
 21 | 
 22 |     counts_xy += 1
 23 |     counts_x += 1
 24 |     counts_y += 1
 25 |     P_xy = counts_xy / np.sum(counts_xy, dtype=float)
 26 |     P_x = counts_x / np.sum(counts_x, dtype=float)
 27 |     P_y = counts_y / np.sum(counts_y, dtype=float)
 28 | 
 29 |     I_xy = np.sum(P_xy * np.log2(P_xy / (P_x.reshape(-1, 1) * P_y)))
 30 | 
 31 |     return I_xy / (entropy(counts_x) + entropy(counts_y))
 32 | 
 33 | 
 34 | def plot_entropy():
 35 |     pylab.clf()
 36 |     pylab.figure(num=None, figsize=(5, 4))
 37 | 
 38 |     title = "Entropy $H(X)$"
 39 |     pylab.title(title)
 40 |     pylab.xlabel("$P(X=$coin will show heads up$)$")
 41 |     pylab.ylabel("$H(X)$")
 42 | 
 43 |     pylab.xlim(xmin=0, xmax=1.1)
 44 |     x = np.arange(0.001, 1, 0.001)
 45 |     y = -x * np.log2(x) - (1 - x) * np.log2(1 - x)
 46 |     pylab.plot(x, y)
 47 |     # pylab.xticks([w*7*24 for w in [0,1,2,3,4]], ['week %i'%(w+1) for w in
 48 |     # [0,1,2,3,4]])
 49 | 
 50 |     pylab.autoscale(tight=True)
 51 |     pylab.grid(True)
 52 | 
 53 |     filename = "entropy_demo.png"
 54 |     pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
 55 | 
 56 | 
 57 | def _plot_mi_func(x, y):
 58 | 
 59 |     mi = mutual_info(x, y)
 60 |     title = "NI($X_1$, $X_2$) = %.3f" % mi
 61 |     pylab.scatter(x, y)
 62 |     pylab.title(title)
 63 |     pylab.xlabel("$X_1$")
 64 |     pylab.ylabel("$X_2$")
 65 | 
 66 | 
 67 | def plot_mi_demo():
 68 |     np.random.seed(0)  # to reproduce the data later on
 69 |     pylab.clf()
 70 |     pylab.figure(num=None, figsize=(8, 8))
 71 | 
 72 |     x = np.arange(0, 10, 0.2)
 73 | 
 74 |     pylab.subplot(221)
 75 |     y = 0.5 * x + norm.rvs(1, scale=.01, size=len(x))
 76 |     _plot_mi_func(x, y)
 77 | 
 78 |     pylab.subplot(222)
 79 |     y = 0.5 * x + norm.rvs(1, scale=.1, size=len(x))
 80 |     _plot_mi_func(x, y)
 81 | 
 82 |     pylab.subplot(223)
 83 |     y = 0.5 * x + norm.rvs(1, scale=1, size=len(x))
 84 |     _plot_mi_func(x, y)
 85 | 
 86 |     pylab.subplot(224)
 87 |     y = norm.rvs(1, scale=10, size=len(x))
 88 |     _plot_mi_func(x, y)
 89 | 
 90 |     pylab.autoscale(tight=True)
 91 |     pylab.grid(True)
 92 | 
 93 |     filename = "mi_demo_1.png"
 94 |     pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
 95 | 
 96 |     pylab.clf()
 97 |     pylab.figure(num=None, figsize=(8, 8))
 98 | 
 99 |     x = np.arange(-5, 5, 0.2)
100 | 
101 |     pylab.subplot(221)
102 |     y = 0.5 * x ** 2 + norm.rvs(1, scale=.01, size=len(x))
103 |     _plot_mi_func(x, y)
104 | 
105 |     pylab.subplot(222)
106 |     y = 0.5 * x ** 2 + norm.rvs(1, scale=.1, size=len(x))
107 |     _plot_mi_func(x, y)
108 | 
109 |     pylab.subplot(223)
110 |     y = 0.5 * x ** 2 + norm.rvs(1, scale=1, size=len(x))
111 |     _plot_mi_func(x, y)
112 | 
113 |     pylab.subplot(224)
114 |     y = 0.5 * x ** 2 + norm.rvs(1, scale=10, size=len(x))
115 |     _plot_mi_func(x, y)
116 | 
117 |     pylab.autoscale(tight=True)
118 |     pylab.grid(True)
119 | 
120 |     filename = "mi_demo_2.png"
121 |     pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
122 | 
123 | if __name__ == '__main__':
124 |     plot_entropy()
125 |     plot_mi_demo()
126 | 


--------------------------------------------------------------------------------
/ch11/demo_pca.py:
--------------------------------------------------------------------------------
  1 | # This code is supporting material for the book
  2 | # Building Machine Learning Systems with Python
  3 | # by Willi Richert and Luis Pedro Coelho
  4 | # published by PACKT Publishing
  5 | #
  6 | # It is made available under the MIT License
  7 | 
  8 | import os
  9 | 
 10 | from matplotlib import pylab
 11 | import numpy as np
 12 | 
 13 | from sklearn import linear_model, decomposition
 14 | from sklearn import lda
 15 | 
 16 | logistic = linear_model.LogisticRegression()
 17 | 
 18 | 
 19 | from utils import CHART_DIR
 20 | 
 21 | np.random.seed(3)
 22 | 
 23 | x1 = np.arange(0, 10, .2)
 24 | x2 = x1 + np.random.normal(scale=1, size=len(x1))
 25 | 
 26 | 
 27 | def plot_simple_demo_1():
 28 |     pylab.clf()
 29 |     fig = pylab.figure(num=None, figsize=(10, 4))
 30 |     pylab.subplot(121)
 31 | 
 32 |     title = "Original feature space"
 33 |     pylab.title(title)
 34 |     pylab.xlabel("$X_1$")
 35 |     pylab.ylabel("$X_2$")
 36 | 
 37 |     x1 = np.arange(0, 10, .2)
 38 |     x2 = x1 + np.random.normal(scale=1, size=len(x1))
 39 | 
 40 |     good = (x1 > 5) | (x2 > 5)
 41 |     bad = ~good
 42 | 
 43 |     x1g = x1[good]
 44 |     x2g = x2[good]
 45 |     pylab.scatter(x1g, x2g, edgecolor="blue", facecolor="blue")
 46 | 
 47 |     x1b = x1[bad]
 48 |     x2b = x2[bad]
 49 |     pylab.scatter(x1b, x2b, edgecolor="red", facecolor="white")
 50 | 
 51 |     pylab.grid(True)
 52 | 
 53 |     pylab.subplot(122)
 54 | 
 55 |     X = np.c_[(x1, x2)]
 56 | 
 57 |     pca = decomposition.PCA(n_components=1)
 58 |     Xtrans = pca.fit_transform(X)
 59 | 
 60 |     Xg = Xtrans[good]
 61 |     Xb = Xtrans[bad]
 62 | 
 63 |     pylab.scatter(
 64 |         Xg[:, 0], np.zeros(len(Xg)), edgecolor="blue", facecolor="blue")
 65 |     pylab.scatter(
 66 |         Xb[:, 0], np.zeros(len(Xb)), edgecolor="red", facecolor="white")
 67 |     title = "Transformed feature space"
 68 |     pylab.title(title)
 69 |     pylab.xlabel("$X'$")
 70 |     fig.axes[1].get_yaxis().set_visible(False)
 71 | 
 72 |     print(pca.explained_variance_ratio_)
 73 | 
 74 |     pylab.grid(True)
 75 | 
 76 |     pylab.autoscale(tight=True)
 77 |     filename = "pca_demo_1.png"
 78 |     pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
 79 | 
 80 | 
 81 | def plot_simple_demo_2():
 82 |     pylab.clf()
 83 |     fig = pylab.figure(num=None, figsize=(10, 4))
 84 |     pylab.subplot(121)
 85 | 
 86 |     title = "Original feature space"
 87 |     pylab.title(title)
 88 |     pylab.xlabel("$X_1$")
 89 |     pylab.ylabel("$X_2$")
 90 | 
 91 |     x1 = np.arange(0, 10, .2)
 92 |     x2 = x1 + np.random.normal(scale=1, size=len(x1))
 93 | 
 94 |     good = x1 > x2
 95 |     bad = ~good
 96 | 
 97 |     x1g = x1[good]
 98 |     x2g = x2[good]
 99 |     pylab.scatter(x1g, x2g, edgecolor="blue", facecolor="blue")
100 | 
101 |     x1b = x1[bad]
102 |     x2b = x2[bad]
103 |     pylab.scatter(x1b, x2b, edgecolor="red", facecolor="white")
104 | 
105 |     pylab.grid(True)
106 | 
107 |     pylab.subplot(122)
108 | 
109 |     X = np.c_[(x1, x2)]
110 | 
111 |     pca = decomposition.PCA(n_components=1)
112 |     Xtrans = pca.fit_transform(X)
113 | 
114 |     Xg = Xtrans[good]
115 |     Xb = Xtrans[bad]
116 | 
117 |     pylab.scatter(
118 |         Xg[:, 0], np.zeros(len(Xg)), edgecolor="blue", facecolor="blue")
119 |     pylab.scatter(
120 |         Xb[:, 0], np.zeros(len(Xb)), edgecolor="red", facecolor="white")
121 |     title = "Transformed feature space"
122 |     pylab.title(title)
123 |     pylab.xlabel("$X'$")
124 |     fig.axes[1].get_yaxis().set_visible(False)
125 | 
126 |     print(pca.explained_variance_ratio_)
127 | 
128 |     pylab.grid(True)
129 | 
130 |     pylab.autoscale(tight=True)
131 |     filename = "pca_demo_2.png"
132 |     pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
133 | 
134 | 
135 | def plot_simple_demo_lda():
136 |     pylab.clf()
137 |     fig = pylab.figure(num=None, figsize=(10, 4))
138 |     pylab.subplot(121)
139 | 
140 |     title = "Original feature space"
141 |     pylab.title(title)
142 |     pylab.xlabel("$X_1$")
143 |     pylab.ylabel("$X_2$")
144 | 
145 |     good = x1 > x2
146 |     bad = ~good
147 | 
148 |     x1g = x1[good]
149 |     x2g = x2[good]
150 |     pylab.scatter(x1g, x2g, edgecolor="blue", facecolor="blue")
151 | 
152 |     x1b = x1[bad]
153 |     x2b = x2[bad]
154 |     pylab.scatter(x1b, x2b, edgecolor="red", facecolor="white")
155 | 
156 |     pylab.grid(True)
157 | 
158 |     pylab.subplot(122)
159 | 
160 |     X = np.c_[(x1, x2)]
161 | 
162 |     lda_inst = lda.LDA(n_components=1)
163 |     Xtrans = lda_inst.fit_transform(X, good)
164 | 
165 |     Xg = Xtrans[good]
166 |     Xb = Xtrans[bad]
167 | 
168 |     pylab.scatter(
169 |         Xg[:, 0], np.zeros(len(Xg)), edgecolor="blue", facecolor="blue")
170 |     pylab.scatter(
171 |         Xb[:, 0], np.zeros(len(Xb)), edgecolor="red", facecolor="white")
172 |     title = "Transformed feature space"
173 |     pylab.title(title)
174 |     pylab.xlabel("$X'$")
175 |     fig.axes[1].get_yaxis().set_visible(False)
176 | 
177 |     pylab.grid(True)
178 | 
179 |     pylab.autoscale(tight=True)
180 |     filename = "lda_demo.png"
181 |     pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
182 | 
183 | if __name__ == '__main__':
184 |     plot_simple_demo_1()
185 |     plot_simple_demo_2()
186 |     plot_simple_demo_lda()
187 | 


--------------------------------------------------------------------------------
/ch11/demo_rfe.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | from sklearn.feature_selection import RFE
 9 | from sklearn.linear_model import LogisticRegression
10 | 
11 | from sklearn.datasets import make_classification
12 | 
13 | X, y = make_classification(
14 |     n_samples=100, n_features=10, n_informative=3, random_state=0)
15 | 
16 | clf = LogisticRegression()
17 | clf.fit(X, y)
18 | 
19 | for i in range(1, 11):
20 |     selector = RFE(clf, i)
21 |     selector = selector.fit(X, y)
22 |     print("%i\t%s\t%s" % (i, selector.support_, selector.ranking_))
23 | 


--------------------------------------------------------------------------------
/ch11/utils.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | import os
 9 | 
10 | DATA_DIR = os.path.join(
11 |     os.path.dirname(os.path.realpath(__file__)), "data")
12 | 
13 | CHART_DIR = os.path.join(
14 |     os.path.dirname(os.path.realpath(__file__)), "charts")
15 | 
16 | for d in [DATA_DIR, CHART_DIR]:
17 |     if not os.path.exists(d):
18 |         os.mkdir(d)
19 | 
20 | 


--------------------------------------------------------------------------------
/ch12/.gitignore:
--------------------------------------------------------------------------------
1 | *.jugdata/
2 | output.txt
3 | results.image.txt
4 | 


--------------------------------------------------------------------------------
/ch12/README.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | Chapter 12
 3 | ==========
 4 | 
 5 | Support code for *Chapter 12: Big(ger) Data* 
 6 | 
 7 | Data
 8 | ----
 9 | 
10 | This chapter relies only on the image dataset that is packaged with the
11 | repository at ``../SimpleImageDataset/``.
12 | 
13 | Scripts
14 | -------
15 | 
16 | chapter.py
17 |     Code as written in the book
18 | jugfile.py
19 |     Example jugfile
20 | image-classification.py
21 |     Jugfile implementation of image classification from Chapter 10
22 | 
23 | setup-aws.txt
24 |     Commands to setup Amazon WebServices machine
25 | run-jugfile.sh
26 |     Wrapper script to run jug file on jugfile.py
27 | run-image-classification.sh
28 |     Wrapper script to run jug file on image-classification.py
29 | 


--------------------------------------------------------------------------------
/ch12/chapter.py:
--------------------------------------------------------------------------------
 1 | from jug import TaskGenerator
 2 | from glob import glob
 3 | import mahotas as mh
 4 | @TaskGenerator
 5 | def compute_texture(im):   
 6 |     from features import texture
 7 |     imc = mh.imread(im)
 8 |     return texture(mh.colors.rgb2gray(imc))
 9 | 
10 | @TaskGenerator
11 | def chist_file(fname):
12 |     from features import chist
13 |     im = mh.imread(fname)
14 |     return chist(im)
15 | 
16 | import numpy as np
17 | to_array = TaskGenerator(np.array)
18 | hstack = TaskGenerator(np.hstack)
19 | 
20 | haralicks = []
21 | chists = []
22 | labels = []
23 | 
24 | # Change this variable to point to
25 | # the location of the dataset is on disk
26 | basedir = '../SimpleImageDataset/'
27 | # Use glob to get all the images
28 | images = glob('{}/*.jpg'.format(basedir))
29 | 
30 | for fname in sorted(images):
31 |     haralicks.append(compute_texture(fname))
32 |     chists.append(chist_file(fname))
33 |     # The class is encoded in the filename as xxxx00.jpg
34 |     labels.append(fname[:-len('00.jpg')])
35 | 
36 | haralicks = to_array(haralicks)
37 | chists = to_array(chists)
38 | labels = to_array(labels)
39 | 
40 | @TaskGenerator
41 | def accuracy(features, labels):
42 |     from sklearn.linear_model import LogisticRegression
43 |     from sklearn.pipeline import Pipeline
44 |     from sklearn.preprocessing import StandardScaler
45 |     from sklearn import cross_validation
46 |     
47 |     clf = Pipeline([('preproc', StandardScaler()),
48 |                 ('classifier', LogisticRegression())])
49 |     cv = cross_validation.LeaveOneOut(len(features))
50 |     scores = cross_validation.cross_val_score(
51 |         clf, features, labels, cv=cv)
52 |     return scores.mean()
53 | scores_base = accuracy(haralicks, labels)
54 | scores_chist = accuracy(chists, labels)
55 | 
56 | combined = hstack([chists, haralicks])
57 | scores_combined  = accuracy(combined, labels)
58 | 
59 | @TaskGenerator
60 | def print_results(scores):
61 |     with open('results.image.txt', 'w') as output:
62 |         for k,v in scores:
63 |             output.write('Accuracy [{}]: {:.1%}\n'.format(
64 |                 k, v.mean()))
65 | 
66 | print_results([
67 |         ('base', scores_base),
68 |         ('chists', scores_chist),
69 |         ('combined' , scores_combined),
70 |         ])
71 | 
72 | @TaskGenerator
73 | def compute_lbp(fname):
74 |     from mahotas.features import lbp
75 |     imc = mh.imread(fname)
76 |     im = mh.colors.rgb2grey(imc)
77 |     return lbp(im, radius=8, points=6)
78 | 
79 | lbps = []
80 | for fname in sorted(images):
81 |     # the rest of the loop as before
82 |     lbps.append(compute_lbp(fname))
83 | lbps = to_array(lbps)
84 | 
85 | scores_lbps = accuracy(lbps, labels)
86 | combined_all = hstack([chists, haralicks, lbps])
87 | scores_combined_all = accuracy(combined_all, labels)
88 | 
89 | print_results([
90 |         ('base', scores_base),
91 |         ('chists', scores_chist),
92 |         ('lbps', scores_lbps),
93 |         ('combined' , scores_combined),
94 |         ('combined_all' , scores_combined_all),
95 |         ])
96 | 


--------------------------------------------------------------------------------
/ch12/features.py:
--------------------------------------------------------------------------------
1 | ../ch10/features.py


--------------------------------------------------------------------------------
/ch12/image-classification.py:
--------------------------------------------------------------------------------
  1 | # This code is supporting material for the book
  2 | # Building Machine Learning Systems with Python
  3 | # by Willi Richert and Luis Pedro Coelho
  4 | # published by PACKT Publishing
  5 | #
  6 | # It is made available under the MIT License
  7 | 
  8 | import mahotas as mh
  9 | import numpy as np
 10 | from glob import glob
 11 | from jug import TaskGenerator
 12 | 
 13 | # We need to use the `features` module from chapter 10.
 14 | from sys import path
 15 | path.append('../ch10')
 16 | 
 17 | 
 18 | # This is the jug-enabled version of the script ``figure18.py`` in Chapter 10
 19 | 
 20 | basedir = '../SimpleImageDataset/'
 21 | 
 22 | @TaskGenerator
 23 | def compute_texture(im):
 24 |     '''Compute features for an image
 25 | 
 26 |     Parameters
 27 |     ----------
 28 |     im : str
 29 |         filepath for image to process
 30 | 
 31 |     Returns
 32 |     -------
 33 |     fs : ndarray
 34 |         1-D array of features
 35 |     '''
 36 |     from features import texture
 37 |     imc = mh.imread(im)
 38 |     return texture(mh.colors.rgb2grey(imc))
 39 | 
 40 | @TaskGenerator
 41 | def chist(fname):
 42 |     from features import chist as color_histogram
 43 |     im = mh.imread(fname)
 44 |     return color_histogram(im)
 45 | 
 46 | @TaskGenerator
 47 | def compute_lbp(fname):
 48 |     from mahotas.features import lbp
 49 |     imc = mh.imread(fname)
 50 |     im = mh.colors.rgb2grey(imc)
 51 |     return lbp(im, radius=8, points=6)
 52 | 
 53 | 
 54 | @TaskGenerator
 55 | def accuracy(features, labels):
 56 |     from sklearn.linear_model import LogisticRegression
 57 |     from sklearn.pipeline import Pipeline
 58 |     from sklearn.preprocessing import StandardScaler
 59 |     from sklearn import cross_validation
 60 |     # We use logistic regression because it is very fast.
 61 |     # Feel free to experiment with other classifiers
 62 |     clf = Pipeline([('preproc', StandardScaler()),
 63 |                 ('classifier', LogisticRegression())])
 64 |     cv = cross_validation.LeaveOneOut(len(features))
 65 |     scores = cross_validation.cross_val_score(
 66 |         clf, features, labels, cv=cv)
 67 |     return scores.mean()
 68 | 
 69 | 
 70 | @TaskGenerator
 71 | def print_results(scores):
 72 |     with open('results.image.txt', 'w') as output:
 73 |         for k,v in scores:
 74 |             output.write('Accuracy (LOO x-val) with Logistic Regression [{0}]: {1:.1%}\n'.format(
 75 |                 k, v.mean()))
 76 | 
 77 | 
 78 | to_array = TaskGenerator(np.array)
 79 | hstack = TaskGenerator(np.hstack)
 80 | 
 81 | haralicks = []
 82 | chists = []
 83 | lbps = []
 84 | labels = []
 85 | 
 86 | # Use glob to get all the images
 87 | images = glob('{0}/*.jpg'.format(basedir))
 88 | for fname in sorted(images):
 89 |     haralicks.append(compute_texture(fname))
 90 |     chists.append(chist(fname))
 91 |     lbps.append(compute_lbp(fname))
 92 |     labels.append(fname[:-len('00.jpg')]) # The class is encoded in the filename as xxxx00.jpg
 93 | 
 94 | haralicks = to_array(haralicks)
 95 | chists = to_array(chists)
 96 | lbps = to_array(lbps)
 97 | labels = to_array(labels)
 98 | 
 99 | scores_base = accuracy(haralicks, labels)
100 | scores_chist = accuracy(chists, labels)
101 | scores_lbps = accuracy(lbps, labels)
102 | 
103 | combined = hstack([chists, haralicks])
104 | scores_combined = accuracy(combined, labels)
105 | 
106 | combined_all = hstack([chists, haralicks, lbps])
107 | scores_combined_all = accuracy(combined_all, labels)
108 | 
109 | print_results([
110 |         ('base', scores_base),
111 |         ('chists', scores_chist),
112 |         ('lbps', scores_lbps),
113 |         ('combined' , scores_combined),
114 |         ('combined_all' , scores_combined_all),
115 |         ])
116 | 
117 | 


--------------------------------------------------------------------------------
/ch12/jugfile.py:
--------------------------------------------------------------------------------
 1 | # This code is supporting material for the book
 2 | # Building Machine Learning Systems with Python
 3 | # by Willi Richert and Luis Pedro Coelho
 4 | # published by PACKT Publishing
 5 | #
 6 | # It is made available under the MIT License
 7 | 
 8 | from jug import TaskGenerator
 9 | from time import sleep
10 | 
11 | 
12 | @TaskGenerator
13 | def double(x):
14 |     sleep(4)
15 |     return 2 * x
16 | 
17 | 
18 | @TaskGenerator
19 | def add(a, b):
20 |     return a + b
21 | 
22 | 
23 | @TaskGenerator
24 | def print_final_result(oname, value):
25 |     with open(oname, 'w') as output:
26 |         output.write("Final result: {0}\n".format(value))
27 | 
28 | input = 2
29 | y = double(input)
30 | z = double(y)
31 | 
32 | y2 = double(7)
33 | z2 = double(y2)
34 | print_final_result('output.txt', add(z, z2))
35 | 


--------------------------------------------------------------------------------
/ch12/run-image-classification.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | jug execute image-classification.py
4 | 


--------------------------------------------------------------------------------
/ch12/run-jugfile.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | jug execute
4 | 
5 | 


--------------------------------------------------------------------------------
/ch12/setup-aws.txt:
--------------------------------------------------------------------------------
1 | sudo yum update
2 | sudo yum -y install python-devel python-pip numpy scipy python-matplotlib
3 | sudo yum -y install gcc-c++
4 | sudo yum -y install git
5 | sudo pip-python install -U pip
6 | sudo pip install scikit-learn jug mahotas
7 | 
8 | 


--------------------------------------------------------------------------------