├── .gitignore ├── LICENSE ├── README.md ├── SimpleImageDataset ├── building00.jpg ├── building01.jpg ├── building02.jpg ├── building03.jpg ├── building04.jpg ├── building05.jpg ├── building06.jpg ├── building07.jpg ├── building08.jpg ├── building09.jpg ├── building10.jpg ├── building11.jpg ├── building12.jpg ├── building13.jpg ├── building14.jpg ├── building15.jpg ├── building16.jpg ├── building17.jpg ├── building18.jpg ├── building19.jpg ├── building20.jpg ├── building21.jpg ├── building22.jpg ├── building23.jpg ├── building24.jpg ├── building25.jpg ├── building26.jpg ├── building27.jpg ├── building28.jpg ├── building29.jpg ├── scene00.jpg ├── scene01.jpg ├── scene02.jpg ├── scene03.jpg ├── scene04.jpg ├── scene05.jpg ├── scene06.jpg ├── scene07.jpg ├── scene08.jpg ├── scene09.jpg ├── scene10.jpg ├── scene11.jpg ├── scene12.jpg ├── scene13.jpg ├── scene14.jpg ├── scene15.jpg ├── scene16.jpg ├── scene17.jpg ├── scene18.jpg ├── scene19.jpg ├── scene20.jpg ├── scene21.jpg ├── scene22.jpg ├── scene23.jpg ├── scene24.jpg ├── scene25.jpg ├── scene26.jpg ├── scene27.jpg ├── scene28.jpg ├── scene29.jpg ├── text00.jpg ├── text01.jpg ├── text02.jpg ├── text03.jpg ├── text04.jpg ├── text05.jpg ├── text06.jpg ├── text07.jpg ├── text08.jpg ├── text09.jpg ├── text10.jpg ├── text11.jpg ├── text12.jpg ├── text13.jpg ├── text14.jpg ├── text15.jpg ├── text16.jpg ├── text17.jpg ├── text18.jpg ├── text19.jpg ├── text20.jpg ├── text21.jpg ├── text22.jpg ├── text23.jpg ├── text24.jpg ├── text25.jpg ├── text26.jpg ├── text27.jpg ├── text28.jpg └── text29.jpg ├── ch01 ├── analyze_webstats.py ├── data │ └── web_traffic.tsv ├── gen_webstats.py ├── performance_test.py └── utils.py ├── ch02 ├── README.rst ├── chapter.py ├── data │ └── seeds.tsv ├── extra │ └── create_tsv.py ├── figure1.py ├── figure2.py ├── figure4_5_no_sklearn.py ├── figure4_5_sklearn.py ├── heldout.py ├── knn.py ├── load.py ├── seeds_knn.py ├── seeds_knn_increasing_k.py ├── seeds_knn_sklearn.py ├── seeds_threshold.py ├── simple_threshold.py ├── stump.py ├── tests │ └── test_load.py └── threshold.py ├── ch03 ├── README.md ├── data │ └── toy │ │ ├── 01.txt │ │ ├── 02.txt │ │ ├── 03.txt │ │ ├── 04.txt │ │ └── 05.txt ├── noise_analysis.py ├── plot_kmeans_example.py ├── rel_post_01.py ├── rel_post_20news.py ├── tfidf.py └── utils.py ├── ch04 ├── .gitignore ├── README.rst ├── blei_lda.py ├── build_lda.py ├── data │ ├── .gitignore │ ├── download_ap.sh │ ├── download_wp.sh │ └── preprocess-wikidata.sh ├── wikitopics_create.py ├── wikitopics_create_hdp.py ├── wikitopics_plot.py └── wordcloud.py ├── ch05 ├── PosTagFreqVectorizer.py ├── README.md ├── chose_instances.py ├── classify.py ├── data.py ├── log_reg_example.py ├── so_xml_to_tsv.py └── utils.py ├── ch06 ├── 01_start.py ├── 02_tuning.py ├── 03_clean.py ├── 04_sent.py ├── README.md ├── data │ ├── corpus.csv │ ├── missing.tsv │ └── not_authorized.tsv ├── install.py ├── twitterauth.py └── utils.py ├── ch07 ├── .gitignore ├── README.rst ├── boston1.py ├── boston1numpy.py ├── boston_cv_penalized.py ├── data │ ├── .gitignore │ └── download.sh ├── figure1_2.py ├── figure3.py ├── figure4.py ├── lasso_path_plot.py ├── lr10k.py └── predict10k_en.py ├── ch08 ├── README.rst ├── all_correlations.py ├── apriori │ ├── .gitignore │ ├── apriori.py │ ├── apriori_example.py │ ├── apriori_naive.py │ ├── download.sh │ └── histogram.py ├── averaged.py ├── chapter.py ├── corrneighbours.py ├── data │ ├── .gitignore │ └── download.sh ├── figure3.py ├── load_ml100k.py ├── norm.py ├── regression.py ├── similar_movie.py └── stacked.py ├── ch09 ├── 01_fft_based_classifier.py ├── 02_ceps_based_classifier.py ├── Makefile ├── ceps.py ├── fft.py └── utils.py ├── ch10 ├── .gitignore ├── README.rst ├── chapter.py ├── download.sh ├── features.py ├── figure10.py ├── large_classification.py ├── lena-ring.py ├── neighbors.py ├── scene00.jpg ├── simple_classification.py ├── threshold.py └── thresholded_figure.py ├── ch11 ├── demo_corr.py ├── demo_mds.py ├── demo_mi.py ├── demo_pca.py ├── demo_rfe.py └── utils.py └── ch12 ├── .gitignore ├── README.rst ├── chapter.py ├── features.py ├── image-classification.py ├── jugfile.py ├── run-image-classification.sh ├── run-jugfile.sh └── setup-aws.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Luis Pedro Coelho 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Building Machine Learning Systems with Python 2 | ============================================= 3 | 4 | Source Code for the book Building Machine Learning Systems with Python by [Luis 5 | Pedro Coelho](http://luispedro.org) and [Willi Richert](http://twotoreal.com). 6 | 7 | The book was published in 2013 (second edition in 2015) by Packt Publishing and 8 | is available [from their 9 | website](http://www.packtpub.com/building-machine-learning-systems-with-python/book). 10 | 11 | The code in the repository corresponds to the second edition. Code for the 12 | first edition is available in [first\_edition 13 | branch](https://github.com/luispedro/BuildingMachineLearningSystemsWithPython/tree/first_edition). 14 | 15 | -------------------------------------------------------------------------------- /SimpleImageDataset/building00.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building00.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building01.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building02.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building03.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building03.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building04.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building04.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building05.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building05.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building06.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building06.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building07.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building07.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building08.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building08.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building09.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building09.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building10.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building10.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building11.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building11.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building12.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building12.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building13.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building13.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building14.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building14.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building15.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building15.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building16.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building16.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building17.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building17.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building18.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building18.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building19.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building19.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building20.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building20.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building21.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building21.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building22.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building22.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building23.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building23.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building24.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building24.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building25.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building25.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building26.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building26.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building27.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building27.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building28.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building28.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/building29.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/building29.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene00.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene00.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene01.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene02.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene03.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene03.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene04.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene04.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene05.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene05.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene06.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene06.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene07.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene07.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene08.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene08.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene09.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene09.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene10.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene10.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene11.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene11.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene12.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene12.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene13.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene13.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene14.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene14.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene15.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene15.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene16.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene16.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene17.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene17.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene18.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene18.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene19.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene19.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene20.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene20.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene21.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene21.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene22.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene22.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene23.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene23.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene24.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene24.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene25.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene25.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene26.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene26.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene27.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene27.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene28.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene28.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/scene29.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/scene29.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text00.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text00.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text01.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text02.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text03.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text03.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text04.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text04.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text05.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text05.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text06.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text06.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text07.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text07.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text08.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text08.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text09.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text09.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text10.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text10.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text11.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text11.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text12.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text12.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text13.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text13.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text14.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text14.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text15.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text15.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text16.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text16.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text17.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text17.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text18.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text18.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text19.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text19.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text20.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text20.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text21.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text21.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text22.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text22.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text23.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text23.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text24.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text24.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text25.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text25.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text26.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text26.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text27.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text27.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text28.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text28.jpg -------------------------------------------------------------------------------- /SimpleImageDataset/text29.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/SimpleImageDataset/text29.jpg -------------------------------------------------------------------------------- /ch01/analyze_webstats.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import os 9 | from utils import DATA_DIR, CHART_DIR 10 | import scipy as sp 11 | import matplotlib.pyplot as plt 12 | 13 | sp.random.seed(3) # to reproduce the data later on 14 | 15 | data = sp.genfromtxt(os.path.join(DATA_DIR, "web_traffic.tsv"), delimiter="\t") 16 | print(data[:10]) 17 | print(data.shape) 18 | 19 | # all examples will have three classes in this file 20 | colors = ['g', 'k', 'b', 'm', 'r'] 21 | linestyles = ['-', '-.', '--', ':', '-'] 22 | 23 | x = data[:, 0] 24 | y = data[:, 1] 25 | print("Number of invalid entries:", sp.sum(sp.isnan(y))) 26 | x = x[~sp.isnan(y)] 27 | y = y[~sp.isnan(y)] 28 | 29 | 30 | def plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None): 31 | ''' plot input data ''' 32 | 33 | plt.figure(num=None, figsize=(8, 6)) 34 | plt.clf() 35 | plt.scatter(x, y, s=10) 36 | plt.title("Web traffic over the last month") 37 | plt.xlabel("Time") 38 | plt.ylabel("Hits/hour") 39 | plt.xticks( 40 | [w * 7 * 24 for w in range(10)], ['week %i' % w for w in range(10)]) 41 | 42 | if models: 43 | if mx is None: 44 | mx = sp.linspace(0, x[-1], 1000) 45 | for model, style, color in zip(models, linestyles, colors): 46 | # print "Model:",model 47 | # print "Coeffs:",model.coeffs 48 | plt.plot(mx, model(mx), linestyle=style, linewidth=2, c=color) 49 | 50 | plt.legend(["d=%i" % m.order for m in models], loc="upper left") 51 | 52 | plt.autoscale(tight=True) 53 | plt.ylim(ymin=0) 54 | if ymax: 55 | plt.ylim(ymax=ymax) 56 | if xmin: 57 | plt.xlim(xmin=xmin) 58 | plt.grid(True, linestyle='-', color='0.75') 59 | plt.savefig(fname) 60 | 61 | # first look at the data 62 | plot_models(x, y, None, os.path.join(CHART_DIR, "1400_01_01.png")) 63 | 64 | # create and plot models 65 | fp1, res1, rank1, sv1, rcond1 = sp.polyfit(x, y, 1, full=True) 66 | print("Model parameters of fp1: %s" % fp1) 67 | print("Error of the model of fp1:", res1) 68 | f1 = sp.poly1d(fp1) 69 | 70 | fp2, res2, rank2, sv2, rcond2 = sp.polyfit(x, y, 2, full=True) 71 | print("Model parameters of fp2: %s" % fp2) 72 | print("Error of the model of fp2:", res2) 73 | f2 = sp.poly1d(fp2) 74 | f3 = sp.poly1d(sp.polyfit(x, y, 3)) 75 | f10 = sp.poly1d(sp.polyfit(x, y, 10)) 76 | f100 = sp.poly1d(sp.polyfit(x, y, 100)) 77 | 78 | plot_models(x, y, [f1], os.path.join(CHART_DIR, "1400_01_02.png")) 79 | plot_models(x, y, [f1, f2], os.path.join(CHART_DIR, "1400_01_03.png")) 80 | plot_models( 81 | x, y, [f1, f2, f3, f10, f100], os.path.join(CHART_DIR, "1400_01_04.png")) 82 | 83 | # fit and plot a model using the knowledge about inflection point 84 | inflection = 3.5 * 7 * 24 85 | xa = x[:inflection] 86 | ya = y[:inflection] 87 | xb = x[inflection:] 88 | yb = y[inflection:] 89 | 90 | fa = sp.poly1d(sp.polyfit(xa, ya, 1)) 91 | fb = sp.poly1d(sp.polyfit(xb, yb, 1)) 92 | 93 | plot_models(x, y, [fa, fb], os.path.join(CHART_DIR, "1400_01_05.png")) 94 | 95 | 96 | def error(f, x, y): 97 | return sp.sum((f(x) - y) ** 2) 98 | 99 | print("Errors for the complete data set:") 100 | for f in [f1, f2, f3, f10, f100]: 101 | print("Error d=%i: %f" % (f.order, error(f, x, y))) 102 | 103 | print("Errors for only the time after inflection point") 104 | for f in [f1, f2, f3, f10, f100]: 105 | print("Error d=%i: %f" % (f.order, error(f, xb, yb))) 106 | 107 | print("Error inflection=%f" % (error(fa, xa, ya) + error(fb, xb, yb))) 108 | 109 | 110 | # extrapolating into the future 111 | plot_models( 112 | x, y, [f1, f2, f3, f10, f100], 113 | os.path.join(CHART_DIR, "1400_01_06.png"), 114 | mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100), 115 | ymax=10000, xmin=0 * 7 * 24) 116 | 117 | print("Trained only on data after inflection point") 118 | fb1 = fb 119 | fb2 = sp.poly1d(sp.polyfit(xb, yb, 2)) 120 | fb3 = sp.poly1d(sp.polyfit(xb, yb, 3)) 121 | fb10 = sp.poly1d(sp.polyfit(xb, yb, 10)) 122 | fb100 = sp.poly1d(sp.polyfit(xb, yb, 100)) 123 | 124 | print("Errors for only the time after inflection point") 125 | for f in [fb1, fb2, fb3, fb10, fb100]: 126 | print("Error d=%i: %f" % (f.order, error(f, xb, yb))) 127 | 128 | plot_models( 129 | x, y, [fb1, fb2, fb3, fb10, fb100], 130 | os.path.join(CHART_DIR, "1400_01_07.png"), 131 | mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100), 132 | ymax=10000, xmin=0 * 7 * 24) 133 | 134 | # separating training from testing data 135 | frac = 0.3 136 | split_idx = int(frac * len(xb)) 137 | shuffled = sp.random.permutation(list(range(len(xb)))) 138 | test = sorted(shuffled[:split_idx]) 139 | train = sorted(shuffled[split_idx:]) 140 | fbt1 = sp.poly1d(sp.polyfit(xb[train], yb[train], 1)) 141 | fbt2 = sp.poly1d(sp.polyfit(xb[train], yb[train], 2)) 142 | print("fbt2(x)= \n%s" % fbt2) 143 | print("fbt2(x)-100,000= \n%s" % (fbt2-100000)) 144 | fbt3 = sp.poly1d(sp.polyfit(xb[train], yb[train], 3)) 145 | fbt10 = sp.poly1d(sp.polyfit(xb[train], yb[train], 10)) 146 | fbt100 = sp.poly1d(sp.polyfit(xb[train], yb[train], 100)) 147 | 148 | print("Test errors for only the time after inflection point") 149 | for f in [fbt1, fbt2, fbt3, fbt10, fbt100]: 150 | print("Error d=%i: %f" % (f.order, error(f, xb[test], yb[test]))) 151 | 152 | plot_models( 153 | x, y, [fbt1, fbt2, fbt3, fbt10, fbt100], 154 | os.path.join(CHART_DIR, "1400_01_08.png"), 155 | mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100), 156 | ymax=10000, xmin=0 * 7 * 24) 157 | 158 | from scipy.optimize import fsolve 159 | print(fbt2) 160 | print(fbt2 - 100000) 161 | reached_max = fsolve(fbt2 - 100000, x0=800) / (7 * 24) 162 | print("100,000 hits/hour expected at week %f" % reached_max[0]) 163 | -------------------------------------------------------------------------------- /ch01/gen_webstats.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | # This script generates web traffic data for our hypothetical 9 | # web startup "MLASS" in chapter 01 10 | 11 | import os 12 | import scipy as sp 13 | from scipy.stats import gamma 14 | import matplotlib.pyplot as plt 15 | 16 | from utils import DATA_DIR, CHART_DIR 17 | 18 | sp.random.seed(3) # to reproduce the data later on 19 | 20 | x = sp.arange(1, 31*24) 21 | y = sp.array(200*(sp.sin(2*sp.pi*x/(7*24))), dtype=int) 22 | y += gamma.rvs(15, loc=0, scale=100, size=len(x)) 23 | y += 2 * sp.exp(x/100.0) 24 | y = sp.ma.array(y, mask=[y<0]) 25 | print(sum(y), sum(y<0)) 26 | 27 | plt.scatter(x, y) 28 | plt.title("Web traffic over the last month") 29 | plt.xlabel("Time") 30 | plt.ylabel("Hits/hour") 31 | plt.xticks([w*7*24 for w in range(5)], 32 | ['week %i' %(w+1) for w in range(5)]) 33 | plt.autoscale(tight=True) 34 | plt.grid() 35 | plt.savefig(os.path.join(CHART_DIR, "1400_01_01.png")) 36 | 37 | sp.savetxt(os.path.join(DATA_DIR, "web_traffic.tsv"), 38 | list(zip(x, y)), delimiter="\t", fmt="%s") 39 | -------------------------------------------------------------------------------- /ch01/performance_test.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | 9 | import timeit 10 | 11 | normal_py_sec = timeit.timeit('sum(x*x for x in range(1000))', 12 | number=10000) 13 | naive_np_sec = timeit.timeit('sum(na*na)', 14 | setup="import numpy as np; na=np.arange(1000)", 15 | number=10000) 16 | good_np_sec = timeit.timeit('na.dot(na)', 17 | setup="import numpy as np; na=np.arange(1000)", 18 | number=10000) 19 | 20 | print("Normal Python: %f sec" % normal_py_sec) 21 | print("Naive NumPy: %f sec" % naive_np_sec) 22 | print("Good NumPy: %f sec" % good_np_sec) 23 | -------------------------------------------------------------------------------- /ch01/utils.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import os 9 | 10 | DATA_DIR = os.path.join( 11 | os.path.dirname(os.path.realpath(__file__)), "data") 12 | 13 | CHART_DIR = os.path.join( 14 | os.path.dirname(os.path.realpath(__file__)), "charts") 15 | 16 | for d in [DATA_DIR, CHART_DIR]: 17 | if not os.path.exists(d): 18 | os.mkdir(d) 19 | 20 | -------------------------------------------------------------------------------- /ch02/README.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Chapter 2 3 | ========= 4 | 5 | Support code for *Chapter 2: Learning How to Classify with Real-world 6 | Examples*. The directory data contains the seeds dataset, originally downloaded 7 | from https://archive.ics.uci.edu/ml/datasets/seeds 8 | 9 | chapter.py 10 | The code as printed in the book. 11 | 12 | figure1.py 13 | Figure 1 in the book: all 2-by-2 scatter plots 14 | 15 | figure2.py 16 | Figure 2 in the book: threshold & decision area 17 | 18 | figure4_5_sklearn.py 19 | Figures 4 and 5 in the book: Knn decision borders before and after feature 20 | normalization. This also produces a version of the figure using 11 21 | neighbors (not in the book), which shows that the result is smoother, not 22 | as sensitive to exact positions of each datapoint. 23 | 24 | figure4_5_no_sklearn.py 25 | Alternative code for Figures 4 and 5 without using scikit-learn 26 | 27 | load.py 28 | Code to load the seeds data 29 | 30 | simple_threshold.py 31 | Code from the book: finds the first partition, between Setosa and the other classes. 32 | 33 | stump.py 34 | Code from the book: finds the second partition, between Virginica and Versicolor. 35 | 36 | threshold.py 37 | Functional implementation of a threshold classifier 38 | 39 | heldout.py 40 | Evalute the threshold model on heldout data 41 | 42 | seeds_knn_sklearn.py 43 | Demonstrate cross-validation and feature normalization using scikit-learn 44 | 45 | seeds_threshold.py 46 | Test thresholding model on the seeds dataset (result mention in book, but no code) 47 | 48 | seeds_knn_increasing_k.py 49 | Test effect of increasing num_neighbors on accuracy. 50 | 51 | knn.py 52 | Implementation of K-Nearest neighbor without using scikit-learn. 53 | 54 | seeds_knn.py 55 | Demonstrate cross-validation (without scikit-learn) 56 | -------------------------------------------------------------------------------- /ch02/chapter.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | 9 | from matplotlib import pyplot as plt 10 | import numpy as np 11 | 12 | # We load the data with load_iris from sklearn 13 | from sklearn.datasets import load_iris 14 | data = load_iris() 15 | 16 | # load_iris returns an object with several fields 17 | features = data.data 18 | feature_names = data.feature_names 19 | target = data.target 20 | target_names = data.target_names 21 | 22 | for t in range(3): 23 | if t == 0: 24 | c = 'r' 25 | marker = '>' 26 | elif t == 1: 27 | c = 'g' 28 | marker = 'o' 29 | elif t == 2: 30 | c = 'b' 31 | marker = 'x' 32 | plt.scatter(features[target == t, 0], 33 | features[target == t, 1], 34 | marker=marker, 35 | c=c) 36 | # We use NumPy fancy indexing to get an array of strings: 37 | labels = target_names[target] 38 | 39 | # The petal length is the feature at position 2 40 | plength = features[:, 2] 41 | 42 | # Build an array of booleans: 43 | is_setosa = (labels == 'setosa') 44 | 45 | # This is the important step: 46 | max_setosa =plength[is_setosa].max() 47 | min_non_setosa = plength[~is_setosa].min() 48 | print('Maximum of setosa: {0}.'.format(max_setosa)) 49 | 50 | print('Minimum of others: {0}.'.format(min_non_setosa)) 51 | 52 | # ~ is the boolean negation operator 53 | features = features[~is_setosa] 54 | labels = labels[~is_setosa] 55 | # Build a new target variable, is_virigina 56 | is_virginica = (labels == 'virginica') 57 | 58 | # Initialize best_acc to impossibly low value 59 | best_acc = -1.0 60 | for fi in range(features.shape[1]): 61 | # We are going to test all possible thresholds 62 | thresh = features[:,fi] 63 | for t in thresh: 64 | 65 | # Get the vector for feature `fi` 66 | feature_i = features[:, fi] 67 | # apply threshold `t` 68 | pred = (feature_i > t) 69 | acc = (pred == is_virginica).mean() 70 | rev_acc = (pred == ~is_virginica).mean() 71 | if rev_acc > acc: 72 | reverse = True 73 | acc = rev_acc 74 | else: 75 | reverse = False 76 | 77 | if acc > best_acc: 78 | best_acc = acc 79 | best_fi = fi 80 | best_t = t 81 | best_reverse = reverse 82 | 83 | print(best_fi, best_t, best_reverse, best_acc) 84 | 85 | def is_virginica_test(fi, t, reverse, example): 86 | 'Apply threshold model to a new example' 87 | test = example[fi] > t 88 | if reverse: 89 | test = not test 90 | return test 91 | from threshold import fit_model, predict 92 | 93 | # ning accuracy was 96.0%. 94 | # ing accuracy was 90.0% (N = 50). 95 | correct = 0.0 96 | 97 | for ei in range(len(features)): 98 | # select all but the one at position `ei`: 99 | training = np.ones(len(features), bool) 100 | training[ei] = False 101 | testing = ~training 102 | model = fit_model(features[training], is_virginica[training]) 103 | predictions = predict(model, features[testing]) 104 | correct += np.sum(predictions == is_virginica[testing]) 105 | acc = correct/float(len(features)) 106 | print('Accuracy: {0:.1%}'.format(acc)) 107 | 108 | 109 | ########################################### 110 | ############## SEEDS DATASET ############## 111 | ########################################### 112 | 113 | from load import load_dataset 114 | 115 | feature_names = [ 116 | 'area', 117 | 'perimeter', 118 | 'compactness', 119 | 'length of kernel', 120 | 'width of kernel', 121 | 'asymmetry coefficien', 122 | 'length of kernel groove', 123 | ] 124 | features, labels = load_dataset('seeds') 125 | 126 | 127 | 128 | from sklearn.neighbors import KNeighborsClassifier 129 | classifier = KNeighborsClassifier(n_neighbors=1) 130 | from sklearn.cross_validation import KFold 131 | 132 | kf = KFold(len(features), n_folds=5, shuffle=True) 133 | means = [] 134 | for training,testing in kf: 135 | # We learn a model for this fold with `fit` and then apply it to the 136 | # testing data with `predict`: 137 | classifier.fit(features[training], labels[training]) 138 | prediction = classifier.predict(features[testing]) 139 | 140 | # np.mean on an array of booleans returns fraction 141 | # of correct decisions for this fold: 142 | curmean = np.mean(prediction == labels[testing]) 143 | means.append(curmean) 144 | print('Mean accuracy: {:.1%}'.format(np.mean(means))) 145 | 146 | 147 | from sklearn.pipeline import Pipeline 148 | from sklearn.preprocessing import StandardScaler 149 | 150 | classifier = KNeighborsClassifier(n_neighbors=1) 151 | classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)]) 152 | 153 | means = [] 154 | for training,testing in kf: 155 | # We learn a model for this fold with `fit` and then apply it to the 156 | # testing data with `predict`: 157 | classifier.fit(features[training], labels[training]) 158 | prediction = classifier.predict(features[testing]) 159 | 160 | # np.mean on an array of booleans returns fraction 161 | # of correct decisions for this fold: 162 | curmean = np.mean(prediction == labels[testing]) 163 | means.append(curmean) 164 | print('Mean accuracy: {:.1%}'.format(np.mean(means))) 165 | -------------------------------------------------------------------------------- /ch02/extra/create_tsv.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import milksets.seeds 9 | 10 | 11 | def save_as_tsv(fname, module): 12 | features, labels = module.load() 13 | nlabels = [module.label_names[ell] for ell in labels] 14 | with open(fname, 'w') as ofile: 15 | for f, n in zip(features, nlabels): 16 | print >>ofile, "\t".join(map(str, f) + [n]) 17 | 18 | save_as_tsv('seeds.tsv', milksets.seeds) 19 | -------------------------------------------------------------------------------- /ch02/figure1.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from matplotlib import pyplot as plt 9 | 10 | # We load the data with load_iris from sklearn 11 | from sklearn.datasets import load_iris 12 | 13 | # load_iris returns an object with several fields 14 | data = load_iris() 15 | features = data.data 16 | feature_names = data.feature_names 17 | target = data.target 18 | target_names = data.target_names 19 | 20 | fig,axes = plt.subplots(2, 3) 21 | pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)] 22 | 23 | # Set up 3 different pairs of (color, marker) 24 | color_markers = [ 25 | ('r', '>'), 26 | ('g', 'o'), 27 | ('b', 'x'), 28 | ] 29 | for i, (p0, p1) in enumerate(pairs): 30 | ax = axes.flat[i] 31 | 32 | for t in range(3): 33 | # Use a different color/marker for each class `t` 34 | c,marker = color_markers[t] 35 | ax.scatter(features[target == t, p0], features[ 36 | target == t, p1], marker=marker, c=c) 37 | ax.set_xlabel(feature_names[p0]) 38 | ax.set_ylabel(feature_names[p1]) 39 | ax.set_xticks([]) 40 | ax.set_yticks([]) 41 | fig.tight_layout() 42 | fig.savefig('figure1.png') 43 | -------------------------------------------------------------------------------- /ch02/figure2.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | COLOUR_FIGURE = False 9 | 10 | from matplotlib import pyplot as plt 11 | from sklearn.datasets import load_iris 12 | data = load_iris() 13 | features = data.data 14 | feature_names = data.feature_names 15 | target = data.target 16 | target_names = data.target_names 17 | 18 | # We use NumPy fancy indexing to get an array of strings: 19 | labels = target_names[target] 20 | 21 | is_setosa = (labels == 'setosa') 22 | features = features[~is_setosa] 23 | labels = labels[~is_setosa] 24 | is_virginica = (labels == 'virginica') 25 | 26 | # Hand fixed thresholds: 27 | t = 1.65 28 | t2 = 1.75 29 | 30 | # Features to use: 3 & 2 31 | f0, f1 = 3, 2 32 | 33 | if COLOUR_FIGURE: 34 | area1c = (1., .8, .8) 35 | area2c = (.8, .8, 1.) 36 | else: 37 | area1c = (1., 1, 1) 38 | area2c = (.7, .7, .7) 39 | 40 | # Plot from 90% of smallest value to 110% of largest value 41 | # (all feature values are positive, otherwise this would not work very well) 42 | 43 | x0 = features[:, f0].min() * .9 44 | x1 = features[:, f0].max() * 1.1 45 | 46 | y0 = features[:, f1].min() * .9 47 | y1 = features[:, f1].max() * 1.1 48 | 49 | fig,ax = plt.subplots() 50 | ax.fill_between([t, x1], [y0, y0], [y1, y1], color=area2c) 51 | ax.fill_between([x0, t], [y0, y0], [y1, y1], color=area1c) 52 | ax.plot([t, t], [y0, y1], 'k--', lw=2) 53 | ax.plot([t2, t2], [y0, y1], 'k:', lw=2) 54 | ax.scatter(features[is_virginica, f0], 55 | features[is_virginica, f1], c='b', marker='o', s=40) 56 | ax.scatter(features[~is_virginica, f0], 57 | features[~is_virginica, f1], c='r', marker='x', s=40) 58 | ax.set_ylim(y0, y1) 59 | ax.set_xlim(x0, x1) 60 | ax.set_xlabel(feature_names[f0]) 61 | ax.set_ylabel(feature_names[f1]) 62 | fig.tight_layout() 63 | fig.savefig('figure2.png') 64 | -------------------------------------------------------------------------------- /ch02/figure4_5_no_sklearn.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | COLOUR_FIGURE = False 9 | 10 | from matplotlib import pyplot as plt 11 | from matplotlib.colors import ListedColormap 12 | from load import load_dataset 13 | import numpy as np 14 | from knn import fit_model, predict 15 | 16 | feature_names = [ 17 | 'area', 18 | 'perimeter', 19 | 'compactness', 20 | 'length of kernel', 21 | 'width of kernel', 22 | 'asymmetry coefficien', 23 | 'length of kernel groove', 24 | ] 25 | 26 | 27 | def plot_decision(features, labels): 28 | '''Plots decision boundary for KNN 29 | 30 | Parameters 31 | ---------- 32 | features : ndarray 33 | labels : sequence 34 | 35 | Returns 36 | ------- 37 | fig : Matplotlib Figure 38 | ax : Matplotlib Axes 39 | ''' 40 | y0, y1 = features[:, 2].min() * .9, features[:, 2].max() * 1.1 41 | x0, x1 = features[:, 0].min() * .9, features[:, 0].max() * 1.1 42 | X = np.linspace(x0, x1, 100) 43 | Y = np.linspace(y0, y1, 100) 44 | X, Y = np.meshgrid(X, Y) 45 | 46 | model = fit_model(1, features[:, (0, 2)], np.array(labels)) 47 | C = predict( 48 | model, np.vstack([X.ravel(), Y.ravel()]).T).reshape(X.shape) 49 | if COLOUR_FIGURE: 50 | cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)]) 51 | else: 52 | cmap = ListedColormap([(1., 1., 1.), (.2, .2, .2), (.6, .6, .6)]) 53 | fig,ax = plt.subplots() 54 | ax.set_xlim(x0, x1) 55 | ax.set_ylim(y0, y1) 56 | ax.set_xlabel(feature_names[0]) 57 | ax.set_ylabel(feature_names[2]) 58 | ax.pcolormesh(X, Y, C, cmap=cmap) 59 | if COLOUR_FIGURE: 60 | cmap = ListedColormap([(1., .0, .0), (.0, 1., .0), (.0, .0, 1.)]) 61 | ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap) 62 | else: 63 | for lab, ma in zip(range(3), "Do^"): 64 | ax.plot(features[labels == lab, 0], features[ 65 | labels == lab, 2], ma, c=(1., 1., 1.)) 66 | return fig,ax 67 | 68 | 69 | features, labels = load_dataset('seeds') 70 | names = sorted(set(labels)) 71 | labels = np.array([names.index(ell) for ell in labels]) 72 | 73 | fig,ax = plot_decision(features, labels) 74 | fig.savefig('figure4.png') 75 | 76 | features -= features.mean(0) 77 | features /= features.std(0) 78 | fig,ax = plot_decision(features, labels) 79 | fig.savefig('figure5.png') 80 | -------------------------------------------------------------------------------- /ch02/figure4_5_sklearn.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | COLOUR_FIGURE = False 9 | 10 | from matplotlib import pyplot as plt 11 | from matplotlib.colors import ListedColormap 12 | from load import load_dataset 13 | import numpy as np 14 | from sklearn.neighbors import KNeighborsClassifier 15 | 16 | feature_names = [ 17 | 'area', 18 | 'perimeter', 19 | 'compactness', 20 | 'length of kernel', 21 | 'width of kernel', 22 | 'asymmetry coefficien', 23 | 'length of kernel groove', 24 | ] 25 | 26 | 27 | def plot_decision(features, labels, num_neighbors=1): 28 | '''Plots decision boundary for KNN 29 | 30 | Parameters 31 | ---------- 32 | features : ndarray 33 | labels : sequence 34 | 35 | Returns 36 | ------- 37 | fig : Matplotlib Figure 38 | ax : Matplotlib Axes 39 | ''' 40 | y0, y1 = features[:, 2].min() * .9, features[:, 2].max() * 1.1 41 | x0, x1 = features[:, 0].min() * .9, features[:, 0].max() * 1.1 42 | X = np.linspace(x0, x1, 1000) 43 | Y = np.linspace(y0, y1, 1000) 44 | X, Y = np.meshgrid(X, Y) 45 | 46 | model = KNeighborsClassifier(num_neighbors) 47 | model.fit(features[:, (0,2)], labels) 48 | C = model.predict(np.vstack([X.ravel(), Y.ravel()]).T).reshape(X.shape) 49 | if COLOUR_FIGURE: 50 | cmap = ListedColormap([(1., .7, .7), (.7, 1., .7), (.7, .7, 1.)]) 51 | else: 52 | cmap = ListedColormap([(1., 1., 1.), (.2, .2, .2), (.6, .6, .6)]) 53 | fig,ax = plt.subplots() 54 | ax.set_xlim(x0, x1) 55 | ax.set_ylim(y0, y1) 56 | ax.set_xlabel(feature_names[0]) 57 | ax.set_ylabel(feature_names[2]) 58 | ax.pcolormesh(X, Y, C, cmap=cmap) 59 | if COLOUR_FIGURE: 60 | cmap = ListedColormap([(1., .0, .0), (.1, .6, .1), (.0, .0, 1.)]) 61 | ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap) 62 | else: 63 | for lab, ma in zip(range(3), "Do^"): 64 | ax.plot(features[labels == lab, 0], features[ 65 | labels == lab, 2], ma, c=(1., 1., 1.), ms=6) 66 | return fig,ax 67 | 68 | 69 | features, labels = load_dataset('seeds') 70 | names = sorted(set(labels)) 71 | labels = np.array([names.index(ell) for ell in labels]) 72 | 73 | fig,ax = plot_decision(features, labels) 74 | fig.tight_layout() 75 | fig.savefig('figure4sklearn.png') 76 | 77 | features -= features.mean(0) 78 | features /= features.std(0) 79 | fig,ax = plot_decision(features, labels) 80 | fig.tight_layout() 81 | fig.savefig('figure5sklearn.png') 82 | 83 | fig,ax = plot_decision(features, labels, 11) 84 | fig.tight_layout() 85 | fig.savefig('figure5sklearn_with_11_neighbors.png') 86 | -------------------------------------------------------------------------------- /ch02/heldout.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | # This script demonstrates the difference between the training accuracy and 9 | # testing (held-out) accuracy. 10 | 11 | import numpy as np 12 | from sklearn.datasets import load_iris 13 | from threshold import fit_model, accuracy 14 | 15 | data = load_iris() 16 | features = data['data'] 17 | labels = data['target_names'][data['target']] 18 | 19 | # We are going to remove the setosa examples as they are too easy: 20 | is_setosa = (labels == 'setosa') 21 | features = features[~is_setosa] 22 | labels = labels[~is_setosa] 23 | 24 | # Now we classify virginica vs non-virginica 25 | is_virginica = (labels == 'virginica') 26 | 27 | # Split the data in two: testing and training 28 | testing = np.tile([True, False], 50) # testing = [True,False,True,False,True,False...] 29 | 30 | # Training is the negation of testing: i.e., datapoints not used for testing, 31 | # will be used for training 32 | training = ~testing 33 | 34 | model = fit_model(features[training], is_virginica[training]) 35 | train_accuracy = accuracy(features[training], is_virginica[training], model) 36 | test_accuracy = accuracy(features[testing], is_virginica[testing], model) 37 | 38 | print('''\ 39 | Training accuracy was {0:.1%}. 40 | Testing accuracy was {1:.1%} (N = {2}). 41 | '''.format(train_accuracy, test_accuracy, testing.sum())) 42 | -------------------------------------------------------------------------------- /ch02/knn.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import numpy as np 9 | 10 | # This function was called ``learn_model`` in the first edition 11 | def fit_model(k, features, labels): 12 | '''Learn a k-nn model''' 13 | # There is no model in k-nn, just a copy of the inputs 14 | return k, features.copy(), labels.copy() 15 | 16 | 17 | def plurality(xs): 18 | '''Find the most common element in a collection''' 19 | from collections import defaultdict 20 | counts = defaultdict(int) 21 | for x in xs: 22 | counts[x] += 1 23 | maxv = max(counts.values()) 24 | for k, v in counts.items(): 25 | if v == maxv: 26 | return k 27 | 28 | # This function was called ``apply_model`` in the first edition 29 | def predict(model, features): 30 | '''Apply k-nn model''' 31 | k, train_feats, labels = model 32 | results = [] 33 | for f in features: 34 | label_dist = [] 35 | # Compute all distances: 36 | for t, ell in zip(train_feats, labels): 37 | label_dist.append((np.linalg.norm(f - t), ell)) 38 | label_dist.sort(key=lambda d_ell: d_ell[0]) 39 | label_dist = label_dist[:k] 40 | results.append(plurality([ell for _, ell in label_dist])) 41 | return np.array(results) 42 | 43 | 44 | def accuracy(features, labels, model): 45 | preds = predict(model, features) 46 | return np.mean(preds == labels) 47 | -------------------------------------------------------------------------------- /ch02/load.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import numpy as np 9 | 10 | 11 | def load_dataset(dataset_name): 12 | ''' 13 | data,labels = load_dataset(dataset_name) 14 | 15 | Load a given dataset 16 | 17 | Returns 18 | ------- 19 | data : numpy ndarray 20 | labels : list of str 21 | ''' 22 | data = [] 23 | labels = [] 24 | with open('./data/{0}.tsv'.format(dataset_name)) as ifile: 25 | for line in ifile: 26 | tokens = line.strip().split('\t') 27 | data.append([float(tk) for tk in tokens[:-1]]) 28 | labels.append(tokens[-1]) 29 | data = np.array(data) 30 | labels = np.array(labels) 31 | return data, labels 32 | -------------------------------------------------------------------------------- /ch02/seeds_knn.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from load import load_dataset 9 | import numpy as np 10 | from knn import fit_model, accuracy 11 | 12 | features, labels = load_dataset('seeds') 13 | 14 | 15 | def cross_validate(features, labels): 16 | '''Compute cross-validation errors''' 17 | error = 0.0 18 | for fold in range(10): 19 | training = np.ones(len(features), bool) 20 | training[fold::10] = 0 21 | testing = ~training 22 | model = fit_model(1, features[training], labels[training]) 23 | test_error = accuracy(features[testing], labels[testing], model) 24 | error += test_error 25 | 26 | return error / 10.0 27 | 28 | error = cross_validate(features, labels) 29 | print('Ten fold cross-validated error was {0:.1%}.'.format(error)) 30 | 31 | # Z-score (whiten) the features 32 | features -= features.mean(0) 33 | features /= features.std(0) 34 | error = cross_validate(features, labels) 35 | print( 36 | 'Ten fold cross-validated error after z-scoring was {0:.1%}.'.format(error)) 37 | -------------------------------------------------------------------------------- /ch02/seeds_knn_increasing_k.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | # Basic imports 9 | from __future__ import print_function 10 | import numpy as np 11 | from matplotlib import pyplot as plt 12 | from load import load_dataset 13 | 14 | 15 | from sklearn.neighbors import KNeighborsClassifier 16 | 17 | from sklearn.cross_validation import cross_val_score 18 | from sklearn.pipeline import Pipeline 19 | from sklearn.preprocessing import StandardScaler 20 | 21 | 22 | features, labels = load_dataset('seeds') 23 | 24 | # Values of k to consider: all in 1 .. 160 25 | ks = np.arange(1,161) 26 | 27 | # We build a classifier object here with the default number of neighbors 28 | # (It happens to be 5, but it does not matter as we will be changing it below 29 | classifier = KNeighborsClassifier() 30 | classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)]) 31 | 32 | # accuracies will hold our results 33 | accuracies = [] 34 | for k in ks: 35 | # set the classifier parameter 36 | classifier.set_params(knn__n_neighbors=k) 37 | crossed = cross_val_score(classifier, features, labels) 38 | 39 | # Save only the average 40 | accuracies.append(crossed.mean()) 41 | 42 | accuracies = np.array(accuracies) 43 | 44 | # Scale the accuracies by 100 to plot as a percentage instead of as a fraction 45 | plt.plot(ks, accuracies*100) 46 | plt.xlabel('Value for k (nr. of neighbors)') 47 | plt.ylabel('Accuracy (%)') 48 | plt.savefig('figure6.png') 49 | -------------------------------------------------------------------------------- /ch02/seeds_knn_sklearn.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | # Basic imports 9 | from __future__ import print_function 10 | import numpy as np 11 | from load import load_dataset 12 | 13 | 14 | # Import sklearn implementation of KNN 15 | from sklearn.neighbors import KNeighborsClassifier 16 | 17 | features, labels = load_dataset('seeds') 18 | classifier = KNeighborsClassifier(n_neighbors=4) 19 | 20 | 21 | n = len(features) 22 | correct = 0.0 23 | for ei in range(n): 24 | training = np.ones(n, bool) 25 | training[ei] = 0 26 | testing = ~training 27 | classifier.fit(features[training], labels[training]) 28 | pred = classifier.predict(features[ei]) 29 | correct += (pred == labels[ei]) 30 | print('Result of leave-one-out: {}'.format(correct/n)) 31 | 32 | # Import KFold object 33 | from sklearn.cross_validation import KFold 34 | 35 | # means will hold the mean for each fold 36 | means = [] 37 | 38 | # kf is a generator of pairs (training,testing) so that each iteration 39 | # implements a separate fold. 40 | kf = KFold(len(features), n_folds=3, shuffle=True) 41 | for training,testing in kf: 42 | # We learn a model for this fold with `fit` and then apply it to the 43 | # testing data with `predict`: 44 | classifier.fit(features[training], labels[training]) 45 | prediction = classifier.predict(features[testing]) 46 | 47 | # np.mean on an array of booleans returns the fraction of correct decisions 48 | # for this fold: 49 | curmean = np.mean(prediction == labels[testing]) 50 | means.append(curmean) 51 | print('Result of cross-validation using KFold: {}'.format(means)) 52 | 53 | # The function cross_val_score does the same thing as the loop above with a 54 | # single function call 55 | 56 | from sklearn.cross_validation import cross_val_score 57 | crossed = cross_val_score(classifier, features, labels) 58 | print('Result of cross-validation using cross_val_score: {}'.format(crossed)) 59 | 60 | # The results above use the features as is, which we learned was not optimal 61 | # except if the features happen to all be in the same scale. We can pre-scale 62 | # the features as explained in the main text: 63 | 64 | from sklearn.pipeline import Pipeline 65 | from sklearn.preprocessing import StandardScaler 66 | classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)]) 67 | crossed = cross_val_score(classifier, features, labels) 68 | print('Result with prescaling: {}'.format(crossed)) 69 | 70 | 71 | # Now, generate & print a cross-validated confusion matrix for the same result 72 | from sklearn.metrics import confusion_matrix 73 | names = list(set(labels)) 74 | labels = np.array([names.index(ell) for ell in labels]) 75 | preds = labels.copy() 76 | preds[:] = -1 77 | for train, test in kf: 78 | classifier.fit(features[train], labels[train]) 79 | preds[test] = classifier.predict(features[test]) 80 | 81 | cmat = confusion_matrix(labels, preds) 82 | print() 83 | print('Confusion matrix: [rows represent true outcome, columns predicted outcome]') 84 | print(cmat) 85 | 86 | # The explicit float() conversion is necessary in Python 2 87 | # (Otherwise, result is rounded to 0) 88 | acc = cmat.trace()/float(cmat.sum()) 89 | print('Accuracy: {0:.1%}'.format(acc)) 90 | 91 | -------------------------------------------------------------------------------- /ch02/seeds_threshold.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from load import load_dataset 9 | import numpy as np 10 | from threshold import fit_model, accuracy 11 | 12 | features, labels = load_dataset('seeds') 13 | 14 | # Turn the labels into a binary array 15 | labels = (labels == 'Canadian') 16 | 17 | error = 0.0 18 | for fold in range(10): 19 | training = np.ones(len(features), bool) 20 | 21 | # numpy magic to make an array with 10% of 0s starting at fold 22 | training[fold::10] = 0 23 | 24 | # whatever is not training is for testing 25 | testing = ~training 26 | 27 | model = fit_model(features[training], labels[training]) 28 | test_error = accuracy(features[testing], labels[testing], model) 29 | error += test_error 30 | 31 | error /= 10.0 32 | 33 | print('Ten fold cross-validated error was {0:.1%}.'.format(error)) 34 | -------------------------------------------------------------------------------- /ch02/simple_threshold.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from sklearn.datasets import load_iris 9 | 10 | data = load_iris() 11 | features = data['data'] 12 | target = data['target'] 13 | target_names = data['target_names'] 14 | labels = target_names[target] 15 | plength = features[:, 2] 16 | 17 | # To use numpy operations to get setosa features, 18 | # we build a boolean array 19 | is_setosa = (labels == 'setosa') 20 | 21 | max_setosa = plength[is_setosa].max() 22 | min_non_setosa = plength[~is_setosa].min() 23 | 24 | print('Maximum of setosa: {0}.'.format(max_setosa)) 25 | print('Minimum of others: {0}.'.format(min_non_setosa)) 26 | -------------------------------------------------------------------------------- /ch02/stump.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from sklearn.datasets import load_iris 9 | data = load_iris() 10 | features = data.data 11 | labels = data.target_names[data.target] 12 | 13 | 14 | is_setosa = (labels == 'setosa') 15 | features = features[~is_setosa] 16 | labels = labels[~is_setosa] 17 | is_virginica = (labels == 'virginica') 18 | 19 | 20 | # Initialize to a value that is worse than any possible test 21 | best_acc = -1.0 22 | 23 | # Loop over all the features 24 | for fi in range(features.shape[1]): 25 | # Test every possible threshold value for feature fi 26 | thresh = features[:, fi].copy() 27 | 28 | # Test them in order 29 | thresh.sort() 30 | for t in thresh: 31 | 32 | # Generate predictions using t as a threshold 33 | pred = (features[:, fi] > t) 34 | 35 | # Accuracy is the fraction of predictions that match reality 36 | acc = (pred == is_virginica).mean() 37 | 38 | # We test whether negating the test is a better threshold: 39 | acc_neg = ((~pred) == is_virginica).mean() 40 | if acc_neg > acc: 41 | acc = acc_neg 42 | negated = True 43 | else: 44 | negated = False 45 | 46 | # If this is better than previous best, then this is now the new best: 47 | 48 | if acc > best_acc: 49 | best_acc = acc 50 | best_fi = fi 51 | best_t = t 52 | best_is_negated = negated 53 | 54 | print('Best threshold is {0} on feature {1} (index {2}), which achieves accuracy of {3:.1%}.'.format( 55 | best_t, data.feature_names[best_fi], best_fi, best_acc)) 56 | -------------------------------------------------------------------------------- /ch02/tests/test_load.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from load import load_dataset 9 | 10 | 11 | def test_iris(): 12 | features, labels = load_dataset('iris') 13 | assert len(features[0]) == 4 14 | assert len(features) 15 | assert len(features) == len(labels) 16 | 17 | 18 | def test_seeds(): 19 | features, labels = load_dataset('seeds') 20 | assert len(features[0]) == 7 21 | assert len(features) 22 | assert len(features) == len(labels) 23 | -------------------------------------------------------------------------------- /ch02/threshold.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import numpy as np 9 | 10 | 11 | # This function was called ``learn_model`` in the first edition 12 | def fit_model(features, labels): 13 | '''Learn a simple threshold model''' 14 | best_acc = -1.0 15 | # Loop over all the features: 16 | for fi in range(features.shape[1]): 17 | thresh = features[:, fi].copy() 18 | # test all feature values in order: 19 | thresh.sort() 20 | for t in thresh: 21 | pred = (features[:, fi] > t) 22 | 23 | # Measure the accuracy of this 24 | acc = (pred == labels).mean() 25 | 26 | rev_acc = (pred == ~labels).mean() 27 | if rev_acc > acc: 28 | acc = rev_acc 29 | reverse = True 30 | else: 31 | reverse = False 32 | if acc > best_acc: 33 | best_acc = acc 34 | best_fi = fi 35 | best_t = t 36 | best_reverse = reverse 37 | 38 | # A model is a threshold and an index 39 | return best_t, best_fi, best_reverse 40 | 41 | 42 | # This function was called ``apply_model`` in the first edition 43 | def predict(model, features): 44 | '''Apply a learned model''' 45 | # A model is a pair as returned by fit_model 46 | t, fi, reverse = model 47 | if reverse: 48 | return features[:, fi] <= t 49 | else: 50 | return features[:, fi] > t 51 | 52 | def accuracy(features, labels, model): 53 | '''Compute the accuracy of the model''' 54 | preds = predict(model, features) 55 | return np.mean(preds == labels) 56 | -------------------------------------------------------------------------------- /ch03/README.md: -------------------------------------------------------------------------------- 1 | Chapter 3 - Clustering - Finding Related Posts 2 | ============================================== 3 | 4 | For this chapter you will need the '20news' dataset from 5 | http://mlcomp.org/datasets/379. To get the data you will need to 6 | register, but it is totally free. When being logged in, you will 7 | see a ZIP download link. 8 | -------------------------------------------------------------------------------- /ch03/data/toy/01.txt: -------------------------------------------------------------------------------- 1 | This is a toy post about machine learning. Actually, it contains not much interesting stuff. -------------------------------------------------------------------------------- /ch03/data/toy/02.txt: -------------------------------------------------------------------------------- 1 | Imaging databases provide storage capabilities. -------------------------------------------------------------------------------- /ch03/data/toy/03.txt: -------------------------------------------------------------------------------- 1 | Most imaging databases save images permanently. 2 | -------------------------------------------------------------------------------- /ch03/data/toy/04.txt: -------------------------------------------------------------------------------- 1 | Imaging databases store data. -------------------------------------------------------------------------------- /ch03/data/toy/05.txt: -------------------------------------------------------------------------------- 1 | Imaging databases store data. Imaging databases store data. Imaging databases store data. -------------------------------------------------------------------------------- /ch03/noise_analysis.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import sklearn.datasets 9 | 10 | groups = [ 11 | 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 12 | 'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space'] 13 | train_data = sklearn.datasets.fetch_20newsgroups(subset="train", 14 | categories=groups) 15 | 16 | labels = train_data.target 17 | num_clusters = 50 # sp.unique(labels).shape[0] 18 | 19 | import nltk.stem 20 | english_stemmer = nltk.stem.SnowballStemmer('english') 21 | 22 | from sklearn.feature_extraction.text import TfidfVectorizer 23 | 24 | 25 | class StemmedTfidfVectorizer(TfidfVectorizer): 26 | 27 | def build_analyzer(self): 28 | analyzer = super(TfidfVectorizer, self).build_analyzer() 29 | return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc)) 30 | 31 | vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5, 32 | stop_words='english', decode_error='ignore' 33 | ) 34 | vectorized = vectorizer.fit_transform(train_data.data) 35 | 36 | post_group = zip(train_data.data, train_data.target) 37 | # Create a list of tuples that can be sorted by 38 | # the length of the posts 39 | all = [(len(post[0]), post[0], train_data.target_names[post[1]]) 40 | for post in post_group] 41 | graphics = sorted([post for post in all if post[2] == 'comp.graphics']) 42 | print(graphics[5]) 43 | # (245, 'From: SITUNAYA@IBM3090.BHAM.AC.UK\nSubject: test....(sorry)\nOrganization: 44 | # The University of Birmingham, United Kingdom\nLines: 1\nNNTP-Posting-Host: ibm3090.bham.ac.uk 45 | # \n\n==============================================================================\n', 46 | # 'comp.graphics') 47 | 48 | noise_post = graphics[5][1] 49 | 50 | analyzer = vectorizer.build_analyzer() 51 | print(list(analyzer(noise_post))) 52 | 53 | useful = set(analyzer(noise_post)).intersection(vectorizer.get_feature_names()) 54 | print(sorted(useful)) 55 | # ['ac', 'birmingham', 'host', 'kingdom', 'nntp', 'sorri', 'test', 'uk', 'unit', 'univers'] 56 | 57 | for term in sorted(useful): 58 | print('IDF(%s)=%.2f' % (term, 59 | vectorizer._tfidf.idf_[vectorizer.vocabulary_[term]])) 60 | # IDF(ac)=3.51 61 | # IDF(birmingham)=6.77 62 | # IDF(host)=1.74 63 | # IDF(kingdom)=6.68 64 | # IDF(nntp)=1.77 65 | # IDF(sorri)=4.14 66 | # IDF(test)=3.83 67 | # IDF(uk)=3.70 68 | # IDF(unit)=4.42 69 | # IDF(univers)=1.91 70 | -------------------------------------------------------------------------------- /ch03/plot_kmeans_example.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | # inspired by http://scikit- 9 | # learn.org/dev/auto_examples/cluster/plot_kmeans_digits.html#example- 10 | # cluster-plot-kmeans-digits-py 11 | 12 | import os 13 | import scipy as sp 14 | from scipy.stats import norm 15 | from matplotlib import pylab 16 | from sklearn.cluster import KMeans 17 | 18 | from utils import CHART_DIR 19 | 20 | seed = 2 21 | sp.random.seed(seed) # to reproduce the data later on 22 | 23 | num_clusters = 3 24 | 25 | 26 | def plot_clustering(x, y, title, mx=None, ymax=None, xmin=None, km=None): 27 | pylab.figure(num=None, figsize=(8, 6)) 28 | if km: 29 | pylab.scatter(x, y, s=50, c=km.predict(list(zip(x, y)))) 30 | else: 31 | pylab.scatter(x, y, s=50) 32 | 33 | pylab.title(title) 34 | pylab.xlabel("Occurrence word 1") 35 | pylab.ylabel("Occurrence word 2") 36 | 37 | pylab.autoscale(tight=True) 38 | pylab.ylim(ymin=0, ymax=1) 39 | pylab.xlim(xmin=0, xmax=1) 40 | pylab.grid(True, linestyle='-', color='0.75') 41 | 42 | return pylab 43 | 44 | 45 | xw1 = norm(loc=0.3, scale=.15).rvs(20) 46 | yw1 = norm(loc=0.3, scale=.15).rvs(20) 47 | 48 | xw2 = norm(loc=0.7, scale=.15).rvs(20) 49 | yw2 = norm(loc=0.7, scale=.15).rvs(20) 50 | 51 | xw3 = norm(loc=0.2, scale=.15).rvs(20) 52 | yw3 = norm(loc=0.8, scale=.15).rvs(20) 53 | 54 | x = sp.append(sp.append(xw1, xw2), xw3) 55 | y = sp.append(sp.append(yw1, yw2), yw3) 56 | 57 | i = 1 58 | plot_clustering(x, y, "Vectors") 59 | pylab.savefig(os.path.join(CHART_DIR, "1400_03_0%i.png" % i)) 60 | pylab.clf() 61 | 62 | i += 1 63 | 64 | # 1 iteration #################### 65 | 66 | mx, my = sp.meshgrid(sp.arange(0, 1, 0.001), sp.arange(0, 1, 0.001)) 67 | 68 | km = KMeans(init='random', n_clusters=num_clusters, verbose=1, 69 | n_init=1, max_iter=1, 70 | random_state=seed) 71 | km.fit(sp.array(list(zip(x, y)))) 72 | 73 | Z = km.predict(sp.c_[mx.ravel(), my.ravel()]).reshape(mx.shape) 74 | 75 | plot_clustering(x, y, "Clustering iteration 1", km=km) 76 | pylab.imshow(Z, interpolation='nearest', 77 | extent=(mx.min(), mx.max(), my.min(), my.max()), 78 | cmap=pylab.cm.Blues, 79 | aspect='auto', origin='lower') 80 | 81 | c1a, c1b, c1c = km.cluster_centers_ 82 | pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], 83 | marker='x', linewidth=2, s=100, color='black') 84 | pylab.savefig(os.path.join(CHART_DIR, "1400_03_0%i.png" % i)) 85 | pylab.clf() 86 | 87 | i += 1 88 | 89 | # 2 iterations #################### 90 | km = KMeans(init='random', n_clusters=num_clusters, verbose=1, 91 | n_init=1, max_iter=2, 92 | random_state=seed) 93 | km.fit(sp.array(list(zip(x, y)))) 94 | 95 | Z = km.predict(sp.c_[mx.ravel(), my.ravel()]).reshape(mx.shape) 96 | 97 | plot_clustering(x, y, "Clustering iteration 2", km=km) 98 | pylab.imshow(Z, interpolation='nearest', 99 | extent=(mx.min(), mx.max(), my.min(), my.max()), 100 | cmap=pylab.cm.Blues, 101 | aspect='auto', origin='lower') 102 | 103 | c2a, c2b, c2c = km.cluster_centers_ 104 | pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], 105 | marker='x', linewidth=2, s=100, color='black') 106 | 107 | pylab.gca().add_patch( 108 | pylab.Arrow(c1a[0], c1a[1], c2a[0] - c1a[0], c2a[1] - c1a[1], width=0.1)) 109 | pylab.gca().add_patch( 110 | pylab.Arrow(c1b[0], c1b[1], c2b[0] - c1b[0], c2b[1] - c1b[1], width=0.1)) 111 | pylab.gca().add_patch( 112 | pylab.Arrow(c1c[0], c1c[1], c2c[0] - c1c[0], c2c[1] - c1c[1], width=0.1)) 113 | 114 | pylab.savefig(os.path.join(CHART_DIR, "1400_03_0%i.png" % i)) 115 | pylab.clf() 116 | 117 | i += 1 118 | 119 | # 3 iterations #################### 120 | km = KMeans(init='random', n_clusters=num_clusters, verbose=1, 121 | n_init=1, max_iter=10, 122 | random_state=seed) 123 | km.fit(sp.array(list(zip(x, y)))) 124 | 125 | Z = km.predict(sp.c_[mx.ravel(), my.ravel()]).reshape(mx.shape) 126 | 127 | plot_clustering(x, y, "Clustering iteration 10", km=km) 128 | pylab.imshow(Z, interpolation='nearest', 129 | extent=(mx.min(), mx.max(), my.min(), my.max()), 130 | cmap=pylab.cm.Blues, 131 | aspect='auto', origin='lower') 132 | 133 | pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], 134 | marker='x', linewidth=2, s=100, color='black') 135 | pylab.savefig(os.path.join(CHART_DIR, "1400_03_0%i.png" % i)) 136 | pylab.clf() 137 | 138 | i += 1 139 | -------------------------------------------------------------------------------- /ch03/rel_post_01.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import os 9 | import sys 10 | 11 | import scipy as sp 12 | 13 | from sklearn.feature_extraction.text import CountVectorizer 14 | 15 | from utils import DATA_DIR 16 | 17 | TOY_DIR = os.path.join(DATA_DIR, "toy") 18 | posts = [open(os.path.join(TOY_DIR, f)).read() for f in os.listdir(TOY_DIR)] 19 | 20 | new_post = "imaging databases" 21 | 22 | import nltk.stem 23 | english_stemmer = nltk.stem.SnowballStemmer('english') 24 | 25 | 26 | class StemmedCountVectorizer(CountVectorizer): 27 | 28 | def build_analyzer(self): 29 | analyzer = super(StemmedCountVectorizer, self).build_analyzer() 30 | return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc)) 31 | 32 | # vectorizer = CountVectorizer(min_df=1, stop_words='english', 33 | # preprocessor=stemmer) 34 | vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english') 35 | 36 | from sklearn.feature_extraction.text import TfidfVectorizer 37 | 38 | 39 | class StemmedTfidfVectorizer(TfidfVectorizer): 40 | 41 | def build_analyzer(self): 42 | analyzer = super(StemmedTfidfVectorizer, self).build_analyzer() 43 | return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc)) 44 | 45 | vectorizer = StemmedTfidfVectorizer( 46 | min_df=1, stop_words='english', decode_error='ignore') 47 | 48 | X_train = vectorizer.fit_transform(posts) 49 | 50 | num_samples, num_features = X_train.shape 51 | print("#samples: %d, #features: %d" % (num_samples, num_features)) 52 | 53 | new_post_vec = vectorizer.transform([new_post]) 54 | print(new_post_vec, type(new_post_vec)) 55 | print(new_post_vec.toarray()) 56 | print(vectorizer.get_feature_names()) 57 | 58 | 59 | def dist_raw(v1, v2): 60 | delta = v1 - v2 61 | return sp.linalg.norm(delta.toarray()) 62 | 63 | 64 | def dist_norm(v1, v2): 65 | v1_normalized = v1 / sp.linalg.norm(v1.toarray()) 66 | v2_normalized = v2 / sp.linalg.norm(v2.toarray()) 67 | 68 | delta = v1_normalized - v2_normalized 69 | 70 | return sp.linalg.norm(delta.toarray()) 71 | 72 | dist = dist_norm 73 | 74 | best_dist = sys.maxsize 75 | best_i = None 76 | 77 | for i in range(0, num_samples): 78 | post = posts[i] 79 | if post == new_post: 80 | continue 81 | post_vec = X_train.getrow(i) 82 | d = dist(post_vec, new_post_vec) 83 | 84 | print("=== Post %i with dist=%.2f: %s" % (i, d, post)) 85 | 86 | if d < best_dist: 87 | best_dist = d 88 | best_i = i 89 | 90 | print("Best post is %i with dist=%.2f" % (best_i, best_dist)) 91 | -------------------------------------------------------------------------------- /ch03/rel_post_20news.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import sklearn.datasets 9 | import scipy as sp 10 | 11 | new_post = \ 12 | """Disk drive problems. Hi, I have a problem with my hard disk. 13 | After 1 year it is working only sporadically now. 14 | I tried to format it, but now it doesn't boot any more. 15 | Any ideas? Thanks. 16 | """ 17 | 18 | print("""\ 19 | Dear reader of the 1st edition of 'Building Machine Learning Systems with Python'! 20 | For the 2nd edition we introduced a couple of changes that will result into 21 | results that differ from the results in the 1st edition. 22 | E.g. we now fully rely on scikit's fetch_20newsgroups() instead of requiring 23 | you to download the data manually from MLCOMP. 24 | If you have any questions, please ask at http://www.twotoreal.com 25 | """) 26 | 27 | all_data = sklearn.datasets.fetch_20newsgroups(subset="all") 28 | print("Number of total posts: %i" % len(all_data.filenames)) 29 | # Number of total posts: 18846 30 | 31 | groups = [ 32 | 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 33 | 'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space'] 34 | train_data = sklearn.datasets.fetch_20newsgroups(subset="train", 35 | categories=groups) 36 | print("Number of training posts in tech groups:", len(train_data.filenames)) 37 | # Number of training posts in tech groups: 3529 38 | 39 | labels = train_data.target 40 | num_clusters = 50 # sp.unique(labels).shape[0] 41 | 42 | import nltk.stem 43 | english_stemmer = nltk.stem.SnowballStemmer('english') 44 | 45 | from sklearn.feature_extraction.text import TfidfVectorizer 46 | 47 | 48 | class StemmedTfidfVectorizer(TfidfVectorizer): 49 | 50 | def build_analyzer(self): 51 | analyzer = super(TfidfVectorizer, self).build_analyzer() 52 | return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc)) 53 | 54 | vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5, 55 | stop_words='english', decode_error='ignore' 56 | ) 57 | 58 | vectorized = vectorizer.fit_transform(train_data.data) 59 | num_samples, num_features = vectorized.shape 60 | print("#samples: %d, #features: %d" % (num_samples, num_features)) 61 | # samples: 3529, #features: 4712 62 | 63 | from sklearn.cluster import KMeans 64 | 65 | km = KMeans(n_clusters=num_clusters, n_init=1, verbose=1, random_state=3) 66 | clustered = km.fit(vectorized) 67 | 68 | print("km.labels_=%s" % km.labels_) 69 | # km.labels_=[ 6 34 22 ..., 2 21 26] 70 | 71 | print("km.labels_.shape=%s" % km.labels_.shape) 72 | # km.labels_.shape=3529 73 | 74 | from sklearn import metrics 75 | print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) 76 | # Homogeneity: 0.400 77 | print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) 78 | # Completeness: 0.206 79 | print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) 80 | # V-measure: 0.272 81 | print("Adjusted Rand Index: %0.3f" % 82 | metrics.adjusted_rand_score(labels, km.labels_)) 83 | # Adjusted Rand Index: 0.064 84 | print("Adjusted Mutual Information: %0.3f" % 85 | metrics.adjusted_mutual_info_score(labels, km.labels_)) 86 | # Adjusted Mutual Information: 0.197 87 | print(("Silhouette Coefficient: %0.3f" % 88 | metrics.silhouette_score(vectorized, labels, sample_size=1000))) 89 | # Silhouette Coefficient: 0.006 90 | 91 | new_post_vec = vectorizer.transform([new_post]) 92 | new_post_label = km.predict(new_post_vec)[0] 93 | 94 | similar_indices = (km.labels_ == new_post_label).nonzero()[0] 95 | 96 | similar = [] 97 | for i in similar_indices: 98 | dist = sp.linalg.norm((new_post_vec - vectorized[i]).toarray()) 99 | similar.append((dist, train_data.data[i])) 100 | 101 | similar = sorted(similar) 102 | print("Count similar: %i" % len(similar)) 103 | 104 | show_at_1 = similar[0] 105 | show_at_2 = similar[int(len(similar) / 10)] 106 | show_at_3 = similar[int(len(similar) / 2)] 107 | 108 | print("=== #1 ===") 109 | print(show_at_1) 110 | print() 111 | 112 | print("=== #2 ===") 113 | print(show_at_2) 114 | print() 115 | 116 | print("=== #3 ===") 117 | print(show_at_3) 118 | -------------------------------------------------------------------------------- /ch03/tfidf.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import scipy as sp 9 | 10 | 11 | def tfidf(t, d, D): 12 | tf = float(d.count(t)) / sum(d.count(w) for w in set(d)) 13 | idf = sp.log(float(len(D)) / (len([doc for doc in D if t in doc]))) 14 | return tf * idf 15 | 16 | 17 | a, abb, abc = ["a"], ["a", "b", "b"], ["a", "b", "c"] 18 | D = [a, abb, abc] 19 | 20 | print(tfidf("a", a, D)) 21 | print(tfidf("b", abb, D)) 22 | print(tfidf("a", abc, D)) 23 | print(tfidf("b", abc, D)) 24 | print(tfidf("c", abc, D)) 25 | -------------------------------------------------------------------------------- /ch03/utils.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import os 9 | import sys 10 | 11 | DATA_DIR = os.path.join( 12 | os.path.dirname(os.path.realpath(__file__)), "data") 13 | 14 | if not os.path.exists(DATA_DIR): 15 | print("Uh, we were expecting a data directory, which contains the toy data") 16 | sys.exit(1) 17 | 18 | CHART_DIR = os.path.join( 19 | os.path.dirname(os.path.realpath(__file__)), "charts") 20 | if not os.path.exists(CHART_DIR): 21 | os.mkdir(CHART_DIR) 22 | 23 | -------------------------------------------------------------------------------- /ch04/.gitignore: -------------------------------------------------------------------------------- 1 | wiki_lda.pkl 2 | wiki_lda.pkl.state 3 | *.png 4 | *.npy 5 | *.pkl 6 | topics.txt 7 | -------------------------------------------------------------------------------- /ch04/README.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Chapter 4 3 | ========= 4 | 5 | Support code for *Chapter 4: Topic Modeling* 6 | 7 | 8 | AP Data 9 | ------- 10 | 11 | To download the AP data, use the ``download_ap.sh`` script inside the ``data`` 12 | directory:: 13 | 14 | cd data 15 | ./download_ap.sh 16 | 17 | Word cloud creation 18 | ------------------- 19 | 20 | Word cloud creation requires that ``pytagcloud`` be installed (in turn, this 21 | requires ``pygame``). Since this is not an essential part of the chapter, the 22 | code will work even if you have not installed it (naturally, the cloud image 23 | will not be generated and a warning will be printed). 24 | 25 | 26 | Wikipedia processing 27 | -------------------- 28 | 29 | You will need **a lot of disk space**. The download of the Wikipedia text is 30 | 11GB and preprocessing it takes another 24GB to save it in the intermediate 31 | format that gensim uses for a total of 34GB! 32 | 33 | Run the following two commands inside the ``data/`` directory:: 34 | 35 | ./download_wp.sh 36 | ./preprocess-wikidata.sh 37 | 38 | As the filenames indicate, the first step will download the data and the second 39 | one will preprocess it. Preprocessing can take several hours, but it is 40 | feasible to run it on a modern laptop. Once the second step is finished, you 41 | may remove the input file if you want to save disk space 42 | (``data/enwiki-latest-pages-articles.xml.bz2``). 43 | 44 | To generate the model, you can run the ``wikitopics_create.py`` script, while 45 | the ``wikitopics_plot.py`` script will plot the most heavily discussed topic as 46 | well as the least heavily discussed one. The code is split into steps as the 47 | first one can take a very long time. Then it saves the results so that you can 48 | later explore them at leisure. 49 | 50 | You should not expect that your results will exactly match the results in the 51 | book, for two reasons: 52 | 53 | 1. The LDA algorithm is a probabilistic algorithm and can give different 54 | results every time it is run. 55 | 2. Wikipedia keeps changing. Thus, even your input data will be different. 56 | 57 | Scripts 58 | ------- 59 | 60 | blei_lda.py 61 | Computes LDA using the AP Corpus. 62 | wikitopics_create.py 63 | Create the topic model for Wikipedia using LDA (must download wikipedia database first) 64 | wikitopics_create_hdp.py 65 | Create the topic model for Wikipedia using HDP (must download wikipedia database first) 66 | -------------------------------------------------------------------------------- /ch04/blei_lda.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from __future__ import print_function 9 | from wordcloud import create_cloud 10 | try: 11 | from gensim import corpora, models, matutils 12 | except: 13 | print("import gensim failed.") 14 | print() 15 | print("Please install it") 16 | raise 17 | 18 | import matplotlib.pyplot as plt 19 | import numpy as np 20 | from os import path 21 | 22 | NUM_TOPICS = 100 23 | 24 | # Check that data exists 25 | if not path.exists('./data/ap/ap.dat'): 26 | print('Error: Expected data to be present at data/ap/') 27 | print('Please cd into ./data & run ./download_ap.sh') 28 | 29 | # Load the data 30 | corpus = corpora.BleiCorpus('./data/ap/ap.dat', './data/ap/vocab.txt') 31 | 32 | # Build the topic model 33 | model = models.ldamodel.LdaModel( 34 | corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=None) 35 | 36 | # Iterate over all the topics in the model 37 | for ti in range(model.num_topics): 38 | words = model.show_topic(ti, 64) 39 | tf = sum(f for _, f in words) 40 | with open('topics.txt', 'w') as output: 41 | output.write('\n'.join('{}:{}'.format(w, int(1000. * f / tf)) for w, f in words)) 42 | output.write("\n\n\n") 43 | 44 | # We first identify the most discussed topic, i.e., the one with the 45 | # highest total weight 46 | 47 | topics = matutils.corpus2dense(model[corpus], num_terms=model.num_topics) 48 | weight = topics.sum(1) 49 | max_topic = weight.argmax() 50 | 51 | 52 | # Get the top 64 words for this topic 53 | # Without the argument, show_topic would return only 10 words 54 | words = model.show_topic(max_topic, 64) 55 | 56 | # This function will actually check for the presence of pytagcloud and is otherwise a no-op 57 | create_cloud('cloud_blei_lda.png', words) 58 | 59 | num_topics_used = [len(model[doc]) for doc in corpus] 60 | fig,ax = plt.subplots() 61 | ax.hist(num_topics_used, np.arange(42)) 62 | ax.set_ylabel('Nr of documents') 63 | ax.set_xlabel('Nr of topics') 64 | fig.tight_layout() 65 | fig.savefig('Figure_04_01.png') 66 | 67 | 68 | # Now, repeat the same exercise using alpha=1.0 69 | # You can edit the constant below to play around with this parameter 70 | ALPHA = 1.0 71 | 72 | model1 = models.ldamodel.LdaModel( 73 | corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=ALPHA) 74 | num_topics_used1 = [len(model1[doc]) for doc in corpus] 75 | 76 | fig,ax = plt.subplots() 77 | ax.hist([num_topics_used, num_topics_used1], np.arange(42)) 78 | ax.set_ylabel('Nr of documents') 79 | ax.set_xlabel('Nr of topics') 80 | 81 | # The coordinates below were fit by trial and error to look good 82 | ax.text(9, 223, r'default alpha') 83 | ax.text(26, 156, 'alpha=1.0') 84 | fig.tight_layout() 85 | fig.savefig('Figure_04_02.png') 86 | 87 | -------------------------------------------------------------------------------- /ch04/build_lda.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | from __future__ import print_function 8 | 9 | try: 10 | import nltk.corpus 11 | except ImportError: 12 | print("nltk not found") 13 | print("please install it") 14 | raise 15 | from scipy.spatial import distance 16 | import numpy as np 17 | from gensim import corpora, models 18 | import sklearn.datasets 19 | import nltk.stem 20 | from collections import defaultdict 21 | 22 | english_stemmer = nltk.stem.SnowballStemmer('english') 23 | stopwords = set(nltk.corpus.stopwords.words('english')) 24 | stopwords.update(['from:', 'subject:', 'writes:', 'writes']) 25 | 26 | 27 | class DirectText(corpora.textcorpus.TextCorpus): 28 | 29 | def get_texts(self): 30 | return self.input 31 | 32 | def __len__(self): 33 | return len(self.input) 34 | try: 35 | dataset = sklearn.datasets.load_mlcomp("20news-18828", "train", 36 | mlcomp_root='./data') 37 | except: 38 | print("Newsgroup data not found.") 39 | print("Please download from http://mlcomp.org/datasets/379") 40 | print("And expand the zip into the subdirectory data/") 41 | print() 42 | print() 43 | raise 44 | 45 | otexts = dataset.data 46 | texts = dataset.data 47 | 48 | texts = [t.decode('utf-8', 'ignore') for t in texts] 49 | texts = [t.split() for t in texts] 50 | texts = [map(lambda w: w.lower(), t) for t in texts] 51 | texts = [filter(lambda s: not len(set("+-.?!()>@012345689") & set(s)), t) 52 | for t in texts] 53 | texts = [filter(lambda s: (len(s) > 3) and (s not in stopwords), t) 54 | for t in texts] 55 | texts = [map(english_stemmer.stem, t) for t in texts] 56 | usage = defaultdict(int) 57 | for t in texts: 58 | for w in set(t): 59 | usage[w] += 1 60 | limit = len(texts) / 10 61 | too_common = [w for w in usage if usage[w] > limit] 62 | too_common = set(too_common) 63 | texts = [filter(lambda s: s not in too_common, t) for t in texts] 64 | 65 | corpus = DirectText(texts) 66 | dictionary = corpus.dictionary 67 | try: 68 | dictionary['computer'] 69 | except: 70 | pass 71 | 72 | model = models.ldamodel.LdaModel( 73 | corpus, num_topics=100, id2word=dictionary.id2token) 74 | 75 | thetas = np.zeros((len(texts), 100)) 76 | for i, c in enumerate(corpus): 77 | for ti, v in model[c]: 78 | thetas[i, ti] += v 79 | 80 | distances = distance.squareform(distance.pdist(thetas)) 81 | large = distances.max() + 1 82 | for i in range(len(distances)): 83 | distances[i, i] = large 84 | 85 | print(otexts[1]) 86 | print() 87 | print() 88 | print() 89 | print(otexts[distances[1].argmin()]) 90 | -------------------------------------------------------------------------------- /ch04/data/.gitignore: -------------------------------------------------------------------------------- 1 | ap.tgz 2 | ap/ 3 | dataset-379-20news-18828_HJRZF.zip 4 | 379/ 5 | enwiki-latest-pages-articles.xml.bz2 6 | wiki_en_output_bow.mm 7 | wiki_en_output_bow.mm.gz 8 | wiki_en_output_bow.mm.index 9 | wiki_en_output_tfidf.mm 10 | wiki_en_output_tfidf.mm.gz 11 | wiki_en_output_tfidf.mm.index 12 | wiki_en_output_wordids.txt.bz2 13 | -------------------------------------------------------------------------------- /ch04/data/download_ap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | wget http://www.cs.columbia.edu/~blei/lda-c/ap.tgz 3 | tar xzf ap.tgz 4 | -------------------------------------------------------------------------------- /ch04/data/download_wp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 3 | -------------------------------------------------------------------------------- /ch04/data/preprocess-wikidata.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | python -m gensim.scripts.make_wiki enwiki-latest-pages-articles.xml.bz2 wiki_en_output 4 | -------------------------------------------------------------------------------- /ch04/wikitopics_create.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from __future__ import print_function 9 | import logging 10 | import gensim 11 | import numpy as np 12 | 13 | NR_OF_TOPICS = 100 14 | 15 | # Set up logging in order to get progress information as the model is being built: 16 | logging.basicConfig( 17 | format='%(asctime)s : %(levelname)s : %(message)s', 18 | level=logging.INFO) 19 | 20 | # Load the preprocessed corpus (id2word & mm): 21 | id2word = gensim.corpora.Dictionary.load_from_text( 22 | 'data/wiki_en_output_wordids.txt.bz2') 23 | mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm') 24 | 25 | # Calling the constructor is enough to build the model 26 | # This call will take a few hours! 27 | model = gensim.models.ldamodel.LdaModel( 28 | corpus=mm, 29 | id2word=id2word, 30 | num_topics=NR_OF_TOPICS, 31 | update_every=1, 32 | chunksize=10000, 33 | passes=1) 34 | 35 | # Save the model so we do not need to learn it again. 36 | model.save('wiki_lda.pkl') 37 | 38 | # Compute the document/topic matrix 39 | topics = np.zeros((len(mm), model.num_topics)) 40 | for di,doc in enumerate(mm): 41 | doc_top = model[doc] 42 | for ti,tv in doc_top: 43 | topics[di,ti] += tv 44 | np.save('topics.npy', topics) 45 | 46 | # Alternatively, we create a sparse matrix and save that. This alternative 47 | # saves disk space, at the cost of slightly more complex code: 48 | 49 | ## from scipy import sparse, io 50 | ## sp = sparse.csr_matrix(topics) 51 | ## io.savemat('topics.mat', {'topics': sp}) 52 | -------------------------------------------------------------------------------- /ch04/wikitopics_create_hdp.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from __future__ import print_function 9 | import logging 10 | import gensim 11 | import numpy as np 12 | 13 | # Set up logging in order to get progress information as the model is being built: 14 | logging.basicConfig( 15 | format='%(asctime)s : %(levelname)s : %(message)s', 16 | level=logging.INFO) 17 | 18 | # Load the preprocessed corpus (id2word & mm): 19 | id2word = gensim.corpora.Dictionary.load_from_text( 20 | 'data/wiki_en_output_wordids.txt.bz2') 21 | mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm') 22 | 23 | # Calling the constructor is enough to build the model 24 | # This call will take a few hours! 25 | model = gensim.models.hdpmodel.HdpModel( 26 | corpus=mm, 27 | id2word=id2word, 28 | chunksize=10000) 29 | 30 | # Save the model so we do not need to learn it again. 31 | model.save('wiki_hdp.pkl') 32 | 33 | # Compute the document/topic matrix 34 | topics = np.zeros((len(mm), model.num_topics)) 35 | for di,doc in enumerate(mm): 36 | doc_top = model[doc] 37 | for ti,tv in doc_top: 38 | topics[di,ti] += tv 39 | np.save('topics_hdp.npy', topics) 40 | -------------------------------------------------------------------------------- /ch04/wikitopics_plot.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from __future__ import print_function 9 | import numpy as np 10 | import gensim 11 | from os import path 12 | from wordcloud import create_cloud 13 | 14 | if not path.exists('wiki_lda.pkl'): 15 | import sys 16 | sys.stderr.write('''\ 17 | This script must be run after wikitopics_create.py! 18 | 19 | That script creates and saves the LDA model (this must onlly be done once). 20 | This script is responsible for the analysis.''') 21 | sys.exit(1) 22 | 23 | # Load the preprocessed Wikipedia corpus (id2word and mm) 24 | id2word = gensim.corpora.Dictionary.load_from_text( 25 | 'data/wiki_en_output_wordids.txt.bz2') 26 | mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm') 27 | 28 | # Load the precomputed model 29 | model = gensim.models.ldamodel.LdaModel.load('wiki_lda.pkl') 30 | 31 | topics = np.load('topics.npy', mmap_mode='r') 32 | 33 | # Compute the number of topics mentioned in each document 34 | lens = (topics > 0).sum(axis=1) 35 | print('Mean number of topics mentioned: {0:.3}'.format(np.mean(lens))) 36 | print('Percentage of articles mentioning less than 10 topics: {0:.1%}'.format(np.mean(lens <= 10))) 37 | 38 | # Weights will be the total weight of each topic 39 | weights = topics.sum(0) 40 | 41 | # Retrieve the most heavily used topic and plot it as a word cloud: 42 | words = model.show_topic(weights.argmax(), 64) 43 | 44 | # The parameter ``maxsize`` often needs some manual tuning to make it look nice. 45 | create_cloud('Wikipedia_most.png', words, maxsize=250, fontname='Cardo') 46 | 47 | fraction_mention = np.mean(topics[:,weights.argmax()] > 0) 48 | print("The most mentioned topics is mentioned in {:.1%} of documents.".format(fraction_mention)) 49 | total_weight = np.mean(topics[:,weights.argmax()]) 50 | print("It represents {:.1%} of the total number of words.".format(total_weight)) 51 | print() 52 | print() 53 | print() 54 | 55 | # Retrieve the **least** heavily used topic and plot it as a word cloud: 56 | words = model.show_topic(weights.argmin(), 64) 57 | create_cloud('Wikipedia_least.png', words, maxsize=150, fontname='Cardo') 58 | fraction_mention = np.mean(topics[:,weights.argmin()] > 0) 59 | print("The least mentioned topics is mentioned in {:.1%} of documents.".format(fraction_mention)) 60 | total_weight = np.mean(topics[:,weights.argmin()]) 61 | print("It represents {:.1%} of the total number of words.".format(total_weight)) 62 | print() 63 | print() 64 | print() 65 | -------------------------------------------------------------------------------- /ch04/wordcloud.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | warned_of_error = False 3 | 4 | def create_cloud(oname, words,maxsize=120, fontname='Lobster'): 5 | '''Creates a word cloud (when pytagcloud is installed) 6 | 7 | Parameters 8 | ---------- 9 | oname : output filename 10 | words : list of (value,str) 11 | maxsize : int, optional 12 | Size of maximum word. The best setting for this parameter will often 13 | require some manual tuning for each input. 14 | fontname : str, optional 15 | Font to use. 16 | ''' 17 | try: 18 | from pytagcloud import create_tag_image, make_tags 19 | except ImportError: 20 | if not warned_of_error: 21 | print("Could not import pytagcloud. Skipping cloud generation") 22 | return 23 | 24 | # gensim returns a weight between 0 and 1 for each word, while pytagcloud 25 | # expects an integer word count. So, we multiply by a large number and 26 | # round. For a visualization this is an adequate approximation. 27 | words = [(w,int(v*10000)) for w,v in words] 28 | tags = make_tags(words, maxsize=maxsize) 29 | create_tag_image(tags, oname, size=(1800, 1200), fontname=fontname) 30 | -------------------------------------------------------------------------------- /ch05/README.md: -------------------------------------------------------------------------------- 1 | Chapter 5 - Classification - Detecting Poor Answers 2 | =================================================== 3 | 4 | The book chapter is based on StackExchange's data blob from August 2012 for the first edition. 5 | 6 | After publishing the book, StackExchange released the May 2014 version at 7 | [https://archive.org/download/stackexchange/stackexchange_archive.torrent](https://archive.org/download/stackexchange/stackexchange_archive.torrent). 8 | 9 | Note that using the latest version, you will get slightly different results. 10 | 11 | The code is using pyenchant for spell correction. Pyenchent is only used to increase your pleasure of eperimenting with additional features. It is not used later on the chapter. So, if you find out that your platform poses too big problems to install it (e.g. on 64bit Windows), don't bother. 12 | -------------------------------------------------------------------------------- /ch05/data.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import os 9 | 10 | DATA_DIR = "data" # put your posts-2012.xml into this directory 11 | CHART_DIR = "charts" 12 | 13 | filtered = os.path.join(DATA_DIR, "filtered.tsv") 14 | filtered_meta = os.path.join(DATA_DIR, "filtered-meta.json") 15 | 16 | chosen = os.path.join(DATA_DIR, "chosen.tsv") 17 | chosen_meta = os.path.join(DATA_DIR, "chosen-meta.json") 18 | -------------------------------------------------------------------------------- /ch05/log_reg_example.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import os 9 | from data import CHART_DIR 10 | 11 | import numpy as np 12 | from scipy.stats import norm 13 | 14 | from matplotlib import pyplot 15 | np.random.seed(3) 16 | 17 | num_per_class = 40 18 | X = np.hstack((norm.rvs(2, size=num_per_class, scale=2), 19 | norm.rvs(8, size=num_per_class, scale=3))) 20 | y = np.hstack((np.zeros(num_per_class), 21 | np.ones(num_per_class))) 22 | 23 | 24 | def lr_model(clf, X): 25 | return 1.0 / (1.0 + np.exp(-(clf.intercept_ + clf.coef_ * X))) 26 | 27 | from sklearn.linear_model import LogisticRegression 28 | logclf = LogisticRegression() 29 | print(logclf) 30 | logclf.fit(X.reshape(num_per_class * 2, 1), y) 31 | print(np.exp(logclf.intercept_), np.exp(logclf.coef_.ravel())) 32 | print("P(x=-1)=%.2f\tP(x=7)=%.2f" % 33 | (lr_model(logclf, -1), lr_model(logclf, 7))) 34 | X_test = np.arange(-5, 20, 0.1) 35 | pyplot.figure(figsize=(10, 4)) 36 | pyplot.xlim((-5, 20)) 37 | pyplot.scatter(X, y, c=y) 38 | pyplot.xlabel("feature value") 39 | pyplot.ylabel("class") 40 | pyplot.grid(True, linestyle='-', color='0.75') 41 | pyplot.savefig( 42 | os.path.join(CHART_DIR, "log_reg_example_data.png"), bbox_inches="tight") 43 | 44 | 45 | def lin_model(clf, X): 46 | return clf.intercept_ + clf.coef_ * X 47 | 48 | from sklearn.linear_model import LinearRegression 49 | clf = LinearRegression() 50 | print(clf) 51 | clf.fit(X.reshape(num_per_class * 2, 1), y) 52 | X_odds = np.arange(0, 1, 0.001) 53 | pyplot.figure(figsize=(10, 4)) 54 | pyplot.subplot(1, 2, 1) 55 | pyplot.scatter(X, y, c=y) 56 | pyplot.plot(X_test, lin_model(clf, X_test)) 57 | pyplot.xlabel("feature value") 58 | pyplot.ylabel("class") 59 | pyplot.title("linear fit on original data") 60 | pyplot.grid(True, linestyle='-', color='0.75') 61 | 62 | X_ext = np.hstack((X, norm.rvs(20, size=100, scale=5))) 63 | y_ext = np.hstack((y, np.ones(100))) 64 | clf = LinearRegression() 65 | clf.fit(X_ext.reshape(num_per_class * 2 + 100, 1), y_ext) 66 | pyplot.subplot(1, 2, 2) 67 | pyplot.scatter(X_ext, y_ext, c=y_ext) 68 | pyplot.plot(X_ext, lin_model(clf, X_ext)) 69 | pyplot.xlabel("feature value") 70 | pyplot.ylabel("class") 71 | pyplot.title("linear fit on additional data") 72 | pyplot.grid(True, linestyle='-', color='0.75') 73 | pyplot.savefig( 74 | os.path.join(CHART_DIR, "log_reg_log_linear_fit.png"), bbox_inches="tight") 75 | 76 | pyplot.figure(figsize=(10, 4)) 77 | pyplot.xlim((-5, 20)) 78 | pyplot.scatter(X, y, c=y) 79 | pyplot.plot(X_test, lr_model(logclf, X_test).ravel()) 80 | pyplot.plot(X_test, np.ones(X_test.shape[0]) * 0.5, "--") 81 | pyplot.xlabel("feature value") 82 | pyplot.ylabel("class") 83 | pyplot.grid(True, linestyle='-', color='0.75') 84 | pyplot.savefig( 85 | os.path.join(CHART_DIR, "log_reg_example_fitted.png"), bbox_inches="tight") 86 | 87 | X = np.arange(0, 1, 0.001) 88 | pyplot.figure(figsize=(10, 4)) 89 | pyplot.subplot(1, 2, 1) 90 | pyplot.xlim((0, 1)) 91 | pyplot.ylim((0, 10)) 92 | pyplot.plot(X, X / (1 - X)) 93 | pyplot.xlabel("P") 94 | pyplot.ylabel("odds = P / (1-P)") 95 | pyplot.grid(True, linestyle='-', color='0.75') 96 | 97 | pyplot.subplot(1, 2, 2) 98 | pyplot.xlim((0, 1)) 99 | pyplot.plot(X, np.log(X / (1 - X))) 100 | pyplot.xlabel("P") 101 | pyplot.ylabel("log(odds) = log(P / (1-P))") 102 | pyplot.grid(True, linestyle='-', color='0.75') 103 | pyplot.savefig( 104 | os.path.join(CHART_DIR, "log_reg_log_odds.png"), bbox_inches="tight") 105 | -------------------------------------------------------------------------------- /ch06/01_start.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | # 9 | # This script trains multinomial Naive Bayes on the tweet corpus 10 | # to find two different results: 11 | # - How well can we distinguis positive from negative tweets? 12 | # - How well can we detect whether a tweet contains sentiment at all? 13 | # 14 | 15 | import time 16 | start_time = time.time() 17 | 18 | import numpy as np 19 | 20 | from sklearn.metrics import precision_recall_curve, roc_curve, auc 21 | from sklearn.cross_validation import ShuffleSplit 22 | 23 | from utils import plot_pr 24 | from utils import load_sanders_data 25 | from utils import tweak_labels 26 | 27 | from sklearn.feature_extraction.text import TfidfVectorizer 28 | from sklearn.pipeline import Pipeline 29 | 30 | from sklearn.naive_bayes import MultinomialNB 31 | 32 | 33 | def create_ngram_model(): 34 | tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3), 35 | analyzer="word", binary=False) 36 | clf = MultinomialNB() 37 | pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)]) 38 | return pipeline 39 | 40 | 41 | def train_model(clf_factory, X, Y, name="NB ngram", plot=False): 42 | cv = ShuffleSplit( 43 | n=len(X), n_iter=10, test_size=0.3, random_state=0) 44 | 45 | train_errors = [] 46 | test_errors = [] 47 | 48 | scores = [] 49 | pr_scores = [] 50 | precisions, recalls, thresholds = [], [], [] 51 | 52 | for train, test in cv: 53 | X_train, y_train = X[train], Y[train] 54 | X_test, y_test = X[test], Y[test] 55 | 56 | clf = clf_factory() 57 | clf.fit(X_train, y_train) 58 | 59 | train_score = clf.score(X_train, y_train) 60 | test_score = clf.score(X_test, y_test) 61 | 62 | train_errors.append(1 - train_score) 63 | test_errors.append(1 - test_score) 64 | 65 | scores.append(test_score) 66 | proba = clf.predict_proba(X_test) 67 | 68 | fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) 69 | precision, recall, pr_thresholds = precision_recall_curve( 70 | y_test, proba[:, 1]) 71 | 72 | pr_scores.append(auc(recall, precision)) 73 | precisions.append(precision) 74 | recalls.append(recall) 75 | thresholds.append(pr_thresholds) 76 | 77 | scores_to_sort = pr_scores 78 | median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] 79 | 80 | if plot: 81 | plot_pr(pr_scores[median], name, "01", precisions[median], 82 | recalls[median], label=name) 83 | 84 | summary = (np.mean(scores), np.std(scores), 85 | np.mean(pr_scores), np.std(pr_scores)) 86 | print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) 87 | 88 | return np.mean(train_errors), np.mean(test_errors) 89 | 90 | 91 | def print_incorrect(clf, X, Y): 92 | Y_hat = clf.predict(X) 93 | wrong_idx = Y_hat != Y 94 | X_wrong = X[wrong_idx] 95 | Y_wrong = Y[wrong_idx] 96 | Y_hat_wrong = Y_hat[wrong_idx] 97 | for idx in range(len(X_wrong)): 98 | print("clf.predict('%s')=%i instead of %i" % 99 | (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])) 100 | 101 | 102 | if __name__ == "__main__": 103 | X_orig, Y_orig = load_sanders_data() 104 | classes = np.unique(Y_orig) 105 | for c in classes: 106 | print("#%s: %i" % (c, sum(Y_orig == c))) 107 | 108 | print("== Pos vs. neg ==") 109 | pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") 110 | X = X_orig[pos_neg] 111 | Y = Y_orig[pos_neg] 112 | Y = tweak_labels(Y, ["positive"]) 113 | 114 | train_model(create_ngram_model, X, Y, name="pos vs neg", plot=True) 115 | 116 | print("== Pos/neg vs. irrelevant/neutral ==") 117 | X = X_orig 118 | Y = tweak_labels(Y_orig, ["positive", "negative"]) 119 | train_model(create_ngram_model, X, Y, name="sent vs rest", plot=True) 120 | 121 | print("== Pos vs. rest ==") 122 | X = X_orig 123 | Y = tweak_labels(Y_orig, ["positive"]) 124 | train_model(create_ngram_model, X, Y, name="pos vs rest", plot=True) 125 | 126 | print("== Neg vs. rest ==") 127 | X = X_orig 128 | Y = tweak_labels(Y_orig, ["negative"]) 129 | train_model(create_ngram_model, X, Y, name="neg vs rest", plot=True) 130 | 131 | print("time spent:", time.time() - start_time) 132 | -------------------------------------------------------------------------------- /ch06/README.md: -------------------------------------------------------------------------------- 1 | Chapter 6 - Classification II - Sentiment Analysis 2 | ================================================== 3 | 4 | When doing last code sanity checks for the book, Twitter 5 | was using the API 1.0, which did not require authentication. 6 | With its switch to version 1.1, this has now changed. 7 | 8 | If you don't have already created your personal Twitter 9 | access keys and tokens, you might want to do so at 10 | [https://dev.twitter.com/docs/auth/tokens-devtwittercom](https://dev.twitter.com/docs/auth/tokens-devtwittercom) and paste the keys/secrets into twitterauth.py 11 | 12 | According to [https://dev.twitter.com/docs/rate-limiting/1](https://dev.twitter.com/docs/rate-limiting/1) Twitter has a rate limit of fetching 350 tweets/h for authorized users. 13 | 14 | Note that some tweets might be missing when you are running install.py (user got suspended, changed authorization, or tweet deleted) and thus you might get different results. We keep track of those tweet IDs in data/{missing,not_authorized}.tsv, so that they are not fetched when you run install.py. 15 | -------------------------------------------------------------------------------- /ch06/twitterauth.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import sys 9 | 10 | CONSUMER_KEY = None 11 | CONSUMER_SECRET = None 12 | 13 | ACCESS_TOKEN_KEY = None 14 | ACCESS_TOKEN_SECRET = None 15 | 16 | if CONSUMER_KEY is None or CONSUMER_SECRET is None or ACCESS_TOKEN_KEY is None or ACCESS_TOKEN_SECRET is None: 17 | print("""\ 18 | When doing last code sanity checks for the book, Twitter 19 | was using the API 1.0, which did not require authentication. 20 | With its switch to version 1.1, this has now changed. 21 | 22 | It seems that you don't have already created your personal Twitter 23 | access keys and tokens. Please do so at 24 | https://dev.twitter.com/docs/auth/tokens-devtwittercom 25 | and paste the keys/secrets into twitterauth.py 26 | 27 | Sorry for the inconvenience, 28 | The authors.""") 29 | 30 | sys.exit(1) 31 | -------------------------------------------------------------------------------- /ch07/.gitignore: -------------------------------------------------------------------------------- 1 | *.png 2 | -------------------------------------------------------------------------------- /ch07/README.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Chapter 7 3 | ========= 4 | 5 | Support code for *Chapter 7: Regression* 6 | 7 | 8 | Boston data analysis 9 | -------------------- 10 | 11 | This dataset is shipped with sklearn. Thus, no extra download is required. 12 | 13 | 14 | boston1.py 15 | Fit a linear regression model to the Boston house price data 16 | boston1numpy.py 17 | Version of above script using numpy operations for linear regression 18 | boston_cv_penalized.py 19 | Test different penalized (and OLS) regression schemes on the Boston dataset 20 | figure1_2.py 21 | Show the regression line for Boston data 22 | figure3.py 23 | Show the regression line for Boston data with OLS and Lasso 24 | figure4.py 25 | Scatter plot of predicted-vs-actual for multidimensional regression 26 | 27 | 10K data analysis 28 | ----------------- 29 | 30 | lr10k.py 31 | Linear regression on 10K dataset, evaluation by cross-validation 32 | predict10k_en.py 33 | Elastic nets (including with inner cross-validation for parameter 34 | settings). Produces scatter plot. 35 | 36 | 37 | MovieLens data analysis 38 | ----------------------- 39 | 40 | In this chapter, we only consider a very simple approach, which is implemented 41 | in the ``usermodel.py`` script. 42 | 43 | -------------------------------------------------------------------------------- /ch07/boston1.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | # This script shows an example of simple (ordinary) linear regression 9 | 10 | # The first edition of the book NumPy functions only for this operation. See 11 | # the file boston1numpy.py for that version. 12 | 13 | import numpy as np 14 | from sklearn.datasets import load_boston 15 | from sklearn.linear_model import LinearRegression 16 | from matplotlib import pyplot as plt 17 | 18 | boston = load_boston() 19 | x = boston.data 20 | y = boston.target 21 | 22 | # Fitting a model is trivial: call the ``fit`` method in LinearRegression: 23 | lr = LinearRegression() 24 | lr.fit(x, y) 25 | 26 | # The instance member `residues_` contains the sum of the squared residues 27 | rmse = np.sqrt(lr.residues_/len(x)) 28 | print('RMSE: {}'.format(rmse)) 29 | 30 | fig, ax = plt.subplots() 31 | # Plot a diagonal (for reference): 32 | ax.plot([0, 50], [0, 50], '-', color=(.9,.3,.3), lw=4) 33 | 34 | # Plot the prediction versus real: 35 | ax.scatter(lr.predict(x), boston.target) 36 | 37 | ax.set_xlabel('predicted') 38 | ax.set_ylabel('real') 39 | fig.savefig('Figure_07_08.png') 40 | -------------------------------------------------------------------------------- /ch07/boston1numpy.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | # This script shows an example of simple (ordinary) linear regression 9 | 10 | import numpy as np 11 | from sklearn.datasets import load_boston 12 | import pylab as plt 13 | 14 | boston = load_boston() 15 | x = np.array([np.concatenate((v, [1])) for v in boston.data]) 16 | y = boston.target 17 | 18 | # np.linal.lstsq implements least-squares linear regression 19 | s, total_error, _, _ = np.linalg.lstsq(x, y) 20 | 21 | rmse = np.sqrt(total_error[0] / len(x)) 22 | print('Residual: {}'.format(rmse)) 23 | 24 | # Plot the prediction versus real: 25 | plt.plot(np.dot(x, s), boston.target, 'ro') 26 | 27 | # Plot a diagonal (for reference): 28 | plt.plot([0, 50], [0, 50], 'g-') 29 | plt.xlabel('predicted') 30 | plt.ylabel('real') 31 | plt.show() 32 | -------------------------------------------------------------------------------- /ch07/boston_cv_penalized.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | # This script fits several forms of penalized regression 9 | 10 | from __future__ import print_function 11 | import numpy as np 12 | from sklearn.cross_validation import KFold 13 | from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge 14 | from sklearn.metrics import r2_score 15 | from sklearn.datasets import load_boston 16 | boston = load_boston() 17 | x = boston.data 18 | y = boston.target 19 | 20 | for name, met in [ 21 | ('linear regression', LinearRegression()), 22 | ('lasso()', Lasso()), 23 | ('elastic-net(.5)', ElasticNet(alpha=0.5)), 24 | ('lasso(.5)', Lasso(alpha=0.5)), 25 | ('ridge(.5)', Ridge(alpha=0.5)), 26 | ]: 27 | # Fit on the whole data: 28 | met.fit(x, y) 29 | 30 | # Predict on the whole data: 31 | p = met.predict(x) 32 | r2_train = r2_score(y, p) 33 | 34 | # Now, we use 10 fold cross-validation to estimate generalization error 35 | kf = KFold(len(x), n_folds=5) 36 | p = np.zeros_like(y) 37 | for train, test in kf: 38 | met.fit(x[train], y[train]) 39 | p[test] = met.predict(x[test]) 40 | 41 | r2_cv = r2_score(y, p) 42 | print('Method: {}'.format(name)) 43 | print('R2 on training: {}'.format(r2_train)) 44 | print('R2 on 5-fold CV: {}'.format(r2_cv)) 45 | print() 46 | print() 47 | -------------------------------------------------------------------------------- /ch07/data/.gitignore: -------------------------------------------------------------------------------- 1 | E2006.train 2 | -------------------------------------------------------------------------------- /ch07/data/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | curl -O http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2 3 | bunzip2 E2006.train.bz2 4 | -------------------------------------------------------------------------------- /ch07/figure1_2.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import numpy as np 9 | from sklearn.datasets import load_boston 10 | from sklearn.linear_model import LinearRegression 11 | from sklearn.metrics import mean_squared_error, r2_score 12 | from matplotlib import pyplot as plt 13 | 14 | boston = load_boston() 15 | 16 | # Index number five in the number of rooms 17 | fig,ax = plt.subplots() 18 | ax.scatter(boston.data[:, 5], boston.target) 19 | ax.set_xlabel("Average number of rooms (RM)") 20 | ax.set_ylabel("House Price") 21 | 22 | x = boston.data[:, 5] 23 | # fit (used below) takes a two-dimensional array as input. We use np.atleast_2d 24 | # to convert from one to two dimensional, then transpose to make sure that the 25 | # format matches: 26 | x = np.transpose(np.atleast_2d(x)) 27 | 28 | y = boston.target 29 | 30 | lr = LinearRegression(fit_intercept=False) 31 | lr.fit(x, y) 32 | 33 | ax.plot([0, boston.data[:, 5].max() + 1], 34 | [0, lr.predict(boston.data[:, 5].max() + 1)], '-', lw=4) 35 | fig.savefig('Figure1.png') 36 | 37 | mse = mean_squared_error(y, lr.predict(x)) 38 | rmse = np.sqrt(mse) 39 | print('RMSE (no intercept): {}'.format(rmse)) 40 | 41 | # Repeat, but fitting an intercept this time: 42 | lr = LinearRegression(fit_intercept=True) 43 | 44 | lr.fit(x, y) 45 | 46 | fig,ax = plt.subplots() 47 | ax.set_xlabel("Average number of rooms (RM)") 48 | ax.set_ylabel("House Price") 49 | ax.scatter(boston.data[:, 5], boston.target) 50 | xmin = x.min() 51 | xmax = x.max() 52 | ax.plot([xmin, xmax], lr.predict([[xmin], [xmax]]) , '-', lw=4) 53 | fig.savefig('Figure2.png') 54 | 55 | mse = mean_squared_error(y, lr.predict(x)) 56 | print("Mean squared error (of training data): {:.3}".format(mse)) 57 | 58 | rmse = np.sqrt(mse) 59 | print("Root mean squared error (of training data): {:.3}".format(rmse)) 60 | 61 | cod = r2_score(y, lr.predict(x)) 62 | print('COD (on training data): {:.2}'.format(cod)) 63 | 64 | -------------------------------------------------------------------------------- /ch07/figure3.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from sklearn.linear_model import LinearRegression, Lasso 9 | import numpy as np 10 | from sklearn.datasets import load_boston 11 | from matplotlib import pyplot as plt 12 | 13 | boston = load_boston() 14 | fig, ax = plt.subplots() 15 | ax.scatter(boston.data[:, 5], boston.target) 16 | ax.set_xlabel("Number of rooms (RM)") 17 | ax.set_ylabel("House Price") 18 | 19 | 20 | x = boston.data[:, 5] 21 | xmin = x.min() 22 | xmax = x.max() 23 | x = np.transpose(np.atleast_2d(x)) 24 | y = boston.target 25 | 26 | lr = LinearRegression() 27 | lr.fit(x, y) 28 | ax.plot([xmin, xmax], lr.predict([[xmin], [xmax]]), ':', lw=4, label='OLS model') 29 | 30 | las = Lasso() 31 | las.fit(x, y) 32 | ax.plot([xmin, xmax], las.predict([ [xmin], [xmax] ]), '-', lw=4, label='Lasso model') 33 | fig.savefig('Figure3.png') 34 | -------------------------------------------------------------------------------- /ch07/figure4.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | 9 | # This script plots prediction-vs-actual on training set for the Boston dataset 10 | # using OLS regression 11 | import numpy as np 12 | from sklearn.linear_model import LinearRegression 13 | from sklearn.datasets import load_boston 14 | from sklearn.metrics import mean_squared_error 15 | from matplotlib import pyplot as plt 16 | 17 | boston = load_boston() 18 | 19 | x = boston.data 20 | y = boston.target 21 | 22 | lr = LinearRegression() 23 | lr.fit(x, y) 24 | p = lr.predict(x) 25 | print("RMSE: {:.2}.".format(np.sqrt(mean_squared_error(y, p)))) 26 | print("R2: {:.2}.".format(lr.score(x, y))) 27 | fig,ax = plt.subplots() 28 | ax.scatter(p, y) 29 | ax.set_xlabel('Predicted price') 30 | ax.set_ylabel('Actual price') 31 | ax.plot([y.min(), y.max()], [y.min(), y.max()], lw=4) 32 | 33 | fig.savefig('Figure4.png') 34 | -------------------------------------------------------------------------------- /ch07/lasso_path_plot.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from sklearn.linear_model import Lasso 9 | from sklearn.datasets import load_boston 10 | from matplotlib import pyplot as plt 11 | import numpy as np 12 | 13 | boston = load_boston() 14 | x = boston.data 15 | y = boston.target 16 | 17 | las = Lasso(normalize=1) 18 | alphas = np.logspace(-5, 2, 1000) 19 | alphas, coefs, _= las.path(x, y, alphas=alphas) 20 | 21 | fig,ax = plt.subplots() 22 | ax.plot(alphas, coefs.T) 23 | ax.set_xscale('log') 24 | ax.set_xlim(alphas.max(), alphas.min()) 25 | ax.set_xlabel('Lasso coefficient path as a function of alpha') 26 | ax.set_xlabel('Alpha') 27 | ax.set_ylabel('Coefficient weight') 28 | fig.savefig('Figure_LassoPath.png') 29 | 30 | -------------------------------------------------------------------------------- /ch07/lr10k.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import numpy as np 9 | from sklearn.metrics import mean_squared_error, r2_score 10 | from sklearn.datasets import load_svmlight_file 11 | from sklearn.linear_model import LinearRegression 12 | from sklearn.cross_validation import KFold 13 | 14 | # Whether to use Elastic nets (otherwise, ordinary linear regression is used) 15 | 16 | # Load data: 17 | data, target = load_svmlight_file('data/E2006.train') 18 | 19 | lr = LinearRegression() 20 | 21 | # Compute error on training data to demonstrate that we can obtain near perfect 22 | # scores: 23 | 24 | lr.fit(data, target) 25 | pred = lr.predict(data) 26 | 27 | print('RMSE on training, {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) 28 | print('R2 on training, {:.2}'.format(r2_score(target, pred))) 29 | print('') 30 | 31 | pred = np.zeros_like(target) 32 | kf = KFold(len(target), n_folds=5) 33 | for train, test in kf: 34 | lr.fit(data[train], target[train]) 35 | pred[test] = lr.predict(data[test]) 36 | 37 | print('RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) 38 | print('R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred))) 39 | -------------------------------------------------------------------------------- /ch07/predict10k_en.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import numpy as np 9 | from sklearn.datasets import load_svmlight_file 10 | from sklearn.cross_validation import KFold 11 | from sklearn.linear_model import ElasticNetCV, ElasticNet 12 | from sklearn.metrics import mean_squared_error, r2_score 13 | from matplotlib import pyplot as plt 14 | 15 | data, target = load_svmlight_file('data/E2006.train') 16 | 17 | # Edit the lines below if you want to switch method: 18 | # from sklearn.linear_model import Lasso 19 | # met = Lasso(alpha=0.1) 20 | met = ElasticNet(alpha=0.1) 21 | 22 | kf = KFold(len(target), n_folds=5) 23 | pred = np.zeros_like(target) 24 | for train, test in kf: 25 | met.fit(data[train], target[train]) 26 | pred[test] = met.predict(data[test]) 27 | 28 | print('[EN 0.1] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) 29 | print('[EN 0.1] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred))) 30 | print('') 31 | 32 | # Construct an ElasticNetCV object (use all available CPUs) 33 | met = ElasticNetCV(n_jobs=-1) 34 | 35 | kf = KFold(len(target), n_folds=5) 36 | pred = np.zeros_like(target) 37 | for train, test in kf: 38 | met.fit(data[train], target[train]) 39 | pred[test] = met.predict(data[test]) 40 | 41 | print('[EN CV] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) 42 | print('[EN CV] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred))) 43 | print('') 44 | 45 | met.fit(data, target) 46 | pred = met.predict(data) 47 | print('[EN CV] RMSE on training, {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) 48 | print('[EN CV] R2 on training, {:.2}'.format(r2_score(target, pred))) 49 | 50 | 51 | # Construct an ElasticNetCV object (use all available CPUs) 52 | met = ElasticNetCV(n_jobs=-1, l1_ratio=[.01, .05, .25, .5, .75, .95, .99]) 53 | 54 | kf = KFold(len(target), n_folds=5) 55 | pred = np.zeros_like(target) 56 | for train, test in kf: 57 | met.fit(data[train], target[train]) 58 | pred[test] = met.predict(data[test]) 59 | 60 | 61 | print('[EN CV l1_ratio] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) 62 | print('[EN CV l1_ratio] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred))) 63 | print('') 64 | 65 | 66 | fig, ax = plt.subplots() 67 | y = target 68 | ax.scatter(y, pred, c='k') 69 | ax.plot([-5,-1], [-5,-1], 'r-', lw=2) 70 | ax.set_xlabel('Actual value') 71 | ax.set_ylabel('Predicted value') 72 | fig.savefig('Figure_10k_scatter_EN_l1_ratio.png') 73 | 74 | -------------------------------------------------------------------------------- /ch08/README.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Chapter 8 3 | ========= 4 | 5 | Support code for *Chapter 8: Recommendations*. 6 | 7 | The code refers to the second edition of the book and this code has been 8 | significantly refactored when compared to the first one. 9 | 10 | Ratings Prediction 11 | ------------------ 12 | 13 | Note that since the partition of the data into training and testing is random, 14 | everytime you run the code, the results will be different. 15 | 16 | 17 | load_ml100k.py 18 | Load data & partition into test/train 19 | norm.py 20 | Normalize the data 21 | corrneighbours.py 22 | Neighbour models based on ncrroaltoin 23 | regression.py 24 | Regression models 25 | stacked.py 26 | Stacked predictions 27 | averaged.py 28 | Averaging of predictions (mentioned in book, but code is not shown there). 29 | 30 | Association Rule Mining 31 | ----------------------- 32 | 33 | Check the folder ``apriori/`` 34 | 35 | apriori/histogram.py 36 | Print a histogram of how many times each product was bought 37 | apriori/apriori.py 38 | Implementation of Apriori algorithm and association rule building 39 | apriori/apriori_example.py 40 | Example of Apriori algorithm in retail dataset 41 | 42 | -------------------------------------------------------------------------------- /ch08/all_correlations.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import numpy as np 9 | 10 | def all_correlations(y, X): 11 | from scipy import spatial 12 | y = np.atleast_2d(y) 13 | sp = spatial.distance.cdist(X, y, 'correlation') 14 | # The "correlation distance" is 1 - corr(x,y); so we invert that to obtain the correlation 15 | return 1 - sp.ravel() 16 | 17 | # This is the version in the book (1st Edition): 18 | def all_correlations_book_version(bait, target): 19 | ''' 20 | corrs = all_correlations(bait, target) 21 | 22 | corrs[i] is the correlation between bait and target[i] 23 | ''' 24 | return np.array( 25 | [np.corrcoef(bait, c)[0, 1] 26 | for c in target]) 27 | 28 | # This is a faster, but harder to read, implementation: 29 | def all_correlations_fast_no_scipy(y, X): 30 | ''' 31 | Cs = all_correlations(y, X) 32 | 33 | Cs[i] = np.corrcoef(y, X[i])[0,1] 34 | ''' 35 | X = np.asanyarray(X, float) 36 | y = np.asanyarray(y, float) 37 | xy = np.dot(X, y) 38 | y_ = y.mean() 39 | ys_ = y.std() 40 | x_ = X.mean(1) 41 | xs_ = X.std(1) 42 | n = float(len(y)) 43 | ys_ += 1e-5 # Handle zeros in ys 44 | xs_ += 1e-5 # Handle zeros in x 45 | 46 | return (xy - x_ * y_ * n) / n / xs_ / ys_ 47 | 48 | 49 | -------------------------------------------------------------------------------- /ch08/apriori/.gitignore: -------------------------------------------------------------------------------- 1 | retail.dat.gz 2 | -------------------------------------------------------------------------------- /ch08/apriori/apriori.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from collections import namedtuple 9 | 10 | 11 | def apriori(dataset, minsupport, maxsize): 12 | ''' 13 | freqsets, support = apriori(dataset, minsupport, maxsize) 14 | 15 | Parameters 16 | ---------- 17 | dataset : sequence of sequences 18 | input dataset 19 | minsupport : int 20 | Minimal support for frequent items 21 | maxsize : int 22 | Maximal size of frequent items to return 23 | 24 | Returns 25 | ------- 26 | freqsets : sequence of sequences 27 | support : dictionary 28 | This associates each itemset (represented as a frozenset) with a float 29 | (the support of that itemset) 30 | ''' 31 | from collections import defaultdict 32 | 33 | baskets = defaultdict(list) 34 | pointers = defaultdict(list) 35 | 36 | for i, ds in enumerate(dataset): 37 | for ell in ds: 38 | pointers[ell].append(i) 39 | baskets[frozenset([ell])].append(i) 40 | 41 | # Convert pointer items to frozensets to speed up operations later 42 | new_pointers = dict() 43 | for k in pointers: 44 | if len(pointers[k]) >= minsupport: 45 | new_pointers[k] = frozenset(pointers[k]) 46 | pointers = new_pointers 47 | for k in baskets: 48 | baskets[k] = frozenset(baskets[k]) 49 | 50 | 51 | # Valid are all elements whose support is >= minsupport 52 | valid = set() 53 | for el, c in baskets.items(): 54 | if len(c) >= minsupport: 55 | valid.update(el) 56 | 57 | # Itemsets at first iteration are simply all singleton with valid elements: 58 | itemsets = [frozenset([v]) for v in valid] 59 | freqsets = [] 60 | for i in range(maxsize - 1): 61 | print("At iteration {}, number of frequent baskets: {}".format( 62 | i, len(itemsets))) 63 | newsets = [] 64 | for it in itemsets: 65 | ccounts = baskets[it] 66 | 67 | for v, pv in pointers.items(): 68 | if v not in it: 69 | csup = (ccounts & pv) 70 | if len(csup) >= minsupport: 71 | new = frozenset(it | frozenset([v])) 72 | if new not in baskets: 73 | newsets.append(new) 74 | baskets[new] = csup 75 | freqsets.extend(itemsets) 76 | itemsets = newsets 77 | if not len(itemsets): 78 | break 79 | support = {} 80 | for k in baskets: 81 | support[k] = float(len(baskets[k])) 82 | return freqsets, support 83 | 84 | 85 | # A namedtuple to collect all values that may be interesting 86 | AssociationRule = namedtuple('AssociationRule', ['antecendent', 'consequent', 'base', 'py_x', 'lift']) 87 | 88 | def association_rules(dataset, freqsets, support, minlift): 89 | ''' 90 | for assoc_rule in association_rules(dataset, freqsets, support, minlift): 91 | ... 92 | 93 | This function takes the returns from ``apriori``. 94 | 95 | Parameters 96 | ---------- 97 | dataset : sequence of sequences 98 | input dataset 99 | freqsets : sequence of sequences 100 | support : dictionary 101 | minlift : int 102 | minimal lift of yielded rules 103 | 104 | Returns 105 | ------- 106 | assoc_rule : sequence of AssociationRule objects 107 | ''' 108 | nr_transactions = float(len(dataset)) 109 | freqsets = [f for f in freqsets if len(f) > 1] 110 | for fset in freqsets: 111 | for f in fset: 112 | consequent = frozenset([f]) 113 | antecendent = fset - consequent 114 | py_x = support[fset] / support[antecendent] 115 | base = support[consequent] / nr_transactions 116 | lift = py_x / base 117 | if lift > minlift: 118 | yield AssociationRule(antecendent, consequent, base, py_x, lift) 119 | 120 | -------------------------------------------------------------------------------- /ch08/apriori/apriori_example.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from apriori import apriori, association_rules 9 | from gzip import GzipFile 10 | 11 | # Load dataset 12 | dataset = [[int(tok) for tok in line.strip().split()] 13 | for line in GzipFile('retail.dat.gz')] 14 | 15 | freqsets, support = apriori(dataset, 80, maxsize=16) 16 | rules = list(association_rules(dataset, freqsets, support, minlift=30.0)) 17 | 18 | rules.sort(key=(lambda ar: -ar.lift)) 19 | for ar in rules: 20 | print('{} -> {} (lift = {:.4})' 21 | .format(set(ar.antecendent), 22 | set(ar.consequent), 23 | ar.lift)) 24 | -------------------------------------------------------------------------------- /ch08/apriori/apriori_naive.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from collections import defaultdict 9 | from itertools import chain 10 | from gzip import GzipFile 11 | minsupport = 80 12 | 13 | dataset = [[int(tok) for tok in line.strip().split()] 14 | for line in GzipFile('retail.dat.gz')] 15 | 16 | counts = defaultdict(int) 17 | for elem in chain(*dataset): 18 | counts[elem] += 1 19 | 20 | # Only elements that have at least minsupport should be considered. 21 | valid = set(el for el, c in counts.items() if (c >= minsupport)) 22 | 23 | # Filter the dataset to contain only valid elements 24 | # (This step is not strictly necessary, but will make the rest of the code 25 | # faster as the itemsets will be smaller): 26 | dataset = [[el for el in ds if (el in valid)] for ds in dataset] 27 | 28 | # Convert to frozenset for fast processing 29 | dataset = [frozenset(ds) for ds in dataset] 30 | 31 | itemsets = [frozenset([v]) for v in valid] 32 | freqsets = itemsets[:] 33 | for i in range(16): 34 | print("At iteration {}, number of frequent baskets: {}".format( 35 | i, len(itemsets))) 36 | nextsets = [] 37 | 38 | tested = set() 39 | for it in itemsets: 40 | for v in valid: 41 | if v not in it: 42 | # Create a new candidate set by adding v to it 43 | c = (it | frozenset([v])) 44 | 45 | # Check if we have tested it already: 46 | if c in tested: 47 | continue 48 | tested.add(c) 49 | 50 | # Count support by looping over dataset 51 | # This step is slow. 52 | # Check `apriori.py` for a better implementation. 53 | support_c = sum(1 for d in dataset if d.issuperset(c)) 54 | if support_c > minsupport: 55 | nextsets.append(c) 56 | freqsets.extend(nextsets) 57 | itemsets = nextsets 58 | if not len(itemsets): 59 | break 60 | print("Finished!") 61 | 62 | 63 | def rules_from_itemset(itemset, dataset, minlift=1.): 64 | nr_transactions = float(len(dataset)) 65 | for item in itemset: 66 | consequent = frozenset([item]) 67 | antecedent = itemset-consequent 68 | base = 0.0 69 | # acount: antecedent count 70 | acount = 0.0 71 | 72 | # ccount : consequent count 73 | ccount = 0.0 74 | for d in dataset: 75 | if item in d: base += 1 76 | if d.issuperset(itemset): ccount += 1 77 | if d.issuperset(antecedent): acount += 1 78 | base /= nr_transactions 79 | p_y_given_x = ccount/acount 80 | lift = p_y_given_x / base 81 | if lift > minlift: 82 | print('Rule {0} -> {1} has lift {2}' 83 | .format(antecedent, consequent,lift)) 84 | 85 | for itemset in freqsets: 86 | if len(itemset) > 1: 87 | rules_from_itemset(itemset, dataset, minlift=4.) 88 | -------------------------------------------------------------------------------- /ch08/apriori/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | wget http://fimi.ua.ac.be/data/retail.dat.gz 3 | -------------------------------------------------------------------------------- /ch08/apriori/histogram.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import numpy as np 9 | from collections import defaultdict 10 | from itertools import chain 11 | from gzip import GzipFile 12 | dataset = [[int(tok) for tok in line.strip().split()] 13 | for line in GzipFile('retail.dat.gz')] 14 | counts = defaultdict(int) 15 | for elem in chain(*dataset): 16 | counts[elem] += 1 17 | counts = np.array(list(counts.values())) 18 | bins = [1, 2, 4, 8, 16, 32, 64, 128, 512] 19 | print(' {0:11} | {1:12}'.format('Nr of baskets', 'Nr of products')) 20 | print('--------------------------------') 21 | for i in range(len(bins)): 22 | bot = bins[i] 23 | top = (bins[i + 1] if (i + 1) < len(bins) else 100000000000) 24 | print(' {0:4} - {1:3} | {2:12}'.format( 25 | bot, (top if top < 1000 else ''), np.sum((counts >= bot) & (counts < top)))) 26 | -------------------------------------------------------------------------------- /ch08/averaged.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import load_ml100k 3 | import regression 4 | import corrneighbours 5 | from sklearn import metrics 6 | import norm 7 | 8 | def predict(train): 9 | predicted0 = regression.predict(train) 10 | predicted1 = regression.predict(train.T).T 11 | predicted2 = corrneighbours.predict(train) 12 | predicted3 = corrneighbours.predict(train.T).T 13 | predicted4 = norm.predict(train) 14 | predicted5 = norm.predict(train.T).T 15 | stack = np.array([ 16 | predicted0, 17 | predicted1, 18 | predicted2, 19 | predicted3, 20 | predicted4, 21 | predicted5, 22 | ]) 23 | return stack.mean(0) 24 | 25 | 26 | def main(): 27 | train,test = load_ml100k.get_train_test(random_state=12) 28 | predicted = predict(train) 29 | r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) 30 | print('R2 averaged: {:.2%}'.format(r2)) 31 | 32 | if __name__ == '__main__': 33 | main() 34 | -------------------------------------------------------------------------------- /ch08/corrneighbours.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from __future__ import print_function 9 | import numpy as np 10 | from load_ml100k import get_train_test 11 | from scipy.spatial import distance 12 | from sklearn import metrics 13 | 14 | from norm import NormalizePositive 15 | 16 | def predict(otrain): 17 | binary = (otrain > 0) 18 | norm = NormalizePositive(axis=1) 19 | train = norm.fit_transform(otrain) 20 | 21 | dists = distance.pdist(binary, 'correlation') 22 | dists = distance.squareform(dists) 23 | 24 | neighbors = dists.argsort(axis=1) 25 | filled = train.copy() 26 | for u in range(filled.shape[0]): 27 | # n_u are the neighbors of user 28 | n_u = neighbors[u, 1:] 29 | for m in range(filled.shape[1]): 30 | # This code could be faster using numpy indexing trickery as the 31 | # cost of readibility (this is left as an exercise to the reader): 32 | revs = [train[neigh, m] 33 | for neigh in n_u 34 | if binary[neigh, m]] 35 | if len(revs): 36 | n = len(revs) 37 | n //= 2 38 | n += 1 39 | revs = revs[:n] 40 | filled[u,m] = np.mean(revs) 41 | 42 | return norm.inverse_transform(filled) 43 | 44 | def main(transpose_inputs=False): 45 | train, test = get_train_test(random_state=12) 46 | if transpose_inputs: 47 | train = train.T 48 | test = test.T 49 | 50 | predicted = predict(train) 51 | r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) 52 | print('R2 score (binary {} neighbours): {:.1%}'.format( 53 | ('movie' if transpose_inputs else 'user'), 54 | r2)) 55 | 56 | if __name__ == '__main__': 57 | main() 58 | main(transpose_inputs=True) 59 | -------------------------------------------------------------------------------- /ch08/data/.gitignore: -------------------------------------------------------------------------------- 1 | retail.dat.gz 2 | ml-100k.zip 3 | /ml-100k/ 4 | -------------------------------------------------------------------------------- /ch08/data/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | curl -L -O http://files.grouplens.org/papers/ml-100k.zip 3 | unzip ml-100k.zip 4 | curl -L -O http://fimi.ua.ac.be/data/retail.dat.gz 5 | -------------------------------------------------------------------------------- /ch08/figure3.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from load_ml100k import load 9 | from matplotlib import pyplot as plt 10 | data = load() 11 | plt.gray() 12 | plt.imshow(data[:200, :200], interpolation='nearest') 13 | plt.xlabel('User ID') 14 | plt.ylabel('Film ID') 15 | plt.savefig('Figure_08_03_DataMatrix.png') 16 | -------------------------------------------------------------------------------- /ch08/load_ml100k.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | def load(): 9 | '''Load ML-100k data 10 | 11 | Returns the review matrix as a numpy array''' 12 | import numpy as np 13 | from scipy import sparse 14 | from os import path 15 | 16 | if not path.exists('data/ml-100k/u.data'): 17 | raise IOError("Data has not been downloaded.\nTry the following:\n\n\tcd data\n\t./download.sh") 18 | 19 | # The input is in the form of a CSC sparse matrix, so it's a natural fit to 20 | # load the data, but we then convert to a more traditional array before 21 | # returning 22 | data = np.loadtxt('data/ml-100k/u.data') 23 | ij = data[:, :2] 24 | ij -= 1 # original data is in 1-based system 25 | values = data[:, 2] 26 | reviews = sparse.csc_matrix((values, ij.T)).astype(float) 27 | return reviews.toarray() 28 | 29 | def get_train_test(reviews=None, random_state=None): 30 | '''Split data into training & testing 31 | 32 | Parameters 33 | ---------- 34 | reviews : ndarray, optional 35 | Input data 36 | 37 | Returns 38 | ------- 39 | train : ndarray 40 | training data 41 | test : ndarray 42 | testing data 43 | ''' 44 | import numpy as np 45 | import random 46 | r = random.Random(random_state) 47 | 48 | if reviews is None: 49 | reviews = load() 50 | U,M = np.where(reviews) 51 | test_idxs = np.array(r.sample(range(len(U)), len(U)//10)) 52 | train = reviews.copy() 53 | train[U[test_idxs], M[test_idxs]] = 0 54 | 55 | test = np.zeros_like(reviews) 56 | test[U[test_idxs], M[test_idxs]] = reviews[U[test_idxs], M[test_idxs]] 57 | 58 | return train, test 59 | 60 | -------------------------------------------------------------------------------- /ch08/norm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class NormalizePositive(object): 4 | 5 | def __init__(self, axis=0): 6 | self.axis = axis 7 | 8 | def fit(self, features, y=None): 9 | # count features that are greater than zero in axis `self.axis`: 10 | if self.axis == 1: 11 | features = features.T 12 | binary = (features > 0) 13 | count = binary.sum(axis=0) 14 | 15 | # to avoid division by zero, set zero counts to one: 16 | count[count == 0] = 1. 17 | 18 | self.mean = features.sum(axis=0)/count 19 | 20 | # Compute variance by average squared difference to the mean, but only 21 | # consider differences where binary is True (i.e., where there was a 22 | # true rating): 23 | diff = (features - self.mean) * binary 24 | diff **= 2 25 | # regularize the estimate of std by adding 0.1 26 | self.std = np.sqrt(0.1 + diff.sum(axis=0)/count) 27 | return self 28 | 29 | def transform(self, features): 30 | if self.axis == 1: 31 | features = features.T 32 | binary = (features > 0) 33 | features = features - self.mean 34 | features /= self.std 35 | features *= binary 36 | if self.axis == 1: 37 | features = features.T 38 | return features 39 | 40 | def inverse_transform(self, features, copy=True): 41 | if copy: 42 | features = features.copy() 43 | if self.axis == 1: 44 | features = features.T 45 | features *= self.std 46 | features += self.mean 47 | if self.axis == 1: 48 | features = features.T 49 | return features 50 | 51 | def fit_transform(self, features): 52 | return self.fit(features).transform(features) 53 | 54 | 55 | def predict(train): 56 | norm = NormalizePositive() 57 | train = norm.fit_transform(train) 58 | return norm.inverse_transform(train * 0.) 59 | 60 | 61 | def main(transpose_inputs=False): 62 | from load_ml100k import get_train_test 63 | from sklearn import metrics 64 | train,test = get_train_test(random_state=12) 65 | if transpose_inputs: 66 | train = train.T 67 | test = test.T 68 | predicted = predict(train) 69 | r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) 70 | print('R2 score ({} normalization): {:.1%}'.format( 71 | ('movie' if transpose_inputs else 'user'), 72 | r2)) 73 | if __name__ == '__main__': 74 | main() 75 | main(transpose_inputs=True) 76 | -------------------------------------------------------------------------------- /ch08/regression.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import numpy as np 9 | from sklearn.linear_model import ElasticNetCV 10 | from norm import NormalizePositive 11 | from sklearn import metrics 12 | 13 | 14 | def predict(train): 15 | binary = (train > 0) 16 | reg = ElasticNetCV(fit_intercept=True, alphas=[ 17 | 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) 18 | norm = NormalizePositive() 19 | train = norm.fit_transform(train) 20 | 21 | filled = train.copy() 22 | # iterate over all users 23 | for u in range(train.shape[0]): 24 | # remove the current user for training 25 | curtrain = np.delete(train, u, axis=0) 26 | bu = binary[u] 27 | if np.sum(bu) > 5: 28 | reg.fit(curtrain[:,bu].T, train[u, bu]) 29 | 30 | # Fill the values that were not there already 31 | filled[u, ~bu] = reg.predict(curtrain[:,~bu].T) 32 | return norm.inverse_transform(filled) 33 | 34 | 35 | def main(transpose_inputs=False): 36 | from load_ml100k import get_train_test 37 | train,test = get_train_test(random_state=12) 38 | if transpose_inputs: 39 | train = train.T 40 | test = test.T 41 | filled = predict(train) 42 | r2 = metrics.r2_score(test[test > 0], filled[test > 0]) 43 | 44 | print('R2 score ({} regression): {:.1%}'.format( 45 | ('movie' if transpose_inputs else 'user'), 46 | r2)) 47 | 48 | if __name__ == '__main__': 49 | main() 50 | main(transpose_inputs=True) 51 | -------------------------------------------------------------------------------- /ch08/similar_movie.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from __future__ import print_function 9 | import numpy as np 10 | 11 | 12 | def nn_movie(ureviews, reviews, uid, mid, k=1): 13 | '''Movie neighbor based classifier 14 | 15 | Parameters 16 | ---------- 17 | ureviews : ndarray 18 | reviews : ndarray 19 | uid : int 20 | index of user 21 | mid : int 22 | index of movie 23 | k : int 24 | index of neighbor to return 25 | 26 | Returns 27 | ------- 28 | pred : float 29 | ''' 30 | X = ureviews 31 | y = ureviews[mid].copy() 32 | y -= y.mean() 33 | y /= (y.std() + 1e-5) 34 | corrs = np.dot(X, y) 35 | likes = corrs.argsort() 36 | likes = likes[::-1] 37 | c = 0 38 | pred = 3. 39 | for ell in likes: 40 | if ell == mid: 41 | continue 42 | if reviews[uid, ell] > 0: 43 | pred = reviews[uid, ell] 44 | if c == k: 45 | return pred 46 | c += 1 47 | return pred 48 | 49 | 50 | def all_estimates(reviews, k=1): 51 | '''Estimate all review ratings 52 | ''' 53 | reviews = reviews.astype(float) 54 | k -= 1 55 | nusers, nmovies = reviews.shape 56 | estimates = np.zeros_like(reviews) 57 | for u in range(nusers): 58 | ureviews = np.delete(reviews, u, axis=0) 59 | ureviews -= ureviews.mean(0) 60 | ureviews /= (ureviews.std(0) + 1e-5) 61 | ureviews = ureviews.T.copy() 62 | for m in np.where(reviews[u] > 0)[0]: 63 | estimates[u, m] = nn_movie(ureviews, reviews, u, m, k) 64 | return estimates 65 | 66 | if __name__ == '__main__': 67 | from load_ml100k import load 68 | reviews = load() 69 | estimates = all_estimates(reviews) 70 | error = (estimates - reviews) 71 | error **= 2 72 | error = error[reviews > 0] 73 | rmse = np.sqrt(error.mean()) 74 | print("RMSE is {0}.".format(rmse)) 75 | -------------------------------------------------------------------------------- /ch08/stacked.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import load_ml100k 3 | import regression 4 | import corrneighbours 5 | from sklearn import linear_model, metrics 6 | import norm 7 | 8 | def predict(train): 9 | tr_train,tr_test = load_ml100k.get_train_test(train, random_state=34) 10 | tr_predicted0 = regression.predict(tr_train) 11 | tr_predicted1 = regression.predict(tr_train.T).T 12 | tr_predicted2 = corrneighbours.predict(tr_train) 13 | tr_predicted3 = corrneighbours.predict(tr_train.T).T 14 | tr_predicted4 = norm.predict(tr_train) 15 | tr_predicted5 = norm.predict(tr_train.T).T 16 | stack_tr = np.array([ 17 | tr_predicted0[tr_test > 0], 18 | tr_predicted1[tr_test > 0], 19 | tr_predicted2[tr_test > 0], 20 | tr_predicted3[tr_test > 0], 21 | tr_predicted4[tr_test > 0], 22 | tr_predicted5[tr_test > 0], 23 | ]).T 24 | 25 | lr = linear_model.LinearRegression() 26 | lr.fit(stack_tr, tr_test[tr_test > 0]) 27 | 28 | stack_te = np.array([ 29 | tr_predicted0.ravel(), 30 | tr_predicted1.ravel(), 31 | tr_predicted2.ravel(), 32 | tr_predicted3.ravel(), 33 | tr_predicted4.ravel(), 34 | tr_predicted5.ravel(), 35 | ]).T 36 | 37 | return lr.predict(stack_te).reshape(train.shape) 38 | 39 | 40 | def main(): 41 | train,test = load_ml100k.get_train_test(random_state=12) 42 | predicted = predict(train) 43 | r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) 44 | print('R2 stacked: {:.2%}'.format(r2)) 45 | 46 | if __name__ == '__main__': 47 | main() 48 | -------------------------------------------------------------------------------- /ch09/01_fft_based_classifier.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import numpy as np 9 | from collections import defaultdict 10 | 11 | from sklearn.metrics import precision_recall_curve, roc_curve 12 | from sklearn.metrics import auc 13 | from sklearn.cross_validation import ShuffleSplit 14 | 15 | from sklearn.metrics import confusion_matrix 16 | 17 | from utils import plot_pr, plot_roc, plot_confusion_matrix, GENRE_LIST 18 | 19 | from fft import read_fft 20 | 21 | genre_list = GENRE_LIST 22 | 23 | 24 | def train_model(clf_factory, X, Y, name, plot=False): 25 | labels = np.unique(Y) 26 | 27 | cv = ShuffleSplit( 28 | n=len(X), n_iter=1, test_size=0.3, indices=True, random_state=0) 29 | 30 | train_errors = [] 31 | test_errors = [] 32 | 33 | scores = [] 34 | pr_scores = defaultdict(list) 35 | precisions, recalls, thresholds = defaultdict( 36 | list), defaultdict(list), defaultdict(list) 37 | 38 | roc_scores = defaultdict(list) 39 | tprs = defaultdict(list) 40 | fprs = defaultdict(list) 41 | 42 | clfs = [] # just to later get the median 43 | 44 | cms = [] 45 | 46 | for train, test in cv: 47 | X_train, y_train = X[train], Y[train] 48 | X_test, y_test = X[test], Y[test] 49 | 50 | clf = clf_factory() 51 | clf.fit(X_train, y_train) 52 | clfs.append(clf) 53 | 54 | train_score = clf.score(X_train, y_train) 55 | test_score = clf.score(X_test, y_test) 56 | scores.append(test_score) 57 | 58 | train_errors.append(1 - train_score) 59 | test_errors.append(1 - test_score) 60 | 61 | y_pred = clf.predict(X_test) 62 | cm = confusion_matrix(y_test, y_pred) 63 | cms.append(cm) 64 | 65 | for label in labels: 66 | y_label_test = np.asarray(y_test == label, dtype=int) 67 | proba = clf.predict_proba(X_test) 68 | proba_label = proba[:, label] 69 | 70 | precision, recall, pr_thresholds = precision_recall_curve( 71 | y_label_test, proba_label) 72 | pr_scores[label].append(auc(recall, precision)) 73 | precisions[label].append(precision) 74 | recalls[label].append(recall) 75 | thresholds[label].append(pr_thresholds) 76 | 77 | fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) 78 | roc_scores[label].append(auc(fpr, tpr)) 79 | tprs[label].append(tpr) 80 | fprs[label].append(fpr) 81 | 82 | if plot: 83 | for label in labels: 84 | print("Plotting %s" % genre_list[label]) 85 | scores_to_sort = roc_scores[label] 86 | median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] 87 | 88 | desc = "%s %s" % (name, genre_list[label]) 89 | plot_pr(pr_scores[label][median], desc, precisions[label][median], 90 | recalls[label][median], label='%s vs rest' % genre_list[label]) 91 | plot_roc(roc_scores[label][median], desc, tprs[label][median], 92 | fprs[label][median], label='%s vs rest' % genre_list[label]) 93 | 94 | all_pr_scores = np.asarray(pr_scores.values()).flatten() 95 | summary = (np.mean(scores), np.std(scores), 96 | np.mean(all_pr_scores), np.std(all_pr_scores)) 97 | print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) 98 | 99 | return np.mean(train_errors), np.mean(test_errors), np.asarray(cms) 100 | 101 | 102 | def create_model(): 103 | from sklearn.linear_model.logistic import LogisticRegression 104 | clf = LogisticRegression() 105 | 106 | return clf 107 | 108 | 109 | if __name__ == "__main__": 110 | X, y = read_fft(genre_list) 111 | 112 | train_avg, test_avg, cms = train_model( 113 | create_model, X, y, "Log Reg FFT", plot=True) 114 | 115 | cm_avg = np.mean(cms, axis=0) 116 | cm_norm = cm_avg / np.sum(cm_avg, axis=0) 117 | 118 | plot_confusion_matrix(cm_norm, genre_list, "fft", 119 | "Confusion matrix of an FFT based classifier") 120 | -------------------------------------------------------------------------------- /ch09/02_ceps_based_classifier.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import numpy as np 9 | from collections import defaultdict 10 | 11 | from sklearn.metrics import precision_recall_curve, roc_curve 12 | from sklearn.metrics import auc 13 | from sklearn.cross_validation import ShuffleSplit 14 | 15 | from sklearn.metrics import confusion_matrix 16 | 17 | from utils import plot_roc, plot_confusion_matrix, GENRE_LIST 18 | 19 | from ceps import read_ceps 20 | 21 | 22 | genre_list = GENRE_LIST 23 | 24 | 25 | def train_model(clf_factory, X, Y, name, plot=False): 26 | labels = np.unique(Y) 27 | 28 | cv = ShuffleSplit( 29 | n=len(X), n_iter=1, test_size=0.3, indices=True, random_state=0) 30 | 31 | train_errors = [] 32 | test_errors = [] 33 | 34 | scores = [] 35 | pr_scores = defaultdict(list) 36 | precisions, recalls, thresholds = defaultdict( 37 | list), defaultdict(list), defaultdict(list) 38 | 39 | roc_scores = defaultdict(list) 40 | tprs = defaultdict(list) 41 | fprs = defaultdict(list) 42 | 43 | clfs = [] # just to later get the median 44 | 45 | cms = [] 46 | 47 | for train, test in cv: 48 | X_train, y_train = X[train], Y[train] 49 | X_test, y_test = X[test], Y[test] 50 | 51 | clf = clf_factory() 52 | clf.fit(X_train, y_train) 53 | clfs.append(clf) 54 | 55 | train_score = clf.score(X_train, y_train) 56 | test_score = clf.score(X_test, y_test) 57 | scores.append(test_score) 58 | 59 | train_errors.append(1 - train_score) 60 | test_errors.append(1 - test_score) 61 | 62 | y_pred = clf.predict(X_test) 63 | cm = confusion_matrix(y_test, y_pred) 64 | cms.append(cm) 65 | 66 | for label in labels: 67 | y_label_test = np.asarray(y_test == label, dtype=int) 68 | proba = clf.predict_proba(X_test) 69 | proba_label = proba[:, label] 70 | 71 | precision, recall, pr_thresholds = precision_recall_curve( 72 | y_label_test, proba_label) 73 | pr_scores[label].append(auc(recall, precision)) 74 | precisions[label].append(precision) 75 | recalls[label].append(recall) 76 | thresholds[label].append(pr_thresholds) 77 | 78 | fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) 79 | roc_scores[label].append(auc(fpr, tpr)) 80 | tprs[label].append(tpr) 81 | fprs[label].append(fpr) 82 | 83 | if plot: 84 | for label in labels: 85 | print("Plotting %s" % genre_list[label]) 86 | scores_to_sort = roc_scores[label] 87 | median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] 88 | 89 | desc = "%s %s" % (name, genre_list[label]) 90 | plot_roc(roc_scores[label][median], desc, tprs[label][median], 91 | fprs[label][median], label='%s vs rest' % genre_list[label]) 92 | 93 | all_pr_scores = np.asarray(pr_scores.values()).flatten() 94 | summary = (np.mean(scores), np.std(scores), 95 | np.mean(all_pr_scores), np.std(all_pr_scores)) 96 | print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) 97 | 98 | return np.mean(train_errors), np.mean(test_errors), np.asarray(cms) 99 | 100 | 101 | def create_model(): 102 | from sklearn.linear_model.logistic import LogisticRegression 103 | clf = LogisticRegression() 104 | 105 | return clf 106 | 107 | 108 | if __name__ == "__main__": 109 | X, y = read_ceps(genre_list) 110 | 111 | train_avg, test_avg, cms = train_model( 112 | create_model, X, y, "Log Reg CEPS", plot=True) 113 | 114 | cm_avg = np.mean(cms, axis=0) 115 | cm_norm = cm_avg / np.sum(cm_avg, axis=0) 116 | 117 | plot_confusion_matrix(cm_norm, genre_list, "ceps", 118 | "Confusion matrix of a CEPS based classifier") 119 | -------------------------------------------------------------------------------- /ch09/Makefile: -------------------------------------------------------------------------------- 1 | CHART_DIR = charts 2 | 3 | fft: 4 | python 01_fft_based_classifier.py 5 | 6 | ceps: 7 | python 02_ceps_based_classifier.py 8 | 9 | rocs_fft.png: 10 | convert $(CHART_DIR)/roc_Log_Reg_FFT_classical.png $(CHART_DIR)/roc_Log_Reg_FFT_jazz.png +append row1.png 11 | convert $(CHART_DIR)/roc_Log_Reg_FFT_country.png $(CHART_DIR)/roc_Log_Reg_FFT_pop.png +append row2.png 12 | convert $(CHART_DIR)/roc_Log_Reg_FFT_rock.png $(CHART_DIR)/roc_Log_Reg_FFT_metal.png +append row3.png 13 | convert row1.png row2.png row3.png -append $(CHART_DIR)/rocs_fft.png 14 | 15 | rocs_ceps.png: 16 | convert $(CHART_DIR)/roc_Log_Reg_CEPS_classical.png $(CHART_DIR)/roc_Log_Reg_CEPS_jazz.png +append row1.png 17 | convert $(CHART_DIR)/roc_Log_Reg_CEPS_country.png $(CHART_DIR)/roc_Log_Reg_CEPS_pop.png +append row2.png 18 | convert $(CHART_DIR)/roc_Log_Reg_CEPS_rock.png $(CHART_DIR)/roc_Log_Reg_CEPS_metal.png +append row3.png 19 | convert row1.png row2.png row3.png -append $(CHART_DIR)/rocs_ceps.png 20 | 21 | roc_pr.png: fft 22 | convert $(CHART_DIR)/pr_Log_Reg_FFT_country.png $(CHART_DIR)/roc_Log_Reg_FFT_country.png +append roc_pr.png 23 | 24 | sox sine_a.wav sine_b.wav sine_mix.wav: 25 | sox --null -r 22050 sine_a.wav synth 0.2 sine 400 26 | sox --null -r 22050 sine_b.wav synth 0.2 sine 3000 27 | sox --combine mix --volume 1 sine_b.wav --volume 0.5 sine_a.wav sine_mix.wav 28 | 29 | fft_demo: sine_a.wav sine_b.wav sine_mix.wav 30 | python fft.py 31 | convert sine_a_wav_fft.png sine_b_wav_fft.png sine_mix_wav_fft.png -append fft_demo.png 32 | 33 | python fft.py /media/sf_P/pymlbook-data/09-genre-class/genres/jazz/jazz.00012.wav 34 | mv jazz.00012_wav_fft.png fft_example.png 35 | 36 | 37 | -------------------------------------------------------------------------------- /ch09/ceps.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import os 9 | import glob 10 | import sys 11 | 12 | import numpy as np 13 | import scipy 14 | import scipy.io.wavfile 15 | from scikits.talkbox.features import mfcc 16 | 17 | from utils import GENRE_DIR 18 | 19 | 20 | def write_ceps(ceps, fn): 21 | """ 22 | Write the MFCC to separate files to speed up processing. 23 | """ 24 | base_fn, ext = os.path.splitext(fn) 25 | data_fn = base_fn + ".ceps" 26 | np.save(data_fn, ceps) 27 | print("Written %s"%data_fn) 28 | 29 | 30 | def create_ceps(fn): 31 | sample_rate, X = scipy.io.wavfile.read(fn) 32 | 33 | ceps, mspec, spec = mfcc(X) 34 | write_ceps(ceps, fn) 35 | 36 | 37 | def read_ceps(genre_list, base_dir=GENRE_DIR): 38 | X = [] 39 | y = [] 40 | for label, genre in enumerate(genre_list): 41 | for fn in glob.glob(os.path.join(base_dir, genre, "*.ceps.npy")): 42 | ceps = np.load(fn) 43 | num_ceps = len(ceps) 44 | X.append( 45 | np.mean(ceps[int(num_ceps / 10):int(num_ceps * 9 / 10)], axis=0)) 46 | y.append(label) 47 | 48 | return np.array(X), np.array(y) 49 | 50 | 51 | if __name__ == "__main__": 52 | os.chdir(GENRE_DIR) 53 | glob_wav = os.path.join(sys.argv[1], "*.wav") 54 | print(glob_wav) 55 | for fn in glob.glob(glob_wav): 56 | create_ceps(fn) 57 | -------------------------------------------------------------------------------- /ch09/fft.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import sys 9 | import os 10 | import glob 11 | 12 | import numpy as np 13 | import scipy 14 | import scipy.io.wavfile 15 | 16 | from utils import GENRE_DIR, CHART_DIR 17 | 18 | import matplotlib.pyplot as plt 19 | from matplotlib.ticker import EngFormatter 20 | 21 | 22 | def write_fft(fft_features, fn): 23 | """ 24 | Write the FFT features to separate files to speed up processing. 25 | """ 26 | base_fn, ext = os.path.splitext(fn) 27 | data_fn = base_fn + ".fft" 28 | 29 | np.save(data_fn, fft_features) 30 | print("Written "%data_fn) 31 | 32 | 33 | def create_fft(fn): 34 | sample_rate, X = scipy.io.wavfile.read(fn) 35 | 36 | fft_features = abs(scipy.fft(X)[:1000]) 37 | write_fft(fft_features, fn) 38 | 39 | 40 | def read_fft(genre_list, base_dir=GENRE_DIR): 41 | X = [] 42 | y = [] 43 | for label, genre in enumerate(genre_list): 44 | genre_dir = os.path.join(base_dir, genre, "*.fft.npy") 45 | file_list = glob.glob(genre_dir) 46 | assert(file_list), genre_dir 47 | for fn in file_list: 48 | fft_features = np.load(fn) 49 | 50 | X.append(fft_features[:2000]) 51 | y.append(label) 52 | 53 | return np.array(X), np.array(y) 54 | 55 | 56 | def plot_wav_fft(wav_filename, desc=None): 57 | plt.clf() 58 | plt.figure(num=None, figsize=(6, 4)) 59 | sample_rate, X = scipy.io.wavfile.read(wav_filename) 60 | spectrum = np.fft.fft(X) 61 | freq = np.fft.fftfreq(len(X), 1.0 / sample_rate) 62 | 63 | plt.subplot(211) 64 | num_samples = 200.0 65 | plt.xlim(0, num_samples / sample_rate) 66 | plt.xlabel("time [s]") 67 | plt.title(desc or wav_filename) 68 | plt.plot(np.arange(num_samples) / sample_rate, X[:num_samples]) 69 | plt.grid(True) 70 | 71 | plt.subplot(212) 72 | plt.xlim(0, 5000) 73 | plt.xlabel("frequency [Hz]") 74 | plt.xticks(np.arange(5) * 1000) 75 | if desc: 76 | desc = desc.strip() 77 | fft_desc = desc[0].lower() + desc[1:] 78 | else: 79 | fft_desc = wav_filename 80 | plt.title("FFT of %s" % fft_desc) 81 | plt.plot(freq, abs(spectrum), linewidth=5) 82 | plt.grid(True) 83 | 84 | plt.tight_layout() 85 | 86 | rel_filename = os.path.split(wav_filename)[1] 87 | plt.savefig("%s_wav_fft.png" % os.path.splitext(rel_filename)[0], 88 | bbox_inches='tight') 89 | 90 | plt.show() 91 | 92 | 93 | def plot_wav_fft_demo(): 94 | plot_wav_fft("sine_a.wav", "400Hz sine wave") 95 | plot_wav_fft("sine_b.wav", "3,000Hz sine wave") 96 | plot_wav_fft("sine_mix.wav", "Mixed sine wave") 97 | 98 | 99 | def plot_specgram(ax, fn): 100 | sample_rate, X = scipy.io.wavfile.read(fn) 101 | ax.specgram(X, Fs=sample_rate, xextent=(0, 30)) 102 | 103 | 104 | def plot_specgrams(base_dir=CHART_DIR): 105 | """ 106 | Plot a bunch of spectrograms of wav files in different genres 107 | """ 108 | plt.clf() 109 | genres = ["classical", "jazz", "country", "pop", "rock", "metal"] 110 | num_files = 3 111 | f, axes = plt.subplots(len(genres), num_files) 112 | 113 | for genre_idx, genre in enumerate(genres): 114 | for idx, fn in enumerate(glob.glob(os.path.join(GENRE_DIR, genre, "*.wav"))): 115 | if idx == num_files: 116 | break 117 | axis = axes[genre_idx, idx] 118 | axis.yaxis.set_major_formatter(EngFormatter()) 119 | axis.set_title("%s song %i" % (genre, idx + 1)) 120 | plot_specgram(axis, fn) 121 | 122 | specgram_file = os.path.join(base_dir, "Spectrogram_Genres.png") 123 | plt.savefig(specgram_file, bbox_inches="tight") 124 | 125 | plt.show() 126 | 127 | 128 | if __name__ == "__main__": 129 | # for fn in glob.glob(os.path.join(sys.argv[1], "*.wav")): 130 | # create_fft(fn) 131 | 132 | # plot_decomp() 133 | 134 | if len(sys.argv) > 1: 135 | plot_wav_fft(sys.argv[1], desc="some sample song") 136 | else: 137 | plot_wav_fft_demo() 138 | 139 | plot_specgrams() 140 | -------------------------------------------------------------------------------- /ch10/.gitignore: -------------------------------------------------------------------------------- 1 | AnimTransDistr/ 2 | -------------------------------------------------------------------------------- /ch10/README.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Chapter 10 3 | ========== 4 | 5 | Support code for *Chapter 10: Pattern Recognition & Computer Vision* 6 | 7 | Data 8 | ---- 9 | 10 | This chapter relies on a publicly available dataset (which can be downloaded 11 | using the ``download.sh`` script inside the ``data/`` directory) as well the 12 | dataset that is packaged with the repository at ``../SimpleImageDataset/``. 13 | 14 | Running ``download.sh`` will retrieve the other dataset into a directory 15 | ``AnimTransDistr/``. 16 | 17 | Scripts 18 | ------- 19 | 20 | chapter.py 21 | Code as written in the book. 22 | thresholded_figure.py 23 | Computes the thresholded figures, including after Gaussian blurring 24 | lena-ring.py 25 | Lena image with center in focus and blurred edges 26 | figure10.py 27 | Just paste two images next to each others 28 | features.py 29 | Contains the color histogram function from the book as well as a simple 30 | wrapper around ``mahotas.texture.haralick`` 31 | simple_classification.py 32 | Classify SimpleImageDataset with texture features + color histogram features 33 | large_classification.py 34 | Classify ``AnimTransDistr`` with both texture and SURF features. 35 | neighbors.py 36 | Computes image neighbors as well as the neighbor figure from the book. 37 | 38 | -------------------------------------------------------------------------------- /ch10/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | mkdir -p AnimTransDistr 4 | cd AnimTransDistr 5 | curl -O http://vision.stanford.edu/Datasets/AnimTransDistr.rar 6 | unrar x AnimTransDistr.rar 7 | # The following file is a weird file: 8 | rm Anims/104034.jpg 9 | -------------------------------------------------------------------------------- /ch10/features.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import numpy as np 9 | import mahotas as mh 10 | 11 | 12 | def edginess_sobel(image): 13 | '''Measure the "edginess" of an image 14 | 15 | image should be a 2d numpy array (an image) 16 | 17 | Returns a floating point value which is higher the "edgier" the image is. 18 | 19 | ''' 20 | edges = mh.sobel(image, just_filter=True) 21 | edges = edges.ravel() 22 | return np.sqrt(np.dot(edges, edges)) 23 | 24 | def texture(im): 25 | '''Compute features for an image 26 | 27 | Parameters 28 | ---------- 29 | im : ndarray 30 | 31 | Returns 32 | ------- 33 | fs : ndarray 34 | 1-D array of features 35 | ''' 36 | im = im.astype(np.uint8) 37 | return mh.features.haralick(im).ravel() 38 | 39 | 40 | def chist(im): 41 | '''Compute color histogram of input image 42 | 43 | Parameters 44 | ---------- 45 | im : ndarray 46 | should be an RGB image 47 | 48 | Returns 49 | ------- 50 | c : ndarray 51 | 1-D array of histogram values 52 | ''' 53 | 54 | # Downsample pixel values: 55 | im = im // 64 56 | 57 | # We can also implement the following by using np.histogramdd 58 | # im = im.reshape((-1,3)) 59 | # bins = [np.arange(5), np.arange(5), np.arange(5)] 60 | # hist = np.histogramdd(im, bins=bins)[0] 61 | # hist = hist.ravel() 62 | 63 | # Separate RGB channels: 64 | r,g,b = im.transpose((2,0,1)) 65 | 66 | pixels = 1 * r + 4 * g + 16 * b 67 | hist = np.bincount(pixels.ravel(), minlength=64) 68 | hist = hist.astype(float) 69 | return np.log1p(hist) 70 | 71 | -------------------------------------------------------------------------------- /ch10/figure10.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import numpy as np 9 | import mahotas as mh 10 | 11 | # This little script just builds an image with two examples, side-by-side: 12 | 13 | text = mh.imread("../SimpleImageDataset/text21.jpg") 14 | building = mh.imread("../SimpleImageDataset/building00.jpg") 15 | h, w, _ = text.shape 16 | canvas = np.zeros((h, 2 * w + 128, 3), np.uint8) 17 | canvas[:, -w:] = building 18 | canvas[:, :w] = text 19 | canvas = canvas[::4, ::4] 20 | mh.imsave('figure10.jpg', canvas) 21 | -------------------------------------------------------------------------------- /ch10/large_classification.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from __future__ import print_function 9 | import mahotas as mh 10 | from glob import glob 11 | from sklearn import cross_validation 12 | from sklearn.linear_model import LogisticRegression 13 | from sklearn.pipeline import Pipeline 14 | from sklearn.preprocessing import StandardScaler 15 | from sklearn.grid_search import GridSearchCV 16 | import numpy as np 17 | 18 | basedir = 'AnimTransDistr' 19 | print('This script will test classification of the AnimTransDistr dataset') 20 | 21 | C_range = 10.0 ** np.arange(-4, 3) 22 | grid = GridSearchCV(LogisticRegression(), param_grid={'C' : C_range}) 23 | clf = Pipeline([('preproc', StandardScaler()), 24 | ('classifier', grid)]) 25 | 26 | def features_for(im): 27 | from features import chist 28 | im = mh.imread(im) 29 | img = mh.colors.rgb2grey(im).astype(np.uint8) 30 | return np.concatenate([mh.features.haralick(img).ravel(), 31 | chist(im)]) 32 | 33 | def images(): 34 | '''Iterate over all (image,label) pairs 35 | 36 | This function will return 37 | ''' 38 | for ci, cl in enumerate(classes): 39 | images = glob('{}/{}/*.jpg'.format(basedir, cl)) 40 | for im in sorted(images): 41 | yield im, ci 42 | 43 | classes = [ 44 | 'Anims', 45 | 'Cars', 46 | 'Distras', 47 | 'Trans', 48 | ] 49 | 50 | print('Computing whole-image texture features...') 51 | ifeatures = [] 52 | labels = [] 53 | for im, ell in images(): 54 | ifeatures.append(features_for(im)) 55 | labels.append(ell) 56 | 57 | ifeatures = np.array(ifeatures) 58 | labels = np.array(labels) 59 | 60 | cv = cross_validation.KFold(len(ifeatures), 5, shuffle=True, random_state=123) 61 | scores0 = cross_validation.cross_val_score( 62 | clf, ifeatures, labels, cv=cv) 63 | print('Accuracy (5 fold x-val) with Logistic Regression [image features]: {:.1%}'.format( 64 | scores0.mean())) 65 | 66 | 67 | from sklearn.cluster import KMeans 68 | from mahotas.features import surf 69 | 70 | 71 | print('Computing SURF descriptors...') 72 | alldescriptors = [] 73 | for im,_ in images(): 74 | im = mh.imread(im, as_grey=True) 75 | im = im.astype(np.uint8) 76 | 77 | # To use dense sampling, you can try the following line: 78 | # alldescriptors.append(surf.dense(im, spacing=16)) 79 | alldescriptors.append(surf.surf(im, descriptor_only=True)) 80 | 81 | print('Descriptor computation complete.') 82 | k = 256 83 | km = KMeans(k) 84 | 85 | concatenated = np.concatenate(alldescriptors) 86 | print('Number of descriptors: {}'.format( 87 | len(concatenated))) 88 | concatenated = concatenated[::64] 89 | print('Clustering with K-means...') 90 | km.fit(concatenated) 91 | sfeatures = [] 92 | for d in alldescriptors: 93 | c = km.predict(d) 94 | sfeatures.append(np.bincount(c, minlength=k)) 95 | sfeatures = np.array(sfeatures, dtype=float) 96 | print('predicting...') 97 | score_SURF = cross_validation.cross_val_score( 98 | clf, sfeatures, labels, cv=cv).mean() 99 | print('Accuracy (5 fold x-val) with Logistic Regression [SURF features]: {:.1%}'.format( 100 | score_SURF.mean())) 101 | 102 | 103 | print('Performing classification with all features combined...') 104 | allfeatures = np.hstack([sfeatures, ifeatures]) 105 | score_SURF_global = cross_validation.cross_val_score( 106 | clf, allfeatures, labels, cv=cv).mean() 107 | print('Accuracy (5 fold x-val) with Logistic Regression [All features]: {:.1%}'.format( 108 | score_SURF_global.mean())) 109 | -------------------------------------------------------------------------------- /ch10/lena-ring.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import mahotas as mh 9 | import numpy as np 10 | 11 | # Read in the image 12 | im = mh.demos.load('lena') 13 | 14 | # This breaks up the image into RGB channels 15 | r, g, b = im.transpose(2, 0, 1) 16 | h, w = r.shape 17 | 18 | # smooth the image per channel: 19 | r12 = mh.gaussian_filter(r, 12.) 20 | g12 = mh.gaussian_filter(g, 12.) 21 | b12 = mh.gaussian_filter(b, 12.) 22 | 23 | # build back the RGB image 24 | im12 = mh.as_rgb(r12, g12, b12) 25 | 26 | X, Y = np.mgrid[:h, :w] 27 | X = X - h / 2. 28 | Y = Y - w / 2. 29 | X /= X.max() 30 | Y /= Y.max() 31 | 32 | # Array C will have the highest values in the center, fading out to the edges: 33 | 34 | C = np.exp(-2. * (X ** 2 + Y ** 2)) 35 | C -= C.min() 36 | C /= C.ptp() 37 | C = C[:, :, None] 38 | 39 | # The final result is sharp in the centre and smooths out to the borders: 40 | ring = mh.stretch(im * C + (1 - C) * im12) 41 | mh.imsave('lena-ring.jpg', ring) 42 | -------------------------------------------------------------------------------- /ch10/neighbors.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | 6 | import numpy as np 7 | import mahotas as mh 8 | from glob import glob 9 | from features import texture, chist 10 | from matplotlib import pyplot as plt 11 | from sklearn.preprocessing import StandardScaler 12 | from scipy.spatial import distance 13 | 14 | basedir = '../SimpleImageDataset/' 15 | 16 | 17 | haralicks = [] 18 | chists = [] 19 | 20 | print('Computing features...') 21 | # Use glob to get all the images 22 | images = glob('{}/*.jpg'.format(basedir)) 23 | # We sort the images to ensure that they are always processed in the same order 24 | # Otherwise, this would introduce some variation just based on the random 25 | # ordering that the filesystem uses 26 | images.sort() 27 | 28 | for fname in images: 29 | imc = mh.imread(fname) 30 | imc = imc[200:-200,200:-200] 31 | haralicks.append(texture(mh.colors.rgb2grey(imc))) 32 | chists.append(chist(imc)) 33 | 34 | haralicks = np.array(haralicks) 35 | chists = np.array(chists) 36 | features = np.hstack([chists, haralicks]) 37 | 38 | print('Computing neighbors...') 39 | sc = StandardScaler() 40 | features = sc.fit_transform(features) 41 | dists = distance.squareform(distance.pdist(features)) 42 | 43 | print('Plotting...') 44 | fig, axes = plt.subplots(2, 9, figsize=(16,8)) 45 | 46 | # Remove ticks from all subplots 47 | for ax in axes.flat: 48 | ax.set_xticks([]) 49 | ax.set_yticks([]) 50 | 51 | for ci,i in enumerate(range(0,90,10)): 52 | left = images[i] 53 | dists_left = dists[i] 54 | right = dists_left.argsort() 55 | # right[0] is the same as left[i], so pick the next closest element 56 | right = right[1] 57 | right = images[right] 58 | left = mh.imread(left) 59 | right = mh.imread(right) 60 | axes[0, ci].imshow(left) 61 | axes[1, ci].imshow(right) 62 | 63 | fig.tight_layout() 64 | fig.savefig('figure_neighbors.png', dpi=300) 65 | -------------------------------------------------------------------------------- /ch10/scene00.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luispedro/BuildingMachineLearningSystemsWithPython/52891e6bac00213bf94ab1a3b1f2d8d5ed04a774/ch10/scene00.jpg -------------------------------------------------------------------------------- /ch10/simple_classification.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import mahotas as mh 9 | import numpy as np 10 | from glob import glob 11 | 12 | from features import texture, chist 13 | from sklearn.linear_model import LogisticRegression 14 | from sklearn.pipeline import Pipeline 15 | from sklearn.preprocessing import StandardScaler 16 | 17 | basedir = '../SimpleImageDataset/' 18 | 19 | 20 | haralicks = [] 21 | labels = [] 22 | chists = [] 23 | 24 | print('This script will test (with cross-validation) classification of the simple 3 class dataset') 25 | print('Computing features...') 26 | # Use glob to get all the images 27 | images = glob('{}/*.jpg'.format(basedir)) 28 | 29 | # We sort the images to ensure that they are always processed in the same order 30 | # Otherwise, this would introduce some variation just based on the random 31 | # ordering that the filesystem uses 32 | for fname in sorted(images): 33 | imc = mh.imread(fname) 34 | haralicks.append(texture(mh.colors.rgb2grey(imc))) 35 | chists.append(chist(imc)) 36 | 37 | # Files are named like building00.jpg, scene23.jpg... 38 | labels.append(fname[:-len('xx.jpg')]) 39 | 40 | print('Finished computing features.') 41 | 42 | haralicks = np.array(haralicks) 43 | labels = np.array(labels) 44 | chists = np.array(chists) 45 | 46 | haralick_plus_chists = np.hstack([chists, haralicks]) 47 | 48 | 49 | # We use Logistic Regression because it achieves high accuracy on small(ish) datasets 50 | # Feel free to experiment with other classifiers 51 | clf = Pipeline([('preproc', StandardScaler()), 52 | ('classifier', LogisticRegression())]) 53 | 54 | from sklearn import cross_validation 55 | cv = cross_validation.LeaveOneOut(len(images)) 56 | scores = cross_validation.cross_val_score( 57 | clf, haralicks, labels, cv=cv) 58 | print('Accuracy (Leave-one-out) with Logistic Regression [haralick features]: {:.1%}'.format( 59 | scores.mean())) 60 | 61 | scores = cross_validation.cross_val_score( 62 | clf, chists, labels, cv=cv) 63 | print('Accuracy (Leave-one-out) with Logistic Regression [color histograms]: {:.1%}'.format( 64 | scores.mean())) 65 | 66 | scores = cross_validation.cross_val_score( 67 | clf, haralick_plus_chists, labels, cv=cv) 68 | print('Accuracy (Leave-one-out) with Logistic Regression [texture features + color histograms]: {:.1%}'.format( 69 | scores.mean())) 70 | 71 | -------------------------------------------------------------------------------- /ch10/threshold.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import numpy as np 9 | import mahotas as mh 10 | 11 | # Load our example image: 12 | image = mh.imread('../SimpleImageDataset/building05.jpg') 13 | 14 | # Convert to greyscale 15 | image = mh.colors.rgb2gray(image, dtype=np.uint8) 16 | 17 | # Compute a threshold value: 18 | thresh = mh.thresholding.otsu(image) 19 | print('Otsu threshold is {0}'.format(thresh)) 20 | 21 | # Compute the thresholded image 22 | otsubin = (image > thresh) 23 | print('Saving thresholded image (with Otsu threshold) to otsu-threshold.jpeg') 24 | mh.imsave('otsu-threshold.jpeg', otsubin.astype(np.uint8) * 255) 25 | 26 | # Execute morphological opening to smooth out the edges 27 | otsubin = mh.open(otsubin, np.ones((15, 15))) 28 | mh.imsave('otsu-closed.jpeg', otsubin.astype(np.uint8) * 255) 29 | 30 | # An alternative thresholding method: 31 | thresh = mh.thresholding.rc(image) 32 | print('Ridley-Calvard threshold is {0}'.format(thresh)) 33 | print('Saving thresholded image (with Ridley-Calvard threshold) to rc-threshold.jpeg') 34 | mh.imsave('rc-threshold.jpeg', (image > thresh).astype(np.uint8) * 255) 35 | -------------------------------------------------------------------------------- /ch10/thresholded_figure.py: -------------------------------------------------------------------------------- 1 | import mahotas as mh 2 | import numpy as np 3 | from matplotlib import pyplot as plt 4 | 5 | # Load image & convert to B&W 6 | image = mh.imread('../SimpleImageDataset/scene00.jpg') 7 | image = mh.colors.rgb2grey(image, dtype=np.uint8) 8 | plt.imshow(image) 9 | plt.gray() 10 | plt.title('original image') 11 | 12 | thresh = mh.thresholding.otsu(image) 13 | print('Otsu threshold is {}.'.format(thresh)) 14 | 15 | threshed = (image > thresh) 16 | plt.figure() 17 | plt.imshow(threshed) 18 | plt.title('threholded image') 19 | mh.imsave('thresholded.png', threshed.astype(np.uint8)*255) 20 | 21 | im16 = mh.gaussian_filter(image, 16) 22 | 23 | # Repeat the thresholding operations with the blurred image 24 | thresh = mh.thresholding.otsu(im16.astype(np.uint8)) 25 | threshed = (im16 > thresh) 26 | plt.figure() 27 | plt.imshow(threshed) 28 | plt.title('threholded image (after blurring)') 29 | print('Otsu threshold after blurring is {}.'.format(thresh)) 30 | mh.imsave('thresholded16.png', threshed.astype(np.uint8)*255) 31 | plt.show() 32 | -------------------------------------------------------------------------------- /ch11/demo_corr.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import os 9 | 10 | from matplotlib import pylab 11 | import numpy as np 12 | import scipy 13 | from scipy.stats import norm, pearsonr 14 | 15 | from utils import CHART_DIR 16 | 17 | 18 | def _plot_correlation_func(x, y): 19 | 20 | r, p = pearsonr(x, y) 21 | title = "Cor($X_1$, $X_2$) = %.3f" % r 22 | pylab.scatter(x, y) 23 | pylab.title(title) 24 | pylab.xlabel("$X_1$") 25 | pylab.ylabel("$X_2$") 26 | 27 | f1 = scipy.poly1d(scipy.polyfit(x, y, 1)) 28 | pylab.plot(x, f1(x), "r--", linewidth=2) 29 | # pylab.xticks([w*7*24 for w in [0,1,2,3,4]], ['week %i'%(w+1) for w in 30 | # [0,1,2,3,4]]) 31 | 32 | 33 | def plot_correlation_demo(): 34 | np.random.seed(0) # to reproduce the data later on 35 | pylab.clf() 36 | pylab.figure(num=None, figsize=(8, 8)) 37 | 38 | x = np.arange(0, 10, 0.2) 39 | 40 | pylab.subplot(221) 41 | y = 0.5 * x + norm.rvs(1, scale=.01, size=len(x)) 42 | _plot_correlation_func(x, y) 43 | 44 | pylab.subplot(222) 45 | y = 0.5 * x + norm.rvs(1, scale=.1, size=len(x)) 46 | _plot_correlation_func(x, y) 47 | 48 | pylab.subplot(223) 49 | y = 0.5 * x + norm.rvs(1, scale=1, size=len(x)) 50 | _plot_correlation_func(x, y) 51 | 52 | pylab.subplot(224) 53 | y = norm.rvs(1, scale=10, size=len(x)) 54 | _plot_correlation_func(x, y) 55 | 56 | pylab.autoscale(tight=True) 57 | pylab.grid(True) 58 | 59 | filename = "corr_demo_1.png" 60 | pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight") 61 | 62 | pylab.clf() 63 | pylab.figure(num=None, figsize=(8, 8)) 64 | 65 | x = np.arange(-5, 5, 0.2) 66 | 67 | pylab.subplot(221) 68 | y = 0.5 * x ** 2 + norm.rvs(1, scale=.01, size=len(x)) 69 | _plot_correlation_func(x, y) 70 | 71 | pylab.subplot(222) 72 | y = 0.5 * x ** 2 + norm.rvs(1, scale=.1, size=len(x)) 73 | _plot_correlation_func(x, y) 74 | 75 | pylab.subplot(223) 76 | y = 0.5 * x ** 2 + norm.rvs(1, scale=1, size=len(x)) 77 | _plot_correlation_func(x, y) 78 | 79 | pylab.subplot(224) 80 | y = 0.5 * x ** 2 + norm.rvs(1, scale=10, size=len(x)) 81 | _plot_correlation_func(x, y) 82 | 83 | pylab.autoscale(tight=True) 84 | pylab.grid(True) 85 | 86 | filename = "corr_demo_2.png" 87 | pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight") 88 | 89 | if __name__ == '__main__': 90 | plot_correlation_demo() 91 | -------------------------------------------------------------------------------- /ch11/demo_mds.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import os 9 | 10 | import numpy as np 11 | from matplotlib import pylab 12 | from mpl_toolkits.mplot3d import Axes3D 13 | 14 | from sklearn import linear_model, manifold, decomposition, datasets 15 | logistic = linear_model.LogisticRegression() 16 | 17 | from utils import CHART_DIR 18 | 19 | np.random.seed(3) 20 | 21 | # all examples will have three classes in this file 22 | colors = ['r', 'g', 'b'] 23 | markers = ['o', 6, '*'] 24 | 25 | 26 | def plot_demo_1(): 27 | X = np.c_[np.ones(5), 2 * np.ones(5), 10 * np.ones(5)].T 28 | y = np.array([0, 1, 2]) 29 | 30 | fig = pylab.figure(figsize=(10, 4)) 31 | 32 | ax = fig.add_subplot(121, projection='3d') 33 | ax.set_axis_bgcolor('white') 34 | 35 | mds = manifold.MDS(n_components=3) 36 | Xtrans = mds.fit_transform(X) 37 | 38 | for cl, color, marker in zip(np.unique(y), colors, markers): 39 | ax.scatter( 40 | Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black') 41 | pylab.title("MDS on example data set in 3 dimensions") 42 | ax.view_init(10, -15) 43 | 44 | mds = manifold.MDS(n_components=2) 45 | Xtrans = mds.fit_transform(X) 46 | 47 | ax = fig.add_subplot(122) 48 | for cl, color, marker in zip(np.unique(y), colors, markers): 49 | ax.scatter( 50 | Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black') 51 | pylab.title("MDS on example data set in 2 dimensions") 52 | 53 | filename = "mds_demo_1.png" 54 | pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight") 55 | 56 | 57 | def plot_iris_mds(): 58 | 59 | iris = datasets.load_iris() 60 | X = iris.data 61 | y = iris.target 62 | 63 | # MDS 64 | 65 | fig = pylab.figure(figsize=(10, 4)) 66 | 67 | ax = fig.add_subplot(121, projection='3d') 68 | ax.set_axis_bgcolor('white') 69 | 70 | mds = manifold.MDS(n_components=3) 71 | Xtrans = mds.fit_transform(X) 72 | 73 | for cl, color, marker in zip(np.unique(y), colors, markers): 74 | ax.scatter( 75 | Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black') 76 | pylab.title("MDS on Iris data set in 3 dimensions") 77 | ax.view_init(10, -15) 78 | 79 | mds = manifold.MDS(n_components=2) 80 | Xtrans = mds.fit_transform(X) 81 | 82 | ax = fig.add_subplot(122) 83 | for cl, color, marker in zip(np.unique(y), colors, markers): 84 | ax.scatter( 85 | Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black') 86 | pylab.title("MDS on Iris data set in 2 dimensions") 87 | 88 | filename = "mds_demo_iris.png" 89 | pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight") 90 | 91 | # PCA 92 | 93 | fig = pylab.figure(figsize=(10, 4)) 94 | 95 | ax = fig.add_subplot(121, projection='3d') 96 | ax.set_axis_bgcolor('white') 97 | 98 | pca = decomposition.PCA(n_components=3) 99 | Xtrans = pca.fit(X).transform(X) 100 | 101 | for cl, color, marker in zip(np.unique(y), colors, markers): 102 | ax.scatter( 103 | Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black') 104 | pylab.title("PCA on Iris data set in 3 dimensions") 105 | ax.view_init(50, -35) 106 | 107 | pca = decomposition.PCA(n_components=2) 108 | Xtrans = pca.fit_transform(X) 109 | 110 | ax = fig.add_subplot(122) 111 | for cl, color, marker in zip(np.unique(y), colors, markers): 112 | ax.scatter( 113 | Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black') 114 | pylab.title("PCA on Iris data set in 2 dimensions") 115 | 116 | filename = "pca_demo_iris.png" 117 | pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight") 118 | 119 | 120 | if __name__ == '__main__': 121 | plot_demo_1() 122 | plot_iris_mds() 123 | -------------------------------------------------------------------------------- /ch11/demo_mi.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import os 9 | 10 | from matplotlib import pylab 11 | import numpy as np 12 | from scipy.stats import norm, entropy 13 | 14 | from utils import CHART_DIR 15 | 16 | 17 | def mutual_info(x, y, bins=10): 18 | counts_xy, bins_x, bins_y = np.histogram2d(x, y, bins=(bins, bins)) 19 | counts_x, bins = np.histogram(x, bins=bins) 20 | counts_y, bins = np.histogram(y, bins=bins) 21 | 22 | counts_xy += 1 23 | counts_x += 1 24 | counts_y += 1 25 | P_xy = counts_xy / np.sum(counts_xy, dtype=float) 26 | P_x = counts_x / np.sum(counts_x, dtype=float) 27 | P_y = counts_y / np.sum(counts_y, dtype=float) 28 | 29 | I_xy = np.sum(P_xy * np.log2(P_xy / (P_x.reshape(-1, 1) * P_y))) 30 | 31 | return I_xy / (entropy(counts_x) + entropy(counts_y)) 32 | 33 | 34 | def plot_entropy(): 35 | pylab.clf() 36 | pylab.figure(num=None, figsize=(5, 4)) 37 | 38 | title = "Entropy $H(X)$" 39 | pylab.title(title) 40 | pylab.xlabel("$P(X=$coin will show heads up$)$") 41 | pylab.ylabel("$H(X)$") 42 | 43 | pylab.xlim(xmin=0, xmax=1.1) 44 | x = np.arange(0.001, 1, 0.001) 45 | y = -x * np.log2(x) - (1 - x) * np.log2(1 - x) 46 | pylab.plot(x, y) 47 | # pylab.xticks([w*7*24 for w in [0,1,2,3,4]], ['week %i'%(w+1) for w in 48 | # [0,1,2,3,4]]) 49 | 50 | pylab.autoscale(tight=True) 51 | pylab.grid(True) 52 | 53 | filename = "entropy_demo.png" 54 | pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight") 55 | 56 | 57 | def _plot_mi_func(x, y): 58 | 59 | mi = mutual_info(x, y) 60 | title = "NI($X_1$, $X_2$) = %.3f" % mi 61 | pylab.scatter(x, y) 62 | pylab.title(title) 63 | pylab.xlabel("$X_1$") 64 | pylab.ylabel("$X_2$") 65 | 66 | 67 | def plot_mi_demo(): 68 | np.random.seed(0) # to reproduce the data later on 69 | pylab.clf() 70 | pylab.figure(num=None, figsize=(8, 8)) 71 | 72 | x = np.arange(0, 10, 0.2) 73 | 74 | pylab.subplot(221) 75 | y = 0.5 * x + norm.rvs(1, scale=.01, size=len(x)) 76 | _plot_mi_func(x, y) 77 | 78 | pylab.subplot(222) 79 | y = 0.5 * x + norm.rvs(1, scale=.1, size=len(x)) 80 | _plot_mi_func(x, y) 81 | 82 | pylab.subplot(223) 83 | y = 0.5 * x + norm.rvs(1, scale=1, size=len(x)) 84 | _plot_mi_func(x, y) 85 | 86 | pylab.subplot(224) 87 | y = norm.rvs(1, scale=10, size=len(x)) 88 | _plot_mi_func(x, y) 89 | 90 | pylab.autoscale(tight=True) 91 | pylab.grid(True) 92 | 93 | filename = "mi_demo_1.png" 94 | pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight") 95 | 96 | pylab.clf() 97 | pylab.figure(num=None, figsize=(8, 8)) 98 | 99 | x = np.arange(-5, 5, 0.2) 100 | 101 | pylab.subplot(221) 102 | y = 0.5 * x ** 2 + norm.rvs(1, scale=.01, size=len(x)) 103 | _plot_mi_func(x, y) 104 | 105 | pylab.subplot(222) 106 | y = 0.5 * x ** 2 + norm.rvs(1, scale=.1, size=len(x)) 107 | _plot_mi_func(x, y) 108 | 109 | pylab.subplot(223) 110 | y = 0.5 * x ** 2 + norm.rvs(1, scale=1, size=len(x)) 111 | _plot_mi_func(x, y) 112 | 113 | pylab.subplot(224) 114 | y = 0.5 * x ** 2 + norm.rvs(1, scale=10, size=len(x)) 115 | _plot_mi_func(x, y) 116 | 117 | pylab.autoscale(tight=True) 118 | pylab.grid(True) 119 | 120 | filename = "mi_demo_2.png" 121 | pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight") 122 | 123 | if __name__ == '__main__': 124 | plot_entropy() 125 | plot_mi_demo() 126 | -------------------------------------------------------------------------------- /ch11/demo_pca.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import os 9 | 10 | from matplotlib import pylab 11 | import numpy as np 12 | 13 | from sklearn import linear_model, decomposition 14 | from sklearn import lda 15 | 16 | logistic = linear_model.LogisticRegression() 17 | 18 | 19 | from utils import CHART_DIR 20 | 21 | np.random.seed(3) 22 | 23 | x1 = np.arange(0, 10, .2) 24 | x2 = x1 + np.random.normal(scale=1, size=len(x1)) 25 | 26 | 27 | def plot_simple_demo_1(): 28 | pylab.clf() 29 | fig = pylab.figure(num=None, figsize=(10, 4)) 30 | pylab.subplot(121) 31 | 32 | title = "Original feature space" 33 | pylab.title(title) 34 | pylab.xlabel("$X_1$") 35 | pylab.ylabel("$X_2$") 36 | 37 | x1 = np.arange(0, 10, .2) 38 | x2 = x1 + np.random.normal(scale=1, size=len(x1)) 39 | 40 | good = (x1 > 5) | (x2 > 5) 41 | bad = ~good 42 | 43 | x1g = x1[good] 44 | x2g = x2[good] 45 | pylab.scatter(x1g, x2g, edgecolor="blue", facecolor="blue") 46 | 47 | x1b = x1[bad] 48 | x2b = x2[bad] 49 | pylab.scatter(x1b, x2b, edgecolor="red", facecolor="white") 50 | 51 | pylab.grid(True) 52 | 53 | pylab.subplot(122) 54 | 55 | X = np.c_[(x1, x2)] 56 | 57 | pca = decomposition.PCA(n_components=1) 58 | Xtrans = pca.fit_transform(X) 59 | 60 | Xg = Xtrans[good] 61 | Xb = Xtrans[bad] 62 | 63 | pylab.scatter( 64 | Xg[:, 0], np.zeros(len(Xg)), edgecolor="blue", facecolor="blue") 65 | pylab.scatter( 66 | Xb[:, 0], np.zeros(len(Xb)), edgecolor="red", facecolor="white") 67 | title = "Transformed feature space" 68 | pylab.title(title) 69 | pylab.xlabel("$X'$") 70 | fig.axes[1].get_yaxis().set_visible(False) 71 | 72 | print(pca.explained_variance_ratio_) 73 | 74 | pylab.grid(True) 75 | 76 | pylab.autoscale(tight=True) 77 | filename = "pca_demo_1.png" 78 | pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight") 79 | 80 | 81 | def plot_simple_demo_2(): 82 | pylab.clf() 83 | fig = pylab.figure(num=None, figsize=(10, 4)) 84 | pylab.subplot(121) 85 | 86 | title = "Original feature space" 87 | pylab.title(title) 88 | pylab.xlabel("$X_1$") 89 | pylab.ylabel("$X_2$") 90 | 91 | x1 = np.arange(0, 10, .2) 92 | x2 = x1 + np.random.normal(scale=1, size=len(x1)) 93 | 94 | good = x1 > x2 95 | bad = ~good 96 | 97 | x1g = x1[good] 98 | x2g = x2[good] 99 | pylab.scatter(x1g, x2g, edgecolor="blue", facecolor="blue") 100 | 101 | x1b = x1[bad] 102 | x2b = x2[bad] 103 | pylab.scatter(x1b, x2b, edgecolor="red", facecolor="white") 104 | 105 | pylab.grid(True) 106 | 107 | pylab.subplot(122) 108 | 109 | X = np.c_[(x1, x2)] 110 | 111 | pca = decomposition.PCA(n_components=1) 112 | Xtrans = pca.fit_transform(X) 113 | 114 | Xg = Xtrans[good] 115 | Xb = Xtrans[bad] 116 | 117 | pylab.scatter( 118 | Xg[:, 0], np.zeros(len(Xg)), edgecolor="blue", facecolor="blue") 119 | pylab.scatter( 120 | Xb[:, 0], np.zeros(len(Xb)), edgecolor="red", facecolor="white") 121 | title = "Transformed feature space" 122 | pylab.title(title) 123 | pylab.xlabel("$X'$") 124 | fig.axes[1].get_yaxis().set_visible(False) 125 | 126 | print(pca.explained_variance_ratio_) 127 | 128 | pylab.grid(True) 129 | 130 | pylab.autoscale(tight=True) 131 | filename = "pca_demo_2.png" 132 | pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight") 133 | 134 | 135 | def plot_simple_demo_lda(): 136 | pylab.clf() 137 | fig = pylab.figure(num=None, figsize=(10, 4)) 138 | pylab.subplot(121) 139 | 140 | title = "Original feature space" 141 | pylab.title(title) 142 | pylab.xlabel("$X_1$") 143 | pylab.ylabel("$X_2$") 144 | 145 | good = x1 > x2 146 | bad = ~good 147 | 148 | x1g = x1[good] 149 | x2g = x2[good] 150 | pylab.scatter(x1g, x2g, edgecolor="blue", facecolor="blue") 151 | 152 | x1b = x1[bad] 153 | x2b = x2[bad] 154 | pylab.scatter(x1b, x2b, edgecolor="red", facecolor="white") 155 | 156 | pylab.grid(True) 157 | 158 | pylab.subplot(122) 159 | 160 | X = np.c_[(x1, x2)] 161 | 162 | lda_inst = lda.LDA(n_components=1) 163 | Xtrans = lda_inst.fit_transform(X, good) 164 | 165 | Xg = Xtrans[good] 166 | Xb = Xtrans[bad] 167 | 168 | pylab.scatter( 169 | Xg[:, 0], np.zeros(len(Xg)), edgecolor="blue", facecolor="blue") 170 | pylab.scatter( 171 | Xb[:, 0], np.zeros(len(Xb)), edgecolor="red", facecolor="white") 172 | title = "Transformed feature space" 173 | pylab.title(title) 174 | pylab.xlabel("$X'$") 175 | fig.axes[1].get_yaxis().set_visible(False) 176 | 177 | pylab.grid(True) 178 | 179 | pylab.autoscale(tight=True) 180 | filename = "lda_demo.png" 181 | pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight") 182 | 183 | if __name__ == '__main__': 184 | plot_simple_demo_1() 185 | plot_simple_demo_2() 186 | plot_simple_demo_lda() 187 | -------------------------------------------------------------------------------- /ch11/demo_rfe.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from sklearn.feature_selection import RFE 9 | from sklearn.linear_model import LogisticRegression 10 | 11 | from sklearn.datasets import make_classification 12 | 13 | X, y = make_classification( 14 | n_samples=100, n_features=10, n_informative=3, random_state=0) 15 | 16 | clf = LogisticRegression() 17 | clf.fit(X, y) 18 | 19 | for i in range(1, 11): 20 | selector = RFE(clf, i) 21 | selector = selector.fit(X, y) 22 | print("%i\t%s\t%s" % (i, selector.support_, selector.ranking_)) 23 | -------------------------------------------------------------------------------- /ch11/utils.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import os 9 | 10 | DATA_DIR = os.path.join( 11 | os.path.dirname(os.path.realpath(__file__)), "data") 12 | 13 | CHART_DIR = os.path.join( 14 | os.path.dirname(os.path.realpath(__file__)), "charts") 15 | 16 | for d in [DATA_DIR, CHART_DIR]: 17 | if not os.path.exists(d): 18 | os.mkdir(d) 19 | 20 | -------------------------------------------------------------------------------- /ch12/.gitignore: -------------------------------------------------------------------------------- 1 | *.jugdata/ 2 | output.txt 3 | results.image.txt 4 | -------------------------------------------------------------------------------- /ch12/README.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Chapter 12 3 | ========== 4 | 5 | Support code for *Chapter 12: Big(ger) Data* 6 | 7 | Data 8 | ---- 9 | 10 | This chapter relies only on the image dataset that is packaged with the 11 | repository at ``../SimpleImageDataset/``. 12 | 13 | Scripts 14 | ------- 15 | 16 | chapter.py 17 | Code as written in the book 18 | jugfile.py 19 | Example jugfile 20 | image-classification.py 21 | Jugfile implementation of image classification from Chapter 10 22 | 23 | setup-aws.txt 24 | Commands to setup Amazon WebServices machine 25 | run-jugfile.sh 26 | Wrapper script to run jug file on jugfile.py 27 | run-image-classification.sh 28 | Wrapper script to run jug file on image-classification.py 29 | -------------------------------------------------------------------------------- /ch12/chapter.py: -------------------------------------------------------------------------------- 1 | from jug import TaskGenerator 2 | from glob import glob 3 | import mahotas as mh 4 | @TaskGenerator 5 | def compute_texture(im): 6 | from features import texture 7 | imc = mh.imread(im) 8 | return texture(mh.colors.rgb2gray(imc)) 9 | 10 | @TaskGenerator 11 | def chist_file(fname): 12 | from features import chist 13 | im = mh.imread(fname) 14 | return chist(im) 15 | 16 | import numpy as np 17 | to_array = TaskGenerator(np.array) 18 | hstack = TaskGenerator(np.hstack) 19 | 20 | haralicks = [] 21 | chists = [] 22 | labels = [] 23 | 24 | # Change this variable to point to 25 | # the location of the dataset is on disk 26 | basedir = '../SimpleImageDataset/' 27 | # Use glob to get all the images 28 | images = glob('{}/*.jpg'.format(basedir)) 29 | 30 | for fname in sorted(images): 31 | haralicks.append(compute_texture(fname)) 32 | chists.append(chist_file(fname)) 33 | # The class is encoded in the filename as xxxx00.jpg 34 | labels.append(fname[:-len('00.jpg')]) 35 | 36 | haralicks = to_array(haralicks) 37 | chists = to_array(chists) 38 | labels = to_array(labels) 39 | 40 | @TaskGenerator 41 | def accuracy(features, labels): 42 | from sklearn.linear_model import LogisticRegression 43 | from sklearn.pipeline import Pipeline 44 | from sklearn.preprocessing import StandardScaler 45 | from sklearn import cross_validation 46 | 47 | clf = Pipeline([('preproc', StandardScaler()), 48 | ('classifier', LogisticRegression())]) 49 | cv = cross_validation.LeaveOneOut(len(features)) 50 | scores = cross_validation.cross_val_score( 51 | clf, features, labels, cv=cv) 52 | return scores.mean() 53 | scores_base = accuracy(haralicks, labels) 54 | scores_chist = accuracy(chists, labels) 55 | 56 | combined = hstack([chists, haralicks]) 57 | scores_combined = accuracy(combined, labels) 58 | 59 | @TaskGenerator 60 | def print_results(scores): 61 | with open('results.image.txt', 'w') as output: 62 | for k,v in scores: 63 | output.write('Accuracy [{}]: {:.1%}\n'.format( 64 | k, v.mean())) 65 | 66 | print_results([ 67 | ('base', scores_base), 68 | ('chists', scores_chist), 69 | ('combined' , scores_combined), 70 | ]) 71 | 72 | @TaskGenerator 73 | def compute_lbp(fname): 74 | from mahotas.features import lbp 75 | imc = mh.imread(fname) 76 | im = mh.colors.rgb2grey(imc) 77 | return lbp(im, radius=8, points=6) 78 | 79 | lbps = [] 80 | for fname in sorted(images): 81 | # the rest of the loop as before 82 | lbps.append(compute_lbp(fname)) 83 | lbps = to_array(lbps) 84 | 85 | scores_lbps = accuracy(lbps, labels) 86 | combined_all = hstack([chists, haralicks, lbps]) 87 | scores_combined_all = accuracy(combined_all, labels) 88 | 89 | print_results([ 90 | ('base', scores_base), 91 | ('chists', scores_chist), 92 | ('lbps', scores_lbps), 93 | ('combined' , scores_combined), 94 | ('combined_all' , scores_combined_all), 95 | ]) 96 | -------------------------------------------------------------------------------- /ch12/features.py: -------------------------------------------------------------------------------- 1 | ../ch10/features.py -------------------------------------------------------------------------------- /ch12/image-classification.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | import mahotas as mh 9 | import numpy as np 10 | from glob import glob 11 | from jug import TaskGenerator 12 | 13 | # We need to use the `features` module from chapter 10. 14 | from sys import path 15 | path.append('../ch10') 16 | 17 | 18 | # This is the jug-enabled version of the script ``figure18.py`` in Chapter 10 19 | 20 | basedir = '../SimpleImageDataset/' 21 | 22 | @TaskGenerator 23 | def compute_texture(im): 24 | '''Compute features for an image 25 | 26 | Parameters 27 | ---------- 28 | im : str 29 | filepath for image to process 30 | 31 | Returns 32 | ------- 33 | fs : ndarray 34 | 1-D array of features 35 | ''' 36 | from features import texture 37 | imc = mh.imread(im) 38 | return texture(mh.colors.rgb2grey(imc)) 39 | 40 | @TaskGenerator 41 | def chist(fname): 42 | from features import chist as color_histogram 43 | im = mh.imread(fname) 44 | return color_histogram(im) 45 | 46 | @TaskGenerator 47 | def compute_lbp(fname): 48 | from mahotas.features import lbp 49 | imc = mh.imread(fname) 50 | im = mh.colors.rgb2grey(imc) 51 | return lbp(im, radius=8, points=6) 52 | 53 | 54 | @TaskGenerator 55 | def accuracy(features, labels): 56 | from sklearn.linear_model import LogisticRegression 57 | from sklearn.pipeline import Pipeline 58 | from sklearn.preprocessing import StandardScaler 59 | from sklearn import cross_validation 60 | # We use logistic regression because it is very fast. 61 | # Feel free to experiment with other classifiers 62 | clf = Pipeline([('preproc', StandardScaler()), 63 | ('classifier', LogisticRegression())]) 64 | cv = cross_validation.LeaveOneOut(len(features)) 65 | scores = cross_validation.cross_val_score( 66 | clf, features, labels, cv=cv) 67 | return scores.mean() 68 | 69 | 70 | @TaskGenerator 71 | def print_results(scores): 72 | with open('results.image.txt', 'w') as output: 73 | for k,v in scores: 74 | output.write('Accuracy (LOO x-val) with Logistic Regression [{0}]: {1:.1%}\n'.format( 75 | k, v.mean())) 76 | 77 | 78 | to_array = TaskGenerator(np.array) 79 | hstack = TaskGenerator(np.hstack) 80 | 81 | haralicks = [] 82 | chists = [] 83 | lbps = [] 84 | labels = [] 85 | 86 | # Use glob to get all the images 87 | images = glob('{0}/*.jpg'.format(basedir)) 88 | for fname in sorted(images): 89 | haralicks.append(compute_texture(fname)) 90 | chists.append(chist(fname)) 91 | lbps.append(compute_lbp(fname)) 92 | labels.append(fname[:-len('00.jpg')]) # The class is encoded in the filename as xxxx00.jpg 93 | 94 | haralicks = to_array(haralicks) 95 | chists = to_array(chists) 96 | lbps = to_array(lbps) 97 | labels = to_array(labels) 98 | 99 | scores_base = accuracy(haralicks, labels) 100 | scores_chist = accuracy(chists, labels) 101 | scores_lbps = accuracy(lbps, labels) 102 | 103 | combined = hstack([chists, haralicks]) 104 | scores_combined = accuracy(combined, labels) 105 | 106 | combined_all = hstack([chists, haralicks, lbps]) 107 | scores_combined_all = accuracy(combined_all, labels) 108 | 109 | print_results([ 110 | ('base', scores_base), 111 | ('chists', scores_chist), 112 | ('lbps', scores_lbps), 113 | ('combined' , scores_combined), 114 | ('combined_all' , scores_combined_all), 115 | ]) 116 | 117 | -------------------------------------------------------------------------------- /ch12/jugfile.py: -------------------------------------------------------------------------------- 1 | # This code is supporting material for the book 2 | # Building Machine Learning Systems with Python 3 | # by Willi Richert and Luis Pedro Coelho 4 | # published by PACKT Publishing 5 | # 6 | # It is made available under the MIT License 7 | 8 | from jug import TaskGenerator 9 | from time import sleep 10 | 11 | 12 | @TaskGenerator 13 | def double(x): 14 | sleep(4) 15 | return 2 * x 16 | 17 | 18 | @TaskGenerator 19 | def add(a, b): 20 | return a + b 21 | 22 | 23 | @TaskGenerator 24 | def print_final_result(oname, value): 25 | with open(oname, 'w') as output: 26 | output.write("Final result: {0}\n".format(value)) 27 | 28 | input = 2 29 | y = double(input) 30 | z = double(y) 31 | 32 | y2 = double(7) 33 | z2 = double(y2) 34 | print_final_result('output.txt', add(z, z2)) 35 | -------------------------------------------------------------------------------- /ch12/run-image-classification.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | jug execute image-classification.py 4 | -------------------------------------------------------------------------------- /ch12/run-jugfile.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | jug execute 4 | 5 | -------------------------------------------------------------------------------- /ch12/setup-aws.txt: -------------------------------------------------------------------------------- 1 | sudo yum update 2 | sudo yum -y install python-devel python-pip numpy scipy python-matplotlib 3 | sudo yum -y install gcc-c++ 4 | sudo yum -y install git 5 | sudo pip-python install -U pip 6 | sudo pip install scikit-learn jug mahotas 7 | 8 | --------------------------------------------------------------------------------