├── .gitignore ├── README.md ├── data ├── test.zip └── train.zip ├── exercises ├── __init__.py ├── test_exercise1.py ├── test_exercise2.py ├── test_exercise3.py └── test_exercise4.py ├── notebooks └── tutorial.ipynb ├── requirements.txt ├── run.py ├── src ├── __init__.py ├── model.py └── transformers.py └── tests ├── __init__.py ├── test_country_transformer_pytest.py ├── test_country_transformer_unittest.py ├── test_transformers.py ├── test_transformers_hypothesis.py ├── test_transformers_mocking.py └── test_transformers_parameterised.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore local virtualenvs 2 | lib/ 3 | bin/ 4 | include/ 5 | .Python/ 6 | *.pickle 7 | *.joblib 8 | .ipynb_checkpoints/ 9 | *.pyc 10 | *.pyo 11 | *.swp 12 | *.class 13 | *.orig 14 | *~ 15 | .hypothesis/ 16 | 17 | # autogenerated 18 | src/_pytest/_version.py 19 | # setuptools 20 | .eggs/ 21 | 22 | doc/*/_build 23 | doc/*/.doctrees 24 | build/ 25 | dist/ 26 | *.egg-info 27 | issue/ 28 | env/ 29 | .env/ 30 | 3rdparty/ 31 | .tox 32 | .cache 33 | .pytest_cache 34 | .coverage 35 | .coverage.* 36 | coverage.xml 37 | .ropeproject 38 | .idea 39 | .hypothesis 40 | .pydevproject 41 | .project 42 | .settings 43 | .vscode 44 | 45 | # generated by pip 46 | pip-wheel-metadata/ 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Testing for Data Scientists 2 | 3 | ## Packages to install 4 | ``` 5 | pip install -r requirements.txt 6 | ``` 7 | 8 | ## Commands available 9 | ``` 10 | python run.py train 11 | python run.py test 12 | python run.py unittest 13 | python run.py coverage 14 | python run.py exercises 15 | ```` 16 | -------------------------------------------------------------------------------- /data/test.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cambridgespark/pydata-testing-for-data-science/5669d582659e54c6b54463b94fb1feb7a6b301aa/data/test.zip -------------------------------------------------------------------------------- /data/train.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cambridgespark/pydata-testing-for-data-science/5669d582659e54c6b54463b94fb1feb7a6b301aa/data/train.zip -------------------------------------------------------------------------------- /exercises/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cambridgespark/pydata-testing-for-data-science/5669d582659e54c6b54463b94fb1feb7a6b301aa/exercises/__init__.py -------------------------------------------------------------------------------- /exercises/test_exercise1.py: -------------------------------------------------------------------------------- 1 | """ 2 | This exercise is about writing a unit test using py test 3 | """ 4 | 5 | from src.transformers import CategoriesExtractor 6 | 7 | def test_extract_categories(): 8 | """ 9 | Write a unit test for CategoriesExtractor.extract_categories(json_string, False) 10 | :return: 11 | """ 12 | pass -------------------------------------------------------------------------------- /exercises/test_exercise2.py: -------------------------------------------------------------------------------- 1 | """ 2 | This exercise is about writing a parameterised unit test using pytest 3 | """ 4 | 5 | import pytest 6 | from src.transformers import TimeTransformer 7 | 8 | def test_time_transformer(sample_df, expected_df): 9 | """ 10 | Write a parameterised unit test for TimeTransformer 11 | :param sample_df: sample df to test with three columns: deadline, created_at, launched_at 12 | :param expected_df: result with two columns: launched_to_deadline, created_to_launched 13 | :return: 14 | """ 15 | pass -------------------------------------------------------------------------------- /exercises/test_exercise3.py: -------------------------------------------------------------------------------- 1 | """ 2 | This exercise is about refactoring a unit test to improve it's readability and maintenance 3 | """ 4 | import pandas as pd 5 | from src.transformers import CountryTransformer 6 | 7 | import pytest 8 | def test_correct_country_returned_with_simple_df(): 9 | """ 10 | Refactor this unit test to apply the Given/When/Then pattern 11 | :return: 12 | """ 13 | df = pd.DataFrame({'country': ["CA", "GB"]}) 14 | country_transformer = CountryTransformer() 15 | assert len(country_transformer.transform(df).index) == 2 16 | assert country_transformer.transform(df)["country"][0] == "Canada" 17 | assert country_transformer.transform(df)["country"][1] == "UK & Ireland" -------------------------------------------------------------------------------- /exercises/test_exercise4.py: -------------------------------------------------------------------------------- 1 | """ 2 | This exercise is about writing a property-based unit test using hypothesis 3 | """ 4 | 5 | from hypothesis import given 6 | from src.transformers import CategoriesExtractor 7 | 8 | import pytest 9 | def test_extract_categories(json_string): 10 | """ 11 | Use hypothesis to generate test cases for CategoriesExtractor.extract_categories. 12 | Think about an appropriate property to test against. 13 | You should be able to find a bug and fix the implementation accordingly 14 | :param json_string: 15 | :return: 16 | """ 17 | 18 | pass 19 | 20 | -------------------------------------------------------------------------------- /notebooks/tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 41, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "pd.set_option('display.max_columns', 100)" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 42, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "name": "stderr", 20 | "output_type": "stream", 21 | "text": [ 22 | "/Users/raoul/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (21,22,23,24) have mixed types. Specify dtype option on import or set low_memory=False.\n", 23 | " interactivity=interactivity, compiler=compiler, result=result)\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "PATH = \"../data/train.zip\"\n", 29 | "df = pd.read_csv(PATH)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 43, 35 | "metadata": { 36 | "scrolled": true 37 | }, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/html": [ 42 | "
\n", 43 | "\n", 56 | "\n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | "
idphotonameblurbgoalslugdisable_communicationcountrycurrencycurrency_symbolcurrency_trailing_codedeadlinecreated_atlaunched_atstatic_usd_ratecreatorlocationcategoryprofileurlssource_urlfriendsis_starredis_backingpermissionsstate
0663816109{\"small\":\"https://ksr-ugc.imgix.net/assets/012...Angular - Where Modern Art meets CardsAngular is a minimalist card design for simpli...17380.0angular-where-modern-art-meets-cardsFalseUSUSD$True1459283229145584536314566948291.000000{\"urls\":{\"web\":{\"user\":\"https://www.kickstarte...{\"country\":\"US\",\"urls\":{\"web\":{\"discover\":\"htt...{\"urls\":{\"web\":{\"discover\":\"http://www.kicksta...{\"background_image_opacity\":0.8,\"should_show_f...{\"web\":{\"project\":\"https://www.kickstarter.com...https://www.kickstarter.com/discover/categorie...NaNNaNNaNNaNfailed
11462931821{\"small\":\"https://ksr-ugc.imgix.net/assets/014...Ladybeard is KAWAII-COREOriginal songs and music videos to jump start ...24000.0ladybeard-is-kawaii-coreFalseUSUSD$True1484110800147556886814809464541.000000{\"urls\":{\"web\":{\"user\":\"https://www.kickstarte...{\"country\":\"JP\",\"urls\":{\"web\":{\"discover\":\"htt...{\"urls\":{\"web\":{\"discover\":\"http://www.kicksta...{\"background_image_opacity\":0.8,\"should_show_f...{\"web\":{\"project\":\"https://www.kickstarter.com...https://www.kickstarter.com/discover/categorie...NaNNaNNaNNaNfailed
21724358498{\"small\":\"https://ksr-ugc.imgix.net/assets/011...Vegan Cafe Delivery Service in Vancouver BCOur project is to launch a vegan lunch deliver...40000.0vegancafecaFalseCACAD$True1408549628140521888314059576280.926746{\"urls\":{\"web\":{\"user\":\"https://www.kickstarte...{\"country\":\"CA\",\"urls\":{\"web\":{\"discover\":\"htt...{\"urls\":{\"web\":{\"discover\":\"http://www.kicksta...{\"background_image_opacity\":0.8,\"should_show_f...{\"web\":{\"project\":\"https://www.kickstarter.com...https://www.kickstarter.com/discover/categorie...NaNNaNNaNNaNfailed
3314918941{\"small\":\"https://ksr-ugc.imgix.net/assets/011...Photoetched Rail Yard ExpositionI have developed a process of my own which tra...1000.0photoetched-rail-yard-expositionFalseUSUSD$True1364084914136062777813614965141.000000{\"urls\":{\"web\":{\"user\":\"https://www.kickstarte...{\"country\":\"US\",\"urls\":{\"web\":{\"discover\":\"htt...{\"urls\":{\"web\":{\"discover\":\"http://www.kicksta...{\"background_image_opacity\":0.8,\"should_show_f...{\"web\":{\"project\":\"https://www.kickstarter.com...https://www.kickstarter.com/discover/categorie...NaNNaNNaNNaNsuccessful
41766165140{\"small\":\"https://ksr-ugc.imgix.net/assets/011...Cinnamon Fletcher needs to be brought to life!Need to pay an illustrator to bring my childre...700.0cinnamon-fletcher-needs-to-be-brought-to-lifeFalseGBGBP£False1382600001137970450213800080011.602384{\"urls\":{\"web\":{\"user\":\"https://www.kickstarte...{\"country\":\"GB\",\"urls\":{\"web\":{\"discover\":\"htt...{\"urls\":{\"web\":{\"discover\":\"http://www.kicksta...{\"background_image_opacity\":0.8,\"should_show_f...{\"web\":{\"project\":\"https://www.kickstarter.com...https://www.kickstarter.com/discover/categorie...NaNNaNNaNNaNfailed
\n", 236 | "
" 237 | ], 238 | "text/plain": [ 239 | " id photo \\\n", 240 | "0 663816109 {\"small\":\"https://ksr-ugc.imgix.net/assets/012... \n", 241 | "1 1462931821 {\"small\":\"https://ksr-ugc.imgix.net/assets/014... \n", 242 | "2 1724358498 {\"small\":\"https://ksr-ugc.imgix.net/assets/011... \n", 243 | "3 314918941 {\"small\":\"https://ksr-ugc.imgix.net/assets/011... \n", 244 | "4 1766165140 {\"small\":\"https://ksr-ugc.imgix.net/assets/011... \n", 245 | "\n", 246 | " name \\\n", 247 | "0 Angular - Where Modern Art meets Cards \n", 248 | "1 Ladybeard is KAWAII-CORE \n", 249 | "2 Vegan Cafe Delivery Service in Vancouver BC \n", 250 | "3 Photoetched Rail Yard Exposition \n", 251 | "4 Cinnamon Fletcher needs to be brought to life! \n", 252 | "\n", 253 | " blurb goal \\\n", 254 | "0 Angular is a minimalist card design for simpli... 17380.0 \n", 255 | "1 Original songs and music videos to jump start ... 24000.0 \n", 256 | "2 Our project is to launch a vegan lunch deliver... 40000.0 \n", 257 | "3 I have developed a process of my own which tra... 1000.0 \n", 258 | "4 Need to pay an illustrator to bring my childre... 700.0 \n", 259 | "\n", 260 | " slug disable_communication \\\n", 261 | "0 angular-where-modern-art-meets-cards False \n", 262 | "1 ladybeard-is-kawaii-core False \n", 263 | "2 vegancafeca False \n", 264 | "3 photoetched-rail-yard-exposition False \n", 265 | "4 cinnamon-fletcher-needs-to-be-brought-to-life False \n", 266 | "\n", 267 | " country currency currency_symbol currency_trailing_code deadline \\\n", 268 | "0 US USD $ True 1459283229 \n", 269 | "1 US USD $ True 1484110800 \n", 270 | "2 CA CAD $ True 1408549628 \n", 271 | "3 US USD $ True 1364084914 \n", 272 | "4 GB GBP £ False 1382600001 \n", 273 | "\n", 274 | " created_at launched_at static_usd_rate \\\n", 275 | "0 1455845363 1456694829 1.000000 \n", 276 | "1 1475568868 1480946454 1.000000 \n", 277 | "2 1405218883 1405957628 0.926746 \n", 278 | "3 1360627778 1361496514 1.000000 \n", 279 | "4 1379704502 1380008001 1.602384 \n", 280 | "\n", 281 | " creator \\\n", 282 | "0 {\"urls\":{\"web\":{\"user\":\"https://www.kickstarte... \n", 283 | "1 {\"urls\":{\"web\":{\"user\":\"https://www.kickstarte... \n", 284 | "2 {\"urls\":{\"web\":{\"user\":\"https://www.kickstarte... \n", 285 | "3 {\"urls\":{\"web\":{\"user\":\"https://www.kickstarte... \n", 286 | "4 {\"urls\":{\"web\":{\"user\":\"https://www.kickstarte... \n", 287 | "\n", 288 | " location \\\n", 289 | "0 {\"country\":\"US\",\"urls\":{\"web\":{\"discover\":\"htt... \n", 290 | "1 {\"country\":\"JP\",\"urls\":{\"web\":{\"discover\":\"htt... \n", 291 | "2 {\"country\":\"CA\",\"urls\":{\"web\":{\"discover\":\"htt... \n", 292 | "3 {\"country\":\"US\",\"urls\":{\"web\":{\"discover\":\"htt... \n", 293 | "4 {\"country\":\"GB\",\"urls\":{\"web\":{\"discover\":\"htt... \n", 294 | "\n", 295 | " category \\\n", 296 | "0 {\"urls\":{\"web\":{\"discover\":\"http://www.kicksta... \n", 297 | "1 {\"urls\":{\"web\":{\"discover\":\"http://www.kicksta... \n", 298 | "2 {\"urls\":{\"web\":{\"discover\":\"http://www.kicksta... \n", 299 | "3 {\"urls\":{\"web\":{\"discover\":\"http://www.kicksta... \n", 300 | "4 {\"urls\":{\"web\":{\"discover\":\"http://www.kicksta... \n", 301 | "\n", 302 | " profile \\\n", 303 | "0 {\"background_image_opacity\":0.8,\"should_show_f... \n", 304 | "1 {\"background_image_opacity\":0.8,\"should_show_f... \n", 305 | "2 {\"background_image_opacity\":0.8,\"should_show_f... \n", 306 | "3 {\"background_image_opacity\":0.8,\"should_show_f... \n", 307 | "4 {\"background_image_opacity\":0.8,\"should_show_f... \n", 308 | "\n", 309 | " urls \\\n", 310 | "0 {\"web\":{\"project\":\"https://www.kickstarter.com... \n", 311 | "1 {\"web\":{\"project\":\"https://www.kickstarter.com... \n", 312 | "2 {\"web\":{\"project\":\"https://www.kickstarter.com... \n", 313 | "3 {\"web\":{\"project\":\"https://www.kickstarter.com... \n", 314 | "4 {\"web\":{\"project\":\"https://www.kickstarter.com... \n", 315 | "\n", 316 | " source_url friends is_starred \\\n", 317 | "0 https://www.kickstarter.com/discover/categorie... NaN NaN \n", 318 | "1 https://www.kickstarter.com/discover/categorie... NaN NaN \n", 319 | "2 https://www.kickstarter.com/discover/categorie... NaN NaN \n", 320 | "3 https://www.kickstarter.com/discover/categorie... NaN NaN \n", 321 | "4 https://www.kickstarter.com/discover/categorie... NaN NaN \n", 322 | "\n", 323 | " is_backing permissions state \n", 324 | "0 NaN NaN failed \n", 325 | "1 NaN NaN failed \n", 326 | "2 NaN NaN failed \n", 327 | "3 NaN NaN successful \n", 328 | "4 NaN NaN failed " 329 | ] 330 | }, 331 | "execution_count": 43, 332 | "metadata": {}, 333 | "output_type": "execute_result" 334 | } 335 | ], 336 | "source": [ 337 | "df.head()" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": {}, 343 | "source": [ 344 | "# Pytruth: friendlier assertions" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 44, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "from truth.truth import AssertThat" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 45, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "data = df.iloc[1]" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 46, 368 | "metadata": {}, 369 | "outputs": [ 370 | { 371 | "data": { 372 | "text/plain": [ 373 | "id 1462931821\n", 374 | "photo {\"small\":\"https://ksr-ugc.imgix.net/assets/014...\n", 375 | "name Ladybeard is KAWAII-CORE\n", 376 | "blurb Original songs and music videos to jump start ...\n", 377 | "goal 24000\n", 378 | "slug ladybeard-is-kawaii-core\n", 379 | "disable_communication False\n", 380 | "country US\n", 381 | "currency USD\n", 382 | "currency_symbol $\n", 383 | "currency_trailing_code True\n", 384 | "deadline 1484110800\n", 385 | "created_at 1475568868\n", 386 | "launched_at 1480946454\n", 387 | "static_usd_rate 1\n", 388 | "creator {\"urls\":{\"web\":{\"user\":\"https://www.kickstarte...\n", 389 | "location {\"country\":\"JP\",\"urls\":{\"web\":{\"discover\":\"htt...\n", 390 | "category {\"urls\":{\"web\":{\"discover\":\"http://www.kicksta...\n", 391 | "profile {\"background_image_opacity\":0.8,\"should_show_f...\n", 392 | "urls {\"web\":{\"project\":\"https://www.kickstarter.com...\n", 393 | "source_url https://www.kickstarter.com/discover/categorie...\n", 394 | "friends NaN\n", 395 | "is_starred NaN\n", 396 | "is_backing NaN\n", 397 | "permissions NaN\n", 398 | "state failed\n", 399 | "Name: 1, dtype: object" 400 | ] 401 | }, 402 | "execution_count": 46, 403 | "metadata": {}, 404 | "output_type": "execute_result" 405 | } 406 | ], 407 | "source": [ 408 | "data" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 48, 414 | "metadata": {}, 415 | "outputs": [], 416 | "source": [ 417 | "AssertThat(data[\"deadline\"]).IsNonZero()" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 56, 423 | "metadata": { 424 | "scrolled": true 425 | }, 426 | "outputs": [], 427 | "source": [ 428 | "AssertThat(data[\"blurb\"]).Contains(\"songs\")" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": {}, 435 | "outputs": [], 436 | "source": [] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": null, 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "metadata": {}, 448 | "source": [ 449 | "# Hypothesis" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 57, 455 | "metadata": {}, 456 | "outputs": [], 457 | "source": [ 458 | "from hypothesis.strategies import text, lists" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 75, 464 | "metadata": {}, 465 | "outputs": [ 466 | { 467 | "name": "stdout", 468 | "output_type": "stream", 469 | "text": [ 470 | "[]\n", 471 | "[]\n", 472 | "[]\n", 473 | "[]\n", 474 | "[]\n", 475 | "[]\n", 476 | "['\\x1c', '', '']\n", 477 | "['\\x14']\n", 478 | "[]\n", 479 | "[]\n" 480 | ] 481 | } 482 | ], 483 | "source": [ 484 | "for i in range(0, 10):\n", 485 | " print(lists(text()).example())" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 59, 491 | "metadata": {}, 492 | "outputs": [], 493 | "source": [ 494 | "from hypothesis.extra.pandas import column, data_frames" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 68, 500 | "metadata": {}, 501 | "outputs": [ 502 | { 503 | "data": { 504 | "text/html": [ 505 | "
\n", 506 | "\n", 519 | "\n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | "
goalstatic_usd_rate
0NaNinf
1NaNinf
2NaNinf
3NaNinf
4NaNinf
5NaNinf
6NaNinf
\n", 565 | "
" 566 | ], 567 | "text/plain": [ 568 | " goal static_usd_rate\n", 569 | "0 NaN inf\n", 570 | "1 NaN inf\n", 571 | "2 NaN inf\n", 572 | "3 NaN inf\n", 573 | "4 NaN inf\n", 574 | "5 NaN inf\n", 575 | "6 NaN inf" 576 | ] 577 | }, 578 | "execution_count": 68, 579 | "metadata": {}, 580 | "output_type": "execute_result" 581 | } 582 | ], 583 | "source": [ 584 | "data_frames([column('goal', dtype=float), column('static_usd_rate', dtype=float)]).example()" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": 61, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [ 593 | "from hypothesis.strategies import fixed_dictionaries, text" 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": 74, 599 | "metadata": {}, 600 | "outputs": [ 601 | { 602 | "name": "stdout", 603 | "output_type": "stream", 604 | "text": [ 605 | "{'slug': '\\U0005d73b&'}\n", 606 | "{'slug': ''}\n", 607 | "{'slug': ''}\n", 608 | "{'slug': ''}\n", 609 | "{'slug': ''}\n", 610 | "{'slug': ''}\n", 611 | "{'slug': \"\\x0e'\\U0006903c\"}\n", 612 | "{'slug': ''}\n", 613 | "{'slug': ''}\n", 614 | "{'slug': '.\\x00'}\n" 615 | ] 616 | } 617 | ], 618 | "source": [ 619 | "for i in range(0, 10): print(fixed_dictionaries({'slug':text()}).example())" 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": 63, 625 | "metadata": {}, 626 | "outputs": [], 627 | "source": [ 628 | "from hypothesis.strategies import fixed_dictionaries, from_regex" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": 64, 634 | "metadata": {}, 635 | "outputs": [ 636 | { 637 | "data": { 638 | "text/plain": [ 639 | "'aba\\n'" 640 | ] 641 | }, 642 | "execution_count": 64, 643 | "metadata": {}, 644 | "output_type": "execute_result" 645 | } 646 | ], 647 | "source": [ 648 | "from_regex(\"^[abc]{3}$\").example()" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": 65, 654 | "metadata": {}, 655 | "outputs": [ 656 | { 657 | "data": { 658 | "text/plain": [ 659 | "'{\"data\": \"acc\\\\n\"}'" 660 | ] 661 | }, 662 | "execution_count": 65, 663 | "metadata": {}, 664 | "output_type": "execute_result" 665 | } 666 | ], 667 | "source": [ 668 | "from hypothesis.strategies import fixed_dictionaries, from_regex\n", 669 | "fixed_dictionaries({'data': from_regex(\"^[abc]{3}$\")}).map(json.dumps).example()" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": null, 675 | "metadata": {}, 676 | "outputs": [], 677 | "source": [] 678 | } 679 | ], 680 | "metadata": { 681 | "kernelspec": { 682 | "display_name": "Python 3", 683 | "language": "python", 684 | "name": "python3" 685 | }, 686 | "language_info": { 687 | "codemirror_mode": { 688 | "name": "ipython", 689 | "version": 3 690 | }, 691 | "file_extension": ".py", 692 | "mimetype": "text/x-python", 693 | "name": "python", 694 | "nbconvert_exporter": "python", 695 | "pygments_lexer": "ipython3", 696 | "version": "3.6.8" 697 | } 698 | }, 699 | "nbformat": 4, 700 | "nbformat_minor": 2 701 | } 702 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn==0.21.2 2 | pandas==0.24.2 3 | numpy==1.16.4 4 | pytest 5 | pytruth 6 | pytest-cov 7 | hypothesis[pandas] 8 | requests 9 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import joblib 4 | import urllib.request 5 | 6 | import pandas as pd 7 | import sys 8 | 9 | import pytest 10 | 11 | from src.model import KickstarterModel as Model 12 | 13 | TRAIN_NAME = "train.zip" 14 | TEST_NAME = "test.zip" 15 | 16 | DATA_DIR = "data" 17 | JOBLIB_NAME = 'model.joblib' 18 | 19 | 20 | def train_model(): 21 | df = pd.read_csv(os.sep.join([DATA_DIR, TRAIN_NAME])) 22 | 23 | my_model = Model() 24 | X_train, y_train = my_model.preprocess_training_data(df) 25 | my_model.fit(X_train, y_train) 26 | 27 | # Save JOB 28 | joblib.dump(my_model, JOBLIB_NAME) 29 | 30 | 31 | def test_model(): 32 | df = pd.read_csv(os.sep.join([DATA_DIR, TEST_NAME])) 33 | 34 | # Load JOB 35 | my_model = joblib.load(JOBLIB_NAME) 36 | 37 | X_test = my_model.preprocess_unseen_data(df) 38 | preds = my_model.predict(X_test) 39 | print("### Your predictions ###") 40 | print(preds) 41 | 42 | 43 | def main(): 44 | parser = argparse.ArgumentParser( 45 | description="A command line-tool to manage the project.") 46 | parser.add_argument( 47 | 'stage', 48 | metavar='stage', 49 | type=str, 50 | choices=['train', 'test', 'unittest', 'coverage', 'hypothesis', 'exercises'], 51 | help="Stage to run. Either train, test, unittest, coverage, hypothesis or exercises") 52 | 53 | if len(sys.argv[1:]) == 0: 54 | parser.print_help() 55 | parser.exit() 56 | 57 | stage = parser.parse_args().stage 58 | 59 | if stage == "train": 60 | print("Training model...") 61 | train_model() 62 | 63 | elif stage == "test": 64 | print("Testing model...") 65 | test_model() 66 | 67 | elif stage == "unittest": 68 | print("Unittesting model...") 69 | pytest.main(['-v', 'tests']) 70 | 71 | elif stage == "coverage": 72 | print("Running coverage...") 73 | pytest.main(['--cov-report', 'term-missing', '--cov=src/', 'tests/']) 74 | 75 | elif stage == "hypothesis": 76 | print("Running hypothesis...") 77 | pytest.main(['-v', '--hypothesis-show-statistics', 'tests/test_transformers_hypothesis.py']) 78 | 79 | elif stage == "exercises": 80 | print("Running the exercises...") 81 | pytest.main(['-v', 'exercises']) 82 | 83 | if __name__ == "__main__": 84 | main() 85 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cambridgespark/pydata-testing-for-data-science/5669d582659e54c6b54463b94fb1feb7a6b301aa/src/__init__.py -------------------------------------------------------------------------------- /src/model.py: -------------------------------------------------------------------------------- 1 | from sklearn.compose import ColumnTransformer 2 | from sklearn.linear_model import LogisticRegression 3 | from sklearn.pipeline import Pipeline 4 | from sklearn.preprocessing import OneHotEncoder, StandardScaler 5 | 6 | from src.transformers import CategoriesExtractor, CountryTransformer, GoalAdjustor, TimeTransformer 7 | 8 | 9 | class KickstarterModel: 10 | 11 | # Update parameters here after re-tuning the model 12 | params = {"penalty": "l1", "C": 1.7, "solver": "liblinear"} 13 | 14 | def __init__(self): 15 | 16 | self.model = None 17 | self.preprocessor = None 18 | 19 | def preprocess_training_data(self, df): 20 | # Processor for categories with one-hot encoding 21 | cat_processor = Pipeline([("extractor", CategoriesExtractor()), 22 | ("one_hot", 23 | OneHotEncoder(sparse=False, 24 | handle_unknown="ignore"))]) 25 | 26 | # Processor for countries with one-hot encoding 27 | country_processor = Pipeline([("transfomer", CountryTransformer()), 28 | ("one_hot", 29 | OneHotEncoder(sparse=False, 30 | handle_unknown="ignore"))]) 31 | 32 | # First level of column specific transformations 33 | col_transformer = ColumnTransformer([ 34 | ("goal", GoalAdjustor(), ["goal", "static_usd_rate"]), 35 | ("categories", cat_processor, ["category"]), 36 | ("disable_communication", "passthrough", ["disable_communication"]), 37 | ("time", TimeTransformer(), 38 | ["deadline", "created_at", "launched_at"]), 39 | ("countries", country_processor, ["country"]) 40 | ]) 41 | 42 | # Add a scaling stage 43 | self.preprocessor = Pipeline([("col_transformer", col_transformer), 44 | ("scaler", StandardScaler())]) 45 | 46 | # Return X_train and y_train 47 | X_train = self.preprocessor.fit_transform(df.drop("state", axis=1)) 48 | y_train = df.state.map({"failed": 0, "successful": 1}) 49 | 50 | return X_train, y_train 51 | 52 | def fit(self, X, y): 53 | self.model = LogisticRegression(**self.params) 54 | self.model.fit(X, y) 55 | 56 | def preprocess_unseen_data(self, df): 57 | X_test = self.preprocessor.transform(df) 58 | return X_test 59 | 60 | def predict(self, X): 61 | 62 | return self.model.predict(X) 63 | -------------------------------------------------------------------------------- /src/transformers.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | import requests 4 | from sklearn.base import BaseEstimator, TransformerMixin 5 | 6 | class CategoriesExtractor(BaseEstimator, TransformerMixin): 7 | """Extract Categories from json string. 8 | 9 | By default it will only keep the hardcoded categories defined below 10 | to avoid having too many dummies.""" 11 | 12 | misc = "misc" 13 | gen_cats = ["music", "film & video", "publishing", "art", "games"] 14 | precise_cats = [ 15 | "rock", "fiction", "webseries", "indie rock", "children's books", 16 | "shorts", "documentary", "video games" 17 | ] 18 | 19 | @classmethod 20 | def extract_categories(cls, json_string, validate=True): 21 | categories = json.loads(json_string).get("slug", "/").split("/") 22 | 23 | # Validate categories to keep only 24 | # the most common ones 25 | if validate: 26 | if categories[0] not in cls.gen_cats: 27 | categories[0] = cls.misc 28 | if categories[1] not in cls.precise_cats: 29 | categories[1] = cls.misc 30 | 31 | return categories 32 | 33 | def fit(self, X, y=None): 34 | return self 35 | 36 | def transform(self, X): 37 | categories = X["category"] 38 | return pd.DataFrame({ 39 | "gen_cat": categories.apply(lambda x: self.extract_categories(x)[0]), 40 | "precise_cat": categories.apply(lambda x: self.extract_categories(x)[1]) 41 | }) 42 | 43 | 44 | class GoalAdjustor(BaseEstimator, TransformerMixin): 45 | """Adjusts the goal feature to USD""" 46 | 47 | def fit(self, X, y=None): 48 | return self 49 | 50 | def transform(self, X): 51 | return pd.DataFrame({"adjusted_goal": X.goal * X.static_usd_rate}) 52 | 53 | 54 | class TimeTransformer(BaseEstimator, TransformerMixin): 55 | """Builds features computed from timestamps""" 56 | 57 | def __init__(self, adj=1000_000_000): 58 | self.adj = adj 59 | 60 | def fit(self, X, y=None): 61 | return self 62 | 63 | def transform(self, X): 64 | deadline = pd.to_datetime(X.deadline * self.adj) 65 | created = pd.to_datetime(X.created_at * self.adj) 66 | launched = pd.to_datetime(X.launched_at * self.adj) 67 | 68 | return pd.DataFrame({ 69 | "launched_to_deadline": (deadline - launched).dt.days, 70 | "created_to_launched": (launched - created).dt.days 71 | }) 72 | 73 | 74 | class CountryTransformer(BaseEstimator, TransformerMixin): 75 | """Transform countries into larger groups to avoid having 76 | too many dummies.""" 77 | 78 | countries = { 79 | 'US': 'US', 80 | 'CA': 'Canada', 81 | 'GB': 'UK & Ireland', 82 | 'AU': 'Oceania', 83 | 'IE': 'UK & Ireland', 84 | 'SE': 'Europe', 85 | 'CH': "Europe", 86 | 'IT': 'Europe', 87 | 'FR': 'Europe', 88 | 'NZ': 'Oceania', 89 | 'DE': 'Europe', 90 | 'NL': 'Europe', 91 | 'NO': 'Europe', 92 | 'MX': 'Other', 93 | 'ES': 'Europe', 94 | 'DK': 'Europe', 95 | 'BE': 'Europe', 96 | 'AT': 'Europe', 97 | 'HK': 'Other', 98 | 'SG': 'Other', 99 | 'LU': 'Europe' 100 | } 101 | 102 | def fit(self, X, y=None): 103 | return self 104 | 105 | def transform(self, X): 106 | return pd.DataFrame({"country": X.country.map(self.countries)}) 107 | 108 | 109 | class CountryFullTransformer(BaseEstimator, TransformerMixin): 110 | """Transform countries into larger groups to avoid having 111 | too many dummies.""" 112 | 113 | def getRegionFromCode(self, country): 114 | url = f"https://restcountries.eu/rest/v2/name/{country}" 115 | 116 | result = json.loads(requests.get(url)) 117 | return result["region"] 118 | 119 | def fit(self, X, y=None): 120 | return self 121 | 122 | def transform(self, X): 123 | return pd.DataFrame({"country": X.country.map(self.getRegionFromCode)}) -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cambridgespark/pydata-testing-for-data-science/5669d582659e54c6b54463b94fb1feb7a6b301aa/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_country_transformer_pytest.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from src.transformers import CountryTransformer 3 | 4 | def test_correct_country_returned_with_simple_df(): 5 | df = pd.DataFrame({'country': ["CA", "GB"]}) 6 | country_transformer = CountryTransformer() 7 | 8 | result_df = country_transformer.transform(df) 9 | 10 | assert len(result_df.index) == 2 11 | assert result_df["country"][0] == "Canada" 12 | assert result_df["country"][1] == "UK & Ireland" 13 | 14 | 15 | # def test_unkown_country_returns_default(): 16 | # df = pd.DataFrame({'country': ["SA"]}) 17 | # country_transformer = CountryTransformer() 18 | # 19 | # result_df = country_transformer.transform(df) 20 | # 21 | # # TODO: fix transformer to handle NaN / default 22 | # assert len(result_df.index) == 1 23 | # assert result_df["country"][0] == "Other" 24 | -------------------------------------------------------------------------------- /tests/test_country_transformer_unittest.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas as pd 3 | from src.transformers import CountryTransformer 4 | 5 | 6 | class TestCountryTransformer(unittest.TestCase): 7 | 8 | def test_correct_country_returned_with_simple_df(self): 9 | df = pd.DataFrame({'country':["CA", "GB"]}) 10 | country_transformer = CountryTransformer() 11 | 12 | result_df = country_transformer.transform(df) 13 | 14 | self.assertEqual(len(result_df.index), 2) 15 | self.assertEqual(result_df["country"][0], "Canada") 16 | self.assertEqual(result_df["country"][1], "UK & Ireland") 17 | 18 | 19 | # def test_unkown_country_returns_default(self): 20 | # df = pd.DataFrame({'country':["BE"]}) 21 | # country_transformer = CountryTransformer() 22 | # 23 | # result_df = country_transformer.transform(df) 24 | # 25 | # # TODO: fix transformer to handle NaN / default 26 | # self.assertEqual(len(result_df.index), 1) 27 | # self.assertEqual(result_df["country"][0], "SA") 28 | 29 | -------------------------------------------------------------------------------- /tests/test_transformers.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pandas.testing import assert_frame_equal 3 | from src.transformers import GoalAdjustor, TimeTransformer 4 | 5 | 6 | def test_time_transformer(): 7 | time_transformer = TimeTransformer() 8 | deadline_timestamp = 1459283229 9 | created_at_timestamp = 1455845363 10 | launched_at_timestamp = 1456694829 11 | sample_df = pd.DataFrame({'deadline': [deadline_timestamp], 'created_at': [created_at_timestamp], 'launched_at': [ 12 | launched_at_timestamp]}) 13 | 14 | expected_df = pd.DataFrame({'launched_to_deadline': [29], 'created_to_launched': [9]}) 15 | 16 | result_df = time_transformer.transform(sample_df) 17 | 18 | assert_frame_equal(result_df, expected_df) 19 | 20 | def test_goal_adjustor_with_value(): 21 | adjustor = GoalAdjustor() 22 | goal_value = 10 23 | usd_rate_value = 2 24 | sample_df = pd.DataFrame({'goal': [goal_value], 'static_usd_rate': [usd_rate_value]}) 25 | 26 | result_df = adjustor.transform(sample_df) 27 | 28 | expected_adjusted_goal_value = 20 29 | expected_df = pd.DataFrame({'adjusted_goal': [expected_adjusted_goal_value]}) 30 | assert_frame_equal(result_df, expected_df) 31 | 32 | #TODO: show problem if we just use assert from py test. the diagnostics makes no sense 33 | # have to use assert_frame_equal 34 | 35 | 36 | -------------------------------------------------------------------------------- /tests/test_transformers_hypothesis.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from hypothesis import given 4 | from hypothesis.extra.pandas import column, data_frames 5 | import pandas as pd 6 | from pandas.util.testing import assert_frame_equal 7 | 8 | from src.transformers import GoalAdjustor 9 | 10 | 11 | @given(data_frames([column('goal', dtype=float), column('static_usd_rate', dtype=float)])) 12 | def test_goal_adjustor(sample_df): 13 | adjustor = GoalAdjustor() 14 | 15 | result_df = adjustor.transform(sample_df) 16 | 17 | assert len(sample_df.index) == len(result_df.index) 18 | 19 | 20 | # example of invariant: raises the question of where should validation be 21 | #expected_df = pd.DataFrame({'adjusted_goal': sample_df["goal"] * sample_df["static_usd_rate"]}) 22 | #assert (expected_df["adjusted_goal"] >= 0).all() -------------------------------------------------------------------------------- /tests/test_transformers_mocking.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock, call 2 | 3 | import pandas as pd 4 | from pandas.util.testing import assert_frame_equal 5 | 6 | from src.transformers import CountryFullTransformer 7 | 8 | 9 | def test_correct_country_returned_with_simple_df(): 10 | df = pd.DataFrame({'country': ["CA", "GB"]}) 11 | 12 | country_transformer = CountryFullTransformer() 13 | 14 | country_transformer.getRegionFromCode = MagicMock() 15 | country_transformer.getRegionFromCode.side_effect = ["Canada", "UK & Ireland"] 16 | 17 | expected_df = pd.DataFrame({'country': ["Canada", "UK & Ireland"]}) 18 | result_df = country_transformer.transform(df) 19 | 20 | country_transformer.getRegionFromCode.assert_has_calls([call("CA"), call("GB")]) 21 | assert_frame_equal(result_df, expected_df) 22 | 23 | -------------------------------------------------------------------------------- /tests/test_transformers_parameterised.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | from pandas.testing import assert_frame_equal 4 | from src.transformers import GoalAdjustor 5 | 6 | 7 | test_goal_transformer_testdata = [ 8 | (pd.DataFrame({'goal': [5], 'static_usd_rate': [2]}), pd.DataFrame({'adjusted_goal': [10]})), 9 | (pd.DataFrame({'goal': [0], 'static_usd_rate': [1]}), pd.DataFrame({'adjusted_goal': [0]})), 10 | (pd.DataFrame({'goal': [0], 'static_usd_rate': [1]}), pd.DataFrame({'adjusted_goal': [0]})), 11 | ] 12 | 13 | @pytest.mark.parametrize("sample_df, expected_df", test_goal_transformer_testdata) 14 | def test_goal_adjustor(sample_df, expected_df): 15 | adjustor = GoalAdjustor() 16 | 17 | result_df = adjustor.transform(sample_df) 18 | assert_frame_equal(result_df, expected_df) 19 | 20 | --------------------------------------------------------------------------------