├── .gitignore
├── README.md
├── data
    ├── test.zip
    └── train.zip
├── exercises
    ├── __init__.py
    ├── test_exercise1.py
    ├── test_exercise2.py
    ├── test_exercise3.py
    └── test_exercise4.py
├── notebooks
    └── tutorial.ipynb
├── requirements.txt
├── run.py
├── src
    ├── __init__.py
    ├── model.py
    └── transformers.py
└── tests
    ├── __init__.py
    ├── test_country_transformer_pytest.py
    ├── test_country_transformer_unittest.py
    ├── test_transformers.py
    ├── test_transformers_hypothesis.py
    ├── test_transformers_mocking.py
    └── test_transformers_parameterised.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Ignore local virtualenvs
 2 | lib/
 3 | bin/
 4 | include/
 5 | .Python/
 6 | *.pickle
 7 | *.joblib
 8 | .ipynb_checkpoints/
 9 | *.pyc
10 | *.pyo
11 | *.swp
12 | *.class
13 | *.orig
14 | *~
15 | .hypothesis/
16 | 
17 | # autogenerated
18 | src/_pytest/_version.py
19 | # setuptools
20 | .eggs/
21 | 
22 | doc/*/_build
23 | doc/*/.doctrees
24 | build/
25 | dist/
26 | *.egg-info
27 | issue/
28 | env/
29 | .env/
30 | 3rdparty/
31 | .tox
32 | .cache
33 | .pytest_cache
34 | .coverage
35 | .coverage.*
36 | coverage.xml
37 | .ropeproject
38 | .idea
39 | .hypothesis
40 | .pydevproject
41 | .project
42 | .settings
43 | .vscode
44 | 
45 | # generated by pip
46 | pip-wheel-metadata/
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Testing for Data Scientists
 2 | 
 3 | ## Packages to install
 4 | ```
 5 | pip install -r requirements.txt
 6 | ```
 7 | 
 8 | ## Commands available
 9 | ```
10 | python run.py train
11 | python run.py test
12 | python run.py unittest
13 | python run.py coverage
14 | python run.py exercises
15 | ````
16 | 


--------------------------------------------------------------------------------
/data/test.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cambridgespark/pydata-testing-for-data-science/5669d582659e54c6b54463b94fb1feb7a6b301aa/data/test.zip


--------------------------------------------------------------------------------
/data/train.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cambridgespark/pydata-testing-for-data-science/5669d582659e54c6b54463b94fb1feb7a6b301aa/data/train.zip


--------------------------------------------------------------------------------
/exercises/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cambridgespark/pydata-testing-for-data-science/5669d582659e54c6b54463b94fb1feb7a6b301aa/exercises/__init__.py


--------------------------------------------------------------------------------
/exercises/test_exercise1.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This exercise is about writing a unit test using py test
 3 | """
 4 | 
 5 | from src.transformers import CategoriesExtractor
 6 | 
 7 | def test_extract_categories():
 8 |     """
 9 |     Write a unit test for CategoriesExtractor.extract_categories(json_string, False)
10 |     :return:
11 |     """
12 |     pass


--------------------------------------------------------------------------------
/exercises/test_exercise2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This exercise is about writing a parameterised unit test using pytest
 3 | """
 4 | 
 5 | import pytest
 6 | from src.transformers import TimeTransformer
 7 | 
 8 | def test_time_transformer(sample_df, expected_df):
 9 |     """
10 |     Write a parameterised unit test for TimeTransformer
11 |     :param sample_df: sample df to test with three columns: deadline, created_at, launched_at
12 |     :param expected_df: result with two columns: launched_to_deadline, created_to_launched
13 |     :return:
14 |     """
15 |     pass


--------------------------------------------------------------------------------
/exercises/test_exercise3.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This exercise is about refactoring a unit test to improve it's readability and maintenance
 3 | """
 4 | import pandas as pd
 5 | from src.transformers import CountryTransformer
 6 | 
 7 | import pytest
 8 | def test_correct_country_returned_with_simple_df():
 9 |     """
10 |     Refactor this unit test to apply the Given/When/Then pattern
11 |     :return:
12 |     """
13 |     df = pd.DataFrame({'country': ["CA", "GB"]})
14 |     country_transformer = CountryTransformer()
15 |     assert len(country_transformer.transform(df).index) ==  2
16 |     assert country_transformer.transform(df)["country"][0] == "Canada"
17 |     assert country_transformer.transform(df)["country"][1] == "UK & Ireland"


--------------------------------------------------------------------------------
/exercises/test_exercise4.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This exercise is about writing a property-based unit test using hypothesis
 3 | """
 4 | 
 5 | from hypothesis import given
 6 | from src.transformers import CategoriesExtractor
 7 | 
 8 | import pytest
 9 | def test_extract_categories(json_string):
10 |     """
11 |     Use hypothesis to generate test cases for CategoriesExtractor.extract_categories.
12 |     Think about an appropriate property to test against.
13 |     You should be able to find a bug and fix the implementation accordingly
14 |     :param json_string:
15 |     :return:
16 |     """
17 | 
18 |     pass
19 | 
20 | 


--------------------------------------------------------------------------------
/notebooks/tutorial.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 41,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "pd.set_option('display.max_columns', 100)"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 42,
 16 |    "metadata": {},
 17 |    "outputs": [
 18 |     {
 19 |      "name": "stderr",
 20 |      "output_type": "stream",
 21 |      "text": [
 22 |       "/Users/raoul/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (21,22,23,24) have mixed types. Specify dtype option on import or set low_memory=False.\n",
 23 |       "  interactivity=interactivity, compiler=compiler, result=result)\n"
 24 |      ]
 25 |     }
 26 |    ],
 27 |    "source": [
 28 |     "PATH = \"../data/train.zip\"\n",
 29 |     "df = pd.read_csv(PATH)"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 43,
 35 |    "metadata": {
 36 |     "scrolled": true
 37 |    },
 38 |    "outputs": [
 39 |     {
 40 |      "data": {
 41 |       "text/html": [
 42 |        "<div>\n",
 43 |        "<style scoped>\n",
 44 |        "    .dataframe tbody tr th:only-of-type {\n",
 45 |        "        vertical-align: middle;\n",
 46 |        "    }\n",
 47 |        "\n",
 48 |        "    .dataframe tbody tr th {\n",
 49 |        "        vertical-align: top;\n",
 50 |        "    }\n",
 51 |        "\n",
 52 |        "    .dataframe thead th {\n",
 53 |        "        text-align: right;\n",
 54 |        "    }\n",
 55 |        "</style>\n",
 56 |        "<table border=\"1\" class=\"dataframe\">\n",
 57 |        "  <thead>\n",
 58 |        "    <tr style=\"text-align: right;\">\n",
 59 |        "      <th></th>\n",
 60 |        "      <th>id</th>\n",
 61 |        "      <th>photo</th>\n",
 62 |        "      <th>name</th>\n",
 63 |        "      <th>blurb</th>\n",
 64 |        "      <th>goal</th>\n",
 65 |        "      <th>slug</th>\n",
 66 |        "      <th>disable_communication</th>\n",
 67 |        "      <th>country</th>\n",
 68 |        "      <th>currency</th>\n",
 69 |        "      <th>currency_symbol</th>\n",
 70 |        "      <th>currency_trailing_code</th>\n",
 71 |        "      <th>deadline</th>\n",
 72 |        "      <th>created_at</th>\n",
 73 |        "      <th>launched_at</th>\n",
 74 |        "      <th>static_usd_rate</th>\n",
 75 |        "      <th>creator</th>\n",
 76 |        "      <th>location</th>\n",
 77 |        "      <th>category</th>\n",
 78 |        "      <th>profile</th>\n",
 79 |        "      <th>urls</th>\n",
 80 |        "      <th>source_url</th>\n",
 81 |        "      <th>friends</th>\n",
 82 |        "      <th>is_starred</th>\n",
 83 |        "      <th>is_backing</th>\n",
 84 |        "      <th>permissions</th>\n",
 85 |        "      <th>state</th>\n",
 86 |        "    </tr>\n",
 87 |        "  </thead>\n",
 88 |        "  <tbody>\n",
 89 |        "    <tr>\n",
 90 |        "      <th>0</th>\n",
 91 |        "      <td>663816109</td>\n",
 92 |        "      <td>{\"small\":\"https://ksr-ugc.imgix.net/assets/012...</td>\n",
 93 |        "      <td>Angular - Where Modern Art meets Cards</td>\n",
 94 |        "      <td>Angular is a minimalist card design for simpli...</td>\n",
 95 |        "      <td>17380.0</td>\n",
 96 |        "      <td>angular-where-modern-art-meets-cards</td>\n",
 97 |        "      <td>False</td>\n",
 98 |        "      <td>US</td>\n",
 99 |        "      <td>USD</td>\n",
100 |        "      <td>$</td>\n",
101 |        "      <td>True</td>\n",
102 |        "      <td>1459283229</td>\n",
103 |        "      <td>1455845363</td>\n",
104 |        "      <td>1456694829</td>\n",
105 |        "      <td>1.000000</td>\n",
106 |        "      <td>{\"urls\":{\"web\":{\"user\":\"https://www.kickstarte...</td>\n",
107 |        "      <td>{\"country\":\"US\",\"urls\":{\"web\":{\"discover\":\"htt...</td>\n",
108 |        "      <td>{\"urls\":{\"web\":{\"discover\":\"http://www.kicksta...</td>\n",
109 |        "      <td>{\"background_image_opacity\":0.8,\"should_show_f...</td>\n",
110 |        "      <td>{\"web\":{\"project\":\"https://www.kickstarter.com...</td>\n",
111 |        "      <td>https://www.kickstarter.com/discover/categorie...</td>\n",
112 |        "      <td>NaN</td>\n",
113 |        "      <td>NaN</td>\n",
114 |        "      <td>NaN</td>\n",
115 |        "      <td>NaN</td>\n",
116 |        "      <td>failed</td>\n",
117 |        "    </tr>\n",
118 |        "    <tr>\n",
119 |        "      <th>1</th>\n",
120 |        "      <td>1462931821</td>\n",
121 |        "      <td>{\"small\":\"https://ksr-ugc.imgix.net/assets/014...</td>\n",
122 |        "      <td>Ladybeard is KAWAII-CORE</td>\n",
123 |        "      <td>Original songs and music videos to jump start ...</td>\n",
124 |        "      <td>24000.0</td>\n",
125 |        "      <td>ladybeard-is-kawaii-core</td>\n",
126 |        "      <td>False</td>\n",
127 |        "      <td>US</td>\n",
128 |        "      <td>USD</td>\n",
129 |        "      <td>$</td>\n",
130 |        "      <td>True</td>\n",
131 |        "      <td>1484110800</td>\n",
132 |        "      <td>1475568868</td>\n",
133 |        "      <td>1480946454</td>\n",
134 |        "      <td>1.000000</td>\n",
135 |        "      <td>{\"urls\":{\"web\":{\"user\":\"https://www.kickstarte...</td>\n",
136 |        "      <td>{\"country\":\"JP\",\"urls\":{\"web\":{\"discover\":\"htt...</td>\n",
137 |        "      <td>{\"urls\":{\"web\":{\"discover\":\"http://www.kicksta...</td>\n",
138 |        "      <td>{\"background_image_opacity\":0.8,\"should_show_f...</td>\n",
139 |        "      <td>{\"web\":{\"project\":\"https://www.kickstarter.com...</td>\n",
140 |        "      <td>https://www.kickstarter.com/discover/categorie...</td>\n",
141 |        "      <td>NaN</td>\n",
142 |        "      <td>NaN</td>\n",
143 |        "      <td>NaN</td>\n",
144 |        "      <td>NaN</td>\n",
145 |        "      <td>failed</td>\n",
146 |        "    </tr>\n",
147 |        "    <tr>\n",
148 |        "      <th>2</th>\n",
149 |        "      <td>1724358498</td>\n",
150 |        "      <td>{\"small\":\"https://ksr-ugc.imgix.net/assets/011...</td>\n",
151 |        "      <td>Vegan Cafe Delivery Service in Vancouver BC</td>\n",
152 |        "      <td>Our project is to launch a vegan lunch deliver...</td>\n",
153 |        "      <td>40000.0</td>\n",
154 |        "      <td>vegancafeca</td>\n",
155 |        "      <td>False</td>\n",
156 |        "      <td>CA</td>\n",
157 |        "      <td>CAD</td>\n",
158 |        "      <td>$</td>\n",
159 |        "      <td>True</td>\n",
160 |        "      <td>1408549628</td>\n",
161 |        "      <td>1405218883</td>\n",
162 |        "      <td>1405957628</td>\n",
163 |        "      <td>0.926746</td>\n",
164 |        "      <td>{\"urls\":{\"web\":{\"user\":\"https://www.kickstarte...</td>\n",
165 |        "      <td>{\"country\":\"CA\",\"urls\":{\"web\":{\"discover\":\"htt...</td>\n",
166 |        "      <td>{\"urls\":{\"web\":{\"discover\":\"http://www.kicksta...</td>\n",
167 |        "      <td>{\"background_image_opacity\":0.8,\"should_show_f...</td>\n",
168 |        "      <td>{\"web\":{\"project\":\"https://www.kickstarter.com...</td>\n",
169 |        "      <td>https://www.kickstarter.com/discover/categorie...</td>\n",
170 |        "      <td>NaN</td>\n",
171 |        "      <td>NaN</td>\n",
172 |        "      <td>NaN</td>\n",
173 |        "      <td>NaN</td>\n",
174 |        "      <td>failed</td>\n",
175 |        "    </tr>\n",
176 |        "    <tr>\n",
177 |        "      <th>3</th>\n",
178 |        "      <td>314918941</td>\n",
179 |        "      <td>{\"small\":\"https://ksr-ugc.imgix.net/assets/011...</td>\n",
180 |        "      <td>Photoetched Rail Yard Exposition</td>\n",
181 |        "      <td>I have developed a process of my own which tra...</td>\n",
182 |        "      <td>1000.0</td>\n",
183 |        "      <td>photoetched-rail-yard-exposition</td>\n",
184 |        "      <td>False</td>\n",
185 |        "      <td>US</td>\n",
186 |        "      <td>USD</td>\n",
187 |        "      <td>$</td>\n",
188 |        "      <td>True</td>\n",
189 |        "      <td>1364084914</td>\n",
190 |        "      <td>1360627778</td>\n",
191 |        "      <td>1361496514</td>\n",
192 |        "      <td>1.000000</td>\n",
193 |        "      <td>{\"urls\":{\"web\":{\"user\":\"https://www.kickstarte...</td>\n",
194 |        "      <td>{\"country\":\"US\",\"urls\":{\"web\":{\"discover\":\"htt...</td>\n",
195 |        "      <td>{\"urls\":{\"web\":{\"discover\":\"http://www.kicksta...</td>\n",
196 |        "      <td>{\"background_image_opacity\":0.8,\"should_show_f...</td>\n",
197 |        "      <td>{\"web\":{\"project\":\"https://www.kickstarter.com...</td>\n",
198 |        "      <td>https://www.kickstarter.com/discover/categorie...</td>\n",
199 |        "      <td>NaN</td>\n",
200 |        "      <td>NaN</td>\n",
201 |        "      <td>NaN</td>\n",
202 |        "      <td>NaN</td>\n",
203 |        "      <td>successful</td>\n",
204 |        "    </tr>\n",
205 |        "    <tr>\n",
206 |        "      <th>4</th>\n",
207 |        "      <td>1766165140</td>\n",
208 |        "      <td>{\"small\":\"https://ksr-ugc.imgix.net/assets/011...</td>\n",
209 |        "      <td>Cinnamon Fletcher needs to be brought to life!</td>\n",
210 |        "      <td>Need to pay an illustrator to bring my childre...</td>\n",
211 |        "      <td>700.0</td>\n",
212 |        "      <td>cinnamon-fletcher-needs-to-be-brought-to-life</td>\n",
213 |        "      <td>False</td>\n",
214 |        "      <td>GB</td>\n",
215 |        "      <td>GBP</td>\n",
216 |        "      <td>Â£</td>\n",
217 |        "      <td>False</td>\n",
218 |        "      <td>1382600001</td>\n",
219 |        "      <td>1379704502</td>\n",
220 |        "      <td>1380008001</td>\n",
221 |        "      <td>1.602384</td>\n",
222 |        "      <td>{\"urls\":{\"web\":{\"user\":\"https://www.kickstarte...</td>\n",
223 |        "      <td>{\"country\":\"GB\",\"urls\":{\"web\":{\"discover\":\"htt...</td>\n",
224 |        "      <td>{\"urls\":{\"web\":{\"discover\":\"http://www.kicksta...</td>\n",
225 |        "      <td>{\"background_image_opacity\":0.8,\"should_show_f...</td>\n",
226 |        "      <td>{\"web\":{\"project\":\"https://www.kickstarter.com...</td>\n",
227 |        "      <td>https://www.kickstarter.com/discover/categorie...</td>\n",
228 |        "      <td>NaN</td>\n",
229 |        "      <td>NaN</td>\n",
230 |        "      <td>NaN</td>\n",
231 |        "      <td>NaN</td>\n",
232 |        "      <td>failed</td>\n",
233 |        "    </tr>\n",
234 |        "  </tbody>\n",
235 |        "</table>\n",
236 |        "</div>"
237 |       ],
238 |       "text/plain": [
239 |        "           id                                              photo  \\\n",
240 |        "0   663816109  {\"small\":\"https://ksr-ugc.imgix.net/assets/012...   \n",
241 |        "1  1462931821  {\"small\":\"https://ksr-ugc.imgix.net/assets/014...   \n",
242 |        "2  1724358498  {\"small\":\"https://ksr-ugc.imgix.net/assets/011...   \n",
243 |        "3   314918941  {\"small\":\"https://ksr-ugc.imgix.net/assets/011...   \n",
244 |        "4  1766165140  {\"small\":\"https://ksr-ugc.imgix.net/assets/011...   \n",
245 |        "\n",
246 |        "                                             name  \\\n",
247 |        "0          Angular - Where Modern Art meets Cards   \n",
248 |        "1                        Ladybeard is KAWAII-CORE   \n",
249 |        "2     Vegan Cafe Delivery Service in Vancouver BC   \n",
250 |        "3                Photoetched Rail Yard Exposition   \n",
251 |        "4  Cinnamon Fletcher needs to be brought to life!   \n",
252 |        "\n",
253 |        "                                               blurb     goal  \\\n",
254 |        "0  Angular is a minimalist card design for simpli...  17380.0   \n",
255 |        "1  Original songs and music videos to jump start ...  24000.0   \n",
256 |        "2  Our project is to launch a vegan lunch deliver...  40000.0   \n",
257 |        "3  I have developed a process of my own which tra...   1000.0   \n",
258 |        "4  Need to pay an illustrator to bring my childre...    700.0   \n",
259 |        "\n",
260 |        "                                            slug  disable_communication  \\\n",
261 |        "0           angular-where-modern-art-meets-cards                  False   \n",
262 |        "1                       ladybeard-is-kawaii-core                  False   \n",
263 |        "2                                    vegancafeca                  False   \n",
264 |        "3               photoetched-rail-yard-exposition                  False   \n",
265 |        "4  cinnamon-fletcher-needs-to-be-brought-to-life                  False   \n",
266 |        "\n",
267 |        "  country currency currency_symbol  currency_trailing_code    deadline  \\\n",
268 |        "0      US      USD               $                    True  1459283229   \n",
269 |        "1      US      USD               $                    True  1484110800   \n",
270 |        "2      CA      CAD               $                    True  1408549628   \n",
271 |        "3      US      USD               $                    True  1364084914   \n",
272 |        "4      GB      GBP              Â£                   False  1382600001   \n",
273 |        "\n",
274 |        "   created_at  launched_at  static_usd_rate  \\\n",
275 |        "0  1455845363   1456694829         1.000000   \n",
276 |        "1  1475568868   1480946454         1.000000   \n",
277 |        "2  1405218883   1405957628         0.926746   \n",
278 |        "3  1360627778   1361496514         1.000000   \n",
279 |        "4  1379704502   1380008001         1.602384   \n",
280 |        "\n",
281 |        "                                             creator  \\\n",
282 |        "0  {\"urls\":{\"web\":{\"user\":\"https://www.kickstarte...   \n",
283 |        "1  {\"urls\":{\"web\":{\"user\":\"https://www.kickstarte...   \n",
284 |        "2  {\"urls\":{\"web\":{\"user\":\"https://www.kickstarte...   \n",
285 |        "3  {\"urls\":{\"web\":{\"user\":\"https://www.kickstarte...   \n",
286 |        "4  {\"urls\":{\"web\":{\"user\":\"https://www.kickstarte...   \n",
287 |        "\n",
288 |        "                                            location  \\\n",
289 |        "0  {\"country\":\"US\",\"urls\":{\"web\":{\"discover\":\"htt...   \n",
290 |        "1  {\"country\":\"JP\",\"urls\":{\"web\":{\"discover\":\"htt...   \n",
291 |        "2  {\"country\":\"CA\",\"urls\":{\"web\":{\"discover\":\"htt...   \n",
292 |        "3  {\"country\":\"US\",\"urls\":{\"web\":{\"discover\":\"htt...   \n",
293 |        "4  {\"country\":\"GB\",\"urls\":{\"web\":{\"discover\":\"htt...   \n",
294 |        "\n",
295 |        "                                            category  \\\n",
296 |        "0  {\"urls\":{\"web\":{\"discover\":\"http://www.kicksta...   \n",
297 |        "1  {\"urls\":{\"web\":{\"discover\":\"http://www.kicksta...   \n",
298 |        "2  {\"urls\":{\"web\":{\"discover\":\"http://www.kicksta...   \n",
299 |        "3  {\"urls\":{\"web\":{\"discover\":\"http://www.kicksta...   \n",
300 |        "4  {\"urls\":{\"web\":{\"discover\":\"http://www.kicksta...   \n",
301 |        "\n",
302 |        "                                             profile  \\\n",
303 |        "0  {\"background_image_opacity\":0.8,\"should_show_f...   \n",
304 |        "1  {\"background_image_opacity\":0.8,\"should_show_f...   \n",
305 |        "2  {\"background_image_opacity\":0.8,\"should_show_f...   \n",
306 |        "3  {\"background_image_opacity\":0.8,\"should_show_f...   \n",
307 |        "4  {\"background_image_opacity\":0.8,\"should_show_f...   \n",
308 |        "\n",
309 |        "                                                urls  \\\n",
310 |        "0  {\"web\":{\"project\":\"https://www.kickstarter.com...   \n",
311 |        "1  {\"web\":{\"project\":\"https://www.kickstarter.com...   \n",
312 |        "2  {\"web\":{\"project\":\"https://www.kickstarter.com...   \n",
313 |        "3  {\"web\":{\"project\":\"https://www.kickstarter.com...   \n",
314 |        "4  {\"web\":{\"project\":\"https://www.kickstarter.com...   \n",
315 |        "\n",
316 |        "                                          source_url friends is_starred  \\\n",
317 |        "0  https://www.kickstarter.com/discover/categorie...     NaN        NaN   \n",
318 |        "1  https://www.kickstarter.com/discover/categorie...     NaN        NaN   \n",
319 |        "2  https://www.kickstarter.com/discover/categorie...     NaN        NaN   \n",
320 |        "3  https://www.kickstarter.com/discover/categorie...     NaN        NaN   \n",
321 |        "4  https://www.kickstarter.com/discover/categorie...     NaN        NaN   \n",
322 |        "\n",
323 |        "  is_backing permissions       state  \n",
324 |        "0        NaN         NaN      failed  \n",
325 |        "1        NaN         NaN      failed  \n",
326 |        "2        NaN         NaN      failed  \n",
327 |        "3        NaN         NaN  successful  \n",
328 |        "4        NaN         NaN      failed  "
329 |       ]
330 |      },
331 |      "execution_count": 43,
332 |      "metadata": {},
333 |      "output_type": "execute_result"
334 |     }
335 |    ],
336 |    "source": [
337 |     "df.head()"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "markdown",
342 |    "metadata": {},
343 |    "source": [
344 |     "# Pytruth: friendlier assertions"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": 44,
350 |    "metadata": {},
351 |    "outputs": [],
352 |    "source": [
353 |     "from truth.truth import AssertThat"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": 45,
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": [
362 |     "data = df.iloc[1]"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 46,
368 |    "metadata": {},
369 |    "outputs": [
370 |     {
371 |      "data": {
372 |       "text/plain": [
373 |        "id                                                               1462931821\n",
374 |        "photo                     {\"small\":\"https://ksr-ugc.imgix.net/assets/014...\n",
375 |        "name                                               Ladybeard is KAWAII-CORE\n",
376 |        "blurb                     Original songs and music videos to jump start ...\n",
377 |        "goal                                                                  24000\n",
378 |        "slug                                               ladybeard-is-kawaii-core\n",
379 |        "disable_communication                                                 False\n",
380 |        "country                                                                  US\n",
381 |        "currency                                                                USD\n",
382 |        "currency_symbol                                                           $\n",
383 |        "currency_trailing_code                                                 True\n",
384 |        "deadline                                                         1484110800\n",
385 |        "created_at                                                       1475568868\n",
386 |        "launched_at                                                      1480946454\n",
387 |        "static_usd_rate                                                           1\n",
388 |        "creator                   {\"urls\":{\"web\":{\"user\":\"https://www.kickstarte...\n",
389 |        "location                  {\"country\":\"JP\",\"urls\":{\"web\":{\"discover\":\"htt...\n",
390 |        "category                  {\"urls\":{\"web\":{\"discover\":\"http://www.kicksta...\n",
391 |        "profile                   {\"background_image_opacity\":0.8,\"should_show_f...\n",
392 |        "urls                      {\"web\":{\"project\":\"https://www.kickstarter.com...\n",
393 |        "source_url                https://www.kickstarter.com/discover/categorie...\n",
394 |        "friends                                                                 NaN\n",
395 |        "is_starred                                                              NaN\n",
396 |        "is_backing                                                              NaN\n",
397 |        "permissions                                                             NaN\n",
398 |        "state                                                                failed\n",
399 |        "Name: 1, dtype: object"
400 |       ]
401 |      },
402 |      "execution_count": 46,
403 |      "metadata": {},
404 |      "output_type": "execute_result"
405 |     }
406 |    ],
407 |    "source": [
408 |     "data"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "code",
413 |    "execution_count": 48,
414 |    "metadata": {},
415 |    "outputs": [],
416 |    "source": [
417 |     "AssertThat(data[\"deadline\"]).IsNonZero()"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": 56,
423 |    "metadata": {
424 |     "scrolled": true
425 |    },
426 |    "outputs": [],
427 |    "source": [
428 |     "AssertThat(data[\"blurb\"]).Contains(\"songs\")"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "code",
433 |    "execution_count": null,
434 |    "metadata": {},
435 |    "outputs": [],
436 |    "source": []
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": null,
441 |    "metadata": {},
442 |    "outputs": [],
443 |    "source": []
444 |   },
445 |   {
446 |    "cell_type": "markdown",
447 |    "metadata": {},
448 |    "source": [
449 |     "# Hypothesis"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "code",
454 |    "execution_count": 57,
455 |    "metadata": {},
456 |    "outputs": [],
457 |    "source": [
458 |     "from hypothesis.strategies import text, lists"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": 75,
464 |    "metadata": {},
465 |    "outputs": [
466 |     {
467 |      "name": "stdout",
468 |      "output_type": "stream",
469 |      "text": [
470 |       "[]\n",
471 |       "[]\n",
472 |       "[]\n",
473 |       "[]\n",
474 |       "[]\n",
475 |       "[]\n",
476 |       "['\\x1c', '', '']\n",
477 |       "['\\x14']\n",
478 |       "[]\n",
479 |       "[]\n"
480 |      ]
481 |     }
482 |    ],
483 |    "source": [
484 |     "for i in range(0, 10):\n",
485 |     "    print(lists(text()).example())"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "code",
490 |    "execution_count": 59,
491 |    "metadata": {},
492 |    "outputs": [],
493 |    "source": [
494 |     "from hypothesis.extra.pandas import column, data_frames"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "code",
499 |    "execution_count": 68,
500 |    "metadata": {},
501 |    "outputs": [
502 |     {
503 |      "data": {
504 |       "text/html": [
505 |        "<div>\n",
506 |        "<style scoped>\n",
507 |        "    .dataframe tbody tr th:only-of-type {\n",
508 |        "        vertical-align: middle;\n",
509 |        "    }\n",
510 |        "\n",
511 |        "    .dataframe tbody tr th {\n",
512 |        "        vertical-align: top;\n",
513 |        "    }\n",
514 |        "\n",
515 |        "    .dataframe thead th {\n",
516 |        "        text-align: right;\n",
517 |        "    }\n",
518 |        "</style>\n",
519 |        "<table border=\"1\" class=\"dataframe\">\n",
520 |        "  <thead>\n",
521 |        "    <tr style=\"text-align: right;\">\n",
522 |        "      <th></th>\n",
523 |        "      <th>goal</th>\n",
524 |        "      <th>static_usd_rate</th>\n",
525 |        "    </tr>\n",
526 |        "  </thead>\n",
527 |        "  <tbody>\n",
528 |        "    <tr>\n",
529 |        "      <th>0</th>\n",
530 |        "      <td>NaN</td>\n",
531 |        "      <td>inf</td>\n",
532 |        "    </tr>\n",
533 |        "    <tr>\n",
534 |        "      <th>1</th>\n",
535 |        "      <td>NaN</td>\n",
536 |        "      <td>inf</td>\n",
537 |        "    </tr>\n",
538 |        "    <tr>\n",
539 |        "      <th>2</th>\n",
540 |        "      <td>NaN</td>\n",
541 |        "      <td>inf</td>\n",
542 |        "    </tr>\n",
543 |        "    <tr>\n",
544 |        "      <th>3</th>\n",
545 |        "      <td>NaN</td>\n",
546 |        "      <td>inf</td>\n",
547 |        "    </tr>\n",
548 |        "    <tr>\n",
549 |        "      <th>4</th>\n",
550 |        "      <td>NaN</td>\n",
551 |        "      <td>inf</td>\n",
552 |        "    </tr>\n",
553 |        "    <tr>\n",
554 |        "      <th>5</th>\n",
555 |        "      <td>NaN</td>\n",
556 |        "      <td>inf</td>\n",
557 |        "    </tr>\n",
558 |        "    <tr>\n",
559 |        "      <th>6</th>\n",
560 |        "      <td>NaN</td>\n",
561 |        "      <td>inf</td>\n",
562 |        "    </tr>\n",
563 |        "  </tbody>\n",
564 |        "</table>\n",
565 |        "</div>"
566 |       ],
567 |       "text/plain": [
568 |        "   goal  static_usd_rate\n",
569 |        "0   NaN              inf\n",
570 |        "1   NaN              inf\n",
571 |        "2   NaN              inf\n",
572 |        "3   NaN              inf\n",
573 |        "4   NaN              inf\n",
574 |        "5   NaN              inf\n",
575 |        "6   NaN              inf"
576 |       ]
577 |      },
578 |      "execution_count": 68,
579 |      "metadata": {},
580 |      "output_type": "execute_result"
581 |     }
582 |    ],
583 |    "source": [
584 |     "data_frames([column('goal', dtype=float), column('static_usd_rate', dtype=float)]).example()"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "code",
589 |    "execution_count": 61,
590 |    "metadata": {},
591 |    "outputs": [],
592 |    "source": [
593 |     "from hypothesis.strategies import fixed_dictionaries, text"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "code",
598 |    "execution_count": 74,
599 |    "metadata": {},
600 |    "outputs": [
601 |     {
602 |      "name": "stdout",
603 |      "output_type": "stream",
604 |      "text": [
605 |       "{'slug': '\\U0005d73b&'}\n",
606 |       "{'slug': ''}\n",
607 |       "{'slug': ''}\n",
608 |       "{'slug': ''}\n",
609 |       "{'slug': ''}\n",
610 |       "{'slug': ''}\n",
611 |       "{'slug': \"\\x0e'\\U0006903c\"}\n",
612 |       "{'slug': ''}\n",
613 |       "{'slug': ''}\n",
614 |       "{'slug': '.\\x00'}\n"
615 |      ]
616 |     }
617 |    ],
618 |    "source": [
619 |     "for i in range(0, 10): print(fixed_dictionaries({'slug':text()}).example())"
620 |    ]
621 |   },
622 |   {
623 |    "cell_type": "code",
624 |    "execution_count": 63,
625 |    "metadata": {},
626 |    "outputs": [],
627 |    "source": [
628 |     "from hypothesis.strategies import fixed_dictionaries, from_regex"
629 |    ]
630 |   },
631 |   {
632 |    "cell_type": "code",
633 |    "execution_count": 64,
634 |    "metadata": {},
635 |    "outputs": [
636 |     {
637 |      "data": {
638 |       "text/plain": [
639 |        "'aba\\n'"
640 |       ]
641 |      },
642 |      "execution_count": 64,
643 |      "metadata": {},
644 |      "output_type": "execute_result"
645 |     }
646 |    ],
647 |    "source": [
648 |     "from_regex(\"^[abc]{3}$\").example()"
649 |    ]
650 |   },
651 |   {
652 |    "cell_type": "code",
653 |    "execution_count": 65,
654 |    "metadata": {},
655 |    "outputs": [
656 |     {
657 |      "data": {
658 |       "text/plain": [
659 |        "'{\"data\": \"acc\\\\n\"}'"
660 |       ]
661 |      },
662 |      "execution_count": 65,
663 |      "metadata": {},
664 |      "output_type": "execute_result"
665 |     }
666 |    ],
667 |    "source": [
668 |     "from hypothesis.strategies import fixed_dictionaries, from_regex\n",
669 |     "fixed_dictionaries({'data': from_regex(\"^[abc]{3}$\")}).map(json.dumps).example()"
670 |    ]
671 |   },
672 |   {
673 |    "cell_type": "code",
674 |    "execution_count": null,
675 |    "metadata": {},
676 |    "outputs": [],
677 |    "source": []
678 |   }
679 |  ],
680 |  "metadata": {
681 |   "kernelspec": {
682 |    "display_name": "Python 3",
683 |    "language": "python",
684 |    "name": "python3"
685 |   },
686 |   "language_info": {
687 |    "codemirror_mode": {
688 |     "name": "ipython",
689 |     "version": 3
690 |    },
691 |    "file_extension": ".py",
692 |    "mimetype": "text/x-python",
693 |    "name": "python",
694 |    "nbconvert_exporter": "python",
695 |    "pygments_lexer": "ipython3",
696 |    "version": "3.6.8"
697 |   }
698 |  },
699 |  "nbformat": 4,
700 |  "nbformat_minor": 2
701 | }
702 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn==0.21.2
2 | pandas==0.24.2
3 | numpy==1.16.4
4 | pytest
5 | pytruth
6 | pytest-cov
7 | hypothesis[pandas]
8 | requests
9 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import joblib
 4 | import urllib.request
 5 | 
 6 | import pandas as pd
 7 | import sys
 8 | 
 9 | import pytest
10 | 
11 | from src.model import KickstarterModel as Model
12 | 
13 | TRAIN_NAME = "train.zip"
14 | TEST_NAME = "test.zip"
15 | 
16 | DATA_DIR = "data"
17 | JOBLIB_NAME = 'model.joblib'
18 | 
19 | 
20 | def train_model():
21 |     df = pd.read_csv(os.sep.join([DATA_DIR, TRAIN_NAME]))
22 | 
23 |     my_model = Model()
24 |     X_train, y_train = my_model.preprocess_training_data(df)
25 |     my_model.fit(X_train, y_train)
26 | 
27 |     # Save JOB
28 |     joblib.dump(my_model, JOBLIB_NAME)
29 | 
30 | 
31 | def test_model():
32 |     df = pd.read_csv(os.sep.join([DATA_DIR, TEST_NAME]))
33 | 
34 |     # Load JOB
35 |     my_model = joblib.load(JOBLIB_NAME)
36 | 
37 |     X_test = my_model.preprocess_unseen_data(df)
38 |     preds = my_model.predict(X_test)
39 |     print("### Your predictions ###")
40 |     print(preds)
41 | 
42 | 
43 | def main():
44 |     parser = argparse.ArgumentParser(
45 |         description="A command line-tool to manage the project.")
46 |     parser.add_argument(
47 |         'stage',
48 |         metavar='stage',
49 |         type=str,
50 |         choices=['train', 'test', 'unittest', 'coverage', 'hypothesis', 'exercises'],
51 |         help="Stage to run. Either train, test, unittest, coverage, hypothesis or exercises")
52 | 
53 |     if len(sys.argv[1:]) == 0:
54 |         parser.print_help()
55 |         parser.exit()
56 | 
57 |     stage = parser.parse_args().stage
58 | 
59 |     if stage == "train":
60 |         print("Training model...")
61 |         train_model()
62 | 
63 |     elif stage == "test":
64 |         print("Testing model...")
65 |         test_model()
66 | 
67 |     elif stage == "unittest":
68 |         print("Unittesting model...")
69 |         pytest.main(['-v', 'tests'])
70 | 
71 |     elif stage == "coverage":
72 |         print("Running coverage...")
73 |         pytest.main(['--cov-report', 'term-missing', '--cov=src/', 'tests/'])
74 | 
75 |     elif stage == "hypothesis":
76 |         print("Running hypothesis...")
77 |         pytest.main(['-v', '--hypothesis-show-statistics', 'tests/test_transformers_hypothesis.py'])
78 | 
79 |     elif stage == "exercises":
80 |         print("Running the exercises...")
81 |         pytest.main(['-v', 'exercises'])
82 | 
83 | if __name__ == "__main__":
84 |     main()
85 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cambridgespark/pydata-testing-for-data-science/5669d582659e54c6b54463b94fb1feb7a6b301aa/src/__init__.py


--------------------------------------------------------------------------------
/src/model.py:
--------------------------------------------------------------------------------
 1 | from sklearn.compose import ColumnTransformer
 2 | from sklearn.linear_model import LogisticRegression
 3 | from sklearn.pipeline import Pipeline
 4 | from sklearn.preprocessing import OneHotEncoder, StandardScaler
 5 | 
 6 | from src.transformers import CategoriesExtractor, CountryTransformer, GoalAdjustor, TimeTransformer
 7 | 
 8 | 
 9 | class KickstarterModel:
10 | 
11 |     # Update parameters here after re-tuning the model
12 |     params = {"penalty": "l1", "C": 1.7, "solver": "liblinear"}
13 | 
14 |     def __init__(self):
15 | 
16 |         self.model = None
17 |         self.preprocessor = None
18 | 
19 |     def preprocess_training_data(self, df):
20 |         # Processor for categories with one-hot encoding
21 |         cat_processor = Pipeline([("extractor", CategoriesExtractor()),
22 |                                   ("one_hot",
23 |                                    OneHotEncoder(sparse=False,
24 |                                                  handle_unknown="ignore"))])
25 | 
26 |         # Processor for countries with one-hot encoding
27 |         country_processor = Pipeline([("transfomer", CountryTransformer()),
28 |                                       ("one_hot",
29 |                                        OneHotEncoder(sparse=False,
30 |                                                      handle_unknown="ignore"))])
31 | 
32 |         # First level of column specific transformations
33 |         col_transformer = ColumnTransformer([
34 |             ("goal", GoalAdjustor(), ["goal", "static_usd_rate"]),
35 |             ("categories", cat_processor, ["category"]),
36 |             ("disable_communication", "passthrough", ["disable_communication"]),
37 |             ("time", TimeTransformer(),
38 |              ["deadline", "created_at", "launched_at"]),
39 |             ("countries", country_processor, ["country"])
40 |         ])
41 | 
42 |         # Add a scaling stage
43 |         self.preprocessor = Pipeline([("col_transformer", col_transformer),
44 |                                       ("scaler", StandardScaler())])
45 | 
46 |         # Return X_train and y_train
47 |         X_train = self.preprocessor.fit_transform(df.drop("state", axis=1))
48 |         y_train = df.state.map({"failed": 0, "successful": 1})
49 | 
50 |         return X_train, y_train
51 | 
52 |     def fit(self, X, y):
53 |         self.model = LogisticRegression(**self.params)
54 |         self.model.fit(X, y)
55 | 
56 |     def preprocess_unseen_data(self, df):
57 |         X_test = self.preprocessor.transform(df)
58 |         return X_test
59 | 
60 |     def predict(self, X):
61 | 
62 |         return self.model.predict(X)
63 | 


--------------------------------------------------------------------------------
/src/transformers.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import pandas as pd
  3 | import requests
  4 | from sklearn.base import BaseEstimator, TransformerMixin
  5 | 
  6 | class CategoriesExtractor(BaseEstimator, TransformerMixin):
  7 |     """Extract Categories from json string.
  8 | 
  9 |     By default it will only keep the hardcoded categories defined below
 10 |     to avoid having too many dummies."""
 11 | 
 12 |     misc = "misc"
 13 |     gen_cats = ["music", "film & video", "publishing", "art", "games"]
 14 |     precise_cats = [
 15 |         "rock", "fiction", "webseries", "indie rock", "children's books",
 16 |         "shorts", "documentary", "video games"
 17 |     ]
 18 | 
 19 |     @classmethod
 20 |     def extract_categories(cls, json_string, validate=True):
 21 |         categories = json.loads(json_string).get("slug", "/").split("/")
 22 | 
 23 |         # Validate categories to keep only
 24 |         # the most common ones
 25 |         if validate:
 26 |             if categories[0] not in cls.gen_cats:
 27 |                 categories[0] = cls.misc
 28 |             if categories[1] not in cls.precise_cats:
 29 |                 categories[1] = cls.misc
 30 | 
 31 |         return categories
 32 | 
 33 |     def fit(self, X, y=None):
 34 |         return self
 35 | 
 36 |     def transform(self, X):
 37 |         categories = X["category"]
 38 |         return pd.DataFrame({
 39 |             "gen_cat": categories.apply(lambda x: self.extract_categories(x)[0]),
 40 |             "precise_cat": categories.apply(lambda x: self.extract_categories(x)[1])
 41 |         })
 42 | 
 43 | 
 44 | class GoalAdjustor(BaseEstimator, TransformerMixin):
 45 |     """Adjusts the goal feature to USD"""
 46 | 
 47 |     def fit(self, X, y=None):
 48 |         return self
 49 | 
 50 |     def transform(self, X):
 51 |         return pd.DataFrame({"adjusted_goal": X.goal * X.static_usd_rate})
 52 | 
 53 | 
 54 | class TimeTransformer(BaseEstimator, TransformerMixin):
 55 |     """Builds features computed from timestamps"""
 56 | 
 57 |     def __init__(self, adj=1000_000_000):
 58 |         self.adj = adj
 59 | 
 60 |     def fit(self, X, y=None):
 61 |         return self
 62 | 
 63 |     def transform(self, X):
 64 |         deadline = pd.to_datetime(X.deadline * self.adj)
 65 |         created = pd.to_datetime(X.created_at * self.adj)
 66 |         launched = pd.to_datetime(X.launched_at * self.adj)
 67 | 
 68 |         return pd.DataFrame({
 69 |             "launched_to_deadline": (deadline - launched).dt.days,
 70 |             "created_to_launched": (launched - created).dt.days
 71 |         })
 72 | 
 73 | 
 74 | class CountryTransformer(BaseEstimator, TransformerMixin):
 75 |     """Transform countries into larger groups to avoid having
 76 |     too many dummies."""
 77 | 
 78 |     countries = {
 79 |         'US': 'US',
 80 |         'CA': 'Canada',
 81 |         'GB': 'UK & Ireland',
 82 |         'AU': 'Oceania',
 83 |         'IE': 'UK & Ireland',
 84 |         'SE': 'Europe',
 85 |         'CH': "Europe",
 86 |         'IT': 'Europe',
 87 |         'FR': 'Europe',
 88 |         'NZ': 'Oceania',
 89 |         'DE': 'Europe',
 90 |         'NL': 'Europe',
 91 |         'NO': 'Europe',
 92 |         'MX': 'Other',
 93 |         'ES': 'Europe',
 94 |         'DK': 'Europe',
 95 |         'BE': 'Europe',
 96 |         'AT': 'Europe',
 97 |         'HK': 'Other',
 98 |         'SG': 'Other',
 99 |         'LU': 'Europe'
100 |     }
101 | 
102 |     def fit(self, X, y=None):
103 |         return self
104 | 
105 |     def transform(self, X):
106 |         return pd.DataFrame({"country": X.country.map(self.countries)})
107 | 
108 | 
109 | class CountryFullTransformer(BaseEstimator, TransformerMixin):
110 |     """Transform countries into larger groups to avoid having
111 |     too many dummies."""
112 | 
113 |     def getRegionFromCode(self, country):
114 |         url = f"https://restcountries.eu/rest/v2/name/{country}"
115 | 
116 |         result = json.loads(requests.get(url))
117 |         return result["region"]
118 | 
119 |     def fit(self, X, y=None):
120 |         return self
121 | 
122 |     def transform(self, X):
123 |         return pd.DataFrame({"country": X.country.map(self.getRegionFromCode)})


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cambridgespark/pydata-testing-for-data-science/5669d582659e54c6b54463b94fb1feb7a6b301aa/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_country_transformer_pytest.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from src.transformers import CountryTransformer
 3 | 
 4 | def test_correct_country_returned_with_simple_df():
 5 |     df = pd.DataFrame({'country': ["CA", "GB"]})
 6 |     country_transformer = CountryTransformer()
 7 | 
 8 |     result_df = country_transformer.transform(df)
 9 | 
10 |     assert len(result_df.index) == 2
11 |     assert result_df["country"][0] == "Canada"
12 |     assert result_df["country"][1] == "UK & Ireland"
13 | 
14 | 
15 | # def test_unkown_country_returns_default():
16 | #     df = pd.DataFrame({'country': ["SA"]})
17 | #     country_transformer = CountryTransformer()
18 | #
19 | #     result_df = country_transformer.transform(df)
20 | #
21 | #     # TODO: fix transformer to handle NaN / default
22 | #     assert len(result_df.index) == 1
23 | #     assert result_df["country"][0] == "Other"
24 | 


--------------------------------------------------------------------------------
/tests/test_country_transformer_unittest.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import pandas as pd
 3 | from src.transformers import CountryTransformer
 4 | 
 5 | 
 6 | class TestCountryTransformer(unittest.TestCase):
 7 | 
 8 |     def test_correct_country_returned_with_simple_df(self):
 9 |         df = pd.DataFrame({'country':["CA", "GB"]})
10 |         country_transformer = CountryTransformer()
11 | 
12 |         result_df = country_transformer.transform(df)
13 | 
14 |         self.assertEqual(len(result_df.index), 2)
15 |         self.assertEqual(result_df["country"][0], "Canada")
16 |         self.assertEqual(result_df["country"][1], "UK & Ireland")
17 | 
18 | 
19 |     # def test_unkown_country_returns_default(self):
20 |     #     df = pd.DataFrame({'country':["BE"]})
21 |     #     country_transformer = CountryTransformer()
22 |     #
23 |     #     result_df = country_transformer.transform(df)
24 |     #
25 |     #     # TODO: fix transformer to handle NaN / default
26 |     #     self.assertEqual(len(result_df.index), 1)
27 |     #     self.assertEqual(result_df["country"][0], "SA")
28 | 
29 | 


--------------------------------------------------------------------------------
/tests/test_transformers.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from pandas.testing import assert_frame_equal
 3 | from src.transformers import GoalAdjustor, TimeTransformer
 4 | 
 5 | 
 6 | def test_time_transformer():
 7 |     time_transformer = TimeTransformer()
 8 |     deadline_timestamp = 1459283229
 9 |     created_at_timestamp = 1455845363
10 |     launched_at_timestamp = 1456694829
11 |     sample_df = pd.DataFrame({'deadline': [deadline_timestamp], 'created_at': [created_at_timestamp], 'launched_at': [
12 |         launched_at_timestamp]})
13 | 
14 |     expected_df = pd.DataFrame({'launched_to_deadline': [29], 'created_to_launched': [9]})
15 | 
16 |     result_df = time_transformer.transform(sample_df)
17 | 
18 |     assert_frame_equal(result_df, expected_df)
19 | 
20 | def test_goal_adjustor_with_value():
21 |     adjustor = GoalAdjustor()
22 |     goal_value = 10
23 |     usd_rate_value = 2
24 |     sample_df = pd.DataFrame({'goal': [goal_value], 'static_usd_rate': [usd_rate_value]})
25 | 
26 |     result_df = adjustor.transform(sample_df)
27 | 
28 |     expected_adjusted_goal_value = 20
29 |     expected_df = pd.DataFrame({'adjusted_goal': [expected_adjusted_goal_value]})
30 |     assert_frame_equal(result_df, expected_df)
31 | 
32 |     #TODO: show problem if we just use assert from py test. the diagnostics makes no sense
33 |     # have to use assert_frame_equal
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/tests/test_transformers_hypothesis.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from hypothesis import given
 4 | from hypothesis.extra.pandas import column, data_frames
 5 | import pandas as pd
 6 | from pandas.util.testing import assert_frame_equal
 7 | 
 8 | from src.transformers import GoalAdjustor
 9 | 
10 | 
11 | @given(data_frames([column('goal', dtype=float), column('static_usd_rate', dtype=float)]))
12 | def test_goal_adjustor(sample_df):
13 |     adjustor = GoalAdjustor()
14 | 
15 |     result_df = adjustor.transform(sample_df)
16 | 
17 |     assert len(sample_df.index) == len(result_df.index)
18 | 
19 | 
20 |     # example of invariant: raises the question of where should validation be
21 |     #expected_df = pd.DataFrame({'adjusted_goal': sample_df["goal"] * sample_df["static_usd_rate"]})
22 |     #assert (expected_df["adjusted_goal"] >= 0).all()


--------------------------------------------------------------------------------
/tests/test_transformers_mocking.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import MagicMock, call
 2 | 
 3 | import pandas as pd
 4 | from pandas.util.testing import assert_frame_equal
 5 | 
 6 | from src.transformers import CountryFullTransformer
 7 | 
 8 | 
 9 | def test_correct_country_returned_with_simple_df():
10 |     df = pd.DataFrame({'country': ["CA", "GB"]})
11 | 
12 |     country_transformer = CountryFullTransformer()
13 | 
14 |     country_transformer.getRegionFromCode = MagicMock()
15 |     country_transformer.getRegionFromCode.side_effect = ["Canada", "UK & Ireland"]
16 | 
17 |     expected_df = pd.DataFrame({'country': ["Canada", "UK & Ireland"]})
18 |     result_df = country_transformer.transform(df)
19 | 
20 |     country_transformer.getRegionFromCode.assert_has_calls([call("CA"), call("GB")])
21 |     assert_frame_equal(result_df, expected_df)
22 | 
23 | 


--------------------------------------------------------------------------------
/tests/test_transformers_parameterised.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | from pandas.testing import assert_frame_equal
 4 | from src.transformers import GoalAdjustor
 5 | 
 6 | 
 7 | test_goal_transformer_testdata = [
 8 |     (pd.DataFrame({'goal': [5], 'static_usd_rate': [2]}), pd.DataFrame({'adjusted_goal': [10]})),
 9 |     (pd.DataFrame({'goal': [0], 'static_usd_rate': [1]}), pd.DataFrame({'adjusted_goal': [0]})),
10 |     (pd.DataFrame({'goal': [0], 'static_usd_rate': [1]}), pd.DataFrame({'adjusted_goal': [0]})),
11 | ]
12 | 
13 | @pytest.mark.parametrize("sample_df, expected_df", test_goal_transformer_testdata)
14 | def test_goal_adjustor(sample_df, expected_df):
15 |     adjustor = GoalAdjustor()
16 | 
17 |     result_df = adjustor.transform(sample_df)
18 |     assert_frame_equal(result_df, expected_df)
19 | 
20 | 


--------------------------------------------------------------------------------