├── .github
    └── workflows
    │   └── python-publish.yml
├── .gitignore
├── Examples.ipynb
├── LICENSE.txt
├── README.md
├── banner.png
├── ecommercetools
    ├── __init__.py
    ├── advertising
    │   ├── __init__.py
    │   └── advertising.py
    ├── customers
    │   ├── __init__.py
    │   └── customers.py
    ├── marketing
    │   ├── __init__.py
    │   └── marketing.py
    ├── nlp
    │   ├── __init__.py
    │   └── nlp.py
    ├── operations
    │   ├── __init__.py
    │   └── operations.py
    ├── products
    │   ├── __init__.py
    │   └── products.py
    ├── reports
    │   ├── __init__.py
    │   └── reports.py
    ├── seo
    │   ├── __init__.py
    │   ├── google_autocomplete.py
    │   ├── google_knowledge_graph.py
    │   ├── google_pagespeed_insights.py
    │   ├── google_search.py
    │   ├── google_search_console.py
    │   ├── robots.py
    │   ├── scraping.py
    │   ├── sitemaps.py
    │   └── testing.py
    ├── transactions
    │   ├── __init__.py
    │   └── transactions.py
    └── utilities
    │   ├── __init__.py
    │   ├── metrics.py
    │   └── tools.py
├── example.py
├── requirements.txt
├── scraper_example.py
├── setup.cfg
└── setup.py


/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on: [push]
 7 | 
 8 | jobs:
 9 |   deploy:
10 | 
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v2
15 |     - name: Set up Python
16 |       uses: actions/setup-python@v2
17 |       with:
18 |         python-version: '3.x'
19 |     - name: Install dependencies
20 |       run: |
21 |         python -m pip install --upgrade pip
22 |         pip install setuptools wheel twine
23 |     - name: Build and publish
24 |       env:
25 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
26 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
27 |       run: |
28 |         python setup.py sdist bdist_wheel
29 |         twine upload dist/*
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | build/
3 | dist/
4 | venv/
5 | data/
6 | ecommercetools.egg-info
7 | google-search-console.json
8 | pds-client-secrets.json
9 | example-test.py


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | Copyright (c) 2021 Matt Clarke, Practical Data Science
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | The above copyright notice and this permission notice shall be included in all
10 | copies or substantial portions of the Software.
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
17 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
   1 | # EcommerceTools
   2 | 
   3 | ![EcommerceTools](https://github.com/practical-data-science/ecommercetools/blob/master/banner.png?raw=true)
   4 | 
   5 | EcommerceTools is a data science toolkit for those working in technical ecommerce, marketing science, and technical seo and includes a wide range of features to aid analysis and model building. The package is written in Python and is designed to be used with Pandas and works within a Jupyter notebook environment or in standalone Python projects. 
   6 | 
   7 | #### Installation
   8 | 
   9 | You can install EcommerceTools and its dependencies via PyPi by entering `pip3 install ecommercetools` in your terminal, or `!pip3 install ecommercetools` within a Jupyter notebook cell. 
  10 | 
  11 | ---
  12 | 
  13 | ### Modules
  14 | 
  15 | - [Transactions](#Transactions)
  16 | - [Products](#Products)
  17 | - [Customers](#Customers)
  18 | - [Advertising](#Advertising)
  19 | - [Operations](#Operations)
  20 | - [Marketing](#Marketing)
  21 | - [NLP](#NLP)
  22 | - [SEO](#SEO)
  23 | - [Reports](#Reports)
  24 | ---
  25 | 
  26 | ### Transactions
  27 | 
  28 | 1. #### Load sample transaction items data
  29 | 
  30 | If you want to get started with the transactions, products, and customers features, you can use the `load_sample_data()` function to load a set of real world data. This imports the transaction items from widely-used Online Retail dataset and reformats it ready for use by EcommerceTools. 
  31 | 
  32 | ```python
  33 | from ecommercetools import utilities
  34 | 
  35 | transaction_items = utilities.load_sample_data()
  36 | transaction_items.head()
  37 | ```
  38 | 
  39 | <table border="1" class="dataframe">
  40 |   <thead>
  41 |     <tr style="text-align: right;">
  42 |       <th></th>
  43 |       <th>order_id</th>
  44 |       <th>sku</th>
  45 |       <th>description</th>
  46 |       <th>quantity</th>
  47 |       <th>order_date</th>
  48 |       <th>unit_price</th>
  49 |       <th>customer_id</th>
  50 |       <th>country</th>
  51 |       <th>line_price</th>
  52 |     </tr>
  53 |   </thead>
  54 |   <tbody>
  55 |     <tr>
  56 |       <th>0</th>
  57 |       <td>536365</td>
  58 |       <td>85123A</td>
  59 |       <td>WHITE HANGING HEART T-LIGHT HOLDER</td>
  60 |       <td>6</td>
  61 |       <td>2010-12-01 08:26:00</td>
  62 |       <td>2.55</td>
  63 |       <td>17850.0</td>
  64 |       <td>United Kingdom</td>
  65 |       <td>15.30</td>
  66 |     </tr>
  67 |     <tr>
  68 |       <th>1</th>
  69 |       <td>536365</td>
  70 |       <td>71053</td>
  71 |       <td>WHITE METAL LANTERN</td>
  72 |       <td>6</td>
  73 |       <td>2010-12-01 08:26:00</td>
  74 |       <td>3.39</td>
  75 |       <td>17850.0</td>
  76 |       <td>United Kingdom</td>
  77 |       <td>20.34</td>
  78 |     </tr>
  79 |     <tr>
  80 |       <th>2</th>
  81 |       <td>536365</td>
  82 |       <td>84406B</td>
  83 |       <td>CREAM CUPID HEARTS COAT HANGER</td>
  84 |       <td>8</td>
  85 |       <td>2010-12-01 08:26:00</td>
  86 |       <td>2.75</td>
  87 |       <td>17850.0</td>
  88 |       <td>United Kingdom</td>
  89 |       <td>22.00</td>
  90 |     </tr>
  91 |     <tr>
  92 |       <th>3</th>
  93 |       <td>536365</td>
  94 |       <td>84029G</td>
  95 |       <td>KNITTED UNION FLAG HOT WATER BOTTLE</td>
  96 |       <td>6</td>
  97 |       <td>2010-12-01 08:26:00</td>
  98 |       <td>3.39</td>
  99 |       <td>17850.0</td>
 100 |       <td>United Kingdom</td>
 101 |       <td>20.34</td>
 102 |     </tr>
 103 |     <tr>
 104 |       <th>4</th>
 105 |       <td>536365</td>
 106 |       <td>84029E</td>
 107 |       <td>RED WOOLLY HOTTIE WHITE HEART.</td>
 108 |       <td>6</td>
 109 |       <td>2010-12-01 08:26:00</td>
 110 |       <td>3.39</td>
 111 |       <td>17850.0</td>
 112 |       <td>United Kingdom</td>
 113 |       <td>20.34</td>
 114 |     </tr>
 115 |   </tbody>
 116 | </table>
 117 | 
 118 | 2. #### Create a transaction items dataframe
 119 | 
 120 | The `utilities` module includes a range of tools that allow you to format data, so it can be used within other EcommerceTools functions. The `load_transaction_items()` function is used to create a Pandas dataframe of formatted transactional item data. When loading your transaction items data, all you need to do is define the column mappings, and the function will reformat the dataframe accordingly. 
 121 | 
 122 | ```python
 123 | import pandas as pd
 124 | from ecommercetools import utilities
 125 | 
 126 | transaction_items = utilities.load_transaction_items('transaction_items_non_standard_names.csv',
 127 |                                  date_column='InvoiceDate',
 128 |                                  order_id_column='InvoiceNo',
 129 |                                  customer_id_column='CustomerID',
 130 |                                  sku_column='StockCode',
 131 |                                  quantity_column='Quantity',
 132 |                                  unit_price_column='UnitPrice'
 133 |                                  )
 134 | transaction_items.to_csv('transaction_items.csv', index=False)
 135 | print(transaction_items.head())
 136 | ```
 137 | 
 138 | <table>
 139 |   <thead>
 140 |     <tr style="text-align: right;">
 141 |       <th></th>
 142 |       <th>order_id</th>
 143 |       <th>sku</th>
 144 |       <th>description</th>
 145 |       <th>quantity</th>
 146 |       <th>order_date</th>
 147 |       <th>unit_price</th>
 148 |       <th>customer_id</th>
 149 |       <th>country</th>
 150 |       <th>line_price</th>
 151 |     </tr>
 152 |   </thead>
 153 |   <tbody>
 154 |     <tr>
 155 |       <th>0</th>
 156 |       <td>536365</td>
 157 |       <td>85123A</td>
 158 |       <td>WHITE HANGING HEART T-LIGHT HOLDER</td>
 159 |       <td>6</td>
 160 |       <td>2010-12-01 08:26:00</td>
 161 |       <td>2.55</td>
 162 |       <td>17850.0</td>
 163 |       <td>United Kingdom</td>
 164 |       <td>15.30</td>
 165 |     </tr>
 166 |     <tr>
 167 |       <th>1</th>
 168 |       <td>536365</td>
 169 |       <td>71053</td>
 170 |       <td>WHITE METAL LANTERN</td>
 171 |       <td>6</td>
 172 |       <td>2010-12-01 08:26:00</td>
 173 |       <td>3.39</td>
 174 |       <td>17850.0</td>
 175 |       <td>United Kingdom</td>
 176 |       <td>20.34</td>
 177 |     </tr>
 178 |     <tr>
 179 |       <th>2</th>
 180 |       <td>536365</td>
 181 |       <td>84406B</td>
 182 |       <td>CREAM CUPID HEARTS COAT HANGER</td>
 183 |       <td>8</td>
 184 |       <td>2010-12-01 08:26:00</td>
 185 |       <td>2.75</td>
 186 |       <td>17850.0</td>
 187 |       <td>United Kingdom</td>
 188 |       <td>22.00</td>
 189 |     </tr>
 190 |     <tr>
 191 |       <th>3</th>
 192 |       <td>536365</td>
 193 |       <td>84029G</td>
 194 |       <td>KNITTED UNION FLAG HOT WATER BOTTLE</td>
 195 |       <td>6</td>
 196 |       <td>2010-12-01 08:26:00</td>
 197 |       <td>3.39</td>
 198 |       <td>17850.0</td>
 199 |       <td>United Kingdom</td>
 200 |       <td>20.34</td>
 201 |     </tr>
 202 |     <tr>
 203 |       <th>4</th>
 204 |       <td>536365</td>
 205 |       <td>84029E</td>
 206 |       <td>RED WOOLLY HOTTIE WHITE HEART.</td>
 207 |       <td>6</td>
 208 |       <td>2010-12-01 08:26:00</td>
 209 |       <td>3.39</td>
 210 |       <td>17850.0</td>
 211 |       <td>United Kingdom</td>
 212 |       <td>20.34</td>
 213 |     </tr>
 214 |   </tbody>
 215 | </table>
 216 | 
 217 | 3. #### Create a transactions dataframe
 218 | 
 219 | The `get_transactions()` function takes the formatted Pandas dataframe of transaction items and returns a Pandas dataframe of aggregated transaction data, which includes features identifying the order number. 
 220 | 
 221 | ```python
 222 | import pandas as pd
 223 | from ecommercetools import customers
 224 | 
 225 | transaction_items = pd.read_csv('transaction_items.csv')
 226 | transactions = transactions.get_transactions(transaction_items)
 227 | transactions.to_csv('transactions.csv', index=False)
 228 | print(transactions.head())
 229 | ```
 230 | 
 231 | <table>
 232 |   <thead>
 233 |     <tr style="text-align: right;">
 234 |       <th></th>
 235 |       <th>order_id</th>
 236 |       <th>order_date</th>
 237 |       <th>customer_id</th>
 238 |       <th>skus</th>
 239 |       <th>items</th>
 240 |       <th>revenue</th>
 241 |       <th>replacement</th>
 242 |       <th>order_number</th>
 243 |     </tr>
 244 |   </thead>
 245 |   <tbody>
 246 |     <tr>
 247 |       <th>0</th>
 248 |       <td>536365</td>
 249 |       <td>2010-12-01 08:26:00</td>
 250 |       <td>17850.0</td>
 251 |       <td>7</td>
 252 |       <td>40</td>
 253 |       <td>139.12</td>
 254 |       <td>0</td>
 255 |       <td>1</td>
 256 |     </tr>
 257 |     <tr>
 258 |       <th>1</th>
 259 |       <td>536366</td>
 260 |       <td>2010-12-01 08:28:00</td>
 261 |       <td>17850.0</td>
 262 |       <td>2</td>
 263 |       <td>12</td>
 264 |       <td>22.20</td>
 265 |       <td>0</td>
 266 |       <td>2</td>
 267 |     </tr>
 268 |     <tr>
 269 |       <th>2</th>
 270 |       <td>536367</td>
 271 |       <td>2010-12-01 08:34:00</td>
 272 |       <td>13047.0</td>
 273 |       <td>12</td>
 274 |       <td>83</td>
 275 |       <td>278.73</td>
 276 |       <td>0</td>
 277 |       <td>1</td>
 278 |     </tr>
 279 |     <tr>
 280 |       <th>3</th>
 281 |       <td>536368</td>
 282 |       <td>2010-12-01 08:34:00</td>
 283 |       <td>13047.0</td>
 284 |       <td>4</td>
 285 |       <td>15</td>
 286 |       <td>70.05</td>
 287 |       <td>0</td>
 288 |       <td>2</td>
 289 |     </tr>
 290 |     <tr>
 291 |       <th>4</th>
 292 |       <td>536369</td>
 293 |       <td>2010-12-01 08:35:00</td>
 294 |       <td>13047.0</td>
 295 |       <td>1</td>
 296 |       <td>3</td>
 297 |       <td>17.85</td>
 298 |       <td>0</td>
 299 |       <td>3</td>
 300 |     </tr>
 301 |   </tbody>
 302 | </table>
 303 | 
 304 | ---
 305 | 
 306 | ### Products
 307 | 
 308 | #### 1. Get product data from transaction items
 309 | 
 310 | ```python
 311 | products_df = products.get_products(transaction_items)
 312 | products_df.head()
 313 | ```
 314 | 
 315 | <table>
 316 |   <thead>
 317 |     <tr style="text-align: right;">
 318 |       <th></th>
 319 |       <th>sku</th>
 320 |       <th>first_order_date</th>
 321 |       <th>last_order_date</th>
 322 |       <th>customers</th>
 323 |       <th>orders</th>
 324 |       <th>items</th>
 325 |       <th>revenue</th>
 326 |       <th>avg_unit_price</th>
 327 |       <th>avg_quantity</th>
 328 |       <th>avg_revenue</th>
 329 |       <th>avg_orders</th>
 330 |       <th>product_tenure</th>
 331 |       <th>product_recency</th>
 332 |     </tr>
 333 |   </thead>
 334 |   <tbody>
 335 |     <tr>
 336 |       <th>0</th>
 337 |       <td>10002</td>
 338 |       <td>2010-12-01 08:45:00</td>
 339 |       <td>2011-04-28 15:05:00</td>
 340 |       <td>40</td>
 341 |       <td>73</td>
 342 |       <td>1037</td>
 343 |       <td>759.89</td>
 344 |       <td>1.056849</td>
 345 |       <td>14.205479</td>
 346 |       <td>10.409452</td>
 347 |       <td>1.82</td>
 348 |       <td>3749</td>
 349 |       <td>3600</td>
 350 |     </tr>
 351 |     <tr>
 352 |       <th>1</th>
 353 |       <td>10080</td>
 354 |       <td>2011-02-27 13:47:00</td>
 355 |       <td>2011-11-21 17:04:00</td>
 356 |       <td>19</td>
 357 |       <td>24</td>
 358 |       <td>495</td>
 359 |       <td>119.09</td>
 360 |       <td>0.376667</td>
 361 |       <td>20.625000</td>
 362 |       <td>4.962083</td>
 363 |       <td>1.26</td>
 364 |       <td>3660</td>
 365 |       <td>3393</td>
 366 |     </tr>
 367 |     <tr>
 368 |       <th>2</th>
 369 |       <td>10120</td>
 370 |       <td>2010-12-03 11:19:00</td>
 371 |       <td>2011-12-04 13:15:00</td>
 372 |       <td>25</td>
 373 |       <td>29</td>
 374 |       <td>193</td>
 375 |       <td>40.53</td>
 376 |       <td>0.210000</td>
 377 |       <td>6.433333</td>
 378 |       <td>1.351000</td>
 379 |       <td>1.16</td>
 380 |       <td>3746</td>
 381 |       <td>3380</td>
 382 |     </tr>
 383 |     <tr>
 384 |       <th>3</th>
 385 |       <td>10123C</td>
 386 |       <td>2010-12-03 11:19:00</td>
 387 |       <td>2011-07-15 15:05:00</td>
 388 |       <td>3</td>
 389 |       <td>4</td>
 390 |       <td>-13</td>
 391 |       <td>3.25</td>
 392 |       <td>0.487500</td>
 393 |       <td>-3.250000</td>
 394 |       <td>0.812500</td>
 395 |       <td>1.33</td>
 396 |       <td>3746</td>
 397 |       <td>3522</td>
 398 |     </tr>
 399 |     <tr>
 400 |       <th>4</th>
 401 |       <td>10123G</td>
 402 |       <td>2011-04-08 11:13:00</td>
 403 |       <td>2011-04-08 11:13:00</td>
 404 |       <td>0</td>
 405 |       <td>1</td>
 406 |       <td>-38</td>
 407 |       <td>0.00</td>
 408 |       <td>0.000000</td>
 409 |       <td>-38.000000</td>
 410 |       <td>0.000000</td>
 411 |       <td>inf</td>
 412 |       <td>3620</td>
 413 |       <td>3620</td>
 414 |     </tr>
 415 |   </tbody>
 416 | </table>
 417 | 
 418 | #### 2. Calculate product consumption and repurchase rate
 419 | 
 420 | 
 421 | ```python
 422 | repurchase_rates = products.get_repurchase_rates(transaction_items)
 423 | repurchase_rates.head(3).T
 424 | ```
 425 | 
 426 | <table>
 427 |   <thead>
 428 |     <tr style="text-align: right;">
 429 |       <th></th>
 430 |       <th>0</th>
 431 |       <th>1</th>
 432 |       <th>2</th>
 433 |     </tr>
 434 |   </thead>
 435 |   <tbody>
 436 |     <tr>
 437 |       <th>sku</th>
 438 |       <td>10002</td>
 439 |       <td>10080</td>
 440 |       <td>10120</td>
 441 |     </tr>
 442 |     <tr>
 443 |       <th>revenue</th>
 444 |       <td>759.89</td>
 445 |       <td>119.09</td>
 446 |       <td>40.53</td>
 447 |     </tr>
 448 |     <tr>
 449 |       <th>items</th>
 450 |       <td>1037</td>
 451 |       <td>495</td>
 452 |       <td>193</td>
 453 |     </tr>
 454 |     <tr>
 455 |       <th>orders</th>
 456 |       <td>73</td>
 457 |       <td>24</td>
 458 |       <td>29</td>
 459 |     </tr>
 460 |     <tr>
 461 |       <th>customers</th>
 462 |       <td>40</td>
 463 |       <td>19</td>
 464 |       <td>25</td>
 465 |     </tr>
 466 |     <tr>
 467 |       <th>avg_unit_price</th>
 468 |       <td>1.05685</td>
 469 |       <td>0.376667</td>
 470 |       <td>0.21</td>
 471 |     </tr>
 472 |     <tr>
 473 |       <th>avg_line_price</th>
 474 |       <td>10.4095</td>
 475 |       <td>4.96208</td>
 476 |       <td>1.351</td>
 477 |     </tr>
 478 |     <tr>
 479 |       <th>avg_items_per_order</th>
 480 |       <td>14.2055</td>
 481 |       <td>20.625</td>
 482 |       <td>6.65517</td>
 483 |     </tr>
 484 |     <tr>
 485 |       <th>avg_items_per_customer</th>
 486 |       <td>25.925</td>
 487 |       <td>26.0526</td>
 488 |       <td>7.72</td>
 489 |     </tr>
 490 |     <tr>
 491 |       <th>purchased_individually</th>
 492 |       <td>0</td>
 493 |       <td>0</td>
 494 |       <td>9</td>
 495 |     </tr>
 496 |     <tr>
 497 |       <th>purchased_once</th>
 498 |       <td>34</td>
 499 |       <td>17</td>
 500 |       <td>22</td>
 501 |     </tr>
 502 |     <tr>
 503 |       <th>bulk_purchases</th>
 504 |       <td>73</td>
 505 |       <td>24</td>
 506 |       <td>20</td>
 507 |     </tr>
 508 |     <tr>
 509 |       <th>bulk_purchase_rate</th>
 510 |       <td>1</td>
 511 |       <td>1</td>
 512 |       <td>0.689655</td>
 513 |     </tr>
 514 |     <tr>
 515 |       <th>repurchases</th>
 516 |       <td>39</td>
 517 |       <td>7</td>
 518 |       <td>7</td>
 519 |     </tr>
 520 |     <tr>
 521 |       <th>repurchase_rate</th>
 522 |       <td>0.534247</td>
 523 |       <td>0.291667</td>
 524 |       <td>0.241379</td>
 525 |     </tr>
 526 |     <tr>
 527 |       <th>repurchase_rate_label</th>
 528 |       <td>Moderate repurchase</td>
 529 |       <td>Low repurchase</td>
 530 |       <td>Low repurchase</td>
 531 |     </tr>
 532 |     <tr>
 533 |       <th>bulk_purchase_rate_label</th>
 534 |       <td>Very high bulk</td>
 535 |       <td>Very high bulk</td>
 536 |       <td>High bulk</td>
 537 |     </tr>
 538 |     <tr>
 539 |       <th>bulk_and_repurchase_label</th>
 540 |       <td>Moderate repurchase_Very high bulk</td>
 541 |       <td>Low repurchase_Very high bulk</td>
 542 |       <td>Low repurchase_High bulk</td>
 543 |     </tr>
 544 |   </tbody>
 545 | </table>
 546 | 
 547 | ---
 548 | 
 549 | ### Customers
 550 | 
 551 | #### 1. Create a customers dataset
 552 | 
 553 | ```python
 554 | from ecommercetools import customers
 555 | 
 556 | customers_df = customers.get_customers(transaction_items)
 557 | customers_df.head()
 558 | ```
 559 | 
 560 | <table>
 561 |   <thead>
 562 |     <tr style="text-align: right;">
 563 |       <th></th>
 564 |       <th>customer_id</th>
 565 |       <th>revenue</th>
 566 |       <th>orders</th>
 567 |       <th>skus</th>
 568 |       <th>items</th>
 569 |       <th>first_order_date</th>
 570 |       <th>last_order_date</th>
 571 |       <th>avg_items</th>
 572 |       <th>avg_order_value</th>
 573 |       <th>tenure</th>
 574 |       <th>recency</th>
 575 |       <th>cohort</th>
 576 |     </tr>
 577 |   </thead>
 578 |   <tbody>
 579 |     <tr>
 580 |       <th>0</th>
 581 |       <td>12346.0</td>
 582 |       <td>0.00</td>
 583 |       <td>2</td>
 584 |       <td>1</td>
 585 |       <td>0</td>
 586 |       <td>2011-01-18 10:01:00</td>
 587 |       <td>2011-01-18 10:17:00</td>
 588 |       <td>0.00</td>
 589 |       <td>0.00</td>
 590 |       <td>3701</td>
 591 |       <td>3700</td>
 592 |       <td>20111</td>
 593 |     </tr>
 594 |     <tr>
 595 |       <th>1</th>
 596 |       <td>12347.0</td>
 597 |       <td>4310.00</td>
 598 |       <td>7</td>
 599 |       <td>7</td>
 600 |       <td>2458</td>
 601 |       <td>2010-12-07 14:57:00</td>
 602 |       <td>2011-12-07 15:52:00</td>
 603 |       <td>351.14</td>
 604 |       <td>615.71</td>
 605 |       <td>3742</td>
 606 |       <td>3377</td>
 607 |       <td>20104</td>
 608 |     </tr>
 609 |     <tr>
 610 |       <th>2</th>
 611 |       <td>12348.0</td>
 612 |       <td>1797.24</td>
 613 |       <td>4</td>
 614 |       <td>4</td>
 615 |       <td>2341</td>
 616 |       <td>2010-12-16 19:09:00</td>
 617 |       <td>2011-09-25 13:13:00</td>
 618 |       <td>585.25</td>
 619 |       <td>449.31</td>
 620 |       <td>3733</td>
 621 |       <td>3450</td>
 622 |       <td>20104</td>
 623 |     </tr>
 624 |     <tr>
 625 |       <th>3</th>
 626 |       <td>12349.0</td>
 627 |       <td>1757.55</td>
 628 |       <td>1</td>
 629 |       <td>1</td>
 630 |       <td>631</td>
 631 |       <td>2011-11-21 09:51:00</td>
 632 |       <td>2011-11-21 09:51:00</td>
 633 |       <td>631.00</td>
 634 |       <td>1757.55</td>
 635 |       <td>3394</td>
 636 |       <td>3394</td>
 637 |       <td>20114</td>
 638 |     </tr>
 639 |     <tr>
 640 |       <th>4</th>
 641 |       <td>12350.0</td>
 642 |       <td>334.40</td>
 643 |       <td>1</td>
 644 |       <td>1</td>
 645 |       <td>197</td>
 646 |       <td>2011-02-02 16:01:00</td>
 647 |       <td>2011-02-02 16:01:00</td>
 648 |       <td>197.00</td>
 649 |       <td>334.40</td>
 650 |       <td>3685</td>
 651 |       <td>3685</td>
 652 |       <td>20111</td>
 653 |     </tr>
 654 |   </tbody>
 655 | </table>
 656 | 
 657 | #### 2. Create a customer cohort analysis dataset
 658 | 
 659 | 
 660 | ```python
 661 | from ecommercetools import customers
 662 | 
 663 | cohorts_df = customers.get_cohorts(transaction_items, period='M')
 664 | cohorts_df.head()
 665 | ```
 666 | 
 667 | <table>
 668 |   <thead>
 669 |     <tr style="text-align: right;">
 670 |       <th></th>
 671 |       <th>customer_id</th>
 672 |       <th>order_id</th>
 673 |       <th>order_date</th>
 674 |       <th>acquisition_cohort</th>
 675 |       <th>order_cohort</th>
 676 |     </tr>
 677 |   </thead>
 678 |   <tbody>
 679 |     <tr>
 680 |       <th>0</th>
 681 |       <td>17850.0</td>
 682 |       <td>536365</td>
 683 |       <td>2010-12-01 08:26:00</td>
 684 |       <td>2010-12</td>
 685 |       <td>2010-12</td>
 686 |     </tr>
 687 |     <tr>
 688 |       <th>7</th>
 689 |       <td>17850.0</td>
 690 |       <td>536366</td>
 691 |       <td>2010-12-01 08:28:00</td>
 692 |       <td>2010-12</td>
 693 |       <td>2010-12</td>
 694 |     </tr>
 695 |     <tr>
 696 |       <th>9</th>
 697 |       <td>13047.0</td>
 698 |       <td>536367</td>
 699 |       <td>2010-12-01 08:34:00</td>
 700 |       <td>2010-12</td>
 701 |       <td>2010-12</td>
 702 |     </tr>
 703 |     <tr>
 704 |       <th>21</th>
 705 |       <td>13047.0</td>
 706 |       <td>536368</td>
 707 |       <td>2010-12-01 08:34:00</td>
 708 |       <td>2010-12</td>
 709 |       <td>2010-12</td>
 710 |     </tr>
 711 |     <tr>
 712 |       <th>25</th>
 713 |       <td>13047.0</td>
 714 |       <td>536369</td>
 715 |       <td>2010-12-01 08:35:00</td>
 716 |       <td>2010-12</td>
 717 |       <td>2010-12</td>
 718 |     </tr>
 719 |   </tbody>
 720 | </table>
 721 | 
 722 | 
 723 | #### 3. Create a customer cohort analysis matrix
 724 | 
 725 | ```python
 726 | from ecommercetools import customers
 727 | 
 728 | cohort_matrix_df = customers.get_cohort_matrix(transaction_items, period='M', percentage=True)
 729 | cohort_matrix_df.head()
 730 | ```
 731 | 
 732 | <table>
 733 |   <thead>
 734 |     <tr style="text-align: right;">
 735 |       <th>periods</th>
 736 |       <th>0</th>
 737 |       <th>1</th>
 738 |       <th>2</th>
 739 |       <th>3</th>
 740 |       <th>4</th>
 741 |       <th>5</th>
 742 |       <th>6</th>
 743 |       <th>7</th>
 744 |       <th>8</th>
 745 |       <th>9</th>
 746 |       <th>10</th>
 747 |       <th>11</th>
 748 |       <th>12</th>
 749 |     </tr>
 750 |     <tr>
 751 |       <th>acquisition_cohort</th>
 752 |       <th></th>
 753 |       <th></th>
 754 |       <th></th>
 755 |       <th></th>
 756 |       <th></th>
 757 |       <th></th>
 758 |       <th></th>
 759 |       <th></th>
 760 |       <th></th>
 761 |       <th></th>
 762 |       <th></th>
 763 |       <th></th>
 764 |       <th></th>
 765 |     </tr>
 766 |   </thead>
 767 |   <tbody>
 768 |     <tr>
 769 |       <th>2010-12</th>
 770 |       <td>1.0</td>
 771 |       <td>0.381857</td>
 772 |       <td>0.334388</td>
 773 |       <td>0.387131</td>
 774 |       <td>0.359705</td>
 775 |       <td>0.396624</td>
 776 |       <td>0.379747</td>
 777 |       <td>0.354430</td>
 778 |       <td>0.354430</td>
 779 |       <td>0.394515</td>
 780 |       <td>0.373418</td>
 781 |       <td>0.500000</td>
 782 |       <td>0.274262</td>
 783 |     </tr>
 784 |     <tr>
 785 |       <th>2011-01</th>
 786 |       <td>1.0</td>
 787 |       <td>0.239905</td>
 788 |       <td>0.282660</td>
 789 |       <td>0.242280</td>
 790 |       <td>0.327791</td>
 791 |       <td>0.299287</td>
 792 |       <td>0.261283</td>
 793 |       <td>0.256532</td>
 794 |       <td>0.311164</td>
 795 |       <td>0.346793</td>
 796 |       <td>0.368171</td>
 797 |       <td>0.149644</td>
 798 |       <td>NaN</td>
 799 |     </tr>
 800 |     <tr>
 801 |       <th>2011-02</th>
 802 |       <td>1.0</td>
 803 |       <td>0.247368</td>
 804 |       <td>0.192105</td>
 805 |       <td>0.278947</td>
 806 |       <td>0.268421</td>
 807 |       <td>0.247368</td>
 808 |       <td>0.255263</td>
 809 |       <td>0.281579</td>
 810 |       <td>0.257895</td>
 811 |       <td>0.313158</td>
 812 |       <td>0.092105</td>
 813 |       <td>NaN</td>
 814 |       <td>NaN</td>
 815 |     </tr>
 816 |     <tr>
 817 |       <th>2011-03</th>
 818 |       <td>1.0</td>
 819 |       <td>0.190909</td>
 820 |       <td>0.254545</td>
 821 |       <td>0.218182</td>
 822 |       <td>0.231818</td>
 823 |       <td>0.177273</td>
 824 |       <td>0.263636</td>
 825 |       <td>0.238636</td>
 826 |       <td>0.288636</td>
 827 |       <td>0.088636</td>
 828 |       <td>NaN</td>
 829 |       <td>NaN</td>
 830 |       <td>NaN</td>
 831 |     </tr>
 832 |     <tr>
 833 |       <th>2011-04</th>
 834 |       <td>1.0</td>
 835 |       <td>0.227425</td>
 836 |       <td>0.220736</td>
 837 |       <td>0.210702</td>
 838 |       <td>0.207358</td>
 839 |       <td>0.237458</td>
 840 |       <td>0.230769</td>
 841 |       <td>0.260870</td>
 842 |       <td>0.083612</td>
 843 |       <td>NaN</td>
 844 |       <td>NaN</td>
 845 |       <td>NaN</td>
 846 |       <td>NaN</td>
 847 |     </tr>
 848 |   </tbody>
 849 | </table>
 850 | 
 851 | 
 852 | ```python
 853 | from ecommercetools import customers
 854 | 
 855 | cohort_matrix_df = customers.get_cohort_matrix(transaction_items, period='M', percentage=False)
 856 | cohort_matrix_df.head()
 857 | ```
 858 | 
 859 | <table>
 860 |   <thead>
 861 |     <tr style="text-align: right;">
 862 |       <th>periods</th>
 863 |       <th>0</th>
 864 |       <th>1</th>
 865 |       <th>2</th>
 866 |       <th>3</th>
 867 |       <th>4</th>
 868 |       <th>5</th>
 869 |       <th>6</th>
 870 |       <th>7</th>
 871 |       <th>8</th>
 872 |       <th>9</th>
 873 |       <th>10</th>
 874 |       <th>11</th>
 875 |       <th>12</th>
 876 |     </tr>
 877 |     <tr>
 878 |       <th>acquisition_cohort</th>
 879 |       <th></th>
 880 |       <th></th>
 881 |       <th></th>
 882 |       <th></th>
 883 |       <th></th>
 884 |       <th></th>
 885 |       <th></th>
 886 |       <th></th>
 887 |       <th></th>
 888 |       <th></th>
 889 |       <th></th>
 890 |       <th></th>
 891 |       <th></th>
 892 |     </tr>
 893 |   </thead>
 894 |   <tbody>
 895 |     <tr>
 896 |       <th>2010-12</th>
 897 |       <td>948.0</td>
 898 |       <td>362.0</td>
 899 |       <td>317.0</td>
 900 |       <td>367.0</td>
 901 |       <td>341.0</td>
 902 |       <td>376.0</td>
 903 |       <td>360.0</td>
 904 |       <td>336.0</td>
 905 |       <td>336.0</td>
 906 |       <td>374.0</td>
 907 |       <td>354.0</td>
 908 |       <td>474.0</td>
 909 |       <td>260.0</td>
 910 |     </tr>
 911 |     <tr>
 912 |       <th>2011-01</th>
 913 |       <td>421.0</td>
 914 |       <td>101.0</td>
 915 |       <td>119.0</td>
 916 |       <td>102.0</td>
 917 |       <td>138.0</td>
 918 |       <td>126.0</td>
 919 |       <td>110.0</td>
 920 |       <td>108.0</td>
 921 |       <td>131.0</td>
 922 |       <td>146.0</td>
 923 |       <td>155.0</td>
 924 |       <td>63.0</td>
 925 |       <td>NaN</td>
 926 |     </tr>
 927 |     <tr>
 928 |       <th>2011-02</th>
 929 |       <td>380.0</td>
 930 |       <td>94.0</td>
 931 |       <td>73.0</td>
 932 |       <td>106.0</td>
 933 |       <td>102.0</td>
 934 |       <td>94.0</td>
 935 |       <td>97.0</td>
 936 |       <td>107.0</td>
 937 |       <td>98.0</td>
 938 |       <td>119.0</td>
 939 |       <td>35.0</td>
 940 |       <td>NaN</td>
 941 |       <td>NaN</td>
 942 |     </tr>
 943 |     <tr>
 944 |       <th>2011-03</th>
 945 |       <td>440.0</td>
 946 |       <td>84.0</td>
 947 |       <td>112.0</td>
 948 |       <td>96.0</td>
 949 |       <td>102.0</td>
 950 |       <td>78.0</td>
 951 |       <td>116.0</td>
 952 |       <td>105.0</td>
 953 |       <td>127.0</td>
 954 |       <td>39.0</td>
 955 |       <td>NaN</td>
 956 |       <td>NaN</td>
 957 |       <td>NaN</td>
 958 |     </tr>
 959 |     <tr>
 960 |       <th>2011-04</th>
 961 |       <td>299.0</td>
 962 |       <td>68.0</td>
 963 |       <td>66.0</td>
 964 |       <td>63.0</td>
 965 |       <td>62.0</td>
 966 |       <td>71.0</td>
 967 |       <td>69.0</td>
 968 |       <td>78.0</td>
 969 |       <td>25.0</td>
 970 |       <td>NaN</td>
 971 |       <td>NaN</td>
 972 |       <td>NaN</td>
 973 |       <td>NaN</td>
 974 |     </tr>
 975 |   </tbody>
 976 | </table>
 977 | 
 978 | 
 979 | 
 980 | #### 4. Create a customer "retention" dataset
 981 | 
 982 | 
 983 | ```python
 984 | from ecommercetools import customers
 985 | 
 986 | retention_df = customers.get_retention(transactions_df)
 987 | retention_df.head()
 988 | ```
 989 | 
 990 | <table>
 991 |   <thead>
 992 |     <tr style="text-align: right;">
 993 |       <th></th>
 994 |       <th>acquisition_cohort</th>
 995 |       <th>order_cohort</th>
 996 |       <th>customers</th>
 997 |       <th>periods</th>
 998 |     </tr>
 999 |   </thead>
1000 |   <tbody>
1001 |     <tr>
1002 |       <th>0</th>
1003 |       <td>2010-12</td>
1004 |       <td>2010-12</td>
1005 |       <td>948</td>
1006 |       <td>0</td>
1007 |     </tr>
1008 |     <tr>
1009 |       <th>1</th>
1010 |       <td>2010-12</td>
1011 |       <td>2011-01</td>
1012 |       <td>362</td>
1013 |       <td>1</td>
1014 |     </tr>
1015 |     <tr>
1016 |       <th>2</th>
1017 |       <td>2010-12</td>
1018 |       <td>2011-02</td>
1019 |       <td>317</td>
1020 |       <td>2</td>
1021 |     </tr>
1022 |     <tr>
1023 |       <th>3</th>
1024 |       <td>2010-12</td>
1025 |       <td>2011-03</td>
1026 |       <td>367</td>
1027 |       <td>3</td>
1028 |     </tr>
1029 |     <tr>
1030 |       <th>4</th>
1031 |       <td>2010-12</td>
1032 |       <td>2011-04</td>
1033 |       <td>341</td>
1034 |       <td>4</td>
1035 |     </tr>
1036 |   </tbody>
1037 | </table>
1038 | 
1039 | #### 5. Create an RFM (H) dataset
1040 | 
1041 | This is an extension of the regular Recency, Frequency, Monetary value (RFM) model that includes an additional parameter "H" for heterogeneity. This shows the number of unique SKUs purchased by each customer. While typically unassociated with targeting, this value can be very useful in identifying which customers should probably be buying a broader mix of products than they currently are, as well as spotting those who may have stopped buying certain items. 
1042 | 
1043 | 
1044 | ```python
1045 | from ecommercetools import customers
1046 | 
1047 | rfm_df = customers.get_rfm_segments(customers_df)
1048 | rfm_df.head()
1049 | ```
1050 | 
1051 | <table>
1052 |   <thead>
1053 |     <tr style="text-align: right;">
1054 |       <th></th>
1055 |       <th>customer_id</th>
1056 |       <th>acquisition_date</th>
1057 |       <th>recency_date</th>
1058 |       <th>recency</th>
1059 |       <th>frequency</th>
1060 |       <th>monetary</th>
1061 |       <th>heterogeneity</th>
1062 |       <th>tenure</th>
1063 |       <th>r</th>
1064 |       <th>f</th>
1065 |       <th>m</th>
1066 |       <th>h</th>
1067 |       <th>rfm</th>
1068 |       <th>rfm_score</th>
1069 |       <th>rfm_segment_name</th>
1070 |     </tr>
1071 |   </thead>
1072 |   <tbody>
1073 |     <tr>
1074 |       <th>0</th>
1075 |       <td>12346.0</td>
1076 |       <td>2011-01-18 10:01:00</td>
1077 |       <td>2011-01-18 10:17:00</td>
1078 |       <td>3700</td>
1079 |       <td>2</td>
1080 |       <td>0.00</td>
1081 |       <td>1</td>
1082 |       <td>3701</td>
1083 |       <td>1</td>
1084 |       <td>1</td>
1085 |       <td>1</td>
1086 |       <td>1</td>
1087 |       <td>111</td>
1088 |       <td>3</td>
1089 |       <td>Risky</td>
1090 |     </tr>
1091 |     <tr>
1092 |       <th>1</th>
1093 |       <td>12350.0</td>
1094 |       <td>2011-02-02 16:01:00</td>
1095 |       <td>2011-02-02 16:01:00</td>
1096 |       <td>3685</td>
1097 |       <td>1</td>
1098 |       <td>334.40</td>
1099 |       <td>1</td>
1100 |       <td>3685</td>
1101 |       <td>1</td>
1102 |       <td>1</td>
1103 |       <td>1</td>
1104 |       <td>1</td>
1105 |       <td>111</td>
1106 |       <td>3</td>
1107 |       <td>Risky</td>
1108 |     </tr>
1109 |     <tr>
1110 |       <th>2</th>
1111 |       <td>12365.0</td>
1112 |       <td>2011-02-21 13:51:00</td>
1113 |       <td>2011-02-21 14:04:00</td>
1114 |       <td>3666</td>
1115 |       <td>3</td>
1116 |       <td>320.69</td>
1117 |       <td>2</td>
1118 |       <td>3666</td>
1119 |       <td>1</td>
1120 |       <td>1</td>
1121 |       <td>1</td>
1122 |       <td>1</td>
1123 |       <td>111</td>
1124 |       <td>3</td>
1125 |       <td>Risky</td>
1126 |     </tr>
1127 |     <tr>
1128 |       <th>3</th>
1129 |       <td>12373.0</td>
1130 |       <td>2011-02-01 13:10:00</td>
1131 |       <td>2011-02-01 13:10:00</td>
1132 |       <td>3686</td>
1133 |       <td>1</td>
1134 |       <td>364.60</td>
1135 |       <td>1</td>
1136 |       <td>3686</td>
1137 |       <td>1</td>
1138 |       <td>1</td>
1139 |       <td>1</td>
1140 |       <td>1</td>
1141 |       <td>111</td>
1142 |       <td>3</td>
1143 |       <td>Risky</td>
1144 |     </tr>
1145 |     <tr>
1146 |       <th>4</th>
1147 |       <td>12377.0</td>
1148 |       <td>2010-12-20 09:37:00</td>
1149 |       <td>2011-01-28 15:45:00</td>
1150 |       <td>3690</td>
1151 |       <td>2</td>
1152 |       <td>1628.12</td>
1153 |       <td>2</td>
1154 |       <td>3730</td>
1155 |       <td>1</td>
1156 |       <td>1</td>
1157 |       <td>1</td>
1158 |       <td>1</td>
1159 |       <td>111</td>
1160 |       <td>3</td>
1161 |       <td>Risky</td>
1162 |     </tr>
1163 |   </tbody>
1164 | </table>
1165 | 
1166 | 
1167 | #### 6. Create a purchase latency dataset
1168 | 
1169 | 
1170 | ```python
1171 | from ecommercetools import customers 
1172 | 
1173 | latency_df = customers.get_latency(transactions_df)
1174 | latency_df.head()
1175 | ```
1176 | 
1177 | <table>
1178 |   <thead>
1179 |     <tr style="text-align: right;">
1180 |       <th></th>
1181 |       <th>customer_id</th>
1182 |       <th>frequency</th>
1183 |       <th>recency_date</th>
1184 |       <th>recency</th>
1185 |       <th>avg_latency</th>
1186 |       <th>min_latency</th>
1187 |       <th>max_latency</th>
1188 |       <th>std_latency</th>
1189 |       <th>cv</th>
1190 |       <th>days_to_next_order</th>
1191 |       <th>label</th>
1192 |     </tr>
1193 |   </thead>
1194 |   <tbody>
1195 |     <tr>
1196 |       <th>0</th>
1197 |       <td>12680.0</td>
1198 |       <td>4</td>
1199 |       <td>2011-12-09 12:50:00</td>
1200 |       <td>3388</td>
1201 |       <td>28</td>
1202 |       <td>16</td>
1203 |       <td>73</td>
1204 |       <td>30.859898</td>
1205 |       <td>1.102139</td>
1206 |       <td>-3329.0</td>
1207 |       <td>Order overdue</td>
1208 |     </tr>
1209 |     <tr>
1210 |       <th>1</th>
1211 |       <td>13113.0</td>
1212 |       <td>24</td>
1213 |       <td>2011-12-09 12:49:00</td>
1214 |       <td>3388</td>
1215 |       <td>15</td>
1216 |       <td>0</td>
1217 |       <td>52</td>
1218 |       <td>12.060126</td>
1219 |       <td>0.804008</td>
1220 |       <td>-3361.0</td>
1221 |       <td>Order overdue</td>
1222 |     </tr>
1223 |     <tr>
1224 |       <th>2</th>
1225 |       <td>15804.0</td>
1226 |       <td>13</td>
1227 |       <td>2011-12-09 12:31:00</td>
1228 |       <td>3388</td>
1229 |       <td>15</td>
1230 |       <td>1</td>
1231 |       <td>39</td>
1232 |       <td>11.008261</td>
1233 |       <td>0.733884</td>
1234 |       <td>-3362.0</td>
1235 |       <td>Order overdue</td>
1236 |     </tr>
1237 |     <tr>
1238 |       <th>3</th>
1239 |       <td>13777.0</td>
1240 |       <td>33</td>
1241 |       <td>2011-12-09 12:25:00</td>
1242 |       <td>3388</td>
1243 |       <td>11</td>
1244 |       <td>0</td>
1245 |       <td>48</td>
1246 |       <td>12.055274</td>
1247 |       <td>1.095934</td>
1248 |       <td>-3365.0</td>
1249 |       <td>Order overdue</td>
1250 |     </tr>
1251 |     <tr>
1252 |       <th>4</th>
1253 |       <td>17581.0</td>
1254 |       <td>25</td>
1255 |       <td>2011-12-09 12:21:00</td>
1256 |       <td>3388</td>
1257 |       <td>14</td>
1258 |       <td>0</td>
1259 |       <td>67</td>
1260 |       <td>21.974293</td>
1261 |       <td>1.569592</td>
1262 |       <td>-3352.0</td>
1263 |       <td>Order overdue</td>
1264 |     </tr>
1265 |   </tbody>
1266 | </table>
1267 | 
1268 | 
1269 | 
1270 | #### 7. Customer ABC segmentation
1271 | 
1272 | ```python
1273 | from ecommercetools import customers
1274 | 
1275 | abc_df = customers.get_abc_segments(customers_df, months=12, abc_class_name='abc_class_12m', abc_rank_name='abc_rank_12m')
1276 | abc_df.head()
1277 | ```
1278 | 
1279 | 
1280 | <table>
1281 |   <thead>
1282 |     <tr style="text-align: right;">
1283 |       <th></th>
1284 |       <th>customer_id</th>
1285 |       <th>abc_class_12m</th>
1286 |       <th>abc_rank_12m</th>
1287 |     </tr>
1288 |   </thead>
1289 |   <tbody>
1290 |     <tr>
1291 |       <th>0</th>
1292 |       <td>12346.0</td>
1293 |       <td>D</td>
1294 |       <td>1.0</td>
1295 |     </tr>
1296 |     <tr>
1297 |       <th>1</th>
1298 |       <td>12347.0</td>
1299 |       <td>D</td>
1300 |       <td>1.0</td>
1301 |     </tr>
1302 |     <tr>
1303 |       <th>2</th>
1304 |       <td>12348.0</td>
1305 |       <td>D</td>
1306 |       <td>1.0</td>
1307 |     </tr>
1308 |     <tr>
1309 |       <th>3</th>
1310 |       <td>12349.0</td>
1311 |       <td>D</td>
1312 |       <td>1.0</td>
1313 |     </tr>
1314 |     <tr>
1315 |       <th>4</th>
1316 |       <td>12350.0</td>
1317 |       <td>D</td>
1318 |       <td>1.0</td>
1319 |     </tr>
1320 |   </tbody>
1321 | </table>
1322 | 
1323 | #### 8. Predict customer AOV, CLV, and orders
1324 | 
1325 | EcommerceTools allows you to predict the AOV, Customer Lifetime Value (CLV) and expected number of orders via the Gamma-Gamma and BG/NBD models from the excellent Lifetimes package. By passing the dataframe of transactions from `get_transactions()` to the `get_customer_predictions()` function, EcommerceTools will fit the BG/NBD and Gamma-Gamma models and predict the AOV, order quantity, and CLV for each customer in the defined number of future days after the end of the observation period.
1326 | 
1327 | ```python
1328 | customer_predictions = customers.get_customer_predictions(transactions_df, 
1329 |                                                           observation_period_end='2011-12-09', 
1330 |                                                           days=90)
1331 | customer_predictions.head(10)
1332 | ```
1333 | 
1334 | <table border="1" class="dataframe">
1335 |   <thead>
1336 |     <tr style="text-align: right;">
1337 |       <th></th>
1338 |       <th>customer_id</th>
1339 |       <th>predicted_purchases</th>
1340 |       <th>aov</th>
1341 |       <th>clv</th>
1342 |     </tr>
1343 |   </thead>
1344 |   <tbody>
1345 |     <tr>
1346 |       <th>0</th>
1347 |       <td>12346.0</td>
1348 |       <td>0.188830</td>
1349 |       <td>NaN</td>
1350 |       <td>NaN</td>
1351 |     </tr>
1352 |     <tr>
1353 |       <th>1</th>
1354 |       <td>12347.0</td>
1355 |       <td>1.408736</td>
1356 |       <td>569.978836</td>
1357 |       <td>836.846896</td>
1358 |     </tr>
1359 |     <tr>
1360 |       <th>2</th>
1361 |       <td>12348.0</td>
1362 |       <td>0.805907</td>
1363 |       <td>333.784235</td>
1364 |       <td>308.247354</td>
1365 |     </tr>
1366 |     <tr>
1367 |       <th>3</th>
1368 |       <td>12349.0</td>
1369 |       <td>0.855607</td>
1370 |       <td>NaN</td>
1371 |       <td>NaN</td>
1372 |     </tr>
1373 |     <tr>
1374 |       <th>4</th>
1375 |       <td>12350.0</td>
1376 |       <td>0.196304</td>
1377 |       <td>NaN</td>
1378 |       <td>NaN</td>
1379 |     </tr>
1380 |     <tr>
1381 |       <th>5</th>
1382 |       <td>12352.0</td>
1383 |       <td>1.682277</td>
1384 |       <td>376.175359</td>
1385 |       <td>647.826169</td>
1386 |     </tr>
1387 |     <tr>
1388 |       <th>6</th>
1389 |       <td>12353.0</td>
1390 |       <td>0.272541</td>
1391 |       <td>NaN</td>
1392 |       <td>NaN</td>
1393 |     </tr>
1394 |     <tr>
1395 |       <th>7</th>
1396 |       <td>12354.0</td>
1397 |       <td>0.247183</td>
1398 |       <td>NaN</td>
1399 |       <td>NaN</td>
1400 |     </tr>
1401 |     <tr>
1402 |       <th>8</th>
1403 |       <td>12355.0</td>
1404 |       <td>0.262909</td>
1405 |       <td>NaN</td>
1406 |       <td>NaN</td>
1407 |     </tr>
1408 |     <tr>
1409 |       <th>9</th>
1410 |       <td>12356.0</td>
1411 |       <td>0.645368</td>
1412 |       <td>324.039419</td>
1413 |       <td>256.855226</td>
1414 |     </tr>
1415 |   </tbody>
1416 | </table>
1417 | ---
1418 | 
1419 | ### Advertising
1420 | 
1421 | #### 1. Create paid search keywords
1422 | 
1423 | 
1424 | ```python
1425 | from ecommercetools import advertising
1426 | 
1427 | product_names = ['fly rods', 'fly reels']
1428 | keywords_prepend = ['buy', 'best', 'cheap', 'reduced']
1429 | keywords_append = ['for sale', 'price', 'promotion', 'promo', 'coupon', 'voucher', 'shop', 'suppliers']
1430 | campaign_name = 'fly_fishing'
1431 | 
1432 | keywords = advertising.generate_ad_keywords(product_names, keywords_prepend, keywords_append, campaign_name)
1433 | keywords.head()
1434 | ```
1435 | 
1436 | <table>
1437 |   <thead>
1438 |     <tr style="text-align: right;">
1439 |       <th></th>
1440 |       <th>product</th>
1441 |       <th>keywords</th>
1442 |       <th>match_type</th>
1443 |       <th>campaign_name</th>
1444 |     </tr>
1445 |   </thead>
1446 |   <tbody>
1447 |     <tr>
1448 |       <th>0</th>
1449 |       <td>fly rods</td>
1450 |       <td>[fly rods]</td>
1451 |       <td>Exact</td>
1452 |       <td>fly_fishing</td>
1453 |     </tr>
1454 |     <tr>
1455 |       <th>1</th>
1456 |       <td>fly rods</td>
1457 |       <td>[buy fly rods]</td>
1458 |       <td>Exact</td>
1459 |       <td>fly_fishing</td>
1460 |     </tr>
1461 |     <tr>
1462 |       <th>2</th>
1463 |       <td>fly rods</td>
1464 |       <td>[best fly rods]</td>
1465 |       <td>Exact</td>
1466 |       <td>fly_fishing</td>
1467 |     </tr>
1468 |     <tr>
1469 |       <th>3</th>
1470 |       <td>fly rods</td>
1471 |       <td>[cheap fly rods]</td>
1472 |       <td>Exact</td>
1473 |       <td>fly_fishing</td>
1474 |     </tr>
1475 |     <tr>
1476 |       <th>4</th>
1477 |       <td>fly rods</td>
1478 |       <td>[reduced fly rods]</td>
1479 |       <td>Exact</td>
1480 |       <td>fly_fishing</td>
1481 |     </tr>
1482 |   </tbody>
1483 | </table>
1484 | 
1485 | 
1486 | #### 2. Create paid search ad copy using Spintax
1487 | 
1488 | ```python
1489 | from ecommercetools import advertising
1490 | 
1491 | text = "Fly Reels from {Orvis|Loop|Sage|Airflo|Nautilus} for {trout|salmon|grayling|pike}"
1492 | spin = advertising.generate_spintax(text, single=False)
1493 | 
1494 | spin
1495 | ```
1496 | 
1497 | 
1498 |     ['Fly Reels from Orvis for trout',
1499 |      'Fly Reels from Orvis for salmon',
1500 |      'Fly Reels from Orvis for grayling',
1501 |      'Fly Reels from Orvis for pike',
1502 |      'Fly Reels from Loop for trout',
1503 |      'Fly Reels from Loop for salmon',
1504 |      'Fly Reels from Loop for grayling',
1505 |      'Fly Reels from Loop for pike',
1506 |      'Fly Reels from Sage for trout',
1507 |      'Fly Reels from Sage for salmon',
1508 |      'Fly Reels from Sage for grayling',
1509 |      'Fly Reels from Sage for pike',
1510 |      'Fly Reels from Airflo for trout',
1511 |      'Fly Reels from Airflo for salmon',
1512 |      'Fly Reels from Airflo for grayling',
1513 |      'Fly Reels from Airflo for pike',
1514 |      'Fly Reels from Nautilus for trout',
1515 |      'Fly Reels from Nautilus for salmon',
1516 |      'Fly Reels from Nautilus for grayling',
1517 |      'Fly Reels from Nautilus for pike']
1518 | 
1519 | ---
1520 | 
1521 | ### Operations
1522 | 
1523 | #### 1. Create an ABC inventory classification
1524 | 
1525 | ```python
1526 | inventory_classification = operations.get_inventory_classification(transaction_items)
1527 | inventory_classification.head()
1528 | ```
1529 | 
1530 | <table>
1531 |   <thead>
1532 |     <tr style="text-align: right;">
1533 |       <th></th>
1534 |       <th>sku</th>
1535 |       <th>abc_class</th>
1536 |       <th>abc_rank</th>
1537 |     </tr>
1538 |   </thead>
1539 |   <tbody>
1540 |     <tr>
1541 |       <th>0</th>
1542 |       <td>10002</td>
1543 |       <td>A</td>
1544 |       <td>1</td>
1545 |     </tr>
1546 |     <tr>
1547 |       <th>1</th>
1548 |       <td>10080</td>
1549 |       <td>A</td>
1550 |       <td>2</td>
1551 |     </tr>
1552 |     <tr>
1553 |       <th>2</th>
1554 |       <td>10120</td>
1555 |       <td>A</td>
1556 |       <td>3</td>
1557 |     </tr>
1558 |     <tr>
1559 |       <th>3</th>
1560 |       <td>10123C</td>
1561 |       <td>A</td>
1562 |       <td>4</td>
1563 |     </tr>
1564 |     <tr>
1565 |       <th>4</th>
1566 |       <td>10123G</td>
1567 |       <td>A</td>
1568 |       <td>4</td>
1569 |     </tr>
1570 |   </tbody>
1571 | </table>
1572 | 
1573 | 
1574 | ---
1575 | ### Marketing
1576 | 
1577 | #### 1. Get ecommerce trading calendar
1578 | 
1579 | ```python
1580 | from ecommercetools import marketing
1581 | 
1582 | trading_calendar_df = marketing.get_trading_calendar('2021-01-01', days=365)
1583 | trading_calendar_df.head()
1584 | ```
1585 | 
1586 | <table>
1587 |   <thead>
1588 |     <tr style="text-align: right;">
1589 |       <th></th>
1590 |       <th>date</th>
1591 |       <th>event</th>
1592 |     </tr>
1593 |   </thead>
1594 |   <tbody>
1595 |     <tr>
1596 |       <th>0</th>
1597 |       <td>2021-01-01</td>
1598 |       <td>January sale</td>
1599 |     </tr>
1600 |     <tr>
1601 |       <th>1</th>
1602 |       <td>2021-01-02</td>
1603 |       <td></td>
1604 |     </tr>
1605 |     <tr>
1606 |       <th>2</th>
1607 |       <td>2021-01-03</td>
1608 |       <td></td>
1609 |     </tr>
1610 |     <tr>
1611 |       <th>3</th>
1612 |       <td>2021-01-04</td>
1613 |       <td></td>
1614 |     </tr>
1615 |     <tr>
1616 |       <th>4</th>
1617 |       <td>2021-01-05</td>
1618 |       <td></td>
1619 |     </tr>
1620 |   </tbody>
1621 | </table>
1622 | 
1623 | 
1624 | #### 2. Get ecommerce trading events
1625 | 
1626 | 
1627 | ```python
1628 | from ecommercetools import marketing
1629 | 
1630 | trading_events_df = marketing.get_trading_events('2021-01-01', days=365)
1631 | trading_events_df.head()
1632 | ```
1633 | 
1634 | 
1635 | <table>
1636 |   <thead>
1637 |     <tr style="text-align: right;">
1638 |       <th></th>
1639 |       <th>date</th>
1640 |       <th>event</th>
1641 |     </tr>
1642 |   </thead>
1643 |   <tbody>
1644 |     <tr>
1645 |       <th>0</th>
1646 |       <td>2021-01-01</td>
1647 |       <td>January sale</td>
1648 |     </tr>
1649 |     <tr>
1650 |       <th>1</th>
1651 |       <td>2021-01-29</td>
1652 |       <td>January Pay Day</td>
1653 |     </tr>
1654 |     <tr>
1655 |       <th>2</th>
1656 |       <td>2021-02-11</td>
1657 |       <td>Valentine's Day [last order date]</td>
1658 |     </tr>
1659 |     <tr>
1660 |       <th>3</th>
1661 |       <td>2021-02-14</td>
1662 |       <td>Valentine's Day</td>
1663 |     </tr>
1664 |     <tr>
1665 |       <th>4</th>
1666 |       <td>2021-02-26</td>
1667 |       <td>February Pay Day</td>
1668 |     </tr>
1669 |   </tbody>
1670 | </table>
1671 | 
1672 | 
1673 | 
1674 | ---
1675 | 
1676 | ### NLP
1677 | 
1678 | #### 1. Generate text summaries
1679 | The `get_summaries()` function of the `nlp` module takes a Pandas dataframe containing text and returns a machine-generated summary of the content using a Huggingface Transformers pipeline via PyTorch. To use this feature, first load your Pandas dataframe and import the `nlp` module from `ecommercetools`.
1680 | 
1681 | ```python
1682 | import pandas as pd
1683 | from ecommercetools import nlp 
1684 | 
1685 | pd.set_option('max_colwidth', 1000)
1686 | df = pd.read_csv('text.csv')
1687 | df.head()
1688 | ```
1689 | 
1690 | Specify the name of the Pandas dataframe, the column containing the text you wish to summarise (i.e. `product_description`), and specify a column name in which to store the machine-generated summary. The `min_length` and `max_length` arguments control the number of words generated, while the `do_sample` argument controls whether the generated text is completely unique (`do_sample=False`) or extracted from the text (`do_sample=True`).
1691 | 
1692 | ```python
1693 | df = nlp.get_summaries(df, 'product_description', 'sampled_summary', min_length=50, max_length=100, do_sample=True)
1694 | df = nlp.get_summaries(df, 'product_description', 'unsampled_summary', min_length=50, max_length=100, do_sample=False)
1695 | df = nlp.get_summaries(df, 'product_description', 'unsampled_summary_20_to_30', min_length=20, max_length=30, do_sample=False)
1696 | ```
1697 | 
1698 | Since the model used for text summarisation is very large (1.2 GB plus), this function will take some time to complete. Once loaded, summaries are generated within a second or two per piece of text, so it is advisable to try smaller volumes of data initially.
1699 | 
1700 | 
1701 | ### SEO
1702 | 
1703 | #### 1. Discover XML sitemap locations
1704 | The `get_sitemaps()` function takes the location of a `robots.txt` file (always stored at the root of a domain), and returns the URLs of any XML sitemaps listed within. 
1705 | 
1706 | ```python
1707 | from ecommercetools import seo
1708 | 
1709 | sitemaps = seo.get_sitemaps("http://www.flyandlure.org/robots.txt")
1710 | print(sitemaps)
1711 | 
1712 | ```
1713 | 
1714 | #### 2. Get an XML sitemap
1715 | The `get_dataframe()` function allows you to download the URLs in an XML sitemap to a Pandas dataframe. If the sitemap contains child sitemaps, each of these will be retrieved. You can save the Pandas dataframe to CSV in the usual way. 
1716 | 
1717 | ```python
1718 | from ecommercetools import seo
1719 | 
1720 | df = seo.get_sitemap("http://flyandlure.org/sitemap.xml")
1721 | print(df.head())
1722 | ```
1723 | 
1724 | 
1725 | <table>
1726 |   <thead>
1727 |     <tr style="text-align: right;">
1728 |       <th></th>
1729 |       <th>loc</th>
1730 |       <th>changefreq</th>
1731 |       <th>priority</th>
1732 |       <th>domain</th>
1733 |       <th>sitemap_name</th>
1734 |     </tr>
1735 |   </thead>
1736 |   <tbody>
1737 |     <tr>
1738 |       <th>0</th>
1739 |       <td>http://flyandlure.org/</td>
1740 |       <td>hourly</td>
1741 |       <td>1.0</td>
1742 |       <td>flyandlure.org</td>
1743 |       <td>http://www.flyandlure.org/sitemap.xml</td>
1744 |     </tr>
1745 |     <tr>
1746 |       <th>1</th>
1747 |       <td>http://flyandlure.org/about</td>
1748 |       <td>monthly</td>
1749 |       <td>1.0</td>
1750 |       <td>flyandlure.org</td>
1751 |       <td>http://www.flyandlure.org/sitemap.xml</td>
1752 |     </tr>
1753 |     <tr>
1754 |       <th>2</th>
1755 |       <td>http://flyandlure.org/terms</td>
1756 |       <td>monthly</td>
1757 |       <td>1.0</td>
1758 |       <td>flyandlure.org</td>
1759 |       <td>http://www.flyandlure.org/sitemap.xml</td>
1760 |     </tr>
1761 |     <tr>
1762 |       <th>3</th>
1763 |       <td>http://flyandlure.org/privacy</td>
1764 |       <td>monthly</td>
1765 |       <td>1.0</td>
1766 |       <td>flyandlure.org</td>
1767 |       <td>http://www.flyandlure.org/sitemap.xml</td>
1768 |     </tr>
1769 |     <tr>
1770 |       <th>4</th>
1771 |       <td>http://flyandlure.org/copyright</td>
1772 |       <td>monthly</td>
1773 |       <td>1.0</td>
1774 |       <td>flyandlure.org</td>
1775 |       <td>http://www.flyandlure.org/sitemap.xml</td>
1776 |     </tr>
1777 |   </tbody>
1778 | </table>
1779 | 
1780 | 
1781 | #### 3. Get Core Web Vitals from PageSpeed Insights
1782 | The `get_core_web_vitals()` function retrieves the Core Web Vitals metrics for a list of sites from the Google PageSpeed Insights API and returns results in a Pandas dataframe. The function requires a a Google PageSpeed Insights API key. 
1783 | 
1784 | ```python
1785 | from ecommercetools import seo
1786 | 
1787 | pagespeed_insights_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
1788 | urls = ['https://www.bbc.co.uk', 'https://www.bbc.co.uk/iplayer']
1789 | df = seo.get_core_web_vitals(pagespeed_insights_key, urls)
1790 | print(df.head())
1791 | ```
1792 | 
1793 | #### 4. Get Google Knowledge Graph data
1794 | The `get_knowledge_graph()` function returns the Google Knowledge Graph data for a given search term. This requires the use of a Google Knowledge Graph API key. By default, the function returns output in a Pandas dataframe, but you can pass the `output="json"` argument if you wish to receive the JSON data back. 
1795 | 
1796 | ```python
1797 | from ecommercetools import seo
1798 | 
1799 | knowledge_graph_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
1800 | knowledge_graph = seo.get_knowledge_graph(knowledge_graph_key, "tesla", output="dataframe")
1801 | print(knowledge_graph)
1802 | ```
1803 | 
1804 | #### 5. Get Google Search Console API data
1805 | The `query_google_search_console()` function runs a search query on the Google Search Console API and returns data in a Pandas dataframe. This function requires a JSON client secrets key with access to the Google Search Console API. 
1806 | 
1807 | ```python
1808 | from ecommercetools import seo
1809 | 
1810 | key = "google-search-console.json"
1811 | site_url = "http://flyandlure.org"
1812 | payload = {
1813 |     'startDate': "2019-01-01",
1814 |     'endDate': "2019-12-31",
1815 |     'dimensions': ["page", "device", "query"],
1816 |     'rowLimit': 100,
1817 |     'startRow': 0
1818 | }
1819 | 
1820 | df = seo.query_google_search_console(key, site_url, payload)
1821 | print(df.head())
1822 | 
1823 | ```
1824 | 
1825 | 
1826 | <table>
1827 |   <thead>
1828 |     <tr style="text-align: right;">
1829 |       <th></th>
1830 |       <th>page</th>
1831 |       <th>device</th>
1832 |       <th>query</th>
1833 |       <th>clicks</th>
1834 |       <th>impressions</th>
1835 |       <th>ctr</th>
1836 |       <th>position</th>
1837 |     </tr>
1838 |   </thead>
1839 |   <tbody>
1840 |     <tr>
1841 |       <th>0</th>
1842 |       <td>http://flyandlure.org/articles/fly_fishing_gea...</td>
1843 |       <td>MOBILE</td>
1844 |       <td>simms freestone waders review</td>
1845 |       <td>56</td>
1846 |       <td>217</td>
1847 |       <td>25.81</td>
1848 |       <td>3.12</td>
1849 |     </tr>
1850 |     <tr>
1851 |       <th>1</th>
1852 |       <td>http://flyandlure.org/</td>
1853 |       <td>MOBILE</td>
1854 |       <td>fly and lure</td>
1855 |       <td>37</td>
1856 |       <td>159</td>
1857 |       <td>23.27</td>
1858 |       <td>3.81</td>
1859 |     </tr>
1860 |     <tr>
1861 |       <th>2</th>
1862 |       <td>http://flyandlure.org/articles/fly_fishing_gea...</td>
1863 |       <td>DESKTOP</td>
1864 |       <td>orvis encounter waders review</td>
1865 |       <td>35</td>
1866 |       <td>134</td>
1867 |       <td>26.12</td>
1868 |       <td>4.04</td>
1869 |     </tr>
1870 |     <tr>
1871 |       <th>3</th>
1872 |       <td>http://flyandlure.org/articles/fly_fishing_gea...</td>
1873 |       <td>DESKTOP</td>
1874 |       <td>simms freestone waders review</td>
1875 |       <td>35</td>
1876 |       <td>200</td>
1877 |       <td>17.50</td>
1878 |       <td>3.50</td>
1879 |     </tr>
1880 |     <tr>
1881 |       <th>4</th>
1882 |       <td>http://flyandlure.org/</td>
1883 |       <td>DESKTOP</td>
1884 |       <td>fly and lure</td>
1885 |       <td>32</td>
1886 |       <td>170</td>
1887 |       <td>18.82</td>
1888 |       <td>3.09</td>
1889 |     </tr>
1890 |   </tbody>
1891 | </table>
1892 | 
1893 | ##### Fetching all results from Google Search Console
1894 | 
1895 | To fetch all results, set `fetch_all` to `True`. This will automatically paginate through your Google Search Console data and return all results. Be aware that if you do this you may hit Google's quota limit if you run a query over an extended period, or have a busy site with lots of `page` or `query` dimensions. 
1896 | 
1897 | ```python
1898 | from ecommercetools import seo
1899 | 
1900 | key = "google-search-console.json"
1901 | site_url = "http://flyandlure.org"
1902 | payload = {
1903 |     'startDate': "2019-01-01",
1904 |     'endDate': "2019-12-31",
1905 |     'dimensions': ["page", "device", "query"],
1906 |     'rowLimit': 25000,
1907 |     'startRow': 0
1908 | }
1909 | 
1910 | df = seo.query_google_search_console(key, site_url, payload, fetch_all=True)
1911 | print(df.head())
1912 | 
1913 | ```
1914 | 
1915 | ##### Comparing two time periods in Google Search Console
1916 | 
1917 | ```python
1918 | payload_before = {
1919 |     'startDate': "2021-08-11",
1920 |     'endDate': "2021-08-31",
1921 |     'dimensions': ["page","query"],    
1922 | }
1923 | 
1924 | payload_after = {
1925 |     'startDate': "2021-07-21",
1926 |     'endDate': "2021-08-10",
1927 |     'dimensions': ["page","query"],    
1928 | }
1929 | 
1930 | df = seo.query_google_search_console_compare(key, site_url, payload_before, payload_after, fetch_all=False)
1931 | df.sort_values(by='clicks_change', ascending=False).head()
1932 | ```
1933 | 
1934 | 
1935 | #### 6. Get the number of "indexed" pages
1936 | The `get_indexed_pages()` function uses the "site:" prefix to search Google for the number of pages "indexed". This is very approximate and may not be a perfect representation, but it's usually a good guide of site "size" in the absence of other data. 
1937 | 
1938 | ```python
1939 | from ecommercetools import seo
1940 | 
1941 | urls = ['https://www.bbc.co.uk', 'https://www.bbc.co.uk/iplayer', 'http://flyandlure.org']
1942 | df = seo.get_indexed_pages(urls)
1943 | print(df.head())
1944 | ```
1945 | 
1946 | <table>
1947 |   <thead>
1948 |     <tr style="text-align: right;">
1949 |       <th></th>
1950 |       <th>url</th>
1951 |       <th>indexed_pages</th>
1952 |     </tr>
1953 |   </thead>
1954 |   <tbody>
1955 |     <tr>
1956 |       <th>2</th>
1957 |       <td>http://flyandlure.org</td>
1958 |       <td>2090</td>
1959 |     </tr>
1960 |     <tr>
1961 |       <th>1</th>
1962 |       <td>https://www.bbc.co.uk/iplayer</td>
1963 |       <td>215000</td>
1964 |     </tr>
1965 |     <tr>
1966 |       <th>0</th>
1967 |       <td>https://www.bbc.co.uk</td>
1968 |       <td>12700000</td>
1969 |     </tr>
1970 |   </tbody>
1971 | </table>
1972 | 
1973 | 
1974 | #### 7. Get keyword suggestions from Google Autocomplete
1975 | The `google_autocomplete()` function returns a set of keyword suggestions from Google Autocomplete. The `include_expanded=True` argument allows you to expand the number of suggestions shown by appending prefixes and suffixes to the search terms. 
1976 | 
1977 | ```python
1978 | from ecommercetools import seo
1979 | 
1980 | suggestions = seo.google_autocomplete("data science", include_expanded=False)
1981 | print(suggestions)
1982 | 
1983 | suggestions = seo.google_autocomplete("data science", include_expanded=True)
1984 | print(suggestions)
1985 | ```
1986 | 
1987 | <table>
1988 |   <thead>
1989 |     <tr style="text-align: right;">
1990 |       <th></th>
1991 |       <th>term</th>
1992 |       <th>relevance</th>
1993 |     </tr>
1994 |   </thead>
1995 |   <tbody>
1996 |     <tr>
1997 |       <th>0</th>
1998 |       <td>data science jobs</td>
1999 |       <td>650</td>
2000 |     </tr>
2001 |     <tr>
2002 |       <th>1</th>
2003 |       <td>data science jobs chester</td>
2004 |       <td>601</td>
2005 |     </tr>
2006 |     <tr>
2007 |       <th>2</th>
2008 |       <td>data science course</td>
2009 |       <td>600</td>
2010 |     </tr>
2011 |     <tr>
2012 |       <th>3</th>
2013 |       <td>data science masters</td>
2014 |       <td>554</td>
2015 |     </tr>
2016 |     <tr>
2017 |       <th>4</th>
2018 |       <td>data science salary</td>
2019 |       <td>553</td>
2020 |     </tr>
2021 |     <tr>
2022 |       <th>5</th>
2023 |       <td>data science internship</td>
2024 |       <td>552</td>
2025 |     </tr>
2026 |     <tr>
2027 |       <th>6</th>
2028 |       <td>data science jobs london</td>
2029 |       <td>551</td>
2030 |     </tr>
2031 |     <tr>
2032 |       <th>7</th>
2033 |       <td>data science graduate scheme</td>
2034 |       <td>550</td>
2035 |     </tr>
2036 |   </tbody>
2037 | </table>
2038 | 
2039 | #### 8. Retrieve robots.txt content
2040 | The `get_robots()` function returns the contents of a robots.txt file in a Pandas dataframe so it can be parsed and analysed. 
2041 | 
2042 | ```python
2043 | from ecommercetools import seo
2044 | 
2045 | robots = seo.get_robots("http://www.flyandlure.org/robots.txt")
2046 | print(robots)
2047 | ```
2048 | 
2049 | <table>
2050 |   <thead>
2051 |     <tr style="text-align: right;">
2052 |       <th></th>
2053 |       <th>directive</th>
2054 |       <th>parameter</th>
2055 |     </tr>
2056 |   </thead>
2057 |   <tbody>
2058 |     <tr>
2059 |       <th>0</th>
2060 |       <td>User-agent</td>
2061 |       <td>*</td>
2062 |     </tr>
2063 |     <tr>
2064 |       <th>1</th>
2065 |       <td>Disallow</td>
2066 |       <td>/signin</td>
2067 |     </tr>
2068 |     <tr>
2069 |       <th>2</th>
2070 |       <td>Disallow</td>
2071 |       <td>/signup</td>
2072 |     </tr>
2073 |     <tr>
2074 |       <th>3</th>
2075 |       <td>Disallow</td>
2076 |       <td>/users</td>
2077 |     </tr>
2078 |     <tr>
2079 |       <th>4</th>
2080 |       <td>Disallow</td>
2081 |       <td>/contact</td>
2082 |     </tr>
2083 |     <tr>
2084 |       <th>5</th>
2085 |       <td>Disallow</td>
2086 |       <td>/activate</td>
2087 |     </tr>
2088 |     <tr>
2089 |       <th>6</th>
2090 |       <td>Disallow</td>
2091 |       <td>/*/page</td>
2092 |     </tr>
2093 |     <tr>
2094 |       <th>7</th>
2095 |       <td>Disallow</td>
2096 |       <td>/articles/search</td>
2097 |     </tr>
2098 |     <tr>
2099 |       <th>8</th>
2100 |       <td>Disallow</td>
2101 |       <td>/search.php</td>
2102 |     </tr>
2103 |     <tr>
2104 |       <th>9</th>
2105 |       <td>Disallow</td>
2106 |       <td>*q=*</td>
2107 |     </tr>
2108 |     <tr>
2109 |       <th>10</th>
2110 |       <td>Disallow</td>
2111 |       <td>*category_slug=*</td>
2112 |     </tr>
2113 |     <tr>
2114 |       <th>11</th>
2115 |       <td>Disallow</td>
2116 |       <td>*country_slug=*</td>
2117 |     </tr>
2118 |     <tr>
2119 |       <th>12</th>
2120 |       <td>Disallow</td>
2121 |       <td>*county_slug=*</td>
2122 |     </tr>
2123 |     <tr>
2124 |       <th>13</th>
2125 |       <td>Disallow</td>
2126 |       <td>*features=*</td>
2127 |     </tr>
2128 |   </tbody>
2129 | </table>
2130 | 
2131 | #### 9. Get Google SERPs
2132 | The `get_serps()` function returns a Pandas dataframe containing the Google search engine results for a given search term. Note that this function is not suitable for large-scale scraping and currently includes no features to prevent it from being blocked.
2133 | 
2134 | ```python
2135 | from ecommercetools import seo
2136 | 
2137 | serps = seo.get_serps("data science blog")
2138 | print(serps)
2139 | ```
2140 | 
2141 | <table>
2142 |   <thead>
2143 |     <tr style="text-align: right;">
2144 |       <th></th>
2145 |       <th>title</th>
2146 |       <th>link</th>
2147 |       <th>text</th>
2148 |     </tr>
2149 |   </thead>
2150 |   <tbody>
2151 |     <tr>
2152 |       <th>0</th>
2153 |       <td>10 of the best data science blogs to follow - ...</td>
2154 |       <td>https://www.tableau.com/learn/articles/data-sc...</td>
2155 |       <td>10 of the best data science blogs to follow. T...</td>
2156 |     </tr>
2157 |     <tr>
2158 |       <th>1</th>
2159 |       <td>Best Data Science Blogs to Follow in 2020 | by...</td>
2160 |       <td>https://towardsdatascience.com/best-data-scien...</td>
2161 |       <td>14 Jul 2020 — 1. Towards Data Science · Joined...</td>
2162 |     </tr>
2163 |     <tr>
2164 |       <th>2</th>
2165 |       <td>Top 20 Data Science Blogs And Websites For Dat...</td>
2166 |       <td>https://medium.com/@exastax/top-20-data-scienc...</td>
2167 |       <td>Top 20 Data Science Blogs And Websites For Dat...</td>
2168 |     </tr>
2169 |     <tr>
2170 |       <th>3</th>
2171 |       <td>Data Science Blog – Dataquest</td>
2172 |       <td>https://www.dataquest.io/blog/</td>
2173 |       <td>Browse our data science blog to get helpful ti...</td>
2174 |     </tr>
2175 |     <tr>
2176 |       <th>4</th>
2177 |       <td>51 Awesome Data Science Blogs You Need To Chec...</td>
2178 |       <td>https://365datascience.com/trending/51-data-sc...</td>
2179 |       <td>Blog name: DataKind · datakind data science bl...</td>
2180 |     </tr>
2181 |     <tr>
2182 |       <th>5</th>
2183 |       <td>Blogs on AI, Analytics, Data Science, Machine ...</td>
2184 |       <td>https://www.kdnuggets.com/websites/blogs.html</td>
2185 |       <td>Individual/small group blogs · Ai4 blog, featu...</td>
2186 |     </tr>
2187 |     <tr>
2188 |       <th>6</th>
2189 |       <td>Data Science Blog – Applied Data Science</td>
2190 |       <td>https://data-science-blog.com/</td>
2191 |       <td>... an Bedeutung – DevOps for Data Science. De...</td>
2192 |     </tr>
2193 |     <tr>
2194 |       <th>7</th>
2195 |       <td>Top 10 Data Science and AI Blogs in 2020 - Liv...</td>
2196 |       <td>https://livecodestream.dev/post/top-data-scien...</td>
2197 |       <td>Some of the best data science and AI blogs for...</td>
2198 |     </tr>
2199 |     <tr>
2200 |       <th>8</th>
2201 |       <td>Data Science Blogs: 17 Must-Read Blogs for Dat...</td>
2202 |       <td>https://www.thinkful.com/blog/data-science-blogs/</td>
2203 |       <td>Data scientists could be considered the magici...</td>
2204 |     </tr>
2205 |     <tr>
2206 |       <th>9</th>
2207 |       <td>rushter/data-science-blogs: A curated list of ...</td>
2208 |       <td>https://github.com/rushter/data-science-blogs</td>
2209 |       <td>A curated list of data science blogs. Contribu...</td>
2210 |     </tr>
2211 |   </tbody>
2212 | </table>
2213 | 
2214 | 
2215 | #### Create an ABCD classification of Google Search Console data
2216 | The `classify_pages()` function returns an ABCD classification of Google Search Console data. This calculates the cumulative sum of clicks and then categorises pages using the ABC algorithm (the first 80% are classed A, the next 10% are classed B, and the final 10% are classed C, with the zero click pages classed D). 
2217 | 
2218 | 
2219 | 
2220 | 
2221 | ```python
2222 | from ecommercetools import seo
2223 | 
2224 | key = "client_secrets.json"
2225 | site_url = "example-domain.co.uk"
2226 | start_date = '2022-10-01'
2227 | end_date = '2022-10-31'
2228 | 
2229 | df_classes = seo.classify_pages(key, site_url, start_date, end_date, output='classes')
2230 | print(df_classes.head())
2231 | 
2232 | df_summary = seo.classify_pages(key, site_url, start_date, end_date, output='summary')
2233 | print(df_summary)
2234 | 
2235 | ```
2236 | 
2237 |                                                     page  clicks  impressions    ctr  position  clicks_cumsum  clicks_running_pc  pc_share class  class_rank
2238 |     0  https://practicaldatascience.co.uk/machine-lea...    3890        36577  10.64     12.64           3890           8.382898  8.382898     A           1
2239 |     1  https://practicaldatascience.co.uk/data-scienc...    2414        16618  14.53     14.30           6304          13.585036  5.202138     A           2
2240 |     2  https://practicaldatascience.co.uk/data-scienc...    2378        71496   3.33     16.39           8682          18.709594  5.124558     A           3
2241 |     3  https://practicaldatascience.co.uk/data-scienc...    1942        14274  13.61     15.02          10624          22.894578  4.184984     A           4
2242 |     4  https://practicaldatascience.co.uk/data-scienc...    1738        23979   7.25     11.80          12362          26.639945  3.745367     A           5
2243 |       
2244 | 
2245 |     class  pages  impressions  clicks   avg_ctr  avg_position  share_of_clicks  share_of_impressions
2246 |     0     A     63       747643   36980  5.126349     22.706825             79.7                  43.7
2247 |     1     B     46       639329    4726  3.228043     31.897826             10.2                  37.4
2248 |     2     C    190       323385    4698  2.393632     38.259368             10.1                  18.9
2249 |     3     D     36         1327       0  0.000000     25.804722              0.0                   0.1
2250 | 
2251 | 
2252 | 
2253 | ---
2254 | 
2255 | ### Reports
2256 | The Reports module creates weekly, monthly, quarterly, or yearly reports for customers and orders and calculates a range of common ecommerce metrics to show business performance.
2257 | 
2258 | #### 1. Customers report
2259 | The `customers_report()` function takes a formatted dataframe of transaction items (see above) and a desired frequency (D for daily, W for weekly, M for monthly, Q for quarterly) and calculates aggregate metrics for each period. 
2260 | 
2261 | The function returns the number of orders, the number of customers, the number of new customers, the number of returning customers, and the acquisition rate (or proportion of new customers). For monthly reporting, I would recommend a 13-month period so you can compare the last month with the same month the previous year. 
2262 | 
2263 | ```python
2264 | from ecommercetools import reports
2265 | 
2266 | df_customers_report = reports.customers_report(transaction_items, frequency='M')
2267 | print(df_customers_report.head(13))
2268 | ```
2269 | 
2270 | #### 2. Transactions report
2271 | The `transactions_report()` function takes a formatted dataframe of transaction items (see above) and a desired frequency (D for daily, W for weekly, M for monthly, Q for quarterly) and calculates aggregate metrics for each period. 
2272 | 
2273 | The metrics returned are: customers, orders, revenue, SKUs, units, average order value, average SKUs per order, average units per order, and average revenue per customer. 
2274 | 
2275 | ```python
2276 | from ecommercetools import reports
2277 | 
2278 | df_orders_report = reports.transactions_report(transaction_items, frequency='M')
2279 | print(df_orders_report.head(13))
2280 | ```
2281 | 
2282 | 


--------------------------------------------------------------------------------
/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/practical-data-science/ecommercetools/d5d5e9a4a6c7fafd0b6931c13d9cf3865a154b76/banner.png


--------------------------------------------------------------------------------
/ecommercetools/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.38"
2 | __author__ = "Matt Clarke"
3 | 


--------------------------------------------------------------------------------
/ecommercetools/advertising/__init__.py:
--------------------------------------------------------------------------------
1 | from ecommercetools.advertising.advertising import generate_ad_keywords
2 | from ecommercetools.advertising.advertising import generate_spintax
3 | 


--------------------------------------------------------------------------------
/ecommercetools/advertising/advertising.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import random
  3 | import itertools
  4 | import pandas as pd
  5 | 
  6 | 
  7 | def _match_type_exact(keywords):
  8 |     exact = []
  9 |     for keyword in keywords:
 10 |         exact.append([keyword[0], '[' + keyword[1] + ']'])
 11 | 
 12 |     df = pd.DataFrame.from_records(exact, columns=['product', 'keywords'])
 13 |     df['match_type'] = 'Exact'
 14 | 
 15 |     return df
 16 | 
 17 | 
 18 | def _match_type_phrase(keywords):
 19 |     phrase = []
 20 |     for keyword in keywords:
 21 |         phrase.append([keyword[0], '"' + keyword[1] + '"'])
 22 | 
 23 |     df = pd.DataFrame.from_records(phrase, columns=['product', 'keywords'])
 24 |     df['match_type'] = 'Phrase'
 25 | 
 26 |     return df
 27 | 
 28 | 
 29 | def _match_type_broad(keywords):
 30 |     broad = []
 31 |     for keyword in keywords:
 32 |         broad.append([keyword[0], keyword[1]])
 33 | 
 34 |     df = pd.DataFrame.from_records(broad, columns=['product', 'keywords'])
 35 |     df['match_type'] = 'Broad'
 36 | 
 37 |     return df
 38 | 
 39 | 
 40 | def _match_type_broad_modified(keywords):
 41 |     broad_modified = []
 42 |     for keyword in keywords:
 43 |         bmm = ['+' + keyword[1].replace(' ', ' +')]
 44 |         broad_modified.append([keyword[0], bmm])
 45 | 
 46 |     df = pd.DataFrame.from_records(broad_modified, columns=['product', 'keywords'])
 47 |     df['match_type'] = 'Modified'
 48 | 
 49 |     return df
 50 | 
 51 | 
 52 | def _generate_combinations(products,
 53 |                            keywords_prepend,
 54 |                            keywords_append):
 55 |     """Return a list of all prepended and appended keywords combinations.
 56 | 
 57 |     Args:
 58 |         products (list): List of product names.
 59 |         keywords_prepend (list): List of keywords to prepend to product names.
 60 |         keywords_append (list): List of keywords to append to product names.
 61 | 
 62 |     Returns:
 63 |         keywords (list): List of lists containing the product name and keyword combination.
 64 | 
 65 |     Example:
 66 |         [['fly rods', 'fly rods'],
 67 |         ['fly rods', 'buy fly rods'],
 68 |         ['fly rods', 'best fly rods']]
 69 |     """
 70 | 
 71 |     keywords = []
 72 | 
 73 |     for product in products:
 74 |         keywords.append([product, product])
 75 | 
 76 |         for keyword_prepend in keywords_prepend:
 77 |             keywords.append([product, keyword_prepend + ' ' + product])
 78 | 
 79 |         for keyword_append in keywords_append:
 80 |             keywords.append([product, product + ' ' + keyword_append])
 81 | 
 82 |     return keywords
 83 | 
 84 | 
 85 | def generate_ad_keywords(products,
 86 |                          keywords_prepend,
 87 |                          keywords_append,
 88 |                          campaign_name):
 89 |     """Return a Pandas dataframe of keywords data for use in Google Adwords.
 90 | 
 91 |     Args:
 92 |         products (list): List of product names.
 93 |         keywords_prepend (list): List of keywords to prepend to product names.
 94 |         keywords_append (list): List of keywords to append to product names.
 95 |         campaign_name (str): Name of paid search campaign.
 96 | 
 97 |     Returns:
 98 |         df (object): Pandas dataframe containing generated data.
 99 |     """
100 | 
101 |     keywords = _generate_combinations(products, keywords_prepend, keywords_append)
102 | 
103 |     exact = _match_type_exact(keywords)
104 |     phrase = _match_type_phrase(keywords)
105 |     broad = _match_type_broad(keywords)
106 |     broad_modified = _match_type_broad_modified(keywords)
107 | 
108 |     df = pd.concat([exact, phrase, broad, broad_modified])
109 |     df['campaign_name'] = campaign_name
110 |     return df
111 | 
112 | 
113 | def generate_spintax(text, single=True):
114 |     """Return a list of unique spins of a Spintax text string.
115 | 
116 |     Args:
117 |         text (string): Spintax text (i.e. I am the {President|King|Ambassador} of Nigeria.)
118 |         single (bool, optional): Optional boolean to return a list or a single spin.
119 | 
120 |     Returns:
121 |         spins (string, list): Single spin or list of spins depending on single.
122 |     """
123 | 
124 |     pattern = re.compile('({[^}]+}|[^{}]*)')
125 |     chunks = pattern.split(text)
126 | 
127 |     def options(s):
128 |         if len(s) > 0 and s[0] == '{':
129 |             return [opt for opt in s[1:-1].split('|')]
130 |         return [s]
131 | 
132 |     parts_list = [options(chunk) for chunk in chunks]
133 | 
134 |     spins = []
135 | 
136 |     for spin in itertools.product(*parts_list):
137 |         spins.append(''.join(spin))
138 | 
139 |     if single:
140 |         return spins[random.randint(0, len(spins) - 1)]
141 |     else:
142 |         return spins
143 | 
144 | 


--------------------------------------------------------------------------------
/ecommercetools/customers/__init__.py:
--------------------------------------------------------------------------------
1 | from ecommercetools.customers.customers import get_customers
2 | from ecommercetools.customers.customers import get_rfm_segments
3 | from ecommercetools.customers.customers import get_abc_segments
4 | from ecommercetools.customers.customers import get_cohorts
5 | from ecommercetools.customers.customers import get_cohort_matrix
6 | from ecommercetools.customers.customers import get_retention
7 | from ecommercetools.customers.customers import get_latency
8 | from ecommercetools.customers.customers import get_customer_predictions
9 | 


--------------------------------------------------------------------------------
/ecommercetools/customers/customers.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import operator as op
  4 | from ecommercetools.transactions import transactions
  5 | from ecommercetools import utilities
  6 | from sklearn.cluster import KMeans
  7 | from sklearn.preprocessing import StandardScaler
  8 | from lifetimes import GammaGammaFitter
  9 | from lifetimes.utils import summary_data_from_transaction_data
 10 | from lifetimes import BetaGeoFitter
 11 | 
 12 | 
 13 | def get_customers(transaction_items):
 14 |     """Return a Pandas DataFrame of customers from a Pandas DataFrame of transaction items.
 15 | 
 16 |     Args:
 17 |         transaction_items (object): DataFrame containing order_id, sku, quantity, unit_price, customer_id, order_date
 18 | 
 19 |     Returns:
 20 |         customers: Pandas DataFrame containing customers
 21 |     """
 22 | 
 23 |     customers = transaction_items.groupby('customer_id').agg(
 24 |         revenue=('line_price', 'sum'),
 25 |         orders=('order_id', 'nunique'),
 26 |         skus=('sku', 'nunique'),
 27 |         items=('quantity', 'sum'),
 28 |         first_order_date=('order_date', 'min'),
 29 |         last_order_date=('order_date', 'max')
 30 |     ).reset_index()
 31 |     customers['avg_items'] = round((customers['items'] / customers['orders']), 2)
 32 |     customers['avg_order_value'] = round((customers['revenue'] / customers['orders']), 2)
 33 |     customers['tenure'] = (pd.to_datetime('today') - customers['first_order_date']).dt.days
 34 |     customers['recency'] = (pd.to_datetime('today') - customers['last_order_date']).dt.days
 35 |     customers['cohort'] = customers['first_order_date'].dt.year.astype(str) + \
 36 |                           customers['first_order_date'].dt.quarter.astype(str)
 37 |     return customers
 38 | 
 39 | 
 40 | def _sorted_kmeans(df,
 41 |                    metric_column,
 42 |                    cluster_name,
 43 |                    ascending=True):
 44 |     """Runs a K-means clustering algorithm on a specific metric column in a Pandas dataframe.
 45 | 
 46 |     Sorts the data in a specified direction; and reassigns cluster numbers to match the data distribution,
 47 |     so they are appropriate for RFM segmentation. You may need to log transform heavily skewed data.
 48 | 
 49 |     Args:
 50 |         df (object): Pandas dataframe
 51 |         metric_column (str): Name of metric column
 52 |         ascending (bool, optional): Set to False to sort in descending order
 53 |         cluster_name (str): Name of cluster
 54 | 
 55 |     Returns:
 56 |         Original Pandas DataFrame with additional column
 57 |     """
 58 | 
 59 |     # Fit the model
 60 |     kmeans = KMeans(n_clusters=5)
 61 |     kmeans.fit(df[[metric_column]])
 62 | 
 63 |     # Assign the initial unsorted cluster
 64 |     initial_cluster = 'unsorted_' + cluster_name
 65 |     df[initial_cluster] = kmeans.predict(df[[metric_column]]) + 1
 66 |     df[cluster_name] = df[initial_cluster]
 67 | 
 68 |     # Group the clusters and re-rank to determine the correct order
 69 |     df_sorted = df.groupby(initial_cluster)[metric_column].mean().round(2).reset_index()
 70 |     df_sorted = df_sorted.sort_values(by=metric_column, ascending=ascending).reset_index(drop=True)
 71 |     df_sorted[cluster_name] = df_sorted[metric_column].rank(method='max', ascending=ascending).astype(int)
 72 | 
 73 |     # Merge data and drop redundant columns
 74 |     df = df.merge(df_sorted[[cluster_name, initial_cluster]], on=[initial_cluster])
 75 |     df = df.drop(initial_cluster, axis=1)
 76 |     df = df.drop(cluster_name + '_x', axis=1)
 77 |     df = df.rename(columns={cluster_name + '_y': cluster_name})
 78 | 
 79 |     return df
 80 | 
 81 | 
 82 | def _label_rfm_segments(rfm):
 83 |     """Return a label for a customer based on their RFM score
 84 | 
 85 |     Args:
 86 |         rfm (int): Full three-digit RFM score, i.e. 555 or 111
 87 | 
 88 |     Returns:
 89 |         label (str): Descriptive RFM score label, i.e. Risky
 90 |     """
 91 | 
 92 |     rfm = int(rfm)
 93 | 
 94 |     if (rfm >= 111) & (rfm <= 155):
 95 |         return 'Risky'
 96 | 
 97 |     elif (rfm >= 211) & (rfm <= 255):
 98 |         return 'Hold and improve'
 99 | 
100 |     elif (rfm >= 311) & (rfm <= 353):
101 |         return 'Potential loyal'
102 | 
103 |     elif ((rfm >= 354) & (rfm <= 454)) or ((rfm >= 511) & (rfm <= 535)) or (rfm == 541):
104 |         return 'Loyal'
105 | 
106 |     elif (rfm == 455) or (rfm >= 542) & (rfm <= 555):
107 |         return 'Star'
108 | 
109 |     else:
110 |         return 'Other'
111 | 
112 | 
113 | def get_rfm_segments(customers):
114 |     """Return a Pandas DataFrame of customer RFM segments from a Pandas DataFrame of customers.
115 | 
116 |     The DataFrame returned by get_customers() already contains the raw data required, but
117 |     this function will rename it accordingly and use it to assign the customer to a range
118 |     of different segments that can be used for marketing and analysis.
119 | 
120 |     Args:
121 |         customers: Pandas DataFrame from get_customers()
122 | 
123 |     Returns:
124 |         segments: Pandas DataFrame
125 | 
126 |     """
127 | 
128 |     # Rename the raw data columns
129 |     segments = customers[['customer_id']]
130 |     segments = segments.assign(acquisition_date=customers['first_order_date'])
131 |     segments = segments.assign(recency_date=customers['last_order_date'])
132 |     segments = segments.assign(recency=customers['recency'])
133 |     segments = segments.assign(frequency=customers['orders'])
134 |     segments = segments.assign(monetary=customers['revenue'])
135 |     segments = segments.assign(heterogeneity=customers['skus'])
136 |     segments = segments.assign(tenure=customers['tenure'])
137 | 
138 |     # Use K-means to create RFMH scores
139 |     segments = _sorted_kmeans(segments, 'recency', 'r', ascending=False)
140 |     segments = _sorted_kmeans(segments, 'frequency', 'f', ascending=True)
141 |     segments = _sorted_kmeans(segments, 'monetary', 'm', ascending=True)
142 |     segments = _sorted_kmeans(segments, 'heterogeneity', 'h', ascending=True)
143 | 
144 |     # Create scores
145 |     segments = segments.assign(rfm=segments['r'].astype(str) + \
146 |                                    segments['f'].astype(str) + \
147 |                                    segments['m'].astype(str))
148 | 
149 |     segments = segments.assign(rfm_score=segments['r'].astype(int) + \
150 |                                          segments['f'].astype(int) + \
151 |                                          segments['m'].astype(int))
152 | 
153 |     # Create labels
154 |     segments['rfm_segment_name'] = segments.apply(lambda x: _label_rfm_segments(x.rfm), axis=1)
155 | 
156 |     return segments
157 | 
158 | 
159 | def _abc_classify_customer(percentage):
160 |     """Apply an ABC classification to each customer based on its ranked percentage revenue contribution.
161 | 
162 |     Args:
163 |         percentage (float): Cumulative percentage of ranked revenue
164 | 
165 |     Returns:
166 |         segments: Pandas DataFrame
167 |     """
168 | 
169 |     if 0 < percentage <= 80:
170 |         return 'A'
171 |     elif 80 < percentage <= 90:
172 |         return 'B'
173 |     else:
174 |         return 'C'
175 | 
176 | 
177 | def get_abc_segments(customers,
178 |                      months=12,
179 |                      abc_class_name='abc_class_12m',
180 |                      abc_rank_name='abc_rank_12m'):
181 |     """Return a dataframe containing the ABC class and rank for each customer.
182 | 
183 |     Apply an ABC classification to each customer based on its ranked percentage revenue contribution.
184 |     This automatically uses a 12 month period by default, but can be modified for other periods to suit.
185 | 
186 |     Args:
187 | 
188 |         customers (object): Pandas DataFrame from get_customers()
189 |         months (int, optional): Number of months to use for ABC analysis (12 by default)
190 |         abc_class_name (str, optional): Name to assign to ABC class string (abc_class_12m by default)
191 |         abc_rank_name (str, optional): Name to assign to ABC rank string (abc_rank_12m by default)
192 | 
193 |     Returns:
194 |         abc: Pandas DataFrame
195 |     """
196 | 
197 |     # Calculate data for customers who purchased within the specified period
198 |     purchased = customers[customers['recency'] <= (months * 30)]
199 |     purchased = purchased.sort_values(by='revenue', ascending=False)
200 |     purchased['revenue_cumsum'] = purchased['revenue'].cumsum()
201 |     purchased['revenue_total'] = purchased['revenue'].sum()
202 |     purchased['revenue_running_percentage'] = (purchased['revenue_cumsum'] / purchased['revenue_total']) * 100
203 |     purchased[abc_class_name] = purchased['revenue_running_percentage'].apply(_abc_classify_customer)
204 |     purchased[abc_rank_name] = purchased['revenue_running_percentage'].rank().astype(int)
205 |     purchased.drop(['revenue_cumsum', 'revenue_total', 'revenue_running_percentage'], axis=1, inplace=True)
206 | 
207 |     # Assign lapsed customers to class D
208 |     lapsed = customers[customers['recency'] > (months * 30)]
209 | 
210 |     # Return ABC segments
211 |     abc = purchased.append(lapsed)
212 |     abc[abc_class_name].fillna('D', inplace=True)
213 |     abc[abc_rank_name].fillna(len(purchased) + 1, inplace=True)
214 |     abc = abc[['customer_id', abc_class_name, abc_rank_name]]
215 |     return abc
216 | 
217 | 
218 | def get_cohorts(df, period='M'):
219 |     """Return a customer cohort matrix from a dataframe of transactional items.
220 | 
221 |     Given a Pandas DataFrame of transactional items, this function returns
222 |     a Pandas DataFrame containing the acquisition cohort and order cohort which
223 |     can be used for customer analysis or the creation of a cohort analysis matrix.
224 | 
225 |     Args:
226 |         df (object): Pandas DataFrame. Required columns: order_id, customer_id, order_date.
227 |         period (str, optional): Period value - M, Q, or Y. Create cohorts using month, quarter, or year of acquisition.
228 | 
229 |     Returns:
230 |         df (object): Pandas DataFrame
231 |     """
232 | 
233 |     df = df[['customer_id', 'order_id', 'order_date']].drop_duplicates()
234 |     df = df.assign(acquisition_cohort=df.groupby('customer_id') \
235 |         ['order_date'].transform('min').dt.to_period(period))
236 |     df = df.assign(order_cohort=df['order_date'].dt.to_period(period))
237 |     return df
238 | 
239 | 
240 | def get_retention(df, period='M'):
241 |     """Calculate the retention of customers in each month after their acquisition.
242 | 
243 |     Args:
244 |         df (object): Pandas DataFrame. Required columns: order_id, customer_id, order_date.
245 |         period (str, optional): Period value - M, Q, or Y. Create cohorts using month, quarter, or year of acquisition.
246 | 
247 |     Returns:
248 |     -------
249 |         df (object): Pandas DataFrame
250 |     """
251 | 
252 |     df = get_cohorts(df, period).groupby(['acquisition_cohort', 'order_cohort']) \
253 |         .agg(customers=('customer_id', 'nunique')) \
254 |         .reset_index(drop=False)
255 |     df['periods'] = (df.order_cohort - df.acquisition_cohort) \
256 |         .apply(op.attrgetter('n'))
257 | 
258 |     return df
259 | 
260 | 
261 | def get_cohort_matrix(df, period='M', percentage=False):
262 |     """Return a cohort matrix showing the number of customers who purchased in each period after their acquisition.
263 | 
264 |     Args:
265 |         df (object): Pandas DataFrame. Required columns: order_id, customer_id, order_date.
266 |         period (str, optional): Period value - M, Q, or Y. Create cohorts using month, quarter, or year of acquisition.
267 |         percentage (bool, optional): True or False. Return raw numbers or a percentage retention.
268 | 
269 |     Returns:
270 |         df (object): Pandas DataFrame
271 |     """
272 | 
273 |     df = get_retention(df, period).pivot_table(index='acquisition_cohort',
274 |                                                columns='periods',
275 |                                                values='customers')
276 | 
277 |     if percentage:
278 |         df = df.divide(df.iloc[:, 0], axis=0)
279 | 
280 |     return df
281 | 
282 | 
283 | def _days_to_next_order(avg_latency, std_latency, recency):
284 |     """Estimate the number of days to a customer's next order using latency.
285 | 
286 |     Args:
287 |         avg_latency (float): Average latency in days
288 |         std_latency (float): Standard deviation of latency in days
289 |         recency (float): Recency in days
290 |     Returns:
291 |         Approximate number of days until the next order.
292 |     """
293 | 
294 |     return avg_latency - (recency - std_latency)
295 | 
296 | 
297 | def _latency_label_customers(avg_latency, std_latency, recency):
298 |     """Add a label to describe a customer's latency metric.
299 | 
300 |     Args:
301 |         avg_latency (float): Average latency in days
302 |         std_latency (float): Standard deviation of latency in days
303 |         recency (float): Recency in days
304 |     Returns:
305 |            Label describing the latency metric in relation to the customer.
306 |     """
307 | 
308 |     days_to_next_order_upper = avg_latency - (recency - std_latency)
309 |     days_to_next_order_lower = avg_latency - (recency + std_latency)
310 | 
311 |     if recency < days_to_next_order_lower:
312 |         return 'Order not due'
313 | 
314 |     elif (recency <= days_to_next_order_lower) or (recency <= days_to_next_order_upper):
315 |         return 'Order due soon'
316 | 
317 |     elif recency > days_to_next_order_upper:
318 |         return 'Order overdue'
319 | 
320 |     else:
321 |         return 'Not sure'
322 | 
323 | 
324 | def get_latency(df_transactions):
325 |     """Return a Pandas dataframe containing latency metrics for each customer.
326 | 
327 |     Args:
328 |         df_transactions: Pandas dataframe from get_transactions().
329 | 
330 |     Returns:
331 |         Pandas dataframe of customer purchase latency metrics.
332 |     """
333 | 
334 |     # Create latency dataframe and calculate granular metrics
335 |     df_latency = df_transactions[['order_id', 'customer_id', 'order_date', 'revenue']]
336 |     df_latency = df_latency[df_latency['revenue'] > 0]
337 |     df_latency = df_latency.sort_values(by=['order_date'], ascending=False)
338 |     df_latency['prev_order_date'] = utilities.get_previous_value(df_latency, 'customer_id', 'order_date')
339 |     df_latency['days_since_prev_order'] = utilities.get_days_since_date(df_latency, 'prev_order_date', 'order_date')
340 |     df_latency['order_number'] = utilities.get_cumulative_count(df_latency, 'customer_id', 'order_id', 'order_date')
341 | 
342 |     # Create customer dataframe and calculate aggregate metrics
343 |     df_customers = pd.DataFrame(df_latency['customer_id'].unique())
344 |     df_customers.columns = ['customer_id']
345 | 
346 |     # Calculate frequency
347 |     df_frequency = df_latency.groupby('customer_id')['order_id'].nunique().reset_index()
348 |     df_frequency.columns = ['customer_id', 'frequency']
349 |     df_customers = df_customers.merge(df_frequency, on='customer_id')
350 | 
351 |     # Calculate recency
352 |     df_recency = df_latency.groupby('customer_id')['order_date'].max().reset_index()
353 |     df_recency.columns = ['customer_id', 'recency_date']
354 |     df_customers = df_customers.merge(df_recency, on='customer_id')
355 |     df_customers['recency'] = round((pd.to_datetime('today') - df_customers['recency_date']) \
356 |                                     / np.timedelta64(1, 'D')).astype(int)
357 | 
358 |     # Calculate average latency
359 |     df_avg_latency = df_latency.groupby('customer_id')['days_since_prev_order'].mean().astype(int).reset_index()
360 |     df_avg_latency.columns = ['customer_id', 'avg_latency']
361 |     df_customers = df_customers.merge(df_avg_latency, on='customer_id')
362 | 
363 |     # Calculate standard deviation of latency for returning customers
364 |     df_latency_returning = df_latency[df_latency['order_number'] > 0]
365 | 
366 |     # Min latency
367 |     df_min = df_latency_returning.groupby('customer_id')['days_since_prev_order'].min().astype(int).reset_index()
368 |     df_min.columns = ['customer_id', 'min_latency']
369 |     df_customers = df_customers.merge(df_min, on='customer_id')
370 | 
371 |     # Max latency
372 |     df_max = df_latency_returning.groupby('customer_id')['days_since_prev_order'].max().astype(int).reset_index()
373 |     df_max.columns = ['customer_id', 'max_latency']
374 |     df_customers = df_customers.merge(df_max, on='customer_id')
375 | 
376 |     # STD latency
377 |     df_std = df_latency_returning.groupby('customer_id')['days_since_prev_order'].std().reset_index()
378 |     df_std.columns = ['customer_id', 'std_latency']
379 |     df_customers = df_customers.merge(df_std, on='customer_id')
380 | 
381 |     # Coefficient of Variation of latency
382 |     df_customers['cv'] = df_customers['std_latency'] / df_customers['avg_latency']
383 | 
384 |     # Calculate approximate days to next order
385 |     df_customers['days_to_next_order'] = df_customers.apply(
386 |         lambda x: _days_to_next_order(x['avg_latency'], x['std_latency'], x['recency']), axis=1).round()
387 | 
388 |     # Label latency
389 |     df_customers['label'] = df_customers.apply(
390 |         lambda x: _latency_label_customers(x['avg_latency'], x['std_latency'], x['recency']), axis=1)
391 | 
392 |     return df_customers
393 | 
394 | 
395 | def _get_lifetimes_rfmt(df_transactions, observation_period_end):
396 |     """Return the RFMT data from the Lifetimes model.
397 | 
398 |     Args:
399 |         df_transactions (df): Pandas dataframe of transactions from get_transactions()
400 |         observation_period_end (string): Date string in YYYY-MM-DD format representing end of observation period.
401 | 
402 |     Returns:
403 |         df: Pandas dataframe containing frequency, recency, T, monetary_value per customer.
404 |     """
405 | 
406 |     df_transactions = df_transactions[df_transactions['replacement'] == 0]
407 | 
408 |     df = summary_data_from_transaction_data(df_transactions,
409 |                                             'customer_id',
410 |                                             'order_date',
411 |                                             'revenue',
412 |                                             observation_period_end=observation_period_end)
413 |     return df
414 | 
415 | 
416 | def _get_predicted_purchases(df_transactions,
417 |                              observation_period_end,
418 |                              days=90):
419 |     """Return the number of predicted purchases per customer from the Lifetimes BG/NBD model.
420 | 
421 |     Args:
422 |         df_transactions (df): Pandas dataframe of transactions from get_transactions()
423 |         observation_period_end (string): Date string in YYYY-MM-DD format representing end of observation period.
424 | 
425 |     Returns:
426 |         df: Pandas dataframe containing frequency, recency, T, monetary_value per customer, and predicted purchases.
427 |     """
428 | 
429 |     df = _get_lifetimes_rfmt(df_transactions, observation_period_end)
430 |     bgf = BetaGeoFitter(penalizer_coef=0)
431 |     bgf.fit(df['frequency'], df['recency'], df['T'])
432 |     df['predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time(days,
433 |                                                                                         df['frequency'],
434 |                                                                                         df['recency'],
435 |                                                                                         df['T'])
436 |     return df
437 | 
438 | 
439 | def _get_predicted_aov(df_transactions,
440 |                        observation_period_end,
441 |                        ggf_penalizer_coef=0):
442 |     """Returns the predicted AOV for each customer via the Gamma-Gamma model.
443 |     This function uses models from the Lifetimes package.
444 | 
445 |     Args:
446 |         df_transactions (df): Pandas dataframe of transactions from get_transactions()
447 |         observation_period_end (string): Date string in YYYY-MM-DD format for end of observation period.
448 |         ggf_penalizer_coef (float, optional): Penalizer coefficient for Gamma-Gamma model. See Lifetimes.
449 | 
450 |     Returns:
451 |         Predicted AOV for each customer.
452 |     """
453 | 
454 |     df_rfmt = _get_lifetimes_rfmt(df_transactions, observation_period_end)
455 | 
456 |     df_returning = df_rfmt[df_rfmt['frequency'] > 0]
457 |     df_returning = df_rfmt[df_rfmt['monetary_value'] > 0]
458 | 
459 |     ggf = GammaGammaFitter(penalizer_coef=ggf_penalizer_coef)
460 |     ggf.fit(df_returning['frequency'],
461 |             df_returning['monetary_value'])
462 | 
463 |     predicted_monetary = ggf.conditional_expected_average_profit(
464 |         df_returning['frequency'],
465 |         df_returning['monetary_value']
466 |     )
467 | 
468 |     aov_df = pd.DataFrame(predicted_monetary, columns=['aov'])
469 | 
470 |     return aov_df
471 | 
472 | 
473 | def _get_predicted_clv(df_transactions,
474 |                        observation_period_end,
475 |                        months=12,
476 |                        discount_rate=0.01,
477 |                        ggf_penalizer_coef=0,
478 |                        bgf_penalizer_coef=0):
479 |     """Return the predicted CLV for each customer using the Gamma-Gamma and BG/NBD models.
480 |     This function uses models from the Lifetimes package.
481 | 
482 |     Args:
483 |         df_transactions (df): Pandas dataframe of transactions from get_transactions()
484 |         observation_period_end (string): Date string in YYYY-MM-DD format for end of observation period.
485 |         months (int, optional): Optional number of months in CLV prediction window.
486 |         discount_rate (float, optional): Discount rate. See Lifetimes.
487 |         ggf_penalizer_coef (float, optional): Penalizer coefficient for Gamma-Gamma model. See Lifetimes.
488 |         bgf_penalizer_coef (float, optional): Penalizer coefficient for BG/NBD model. See Lifetimes.
489 | 
490 |     Returns:
491 |         Predicted CLV for each customer.
492 |     """
493 | 
494 |     df_rfmt = _get_lifetimes_rfmt(df_transactions, observation_period_end)
495 |     df_returning = df_rfmt[df_rfmt['frequency'] > 0]
496 |     df_returning = df_rfmt[df_rfmt['monetary_value'] > 0]
497 | 
498 |     ggf = GammaGammaFitter(penalizer_coef=ggf_penalizer_coef)
499 |     ggf.fit(df_returning['frequency'],
500 |             df_returning['monetary_value'])
501 | 
502 |     bgf = BetaGeoFitter(penalizer_coef=bgf_penalizer_coef)
503 |     bgf.fit(df_returning['frequency'],
504 |             df_returning['recency'],
505 |             df_returning['T'])
506 | 
507 |     preds = ggf.customer_lifetime_value(
508 |         bgf,
509 |         df_returning['frequency'],
510 |         df_returning['recency'],
511 |         df_returning['T'],
512 |         df_returning['monetary_value'],
513 |         time=months,
514 |         discount_rate=discount_rate
515 |     ).to_frame().reset_index()
516 | 
517 |     return preds
518 | 
519 | 
520 | def get_customer_predictions(df_transactions,
521 |                              observation_period_end,
522 |                              days=90,
523 |                              months=3,
524 |                              discount_rate=0.01,
525 |                              ggf_penalizer_coef=0,
526 |                              bgf_penalizer_coef=0):
527 |     """Get predicted customer purchased, AOV, and CLV for the defined period.
528 | 
529 |     This uses the Lifetimes package to run the Gamma-Gamma, and BG/NBD models
530 |     and predict the AOV, CLV, and number of purchases each customer will make.
531 |     These models use a different approach to measuring RFMT than the other
532 |     functions in EcommerceTools, so are not directly comparable, so the results
533 |     have been removed from the output.
534 | 
535 |     Args:
536 |         df_transactions (df): Pandas dataframe of transactions from get_transactions()
537 |         observation_period_end (string): Date string in YYYY-MM-DD format for end of observation period.
538 |         days (int, optional): Optional number of days in purchase prediction window.
539 |         months (int, optional): Optional number of months in CLV prediction window.
540 |         discount_rate (float, optional): Discount rate. See Lifetimes.
541 |         ggf_penalizer_coef (float, optional): Penalizer coefficient for Gamma-Gamma model. See Lifetimes.
542 |         bgf_penalizer_coef (float, optional): Penalizer coefficient for BG/NBD model. See Lifetimes.
543 | 
544 |     Returns:
545 |         df_predictions: Pandas dataframe containing predictions from Gamma-Gamma and BG/NBD models.
546 |     """
547 | 
548 |     df_predicted_purchases = _get_predicted_purchases(df_transactions,
549 |                                                       observation_period_end,
550 |                                                       days=days)
551 |     df_aov = _get_predicted_aov(df_transactions,
552 |                                 observation_period_end)
553 | 
554 |     df_clv = _get_predicted_clv(df_transactions,
555 |                                 observation_period_end,
556 |                                 months=months,
557 |                                 discount_rate=discount_rate,
558 |                                 bgf_penalizer_coef=bgf_penalizer_coef,
559 |                                 ggf_penalizer_coef=ggf_penalizer_coef
560 |                                 )
561 | 
562 |     df_predictions = df_predicted_purchases.merge(df_aov, on='customer_id', how='left')
563 |     df_predictions = df_predictions.merge(df_clv, on='customer_id', how='left')
564 | 
565 |     return df_predictions[['customer_id', 'predicted_purchases', 'aov', 'clv']]
566 | 
567 | 


--------------------------------------------------------------------------------
/ecommercetools/marketing/__init__.py:
--------------------------------------------------------------------------------
1 | from ecommercetools.marketing.marketing import get_trading_events
2 | from ecommercetools.marketing.marketing import get_trading_calendar
3 | 
4 | 


--------------------------------------------------------------------------------
/ecommercetools/marketing/marketing.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from pandas.tseries.offsets import BDay
 3 | from pandas.tseries.holiday import (
 4 |     AbstractHolidayCalendar, Holiday, DateOffset, SU, MO, TU, WE, TH, FR, SA, next_monday,
 5 |     nearest_workday, sunday_to_monday, EasterMonday, GoodFriday, Easter
 6 | )
 7 | 
 8 | 
 9 | class UKEcommerceTradingCalendar(AbstractHolidayCalendar):
10 |     rules = [
11 | 
12 |         # Pay days (based on fourth Friday of the month)
13 |         Holiday('January Pay Day', month=1, day=31, offset=BDay(-1)),
14 |         Holiday('February Pay Day', month=2, day=28, offset=BDay(-1)),
15 |         Holiday('March Pay Day', month=3, day=31, offset=BDay(-1)),
16 |         Holiday('April Pay Day', month=4, day=30, offset=BDay(-1)),
17 |         Holiday('May Pay Day', month=5, day=31, offset=BDay(-1)),
18 |         Holiday('June Pay Day', month=6, day=30, offset=BDay(-1)),
19 |         Holiday('July Pay Day', month=7, day=31, offset=BDay(-1)),
20 |         Holiday('August Pay Day', month=8, day=31, offset=BDay(-1)),
21 |         Holiday('September Pay Day', month=9, day=30, offset=BDay(-1)),
22 |         Holiday('October Pay Day', month=10, day=31, offset=BDay(-1)),
23 |         Holiday('November Pay Day', month=11, day=30, offset=BDay(-1)),
24 |         Holiday('December Pay Day', month=12, day=31, offset=BDay(-1)),
25 | 
26 |         # Seasonal trading events
27 |         Holiday('January sale', month=1, day=1),
28 |         Holiday('Valentine\'s Day [last order date]', month=2, day=14, offset=BDay(-2)),
29 |         Holiday('Valentine\'s Day', month=2, day=14),
30 |         Holiday('Mother\'s Day [last order date]', month=5, day=1, offset=BDay(-2)),
31 |         Holiday('Mother\'s Day', month=5, day=1, offset=pd.DateOffset(weekday=SU(2))),
32 |         Holiday('Father\'s Day [last order date]', month=6, day=1, offset=BDay(-2)),
33 |         Holiday('Father\'s Day', month=6, day=1, offset=pd.DateOffset(weekday=SU(3))),
34 |         Holiday("Black Friday [sale starts]", month=11, day=1, offset=[pd.DateOffset(weekday=SA(4)), BDay(-5)]),
35 |         Holiday('Black Friday', month=11, day=1, offset=pd.DateOffset(weekday=FR(4))),
36 |         Holiday("Cyber Monday", month=11, day=1, offset=[pd.DateOffset(weekday=SA(4)), pd.DateOffset(2)]),
37 |         Holiday('Christmas Day [last order date]', month=12, day=25, offset=BDay(-2)),
38 |         Holiday('Boxing Day sale', month=12, day=26),
39 |     ]
40 | 
41 | 
42 | def _get_dates(start_date, days=365):
43 |     """Get all dates from a start date to a given end date X days ahead.
44 | 
45 |     Args:
46 |         start_date (YYYY-MM-DD): Start date, i.e. 2021-01-01
47 |         days (optional, int): 365
48 | 
49 |     Returns:
50 |         Dataframe of dates X days ahead of start date
51 |     """
52 | 
53 |     period = pd.date_range(start_date, periods=days, freq='D')
54 |     df = pd.DataFrame({'date': period})
55 |     return df
56 | 
57 | 
58 | def get_trading_events(start_date, days=365):
59 |     """Calculate and return all trading events from the UK ecommerce trading calendar.
60 | 
61 |     Args:
62 |         start_date (YYYY-MM-DD): Start date, i.e. 2021-01-01
63 |         days (optional, int): 365
64 | 
65 |     Returns:
66 |         Dataframe of the name and date of each ecommerce trading event.
67 |     """
68 | 
69 |     dates = _get_dates(start_date, days)
70 | 
71 |     calendar = UKEcommerceTradingCalendar()
72 |     start = dates.date.min()
73 |     end = dates.date.max()
74 | 
75 |     events = calendar.holidays(start=start, end=end, return_name=True)
76 |     events = events.reset_index(name='event').rename(columns={'index': 'date'})
77 | 
78 |     return events
79 | 
80 | 
81 | def get_trading_calendar(start_date, days=365):
82 |     """Return a full ecommerce trading calendar for the specified period.
83 | 
84 |     Args:
85 |         start_date (YYYY-MM-DD): Start date, i.e. 2021-01-01
86 |         days (optional, int): 365
87 | 
88 |     Returns:
89 |         Pandas dataframe containing full calendar of ecommerce trading events.
90 |     """
91 | 
92 |     dates = _get_dates(start_date, days)
93 |     events = get_trading_events(start_date, days)
94 | 
95 |     calendar = dates.merge(events, on='date', how='left').fillna('')
96 |     return calendar
97 | 


--------------------------------------------------------------------------------
/ecommercetools/nlp/__init__.py:
--------------------------------------------------------------------------------
1 | from ecommercetools.nlp.nlp import get_summaries
2 | 


--------------------------------------------------------------------------------
/ecommercetools/nlp/nlp.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from transformers import pipeline
 3 | 
 4 | 
 5 | def get_summary(text,
 6 |                 min_length=50,
 7 |                 max_length=100,
 8 |                 do_sample=False):
 9 |     """Return a summary from a piece of text using a transformer model.
10 | 
11 |     Args:
12 |         text (string): String of text to summarize. Will be truncated to first 1024 characters.
13 |         min_length (int): Minimum length to return.
14 |         max_length (int): Maximum length to return.
15 |         do_sample (optional, boolean): Set to False to generate unique text or True to extract excerpts.
16 | 
17 |     Returns:
18 |         string: Summarized text.
19 |     """
20 | 
21 |     summarizer = pipeline("summarization")
22 |     summary = summarizer(text[:1024],
23 |                          min_length=min_length,
24 |                          max_length=max_length,
25 |                          do_sample=do_sample)
26 |     summary_text = summary[0]['summary_text'].strip().replace(' .', '.')
27 | 
28 |     return summary_text
29 | 
30 | 
31 | def get_summaries(df,
32 |                   text_column,
33 |                   summary_column_name='summary',
34 |                   min_length=50,
35 |                   max_length=100,
36 |                   do_sample=False):
37 |     """Return a summary each of a specified dataframe column using a transformer model.
38 | 
39 |     Args:
40 |         df (dataframe): Pandas dataframe containing the text to summarize.
41 |         text_column (string): Name of text column to summarize. Will be truncated to first 1024 characters.
42 |         summary_column_name (string, optional): Name of summary column.
43 |         min_length (int, optional): Minimum length to return.
44 |         max_length (int, optional): Maximum length to return.
45 |         do_sample (boolean, optional): Set to False to generate unique text or True to extract excerpts.
46 | 
47 |     Returns:
48 |         df['summary']: Original dataframe with additional column containing summaries.
49 |     """
50 | 
51 |     df[summary_column_name] = df.apply(lambda x: get_summary(x[text_column],
52 |                                                              min_length=min_length,
53 |                                                              max_length=max_length,
54 |                                                              do_sample=do_sample), axis=1)
55 |     return df
56 | 


--------------------------------------------------------------------------------
/ecommercetools/operations/__init__.py:
--------------------------------------------------------------------------------
1 | from ecommercetools.operations.operations import get_inventory_classification
2 | 


--------------------------------------------------------------------------------
/ecommercetools/operations/operations.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from ecommercetools.products import products
 3 | 
 4 | 
 5 | def _abc_classify_product(percentage):
 6 |     """Return an ABC classification for a product based on its ranked percentage revenue contribution.
 7 | 
 8 |     Args:
 9 |         percentage (float): Running percentage of revenue contributed by each SKU over a time period.
10 | 
11 |     Returns:
12 |         class (string): ABC class string
13 |     """
14 | 
15 |     if 0 < percentage <= 80:
16 |         return 'A'
17 |     elif 80 < percentage <= 90:
18 |         return 'B'
19 |     else:
20 |         return 'C'
21 | 
22 | 
23 | def get_inventory_classification(transaction_items, days=None, verbose=False):
24 |     """Return a Pandas DataFrame of product inventory classification from the transaction items dataframe.
25 | 
26 |     Args:
27 |         transaction_items (object): Pandas DataFrame of transaction items.
28 |         days (int, optional): Return data only for products sold in the past X days.
29 |         verbose (bool, optional): Displays additional columns of workings when set to True.
30 | 
31 |     Returns:
32 |         products (object): Pandas DataFrame.
33 |     """
34 | 
35 |     # Filter to the last X days
36 |     if days:
37 |         products_data = products.get_products(transaction_items, days)
38 |     else:
39 |         products_data = products.get_products(transaction_items)
40 | 
41 |     # Sort the data
42 |     products_data['revenue_total'] = products_data['revenue'].sum()
43 |     products_data = products_data.sort_values(by='revenue', ascending=False)
44 | 
45 |     # ABC inventory classification
46 |     products_data['revenue_cumsum'] = products_data['revenue'].cumsum()
47 |     products_data['revenue_running_percentage'] = (products_data['revenue_cumsum'] / products_data['revenue_total']) * 100
48 |     products_data['abc_class'] = products_data['revenue_running_percentage'].apply(_abc_classify_product)
49 |     products_data['abc_rank'] = products_data['revenue_running_percentage'].rank().astype(int)
50 | 
51 |     if verbose:
52 |         products_data = products_data[['sku', 'abc_class', 'abc_rank', 'revenue',
53 |                                        'revenue_cumsum', 'revenue_total', 'revenue_running_percentage']]
54 |     else:
55 |         products_data = products_data[['sku', 'abc_class', 'abc_rank']]
56 | 
57 |     return products_data
58 | 


--------------------------------------------------------------------------------
/ecommercetools/products/__init__.py:
--------------------------------------------------------------------------------
1 | from ecommercetools.products.products import get_products
2 | from ecommercetools.products.products import get_repurchase_rates
3 | 
4 | 


--------------------------------------------------------------------------------
/ecommercetools/products/products.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from ecommercetools.utilities import tools
  3 | 
  4 | 
  5 | def get_products(transaction_items, days=None):
  6 |     """Return a Pandas DataFrame of products from a Pandas DataFrame of transaction items.
  7 | 
  8 |     Args:
  9 |         transaction_items (object): Pandas DataFrame.
 10 |         days (int, optional): Select only product sold in the last X days.
 11 | 
 12 |     Returns:
 13 |         customers (object): Pandas DataFrame
 14 |     """
 15 | 
 16 |     if days:
 17 |         transaction_items = tools.select_last_x_days(transaction_items, 'order_date', days)
 18 | 
 19 |     transaction_items = transaction_items.assign(line_price=transaction_items['quantity'] * transaction_items['unit_price'])
 20 | 
 21 |     products = transaction_items.groupby('sku').agg(
 22 |         first_order_date=('order_date', 'min'),
 23 |         last_order_date=('order_date', 'max'),
 24 |         customers=('customer_id', 'nunique'),
 25 |         orders=('order_id', 'nunique'),
 26 |         items=('quantity', 'sum'),
 27 |         revenue=('line_price', 'sum'),
 28 |         avg_unit_price=('unit_price', 'mean'),
 29 |         avg_quantity=('quantity', 'mean'),
 30 |         avg_revenue=('line_price', 'mean')
 31 |     ).reset_index()
 32 | 
 33 |     products['avg_orders'] = round(products['orders'] / products['customers'], 2)
 34 |     products['product_tenure'] = (pd.to_datetime('today') - products['first_order_date']).dt.days
 35 |     products['product_recency'] = (pd.to_datetime('today') - products['last_order_date']).dt.days
 36 |     return products
 37 | 
 38 | 
 39 | def get_repurchase_rate_label(df):
 40 |     """Add a label describing the repurchase rate bin.
 41 | 
 42 |     Args:
 43 |         df (object): Pandas DataFrame containing repurchase_rate.
 44 | 
 45 |     Returns:
 46 |     -------
 47 |         df (object): Pandas DataFrame with repurchase_rate_label added.
 48 |     """
 49 | 
 50 |     labels = ['Very low repurchase',
 51 |               'Low repurchase',
 52 |               'Moderate repurchase',
 53 |               'High repurchase',
 54 |               'Very high repurchase']
 55 |     df['repurchase_rate_label'] = pd.cut(df['repurchase_rate'],
 56 |                                          bins=5,
 57 |                                          labels=labels)
 58 |     return df
 59 | 
 60 | 
 61 | def get_bulk_purchase_rate_label(df):
 62 |     """Add a label describing the bulk purchase rate bin.
 63 | 
 64 |     Args:
 65 |         df (object): Pandas DataFrame containing bulk_purchase_rate.
 66 | 
 67 |     Returns:
 68 |     -------
 69 |         df (object): Pandas DataFrame with bulk_purchase_rate_label added.
 70 |     """
 71 | 
 72 |     labels = ['Very low bulk',
 73 |               'Low bulk',
 74 |               'Moderate bulk',
 75 |               'High bulk',
 76 |               'Very high bulk']
 77 |     df['bulk_purchase_rate_label'] = pd.cut(df['bulk_purchase_rate'],
 78 |                                             bins=5,
 79 |                                             labels=labels)
 80 |     return df
 81 | 
 82 | 
 83 | def get_repurchase_rates(df):
 84 |     """Return repurchase rates and purchase behaviour for each SKU from transaction items data.
 85 | 
 86 |     Given a Pandas DataFrame of transactional items, this function returns a Pandas DataFrame
 87 |     containing the purchase behaviour and repurchase behaviour for each SKU.
 88 | 
 89 |     Args:
 90 |         df (object): Pandas DataFrame. Required columns: sku, order_id, customer_id, quantity, unit_price.
 91 | 
 92 |     Returns:
 93 |     -------
 94 |         df (object): Pandas DataFrame.
 95 |     """
 96 | 
 97 |     # Count the number of times each customer purchased each SKU
 98 |     df['times_purchased'] = df.groupby(['sku', 'customer_id'])['order_id'].transform('count')
 99 | 
100 |     # Count the number of times the SKU was purchased individually within orders
101 |     df['purchased_individually'] = df[df['quantity'] == 1]. \
102 |         groupby('sku')['order_id'].transform('count')
103 |     df['purchased_individually'] = df['purchased_individually'].fillna(0)
104 | 
105 |     # Count the number of times the SKU was purchased once only by customers
106 |     df['purchased_once'] = df[df['times_purchased'] == 1]. \
107 |         groupby('sku')['order_id'].transform('count')
108 |     df['purchased_once'] = df['purchased_once'].fillna(0)
109 | 
110 |     # Calculate line price
111 |     df['line_price'] = df['unit_price'] * df['quantity']
112 | 
113 |     # Get unique SKUs and count total items, orders, and customers
114 |     df_skus = df.groupby('sku').agg(
115 |         revenue=('line_price', 'sum'),
116 |         items=('quantity', 'sum'),
117 |         orders=('order_id', 'nunique'),
118 |         customers=('customer_id', 'nunique'),
119 |         avg_unit_price=('unit_price', 'mean'),
120 |         avg_line_price=('line_price', 'mean')
121 |     )
122 | 
123 |     # Calculate the average number of units per order
124 |     df_skus = df_skus.assign(avg_items_per_order=(df_skus['items'] / df_skus['orders']))
125 | 
126 |     # Calculate the average number of items per customer
127 |     df_skus = df_skus.assign(avg_items_per_customer=(df_skus['items'] / df_skus['customers']))
128 | 
129 |     # Merge the dataframes
130 |     df_subset = df[['sku', 'purchased_individually', 'purchased_once']].fillna(0)
131 |     df_subset.drop_duplicates('sku', keep='first', inplace=True)
132 |     df_skus = df_skus.merge(df_subset, on='sku', how='left')
133 | 
134 |     # Calculate bulk purchase rates
135 |     df_skus = df_skus.assign(bulk_purchases=(df_skus['orders'] - df_skus['purchased_individually']))
136 |     df_skus = df_skus.assign(bulk_purchase_rate=(df_skus['bulk_purchases'] / df_skus['orders']))
137 | 
138 |     # Calculate repurchase rates
139 |     df_skus = df_skus.assign(repurchases=(df_skus['orders'] - df_skus['purchased_once']))
140 |     df_skus = df_skus.assign(repurchase_rate=(df_skus['repurchases'] / df_skus['orders']))
141 | 
142 |     # Add labels
143 |     df_skus = get_repurchase_rate_label(df_skus)
144 |     df_skus = get_bulk_purchase_rate_label(df_skus)
145 | 
146 |     df_skus['bulk_and_repurchase_label'] = df_skus['repurchase_rate_label'].astype(str) + \
147 |                                            '_' + df_skus['bulk_purchase_rate_label'].astype(str)
148 | 
149 |     return df_skus
150 | 


--------------------------------------------------------------------------------
/ecommercetools/reports/__init__.py:
--------------------------------------------------------------------------------
1 | from ecommercetools.reports.reports import customers_report
2 | from ecommercetools.reports.reports import transactions_report
3 | 


--------------------------------------------------------------------------------
/ecommercetools/reports/reports.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from ecommercetools import transactions
  4 | 
  5 | 
  6 | def transactions_report(df, frequency='M'):
  7 |     """Create an transactions report based on a specified reporting frequency.
  8 | 
  9 |     Args:
 10 |         df (dataframe): Pandas dataframe of transaction items.
 11 |         frequency (optional, string, default 'M'): Optional frequency indicator (Y, Q, M, W, D)
 12 | 
 13 |     Returns:
 14 |         df (dataframe): Pandas dataframe of aggregated data for the specified frequency.
 15 |     """
 16 | 
 17 |     df['year'] = df['order_date'].dt.year
 18 |     df['quarter'] = df['order_date'].dt.quarter
 19 |     df['year_quarter'] = df['year'].astype(str) + '-' + df['quarter'].astype(str)
 20 |     df['month'] = df['order_date'].dt.month
 21 |     df['year_month'] = df['order_date'].dt.strftime('%Y-%m')
 22 |     df['week'] = df['order_date'].dt.strftime('%W')
 23 |     df['year_week'] = df['order_date'].dt.strftime('%Y-%W')
 24 |     df['day'] = df['order_date'].dt.strftime('%j')
 25 |     df['year_day'] = df['order_date'].dt.strftime('%Y-%j')
 26 | 
 27 |     if frequency == 'Y':
 28 |         group = 'year'
 29 |     elif frequency == 'Q':
 30 |         group = 'year_quarter'
 31 |     elif frequency == 'W':
 32 |         group = 'year_week'
 33 |     elif frequency == 'D':
 34 |         group = 'year_day'
 35 |     else:
 36 |         group = 'year_month'
 37 | 
 38 |     df_agg = df.groupby(group).agg(
 39 |         customers=('customer_id', 'nunique'),
 40 |         orders=('order_id', 'nunique'),
 41 |         revenue=('line_price', 'sum'),
 42 |         skus=('sku', 'count'),
 43 |         units=('quantity', 'sum')
 44 |     ).reset_index()
 45 | 
 46 |     df_agg['avg_order_value'] = round(df_agg['revenue'] / df_agg['orders'], 2)
 47 |     df_agg['avg_skus_per_order'] = round(df_agg['skus'] / df_agg['orders'], 2)
 48 |     df_agg['avg_units_per_order'] = round(df_agg['units'] / df_agg['orders'], 2)
 49 |     df_agg['avg_revenue_per_customer'] = round(df_agg['revenue'] / df_agg['customers'], 2)
 50 | 
 51 |     return df_agg
 52 | 
 53 | 
 54 | def customers_report(transaction_items_df, frequency='M'):
 55 |     """Create a customers report based on a specified reporting frequency.
 56 | 
 57 |     Args:
 58 |         df (dataframe): Pandas dataframe of transaction items.
 59 |         frequency (optional, string, default 'M'): Optional frequency indicator (Y, Q, M, W, D)
 60 | 
 61 |     Returns:
 62 |         df (dataframe): Pandas dataframe of aggregated data for the specified frequency.
 63 |     """
 64 | 
 65 |     df = transactions.get_transactions(transaction_items_df)
 66 | 
 67 |     df['period'] = df['order_date'].dt.strftime('%B, Y')
 68 |     df['year'] = df['order_date'].dt.year
 69 |     df['quarter'] = df['order_date'].dt.quarter
 70 |     df['year_quarter'] = df['year'].astype(str) + '-' + df['quarter'].astype(str)
 71 |     df['month'] = df['order_date'].dt.month
 72 |     df['year_month'] = df['order_date'].dt.strftime('%Y-%m')
 73 |     df['week'] = df['order_date'].dt.strftime('%W')
 74 |     df['year_week'] = df['order_date'].dt.strftime('%Y-%W')
 75 |     df['day'] = df['order_date'].dt.strftime('%j')
 76 |     df['year_day'] = df['order_date'].dt.strftime('%Y-%j')
 77 | 
 78 |     if frequency == 'Y':
 79 |         group = 'year'
 80 |     elif frequency == 'Q':
 81 |         group = 'year_quarter'
 82 |     elif frequency == 'W':
 83 |         group = 'year_week'
 84 |     elif frequency == 'D':
 85 |         group = 'year_day'
 86 |     else:
 87 |         group = 'year_month'
 88 | 
 89 |     df['new_customers'] = np.where(df['order_number'] == 1, 1, 0)
 90 | 
 91 |     df_agg = df.groupby(group).agg(
 92 |         orders=('order_id', 'nunique'),
 93 |         customers=('customer_id', 'nunique'),
 94 |         new_customers=('new_customers', 'sum'),
 95 |     ).reset_index()
 96 | 
 97 |     df_agg['returning_customers'] = df_agg['customers'] - df_agg['new_customers']
 98 |     df_agg['acquisition_rate'] = round((df_agg['new_customers'] / df_agg['customers']) * 100, 2)
 99 | 
100 |     return df_agg
101 | 


--------------------------------------------------------------------------------
/ecommercetools/seo/__init__.py:
--------------------------------------------------------------------------------
 1 | from ecommercetools.seo.robots import get_sitemaps
 2 | from ecommercetools.seo.robots import get_robots
 3 | from ecommercetools.seo.sitemaps import get_sitemap
 4 | from ecommercetools.seo.google_pagespeed_insights import get_core_web_vitals
 5 | from ecommercetools.seo.google_knowledge_graph import get_knowledge_graph
 6 | from ecommercetools.seo.google_search_console import query_google_search_console
 7 | from ecommercetools.seo.google_search_console import query_google_search_console_compare
 8 | from ecommercetools.seo.google_search_console import classify_pages
 9 | from ecommercetools.seo.google_autocomplete import google_autocomplete
10 | from ecommercetools.seo.google_search import get_indexed_pages
11 | from ecommercetools.seo.google_search import get_serps
12 | from ecommercetools.seo.scraping import scrape_site
13 | from ecommercetools.seo.testing import seo_test


--------------------------------------------------------------------------------
/ecommercetools/seo/google_autocomplete.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Get keyword suggestions for a term using Google Autocomplete or Google Suggest.
  3 | """
  4 | 
  5 | import requests
  6 | import urllib.parse
  7 | import json
  8 | import pandas as pd
  9 | from requests_html import HTMLSession
 10 | 
 11 | 
 12 | def _get_source(url: str):
 13 |     """Return the source code for the provided URL.
 14 | 
 15 |     Args:
 16 |         url (string): URL of the page to scrape.
 17 | 
 18 |     Returns:
 19 |         response (object): HTTP response object from requests_html.
 20 |     """
 21 | 
 22 |     try:
 23 |         session = HTMLSession()
 24 |         response = session.get(url)
 25 |         return response
 26 |     except requests.exceptions.RequestException as e:
 27 |         print(e)
 28 | 
 29 | 
 30 | def _get_results(query: str):
 31 |     """Get the JSON data from a Google Autocomplete query.
 32 | 
 33 |     Args:
 34 |         query (string): Query term, i.e. data science
 35 | 
 36 |     Returns:
 37 |         results (dict): JSON results.
 38 |     """
 39 | 
 40 |     query = urllib.parse.quote_plus(query)
 41 |     response = _get_source("https://suggestqueries.google.com/complete/search?output=chrome&hl=en&q=" + query)
 42 |     results = json.loads(response.text)
 43 |     return results
 44 | 
 45 | 
 46 | def _format_results(results: dict):
 47 |     """Return formatted dictionary containing term and relevance.
 48 | 
 49 |     Args:
 50 |         results (dict): JSON dictionary of Google Autocomplete results.
 51 | 
 52 |     Returns:
 53 |         suggestions (dict): Formatted dictionary containing term and relevance.
 54 |     """
 55 | 
 56 |     if results:
 57 |         suggestions = []
 58 |         for index, value in enumerate(results[1]):
 59 |             suggestion = {'term': value, 'relevance': results[4]['google:suggestrelevance'][index]}
 60 |             suggestions.append(suggestion)
 61 |         return suggestions
 62 | 
 63 | 
 64 | def _get_suggestions(query: str):
 65 |     """Return results sorted by relevance.
 66 | 
 67 |     Args:
 68 |         query (string): Search term, i.e. data science
 69 | 
 70 |     Returns:
 71 |         results (dict): Sorted dictionary containing term and relevance.
 72 |     """
 73 | 
 74 |     results = _get_results(query)
 75 |     results = _format_results(results)
 76 |     results = sorted(results, key=lambda k: k['relevance'], reverse=True)
 77 |     return results
 78 | 
 79 | 
 80 | def _get_expanded_term_suffixes():
 81 |     """Return a list of query suffixes to extend Google Autocomplete results.
 82 | 
 83 |     Returns:
 84 |         expanded_term_suffixes (list)
 85 |     """
 86 | 
 87 |     expanded_term_suffixes = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 88 |                               'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
 89 |     return expanded_term_suffixes
 90 | 
 91 | 
 92 | def _get_expanded_term_prefixes():
 93 |     """Return a list of query prefixes to extend Google Autocomplete results.
 94 | 
 95 |     Returns:
 96 |         expanded_term_prefixes (list)
 97 |     """
 98 | 
 99 |     expanded_term_prefixes = ['who is *', 'what is *', 'where is *', 'when can *', 'why is *',
100 |                               'how to *', 'best', 'cheap', 'worst', 'is', 'what', 'when', 'why',
101 |                               'how', 'who']
102 |     return expanded_term_prefixes
103 | 
104 | 
105 | def _get_expanded_terms(query: str):
106 |     """Return a list of expanded terms, comprising the original query, and the prefixed and suffixed queries.
107 | 
108 |     Args:
109 |         query (string): Query term, i.e. data science
110 | 
111 |     Returns:
112 |         terms (list): List of query terms with suffixes and prefixes.
113 |     """
114 | 
115 |     expanded_term_prefixes = _get_expanded_term_prefixes()
116 |     expanded_term_suffixes = _get_expanded_term_suffixes()
117 | 
118 |     terms = [query]
119 | 
120 |     for term in expanded_term_prefixes:
121 |         terms.append(term + ' ' + query)
122 | 
123 |     for term in expanded_term_suffixes:
124 |         terms.append(query + ' ' + term)
125 | 
126 |     return terms
127 | 
128 | 
129 | def _get_expanded_suggestions(query: str):
130 |     """Return the Google Autocomplete suggestions for a query and its prefixed and suffixed versions.
131 | 
132 |     Args:
133 |         query (string): Query term, i.e. data science
134 | 
135 |     Returns:
136 |         all_results (dict): Sorted formatted dictionary of results for each search term.
137 |     """
138 | 
139 |     all_results = []
140 | 
141 |     expanded_terms = _get_expanded_terms(query)
142 |     for term in expanded_terms:
143 |         results = _get_results(term)
144 |         results = _format_results(results)
145 |         all_results = all_results + results
146 |         all_results = sorted(all_results, key=lambda k: k['relevance'], reverse=True)
147 |     return all_results
148 | 
149 | 
150 | def google_autocomplete(query: str, include_expanded=True):
151 |     """Run a Google Autocomplete / Google Suggest search with optional query expansion.
152 | 
153 |     Args:
154 |         query (string): Query term, i.e. data science
155 |         include_expanded (bool, optional): Optional boolean flag. Set to true to add prefixes/suffixes.
156 | 
157 |     Returns:
158 |         df (dataframe): Pandas dataframe containing results.
159 |     """
160 | 
161 |     if include_expanded:
162 |         results = _get_expanded_suggestions(query)
163 | 
164 |     else:
165 |         results = _get_suggestions(query)
166 | 
167 |     df = pd.DataFrame.from_records(results)
168 |     return df
169 | 
170 | 


--------------------------------------------------------------------------------
/ecommercetools/seo/google_knowledge_graph.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import urllib.parse
 3 | import json
 4 | import pandas as pd
 5 | from requests_html import HTMLSession
 6 | 
 7 | 
 8 | def _get_source(url: str):
 9 |     """Return the source code for the provided URL.
10 | 
11 |     Args:
12 |         url (string): URL of the page to scrape.
13 | 
14 |     Returns:
15 |         response (object): HTTP response object from requests_html.
16 |     """
17 | 
18 |     try:
19 |         session = HTMLSession()
20 |         response = session.get(url)
21 |         return response
22 | 
23 |     except requests.exceptions.RequestException as e:
24 |         print(e)
25 | 
26 | 
27 | def get_knowledge_graph(api_key: str,
28 |                         query: str,
29 |                         output="dataframe"):
30 |     """Return a Google Knowledge Graph for a given query.
31 | 
32 |     Args:
33 |         api_key (string): Google Knowledge Graph API key.
34 |         query (string): Term to search for.
35 |         output (string, optional): Output format (dataframe, or json).
36 | 
37 |     Returns:
38 |         response (object): Knowledge Graph response object in JSON format.
39 |     """
40 | 
41 |     endpoint = 'https://kgsearch.googleapis.com/v1/entities:search'
42 |     params = {
43 |         'query': query,
44 |         'limit': 10,
45 |         'indent': True,
46 |         'key': api_key,
47 |     }
48 | 
49 |     url = endpoint + '?' + urllib.parse.urlencode(params)
50 |     response = _get_source(url)
51 | 
52 |     if output == "json":
53 |         return json.loads(response.text)
54 |     else:
55 |         return pd.json_normalize(json.loads(response.text), record_path='itemListElement')
56 | 


--------------------------------------------------------------------------------
/ecommercetools/seo/google_pagespeed_insights.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Fetch Core Web Vitals from the Google PageSpeed Insights API.
  3 | """
  4 | 
  5 | import sys
  6 | import json
  7 | import urllib.request
  8 | import pandas as pd
  9 | 
 10 | 
 11 | def query_core_web_vitals(key: str,
 12 |                           url: str,
 13 |                           strategy: str = "desktop"):
 14 |     """Run a Google Page Speed API query to fetch the Core Web Vitals for a URL.
 15 | 
 16 |     Args:
 17 |         key (str): API key for Google Page Speed API.
 18 |         url (str): URL of the page you wish to check.
 19 |         strategy (str, optional): Optional strategy (desktop or mobile).
 20 | 
 21 |     Returns:
 22 |         data (json): API response in JSON format.
 23 |     """
 24 | 
 25 |     try:
 26 |         endpoint = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed" \
 27 |                    + "?strategy=" + strategy \
 28 |                    + "&url={}" \
 29 |                    + "&key=" + key
 30 | 
 31 |         response = urllib.request.urlopen(endpoint.format(url)).read().decode('UTF-8')
 32 |         data = json.loads(response)
 33 |         return data
 34 |     except Exception as e:
 35 |         print("Error: ", e)
 36 |         sys.exit(1)
 37 | 
 38 | 
 39 | def save_core_web_vitals(report: dict,
 40 |                          filename: str):
 41 |     """Save the Core Web Vitals JSON report to file.
 42 | 
 43 |     Args:
 44 |         report (dict): JSON object containing report data.
 45 |         filename (str): Filename to use for report.
 46 | 
 47 |     Returns:
 48 |         JSON Core Web Vitals report file.
 49 |     """
 50 | 
 51 |     with open(filename, 'w') as outfile:
 52 |         json.dump(report, outfile)
 53 | 
 54 | 
 55 | def parse_core_web_vitals(report: dict):
 56 |     """Return a dictionary containing the Core Web Vitals from the report.
 57 | 
 58 |     Args:
 59 |         report (dict): JSON dictionary containing report data.
 60 | 
 61 |     Return:
 62 |         data (dict): Dictionary containing the key data.
 63 | 
 64 |     """
 65 | 
 66 |     final_url = report['lighthouseResult']['finalUrl']
 67 |     fetch_time = report['lighthouseResult']['fetchTime']
 68 |     form_factor = report['lighthouseResult']['configSettings']['formFactor']
 69 |     overall_score = report["lighthouseResult"]["categories"]["performance"]["score"] * 100
 70 |     speed_index = report["lighthouseResult"]["audits"]["speed-index"]["score"] * 100
 71 |     first_meaningful_paint = report["lighthouseResult"]["audits"]["first-meaningful-paint"]["score"] * 100
 72 |     first_contentful_paint = report["lighthouseResult"]["audits"]["first-contentful-paint"]["score"] * 100
 73 |     time_to_interactive = report["lighthouseResult"]["audits"]["interactive"]["score"] * 100
 74 |     total_blocking_time = report["lighthouseResult"]["audits"]["total-blocking-time"]["score"] * 100
 75 |     cumulative_layout_shift = report["lighthouseResult"]["audits"]["cumulative-layout-shift"]["score"] * 100
 76 | 
 77 |     data = {
 78 |         'final_url': final_url,
 79 |         'fetch_time': fetch_time,
 80 |         'form_factor': form_factor,
 81 |         'overall_score': overall_score,
 82 |         'speed_index': speed_index,
 83 |         'first_meaningful_paint': first_meaningful_paint,
 84 |         'first_contentful_paint': first_contentful_paint,
 85 |         'time_to_interactive': time_to_interactive,
 86 |         'total_blocking_time': total_blocking_time,
 87 |         'cumulative_layout_shift': cumulative_layout_shift,
 88 |     }
 89 | 
 90 |     return data
 91 | 
 92 | 
 93 | def get_core_web_vitals(key: str,
 94 |                         urls: list,
 95 |                         strategy: str = "both"):
 96 |     """Return a Pandas dataframe containing Core Web Vitals for the provided URLs and optional strategy.
 97 | 
 98 |     Args:
 99 |         key (str): API key for Google Page Speed API.
100 |         urls (list): URL of the page you wish to check.
101 |         strategy (str, optional): Optional strategy (desktop or mobile) or both (default).
102 | 
103 |     Returns:
104 |         df (dataframe): Pandas dataframe containing core web vitals for URL and strategy.
105 |     """
106 | 
107 |     df = pd.DataFrame(columns=['final_url', 'fetch_time', 'form_factor', 'overall_score',
108 |                                'speed_index', 'first_meaningful_paint', 'first_contentful_paint',
109 |                                'time_to_interactive', 'total_blocking_time', 'cumulative_layout_shift'])
110 | 
111 |     if strategy == "both":
112 | 
113 |         for url in urls:
114 |             report = query_core_web_vitals(key, url, strategy="mobile")
115 |             if report:
116 |                 data = parse_core_web_vitals(report)
117 |                 df = df.append(data, ignore_index=True)
118 | 
119 |         for url in urls:
120 |             report = query_core_web_vitals(key, url, strategy="desktop")
121 |             if report:
122 |                 data = parse_core_web_vitals(report)
123 |                 df = df.append(data, ignore_index=True)
124 | 
125 |     else:
126 |         for url in urls:
127 |             report = query_core_web_vitals(key, url, strategy=strategy)
128 |             if report:
129 |                 data = parse_core_web_vitals(report)
130 |                 df = df.append(data, ignore_index=True)
131 | 
132 |     df = df.sort_values(by='final_url')
133 |     return df
134 | 


--------------------------------------------------------------------------------
/ecommercetools/seo/google_search.py:
--------------------------------------------------------------------------------
  1 | """
  2 | General functions for scraping data from Google search engine results pages.
  3 | """
  4 | 
  5 | import re
  6 | import requests
  7 | import urllib.parse
  8 | import pandas as pd
  9 | import numpy as np
 10 | from requests_html import HTMLSession
 11 | 
 12 | 
 13 | def _get_source(url: str):
 14 |     """Return the source code for the provided URL.
 15 | 
 16 |     Args:
 17 |         url (string): URL of the page to scrape.
 18 | 
 19 |     Returns:
 20 |         response (object): HTTP response object from requests_html.
 21 |     """
 22 | 
 23 |     try:
 24 |         session = HTMLSession()
 25 |         response = session.get(url)
 26 | 
 27 |         if response.status_code == 200:
 28 |             return response
 29 |         elif response.status_code == 429:
 30 |             print('Error: Too many requests. Google has temporarily blocked you. Try again later.')
 31 |             exit()
 32 |         else:
 33 |             print('Error:' + response)
 34 |             exit()
 35 |     except requests.exceptions.RequestException as e:
 36 |         print(e)
 37 | 
 38 | 
 39 | def _get_site_results(url: str):
 40 |     """Return the source of a site:url search.
 41 | 
 42 |     Args:
 43 |         url: URL of page to append to site: query
 44 | 
 45 |     Returns:
 46 |         response (str): HTML of page.
 47 |     """
 48 | 
 49 |     try:
 50 |         query = urllib.parse.quote_plus(url)
 51 |         response = _get_source("https://www.google.co.uk/search?q=site%3A" + query + "&num=100")
 52 | 
 53 |         return response
 54 |     except requests.exceptions.RequestException as e:
 55 |         print(e)
 56 | 
 57 | 
 58 | def _parse_site_results(response: str):
 59 |     """Parse the HTML of a site:url query and return the number of pages "indexed".
 60 | 
 61 |     Args:
 62 |         response: HTML of site:url query.
 63 | 
 64 |     Returns:
 65 |         indexed: Number of pages "indexed".
 66 |     """
 67 | 
 68 |     try:
 69 |         if response.html.find("#result-stats", first=True):
 70 | 
 71 |             string = response.html.find("#result-stats", first=True).text
 72 |             if string:
 73 |                 # Remove values in paretheses, i.e. (0.31 seconds)
 74 |                 string = re.sub(r'\([^)]*\)', '', string)
 75 | 
 76 |                 # Remove non-numeric characters
 77 |                 string = re.sub('[^0-9]', '', string)
 78 | 
 79 |                 return string
 80 |             else:
 81 |                 return 0
 82 |     except requests.exceptions.RequestException as e:
 83 |         print(e)
 84 | 
 85 | 
 86 | def _count_indexed_pages(url: str):
 87 |     """Gets the site:url data, parses the response, and returns the number of "indexed" pages.
 88 | 
 89 |     Args:
 90 |         url: URL to use in site:url search.
 91 | 
 92 |     Returns:
 93 |         results (int): Number of pages "indexed".
 94 |     """
 95 | 
 96 |     response = _get_site_results(url)
 97 |     return _parse_site_results(response)
 98 | 
 99 | 
100 | def get_indexed_pages(urls: list):
101 |     """Loop through a series of URLs and run site:url searches, then return number of "indexed" pages.
102 | 
103 |     Args:
104 |         urls (list): List of URLs.
105 | 
106 |     Returns:
107 |         df (dataframe): Pandas dataframe containing URL and number of "indexed" pages.
108 |     """
109 | 
110 |     data = []
111 |     for site in urls:
112 |         site_data = {'url': site, 'indexed_pages': _count_indexed_pages(site)}
113 |         data.append(site_data)
114 |     df = pd.DataFrame.from_records(data)
115 |     df = df.sort_values(by='indexed_pages')
116 |     return df
117 | 
118 | 
119 | def _get_results(query: str):
120 |     """Return the source of a search.
121 | 
122 |     Args:
123 |         query: Search query term.
124 | 
125 |     Returns:
126 |         response (str): HTML of page.
127 |     """
128 | 
129 |     query = urllib.parse.quote_plus(query)
130 |     response = _get_source("https://www.google.co.uk/search?q=" + query + "&num=100")
131 | 
132 |     return response
133 | 
134 | 
135 | def _get_next_page(response, domain="google.co.uk"):
136 |     """Get the URL for the next page of results."""
137 | 
138 |     css_identifier_next = "#pnnext"
139 |     next_page_url = response.html.find(css_identifier_next, first=True).attrs['href']
140 |     next_page = "https://www." + domain + next_page_url
141 | 
142 |     return next_page
143 | 
144 | 
145 | def _parse_search_results(response):
146 |     """Parses the Google Search engine results and returns a list of results.
147 | 
148 |     Note: This function is obviously dependent upon the source code in the Google results.
149 |     Google obfuscates the source of the page to make it more difficult to extra information.
150 |     Extraction classes change from time to time, so there is always a likelihood that this
151 |     function will need to be adjusted with the new class or identifier details.
152 |     In the event of the function failing, please raise a GitHub issue.
153 | 
154 |     Args:
155 |         response: Response object containing the page source code.
156 | 
157 |     Returns:
158 |         list: List of Google search results.
159 |     """
160 | 
161 |     css_identifier_result = ".tF2Cxc"  # The class of the div containing each result, i.e. <div class="tF2Cxc">
162 |     css_identifier_title = "h3"  # The element containing the title, i.e. <h3 class="...
163 |     css_identifier_link = ".yuRUbf a"  # The class of the div containing the anchor, i.e. <div class="yuRUbf"><a ...
164 |     css_identifier_text = ".VwiC3b"  # The class of the parent element containing the snippet <span>
165 |     css_identifier_bold = ".VwiC3b span em"  # The class of the element containing the snippet <span><em>
166 | 
167 |     try:
168 |         results = response.html.find(css_identifier_result)
169 | 
170 |         output = []
171 | 
172 |         for result in results:
173 | 
174 |             if result.find(css_identifier_text, first=True):
175 |                 text = result.find(css_identifier_text, first=True).text
176 |             else:
177 |                 text = ''
178 | 
179 |             if result.find(css_identifier_title, first=True):
180 |                 title = result.find(css_identifier_title, first=True).text
181 |             else:
182 |                 title = ''
183 | 
184 |             if result.find(css_identifier_link, first=True):
185 |                 link = result.find(css_identifier_link, first=True).attrs['href']
186 |             else:
187 |                 link = ''
188 | 
189 |             # Extract bold text
190 |             if result.find(css_identifier_bold, first=True):
191 |                 bold = result.find(css_identifier_bold, first=True).text.lower()
192 |             else:
193 |                 bold = ''
194 | 
195 |             item = {
196 |                 'title': title,
197 |                 'link': link,
198 |                 'text': text,
199 |                 'bold': bold,
200 |             }
201 | 
202 |             output.append(item)
203 | 
204 |         return output
205 |     except requests.exceptions.RequestException as e:
206 |         print(e)
207 | 
208 | 
209 | def get_serps(query: str,
210 |               output="dataframe",
211 |               pages=1,
212 |               domain="google.co.uk"):
213 |     """Return the Google search results for a given query.
214 | 
215 |     Args:
216 |         query (string): Query term to search Google for.
217 |         output (string, optional): Optional output format (dataframe or dictionary).
218 |         pages (int, optional): Optional number of pages to return.
219 |         domain (string, optional): Optional Google domain (default is google.co.uk).
220 | 
221 |     Returns:
222 |         results (dict): Results of query.
223 |     """
224 | 
225 |     response = _get_results(query)
226 |     results = _parse_search_results(response)
227 |     next_page = _get_next_page(response)
228 | 
229 |     page = 1
230 |     while page <= pages:
231 |         if page > 1:
232 |             response = _get_source(next_page)
233 |             results = results + _parse_search_results(response)
234 |             next_page = _get_next_page(response)
235 |         page += 1
236 | 
237 |     if results:
238 |         if output == "dataframe":
239 |             df = pd.DataFrame.from_records(results)
240 |             df.index = np.arange(1, len(df) + 1)
241 |             df.index.names = ['position']
242 |             return df.reset_index()
243 |         else:
244 |             return results
245 | 


--------------------------------------------------------------------------------
/ecommercetools/seo/google_search_console.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Fetch data from the Google Search Console API.
  3 | """
  4 | 
  5 | import sys
  6 | import pandas as pd
  7 | from google.oauth2 import service_account
  8 | from googleapiclient.discovery import build
  9 | 
 10 | 
 11 | def _connect(key: str):
 12 |     """Create a connection to the Google Search Console API and return service object.
 13 | 
 14 |     Args:
 15 |         key (string): Google Search Console JSON client secrets path.
 16 | 
 17 |     Returns:
 18 |         service (object): Google Search Console service object.
 19 |     """
 20 | 
 21 |     try:
 22 |         scope = ['https://www.googleapis.com/auth/webmasters']
 23 |         credentials = service_account.Credentials.from_service_account_file(key, scopes=scope)
 24 |         service = build('webmasters', 'v3', credentials=credentials)
 25 | 
 26 |         return service
 27 | 
 28 |     except Exception as e:
 29 |         print("Error: ", e)
 30 |         sys.exit(1)
 31 | 
 32 | 
 33 | def _get_response(service, site_url, payload):
 34 |     """Returns the rowLimit value from a Google Search Console API payload.
 35 | 
 36 |     Args:
 37 |         service (object): Google Search Console service object.
 38 |         site_url (string): Site URL for the Google Search Console property.
 39 |                            For domain properties use "sc-domain:example.com".
 40 |                            For other properties use "https://www.example.com".
 41 |         payload (dict): Google Search Console API payload.
 42 | 
 43 |     Returns:
 44 |         response (dict): API response dictionary
 45 |     """
 46 | 
 47 |     try:
 48 |         return service.searchanalytics().query(siteUrl=site_url, body=payload).execute()
 49 |     except Exception as e:
 50 |         return e
 51 | 
 52 | 
 53 | def _get_results(service, site_url, payload, results):
 54 |     """Returns a dictionary containing the Google Search Console API query results.
 55 | 
 56 |     Args:
 57 |         service (object): Google Search Console service object.
 58 |         site_url (string): Site URL for the Google Search Console property.
 59 |                            For domain properties use "sc-domain:example.com".
 60 |                            For other properties use "https://www.example.com".
 61 |         payload (dict): Google Search Console API payload.
 62 |         results (list): Python list to which to append the results.
 63 | 
 64 |     Returns:
 65 |         results (dict): Python dictionary of results to use to create a dataframe.
 66 | 
 67 |     """
 68 | 
 69 |     response = _get_response(service, site_url, payload)
 70 | 
 71 |     try:
 72 |         for row in response['rows']:
 73 |             data = {}
 74 | 
 75 |             for i in range(len(payload['dimensions'])):
 76 |                 data[payload['dimensions'][i]] = row['keys'][i]
 77 | 
 78 |             data['clicks'] = row['clicks']
 79 |             data['impressions'] = row['impressions']
 80 |             data['ctr'] = round(row['ctr'] * 100, 2)
 81 |             data['position'] = round(row['position'], 2)
 82 |             results.append(data)
 83 | 
 84 |         return results
 85 |     except Exception as e:
 86 |         return None
 87 | 
 88 | 
 89 | def query_google_search_console(key: str, site_url: str, payload: dict, fetch_all=False):
 90 |     """Run a query on the Google Search Console API and return a dataframe of results.
 91 | 
 92 |     Args:
 93 |         key (object): JSON client secrets key file path.
 94 |         site_url (string): URL of Google Search Console property
 95 |         payload (dict): API query payload dictionary
 96 |         fetch_all (boolean, default=False): Set to True to return all results and ignore rowLimit and startRow if provided
 97 | 
 98 |     Return:
 99 |         df (dataframe): Pandas dataframe containing requested data.
100 |     """
101 | 
102 |     service = _connect(key)
103 |     results = []
104 | 
105 |     if fetch_all == False:
106 |         results = _get_results(service, site_url, payload, results)
107 |     else:
108 |         maxrows = 10000
109 |         startrow = 0
110 |         complete = False
111 | 
112 |         while not complete:
113 |             payload['rowLimit'] = maxrows
114 |             payload['startRow'] = startrow
115 |             result = _get_results(service, site_url, payload, results)
116 | 
117 |             if result is None:
118 |                 complete = True
119 | 
120 |             startrow += maxrows
121 |     return pd.DataFrame.from_dict(results)
122 | 
123 | 
124 | def query_google_search_console_compare(key, site_url, payload_before, payload_after, fetch_all=False):
125 |     """Run two queries on the Google Search Console API and return a dataframe of results comparing changes.
126 | 
127 |     Args:
128 |         key (object): JSON client secrets key file path.
129 |         site_url (string): URL of Google Search Console property
130 |         payload_before (dict): API query payload dictionary for earliest period
131 |         payload_after (dict): API query payload dictionary for latest period
132 |         fetch_all (boolean, default=False): Set to True to return all results and ignore rowLimit and startRow if provided
133 | 
134 |     Example:
135 | 
136 |     The below code will compare queries by page and device across two periods and return a dataframe of results.
137 | 
138 |     payload_before = {
139 |     'startDate': "2021-08-11",
140 |     'endDate': "2021-08-31",
141 |     'dimensions': ["page","query", "device"],
142 |     }
143 | 
144 |     payload_after = {
145 |         'startDate': "2021-07-21",
146 |         'endDate': "2021-08-10",
147 |         'dimensions': ["page","query", "device"],
148 |     }
149 | 
150 |     df = query_google_search_console_compare(key, site_url, payload_before, payload_after, fetch_all=False)
151 | 
152 |     Return:
153 |         df (dataframe): Pandas dataframe containing requested data.
154 |     """
155 | 
156 |     # Validate the payload
157 |     if ('date' in payload_before['dimensions']) or ('date' in payload_after['dimensions']):
158 |         print('The date dimension cannot be used in a payload. Please use only page, query, and device.')
159 |     elif payload_before['dimensions'] != payload_after['dimensions']:
160 |         print('The payload dimensions provided do not match. Please use the same dimensions in each payload.')
161 |     else:
162 |         # Fetch the data and prefix the column names with _before and _after
163 |         df_before = query_google_search_console(key, site_url, payload_before, fetch_all=fetch_all)
164 |         df_after = query_google_search_console(key, site_url, payload_after, fetch_all=fetch_all)
165 |         df_before.columns = [str(col) + '_before' for col in df_before.columns]
166 |         df_after.columns = [str(col) + '_after' for col in df_after.columns]
167 | 
168 |         # Extract the dimensions from the payload, remove date and append _before and _after and join data
169 |         dimensions_before = [dimension + '_before' for dimension in payload_before['dimensions']]
170 |         dimensions_after = [dimension + '_after' for dimension in payload_after['dimensions']]
171 |         df = df_before.merge(df_after, how='left', left_on=dimensions_before, right_on=dimensions_after)
172 |         df = df.fillna(0)
173 | 
174 |         # Calculate changes between the periods
175 |         df['clicks_change'] = df['clicks_after'] - df['clicks_before']
176 |         df['impressions_change'] = df['impressions_after'] - df['impressions_before']
177 |         df['ctr_change'] = df['ctr_after'] - df['ctr_before']
178 |         df['position_change'] = df['position_after'] - df['position_before']
179 | 
180 |         # Drop the _after suffixed columns from the dataframe
181 |         object_columns = list(df.select_dtypes(['object']).columns)
182 |         after_columns = [column for column in object_columns if "_after" in column]
183 |         df = df.drop(columns=after_columns)
184 | 
185 |         # Create the dataframe
186 |         dimension_columns = list(df.select_dtypes(['object']).columns)
187 |         metrics = ['impressions_before',
188 |                    'impressions_after',
189 |                    'impressions_change',
190 |                     'clicks_before',
191 |                    'clicks_after',
192 |                    'clicks_change',
193 |                    'ctr_before',
194 |                    'ctr_after',
195 |                    'ctr_change',
196 |                    'position_before',
197 |                    'position_after',
198 |                    'position_change']
199 |         df = df[dimension_columns + metrics]
200 | 
201 |         # Drop the _before from dimension columns
202 |         df.columns = df.columns.str.replace('page_before', 'page')
203 |         df.columns = df.columns.str.replace('query_before', 'query')
204 |         df.columns = df.columns.str.replace('device_before', 'device')
205 | 
206 |         return df
207 | 
208 | def abcd(df):
209 |     """Assign an ABCD class and rank to a metric based on cumulative percentage contribution.
210 | 
211 |         Args:
212 |             df: Pandas dataframe containing data.
213 | 
214 |         Return:
215 |             Pandas dataframe containing original data, plus the metric class and rank.
216 |         """
217 | 
218 |     def _assign_class(percentage):
219 |         """Assign an ABCD class based on cumulative percentage contribution.
220 |         Args:
221 |             percentage (float): Cumulative percentage of ranked metric.
222 |         Returns:
223 |             segments: Pandas DataFrame
224 |         """
225 | 
226 |         if 0 < percentage <= 80:
227 |             return 'A'
228 |         elif 80 < percentage <= 90:
229 |             return 'B'
230 |         elif 90 < percentage < 100:
231 |             return 'C'
232 |         else:
233 |             return 'D'
234 | 
235 |     data = df.sort_values(by='clicks', ascending=False)
236 |     data['clicks_cumsum'] = data['clicks'].cumsum()
237 |     data['clicks_running_pc'] = (data['clicks_cumsum'] / data['clicks'].sum()) * 100
238 |     data['pc_share'] = (data['clicks'] / data['clicks'].sum()) * 100
239 |     data['class'] = data['clicks_running_pc'].apply(_assign_class)
240 |     data['class_rank'] = data['clicks_running_pc'].rank().astype(int)
241 |     data.loc[(data['class'] == 'D') & (data['clicks'] > 0), 'class'] = 'C'
242 |     return data
243 | 
244 | 
245 | def abcd_summary(df):
246 |     """Return a summary of the ABCD classification of Google Search Console page data.
247 | 
248 |     Args:
249 |         df: Pandas dataframe containing data from adcd() function.
250 |     Return:
251 |         Pandas dataframe containing summary of ABCD classification.
252 |     """
253 | 
254 |     df_summary = df.groupby('class').agg(
255 |         pages=('page', 'nunique'),
256 |         impressions=('impressions', 'sum'),
257 |         clicks=('clicks', 'sum'),
258 |         avg_ctr=('ctr', 'mean'),
259 |         avg_position=('position', 'mean')
260 |     ).reset_index()
261 | 
262 |     df_summary['pc_clicks'] = round((df_summary['clicks'] / df_summary['clicks'].sum()) * 100, 1)
263 |     df_summary['pc_impressions'] = round((df_summary['impressions'] / df_summary['impressions'].sum()) * 100, 1)
264 |     df_summary['pc_pages'] = round((df_summary['pages'] / df_summary['pages'].sum()) * 100, 1)
265 | 
266 |     df_summary[['class', 'pc_pages', 'pc_clicks', 'pc_impressions',
267 |                 'pages', 'impressions', 'clicks', 'avg_ctr', 'avg_position']]
268 |     return df_summary
269 | 
270 | def classify_pages(key, site_url, start_date, end_date, output='classes'):
271 |     """Classify pages using ABCD based on their cumulative percentage contribution to clicks.
272 | 
273 |     Args:
274 |         key (str): Path to Google Search Console API key.
275 |         site_url (str): Google Search Console site URL.
276 |         start_date (str): Start date for data.
277 |         end_date (str): End date for data.
278 |         output (str): Output format. Options are 'classes' or 'summary'.
279 | 
280 |     Returns:
281 |         Pandas DataFrame
282 |     """
283 | 
284 |     payload = {
285 |         'startDate': start_date,
286 |         'endDate': end_date,
287 |         'dimensions': ["page"],
288 |         'rowLimit': 25000,
289 |         'startRow': 0
290 |     }
291 | 
292 |     df_gsc = query_google_search_console(key, site_url, payload, fetch_all=True)
293 |     df_classes = abcd(df_gsc)
294 |     df_summary = abcd_summary(df_classes)
295 | 
296 |     if output == 'classes':
297 |         return df_classes
298 |     else:
299 |         return df_summary
300 | 


--------------------------------------------------------------------------------
/ecommercetools/seo/robots.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Fetch the contents of a robots.txt file and return the output in a Pandas dataframe.
 3 | """
 4 | 
 5 | import requests
 6 | import urllib.parse
 7 | import json
 8 | import pandas as pd
 9 | from requests_html import HTMLSession
10 | 
11 | 
12 | def _get_source(url: str):
13 |     """Return the source code for the provided URL.
14 | 
15 |     Args:
16 |         url (string): URL of the page to scrape.
17 | 
18 |     Returns:
19 |         response (object): HTTP response object from requests_html.
20 |     """
21 | 
22 |     try:
23 |         session = HTMLSession()
24 |         response = session.get(url)
25 |         return response
26 |     except requests.exceptions.RequestException as e:
27 |         print(e)
28 | 
29 | 
30 | def get_sitemaps(url: str):
31 |     """Parse a robots.txt file and return a Python list containing any sitemap URLs found.
32 | 
33 |     Args:
34 |         url (string): URL of robots.txt file.
35 | 
36 |     Returns:
37 |         data (list): List containing each sitemap found.
38 |     """
39 | 
40 |     response = _get_source(url)
41 |     robots = response.text
42 | 
43 |     data = []
44 |     lines = str(robots).splitlines()
45 | 
46 |     for line in lines:
47 |         if line.startswith('Sitemap:'):
48 |             split = line.split(':', maxsplit=1)
49 |             data.append(split[1].strip())
50 | 
51 |     return data
52 | 
53 | 
54 | def get_robots(url: str):
55 |     """Parses robots.txt file contents into a Pandas DataFrame.
56 | 
57 |     Args:
58 |         url (string): URL of robots.txt file.
59 | 
60 |     Returns:
61 |         df (list): Pandas dataframe containing robots.txt directives and parameters.
62 |     """
63 | 
64 |     response = _get_source(url)
65 |     robots = response.text
66 | 
67 |     data = []
68 |     lines = str(robots).splitlines()
69 |     for line in lines:
70 | 
71 |         if line.strip():
72 |             if not line.startswith('#'):
73 |                 split = line.split(':', maxsplit=1)
74 |                 data.append([split[0].strip(), split[1].strip()])
75 | 
76 |     return pd.DataFrame(data, columns=['directive', 'parameter'])
77 | 
78 | 


--------------------------------------------------------------------------------
/ecommercetools/seo/scraping.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A very primitive and slow web scraper for SEO tasks on small websites
  3 | """
  4 | 
  5 | import requests
  6 | import urllib.parse
  7 | import pandas as pd
  8 | from requests_html import HTMLSession
  9 | 
 10 | 
 11 | def _get_source(url: str):
 12 |     """Return the source code for the provided URL.
 13 | 
 14 |     Args:
 15 |         url (string): URL of the page to scrape.
 16 | 
 17 |     Returns:
 18 |         response (object): HTTP response object from requests_html.
 19 |     """
 20 | 
 21 |     try:
 22 |         session = HTMLSession()
 23 |         response = session.get(url)
 24 |         return response
 25 | 
 26 |     except requests.exceptions.RequestException as e:
 27 |         print(e)
 28 | 
 29 | 
 30 | def _get_title(response):
 31 |     """Parse HTML and extract the title
 32 | 
 33 |     Args:
 34 |         response: HTML response from Requests-HTML
 35 |     Returns:
 36 |         HTML element
 37 |     """
 38 | 
 39 |     try:
 40 |         return response.html.find('title', first=True).text
 41 |     except Exception as e:
 42 |         return
 43 | 
 44 | 
 45 | def _get_description(response):
 46 |     """Parse HTML and extract the meta description
 47 | 
 48 |     Args:
 49 |         response: HTML response from Requests-HTML
 50 |     Returns:
 51 |         HTML element
 52 |     """
 53 | 
 54 |     try:
 55 |         return response.html.xpath('//meta[@name="description"]/@content')[0]
 56 |     except Exception as e:
 57 |         return
 58 | 
 59 | 
 60 | def _get_canonical(response):
 61 |     """Parse HTML and extract the canonical
 62 |     :param response: HTML response from Requests-HTML
 63 |     :return: HTML element
 64 |     """
 65 | 
 66 |     try:
 67 |         return response.html.xpath("//link[@rel='canonical']/@href")
 68 |     except Exception as e:
 69 |         return
 70 | 
 71 | 
 72 | def _get_robots(response):
 73 |     """Parse HTML and extract the meta robots
 74 |     :param response: HTML response from Requests-HTML
 75 |     :return: HTML element
 76 |     """
 77 | 
 78 |     try:
 79 |         return response.html.xpath("//meta[@name='robots']/@content")
 80 |     except Exception as e:
 81 |         return
 82 | 
 83 | 
 84 | def _get_generator(response):
 85 |     """Parse HTML and extract the generator
 86 |     :param response: HTML response from Requests-HTML
 87 |     :return: HTML element
 88 |     """
 89 | 
 90 |     try:
 91 |         return response.html.xpath("//meta[@name='generator']/@content")
 92 |     except Exception as e:
 93 |         return
 94 | 
 95 | 
 96 | def _get_hreflang(response):
 97 |     """Parse HTML and extract the hreflang
 98 |     :param response: HTML response from Requests-HTML
 99 |     :return: HTML element
100 |     """
101 | 
102 |     try:
103 |         return response.html.xpath("//link[@rel='alternate']/@hreflang")
104 |     except Exception as e:
105 |         return
106 | 
107 | 
108 | def _get_absolute_links(response):
109 |     """Parse HTML and extract the absolute URLs
110 |     :param response: HTML response from Requests-HTML
111 |     :return: HTML element as text
112 |     """
113 | 
114 |     try:
115 |         return response.html.absolute_links
116 |     except Exception as e:
117 |         return
118 | 
119 | 
120 | def _get_paragraphs(response):
121 |     """Parse HTML and extract paragraphs
122 | 
123 |     Args:
124 |         response: HTML response from Requests-HTML
125 |     Returns:
126 |         HTML element
127 |     """
128 | 
129 |     try:
130 |         paragraphs = []
131 |         for paragraph in response.html.find('p'):
132 |             paragraphs.append(paragraph.text)
133 |         return paragraphs
134 |     except Exception as e:
135 |         return
136 | 
137 | 
138 | def scrape_site(df, url='loc', verbose=False):
139 |     """Scrapes every page in a Pandas dataframe column.
140 | 
141 |     Args:
142 |         df: Pandas dataframe containing the URL list.
143 |         url (optional, string): Optional name of URL column, if not 'url'
144 |         verbose (optional, boolean, default = False): Set to False to hide progress updates
145 | 
146 |     Returns:
147 |         df: Pandas dataframe containing all scraped content.
148 | 
149 |     """
150 | 
151 |     if verbose:
152 |         pages = len(df)
153 |         minutes = pages / 60
154 | 
155 |         print('Preparing to scrape ' + str(pages) + ' pages. This will take approximately ' + str(round(minutes)) + ' minutes')
156 | 
157 |     df_pages = pd.DataFrame(columns=['url', 'title', 'description', 'canonical', 'robots', 'hreflang', 'generator',
158 |                                      'absolute_links', 'paragraphs'])
159 | 
160 |     for index, row in df.iterrows():
161 | 
162 |         if verbose:
163 |             print('Scraping: ' + row[url])
164 | 
165 |         response = _get_source(row[url])
166 | 
167 |         if response:
168 |             with response as r:
169 |                 row = {
170 |                     'url': row[url],
171 |                     'title': _get_title(r),
172 |                     'description': _get_description(r),
173 |                     'canonical': _get_canonical(r),
174 |                     'robots': _get_robots(r),
175 |                     'hreflang': _get_hreflang(r),
176 |                     'generator': _get_generator(r),
177 |                     'absolute_links': _get_absolute_links(r),
178 |                     'paragraphs': _get_paragraphs(r),
179 |                 }
180 | 
181 |                 df_pages = df_pages.append(row, ignore_index=True)
182 | 
183 |     return df_pages
184 | 
185 | 


--------------------------------------------------------------------------------
/ecommercetools/seo/sitemaps.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Fetch the contents of all XML sitemaps and return the output in a Pandas dataframe.
  3 | """
  4 | 
  5 | import pandas as pd
  6 | import urllib.request
  7 | from urllib.parse import urlparse
  8 | from bs4 import BeautifulSoup
  9 | 
 10 | 
 11 | def _get_xml(url: str):
 12 |     """Scrapes an XML sitemap from the provided URL and returns XML source.
 13 |     Args:
 14 |         url (string): Fully qualified URL pointing to XML sitemap.
 15 |     Returns:
 16 |         xml (string): XML source of scraped sitemap.
 17 |     """
 18 | 
 19 |     try:
 20 |         response = urllib.request.urlopen(urllib.request.Request(url, headers={'User-Agent': 'Mozilla'}))
 21 |         xml = BeautifulSoup(response,
 22 |                             'lxml-xml',
 23 |                             from_encoding=response.info().get_param('charset'))
 24 |         return xml
 25 |     except Exception as e:
 26 |         print("Error: ", e)
 27 | 
 28 | 
 29 | def _get_sitemap_type(xml: str):
 30 |     """Parse XML source and returns the type of sitemap.
 31 | 
 32 |     Args:
 33 |         xml (string): Source code of XML sitemap.
 34 | 
 35 |     Returns:
 36 |         sitemap_type (string): Type of sitemap (sitemap, sitemapindex, or None).
 37 |     """
 38 | 
 39 |     sitemapindex = xml.find_all('sitemapindex')
 40 |     sitemap = xml.find_all('urlset')
 41 | 
 42 |     if sitemapindex:
 43 |         return 'sitemapindex'
 44 |     elif sitemap:
 45 |         return 'urlset'
 46 |     else:
 47 |         return
 48 | 
 49 | 
 50 | def _get_child_sitemaps(xml: str):
 51 |     """Return a list of child sitemaps present in a XML sitemap file.
 52 | 
 53 |     Args:
 54 |         xml (string): XML source of sitemap.
 55 | 
 56 |     Returns:
 57 |         sitemaps (list): Python list of XML sitemap URLs.
 58 |     """
 59 | 
 60 |     sitemaps = xml.find_all("sitemap")
 61 |     output = []
 62 | 
 63 |     for sitemap in sitemaps:
 64 |         output.append(sitemap.findNext("loc").text)
 65 |     return output
 66 | 
 67 | 
 68 | def _sitemap_to_dataframe(xml: str, name=None, verbose=False):
 69 |     """Read an XML sitemap into a Pandas dataframe.
 70 | 
 71 |     Args:
 72 |         xml (string): XML source of sitemap.
 73 |         name (optional): Optional name for sitemap parsed.
 74 |         verbose (boolean, optional): Set to True to monitor progress.
 75 | 
 76 |     Returns:
 77 |         dataframe: Pandas dataframe of XML sitemap content.
 78 |     """
 79 | 
 80 |     df = pd.DataFrame(columns=['loc', 'changefreq', 'priority', 'domain', 'sitemap_name'])
 81 | 
 82 |     urls = xml.find_all("url")
 83 | 
 84 |     for url in urls:
 85 | 
 86 |         if xml.find("loc"):
 87 |             loc = url.findNext("loc").text
 88 |             parsed_uri = urlparse(loc)
 89 |             domain = '{uri.netloc}'.format(uri=parsed_uri)
 90 |         else:
 91 |             loc = ''
 92 |             domain = ''
 93 | 
 94 |         if xml.find("changefreq"):
 95 |             changefreq = url.findNext("changefreq").text
 96 |         else:
 97 |             changefreq = ''
 98 | 
 99 |         if xml.find("priority"):
100 |             priority = url.findNext("priority").text
101 |         else:
102 |             priority = ''
103 | 
104 |         if name:
105 |             sitemap_name = name
106 |         else:
107 |             sitemap_name = ''
108 | 
109 |         row = {
110 |             'domain': domain,
111 |             'loc': loc,
112 |             'changefreq': changefreq,
113 |             'priority': priority,
114 |             'sitemap_name': sitemap_name,
115 |         }
116 | 
117 |         if verbose:
118 |             print(row)
119 | 
120 |         df = df.append(row, ignore_index=True)
121 |     return df
122 | 
123 | 
124 | def get_sitemap(url: str):
125 |     """Return a dataframe containing all of the URLs from a site's XML sitemaps.
126 | 
127 |     Args:
128 |         url (string): URL of site's XML sitemap. Usually located at /sitemap.xml
129 | 
130 |     Returns:
131 |         df (dataframe): Pandas dataframe containing all sitemap content.
132 | 
133 |     """
134 | 
135 |     xml = _get_xml(url)
136 |     if xml:
137 |         sitemap_type = _get_sitemap_type(xml)
138 | 
139 |         if sitemap_type =='sitemapindex':
140 |             sitemaps = _get_child_sitemaps(xml)
141 |         else:
142 |             sitemaps = [url]
143 | 
144 |         df = pd.DataFrame(columns=['loc', 'changefreq', 'priority', 'domain', 'sitemap_name'])
145 | 
146 |         for sitemap in sitemaps:
147 |             sitemap_xml = _get_xml(sitemap)
148 |             df_sitemap = _sitemap_to_dataframe(sitemap_xml, name=sitemap)
149 | 
150 |             df = pd.concat([df, df_sitemap], ignore_index=True)
151 | 
152 |         return df
153 | 
154 | 


--------------------------------------------------------------------------------
/ecommercetools/seo/testing.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Functions for running simple before and after tests using Causal Impact.
  3 | """
  4 | 
  5 | from datetime import timedelta
  6 | import pandas as pd
  7 | from causalimpact import CausalImpact
  8 | from ecommercetools import seo
  9 | import sys
 10 | import warnings
 11 | 
 12 | if not sys.warnoptions:
 13 |     warnings.simplefilter("ignore")
 14 | 
 15 | 
 16 | def _subtract_days_from_date(date, days):
 17 |     """Subtract days from a date and return the date.
 18 | 
 19 |     Args:
 20 |         date (string): Date string in YYYY-MM-DD format.
 21 |         days (int): Number of days to subtract from date
 22 | 
 23 |     Returns:
 24 |         date (date): Date in YYYY-MM-DD with X days subtracted.
 25 |     """
 26 | 
 27 |     subtracted_date = pd.to_datetime(date) - timedelta(days=days)
 28 |     subtracted_date = subtracted_date.strftime("%Y-%m-%d")
 29 | 
 30 |     return subtracted_date
 31 | 
 32 | 
 33 | def _add_days_to_date(date, days):
 34 |     """Add days to a date and return the date.
 35 | 
 36 |     Args:
 37 |         date (string): Date string in YYYY-MM-DD format.
 38 |         days (int): Number of days to add to date
 39 | 
 40 |     Returns:
 41 |         date (date): Date in YYYY-MM-DD with X days added.
 42 |     """
 43 | 
 44 |     added_date = pd.to_datetime(date) + timedelta(days=days)
 45 |     added_date = added_date.strftime("%Y-%m-%d")
 46 | 
 47 |     return added_date
 48 | 
 49 | 
 50 | def _get_pre_and_post_periods(post_period_start_date, days):
 51 |     """Return the pre- and post-period dates for use in CausalImpact.
 52 | 
 53 |     If you provide the start date for the test period, i.e. 2021-07-18, and
 54 |     the test duration in days, i.e. 14, this function will return the start
 55 |     and end date for the test period, and the start and end date for the
 56 |     pre-intervention period that ran immediately before. Data are returned
 57 |     in the right format for use by the CausalImpact model.
 58 | 
 59 |     Args:
 60 |         post_period_start_date (string): The date at which the test was started in YYYY-MM-DD format.
 61 |         days (int): The number of days to use for the test period.
 62 | 
 63 |     Returns:
 64 |         pre_period (list): The start and end date for the period before the test.
 65 |         post_period (list): The start and end date for the test period.
 66 | 
 67 |     Example:
 68 |         pre_period, post_period = get_pre_and_post_periods('2021-07-18', 14)
 69 | 
 70 |         pre_period
 71 |             ['2021-07-04', '2021-07-17']
 72 | 
 73 |         post_period
 74 |             ['2021-07-18', '2021-07-31']
 75 |     """
 76 | 
 77 |     pre_period_start_date = _subtract_days_from_date(post_period_start_date, days)
 78 |     pre_period_end_date = _subtract_days_from_date(post_period_start_date, 1)
 79 |     post_period_end_date = _add_days_to_date(post_period_start_date, days-1)
 80 | 
 81 |     pre_period = [pre_period_start_date, pre_period_end_date]
 82 |     post_period = [post_period_start_date, post_period_end_date]
 83 | 
 84 |     return pre_period, post_period
 85 | 
 86 | 
 87 | def _get_seo_test_data(key, site_url, post_period_start_date, days, filters=None):
 88 |     """Return Google Search Console data for use within a Causal Impact SEO test.
 89 | 
 90 |     Args:
 91 |         key (string): Filepath of Google Search Console API client secrets JSON keyfile
 92 |         site_url (string): Google Search Console property URL to query
 93 |         post_period_start_date (string): Start date for test in YYYY-MM-DD format
 94 |         days (int): Number of days to include in test period
 95 |         filters (optional, list): Optional list of GSC formatted query filters.
 96 | 
 97 |     Returns:
 98 |         df (dataframe): Date indexed dataframe containing clicks, impressions, ctr, and position
 99 | 
100 |     Usage:
101 |         # Site level test
102 |         df = _get_seo_test_data('client_secrets.json',
103 |                                     'https://example.com',
104 |                                     '2021-07-17',
105 |                                     14)
106 | 
107 |         # Page level test
108 |         filters = [{
109 |             'filters':[{
110 |                 'dimension':'page',
111 |                 'expression': 'https://example.com/hello'
112 |             }]
113 |         }]
114 | 
115 |         df = _get_seo_test_data('client_secrets.json',
116 |                                     'https://example.com',
117 |                                     '2021-07-17',
118 |                                     14,
119 |                                     filters)
120 | 
121 |         # Query level test
122 |         filters = [{
123 |             'filters':[{
124 |                 'dimension':'query',
125 |                 'expression': 'marketing'
126 |             }]
127 |         }]
128 | 
129 |         df = _get_seo_test_data('client_secrets.json',
130 |                                     'https://example.com',
131 |                                     '2021-07-17',
132 |                                     14,
133 |                                     filters)
134 |     """
135 | 
136 |     # Get the dates of the pre- and post-periods
137 |     pre_period, post_period = _get_pre_and_post_periods(post_period_start_date, days)
138 | 
139 |     # Create basic payload
140 |     payload = {
141 |         'startDate': pre_period[0],
142 |         'endDate': post_period[1],
143 |         'dimensions': ['date'],
144 |         'rowLimit': 10000,
145 |         'startRow': 0,
146 |     }
147 | 
148 |     # Add filters to the payload if provided
149 |     if filters:
150 |         payload['dimensionFilterGroups'] = filters
151 | 
152 |     # Run Google Search Console query using payload
153 |     df = seo.query_google_search_console(key, site_url, payload)
154 |     df.sort_values(by='date', ascending=True).head()
155 |     df = df.set_index('date')
156 | 
157 |     return df
158 | 
159 | 
160 | def seo_test(key,
161 |              site_url,
162 |              post_period_start_date,
163 |              days,
164 |              filters=None,
165 |              metric='clicks'):
166 |     """Run a simple marketing or SEO test using CausalImpact.
167 | 
168 |     Args:
169 |         key (string): Filepath of Google Search Console API client secrets JSON keyfile
170 |         site_url (string): Google Search Console property URL to query
171 |         post_period_start_date (string): Start date for test in YYYY-MM-DD format
172 |         days (int): Number of days to include in test period
173 |         filters (optional, list): Optional list of GSC formatted query filters.
174 |         metric (optional): Select a specific metric to examine. Default is clicks.
175 | 
176 |     Returns:
177 |         model (object): Returns a Causal Impact model object.
178 | 
179 |     """
180 | 
181 |     # Get the SEO test data
182 |     df = _get_seo_test_data(key, site_url, post_period_start_date, days, filters)
183 | 
184 |     # Get the dates of the pre- and post-periods
185 |     pre_period, post_period = _get_pre_and_post_periods(post_period_start_date, days)
186 | 
187 |     # Fit the test model
188 |     model = CausalImpact(df[metric], pre_period, post_period)
189 | 
190 |     return model
191 | 
192 | 


--------------------------------------------------------------------------------
/ecommercetools/transactions/__init__.py:
--------------------------------------------------------------------------------
1 | from ecommercetools.transactions.transactions import get_transactions
2 | 


--------------------------------------------------------------------------------
/ecommercetools/transactions/transactions.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from ecommercetools.utilities import tools
 4 | 
 5 | 
 6 | def get_transactions(transaction_items):
 7 |     """Return a Pandas DataFrame of transactions from a Pandas DataFrame of transaction items.
 8 | 
 9 |     Args:
10 |         transaction_items (object): DataFrame containing order_id, sku, quantity, unit_price, customer_id, order_date
11 | 
12 |     Returns:
13 |         transactions: Pandas DataFrame containing transactions
14 |     """
15 | 
16 |     transaction_items = transaction_items.sort_values(by=['order_date'], ascending=True)
17 |     transactions = transaction_items.groupby('order_id').agg(
18 |         order_date=('order_date', 'max'),
19 |         customer_id=('customer_id', 'max'),
20 |         skus=('sku', 'nunique'),
21 |         items=('quantity', 'sum'),
22 |         revenue=('line_price', 'sum'),
23 |     ).reset_index()
24 |     transactions['replacement'] = np.where(transactions['revenue'] > 0, 0, 1)
25 |     transactions['order_number'] = tools.get_cumulative_count(transactions,
26 |                                                               'customer_id',
27 |                                                               'order_id',
28 |                                                               'order_date') + 1
29 |     return transactions
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/ecommercetools/utilities/__init__.py:
--------------------------------------------------------------------------------
1 | from ecommercetools.utilities.tools import load_transaction_items
2 | from ecommercetools.utilities.tools import load_sample_data
3 | from ecommercetools.utilities.tools import get_cumulative_count
4 | from ecommercetools.utilities.tools import get_previous_value
5 | from ecommercetools.utilities.tools import get_days_since_date
6 | from ecommercetools.utilities.tools import date_subtract
7 | from ecommercetools.utilities.tools import select_last_x_days
8 | 


--------------------------------------------------------------------------------
/ecommercetools/utilities/metrics.py:
--------------------------------------------------------------------------------
  1 | """Common retail metrics."""
  2 | 
  3 | import math
  4 | from datetime import datetime
  5 | 
  6 | """====================================================================================================================
  7 | SALES AND FINANCIAL METRICS
  8 | ===================================================================================================================="""
  9 | 
 10 | 
 11 | def tax(gross_revenue, tax_rate=0.2):
 12 |     """Returns total tax based on gross revenue and tax rate.
 13 | 
 14 |     Args:
 15 |         gross_revenue (float): Gross revenue.
 16 |         tax_rate (float, optional): Product tax as decimal, i.e. 0.2 for 20% tax. Default is 0.2.
 17 | 
 18 |     Returns:
 19 |         total_tax (float): Total tax based on gross revenue and tax rate.
 20 | 
 21 |     Example:
 22 |         total_tax = tax(1000, 0.2)
 23 |         200.0
 24 |     """
 25 | 
 26 |     return gross_revenue * tax_rate
 27 | 
 28 | 
 29 | def net_revenue(gross_revenue, tax_rate=0.2):
 30 |     """Returns total net revenue based on gross revenue and tax rate.
 31 | 
 32 |     Args:
 33 |         gross_revenue (float): Gross revenue.
 34 |         tax_rate (float, optional): Product tax as decimal, i.e. 0.2 for 20% tax. Default is 0.2.
 35 | 
 36 |     Returns:
 37 |         net_revenue (float): Total net revenue based on gross revenue and tax rate.
 38 | 
 39 |     Example:
 40 |         net_revenue = net_revenue(1000, 0.2)
 41 |         800.0
 42 |     """
 43 | 
 44 |     total_tax = tax(gross_revenue, tax_rate)
 45 |     return gross_revenue - total_tax
 46 | 
 47 | 
 48 | def aov(total_revenue, total_orders):
 49 |     """Return the AOV (Average Order Value).
 50 | 
 51 |     Args:
 52 |         total_revenue (float): Total revenue.
 53 |         total_orders (int): Total number of orders.
 54 | 
 55 |     Returns:
 56 |         aov (float) as average order value
 57 |     """
 58 | 
 59 |     return total_revenue / total_orders
 60 | 
 61 | 
 62 | def product_cost(gross_revenue, margin, tax_rate=0.2):
 63 |     """Return the product cost from the gross revenue, product margin, and tax rate.
 64 | 
 65 |     Args:
 66 |         gross_revenue (float): Gross product revenue
 67 |         margin (float): Product margin as a decimal, i.e. 0.3 for 30% margin.
 68 |         tax_rate (float, optional): Optional tax rate, i.e. 0.2 for 20% tax.
 69 | 
 70 |     Returns:
 71 |         Product cost based on margin and tax rate.
 72 |     """
 73 | 
 74 |     revenue_net = net_revenue(gross_revenue, tax_rate)
 75 |     return revenue_net * margin
 76 | 
 77 | 
 78 | def gross_profit(gross_revenue, margin, tax_rate=0.2):
 79 |     """Return the gross profit from the gross revenue, product margin, and tax rate.
 80 | 
 81 |     Args:
 82 |         gross_revenue (float): Gross product revenue
 83 |         margin (float): Product margin as a decimal, i.e. 0.3 for 30% margin.
 84 |         tax_rate (float, optional): Optional tax rate, i.e. 0.2 for 20% tax.
 85 | 
 86 |     Returns:
 87 |         Gross profit based on margin and tax rate.
 88 |     """
 89 | 
 90 |     cost_product = product_cost(gross_revenue, margin, tax_rate)
 91 |     cost_tax = tax(gross_revenue, tax_rate)
 92 |     return gross_revenue - (cost_product + cost_tax)
 93 | 
 94 | 
 95 | def net_profit(gross_revenue, other_costs, margin, tax_rate=0.2):
 96 |     """Return the gross profit from the gross revenue, product margin, and tax rate.
 97 | 
 98 |     Args:
 99 |         gross_revenue (float): Gross product revenue
100 |         other_costs (float): Other costs, i.e. advertising, cross-charges, shipping
101 |         margin (float): Product margin as a decimal, i.e. 0.3 for 30% margin.
102 |         tax_rate (float, optional): Optional tax rate, i.e. 0.2 for 20% tax.
103 | 
104 |     Returns:
105 |         Gross profit based on margin and tax rate.
106 |     """
107 | 
108 |     cost_product = product_cost(gross_revenue, margin, tax_rate)
109 |     cost_tax = tax(gross_revenue, tax_rate)
110 |     return gross_revenue - (cost_product + cost_tax + other_costs)
111 | 
112 | 
113 | def sales_growth_rate(sales_period_1, sales_period_2):
114 |     """Return the sales growth rate for the current period versus the previous period.
115 | 
116 |     Args:
117 |         sales_period_1 (float): Total company sales for previous the period.
118 |         sales_period_2 (float): Total company sales for the current period.
119 | 
120 |     Returns:
121 |         Sales growth based on sales in period 2 versus period 1.
122 |     """
123 | 
124 |     return ((sales_period_2 - sales_period_1) / sales_period_1) * 100
125 | 
126 | 
127 | def revenue_per_unit(total_revenue, total_units):
128 |     """Return the total revenue per unit for the period.
129 | 
130 |     Args:
131 |         total_revenue (float): Total revenue generated during the period.
132 |         total_units (int): Total units sold during the period.
133 | 
134 |     Returns:
135 |         Total revenue per unit during the period.
136 |     """
137 | 
138 |     return total_revenue / total_units
139 | 
140 | 
141 | """====================================================================================================================
142 | MARKET STRATEGY METRICS
143 | ===================================================================================================================="""
144 | 
145 | 
146 | def market_share(company_sales, market_sales):
147 |     """Return the percentage market share for a company based on its revenue versus total market revenue.
148 | 
149 |     Args:
150 |         company_sales (float): Total company sales for the period.
151 |         market_sales (float): Total market sales for the period.
152 | 
153 |     Returns:
154 |         market_share (float): Percentage of sales generated by the company within the market.
155 |     """
156 | 
157 |     return (company_sales / market_sales) * 100
158 | 
159 | 
160 | """====================================================================================================================
161 | CUSTOMER METRICS
162 | ===================================================================================================================="""
163 | 
164 | 
165 | def retention_rate(customers_repurchasing_current_period,
166 |                    customers_purchasing_previous_period):
167 |     """Return the retention rate of customers acquired in one period who repurchased in another.
168 | 
169 |     Args:
170 |         customers_repurchasing_current_period (int): The number of customers acquired in p1, who reordered in p2.
171 |         customers_purchasing_previous_period (int): The number of customers who placed their first order in p1.
172 | 
173 |     Returns:
174 |         retention_rate (float): Percentage of customers acquired in p1 who repurchased in p2.
175 |     """
176 | 
177 |     return (customers_repurchasing_current_period / customers_purchasing_previous_period) * 100
178 | 
179 | 
180 | """====================================================================================================================
181 | PRODUCT AND CATEGORY MANAGEMENT METRICS
182 | ===================================================================================================================="""
183 | 
184 | 
185 | def share_of_shelf_index(products_of_brand_x, total_products):
186 |     """Return share of shelf index showing the percentage of total products made up by brand X.
187 | 
188 |     Args:
189 |         products_of_brand_x (int): Number of products of brand X in portfolio, category, or on shelf.
190 |         total_products (int): Total number of products of all brands in portfolio, category, or on shelf.
191 | 
192 |     Returns:
193 |         Percentage of shelf, category, or portfolio made up by brand X
194 |     """
195 | 
196 |     return (products_of_brand_x / total_products) * 100
197 | 
198 | 
199 | def product_turnover(units_sold_in_period, average_items_stocked_in_period):
200 |     """Return the product turnover (or sell through rate) for a product based on units sold versus items stocked.
201 | 
202 |     Args:
203 |         units_sold_in_period (int): Number of units of product X sold in the period.
204 |         average_items_stocked_in_period (int): Average stock holding for product X in the period.
205 | 
206 |     Returns:
207 |         product_turnover (float): Percentage of average stock holding sold during the period.
208 |     """
209 | 
210 |     return (units_sold_in_period / average_items_stocked_in_period) * 100
211 | 
212 | 
213 | def price_index(price_of_product_x, price_of_product_y):
214 |     """Return the price index of product X over product Y.
215 | 
216 |     Args:
217 |         price_of_product_x (float): Price of product X.
218 |         price_of_product_y (float): Price of product Y.
219 | 
220 |     Returns:
221 |         price_index (float): Price of X / Price of Y
222 |     """
223 | 
224 |     return (price_of_product_x / price_of_product_y) * 100
225 | 
226 | 
227 | def purchase_intention(people_who_declared_interest, total_people):
228 |     """Returns the purchase intention rate for a product.
229 | 
230 |     This can be used for cart-to-detail, buy-to-detail, and similar calculations.
231 | 
232 |     Args:
233 |         people_who_declared_interest (int): Number of people who declared interest in a product.
234 |         total_people (int): Total number of people.
235 | 
236 |     Returns:
237 |         Percentage of people who were interested in a product.
238 |     """
239 | 
240 |     return (people_who_declared_interest / total_people) * 100
241 | 
242 | 
243 | def product_trial_rate(number_of_first_time_purchases, total_purchasers):
244 |     """Returns the percentage of customers who trialled a product for the first time during a period.
245 | 
246 |     Args:
247 |         number_of_first_time_purchases (int): Total number of unique first-time purchasers during a period.
248 |         total_purchasers (int): Total number of unique purchasers during a period.
249 | 
250 |     Returns:
251 |         Percentage of customers who purchased product for the first time during a period.
252 |     """
253 | 
254 |     return (number_of_first_time_purchases / total_purchasers) * 100
255 | 
256 | 
257 | def product_repurchase_rate(number_of_repeat_purchasers, total_purchasers):
258 |     """Returns the percentage of customers who purchased a product for the second time or more.
259 | 
260 |     Args:
261 |         number_of_repeat_purchasers (int): Total number of unique repeat purchasers during a period.
262 |         total_purchasers (int): Total number of unique purchasers during a period.
263 | 
264 |     Returns:
265 |         Percentage of customers who purchased product for the second time or more during a period.
266 |     """
267 | 
268 |     return (number_of_repeat_purchasers / total_purchasers) * 100
269 | 
270 | 
271 | def product_consumption_rate(total_items, total_orders):
272 |     """Returns the average number of units per order.
273 | 
274 |     Args:
275 |         total_items (int): Total number of items of a SKU sold during a period.
276 |         total_orders (int): Total number of orders during a period.
277 | 
278 |     Returns:
279 |         Average number of units per order.
280 |     """
281 | 
282 |     return (total_items / total_orders) * 100
283 | 
284 | 
285 | def brand_usage(number_of_brand_purchasers, total_purchasers):
286 |     """Returns the percentage of brand usage for a period.
287 | 
288 |     Args:
289 |         number_of_brand_purchasers (int): Total number of unique purchasers of a brand in a period.
290 |         total_purchasers (int): Total unique purchasers in a period.
291 | 
292 |     Returns:
293 |         Percentage of purchasers who used brand X in the period.
294 |     """
295 | 
296 |     return (number_of_brand_purchasers / total_purchasers) * 100
297 | 
298 | 
299 | def brand_penetration_rate(number_of_brand_purchasers, total_purchasers):
300 |     """Returns the percentage of penetration rate for a brand.
301 | 
302 |     Args:
303 |         number_of_brand_purchasers (int): Total number of unique purchasers of a brand.
304 |         total_purchasers (int): Total unique purchasers.
305 | 
306 |     Returns:
307 |         Percentage of purchasers who have purchased the brand.
308 |     """
309 | 
310 |     return (number_of_brand_purchasers / total_purchasers) * 100
311 | 
312 | 
313 | def product_satisfaction(total_reviews, positive_reviews):
314 |     """Return the product satisfaction score for a period.
315 | 
316 |     Args:
317 |         total_reviews (int): Total number of reviews received within the period.
318 |         positive_reviews (int): Total number of positive reviews received within the period.
319 | 
320 |     Returns:
321 |         Percentage (float) of positive reviews received.
322 |     """
323 | 
324 |     return (positive_reviews / total_reviews) * 100
325 | 
326 | 
327 | """====================================================================================================================
328 | SALES TEAM METRICS
329 | ===================================================================================================================="""
330 | 
331 | 
332 | def market_coverage_index(unique_customers_contacted, unique_customers):
333 |     """Returns the market coverage index showing the percentage of customers contacted or visited by a sales force.
334 | 
335 |     Args:
336 |         unique_customers_contacted (int): Unique customers contacted/visited during the period.
337 |         unique_customers (int): Unique customers who purchased during the period, or who are managed by the sales force.
338 | 
339 |     Returns:
340 |         Market coverage representing the percentage of customers contacted or visited during a period.
341 |     """
342 | 
343 |     return (unique_customers_contacted / unique_customers) * 100
344 | 
345 | 
346 | def sales_force_efficiency(number_of_orders_from_visits, number_of_visits):
347 |     """Returns the percentage of visits by the sales force that resulted in orders from customers.
348 | 
349 |     Args:
350 |         number_of_orders_from_visits (int): Number of orders generated by sales force visits during the period.
351 |         number_of_visits (int): Number of sales force visits during the period.
352 | 
353 |     Returns:
354 |         Percentage of visits by the sales force that led to orders.
355 |     """
356 | 
357 |     return (number_of_orders_from_visits / number_of_visits) * 100
358 | 
359 | 
360 | """====================================================================================================================
361 | MARKETING METRICS
362 | ===================================================================================================================="""
363 | 
364 | 
365 | def cpm(total_cost, total_recipients):
366 |     """Return the CPM (or Cost per Mille) based on the marketing cost per 1000 customers.
367 | 
368 |     Args:
369 |         total_cost (float): Total cost of marketing.
370 |         total_recipients (int): Total number of marketing recipients.
371 | 
372 |     Returns:
373 |         cpm (float) as total cost of marketing per 1000 customers.
374 |     """
375 | 
376 |     return (total_cost / total_recipients) * 1000
377 | 
378 | 
379 | def cpo(total_cost, total_transactions):
380 |     """Return the CPT (Cost per Order).
381 | 
382 |     Args:
383 |         total_cost (float): Total cost of marketing.
384 |         total_transactions (int): Total number of transactions.
385 | 
386 |     Returns:
387 |         cpt (float) as total cost per order
388 |     """
389 | 
390 |     return total_cost / total_transactions
391 | 
392 | 
393 | def cpa(total_cost, total_acquisitions):
394 |     """Return the CPA (Cost per Acquisition).
395 | 
396 |     Args:
397 |         total_cost (float): Total cost of marketing.
398 |         total_acquisitions (int): Total number of acquisitions.
399 | 
400 |     Returns:
401 |         cpt (float) as total cost per acquisition
402 |     """
403 | 
404 |     return total_cost / total_acquisitions
405 | 
406 | 
407 | def cpc(total_cost, total_clicks):
408 |     """Return the CPC (Cost per Click).
409 | 
410 |     Args:
411 |         total_cost (float): Total cost of marketing.
412 |         total_clicks (int): Total number of clicks.
413 | 
414 |     Returns:
415 |         cpt (float) as total cost per click
416 |     """
417 | 
418 |     return total_cost / total_clicks
419 | 
420 | 
421 | def conversion_rate(total_conversions, total_actions):
422 |     """Return the conversion rate (CR) for an action.
423 | 
424 |     Args:
425 |         total_conversions (int): Total number of conversions.
426 |         total_actions (int): Total number of actions.
427 | 
428 |     Returns:
429 |         conversion rate (float) percentage
430 |     """
431 | 
432 |     return (total_conversions / total_actions) * 100
433 | 
434 | 
435 | def lin_rodnitsky_ratio(avg_cost_per_conversion_all_queries,
436 |                         avg_cost_per_conversion_queries_with_one_conversion_or_more):
437 |     """Return the Lin-Rodnitsky Ratio describing the quality of paid search account managemnent.
438 | 
439 |     Args:
440 |         avg_cost_per_conversion_all_queries (float): Average cost per conversion on the whole PPC account.
441 |         avg_cost_per_conversion_queries_with_one_conversion_or_more (float): Average cost per conversion for only
442 |         conversions where there was one or more conversions.
443 | 
444 |     Returns:
445 |         Lin-Rodnitsky Ratio (float).
446 | 
447 |         1.0 to 1.5 - Account is too conservatively managed.
448 |         1.5 to 2.0 - Account is well-managed.
449 |         2.0 to 2.5 - Account is too aggressively managed.
450 |         2.5 or more - Account is being mismanaged.
451 |     """
452 | 
453 |     return avg_cost_per_conversion_all_queries / avg_cost_per_conversion_queries_with_one_conversion_or_more
454 | 
455 | 
456 | def romi(total_revenue, total_marketing_costs):
457 |     """Return the Return on Marketing Investment (ROMI).
458 | 
459 |     Args:
460 |         total_revenue (float): Total revenue generated.
461 |         total_marketing_costs (float): Total marketing costs
462 | 
463 |     Returns:
464 |         Return on Marketing Investment (float) or (ROMI).
465 |     """
466 | 
467 |     return ((total_revenue - total_marketing_costs) / total_marketing_costs) * 100
468 | 
469 | 
470 | def roi(total_revenue, total_marketing_costs, total_other_costs):
471 |     """Return the Return on Investment (ROI).
472 | 
473 |     Args:
474 |         total_revenue (float): Total revenue generated.
475 |         total_marketing_costs (float): Total marketing costs
476 |         total_other_costs (float): Total other costs
477 | 
478 |     Returns:
479 |         Return on Marketing Investment (float) or (ROMI).
480 |     """
481 | 
482 |     total_costs = total_marketing_costs + total_other_costs
483 |     return ((total_revenue - total_costs) / total_costs) * 100
484 | 
485 | 
486 | def roas(total_revenue, total_marketing_costs):
487 |     """Return the Return on Advertising Spend or ROAS.
488 | 
489 |     Args:
490 |         total_revenue (float): Total revenue generated.
491 |         total_marketing_costs (float): Total marketing costs
492 | 
493 |     Returns:
494 |         Return on Advertising Spend or ROAS (float).
495 |     """
496 | 
497 |     return total_revenue / total_marketing_costs
498 | 
499 | 
500 | """====================================================================================================================
501 | CONTENT METRICS
502 | ===================================================================================================================="""
503 | 
504 | 
505 | def focus_index(average_pages_visited_in_section, total_pages_in_section):
506 |     """Return the focus index for a section of a website.
507 | 
508 |     Args:
509 |         average_pages_visited_in_section (float): Average number of pages visited in this section of the website.
510 |         total_pages_in_section (int): Total number of pages in this section of the website.
511 | 
512 |     Returns:
513 |         Focus index as average_pages_visited_in_section / total_pages_in_section
514 |     """
515 | 
516 |     return (average_pages_visited_in_section / total_pages_in_section) * 100
517 | 
518 | 
519 | def stickiness(total_visits, total_visit_duration, total_users):
520 |     """Return the stickiness score for a website or part of a website.
521 | 
522 |     Args:
523 |         total_visits (int): Total number of visits to a website or a section of a website.
524 |         total_visit_duration (int): Total number of minutes spent viewing the website or a section of a website.
525 |         total_users (int): Total unique users who visited a website or section of a website.
526 | 
527 |     Returns:
528 |         Stickiness score for website or part of website
529 |     """
530 | 
531 |     frequency_of_visits = total_visits / total_users
532 |     average_visit_duration = total_visit_duration / total_visits
533 |     total_reach = total_users / total_visits
534 | 
535 |     return frequency_of_visits * average_visit_duration * total_reach
536 | 
537 | 
538 | def sessions_with_product_views(total_sessions, sessions_with_product_views):
539 |     """Return the percentage of sessions with product views during the period.
540 | 
541 |     Args:
542 |         total_sessions (int): Total number of sessions within the period.
543 |         sessions_with_product_views (int): Total number of sessions with product views within the period.
544 | 
545 |     Returns:
546 |         Percentage (float) of positive reviews received.
547 |     """
548 | 
549 |     return (sessions_with_product_views / total_sessions) * 100
550 | 
551 | 
552 | """====================================================================================================================
553 | SOCIAL MEDIA METRICS
554 | ===================================================================================================================="""
555 | 
556 | 
557 | def engagement_rate(followers_who_engaged, total_followers):
558 |     """Return the engagement rate for a social media account.
559 | 
560 |     Args:
561 |         followers_who_engaged (int): Total unique followers who engaged.
562 |         total_followers (int): Total number of followers.
563 | 
564 |     Returns:
565 |         Engagement rate (float) as followers_who_engaged / total_followers
566 |     """
567 | 
568 |     return (followers_who_engaged / total_followers) * 100
569 | 
570 | 
571 | """====================================================================================================================
572 | INVENTORY MANAGEMENT METRICS
573 | ===================================================================================================================="""
574 | 
575 | 
576 | def dio(average_inventory_cost, cost_of_goods_sold):
577 |     """Return the DIO or Days of Inventory Outstanding over the previous 365 days.
578 | 
579 |     Args:
580 |         average_inventory_cost (float): Average cost of inventory.
581 |         cost_of_goods_sold (float): Cost of goods sold.
582 | 
583 |     Returns:
584 |         Days of Inventory Outstanding (float).
585 |     """
586 | 
587 |     return (average_inventory_cost / cost_of_goods_sold) * 365
588 | 
589 | 
590 | def safety_stock(max_units_sold_daily, avg_units_sold_daily, max_lead_time, avg_lead_time):
591 |     """Returns the safety stock level for a given product based on sales and lead time.
592 | 
593 |     Args:
594 |         max_units_sold_daily (int): Maximum number of units sold daily in previous period.
595 |         avg_units_sold_daily (float): Average number of units sold daily in previous period.
596 |         max_lead_time (int): Maximum number of days required to obtain stock.
597 |         avg_lead_time (int): Average number of days required to obtain stock.
598 | 
599 |     Returns:
600 |         Safety stock level for the product based on sales and lead time.
601 |     """
602 | 
603 |     return (max_units_sold_daily * max_lead_time) - (avg_units_sold_daily * avg_lead_time)
604 | 
605 | 
606 | def reorder_point(max_units_sold_daily, avg_units_sold_daily, max_lead_time, avg_lead_time, lead_time):
607 |     """Returns the reorder point for a given product based on sales and lead time.
608 | 
609 |     The reorder point is the stock level at which a new order should be placed in order to avoid stock outs.
610 | 
611 |     Args:
612 |         max_units_sold_daily (int): Maximum number of units sold daily in previous period.
613 |         avg_units_sold_daily (float): Average number of units sold daily in previous period.
614 |         max_lead_time (int): Maximum number of days required to obtain stock.
615 |         avg_lead_time (int): Average number of days required to obtain stock.
616 |         lead_time (int): Number of days required to obtain stock.
617 | 
618 |     Returns:
619 |         Safety stock level for the product based on sales and lead time.
620 |     """
621 | 
622 |     safety = safety_stock(max_units_sold_daily, avg_units_sold_daily, max_lead_time, avg_lead_time)
623 |     return (lead_time * avg_units_sold_daily) + safety
624 | 
625 | 
626 | def back_order_rate(total_back_orders, total_orders):
627 |     """Return the back order rate for a period. Back orders are those that could not be shipped due to lack of stock.
628 | 
629 |     Args:
630 |         total_back_orders (int): Total number of back orders.
631 |         total_orders (int): Total number of orders.
632 | 
633 |     Returns:
634 |         Back order rate (float).
635 |     """
636 | 
637 |     return (total_back_orders / total_orders) * 100
638 | 
639 | 
640 | def sales_velocity(units_sold_last_12m, number_of_days_in_stock, velocity_days=30):
641 |     """Return the sales velocity of a product for a given number of days.
642 | 
643 |     Args:
644 |         units_sold_last_12m (int): Total number of units sold in the past 12 months.
645 |         number_of_days_in_stock (int): Total number of days in the past 12 months when product was in stock.
646 |         velocity_days (int, optional): Number of days over which to measure sales velocity. Default 30.
647 | 
648 |     Returns:
649 |         Sales velocity of product
650 |     """
651 | 
652 |     return (units_sold_last_12m / number_of_days_in_stock) * velocity_days
653 | 
654 | 
655 | def accuracy_of_forecast_demand(actual_demand, forecast_demand):
656 |     """Return the accuracy of forecast demand.
657 | 
658 |     Args:
659 |         actual_demand (int): Actual number of units of product sold within the period.
660 |         forecast_demand (int): Number of units forecast to be demanded within the period.
661 | 
662 |     Returns:
663 |         Accuracy of forecast demand.
664 |     """
665 | 
666 |     return ((actual_demand - forecast_demand) / actual_demand) * 100
667 | 
668 | 
669 | def eoq(demand_in_units, cost_of_ordering, cost_of_carrying):
670 |     """Return the Economic Order Quantity (EOQ) for a product.
671 | 
672 |     Args:
673 |         demand_in_units (int):
674 |         cost_of_ordering (float):
675 |         cost_of_carrying (float):
676 | 
677 |     Returns:
678 |         Economic Order Quantity or EOQ (float).
679 |     """
680 | 
681 |     return math.sqrt(((demand_in_units * cost_of_ordering) * 2) / cost_of_carrying)
682 | 
683 | 
684 | """====================================================================================================================
685 | CUSTOMER SERVICE METRICS
686 | ===================================================================================================================="""
687 | 
688 | 
689 | def csat(total_responses, positive_responses):
690 |     """Return the Customer Satisfaction or CSAT score for a period.
691 | 
692 |     Args:
693 |         total_responses (int): Total number of responses received within the period.
694 |         positive_responses (int): Total number of positive responses received within the period.
695 | 
696 |     Returns:
697 |         Percentage (float) of positive responses received.
698 |     """
699 | 
700 |     return (positive_responses / total_responses) * 100
701 | 
702 | 
703 | def nps(total_promoters, total_detractors, total_respondents):
704 |     """Return the Net Promoter Score (NPS) for a period.
705 | 
706 |     Args:
707 |         total_promoters (int): Total number of promoters (9 or 10 out of 10) within the period.
708 |         total_detractors (int): Total number of detractors responses (1 to 6 out of 10) within the period.
709 |         total_respondents (int): Total number of responses within the period.
710 | 
711 |     Returns:
712 |         NPS score (float) based on the percentage of promoters - percentage detractors.
713 |     """
714 | 
715 |     return ((total_promoters * 100) / total_respondents) - ((total_detractors * 100) / total_respondents)
716 | 
717 | 
718 | def ticket_to_order_ratio(total_tickets, total_orders):
719 |     """Returns the ratio of tickets to orders.
720 | 
721 |     Args:
722 |         total_tickets (int): Total chats, emails, or tickets in the period.
723 |         total_orders (int): Total orders in the period.
724 | 
725 |     Returns:
726 |         Ratio of tickets to orders
727 |     """
728 | 
729 |     return (total_tickets / total_orders) * 100
730 | 
731 | 
732 | def average_tickets_to_resolve(total_tickets, total_resolutions):
733 |     """Returns the average number of tickets required to resolve an issue.
734 | 
735 |     Args:
736 |         total_tickets (int): Total chats, emails, or tickets in the period.
737 |         total_resolutions (int): Total chats, emails, or tickets resolved in the period.
738 | 
739 |     Returns:
740 |         Average number of tickets it takes to resolve an issue.
741 |     """
742 | 
743 |     return total_tickets / total_resolutions
744 | 
745 | 
746 | def time_to_resolve(time_received, time_resolved):
747 |     """Returns the time taken to resolve an issue.
748 | 
749 |     Args:
750 |         time_received (datetime): Datetime showing when ticket was received.
751 |         time_resolved (datetime): Datetime showing when ticket was received.
752 | 
753 |     Returns:
754 |         Time taken to resolve issue in hours.
755 |     """
756 | 
757 |     time_received = datetime.strptime(time_received, "%Y-%m-%d %H:%M:%S")
758 |     time_resolved = datetime.strptime(time_resolved, "%Y-%m-%d %H:%M:%S")
759 |     time_to_resolve = ((time_resolved - time_received).seconds / 60) / 60
760 | 
761 |     return time_to_resolve
762 | 
763 | 
764 | 
765 | """====================================================================================================================
766 | OPERATIONS METRICS
767 | ===================================================================================================================="""
768 | 
769 | 
770 | def service_level(orders_received, orders_delivered):
771 |     """Return the inventory management service level metric, based on the percentage of received orders delivered.
772 | 
773 |     Args:
774 |         orders_received (int): Orders received within the period.
775 |         orders_delivered (int): Orders successfully delivered within the period.
776 | 
777 |     Returns:
778 |         Percentage (float) of orders received that were delivered within th period.
779 |     """
780 | 
781 |     return (orders_delivered / orders_received) * 100
782 | 
783 | 
784 | def available_inventory_accuracy(counted_items, counted_items_that_match_record):
785 |     """Return the Available Inventory Accuracy.
786 | 
787 |     Args:
788 |         counted_items (int): Total items supposedly in the inventory according to the WMS.
789 |         counted_items_that_match_record (int): Number of items were the WMS count matches the actual count.
790 | 
791 |     Returns:
792 |         Percentage of available inventory that was correctly counted in the WMS.
793 |     """
794 | 
795 |     return (counted_items_that_match_record / counted_items) * 100
796 | 
797 | 
798 | def lost_sales_ratio(days_out_of_stock, days_in_period):
799 |     """Returns the lost sales ratio for a product, representing the percentage of days in a period when it was OOS.
800 | 
801 |     Args:
802 |         days_out_of_stock (int): Total days the product was out of stock.
803 |         days_in_period (int): Total days in the period.
804 | 
805 |     Returns:
806 |         Percentage of days in the period when the product was out of stock.
807 |     """
808 | 
809 |     return (days_out_of_stock / days_in_period) * 100
810 | 
811 | 
812 | 


--------------------------------------------------------------------------------
/ecommercetools/utilities/tools.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from datetime import timedelta, datetime
  4 | 
  5 | 
  6 | def load_transaction_items(filename,
  7 |                            date_column='order_date',
  8 |                            order_id_column='order_id',
  9 |                            customer_id_column='customer_id',
 10 |                            sku_column='sku',
 11 |                            quantity_column='quantity',
 12 |                            unit_price_column='unit_price'
 13 |                            ):
 14 |     """Load a CSV of transactional item data, sets standard column names, and calculates line price.
 15 | 
 16 |     Args:
 17 |         filename (str): Filename and path of CSV file containing transaction items.
 18 |         date_column (str, optional): Name of order date column, default is order_date
 19 |         order_id_column (str, optional): Name of order ID column, default is order_id
 20 |         customer_id_column (str, optional): Name of customer ID column, default is customer_id
 21 |         sku_column (str, optional): Name of SKU column, default is sku
 22 |         quantity_column (int, optional): Name of quantity column, default is quantity
 23 |         unit_price_column (float, optional): Name of unit price column, default is unit_price
 24 | 
 25 |     Usage:
 26 |         transaction_items = rt.load_transaction_items('data/input/transaction_items_non_standard_names.csv',
 27 |                                      date_column='InvoiceDate',
 28 |                                      order_id_column='OrderId',
 29 |                                      customer_id_column='CustomerId',
 30 |                                      sku_column='VariantId',
 31 |                                      quantity_column='Qty',
 32 |                                      unit_price_column='Price'
 33 |                                      )
 34 | 
 35 |     Returns:
 36 |         A Pandas dataframe containing the same data with the column names changed to the
 37 |         standardised names used throughout RetailTools, if they do not already match, and
 38 |         the order_date column correctly set as a datetime column. If the user provides a
 39 |         CSV file in which the column names are already set to these values, it it not a
 40 |         requirement to provide them.
 41 | 
 42 |     """
 43 | 
 44 |     df = pd.read_csv(filename, parse_dates=[date_column])
 45 |     df = df.rename(columns={
 46 |         date_column: 'order_date',
 47 |         order_id_column: 'order_id',
 48 |         customer_id_column: 'customer_id',
 49 |         sku_column: 'sku',
 50 |         quantity_column: 'quantity',
 51 |         unit_price_column: 'unit_price'
 52 |     })
 53 |     df['line_price'] = round(df['quantity'] * df['unit_price'], 2)
 54 |     return df
 55 | 
 56 | 
 57 | def load_sample_data():
 58 |     """Load the Online Retail dataset of transaction items and format for use within EcommerceTools functions.
 59 | 
 60 |     :return: Pandas dataframe.
 61 |     """
 62 | 
 63 |     df = pd.read_csv('https://raw.githubusercontent.com/databricks/Spark-The-Definitive-Guide/master/data'
 64 |                      '/retail-data/all/online-retail-dataset.csv',
 65 |                      names=['order_id', 'sku', 'description', 'quantity', 'order_date', 'unit_price', 'customer_id',
 66 |                             'country'],
 67 |                      skiprows=1,
 68 |                      parse_dates=['order_date']
 69 |                      )
 70 |     df['line_price'] = df['unit_price'] * df['quantity']
 71 |     return df
 72 | 
 73 | 
 74 | def get_cumulative_count(df, group_column, count_column, sort_column):
 75 |     """Get the cumulative count of a column based on a GroupBy.
 76 | 
 77 |     Args:
 78 |         df (object): Pandas DataFrame.
 79 |         group_column (string): Column to group by.
 80 |         count_column (string): Column to count.
 81 |         sort_column (string): Column to sort by.
 82 | 
 83 |     Returns:
 84 |         Cumulative count of the column.
 85 | 
 86 |     Usage:
 87 |         df['running_total'] = get_cumulative_count(df, 'customer_id', 'order_id', 'date_created')
 88 |     """
 89 | 
 90 |     df = df.sort_values(by=sort_column, ascending=True)
 91 |     return df.groupby([group_column])[count_column].cumcount()
 92 | 
 93 | 
 94 | def get_previous_value(df, group_column, value_column):
 95 |     """Group by a column and return the previous value of another column and assign value to a new column.
 96 | 
 97 |     Args:
 98 |         df (object): Pandas DataFrame.
 99 |         group_column (str): Column name to group by
100 |         value_column (str): Column value to return.
101 | 
102 |     Returns:
103 |         Original DataFrame with new column containing previous value of named column.
104 |     """
105 | 
106 |     df = df.copy()
107 |     df = df.sort_values(by=[value_column], ascending=False)
108 |     return df.groupby([group_column])[value_column].shift(-1)
109 | 
110 | 
111 | def get_days_since_date(df, before_datetime, after_datetime):
112 |     """Return a new column containing the difference between two dates in days.
113 | 
114 |     Args:
115 |         df (object): Pandas DataFrame.
116 |         before_datetime (datetime): Earliest datetime (will convert value)
117 |         after_datetime (datetime): Latest datetime (will convert value)
118 | 
119 |     Returns:
120 |         New column value
121 |     """
122 | 
123 |     df = df.copy()
124 |     df[before_datetime] = pd.to_datetime(df[before_datetime])
125 |     df[after_datetime] = pd.to_datetime(df[after_datetime])
126 | 
127 |     diff = df[after_datetime] - df[before_datetime]
128 |     return round(diff / np.timedelta64(1, 'D')).fillna(0).astype(int)
129 | 
130 | 
131 | def date_subtract(date, days):
132 |     """Given a date, subtract a specified number of days, and return the date.
133 | 
134 |     Args:
135 |         date (datetime): Original date to subtract from.
136 |         days (int): Number of days to subtract from date.
137 | 
138 |     Return:
139 |         subtracted_date (datetime): Original date with days subtracted.
140 |     """
141 | 
142 |     return pd.to_datetime(date) - timedelta(days=days)
143 | 
144 | 
145 | def select_last_x_days(df,
146 |                        date_column='order_date',
147 |                        days=365):
148 |     """Select the last X days from a Pandas dataframe.
149 | 
150 |     Args:
151 |         df (object): Pandas dataframe containing time series data.
152 |         date_column (str, optional): Name of column containing date. Default is order_date.
153 |         days (int, optional): Number of days to subtract from current date. Default is 365.
154 | 
155 |     Returns:
156 |         df (object): Filtered dataframe containing only records from the past X days.
157 |     """
158 | 
159 |     subtracted_date = date_subtract(datetime.today(), days)
160 |     df = df[df[date_column] >= subtracted_date]
161 |     return df
162 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from ecommercetools import utilities
  3 | from ecommercetools import transactions
  4 | from ecommercetools import products
  5 | from ecommercetools import customers
  6 | from ecommercetools import operations
  7 | from ecommercetools import seo
  8 | from ecommercetools import reports
  9 | 
 10 | """
 11 | 
 12 | 
 13 | # =======================================================================
 14 | # Load sample data
 15 | # =======================================================================
 16 | 
 17 | transaction_items = utilities.load_sample_data()
 18 | print(transaction_items.head())
 19 | 
 20 | # =======================================================================
 21 | # Create transactions dataframe
 22 | # =======================================================================
 23 | 
 24 | transactions = transactions.get_transactions(transaction_items)
 25 | print(transactions.head())
 26 | 
 27 | # =======================================================================
 28 | # Create products dataframe
 29 | # =======================================================================
 30 | 
 31 | products_df = products.get_products(transaction_items)
 32 | print(products_df.head())
 33 | 
 34 | # =======================================================================
 35 | # Create repurchase rates dataframe
 36 | # =======================================================================
 37 | 
 38 | repurchase_rates = products.get_repurchase_rates(transaction_items)
 39 | print(repurchase_rates.head(3))
 40 | 
 41 | # =======================================================================
 42 | # Create customers dataframe
 43 | # =======================================================================
 44 | 
 45 | customers_df = customers.get_customers(transaction_items)
 46 | print(customers_df.head())
 47 | 
 48 | # =======================================================================
 49 | # Create cohorts dataframe
 50 | # =======================================================================
 51 | 
 52 | cohorts_df = customers.get_cohorts(transaction_items, period='M')
 53 | print(cohorts_df.head())
 54 | 
 55 | # =======================================================================
 56 | # Create cohort matrix dataframe
 57 | # =======================================================================
 58 | 
 59 | cohort_matrix_df = customers.get_cohort_matrix(transaction_items, period='M', percentage=True)
 60 | print(cohort_matrix_df.head())
 61 | 
 62 | cohort_matrix_df = customers.get_cohort_matrix(transaction_items, period='M', percentage=False)
 63 | print(cohort_matrix_df.head())
 64 | 
 65 | # =======================================================================
 66 | # Create retention dataframe
 67 | # =======================================================================
 68 | 
 69 | retention_df = customers.get_retention(transactions)
 70 | print(retention_df.head())
 71 | 
 72 | # =======================================================================
 73 | # Create RFMH dataframe
 74 | # =======================================================================
 75 | 
 76 | rfm_df = customers.get_rfm_segments(customers_df)
 77 | print(rfm_df.head())
 78 | 
 79 | # =======================================================================
 80 | # Create latency dataframe
 81 | # =======================================================================
 82 | 
 83 | latency_df = customers.get_latency(transactions)
 84 | print(latency_df.head())
 85 | 
 86 | # =======================================================================
 87 | # Create customer ABC dataframe
 88 | # =======================================================================
 89 | 
 90 | abc_df = customers.get_abc_segments(customers_df, months=12, abc_class_name='abc_class_12m', abc_rank_name='abc_rank_12m')
 91 | print(abc_df.head())
 92 | 
 93 | # =======================================================================
 94 | # Create customer predictions dataframe
 95 | # =======================================================================
 96 | 
 97 | customer_predictions = customers.get_customer_predictions(transactions,
 98 |                                                           observation_period_end='2011-12-09',
 99 |                                                           days=90)
100 | print(customer_predictions.head(10))
101 | 
102 | # =======================================================================
103 | # Create ABC inventory classification dataframe
104 | # =======================================================================
105 | 
106 | inventory_classification = operations.get_inventory_classification(transaction_items, verbose=True)
107 | print(inventory_classification.head())
108 | print(inventory_classification.abc_class.value_counts())
109 | 
110 | # =======================================================================
111 | # Scrape a bunch of URLs
112 | # =======================================================================
113 | 
114 | urls = ['https://practicaldatascience.co.uk/data-science/how-to-create-a-pandas-dataframe',
115 |         'https://practicaldatascience.co.uk/data-science/how-to-query-the-google-search-console-api-with-ecommercetools',
116 |         'https://practicaldatascience.co.uk/data-science/how-to-assign-rfm-scores-with-quantile-based-discretization',
117 |         '404',
118 |         'https://practicaldatascience.co.uk/data-science/how-to-engineer-customer-purchase-latency-features',
119 |         'https://practicaldatascience.co.uk/assets/files/marketing.pdf'
120 |         ]
121 | df = pd.DataFrame(list(zip(urls)), columns=['loc'])
122 | 
123 | df_pages = seo.scraping.scrape_site(df)
124 | 
125 | print(df_pages)
126 | 
127 | 
128 | # =======================================================================
129 | # Get SERPs
130 | # =======================================================================
131 | 
132 | results = seo.get_serps("bearded dragon brumation", pages=3)
133 | print(results)
134 | 
135 | 
136 | # =======================================================================
137 | # Get indexed pages
138 | # =======================================================================
139 | 
140 | results = seo.get_indexed_pages(["https://www.bbc.co.uk",  # Millions
141 |                                  "https://www.practicaldatascience.co.uk",  # 1
142 |                                  "https://www.shj989uiskjdlksjd.com"  # None
143 |                                  ])
144 | print(results)
145 | exit()
146 | 
147 | # =======================================================================
148 | # Get all Google Search Console data
149 | # =======================================================================
150 | 
151 | key = "pds-client-secrets.json"
152 | site_url = "sc-domain:practicaldatascience.co.uk"
153 | payload = {
154 |     'startDate': "2021-01-01",
155 |     'endDate': "2021-08-31",
156 |     'dimensions': ["query"],
157 |     'rowLimit': 25000,
158 |     'startRow': 0
159 | }
160 | 
161 | df = seo.query_google_search_console(key, site_url, payload, fetch_all=True)
162 | print(len(df))
163 | 
164 | # =======================================================================
165 | # Compare two Google Search Console periods
166 | # =======================================================================
167 | 
168 | payload_before = {
169 |     'startDate': "2021-08-11",
170 |     'endDate': "2021-08-31",
171 |     'dimensions': ["page", "query"],
172 | }
173 | 
174 | payload_after = {
175 |     'startDate': "2021-07-21",
176 |     'endDate': "2021-08-10",
177 |     'dimensions': ["page","query"],
178 | }
179 | 
180 | df = seo.query_google_search_console_compare(key, site_url, payload_before, payload_after, fetch_all=False)
181 | print(df.sort_values(by='clicks_change', ascending=False).head())
182 | 
183 | # =======================================================================
184 | # Load customers report
185 | # =======================================================================
186 | 
187 | df_customers_report = reports.customers_report(transaction_items, frequency='M')
188 | print(df_customers_report.head(13))
189 | 
190 | # =======================================================================
191 | # Load transactions report
192 | # =======================================================================
193 | 
194 | df_orders_report = reports.transactions_report(transaction_items, frequency='M')
195 | print(df_orders_report.head(13))
196 | 
197 | 
198 | # =======================================================================
199 | # Classify Google Search Console data using ABCD
200 | # =======================================================================
201 | """
202 | key = "pds-client-secrets.json"
203 | site_url = "sc-domain:practicaldatascience.co.uk"
204 | start_date = '2022-10-01'
205 | end_date = '2022-10-31'
206 | 
207 | df_classes = seo.classify_pages(key, site_url, start_date, end_date, output='classes')
208 | print(df_classes.head())
209 | 
210 | df_summary = seo.classify_pages(key, site_url, start_date, end_date, output='summary')
211 | print(df_summary)
212 | 
213 | 
214 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | requests~=2.26.0
 2 | requests-html
 3 | gapandas
 4 | sklearn~=0.0
 5 | lifetimes~=0.11.3
 6 | transformers~=4.5.1
 7 | torch
 8 | pycausalimpact
 9 | pandas~=1.2.4
10 | causalimpact~=0.2.0
11 | bs4~=0.0.1
12 | beautifulsoup4~=4.9.3
13 | numpy~=1.20.1
14 | scikit-learn~=0.24.1
15 | setuptools~=45.2.0


--------------------------------------------------------------------------------
/scraper_example.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from ecommercetools import seo
 3 | 
 4 | # Fetch the sitemap to obtain a URL list to scrape
 5 | df_sitemap = seo.get_sitemap("https://www.practicaldatascience.co.uk/sitemap.xml")
 6 | 
 7 | # Scrape the list of URLs
 8 | df_pages = seo.scrape_site(df_sitemap.head(), url='loc', verbose=True)
 9 | 
10 | print(df_pages)
11 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from setuptools import find_namespace_packages
 3 | from os import path
 4 | 
 5 | this_directory = path.abspath(path.dirname(__file__))
 6 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
 7 |     long_description = f.read()
 8 | 
 9 | setup(
10 |     name='ecommercetools',
11 |     packages=find_namespace_packages(include=['ecommercetools.*']),
12 |     version='0.42.8',
13 |     license='MIT',
14 |     description='EcommerceTools is a data science toolkit for ecommerce, marketing science, and Python SEO.',
15 |     long_description=long_description,
16 |     long_description_content_type='text/markdown',
17 |     author='Matt Clarke',
18 |     author_email='matt@practicaldatascience.co.uk',
19 |     url='https://github.com/practical-data-science/ecommercetools',
20 |     download_url='https://github.com/practical-data-science/ecommercetools/archive/master.zip',
21 |     keywords=['ecommerce', 'marketing', 'seo', 'seo testing', 'customers', 'products', 'rfm', 'abc',
22 |               'operations', 'analytics', 'python', 'python seo', 'pandas', 'nlp', 'causal impact'],
23 |     classifiers=[
24 |         'Development Status :: 3 - Alpha',
25 |         'Intended Audience :: Developers',
26 |         'Topic :: Software Development :: Libraries :: Python Modules',
27 |         'License :: OSI Approved :: MIT License',
28 |         'Programming Language :: Python :: 3.6',
29 |     ],
30 |     install_requires=['pandas',
31 |                       'gapandas',
32 |                       'sklearn',
33 |                       'requests',
34 |                       'requests_html',
35 |                       'httplib2 >= 0.15.0',
36 |                       'lifetimes',
37 |                       'transformers',
38 |                       'torch',
39 |                       'pycausalimpact',
40 |                       'numpy']
41 | )
42 | 


--------------------------------------------------------------------------------