├── .github └── workflows │ └── python-publish.yml ├── .gitignore ├── Examples.ipynb ├── LICENSE.txt ├── README.md ├── banner.png ├── ecommercetools ├── __init__.py ├── advertising │ ├── __init__.py │ └── advertising.py ├── customers │ ├── __init__.py │ └── customers.py ├── marketing │ ├── __init__.py │ └── marketing.py ├── nlp │ ├── __init__.py │ └── nlp.py ├── operations │ ├── __init__.py │ └── operations.py ├── products │ ├── __init__.py │ └── products.py ├── reports │ ├── __init__.py │ └── reports.py ├── seo │ ├── __init__.py │ ├── google_autocomplete.py │ ├── google_knowledge_graph.py │ ├── google_pagespeed_insights.py │ ├── google_search.py │ ├── google_search_console.py │ ├── robots.py │ ├── scraping.py │ ├── sitemaps.py │ └── testing.py ├── transactions │ ├── __init__.py │ └── transactions.py └── utilities │ ├── __init__.py │ ├── metrics.py │ └── tools.py ├── example.py ├── requirements.txt ├── scraper_example.py ├── setup.cfg └── setup.py /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: [push] 7 | 8 | jobs: 9 | deploy: 10 | 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: '3.x' 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install setuptools wheel twine 23 | - name: Build and publish 24 | env: 25 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 26 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 27 | run: | 28 | python setup.py sdist bdist_wheel 29 | twine upload dist/* 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | build/ 3 | dist/ 4 | venv/ 5 | data/ 6 | ecommercetools.egg-info 7 | google-search-console.json 8 | pds-client-secrets.json 9 | example-test.py -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | Copyright (c) 2021 Matt Clarke, Practical Data Science 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | The above copyright notice and this permission notice shall be included in all 10 | copies or substantial portions of the Software. 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 17 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EcommerceTools 2 | 3 | ![EcommerceTools](https://github.com/practical-data-science/ecommercetools/blob/master/banner.png?raw=true) 4 | 5 | EcommerceTools is a data science toolkit for those working in technical ecommerce, marketing science, and technical seo and includes a wide range of features to aid analysis and model building. The package is written in Python and is designed to be used with Pandas and works within a Jupyter notebook environment or in standalone Python projects. 6 | 7 | #### Installation 8 | 9 | You can install EcommerceTools and its dependencies via PyPi by entering `pip3 install ecommercetools` in your terminal, or `!pip3 install ecommercetools` within a Jupyter notebook cell. 10 | 11 | --- 12 | 13 | ### Modules 14 | 15 | - [Transactions](#Transactions) 16 | - [Products](#Products) 17 | - [Customers](#Customers) 18 | - [Advertising](#Advertising) 19 | - [Operations](#Operations) 20 | - [Marketing](#Marketing) 21 | - [NLP](#NLP) 22 | - [SEO](#SEO) 23 | - [Reports](#Reports) 24 | --- 25 | 26 | ### Transactions 27 | 28 | 1. #### Load sample transaction items data 29 | 30 | If you want to get started with the transactions, products, and customers features, you can use the `load_sample_data()` function to load a set of real world data. This imports the transaction items from widely-used Online Retail dataset and reformats it ready for use by EcommerceTools. 31 | 32 | ```python 33 | from ecommercetools import utilities 34 | 35 | transaction_items = utilities.load_sample_data() 36 | transaction_items.head() 37 | ``` 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 |
order_idskudescriptionquantityorder_dateunit_pricecustomer_idcountryline_price
053636585123AWHITE HANGING HEART T-LIGHT HOLDER62010-12-01 08:26:002.5517850.0United Kingdom15.30
153636571053WHITE METAL LANTERN62010-12-01 08:26:003.3917850.0United Kingdom20.34
253636584406BCREAM CUPID HEARTS COAT HANGER82010-12-01 08:26:002.7517850.0United Kingdom22.00
353636584029GKNITTED UNION FLAG HOT WATER BOTTLE62010-12-01 08:26:003.3917850.0United Kingdom20.34
453636584029ERED WOOLLY HOTTIE WHITE HEART.62010-12-01 08:26:003.3917850.0United Kingdom20.34
117 | 118 | 2. #### Create a transaction items dataframe 119 | 120 | The `utilities` module includes a range of tools that allow you to format data, so it can be used within other EcommerceTools functions. The `load_transaction_items()` function is used to create a Pandas dataframe of formatted transactional item data. When loading your transaction items data, all you need to do is define the column mappings, and the function will reformat the dataframe accordingly. 121 | 122 | ```python 123 | import pandas as pd 124 | from ecommercetools import utilities 125 | 126 | transaction_items = utilities.load_transaction_items('transaction_items_non_standard_names.csv', 127 | date_column='InvoiceDate', 128 | order_id_column='InvoiceNo', 129 | customer_id_column='CustomerID', 130 | sku_column='StockCode', 131 | quantity_column='Quantity', 132 | unit_price_column='UnitPrice' 133 | ) 134 | transaction_items.to_csv('transaction_items.csv', index=False) 135 | print(transaction_items.head()) 136 | ``` 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 |
order_idskudescriptionquantityorder_dateunit_pricecustomer_idcountryline_price
053636585123AWHITE HANGING HEART T-LIGHT HOLDER62010-12-01 08:26:002.5517850.0United Kingdom15.30
153636571053WHITE METAL LANTERN62010-12-01 08:26:003.3917850.0United Kingdom20.34
253636584406BCREAM CUPID HEARTS COAT HANGER82010-12-01 08:26:002.7517850.0United Kingdom22.00
353636584029GKNITTED UNION FLAG HOT WATER BOTTLE62010-12-01 08:26:003.3917850.0United Kingdom20.34
453636584029ERED WOOLLY HOTTIE WHITE HEART.62010-12-01 08:26:003.3917850.0United Kingdom20.34
216 | 217 | 3. #### Create a transactions dataframe 218 | 219 | The `get_transactions()` function takes the formatted Pandas dataframe of transaction items and returns a Pandas dataframe of aggregated transaction data, which includes features identifying the order number. 220 | 221 | ```python 222 | import pandas as pd 223 | from ecommercetools import customers 224 | 225 | transaction_items = pd.read_csv('transaction_items.csv') 226 | transactions = transactions.get_transactions(transaction_items) 227 | transactions.to_csv('transactions.csv', index=False) 228 | print(transactions.head()) 229 | ``` 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 |
order_idorder_datecustomer_idskusitemsrevenuereplacementorder_number
05363652010-12-01 08:26:0017850.0740139.1201
15363662010-12-01 08:28:0017850.021222.2002
25363672010-12-01 08:34:0013047.01283278.7301
35363682010-12-01 08:34:0013047.041570.0502
45363692010-12-01 08:35:0013047.01317.8503
303 | 304 | --- 305 | 306 | ### Products 307 | 308 | #### 1. Get product data from transaction items 309 | 310 | ```python 311 | products_df = products.get_products(transaction_items) 312 | products_df.head() 313 | ``` 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 |
skufirst_order_datelast_order_datecustomersordersitemsrevenueavg_unit_priceavg_quantityavg_revenueavg_ordersproduct_tenureproduct_recency
0100022010-12-01 08:45:002011-04-28 15:05:0040731037759.891.05684914.20547910.4094521.8237493600
1100802011-02-27 13:47:002011-11-21 17:04:001924495119.090.37666720.6250004.9620831.2636603393
2101202010-12-03 11:19:002011-12-04 13:15:00252919340.530.2100006.4333331.3510001.1637463380
310123C2010-12-03 11:19:002011-07-15 15:05:0034-133.250.487500-3.2500000.8125001.3337463522
410123G2011-04-08 11:13:002011-04-08 11:13:0001-380.000.000000-38.0000000.000000inf36203620
417 | 418 | #### 2. Calculate product consumption and repurchase rate 419 | 420 | 421 | ```python 422 | repurchase_rates = products.get_repurchase_rates(transaction_items) 423 | repurchase_rates.head(3).T 424 | ``` 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 |
012
sku100021008010120
revenue759.89119.0940.53
items1037495193
orders732429
customers401925
avg_unit_price1.056850.3766670.21
avg_line_price10.40954.962081.351
avg_items_per_order14.205520.6256.65517
avg_items_per_customer25.92526.05267.72
purchased_individually009
purchased_once341722
bulk_purchases732420
bulk_purchase_rate110.689655
repurchases3977
repurchase_rate0.5342470.2916670.241379
repurchase_rate_labelModerate repurchaseLow repurchaseLow repurchase
bulk_purchase_rate_labelVery high bulkVery high bulkHigh bulk
bulk_and_repurchase_labelModerate repurchase_Very high bulkLow repurchase_Very high bulkLow repurchase_High bulk
546 | 547 | --- 548 | 549 | ### Customers 550 | 551 | #### 1. Create a customers dataset 552 | 553 | ```python 554 | from ecommercetools import customers 555 | 556 | customers_df = customers.get_customers(transaction_items) 557 | customers_df.head() 558 | ``` 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 |
customer_idrevenueordersskusitemsfirst_order_datelast_order_dateavg_itemsavg_order_valuetenurerecencycohort
012346.00.002102011-01-18 10:01:002011-01-18 10:17:000.000.003701370020111
112347.04310.007724582010-12-07 14:57:002011-12-07 15:52:00351.14615.713742337720104
212348.01797.244423412010-12-16 19:09:002011-09-25 13:13:00585.25449.313733345020104
312349.01757.55116312011-11-21 09:51:002011-11-21 09:51:00631.001757.553394339420114
412350.0334.40111972011-02-02 16:01:002011-02-02 16:01:00197.00334.403685368520111
656 | 657 | #### 2. Create a customer cohort analysis dataset 658 | 659 | 660 | ```python 661 | from ecommercetools import customers 662 | 663 | cohorts_df = customers.get_cohorts(transaction_items, period='M') 664 | cohorts_df.head() 665 | ``` 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 |
customer_idorder_idorder_dateacquisition_cohortorder_cohort
017850.05363652010-12-01 08:26:002010-122010-12
717850.05363662010-12-01 08:28:002010-122010-12
913047.05363672010-12-01 08:34:002010-122010-12
2113047.05363682010-12-01 08:34:002010-122010-12
2513047.05363692010-12-01 08:35:002010-122010-12
721 | 722 | 723 | #### 3. Create a customer cohort analysis matrix 724 | 725 | ```python 726 | from ecommercetools import customers 727 | 728 | cohort_matrix_df = customers.get_cohort_matrix(transaction_items, period='M', percentage=True) 729 | cohort_matrix_df.head() 730 | ``` 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | 785 | 786 | 787 | 788 | 789 | 790 | 791 | 792 | 793 | 794 | 795 | 796 | 797 | 798 | 799 | 800 | 801 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | 841 | 842 | 843 | 844 | 845 | 846 | 847 | 848 | 849 |
periods0123456789101112
acquisition_cohort
2010-121.00.3818570.3343880.3871310.3597050.3966240.3797470.3544300.3544300.3945150.3734180.5000000.274262
2011-011.00.2399050.2826600.2422800.3277910.2992870.2612830.2565320.3111640.3467930.3681710.149644NaN
2011-021.00.2473680.1921050.2789470.2684210.2473680.2552630.2815790.2578950.3131580.092105NaNNaN
2011-031.00.1909090.2545450.2181820.2318180.1772730.2636360.2386360.2886360.088636NaNNaNNaN
2011-041.00.2274250.2207360.2107020.2073580.2374580.2307690.2608700.083612NaNNaNNaNNaN
850 | 851 | 852 | ```python 853 | from ecommercetools import customers 854 | 855 | cohort_matrix_df = customers.get_cohort_matrix(transaction_items, period='M', percentage=False) 856 | cohort_matrix_df.head() 857 | ``` 858 | 859 | 860 | 861 | 862 | 863 | 864 | 865 | 866 | 867 | 868 | 869 | 870 | 871 | 872 | 873 | 874 | 875 | 876 | 877 | 878 | 879 | 880 | 881 | 882 | 883 | 884 | 885 | 886 | 887 | 888 | 889 | 890 | 891 | 892 | 893 | 894 | 895 | 896 | 897 | 898 | 899 | 900 | 901 | 902 | 903 | 904 | 905 | 906 | 907 | 908 | 909 | 910 | 911 | 912 | 913 | 914 | 915 | 916 | 917 | 918 | 919 | 920 | 921 | 922 | 923 | 924 | 925 | 926 | 927 | 928 | 929 | 930 | 931 | 932 | 933 | 934 | 935 | 936 | 937 | 938 | 939 | 940 | 941 | 942 | 943 | 944 | 945 | 946 | 947 | 948 | 949 | 950 | 951 | 952 | 953 | 954 | 955 | 956 | 957 | 958 | 959 | 960 | 961 | 962 | 963 | 964 | 965 | 966 | 967 | 968 | 969 | 970 | 971 | 972 | 973 | 974 | 975 | 976 |
periods0123456789101112
acquisition_cohort
2010-12948.0362.0317.0367.0341.0376.0360.0336.0336.0374.0354.0474.0260.0
2011-01421.0101.0119.0102.0138.0126.0110.0108.0131.0146.0155.063.0NaN
2011-02380.094.073.0106.0102.094.097.0107.098.0119.035.0NaNNaN
2011-03440.084.0112.096.0102.078.0116.0105.0127.039.0NaNNaNNaN
2011-04299.068.066.063.062.071.069.078.025.0NaNNaNNaNNaN
977 | 978 | 979 | 980 | #### 4. Create a customer "retention" dataset 981 | 982 | 983 | ```python 984 | from ecommercetools import customers 985 | 986 | retention_df = customers.get_retention(transactions_df) 987 | retention_df.head() 988 | ``` 989 | 990 | 991 | 992 | 993 | 994 | 995 | 996 | 997 | 998 | 999 | 1000 | 1001 | 1002 | 1003 | 1004 | 1005 | 1006 | 1007 | 1008 | 1009 | 1010 | 1011 | 1012 | 1013 | 1014 | 1015 | 1016 | 1017 | 1018 | 1019 | 1020 | 1021 | 1022 | 1023 | 1024 | 1025 | 1026 | 1027 | 1028 | 1029 | 1030 | 1031 | 1032 | 1033 | 1034 | 1035 | 1036 | 1037 |
acquisition_cohortorder_cohortcustomersperiods
02010-122010-129480
12010-122011-013621
22010-122011-023172
32010-122011-033673
42010-122011-043414
1038 | 1039 | #### 5. Create an RFM (H) dataset 1040 | 1041 | This is an extension of the regular Recency, Frequency, Monetary value (RFM) model that includes an additional parameter "H" for heterogeneity. This shows the number of unique SKUs purchased by each customer. While typically unassociated with targeting, this value can be very useful in identifying which customers should probably be buying a broader mix of products than they currently are, as well as spotting those who may have stopped buying certain items. 1042 | 1043 | 1044 | ```python 1045 | from ecommercetools import customers 1046 | 1047 | rfm_df = customers.get_rfm_segments(customers_df) 1048 | rfm_df.head() 1049 | ``` 1050 | 1051 | 1052 | 1053 | 1054 | 1055 | 1056 | 1057 | 1058 | 1059 | 1060 | 1061 | 1062 | 1063 | 1064 | 1065 | 1066 | 1067 | 1068 | 1069 | 1070 | 1071 | 1072 | 1073 | 1074 | 1075 | 1076 | 1077 | 1078 | 1079 | 1080 | 1081 | 1082 | 1083 | 1084 | 1085 | 1086 | 1087 | 1088 | 1089 | 1090 | 1091 | 1092 | 1093 | 1094 | 1095 | 1096 | 1097 | 1098 | 1099 | 1100 | 1101 | 1102 | 1103 | 1104 | 1105 | 1106 | 1107 | 1108 | 1109 | 1110 | 1111 | 1112 | 1113 | 1114 | 1115 | 1116 | 1117 | 1118 | 1119 | 1120 | 1121 | 1122 | 1123 | 1124 | 1125 | 1126 | 1127 | 1128 | 1129 | 1130 | 1131 | 1132 | 1133 | 1134 | 1135 | 1136 | 1137 | 1138 | 1139 | 1140 | 1141 | 1142 | 1143 | 1144 | 1145 | 1146 | 1147 | 1148 | 1149 | 1150 | 1151 | 1152 | 1153 | 1154 | 1155 | 1156 | 1157 | 1158 | 1159 | 1160 | 1161 | 1162 | 1163 | 1164 |
customer_idacquisition_daterecency_daterecencyfrequencymonetaryheterogeneitytenurerfmhrfmrfm_scorerfm_segment_name
012346.02011-01-18 10:01:002011-01-18 10:17:00370020.001370111111113Risky
112350.02011-02-02 16:01:002011-02-02 16:01:0036851334.401368511111113Risky
212365.02011-02-21 13:51:002011-02-21 14:04:0036663320.692366611111113Risky
312373.02011-02-01 13:10:002011-02-01 13:10:0036861364.601368611111113Risky
412377.02010-12-20 09:37:002011-01-28 15:45:00369021628.122373011111113Risky
1165 | 1166 | 1167 | #### 6. Create a purchase latency dataset 1168 | 1169 | 1170 | ```python 1171 | from ecommercetools import customers 1172 | 1173 | latency_df = customers.get_latency(transactions_df) 1174 | latency_df.head() 1175 | ``` 1176 | 1177 | 1178 | 1179 | 1180 | 1181 | 1182 | 1183 | 1184 | 1185 | 1186 | 1187 | 1188 | 1189 | 1190 | 1191 | 1192 | 1193 | 1194 | 1195 | 1196 | 1197 | 1198 | 1199 | 1200 | 1201 | 1202 | 1203 | 1204 | 1205 | 1206 | 1207 | 1208 | 1209 | 1210 | 1211 | 1212 | 1213 | 1214 | 1215 | 1216 | 1217 | 1218 | 1219 | 1220 | 1221 | 1222 | 1223 | 1224 | 1225 | 1226 | 1227 | 1228 | 1229 | 1230 | 1231 | 1232 | 1233 | 1234 | 1235 | 1236 | 1237 | 1238 | 1239 | 1240 | 1241 | 1242 | 1243 | 1244 | 1245 | 1246 | 1247 | 1248 | 1249 | 1250 | 1251 | 1252 | 1253 | 1254 | 1255 | 1256 | 1257 | 1258 | 1259 | 1260 | 1261 | 1262 | 1263 | 1264 | 1265 | 1266 |
customer_idfrequencyrecency_daterecencyavg_latencymin_latencymax_latencystd_latencycvdays_to_next_orderlabel
012680.042011-12-09 12:50:00338828167330.8598981.102139-3329.0Order overdue
113113.0242011-12-09 12:49:0033881505212.0601260.804008-3361.0Order overdue
215804.0132011-12-09 12:31:0033881513911.0082610.733884-3362.0Order overdue
313777.0332011-12-09 12:25:0033881104812.0552741.095934-3365.0Order overdue
417581.0252011-12-09 12:21:0033881406721.9742931.569592-3352.0Order overdue
1267 | 1268 | 1269 | 1270 | #### 7. Customer ABC segmentation 1271 | 1272 | ```python 1273 | from ecommercetools import customers 1274 | 1275 | abc_df = customers.get_abc_segments(customers_df, months=12, abc_class_name='abc_class_12m', abc_rank_name='abc_rank_12m') 1276 | abc_df.head() 1277 | ``` 1278 | 1279 | 1280 | 1281 | 1282 | 1283 | 1284 | 1285 | 1286 | 1287 | 1288 | 1289 | 1290 | 1291 | 1292 | 1293 | 1294 | 1295 | 1296 | 1297 | 1298 | 1299 | 1300 | 1301 | 1302 | 1303 | 1304 | 1305 | 1306 | 1307 | 1308 | 1309 | 1310 | 1311 | 1312 | 1313 | 1314 | 1315 | 1316 | 1317 | 1318 | 1319 | 1320 | 1321 |
customer_idabc_class_12mabc_rank_12m
012346.0D1.0
112347.0D1.0
212348.0D1.0
312349.0D1.0
412350.0D1.0
1322 | 1323 | #### 8. Predict customer AOV, CLV, and orders 1324 | 1325 | EcommerceTools allows you to predict the AOV, Customer Lifetime Value (CLV) and expected number of orders via the Gamma-Gamma and BG/NBD models from the excellent Lifetimes package. By passing the dataframe of transactions from `get_transactions()` to the `get_customer_predictions()` function, EcommerceTools will fit the BG/NBD and Gamma-Gamma models and predict the AOV, order quantity, and CLV for each customer in the defined number of future days after the end of the observation period. 1326 | 1327 | ```python 1328 | customer_predictions = customers.get_customer_predictions(transactions_df, 1329 | observation_period_end='2011-12-09', 1330 | days=90) 1331 | customer_predictions.head(10) 1332 | ``` 1333 | 1334 | 1335 | 1336 | 1337 | 1338 | 1339 | 1340 | 1341 | 1342 | 1343 | 1344 | 1345 | 1346 | 1347 | 1348 | 1349 | 1350 | 1351 | 1352 | 1353 | 1354 | 1355 | 1356 | 1357 | 1358 | 1359 | 1360 | 1361 | 1362 | 1363 | 1364 | 1365 | 1366 | 1367 | 1368 | 1369 | 1370 | 1371 | 1372 | 1373 | 1374 | 1375 | 1376 | 1377 | 1378 | 1379 | 1380 | 1381 | 1382 | 1383 | 1384 | 1385 | 1386 | 1387 | 1388 | 1389 | 1390 | 1391 | 1392 | 1393 | 1394 | 1395 | 1396 | 1397 | 1398 | 1399 | 1400 | 1401 | 1402 | 1403 | 1404 | 1405 | 1406 | 1407 | 1408 | 1409 | 1410 | 1411 | 1412 | 1413 | 1414 | 1415 | 1416 |
customer_idpredicted_purchasesaovclv
012346.00.188830NaNNaN
112347.01.408736569.978836836.846896
212348.00.805907333.784235308.247354
312349.00.855607NaNNaN
412350.00.196304NaNNaN
512352.01.682277376.175359647.826169
612353.00.272541NaNNaN
712354.00.247183NaNNaN
812355.00.262909NaNNaN
912356.00.645368324.039419256.855226
1417 | --- 1418 | 1419 | ### Advertising 1420 | 1421 | #### 1. Create paid search keywords 1422 | 1423 | 1424 | ```python 1425 | from ecommercetools import advertising 1426 | 1427 | product_names = ['fly rods', 'fly reels'] 1428 | keywords_prepend = ['buy', 'best', 'cheap', 'reduced'] 1429 | keywords_append = ['for sale', 'price', 'promotion', 'promo', 'coupon', 'voucher', 'shop', 'suppliers'] 1430 | campaign_name = 'fly_fishing' 1431 | 1432 | keywords = advertising.generate_ad_keywords(product_names, keywords_prepend, keywords_append, campaign_name) 1433 | keywords.head() 1434 | ``` 1435 | 1436 | 1437 | 1438 | 1439 | 1440 | 1441 | 1442 | 1443 | 1444 | 1445 | 1446 | 1447 | 1448 | 1449 | 1450 | 1451 | 1452 | 1453 | 1454 | 1455 | 1456 | 1457 | 1458 | 1459 | 1460 | 1461 | 1462 | 1463 | 1464 | 1465 | 1466 | 1467 | 1468 | 1469 | 1470 | 1471 | 1472 | 1473 | 1474 | 1475 | 1476 | 1477 | 1478 | 1479 | 1480 | 1481 | 1482 | 1483 |
productkeywordsmatch_typecampaign_name
0fly rods[fly rods]Exactfly_fishing
1fly rods[buy fly rods]Exactfly_fishing
2fly rods[best fly rods]Exactfly_fishing
3fly rods[cheap fly rods]Exactfly_fishing
4fly rods[reduced fly rods]Exactfly_fishing
1484 | 1485 | 1486 | #### 2. Create paid search ad copy using Spintax 1487 | 1488 | ```python 1489 | from ecommercetools import advertising 1490 | 1491 | text = "Fly Reels from {Orvis|Loop|Sage|Airflo|Nautilus} for {trout|salmon|grayling|pike}" 1492 | spin = advertising.generate_spintax(text, single=False) 1493 | 1494 | spin 1495 | ``` 1496 | 1497 | 1498 | ['Fly Reels from Orvis for trout', 1499 | 'Fly Reels from Orvis for salmon', 1500 | 'Fly Reels from Orvis for grayling', 1501 | 'Fly Reels from Orvis for pike', 1502 | 'Fly Reels from Loop for trout', 1503 | 'Fly Reels from Loop for salmon', 1504 | 'Fly Reels from Loop for grayling', 1505 | 'Fly Reels from Loop for pike', 1506 | 'Fly Reels from Sage for trout', 1507 | 'Fly Reels from Sage for salmon', 1508 | 'Fly Reels from Sage for grayling', 1509 | 'Fly Reels from Sage for pike', 1510 | 'Fly Reels from Airflo for trout', 1511 | 'Fly Reels from Airflo for salmon', 1512 | 'Fly Reels from Airflo for grayling', 1513 | 'Fly Reels from Airflo for pike', 1514 | 'Fly Reels from Nautilus for trout', 1515 | 'Fly Reels from Nautilus for salmon', 1516 | 'Fly Reels from Nautilus for grayling', 1517 | 'Fly Reels from Nautilus for pike'] 1518 | 1519 | --- 1520 | 1521 | ### Operations 1522 | 1523 | #### 1. Create an ABC inventory classification 1524 | 1525 | ```python 1526 | inventory_classification = operations.get_inventory_classification(transaction_items) 1527 | inventory_classification.head() 1528 | ``` 1529 | 1530 | 1531 | 1532 | 1533 | 1534 | 1535 | 1536 | 1537 | 1538 | 1539 | 1540 | 1541 | 1542 | 1543 | 1544 | 1545 | 1546 | 1547 | 1548 | 1549 | 1550 | 1551 | 1552 | 1553 | 1554 | 1555 | 1556 | 1557 | 1558 | 1559 | 1560 | 1561 | 1562 | 1563 | 1564 | 1565 | 1566 | 1567 | 1568 | 1569 | 1570 | 1571 |
skuabc_classabc_rank
010002A1
110080A2
210120A3
310123CA4
410123GA4
1572 | 1573 | 1574 | --- 1575 | ### Marketing 1576 | 1577 | #### 1. Get ecommerce trading calendar 1578 | 1579 | ```python 1580 | from ecommercetools import marketing 1581 | 1582 | trading_calendar_df = marketing.get_trading_calendar('2021-01-01', days=365) 1583 | trading_calendar_df.head() 1584 | ``` 1585 | 1586 | 1587 | 1588 | 1589 | 1590 | 1591 | 1592 | 1593 | 1594 | 1595 | 1596 | 1597 | 1598 | 1599 | 1600 | 1601 | 1602 | 1603 | 1604 | 1605 | 1606 | 1607 | 1608 | 1609 | 1610 | 1611 | 1612 | 1613 | 1614 | 1615 | 1616 | 1617 | 1618 | 1619 | 1620 | 1621 |
dateevent
02021-01-01January sale
12021-01-02
22021-01-03
32021-01-04
42021-01-05
1622 | 1623 | 1624 | #### 2. Get ecommerce trading events 1625 | 1626 | 1627 | ```python 1628 | from ecommercetools import marketing 1629 | 1630 | trading_events_df = marketing.get_trading_events('2021-01-01', days=365) 1631 | trading_events_df.head() 1632 | ``` 1633 | 1634 | 1635 | 1636 | 1637 | 1638 | 1639 | 1640 | 1641 | 1642 | 1643 | 1644 | 1645 | 1646 | 1647 | 1648 | 1649 | 1650 | 1651 | 1652 | 1653 | 1654 | 1655 | 1656 | 1657 | 1658 | 1659 | 1660 | 1661 | 1662 | 1663 | 1664 | 1665 | 1666 | 1667 | 1668 | 1669 | 1670 |
dateevent
02021-01-01January sale
12021-01-29January Pay Day
22021-02-11Valentine's Day [last order date]
32021-02-14Valentine's Day
42021-02-26February Pay Day
1671 | 1672 | 1673 | 1674 | --- 1675 | 1676 | ### NLP 1677 | 1678 | #### 1. Generate text summaries 1679 | The `get_summaries()` function of the `nlp` module takes a Pandas dataframe containing text and returns a machine-generated summary of the content using a Huggingface Transformers pipeline via PyTorch. To use this feature, first load your Pandas dataframe and import the `nlp` module from `ecommercetools`. 1680 | 1681 | ```python 1682 | import pandas as pd 1683 | from ecommercetools import nlp 1684 | 1685 | pd.set_option('max_colwidth', 1000) 1686 | df = pd.read_csv('text.csv') 1687 | df.head() 1688 | ``` 1689 | 1690 | Specify the name of the Pandas dataframe, the column containing the text you wish to summarise (i.e. `product_description`), and specify a column name in which to store the machine-generated summary. The `min_length` and `max_length` arguments control the number of words generated, while the `do_sample` argument controls whether the generated text is completely unique (`do_sample=False`) or extracted from the text (`do_sample=True`). 1691 | 1692 | ```python 1693 | df = nlp.get_summaries(df, 'product_description', 'sampled_summary', min_length=50, max_length=100, do_sample=True) 1694 | df = nlp.get_summaries(df, 'product_description', 'unsampled_summary', min_length=50, max_length=100, do_sample=False) 1695 | df = nlp.get_summaries(df, 'product_description', 'unsampled_summary_20_to_30', min_length=20, max_length=30, do_sample=False) 1696 | ``` 1697 | 1698 | Since the model used for text summarisation is very large (1.2 GB plus), this function will take some time to complete. Once loaded, summaries are generated within a second or two per piece of text, so it is advisable to try smaller volumes of data initially. 1699 | 1700 | 1701 | ### SEO 1702 | 1703 | #### 1. Discover XML sitemap locations 1704 | The `get_sitemaps()` function takes the location of a `robots.txt` file (always stored at the root of a domain), and returns the URLs of any XML sitemaps listed within. 1705 | 1706 | ```python 1707 | from ecommercetools import seo 1708 | 1709 | sitemaps = seo.get_sitemaps("http://www.flyandlure.org/robots.txt") 1710 | print(sitemaps) 1711 | 1712 | ``` 1713 | 1714 | #### 2. Get an XML sitemap 1715 | The `get_dataframe()` function allows you to download the URLs in an XML sitemap to a Pandas dataframe. If the sitemap contains child sitemaps, each of these will be retrieved. You can save the Pandas dataframe to CSV in the usual way. 1716 | 1717 | ```python 1718 | from ecommercetools import seo 1719 | 1720 | df = seo.get_sitemap("http://flyandlure.org/sitemap.xml") 1721 | print(df.head()) 1722 | ``` 1723 | 1724 | 1725 | 1726 | 1727 | 1728 | 1729 | 1730 | 1731 | 1732 | 1733 | 1734 | 1735 | 1736 | 1737 | 1738 | 1739 | 1740 | 1741 | 1742 | 1743 | 1744 | 1745 | 1746 | 1747 | 1748 | 1749 | 1750 | 1751 | 1752 | 1753 | 1754 | 1755 | 1756 | 1757 | 1758 | 1759 | 1760 | 1761 | 1762 | 1763 | 1764 | 1765 | 1766 | 1767 | 1768 | 1769 | 1770 | 1771 | 1772 | 1773 | 1774 | 1775 | 1776 | 1777 | 1778 |
locchangefreqprioritydomainsitemap_name
0http://flyandlure.org/hourly1.0flyandlure.orghttp://www.flyandlure.org/sitemap.xml
1http://flyandlure.org/aboutmonthly1.0flyandlure.orghttp://www.flyandlure.org/sitemap.xml
2http://flyandlure.org/termsmonthly1.0flyandlure.orghttp://www.flyandlure.org/sitemap.xml
3http://flyandlure.org/privacymonthly1.0flyandlure.orghttp://www.flyandlure.org/sitemap.xml
4http://flyandlure.org/copyrightmonthly1.0flyandlure.orghttp://www.flyandlure.org/sitemap.xml
1779 | 1780 | 1781 | #### 3. Get Core Web Vitals from PageSpeed Insights 1782 | The `get_core_web_vitals()` function retrieves the Core Web Vitals metrics for a list of sites from the Google PageSpeed Insights API and returns results in a Pandas dataframe. The function requires a a Google PageSpeed Insights API key. 1783 | 1784 | ```python 1785 | from ecommercetools import seo 1786 | 1787 | pagespeed_insights_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" 1788 | urls = ['https://www.bbc.co.uk', 'https://www.bbc.co.uk/iplayer'] 1789 | df = seo.get_core_web_vitals(pagespeed_insights_key, urls) 1790 | print(df.head()) 1791 | ``` 1792 | 1793 | #### 4. Get Google Knowledge Graph data 1794 | The `get_knowledge_graph()` function returns the Google Knowledge Graph data for a given search term. This requires the use of a Google Knowledge Graph API key. By default, the function returns output in a Pandas dataframe, but you can pass the `output="json"` argument if you wish to receive the JSON data back. 1795 | 1796 | ```python 1797 | from ecommercetools import seo 1798 | 1799 | knowledge_graph_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" 1800 | knowledge_graph = seo.get_knowledge_graph(knowledge_graph_key, "tesla", output="dataframe") 1801 | print(knowledge_graph) 1802 | ``` 1803 | 1804 | #### 5. Get Google Search Console API data 1805 | The `query_google_search_console()` function runs a search query on the Google Search Console API and returns data in a Pandas dataframe. This function requires a JSON client secrets key with access to the Google Search Console API. 1806 | 1807 | ```python 1808 | from ecommercetools import seo 1809 | 1810 | key = "google-search-console.json" 1811 | site_url = "http://flyandlure.org" 1812 | payload = { 1813 | 'startDate': "2019-01-01", 1814 | 'endDate': "2019-12-31", 1815 | 'dimensions': ["page", "device", "query"], 1816 | 'rowLimit': 100, 1817 | 'startRow': 0 1818 | } 1819 | 1820 | df = seo.query_google_search_console(key, site_url, payload) 1821 | print(df.head()) 1822 | 1823 | ``` 1824 | 1825 | 1826 | 1827 | 1828 | 1829 | 1830 | 1831 | 1832 | 1833 | 1834 | 1835 | 1836 | 1837 | 1838 | 1839 | 1840 | 1841 | 1842 | 1843 | 1844 | 1845 | 1846 | 1847 | 1848 | 1849 | 1850 | 1851 | 1852 | 1853 | 1854 | 1855 | 1856 | 1857 | 1858 | 1859 | 1860 | 1861 | 1862 | 1863 | 1864 | 1865 | 1866 | 1867 | 1868 | 1869 | 1870 | 1871 | 1872 | 1873 | 1874 | 1875 | 1876 | 1877 | 1878 | 1879 | 1880 | 1881 | 1882 | 1883 | 1884 | 1885 | 1886 | 1887 | 1888 | 1889 | 1890 | 1891 |
pagedevicequeryclicksimpressionsctrposition
0http://flyandlure.org/articles/fly_fishing_gea...MOBILEsimms freestone waders review5621725.813.12
1http://flyandlure.org/MOBILEfly and lure3715923.273.81
2http://flyandlure.org/articles/fly_fishing_gea...DESKTOPorvis encounter waders review3513426.124.04
3http://flyandlure.org/articles/fly_fishing_gea...DESKTOPsimms freestone waders review3520017.503.50
4http://flyandlure.org/DESKTOPfly and lure3217018.823.09
1892 | 1893 | ##### Fetching all results from Google Search Console 1894 | 1895 | To fetch all results, set `fetch_all` to `True`. This will automatically paginate through your Google Search Console data and return all results. Be aware that if you do this you may hit Google's quota limit if you run a query over an extended period, or have a busy site with lots of `page` or `query` dimensions. 1896 | 1897 | ```python 1898 | from ecommercetools import seo 1899 | 1900 | key = "google-search-console.json" 1901 | site_url = "http://flyandlure.org" 1902 | payload = { 1903 | 'startDate': "2019-01-01", 1904 | 'endDate': "2019-12-31", 1905 | 'dimensions': ["page", "device", "query"], 1906 | 'rowLimit': 25000, 1907 | 'startRow': 0 1908 | } 1909 | 1910 | df = seo.query_google_search_console(key, site_url, payload, fetch_all=True) 1911 | print(df.head()) 1912 | 1913 | ``` 1914 | 1915 | ##### Comparing two time periods in Google Search Console 1916 | 1917 | ```python 1918 | payload_before = { 1919 | 'startDate': "2021-08-11", 1920 | 'endDate': "2021-08-31", 1921 | 'dimensions': ["page","query"], 1922 | } 1923 | 1924 | payload_after = { 1925 | 'startDate': "2021-07-21", 1926 | 'endDate': "2021-08-10", 1927 | 'dimensions': ["page","query"], 1928 | } 1929 | 1930 | df = seo.query_google_search_console_compare(key, site_url, payload_before, payload_after, fetch_all=False) 1931 | df.sort_values(by='clicks_change', ascending=False).head() 1932 | ``` 1933 | 1934 | 1935 | #### 6. Get the number of "indexed" pages 1936 | The `get_indexed_pages()` function uses the "site:" prefix to search Google for the number of pages "indexed". This is very approximate and may not be a perfect representation, but it's usually a good guide of site "size" in the absence of other data. 1937 | 1938 | ```python 1939 | from ecommercetools import seo 1940 | 1941 | urls = ['https://www.bbc.co.uk', 'https://www.bbc.co.uk/iplayer', 'http://flyandlure.org'] 1942 | df = seo.get_indexed_pages(urls) 1943 | print(df.head()) 1944 | ``` 1945 | 1946 | 1947 | 1948 | 1949 | 1950 | 1951 | 1952 | 1953 | 1954 | 1955 | 1956 | 1957 | 1958 | 1959 | 1960 | 1961 | 1962 | 1963 | 1964 | 1965 | 1966 | 1967 | 1968 | 1969 | 1970 | 1971 |
urlindexed_pages
2http://flyandlure.org2090
1https://www.bbc.co.uk/iplayer215000
0https://www.bbc.co.uk12700000
1972 | 1973 | 1974 | #### 7. Get keyword suggestions from Google Autocomplete 1975 | The `google_autocomplete()` function returns a set of keyword suggestions from Google Autocomplete. The `include_expanded=True` argument allows you to expand the number of suggestions shown by appending prefixes and suffixes to the search terms. 1976 | 1977 | ```python 1978 | from ecommercetools import seo 1979 | 1980 | suggestions = seo.google_autocomplete("data science", include_expanded=False) 1981 | print(suggestions) 1982 | 1983 | suggestions = seo.google_autocomplete("data science", include_expanded=True) 1984 | print(suggestions) 1985 | ``` 1986 | 1987 | 1988 | 1989 | 1990 | 1991 | 1992 | 1993 | 1994 | 1995 | 1996 | 1997 | 1998 | 1999 | 2000 | 2001 | 2002 | 2003 | 2004 | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | 2019 | 2020 | 2021 | 2022 | 2023 | 2024 | 2025 | 2026 | 2027 | 2028 | 2029 | 2030 | 2031 | 2032 | 2033 | 2034 | 2035 | 2036 | 2037 |
termrelevance
0data science jobs650
1data science jobs chester601
2data science course600
3data science masters554
4data science salary553
5data science internship552
6data science jobs london551
7data science graduate scheme550
2038 | 2039 | #### 8. Retrieve robots.txt content 2040 | The `get_robots()` function returns the contents of a robots.txt file in a Pandas dataframe so it can be parsed and analysed. 2041 | 2042 | ```python 2043 | from ecommercetools import seo 2044 | 2045 | robots = seo.get_robots("http://www.flyandlure.org/robots.txt") 2046 | print(robots) 2047 | ``` 2048 | 2049 | 2050 | 2051 | 2052 | 2053 | 2054 | 2055 | 2056 | 2057 | 2058 | 2059 | 2060 | 2061 | 2062 | 2063 | 2064 | 2065 | 2066 | 2067 | 2068 | 2069 | 2070 | 2071 | 2072 | 2073 | 2074 | 2075 | 2076 | 2077 | 2078 | 2079 | 2080 | 2081 | 2082 | 2083 | 2084 | 2085 | 2086 | 2087 | 2088 | 2089 | 2090 | 2091 | 2092 | 2093 | 2094 | 2095 | 2096 | 2097 | 2098 | 2099 | 2100 | 2101 | 2102 | 2103 | 2104 | 2105 | 2106 | 2107 | 2108 | 2109 | 2110 | 2111 | 2112 | 2113 | 2114 | 2115 | 2116 | 2117 | 2118 | 2119 | 2120 | 2121 | 2122 | 2123 | 2124 | 2125 | 2126 | 2127 | 2128 | 2129 |
directiveparameter
0User-agent*
1Disallow/signin
2Disallow/signup
3Disallow/users
4Disallow/contact
5Disallow/activate
6Disallow/*/page
7Disallow/articles/search
8Disallow/search.php
9Disallow*q=*
10Disallow*category_slug=*
11Disallow*country_slug=*
12Disallow*county_slug=*
13Disallow*features=*
2130 | 2131 | #### 9. Get Google SERPs 2132 | The `get_serps()` function returns a Pandas dataframe containing the Google search engine results for a given search term. Note that this function is not suitable for large-scale scraping and currently includes no features to prevent it from being blocked. 2133 | 2134 | ```python 2135 | from ecommercetools import seo 2136 | 2137 | serps = seo.get_serps("data science blog") 2138 | print(serps) 2139 | ``` 2140 | 2141 | 2142 | 2143 | 2144 | 2145 | 2146 | 2147 | 2148 | 2149 | 2150 | 2151 | 2152 | 2153 | 2154 | 2155 | 2156 | 2157 | 2158 | 2159 | 2160 | 2161 | 2162 | 2163 | 2164 | 2165 | 2166 | 2167 | 2168 | 2169 | 2170 | 2171 | 2172 | 2173 | 2174 | 2175 | 2176 | 2177 | 2178 | 2179 | 2180 | 2181 | 2182 | 2183 | 2184 | 2185 | 2186 | 2187 | 2188 | 2189 | 2190 | 2191 | 2192 | 2193 | 2194 | 2195 | 2196 | 2197 | 2198 | 2199 | 2200 | 2201 | 2202 | 2203 | 2204 | 2205 | 2206 | 2207 | 2208 | 2209 | 2210 | 2211 | 2212 |
titlelinktext
010 of the best data science blogs to follow - ...https://www.tableau.com/learn/articles/data-sc...10 of the best data science blogs to follow. T...
1Best Data Science Blogs to Follow in 2020 | by...https://towardsdatascience.com/best-data-scien...14 Jul 2020 — 1. Towards Data Science · Joined...
2Top 20 Data Science Blogs And Websites For Dat...https://medium.com/@exastax/top-20-data-scienc...Top 20 Data Science Blogs And Websites For Dat...
3Data Science Blog – Dataquesthttps://www.dataquest.io/blog/Browse our data science blog to get helpful ti...
451 Awesome Data Science Blogs You Need To Chec...https://365datascience.com/trending/51-data-sc...Blog name: DataKind · datakind data science bl...
5Blogs on AI, Analytics, Data Science, Machine ...https://www.kdnuggets.com/websites/blogs.htmlIndividual/small group blogs · Ai4 blog, featu...
6Data Science Blog – Applied Data Sciencehttps://data-science-blog.com/... an Bedeutung – DevOps for Data Science. De...
7Top 10 Data Science and AI Blogs in 2020 - Liv...https://livecodestream.dev/post/top-data-scien...Some of the best data science and AI blogs for...
8Data Science Blogs: 17 Must-Read Blogs for Dat...https://www.thinkful.com/blog/data-science-blogs/Data scientists could be considered the magici...
9rushter/data-science-blogs: A curated list of ...https://github.com/rushter/data-science-blogsA curated list of data science blogs. Contribu...
2213 | 2214 | 2215 | #### Create an ABCD classification of Google Search Console data 2216 | The `classify_pages()` function returns an ABCD classification of Google Search Console data. This calculates the cumulative sum of clicks and then categorises pages using the ABC algorithm (the first 80% are classed A, the next 10% are classed B, and the final 10% are classed C, with the zero click pages classed D). 2217 | 2218 | 2219 | 2220 | 2221 | ```python 2222 | from ecommercetools import seo 2223 | 2224 | key = "client_secrets.json" 2225 | site_url = "example-domain.co.uk" 2226 | start_date = '2022-10-01' 2227 | end_date = '2022-10-31' 2228 | 2229 | df_classes = seo.classify_pages(key, site_url, start_date, end_date, output='classes') 2230 | print(df_classes.head()) 2231 | 2232 | df_summary = seo.classify_pages(key, site_url, start_date, end_date, output='summary') 2233 | print(df_summary) 2234 | 2235 | ``` 2236 | 2237 | page clicks impressions ctr position clicks_cumsum clicks_running_pc pc_share class class_rank 2238 | 0 https://practicaldatascience.co.uk/machine-lea... 3890 36577 10.64 12.64 3890 8.382898 8.382898 A 1 2239 | 1 https://practicaldatascience.co.uk/data-scienc... 2414 16618 14.53 14.30 6304 13.585036 5.202138 A 2 2240 | 2 https://practicaldatascience.co.uk/data-scienc... 2378 71496 3.33 16.39 8682 18.709594 5.124558 A 3 2241 | 3 https://practicaldatascience.co.uk/data-scienc... 1942 14274 13.61 15.02 10624 22.894578 4.184984 A 4 2242 | 4 https://practicaldatascience.co.uk/data-scienc... 1738 23979 7.25 11.80 12362 26.639945 3.745367 A 5 2243 | 2244 | 2245 | class pages impressions clicks avg_ctr avg_position share_of_clicks share_of_impressions 2246 | 0 A 63 747643 36980 5.126349 22.706825 79.7 43.7 2247 | 1 B 46 639329 4726 3.228043 31.897826 10.2 37.4 2248 | 2 C 190 323385 4698 2.393632 38.259368 10.1 18.9 2249 | 3 D 36 1327 0 0.000000 25.804722 0.0 0.1 2250 | 2251 | 2252 | 2253 | --- 2254 | 2255 | ### Reports 2256 | The Reports module creates weekly, monthly, quarterly, or yearly reports for customers and orders and calculates a range of common ecommerce metrics to show business performance. 2257 | 2258 | #### 1. Customers report 2259 | The `customers_report()` function takes a formatted dataframe of transaction items (see above) and a desired frequency (D for daily, W for weekly, M for monthly, Q for quarterly) and calculates aggregate metrics for each period. 2260 | 2261 | The function returns the number of orders, the number of customers, the number of new customers, the number of returning customers, and the acquisition rate (or proportion of new customers). For monthly reporting, I would recommend a 13-month period so you can compare the last month with the same month the previous year. 2262 | 2263 | ```python 2264 | from ecommercetools import reports 2265 | 2266 | df_customers_report = reports.customers_report(transaction_items, frequency='M') 2267 | print(df_customers_report.head(13)) 2268 | ``` 2269 | 2270 | #### 2. Transactions report 2271 | The `transactions_report()` function takes a formatted dataframe of transaction items (see above) and a desired frequency (D for daily, W for weekly, M for monthly, Q for quarterly) and calculates aggregate metrics for each period. 2272 | 2273 | The metrics returned are: customers, orders, revenue, SKUs, units, average order value, average SKUs per order, average units per order, and average revenue per customer. 2274 | 2275 | ```python 2276 | from ecommercetools import reports 2277 | 2278 | df_orders_report = reports.transactions_report(transaction_items, frequency='M') 2279 | print(df_orders_report.head(13)) 2280 | ``` 2281 | 2282 | -------------------------------------------------------------------------------- /banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/practical-data-science/ecommercetools/d5d5e9a4a6c7fafd0b6931c13d9cf3865a154b76/banner.png -------------------------------------------------------------------------------- /ecommercetools/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.38" 2 | __author__ = "Matt Clarke" 3 | -------------------------------------------------------------------------------- /ecommercetools/advertising/__init__.py: -------------------------------------------------------------------------------- 1 | from ecommercetools.advertising.advertising import generate_ad_keywords 2 | from ecommercetools.advertising.advertising import generate_spintax 3 | -------------------------------------------------------------------------------- /ecommercetools/advertising/advertising.py: -------------------------------------------------------------------------------- 1 | import re 2 | import random 3 | import itertools 4 | import pandas as pd 5 | 6 | 7 | def _match_type_exact(keywords): 8 | exact = [] 9 | for keyword in keywords: 10 | exact.append([keyword[0], '[' + keyword[1] + ']']) 11 | 12 | df = pd.DataFrame.from_records(exact, columns=['product', 'keywords']) 13 | df['match_type'] = 'Exact' 14 | 15 | return df 16 | 17 | 18 | def _match_type_phrase(keywords): 19 | phrase = [] 20 | for keyword in keywords: 21 | phrase.append([keyword[0], '"' + keyword[1] + '"']) 22 | 23 | df = pd.DataFrame.from_records(phrase, columns=['product', 'keywords']) 24 | df['match_type'] = 'Phrase' 25 | 26 | return df 27 | 28 | 29 | def _match_type_broad(keywords): 30 | broad = [] 31 | for keyword in keywords: 32 | broad.append([keyword[0], keyword[1]]) 33 | 34 | df = pd.DataFrame.from_records(broad, columns=['product', 'keywords']) 35 | df['match_type'] = 'Broad' 36 | 37 | return df 38 | 39 | 40 | def _match_type_broad_modified(keywords): 41 | broad_modified = [] 42 | for keyword in keywords: 43 | bmm = ['+' + keyword[1].replace(' ', ' +')] 44 | broad_modified.append([keyword[0], bmm]) 45 | 46 | df = pd.DataFrame.from_records(broad_modified, columns=['product', 'keywords']) 47 | df['match_type'] = 'Modified' 48 | 49 | return df 50 | 51 | 52 | def _generate_combinations(products, 53 | keywords_prepend, 54 | keywords_append): 55 | """Return a list of all prepended and appended keywords combinations. 56 | 57 | Args: 58 | products (list): List of product names. 59 | keywords_prepend (list): List of keywords to prepend to product names. 60 | keywords_append (list): List of keywords to append to product names. 61 | 62 | Returns: 63 | keywords (list): List of lists containing the product name and keyword combination. 64 | 65 | Example: 66 | [['fly rods', 'fly rods'], 67 | ['fly rods', 'buy fly rods'], 68 | ['fly rods', 'best fly rods']] 69 | """ 70 | 71 | keywords = [] 72 | 73 | for product in products: 74 | keywords.append([product, product]) 75 | 76 | for keyword_prepend in keywords_prepend: 77 | keywords.append([product, keyword_prepend + ' ' + product]) 78 | 79 | for keyword_append in keywords_append: 80 | keywords.append([product, product + ' ' + keyword_append]) 81 | 82 | return keywords 83 | 84 | 85 | def generate_ad_keywords(products, 86 | keywords_prepend, 87 | keywords_append, 88 | campaign_name): 89 | """Return a Pandas dataframe of keywords data for use in Google Adwords. 90 | 91 | Args: 92 | products (list): List of product names. 93 | keywords_prepend (list): List of keywords to prepend to product names. 94 | keywords_append (list): List of keywords to append to product names. 95 | campaign_name (str): Name of paid search campaign. 96 | 97 | Returns: 98 | df (object): Pandas dataframe containing generated data. 99 | """ 100 | 101 | keywords = _generate_combinations(products, keywords_prepend, keywords_append) 102 | 103 | exact = _match_type_exact(keywords) 104 | phrase = _match_type_phrase(keywords) 105 | broad = _match_type_broad(keywords) 106 | broad_modified = _match_type_broad_modified(keywords) 107 | 108 | df = pd.concat([exact, phrase, broad, broad_modified]) 109 | df['campaign_name'] = campaign_name 110 | return df 111 | 112 | 113 | def generate_spintax(text, single=True): 114 | """Return a list of unique spins of a Spintax text string. 115 | 116 | Args: 117 | text (string): Spintax text (i.e. I am the {President|King|Ambassador} of Nigeria.) 118 | single (bool, optional): Optional boolean to return a list or a single spin. 119 | 120 | Returns: 121 | spins (string, list): Single spin or list of spins depending on single. 122 | """ 123 | 124 | pattern = re.compile('({[^}]+}|[^{}]*)') 125 | chunks = pattern.split(text) 126 | 127 | def options(s): 128 | if len(s) > 0 and s[0] == '{': 129 | return [opt for opt in s[1:-1].split('|')] 130 | return [s] 131 | 132 | parts_list = [options(chunk) for chunk in chunks] 133 | 134 | spins = [] 135 | 136 | for spin in itertools.product(*parts_list): 137 | spins.append(''.join(spin)) 138 | 139 | if single: 140 | return spins[random.randint(0, len(spins) - 1)] 141 | else: 142 | return spins 143 | 144 | -------------------------------------------------------------------------------- /ecommercetools/customers/__init__.py: -------------------------------------------------------------------------------- 1 | from ecommercetools.customers.customers import get_customers 2 | from ecommercetools.customers.customers import get_rfm_segments 3 | from ecommercetools.customers.customers import get_abc_segments 4 | from ecommercetools.customers.customers import get_cohorts 5 | from ecommercetools.customers.customers import get_cohort_matrix 6 | from ecommercetools.customers.customers import get_retention 7 | from ecommercetools.customers.customers import get_latency 8 | from ecommercetools.customers.customers import get_customer_predictions 9 | -------------------------------------------------------------------------------- /ecommercetools/customers/customers.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import operator as op 4 | from ecommercetools.transactions import transactions 5 | from ecommercetools import utilities 6 | from sklearn.cluster import KMeans 7 | from sklearn.preprocessing import StandardScaler 8 | from lifetimes import GammaGammaFitter 9 | from lifetimes.utils import summary_data_from_transaction_data 10 | from lifetimes import BetaGeoFitter 11 | 12 | 13 | def get_customers(transaction_items): 14 | """Return a Pandas DataFrame of customers from a Pandas DataFrame of transaction items. 15 | 16 | Args: 17 | transaction_items (object): DataFrame containing order_id, sku, quantity, unit_price, customer_id, order_date 18 | 19 | Returns: 20 | customers: Pandas DataFrame containing customers 21 | """ 22 | 23 | customers = transaction_items.groupby('customer_id').agg( 24 | revenue=('line_price', 'sum'), 25 | orders=('order_id', 'nunique'), 26 | skus=('sku', 'nunique'), 27 | items=('quantity', 'sum'), 28 | first_order_date=('order_date', 'min'), 29 | last_order_date=('order_date', 'max') 30 | ).reset_index() 31 | customers['avg_items'] = round((customers['items'] / customers['orders']), 2) 32 | customers['avg_order_value'] = round((customers['revenue'] / customers['orders']), 2) 33 | customers['tenure'] = (pd.to_datetime('today') - customers['first_order_date']).dt.days 34 | customers['recency'] = (pd.to_datetime('today') - customers['last_order_date']).dt.days 35 | customers['cohort'] = customers['first_order_date'].dt.year.astype(str) + \ 36 | customers['first_order_date'].dt.quarter.astype(str) 37 | return customers 38 | 39 | 40 | def _sorted_kmeans(df, 41 | metric_column, 42 | cluster_name, 43 | ascending=True): 44 | """Runs a K-means clustering algorithm on a specific metric column in a Pandas dataframe. 45 | 46 | Sorts the data in a specified direction; and reassigns cluster numbers to match the data distribution, 47 | so they are appropriate for RFM segmentation. You may need to log transform heavily skewed data. 48 | 49 | Args: 50 | df (object): Pandas dataframe 51 | metric_column (str): Name of metric column 52 | ascending (bool, optional): Set to False to sort in descending order 53 | cluster_name (str): Name of cluster 54 | 55 | Returns: 56 | Original Pandas DataFrame with additional column 57 | """ 58 | 59 | # Fit the model 60 | kmeans = KMeans(n_clusters=5) 61 | kmeans.fit(df[[metric_column]]) 62 | 63 | # Assign the initial unsorted cluster 64 | initial_cluster = 'unsorted_' + cluster_name 65 | df[initial_cluster] = kmeans.predict(df[[metric_column]]) + 1 66 | df[cluster_name] = df[initial_cluster] 67 | 68 | # Group the clusters and re-rank to determine the correct order 69 | df_sorted = df.groupby(initial_cluster)[metric_column].mean().round(2).reset_index() 70 | df_sorted = df_sorted.sort_values(by=metric_column, ascending=ascending).reset_index(drop=True) 71 | df_sorted[cluster_name] = df_sorted[metric_column].rank(method='max', ascending=ascending).astype(int) 72 | 73 | # Merge data and drop redundant columns 74 | df = df.merge(df_sorted[[cluster_name, initial_cluster]], on=[initial_cluster]) 75 | df = df.drop(initial_cluster, axis=1) 76 | df = df.drop(cluster_name + '_x', axis=1) 77 | df = df.rename(columns={cluster_name + '_y': cluster_name}) 78 | 79 | return df 80 | 81 | 82 | def _label_rfm_segments(rfm): 83 | """Return a label for a customer based on their RFM score 84 | 85 | Args: 86 | rfm (int): Full three-digit RFM score, i.e. 555 or 111 87 | 88 | Returns: 89 | label (str): Descriptive RFM score label, i.e. Risky 90 | """ 91 | 92 | rfm = int(rfm) 93 | 94 | if (rfm >= 111) & (rfm <= 155): 95 | return 'Risky' 96 | 97 | elif (rfm >= 211) & (rfm <= 255): 98 | return 'Hold and improve' 99 | 100 | elif (rfm >= 311) & (rfm <= 353): 101 | return 'Potential loyal' 102 | 103 | elif ((rfm >= 354) & (rfm <= 454)) or ((rfm >= 511) & (rfm <= 535)) or (rfm == 541): 104 | return 'Loyal' 105 | 106 | elif (rfm == 455) or (rfm >= 542) & (rfm <= 555): 107 | return 'Star' 108 | 109 | else: 110 | return 'Other' 111 | 112 | 113 | def get_rfm_segments(customers): 114 | """Return a Pandas DataFrame of customer RFM segments from a Pandas DataFrame of customers. 115 | 116 | The DataFrame returned by get_customers() already contains the raw data required, but 117 | this function will rename it accordingly and use it to assign the customer to a range 118 | of different segments that can be used for marketing and analysis. 119 | 120 | Args: 121 | customers: Pandas DataFrame from get_customers() 122 | 123 | Returns: 124 | segments: Pandas DataFrame 125 | 126 | """ 127 | 128 | # Rename the raw data columns 129 | segments = customers[['customer_id']] 130 | segments = segments.assign(acquisition_date=customers['first_order_date']) 131 | segments = segments.assign(recency_date=customers['last_order_date']) 132 | segments = segments.assign(recency=customers['recency']) 133 | segments = segments.assign(frequency=customers['orders']) 134 | segments = segments.assign(monetary=customers['revenue']) 135 | segments = segments.assign(heterogeneity=customers['skus']) 136 | segments = segments.assign(tenure=customers['tenure']) 137 | 138 | # Use K-means to create RFMH scores 139 | segments = _sorted_kmeans(segments, 'recency', 'r', ascending=False) 140 | segments = _sorted_kmeans(segments, 'frequency', 'f', ascending=True) 141 | segments = _sorted_kmeans(segments, 'monetary', 'm', ascending=True) 142 | segments = _sorted_kmeans(segments, 'heterogeneity', 'h', ascending=True) 143 | 144 | # Create scores 145 | segments = segments.assign(rfm=segments['r'].astype(str) + \ 146 | segments['f'].astype(str) + \ 147 | segments['m'].astype(str)) 148 | 149 | segments = segments.assign(rfm_score=segments['r'].astype(int) + \ 150 | segments['f'].astype(int) + \ 151 | segments['m'].astype(int)) 152 | 153 | # Create labels 154 | segments['rfm_segment_name'] = segments.apply(lambda x: _label_rfm_segments(x.rfm), axis=1) 155 | 156 | return segments 157 | 158 | 159 | def _abc_classify_customer(percentage): 160 | """Apply an ABC classification to each customer based on its ranked percentage revenue contribution. 161 | 162 | Args: 163 | percentage (float): Cumulative percentage of ranked revenue 164 | 165 | Returns: 166 | segments: Pandas DataFrame 167 | """ 168 | 169 | if 0 < percentage <= 80: 170 | return 'A' 171 | elif 80 < percentage <= 90: 172 | return 'B' 173 | else: 174 | return 'C' 175 | 176 | 177 | def get_abc_segments(customers, 178 | months=12, 179 | abc_class_name='abc_class_12m', 180 | abc_rank_name='abc_rank_12m'): 181 | """Return a dataframe containing the ABC class and rank for each customer. 182 | 183 | Apply an ABC classification to each customer based on its ranked percentage revenue contribution. 184 | This automatically uses a 12 month period by default, but can be modified for other periods to suit. 185 | 186 | Args: 187 | 188 | customers (object): Pandas DataFrame from get_customers() 189 | months (int, optional): Number of months to use for ABC analysis (12 by default) 190 | abc_class_name (str, optional): Name to assign to ABC class string (abc_class_12m by default) 191 | abc_rank_name (str, optional): Name to assign to ABC rank string (abc_rank_12m by default) 192 | 193 | Returns: 194 | abc: Pandas DataFrame 195 | """ 196 | 197 | # Calculate data for customers who purchased within the specified period 198 | purchased = customers[customers['recency'] <= (months * 30)] 199 | purchased = purchased.sort_values(by='revenue', ascending=False) 200 | purchased['revenue_cumsum'] = purchased['revenue'].cumsum() 201 | purchased['revenue_total'] = purchased['revenue'].sum() 202 | purchased['revenue_running_percentage'] = (purchased['revenue_cumsum'] / purchased['revenue_total']) * 100 203 | purchased[abc_class_name] = purchased['revenue_running_percentage'].apply(_abc_classify_customer) 204 | purchased[abc_rank_name] = purchased['revenue_running_percentage'].rank().astype(int) 205 | purchased.drop(['revenue_cumsum', 'revenue_total', 'revenue_running_percentage'], axis=1, inplace=True) 206 | 207 | # Assign lapsed customers to class D 208 | lapsed = customers[customers['recency'] > (months * 30)] 209 | 210 | # Return ABC segments 211 | abc = purchased.append(lapsed) 212 | abc[abc_class_name].fillna('D', inplace=True) 213 | abc[abc_rank_name].fillna(len(purchased) + 1, inplace=True) 214 | abc = abc[['customer_id', abc_class_name, abc_rank_name]] 215 | return abc 216 | 217 | 218 | def get_cohorts(df, period='M'): 219 | """Return a customer cohort matrix from a dataframe of transactional items. 220 | 221 | Given a Pandas DataFrame of transactional items, this function returns 222 | a Pandas DataFrame containing the acquisition cohort and order cohort which 223 | can be used for customer analysis or the creation of a cohort analysis matrix. 224 | 225 | Args: 226 | df (object): Pandas DataFrame. Required columns: order_id, customer_id, order_date. 227 | period (str, optional): Period value - M, Q, or Y. Create cohorts using month, quarter, or year of acquisition. 228 | 229 | Returns: 230 | df (object): Pandas DataFrame 231 | """ 232 | 233 | df = df[['customer_id', 'order_id', 'order_date']].drop_duplicates() 234 | df = df.assign(acquisition_cohort=df.groupby('customer_id') \ 235 | ['order_date'].transform('min').dt.to_period(period)) 236 | df = df.assign(order_cohort=df['order_date'].dt.to_period(period)) 237 | return df 238 | 239 | 240 | def get_retention(df, period='M'): 241 | """Calculate the retention of customers in each month after their acquisition. 242 | 243 | Args: 244 | df (object): Pandas DataFrame. Required columns: order_id, customer_id, order_date. 245 | period (str, optional): Period value - M, Q, or Y. Create cohorts using month, quarter, or year of acquisition. 246 | 247 | Returns: 248 | ------- 249 | df (object): Pandas DataFrame 250 | """ 251 | 252 | df = get_cohorts(df, period).groupby(['acquisition_cohort', 'order_cohort']) \ 253 | .agg(customers=('customer_id', 'nunique')) \ 254 | .reset_index(drop=False) 255 | df['periods'] = (df.order_cohort - df.acquisition_cohort) \ 256 | .apply(op.attrgetter('n')) 257 | 258 | return df 259 | 260 | 261 | def get_cohort_matrix(df, period='M', percentage=False): 262 | """Return a cohort matrix showing the number of customers who purchased in each period after their acquisition. 263 | 264 | Args: 265 | df (object): Pandas DataFrame. Required columns: order_id, customer_id, order_date. 266 | period (str, optional): Period value - M, Q, or Y. Create cohorts using month, quarter, or year of acquisition. 267 | percentage (bool, optional): True or False. Return raw numbers or a percentage retention. 268 | 269 | Returns: 270 | df (object): Pandas DataFrame 271 | """ 272 | 273 | df = get_retention(df, period).pivot_table(index='acquisition_cohort', 274 | columns='periods', 275 | values='customers') 276 | 277 | if percentage: 278 | df = df.divide(df.iloc[:, 0], axis=0) 279 | 280 | return df 281 | 282 | 283 | def _days_to_next_order(avg_latency, std_latency, recency): 284 | """Estimate the number of days to a customer's next order using latency. 285 | 286 | Args: 287 | avg_latency (float): Average latency in days 288 | std_latency (float): Standard deviation of latency in days 289 | recency (float): Recency in days 290 | Returns: 291 | Approximate number of days until the next order. 292 | """ 293 | 294 | return avg_latency - (recency - std_latency) 295 | 296 | 297 | def _latency_label_customers(avg_latency, std_latency, recency): 298 | """Add a label to describe a customer's latency metric. 299 | 300 | Args: 301 | avg_latency (float): Average latency in days 302 | std_latency (float): Standard deviation of latency in days 303 | recency (float): Recency in days 304 | Returns: 305 | Label describing the latency metric in relation to the customer. 306 | """ 307 | 308 | days_to_next_order_upper = avg_latency - (recency - std_latency) 309 | days_to_next_order_lower = avg_latency - (recency + std_latency) 310 | 311 | if recency < days_to_next_order_lower: 312 | return 'Order not due' 313 | 314 | elif (recency <= days_to_next_order_lower) or (recency <= days_to_next_order_upper): 315 | return 'Order due soon' 316 | 317 | elif recency > days_to_next_order_upper: 318 | return 'Order overdue' 319 | 320 | else: 321 | return 'Not sure' 322 | 323 | 324 | def get_latency(df_transactions): 325 | """Return a Pandas dataframe containing latency metrics for each customer. 326 | 327 | Args: 328 | df_transactions: Pandas dataframe from get_transactions(). 329 | 330 | Returns: 331 | Pandas dataframe of customer purchase latency metrics. 332 | """ 333 | 334 | # Create latency dataframe and calculate granular metrics 335 | df_latency = df_transactions[['order_id', 'customer_id', 'order_date', 'revenue']] 336 | df_latency = df_latency[df_latency['revenue'] > 0] 337 | df_latency = df_latency.sort_values(by=['order_date'], ascending=False) 338 | df_latency['prev_order_date'] = utilities.get_previous_value(df_latency, 'customer_id', 'order_date') 339 | df_latency['days_since_prev_order'] = utilities.get_days_since_date(df_latency, 'prev_order_date', 'order_date') 340 | df_latency['order_number'] = utilities.get_cumulative_count(df_latency, 'customer_id', 'order_id', 'order_date') 341 | 342 | # Create customer dataframe and calculate aggregate metrics 343 | df_customers = pd.DataFrame(df_latency['customer_id'].unique()) 344 | df_customers.columns = ['customer_id'] 345 | 346 | # Calculate frequency 347 | df_frequency = df_latency.groupby('customer_id')['order_id'].nunique().reset_index() 348 | df_frequency.columns = ['customer_id', 'frequency'] 349 | df_customers = df_customers.merge(df_frequency, on='customer_id') 350 | 351 | # Calculate recency 352 | df_recency = df_latency.groupby('customer_id')['order_date'].max().reset_index() 353 | df_recency.columns = ['customer_id', 'recency_date'] 354 | df_customers = df_customers.merge(df_recency, on='customer_id') 355 | df_customers['recency'] = round((pd.to_datetime('today') - df_customers['recency_date']) \ 356 | / np.timedelta64(1, 'D')).astype(int) 357 | 358 | # Calculate average latency 359 | df_avg_latency = df_latency.groupby('customer_id')['days_since_prev_order'].mean().astype(int).reset_index() 360 | df_avg_latency.columns = ['customer_id', 'avg_latency'] 361 | df_customers = df_customers.merge(df_avg_latency, on='customer_id') 362 | 363 | # Calculate standard deviation of latency for returning customers 364 | df_latency_returning = df_latency[df_latency['order_number'] > 0] 365 | 366 | # Min latency 367 | df_min = df_latency_returning.groupby('customer_id')['days_since_prev_order'].min().astype(int).reset_index() 368 | df_min.columns = ['customer_id', 'min_latency'] 369 | df_customers = df_customers.merge(df_min, on='customer_id') 370 | 371 | # Max latency 372 | df_max = df_latency_returning.groupby('customer_id')['days_since_prev_order'].max().astype(int).reset_index() 373 | df_max.columns = ['customer_id', 'max_latency'] 374 | df_customers = df_customers.merge(df_max, on='customer_id') 375 | 376 | # STD latency 377 | df_std = df_latency_returning.groupby('customer_id')['days_since_prev_order'].std().reset_index() 378 | df_std.columns = ['customer_id', 'std_latency'] 379 | df_customers = df_customers.merge(df_std, on='customer_id') 380 | 381 | # Coefficient of Variation of latency 382 | df_customers['cv'] = df_customers['std_latency'] / df_customers['avg_latency'] 383 | 384 | # Calculate approximate days to next order 385 | df_customers['days_to_next_order'] = df_customers.apply( 386 | lambda x: _days_to_next_order(x['avg_latency'], x['std_latency'], x['recency']), axis=1).round() 387 | 388 | # Label latency 389 | df_customers['label'] = df_customers.apply( 390 | lambda x: _latency_label_customers(x['avg_latency'], x['std_latency'], x['recency']), axis=1) 391 | 392 | return df_customers 393 | 394 | 395 | def _get_lifetimes_rfmt(df_transactions, observation_period_end): 396 | """Return the RFMT data from the Lifetimes model. 397 | 398 | Args: 399 | df_transactions (df): Pandas dataframe of transactions from get_transactions() 400 | observation_period_end (string): Date string in YYYY-MM-DD format representing end of observation period. 401 | 402 | Returns: 403 | df: Pandas dataframe containing frequency, recency, T, monetary_value per customer. 404 | """ 405 | 406 | df_transactions = df_transactions[df_transactions['replacement'] == 0] 407 | 408 | df = summary_data_from_transaction_data(df_transactions, 409 | 'customer_id', 410 | 'order_date', 411 | 'revenue', 412 | observation_period_end=observation_period_end) 413 | return df 414 | 415 | 416 | def _get_predicted_purchases(df_transactions, 417 | observation_period_end, 418 | days=90): 419 | """Return the number of predicted purchases per customer from the Lifetimes BG/NBD model. 420 | 421 | Args: 422 | df_transactions (df): Pandas dataframe of transactions from get_transactions() 423 | observation_period_end (string): Date string in YYYY-MM-DD format representing end of observation period. 424 | 425 | Returns: 426 | df: Pandas dataframe containing frequency, recency, T, monetary_value per customer, and predicted purchases. 427 | """ 428 | 429 | df = _get_lifetimes_rfmt(df_transactions, observation_period_end) 430 | bgf = BetaGeoFitter(penalizer_coef=0) 431 | bgf.fit(df['frequency'], df['recency'], df['T']) 432 | df['predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time(days, 433 | df['frequency'], 434 | df['recency'], 435 | df['T']) 436 | return df 437 | 438 | 439 | def _get_predicted_aov(df_transactions, 440 | observation_period_end, 441 | ggf_penalizer_coef=0): 442 | """Returns the predicted AOV for each customer via the Gamma-Gamma model. 443 | This function uses models from the Lifetimes package. 444 | 445 | Args: 446 | df_transactions (df): Pandas dataframe of transactions from get_transactions() 447 | observation_period_end (string): Date string in YYYY-MM-DD format for end of observation period. 448 | ggf_penalizer_coef (float, optional): Penalizer coefficient for Gamma-Gamma model. See Lifetimes. 449 | 450 | Returns: 451 | Predicted AOV for each customer. 452 | """ 453 | 454 | df_rfmt = _get_lifetimes_rfmt(df_transactions, observation_period_end) 455 | 456 | df_returning = df_rfmt[df_rfmt['frequency'] > 0] 457 | df_returning = df_rfmt[df_rfmt['monetary_value'] > 0] 458 | 459 | ggf = GammaGammaFitter(penalizer_coef=ggf_penalizer_coef) 460 | ggf.fit(df_returning['frequency'], 461 | df_returning['monetary_value']) 462 | 463 | predicted_monetary = ggf.conditional_expected_average_profit( 464 | df_returning['frequency'], 465 | df_returning['monetary_value'] 466 | ) 467 | 468 | aov_df = pd.DataFrame(predicted_monetary, columns=['aov']) 469 | 470 | return aov_df 471 | 472 | 473 | def _get_predicted_clv(df_transactions, 474 | observation_period_end, 475 | months=12, 476 | discount_rate=0.01, 477 | ggf_penalizer_coef=0, 478 | bgf_penalizer_coef=0): 479 | """Return the predicted CLV for each customer using the Gamma-Gamma and BG/NBD models. 480 | This function uses models from the Lifetimes package. 481 | 482 | Args: 483 | df_transactions (df): Pandas dataframe of transactions from get_transactions() 484 | observation_period_end (string): Date string in YYYY-MM-DD format for end of observation period. 485 | months (int, optional): Optional number of months in CLV prediction window. 486 | discount_rate (float, optional): Discount rate. See Lifetimes. 487 | ggf_penalizer_coef (float, optional): Penalizer coefficient for Gamma-Gamma model. See Lifetimes. 488 | bgf_penalizer_coef (float, optional): Penalizer coefficient for BG/NBD model. See Lifetimes. 489 | 490 | Returns: 491 | Predicted CLV for each customer. 492 | """ 493 | 494 | df_rfmt = _get_lifetimes_rfmt(df_transactions, observation_period_end) 495 | df_returning = df_rfmt[df_rfmt['frequency'] > 0] 496 | df_returning = df_rfmt[df_rfmt['monetary_value'] > 0] 497 | 498 | ggf = GammaGammaFitter(penalizer_coef=ggf_penalizer_coef) 499 | ggf.fit(df_returning['frequency'], 500 | df_returning['monetary_value']) 501 | 502 | bgf = BetaGeoFitter(penalizer_coef=bgf_penalizer_coef) 503 | bgf.fit(df_returning['frequency'], 504 | df_returning['recency'], 505 | df_returning['T']) 506 | 507 | preds = ggf.customer_lifetime_value( 508 | bgf, 509 | df_returning['frequency'], 510 | df_returning['recency'], 511 | df_returning['T'], 512 | df_returning['monetary_value'], 513 | time=months, 514 | discount_rate=discount_rate 515 | ).to_frame().reset_index() 516 | 517 | return preds 518 | 519 | 520 | def get_customer_predictions(df_transactions, 521 | observation_period_end, 522 | days=90, 523 | months=3, 524 | discount_rate=0.01, 525 | ggf_penalizer_coef=0, 526 | bgf_penalizer_coef=0): 527 | """Get predicted customer purchased, AOV, and CLV for the defined period. 528 | 529 | This uses the Lifetimes package to run the Gamma-Gamma, and BG/NBD models 530 | and predict the AOV, CLV, and number of purchases each customer will make. 531 | These models use a different approach to measuring RFMT than the other 532 | functions in EcommerceTools, so are not directly comparable, so the results 533 | have been removed from the output. 534 | 535 | Args: 536 | df_transactions (df): Pandas dataframe of transactions from get_transactions() 537 | observation_period_end (string): Date string in YYYY-MM-DD format for end of observation period. 538 | days (int, optional): Optional number of days in purchase prediction window. 539 | months (int, optional): Optional number of months in CLV prediction window. 540 | discount_rate (float, optional): Discount rate. See Lifetimes. 541 | ggf_penalizer_coef (float, optional): Penalizer coefficient for Gamma-Gamma model. See Lifetimes. 542 | bgf_penalizer_coef (float, optional): Penalizer coefficient for BG/NBD model. See Lifetimes. 543 | 544 | Returns: 545 | df_predictions: Pandas dataframe containing predictions from Gamma-Gamma and BG/NBD models. 546 | """ 547 | 548 | df_predicted_purchases = _get_predicted_purchases(df_transactions, 549 | observation_period_end, 550 | days=days) 551 | df_aov = _get_predicted_aov(df_transactions, 552 | observation_period_end) 553 | 554 | df_clv = _get_predicted_clv(df_transactions, 555 | observation_period_end, 556 | months=months, 557 | discount_rate=discount_rate, 558 | bgf_penalizer_coef=bgf_penalizer_coef, 559 | ggf_penalizer_coef=ggf_penalizer_coef 560 | ) 561 | 562 | df_predictions = df_predicted_purchases.merge(df_aov, on='customer_id', how='left') 563 | df_predictions = df_predictions.merge(df_clv, on='customer_id', how='left') 564 | 565 | return df_predictions[['customer_id', 'predicted_purchases', 'aov', 'clv']] 566 | 567 | -------------------------------------------------------------------------------- /ecommercetools/marketing/__init__.py: -------------------------------------------------------------------------------- 1 | from ecommercetools.marketing.marketing import get_trading_events 2 | from ecommercetools.marketing.marketing import get_trading_calendar 3 | 4 | -------------------------------------------------------------------------------- /ecommercetools/marketing/marketing.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pandas.tseries.offsets import BDay 3 | from pandas.tseries.holiday import ( 4 | AbstractHolidayCalendar, Holiday, DateOffset, SU, MO, TU, WE, TH, FR, SA, next_monday, 5 | nearest_workday, sunday_to_monday, EasterMonday, GoodFriday, Easter 6 | ) 7 | 8 | 9 | class UKEcommerceTradingCalendar(AbstractHolidayCalendar): 10 | rules = [ 11 | 12 | # Pay days (based on fourth Friday of the month) 13 | Holiday('January Pay Day', month=1, day=31, offset=BDay(-1)), 14 | Holiday('February Pay Day', month=2, day=28, offset=BDay(-1)), 15 | Holiday('March Pay Day', month=3, day=31, offset=BDay(-1)), 16 | Holiday('April Pay Day', month=4, day=30, offset=BDay(-1)), 17 | Holiday('May Pay Day', month=5, day=31, offset=BDay(-1)), 18 | Holiday('June Pay Day', month=6, day=30, offset=BDay(-1)), 19 | Holiday('July Pay Day', month=7, day=31, offset=BDay(-1)), 20 | Holiday('August Pay Day', month=8, day=31, offset=BDay(-1)), 21 | Holiday('September Pay Day', month=9, day=30, offset=BDay(-1)), 22 | Holiday('October Pay Day', month=10, day=31, offset=BDay(-1)), 23 | Holiday('November Pay Day', month=11, day=30, offset=BDay(-1)), 24 | Holiday('December Pay Day', month=12, day=31, offset=BDay(-1)), 25 | 26 | # Seasonal trading events 27 | Holiday('January sale', month=1, day=1), 28 | Holiday('Valentine\'s Day [last order date]', month=2, day=14, offset=BDay(-2)), 29 | Holiday('Valentine\'s Day', month=2, day=14), 30 | Holiday('Mother\'s Day [last order date]', month=5, day=1, offset=BDay(-2)), 31 | Holiday('Mother\'s Day', month=5, day=1, offset=pd.DateOffset(weekday=SU(2))), 32 | Holiday('Father\'s Day [last order date]', month=6, day=1, offset=BDay(-2)), 33 | Holiday('Father\'s Day', month=6, day=1, offset=pd.DateOffset(weekday=SU(3))), 34 | Holiday("Black Friday [sale starts]", month=11, day=1, offset=[pd.DateOffset(weekday=SA(4)), BDay(-5)]), 35 | Holiday('Black Friday', month=11, day=1, offset=pd.DateOffset(weekday=FR(4))), 36 | Holiday("Cyber Monday", month=11, day=1, offset=[pd.DateOffset(weekday=SA(4)), pd.DateOffset(2)]), 37 | Holiday('Christmas Day [last order date]', month=12, day=25, offset=BDay(-2)), 38 | Holiday('Boxing Day sale', month=12, day=26), 39 | ] 40 | 41 | 42 | def _get_dates(start_date, days=365): 43 | """Get all dates from a start date to a given end date X days ahead. 44 | 45 | Args: 46 | start_date (YYYY-MM-DD): Start date, i.e. 2021-01-01 47 | days (optional, int): 365 48 | 49 | Returns: 50 | Dataframe of dates X days ahead of start date 51 | """ 52 | 53 | period = pd.date_range(start_date, periods=days, freq='D') 54 | df = pd.DataFrame({'date': period}) 55 | return df 56 | 57 | 58 | def get_trading_events(start_date, days=365): 59 | """Calculate and return all trading events from the UK ecommerce trading calendar. 60 | 61 | Args: 62 | start_date (YYYY-MM-DD): Start date, i.e. 2021-01-01 63 | days (optional, int): 365 64 | 65 | Returns: 66 | Dataframe of the name and date of each ecommerce trading event. 67 | """ 68 | 69 | dates = _get_dates(start_date, days) 70 | 71 | calendar = UKEcommerceTradingCalendar() 72 | start = dates.date.min() 73 | end = dates.date.max() 74 | 75 | events = calendar.holidays(start=start, end=end, return_name=True) 76 | events = events.reset_index(name='event').rename(columns={'index': 'date'}) 77 | 78 | return events 79 | 80 | 81 | def get_trading_calendar(start_date, days=365): 82 | """Return a full ecommerce trading calendar for the specified period. 83 | 84 | Args: 85 | start_date (YYYY-MM-DD): Start date, i.e. 2021-01-01 86 | days (optional, int): 365 87 | 88 | Returns: 89 | Pandas dataframe containing full calendar of ecommerce trading events. 90 | """ 91 | 92 | dates = _get_dates(start_date, days) 93 | events = get_trading_events(start_date, days) 94 | 95 | calendar = dates.merge(events, on='date', how='left').fillna('') 96 | return calendar 97 | -------------------------------------------------------------------------------- /ecommercetools/nlp/__init__.py: -------------------------------------------------------------------------------- 1 | from ecommercetools.nlp.nlp import get_summaries 2 | -------------------------------------------------------------------------------- /ecommercetools/nlp/nlp.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from transformers import pipeline 3 | 4 | 5 | def get_summary(text, 6 | min_length=50, 7 | max_length=100, 8 | do_sample=False): 9 | """Return a summary from a piece of text using a transformer model. 10 | 11 | Args: 12 | text (string): String of text to summarize. Will be truncated to first 1024 characters. 13 | min_length (int): Minimum length to return. 14 | max_length (int): Maximum length to return. 15 | do_sample (optional, boolean): Set to False to generate unique text or True to extract excerpts. 16 | 17 | Returns: 18 | string: Summarized text. 19 | """ 20 | 21 | summarizer = pipeline("summarization") 22 | summary = summarizer(text[:1024], 23 | min_length=min_length, 24 | max_length=max_length, 25 | do_sample=do_sample) 26 | summary_text = summary[0]['summary_text'].strip().replace(' .', '.') 27 | 28 | return summary_text 29 | 30 | 31 | def get_summaries(df, 32 | text_column, 33 | summary_column_name='summary', 34 | min_length=50, 35 | max_length=100, 36 | do_sample=False): 37 | """Return a summary each of a specified dataframe column using a transformer model. 38 | 39 | Args: 40 | df (dataframe): Pandas dataframe containing the text to summarize. 41 | text_column (string): Name of text column to summarize. Will be truncated to first 1024 characters. 42 | summary_column_name (string, optional): Name of summary column. 43 | min_length (int, optional): Minimum length to return. 44 | max_length (int, optional): Maximum length to return. 45 | do_sample (boolean, optional): Set to False to generate unique text or True to extract excerpts. 46 | 47 | Returns: 48 | df['summary']: Original dataframe with additional column containing summaries. 49 | """ 50 | 51 | df[summary_column_name] = df.apply(lambda x: get_summary(x[text_column], 52 | min_length=min_length, 53 | max_length=max_length, 54 | do_sample=do_sample), axis=1) 55 | return df 56 | -------------------------------------------------------------------------------- /ecommercetools/operations/__init__.py: -------------------------------------------------------------------------------- 1 | from ecommercetools.operations.operations import get_inventory_classification 2 | -------------------------------------------------------------------------------- /ecommercetools/operations/operations.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from ecommercetools.products import products 3 | 4 | 5 | def _abc_classify_product(percentage): 6 | """Return an ABC classification for a product based on its ranked percentage revenue contribution. 7 | 8 | Args: 9 | percentage (float): Running percentage of revenue contributed by each SKU over a time period. 10 | 11 | Returns: 12 | class (string): ABC class string 13 | """ 14 | 15 | if 0 < percentage <= 80: 16 | return 'A' 17 | elif 80 < percentage <= 90: 18 | return 'B' 19 | else: 20 | return 'C' 21 | 22 | 23 | def get_inventory_classification(transaction_items, days=None, verbose=False): 24 | """Return a Pandas DataFrame of product inventory classification from the transaction items dataframe. 25 | 26 | Args: 27 | transaction_items (object): Pandas DataFrame of transaction items. 28 | days (int, optional): Return data only for products sold in the past X days. 29 | verbose (bool, optional): Displays additional columns of workings when set to True. 30 | 31 | Returns: 32 | products (object): Pandas DataFrame. 33 | """ 34 | 35 | # Filter to the last X days 36 | if days: 37 | products_data = products.get_products(transaction_items, days) 38 | else: 39 | products_data = products.get_products(transaction_items) 40 | 41 | # Sort the data 42 | products_data['revenue_total'] = products_data['revenue'].sum() 43 | products_data = products_data.sort_values(by='revenue', ascending=False) 44 | 45 | # ABC inventory classification 46 | products_data['revenue_cumsum'] = products_data['revenue'].cumsum() 47 | products_data['revenue_running_percentage'] = (products_data['revenue_cumsum'] / products_data['revenue_total']) * 100 48 | products_data['abc_class'] = products_data['revenue_running_percentage'].apply(_abc_classify_product) 49 | products_data['abc_rank'] = products_data['revenue_running_percentage'].rank().astype(int) 50 | 51 | if verbose: 52 | products_data = products_data[['sku', 'abc_class', 'abc_rank', 'revenue', 53 | 'revenue_cumsum', 'revenue_total', 'revenue_running_percentage']] 54 | else: 55 | products_data = products_data[['sku', 'abc_class', 'abc_rank']] 56 | 57 | return products_data 58 | -------------------------------------------------------------------------------- /ecommercetools/products/__init__.py: -------------------------------------------------------------------------------- 1 | from ecommercetools.products.products import get_products 2 | from ecommercetools.products.products import get_repurchase_rates 3 | 4 | -------------------------------------------------------------------------------- /ecommercetools/products/products.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from ecommercetools.utilities import tools 3 | 4 | 5 | def get_products(transaction_items, days=None): 6 | """Return a Pandas DataFrame of products from a Pandas DataFrame of transaction items. 7 | 8 | Args: 9 | transaction_items (object): Pandas DataFrame. 10 | days (int, optional): Select only product sold in the last X days. 11 | 12 | Returns: 13 | customers (object): Pandas DataFrame 14 | """ 15 | 16 | if days: 17 | transaction_items = tools.select_last_x_days(transaction_items, 'order_date', days) 18 | 19 | transaction_items = transaction_items.assign(line_price=transaction_items['quantity'] * transaction_items['unit_price']) 20 | 21 | products = transaction_items.groupby('sku').agg( 22 | first_order_date=('order_date', 'min'), 23 | last_order_date=('order_date', 'max'), 24 | customers=('customer_id', 'nunique'), 25 | orders=('order_id', 'nunique'), 26 | items=('quantity', 'sum'), 27 | revenue=('line_price', 'sum'), 28 | avg_unit_price=('unit_price', 'mean'), 29 | avg_quantity=('quantity', 'mean'), 30 | avg_revenue=('line_price', 'mean') 31 | ).reset_index() 32 | 33 | products['avg_orders'] = round(products['orders'] / products['customers'], 2) 34 | products['product_tenure'] = (pd.to_datetime('today') - products['first_order_date']).dt.days 35 | products['product_recency'] = (pd.to_datetime('today') - products['last_order_date']).dt.days 36 | return products 37 | 38 | 39 | def get_repurchase_rate_label(df): 40 | """Add a label describing the repurchase rate bin. 41 | 42 | Args: 43 | df (object): Pandas DataFrame containing repurchase_rate. 44 | 45 | Returns: 46 | ------- 47 | df (object): Pandas DataFrame with repurchase_rate_label added. 48 | """ 49 | 50 | labels = ['Very low repurchase', 51 | 'Low repurchase', 52 | 'Moderate repurchase', 53 | 'High repurchase', 54 | 'Very high repurchase'] 55 | df['repurchase_rate_label'] = pd.cut(df['repurchase_rate'], 56 | bins=5, 57 | labels=labels) 58 | return df 59 | 60 | 61 | def get_bulk_purchase_rate_label(df): 62 | """Add a label describing the bulk purchase rate bin. 63 | 64 | Args: 65 | df (object): Pandas DataFrame containing bulk_purchase_rate. 66 | 67 | Returns: 68 | ------- 69 | df (object): Pandas DataFrame with bulk_purchase_rate_label added. 70 | """ 71 | 72 | labels = ['Very low bulk', 73 | 'Low bulk', 74 | 'Moderate bulk', 75 | 'High bulk', 76 | 'Very high bulk'] 77 | df['bulk_purchase_rate_label'] = pd.cut(df['bulk_purchase_rate'], 78 | bins=5, 79 | labels=labels) 80 | return df 81 | 82 | 83 | def get_repurchase_rates(df): 84 | """Return repurchase rates and purchase behaviour for each SKU from transaction items data. 85 | 86 | Given a Pandas DataFrame of transactional items, this function returns a Pandas DataFrame 87 | containing the purchase behaviour and repurchase behaviour for each SKU. 88 | 89 | Args: 90 | df (object): Pandas DataFrame. Required columns: sku, order_id, customer_id, quantity, unit_price. 91 | 92 | Returns: 93 | ------- 94 | df (object): Pandas DataFrame. 95 | """ 96 | 97 | # Count the number of times each customer purchased each SKU 98 | df['times_purchased'] = df.groupby(['sku', 'customer_id'])['order_id'].transform('count') 99 | 100 | # Count the number of times the SKU was purchased individually within orders 101 | df['purchased_individually'] = df[df['quantity'] == 1]. \ 102 | groupby('sku')['order_id'].transform('count') 103 | df['purchased_individually'] = df['purchased_individually'].fillna(0) 104 | 105 | # Count the number of times the SKU was purchased once only by customers 106 | df['purchased_once'] = df[df['times_purchased'] == 1]. \ 107 | groupby('sku')['order_id'].transform('count') 108 | df['purchased_once'] = df['purchased_once'].fillna(0) 109 | 110 | # Calculate line price 111 | df['line_price'] = df['unit_price'] * df['quantity'] 112 | 113 | # Get unique SKUs and count total items, orders, and customers 114 | df_skus = df.groupby('sku').agg( 115 | revenue=('line_price', 'sum'), 116 | items=('quantity', 'sum'), 117 | orders=('order_id', 'nunique'), 118 | customers=('customer_id', 'nunique'), 119 | avg_unit_price=('unit_price', 'mean'), 120 | avg_line_price=('line_price', 'mean') 121 | ) 122 | 123 | # Calculate the average number of units per order 124 | df_skus = df_skus.assign(avg_items_per_order=(df_skus['items'] / df_skus['orders'])) 125 | 126 | # Calculate the average number of items per customer 127 | df_skus = df_skus.assign(avg_items_per_customer=(df_skus['items'] / df_skus['customers'])) 128 | 129 | # Merge the dataframes 130 | df_subset = df[['sku', 'purchased_individually', 'purchased_once']].fillna(0) 131 | df_subset.drop_duplicates('sku', keep='first', inplace=True) 132 | df_skus = df_skus.merge(df_subset, on='sku', how='left') 133 | 134 | # Calculate bulk purchase rates 135 | df_skus = df_skus.assign(bulk_purchases=(df_skus['orders'] - df_skus['purchased_individually'])) 136 | df_skus = df_skus.assign(bulk_purchase_rate=(df_skus['bulk_purchases'] / df_skus['orders'])) 137 | 138 | # Calculate repurchase rates 139 | df_skus = df_skus.assign(repurchases=(df_skus['orders'] - df_skus['purchased_once'])) 140 | df_skus = df_skus.assign(repurchase_rate=(df_skus['repurchases'] / df_skus['orders'])) 141 | 142 | # Add labels 143 | df_skus = get_repurchase_rate_label(df_skus) 144 | df_skus = get_bulk_purchase_rate_label(df_skus) 145 | 146 | df_skus['bulk_and_repurchase_label'] = df_skus['repurchase_rate_label'].astype(str) + \ 147 | '_' + df_skus['bulk_purchase_rate_label'].astype(str) 148 | 149 | return df_skus 150 | -------------------------------------------------------------------------------- /ecommercetools/reports/__init__.py: -------------------------------------------------------------------------------- 1 | from ecommercetools.reports.reports import customers_report 2 | from ecommercetools.reports.reports import transactions_report 3 | -------------------------------------------------------------------------------- /ecommercetools/reports/reports.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from ecommercetools import transactions 4 | 5 | 6 | def transactions_report(df, frequency='M'): 7 | """Create an transactions report based on a specified reporting frequency. 8 | 9 | Args: 10 | df (dataframe): Pandas dataframe of transaction items. 11 | frequency (optional, string, default 'M'): Optional frequency indicator (Y, Q, M, W, D) 12 | 13 | Returns: 14 | df (dataframe): Pandas dataframe of aggregated data for the specified frequency. 15 | """ 16 | 17 | df['year'] = df['order_date'].dt.year 18 | df['quarter'] = df['order_date'].dt.quarter 19 | df['year_quarter'] = df['year'].astype(str) + '-' + df['quarter'].astype(str) 20 | df['month'] = df['order_date'].dt.month 21 | df['year_month'] = df['order_date'].dt.strftime('%Y-%m') 22 | df['week'] = df['order_date'].dt.strftime('%W') 23 | df['year_week'] = df['order_date'].dt.strftime('%Y-%W') 24 | df['day'] = df['order_date'].dt.strftime('%j') 25 | df['year_day'] = df['order_date'].dt.strftime('%Y-%j') 26 | 27 | if frequency == 'Y': 28 | group = 'year' 29 | elif frequency == 'Q': 30 | group = 'year_quarter' 31 | elif frequency == 'W': 32 | group = 'year_week' 33 | elif frequency == 'D': 34 | group = 'year_day' 35 | else: 36 | group = 'year_month' 37 | 38 | df_agg = df.groupby(group).agg( 39 | customers=('customer_id', 'nunique'), 40 | orders=('order_id', 'nunique'), 41 | revenue=('line_price', 'sum'), 42 | skus=('sku', 'count'), 43 | units=('quantity', 'sum') 44 | ).reset_index() 45 | 46 | df_agg['avg_order_value'] = round(df_agg['revenue'] / df_agg['orders'], 2) 47 | df_agg['avg_skus_per_order'] = round(df_agg['skus'] / df_agg['orders'], 2) 48 | df_agg['avg_units_per_order'] = round(df_agg['units'] / df_agg['orders'], 2) 49 | df_agg['avg_revenue_per_customer'] = round(df_agg['revenue'] / df_agg['customers'], 2) 50 | 51 | return df_agg 52 | 53 | 54 | def customers_report(transaction_items_df, frequency='M'): 55 | """Create a customers report based on a specified reporting frequency. 56 | 57 | Args: 58 | df (dataframe): Pandas dataframe of transaction items. 59 | frequency (optional, string, default 'M'): Optional frequency indicator (Y, Q, M, W, D) 60 | 61 | Returns: 62 | df (dataframe): Pandas dataframe of aggregated data for the specified frequency. 63 | """ 64 | 65 | df = transactions.get_transactions(transaction_items_df) 66 | 67 | df['period'] = df['order_date'].dt.strftime('%B, Y') 68 | df['year'] = df['order_date'].dt.year 69 | df['quarter'] = df['order_date'].dt.quarter 70 | df['year_quarter'] = df['year'].astype(str) + '-' + df['quarter'].astype(str) 71 | df['month'] = df['order_date'].dt.month 72 | df['year_month'] = df['order_date'].dt.strftime('%Y-%m') 73 | df['week'] = df['order_date'].dt.strftime('%W') 74 | df['year_week'] = df['order_date'].dt.strftime('%Y-%W') 75 | df['day'] = df['order_date'].dt.strftime('%j') 76 | df['year_day'] = df['order_date'].dt.strftime('%Y-%j') 77 | 78 | if frequency == 'Y': 79 | group = 'year' 80 | elif frequency == 'Q': 81 | group = 'year_quarter' 82 | elif frequency == 'W': 83 | group = 'year_week' 84 | elif frequency == 'D': 85 | group = 'year_day' 86 | else: 87 | group = 'year_month' 88 | 89 | df['new_customers'] = np.where(df['order_number'] == 1, 1, 0) 90 | 91 | df_agg = df.groupby(group).agg( 92 | orders=('order_id', 'nunique'), 93 | customers=('customer_id', 'nunique'), 94 | new_customers=('new_customers', 'sum'), 95 | ).reset_index() 96 | 97 | df_agg['returning_customers'] = df_agg['customers'] - df_agg['new_customers'] 98 | df_agg['acquisition_rate'] = round((df_agg['new_customers'] / df_agg['customers']) * 100, 2) 99 | 100 | return df_agg 101 | -------------------------------------------------------------------------------- /ecommercetools/seo/__init__.py: -------------------------------------------------------------------------------- 1 | from ecommercetools.seo.robots import get_sitemaps 2 | from ecommercetools.seo.robots import get_robots 3 | from ecommercetools.seo.sitemaps import get_sitemap 4 | from ecommercetools.seo.google_pagespeed_insights import get_core_web_vitals 5 | from ecommercetools.seo.google_knowledge_graph import get_knowledge_graph 6 | from ecommercetools.seo.google_search_console import query_google_search_console 7 | from ecommercetools.seo.google_search_console import query_google_search_console_compare 8 | from ecommercetools.seo.google_search_console import classify_pages 9 | from ecommercetools.seo.google_autocomplete import google_autocomplete 10 | from ecommercetools.seo.google_search import get_indexed_pages 11 | from ecommercetools.seo.google_search import get_serps 12 | from ecommercetools.seo.scraping import scrape_site 13 | from ecommercetools.seo.testing import seo_test -------------------------------------------------------------------------------- /ecommercetools/seo/google_autocomplete.py: -------------------------------------------------------------------------------- 1 | """ 2 | Get keyword suggestions for a term using Google Autocomplete or Google Suggest. 3 | """ 4 | 5 | import requests 6 | import urllib.parse 7 | import json 8 | import pandas as pd 9 | from requests_html import HTMLSession 10 | 11 | 12 | def _get_source(url: str): 13 | """Return the source code for the provided URL. 14 | 15 | Args: 16 | url (string): URL of the page to scrape. 17 | 18 | Returns: 19 | response (object): HTTP response object from requests_html. 20 | """ 21 | 22 | try: 23 | session = HTMLSession() 24 | response = session.get(url) 25 | return response 26 | except requests.exceptions.RequestException as e: 27 | print(e) 28 | 29 | 30 | def _get_results(query: str): 31 | """Get the JSON data from a Google Autocomplete query. 32 | 33 | Args: 34 | query (string): Query term, i.e. data science 35 | 36 | Returns: 37 | results (dict): JSON results. 38 | """ 39 | 40 | query = urllib.parse.quote_plus(query) 41 | response = _get_source("https://suggestqueries.google.com/complete/search?output=chrome&hl=en&q=" + query) 42 | results = json.loads(response.text) 43 | return results 44 | 45 | 46 | def _format_results(results: dict): 47 | """Return formatted dictionary containing term and relevance. 48 | 49 | Args: 50 | results (dict): JSON dictionary of Google Autocomplete results. 51 | 52 | Returns: 53 | suggestions (dict): Formatted dictionary containing term and relevance. 54 | """ 55 | 56 | if results: 57 | suggestions = [] 58 | for index, value in enumerate(results[1]): 59 | suggestion = {'term': value, 'relevance': results[4]['google:suggestrelevance'][index]} 60 | suggestions.append(suggestion) 61 | return suggestions 62 | 63 | 64 | def _get_suggestions(query: str): 65 | """Return results sorted by relevance. 66 | 67 | Args: 68 | query (string): Search term, i.e. data science 69 | 70 | Returns: 71 | results (dict): Sorted dictionary containing term and relevance. 72 | """ 73 | 74 | results = _get_results(query) 75 | results = _format_results(results) 76 | results = sorted(results, key=lambda k: k['relevance'], reverse=True) 77 | return results 78 | 79 | 80 | def _get_expanded_term_suffixes(): 81 | """Return a list of query suffixes to extend Google Autocomplete results. 82 | 83 | Returns: 84 | expanded_term_suffixes (list) 85 | """ 86 | 87 | expanded_term_suffixes = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 88 | 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] 89 | return expanded_term_suffixes 90 | 91 | 92 | def _get_expanded_term_prefixes(): 93 | """Return a list of query prefixes to extend Google Autocomplete results. 94 | 95 | Returns: 96 | expanded_term_prefixes (list) 97 | """ 98 | 99 | expanded_term_prefixes = ['who is *', 'what is *', 'where is *', 'when can *', 'why is *', 100 | 'how to *', 'best', 'cheap', 'worst', 'is', 'what', 'when', 'why', 101 | 'how', 'who'] 102 | return expanded_term_prefixes 103 | 104 | 105 | def _get_expanded_terms(query: str): 106 | """Return a list of expanded terms, comprising the original query, and the prefixed and suffixed queries. 107 | 108 | Args: 109 | query (string): Query term, i.e. data science 110 | 111 | Returns: 112 | terms (list): List of query terms with suffixes and prefixes. 113 | """ 114 | 115 | expanded_term_prefixes = _get_expanded_term_prefixes() 116 | expanded_term_suffixes = _get_expanded_term_suffixes() 117 | 118 | terms = [query] 119 | 120 | for term in expanded_term_prefixes: 121 | terms.append(term + ' ' + query) 122 | 123 | for term in expanded_term_suffixes: 124 | terms.append(query + ' ' + term) 125 | 126 | return terms 127 | 128 | 129 | def _get_expanded_suggestions(query: str): 130 | """Return the Google Autocomplete suggestions for a query and its prefixed and suffixed versions. 131 | 132 | Args: 133 | query (string): Query term, i.e. data science 134 | 135 | Returns: 136 | all_results (dict): Sorted formatted dictionary of results for each search term. 137 | """ 138 | 139 | all_results = [] 140 | 141 | expanded_terms = _get_expanded_terms(query) 142 | for term in expanded_terms: 143 | results = _get_results(term) 144 | results = _format_results(results) 145 | all_results = all_results + results 146 | all_results = sorted(all_results, key=lambda k: k['relevance'], reverse=True) 147 | return all_results 148 | 149 | 150 | def google_autocomplete(query: str, include_expanded=True): 151 | """Run a Google Autocomplete / Google Suggest search with optional query expansion. 152 | 153 | Args: 154 | query (string): Query term, i.e. data science 155 | include_expanded (bool, optional): Optional boolean flag. Set to true to add prefixes/suffixes. 156 | 157 | Returns: 158 | df (dataframe): Pandas dataframe containing results. 159 | """ 160 | 161 | if include_expanded: 162 | results = _get_expanded_suggestions(query) 163 | 164 | else: 165 | results = _get_suggestions(query) 166 | 167 | df = pd.DataFrame.from_records(results) 168 | return df 169 | 170 | -------------------------------------------------------------------------------- /ecommercetools/seo/google_knowledge_graph.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import urllib.parse 3 | import json 4 | import pandas as pd 5 | from requests_html import HTMLSession 6 | 7 | 8 | def _get_source(url: str): 9 | """Return the source code for the provided URL. 10 | 11 | Args: 12 | url (string): URL of the page to scrape. 13 | 14 | Returns: 15 | response (object): HTTP response object from requests_html. 16 | """ 17 | 18 | try: 19 | session = HTMLSession() 20 | response = session.get(url) 21 | return response 22 | 23 | except requests.exceptions.RequestException as e: 24 | print(e) 25 | 26 | 27 | def get_knowledge_graph(api_key: str, 28 | query: str, 29 | output="dataframe"): 30 | """Return a Google Knowledge Graph for a given query. 31 | 32 | Args: 33 | api_key (string): Google Knowledge Graph API key. 34 | query (string): Term to search for. 35 | output (string, optional): Output format (dataframe, or json). 36 | 37 | Returns: 38 | response (object): Knowledge Graph response object in JSON format. 39 | """ 40 | 41 | endpoint = 'https://kgsearch.googleapis.com/v1/entities:search' 42 | params = { 43 | 'query': query, 44 | 'limit': 10, 45 | 'indent': True, 46 | 'key': api_key, 47 | } 48 | 49 | url = endpoint + '?' + urllib.parse.urlencode(params) 50 | response = _get_source(url) 51 | 52 | if output == "json": 53 | return json.loads(response.text) 54 | else: 55 | return pd.json_normalize(json.loads(response.text), record_path='itemListElement') 56 | -------------------------------------------------------------------------------- /ecommercetools/seo/google_pagespeed_insights.py: -------------------------------------------------------------------------------- 1 | """ 2 | Fetch Core Web Vitals from the Google PageSpeed Insights API. 3 | """ 4 | 5 | import sys 6 | import json 7 | import urllib.request 8 | import pandas as pd 9 | 10 | 11 | def query_core_web_vitals(key: str, 12 | url: str, 13 | strategy: str = "desktop"): 14 | """Run a Google Page Speed API query to fetch the Core Web Vitals for a URL. 15 | 16 | Args: 17 | key (str): API key for Google Page Speed API. 18 | url (str): URL of the page you wish to check. 19 | strategy (str, optional): Optional strategy (desktop or mobile). 20 | 21 | Returns: 22 | data (json): API response in JSON format. 23 | """ 24 | 25 | try: 26 | endpoint = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed" \ 27 | + "?strategy=" + strategy \ 28 | + "&url={}" \ 29 | + "&key=" + key 30 | 31 | response = urllib.request.urlopen(endpoint.format(url)).read().decode('UTF-8') 32 | data = json.loads(response) 33 | return data 34 | except Exception as e: 35 | print("Error: ", e) 36 | sys.exit(1) 37 | 38 | 39 | def save_core_web_vitals(report: dict, 40 | filename: str): 41 | """Save the Core Web Vitals JSON report to file. 42 | 43 | Args: 44 | report (dict): JSON object containing report data. 45 | filename (str): Filename to use for report. 46 | 47 | Returns: 48 | JSON Core Web Vitals report file. 49 | """ 50 | 51 | with open(filename, 'w') as outfile: 52 | json.dump(report, outfile) 53 | 54 | 55 | def parse_core_web_vitals(report: dict): 56 | """Return a dictionary containing the Core Web Vitals from the report. 57 | 58 | Args: 59 | report (dict): JSON dictionary containing report data. 60 | 61 | Return: 62 | data (dict): Dictionary containing the key data. 63 | 64 | """ 65 | 66 | final_url = report['lighthouseResult']['finalUrl'] 67 | fetch_time = report['lighthouseResult']['fetchTime'] 68 | form_factor = report['lighthouseResult']['configSettings']['formFactor'] 69 | overall_score = report["lighthouseResult"]["categories"]["performance"]["score"] * 100 70 | speed_index = report["lighthouseResult"]["audits"]["speed-index"]["score"] * 100 71 | first_meaningful_paint = report["lighthouseResult"]["audits"]["first-meaningful-paint"]["score"] * 100 72 | first_contentful_paint = report["lighthouseResult"]["audits"]["first-contentful-paint"]["score"] * 100 73 | time_to_interactive = report["lighthouseResult"]["audits"]["interactive"]["score"] * 100 74 | total_blocking_time = report["lighthouseResult"]["audits"]["total-blocking-time"]["score"] * 100 75 | cumulative_layout_shift = report["lighthouseResult"]["audits"]["cumulative-layout-shift"]["score"] * 100 76 | 77 | data = { 78 | 'final_url': final_url, 79 | 'fetch_time': fetch_time, 80 | 'form_factor': form_factor, 81 | 'overall_score': overall_score, 82 | 'speed_index': speed_index, 83 | 'first_meaningful_paint': first_meaningful_paint, 84 | 'first_contentful_paint': first_contentful_paint, 85 | 'time_to_interactive': time_to_interactive, 86 | 'total_blocking_time': total_blocking_time, 87 | 'cumulative_layout_shift': cumulative_layout_shift, 88 | } 89 | 90 | return data 91 | 92 | 93 | def get_core_web_vitals(key: str, 94 | urls: list, 95 | strategy: str = "both"): 96 | """Return a Pandas dataframe containing Core Web Vitals for the provided URLs and optional strategy. 97 | 98 | Args: 99 | key (str): API key for Google Page Speed API. 100 | urls (list): URL of the page you wish to check. 101 | strategy (str, optional): Optional strategy (desktop or mobile) or both (default). 102 | 103 | Returns: 104 | df (dataframe): Pandas dataframe containing core web vitals for URL and strategy. 105 | """ 106 | 107 | df = pd.DataFrame(columns=['final_url', 'fetch_time', 'form_factor', 'overall_score', 108 | 'speed_index', 'first_meaningful_paint', 'first_contentful_paint', 109 | 'time_to_interactive', 'total_blocking_time', 'cumulative_layout_shift']) 110 | 111 | if strategy == "both": 112 | 113 | for url in urls: 114 | report = query_core_web_vitals(key, url, strategy="mobile") 115 | if report: 116 | data = parse_core_web_vitals(report) 117 | df = df.append(data, ignore_index=True) 118 | 119 | for url in urls: 120 | report = query_core_web_vitals(key, url, strategy="desktop") 121 | if report: 122 | data = parse_core_web_vitals(report) 123 | df = df.append(data, ignore_index=True) 124 | 125 | else: 126 | for url in urls: 127 | report = query_core_web_vitals(key, url, strategy=strategy) 128 | if report: 129 | data = parse_core_web_vitals(report) 130 | df = df.append(data, ignore_index=True) 131 | 132 | df = df.sort_values(by='final_url') 133 | return df 134 | -------------------------------------------------------------------------------- /ecommercetools/seo/google_search.py: -------------------------------------------------------------------------------- 1 | """ 2 | General functions for scraping data from Google search engine results pages. 3 | """ 4 | 5 | import re 6 | import requests 7 | import urllib.parse 8 | import pandas as pd 9 | import numpy as np 10 | from requests_html import HTMLSession 11 | 12 | 13 | def _get_source(url: str): 14 | """Return the source code for the provided URL. 15 | 16 | Args: 17 | url (string): URL of the page to scrape. 18 | 19 | Returns: 20 | response (object): HTTP response object from requests_html. 21 | """ 22 | 23 | try: 24 | session = HTMLSession() 25 | response = session.get(url) 26 | 27 | if response.status_code == 200: 28 | return response 29 | elif response.status_code == 429: 30 | print('Error: Too many requests. Google has temporarily blocked you. Try again later.') 31 | exit() 32 | else: 33 | print('Error:' + response) 34 | exit() 35 | except requests.exceptions.RequestException as e: 36 | print(e) 37 | 38 | 39 | def _get_site_results(url: str): 40 | """Return the source of a site:url search. 41 | 42 | Args: 43 | url: URL of page to append to site: query 44 | 45 | Returns: 46 | response (str): HTML of page. 47 | """ 48 | 49 | try: 50 | query = urllib.parse.quote_plus(url) 51 | response = _get_source("https://www.google.co.uk/search?q=site%3A" + query + "&num=100") 52 | 53 | return response 54 | except requests.exceptions.RequestException as e: 55 | print(e) 56 | 57 | 58 | def _parse_site_results(response: str): 59 | """Parse the HTML of a site:url query and return the number of pages "indexed". 60 | 61 | Args: 62 | response: HTML of site:url query. 63 | 64 | Returns: 65 | indexed: Number of pages "indexed". 66 | """ 67 | 68 | try: 69 | if response.html.find("#result-stats", first=True): 70 | 71 | string = response.html.find("#result-stats", first=True).text 72 | if string: 73 | # Remove values in paretheses, i.e. (0.31 seconds) 74 | string = re.sub(r'\([^)]*\)', '', string) 75 | 76 | # Remove non-numeric characters 77 | string = re.sub('[^0-9]', '', string) 78 | 79 | return string 80 | else: 81 | return 0 82 | except requests.exceptions.RequestException as e: 83 | print(e) 84 | 85 | 86 | def _count_indexed_pages(url: str): 87 | """Gets the site:url data, parses the response, and returns the number of "indexed" pages. 88 | 89 | Args: 90 | url: URL to use in site:url search. 91 | 92 | Returns: 93 | results (int): Number of pages "indexed". 94 | """ 95 | 96 | response = _get_site_results(url) 97 | return _parse_site_results(response) 98 | 99 | 100 | def get_indexed_pages(urls: list): 101 | """Loop through a series of URLs and run site:url searches, then return number of "indexed" pages. 102 | 103 | Args: 104 | urls (list): List of URLs. 105 | 106 | Returns: 107 | df (dataframe): Pandas dataframe containing URL and number of "indexed" pages. 108 | """ 109 | 110 | data = [] 111 | for site in urls: 112 | site_data = {'url': site, 'indexed_pages': _count_indexed_pages(site)} 113 | data.append(site_data) 114 | df = pd.DataFrame.from_records(data) 115 | df = df.sort_values(by='indexed_pages') 116 | return df 117 | 118 | 119 | def _get_results(query: str): 120 | """Return the source of a search. 121 | 122 | Args: 123 | query: Search query term. 124 | 125 | Returns: 126 | response (str): HTML of page. 127 | """ 128 | 129 | query = urllib.parse.quote_plus(query) 130 | response = _get_source("https://www.google.co.uk/search?q=" + query + "&num=100") 131 | 132 | return response 133 | 134 | 135 | def _get_next_page(response, domain="google.co.uk"): 136 | """Get the URL for the next page of results.""" 137 | 138 | css_identifier_next = "#pnnext" 139 | next_page_url = response.html.find(css_identifier_next, first=True).attrs['href'] 140 | next_page = "https://www." + domain + next_page_url 141 | 142 | return next_page 143 | 144 | 145 | def _parse_search_results(response): 146 | """Parses the Google Search engine results and returns a list of results. 147 | 148 | Note: This function is obviously dependent upon the source code in the Google results. 149 | Google obfuscates the source of the page to make it more difficult to extra information. 150 | Extraction classes change from time to time, so there is always a likelihood that this 151 | function will need to be adjusted with the new class or identifier details. 152 | In the event of the function failing, please raise a GitHub issue. 153 | 154 | Args: 155 | response: Response object containing the page source code. 156 | 157 | Returns: 158 | list: List of Google search results. 159 | """ 160 | 161 | css_identifier_result = ".tF2Cxc" # The class of the div containing each result, i.e.
162 | css_identifier_title = "h3" # The element containing the title, i.e.