├── Customer_Segmentation.py └── RFM_User_Friendly_Function.py /Customer_Segmentation.py: -------------------------------------------------------------------------------- 1 | # Customer Segmentation using RFM Analytics 2 | 3 | # The data set consists of information obtained from the past shopping behavior of customers who made their last purchases 4 | # via OmniChannel (both online and offline shopping) in 2020 - 2021. # 5 | # 12 variables, 19.945 observations 6 | 7 | # master_id: Unique customer id 8 | # order_channel: Channel of the shopping platform where the shopping was made (Android, iOS, Desktop, Mobile) 9 | # last_order_channel: Channel where the last shopping was made. 10 | # first_order_date: The date of the customer's initial purchase. 11 | # last_order_date: The date of the customer's most recent purchase. 12 | # last_order_date_online: The date of the customer's most recent online purchase. 13 | # last_order_date_offline: The date of the customer's most recent offline purchase. 14 | # order_num_total_ever_online: The total number of purchases the customer has made on online platforms. 15 | # order_num_total_ever_offline: The total number of purchases the customer has made on offline platforms. 16 | # customer_value_total_ever_offline: The total amount the customer has spent on offline purchases. 17 | # customer_value_total_ever_online: The total amount the customer has spent on online purchases. 18 | # interested_in_categories_12: The list of categories in which the customer has made purchases in the last 12 months. 19 | 20 | #################################################### 21 | ### Task 1: Understanding and Preparing the Data ### 22 | #################################################### 23 | 24 | # Step 1: Reading the dataset, necessary libraries and options 25 | 26 | import datetime as dt 27 | import pandas as pd 28 | pd.set_option("display.max_columns", 14) 29 | pd.set_option('display.width', 99) 30 | pd.set_option("display.float_format", lambda x: "%.3f" % x) 31 | 32 | _df_ = pd.read_csv("data_20k.csv") 33 | df = _df_.copy() 34 | 35 | # Step 2: Understanding the dataset 36 | 37 | df.head() 38 | df.shape 39 | df.dtypes 40 | df.columns 41 | df.describe().T 42 | df.isnull().sum() 43 | 44 | # Step 3: Creating new variables in the dataset for using Omnichannel for shopping. This way we will be able 45 | # to sum and evalute the customers both online and offline recency and frequency values together.# 46 | 47 | df["omni_customer_value"] = df["customer_value_total_ever_online"] + df["customer_value_total_ever_offline"] 48 | df["omni_total_order"] = df["order_num_total_ever_online"] + df["order_num_total_ever_offline"] 49 | df.head() 50 | 51 | 52 | # Step 4: Examination of variable types and converting variables that represent dates to the type datetime. 53 | 54 | df.dtypes 55 | 56 | for col in df.columns: 57 | if "date" in col: 58 | df[col] = pd.to_datetime(df[col]) 59 | 60 | df.dtypes 61 | 62 | 63 | # Step 5.1: Evaluating the distribution of customer count, total product purchased, and total spending across online shopping channels. 64 | 65 | df.groupby("order_channel").agg({"master_id": lambda master_id: master_id.nunique(), 66 | "order_num_total_ever_online": lambda order_num_total_ever_online: order_num_total_ever_online.sum(), 67 | "customer_value_total_ever_online": lambda customer_value_total_ever_online: customer_value_total_ever_online.sum()}) 68 | 69 | 70 | # Step 5.2: Evaluating the distribution of customer count, total product purchased, and total spending across all (online + offline) shopping channels. 71 | 72 | df.groupby("order_channel").agg({"master_id": lambda master_id: master_id.nunique(), 73 | "omni_total_order": lambda omni_total_order: omni_total_order.sum(), 74 | "omni_customer_value": lambda omni_customer_value: omni_customer_value.sum()}) 75 | 76 | 77 | # Step 6: Listing the top 10 customers with the highest revenue. 78 | 79 | df_top_ten_value = df.sort_values(by='omni_customer_value', ascending=False).head(10) 80 | 81 | 82 | # Step 7: List the top 10 customers with the most orders. 83 | 84 | df_top_ten_order = df.sort_values(by="omni_total_order", ascending=False).head(10) 85 | 86 | 87 | # Step 8: Functionize the data preparation process. 88 | 89 | def data_preparation(dataframe): 90 | dataframe["omni_customer_value"] = dataframe["customer_value_total_ever_online"] + dataframe["customer_value_total_ever_offline"] 91 | dataframe["omni_total_order"] = dataframe["order_num_total_ever_online"] + dataframe["order_num_total_ever_offline"] 92 | for col in dataframe.columns: 93 | if "date" in col: 94 | df[col] = pd.to_datetime(dataframe[col]) 95 | return dataframe 96 | 97 | data_preparation(df) # to call it. 98 | 99 | 100 | ####################################### 101 | ### Task 2: Calculating RFM Metrics ### 102 | ####################################### 103 | 104 | 105 | # Step 1: Recency, Frequency, and Monetary definitions. 106 | 107 | # Recency: Represents the time difference between the customer's last purchase date and the analysis date. 108 | # Frequency: Indicates the total number of purchases made by the customer, representing the frequency of purchases. 109 | # Monetary: The total revenue generated by the customer for the company. 110 | 111 | 112 | # Step 2: Calculate the Recency, Frequency, and Monetary metrics on a customer-specific basis. 113 | # We use list comprehension for columns that represent date and define it as "date_var". 114 | # "df[date_var].max()" will result as max values of these variables and "df[date_var].max().max()" will result as max value of these max values.# 115 | 116 | date_var = [col for col in df.columns if 'date' in col] 117 | df[date_var].max().max() 118 | today_date = dt.datetime(2021, 6, 1) 119 | 120 | rfm = df.groupby("master_id").agg({"last_order_date": lambda last_order_date: (today_date - last_order_date.max()).days, 121 | "omni_total_order": lambda omni_total_order: omni_total_order, 122 | "omni_customer_value": lambda omni_customer_value: omni_customer_value}) 123 | 124 | 125 | # Step 3: Changing the names of the metrics we've created to recency, frequency, and monetary. 126 | 127 | rfm.columns = ["recency", "frequency", "monetary"] 128 | rfm.head() 129 | 130 | 131 | ######################################## 132 | ### Task 3: Calculating the RF Score ### 133 | ######################################## 134 | 135 | # rank(method="first"): The "method="first"" expression is used to capture the first value in frequency. 136 | # We are dividing the recency and frequency variables into 5 segments and labeling them from 1 to 5. 137 | # For recency, smaller values are better, and since the qcut() function sorts values from lower to higher, we label the group with the lowest recency as 5, 138 | # and the second-lowest group as 4, down to 1 for the highest recency. 139 | # For frequency, higher values are better, and due to how the qcut() function works, we label the group with the lowest frequency as 1, 140 | # and the second-lowest as 2, up to 5 for the highest frequency group. 141 | 142 | 143 | rfm["recency_score"] = pd.qcut(rfm["recency"], 5, labels=[5, 4, 3, 2, 1]) 144 | rfm["frequency_score"] = pd.qcut(rfm["frequency"].rank(method="first"), 5, labels=[1, 2, 3, 4, 5]) 145 | rfm["RF_score"] = (rfm["recency_score"].astype(str) + rfm["frequency_score"].astype(str)) 146 | rfm.reset_index(inplace=True) 147 | rfm.head() 148 | 149 | 150 | ######################################################## 151 | ### Task 4: Defining the RF Score as Segments ### 152 | ######################################################## 153 | # "REGEX" stands for "Regular expression", The "-" within "[-]" represents the "or" expression. 154 | # REGEX and RFM Naming 155 | 156 | seg_map = { 157 | r"[1-2][1-2]": "hibernating", 158 | r"[1-2][3-4]": "at_risk", 159 | r"[1-2]5": "cant_loose", 160 | r"3[1-2]": "about_to_sleep", 161 | r"33": "need_attention", 162 | r"[3-4][4-5]": "loyal_customers", 163 | r"41": "promising", 164 | r"51": "new_customers", 165 | r"[4-5][2-3]": "potential_loyalists", 166 | r"5[4-5]": "champions" 167 | } 168 | 169 | rfm["Segment"] = rfm["RF_score"].replace(seg_map, regex=True) 170 | rfm.head() 171 | 172 | 173 | ################################### 174 | ### Task 5: RFM Use In Practice ### 175 | ################################### 176 | 177 | # Step 1: Examine and evaluate the averages of recency, frequency, and monetary for the segments. 178 | 179 | rfm[["Segment", "recency", "frequency", "monetary"]].groupby("Segment").agg(["mean", "count"]) 180 | 181 | 182 | # Step 2: Using RFM analysis, find the relevant profiled customers for the following 2 cases and save their customer IDs as a CSV. 183 | 184 | # a. FLO is introducing a new women's shoe brand. The prices of this brand's products are above the general customer preferences. 185 | # Therefore, they want to establish special communication with customers who are interested in promoting the brand and sales. 186 | # The customers for special communication will include loyal customers (champions, loyal_customers) and those within the shopping category, 187 | # including the women's category. Save the customer IDs of these customers to a CSV file. 188 | 189 | rfm1 = rfm[rfm["Segment"].str.contains("champions|loyal_customers")][["master_id", "Segment"]] 190 | df1 = df[df["interested_in_categories_12"].str.contains("KADIN", na=False)][["master_id", "interested_in_categories_12"]] 191 | cus_for_new_brand = rfm1.merge(df1, on="master_id", how="inner") 192 | cus_for_new_brand.head() 193 | cus_for_new_brand.to_csv("cus_for_new_brand") 194 | 195 | 196 | # "b. A discount of nearly 40% is planned for men's and children's products. 197 | # Customers who have shown interest in these categories in the past but haven't shopped for a long time, 198 | # as well as customers who are considered 'at risk' (sleeping), and new customers, 199 | # are being targeted for this discount. Save the customer IDs of the appropriate profiled customers to a CSV file. # 200 | 201 | rfm2 = rfm[rfm["Segment"].str.contains("cant_loose|hibernating|new_customers")][["master_id", "Segment"]] 202 | df2 = df[(df["interested_in_categories_12"].str.contains("ERKEK|COCUK", na=False))][["master_id", "interested_in_categories_12"]] 203 | cus_for_new_brand2 = rfm2.merge(df2, on="master_id", how="inner") 204 | cus_for_new_brand2.head() 205 | cus_for_new_brand2.to_csv("cus_for_new_brand2") -------------------------------------------------------------------------------- /RFM_User_Friendly_Function.py: -------------------------------------------------------------------------------- 1 | # This function presents a user-friendly, answer-based use for entire "Customer Segmentation Using RFM Analytics". 2 | # Only works on current dataset structure. 3 | import pandas as pd 4 | import datetime as dt 5 | pd.set_option("display.max_columns", 14) 6 | pd.set_option("display.width", 500) 7 | 8 | _df_ = pd.read_csv("data_20k.csv") 9 | df = _df_.copy() 10 | 11 | today_date = dt.datetime(2021, 6, 1) 12 | 13 | 14 | def main_func(dataframe): 15 | """ 16 | 17 | Parameters 18 | ---------- 19 | dataframe: specify relevant dataframe. 20 | 21 | Returns 22 | Returns as three questions: 23 | 1. Segment name 24 | (For a single segment, enter the segment name as is. For multiple segments, list the segment names separated by '|' (pipe character).) 25 | 2. Category name 26 | (For a single category, enter the category name as is. For multiple categories, list the category names separated by '|' (pipe character).) 27 | 3. CSV file name 28 | After answering these question, function will return as a .csv file that created by the answers. 29 | ------- 30 | 31 | """ 32 | def prep_data(dataframe): 33 | dataframe["omni_total_order"] = dataframe["order_num_total_ever_online"] + dataframe["order_num_total_ever_offline"] 34 | dataframe["omni_customer_value"] = dataframe["customer_value_total_ever_offline"] + dataframe[ 35 | "customer_value_total_ever_online"] 36 | for col in dataframe.columns: 37 | if "date" in col: 38 | dataframe[col] = pd.to_datetime(dataframe[col]) 39 | return rfm_creating(dataframe) 40 | def rfm_creating(dataframe): 41 | rfm = dataframe.groupby("master_id").agg( 42 | {"last_order_date": lambda date: (today_date - date.max()).days, 43 | "omni_total_order": lambda order: order, 44 | "omni_customer_value": lambda value: value}) 45 | rfm.columns = ["recency", "frequency", "monetary"] 46 | return rfm_segment(rfm) 47 | def rfm_segment(rfm): 48 | rfm["recency_score"] = pd.qcut(rfm["recency"], 5, labels=[5, 4, 3, 2, 1]) 49 | rfm["frequency_score"] = pd.qcut(rfm["frequency"].rank(method="first"), 5, labels=[1, 2, 3, 4, 5]) 50 | rfm["RF_score"] = (rfm["recency_score"].astype(str) + rfm["frequency_score"].astype(str)) 51 | rfm.reset_index(inplace=True) 52 | 53 | seg_map = { 54 | r"[1-2][1-2]": "hibernating", 55 | r"[1-2][3-4]": "at_risk", 56 | r"[1-2]5": "cant_loose", 57 | r"3[1-2]": "about_to_sleep", 58 | r"33": "need_attention", 59 | r"[3-4][4-5]": "loyal_customers", 60 | r"41": "promising", 61 | r"51": "new_customers", 62 | r"[4-5][2-3]": "potential_loyalists", 63 | r"5[4-5]": "champions" 64 | } 65 | rfm["Segment"] = rfm["RF_score"].replace(seg_map, regex=True) 66 | return RFM_output(rfm, dataframe) 67 | def RFM_output(rfm, dataframe): 68 | segments = input("Segment name (eg. cant_loose or cant_loose|hibernating|new_customers)") 69 | categories = input("Category name (eg. KADIN or COCUK|ERKEK)") 70 | dataframe_1 = dataframe[dataframe["interested_in_categories_12"].str.contains(f"{categories}")][ 71 | ["master_id", "interested_in_categories_12"]] 72 | rfm_1 = rfm[rfm["Segment"].str.contains(f"{segments}")][["master_id", "Segment"]] 73 | new_customers = dataframe_1.merge(rfm_1, on="master_id", how="inner") 74 | file_name = input("Under what name should the file be saved?") 75 | new_customers.to_csv(f"{file_name}") 76 | return print(f"Your file named {file_name} has been created and saved.") 77 | return prep_data(dataframe) 78 | 79 | 80 | 81 | --------------------------------------------------------------------------------