├── Customer_Segmentation.py
└── RFM_User_Friendly_Function.py


/Customer_Segmentation.py:
--------------------------------------------------------------------------------
  1 | # Customer Segmentation using RFM Analytics
  2 | 
  3 | # The data set consists of information obtained from the past shopping behavior of customers who made their last purchases
  4 | # via OmniChannel (both online and offline shopping) in 2020 - 2021. #
  5 | # 12 variables, 19.945 observations
  6 | 
  7 | # master_id: Unique customer id
  8 | # order_channel: Channel of the shopping platform where the shopping was made (Android, iOS, Desktop, Mobile)
  9 | # last_order_channel: Channel where the last shopping was made.
 10 | # first_order_date: The date of the customer's initial purchase.
 11 | # last_order_date: The date of the customer's most recent purchase.
 12 | # last_order_date_online: The date of the customer's most recent online purchase.
 13 | # last_order_date_offline: The date of the customer's most recent offline purchase.
 14 | # order_num_total_ever_online: The total number of purchases the customer has made on online platforms.
 15 | # order_num_total_ever_offline: The total number of purchases the customer has made on offline platforms.
 16 | # customer_value_total_ever_offline: The total amount the customer has spent on offline purchases.
 17 | # customer_value_total_ever_online: The total amount the customer has spent on online purchases.
 18 | # interested_in_categories_12: The list of categories in which the customer has made purchases in the last 12 months.
 19 | 
 20 | ####################################################
 21 | ### Task 1: Understanding and Preparing the Data ###
 22 | ####################################################
 23 | 
 24 | # Step 1: Reading the dataset, necessary libraries and options
 25 | 
 26 | import datetime as dt
 27 | import pandas as pd
 28 | pd.set_option("display.max_columns", 14)
 29 | pd.set_option('display.width', 99)
 30 | pd.set_option("display.float_format", lambda x: "%.3f" % x)
 31 | 
 32 | _df_ = pd.read_csv("data_20k.csv")
 33 | df = _df_.copy()
 34 | 
 35 | # Step 2: Understanding the dataset
 36 | 
 37 | df.head()
 38 | df.shape
 39 | df.dtypes
 40 | df.columns
 41 | df.describe().T
 42 | df.isnull().sum()
 43 | 
 44 | # Step 3: Creating new variables in the dataset for using Omnichannel for shopping. This way we will be able
 45 | # to sum and evalute the customers both online and offline recency and frequency values together.#
 46 | 
 47 | df["omni_customer_value"] = df["customer_value_total_ever_online"] + df["customer_value_total_ever_offline"]
 48 | df["omni_total_order"] = df["order_num_total_ever_online"] + df["order_num_total_ever_offline"]
 49 | df.head()
 50 | 
 51 | 
 52 | # Step 4: Examination of variable types and converting variables that represent dates to the type datetime.
 53 | 
 54 | df.dtypes
 55 | 
 56 | for col in df.columns:
 57 |     if "date" in col:
 58 |         df[col] = pd.to_datetime(df[col])
 59 | 
 60 | df.dtypes
 61 | 
 62 | 
 63 | # Step 5.1: Evaluating the distribution of customer count, total product purchased, and total spending across online shopping channels.
 64 | 
 65 | df.groupby("order_channel").agg({"master_id": lambda master_id: master_id.nunique(),
 66 |                                  "order_num_total_ever_online": lambda order_num_total_ever_online: order_num_total_ever_online.sum(),
 67 |                                  "customer_value_total_ever_online": lambda customer_value_total_ever_online: customer_value_total_ever_online.sum()})
 68 | 
 69 | 
 70 | # Step 5.2: Evaluating the distribution of customer count, total product purchased, and total spending across all (online + offline) shopping channels.
 71 | 
 72 | df.groupby("order_channel").agg({"master_id": lambda master_id: master_id.nunique(),
 73 |                                  "omni_total_order": lambda omni_total_order: omni_total_order.sum(),
 74 |                                  "omni_customer_value": lambda omni_customer_value: omni_customer_value.sum()})
 75 | 
 76 | 
 77 | # Step 6: Listing the top 10 customers with the highest revenue.
 78 | 
 79 | df_top_ten_value = df.sort_values(by='omni_customer_value', ascending=False).head(10)
 80 | 
 81 | 
 82 | # Step 7: List the top 10 customers with the most orders.
 83 | 
 84 | df_top_ten_order = df.sort_values(by="omni_total_order", ascending=False).head(10)
 85 | 
 86 | 
 87 | # Step 8: Functionize the data preparation process.
 88 | 
 89 | def data_preparation(dataframe):
 90 |     dataframe["omni_customer_value"] = dataframe["customer_value_total_ever_online"] + dataframe["customer_value_total_ever_offline"]
 91 |     dataframe["omni_total_order"] = dataframe["order_num_total_ever_online"] + dataframe["order_num_total_ever_offline"]
 92 |     for col in dataframe.columns:
 93 |         if "date" in col:
 94 |             df[col] = pd.to_datetime(dataframe[col])
 95 |     return dataframe
 96 | 
 97 | data_preparation(df) # to call it.
 98 | 
 99 | 
100 | #######################################
101 | ### Task 2: Calculating RFM Metrics ###
102 | #######################################
103 | 
104 | 
105 | # Step 1: Recency, Frequency, and Monetary definitions.
106 | 
107 | # Recency: Represents the time difference between the customer's last purchase date and the analysis date.
108 | # Frequency: Indicates the total number of purchases made by the customer, representing the frequency of purchases.
109 | # Monetary: The total revenue generated by the customer for the company.
110 | 
111 | 
112 | # Step 2: Calculate the Recency, Frequency, and Monetary metrics on a customer-specific basis.
113 | # We use list comprehension for columns that represent date and define it as "date_var".
114 | # "df[date_var].max()" will result as max values of these variables and "df[date_var].max().max()" will result as max value of these max values.#
115 | 
116 | date_var = [col for col in df.columns if 'date' in col]
117 | df[date_var].max().max()
118 | today_date = dt.datetime(2021, 6, 1)
119 | 
120 | rfm = df.groupby("master_id").agg({"last_order_date": lambda last_order_date: (today_date - last_order_date.max()).days,
121 |                              "omni_total_order": lambda omni_total_order: omni_total_order,
122 |                              "omni_customer_value": lambda omni_customer_value: omni_customer_value})
123 | 
124 | 
125 | # Step 3: Changing the names of the metrics we've created to recency, frequency, and monetary.
126 | 
127 | rfm.columns = ["recency", "frequency", "monetary"]
128 | rfm.head()
129 | 
130 | 
131 | ########################################
132 | ### Task 3: Calculating the RF Score ###
133 | ########################################
134 | 
135 | # rank(method="first"): The "method="first"" expression is used to capture the first value in frequency.
136 | # We are dividing the recency and frequency variables into 5 segments and labeling them from 1 to 5.
137 | # For recency, smaller values are better, and since the qcut() function sorts values from lower to higher, we label the group with the lowest recency as 5,
138 | # and the second-lowest group as 4, down to 1 for the highest recency.
139 | # For frequency, higher values are better, and due to how the qcut() function works, we label the group with the lowest frequency as 1,
140 | # and the second-lowest as 2, up to 5 for the highest frequency group.
141 | 
142 | 
143 | rfm["recency_score"] = pd.qcut(rfm["recency"], 5, labels=[5, 4, 3, 2, 1])
144 | rfm["frequency_score"] = pd.qcut(rfm["frequency"].rank(method="first"), 5, labels=[1, 2, 3, 4, 5])
145 | rfm["RF_score"] = (rfm["recency_score"].astype(str) + rfm["frequency_score"].astype(str))
146 | rfm.reset_index(inplace=True)
147 | rfm.head()
148 | 
149 | 
150 | ########################################################
151 | ### Task 4: Defining the RF Score as Segments ###
152 | ########################################################
153 | # "REGEX" stands for "Regular expression",  The "-" within "[-]" represents the "or" expression.
154 | # REGEX and RFM Naming
155 | 
156 | seg_map = {
157 |     r"[1-2][1-2]": "hibernating",
158 |     r"[1-2][3-4]": "at_risk",
159 |     r"[1-2]5": "cant_loose",
160 |     r"3[1-2]": "about_to_sleep",
161 |     r"33": "need_attention",
162 |     r"[3-4][4-5]": "loyal_customers",
163 |     r"41": "promising",
164 |     r"51": "new_customers",
165 |     r"[4-5][2-3]": "potential_loyalists",
166 |     r"5[4-5]": "champions"
167 | }
168 | 
169 | rfm["Segment"] = rfm["RF_score"].replace(seg_map, regex=True)
170 | rfm.head()
171 | 
172 | 
173 | ###################################
174 | ### Task 5: RFM Use In Practice ###
175 | ###################################
176 | 
177 | # Step 1: Examine and evaluate the averages of recency, frequency, and monetary for the segments.
178 | 
179 | rfm[["Segment", "recency", "frequency", "monetary"]].groupby("Segment").agg(["mean", "count"])
180 | 
181 | 
182 | # Step 2: Using RFM analysis, find the relevant profiled customers for the following 2 cases and save their customer IDs as a CSV.
183 | 
184 | # a. FLO is introducing a new women's shoe brand. The prices of this brand's products are above the general customer preferences.
185 | # Therefore, they want to establish special communication with customers who are interested in promoting the brand and sales.
186 | # The customers for special communication will include loyal customers (champions, loyal_customers) and those within the shopping category,
187 | # including the women's category. Save the customer IDs of these customers to a CSV file.
188 | 
189 | rfm1 = rfm[rfm["Segment"].str.contains("champions|loyal_customers")][["master_id", "Segment"]]
190 | df1 = df[df["interested_in_categories_12"].str.contains("KADIN", na=False)][["master_id", "interested_in_categories_12"]]
191 | cus_for_new_brand = rfm1.merge(df1, on="master_id", how="inner")
192 | cus_for_new_brand.head()
193 | cus_for_new_brand.to_csv("cus_for_new_brand")
194 | 
195 | 
196 | # "b. A discount of nearly 40% is planned for men's and children's products.
197 | # Customers who have shown interest in these categories in the past but haven't shopped for a long time,
198 | # as well as customers who are considered 'at risk' (sleeping), and new customers,
199 | # are being targeted for this discount. Save the customer IDs of the appropriate profiled customers to a CSV file. #
200 | 
201 | rfm2 = rfm[rfm["Segment"].str.contains("cant_loose|hibernating|new_customers")][["master_id", "Segment"]]
202 | df2 = df[(df["interested_in_categories_12"].str.contains("ERKEK|COCUK", na=False))][["master_id", "interested_in_categories_12"]]
203 | cus_for_new_brand2 = rfm2.merge(df2, on="master_id", how="inner")
204 | cus_for_new_brand2.head()
205 | cus_for_new_brand2.to_csv("cus_for_new_brand2")


--------------------------------------------------------------------------------
/RFM_User_Friendly_Function.py:
--------------------------------------------------------------------------------
 1 | # This function presents a user-friendly, answer-based use for entire "Customer Segmentation Using RFM Analytics".
 2 | # Only works on current dataset structure.
 3 | import pandas as pd
 4 | import datetime as dt
 5 | pd.set_option("display.max_columns", 14)
 6 | pd.set_option("display.width", 500)
 7 | 
 8 | _df_ = pd.read_csv("data_20k.csv")
 9 | df = _df_.copy()
10 | 
11 | today_date = dt.datetime(2021, 6, 1)
12 | 
13 | 
14 | def main_func(dataframe):
15 |     """
16 | 
17 |     Parameters
18 |     ----------
19 |     dataframe: specify relevant dataframe.
20 | 
21 |     Returns
22 |     Returns as three questions:
23 |         1. Segment name
24 |             (For a single segment, enter the segment name as is. For multiple segments, list the segment names separated by '|' (pipe character).)
25 |         2. Category name
26 |             (For a single category, enter the category name as is. For multiple categories, list the category names separated by '|' (pipe character).)
27 |         3. CSV file name
28 |     After answering these question, function will return as a .csv file that created by the answers.
29 |     -------
30 | 
31 |     """
32 |     def prep_data(dataframe):
33 |         dataframe["omni_total_order"] = dataframe["order_num_total_ever_online"] + dataframe["order_num_total_ever_offline"]
34 |         dataframe["omni_customer_value"] = dataframe["customer_value_total_ever_offline"] + dataframe[
35 |             "customer_value_total_ever_online"]
36 |         for col in dataframe.columns:
37 |             if "date" in col:
38 |                 dataframe[col] = pd.to_datetime(dataframe[col])
39 |         return rfm_creating(dataframe)
40 |     def rfm_creating(dataframe):
41 |         rfm = dataframe.groupby("master_id").agg(
42 |             {"last_order_date": lambda date: (today_date - date.max()).days,
43 |              "omni_total_order": lambda order: order,
44 |              "omni_customer_value": lambda value: value})
45 |         rfm.columns = ["recency", "frequency", "monetary"]
46 |         return rfm_segment(rfm)
47 |     def rfm_segment(rfm):
48 |         rfm["recency_score"] = pd.qcut(rfm["recency"], 5, labels=[5, 4, 3, 2, 1])
49 |         rfm["frequency_score"] = pd.qcut(rfm["frequency"].rank(method="first"), 5, labels=[1, 2, 3, 4, 5])
50 |         rfm["RF_score"] = (rfm["recency_score"].astype(str) + rfm["frequency_score"].astype(str))
51 |         rfm.reset_index(inplace=True)
52 | 
53 |         seg_map = {
54 |             r"[1-2][1-2]": "hibernating",
55 |             r"[1-2][3-4]": "at_risk",
56 |             r"[1-2]5": "cant_loose",
57 |             r"3[1-2]": "about_to_sleep",
58 |             r"33": "need_attention",
59 |             r"[3-4][4-5]": "loyal_customers",
60 |             r"41": "promising",
61 |             r"51": "new_customers",
62 |             r"[4-5][2-3]": "potential_loyalists",
63 |             r"5[4-5]": "champions"
64 |         }
65 |         rfm["Segment"] = rfm["RF_score"].replace(seg_map, regex=True)
66 |         return RFM_output(rfm, dataframe)
67 |     def RFM_output(rfm, dataframe):
68 |         segments = input("Segment name (eg. cant_loose or cant_loose|hibernating|new_customers)")
69 |         categories = input("Category name (eg. KADIN or COCUK|ERKEK)")
70 |         dataframe_1 = dataframe[dataframe["interested_in_categories_12"].str.contains(f"{categories}")][
71 |             ["master_id", "interested_in_categories_12"]]
72 |         rfm_1 = rfm[rfm["Segment"].str.contains(f"{segments}")][["master_id", "Segment"]]
73 |         new_customers = dataframe_1.merge(rfm_1, on="master_id", how="inner")
74 |         file_name = input("Under what name should the file be saved?")
75 |         new_customers.to_csv(f"{file_name}")
76 |         return print(f"Your file named {file_name} has been created and saved.")
77 |     return prep_data(dataframe)
78 | 
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------