├── cloud_function
├── requirements.txt
├── create_cloud_function.sh
└── main.py
├── readme.md
└── notebooks
├── update_timestamps.ipynb
├── txn_and_features_gen.ipynb
└── Validating_Online_Features_While_Detecting_Fraud.ipynb
/cloud_function/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | pandas-gbq
--------------------------------------------------------------------------------
/cloud_function/create_cloud_function.sh:
--------------------------------------------------------------------------------
1 | gcloud functions deploy feast-update-timestamps \
2 | --entry-point main \
3 | --runtime python37 \
4 | --trigger-resource feature-timestamp-schedule \
5 | --trigger-event google.pubsub.topic.publish \
6 | --timeout 540s
7 |
8 | gcloud scheduler jobs create pubsub feast-update-timestamp-job \
9 | --schedule "0 22 * * *" \
10 | --topic feature-timestamp-schedule \
11 | --message-body "."
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # Feast Fraud Tutorial
2 |
3 | This repo contains resources backing the `notebooks`directory containing notebooks used to generate the raw data in BigQuery.
4 | `cloud_functions` contains the source code for a cloud function that updates timestamps of tables in BigQuery.
5 |
6 | ### Prerequisites
7 |
8 | To successfully run the primary tutorial in `notebooks` directory, `Fraud Detection Tutorial`, you
9 | must meet the following requirements:
10 | * A Google Cloud Platform (GCP) account
11 | * Ability to create, modify or delete GCP artifacts:
12 | * Project ID
13 | * Bucket name and location on GCP
14 | * BigQuery Dataset name
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/cloud_function/main.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from datetime import datetime, timedelta
3 |
4 | def update_transactions():
5 | sql = """
6 | SELECT *
7 | FROM `feast-oss.fraud_tutorial.transactions`
8 | """
9 | transactions = pd.read_gbq(sql, dialect='standard')
10 | latest_time = transactions['timestamp'].max()
11 | datediff = datetime.now() - latest_time.replace(tzinfo=None)
12 | transactions['timestamp'] = transactions['timestamp'] + datediff
13 | transactions.to_gbq(destination_table="fraud_tutorial.transactions", project_id="feast-oss", if_exists='replace')
14 |
15 | def update_user_features():
16 | sql = """
17 | SELECT *
18 | FROM `feast-oss.fraud_tutorial.user_account_features`
19 | """
20 | user_features = pd.read_gbq(sql, dialect='standard')
21 | user_features['feature_timestamp'] = datetime.now() - timedelta(days=7)
22 | user_features.to_gbq(destination_table="fraud_tutorial.user_account_features", project_id="feast-oss", if_exists='replace')
23 |
24 | def update_user_fraud_features():
25 | sql = """
26 | SELECT *
27 | FROM `feast-oss.fraud_tutorial.user_has_fraudulent_transactions`
28 | """
29 | user_has_fraud = pd.read_gbq(sql, dialect='standard')
30 | latest_time = user_has_fraud['feature_timestamp'].max()
31 | datediff = datetime.now() - latest_time.replace(tzinfo=None)
32 | user_has_fraud['feature_timestamp'] = user_has_fraud['feature_timestamp'] + datediff
33 | user_has_fraud.to_gbq(destination_table="fraud_tutorial.user_has_fraudulent_transactions", project_id="feast-oss", if_exists='replace')
34 |
35 | def main(data, context):
36 | update_transactions()
37 | update_user_features()
38 | update_user_fraud_features()
39 |
40 | main(1, 1)
--------------------------------------------------------------------------------
/notebooks/update_timestamps.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "language_info": {
4 | "codemirror_mode": {
5 | "name": "ipython",
6 | "version": 3
7 | },
8 | "file_extension": ".py",
9 | "mimetype": "text/x-python",
10 | "name": "python",
11 | "nbconvert_exporter": "python",
12 | "pygments_lexer": "ipython3",
13 | "version": "3.7.9"
14 | },
15 | "orig_nbformat": 2,
16 | "kernelspec": {
17 | "name": "python3",
18 | "display_name": "Python 3.7.9 64-bit"
19 | },
20 | "interpreter": {
21 | "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
22 | }
23 | },
24 | "nbformat": 4,
25 | "nbformat_minor": 2,
26 | "cells": [
27 | {
28 | "cell_type": "code",
29 | "execution_count": 1,
30 | "metadata": {},
31 | "outputs": [
32 | {
33 | "output_type": "stream",
34 | "name": "stderr",
35 | "text": [
36 | "Downloading: 100%|██████████| 100000/100000 [00:05<00:00, 19643.91rows/s]\n"
37 | ]
38 | }
39 | ],
40 | "source": [
41 | "import pandas as pd\n",
42 | "\n",
43 | "sql = \"\"\"\n",
44 | " SELECT *\n",
45 | " FROM `feast-oss.fraud_tutorial.transactions`\n",
46 | "\"\"\"\n",
47 | "\n",
48 | "# Run a Standard SQL query using the environment's default project\n",
49 | "transactions = pd.read_gbq(sql, dialect='standard')\n",
50 | "\n",
51 | "latest_time = transactions['timestamp'].max()"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 2,
57 | "metadata": {},
58 | "outputs": [
59 | {
60 | "output_type": "execute_result",
61 | "data": {
62 | "text/plain": [
63 | " src_account amount dest_account is_fraud \\\n",
64 | "0 0001mg 3012.44 ydnwlr 0 \n",
65 | "1 0001mg 4431.82 oijv7z 0 \n",
66 | "2 0001mg 3037.60 a6mrvu 0 \n",
67 | "3 0001mg 6322.63 bmihen 0 \n",
68 | "4 0001mg 9981.82 tk53lu 0 \n",
69 | "... ... ... ... ... \n",
70 | "99995 zyvtf8 3609.00 u5s54p 1 \n",
71 | "99996 zz0sgh 6060.71 c97pdy 1 \n",
72 | "99997 zz0sgh 5543.38 dt60g4 1 \n",
73 | "99998 zzrx9o 5031.12 9vo8j7 1 \n",
74 | "99999 zzx65l 9031.58 p6w6un 1 \n",
75 | "\n",
76 | " timestamp \n",
77 | "0 2021-06-16 12:52:25.074517+00:00 \n",
78 | "1 2021-06-13 11:47:21.535700+00:00 \n",
79 | "2 2021-06-11 20:51:21.873945+00:00 \n",
80 | "3 2021-06-11 13:46:35.364700+00:00 \n",
81 | "4 2021-06-08 23:31:54.140277+00:00 \n",
82 | "... ... \n",
83 | "99995 2021-06-12 21:09:53.775954+00:00 \n",
84 | "99996 2021-06-04 11:50:31.591834+00:00 \n",
85 | "99997 2021-06-03 21:48:26.560339+00:00 \n",
86 | "99998 2021-06-14 12:00:42.439961+00:00 \n",
87 | "99999 2021-06-11 13:13:24.071963+00:00 \n",
88 | "\n",
89 | "[100000 rows x 5 columns]"
90 | ],
91 | "text/html": "
\n\n
\n \n \n | \n src_account | \n amount | \n dest_account | \n is_fraud | \n timestamp | \n
\n \n \n \n | 0 | \n 0001mg | \n 3012.44 | \n ydnwlr | \n 0 | \n 2021-06-16 12:52:25.074517+00:00 | \n
\n \n | 1 | \n 0001mg | \n 4431.82 | \n oijv7z | \n 0 | \n 2021-06-13 11:47:21.535700+00:00 | \n
\n \n | 2 | \n 0001mg | \n 3037.60 | \n a6mrvu | \n 0 | \n 2021-06-11 20:51:21.873945+00:00 | \n
\n \n | 3 | \n 0001mg | \n 6322.63 | \n bmihen | \n 0 | \n 2021-06-11 13:46:35.364700+00:00 | \n
\n \n | 4 | \n 0001mg | \n 9981.82 | \n tk53lu | \n 0 | \n 2021-06-08 23:31:54.140277+00:00 | \n
\n \n | ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n
\n \n | 99995 | \n zyvtf8 | \n 3609.00 | \n u5s54p | \n 1 | \n 2021-06-12 21:09:53.775954+00:00 | \n
\n \n | 99996 | \n zz0sgh | \n 6060.71 | \n c97pdy | \n 1 | \n 2021-06-04 11:50:31.591834+00:00 | \n
\n \n | 99997 | \n zz0sgh | \n 5543.38 | \n dt60g4 | \n 1 | \n 2021-06-03 21:48:26.560339+00:00 | \n
\n \n | 99998 | \n zzrx9o | \n 5031.12 | \n 9vo8j7 | \n 1 | \n 2021-06-14 12:00:42.439961+00:00 | \n
\n \n | 99999 | \n zzx65l | \n 9031.58 | \n p6w6un | \n 1 | \n 2021-06-11 13:13:24.071963+00:00 | \n
\n \n
\n
100000 rows × 5 columns
\n
"
92 | },
93 | "metadata": {},
94 | "execution_count": 2
95 | }
96 | ],
97 | "source": [
98 | "from datetime import datetime\n",
99 | "datediff = datetime.now() - latest_time.replace(tzinfo=None)\n",
100 | "\n",
101 | "transactions['timestamp'] = transactions['timestamp'] + datediff\n"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": 3,
107 | "metadata": {},
108 | "outputs": [
109 | {
110 | "output_type": "stream",
111 | "name": "stderr",
112 | "text": [
113 | "1it [00:11, 11.66s/it]\n"
114 | ]
115 | }
116 | ],
117 | "source": [
118 | "transactions.to_gbq(destination_table=\"fraud_tutorial.transactions\", project_id=\"feast-oss\", if_exists='replace')"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 4,
124 | "metadata": {},
125 | "outputs": [
126 | {
127 | "output_type": "stream",
128 | "name": "stderr",
129 | "text": [
130 | "Downloading: 100%|██████████| 9944/9944 [00:00<00:00, 19073.20rows/s]\n"
131 | ]
132 | },
133 | {
134 | "output_type": "execute_result",
135 | "data": {
136 | "text/plain": [
137 | " user_id credit_score account_age_days user_has_2fa_installed \\\n",
138 | "0 41sozr 512 700 0 \n",
139 | "1 h8nr8u 512 157 0 \n",
140 | "2 shid6v 512 509 0 \n",
141 | "3 rbcoqw 512 742 0 \n",
142 | "4 hew545 512 327 0 \n",
143 | "... ... ... ... ... \n",
144 | "9939 nsgtkp 767 891 1 \n",
145 | "9940 4dlidj 767 855 1 \n",
146 | "9941 1z87hk 767 271 1 \n",
147 | "9942 ffqerm 767 829 1 \n",
148 | "9943 elz674 767 783 1 \n",
149 | "\n",
150 | " feature_timestamp \n",
151 | "0 2021-06-09 19:13:46.199693 \n",
152 | "1 2021-06-09 19:13:46.199693 \n",
153 | "2 2021-06-09 19:13:46.199693 \n",
154 | "3 2021-06-09 19:13:46.199693 \n",
155 | "4 2021-06-09 19:13:46.199693 \n",
156 | "... ... \n",
157 | "9939 2021-06-09 19:13:46.199693 \n",
158 | "9940 2021-06-09 19:13:46.199693 \n",
159 | "9941 2021-06-09 19:13:46.199693 \n",
160 | "9942 2021-06-09 19:13:46.199693 \n",
161 | "9943 2021-06-09 19:13:46.199693 \n",
162 | "\n",
163 | "[9944 rows x 5 columns]"
164 | ],
165 | "text/html": "\n\n
\n \n \n | \n user_id | \n credit_score | \n account_age_days | \n user_has_2fa_installed | \n feature_timestamp | \n
\n \n \n \n | 0 | \n 41sozr | \n 512 | \n 700 | \n 0 | \n 2021-06-09 19:13:46.199693 | \n
\n \n | 1 | \n h8nr8u | \n 512 | \n 157 | \n 0 | \n 2021-06-09 19:13:46.199693 | \n
\n \n | 2 | \n shid6v | \n 512 | \n 509 | \n 0 | \n 2021-06-09 19:13:46.199693 | \n
\n \n | 3 | \n rbcoqw | \n 512 | \n 742 | \n 0 | \n 2021-06-09 19:13:46.199693 | \n
\n \n | 4 | \n hew545 | \n 512 | \n 327 | \n 0 | \n 2021-06-09 19:13:46.199693 | \n
\n \n | ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n
\n \n | 9939 | \n nsgtkp | \n 767 | \n 891 | \n 1 | \n 2021-06-09 19:13:46.199693 | \n
\n \n | 9940 | \n 4dlidj | \n 767 | \n 855 | \n 1 | \n 2021-06-09 19:13:46.199693 | \n
\n \n | 9941 | \n 1z87hk | \n 767 | \n 271 | \n 1 | \n 2021-06-09 19:13:46.199693 | \n
\n \n | 9942 | \n ffqerm | \n 767 | \n 829 | \n 1 | \n 2021-06-09 19:13:46.199693 | \n
\n \n | 9943 | \n elz674 | \n 767 | \n 783 | \n 1 | \n 2021-06-09 19:13:46.199693 | \n
\n \n
\n
9944 rows × 5 columns
\n
"
166 | },
167 | "metadata": {},
168 | "execution_count": 4
169 | }
170 | ],
171 | "source": [
172 | "import pandas as pd\n",
173 | "from datetime import datetime, timedelta\n",
174 | "\n",
175 | "sql = \"\"\"\n",
176 | " SELECT *\n",
177 | " FROM `feast-oss.fraud_tutorial.user_account_features`\n",
178 | "\"\"\"\n",
179 | "\n",
180 | "user_features = pd.read_gbq(sql, dialect='standard')\n",
181 | "\n",
182 | "user_features['feature_timestamp'] = datetime.now() - timedelta(days=7)\n",
183 | "\n",
184 | "user_features"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 5,
190 | "metadata": {},
191 | "outputs": [
192 | {
193 | "output_type": "stream",
194 | "name": "stderr",
195 | "text": [
196 | "1it [00:02, 2.60s/it]\n"
197 | ]
198 | }
199 | ],
200 | "source": [
201 | "user_features.to_gbq(destination_table=\"fraud_tutorial.user_account_features\", project_id=\"feast-oss\", if_exists='replace')"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": 7,
207 | "metadata": {},
208 | "outputs": [
209 | {
210 | "output_type": "stream",
211 | "name": "stderr",
212 | "text": [
213 | "Downloading: 100%|██████████| 69608/69608 [00:02<00:00, 29059.20rows/s]\n"
214 | ]
215 | },
216 | {
217 | "output_type": "execute_result",
218 | "data": {
219 | "text/plain": [
220 | " user_id user_has_fraudulent_transactions_7d \\\n",
221 | "0 0001mg 0.0 \n",
222 | "1 00c8mc 0.0 \n",
223 | "2 00gmwi 0.0 \n",
224 | "3 00mbm9 0.0 \n",
225 | "4 00wjqi 0.0 \n",
226 | "... ... ... \n",
227 | "69603 54r2jp 1.0 \n",
228 | "69604 phvjnv 1.0 \n",
229 | "69605 vr9qpk 1.0 \n",
230 | "69606 wija9d 1.0 \n",
231 | "69607 yvkh8e 1.0 \n",
232 | "\n",
233 | " feature_timestamp \n",
234 | "0 2021-06-11 12:56:59.739937+00:00 \n",
235 | "1 2021-06-11 12:56:59.739937+00:00 \n",
236 | "2 2021-06-11 12:56:59.739937+00:00 \n",
237 | "3 2021-06-11 12:56:59.739937+00:00 \n",
238 | "4 2021-06-11 12:56:59.739937+00:00 \n",
239 | "... ... \n",
240 | "69603 2021-06-17 12:56:59.739937+00:00 \n",
241 | "69604 2021-06-17 12:56:59.739937+00:00 \n",
242 | "69605 2021-06-17 12:56:59.739937+00:00 \n",
243 | "69606 2021-06-17 12:56:59.739937+00:00 \n",
244 | "69607 2021-06-17 12:56:59.739937+00:00 \n",
245 | "\n",
246 | "[69608 rows x 3 columns]"
247 | ],
248 | "text/html": "\n\n
\n \n \n | \n user_id | \n user_has_fraudulent_transactions_7d | \n feature_timestamp | \n
\n \n \n \n | 0 | \n 0001mg | \n 0.0 | \n 2021-06-11 12:56:59.739937+00:00 | \n
\n \n | 1 | \n 00c8mc | \n 0.0 | \n 2021-06-11 12:56:59.739937+00:00 | \n
\n \n | 2 | \n 00gmwi | \n 0.0 | \n 2021-06-11 12:56:59.739937+00:00 | \n
\n \n | 3 | \n 00mbm9 | \n 0.0 | \n 2021-06-11 12:56:59.739937+00:00 | \n
\n \n | 4 | \n 00wjqi | \n 0.0 | \n 2021-06-11 12:56:59.739937+00:00 | \n
\n \n | ... | \n ... | \n ... | \n ... | \n
\n \n | 69603 | \n 54r2jp | \n 1.0 | \n 2021-06-17 12:56:59.739937+00:00 | \n
\n \n | 69604 | \n phvjnv | \n 1.0 | \n 2021-06-17 12:56:59.739937+00:00 | \n
\n \n | 69605 | \n vr9qpk | \n 1.0 | \n 2021-06-17 12:56:59.739937+00:00 | \n
\n \n | 69606 | \n wija9d | \n 1.0 | \n 2021-06-17 12:56:59.739937+00:00 | \n
\n \n | 69607 | \n yvkh8e | \n 1.0 | \n 2021-06-17 12:56:59.739937+00:00 | \n
\n \n
\n
69608 rows × 3 columns
\n
"
249 | },
250 | "metadata": {},
251 | "execution_count": 7
252 | }
253 | ],
254 | "source": [
255 | "import pandas as pd\n",
256 | "\n",
257 | "sql = \"\"\"\n",
258 | " SELECT *\n",
259 | " FROM `feast-oss.fraud_tutorial.user_has_fraudulent_transactions`\n",
260 | "\"\"\"\n",
261 | "\n",
262 | "# Run a Standard SQL query using the environment's default project\n",
263 | "user_has_fraud = pd.read_gbq(sql, dialect='standard')\n",
264 | "\n",
265 | "latest_time = user_has_fraud['feature_timestamp'].max()\n",
266 | "\n",
267 | "datediff = datetime.now() - latest_time.replace(tzinfo=None)\n",
268 | "\n",
269 | "user_has_fraud['feature_timestamp'] = user_has_fraud['feature_timestamp'] + datediff"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 8,
275 | "metadata": {},
276 | "outputs": [
277 | {
278 | "output_type": "stream",
279 | "name": "stderr",
280 | "text": [
281 | "1it [00:05, 5.72s/it]\n"
282 | ]
283 | }
284 | ],
285 | "source": [
286 | "user_has_fraud.to_gbq(destination_table=\"fraud_tutorial.user_has_fraudulent_transactions\", project_id=\"feast-oss\", if_exists='replace')"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": null,
292 | "metadata": {},
293 | "outputs": [],
294 | "source": []
295 | }
296 | ]
297 | }
--------------------------------------------------------------------------------
/notebooks/txn_and_features_gen.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import string\n",
10 | "import random"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 61,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "from datetime import datetime, timedelta\n",
20 | "import numpy as np\n",
21 | "\n",
22 | "src_accounts = [user_ids[abs(int(np.random.normal(5000, 2500)))% 10000] for _ in range(100000)]\n",
23 | "amounts = [ round(np.random.uniform (100, 10000), 2) for _ in range(100000)]\n",
24 | "dest_accounts = [''.join(random.choices(string.digits + string.ascii_lowercase, k=6)) for _ in range(100000)]\n",
25 | "is_frauds = [np.random.binomial(1, 0.05) for _ in range(100000)]\n",
26 | "\n",
27 | "timestamps = []\n",
28 | "ts = datetime.now() - timedelta(days=14)\n",
29 | "for i in range(100000):\n",
30 | " timestamps.append(ts)\n",
31 | " ts += timedelta(seconds=np.random.uniform(0, 24))"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 67,
37 | "metadata": {},
38 | "outputs": [
39 | {
40 | "data": {
41 | "text/html": [
42 | "\n",
43 | "\n",
56 | "
\n",
57 | " \n",
58 | " \n",
59 | " | \n",
60 | " src_account | \n",
61 | " amount | \n",
62 | " dest_account | \n",
63 | " is_fraud | \n",
64 | " timestamp | \n",
65 | "
\n",
66 | " \n",
67 | " \n",
68 | " \n",
69 | " | 0 | \n",
70 | " 7859ge | \n",
71 | " 7078.83 | \n",
72 | " waqmx5 | \n",
73 | " 0 | \n",
74 | " 2021-05-21 08:10:12.039737 | \n",
75 | "
\n",
76 | " \n",
77 | " | 1 | \n",
78 | " yzziue | \n",
79 | " 7851.83 | \n",
80 | " tgx086 | \n",
81 | " 0 | \n",
82 | " 2021-05-21 08:10:24.464622 | \n",
83 | "
\n",
84 | " \n",
85 | " | 2 | \n",
86 | " bgf8nl | \n",
87 | " 6016.44 | \n",
88 | " q0ltxc | \n",
89 | " 0 | \n",
90 | " 2021-05-21 08:10:36.208894 | \n",
91 | "
\n",
92 | " \n",
93 | " | 3 | \n",
94 | " jiaxoq | \n",
95 | " 2573.43 | \n",
96 | " ct01il | \n",
97 | " 0 | \n",
98 | " 2021-05-21 08:10:54.177049 | \n",
99 | "
\n",
100 | " \n",
101 | " | 4 | \n",
102 | " u49qmt | \n",
103 | " 6743.81 | \n",
104 | " 397mqf | \n",
105 | " 0 | \n",
106 | " 2021-05-21 08:11:16.870868 | \n",
107 | "
\n",
108 | " \n",
109 | " | ... | \n",
110 | " ... | \n",
111 | " ... | \n",
112 | " ... | \n",
113 | " ... | \n",
114 | " ... | \n",
115 | "
\n",
116 | " \n",
117 | " | 99995 | \n",
118 | " xqjsd3 | \n",
119 | " 6856.42 | \n",
120 | " 2z3w39 | \n",
121 | " 0 | \n",
122 | " 2021-06-04 05:21:23.834089 | \n",
123 | "
\n",
124 | " \n",
125 | " | 99996 | \n",
126 | " 1tyh8p | \n",
127 | " 8527.89 | \n",
128 | " h5jgwy | \n",
129 | " 0 | \n",
130 | " 2021-06-04 05:21:37.946295 | \n",
131 | "
\n",
132 | " \n",
133 | " | 99997 | \n",
134 | " mfj3xt | \n",
135 | " 4651.57 | \n",
136 | " xvjv67 | \n",
137 | " 0 | \n",
138 | " 2021-06-04 05:21:39.853131 | \n",
139 | "
\n",
140 | " \n",
141 | " | 99998 | \n",
142 | " l0e31n | \n",
143 | " 9771.14 | \n",
144 | " savnzy | \n",
145 | " 0 | \n",
146 | " 2021-06-04 05:22:03.112553 | \n",
147 | "
\n",
148 | " \n",
149 | " | 99999 | \n",
150 | " 782k16 | \n",
151 | " 9749.99 | \n",
152 | " 24bhqv | \n",
153 | " 0 | \n",
154 | " 2021-06-04 05:22:08.142090 | \n",
155 | "
\n",
156 | " \n",
157 | "
\n",
158 | "
100000 rows × 5 columns
\n",
159 | "
"
160 | ],
161 | "text/plain": [
162 | " src_account amount dest_account is_fraud timestamp\n",
163 | "0 7859ge 7078.83 waqmx5 0 2021-05-21 08:10:12.039737\n",
164 | "1 yzziue 7851.83 tgx086 0 2021-05-21 08:10:24.464622\n",
165 | "2 bgf8nl 6016.44 q0ltxc 0 2021-05-21 08:10:36.208894\n",
166 | "3 jiaxoq 2573.43 ct01il 0 2021-05-21 08:10:54.177049\n",
167 | "4 u49qmt 6743.81 397mqf 0 2021-05-21 08:11:16.870868\n",
168 | "... ... ... ... ... ...\n",
169 | "99995 xqjsd3 6856.42 2z3w39 0 2021-06-04 05:21:23.834089\n",
170 | "99996 1tyh8p 8527.89 h5jgwy 0 2021-06-04 05:21:37.946295\n",
171 | "99997 mfj3xt 4651.57 xvjv67 0 2021-06-04 05:21:39.853131\n",
172 | "99998 l0e31n 9771.14 savnzy 0 2021-06-04 05:22:03.112553\n",
173 | "99999 782k16 9749.99 24bhqv 0 2021-06-04 05:22:08.142090\n",
174 | "\n",
175 | "[100000 rows x 5 columns]"
176 | ]
177 | },
178 | "execution_count": 67,
179 | "metadata": {},
180 | "output_type": "execute_result"
181 | }
182 | ],
183 | "source": [
184 | "transactions = pd.DataFrame.from_dict({\n",
185 | " \"src_account\": src_accounts,\n",
186 | " \"amount\": amounts,\n",
187 | " \"dest_account\": dest_accounts,\n",
188 | " \"is_fraud\": is_frauds,\n",
189 | " \"timestamp\": timestamps\n",
190 | "})\n",
191 | "\n",
192 | "transactions = transactions.sort_values(by='timestamp', ascending=False)\n",
193 | "\n",
194 | "transactions"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 138,
200 | "metadata": {},
201 | "outputs": [
202 | {
203 | "name": "stderr",
204 | "output_type": "stream",
205 | "text": [
206 | "1it [00:18, 18.60s/it]\n"
207 | ]
208 | }
209 | ],
210 | "source": [
211 | "# For some reason, this loads into BQ sorted by src_account\n",
212 | "# Needed to sort in BQ after this\n",
213 | "transactions.to_gbq(destination_table=\"fraud_tutorial.transactions\", project_id=\"feast-oss\")"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": 4,
219 | "metadata": {},
220 | "outputs": [
221 | {
222 | "name": "stderr",
223 | "output_type": "stream",
224 | "text": [
225 | "Downloading: 100%|██████████| 100000/100000 [00:04<00:00, 23842.20rows/s]\n"
226 | ]
227 | }
228 | ],
229 | "source": [
230 | "import pandas as pd\n",
231 | "\n",
232 | "sql = \"\"\"\n",
233 | " SELECT *\n",
234 | " FROM `feast-oss.fraud_tutorial.transactions`\n",
235 | "\"\"\"\n",
236 | "\n",
237 | "# Run a Standard SQL query using the environment's default project\n",
238 | "transactions = pd.read_gbq(sql, dialect='standard')"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 6,
244 | "metadata": {},
245 | "outputs": [
246 | {
247 | "data": {
248 | "text/html": [
249 | "\n",
250 | "\n",
263 | "
\n",
264 | " \n",
265 | " \n",
266 | " | \n",
267 | " src_account | \n",
268 | " amount | \n",
269 | " dest_account | \n",
270 | " is_fraud | \n",
271 | " timestamp | \n",
272 | "
\n",
273 | " \n",
274 | " \n",
275 | " \n",
276 | " | 0 | \n",
277 | " 782k16 | \n",
278 | " 9749.99 | \n",
279 | " 24bhqv | \n",
280 | " 0 | \n",
281 | " 2021-06-04 05:22:08.142090+00:00 | \n",
282 | "
\n",
283 | " \n",
284 | " | 1 | \n",
285 | " l0e31n | \n",
286 | " 9771.14 | \n",
287 | " savnzy | \n",
288 | " 0 | \n",
289 | " 2021-06-04 05:22:03.112553+00:00 | \n",
290 | "
\n",
291 | " \n",
292 | " | 2 | \n",
293 | " mfj3xt | \n",
294 | " 4651.57 | \n",
295 | " xvjv67 | \n",
296 | " 0 | \n",
297 | " 2021-06-04 05:21:39.853131+00:00 | \n",
298 | "
\n",
299 | " \n",
300 | " | 3 | \n",
301 | " 1tyh8p | \n",
302 | " 8527.89 | \n",
303 | " h5jgwy | \n",
304 | " 0 | \n",
305 | " 2021-06-04 05:21:37.946295+00:00 | \n",
306 | "
\n",
307 | " \n",
308 | " | 4 | \n",
309 | " xqjsd3 | \n",
310 | " 6856.42 | \n",
311 | " 2z3w39 | \n",
312 | " 0 | \n",
313 | " 2021-06-04 05:21:23.834089+00:00 | \n",
314 | "
\n",
315 | " \n",
316 | " | ... | \n",
317 | " ... | \n",
318 | " ... | \n",
319 | " ... | \n",
320 | " ... | \n",
321 | " ... | \n",
322 | "
\n",
323 | " \n",
324 | " | 99995 | \n",
325 | " u49qmt | \n",
326 | " 6743.81 | \n",
327 | " 397mqf | \n",
328 | " 0 | \n",
329 | " 2021-05-21 08:11:16.870868+00:00 | \n",
330 | "
\n",
331 | " \n",
332 | " | 99996 | \n",
333 | " jiaxoq | \n",
334 | " 2573.43 | \n",
335 | " ct01il | \n",
336 | " 0 | \n",
337 | " 2021-05-21 08:10:54.177049+00:00 | \n",
338 | "
\n",
339 | " \n",
340 | " | 99997 | \n",
341 | " bgf8nl | \n",
342 | " 6016.44 | \n",
343 | " q0ltxc | \n",
344 | " 0 | \n",
345 | " 2021-05-21 08:10:36.208894+00:00 | \n",
346 | "
\n",
347 | " \n",
348 | " | 99998 | \n",
349 | " yzziue | \n",
350 | " 7851.83 | \n",
351 | " tgx086 | \n",
352 | " 0 | \n",
353 | " 2021-05-21 08:10:24.464622+00:00 | \n",
354 | "
\n",
355 | " \n",
356 | " | 99999 | \n",
357 | " 7859ge | \n",
358 | " 7078.83 | \n",
359 | " waqmx5 | \n",
360 | " 0 | \n",
361 | " 2021-05-21 08:10:12.039737+00:00 | \n",
362 | "
\n",
363 | " \n",
364 | "
\n",
365 | "
100000 rows × 5 columns
\n",
366 | "
"
367 | ],
368 | "text/plain": [
369 | " src_account amount dest_account is_fraud \\\n",
370 | "0 782k16 9749.99 24bhqv 0 \n",
371 | "1 l0e31n 9771.14 savnzy 0 \n",
372 | "2 mfj3xt 4651.57 xvjv67 0 \n",
373 | "3 1tyh8p 8527.89 h5jgwy 0 \n",
374 | "4 xqjsd3 6856.42 2z3w39 0 \n",
375 | "... ... ... ... ... \n",
376 | "99995 u49qmt 6743.81 397mqf 0 \n",
377 | "99996 jiaxoq 2573.43 ct01il 0 \n",
378 | "99997 bgf8nl 6016.44 q0ltxc 0 \n",
379 | "99998 yzziue 7851.83 tgx086 0 \n",
380 | "99999 7859ge 7078.83 waqmx5 0 \n",
381 | "\n",
382 | " timestamp \n",
383 | "0 2021-06-04 05:22:08.142090+00:00 \n",
384 | "1 2021-06-04 05:22:03.112553+00:00 \n",
385 | "2 2021-06-04 05:21:39.853131+00:00 \n",
386 | "3 2021-06-04 05:21:37.946295+00:00 \n",
387 | "4 2021-06-04 05:21:23.834089+00:00 \n",
388 | "... ... \n",
389 | "99995 2021-05-21 08:11:16.870868+00:00 \n",
390 | "99996 2021-05-21 08:10:54.177049+00:00 \n",
391 | "99997 2021-05-21 08:10:36.208894+00:00 \n",
392 | "99998 2021-05-21 08:10:24.464622+00:00 \n",
393 | "99999 2021-05-21 08:10:12.039737+00:00 \n",
394 | "\n",
395 | "[100000 rows x 5 columns]"
396 | ]
397 | },
398 | "execution_count": 6,
399 | "metadata": {},
400 | "output_type": "execute_result"
401 | }
402 | ],
403 | "source": [
404 | "transactions"
405 | ]
406 | },
407 | {
408 | "cell_type": "code",
409 | "execution_count": 10,
410 | "metadata": {},
411 | "outputs": [],
412 | "source": [
413 | "from datetime import datetime, timedelta\n",
414 | "import pytz\n",
415 | "\n",
416 | "t = transactions\n",
417 | "\n",
418 | "aggregation_end = datetime.now(tz=pytz.UTC)\n",
419 | "\n",
420 | "# last week's transactions\n",
421 | "lwt = t[(t['timestamp'] < aggregation_end ) & (t['timestamp'] >= aggregation_end - timedelta(days=7))]\n",
422 | "\n",
423 | "# users with fraudulent transactions\n",
424 | "fu = lwt[lwt['is_fraud'] == 1]\\\n",
425 | " .drop_duplicates(subset='src_account')[['src_account', 'is_fraud']]\\\n",
426 | " .rename(columns={'is_fraud': 'user_has_fraudulent_transactions_7d'}) \n"
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": 11,
432 | "metadata": {},
433 | "outputs": [
434 | {
435 | "data": {
436 | "text/html": [
437 | "\n",
438 | "\n",
451 | "
\n",
452 | " \n",
453 | " \n",
454 | " | \n",
455 | " src_account | \n",
456 | " user_has_fraudulent_transactions_7d | \n",
457 | " feature_timestamp | \n",
458 | "
\n",
459 | " \n",
460 | " \n",
461 | " \n",
462 | " | 27 | \n",
463 | " zqvbs4 | \n",
464 | " 1 | \n",
465 | " 2021-06-07 12:45:10.214026 | \n",
466 | "
\n",
467 | " \n",
468 | " | 37 | \n",
469 | " a9l0te | \n",
470 | " 1 | \n",
471 | " 2021-06-07 12:45:10.214026 | \n",
472 | "
\n",
473 | " \n",
474 | " | 62 | \n",
475 | " z2lnqe | \n",
476 | " 1 | \n",
477 | " 2021-06-07 12:45:10.214026 | \n",
478 | "
\n",
479 | " \n",
480 | " | 82 | \n",
481 | " xv1ul5 | \n",
482 | " 1 | \n",
483 | " 2021-06-07 12:45:10.214026 | \n",
484 | "
\n",
485 | " \n",
486 | " | 112 | \n",
487 | " 6ua5v6 | \n",
488 | " 1 | \n",
489 | " 2021-06-07 12:45:10.214026 | \n",
490 | "
\n",
491 | " \n",
492 | " | ... | \n",
493 | " ... | \n",
494 | " ... | \n",
495 | " ... | \n",
496 | "
\n",
497 | " \n",
498 | " | 25188 | \n",
499 | " kr123d | \n",
500 | " 1 | \n",
501 | " 2021-06-07 12:45:10.214026 | \n",
502 | "
\n",
503 | " \n",
504 | " | 25243 | \n",
505 | " y7dobz | \n",
506 | " 1 | \n",
507 | " 2021-06-07 12:45:10.214026 | \n",
508 | "
\n",
509 | " \n",
510 | " | 25303 | \n",
511 | " wija9d | \n",
512 | " 1 | \n",
513 | " 2021-06-07 12:45:10.214026 | \n",
514 | "
\n",
515 | " \n",
516 | " | 25375 | \n",
517 | " u269is | \n",
518 | " 1 | \n",
519 | " 2021-06-07 12:45:10.214026 | \n",
520 | "
\n",
521 | " \n",
522 | " | 25378 | \n",
523 | " 8058vz | \n",
524 | " 1 | \n",
525 | " 2021-06-07 12:45:10.214026 | \n",
526 | "
\n",
527 | " \n",
528 | "
\n",
529 | "
1175 rows × 3 columns
\n",
530 | "
"
531 | ],
532 | "text/plain": [
533 | " src_account user_has_fraudulent_transactions_7d \\\n",
534 | "27 zqvbs4 1 \n",
535 | "37 a9l0te 1 \n",
536 | "62 z2lnqe 1 \n",
537 | "82 xv1ul5 1 \n",
538 | "112 6ua5v6 1 \n",
539 | "... ... ... \n",
540 | "25188 kr123d 1 \n",
541 | "25243 y7dobz 1 \n",
542 | "25303 wija9d 1 \n",
543 | "25375 u269is 1 \n",
544 | "25378 8058vz 1 \n",
545 | "\n",
546 | " feature_timestamp \n",
547 | "27 2021-06-07 12:45:10.214026 \n",
548 | "37 2021-06-07 12:45:10.214026 \n",
549 | "62 2021-06-07 12:45:10.214026 \n",
550 | "82 2021-06-07 12:45:10.214026 \n",
551 | "112 2021-06-07 12:45:10.214026 \n",
552 | "... ... \n",
553 | "25188 2021-06-07 12:45:10.214026 \n",
554 | "25243 2021-06-07 12:45:10.214026 \n",
555 | "25303 2021-06-07 12:45:10.214026 \n",
556 | "25375 2021-06-07 12:45:10.214026 \n",
557 | "25378 2021-06-07 12:45:10.214026 \n",
558 | "\n",
559 | "[1175 rows x 3 columns]"
560 | ]
561 | },
562 | "execution_count": 11,
563 | "metadata": {},
564 | "output_type": "execute_result"
565 | }
566 | ],
567 | "source": [
568 | "fu"
569 | ]
570 | },
571 | {
572 | "cell_type": "code",
573 | "execution_count": 23,
574 | "metadata": {},
575 | "outputs": [
576 | {
577 | "data": {
578 | "text/html": [
579 | "\n",
580 | "\n",
593 | "
\n",
594 | " \n",
595 | " \n",
596 | " | \n",
597 | " user_id | \n",
598 | " user_has_fraudulent_transactions_7d | \n",
599 | " feature_timestamp | \n",
600 | "
\n",
601 | " \n",
602 | " \n",
603 | " \n",
604 | " | 0 | \n",
605 | " 782k16 | \n",
606 | " 0.0 | \n",
607 | " 2021-06-07 12:58:28.318652 | \n",
608 | "
\n",
609 | " \n",
610 | " | 1 | \n",
611 | " l0e31n | \n",
612 | " 0.0 | \n",
613 | " 2021-06-07 12:58:28.318652 | \n",
614 | "
\n",
615 | " \n",
616 | " | 2 | \n",
617 | " mfj3xt | \n",
618 | " 1.0 | \n",
619 | " 2021-06-07 12:58:28.318652 | \n",
620 | "
\n",
621 | " \n",
622 | " | 3 | \n",
623 | " 1tyh8p | \n",
624 | " 0.0 | \n",
625 | " 2021-06-07 12:58:28.318652 | \n",
626 | "
\n",
627 | " \n",
628 | " | 4 | \n",
629 | " xqjsd3 | \n",
630 | " 0.0 | \n",
631 | " 2021-06-07 12:58:28.318652 | \n",
632 | "
\n",
633 | " \n",
634 | " | ... | \n",
635 | " ... | \n",
636 | " ... | \n",
637 | " ... | \n",
638 | "
\n",
639 | " \n",
640 | " | 99995 | \n",
641 | " u49qmt | \n",
642 | " 0.0 | \n",
643 | " 2021-06-07 12:58:28.318652 | \n",
644 | "
\n",
645 | " \n",
646 | " | 99996 | \n",
647 | " jiaxoq | \n",
648 | " 0.0 | \n",
649 | " 2021-06-07 12:58:28.318652 | \n",
650 | "
\n",
651 | " \n",
652 | " | 99997 | \n",
653 | " bgf8nl | \n",
654 | " 0.0 | \n",
655 | " 2021-06-07 12:58:28.318652 | \n",
656 | "
\n",
657 | " \n",
658 | " | 99998 | \n",
659 | " yzziue | \n",
660 | " 0.0 | \n",
661 | " 2021-06-07 12:58:28.318652 | \n",
662 | "
\n",
663 | " \n",
664 | " | 99999 | \n",
665 | " 7859ge | \n",
666 | " 0.0 | \n",
667 | " 2021-06-07 12:58:28.318652 | \n",
668 | "
\n",
669 | " \n",
670 | "
\n",
671 | "
100000 rows × 3 columns
\n",
672 | "
"
673 | ],
674 | "text/plain": [
675 | " user_id user_has_fraudulent_transactions_7d feature_timestamp\n",
676 | "0 782k16 0.0 2021-06-07 12:58:28.318652\n",
677 | "1 l0e31n 0.0 2021-06-07 12:58:28.318652\n",
678 | "2 mfj3xt 1.0 2021-06-07 12:58:28.318652\n",
679 | "3 1tyh8p 0.0 2021-06-07 12:58:28.318652\n",
680 | "4 xqjsd3 0.0 2021-06-07 12:58:28.318652\n",
681 | "... ... ... ...\n",
682 | "99995 u49qmt 0.0 2021-06-07 12:58:28.318652\n",
683 | "99996 jiaxoq 0.0 2021-06-07 12:58:28.318652\n",
684 | "99997 bgf8nl 0.0 2021-06-07 12:58:28.318652\n",
685 | "99998 yzziue 0.0 2021-06-07 12:58:28.318652\n",
686 | "99999 7859ge 0.0 2021-06-07 12:58:28.318652\n",
687 | "\n",
688 | "[100000 rows x 3 columns]"
689 | ]
690 | },
691 | "execution_count": 23,
692 | "metadata": {},
693 | "output_type": "execute_result"
694 | }
695 | ],
696 | "source": [
697 | "# feature table\n",
698 | "ff = pd.merge(t, fu, on='src_account',how='left')\\\n",
699 | " .fillna(0)\\\n",
700 | " [['src_account', 'user_has_fraudulent_transactions_7d']].rename(columns={\"src_account\": \"user_id\"})\n",
701 | "\n",
702 | "ff['feature_timestamp'] = datetime.now()\n",
703 | "\n",
704 | "ff"
705 | ]
706 | },
707 | {
708 | "cell_type": "code",
709 | "execution_count": 20,
710 | "metadata": {},
711 | "outputs": [
712 | {
713 | "output_type": "stream",
714 | "name": "stderr",
715 | "text": [
716 | "\n",
717 | "Downloading: 0%| | 0/100000 [00:00, ?rows/s]\u001b[A\n",
718 | "Downloading: 100%|██████████| 100000/100000 [00:03<00:00, 25626.05rows/s]\n",
719 | "generating features as of 2021-06-03 16:42:14.250521+00:00\n",
720 | "\n",
721 | "1it [00:03, 3.78s/it]\n",
722 | "generating features as of 2021-06-04 16:42:14.250521+00:00\n",
723 | "\n",
724 | "1it [00:02, 2.77s/it]\n",
725 | "generating features as of 2021-06-05 16:42:14.250521+00:00\n",
726 | "\n",
727 | "1it [00:02, 2.72s/it]\n",
728 | "generating features as of 2021-06-06 16:42:14.250521+00:00\n",
729 | "\n",
730 | "1it [00:05, 5.13s/it]\n",
731 | "generating features as of 2021-06-07 16:42:14.250521+00:00\n",
732 | "\n",
733 | "1it [00:02, 2.27s/it]\n",
734 | "generating features as of 2021-06-08 16:42:14.250521+00:00\n",
735 | "\n",
736 | "1it [00:02, 2.61s/it]\n",
737 | "\n",
738 | "0it [00:00, ?it/s]\u001b[Agenerating features as of 2021-06-09 16:42:14.250521+00:00\n",
739 | "1it [00:05, 5.17s/it]\n"
740 | ]
741 | }
742 | ],
743 | "source": [
744 | "from datetime import datetime, timedelta\n",
745 | "import pytz\n",
746 | "\n",
747 | "sql = \"\"\"\n",
748 | "SELECT *\n",
749 | "FROM `feast-oss.fraud_tutorial.transactions`\n",
750 | "\"\"\"\n",
751 | "\n",
752 | "t = pd.read_gbq(sql, dialect='standard')\n",
753 | "\n",
754 | "def generate_fraud_features(aggregation_end):\n",
755 | "\n",
756 | " user_ids = t['src_account'].drop_duplicates()\n",
757 | "\n",
758 | " # last week's transactions\n",
759 | " lwt = t[(t['timestamp'] < aggregation_end ) & (t['timestamp'] >= aggregation_end - timedelta(days=7))]\n",
760 | "\n",
761 | " # users with fraudulent transactions\n",
762 | " fu = lwt[lwt['is_fraud'] == 1]\\\n",
763 | " .drop_duplicates(subset='src_account')[['src_account', 'is_fraud']]\\\n",
764 | " .rename(columns={'is_fraud': 'user_has_fraudulent_transactions_7d'}) \\\n",
765 | " # feature table\n",
766 | " ff = pd.merge(user_ids, fu, on='src_account',how='left')\\\n",
767 | " .fillna(0)\\\n",
768 | " [['src_account', 'user_has_fraudulent_transactions_7d']].rename(columns={\"src_account\": \"user_id\"})\n",
769 | "\n",
770 | " ff['feature_timestamp'] = aggregation_end\n",
771 | " ff.to_gbq(destination_table=\"fraud_tutorial.user_has_fraudulent_transactions\", project_id=\"feast-oss\", if_exists='append')\n",
772 | "\n",
773 | "def backfill_features(start_date, interval, iters):\n",
774 | " aggregation_end_date = start_date\n",
775 | " for _ in range(iters):\n",
776 | " print(f\"generating features as of {aggregation_end_date}\")\n",
777 | " generate_fraud_features(aggregation_end=aggregation_end_date)\n",
778 | " aggregation_end_date += interval\n",
779 | " \n",
780 | "ff = backfill_features(\n",
781 | " start_date=datetime.now(tz=pytz.UTC) - timedelta(days=7), \n",
782 | " interval=timedelta(days=1), \n",
783 | " iters=7\n",
784 | ")"
785 | ]
786 | },
787 | {
788 | "cell_type": "code",
789 | "execution_count": 24,
790 | "metadata": {},
791 | "outputs": [
792 | {
793 | "name": "stderr",
794 | "output_type": "stream",
795 | "text": [
796 | "1it [00:06, 6.93s/it]\n"
797 | ]
798 | }
799 | ],
800 | "source": [
801 | "ff.to_gbq(destination_table=\"fraud_tutorial.user_has_fraudulent_transactions\", project_id=\"feast-oss\", if_exists='replace')"
802 | ]
803 | },
804 | {
805 | "cell_type": "code",
806 | "execution_count": 25,
807 | "metadata": {},
808 | "outputs": [
809 | {
810 | "data": {
811 | "text/html": [
812 | "\n",
813 | "\n",
826 | "
\n",
827 | " \n",
828 | " \n",
829 | " | \n",
830 | " user_id | \n",
831 | " credit_score | \n",
832 | " account_age_days | \n",
833 | " user_has_2fa_installed | \n",
834 | " feature_timestamp | \n",
835 | "
\n",
836 | " \n",
837 | " \n",
838 | " \n",
839 | " | 0 | \n",
840 | " 782k16 | \n",
841 | " 626 | \n",
842 | " 799 | \n",
843 | " 1 | \n",
844 | " 2021-06-07 12:59:14.813413 | \n",
845 | "
\n",
846 | " \n",
847 | " | 1 | \n",
848 | " l0e31n | \n",
849 | " 648 | \n",
850 | " 889 | \n",
851 | " 1 | \n",
852 | " 2021-06-07 12:59:14.813418 | \n",
853 | "
\n",
854 | " \n",
855 | " | 2 | \n",
856 | " mfj3xt | \n",
857 | " 603 | \n",
858 | " 383 | \n",
859 | " 1 | \n",
860 | " 2021-06-07 12:59:14.813419 | \n",
861 | "
\n",
862 | " \n",
863 | " | 3 | \n",
864 | " 1tyh8p | \n",
865 | " 808 | \n",
866 | " 701 | \n",
867 | " 0 | \n",
868 | " 2021-06-07 12:59:14.813419 | \n",
869 | "
\n",
870 | " \n",
871 | " | 4 | \n",
872 | " xqjsd3 | \n",
873 | " 351 | \n",
874 | " 428 | \n",
875 | " 0 | \n",
876 | " 2021-06-07 12:59:14.813420 | \n",
877 | "
\n",
878 | " \n",
879 | " | ... | \n",
880 | " ... | \n",
881 | " ... | \n",
882 | " ... | \n",
883 | " ... | \n",
884 | " ... | \n",
885 | "
\n",
886 | " \n",
887 | " | 97279 | \n",
888 | " h1p7lk | \n",
889 | " 518 | \n",
890 | " 407 | \n",
891 | " 1 | \n",
892 | " 2021-06-07 12:59:14.818469 | \n",
893 | "
\n",
894 | " \n",
895 | " | 97325 | \n",
896 | " n120dt | \n",
897 | " 595 | \n",
898 | " 927 | \n",
899 | " 1 | \n",
900 | " 2021-06-07 12:59:14.818470 | \n",
901 | "
\n",
902 | " \n",
903 | " | 97818 | \n",
904 | " txk4ui | \n",
905 | " 583 | \n",
906 | " 872 | \n",
907 | " 1 | \n",
908 | " 2021-06-07 12:59:14.818470 | \n",
909 | "
\n",
910 | " \n",
911 | " | 98870 | \n",
912 | " j72zdi | \n",
913 | " 685 | \n",
914 | " 114 | \n",
915 | " 0 | \n",
916 | " 2021-06-07 12:59:14.818471 | \n",
917 | "
\n",
918 | " \n",
919 | " | 99563 | \n",
920 | " wi10zj | \n",
921 | " 404 | \n",
922 | " 627 | \n",
923 | " 1 | \n",
924 | " 2021-06-07 12:59:14.818471 | \n",
925 | "
\n",
926 | " \n",
927 | "
\n",
928 | "
9944 rows × 5 columns
\n",
929 | "
"
930 | ],
931 | "text/plain": [
932 | " user_id credit_score account_age_days user_has_2fa_installed \\\n",
933 | "0 782k16 626 799 1 \n",
934 | "1 l0e31n 648 889 1 \n",
935 | "2 mfj3xt 603 383 1 \n",
936 | "3 1tyh8p 808 701 0 \n",
937 | "4 xqjsd3 351 428 0 \n",
938 | "... ... ... ... ... \n",
939 | "97279 h1p7lk 518 407 1 \n",
940 | "97325 n120dt 595 927 1 \n",
941 | "97818 txk4ui 583 872 1 \n",
942 | "98870 j72zdi 685 114 0 \n",
943 | "99563 wi10zj 404 627 1 \n",
944 | "\n",
945 | " feature_timestamp \n",
946 | "0 2021-06-07 12:59:14.813413 \n",
947 | "1 2021-06-07 12:59:14.813418 \n",
948 | "2 2021-06-07 12:59:14.813419 \n",
949 | "3 2021-06-07 12:59:14.813419 \n",
950 | "4 2021-06-07 12:59:14.813420 \n",
951 | "... ... \n",
952 | "97279 2021-06-07 12:59:14.818469 \n",
953 | "97325 2021-06-07 12:59:14.818470 \n",
954 | "97818 2021-06-07 12:59:14.818470 \n",
955 | "98870 2021-06-07 12:59:14.818471 \n",
956 | "99563 2021-06-07 12:59:14.818471 \n",
957 | "\n",
958 | "[9944 rows x 5 columns]"
959 | ]
960 | },
961 | "execution_count": 25,
962 | "metadata": {},
963 | "output_type": "execute_result"
964 | }
965 | ],
966 | "source": [
967 | "import numpy as np\n",
968 | "\n",
969 | "user_ids = t['src_account'].drop_duplicates()\n",
970 | "\n",
971 | "user_features = pd.DataFrame.from_dict({\n",
972 | " \"user_id\": user_ids,\n",
973 | " \"credit_score\": [int(np.random.uniform(350, 850)) % 850 for _ in range(9944)],\n",
974 | " \"account_age_days\": [int(np.random.uniform(25, 960)) for _ in range(9944)],\n",
975 | " \"user_has_2fa_installed\": [int(np.random.uniform(0, 2)) for _ in range(9944)],\n",
976 | " \"feature_timestamp\": [datetime.now() for _ in range(9944)]\n",
977 | "})\n",
978 | "\n",
979 | "user_features"
980 | ]
981 | },
982 | {
983 | "cell_type": "code",
984 | "execution_count": 26,
985 | "metadata": {},
986 | "outputs": [
987 | {
988 | "name": "stderr",
989 | "output_type": "stream",
990 | "text": [
991 | "1it [00:05, 5.71s/it]\n"
992 | ]
993 | }
994 | ],
995 | "source": [
996 | "user_features.to_gbq(destination_table=\"fraud_tutorial.user_account_features\", project_id=\"feast-oss\", if_exists='replace')"
997 | ]
998 | },
999 | {
1000 | "cell_type": "code",
1001 | "execution_count": 125,
1002 | "metadata": {},
1003 | "outputs": [
1004 | {
1005 | "name": "stdout",
1006 | "output_type": "stream",
1007 | "text": [
1008 | "1.7273118602433915\n"
1009 | ]
1010 | },
1011 | {
1012 | "data": {
1013 | "text/plain": [
1014 | "1"
1015 | ]
1016 | },
1017 | "execution_count": 125,
1018 | "metadata": {},
1019 | "output_type": "execute_result"
1020 | }
1021 | ],
1022 | "source": [
1023 | "x = np.rando2021-06-04 12:56:23.851782 UTC\n",
1024 | "m.uniform(0, 2)\n",
1025 | "print(x)\n",
1026 | "int(x)"
1027 | ]
1028 | },
1029 | {
1030 | "cell_type": "code",
1031 | "execution_count": 2,
1032 | "metadata": {},
1033 | "outputs": [
1034 | {
1035 | "output_type": "stream",
1036 | "name": "stderr",
1037 | "text": [
1038 | "Downloading: 100%|██████████| 100000/100000 [00:04<00:00, 20677.91rows/s]\n"
1039 | ]
1040 | }
1041 | ],
1042 | "source": [
1043 | "import pandas as pd\n",
1044 | "\n",
1045 | "sql = \"\"\"\n",
1046 | " SELECT *\n",
1047 | " FROM `feast-oss.fraud_tutorial.transactions`\n",
1048 | "\"\"\"\n",
1049 | "\n",
1050 | "# Run a Standard SQL query using the environment's default project\n",
1051 | "transactions = pd.read_gbq(sql, dialect='standard')\n",
1052 | "\n",
1053 | "latest_time = transactions['timestamp'].max()"
1054 | ]
1055 | },
1056 | {
1057 | "cell_type": "code",
1058 | "execution_count": 7,
1059 | "metadata": {},
1060 | "outputs": [
1061 | {
1062 | "output_type": "execute_result",
1063 | "data": {
1064 | "text/plain": [
1065 | " src_account amount dest_account is_fraud \\\n",
1066 | "0 782k16 9749.99 24bhqv 0 \n",
1067 | "1 l0e31n 9771.14 savnzy 0 \n",
1068 | "2 mfj3xt 4651.57 xvjv67 0 \n",
1069 | "3 1tyh8p 8527.89 h5jgwy 0 \n",
1070 | "4 xqjsd3 6856.42 2z3w39 0 \n",
1071 | "... ... ... ... ... \n",
1072 | "99995 u49qmt 6743.81 397mqf 0 \n",
1073 | "99996 jiaxoq 2573.43 ct01il 0 \n",
1074 | "99997 bgf8nl 6016.44 q0ltxc 0 \n",
1075 | "99998 yzziue 7851.83 tgx086 0 \n",
1076 | "99999 7859ge 7078.83 waqmx5 0 \n",
1077 | "\n",
1078 | " timestamp \n",
1079 | "0 2021-06-10 11:53:31.513514+00:00 \n",
1080 | "1 2021-06-10 11:53:26.483977+00:00 \n",
1081 | "2 2021-06-10 11:53:03.224555+00:00 \n",
1082 | "3 2021-06-10 11:53:01.317719+00:00 \n",
1083 | "4 2021-06-10 11:52:47.205513+00:00 \n",
1084 | "... ... \n",
1085 | "99995 2021-05-27 14:42:40.242292+00:00 \n",
1086 | "99996 2021-05-27 14:42:17.548473+00:00 \n",
1087 | "99997 2021-05-27 14:41:59.580318+00:00 \n",
1088 | "99998 2021-05-27 14:41:47.836046+00:00 \n",
1089 | "99999 2021-05-27 14:41:35.411161+00:00 \n",
1090 | "\n",
1091 | "[100000 rows x 5 columns]"
1092 | ],
1093 | "text/html": "\n\n
\n \n \n | \n src_account | \n amount | \n dest_account | \n is_fraud | \n timestamp | \n
\n \n \n \n | 0 | \n 782k16 | \n 9749.99 | \n 24bhqv | \n 0 | \n 2021-06-10 11:53:31.513514+00:00 | \n
\n \n | 1 | \n l0e31n | \n 9771.14 | \n savnzy | \n 0 | \n 2021-06-10 11:53:26.483977+00:00 | \n
\n \n | 2 | \n mfj3xt | \n 4651.57 | \n xvjv67 | \n 0 | \n 2021-06-10 11:53:03.224555+00:00 | \n
\n \n | 3 | \n 1tyh8p | \n 8527.89 | \n h5jgwy | \n 0 | \n 2021-06-10 11:53:01.317719+00:00 | \n
\n \n | 4 | \n xqjsd3 | \n 6856.42 | \n 2z3w39 | \n 0 | \n 2021-06-10 11:52:47.205513+00:00 | \n
\n \n | ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n
\n \n | 99995 | \n u49qmt | \n 6743.81 | \n 397mqf | \n 0 | \n 2021-05-27 14:42:40.242292+00:00 | \n
\n \n | 99996 | \n jiaxoq | \n 2573.43 | \n ct01il | \n 0 | \n 2021-05-27 14:42:17.548473+00:00 | \n
\n \n | 99997 | \n bgf8nl | \n 6016.44 | \n q0ltxc | \n 0 | \n 2021-05-27 14:41:59.580318+00:00 | \n
\n \n | 99998 | \n yzziue | \n 7851.83 | \n tgx086 | \n 0 | \n 2021-05-27 14:41:47.836046+00:00 | \n
\n \n | 99999 | \n 7859ge | \n 7078.83 | \n waqmx5 | \n 0 | \n 2021-05-27 14:41:35.411161+00:00 | \n
\n \n
\n
100000 rows × 5 columns
\n
"
1094 | },
1095 | "metadata": {},
1096 | "execution_count": 7
1097 | }
1098 | ],
1099 | "source": [
1100 | "from datetime import datetime\n",
1101 | "datediff = datetime.now() - latest_time.replace(tzinfo=None)\n",
1102 | "\n",
1103 | "transactions['timestamp'] = transactions['timestamp'] + datediff\n",
1104 | "\n",
1105 | "transactions"
1106 | ]
1107 | },
1108 | {
1109 | "cell_type": "code",
1110 | "execution_count": 9,
1111 | "metadata": {},
1112 | "outputs": [
1113 | {
1114 | "output_type": "stream",
1115 | "name": "stderr",
1116 | "text": [
1117 | "1it [00:09, 9.47s/it]\n"
1118 | ]
1119 | }
1120 | ],
1121 | "source": [
1122 | "transactions.to_gbq(destination_table=\"fraud_tutorial.transactions\", project_id=\"feast-oss\", if_exists='replace')"
1123 | ]
1124 | },
1125 | {
1126 | "cell_type": "code",
1127 | "execution_count": 5,
1128 | "metadata": {},
1129 | "outputs": [
1130 | {
1131 | "output_type": "stream",
1132 | "name": "stderr",
1133 | "text": [
1134 | "Downloading: 100%|██████████| 9944/9944 [00:00<00:00, 12317.33rows/s]\n"
1135 | ]
1136 | },
1137 | {
1138 | "output_type": "execute_result",
1139 | "data": {
1140 | "text/plain": [
1141 | " user_id credit_score account_age_days user_has_2fa_installed \\\n",
1142 | "0 41sozr 512 700 0 \n",
1143 | "1 h8nr8u 512 157 0 \n",
1144 | "2 shid6v 512 509 0 \n",
1145 | "3 rbcoqw 512 742 0 \n",
1146 | "4 hew545 512 327 0 \n",
1147 | "... ... ... ... ... \n",
1148 | "9939 nsgtkp 767 891 1 \n",
1149 | "9940 4dlidj 767 855 1 \n",
1150 | "9941 1z87hk 767 271 1 \n",
1151 | "9942 ffqerm 767 829 1 \n",
1152 | "9943 elz674 767 783 1 \n",
1153 | "\n",
1154 | " feature_timestamp \n",
1155 | "0 2021-06-03 12:11:13.032174 \n",
1156 | "1 2021-06-03 12:11:13.032174 \n",
1157 | "2 2021-06-03 12:11:13.032174 \n",
1158 | "3 2021-06-03 12:11:13.032174 \n",
1159 | "4 2021-06-03 12:11:13.032174 \n",
1160 | "... ... \n",
1161 | "9939 2021-06-03 12:11:13.032174 \n",
1162 | "9940 2021-06-03 12:11:13.032174 \n",
1163 | "9941 2021-06-03 12:11:13.032174 \n",
1164 | "9942 2021-06-03 12:11:13.032174 \n",
1165 | "9943 2021-06-03 12:11:13.032174 \n",
1166 | "\n",
1167 | "[9944 rows x 5 columns]"
1168 | ],
1169 | "text/html": "\n\n
\n \n \n | \n user_id | \n credit_score | \n account_age_days | \n user_has_2fa_installed | \n feature_timestamp | \n
\n \n \n \n | 0 | \n 41sozr | \n 512 | \n 700 | \n 0 | \n 2021-06-03 12:11:13.032174 | \n
\n \n | 1 | \n h8nr8u | \n 512 | \n 157 | \n 0 | \n 2021-06-03 12:11:13.032174 | \n
\n \n | 2 | \n shid6v | \n 512 | \n 509 | \n 0 | \n 2021-06-03 12:11:13.032174 | \n
\n \n | 3 | \n rbcoqw | \n 512 | \n 742 | \n 0 | \n 2021-06-03 12:11:13.032174 | \n
\n \n | 4 | \n hew545 | \n 512 | \n 327 | \n 0 | \n 2021-06-03 12:11:13.032174 | \n
\n \n | ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n
\n \n | 9939 | \n nsgtkp | \n 767 | \n 891 | \n 1 | \n 2021-06-03 12:11:13.032174 | \n
\n \n | 9940 | \n 4dlidj | \n 767 | \n 855 | \n 1 | \n 2021-06-03 12:11:13.032174 | \n
\n \n | 9941 | \n 1z87hk | \n 767 | \n 271 | \n 1 | \n 2021-06-03 12:11:13.032174 | \n
\n \n | 9942 | \n ffqerm | \n 767 | \n 829 | \n 1 | \n 2021-06-03 12:11:13.032174 | \n
\n \n | 9943 | \n elz674 | \n 767 | \n 783 | \n 1 | \n 2021-06-03 12:11:13.032174 | \n
\n \n
\n
9944 rows × 5 columns
\n
"
1170 | },
1171 | "metadata": {},
1172 | "execution_count": 5
1173 | }
1174 | ],
1175 | "source": [
1176 | "import pandas as pd\n",
1177 | "from datetime import datetime, timedelta\n",
1178 | "\n",
1179 | "sql = \"\"\"\n",
1180 | " SELECT *\n",
1181 | " FROM `feast-oss.fraud_tutorial.user_account_features`\n",
1182 | "\"\"\"\n",
1183 | "\n",
1184 | "user_features = pd.read_gbq(sql, dialect='standard')\n",
1185 | "\n",
1186 | "user_features['feature_timestamp'] = datetime.now() - timedelta(days=7)\n",
1187 | "\n",
1188 | "user_features"
1189 | ]
1190 | },
1191 | {
1192 | "cell_type": "code",
1193 | "execution_count": 6,
1194 | "metadata": {},
1195 | "outputs": [
1196 | {
1197 | "output_type": "stream",
1198 | "name": "stderr",
1199 | "text": [
1200 | "1it [00:04, 4.62s/it]\n"
1201 | ]
1202 | }
1203 | ],
1204 | "source": [
1205 | "user_features.to_gbq(destination_table=\"fraud_tutorial.user_account_features\", project_id=\"feast-oss\", if_exists='replace')"
1206 | ]
1207 | },
1208 | {
1209 | "cell_type": "code",
1210 | "execution_count": null,
1211 | "metadata": {},
1212 | "outputs": [],
1213 | "source": []
1214 | }
1215 | ],
1216 | "metadata": {
1217 | "kernelspec": {
1218 | "name": "python379jvsc74a57bd0aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49",
1219 | "display_name": "Python 3.7.9 64-bit"
1220 | },
1221 | "language_info": {
1222 | "codemirror_mode": {
1223 | "name": "ipython",
1224 | "version": 3
1225 | },
1226 | "file_extension": ".py",
1227 | "mimetype": "text/x-python",
1228 | "name": "python",
1229 | "nbconvert_exporter": "python",
1230 | "pygments_lexer": "ipython3",
1231 | "version": "3.7.9"
1232 | },
1233 | "metadata": {
1234 | "interpreter": {
1235 | "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
1236 | }
1237 | }
1238 | },
1239 | "nbformat": 4,
1240 | "nbformat_minor": 2
1241 | }
--------------------------------------------------------------------------------
/notebooks/Validating_Online_Features_While_Detecting_Fraud.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "jirdTjhETQW0"
7 | },
8 | "source": [
9 | "# Introduction\n",
10 | "\n",
11 | "In this tutorial we will extend previously developed fraud prediction system by adding Data Quality Monitoring for online features.\n",
12 | "\n",
13 | "If you haven't already please check out these two tutorials first:\n",
14 | "1. [Fraud Detection (with BigQuery and Datastore)](https://github.com/feast-dev/feast-gcp-fraud-tutorial/blob/main/notebooks/Fraud_Detection_Tutorial.ipynb)\n",
15 | "2. [Validation of historical features with Great Expectations](https://docs.feast.dev/tutorials/validating-historical-features)\n",
16 | "\n",
17 | "Throughout this tutorial, we’ll briefly revisit set up of feature store for the fraud detection system and then we'll walk through the creation of validation expectations, configuration of the online features logging and will check how to apply validation in production.\n",
18 | "\n",
19 | "*The need to revisit the system desribed in [previous tutorial](https://github.com/feast-dev/feast-gcp-fraud-tutorial/blob/main/notebooks/Fraud_Detection_Tutorial.ipynb) is caused by the fact that Go feature server, which can produce feature logs used in validation, currently supports only Redis online store, whereas previous tutorial was using Datastore.*\n",
20 | "\n",
21 | "Here's a high-level diagram desribing data flow in DQM pipeline:\n",
22 | "\n",
23 | "\n"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {
29 | "id": "2qipWwfSnrjK"
30 | },
31 | "source": [
32 | ""
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "It all starts from generating a training dataset (by pulling historical features from an offline store). This training dataset serves as a validation reference. When a data scientist develops a model some set of implicit expectations from the training dataset appears. Those expectations, first of all, should be met by the training dataset itself and only then we can apply them to the online features in production.\n",
40 | "\n",
41 | "Hence, on the second step the data scientist explores the dataset and develops or formalizes those expectations with help of [Great Expectations library](https://docs.greatexpectations.io/docs/). Those expectations can be checked right aways against training dataset and only those that pass on it will be added to a reference profile. Reference profile is a set of expectations that could be serialzed and later checked against tested dataset w/o the need to load the training dataset again.\n",
42 | "\n",
43 | "On the evaluation stage a tested dataset is loaded from a storage and validated against a reference profile."
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "What you'll need for this tutorial:\n",
51 | "1. GCP Account with access to BigQuery\n",
52 | "2. Redis server (accessible locally)\n",
53 | "3. (for Windows / Mac M1 users) installed Go compiler (=> 1.17) to build parts of Feast written in Go "
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "# Part I (feature store for fraud detection system on Redis)"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "So let's first recall the basics of the feature store creation."
68 | ]
69 | },
70 | {
71 | "cell_type": "markdown",
72 | "metadata": {
73 | "id": "WX6daujBVgZ5"
74 | },
75 | "source": [
76 | "## Installation and set up\n",
77 | "\n",
78 | "### Install Feast\n",
79 | "\n",
80 | "Feast can be installed using pip. This installation includes a Python package as well as a CLI.\n",
81 | "\n",
82 | "Feast contains some packages which conflict with the default versions installed in Colab. **After running this cell, restart the runtime to continue** (Runtime > Restart runtime.)\n"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {
89 | "colab": {
90 | "base_uri": "https://localhost:8080/"
91 | },
92 | "id": "S51NR-oPVsjg",
93 | "outputId": "81fa5d76-1641-4c2f-a7f7-a27988b686f8"
94 | },
95 | "outputs": [],
96 | "source": [
97 | "%env COMPILE_GO=True\n",
98 | "%env FEAST_USAGE=False\n",
99 | "\n",
100 | "! pip install 'feast[gcp,redis,ge,go]'\n",
101 | "! feast version"
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "metadata": {
107 | "id": "6eKs1547MiFA"
108 | },
109 | "source": [
110 | "## Set configurations\n",
111 | "\n",
112 | "Set the following configuration, which we'll be using throughout the tutorial:\n",
113 | "\n",
114 | "- PROJECT_ID: Your project.\n",
115 | "- BIGQUERY_DATASET_NAME: The name of a dataset which will be used to create tables containing features and store the logs of the feature server."
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {
122 | "colab": {
123 | "base_uri": "https://localhost:8080/"
124 | },
125 | "id": "NKPT2GJ_Jb2h",
126 | "outputId": "06a9514b-b0fc-4aff-dd1c-6a9a62c4adae"
127 | },
128 | "outputs": [],
129 | "source": [
130 | "PROJECT_ID = \"\"\n",
131 | "BIGQUERY_DATASET_NAME = \"\""
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "metadata": {
137 | "id": "hrjZkcFmZlSM"
138 | },
139 | "source": [
140 | "## Create a BigQuery dataset\n",
141 | "**Only if your dataset doesn't already exist**: Run the following cell to create your BigQuery dataset."
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "metadata": {
148 | "colab": {
149 | "base_uri": "https://localhost:8080/"
150 | },
151 | "id": "_73sXuvjZzoz",
152 | "outputId": "31f70b1e-8eae-4099-efc1-b067eaaadf07"
153 | },
154 | "outputs": [],
155 | "source": [
156 | "! bq mk $BIGQUERY_DATASET_NAME"
157 | ]
158 | },
159 | {
160 | "cell_type": "markdown",
161 | "metadata": {
162 | "id": "3W_OsJMWkipk"
163 | },
164 | "source": [
165 | "## Initialize the feature repository"
166 | ]
167 | },
168 | {
169 | "cell_type": "markdown",
170 | "metadata": {
171 | "id": "Sk0fdKESD3j-"
172 | },
173 | "source": [
174 | "In Feast, you define your features using configuration stored in a repository. To start, initialize a feature repository."
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "metadata": {
181 | "colab": {
182 | "base_uri": "https://localhost:8080/"
183 | },
184 | "id": "ASAv4kB3kkz_",
185 | "outputId": "d64888e0-f9ce-4b5a-a00d-a6b5c300b412"
186 | },
187 | "outputs": [],
188 | "source": [
189 | "! feast init fraud_tutorial\n",
190 | "%cd fraud_tutorial/\n",
191 | "! ls"
192 | ]
193 | },
194 | {
195 | "cell_type": "markdown",
196 | "metadata": {
197 | "id": "XDfgOvshswm5"
198 | },
199 | "source": [
200 | "Next, we'll edit the `feature_store.yaml` file to specify offline and online stores. Note that the `project` field in this file refers to the Feast concept of a project, not a GCP project."
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": null,
206 | "metadata": {
207 | "colab": {
208 | "base_uri": "https://localhost:8080/"
209 | },
210 | "id": "mK727g_Ik6zs",
211 | "outputId": "2bb6628d-4dd8-4a3c-89a3-a64da05c43d8"
212 | },
213 | "outputs": [],
214 | "source": [
215 | "feature_store = \\\n",
216 | "f\"\"\"project: fraud_tutorial\n",
217 | "registry: data/registry.db\n",
218 | "provider: local\n",
219 | "offline_store:\n",
220 | " type: bigquery\n",
221 | " dataset: {BIGQUERY_DATASET_NAME}\n",
222 | "online_store:\n",
223 | " type: redis\n",
224 | " connection_string: \"localhost:6379\"\n",
225 | "go_feature_retrieval: True\n",
226 | "\"\"\"\n",
227 | "\n",
228 | "with open('feature_store.yaml', \"w\") as feature_store_file:\n",
229 | " feature_store_file.write(feature_store)\n",
230 | "\n",
231 | "# Print our feature_store.yaml\n",
232 | "! cat feature_store.yaml"
233 | ]
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "metadata": {
238 | "id": "Q0dMrw4rESL7"
239 | },
240 | "source": [
241 | "Then, we can apply our feature repository:"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": null,
247 | "metadata": {
248 | "colab": {
249 | "base_uri": "https://localhost:8080/"
250 | },
251 | "id": "f6QD4-lVrbdt",
252 | "outputId": "460f5f04-ea1c-4580-a1fe-0f755b71243e"
253 | },
254 | "outputs": [],
255 | "source": [
256 | "! feast apply"
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "metadata": {
262 | "id": "ewiAq45-Efp0"
263 | },
264 | "source": [
265 | "## Creating features\n",
266 | "\n",
267 | "Next, let's make a new feature and register it to the store.\n",
268 | "\n",
269 | "This involves two steps.\n",
270 | "\n",
271 | "- **Using Bigquery**, we generate new feature values using SQL. Feast is used not to generate features, which is done in Python/SQL.\n",
272 | "- **Using Feast**, we register our new features in Feast by creating a FeatureView:\n",
273 | "\n"
274 | ]
275 | },
276 | {
277 | "cell_type": "markdown",
278 | "metadata": {
279 | "id": "0V0qK1knwjg-"
280 | },
281 | "source": [
282 | "## Preview the raw data"
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": null,
288 | "metadata": {},
289 | "outputs": [],
290 | "source": [
291 | "from google.cloud import bigquery\n",
292 | "bq_client = bigquery.Client(project=PROJECT_ID)"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": null,
298 | "metadata": {
299 | "colab": {
300 | "base_uri": "https://localhost:8080/",
301 | "height": 459
302 | },
303 | "id": "p90F3kxgv9UO",
304 | "outputId": "5f540f22-3d93-4999-928f-b27292e8b62c"
305 | },
306 | "outputs": [],
307 | "source": [
308 | "j = bq_client.query(\"select * from feast-oss.fraud_tutorial.transactions limit 1000\")\n",
309 | "j.to_dataframe()"
310 | ]
311 | },
312 | {
313 | "cell_type": "markdown",
314 | "metadata": {
315 | "id": "3i0KOCg8wn5n"
316 | },
317 | "source": [
318 | "## Create a feature table using SQL"
319 | ]
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "metadata": {
324 | "id": "1EGCETWpFV95"
325 | },
326 | "source": [
327 | "Then, run the following cell to generate features. This cell contains two functions:\n",
328 | "\n",
329 | "- `generate_user_count_features` runs a SQL query that counts the amount of transactions users have made as of a given point in time.\n",
330 | "\n",
331 | "- `backfill_features` runs this query multiple times over an interval to backfill features.\n",
332 | "\n"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": null,
338 | "metadata": {
339 | "colab": {
340 | "base_uri": "https://localhost:8080/"
341 | },
342 | "id": "ZZ_z84Y1xMia",
343 | "outputId": "b52dcd50-3372-4d31-e52f-073854d3b21c"
344 | },
345 | "outputs": [],
346 | "source": [
347 | "from datetime import datetime, timedelta\n",
348 | "import time\n",
349 | "\n",
350 | "def generate_user_count_features(aggregation_end_date):\n",
351 | " table_id = f\"{PROJECT_ID}.{BIGQUERY_DATASET_NAME}.user_count_transactions_7d\"\n",
352 | "\n",
353 | " client = bigquery.Client(project=PROJECT_ID)\n",
354 | " job_config = bigquery.QueryJobConfig(destination=table_id, write_disposition='WRITE_APPEND')\n",
355 | "\n",
356 | " aggregation_start_date = datetime.now() - timedelta(days=7)\n",
357 | "\n",
358 | " sql = f\"\"\"\n",
359 | " SELECT\n",
360 | " src_account AS user_id,\n",
361 | " COUNT(*) AS transaction_count_7d,\n",
362 | " timestamp'{aggregation_end_date.isoformat()}' AS feature_timestamp\n",
363 | " FROM\n",
364 | " feast-oss.fraud_tutorial.transactions\n",
365 | " WHERE\n",
366 | " timestamp BETWEEN TIMESTAMP('{aggregation_start_date.isoformat()}')\n",
367 | " AND TIMESTAMP('{aggregation_end_date.isoformat()}')\n",
368 | " GROUP BY\n",
369 | " user_id\n",
370 | " \"\"\"\n",
371 | "\n",
372 | " query_job = client.query(sql, job_config=job_config)\n",
373 | " query_job.result()\n",
374 | " print(f\"Generated features as of {aggregation_end_date.isoformat()}\")\n",
375 | "\n",
376 | "\n",
377 | "def backfill_features(earliest_aggregation_end_date, interval, num_iterations):\n",
378 | " aggregation_end_date = earliest_aggregation_end_date\n",
379 | " for _ in range(num_iterations):\n",
380 | " generate_user_count_features(aggregation_end_date=aggregation_end_date)\n",
381 | " time.sleep(1)\n",
382 | " aggregation_end_date += interval\n",
383 | "\n",
384 | "if __name__ == '__main__':\n",
385 | " backfill_features(\n",
386 | " earliest_aggregation_end_date=datetime.now() - timedelta(days=7),\n",
387 | " interval=timedelta(days=1),\n",
388 | " num_iterations=8\n",
389 | " )\n"
390 | ]
391 | },
392 | {
393 | "cell_type": "markdown",
394 | "metadata": {
395 | "id": "JYbQTSWiGCWu"
396 | },
397 | "source": [
398 | "Then, we can preview our new feature:"
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": null,
404 | "metadata": {
405 | "colab": {
406 | "base_uri": "https://localhost:8080/"
407 | },
408 | "id": "c3WpOXbUxs6d",
409 | "outputId": "696fa460-d89f-4286-e8c7-a53272d930e6"
410 | },
411 | "outputs": [],
412 | "source": [
413 | "j = bq_client.query(f\"select * from {BIGQUERY_DATASET_NAME}.user_count_transactions_7d limit 1000\")\n",
414 | "j.to_dataframe()"
415 | ]
416 | },
417 | {
418 | "cell_type": "markdown",
419 | "metadata": {
420 | "id": "RXLgVUuNGPTj"
421 | },
422 | "source": [
423 | "## Create a new FeatureView\n",
424 | "\n",
425 | "Create new files, `fraud_features.py`, which contains our new feature definition and `fraud_services.py`, which contains feature service definition."
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": null,
431 | "metadata": {
432 | "id": "F6B7Wo67yDV7"
433 | },
434 | "outputs": [],
435 | "source": [
436 | "fraud_features = \\\n",
437 | "f\"\"\"\n",
438 | "from datetime import timedelta\n",
439 | "from feast import BigQuerySource, FeatureView, Entity, ValueType\n",
440 | "\n",
441 | "# Add an entity for users\n",
442 | "user_entity = Entity(\n",
443 | " name=\"user_id\",\n",
444 | " description=\"A user that has executed a transaction or received a transaction\",\n",
445 | ")\n",
446 | "\n",
447 | "# Add a FeatureView based on our new table\n",
448 | "driver_stats_fv = FeatureView(\n",
449 | " name=\"user_transaction_count_7d\",\n",
450 | " entities=[user_entity],\n",
451 | " ttl=timedelta(weeks=1),\n",
452 | " batch_source=BigQuerySource(\n",
453 | " table=f\"{PROJECT_ID}.{BIGQUERY_DATASET_NAME}.user_count_transactions_7d\",\n",
454 | " timestamp_field=\"feature_timestamp\"))\n",
455 | "\n",
456 | "# Add two FeatureViews based on existing tables in BigQuery\n",
457 | "user_account_fv = FeatureView(\n",
458 | " name=\"user_account_features\",\n",
459 | " entities=[user_entity],\n",
460 | " ttl=timedelta(weeks=52),\n",
461 | " batch_source=BigQuerySource(\n",
462 | " table=f\"feast-oss.fraud_tutorial.user_account_features\",\n",
463 | " timestamp_field=\"feature_timestamp\"))\n",
464 | "\n",
465 | "user_has_fraudulent_transactions_fv = FeatureView(\n",
466 | " name=\"user_has_fraudulent_transactions\",\n",
467 | " entities=[user_entity],\n",
468 | " ttl=timedelta(weeks=52),\n",
469 | " batch_source=BigQuerySource(\n",
470 | " table=f\"feast-oss.fraud_tutorial.user_has_fraudulent_transactions\",\n",
471 | " timestamp_field=\"feature_timestamp\"))\n",
472 | "\"\"\"\n",
473 | "\n",
474 | "fraud_services = f\"\"\"\n",
475 | "from feast import FeatureService\n",
476 | "\n",
477 | "from fraud_features import driver_stats_fv, user_account_fv, user_has_fraudulent_transactions_fv\n",
478 | "\n",
479 | "fs = FeatureService(\n",
480 | " name=\"user_features\",\n",
481 | " features=[\n",
482 | " driver_stats_fv[[\"user_transaction_count_7d\"]],\n",
483 | " user_account_fv[[\"credit_score\", \"account_age_days\", \"user_has_2fa_installed\"]],\n",
484 | " user_has_fraudulent_transactions_fv[[\"user_has_fraudulent_transactions_7d\"]],\n",
485 | " ],\n",
486 | ")\"\"\"\n",
487 | "\n",
488 | "with open('fraud_features.py', \"w\") as fraud_features_file:\n",
489 | " fraud_features_file.write(fraud_features)\n",
490 | " \n",
491 | "with open('fraud_services.py', \"w\") as fraud_services_file:\n",
492 | " fraud_services_file.write(fraud_services)"
493 | ]
494 | },
495 | {
496 | "cell_type": "code",
497 | "execution_count": null,
498 | "metadata": {
499 | "colab": {
500 | "base_uri": "https://localhost:8080/"
501 | },
502 | "id": "6rSndNQ30ASP",
503 | "outputId": "84e0d430-d7a0-4b87-aa15-86d0cdccaf9e"
504 | },
505 | "outputs": [],
506 | "source": [
507 | "# Remove example features\n",
508 | "!rm example.py\n",
509 | "# Apply our changes\n",
510 | "!feast apply"
511 | ]
512 | },
513 | {
514 | "cell_type": "markdown",
515 | "metadata": {
516 | "id": "6cJFAJiuGxM3"
517 | },
518 | "source": [
519 | "## Fetching training data\n",
520 | "\n",
521 | "Now that our feature is registered in Feast, we can use Feast to generate a training dataset. To do this, we need an entity dataframe, alongside the list of features we want:"
522 | ]
523 | },
524 | {
525 | "cell_type": "code",
526 | "execution_count": null,
527 | "metadata": {
528 | "colab": {
529 | "base_uri": "https://localhost:8080/",
530 | "height": 330
531 | },
532 | "id": "BqgiEP2Oz42q",
533 | "outputId": "40552317-644b-4ee6-d735-0ee5e48e79dd"
534 | },
535 | "outputs": [],
536 | "source": [
537 | "from datetime import datetime, timedelta\n",
538 | "from feast import FeatureStore\n",
539 | "\n",
540 | "# Initialize a FeatureStore with our current repository's configurations\n",
541 | "store = FeatureStore(repo_path=\".\")\n",
542 | "\n",
543 | "# Get training data\n",
544 | "now = datetime.now()\n",
545 | "two_days_ago = datetime.now() - timedelta(days=2)\n",
546 | "\n",
547 | "feature_service = store.get_feature_service(\"user_features\")\n",
548 | "\n",
549 | "training_data_job = store.get_historical_features(\n",
550 | " entity_df=f\"\"\"\n",
551 | " select \n",
552 | " src_account as user_id,\n",
553 | " timestamp as event_timestamp,\n",
554 | " is_fraud\n",
555 | " from\n",
556 | " feast-oss.fraud_tutorial.transactions\n",
557 | " where\n",
558 | " timestamp between timestamp('{two_days_ago.isoformat()}') \n",
559 | " and timestamp('{now.isoformat()}')\"\"\",\n",
560 | " features=feature_service,\n",
561 | " full_feature_names=True\n",
562 | ")\n",
563 | "\n",
564 | "training_data = training_data_job.to_df()\n",
565 | "training_data.head()\n"
566 | ]
567 | },
568 | {
569 | "cell_type": "markdown",
570 | "metadata": {
571 | "id": "3izkr_3sG1hX"
572 | },
573 | "source": [
574 | "## Training a model\n",
575 | "\n",
576 | "Now, we can use our features to train a model:"
577 | ]
578 | },
579 | {
580 | "cell_type": "code",
581 | "execution_count": null,
582 | "metadata": {},
583 | "outputs": [],
584 | "source": [
585 | "!pip install sklearn"
586 | ]
587 | },
588 | {
589 | "cell_type": "code",
590 | "execution_count": null,
591 | "metadata": {
592 | "colab": {
593 | "base_uri": "https://localhost:8080/"
594 | },
595 | "id": "YMgFuFHR4pIu",
596 | "outputId": "7f248543-3016-482c-9d8e-b0819cf3609d"
597 | },
598 | "outputs": [],
599 | "source": [
600 | "from sklearn.linear_model import LinearRegression\n",
601 | "\n",
602 | "# Drop stray nulls\n",
603 | "training_data.dropna(inplace=True)\n",
604 | "\n",
605 | "# Select training matrices\n",
606 | "X = training_data[[\n",
607 | " \"user_transaction_count_7d__transaction_count_7d\", \n",
608 | " \"user_account_features__credit_score\",\n",
609 | " \"user_account_features__account_age_days\",\n",
610 | " \"user_account_features__user_has_2fa_installed\",\n",
611 | " \"user_has_fraudulent_transactions__user_has_fraudulent_transactions_7d\"\n",
612 | "]]\n",
613 | "y = training_data[\"is_fraud\"]\n",
614 | "\n",
615 | "# Train a simple SVC model\n",
616 | "model = LinearRegression()\n",
617 | "model.fit(X, y)"
618 | ]
619 | },
620 | {
621 | "cell_type": "code",
622 | "execution_count": null,
623 | "metadata": {
624 | "colab": {
625 | "base_uri": "https://localhost:8080/"
626 | },
627 | "id": "PobCpVWu4pdz",
628 | "outputId": "60a39792-d332-4edc-c59c-e4e20a87ea5e"
629 | },
630 | "outputs": [],
631 | "source": [
632 | "# Get first two rows of training data\n",
633 | "samples = X.iloc[:2]\n",
634 | "\n",
635 | "# Make a test prediction\n",
636 | "model.predict(samples)"
637 | ]
638 | },
639 | {
640 | "cell_type": "code",
641 | "execution_count": null,
642 | "metadata": {},
643 | "outputs": [],
644 | "source": [
645 | "import joblib\n",
646 | "joblib.dump(model, \"model.bin\")"
647 | ]
648 | },
649 | {
650 | "cell_type": "markdown",
651 | "metadata": {
652 | "id": "HWQLT0gTHi1h"
653 | },
654 | "source": [
655 | "## Materializing features\n",
656 | "\n",
657 | "To enable real time feature inference, Feast loads your features into a key-value store so they're available at low latency. We use Redis as this key-value store."
658 | ]
659 | },
660 | {
661 | "cell_type": "code",
662 | "execution_count": null,
663 | "metadata": {
664 | "id": "REBqJPcZ99Lj"
665 | },
666 | "outputs": [],
667 | "source": [
668 | "!feast materialize-incremental $(date -u +\"%Y-%m-%dT%H:%M:%S\")"
669 | ]
670 | },
671 | {
672 | "cell_type": "markdown",
673 | "metadata": {
674 | "id": "FNVxJpw1HohD"
675 | },
676 | "source": [
677 | "## Low latency inference\n",
678 | "\n",
679 | "To make a prediction in real-time, we need to do the following:\n",
680 | "\n",
681 | "1. Start a feature server (as a subprocess) that will expose gRPC API\n",
682 | "2. Create a gRPC client using precompiled Feast Serving proto interface\n",
683 | "3. Call GetOnlineFeatures on this gRPC client\n",
684 | "4. Pass these features to the model\n",
685 | "5. Return these predictions to the user"
686 | ]
687 | },
688 | {
689 | "cell_type": "code",
690 | "execution_count": null,
691 | "metadata": {},
692 | "outputs": [],
693 | "source": [
694 | "import subprocess\n",
695 | "\n",
696 | "server = subprocess.Popen([\"feast\", \"serve\", \"-t\", \"grpc\"])"
697 | ]
698 | },
699 | {
700 | "cell_type": "code",
701 | "execution_count": null,
702 | "metadata": {},
703 | "outputs": [],
704 | "source": [
705 | "import grpc\n",
706 | "\n",
707 | "from feast.protos.feast.serving.ServingService_pb2 import GetOnlineFeaturesRequest\n",
708 | "from feast.protos.feast.serving.ServingService_pb2_grpc import ServingServiceStub\n",
709 | "\n",
710 | "from feast.protos.feast.types.Value_pb2 import RepeatedValue\n",
711 | "\n",
712 | "from feast.type_map import python_values_to_proto_values\n",
713 | "from feast.online_response import OnlineResponse\n",
714 | "\n",
715 | "chan = grpc.insecure_channel(\"localhost:6566\")\n",
716 | "grpc_client = ServingServiceStub(chan)\n",
717 | "\n",
718 | "def get_online_features_remote(user_ids):\n",
719 | " resp = grpc_client.GetOnlineFeatures(\n",
720 | " GetOnlineFeaturesRequest(\n",
721 | " feature_service=\"user_features\",\n",
722 | " entities={\n",
723 | " \"user_id\": RepeatedValue(\n",
724 | " val=python_values_to_proto_values(user_ids)\n",
725 | " )\n",
726 | " }\n",
727 | " )\n",
728 | " )\n",
729 | " return OnlineResponse(resp).to_dict()"
730 | ]
731 | },
732 | {
733 | "cell_type": "code",
734 | "execution_count": null,
735 | "metadata": {
736 | "colab": {
737 | "base_uri": "https://localhost:8080/"
738 | },
739 | "id": "JyME-zoy-4d_",
740 | "outputId": "b2e9ba19-4d0b-4cac-9c69-90a06c28fe25"
741 | },
742 | "outputs": [],
743 | "source": [
744 | "import joblib\n",
745 | "model = joblib.load('model.bin')\n",
746 | "\n",
747 | "def predict(user_ids):\n",
748 | " feature_vector = get_online_features_remote(user_ids)\n",
749 | " \n",
750 | " # Delete entity keys\n",
751 | " del feature_vector[\"user_id\"]\n",
752 | "\n",
753 | " # Flatten response from Feast\n",
754 | " instances = [\n",
755 | " [feature_values[i] or 0 for feature_values in feature_vector.values()]\n",
756 | " for i in range(len(user_ids))\n",
757 | " ]\n",
758 | "\n",
759 | " response = model.predict(instances)\n",
760 | " return response\n",
761 | "\n",
762 | "predict([\"v5zlw0\"])"
763 | ]
764 | },
765 | {
766 | "cell_type": "markdown",
767 | "metadata": {},
768 | "source": [
769 | " "
770 | ]
771 | },
772 | {
773 | "cell_type": "markdown",
774 | "metadata": {},
775 | "source": [
776 | " "
777 | ]
778 | },
779 | {
780 | "cell_type": "markdown",
781 | "metadata": {},
782 | "source": [
783 | "# Part II (online features logging and validation)"
784 | ]
785 | },
786 | {
787 | "cell_type": "markdown",
788 | "metadata": {},
789 | "source": [
790 | "In this part we will extend our feature store project with Data Quality Monitoring. Specifically, we are going to validate online features (features served by the feature server) against the reference dataset created from training features by applying expectations, that we are going to develop ourselves. We will do this in 3 steps:\n",
791 | "1. Configuring feature logging in the feature server and setting destination for the specific feature service object.\n",
792 | "2. Defining expectations using [Great Expectations](https://greatexpectations.io/).\n",
793 | "3. Triggering validation using SDK or CLI API."
794 | ]
795 | },
796 | {
797 | "cell_type": "markdown",
798 | "metadata": {},
799 | "source": [
800 | "### Updating configuration to enable logging"
801 | ]
802 | },
803 | {
804 | "cell_type": "markdown",
805 | "metadata": {},
806 | "source": [
807 | "First, let's edit our `feature_store.yaml` and add `feature_logging` parameter inside `feature_server`."
808 | ]
809 | },
810 | {
811 | "cell_type": "code",
812 | "execution_count": null,
813 | "metadata": {},
814 | "outputs": [],
815 | "source": [
816 | "feature_store = \\\n",
817 | "f\"\"\"project: fraud_tutorial\n",
818 | "registry: data/registry.db\n",
819 | "provider: local\n",
820 | "offline_store:\n",
821 | " type: bigquery\n",
822 | " dataset: {BIGQUERY_DATASET_NAME}\n",
823 | "online_store:\n",
824 | " type: redis\n",
825 | " connection_string: \"localhost:6379\"\n",
826 | "feature_server:\n",
827 | " enabled: True\n",
828 | " feature_logging:\n",
829 | " enabled: True\n",
830 | " flush_interval_secs: 60\n",
831 | " write_to_disk_interval_secs: 10\n",
832 | " \n",
833 | "go_feature_retrieval: True\n",
834 | "\"\"\"\n",
835 | "\n",
836 | "with open('feature_store.yaml', \"w\") as feature_store_file:\n",
837 | " feature_store_file.write(feature_store)"
838 | ]
839 | },
840 | {
841 | "cell_type": "markdown",
842 | "metadata": {},
843 | "source": [
844 | "Next, we need to update our feature service definition in `fraud_services.py` with `logging config`. Logging config defines sample rate and logging destination. Sample rate sets the percentage of feature rows that will be logged from all served features and can have a value from 0 to 1 (inclusive from both sides). Destination must be a table or a path in the offline store declared globally in the `feature_store.yaml`."
845 | ]
846 | },
847 | {
848 | "cell_type": "code",
849 | "execution_count": null,
850 | "metadata": {},
851 | "outputs": [],
852 | "source": [
853 | "fraud_services = f\"\"\"\n",
854 | "from feast import FeatureService\n",
855 | "from feast.feature_logging import LoggingConfig\n",
856 | "from feast.infra.offline_stores.bigquery_source import BigQueryLoggingDestination\n",
857 | "\n",
858 | "from fraud_features import driver_stats_fv, user_account_fv, user_has_fraudulent_transactions_fv\n",
859 | "\n",
860 | "fs = FeatureService(\n",
861 | " name=\"user_features\",\n",
862 | " features=[\n",
863 | " driver_stats_fv[[\"user_transaction_count_7d\"]],\n",
864 | " user_account_fv[[\"credit_score\", \"account_age_days\", \"user_has_2fa_installed\"]],\n",
865 | " user_has_fraudulent_transactions_fv[[\"user_has_fraudulent_transactions_7d\"]],\n",
866 | " ],\n",
867 | " logging_config=LoggingConfig(\n",
868 | " sample_rate=1.0,\n",
869 | " destination=BigQueryLoggingDestination(\n",
870 | " table_ref=\"{PROJECT_ID}.{BIGQUERY_DATASET_NAME}.user_features_online_logs\"\n",
871 | " )\n",
872 | " )\n",
873 | ")\"\"\"\n",
874 | " \n",
875 | "with open('fraud_services.py', \"w\") as fraud_services_file:\n",
876 | " fraud_services_file.write(fraud_services)"
877 | ]
878 | },
879 | {
880 | "cell_type": "markdown",
881 | "metadata": {},
882 | "source": [
883 | "Now, let's apply these changes:"
884 | ]
885 | },
886 | {
887 | "cell_type": "code",
888 | "execution_count": null,
889 | "metadata": {},
890 | "outputs": [],
891 | "source": [
892 | "! feast apply"
893 | ]
894 | },
895 | {
896 | "cell_type": "markdown",
897 | "metadata": {},
898 | "source": [
899 | "and restart the feature server:"
900 | ]
901 | },
902 | {
903 | "cell_type": "code",
904 | "execution_count": null,
905 | "metadata": {},
906 | "outputs": [],
907 | "source": [
908 | "server.terminate()\n",
909 | "server = subprocess.Popen([\"feast\", \"serve\", \"-t\", \"grpc\"])"
910 | ]
911 | },
912 | {
913 | "cell_type": "markdown",
914 | "metadata": {},
915 | "source": [
916 | "### Creating reference dataset from training features"
917 | ]
918 | },
919 | {
920 | "cell_type": "code",
921 | "execution_count": null,
922 | "metadata": {},
923 | "outputs": [],
924 | "source": [
925 | "from feast.infra.offline_stores.bigquery_source import SavedDatasetBigQueryStorage\n",
926 | "\n",
927 | "reference_dataset = store.create_saved_dataset(\n",
928 | " from_=training_data_job,\n",
929 | " name=\"reference_dataset\",\n",
930 | " storage=SavedDatasetBigQueryStorage(table=f\"{PROJECT_ID}.{BIGQUERY_DATASET_NAME}.reference_dataset\"))"
931 | ]
932 | },
933 | {
934 | "cell_type": "markdown",
935 | "metadata": {},
936 | "source": [
937 | "### Creating & testing validation profiler"
938 | ]
939 | },
940 | {
941 | "cell_type": "code",
942 | "execution_count": null,
943 | "metadata": {},
944 | "outputs": [],
945 | "source": [
946 | "from feast.dqm.profilers.ge_profiler import ge_profiler\n",
947 | "from great_expectations.dataset import PandasDataset\n",
948 | "from great_expectations.core.expectation_suite import ExpectationSuite"
949 | ]
950 | },
951 | {
952 | "cell_type": "markdown",
953 | "metadata": {},
954 | "source": [
955 | "Profiler is defined as a function that takes a dataset, a Pandas DataFrame wrapped into GE's `PandasDataset` class, and returns `ExpectationSuite`, a set of expectations:"
956 | ]
957 | },
958 | {
959 | "cell_type": "code",
960 | "execution_count": null,
961 | "metadata": {},
962 | "outputs": [],
963 | "source": [
964 | "@ge_profiler\n",
965 | "def user_features_profiler(ds: PandasDataset) -> ExpectationSuite:\n",
966 | " ds.expect_column_values_to_be_between(\"user_account_features__credit_score\", 300, 850)\n",
967 | " ds.expect_column_values_to_be_between(\"user_transaction_count_7d__transaction_count_7d\", min_value=0)\n",
968 | " return ds.get_expectation_suite()"
969 | ]
970 | },
971 | {
972 | "cell_type": "markdown",
973 | "metadata": {},
974 | "source": [
975 | "To learn more about expectation functions that can be used in the profiler definition please refer to [Great Expectations doc](https://docs.greatexpectations.io/docs/)."
976 | ]
977 | },
978 | {
979 | "cell_type": "markdown",
980 | "metadata": {},
981 | "source": [
982 | "Profiler can be tested using the saved dataset object created above:"
983 | ]
984 | },
985 | {
986 | "cell_type": "code",
987 | "execution_count": null,
988 | "metadata": {},
989 | "outputs": [],
990 | "source": [
991 | "reference_dataset.get_profile(profiler=user_features_profiler)"
992 | ]
993 | },
994 | {
995 | "cell_type": "markdown",
996 | "metadata": {},
997 | "source": [
998 | "Profiler function along with the reference dataset must be stored in the Feast registry before calling validation API:"
999 | ]
1000 | },
1001 | {
1002 | "cell_type": "code",
1003 | "execution_count": null,
1004 | "metadata": {},
1005 | "outputs": [],
1006 | "source": [
1007 | "from feast.saved_dataset import ValidationReference\n",
1008 | "\n",
1009 | "ref = ValidationReference(\n",
1010 | " name='user_features_training_ref',\n",
1011 | " dataset_name=\"reference_dataset\",\n",
1012 | " profiler=user_features_profiler,\n",
1013 | ")\n",
1014 | "store.apply(ref)"
1015 | ]
1016 | },
1017 | {
1018 | "cell_type": "markdown",
1019 | "metadata": {},
1020 | "source": [
1021 | "## Validation"
1022 | ]
1023 | },
1024 | {
1025 | "cell_type": "markdown",
1026 | "metadata": {},
1027 | "source": [
1028 | "Let's now run a few predictions to log some data points:"
1029 | ]
1030 | },
1031 | {
1032 | "cell_type": "code",
1033 | "execution_count": null,
1034 | "metadata": {},
1035 | "outputs": [],
1036 | "source": [
1037 | "user_ids = list(training_data.user_id.sample(10))\n",
1038 | "predict(user_ids)"
1039 | ]
1040 | },
1041 | {
1042 | "cell_type": "markdown",
1043 | "metadata": {},
1044 | "source": [
1045 | "After some time passed (depending on the value of `flush_interval_secs` defined in `feature_store.yaml`) we can trigger a validation:"
1046 | ]
1047 | },
1048 | {
1049 | "cell_type": "code",
1050 | "execution_count": null,
1051 | "metadata": {},
1052 | "outputs": [],
1053 | "source": [
1054 | "end_ts = datetime.now()\n",
1055 | "start_ts = end_ts - timedelta(minutes=10)\n",
1056 | "\n",
1057 | "! feast validate --feature-service user_features \\\n",
1058 | " --reference user_features_training_ref {start_ts.isoformat()} {end_ts.isoformat()}"
1059 | ]
1060 | },
1061 | {
1062 | "cell_type": "markdown",
1063 | "metadata": {},
1064 | "source": [
1065 | "### Making validation fail"
1066 | ]
1067 | },
1068 | {
1069 | "cell_type": "markdown",
1070 | "metadata": {},
1071 | "source": [
1072 | "Now, if some invalid data, that doesn't met our expectations, will be ingested into the online store and then retrieved via the feature server we should observe how validation is failing."
1073 | ]
1074 | },
1075 | {
1076 | "cell_type": "code",
1077 | "execution_count": null,
1078 | "metadata": {},
1079 | "outputs": [],
1080 | "source": [
1081 | "import pandas as pd\n",
1082 | "insert_df = pd.DataFrame({\n",
1083 | " \"user_id\": [\"pwvabf\"],\n",
1084 | " \"transaction_count_7d\": [-1],\n",
1085 | " \"feature_timestamp\": [datetime.now()],\n",
1086 | "})\n",
1087 | "store.write_to_online_store(\"user_transaction_count_7d\", insert_df)"
1088 | ]
1089 | },
1090 | {
1091 | "cell_type": "code",
1092 | "execution_count": null,
1093 | "metadata": {},
1094 | "outputs": [],
1095 | "source": [
1096 | "predict([\"pwvabf\"])"
1097 | ]
1098 | },
1099 | {
1100 | "cell_type": "markdown",
1101 | "metadata": {},
1102 | "source": [
1103 | "*Remember that it takes some time to write logs to BigQuery*"
1104 | ]
1105 | },
1106 | {
1107 | "cell_type": "code",
1108 | "execution_count": null,
1109 | "metadata": {},
1110 | "outputs": [],
1111 | "source": [
1112 | "end_ts = datetime.now()\n",
1113 | "start_ts = end_ts - timedelta(minutes=10)\n",
1114 | "\n",
1115 | "! feast validate --feature-service user_features \\\n",
1116 | " --reference user_features_training_ref {start_ts.isoformat()} {end_ts.isoformat()}"
1117 | ]
1118 | },
1119 | {
1120 | "cell_type": "markdown",
1121 | "metadata": {},
1122 | "source": [
1123 | "### Alternative example with validating feature presence"
1124 | ]
1125 | },
1126 | {
1127 | "cell_type": "markdown",
1128 | "metadata": {},
1129 | "source": [
1130 | "In this example we create an expectation that the feature will have a not-null value in 99% of the cases:"
1131 | ]
1132 | },
1133 | {
1134 | "cell_type": "code",
1135 | "execution_count": null,
1136 | "metadata": {},
1137 | "outputs": [],
1138 | "source": [
1139 | "@ge_profiler\n",
1140 | "def user_features_profiler_v2(ds: PandasDataset) -> ExpectationSuite:\n",
1141 | " ds.expect_column_values_to_not_be_null(\"user_account_features__account_age_days\", mostly=0.99)\n",
1142 | " return ds.get_expectation_suite()"
1143 | ]
1144 | },
1145 | {
1146 | "cell_type": "markdown",
1147 | "metadata": {},
1148 | "source": [
1149 | "testing on the reference dataset:"
1150 | ]
1151 | },
1152 | {
1153 | "cell_type": "code",
1154 | "execution_count": null,
1155 | "metadata": {},
1156 | "outputs": [],
1157 | "source": [
1158 | "reference_dataset.get_profile(profiler=user_features_profiler_v2)"
1159 | ]
1160 | },
1161 | {
1162 | "cell_type": "markdown",
1163 | "metadata": {},
1164 | "source": [
1165 | ".. and storing new validation reference in the registry:"
1166 | ]
1167 | },
1168 | {
1169 | "cell_type": "code",
1170 | "execution_count": null,
1171 | "metadata": {},
1172 | "outputs": [],
1173 | "source": [
1174 | "store.apply(\n",
1175 | " ValidationReference(\n",
1176 | " name='user_features_training_ref_v2',\n",
1177 | " dataset_name=\"reference_dataset\",\n",
1178 | " profiler=user_features_profiler_v2,\n",
1179 | " )\n",
1180 | ")"
1181 | ]
1182 | },
1183 | {
1184 | "cell_type": "markdown",
1185 | "metadata": {},
1186 | "source": [
1187 | "Retrieving some entity rows that do not exist in the online store (and thus, returned feature statuses will be NOT FOUND):"
1188 | ]
1189 | },
1190 | {
1191 | "cell_type": "code",
1192 | "execution_count": null,
1193 | "metadata": {},
1194 | "outputs": [],
1195 | "source": [
1196 | "predict([\"invalid\"] * 5)"
1197 | ]
1198 | },
1199 | {
1200 | "cell_type": "markdown",
1201 | "metadata": {},
1202 | "source": [
1203 | "Now validation should fail:"
1204 | ]
1205 | },
1206 | {
1207 | "cell_type": "code",
1208 | "execution_count": null,
1209 | "metadata": {},
1210 | "outputs": [],
1211 | "source": [
1212 | "end_ts = datetime.now()\n",
1213 | "start_ts = end_ts - timedelta(hours=1)\n",
1214 | "\n",
1215 | "! feast validate --feature-service user_features \\\n",
1216 | " --reference user_features_training_ref_v2 {start_ts.isoformat()} {end_ts.isoformat()}"
1217 | ]
1218 | },
1219 | {
1220 | "cell_type": "markdown",
1221 | "metadata": {
1222 | "id": "m4Pu2m4KUrbp"
1223 | },
1224 | "source": [
1225 | "# Cleanup\n",
1226 | "\n",
1227 | "If you want to clean up the resources created during this tutorial, run the following cells:\n"
1228 | ]
1229 | },
1230 | {
1231 | "cell_type": "code",
1232 | "execution_count": null,
1233 | "metadata": {
1234 | "colab": {
1235 | "base_uri": "https://localhost:8080/"
1236 | },
1237 | "id": "9RK_Kxj2VFQu",
1238 | "outputId": "c6947143-d41b-4ce8-9f14-36c530234eb4"
1239 | },
1240 | "outputs": [],
1241 | "source": [
1242 | "!bq rm -t -f ${BIGQUERY_DATASET_NAME}.user_count_transactions_7d\n",
1243 | "!bq rm -t -f ${BIGQUERY_DATASET_NAME}.user_features_online_logs\n",
1244 | "!bq rm -r -f -d ${BIGQUERY_DATASET_NAME}"
1245 | ]
1246 | },
1247 | {
1248 | "cell_type": "code",
1249 | "execution_count": null,
1250 | "metadata": {
1251 | "id": "5EGuKSupu5jN"
1252 | },
1253 | "outputs": [],
1254 | "source": [
1255 | "server.terminate()"
1256 | ]
1257 | },
1258 | {
1259 | "cell_type": "code",
1260 | "execution_count": null,
1261 | "metadata": {},
1262 | "outputs": [],
1263 | "source": []
1264 | }
1265 | ],
1266 | "metadata": {
1267 | "colab": {
1268 | "collapsed_sections": [],
1269 | "name": "Fraud_Detection_Tutorial.ipynb",
1270 | "provenance": [],
1271 | "toc_visible": true
1272 | },
1273 | "kernelspec": {
1274 | "display_name": "Python 3 (ipykernel)",
1275 | "language": "python",
1276 | "name": "python3"
1277 | },
1278 | "language_info": {
1279 | "codemirror_mode": {
1280 | "name": "ipython",
1281 | "version": 3
1282 | },
1283 | "file_extension": ".py",
1284 | "mimetype": "text/x-python",
1285 | "name": "python",
1286 | "nbconvert_exporter": "python",
1287 | "pygments_lexer": "ipython3",
1288 | "version": "3.9.12"
1289 | }
1290 | },
1291 | "nbformat": 4,
1292 | "nbformat_minor": 1
1293 | }
1294 |
--------------------------------------------------------------------------------