"
496 | ]
497 | },
498 | "execution_count": 4,
499 | "metadata": {},
500 | "output_type": "execute_result"
501 | }
502 | ],
503 | "source": [
504 | "es.plot()"
505 | ]
506 | },
507 | {
508 | "cell_type": "markdown",
509 | "metadata": {
510 | "collapsed": true
511 | },
512 | "source": [
513 | "\n",
514 | "
\n",
515 | "
\n",
516 | "\n",
517 | "Featuretools was created by the developers at [Feature Labs](https://www.featurelabs.com/). If building impactful data science pipelines is important to you or your business, please [get in touch](https://www.featurelabs.com/contact/)."
518 | ]
519 | },
520 | {
521 | "cell_type": "code",
522 | "execution_count": null,
523 | "metadata": {},
524 | "outputs": [],
525 | "source": []
526 | }
527 | ],
528 | "metadata": {
529 | "kernelspec": {
530 | "display_name": "Python 3",
531 | "language": "python",
532 | "name": "python3"
533 | },
534 | "language_info": {
535 | "codemirror_mode": {
536 | "name": "ipython",
537 | "version": 3
538 | },
539 | "file_extension": ".py",
540 | "mimetype": "text/x-python",
541 | "name": "python",
542 | "nbconvert_exporter": "python",
543 | "pygments_lexer": "ipython3",
544 | "version": "3.7.2"
545 | }
546 | },
547 | "nbformat": 4,
548 | "nbformat_minor": 2
549 | }
550 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | bokeh>=1.0.2
2 | featuretools>=0.16.0
3 | jupyter>=1.0.0
4 | scikit-learn>=0.20.2
5 | graphviz>=0.10.1
6 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import featuretools.variable_types as vtypes
2 | import pandas as pd
3 | import featuretools as ft
4 | from featuretools.primitives import Sum, Mean, Hour
5 | from featuretools.selection import remove_low_information_features
6 | from sklearn.ensemble import RandomForestClassifier
7 | from sklearn.metrics import roc_auc_score
8 |
9 | from bokeh.plotting import figure
10 | from bokeh.models import ColumnDataSource, HoverTool
11 | from bokeh.io import show
12 |
13 |
14 | def datashop_to_entityset(filename):
15 | # Make an EntitySet called Dataset with the following structure
16 | #
17 | # schools students problems
18 | # \ | /
19 | # classes sessions problem steps
20 | # \ | /
21 | # transactions -- attempts
22 | #
23 |
24 | # Convert the csv into a dataframe using pandas
25 | data = pd.read_csv(filename, '\t', parse_dates=True)
26 |
27 | # Make the Transaction Id the index column of the dataframe and clean other columns
28 | data.index = data['Transaction Id']
29 | data = data.drop(['Row'], axis=1)
30 | data['Outcome'] = data['Outcome'].map({'INCORRECT': 0, 'CORRECT': 1})
31 |
32 | # Make a new 'End Time' column which is start_time + duration
33 | # This is /super useful/ because you shouldn't be using outcome data at
34 | # any point before the student has attempted the problem.
35 | data['End Time'] = pd.to_datetime(
36 | data['Time']) + pd.to_timedelta(pd.to_numeric(data['Duration (sec)']), 's')
37 |
38 | # Make a list of all the KC and CF columns present
39 | kc_and_cf_cols = [x for x in data.columns if (
40 | x.startswith('KC ') or x.startswith('CF '))]
41 |
42 | # Now we start making an entityset. We make 'End Time' a time index for 'Outcome'
43 | # even though our primary time index for a row is 'Time' preventing label leakage.
44 | es = ft.EntitySet('Dataset')
45 | es.entity_from_dataframe(entity_id='transactions',
46 | index='Transaction Id',
47 | dataframe=data,
48 | variable_types={'Outcome': vtypes.Boolean, 'Attempt At Step': vtypes.Categorical},
49 | time_index='Time',
50 | secondary_time_index={'End Time': [
51 | 'Outcome', 'Is Last Attempt', 'Duration (sec)']}
52 | )
53 |
54 | # Every transaction has a `problem_step` which is associated to a problem
55 | es.normalize_entity(base_entity_id='transactions',
56 | new_entity_id='problem_steps',
57 | index='Step Name',
58 | additional_variables=['Problem Name'] + kc_and_cf_cols,
59 | make_time_index=True)
60 |
61 | es.normalize_entity(base_entity_id='problem_steps',
62 | new_entity_id='problems',
63 | index='Problem Name',
64 | make_time_index=True)
65 |
66 | # Every transaction has a `session` associated to a student
67 | es.normalize_entity(base_entity_id='transactions',
68 | new_entity_id='sessions',
69 | index='Session Id',
70 | additional_variables=['Anon Student Id'],
71 | make_time_index=True)
72 |
73 | es.normalize_entity(base_entity_id='sessions',
74 | new_entity_id='students',
75 | index='Anon Student Id',
76 | make_time_index=True)
77 |
78 | # Every transaction has a `class` associated to a school
79 | es.normalize_entity(base_entity_id='transactions',
80 | new_entity_id='classes',
81 | index='Class',
82 | additional_variables=['School'],
83 | make_time_index=False)
84 |
85 | es.normalize_entity(base_entity_id='classes',
86 | new_entity_id='schools',
87 | index='School',
88 | make_time_index=False)
89 |
90 | # And because we might be interested in creating features grouped
91 | # by attempts we normalize by those as well.
92 | # es.normalize_entity(base_entity_id='transactions',
93 | # new_entity_id='attempts',
94 | # index='Attempt At Step',
95 | # additional_variables=[],
96 | # make_time_index=False)
97 | return es
98 |
99 |
100 | def create_features(es, label='Outcome', custom_agg=[]):
101 | cutoff_times = es['transactions'].df[['Transaction Id', 'End Time', label]]
102 | fm, features = ft.dfs(entityset=es,
103 | target_entity='transactions',
104 | agg_primitives=[Sum, Mean] + custom_agg,
105 | trans_primitives=[Hour],
106 | max_depth=3,
107 | approximate='2m',
108 | cutoff_time=cutoff_times,
109 | verbose=True)
110 | fm_enc, _ = ft.encode_features(fm, features)
111 | fm_enc = fm_enc.fillna(0)
112 | fm_enc = remove_low_information_features(fm_enc)
113 | labels = fm.pop(label)
114 | return (fm_enc, labels)
115 |
116 |
117 | def estimate_score(fm_enc, label, splitter):
118 | k = 0
119 | for train_index, test_index in splitter.split(fm_enc):
120 | clf = RandomForestClassifier()
121 | X_train, X_test = fm_enc.iloc[train_index], fm_enc.iloc[test_index]
122 | y_train, y_test = label[train_index], label[test_index]
123 | clf.fit(X_train, y_train)
124 | preds = clf.predict(X_test)
125 | score = round(roc_auc_score(preds, y_test), 2)
126 | print("AUC score on time split {} is {}".format(k, score))
127 |
128 |
129 | def feature_importances(fm_enc, clf, feats=5):
130 | feature_imps = [(imp, fm_enc.columns[i])
131 | for i, imp in enumerate(clf.feature_importances_)]
132 | feature_imps.sort()
133 | feature_imps.reverse()
134 | print('Feature Importances: ')
135 | for i, f in enumerate(feature_imps[0:feats]):
136 | print('{}: {}'.format(i + 1, f[1]))
137 | print("-----\n")
138 | return ([f[1] for f in feature_imps[0:feats]])
139 |
140 |
141 | def datashop_plot(fm, col1='', col2='', label=None, names=['', '', '']):
142 | colorlist = ['#3A3A3A', '#1072B9', '#B22222']
143 | colormap = {name: colorlist[name] for name in label}
144 | colors = [colormap[x] for x in label]
145 | labelmap = {0: 'INCORRECT', 1: 'CORRECT'}
146 | desc = [labelmap[x] for x in label]
147 | source = ColumnDataSource(dict(
148 | x=fm[col1],
149 | y=fm[col2],
150 | desc=desc,
151 | color=colors,
152 | index=fm.index,
153 | problem_step=fm['Step Name'],
154 | problem=fm['problem_steps.Problem Name'],
155 | attempt=fm['Attempt At Step']
156 | ))
157 | hover = HoverTool(tooltips=[
158 | ("(x,y)", "(@x, @y)"),
159 | ("problem", "@problem"),
160 | ("problem step", "@problem_step"),
161 | ])
162 |
163 | p = figure(title=names[0],
164 | tools=['box_zoom', hover, 'reset'], width=800)
165 | p.scatter(x='x',
166 | y='y',
167 | color='color',
168 | legend_group='desc',
169 | source=source,
170 | alpha=.6)
171 |
172 | p.xaxis.axis_label = names[1]
173 | p.yaxis.axis_label = names[2]
174 | return p
175 |
176 | from sklearn.preprocessing import LabelEncoder
177 | def inplace_encoder(X):
178 | for col in X:
179 | le = LabelEncoder()
180 | X[col] = le.fit_transform(X[[col]].astype(str))
181 | return X
182 |
--------------------------------------------------------------------------------