.
675 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data-Analysis
2 | Python Practice of Data Analysis and Mining
3 |
--------------------------------------------------------------------------------
/data_exploratory/abnormal_check.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-D:\GitWork\Data\chapter3\demo\code\3-1_abnormal_check.py
2 | # 3-1
3 | '''
4 | @ author: Amos
5 | '''
6 |
7 | import pandas as pd
8 | import matplotlib.pyplot as plt
9 |
10 | catering_sale = './data/catering_sale.xls'
11 | data = pd.read_excel(catering_sale, index_col = u'日期')
12 |
13 | plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
14 | plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
15 |
16 | #建立图像
17 | plt.figure()
18 | p = data.boxplot(return_type = 'dict')
19 | x = p['fliers'][0].get_xdata() #‘flies’即为异常值
20 | y = p['fliers'][0].get_ydata()
21 |
22 | y.sort()
23 |
24 | #用annotate添加注释
25 | for i in range(len(x)):
26 | if i>0:
27 | plt.annotate(y[i], xy = (x[i], y[i]), xytext=(x[i]+0.05-0.8/(y[i]-y[i-1]),y[i]))
28 | else:
29 | plt.annotate(y[i], xy = (x[i],y[i]), xytext=(x[i]+0.08,y[i]))
30 |
31 | plt.show()
32 |
--------------------------------------------------------------------------------
/data_exploratory/correlation_analysis.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import pandas as pd
4 |
5 | catering_sale = './data/catering_sale_all.xls' #餐饮数据,含有其他属性
6 | data = pd.read_excel(catering_sale, index_col = u'日期') #读取数据,指定“日期”列为索引列
7 |
8 | #print(data.corr())
9 | print(data.corr()[u'百合酱蒸凤爪']) #只显示“百合酱蒸凤爪”与其他菜式的相关系数
10 | print('\n')
11 | print(data[u'百合酱蒸凤爪'].corr(data[u'翡翠蒸香茜饺'])) #计算“百合酱蒸凤爪”与“翡翠蒸香茜饺”的相关系数
12 |
--------------------------------------------------------------------------------
/data_exploratory/data/catering_dish_profit.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_exploratory/data/catering_dish_profit.xls
--------------------------------------------------------------------------------
/data_exploratory/data/catering_fish_congee.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_exploratory/data/catering_fish_congee.xls
--------------------------------------------------------------------------------
/data_exploratory/data/catering_sale.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_exploratory/data/catering_sale.csv
--------------------------------------------------------------------------------
/data_exploratory/data/catering_sale.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_exploratory/data/catering_sale.xls
--------------------------------------------------------------------------------
/data_exploratory/data/catering_sale_all.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_exploratory/data/catering_sale_all.xls
--------------------------------------------------------------------------------
/data_exploratory/dish_pareto.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | #帕累托分析
4 | import pandas as pd
5 | import matplotlib.pyplot as plt
6 | plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
7 | plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
8 |
9 | dish_profit = './data/catering_dish_profit.xls'
10 | data = pd.read_excel(dish_profit, index_col = u'菜品名')
11 |
12 | data = data[u'盈利'].copy()
13 | data.sort_index(ascending = False)
14 |
15 | plt.figure()
16 | data.plot(kind='bar') #柱状图
17 | plt.ylabel(u'盈利(元)')
18 |
19 | p = 1.0*data.cumsum()/data.sum()
20 | p.plot(color = 'r', secondary_y = True, style = '-o',linewidth = 2) #线
21 | #添加注释,即85%处的标记。这里包括了指定箭头样式。
22 | plt.annotate(format(p[6], '.4%'), \
23 | xy = (6, p[6]), \
24 | xytext=(6*0.9, p[6]*0.9), \
25 | arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=.2"))
26 | plt.ylabel(u'盈利(比例)')
27 |
28 | plt.show()
29 |
--------------------------------------------------------------------------------
/data_exploratory/statistic_analysis.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import pandas as pd
4 |
5 | catering_sale = './data/catering_sale.xls'
6 | data = pd.read_excel(catering_sale, index_col = u'日期')
7 |
8 | print(data.describe(),'\n')
9 | print('total: ',len(data))
10 |
11 | data = data[(data[u'销量']>400) & (data[u'销量']<5000)]
12 | statistics = data.describe()
13 |
14 | s = statistics
15 | s.loc['range'] = s.loc['max'] - s.loc['min']
16 | s.loc['var'] = s.loc['std'] / s.loc['mean']
17 | s.loc['dis'] = s.loc['75%'] - s.loc['25%']
18 |
19 | print(statistics)
20 |
--------------------------------------------------------------------------------
/data_modeling/cm_plot.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # cm_plot.py 文件,包括了混淆矩阵可视化函数,
4 | # 放置在python的site-packages 目录,供调用
5 | # 例如:~/anaconda2/lib/python2.7/site-packages
6 |
7 | def cm_plot(y, yp):
8 | from sklearn.metrics import confusion_matrix#导入混淆矩阵函数
9 | cm = confusion_matrix(y, yp)#混淆矩阵
10 | import matplotlib.pyplot as plt #导入作图库
11 | #画混淆矩阵图,配色风格使用cm.Greens,更多风格请参考官网。
12 | plt.matshow(cm, cmap=plt.cm.Greens)
13 | plt.colorbar()
14 | for x in range(len(cm)): #数据标签
15 | for y in range(len(cm)):
16 | plt.annotate(cm[x,y], xy=(x, y), horizontalalignment='center', verticalalignment='center')
17 | plt.ylabel('True label') #坐标轴标签
18 | plt.xlabel('Predicted label') #坐标轴标签
19 | return plt
20 |
--------------------------------------------------------------------------------
/data_modeling/data/arima_data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/data/arima_data.xls
--------------------------------------------------------------------------------
/data_modeling/data/bankloan.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/data/bankloan.xls
--------------------------------------------------------------------------------
/data_modeling/data/consumption_data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/data/consumption_data.xls
--------------------------------------------------------------------------------
/data_modeling/data/menu_orders.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/data/menu_orders.xls
--------------------------------------------------------------------------------
/data_modeling/data/neural_network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/data/neural_network.png
--------------------------------------------------------------------------------
/data_modeling/data/sales_data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/data/sales_data.xls
--------------------------------------------------------------------------------
/data_modeling/decision_tree.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Jan 29 22:34:51 2018
4 |
5 | @author: Amos
6 | """
7 | import pandas as pd
8 | from sklearn.tree import DecisionTreeClassifier as DTC
9 | from sklearn.tree import export_graphviz as to_graphviz
10 | from sklearn.externals.six import StringIO
11 |
12 | filename = "./data/sales_data.xls"
13 | data = pd.read_excel(filename, index_col=u'序号')
14 |
15 | #数据变换为类别标签
16 | data[data == u'高'] = 1
17 | data[data == u'是'] = 1
18 | data[data == u'好'] = 1
19 | data[data != 1] = -1
20 | x = data.iloc[:,:3].astype(int)
21 | y = data.iloc[:,3].astype(int)
22 |
23 | #建立并训练决策树模型,基于信息熵
24 | dtc = DTC(criterion='entropy')
25 | dtc.fit(x, y)
26 |
27 | with open("./tmp/tree.dot", 'w') as f:
28 | f = to_graphviz(dtc, feature_names = x.columns, out_file= f)
29 |
--------------------------------------------------------------------------------
/data_modeling/kmeans.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # 使用K-Means算法聚类消费行为特征数据
4 | import pandas as pd
5 | from sklearn.cluster import KMeans
6 |
7 | k = 3 #聚类的类别
8 | iteration = 5 #最大循环次数
9 |
10 | inputfile = './data/consumption_data.xls'
11 | outputfile = './tmp/out_consumption_data.xls'
12 | #读取并标准化数据
13 | data = pd.read_excel(inputfile)
14 | data_zs = 1.0*(data - data.mean())/data.std()
15 |
16 | #分为K类,并发数
17 | model = KMeans(
18 | n_clusters=k, n_jobs=1, max_iter = iteration)
19 | #开始聚类
20 | model.fit(data_zs)
21 |
22 | #聚类结果
23 | r1 = pd.Series(model.labels_).value_counts() #统计各个类别的数目
24 | r2 = pd.DataFrame(model.cluster_centers_) #找出聚类中心
25 | #横向连接(0是纵向),得到聚类中心对应的类别下的数目
26 | r = pd.concat([r2, r1], axis=1)
27 | r.columns = list(data.columns) + [u'类别数目']
28 | #print(r)
29 |
30 | #详细输出原始数据及其类别
31 | r_detail = pd.concat(
32 | [data, pd.Series(model.labels_, index=data.index)], axis=1)
33 | r_detail.columns = list(data.columns) + [u'聚类类别']
34 | #print(r_detail)
35 |
36 | '''
37 | #自定义作图函数
38 | def density_plot(data):
39 | import matplotlib.pyplot as plt
40 | #用来正常显示中文标签和负号
41 | plt.rcParams['font.sans-serif'] = ['SimHei']
42 | plt.rcParams['axes.unicode_minus'] = False
43 | p = data.plot(
44 | kind='kde', linewidth = 2,
45 | subplots = True, sharex = False)
46 | [p[i].set_ylabel(u'密度') for i in range(k)]
47 | plt.legend()
48 | return plt
49 |
50 | #作概率密度图
51 | fig_output = './tmp/kmeans_pd_'
52 | for i in range(k):
53 | data_r = data[r_detail[u'聚类类别'] == i].iloc[:, 1:]
54 | density_plot(data_r).savefig(u'%s%s'%(fig_output, i))
55 | '''
56 |
57 | #对kmeans结果可视化展示
58 | from sklearn.manifold import TSNE
59 |
60 | tsne = TSNE()
61 | #数据降维度
62 | tsne.fit_transform(data_zs)
63 | tsne = pd.DataFrame(tsne.embedding_, index = data_zs.index)
64 |
65 | import matplotlib.pyplot as plt
66 | plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
67 | plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
68 |
69 | #不同类别用不同颜色和样式绘图
70 | d = tsne[r_detail[u'聚类类别'] == 0]
71 | plt.plot(d[0], d[1], 'r.')
72 | d = tsne[r_detail[u'聚类类别'] == 1]
73 | plt.plot(d[0], d[1], 'go')
74 | d = tsne[r_detail[u'聚类类别'] == 2]
75 | plt.plot(d[0], d[1], 'b*')
76 | plt.show()
77 |
--------------------------------------------------------------------------------
/data_modeling/logistic_regression.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Jan 22 19:36:10 2018
4 |
5 | @author: Amos
6 | """
7 | #某银行在降低贷款拖欠率的数据进行回归建模
8 | #逻辑回归 自动建模
9 | import pandas as pd
10 | import numpy
11 |
12 | #参数初始化
13 | filename = './data/bankloan.xls'
14 | data = pd.read_excel(filename)
15 |
16 | x = data.iloc[:, :8].as_matrix()
17 | y = data.iloc[:, 8].as_matrix()
18 |
19 | from sklearn.linear_model import LogisticRegression as LR
20 | from sklearn.linear_model import RandomizedLogisticRegression as RLR
21 |
22 |
23 | lr = LR() #建立逻辑货柜模型
24 | lr.fit(x, y) #用筛选后的特征数据来训练模型
25 | print(u'逻辑回归模型训练结束。')
26 | print(u'未经过筛选特性模型的平均正确率为:%s' % lr.score(x, y))
27 |
28 | #建立随机逻辑回归模型
29 | rlr = RLR() #帅选变量
30 | rlr.fit(x, y)
31 | #rlr.get_support() #获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数
32 | selected_col = numpy.append(rlr.get_support(),[False])
33 | print(u"通过随机逻辑回归模型筛选特征结束")
34 | print(u"有效特征为:%s" % ",".join(data.columns[selected_col]))
35 | x = data[data.columns[selected_col]].as_matrix() # 筛选好特征
36 |
37 | lr = LR() #建立逻辑货柜模型
38 | lr.fit(x, y) #用筛选后的特征数据来训练模型
39 | print(u'逻辑回归模型训练结束。')
40 | print(u'模型的平均正确率为:%s' % lr.score(x, y)) #给出模型的平均正确率,本例为81.4%
41 |
--------------------------------------------------------------------------------
/data_modeling/neural_network.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #使用神经网络算法预测销量高低
3 |
4 | import pandas as pd
5 |
6 | inputfile = './data/sales_data.xls'
7 | data = pd.read_excel(inputfile, index_col = u'序号')
8 |
9 | data[data == u'高'] = 1
10 | data[data == u'是'] = 1
11 | data[data == u'好'] = 1
12 | data[data != 1] = -1
13 | x = data.iloc[:,:3].astype(int)
14 | y = data.iloc[:,3].astype(int)
15 |
16 | from keras.models import Sequential
17 | from keras.layers.core import Dense, Activation
18 |
19 | #建立模型
20 | model = Sequential();
21 | model.add(Dense(input_dim = 3, output_dim = 10))
22 | model.add(Activation('relu')) #用relu函数作为激活函数,能够大幅提供准确度
23 | model.add(Dense(input_dim = 10, output_dim = 1))
24 | model.add(Activation('sigmoid')) #由于是0-1输出,用sigmoid函数作为激活函数
25 |
26 | #编译模型:
27 | #由于我们做的是二元分类,所以我们指定损失函数为binary_crossentropy,以及模式为binary
28 | #另外常见的损失函数还有mean_squared_error、categorical_crossentropy等,请阅读帮助文件。
29 | #求解方法我们指定用adam,还有sgd、rmsprop等可选
30 | model.compile(loss = 'binary_crossentropy',
31 | optimizer = 'adam')
32 |
33 | model.fit(x, y, epochs = 100, batch_size = 10) #训练模型,学习一千次
34 | yp = model.predict_classes(x).reshape(len(y)) #分类预测
35 |
36 | from cm_plot import *
37 | cm_plot(y, yp).savefig('./data/neiral_network.png')
38 |
--------------------------------------------------------------------------------
/data_modeling/tmp/data_type.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/tmp/data_type.xls
--------------------------------------------------------------------------------
/data_modeling/tmp/kmeans_pd_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/tmp/kmeans_pd_0.png
--------------------------------------------------------------------------------
/data_modeling/tmp/kmeans_pd_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/tmp/kmeans_pd_1.png
--------------------------------------------------------------------------------
/data_modeling/tmp/kmeans_pd_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/tmp/kmeans_pd_2.png
--------------------------------------------------------------------------------
/data_modeling/tmp/pd_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/tmp/pd_0.png
--------------------------------------------------------------------------------
/data_modeling/tmp/pd_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/tmp/pd_1.png
--------------------------------------------------------------------------------
/data_modeling/tmp/pd_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/tmp/pd_2.png
--------------------------------------------------------------------------------
/data_modeling/tmp/tree.dot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/tmp/tree.dot
--------------------------------------------------------------------------------
/data_preprocess/attr_construct.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sun Jan 21 18:54:48 2018
4 |
5 | @author: Amos
6 | """
7 | import pandas as pd
8 |
9 | #构造线损率这个属性
10 |
11 | inputfile= './data/electricity_data.xls' #供入供出电量数据
12 | outputfile = './tmp/electricity_data.xls' #属性构造后数据文件
13 |
14 | data = pd.read_excel(inputfile)
15 | data[u'线损率'] = (data[u'供入电量'] - data[u'供出电量'])/data[u'供入电量']
16 | data.to_excel(outputfile)
17 |
--------------------------------------------------------------------------------
/data_preprocess/data/catering_sale.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_preprocess/data/catering_sale.xls
--------------------------------------------------------------------------------
/data_preprocess/data/discretization_data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_preprocess/data/discretization_data.xls
--------------------------------------------------------------------------------
/data_preprocess/data/electricity_data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_preprocess/data/electricity_data.xls
--------------------------------------------------------------------------------
/data_preprocess/data/leleccum.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_preprocess/data/leleccum.mat
--------------------------------------------------------------------------------
/data_preprocess/data/normalization_data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_preprocess/data/normalization_data.xls
--------------------------------------------------------------------------------
/data_preprocess/data/principal_component.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_preprocess/data/principal_component.xls
--------------------------------------------------------------------------------
/data_preprocess/data_discretization.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Jan 21 15:35:38 2018
4 | @author: Amos
5 | """
6 | #数据离散化
7 | import pandas as pd
8 |
9 | DATA_FILE = './data/discretization_data.xls' #参数初始化
10 | DATA = pd.read_excel(DATA_FILE) #读取数据
11 | DATA = DATA.loc[:, u'肝气郁结证型系数']
12 | k = 4
13 |
14 | #等宽离散化
15 | d1 = pd.cut(DATA, k, labels=range(k))
16 |
17 | #等频率离散化
18 | w = [1.0*i/k for i in range(k+1)]
19 | #m = DATA.describe()
20 | #n = DATA.describe(percentiles=w)
21 | w = DATA.describe(percentiles=w)[4:(4+k+1)]
22 | w[0] = w[0]*(1-1e-10)
23 | d2 = pd.cut(DATA, w, labels=range(k))
24 |
25 | from sklearn.cluster import KMeans #引入KMeans
26 | #一维聚类离散化
27 | kmodel = KMeans(n_clusters=k, n_jobs=2) #建立模型
28 | #kmodel.fit(DATA.reshape((len(DATA), 1))) #训练模型
29 | #c = pd.DataFrame(kmodel.cluster_centers_).sort(0) #输出聚类中心,并且排序
30 | #w = pd.rolling_mean(c, 2).iloc[1:] #相邻两项求中点,作为边界点
31 | #w = [0] + list(w[0]) + [DATA.max()] #把首末边界点加上
32 | #d3 = pd.cut(DATA, w, labels = range(k))
33 |
34 | def cluster_plot(d, k): #自定义作图函数来显示聚类结果
35 | import matplotlib.pyplot as plt
36 | plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
37 | plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
38 |
39 | plt.figure(figsize=(8, 3))
40 | for j in range(0, k):
41 | plt.plot(DATA[d==j], [j for i in d[d==j]], 'o')
42 | plt.ylim(-0.5, k-0.5)
43 | return plt
44 |
45 | cluster_plot(d1, k).show()
46 | cluster_plot(d2, k).show()
47 | #cluster_plot(d3, k).show()
48 |
--------------------------------------------------------------------------------
/data_preprocess/data_lagrange_interplate.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Jan 18 11:25:38 2018
4 | @author: Amos
5 | """
6 |
7 | import pandas as pd
8 | from scipy.interpolate import lagrange
9 |
10 | inputfile = './data/catering_sale.xls' #销量数据路径
11 | outputfile = './tmp/sales.xls' #输出数据路径
12 |
13 | data = pd.read_excel(inputfile)
14 | #data[u'日期'].to_excel('./tmp/sales0.xls')
15 | #异常值过滤,变为空值
16 | #null_raw = list((data['销量']<400) | (data['销量']>5000))
17 | #data.loc[:, '销量'][(data['销量']<400) | (data['销量']>5000)] = None
18 | data.loc[(data['销量']<400) | (data['销量']>5000), '销量'] = None
19 | #data.to_excel('./tmp/sales1.xls')
20 |
21 | #自定义列向量插值函数
22 | def polyinterp_column(s, n, k=5):
23 | y = s [list(range(n-k, n)) + list(range(n+1, n+1+k))]
24 | y = y[y.notnull()] #剔除空值
25 | return lagrange(y.index, list(y))(n) #插值并返回结果
26 |
27 | for i in data.columns:
28 | for j in range(len(data)):
29 | if (data[i].isnull())[j]:
30 | #data[i][j] = polyinterp_column(data[i], j)
31 | data.loc[j, [i]] = polyinterp_column(data[i], j)
32 |
33 | data.to_excel(outputfile)
34 |
--------------------------------------------------------------------------------
/data_preprocess/data_normalization.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Jan 18 16:14:38 2018
4 |
5 | @author: Amos
6 |
7 | 规范化:归一化
8 | """
9 |
10 | import pandas as pd
11 | import numpy as np
12 |
13 | datafile = './data/normalization_data.xls'
14 | data = pd.read_excel(datafile, header = None)
15 |
16 | data_n1 = (data - data.min())/(data.max() - data.min()) #最小-最大规范化
17 | data_n2 = (data - data.mean())/data.std() #零-均值规范化
18 | data_n3 = data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化
19 |
20 | print(data_n1)
21 | print(data_n2)
22 | print(data_n3)
23 |
--------------------------------------------------------------------------------
/data_preprocess/principal_component_analyze.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sun Jan 21 19:19:29 2018
4 |
5 | @author: Amos
6 | """
7 |
8 | #主成分分析 降维
9 | import pandas as pd
10 |
11 | #参数初始化
12 | inputfile = './data/principal_component.xls'
13 | outputfile = './tmp/dimention_reducted.xls' #降维后的数据
14 |
15 | data = pd.read_excel(inputfile, header = None) #读入数据
16 |
17 | from sklearn.decomposition import PCA
18 |
19 | pca = PCA(3)
20 | pca.fit(data)
21 | #pca.components_ #返回模型的各个特征向量
22 | #pca.explained_variance_ratio_ #返回各个成分各自的方差百分比
23 |
24 | low_d = pca.transform(data) #降低维度
25 | pd.DataFrame(low_d).to_excel(outputfile)
26 | pca.inverse_transform(low_d) #复原数据
27 |
--------------------------------------------------------------------------------
/data_preprocess/tmp/dimention_reducted.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_preprocess/tmp/dimention_reducted.xls
--------------------------------------------------------------------------------
/data_preprocess/tmp/electricity_data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_preprocess/tmp/electricity_data.xls
--------------------------------------------------------------------------------
/data_preprocess/tmp/sales.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_preprocess/tmp/sales.xls
--------------------------------------------------------------------------------
/data_preprocess/wave_analysis.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Jan 18 17:36:54 2018
4 | @author: Amos
5 | """
6 |
7 | #利用小波变换进行数据分析
8 |
9 | from scipy.io import loadmat
10 |
11 | inputfile= './data/leleccum.mat' #提取自Matlab的信号文件
12 | mat = loadmat(inputfile)
13 | signal = mat['leleccum'][0]
14 |
15 | #导入PyWavelets
16 | import pywt
17 | coeffs = pywt.wavedec(signal, 'bior3.7', level=5)
18 | #返回结果为level+1个数字,第一个数组为逼近系数数组,后面的依次是细节系数数组i
19 |
--------------------------------------------------------------------------------
/tools/hello.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | print("hello world")
4 |
5 | import matplotlib as mpl
6 | print(mpl.get_cachedir())
--------------------------------------------------------------------------------
/tools/matplotlib_test.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | x = np.linspace(0, 10 , 1000)
6 | y = np.sin(x) + 1
7 | z = np.cos(x**2) + 1
8 |
9 | #plt.figure(figsize == (8, 4))
10 | plt.plot(x, y, label = '$\sin x+1$', color = 'red', linewidth = 2)
11 | plt.plot(x, z, 'b--', label = '$\cos x^2+1$')
12 |
13 | plt.xlabel('Time(s)')
14 | plt.ylabel('Volt')
15 | plt.title('A Simple Eg.')
16 |
17 | plt.ylim(0, 2.2) #y轴范围
18 |
19 | plt.legend() #显示图例
20 |
21 | plt.show()
22 |
--------------------------------------------------------------------------------
/tools/numpy_test.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | import numpy as np
4 |
5 | a = np.array([2,0,1,5])
6 | print(a)
7 | print(a[:2])
8 | print(a.min())
9 | a.sort()
10 | print(a)
11 |
--------------------------------------------------------------------------------
/tools/pandas_notes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# pandas主要功能"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "在了解pandas数据结构的基础上,了解其常用功能。"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## 1.重新索引(Reindexing)"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 1,
27 | "metadata": {
28 | "collapsed": true
29 | },
30 | "outputs": [],
31 | "source": [
32 | "import pandas as pd"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 2,
38 | "metadata": {},
39 | "outputs": [
40 | {
41 | "data": {
42 | "text/plain": [
43 | "d 4.5\n",
44 | "b 7.2\n",
45 | "a -5.3\n",
46 | "c 3.6\n",
47 | "dtype: float64"
48 | ]
49 | },
50 | "execution_count": 2,
51 | "metadata": {},
52 | "output_type": "execute_result"
53 | }
54 | ],
55 | "source": [
56 | "obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])\n",
57 | "obj"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "更改index需要调用reindex,如果没有对应index会引入缺失值"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 3,
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "data": {
74 | "text/plain": [
75 | "a -5.3\n",
76 | "b 7.2\n",
77 | "c 3.6\n",
78 | "d 4.5\n",
79 | "e NaN\n",
80 | "dtype: float64"
81 | ]
82 | },
83 | "execution_count": 3,
84 | "metadata": {},
85 | "output_type": "execute_result"
86 | }
87 | ],
88 | "source": [
89 | "obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])\n",
90 | "obj2"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "metadata": {},
96 | "source": [
97 | "对于DataFrame,reindex能更改row index,或column index。\n",
98 | "reindex the rows:"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 4,
104 | "metadata": {
105 | "collapsed": true
106 | },
107 | "outputs": [],
108 | "source": [
109 | "import numpy as np"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 5,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "frame = pd.DataFrame(np.arange(9).reshape(3, 3),\n",
119 | " index=['a', 'c', 'd'],\n",
120 | " columns=['Ohio', 'Texas', 'California'])"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 6,
126 | "metadata": {
127 | "scrolled": true
128 | },
129 | "outputs": [
130 | {
131 | "data": {
132 | "text/html": [
133 | "\n",
134 | "\n",
147 | "
\n",
148 | " \n",
149 | " \n",
150 | " | \n",
151 | " Ohio | \n",
152 | " Texas | \n",
153 | " California | \n",
154 | "
\n",
155 | " \n",
156 | " \n",
157 | " \n",
158 | " a | \n",
159 | " 0 | \n",
160 | " 1 | \n",
161 | " 2 | \n",
162 | "
\n",
163 | " \n",
164 | " c | \n",
165 | " 3 | \n",
166 | " 4 | \n",
167 | " 5 | \n",
168 | "
\n",
169 | " \n",
170 | " d | \n",
171 | " 6 | \n",
172 | " 7 | \n",
173 | " 8 | \n",
174 | "
\n",
175 | " \n",
176 | "
\n",
177 | "
"
178 | ],
179 | "text/plain": [
180 | " Ohio Texas California\n",
181 | "a 0 1 2\n",
182 | "c 3 4 5\n",
183 | "d 6 7 8"
184 | ]
185 | },
186 | "execution_count": 6,
187 | "metadata": {},
188 | "output_type": "execute_result"
189 | }
190 | ],
191 | "source": [
192 | "frame"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 7,
198 | "metadata": {},
199 | "outputs": [
200 | {
201 | "data": {
202 | "text/html": [
203 | "\n",
204 | "\n",
217 | "
\n",
218 | " \n",
219 | " \n",
220 | " | \n",
221 | " Ohio | \n",
222 | " Texas | \n",
223 | " California | \n",
224 | "
\n",
225 | " \n",
226 | " \n",
227 | " \n",
228 | " a | \n",
229 | " 0.0 | \n",
230 | " 1.0 | \n",
231 | " 2.0 | \n",
232 | "
\n",
233 | " \n",
234 | " b | \n",
235 | " NaN | \n",
236 | " NaN | \n",
237 | " NaN | \n",
238 | "
\n",
239 | " \n",
240 | " c | \n",
241 | " 3.0 | \n",
242 | " 4.0 | \n",
243 | " 5.0 | \n",
244 | "
\n",
245 | " \n",
246 | " d | \n",
247 | " 6.0 | \n",
248 | " 7.0 | \n",
249 | " 8.0 | \n",
250 | "
\n",
251 | " \n",
252 | "
\n",
253 | "
"
254 | ],
255 | "text/plain": [
256 | " Ohio Texas California\n",
257 | "a 0.0 1.0 2.0\n",
258 | "b NaN NaN NaN\n",
259 | "c 3.0 4.0 5.0\n",
260 | "d 6.0 7.0 8.0"
261 | ]
262 | },
263 | "execution_count": 7,
264 | "metadata": {},
265 | "output_type": "execute_result"
266 | }
267 | ],
268 | "source": [
269 | "frame2 = frame.reindex(['a','b','c','d'])\n",
270 | "frame2"
271 | ]
272 | },
273 | {
274 | "cell_type": "markdown",
275 | "metadata": {},
276 | "source": [
277 | "reindex the columns:"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": 8,
283 | "metadata": {
284 | "collapsed": true
285 | },
286 | "outputs": [],
287 | "source": [
288 | "states = ['Texes', 'Utah', 'California']"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 9,
294 | "metadata": {
295 | "scrolled": true
296 | },
297 | "outputs": [
298 | {
299 | "data": {
300 | "text/html": [
301 | "\n",
302 | "\n",
315 | "
\n",
316 | " \n",
317 | " \n",
318 | " | \n",
319 | " Texes | \n",
320 | " Utah | \n",
321 | " California | \n",
322 | "
\n",
323 | " \n",
324 | " \n",
325 | " \n",
326 | " a | \n",
327 | " NaN | \n",
328 | " NaN | \n",
329 | " 2 | \n",
330 | "
\n",
331 | " \n",
332 | " c | \n",
333 | " NaN | \n",
334 | " NaN | \n",
335 | " 5 | \n",
336 | "
\n",
337 | " \n",
338 | " d | \n",
339 | " NaN | \n",
340 | " NaN | \n",
341 | " 8 | \n",
342 | "
\n",
343 | " \n",
344 | "
\n",
345 | "
"
346 | ],
347 | "text/plain": [
348 | " Texes Utah California\n",
349 | "a NaN NaN 2\n",
350 | "c NaN NaN 5\n",
351 | "d NaN NaN 8"
352 | ]
353 | },
354 | "execution_count": 9,
355 | "metadata": {},
356 | "output_type": "execute_result"
357 | }
358 | ],
359 | "source": [
360 | "frame.reindex(columns=states)"
361 | ]
362 | },
363 | {
364 | "cell_type": "markdown",
365 | "metadata": {},
366 | "source": [
367 | "reinsex参数: "
368 | ]
369 | },
370 | {
371 | "cell_type": "markdown",
372 | "metadata": {},
373 | "source": [
374 | ""
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": 10,
380 | "metadata": {},
381 | "outputs": [
382 | {
383 | "data": {
384 | "text/html": [
385 | "\n",
386 | "\n",
399 | "
\n",
400 | " \n",
401 | " \n",
402 | " | \n",
403 | " Texes | \n",
404 | " Utah | \n",
405 | " California | \n",
406 | "
\n",
407 | " \n",
408 | " \n",
409 | " \n",
410 | " a | \n",
411 | " NaN | \n",
412 | " NaN | \n",
413 | " 2.0 | \n",
414 | "
\n",
415 | " \n",
416 | " b | \n",
417 | " NaN | \n",
418 | " NaN | \n",
419 | " NaN | \n",
420 | "
\n",
421 | " \n",
422 | " c | \n",
423 | " NaN | \n",
424 | " NaN | \n",
425 | " 5.0 | \n",
426 | "
\n",
427 | " \n",
428 | " d | \n",
429 | " NaN | \n",
430 | " NaN | \n",
431 | " 8.0 | \n",
432 | "
\n",
433 | " \n",
434 | "
\n",
435 | "
"
436 | ],
437 | "text/plain": [
438 | " Texes Utah California\n",
439 | "a NaN NaN 2.0\n",
440 | "b NaN NaN NaN\n",
441 | "c NaN NaN 5.0\n",
442 | "d NaN NaN 8.0"
443 | ]
444 | },
445 | "execution_count": 10,
446 | "metadata": {},
447 | "output_type": "execute_result"
448 | }
449 | ],
450 | "source": [
451 | "frame.loc[['a','b','c','d'], states]"
452 | ]
453 | },
454 | {
455 | "cell_type": "markdown",
456 | "metadata": {},
457 | "source": [
458 | "## 2.按轴删除记录(Dropping Entries from an Axis)"
459 | ]
460 | },
461 | {
462 | "cell_type": "markdown",
463 | "metadata": {},
464 | "source": [
465 | "对于DataFrame,index能按行或列的axis来删除:"
466 | ]
467 | },
468 | {
469 | "cell_type": "code",
470 | "execution_count": 11,
471 | "metadata": {},
472 | "outputs": [
473 | {
474 | "data": {
475 | "text/html": [
476 | "\n",
477 | "\n",
490 | "
\n",
491 | " \n",
492 | " \n",
493 | " | \n",
494 | " one | \n",
495 | " two | \n",
496 | " three | \n",
497 | " four | \n",
498 | "
\n",
499 | " \n",
500 | " \n",
501 | " \n",
502 | " Ohio | \n",
503 | " 0 | \n",
504 | " 1 | \n",
505 | " 2 | \n",
506 | " 3 | \n",
507 | "
\n",
508 | " \n",
509 | " Colorado | \n",
510 | " 4 | \n",
511 | " 5 | \n",
512 | " 6 | \n",
513 | " 7 | \n",
514 | "
\n",
515 | " \n",
516 | " Utah | \n",
517 | " 8 | \n",
518 | " 9 | \n",
519 | " 10 | \n",
520 | " 11 | \n",
521 | "
\n",
522 | " \n",
523 | " New York | \n",
524 | " 12 | \n",
525 | " 13 | \n",
526 | " 14 | \n",
527 | " 15 | \n",
528 | "
\n",
529 | " \n",
530 | "
\n",
531 | "
"
532 | ],
533 | "text/plain": [
534 | " one two three four\n",
535 | "Ohio 0 1 2 3\n",
536 | "Colorado 4 5 6 7\n",
537 | "Utah 8 9 10 11\n",
538 | "New York 12 13 14 15"
539 | ]
540 | },
541 | "execution_count": 11,
542 | "metadata": {},
543 | "output_type": "execute_result"
544 | }
545 | ],
546 | "source": [
547 | "data = pd.DataFrame(np.arange(16).reshape(4, 4),\n",
548 | " index=['Ohio', 'Colorado', 'Utah', 'New York'],\n",
549 | " columns=['one', 'two', 'three', 'four'])\n",
550 | "data"
551 | ]
552 | },
553 | {
554 | "cell_type": "markdown",
555 | "metadata": {},
556 | "source": [
557 | "行处理:(axis 0)"
558 | ]
559 | },
560 | {
561 | "cell_type": "code",
562 | "execution_count": 12,
563 | "metadata": {},
564 | "outputs": [
565 | {
566 | "data": {
567 | "text/html": [
568 | "\n",
569 | "\n",
582 | "
\n",
583 | " \n",
584 | " \n",
585 | " | \n",
586 | " one | \n",
587 | " two | \n",
588 | " three | \n",
589 | " four | \n",
590 | "
\n",
591 | " \n",
592 | " \n",
593 | " \n",
594 | " Colorado | \n",
595 | " 4 | \n",
596 | " 5 | \n",
597 | " 6 | \n",
598 | " 7 | \n",
599 | "
\n",
600 | " \n",
601 | " Utah | \n",
602 | " 8 | \n",
603 | " 9 | \n",
604 | " 10 | \n",
605 | " 11 | \n",
606 | "
\n",
607 | " \n",
608 | " New York | \n",
609 | " 12 | \n",
610 | " 13 | \n",
611 | " 14 | \n",
612 | " 15 | \n",
613 | "
\n",
614 | " \n",
615 | "
\n",
616 | "
"
617 | ],
618 | "text/plain": [
619 | " one two three four\n",
620 | "Colorado 4 5 6 7\n",
621 | "Utah 8 9 10 11\n",
622 | "New York 12 13 14 15"
623 | ]
624 | },
625 | "execution_count": 12,
626 | "metadata": {},
627 | "output_type": "execute_result"
628 | }
629 | ],
630 | "source": [
631 | "data.drop(['Ohio'])"
632 | ]
633 | },
634 | {
635 | "cell_type": "markdown",
636 | "metadata": {},
637 | "source": [
638 | "列处理:(axis 1)"
639 | ]
640 | },
641 | {
642 | "cell_type": "code",
643 | "execution_count": 13,
644 | "metadata": {},
645 | "outputs": [
646 | {
647 | "data": {
648 | "text/html": [
649 | "\n",
650 | "\n",
663 | "
\n",
664 | " \n",
665 | " \n",
666 | " | \n",
667 | " one | \n",
668 | " three | \n",
669 | " four | \n",
670 | "
\n",
671 | " \n",
672 | " \n",
673 | " \n",
674 | " Ohio | \n",
675 | " 0 | \n",
676 | " 2 | \n",
677 | " 3 | \n",
678 | "
\n",
679 | " \n",
680 | " Colorado | \n",
681 | " 4 | \n",
682 | " 6 | \n",
683 | " 7 | \n",
684 | "
\n",
685 | " \n",
686 | " Utah | \n",
687 | " 8 | \n",
688 | " 10 | \n",
689 | " 11 | \n",
690 | "
\n",
691 | " \n",
692 | " New York | \n",
693 | " 12 | \n",
694 | " 14 | \n",
695 | " 15 | \n",
696 | "
\n",
697 | " \n",
698 | "
\n",
699 | "
"
700 | ],
701 | "text/plain": [
702 | " one three four\n",
703 | "Ohio 0 2 3\n",
704 | "Colorado 4 6 7\n",
705 | "Utah 8 10 11\n",
706 | "New York 12 14 15"
707 | ]
708 | },
709 | "execution_count": 13,
710 | "metadata": {},
711 | "output_type": "execute_result"
712 | }
713 | ],
714 | "source": [
715 | "data.drop('two', axis=1)"
716 | ]
717 | },
718 | {
719 | "cell_type": "markdown",
720 | "metadata": {},
721 | "source": [
722 | "## 2.索引,选择,过滤(indexing, selection, filtering)"
723 | ]
724 | },
725 | {
726 | "cell_type": "markdown",
727 | "metadata": {},
728 | "source": [
729 | "Series索引\n",
730 | "\n",
731 | "相当于numpy的Array索引,而且还可以使用label索引。注意使用label切片会包括尾节点。"
732 | ]
733 | },
734 | {
735 | "cell_type": "markdown",
736 | "metadata": {},
737 | "source": [
738 | "DataFrame 索引\n",
739 | "\n",
740 | "#### 值或序列索引:"
741 | ]
742 | },
743 | {
744 | "cell_type": "code",
745 | "execution_count": 14,
746 | "metadata": {},
747 | "outputs": [
748 | {
749 | "data": {
750 | "text/plain": [
751 | "Ohio 0\n",
752 | "Colorado 4\n",
753 | "Utah 8\n",
754 | "New York 12\n",
755 | "Name: one, dtype: int32"
756 | ]
757 | },
758 | "execution_count": 14,
759 | "metadata": {},
760 | "output_type": "execute_result"
761 | }
762 | ],
763 | "source": [
764 | "data['one']"
765 | ]
766 | },
767 | {
768 | "cell_type": "code",
769 | "execution_count": 15,
770 | "metadata": {},
771 | "outputs": [
772 | {
773 | "data": {
774 | "text/html": [
775 | "\n",
776 | "\n",
789 | "
\n",
790 | " \n",
791 | " \n",
792 | " | \n",
793 | " one | \n",
794 | " two | \n",
795 | "
\n",
796 | " \n",
797 | " \n",
798 | " \n",
799 | " Ohio | \n",
800 | " 0 | \n",
801 | " 1 | \n",
802 | "
\n",
803 | " \n",
804 | " Colorado | \n",
805 | " 4 | \n",
806 | " 5 | \n",
807 | "
\n",
808 | " \n",
809 | " Utah | \n",
810 | " 8 | \n",
811 | " 9 | \n",
812 | "
\n",
813 | " \n",
814 | " New York | \n",
815 | " 12 | \n",
816 | " 13 | \n",
817 | "
\n",
818 | " \n",
819 | "
\n",
820 | "
"
821 | ],
822 | "text/plain": [
823 | " one two\n",
824 | "Ohio 0 1\n",
825 | "Colorado 4 5\n",
826 | "Utah 8 9\n",
827 | "New York 12 13"
828 | ]
829 | },
830 | "execution_count": 15,
831 | "metadata": {},
832 | "output_type": "execute_result"
833 | }
834 | ],
835 | "source": [
836 | "data[['one', 'two']]"
837 | ]
838 | },
839 | {
840 | "cell_type": "markdown",
841 | "metadata": {},
842 | "source": [
843 | "#### 布尔数组索引:"
844 | ]
845 | },
846 | {
847 | "cell_type": "code",
848 | "execution_count": 16,
849 | "metadata": {},
850 | "outputs": [
851 | {
852 | "data": {
853 | "text/html": [
854 | "\n",
855 | "\n",
868 | "
\n",
869 | " \n",
870 | " \n",
871 | " | \n",
872 | " one | \n",
873 | " two | \n",
874 | " three | \n",
875 | " four | \n",
876 | "
\n",
877 | " \n",
878 | " \n",
879 | " \n",
880 | " Ohio | \n",
881 | " 0 | \n",
882 | " 1 | \n",
883 | " 2 | \n",
884 | " 3 | \n",
885 | "
\n",
886 | " \n",
887 | " Colorado | \n",
888 | " 4 | \n",
889 | " 5 | \n",
890 | " 6 | \n",
891 | " 7 | \n",
892 | "
\n",
893 | " \n",
894 | "
\n",
895 | "
"
896 | ],
897 | "text/plain": [
898 | " one two three four\n",
899 | "Ohio 0 1 2 3\n",
900 | "Colorado 4 5 6 7"
901 | ]
902 | },
903 | "execution_count": 16,
904 | "metadata": {},
905 | "output_type": "execute_result"
906 | }
907 | ],
908 | "source": [
909 | "data[:2]"
910 | ]
911 | },
912 | {
913 | "cell_type": "code",
914 | "execution_count": 17,
915 | "metadata": {},
916 | "outputs": [
917 | {
918 | "data": {
919 | "text/html": [
920 | "\n",
921 | "\n",
934 | "
\n",
935 | " \n",
936 | " \n",
937 | " | \n",
938 | " one | \n",
939 | " two | \n",
940 | " three | \n",
941 | " four | \n",
942 | "
\n",
943 | " \n",
944 | " \n",
945 | " \n",
946 | " Colorado | \n",
947 | " 4 | \n",
948 | " 5 | \n",
949 | " 6 | \n",
950 | " 7 | \n",
951 | "
\n",
952 | " \n",
953 | " Utah | \n",
954 | " 8 | \n",
955 | " 9 | \n",
956 | " 10 | \n",
957 | " 11 | \n",
958 | "
\n",
959 | " \n",
960 | " New York | \n",
961 | " 12 | \n",
962 | " 13 | \n",
963 | " 14 | \n",
964 | " 15 | \n",
965 | "
\n",
966 | " \n",
967 | "
\n",
968 | "
"
969 | ],
970 | "text/plain": [
971 | " one two three four\n",
972 | "Colorado 4 5 6 7\n",
973 | "Utah 8 9 10 11\n",
974 | "New York 12 13 14 15"
975 | ]
976 | },
977 | "execution_count": 17,
978 | "metadata": {},
979 | "output_type": "execute_result"
980 | }
981 | ],
982 | "source": [
983 | "data[data['three']>5]"
984 | ]
985 | },
986 | {
987 | "cell_type": "code",
988 | "execution_count": 18,
989 | "metadata": {},
990 | "outputs": [
991 | {
992 | "data": {
993 | "text/html": [
994 | "\n",
995 | "\n",
1008 | "
\n",
1009 | " \n",
1010 | " \n",
1011 | " | \n",
1012 | " one | \n",
1013 | " two | \n",
1014 | " three | \n",
1015 | " four | \n",
1016 | "
\n",
1017 | " \n",
1018 | " \n",
1019 | " \n",
1020 | " Ohio | \n",
1021 | " 0 | \n",
1022 | " 1 | \n",
1023 | " 2 | \n",
1024 | " 3 | \n",
1025 | "
\n",
1026 | " \n",
1027 | " Colorado | \n",
1028 | " 4 | \n",
1029 | " 5 | \n",
1030 | " 6 | \n",
1031 | " 7 | \n",
1032 | "
\n",
1033 | " \n",
1034 | " Utah | \n",
1035 | " 8 | \n",
1036 | " 9 | \n",
1037 | " 10 | \n",
1038 | " 11 | \n",
1039 | "
\n",
1040 | " \n",
1041 | " New York | \n",
1042 | " 12 | \n",
1043 | " 13 | \n",
1044 | " 14 | \n",
1045 | " 0 | \n",
1046 | "
\n",
1047 | " \n",
1048 | "
\n",
1049 | "
"
1050 | ],
1051 | "text/plain": [
1052 | " one two three four\n",
1053 | "Ohio 0 1 2 3\n",
1054 | "Colorado 4 5 6 7\n",
1055 | "Utah 8 9 10 11\n",
1056 | "New York 12 13 14 0"
1057 | ]
1058 | },
1059 | "execution_count": 18,
1060 | "metadata": {},
1061 | "output_type": "execute_result"
1062 | }
1063 | ],
1064 | "source": [
1065 | "data[data>14] = 0\n",
1066 | "data"
1067 | ]
1068 | },
1069 | {
1070 | "cell_type": "markdown",
1071 | "metadata": {},
1072 | "source": [
1073 | "#### 标签和位置索引:\n",
1074 | "\n",
1075 | "对于label-indexing on rows:loc(for labels标签索引)、iloc(for integers位置索引)"
1076 | ]
1077 | },
1078 | {
1079 | "cell_type": "code",
1080 | "execution_count": 19,
1081 | "metadata": {},
1082 | "outputs": [
1083 | {
1084 | "data": {
1085 | "text/html": [
1086 | "\n",
1087 | "\n",
1100 | "
\n",
1101 | " \n",
1102 | " \n",
1103 | " | \n",
1104 | " one | \n",
1105 | " two | \n",
1106 | " three | \n",
1107 | " four | \n",
1108 | "
\n",
1109 | " \n",
1110 | " \n",
1111 | " \n",
1112 | " Ohio | \n",
1113 | " 0 | \n",
1114 | " 1 | \n",
1115 | " 2 | \n",
1116 | " 3 | \n",
1117 | "
\n",
1118 | " \n",
1119 | " Colorado | \n",
1120 | " 4 | \n",
1121 | " 5 | \n",
1122 | " 6 | \n",
1123 | " 7 | \n",
1124 | "
\n",
1125 | " \n",
1126 | " Utah | \n",
1127 | " 8 | \n",
1128 | " 9 | \n",
1129 | " 10 | \n",
1130 | " 11 | \n",
1131 | "
\n",
1132 | " \n",
1133 | " New York | \n",
1134 | " 12 | \n",
1135 | " 13 | \n",
1136 | " 14 | \n",
1137 | " 0 | \n",
1138 | "
\n",
1139 | " \n",
1140 | "
\n",
1141 | "
"
1142 | ],
1143 | "text/plain": [
1144 | " one two three four\n",
1145 | "Ohio 0 1 2 3\n",
1146 | "Colorado 4 5 6 7\n",
1147 | "Utah 8 9 10 11\n",
1148 | "New York 12 13 14 0"
1149 | ]
1150 | },
1151 | "execution_count": 19,
1152 | "metadata": {},
1153 | "output_type": "execute_result"
1154 | }
1155 | ],
1156 | "source": [
1157 | "data"
1158 | ]
1159 | },
1160 | {
1161 | "cell_type": "code",
1162 | "execution_count": 20,
1163 | "metadata": {},
1164 | "outputs": [
1165 | {
1166 | "data": {
1167 | "text/plain": [
1168 | "one 0\n",
1169 | "two 1\n",
1170 | "Name: Ohio, dtype: int32"
1171 | ]
1172 | },
1173 | "execution_count": 20,
1174 | "metadata": {},
1175 | "output_type": "execute_result"
1176 | }
1177 | ],
1178 | "source": [
1179 | "data.loc['Ohio', ['one', 'two']]"
1180 | ]
1181 | },
1182 | {
1183 | "cell_type": "code",
1184 | "execution_count": 21,
1185 | "metadata": {},
1186 | "outputs": [
1187 | {
1188 | "data": {
1189 | "text/plain": [
1190 | "one 0\n",
1191 | "two 1\n",
1192 | "Name: Ohio, dtype: int32"
1193 | ]
1194 | },
1195 | "execution_count": 21,
1196 | "metadata": {},
1197 | "output_type": "execute_result"
1198 | }
1199 | ],
1200 | "source": [
1201 | "data.iloc[0, [0, 1]]"
1202 | ]
1203 | },
1204 | {
1205 | "cell_type": "code",
1206 | "execution_count": 22,
1207 | "metadata": {},
1208 | "outputs": [
1209 | {
1210 | "data": {
1211 | "text/plain": [
1212 | "Ohio 1\n",
1213 | "Colorado 5\n",
1214 | "Utah 9\n",
1215 | "Name: two, dtype: int32"
1216 | ]
1217 | },
1218 | "execution_count": 22,
1219 | "metadata": {},
1220 | "output_type": "execute_result"
1221 | }
1222 | ],
1223 | "source": [
1224 | "data.loc[:'Utah', 'two']"
1225 | ]
1226 | },
1227 | {
1228 | "cell_type": "code",
1229 | "execution_count": 23,
1230 | "metadata": {},
1231 | "outputs": [
1232 | {
1233 | "data": {
1234 | "text/html": [
1235 | "\n",
1236 | "\n",
1249 | "
\n",
1250 | " \n",
1251 | " \n",
1252 | " | \n",
1253 | " one | \n",
1254 | " two | \n",
1255 | " three | \n",
1256 | "
\n",
1257 | " \n",
1258 | " \n",
1259 | " \n",
1260 | " Colorado | \n",
1261 | " 4 | \n",
1262 | " 5 | \n",
1263 | " 6 | \n",
1264 | "
\n",
1265 | " \n",
1266 | " Utah | \n",
1267 | " 8 | \n",
1268 | " 9 | \n",
1269 | " 10 | \n",
1270 | "
\n",
1271 | " \n",
1272 | " New York | \n",
1273 | " 12 | \n",
1274 | " 13 | \n",
1275 | " 14 | \n",
1276 | "
\n",
1277 | " \n",
1278 | "
\n",
1279 | "
"
1280 | ],
1281 | "text/plain": [
1282 | " one two three\n",
1283 | "Colorado 4 5 6\n",
1284 | "Utah 8 9 10\n",
1285 | "New York 12 13 14"
1286 | ]
1287 | },
1288 | "execution_count": 23,
1289 | "metadata": {},
1290 | "output_type": "execute_result"
1291 | }
1292 | ],
1293 | "source": [
1294 | "data.iloc[:, :3][data.three>5]"
1295 | ]
1296 | },
1297 | {
1298 | "cell_type": "markdown",
1299 | "metadata": {},
1300 | "source": [
1301 | "选择数据方法:\n",
1302 | "\n",
1303 | "\n",
1304 | "\n",
1305 | ""
1306 | ]
1307 | },
1308 | {
1309 | "cell_type": "markdown",
1310 | "metadata": {},
1311 | "source": [
1312 | "## 3.算数和数据对齐(Arithmetic and Data Alignment)"
1313 | ]
1314 | },
1315 | {
1316 | "cell_type": "code",
1317 | "execution_count": 24,
1318 | "metadata": {},
1319 | "outputs": [
1320 | {
1321 | "data": {
1322 | "text/html": [
1323 | "\n",
1324 | "\n",
1337 | "
\n",
1338 | " \n",
1339 | " \n",
1340 | " | \n",
1341 | " b | \n",
1342 | " c | \n",
1343 | " d | \n",
1344 | "
\n",
1345 | " \n",
1346 | " \n",
1347 | " \n",
1348 | " Colorado | \n",
1349 | " 0.0 | \n",
1350 | " 1.0 | \n",
1351 | " 2.0 | \n",
1352 | "
\n",
1353 | " \n",
1354 | " Texas | \n",
1355 | " 3.0 | \n",
1356 | " 4.0 | \n",
1357 | " 5.0 | \n",
1358 | "
\n",
1359 | " \n",
1360 | " Ohio | \n",
1361 | " 6.0 | \n",
1362 | " 7.0 | \n",
1363 | " 8.0 | \n",
1364 | "
\n",
1365 | " \n",
1366 | "
\n",
1367 | "
"
1368 | ],
1369 | "text/plain": [
1370 | " b c d\n",
1371 | "Colorado 0.0 1.0 2.0\n",
1372 | "Texas 3.0 4.0 5.0\n",
1373 | "Ohio 6.0 7.0 8.0"
1374 | ]
1375 | },
1376 | "execution_count": 24,
1377 | "metadata": {},
1378 | "output_type": "execute_result"
1379 | }
1380 | ],
1381 | "source": [
1382 | "df1 = pd.DataFrame(np.arange(9.).reshape((3,3)), columns=list('bcd'),\n",
1383 | " index={'Ohio', 'Texas', 'Colorado'})\n",
1384 | "df1"
1385 | ]
1386 | },
1387 | {
1388 | "cell_type": "code",
1389 | "execution_count": 25,
1390 | "metadata": {},
1391 | "outputs": [
1392 | {
1393 | "data": {
1394 | "text/html": [
1395 | "\n",
1396 | "\n",
1409 | "
\n",
1410 | " \n",
1411 | " \n",
1412 | " | \n",
1413 | " b | \n",
1414 | " d | \n",
1415 | " e | \n",
1416 | "
\n",
1417 | " \n",
1418 | " \n",
1419 | " \n",
1420 | " Utah | \n",
1421 | " 0.0 | \n",
1422 | " 1.0 | \n",
1423 | " 2.0 | \n",
1424 | "
\n",
1425 | " \n",
1426 | " Ohio | \n",
1427 | " 3.0 | \n",
1428 | " 4.0 | \n",
1429 | " 5.0 | \n",
1430 | "
\n",
1431 | " \n",
1432 | " Texas | \n",
1433 | " 6.0 | \n",
1434 | " 7.0 | \n",
1435 | " 8.0 | \n",
1436 | "
\n",
1437 | " \n",
1438 | " Oregon | \n",
1439 | " 9.0 | \n",
1440 | " 10.0 | \n",
1441 | " 11.0 | \n",
1442 | "
\n",
1443 | " \n",
1444 | "
\n",
1445 | "
"
1446 | ],
1447 | "text/plain": [
1448 | " b d e\n",
1449 | "Utah 0.0 1.0 2.0\n",
1450 | "Ohio 3.0 4.0 5.0\n",
1451 | "Texas 6.0 7.0 8.0\n",
1452 | "Oregon 9.0 10.0 11.0"
1453 | ]
1454 | },
1455 | "execution_count": 25,
1456 | "metadata": {},
1457 | "output_type": "execute_result"
1458 | }
1459 | ],
1460 | "source": [
1461 | "df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),\n",
1462 | " index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n",
1463 | "df2"
1464 | ]
1465 | },
1466 | {
1467 | "cell_type": "code",
1468 | "execution_count": 26,
1469 | "metadata": {},
1470 | "outputs": [
1471 | {
1472 | "data": {
1473 | "text/html": [
1474 | "\n",
1475 | "\n",
1488 | "
\n",
1489 | " \n",
1490 | " \n",
1491 | " | \n",
1492 | " b | \n",
1493 | " c | \n",
1494 | " d | \n",
1495 | " e | \n",
1496 | "
\n",
1497 | " \n",
1498 | " \n",
1499 | " \n",
1500 | " Colorado | \n",
1501 | " NaN | \n",
1502 | " NaN | \n",
1503 | " NaN | \n",
1504 | " NaN | \n",
1505 | "
\n",
1506 | " \n",
1507 | " Ohio | \n",
1508 | " 9.0 | \n",
1509 | " NaN | \n",
1510 | " 12.0 | \n",
1511 | " NaN | \n",
1512 | "
\n",
1513 | " \n",
1514 | " Oregon | \n",
1515 | " NaN | \n",
1516 | " NaN | \n",
1517 | " NaN | \n",
1518 | " NaN | \n",
1519 | "
\n",
1520 | " \n",
1521 | " Texas | \n",
1522 | " 9.0 | \n",
1523 | " NaN | \n",
1524 | " 12.0 | \n",
1525 | " NaN | \n",
1526 | "
\n",
1527 | " \n",
1528 | " Utah | \n",
1529 | " NaN | \n",
1530 | " NaN | \n",
1531 | " NaN | \n",
1532 | " NaN | \n",
1533 | "
\n",
1534 | " \n",
1535 | "
\n",
1536 | "
"
1537 | ],
1538 | "text/plain": [
1539 | " b c d e\n",
1540 | "Colorado NaN NaN NaN NaN\n",
1541 | "Ohio 9.0 NaN 12.0 NaN\n",
1542 | "Oregon NaN NaN NaN NaN\n",
1543 | "Texas 9.0 NaN 12.0 NaN\n",
1544 | "Utah NaN NaN NaN NaN"
1545 | ]
1546 | },
1547 | "execution_count": 26,
1548 | "metadata": {},
1549 | "output_type": "execute_result"
1550 | }
1551 | ],
1552 | "source": [
1553 | "df1 + df2"
1554 | ]
1555 | },
1556 | {
1557 | "cell_type": "markdown",
1558 | "metadata": {},
1559 | "source": [
1560 | "因为'c'和'e'列都不在两个DataFrame里,所有全是缺失值。对于行,即使有相同的,但列不一样的话也会是缺失值。"
1561 | ]
1562 | },
1563 | {
1564 | "cell_type": "markdown",
1565 | "metadata": {},
1566 | "source": [
1567 | "使用带填充值得方法:"
1568 | ]
1569 | },
1570 | {
1571 | "cell_type": "code",
1572 | "execution_count": 27,
1573 | "metadata": {},
1574 | "outputs": [
1575 | {
1576 | "data": {
1577 | "text/html": [
1578 | "\n",
1579 | "\n",
1592 | "
\n",
1593 | " \n",
1594 | " \n",
1595 | " | \n",
1596 | " a | \n",
1597 | " b | \n",
1598 | " c | \n",
1599 | " d | \n",
1600 | " e | \n",
1601 | "
\n",
1602 | " \n",
1603 | " \n",
1604 | " \n",
1605 | " 0 | \n",
1606 | " 0.0 | \n",
1607 | " 2.0 | \n",
1608 | " 4.0 | \n",
1609 | " 6.0 | \n",
1610 | " 4.0 | \n",
1611 | "
\n",
1612 | " \n",
1613 | " 1 | \n",
1614 | " 9.0 | \n",
1615 | " 5.0 | \n",
1616 | " 13.0 | \n",
1617 | " 15.0 | \n",
1618 | " 9.0 | \n",
1619 | "
\n",
1620 | " \n",
1621 | " 2 | \n",
1622 | " 18.0 | \n",
1623 | " 20.0 | \n",
1624 | " 22.0 | \n",
1625 | " 24.0 | \n",
1626 | " 14.0 | \n",
1627 | "
\n",
1628 | " \n",
1629 | " 3 | \n",
1630 | " 15.0 | \n",
1631 | " 16.0 | \n",
1632 | " 17.0 | \n",
1633 | " 18.0 | \n",
1634 | " 19.0 | \n",
1635 | "
\n",
1636 | " \n",
1637 | "
\n",
1638 | "
"
1639 | ],
1640 | "text/plain": [
1641 | " a b c d e\n",
1642 | "0 0.0 2.0 4.0 6.0 4.0\n",
1643 | "1 9.0 5.0 13.0 15.0 9.0\n",
1644 | "2 18.0 20.0 22.0 24.0 14.0\n",
1645 | "3 15.0 16.0 17.0 18.0 19.0"
1646 | ]
1647 | },
1648 | "execution_count": 27,
1649 | "metadata": {},
1650 | "output_type": "execute_result"
1651 | }
1652 | ],
1653 | "source": [
1654 | "df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), \n",
1655 | " columns=list('abcd'))\n",
1656 | "\n",
1657 | "df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), \n",
1658 | " columns=list('abcde'))\n",
1659 | "df2.loc[1, 'b'] = np.nan\n",
1660 | "df1.add(df2, fill_value=0)"
1661 | ]
1662 | },
1663 | {
1664 | "cell_type": "markdown",
1665 | "metadata": {},
1666 | "source": [
1667 | "下表是这样的灵活算数方法:"
1668 | ]
1669 | },
1670 | {
1671 | "cell_type": "markdown",
1672 | "metadata": {},
1673 | "source": [
1674 | ""
1675 | ]
1676 | },
1677 | {
1678 | "cell_type": "markdown",
1679 | "metadata": {},
1680 | "source": [
1681 | "每一个都有一个配对的,以r开头,意思是反转。"
1682 | ]
1683 | },
1684 | {
1685 | "cell_type": "code",
1686 | "execution_count": 28,
1687 | "metadata": {},
1688 | "outputs": [
1689 | {
1690 | "data": {
1691 | "text/html": [
1692 | "\n",
1693 | "\n",
1706 | "
\n",
1707 | " \n",
1708 | " \n",
1709 | " | \n",
1710 | " a | \n",
1711 | " b | \n",
1712 | " c | \n",
1713 | " d | \n",
1714 | "
\n",
1715 | " \n",
1716 | " \n",
1717 | " \n",
1718 | " 0 | \n",
1719 | " inf | \n",
1720 | " 1.000000 | \n",
1721 | " 0.500000 | \n",
1722 | " 0.333333 | \n",
1723 | "
\n",
1724 | " \n",
1725 | " 1 | \n",
1726 | " 0.250000 | \n",
1727 | " 0.200000 | \n",
1728 | " 0.166667 | \n",
1729 | " 0.142857 | \n",
1730 | "
\n",
1731 | " \n",
1732 | " 2 | \n",
1733 | " 0.125000 | \n",
1734 | " 0.111111 | \n",
1735 | " 0.100000 | \n",
1736 | " 0.090909 | \n",
1737 | "
\n",
1738 | " \n",
1739 | "
\n",
1740 | "
"
1741 | ],
1742 | "text/plain": [
1743 | " a b c d\n",
1744 | "0 inf 1.000000 0.500000 0.333333\n",
1745 | "1 0.250000 0.200000 0.166667 0.142857\n",
1746 | "2 0.125000 0.111111 0.100000 0.090909"
1747 | ]
1748 | },
1749 | "execution_count": 28,
1750 | "metadata": {},
1751 | "output_type": "execute_result"
1752 | }
1753 | ],
1754 | "source": [
1755 | "1/df1"
1756 | ]
1757 | },
1758 | {
1759 | "cell_type": "code",
1760 | "execution_count": 29,
1761 | "metadata": {},
1762 | "outputs": [
1763 | {
1764 | "data": {
1765 | "text/html": [
1766 | "\n",
1767 | "\n",
1780 | "
\n",
1781 | " \n",
1782 | " \n",
1783 | " | \n",
1784 | " a | \n",
1785 | " b | \n",
1786 | " c | \n",
1787 | " d | \n",
1788 | "
\n",
1789 | " \n",
1790 | " \n",
1791 | " \n",
1792 | " 0 | \n",
1793 | " inf | \n",
1794 | " 1.000000 | \n",
1795 | " 0.500000 | \n",
1796 | " 0.333333 | \n",
1797 | "
\n",
1798 | " \n",
1799 | " 1 | \n",
1800 | " 0.250000 | \n",
1801 | " 0.200000 | \n",
1802 | " 0.166667 | \n",
1803 | " 0.142857 | \n",
1804 | "
\n",
1805 | " \n",
1806 | " 2 | \n",
1807 | " 0.125000 | \n",
1808 | " 0.111111 | \n",
1809 | " 0.100000 | \n",
1810 | " 0.090909 | \n",
1811 | "
\n",
1812 | " \n",
1813 | "
\n",
1814 | "
"
1815 | ],
1816 | "text/plain": [
1817 | " a b c d\n",
1818 | "0 inf 1.000000 0.500000 0.333333\n",
1819 | "1 0.250000 0.200000 0.166667 0.142857\n",
1820 | "2 0.125000 0.111111 0.100000 0.090909"
1821 | ]
1822 | },
1823 | "execution_count": 29,
1824 | "metadata": {},
1825 | "output_type": "execute_result"
1826 | }
1827 | ],
1828 | "source": [
1829 | "df1.rdiv(1)"
1830 | ]
1831 | },
1832 | {
1833 | "cell_type": "markdown",
1834 | "metadata": {},
1835 | "source": [
1836 | "在reindexing(重建索引)时,也可以使用fill_value"
1837 | ]
1838 | },
1839 | {
1840 | "cell_type": "code",
1841 | "execution_count": 30,
1842 | "metadata": {},
1843 | "outputs": [
1844 | {
1845 | "data": {
1846 | "text/html": [
1847 | "\n",
1848 | "\n",
1861 | "
\n",
1862 | " \n",
1863 | " \n",
1864 | " | \n",
1865 | " a | \n",
1866 | " b | \n",
1867 | " c | \n",
1868 | " d | \n",
1869 | " e | \n",
1870 | "
\n",
1871 | " \n",
1872 | " \n",
1873 | " \n",
1874 | " 0 | \n",
1875 | " 0.0 | \n",
1876 | " 1.0 | \n",
1877 | " 2.0 | \n",
1878 | " 3.0 | \n",
1879 | " 0 | \n",
1880 | "
\n",
1881 | " \n",
1882 | " 1 | \n",
1883 | " 4.0 | \n",
1884 | " 5.0 | \n",
1885 | " 6.0 | \n",
1886 | " 7.0 | \n",
1887 | " 0 | \n",
1888 | "
\n",
1889 | " \n",
1890 | " 2 | \n",
1891 | " 8.0 | \n",
1892 | " 9.0 | \n",
1893 | " 10.0 | \n",
1894 | " 11.0 | \n",
1895 | " 0 | \n",
1896 | "
\n",
1897 | " \n",
1898 | "
\n",
1899 | "
"
1900 | ],
1901 | "text/plain": [
1902 | " a b c d e\n",
1903 | "0 0.0 1.0 2.0 3.0 0\n",
1904 | "1 4.0 5.0 6.0 7.0 0\n",
1905 | "2 8.0 9.0 10.0 11.0 0"
1906 | ]
1907 | },
1908 | "execution_count": 30,
1909 | "metadata": {},
1910 | "output_type": "execute_result"
1911 | }
1912 | ],
1913 | "source": [
1914 | "df1.reindex(columns=df2.columns, fill_value=0)"
1915 | ]
1916 | },
1917 | {
1918 | "cell_type": "markdown",
1919 | "metadata": {},
1920 | "source": [
1921 | "#### DataFrame和Series之间的操作:"
1922 | ]
1923 | },
1924 | {
1925 | "cell_type": "markdown",
1926 | "metadata": {},
1927 | "source": [
1928 | "举一个numpy的例子:"
1929 | ]
1930 | },
1931 | {
1932 | "cell_type": "code",
1933 | "execution_count": 31,
1934 | "metadata": {
1935 | "collapsed": true
1936 | },
1937 | "outputs": [],
1938 | "source": [
1939 | "arr = np.arange(12.).reshape((3, 4))"
1940 | ]
1941 | },
1942 | {
1943 | "cell_type": "code",
1944 | "execution_count": 32,
1945 | "metadata": {},
1946 | "outputs": [
1947 | {
1948 | "data": {
1949 | "text/plain": [
1950 | "array([[ 0., 0., 0., 0.],\n",
1951 | " [ 4., 4., 4., 4.],\n",
1952 | " [ 8., 8., 8., 8.]])"
1953 | ]
1954 | },
1955 | "execution_count": 32,
1956 | "metadata": {},
1957 | "output_type": "execute_result"
1958 | }
1959 | ],
1960 | "source": [
1961 | "arr - arr[0]"
1962 | ]
1963 | },
1964 | {
1965 | "cell_type": "markdown",
1966 | "metadata": {},
1967 | "source": [
1968 | "减法用在了每一行上,这种操作叫做broadcating(广播)。"
1969 | ]
1970 | },
1971 | {
1972 | "cell_type": "code",
1973 | "execution_count": 33,
1974 | "metadata": {
1975 | "collapsed": true
1976 | },
1977 | "outputs": [],
1978 | "source": [
1979 | "frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),\n",
1980 | " columns=list('bde'),\n",
1981 | " index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n",
1982 | "series = frame.iloc[0]"
1983 | ]
1984 | },
1985 | {
1986 | "cell_type": "code",
1987 | "execution_count": 34,
1988 | "metadata": {},
1989 | "outputs": [
1990 | {
1991 | "data": {
1992 | "text/html": [
1993 | "\n",
1994 | "\n",
2007 | "
\n",
2008 | " \n",
2009 | " \n",
2010 | " | \n",
2011 | " b | \n",
2012 | " d | \n",
2013 | " e | \n",
2014 | "
\n",
2015 | " \n",
2016 | " \n",
2017 | " \n",
2018 | " Utah | \n",
2019 | " 0.0 | \n",
2020 | " 1.0 | \n",
2021 | " 2.0 | \n",
2022 | "
\n",
2023 | " \n",
2024 | " Ohio | \n",
2025 | " 3.0 | \n",
2026 | " 4.0 | \n",
2027 | " 5.0 | \n",
2028 | "
\n",
2029 | " \n",
2030 | " Texas | \n",
2031 | " 6.0 | \n",
2032 | " 7.0 | \n",
2033 | " 8.0 | \n",
2034 | "
\n",
2035 | " \n",
2036 | " Oregon | \n",
2037 | " 9.0 | \n",
2038 | " 10.0 | \n",
2039 | " 11.0 | \n",
2040 | "
\n",
2041 | " \n",
2042 | "
\n",
2043 | "
"
2044 | ],
2045 | "text/plain": [
2046 | " b d e\n",
2047 | "Utah 0.0 1.0 2.0\n",
2048 | "Ohio 3.0 4.0 5.0\n",
2049 | "Texas 6.0 7.0 8.0\n",
2050 | "Oregon 9.0 10.0 11.0"
2051 | ]
2052 | },
2053 | "execution_count": 34,
2054 | "metadata": {},
2055 | "output_type": "execute_result"
2056 | }
2057 | ],
2058 | "source": [
2059 | "frame"
2060 | ]
2061 | },
2062 | {
2063 | "cell_type": "code",
2064 | "execution_count": 35,
2065 | "metadata": {},
2066 | "outputs": [
2067 | {
2068 | "data": {
2069 | "text/plain": [
2070 | "b 0.0\n",
2071 | "d 1.0\n",
2072 | "e 2.0\n",
2073 | "Name: Utah, dtype: float64"
2074 | ]
2075 | },
2076 | "execution_count": 35,
2077 | "metadata": {},
2078 | "output_type": "execute_result"
2079 | }
2080 | ],
2081 | "source": [
2082 | "series"
2083 | ]
2084 | },
2085 | {
2086 | "cell_type": "markdown",
2087 | "metadata": {},
2088 | "source": [
2089 | "可以理解为Series和DataFrame的列匹配。\n",
2090 | "\n",
2091 | "Broadcasting down the rows(向下按行广播)"
2092 | ]
2093 | },
2094 | {
2095 | "cell_type": "code",
2096 | "execution_count": 36,
2097 | "metadata": {},
2098 | "outputs": [
2099 | {
2100 | "data": {
2101 | "text/html": [
2102 | "\n",
2103 | "\n",
2116 | "
\n",
2117 | " \n",
2118 | " \n",
2119 | " | \n",
2120 | " b | \n",
2121 | " d | \n",
2122 | " e | \n",
2123 | "
\n",
2124 | " \n",
2125 | " \n",
2126 | " \n",
2127 | " Utah | \n",
2128 | " 0.0 | \n",
2129 | " 0.0 | \n",
2130 | " 0.0 | \n",
2131 | "
\n",
2132 | " \n",
2133 | " Ohio | \n",
2134 | " 3.0 | \n",
2135 | " 3.0 | \n",
2136 | " 3.0 | \n",
2137 | "
\n",
2138 | " \n",
2139 | " Texas | \n",
2140 | " 6.0 | \n",
2141 | " 6.0 | \n",
2142 | " 6.0 | \n",
2143 | "
\n",
2144 | " \n",
2145 | " Oregon | \n",
2146 | " 9.0 | \n",
2147 | " 9.0 | \n",
2148 | " 9.0 | \n",
2149 | "
\n",
2150 | " \n",
2151 | "
\n",
2152 | "
"
2153 | ],
2154 | "text/plain": [
2155 | " b d e\n",
2156 | "Utah 0.0 0.0 0.0\n",
2157 | "Ohio 3.0 3.0 3.0\n",
2158 | "Texas 6.0 6.0 6.0\n",
2159 | "Oregon 9.0 9.0 9.0"
2160 | ]
2161 | },
2162 | "execution_count": 36,
2163 | "metadata": {},
2164 | "output_type": "execute_result"
2165 | }
2166 | ],
2167 | "source": [
2168 | "frame - series"
2169 | ]
2170 | },
2171 | {
2172 | "cell_type": "markdown",
2173 | "metadata": {},
2174 | "source": [
2175 | "如果Series和DataFrame有不同的index,那么相加结果也是合集:"
2176 | ]
2177 | },
2178 | {
2179 | "cell_type": "code",
2180 | "execution_count": 37,
2181 | "metadata": {},
2182 | "outputs": [
2183 | {
2184 | "data": {
2185 | "text/html": [
2186 | "\n",
2187 | "\n",
2200 | "
\n",
2201 | " \n",
2202 | " \n",
2203 | " | \n",
2204 | " b | \n",
2205 | " d | \n",
2206 | " e | \n",
2207 | " f | \n",
2208 | "
\n",
2209 | " \n",
2210 | " \n",
2211 | " \n",
2212 | " Utah | \n",
2213 | " 0.0 | \n",
2214 | " NaN | \n",
2215 | " 3.0 | \n",
2216 | " NaN | \n",
2217 | "
\n",
2218 | " \n",
2219 | " Ohio | \n",
2220 | " 3.0 | \n",
2221 | " NaN | \n",
2222 | " 6.0 | \n",
2223 | " NaN | \n",
2224 | "
\n",
2225 | " \n",
2226 | " Texas | \n",
2227 | " 6.0 | \n",
2228 | " NaN | \n",
2229 | " 9.0 | \n",
2230 | " NaN | \n",
2231 | "
\n",
2232 | " \n",
2233 | " Oregon | \n",
2234 | " 9.0 | \n",
2235 | " NaN | \n",
2236 | " 12.0 | \n",
2237 | " NaN | \n",
2238 | "
\n",
2239 | " \n",
2240 | "
\n",
2241 | "
"
2242 | ],
2243 | "text/plain": [
2244 | " b d e f\n",
2245 | "Utah 0.0 NaN 3.0 NaN\n",
2246 | "Ohio 3.0 NaN 6.0 NaN\n",
2247 | "Texas 6.0 NaN 9.0 NaN\n",
2248 | "Oregon 9.0 NaN 12.0 NaN"
2249 | ]
2250 | },
2251 | "execution_count": 37,
2252 | "metadata": {},
2253 | "output_type": "execute_result"
2254 | }
2255 | ],
2256 | "source": [
2257 | "series2 = pd.Series(range(3), index=['b', 'e', 'f'])\n",
2258 | "frame + series2"
2259 | ]
2260 | },
2261 | {
2262 | "cell_type": "markdown",
2263 | "metadata": {},
2264 | "source": [
2265 | "如果想要广播列,去匹配行,必须要用到算数方法:"
2266 | ]
2267 | },
2268 | {
2269 | "cell_type": "code",
2270 | "execution_count": 38,
2271 | "metadata": {
2272 | "collapsed": true
2273 | },
2274 | "outputs": [],
2275 | "source": [
2276 | "series = frame['d']"
2277 | ]
2278 | },
2279 | {
2280 | "cell_type": "code",
2281 | "execution_count": 39,
2282 | "metadata": {},
2283 | "outputs": [
2284 | {
2285 | "data": {
2286 | "text/html": [
2287 | "\n",
2288 | "\n",
2301 | "
\n",
2302 | " \n",
2303 | " \n",
2304 | " | \n",
2305 | " b | \n",
2306 | " d | \n",
2307 | " e | \n",
2308 | "
\n",
2309 | " \n",
2310 | " \n",
2311 | " \n",
2312 | " Utah | \n",
2313 | " -1.0 | \n",
2314 | " 0.0 | \n",
2315 | " 1.0 | \n",
2316 | "
\n",
2317 | " \n",
2318 | " Ohio | \n",
2319 | " -1.0 | \n",
2320 | " 0.0 | \n",
2321 | " 1.0 | \n",
2322 | "
\n",
2323 | " \n",
2324 | " Texas | \n",
2325 | " -1.0 | \n",
2326 | " 0.0 | \n",
2327 | " 1.0 | \n",
2328 | "
\n",
2329 | " \n",
2330 | " Oregon | \n",
2331 | " -1.0 | \n",
2332 | " 0.0 | \n",
2333 | " 1.0 | \n",
2334 | "
\n",
2335 | " \n",
2336 | "
\n",
2337 | "
"
2338 | ],
2339 | "text/plain": [
2340 | " b d e\n",
2341 | "Utah -1.0 0.0 1.0\n",
2342 | "Ohio -1.0 0.0 1.0\n",
2343 | "Texas -1.0 0.0 1.0\n",
2344 | "Oregon -1.0 0.0 1.0"
2345 | ]
2346 | },
2347 | "execution_count": 39,
2348 | "metadata": {},
2349 | "output_type": "execute_result"
2350 | }
2351 | ],
2352 | "source": [
2353 | "frame.sub(series, axis='index')"
2354 | ]
2355 | },
2356 | {
2357 | "cell_type": "markdown",
2358 | "metadata": {},
2359 | "source": [
2360 | "axis参数就是用来匹配轴的。在这个例子里是匹配dataframe的row index(axis='index' or axis=0),然后再广播。"
2361 | ]
2362 | },
2363 | {
2364 | "cell_type": "markdown",
2365 | "metadata": {},
2366 | "source": [
2367 | "## 4.函数应用和映射(Fuction Application and Mappong)"
2368 | ]
2369 | },
2370 | {
2371 | "cell_type": "markdown",
2372 | "metadata": {},
2373 | "source": [
2374 | "numpy的ufuncs(element-wise数组方法)也能用在pandas的object上:"
2375 | ]
2376 | },
2377 | {
2378 | "cell_type": "code",
2379 | "execution_count": 40,
2380 | "metadata": {},
2381 | "outputs": [
2382 | {
2383 | "data": {
2384 | "text/html": [
2385 | "\n",
2386 | "\n",
2399 | "
\n",
2400 | " \n",
2401 | " \n",
2402 | " | \n",
2403 | " b | \n",
2404 | " d | \n",
2405 | " e | \n",
2406 | "
\n",
2407 | " \n",
2408 | " \n",
2409 | " \n",
2410 | " Utah | \n",
2411 | " -1.326382 | \n",
2412 | " -0.690920 | \n",
2413 | " 0.121802 | \n",
2414 | "
\n",
2415 | " \n",
2416 | " Ohio | \n",
2417 | " 1.255100 | \n",
2418 | " 0.496809 | \n",
2419 | " 1.017018 | \n",
2420 | "
\n",
2421 | " \n",
2422 | " Texas | \n",
2423 | " 0.752331 | \n",
2424 | " -0.148764 | \n",
2425 | " -1.549744 | \n",
2426 | "
\n",
2427 | " \n",
2428 | " Oregon | \n",
2429 | " 1.063863 | \n",
2430 | " 0.208184 | \n",
2431 | " -1.328060 | \n",
2432 | "
\n",
2433 | " \n",
2434 | "
\n",
2435 | "
"
2436 | ],
2437 | "text/plain": [
2438 | " b d e\n",
2439 | "Utah -1.326382 -0.690920 0.121802\n",
2440 | "Ohio 1.255100 0.496809 1.017018\n",
2441 | "Texas 0.752331 -0.148764 -1.549744\n",
2442 | "Oregon 1.063863 0.208184 -1.328060"
2443 | ]
2444 | },
2445 | "execution_count": 40,
2446 | "metadata": {},
2447 | "output_type": "execute_result"
2448 | }
2449 | ],
2450 | "source": [
2451 | "frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), \n",
2452 | " index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n",
2453 | "frame"
2454 | ]
2455 | },
2456 | {
2457 | "cell_type": "code",
2458 | "execution_count": 41,
2459 | "metadata": {},
2460 | "outputs": [
2461 | {
2462 | "data": {
2463 | "text/html": [
2464 | "\n",
2465 | "\n",
2478 | "
\n",
2479 | " \n",
2480 | " \n",
2481 | " | \n",
2482 | " b | \n",
2483 | " d | \n",
2484 | " e | \n",
2485 | "
\n",
2486 | " \n",
2487 | " \n",
2488 | " \n",
2489 | " Utah | \n",
2490 | " 1.326382 | \n",
2491 | " 0.690920 | \n",
2492 | " 0.121802 | \n",
2493 | "
\n",
2494 | " \n",
2495 | " Ohio | \n",
2496 | " 1.255100 | \n",
2497 | " 0.496809 | \n",
2498 | " 1.017018 | \n",
2499 | "
\n",
2500 | " \n",
2501 | " Texas | \n",
2502 | " 0.752331 | \n",
2503 | " 0.148764 | \n",
2504 | " 1.549744 | \n",
2505 | "
\n",
2506 | " \n",
2507 | " Oregon | \n",
2508 | " 1.063863 | \n",
2509 | " 0.208184 | \n",
2510 | " 1.328060 | \n",
2511 | "
\n",
2512 | " \n",
2513 | "
\n",
2514 | "
"
2515 | ],
2516 | "text/plain": [
2517 | " b d e\n",
2518 | "Utah 1.326382 0.690920 0.121802\n",
2519 | "Ohio 1.255100 0.496809 1.017018\n",
2520 | "Texas 0.752331 0.148764 1.549744\n",
2521 | "Oregon 1.063863 0.208184 1.328060"
2522 | ]
2523 | },
2524 | "execution_count": 41,
2525 | "metadata": {},
2526 | "output_type": "execute_result"
2527 | }
2528 | ],
2529 | "source": [
2530 | "np.abs(frame)"
2531 | ]
2532 | },
2533 | {
2534 | "cell_type": "markdown",
2535 | "metadata": {},
2536 | "source": [
2537 | "此外,可以把一个用在一维数组上的函数应用在一行或者一列上。\n",
2538 | "\n",
2539 | "用到DataFrame的apply函数:"
2540 | ]
2541 | },
2542 | {
2543 | "cell_type": "code",
2544 | "execution_count": 42,
2545 | "metadata": {},
2546 | "outputs": [
2547 | {
2548 | "data": {
2549 | "text/plain": [
2550 | "b 2.581482\n",
2551 | "d 1.187729\n",
2552 | "e 2.566762\n",
2553 | "dtype: float64"
2554 | ]
2555 | },
2556 | "execution_count": 42,
2557 | "metadata": {},
2558 | "output_type": "execute_result"
2559 | }
2560 | ],
2561 | "source": [
2562 | "f = lambda x: x.max()-x.min()\n",
2563 | "frame.apply(f)"
2564 | ]
2565 | },
2566 | {
2567 | "cell_type": "markdown",
2568 | "metadata": {},
2569 | "source": [
2570 | "这里函数f,计算的是一个series中最大值和最小值的差,在frame中的每一列,这个函数被调用一次。作为结果的Series,它的index就是frame的column。\n",
2571 | "\n",
2572 | "如果你传入axis='column'用于apply,那么函数会被用在每一行。\n",
2573 | "\n",
2574 | "apply不会返回标量,只会返回一个含有多个值的Series:"
2575 | ]
2576 | },
2577 | {
2578 | "cell_type": "code",
2579 | "execution_count": 43,
2580 | "metadata": {
2581 | "collapsed": true
2582 | },
2583 | "outputs": [],
2584 | "source": [
2585 | "def f(x):\n",
2586 | " return pd.Series([x.min, x.max], index=['min','max'])"
2587 | ]
2588 | },
2589 | {
2590 | "cell_type": "code",
2591 | "execution_count": 44,
2592 | "metadata": {},
2593 | "outputs": [
2594 | {
2595 | "data": {
2596 | "text/html": [
2597 | "\n",
2598 | "\n",
2611 | "
\n",
2612 | " \n",
2613 | " \n",
2614 | " | \n",
2615 | " b | \n",
2616 | " d | \n",
2617 | " e | \n",
2618 | "
\n",
2619 | " \n",
2620 | " \n",
2621 | " \n",
2622 | " min | \n",
2623 | " <bound method Series.min of Utah -1.326382... | \n",
2624 | " <bound method Series.min of Utah -0.690920... | \n",
2625 | " <bound method Series.min of Utah 0.121802... | \n",
2626 | "
\n",
2627 | " \n",
2628 | " max | \n",
2629 | " <bound method Series.max of Utah -1.326382... | \n",
2630 | " <bound method Series.max of Utah -0.690920... | \n",
2631 | " <bound method Series.max of Utah 0.121802... | \n",
2632 | "
\n",
2633 | " \n",
2634 | "
\n",
2635 | "
"
2636 | ],
2637 | "text/plain": [
2638 | " b \\\n",
2639 | "min \n",
2676 | "\n",
2689 | "\n",
2690 | " \n",
2691 | " \n",
2692 | " | \n",
2693 | " b | \n",
2694 | " d | \n",
2695 | " e | \n",
2696 | "
\n",
2697 | " \n",
2698 | " \n",
2699 | " \n",
2700 | " Utah | \n",
2701 | " -1.326382 | \n",
2702 | " -0.690920 | \n",
2703 | " 0.121802 | \n",
2704 | "
\n",
2705 | " \n",
2706 | " Ohio | \n",
2707 | " 1.255100 | \n",
2708 | " 0.496809 | \n",
2709 | " 1.017018 | \n",
2710 | "
\n",
2711 | " \n",
2712 | " Texas | \n",
2713 | " 0.752331 | \n",
2714 | " -0.148764 | \n",
2715 | " -1.549744 | \n",
2716 | "
\n",
2717 | " \n",
2718 | " Oregon | \n",
2719 | " 1.063863 | \n",
2720 | " 0.208184 | \n",
2721 | " -1.328060 | \n",
2722 | "
\n",
2723 | " \n",
2724 | "
\n",
2725 | ""
2726 | ],
2727 | "text/plain": [
2728 | " b d e\n",
2729 | "Utah -1.326382 -0.690920 0.121802\n",
2730 | "Ohio 1.255100 0.496809 1.017018\n",
2731 | "Texas 0.752331 -0.148764 -1.549744\n",
2732 | "Oregon 1.063863 0.208184 -1.328060"
2733 | ]
2734 | },
2735 | "execution_count": 45,
2736 | "metadata": {},
2737 | "output_type": "execute_result"
2738 | }
2739 | ],
2740 | "source": [
2741 | "format = lambda x:'%2f'%x\n",
2742 | "frame.applymap(format)"
2743 | ]
2744 | },
2745 | {
2746 | "cell_type": "markdown",
2747 | "metadata": {},
2748 | "source": [
2749 | "applymap的做法是,Series有一个map函数,用来实现element-wise函数:"
2750 | ]
2751 | },
2752 | {
2753 | "cell_type": "code",
2754 | "execution_count": 46,
2755 | "metadata": {},
2756 | "outputs": [
2757 | {
2758 | "data": {
2759 | "text/plain": [
2760 | "Utah 0.121802\n",
2761 | "Ohio 1.017018\n",
2762 | "Texas -1.549744\n",
2763 | "Oregon -1.328060\n",
2764 | "Name: e, dtype: object"
2765 | ]
2766 | },
2767 | "execution_count": 46,
2768 | "metadata": {},
2769 | "output_type": "execute_result"
2770 | }
2771 | ],
2772 | "source": [
2773 | "frame['e'].map(format)"
2774 | ]
2775 | },
2776 | {
2777 | "cell_type": "markdown",
2778 | "metadata": {},
2779 | "source": [
2780 | "## 5.排序(Sorting and Ranking)"
2781 | ]
2782 | },
2783 | {
2784 | "cell_type": "markdown",
2785 | "metadata": {},
2786 | "source": [
2787 | "按row或column index来排序的话,可以用sort_index方法,按照某个axis来排序,并且会返回一个新的object:"
2788 | ]
2789 | },
2790 | {
2791 | "cell_type": "code",
2792 | "execution_count": 47,
2793 | "metadata": {},
2794 | "outputs": [
2795 | {
2796 | "data": {
2797 | "text/html": [
2798 | "\n",
2799 | "\n",
2812 | "
\n",
2813 | " \n",
2814 | " \n",
2815 | " | \n",
2816 | " d | \n",
2817 | " a | \n",
2818 | " b | \n",
2819 | " c | \n",
2820 | "
\n",
2821 | " \n",
2822 | " \n",
2823 | " \n",
2824 | " three | \n",
2825 | " 0 | \n",
2826 | " 1 | \n",
2827 | " 2 | \n",
2828 | " 3 | \n",
2829 | "
\n",
2830 | " \n",
2831 | " one | \n",
2832 | " 4 | \n",
2833 | " 5 | \n",
2834 | " 6 | \n",
2835 | " 7 | \n",
2836 | "
\n",
2837 | " \n",
2838 | "
\n",
2839 | "
"
2840 | ],
2841 | "text/plain": [
2842 | " d a b c\n",
2843 | "three 0 1 2 3\n",
2844 | "one 4 5 6 7"
2845 | ]
2846 | },
2847 | "execution_count": 47,
2848 | "metadata": {},
2849 | "output_type": "execute_result"
2850 | }
2851 | ],
2852 | "source": [
2853 | "frame = pd.DataFrame(np.arange(8).reshape((2, 4)),\n",
2854 | " index=['three', 'one'],\n",
2855 | " columns=['d', 'a', 'b', 'c'])\n",
2856 | "frame"
2857 | ]
2858 | },
2859 | {
2860 | "cell_type": "code",
2861 | "execution_count": 48,
2862 | "metadata": {},
2863 | "outputs": [
2864 | {
2865 | "data": {
2866 | "text/html": [
2867 | "\n",
2868 | "\n",
2881 | "
\n",
2882 | " \n",
2883 | " \n",
2884 | " | \n",
2885 | " d | \n",
2886 | " a | \n",
2887 | " b | \n",
2888 | " c | \n",
2889 | "
\n",
2890 | " \n",
2891 | " \n",
2892 | " \n",
2893 | " one | \n",
2894 | " 4 | \n",
2895 | " 5 | \n",
2896 | " 6 | \n",
2897 | " 7 | \n",
2898 | "
\n",
2899 | " \n",
2900 | " three | \n",
2901 | " 0 | \n",
2902 | " 1 | \n",
2903 | " 2 | \n",
2904 | " 3 | \n",
2905 | "
\n",
2906 | " \n",
2907 | "
\n",
2908 | "
"
2909 | ],
2910 | "text/plain": [
2911 | " d a b c\n",
2912 | "one 4 5 6 7\n",
2913 | "three 0 1 2 3"
2914 | ]
2915 | },
2916 | "execution_count": 48,
2917 | "metadata": {},
2918 | "output_type": "execute_result"
2919 | }
2920 | ],
2921 | "source": [
2922 | "frame.sort_index()"
2923 | ]
2924 | },
2925 | {
2926 | "cell_type": "code",
2927 | "execution_count": 49,
2928 | "metadata": {},
2929 | "outputs": [
2930 | {
2931 | "data": {
2932 | "text/html": [
2933 | "\n",
2934 | "\n",
2947 | "
\n",
2948 | " \n",
2949 | " \n",
2950 | " | \n",
2951 | " a | \n",
2952 | " b | \n",
2953 | " c | \n",
2954 | " d | \n",
2955 | "
\n",
2956 | " \n",
2957 | " \n",
2958 | " \n",
2959 | " three | \n",
2960 | " 1 | \n",
2961 | " 2 | \n",
2962 | " 3 | \n",
2963 | " 0 | \n",
2964 | "
\n",
2965 | " \n",
2966 | " one | \n",
2967 | " 5 | \n",
2968 | " 6 | \n",
2969 | " 7 | \n",
2970 | " 4 | \n",
2971 | "
\n",
2972 | " \n",
2973 | "
\n",
2974 | "
"
2975 | ],
2976 | "text/plain": [
2977 | " a b c d\n",
2978 | "three 1 2 3 0\n",
2979 | "one 5 6 7 4"
2980 | ]
2981 | },
2982 | "execution_count": 49,
2983 | "metadata": {},
2984 | "output_type": "execute_result"
2985 | }
2986 | ],
2987 | "source": [
2988 | "frame.sort_index(axis=1)"
2989 | ]
2990 | },
2991 | {
2992 | "cell_type": "code",
2993 | "execution_count": 50,
2994 | "metadata": {},
2995 | "outputs": [
2996 | {
2997 | "data": {
2998 | "text/html": [
2999 | "\n",
3000 | "\n",
3013 | "
\n",
3014 | " \n",
3015 | " \n",
3016 | " | \n",
3017 | " d | \n",
3018 | " a | \n",
3019 | " b | \n",
3020 | " c | \n",
3021 | "
\n",
3022 | " \n",
3023 | " \n",
3024 | " \n",
3025 | " three | \n",
3026 | " 0 | \n",
3027 | " 1 | \n",
3028 | " 2 | \n",
3029 | " 3 | \n",
3030 | "
\n",
3031 | " \n",
3032 | " one | \n",
3033 | " 4 | \n",
3034 | " 5 | \n",
3035 | " 6 | \n",
3036 | " 7 | \n",
3037 | "
\n",
3038 | " \n",
3039 | "
\n",
3040 | "
"
3041 | ],
3042 | "text/plain": [
3043 | " d a b c\n",
3044 | "three 0 1 2 3\n",
3045 | "one 4 5 6 7"
3046 | ]
3047 | },
3048 | "execution_count": 50,
3049 | "metadata": {},
3050 | "output_type": "execute_result"
3051 | }
3052 | ],
3053 | "source": [
3054 | "frame.sort_index(axis=0, ascending=False)"
3055 | ]
3056 | },
3057 | {
3058 | "cell_type": "markdown",
3059 | "metadata": {},
3060 | "source": [
3061 | "通过值来排序,使用sort_values方法:(缺失值会被排在最后)"
3062 | ]
3063 | },
3064 | {
3065 | "cell_type": "code",
3066 | "execution_count": 51,
3067 | "metadata": {},
3068 | "outputs": [
3069 | {
3070 | "data": {
3071 | "text/plain": [
3072 | "2 -3.0\n",
3073 | "3 2.0\n",
3074 | "0 4.0\n",
3075 | "1 NaN\n",
3076 | "dtype: float64"
3077 | ]
3078 | },
3079 | "execution_count": 51,
3080 | "metadata": {},
3081 | "output_type": "execute_result"
3082 | }
3083 | ],
3084 | "source": [
3085 | "obj = pd.Series([4, np.nan, -3, 2])\n",
3086 | "obj.sort_values()"
3087 | ]
3088 | },
3089 | {
3090 | "cell_type": "code",
3091 | "execution_count": 52,
3092 | "metadata": {},
3093 | "outputs": [
3094 | {
3095 | "data": {
3096 | "text/html": [
3097 | "\n",
3098 | "\n",
3111 | "
\n",
3112 | " \n",
3113 | " \n",
3114 | " | \n",
3115 | " d | \n",
3116 | " a | \n",
3117 | " b | \n",
3118 | " c | \n",
3119 | "
\n",
3120 | " \n",
3121 | " \n",
3122 | " \n",
3123 | " three | \n",
3124 | " 0 | \n",
3125 | " 1 | \n",
3126 | " 2 | \n",
3127 | " 3 | \n",
3128 | "
\n",
3129 | " \n",
3130 | " one | \n",
3131 | " 4 | \n",
3132 | " 5 | \n",
3133 | " 6 | \n",
3134 | " 7 | \n",
3135 | "
\n",
3136 | " \n",
3137 | "
\n",
3138 | "
"
3139 | ],
3140 | "text/plain": [
3141 | " d a b c\n",
3142 | "three 0 1 2 3\n",
3143 | "one 4 5 6 7"
3144 | ]
3145 | },
3146 | "execution_count": 52,
3147 | "metadata": {},
3148 | "output_type": "execute_result"
3149 | }
3150 | ],
3151 | "source": [
3152 | "frame.sort_values(by=['a', 'b'])"
3153 | ]
3154 | },
3155 | {
3156 | "cell_type": "markdown",
3157 | "metadata": {},
3158 | "source": [
3159 | "rank(略)"
3160 | ]
3161 | },
3162 | {
3163 | "cell_type": "markdown",
3164 | "metadata": {},
3165 | "source": [
3166 | "## 6.有重复label的轴索引(Axis Indexes with Duplicate Labels)"
3167 | ]
3168 | },
3169 | {
3170 | "cell_type": "markdown",
3171 | "metadata": {},
3172 | "source": [
3173 | "有一些有重复索引:"
3174 | ]
3175 | },
3176 | {
3177 | "cell_type": "code",
3178 | "execution_count": 53,
3179 | "metadata": {},
3180 | "outputs": [
3181 | {
3182 | "data": {
3183 | "text/plain": [
3184 | "a 0\n",
3185 | "a 1\n",
3186 | "b 2\n",
3187 | "b 3\n",
3188 | "c 4\n",
3189 | "dtype: int32"
3190 | ]
3191 | },
3192 | "execution_count": 53,
3193 | "metadata": {},
3194 | "output_type": "execute_result"
3195 | }
3196 | ],
3197 | "source": [
3198 | "obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])\n",
3199 | "obj"
3200 | ]
3201 | },
3202 | {
3203 | "cell_type": "code",
3204 | "execution_count": 54,
3205 | "metadata": {},
3206 | "outputs": [
3207 | {
3208 | "data": {
3209 | "text/plain": [
3210 | "False"
3211 | ]
3212 | },
3213 | "execution_count": 54,
3214 | "metadata": {},
3215 | "output_type": "execute_result"
3216 | }
3217 | ],
3218 | "source": [
3219 | "obj.index.is_unique"
3220 | ]
3221 | },
3222 | {
3223 | "cell_type": "markdown",
3224 | "metadata": {},
3225 | "source": [
3226 | "数据选择时,对于Series,如果一个label有多个值,返回一个Series,反之返回一个标量。\n",
3227 | " 对于DataFrame,如果一个label有多行/列,返回一个DataFrame。"
3228 | ]
3229 | },
3230 | {
3231 | "cell_type": "code",
3232 | "execution_count": 55,
3233 | "metadata": {},
3234 | "outputs": [
3235 | {
3236 | "data": {
3237 | "text/plain": [
3238 | "a 0\n",
3239 | "a 1\n",
3240 | "dtype: int32"
3241 | ]
3242 | },
3243 | "execution_count": 55,
3244 | "metadata": {},
3245 | "output_type": "execute_result"
3246 | }
3247 | ],
3248 | "source": [
3249 | "obj['a']"
3250 | ]
3251 | }
3252 | ],
3253 | "metadata": {
3254 | "kernelspec": {
3255 | "display_name": "Python 3",
3256 | "language": "python",
3257 | "name": "python3"
3258 | },
3259 | "language_info": {
3260 | "codemirror_mode": {
3261 | "name": "ipython",
3262 | "version": 3
3263 | },
3264 | "file_extension": ".py",
3265 | "mimetype": "text/x-python",
3266 | "name": "python",
3267 | "nbconvert_exporter": "python",
3268 | "pygments_lexer": "ipython3",
3269 | "version": "3.6.3"
3270 | },
3271 | "widgets": {
3272 | "application/vnd.jupyter.widget-state+json": {
3273 | "state": {},
3274 | "version_major": 2,
3275 | "version_minor": 0
3276 | }
3277 | }
3278 | },
3279 | "nbformat": 4,
3280 | "nbformat_minor": 2
3281 | }
3282 |
--------------------------------------------------------------------------------
/tools/pandas_test.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | import pandas as pd
4 |
5 | s = pd.Series([1,2,3], index=['a', 'b', 'c'])
6 | d = pd.DataFrame([[1,2,3], [3,4,5]],columns = ['a','b','c'])
7 | ds = pd.DataFrame(s)
8 |
9 | print(d.head())
10 | print(d.describe())
11 |
12 | print(s)
13 |
--------------------------------------------------------------------------------