├── _config.yml ├── 25&26-ConditionalFormatting ├── .gitignore ├── Students.xlsx ├── ConditionalFormatting02.py ├── ConditionalFormatting01.py ├── ConditionalFormatting02.ipynb └── ConditionalFormatting01.ipynb ├── 7-Sequence ├── List.xlsx └── Sequence.py ├── 2-ReadExcel ├── People.xlsx └── ReadExcel.py ├── 23-GroupBy ├── Orders.xlsx └── GroupBy.py ├── 9-Histogram ├── Figure_1.png ├── Students.xlsx └── Histogram.py ├── 1-CreateExcel ├── output.xlsx └── CreateExcel.py ├── 12-PieChart ├── Students.xlsx └── PieChart.py ├── 16-Join ├── Student_Score.xlsx └── Join.py ├── 6-InputFunction ├── Books.xlsx └── InputFunction.py ├── 19-Statistics ├── Students.xlsx └── Statistics.py ├── 21-RotateDataSet ├── Videos.xlsx └── RotateDataSet.py ├── 24-DataPrediction ├── Sales.xlsx └── DataPrediction.py ├── 27-RowOperation ├── Students.xlsx └── RowOperation.py ├── 28-ColOperation ├── Students.xlsx └── ColOperation.py ├── 8-DataFiltering ├── Students.xlsx └── DataFiltering.py ├── 17-DataValidation ├── Students.xlsx └── DataValidation.py ├── 18-DataSegmentation ├── Employees.xlsx └── DataSegmentation.py ├── 4&5-ReadData&BaseInput ├── Books.xlsx ├── Books_output.xlsx └── ReadData&BaseInput.py ├── 20-DuplicateData ├── Students_Duplicates.xlsx └── DuplicateData.py ├── 30-WritingComplexEquations ├── Rectangles.xlsx └── WritingComplexEquations.py ├── 13-PolylineTrendChart&OverlayAreaMap ├── Orders.xlsx └── PolylineTrendChart&OverlayAreaMap.py ├── 14&15-ScatterPlot&Histogram&DensityMap ├── home_data.xlsx └── ScatterPlot&Histogram&DensityMap.py ├── 10-GroupedHistogran&DepthOptimizationChart ├── Students.xlsx └── GroupedHistogran&DepthOptimizationChart.py ├── 11-SuperimposedHistogram&HorizontalHistogram ├── Users.xlsx └── SuperimposedHistogram&HorizontalHistogram.py ├── .editorconfig ├── .idea ├── vcs.xml ├── misc.xml ├── modules.xml ├── PandasVersusExcel.iml └── workspace.xml ├── 22-ReadData ├── Students.csv ├── Students.tsv ├── Students.txt └── ReadData.py ├── 3-Rows&Clumns&Cell └── Rows&Clumns&Cell.py ├── 29-ReadDataBase └── ReadDataBase.py ├── .github └── workflows │ └── blank.yml └── README.md /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /25&26-ConditionalFormatting/.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | .ipynb_checkpoints\* -------------------------------------------------------------------------------- /7-Sequence/List.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/7-Sequence/List.xlsx -------------------------------------------------------------------------------- /2-ReadExcel/People.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/2-ReadExcel/People.xlsx -------------------------------------------------------------------------------- /23-GroupBy/Orders.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/23-GroupBy/Orders.xlsx -------------------------------------------------------------------------------- /9-Histogram/Figure_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/9-Histogram/Figure_1.png -------------------------------------------------------------------------------- /1-CreateExcel/output.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/1-CreateExcel/output.xlsx -------------------------------------------------------------------------------- /12-PieChart/Students.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/12-PieChart/Students.xlsx -------------------------------------------------------------------------------- /16-Join/Student_Score.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/16-Join/Student_Score.xlsx -------------------------------------------------------------------------------- /6-InputFunction/Books.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/6-InputFunction/Books.xlsx -------------------------------------------------------------------------------- /9-Histogram/Students.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/9-Histogram/Students.xlsx -------------------------------------------------------------------------------- /19-Statistics/Students.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/19-Statistics/Students.xlsx -------------------------------------------------------------------------------- /21-RotateDataSet/Videos.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/21-RotateDataSet/Videos.xlsx -------------------------------------------------------------------------------- /24-DataPrediction/Sales.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/24-DataPrediction/Sales.xlsx -------------------------------------------------------------------------------- /27-RowOperation/Students.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/27-RowOperation/Students.xlsx -------------------------------------------------------------------------------- /28-ColOperation/Students.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/28-ColOperation/Students.xlsx -------------------------------------------------------------------------------- /8-DataFiltering/Students.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/8-DataFiltering/Students.xlsx -------------------------------------------------------------------------------- /17-DataValidation/Students.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/17-DataValidation/Students.xlsx -------------------------------------------------------------------------------- /18-DataSegmentation/Employees.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/18-DataSegmentation/Employees.xlsx -------------------------------------------------------------------------------- /4&5-ReadData&BaseInput/Books.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/4&5-ReadData&BaseInput/Books.xlsx -------------------------------------------------------------------------------- /20-DuplicateData/Students_Duplicates.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/20-DuplicateData/Students_Duplicates.xlsx -------------------------------------------------------------------------------- /25&26-ConditionalFormatting/Students.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/25&26-ConditionalFormatting/Students.xlsx -------------------------------------------------------------------------------- /4&5-ReadData&BaseInput/Books_output.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/4&5-ReadData&BaseInput/Books_output.xlsx -------------------------------------------------------------------------------- /30-WritingComplexEquations/Rectangles.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/30-WritingComplexEquations/Rectangles.xlsx -------------------------------------------------------------------------------- /13-PolylineTrendChart&OverlayAreaMap/Orders.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/13-PolylineTrendChart&OverlayAreaMap/Orders.xlsx -------------------------------------------------------------------------------- /14&15-ScatterPlot&Histogram&DensityMap/home_data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/14&15-ScatterPlot&Histogram&DensityMap/home_data.xlsx -------------------------------------------------------------------------------- /10-GroupedHistogran&DepthOptimizationChart/Students.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/10-GroupedHistogran&DepthOptimizationChart/Students.xlsx -------------------------------------------------------------------------------- /11-SuperimposedHistogram&HorizontalHistogram/Users.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/HEAD/11-SuperimposedHistogram&HorizontalHistogram/Users.xlsx -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 4 6 | end_of_line = lf 7 | charset = utf-8 8 | trim_trailing_whitespace = false 9 | insert_final_newline = false -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /21-RotateDataSet/RotateDataSet.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第二十一课 定位、旋转数据表(行/列转换) 4 | # 2018-10-25 5 | 6 | import pandas as pd 7 | 8 | # 设置最大显示列数为20 9 | pd.options.display.max_columns=20 10 | video = pd.read_excel('./Videos.xlsx',index_col='Month') 11 | print('----原始数据----') 12 | print(video) 13 | print(video.columns) 14 | 15 | table = video.transpose() 16 | print('\n----行列转换的结果----') 17 | print(table) -------------------------------------------------------------------------------- /1-CreateExcel/CreateExcel.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第一课 创建文件 4 | # 2018-10-17 5 | 6 | import pandas as pd 7 | 8 | df = pd.DataFrame({'ID':[0,1,2],'Name':['Mark','Tomi','Jack']}) 9 | 10 | # 此处设置数据表的索引,如未设置索引会自动在最前方添加一列作为索引 11 | df = df.set_index('ID') # 会生成新的 DataFrame 12 | # df.set_index('ID',inplace=True) # 在原来的 DataFrame 上进行修改 13 | 14 | # 将数据报错到 output.xlsx 15 | df.to_excel('./output.xlsx') 16 | 17 | print(df) 18 | print('Done') -------------------------------------------------------------------------------- /22-ReadData/Students.csv: -------------------------------------------------------------------------------- 1 | ID,Name,Age 2 | 1,Student_001,21 3 | 2,Student_002,22 4 | 3,Student_003,23 5 | 4,Student_004,24 6 | 5,Student_005,25 7 | 6,Student_006,26 8 | 7,Student_007,27 9 | 8,Student_008,28 10 | 9,Student_009,29 11 | 10,Student_010,30 12 | 11,Student_011,31 13 | 12,Student_012,32 14 | 13,Student_013,33 15 | 14,Student_014,34 16 | 15,Student_015,35 17 | 16,Student_016,36 18 | 17,Student_017,37 19 | 18,Student_018,38 20 | 19,Student_019,39 21 | 20,Student_020,40 22 | -------------------------------------------------------------------------------- /22-ReadData/Students.tsv: -------------------------------------------------------------------------------- 1 | ID Name Age 2 | 1 Student_001 21 3 | 2 Student_002 22 4 | 3 Student_003 23 5 | 4 Student_004 24 6 | 5 Student_005 25 7 | 6 Student_006 26 8 | 7 Student_007 27 9 | 8 Student_008 28 10 | 9 Student_009 29 11 | 10 Student_010 30 12 | 11 Student_011 31 13 | 12 Student_012 32 14 | 13 Student_013 33 15 | 14 Student_014 34 16 | 15 Student_015 35 17 | 16 Student_016 36 18 | 17 Student_017 37 19 | 18 Student_018 38 20 | 19 Student_019 39 21 | 20 Student_020 40 22 | -------------------------------------------------------------------------------- /22-ReadData/Students.txt: -------------------------------------------------------------------------------- 1 | ID|Name|Age 2 | 1|Student_001|21 3 | 2|Student_002|22 4 | 3|Student_003|23 5 | 4|Student_004|24 6 | 5|Student_005|25 7 | 6|Student_006|26 8 | 7|Student_007|27 9 | 8|Student_008|28 10 | 9|Student_009|29 11 | 10|Student_010|30 12 | 11|Student_011|31 13 | 12|Student_012|32 14 | 13|Student_013|33 15 | 14|Student_014|34 16 | 15|Student_015|35 17 | 16|Student_016|36 18 | 17|Student_017|37 19 | 18|Student_018|38 20 | 19|Student_019|39 21 | 20|Student_020|40 22 | -------------------------------------------------------------------------------- /.idea/PandasVersusExcel.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /12-PieChart/PieChart.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第十二课 绘制饼图 4 | # 2018-10-19 5 | 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | 9 | students = pd.read_excel('./Students.xlsx',index_col="From") 10 | print('----原始数据----') 11 | print(students) 12 | 13 | # counterclock: True(默认值): 逆时针,False: 顺时针 14 | students['2017'].plot.pie(fontsize=8,counterclock=False) 15 | plt.title('Source of International Students',fontsize=16,fontweight='bold') 16 | plt.ylabel('2017',fontsize=12,fontweight='bold') 17 | plt.show() -------------------------------------------------------------------------------- /22-ReadData/ReadData.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第二十二课 读取CSV、TSV、TXT文件中的数据 4 | # 2018-10-25 5 | 6 | import pandas as pd 7 | 8 | student_csv = pd.read_csv('./Students.csv',index_col='ID') 9 | print('----读取 csv 数据----') 10 | print(student_csv) 11 | 12 | # sep 指定分隔符(读取csv文件时可省略,默认为 ',') 13 | student_tsv = pd.read_csv('./Students.tsv',sep='\t',index_col='ID') 14 | print('\n----读取 tsv 数据----') 15 | print(student_tsv) 16 | 17 | student_txt = pd.read_csv('./Students.txt',sep='|',index_col='ID') 18 | print('\n----读取 txt 数据----') 19 | print(student_txt) -------------------------------------------------------------------------------- /20-DuplicateData/DuplicateData.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第二十课 定位、消除重复数据 4 | # 2018-10-24 5 | 6 | import pandas as pd 7 | 8 | students = pd.read_excel('./Students_Duplicates.xlsx') 9 | print('----原始数据----') 10 | print(students) 11 | print(students.columns) 12 | 13 | dupe = students.duplicated(subset='Name') 14 | print('\n----检查重复数据(True为重复)----') 15 | print(dupe) 16 | 17 | dupe = dupe[dupe] # 获取重复的行,等同于dupe = dupe[dupe==True] 18 | print('\n----查看重复数据----') 19 | print(students.iloc[dupe.index]) 20 | 21 | students.drop_duplicates(subset='Name',inplace=True) 22 | print('\n----消除重复数据后的数据----') 23 | print(students) 24 | -------------------------------------------------------------------------------- /3-Rows&Clumns&Cell/Rows&Clumns&Cell.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第三课 行、列、单元格 4 | # 2018-10-17 5 | 6 | import pandas as pd 7 | 8 | # --创建Series-- 9 | 10 | # 方法一 11 | d = {'X':100,'Y':200,'Z':300} 12 | s1 = pd.Series(d) # 序列对象 13 | s1.name 14 | s1.index 15 | print(s1.index) 16 | 17 | # 方法二 18 | L1 = [100,200,300] 19 | L2 = ['X','Y','Z'] 20 | s2 = pd.Series(L1,index=L2) 21 | print(s2.index) 22 | 23 | # --操作Excel-- 24 | 25 | s1 = pd.Series([1,2,3],index=[1,2,3],name='A') 26 | s2 = pd.Series([10,20,30],index=[1,2,3],name='B') 27 | s3 = pd.Series([100,200,300],index=[1,2,3],name='C') 28 | 29 | df = pd.DataFrame({s1.name:s1,s2.name:s2,s3.name:s3}) 30 | print(df) -------------------------------------------------------------------------------- /11-SuperimposedHistogram&HorizontalHistogram/SuperimposedHistogram&HorizontalHistogram.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第十一课 绘制分组柱图,深度优化图表 4 | # 2018-10-19 5 | 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | 9 | users = pd.read_excel('./Users.xlsx',index_col='ID') 10 | users['Total'] = users['Oct'] + users['Nov'] + users['Dec'] 11 | users.sort_values(by='Total',inplace=True,ascending=True) 12 | print(users) 13 | 14 | # stacked: 叠加(默认为False) 15 | users.plot.barh(x='Name',y=['Oct','Nov','Dec'],stacked=True,title='User Behavior') 16 | 17 | plt.tight_layout() 18 | plt.show() 19 | 20 | # 补充说明 21 | # users.plot.bar(...) 表示制作竖直柱状图 22 | # users.plot.barh(...) 表示制作水平柱状图 23 | # 24 | # 25 | # -------------------------------------------------------------------------------- /18-DataSegmentation/DataSegmentation.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第十八课 把一列数据分割成两列 4 | # 2018-10-24 5 | 6 | import pandas as pd 7 | 8 | employees = pd.read_excel('./Employees.xlsx',index_col='ID') 9 | df = employees['Full Name'].str.split(expand=True) 10 | print('----原始数据----') 11 | print(employees) 12 | print(employees.columns) 13 | print(df) 14 | 15 | employees['First Name'] = df[0] 16 | employees['Last Name'] = df[0] 17 | print('\n----分割后的结果----') 18 | print(employees) 19 | 20 | # 补充 21 | # split() 方法: 22 | # split(' ',n=0,expand=True) 23 | # split 的第一个参数: 表示分隔符默认为空格或tab 24 | # split 的第二个参数 n: 表示最多分割的个数(0或-1 表示分割成尽可能多的个数) 25 | # split 的第二个参数 expand: 默认为 False (False:分割后生成数组,占一列;True: 分割成单独的列) -------------------------------------------------------------------------------- /7-Sequence/Sequence.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第七课 排序,多重排序 4 | # 2018-10-18 5 | 6 | import pandas as pd 7 | 8 | products = pd.read_excel('./List.xlsx',index_col='ID') 9 | 10 | print('----原始数据----') 11 | print(products) 12 | 13 | # sort_values 排序方法 14 | # by:根据什么排序 15 | # inplace:在原数据集中排序,而不是生成新的数据集 16 | # ascending:排序的顺序(True:默认,顺序|False:倒序) 17 | 18 | # 以 Price 按 倒序 排序 19 | products.sort_values(by='Price',inplace=True,ascending=False) 20 | print('\n----以 Price 按 倒序 排序----') 21 | print(products) 22 | 23 | # 先以 Worthy 按 顺序 排序,再以 Price 按倒序排序 24 | products.sort_values(by=['Worthy','Price'],inplace=True,ascending=[True,False]) 25 | print('\n----先以 Worthy 按 顺序 排序,再以 Price 按倒序排序----') 26 | print(products) -------------------------------------------------------------------------------- /23-GroupBy/GroupBy.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第二十三课 透视表,分组,聚合(group by) 4 | # 2018-10-25 5 | 6 | import pandas as pd 7 | import numpy as np 8 | 9 | # 设置最大显示列数为20 10 | pd.options.display.max_columns=20 11 | orders = pd.read_excel('./Orders.xlsx') 12 | print('----原始数据----') 13 | print(orders.head()) 14 | print(orders.columns) 15 | 16 | orders['Year'] = pd.DatetimeIndex(orders['Date']).year 17 | 18 | # 方法一 19 | tt1 = orders.pivot_table(index='Category',columns='Year',values='Total',aggfunc=np.sum) 20 | print('\n----方法一----') 21 | print(tt1) 22 | 23 | group = orders.groupby(['Category','Year']) 24 | s = group['Total'].sum() 25 | c = group['ID'].count() 26 | 27 | tt2 = pd.DataFrame({'Sum':s,'Count':c}) 28 | print('\n----方法二----') 29 | print(tt2) -------------------------------------------------------------------------------- /30-WritingComplexEquations/WritingComplexEquations.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第三十课 编写复杂方程 4 | # 建议在第八讲之后查看 5 | # 2018-10-30 6 | 7 | import pandas as pd 8 | import numpy as np 9 | 10 | # 计算外接圆的面积 11 | def get_Circumscribedcircle_area(lengh,height): 12 | r = np.sqrt(lengh**2 + height**2) / 2 13 | return r**2*np.pi 14 | 15 | def wrapper(row): 16 | return get_Circumscribedcircle_area(row['Length'],row['Height']) 17 | 18 | 19 | rectangles = pd.read_excel('./Rectangles.xlsx',index_col='ID') 20 | print('----原始数据----') 21 | print(rectangles) 22 | 23 | rectangles['CA'] = rectangles.apply(lambda row: get_Circumscribedcircle_area(row['Length'],row['Height']),axis=1) 24 | # rectangles['CA'] = rectangles.apply(wrapper,axis=1) 25 | print(rectangles) 26 | -------------------------------------------------------------------------------- /25&26-ConditionalFormatting/ConditionalFormatting02.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第二十六课 条件格式化(下) 4 | # 2018-10-26 5 | 6 | import pandas as pd 7 | import seaborn as sns 8 | 9 | students = pd.read_excel('./Students.xlsx') 10 | print('----原始数据----') 11 | print(students) 12 | print(students.columns) 13 | 14 | # 以下两种效果不要同时使用,会被覆盖 15 | 16 | # 根据数据的大小显示不同深度的颜色 17 | col_map = sns.light_palette('green', as_cmap=True) 18 | # students.style.background_gradient(col_map, subset=['Test_1', 'Test_2', 'Test_3']) # 需要引入 seaborn 19 | 20 | # 根据数据的大小显示不同长度的色条 21 | students.style.bar(color='orange', subset=['Test_1', 'Test_2', 'Test_3']) # 不需要引入 seaborn 22 | 23 | # 说明 24 | # 由于编辑器的支持问题,此代码的效果可能无法展现 25 | # 请使用 Anaconda 中的 jupyter notebook 中打开 'ConditionalFormatting02.ipynb' 查看运行效果 26 | -------------------------------------------------------------------------------- /29-ReadDataBase/ReadDataBase.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第二十九课 读取数据库 4 | # 建议在第八讲之后查看 5 | # 2018-10-30 6 | 7 | import pyodbc 8 | import sqlalchemy 9 | import pandas as pd 10 | # sqlalchemy 和 pandas 均可链接数据库,选择其一即可 11 | 12 | # pandas 链接字符串 13 | connection = pyodbc.connect('DRIVER={SQL Server}; SERVER=(local); DATABASE=AdventureWorks;USER=sa;PASSWORD=123456') 14 | # sqlalchemy 链接字符串 15 | engine = sqlalchemy.create_engine('mssql+pyodbc://sa:123456@(local)/AdventureWorks?driver=SQL+Server') 16 | 17 | # 由于数据库中使用单引号,此处使用双引号引用 SQL 语句 18 | query = "SELECT FirstName, LastName FROM Person.Person" 19 | df1 = pd.read_sql_query(query, connection) 20 | df2 = pd.read_sql_query(query, engine) 21 | 22 | pd.options.display.max_columns = 999 23 | print(df1.head()) 24 | print(df2.head()) -------------------------------------------------------------------------------- /14&15-ScatterPlot&Histogram&DensityMap/ScatterPlot&Histogram&DensityMap.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第十四课 散点图,直方图,密度图 4 | # 第十五课 密度图,数据相关性 5 | # 2018-10-19 6 | 7 | import pandas as pd 8 | import matplotlib.pyplot as plt 9 | 10 | pd.options.display.max_columns = 20 11 | homes = pd.read_excel('./home_data.xlsx',index_col='id') 12 | print('----原始数据----') 13 | print(homes.head()) 14 | print(homes.columns) 15 | 16 | # 散点图 17 | # homes.plot.scatter(x='sqft_living',y='price') 18 | 19 | # 直方图 bins: 分配粒度 20 | # homes.sqft_living.plot.hist(bins=100) 21 | # plt.xticks(range(0,max(homes.sqft_living),500),fontsize=8,rotation=90) 22 | 23 | # 密度图 24 | homes.sqft_living.plot.kde() 25 | plt.xticks(range(0,max(homes.sqft_living),500),fontsize=8,rotation=90) 26 | plt.show() 27 | 28 | 29 | -------------------------------------------------------------------------------- /13-PolylineTrendChart&OverlayAreaMap/PolylineTrendChart&OverlayAreaMap.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第十三课 绘制折线趋势图、叠加区域图 4 | # 2018-10-19 5 | 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | 9 | weeks = pd.read_excel('./Orders.xlsx',index_col='Week') 10 | print('----原始数据----') 11 | print(weeks) 12 | 13 | # 叠加区域图 14 | weeks.plot.area(y=['Accessories','Bikes','Clothing','Components']) 15 | # 叠加柱状图 16 | # weeks.plot.bar(y=['Accessories','Bikes','Clothing','Components'],stacked=True) 17 | plt.title('Sales Weekly Trend',fontsize=16,fontweight='bold') 18 | plt.ylabel('Total',fontsize=12,fontweight='bold') 19 | plt.xticks(weeks.index,fontsize=8) 20 | plt.show() 21 | 22 | # 补充说明 23 | # weeks.plot(...) 绘制折线图 24 | # weeks.plot.area(...) 绘制叠加区域图 25 | # weeks.plot.bar(...) 绘制叠加柱状腿 26 | # -------------------------------------------------------------------------------- /17-DataValidation/DataValidation.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第十七课 数据校验,轴的概念 4 | # 2018-10-24 5 | 6 | import pandas as pd 7 | 8 | # 方法一 9 | def score_validation(row): 10 | try: 11 | assert 0 <= row.Score <= 100 12 | except: 13 | print(f'#{row.ID}\tstudent {row.Name} has an invalid score {row.Score}.') 14 | 15 | # 方法二 16 | def score_validation2(row): 17 | if not 0 <= row.Score <= 100: 18 | print(f'#{row.ID}\tstudent {row.Name} has an invalid score {row.Score}.') 19 | 20 | # 在进行数据校验时不要设置 index_col ,这样有助于保证所有数据都进行校验 21 | students = pd.read_excel('./Students.xlsx') 22 | print('----原始数据----') 23 | print(students) 24 | print(students.columns) 25 | 26 | print('\n----校验结果----') 27 | students.apply(score_validation,axis=1) 28 | 29 | # axis = 1: 横向 30 | # axis = 0: 纵向(默认) -------------------------------------------------------------------------------- /6-InputFunction/InputFunction.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第六课 函数填充 4 | # 2018-10-18 5 | 6 | import pandas as pd 7 | 8 | books = pd.read_excel('./Books.xlsx',index_col='ID') 9 | print('----计算前----') 10 | print(books) 11 | 12 | # 方法一 13 | 14 | # books['Price'] = books['ListPrice'] * books['Discount'] 15 | # print('----方法一----') 16 | # print(books) 17 | 18 | # 方法二(此方法可以对计算的行的范围进行精确控制) 19 | 20 | for i in range(5,16): # books.index: 21 | books['Price'].at[i] = books['ListPrice'].at[i] * books['Discount'].at[i] 22 | print('----方法二----') 23 | print(books) 24 | 25 | # 方法一 26 | books['ListPrice'] += 2 27 | 28 | # 方法二 29 | def add_2(x): 30 | return x + 2 31 | books['ListPrice'] = books['ListPrice'].apply(add_2) 32 | 33 | # 方法三 34 | books['ListPrice'] = books['ListPrice'].apply(lambda x:x+2) 35 | print(books) -------------------------------------------------------------------------------- /9-Histogram/Histogram.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第九课 柱状图 4 | # 2018-10-18 5 | 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | 9 | students = pd.read_excel('./Students.xlsx') 10 | 11 | print('----原始数据----') 12 | print(students) 13 | 14 | students.sort_values(by='Number',inplace=True,ascending=False) 15 | 16 | # 使用 pandas 绘图(需要使用 matplotlib 展示图表) 17 | # students.plot.bar(x="Field",y='Number',color='orange',title='International Students by Field') 18 | 19 | # 使用 matplotlib 绘图 20 | plt.bar(students.Field,students.Number,color='orange') 21 | plt.xticks(students.Field,rotation='90') # 将 Field 旋转 90 度 22 | plt.xlabel('Field') # 设置 x轴 标题 23 | plt.ylabel('Number') # 设置 y轴 标题 24 | plt.title('International Students by Field',fontsize=16) # 设置标题 25 | 26 | # 展示图表 27 | plt.tight_layout() # 紧凑型布局 28 | plt.show() -------------------------------------------------------------------------------- /.github/workflows/blank.yml: -------------------------------------------------------------------------------- 1 | name: Deploy to Github Pages 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | build-and-deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout 13 | uses: actions/checkout@master 14 | 15 | - name: Build and Deploy 16 | uses: JacksonMaxfield/github-pages-deploy-action-python@master 17 | env: 18 | ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }} 19 | BASE_BRANCH: master # The branch the action should deploy from. 20 | BRANCH: gh-pages # The branch the action should deploy to. 21 | FOLDER: docs/_build/html # The folder the action should deploy. This example folder is generated by Sphinx 22 | BUILD_SCRIPT: pip install .[all] && make docs-build && touch docs/_build/html/.nojekyll # The build script the action should run prior to deploying. 23 | -------------------------------------------------------------------------------- /10-GroupedHistogran&DepthOptimizationChart/GroupedHistogran&DepthOptimizationChart.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第十课 绘制分组柱图,深度优化图表 4 | # 2018-10-18 5 | 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | 9 | students = pd.read_excel('./Students.xlsx') 10 | 11 | print('----原始数据----') 12 | print(students) 13 | 14 | students.sort_values(by='2017',inplace=True,ascending=False) 15 | students.plot.bar(x='Field',y=['2016','2017'],color=['orange','red']) 16 | 17 | plt.title('International Students by Field',fontsize=16,fontweight='bold') 18 | plt.xlabel('Field',fontweight='bold') 19 | plt.ylabel('Numbers',fontweight='bold') 20 | ax = plt.gca() # 获取图表的轴 21 | ax.set_xticklabels(students['Field'],rotation=45,ha='right') 22 | f = plt.gcf() # 获取图表的图形 23 | f.subplots_adjust(left=0.2,bottom=0.42) 24 | # plt.tight_layout() 25 | plt.show() -------------------------------------------------------------------------------- /2-ReadExcel/ReadExcel.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第二课 读取文件 4 | # 2018-10-17 5 | 6 | import pandas as pd 7 | 8 | # --基本数据的读取-- 9 | 10 | # 读取文件 11 | # head:默认0,表示开始读取的行(默认会跳过顶部的空行) 12 | # index_col: 指定数据的索引列 13 | people = pd.read_excel('./People.xlsx',head=1,index_col='ID') 14 | # 读取文件的行数和列数 15 | shape = people.shape 16 | # 读取文件的行,不会显示索引列 17 | columns = people.columns 18 | # 读取文件的前几行(默认为5,可传入指定行数) 19 | head = people.head() 20 | # 读取文件的末尾几行(默认为5,可传入指定行数) 21 | tail = people.tail() 22 | 23 | # --当数据文件存在坏数据时可按以下方式处理-- 24 | # (以下内容根据实际情况使用,同时使用会造成数据混乱) 25 | 26 | # 当标题行上有坏数据时可使用 head 参数 27 | people1 = pd.read_excel('./People.xlsx',head=1) 28 | 29 | # 当数据表中没有标题行时可将 head 的值设为 None 表示无标题 30 | people2 = pd.read_excel('./People.xlsx',head=None) 31 | people2.columns = ['ID', 'Type', 'Title', 'FirstName', 'MiddleName', 'LastName'] 32 | 33 | print(columns) -------------------------------------------------------------------------------- /25&26-ConditionalFormatting/ConditionalFormatting01.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第二十五课 条件格式化(上) 4 | # 2018-10-26 5 | 6 | import pandas as pd 7 | import seaborn as sns 8 | 9 | 10 | def low_score_red(s): 11 | color = 'red' if s < 60 else 'black' 12 | return f'color:{color}' 13 | 14 | 15 | def highest_score_green(col): 16 | return ['background-color:lime' if s == col.max() else 'background-color:white' for s in col] 17 | 18 | 19 | students = pd.read_excel('./Students.xlsx') 20 | print('----原始数据----') 21 | print(students) 22 | print(students.columns) 23 | 24 | students.style.applymap(low_score_red, subset=['Test_1', 'Test_2', 'Test_3']) \ 25 | .apply(highest_score_green, subset=['Test_1', 'Test_2', 'Test_3']) 26 | 27 | # 说明 28 | # 由于编辑器的支持问题,此代码的效果可能无法展现 29 | # 请使用 Anaconda 中的 jupyter notebook 中打开 'ConditionalFormatting01.ipynb' 查看运行效果 30 | 31 | 32 | -------------------------------------------------------------------------------- /19-Statistics/Statistics.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第十九课 求和,求平均,统计导引 4 | # 2018-10-24 5 | 6 | import pandas as pd 7 | 8 | students = pd.read_excel('./Students.xlsx',index_col='ID') 9 | print('----原始数据----') 10 | print(students) 11 | print(students.columns) 12 | 13 | temp = students[['Test_1','Test_2','Test_3']] 14 | print('\n----需要计算的元数据----') 15 | print(temp) 16 | 17 | # 和 18 | raw_sum = temp.sum(axis=1) 19 | print('\n----求和----') 20 | print(raw_sum) 21 | 22 | # 平均值 23 | raw_mean = temp.mean(axis=1) 24 | print('\n----求平均值----') 25 | print(raw_mean) 26 | 27 | students['Total'] = raw_sum 28 | students['Average'] = raw_mean 29 | print('\n----整合结果----') 30 | print(students) 31 | 32 | col_mean = students[['Test_1','Test_2','Test_3','Total','Average']].mean() 33 | col_mean['Name'] = 'Summary' 34 | students = students.append(col_mean,ignore_index=True) 35 | print('\n----最终结果----') 36 | print(students) 37 | 38 | # axis = 1: 横向 39 | # axis = 0: 纵向(默认) -------------------------------------------------------------------------------- /24-DataPrediction/DataPrediction.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第二十四课 线性回归,数据预测 4 | # 2018-10-25 5 | 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | from scipy.stats import linregress 9 | 10 | sales = pd.read_excel('./Sales.xlsx',dtype={'Month':str}) 11 | print('----原始数据----') 12 | print(sales.head()) 13 | print(sales.columns) 14 | 15 | # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.linregress.html 16 | # linregress 计算两组测量的线性最小二乘回归。 17 | # 共传递两个参数,这连个参数应为连个数组,并且两个素组的值应该一一对应 18 | # 返回值: 19 | # slope:回归线的斜率 20 | # intercept:回归线的截距 21 | # r:相关系数 22 | # p:假设检验的双侧p值,其零假设是斜率为零,使用Wald检验,检验统计量的t分布 23 | # std_err:估计梯度的标准误差。 24 | slope,intercept,r,p,std_err = linregress(sales.index,sales.Revenue) 25 | 26 | # 期望值 27 | exp = sales.index * slope + intercept 28 | # 线性回归方程回归方程 29 | # y = slope * x + intercept 30 | 31 | plt.scatter(sales.index,sales.Revenue) 32 | plt.plot(sales.index,exp,color='orange') 33 | plt.title('Sales') 34 | plt.xticks(sales.index,sales.Month,rotation=90) 35 | plt.tight_layout() 36 | plt.show() 37 | 38 | -------------------------------------------------------------------------------- /8-DataFiltering/DataFiltering.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第八课 数据筛选、过滤 4 | # 2018-10-18 5 | 6 | import pandas as pd 7 | 8 | Students = pd.read_excel('./Students.xlsx',index_col='ID') 9 | 10 | print('----原始数据----') 11 | print(Students) 12 | 13 | # Age 大于等于 18 小于 30 14 | def age_18_to_30(age): 15 | return 18 <= age < 30 16 | 17 | # 成绩在 85 到 100 之间 18 | def level_a(score): 19 | return 85 <= score <= 100 20 | 21 | # 筛选 Age 大于等于 18 小于 30 的学生 22 | Students = Students.loc[Students['Age'].apply(lambda age:18 <= age < 30)] 23 | print('\n----筛选 Age 大于等于 18 小于 30 的学生----') 24 | print(Students) 25 | 26 | # 筛选 Age 大于等于 18 小于 30 成绩在 85 到 100 之间 的学生 27 | Students = Students.loc[Students.Age.apply(age_18_to_30)] \ 28 | .loc[Students.Score.apply(level_a)] 29 | print('\n----筛选 Age 大于等于 18 小于 30 成绩在 85 到 100 之间 的学生----') 30 | print(Students) 31 | 32 | # 补充知识点 33 | # 1. Students['Age'] 的写法可以简写为 Students.Age 34 | # 2. age_18_to_30 函数可以用 lambda 表达式代替,因此 35 | # .apply(age_18_to_30) 可以简写为 .apply(lambda age:18 <= age < 30) 36 | # 3. Python 中 如遇表达式过长可以使用 ' \'(空格加正斜杠加回车)的方式换行 37 | -------------------------------------------------------------------------------- /4&5-ReadData&BaseInput/ReadData&BaseInput.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第四课 数据区域的读取,填充整数、文字 4 | # 第五课 填充日期序列 5 | # 2018-10-17 6 | 7 | import pandas as pd 8 | from datetime import date,timedelta 9 | 10 | # --数据区域的读取-- 11 | 12 | # skiprows: 从序号为3的行开始读取(类似于 header) 13 | # usecols: 读取列的范围 14 | # dtype: 设置每一列的数据类型 15 | books = pd.read_excel('./Books.xlsx',skiprows=3,usecols='C:F',dtype={'ID':str,'Name':str,'InStore':str,'Date':str}) 16 | print(books) 17 | 18 | 19 | print('-----------------分隔符-----------------') 20 | 21 | # --填充整数、文字、日前-- 22 | 23 | # 日期加月份 24 | # d:起始日期,type:date 25 | # month_delta: 要添加的月数,type:int 26 | # 返回添加后的结果,type:date 27 | def add_month(d, month_delta): 28 | year_delta = month_delta // 12 29 | month = d.month + month_delta % 12 30 | if month != 12: 31 | year_delta += month // 12 32 | month = month % 12 33 | return date(d.year + year_delta, month, d.day) 34 | 35 | start = date(2018,10,17) 36 | 37 | for i in books.index: 38 | books['ID'].at[i] = i + 1 39 | books['InStore'].at[i] = 'Yes' if i % 2 == 0 else 'No' 40 | books['Date'].at[i] = start + timedelta(days=i) 41 | print(books) 42 | 43 | books.set_index('ID') 44 | books.to_excel('./Books_output.xlsx') 45 | print('-----------------Done-----------------') 46 | -------------------------------------------------------------------------------- /16-Join/Join.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第十六课 多表联合(Join) 4 | # 2018-10-24 5 | 6 | import pandas as pd 7 | 8 | students = pd.read_excel('./Student_Score.xlsx',sheet_name='Students',index_col='ID') 9 | scores = pd.read_excel('./Student_Score.xlsx',sheet_name='Scores',index_col='ID') 10 | print('----原始数据----') 11 | print('\n----Students----') 12 | print(students) 13 | print(students.columns) 14 | print('\n----Scores----') 15 | print(scores) 16 | print(scores.columns) 17 | 18 | # 联合查询 19 | 20 | # 方法一 21 | # how: 链接方式 22 | # inner(默认)-inner join 23 | # left-左链接 24 | # right-右链接 25 | # on: 链接字段(如果省略此属性,merge会自动以相同的列名作为链接的依据,但不会比较 index_col) 26 | # left_on/right_on: 分别指定两张表的链接依据 27 | # fillna(0): 将'NaN'替换为0 28 | table1 = students.merge(scores,how='left',on='ID').fillna(0) 29 | table1.Score = table1.Score.astype(int) # 将Score中的小数转换为整数 30 | print('\n----联合查询 方法一(inner join)----') 31 | print(table1) 32 | 33 | # 方法二 34 | # how: 链接方式 35 | # inner(默认)-inner join 36 | # left-左链接 37 | # right-右链接 38 | # on: 链接字段(设置了 index_col 时如果省略此属性,join会自动以 index_col 作为链接的依据) 39 | # fillna(0): 将'NaN'替换为0 40 | table2 = students.join(scores,how='left',on='ID').fillna(0) 41 | table2.Score = table2.Score.astype(int) # 将Score中的小数转换为整数 42 | print('\n----联合查询 方法二(inner join)----') 43 | print(table2) 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /28-ColOperation/ColOperation.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第二十八课 列操作集锦 4 | # 建议在第八讲之后查看 5 | # 2018-10-30 6 | 7 | import pandas as pd 8 | import numpy as np 9 | 10 | students_001 = pd.read_excel('./Students.xlsx',sheet_name='Page_001') 11 | students_002 = pd.read_excel('./Students.xlsx',sheet_name='Page_002') 12 | print('----原始数据----') 13 | print('\n----Page_001----') 14 | print(students_001) 15 | print('\n----Page_002----') 16 | print(students_002) 17 | 18 | # 追加数据集 19 | students_add_dates = pd.concat([students_001,students_002],axis=1) 20 | print('\n----追加数据集(极少使用)----') 21 | print(students_add_dates) 22 | 23 | students = pd.concat([students_001,students_002]).reset_index(drop=True) 24 | print('\n----将要使用的数据----') 25 | print(students) 26 | 27 | # 追加数据列 28 | # students['Age'] = 25 # 等同于 np.repeat(25,len(students)) 29 | students['Age'] = np.arange(0,len(students)) 30 | print('\n----追加数据列----') 31 | print(students) 32 | 33 | # 删除列 34 | students.drop(columns='Age',inplace=True) 35 | print('\n----删除列----') 36 | print(students) 37 | 38 | # 插入列 39 | students.insert(1,column='Foo',value=np.repeat('foo',len(students))) 40 | print('\n----插入列----') 41 | print(students) 42 | 43 | # 修改列名 44 | students.rename(columns={'Foo':'FOO','Name':'NAME'},inplace=True) 45 | print('\n----修改列名----') 46 | print(students) 47 | 48 | # 删除含空值的行 49 | # 制造空值 50 | students['ID'] = students['ID'].astype(float) 51 | for i in range(3,5): 52 | students['ID'].at[i] = np.nan 53 | 54 | students.dropna(inplace=True) 55 | print('\n----删除含空值的行----') 56 | print(students) 57 | 58 | 59 | -------------------------------------------------------------------------------- /27-RowOperation/RowOperation.py: -------------------------------------------------------------------------------- 1 | # pandasVersusExcel 2 | # http://sa.mentorx.net/course/89/tasks 3 | # 第二十七课 行操作集锦 4 | # 建议在第八讲之后查看 5 | # 2018-10-30 6 | 7 | import pandas as pd 8 | 9 | students_001 = pd.read_excel('./Students.xlsx',sheet_name='Page_001',index_col='ID') 10 | students_002 = pd.read_excel('./Students.xlsx',sheet_name='Page_002',index_col='ID') 11 | print('----原始数据----') 12 | print('\n----Page_001----') 13 | print(students_001) 14 | print('\n----Page_002----') 15 | print(students_002) 16 | 17 | # 追加数据集 18 | students_add_dates = students_001.append(students_002) 19 | print('\n----追加数据集----') 20 | print(students_add_dates) 21 | 22 | # 追加数据行 23 | stu_col1 = pd.Series({'Name':'Abel','Score':99}) 24 | students_add_col = students_add_dates.append(stu_col1,ignore_index=True) 25 | print('\n----追加数据行----') 26 | print(students_add_col) 27 | 28 | # 更改数据 29 | students_001.at[1,'Name'] = 'Jack' 30 | students_001.at[1,'Score'] = 100 31 | print('\n----更改数据 方法一----') 32 | print(students_001) 33 | 34 | stu_col2 = pd.Series({'ID':1,"Name":'Chen','Score':110}) 35 | students_001.iloc[0] = stu_col2 # iloc 的参数为行数 ,从0开始 36 | print('\n----更改数据 方法二----') 37 | print(students_001) 38 | 39 | # 在数据中插入一行 40 | stu_col3 = pd.Series({"Name":'Scort','Score':110}) 41 | part1 = students_001[:15] 42 | part2 = students_001[15:] 43 | students_001 = part1.append(stu_col3,ignore_index=True).append(part2,ignore_index=True) 44 | print('\n----在数据中插入一行----') 45 | print(students_001) 46 | 47 | # 删除数据行 48 | students_drop_col = students_001.drop(index=[15]) 49 | print('\n----删除数据行----') 50 | print(students_drop_col) 51 | 52 | # 带条件的删除 53 | # 设置空值 54 | for i in range(5, 15): 55 | students_001['Name'].at[i] = '' 56 | 57 | # 去掉空值 58 | missing = students_001.loc[students_001['Name'] == ''] 59 | students_001.drop(missing.index, inplace=True) 60 | print('\n----带条件的删除----') 61 | print(students_001) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Learn pandas vs Excel(Python) 2 | 3 | 4 | 本笔记为 [Pandas玩转Excel - Python数据分析轻松学](http://sa.mentorx.net/course/89/tasks) 这门课程的学习笔记 5 | 6 | ## Contents 7 | * [1 - 创建文件](1-CreateExcel/CreateExcel.py) 8 | * [2 - 读取文件](2-ReadExcel/ReadExcel.py) 9 | * [3 - 行、列、单元格](3-Rows&Clumns&Cell/Rows&Clumns&Cell.py) 10 | * [4&5 - 数据区域的读取,填充整数、文字,填充日期序列](4%265-ReadData&BaseInput/ReadData&BaseInput.py) 11 | * [6 - 函数填充](6-InputFunction/InputFunction.py) 12 | * [7 - 排序,多重排序](7-Sequence/Sequence.py) 13 | * [8 - 数据筛选、过滤](8-DataFiltering/DataFiltering.py) 14 | * [9 - 柱状图](9-Histogram/Histogram.py) 15 | * [10 - 绘制分组柱图,深度优化图表](10-GroupedHistogran&DepthOptimizationChart/GroupedHistogran&DepthOptimizationChart.py) 16 | * [11 - 绘制分组柱图,深度优化图表](11-SuperimposedHistogram&HorizontalHistogram/SuperimposedHistogram&HorizontalHistogram.py) 17 | * [12 - 绘制饼图](12-PieChart/PieChart.py) 18 | * [13 - 绘制折线趋势图、叠加区域图](13-PolylineTrendChart&OverlayAreaMap/PolylineTrendChart&OverlayAreaMap.py) 19 | * [14&15 - 散点图,直方图,密度图,·密度图,数据相关性](14%2615-ScatterPlot&Histogram&DensityMap/ScatterPlot&Histogram&DensityMap.py) 20 | * [16 - 多表联合(Join)](16-Join/Join.py) 21 | * [17 - 数据校验,轴的概念](17-DataValidation/DataValidation.py) 22 | * [18 - 把一列数据分割成两列](18-DataSegmentation/DataSegmentation.py) 23 | * [19 - 求和,求平均,统计导引](19-Statistics/Statistics.py) 24 | * [20 - 定位、消除重复数据](20-DuplicateData/DuplicateData.py) 25 | * [21 - 定位、旋转数据表(行/列转换)](21-RotateDataSet/RotateDataSet.py) 26 | * [22 - 读取CSV、TSV、TXT文件中的数据](22-ReadData/ReadData.py) 27 | * [23 - 透视表,分组,聚合(group by)](23-GroupBy/GroupBy.py) 28 | * [24 - 线性回归,数据预测](24-DataPrediction/DataPrediction.py) 29 | * [25 - 条件格式化(上)](25%2626-ConditionalFormatting/ConditionalFormatting01.py) 30 | * [26 - 条件格式化(下)](25%2626-ConditionalFormatting/ConditionalFormatting02.py) 31 | * [27 - 行操作集锦](27-RowOperation/RowOperation.py) 32 | * [28 - 列操作集锦](28-ColOperation/ColOperation.py) 33 | * [29 - 读取数据库](29-ReadDataBase/ReadDataBase.py) 34 | * [30 - 编写复杂方程](30-WritingComplexEquations/WritingComplexEquations.py) 35 | 36 | 37 | --- 38 | 鸣谢: 39 | [Pandas玩转Excel - Python数据分析轻松学](http://sa.mentorx.net/course/89/notes) 讲师:[Timothy](http://sa.mentorx.net/user/25) -------------------------------------------------------------------------------- /25&26-ConditionalFormatting/ConditionalFormatting02.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "----原始数据----\n", 13 | " ID Name Test_1 Test_2 Test_3\n", 14 | "0 1 Student_001 41 31 54\n", 15 | "1 2 Student_002 86 22 59\n", 16 | "2 3 Student_003 54 25 68\n", 17 | "3 4 Student_004 82 36 92\n", 18 | "4 5 Student_005 55 99 93\n", 19 | "5 6 Student_006 86 12 50\n", 20 | "6 7 Student_007 91 11 54\n", 21 | "7 8 Student_008 20 98 52\n", 22 | "8 9 Student_009 74 85 63\n", 23 | "9 10 Student_010 28 98 99\n", 24 | "10 11 Student_011 35 83 85\n", 25 | "11 12 Student_012 23 48 67\n", 26 | "12 13 Student_013 45 62 90\n", 27 | "13 14 Student_014 63 26 56\n", 28 | "14 15 Student_015 50 64 70\n", 29 | "15 16 Student_016 69 31 96\n", 30 | "16 17 Student_017 98 78 55\n", 31 | "17 18 Student_018 65 74 95\n", 32 | "18 19 Student_019 95 51 61\n", 33 | "19 20 Student_020 83 72 82\n", 34 | "Index(['ID', 'Name', 'Test_1', 'Test_2', 'Test_3'], dtype='object')\n" 35 | ] 36 | }, 37 | { 38 | "data": { 39 | "text/html": [ 40 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | "
IDNameTest_1Test_2Test_3
01Student_001413154
12Student_002862259
23Student_003542568
34Student_004823692
45Student_005559993
56Student_006861250
67Student_007911154
78Student_008209852
89Student_009748563
910Student_010289899
1011Student_011358385
1112Student_012234867
1213Student_013456290
1314Student_014632656
1415Student_015506470
1516Student_016693196
1617Student_017987855
1718Student_018657495
1819Student_019955161
1920Student_020837282
" 351 | ], 352 | "text/plain": [ 353 | "" 354 | ] 355 | }, 356 | "execution_count": 7, 357 | "metadata": {}, 358 | "output_type": "execute_result" 359 | } 360 | ], 361 | "source": [ 362 | "# pandasVersusExcel\n", 363 | "# http://sa.mentorx.net/course/89/tasks\n", 364 | "# 第二十六课 条件格式化(下)\n", 365 | "# 2018-10-26\n", 366 | "\n", 367 | "import pandas as pd\n", 368 | "import seaborn as sns\n", 369 | "\n", 370 | "students = pd.read_excel('./Students.xlsx')\n", 371 | "print('----原始数据----')\n", 372 | "print(students)\n", 373 | "print(students.columns)\n", 374 | "\n", 375 | "# 以下两种效果不要同时使用,会被覆盖\n", 376 | "\n", 377 | "# 根据数据的大小显示不同深度的颜色\n", 378 | "col_map = sns.light_palette('green', as_cmap=True)\n", 379 | "# students.style.background_gradient(col_map, subset=['Test_1', 'Test_2', 'Test_3']) # 需要引入 seaborn\n", 380 | "\n", 381 | "# 根据数据的大小显示不同长度的色条\n", 382 | "students.style.bar(color='orange', subset=['Test_1', 'Test_2', ‘Test_’]) # 不需要引入 seaborn\n" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [] 391 | } 392 | ], 393 | "metadata": { 394 | "kernelspec": { 395 | "display_name": "Python 3", 396 | "language": "python", 397 | "name": "python3" 398 | }, 399 | "language_info": { 400 | "codemirror_mode": { 401 | "name": "ipython", 402 | "version": 3 403 | }, 404 | "file_extension": ".py", 405 | "mimetype": "text/x-python", 406 | "name": "python", 407 | "nbconvert_exporter": "python", 408 | "pygments_lexer": "ipython3", 409 | "version": "3.6.6" 410 | } 411 | }, 412 | "nbformat": 4, 413 | "nbformat_minor": 2 414 | } 415 | -------------------------------------------------------------------------------- /25&26-ConditionalFormatting/ConditionalFormatting01.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 9, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "----原始数据----\n", 13 | " ID Name Test_1 Test_2 Test_3\n", 14 | "0 1 Student_001 41 31 54\n", 15 | "1 2 Student_002 86 22 59\n", 16 | "2 3 Student_003 54 25 68\n", 17 | "3 4 Student_004 82 36 92\n", 18 | "4 5 Student_005 55 99 93\n", 19 | "5 6 Student_006 86 12 50\n", 20 | "6 7 Student_007 91 11 54\n", 21 | "7 8 Student_008 20 98 52\n", 22 | "8 9 Student_009 74 85 63\n", 23 | "9 10 Student_010 28 98 99\n", 24 | "10 11 Student_011 35 83 85\n", 25 | "11 12 Student_012 23 48 67\n", 26 | "12 13 Student_013 45 62 90\n", 27 | "13 14 Student_014 63 26 56\n", 28 | "14 15 Student_015 50 64 70\n", 29 | "15 16 Student_016 69 31 96\n", 30 | "16 17 Student_017 98 78 55\n", 31 | "17 18 Student_018 65 74 95\n", 32 | "18 19 Student_019 95 51 61\n", 33 | "19 20 Student_020 83 72 82\n", 34 | "Index(['ID', 'Name', 'Test_1', 'Test_2', 'Test_3'], dtype='object')\n" 35 | ] 36 | }, 37 | { 38 | "data": { 39 | "text/html": [ 40 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | "
IDNameTest_1Test_2Test_3
01Student_001413154
12Student_002862259
23Student_003542568
34Student_004823692
45Student_005559993
56Student_006861250
67Student_007911154
78Student_008209852
89Student_009748563
910Student_010289899
1011Student_011358385
1112Student_012234867
1213Student_013456290
1314Student_014632656
1415Student_015506470
1516Student_016693196
1617Student_017987855
1718Student_018657495
1819Student_019955161
1920Student_020837282
" 373 | ], 374 | "text/plain": [ 375 | "" 376 | ] 377 | }, 378 | "execution_count": 9, 379 | "metadata": {}, 380 | "output_type": "execute_result" 381 | } 382 | ], 383 | "source": [ 384 | "# pandasVersusExcel\n", 385 | "# http://sa.mentorx.net/course/89/tasks\n", 386 | "# 第二十五课 条件格式化(上)\n", 387 | "# 2018-10-26\n", 388 | "\n", 389 | "import pandas as pd \n", 390 | "\n", 391 | "def low_score_red(s):\n", 392 | " color = 'red' if s < 60 else 'black'\n", 393 | " return f'color:{color}'\n", 394 | "\n", 395 | "def highest_score_green(col):\n", 396 | " return ['background-color:lime' if s == col.max() else 'background-color:white' for s in col]\n", 397 | "\n", 398 | "students = pd.read_excel('./Students.xlsx')\n", 399 | "print('----原始数据----')\n", 400 | "print(students)\n", 401 | "print(students.columns)\n", 402 | " \n", 403 | "students.style.applymap(low_score_red, subset=['Test_1', 'Test_2', 'Test_3']) \\\n", 404 | ".apply(highest_score_green, subset=['Test_1', 'Test_2', 'Test_3'])\n", 405 | "\n" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": null, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [] 421 | } 422 | ], 423 | "metadata": { 424 | "kernelspec": { 425 | "display_name": "Python 3", 426 | "language": "python", 427 | "name": "python3" 428 | }, 429 | "language_info": { 430 | "codemirror_mode": { 431 | "name": "ipython", 432 | "version": 3 433 | }, 434 | "file_extension": ".py", 435 | "mimetype": "text/x-python", 436 | "name": "python", 437 | "nbconvert_exporter": "python", 438 | "pygments_lexer": "ipython3", 439 | "version": "3.6.6" 440 | } 441 | }, 442 | "nbformat": 4, 443 | "nbformat_minor": 2 444 | } 445 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 98 | 99 | 100 | 102 | 103 | 150 | 151 | 152 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 |