├── .editorconfig
├── .github
└── workflows
│ └── blank.yml
├── .idea
├── PandasVersusExcel.iml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── 1-CreateExcel
├── CreateExcel.py
└── output.xlsx
├── 10-GroupedHistogran&DepthOptimizationChart
├── GroupedHistogran&DepthOptimizationChart.py
└── Students.xlsx
├── 11-SuperimposedHistogram&HorizontalHistogram
├── SuperimposedHistogram&HorizontalHistogram.py
└── Users.xlsx
├── 12-PieChart
├── PieChart.py
└── Students.xlsx
├── 13-PolylineTrendChart&OverlayAreaMap
├── Orders.xlsx
└── PolylineTrendChart&OverlayAreaMap.py
├── 14&15-ScatterPlot&Histogram&DensityMap
├── ScatterPlot&Histogram&DensityMap.py
└── home_data.xlsx
├── 16-Join
├── Join.py
└── Student_Score.xlsx
├── 17-DataValidation
├── DataValidation.py
└── Students.xlsx
├── 18-DataSegmentation
├── DataSegmentation.py
└── Employees.xlsx
├── 19-Statistics
├── Statistics.py
└── Students.xlsx
├── 2-ReadExcel
├── People.xlsx
└── ReadExcel.py
├── 20-DuplicateData
├── DuplicateData.py
└── Students_Duplicates.xlsx
├── 21-RotateDataSet
├── RotateDataSet.py
└── Videos.xlsx
├── 22-ReadData
├── ReadData.py
├── Students.csv
├── Students.tsv
└── Students.txt
├── 23-GroupBy
├── GroupBy.py
└── Orders.xlsx
├── 24-DataPrediction
├── DataPrediction.py
└── Sales.xlsx
├── 25&26-ConditionalFormatting
├── .gitignore
├── ConditionalFormatting01.ipynb
├── ConditionalFormatting01.py
├── ConditionalFormatting02.ipynb
├── ConditionalFormatting02.py
└── Students.xlsx
├── 27-RowOperation
├── RowOperation.py
└── Students.xlsx
├── 28-ColOperation
├── ColOperation.py
└── Students.xlsx
├── 29-ReadDataBase
└── ReadDataBase.py
├── 3-Rows&Clumns&Cell
└── Rows&Clumns&Cell.py
├── 30-WritingComplexEquations
├── Rectangles.xlsx
└── WritingComplexEquations.py
├── 4&5-ReadData&BaseInput
├── Books.xlsx
├── Books_output.xlsx
└── ReadData&BaseInput.py
├── 6-InputFunction
├── Books.xlsx
└── InputFunction.py
├── 7-Sequence
├── List.xlsx
└── Sequence.py
├── 8-DataFiltering
├── DataFiltering.py
└── Students.xlsx
├── 9-Histogram
├── Figure_1.png
├── Histogram.py
└── Students.xlsx
├── README.md
└── _config.yml
/.editorconfig:
--------------------------------------------------------------------------------
1 | root = true
2 |
3 | [*]
4 | indent_style = space
5 | indent_size = 4
6 | end_of_line = lf
7 | charset = utf-8
8 | trim_trailing_whitespace = false
9 | insert_final_newline = false
--------------------------------------------------------------------------------
/.github/workflows/blank.yml:
--------------------------------------------------------------------------------
1 | name: Deploy to Github Pages
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 |
8 | jobs:
9 | build-and-deploy:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - name: Checkout
13 | uses: actions/checkout@master
14 |
15 | - name: Build and Deploy
16 | uses: JacksonMaxfield/github-pages-deploy-action-python@master
17 | env:
18 | ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }}
19 | BASE_BRANCH: master # The branch the action should deploy from.
20 | BRANCH: gh-pages # The branch the action should deploy to.
21 | FOLDER: docs/_build/html # The folder the action should deploy. This example folder is generated by Sphinx
22 | BUILD_SCRIPT: pip install .[all] && make docs-build && touch docs/_build/html/.nojekyll # The build script the action should run prior to deploying.
23 |
--------------------------------------------------------------------------------
/.idea/PandasVersusExcel.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 | 1539737266150
342 |
343 |
344 | 1539737266150
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
695 |
696 |
697 |
698 |
699 |
700 |
701 |
702 |
703 |
704 |
705 |
706 |
707 |
708 |
709 |
710 |
711 |
712 |
713 |
714 |
715 |
716 |
717 |
718 |
719 |
720 |
721 |
722 |
723 |
724 |
725 |
726 |
727 |
728 |
729 |
730 |
731 |
732 |
733 |
734 |
735 |
736 |
737 |
738 |
739 |
740 |
741 |
742 |
743 |
744 |
745 |
746 |
747 |
748 |
749 |
750 |
751 |
752 |
753 |
754 |
755 |
756 |
757 |
758 |
759 |
760 |
761 |
--------------------------------------------------------------------------------
/1-CreateExcel/CreateExcel.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第一课 创建文件
4 | # 2018-10-17
5 |
6 | import pandas as pd
7 |
8 | df = pd.DataFrame({'ID':[0,1,2],'Name':['Mark','Tomi','Jack']})
9 |
10 | # 此处设置数据表的索引,如未设置索引会自动在最前方添加一列作为索引
11 | df = df.set_index('ID') # 会生成新的 DataFrame
12 | # df.set_index('ID',inplace=True) # 在原来的 DataFrame 上进行修改
13 |
14 | # 将数据报错到 output.xlsx
15 | df.to_excel('./output.xlsx')
16 |
17 | print(df)
18 | print('Done')
--------------------------------------------------------------------------------
/1-CreateExcel/output.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/1-CreateExcel/output.xlsx
--------------------------------------------------------------------------------
/10-GroupedHistogran&DepthOptimizationChart/GroupedHistogran&DepthOptimizationChart.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第十课 绘制分组柱图,深度优化图表
4 | # 2018-10-18
5 |
6 | import pandas as pd
7 | import matplotlib.pyplot as plt
8 |
9 | students = pd.read_excel('./Students.xlsx')
10 |
11 | print('----原始数据----')
12 | print(students)
13 |
14 | students.sort_values(by='2017',inplace=True,ascending=False)
15 | students.plot.bar(x='Field',y=['2016','2017'],color=['orange','red'])
16 |
17 | plt.title('International Students by Field',fontsize=16,fontweight='bold')
18 | plt.xlabel('Field',fontweight='bold')
19 | plt.ylabel('Numbers',fontweight='bold')
20 | ax = plt.gca() # 获取图表的轴
21 | ax.set_xticklabels(students['Field'],rotation=45,ha='right')
22 | f = plt.gcf() # 获取图表的图形
23 | f.subplots_adjust(left=0.2,bottom=0.42)
24 | # plt.tight_layout()
25 | plt.show()
--------------------------------------------------------------------------------
/10-GroupedHistogran&DepthOptimizationChart/Students.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/10-GroupedHistogran&DepthOptimizationChart/Students.xlsx
--------------------------------------------------------------------------------
/11-SuperimposedHistogram&HorizontalHistogram/SuperimposedHistogram&HorizontalHistogram.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第十一课 绘制分组柱图,深度优化图表
4 | # 2018-10-19
5 |
6 | import pandas as pd
7 | import matplotlib.pyplot as plt
8 |
9 | users = pd.read_excel('./Users.xlsx',index_col='ID')
10 | users['Total'] = users['Oct'] + users['Nov'] + users['Dec']
11 | users.sort_values(by='Total',inplace=True,ascending=True)
12 | print(users)
13 |
14 | # stacked: 叠加(默认为False)
15 | users.plot.barh(x='Name',y=['Oct','Nov','Dec'],stacked=True,title='User Behavior')
16 |
17 | plt.tight_layout()
18 | plt.show()
19 |
20 | # 补充说明
21 | # users.plot.bar(...) 表示制作竖直柱状图
22 | # users.plot.barh(...) 表示制作水平柱状图
23 | #
24 | #
25 | #
--------------------------------------------------------------------------------
/11-SuperimposedHistogram&HorizontalHistogram/Users.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/11-SuperimposedHistogram&HorizontalHistogram/Users.xlsx
--------------------------------------------------------------------------------
/12-PieChart/PieChart.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第十二课 绘制饼图
4 | # 2018-10-19
5 |
6 | import pandas as pd
7 | import matplotlib.pyplot as plt
8 |
9 | students = pd.read_excel('./Students.xlsx',index_col="From")
10 | print('----原始数据----')
11 | print(students)
12 |
13 | # counterclock: True(默认值): 逆时针,False: 顺时针
14 | students['2017'].plot.pie(fontsize=8,counterclock=False)
15 | plt.title('Source of International Students',fontsize=16,fontweight='bold')
16 | plt.ylabel('2017',fontsize=12,fontweight='bold')
17 | plt.show()
--------------------------------------------------------------------------------
/12-PieChart/Students.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/12-PieChart/Students.xlsx
--------------------------------------------------------------------------------
/13-PolylineTrendChart&OverlayAreaMap/Orders.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/13-PolylineTrendChart&OverlayAreaMap/Orders.xlsx
--------------------------------------------------------------------------------
/13-PolylineTrendChart&OverlayAreaMap/PolylineTrendChart&OverlayAreaMap.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第十三课 绘制折线趋势图、叠加区域图
4 | # 2018-10-19
5 |
6 | import pandas as pd
7 | import matplotlib.pyplot as plt
8 |
9 | weeks = pd.read_excel('./Orders.xlsx',index_col='Week')
10 | print('----原始数据----')
11 | print(weeks)
12 |
13 | # 叠加区域图
14 | weeks.plot.area(y=['Accessories','Bikes','Clothing','Components'])
15 | # 叠加柱状图
16 | # weeks.plot.bar(y=['Accessories','Bikes','Clothing','Components'],stacked=True)
17 | plt.title('Sales Weekly Trend',fontsize=16,fontweight='bold')
18 | plt.ylabel('Total',fontsize=12,fontweight='bold')
19 | plt.xticks(weeks.index,fontsize=8)
20 | plt.show()
21 |
22 | # 补充说明
23 | # weeks.plot(...) 绘制折线图
24 | # weeks.plot.area(...) 绘制叠加区域图
25 | # weeks.plot.bar(...) 绘制叠加柱状腿
26 | #
--------------------------------------------------------------------------------
/14&15-ScatterPlot&Histogram&DensityMap/ScatterPlot&Histogram&DensityMap.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第十四课 散点图,直方图,密度图
4 | # 第十五课 密度图,数据相关性
5 | # 2018-10-19
6 |
7 | import pandas as pd
8 | import matplotlib.pyplot as plt
9 |
10 | pd.options.display.max_columns = 20
11 | homes = pd.read_excel('./home_data.xlsx',index_col='id')
12 | print('----原始数据----')
13 | print(homes.head())
14 | print(homes.columns)
15 |
16 | # 散点图
17 | # homes.plot.scatter(x='sqft_living',y='price')
18 |
19 | # 直方图 bins: 分配粒度
20 | # homes.sqft_living.plot.hist(bins=100)
21 | # plt.xticks(range(0,max(homes.sqft_living),500),fontsize=8,rotation=90)
22 |
23 | # 密度图
24 | homes.sqft_living.plot.kde()
25 | plt.xticks(range(0,max(homes.sqft_living),500),fontsize=8,rotation=90)
26 | plt.show()
27 |
28 |
29 |
--------------------------------------------------------------------------------
/14&15-ScatterPlot&Histogram&DensityMap/home_data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/14&15-ScatterPlot&Histogram&DensityMap/home_data.xlsx
--------------------------------------------------------------------------------
/16-Join/Join.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第十六课 多表联合(Join)
4 | # 2018-10-24
5 |
6 | import pandas as pd
7 |
8 | students = pd.read_excel('./Student_Score.xlsx',sheet_name='Students',index_col='ID')
9 | scores = pd.read_excel('./Student_Score.xlsx',sheet_name='Scores',index_col='ID')
10 | print('----原始数据----')
11 | print('\n----Students----')
12 | print(students)
13 | print(students.columns)
14 | print('\n----Scores----')
15 | print(scores)
16 | print(scores.columns)
17 |
18 | # 联合查询
19 |
20 | # 方法一
21 | # how: 链接方式
22 | # inner(默认)-inner join
23 | # left-左链接
24 | # right-右链接
25 | # on: 链接字段(如果省略此属性,merge会自动以相同的列名作为链接的依据,但不会比较 index_col)
26 | # left_on/right_on: 分别指定两张表的链接依据
27 | # fillna(0): 将'NaN'替换为0
28 | table1 = students.merge(scores,how='left',on='ID').fillna(0)
29 | table1.Score = table1.Score.astype(int) # 将Score中的小数转换为整数
30 | print('\n----联合查询 方法一(inner join)----')
31 | print(table1)
32 |
33 | # 方法二
34 | # how: 链接方式
35 | # inner(默认)-inner join
36 | # left-左链接
37 | # right-右链接
38 | # on: 链接字段(设置了 index_col 时如果省略此属性,join会自动以 index_col 作为链接的依据)
39 | # fillna(0): 将'NaN'替换为0
40 | table2 = students.join(scores,how='left',on='ID').fillna(0)
41 | table2.Score = table2.Score.astype(int) # 将Score中的小数转换为整数
42 | print('\n----联合查询 方法二(inner join)----')
43 | print(table2)
44 |
45 |
46 |
47 |
48 |
--------------------------------------------------------------------------------
/16-Join/Student_Score.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/16-Join/Student_Score.xlsx
--------------------------------------------------------------------------------
/17-DataValidation/DataValidation.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第十七课 数据校验,轴的概念
4 | # 2018-10-24
5 |
6 | import pandas as pd
7 |
8 | # 方法一
9 | def score_validation(row):
10 | try:
11 | assert 0 <= row.Score <= 100
12 | except:
13 | print(f'#{row.ID}\tstudent {row.Name} has an invalid score {row.Score}.')
14 |
15 | # 方法二
16 | def score_validation2(row):
17 | if not 0 <= row.Score <= 100:
18 | print(f'#{row.ID}\tstudent {row.Name} has an invalid score {row.Score}.')
19 |
20 | # 在进行数据校验时不要设置 index_col ,这样有助于保证所有数据都进行校验
21 | students = pd.read_excel('./Students.xlsx')
22 | print('----原始数据----')
23 | print(students)
24 | print(students.columns)
25 |
26 | print('\n----校验结果----')
27 | students.apply(score_validation,axis=1)
28 |
29 | # axis = 1: 横向
30 | # axis = 0: 纵向(默认)
--------------------------------------------------------------------------------
/17-DataValidation/Students.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/17-DataValidation/Students.xlsx
--------------------------------------------------------------------------------
/18-DataSegmentation/DataSegmentation.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第十八课 把一列数据分割成两列
4 | # 2018-10-24
5 |
6 | import pandas as pd
7 |
8 | employees = pd.read_excel('./Employees.xlsx',index_col='ID')
9 | df = employees['Full Name'].str.split(expand=True)
10 | print('----原始数据----')
11 | print(employees)
12 | print(employees.columns)
13 | print(df)
14 |
15 | employees['First Name'] = df[0]
16 | employees['Last Name'] = df[0]
17 | print('\n----分割后的结果----')
18 | print(employees)
19 |
20 | # 补充
21 | # split() 方法:
22 | # split(' ',n=0,expand=True)
23 | # split 的第一个参数: 表示分隔符默认为空格或tab
24 | # split 的第二个参数 n: 表示最多分割的个数(0或-1 表示分割成尽可能多的个数)
25 | # split 的第二个参数 expand: 默认为 False (False:分割后生成数组,占一列;True: 分割成单独的列)
--------------------------------------------------------------------------------
/18-DataSegmentation/Employees.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/18-DataSegmentation/Employees.xlsx
--------------------------------------------------------------------------------
/19-Statistics/Statistics.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第十九课 求和,求平均,统计导引
4 | # 2018-10-24
5 |
6 | import pandas as pd
7 |
8 | students = pd.read_excel('./Students.xlsx',index_col='ID')
9 | print('----原始数据----')
10 | print(students)
11 | print(students.columns)
12 |
13 | temp = students[['Test_1','Test_2','Test_3']]
14 | print('\n----需要计算的元数据----')
15 | print(temp)
16 |
17 | # 和
18 | raw_sum = temp.sum(axis=1)
19 | print('\n----求和----')
20 | print(raw_sum)
21 |
22 | # 平均值
23 | raw_mean = temp.mean(axis=1)
24 | print('\n----求平均值----')
25 | print(raw_mean)
26 |
27 | students['Total'] = raw_sum
28 | students['Average'] = raw_mean
29 | print('\n----整合结果----')
30 | print(students)
31 |
32 | col_mean = students[['Test_1','Test_2','Test_3','Total','Average']].mean()
33 | col_mean['Name'] = 'Summary'
34 | students = students.append(col_mean,ignore_index=True)
35 | print('\n----最终结果----')
36 | print(students)
37 |
38 | # axis = 1: 横向
39 | # axis = 0: 纵向(默认)
--------------------------------------------------------------------------------
/19-Statistics/Students.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/19-Statistics/Students.xlsx
--------------------------------------------------------------------------------
/2-ReadExcel/People.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/2-ReadExcel/People.xlsx
--------------------------------------------------------------------------------
/2-ReadExcel/ReadExcel.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第二课 读取文件
4 | # 2018-10-17
5 |
6 | import pandas as pd
7 |
8 | # --基本数据的读取--
9 |
10 | # 读取文件
11 | # head:默认0,表示开始读取的行(默认会跳过顶部的空行)
12 | # index_col: 指定数据的索引列
13 | people = pd.read_excel('./People.xlsx',head=1,index_col='ID')
14 | # 读取文件的行数和列数
15 | shape = people.shape
16 | # 读取文件的行,不会显示索引列
17 | columns = people.columns
18 | # 读取文件的前几行(默认为5,可传入指定行数)
19 | head = people.head()
20 | # 读取文件的末尾几行(默认为5,可传入指定行数)
21 | tail = people.tail()
22 |
23 | # --当数据文件存在坏数据时可按以下方式处理--
24 | # (以下内容根据实际情况使用,同时使用会造成数据混乱)
25 |
26 | # 当标题行上有坏数据时可使用 head 参数
27 | people1 = pd.read_excel('./People.xlsx',head=1)
28 |
29 | # 当数据表中没有标题行时可将 head 的值设为 None 表示无标题
30 | people2 = pd.read_excel('./People.xlsx',head=None)
31 | people2.columns = ['ID', 'Type', 'Title', 'FirstName', 'MiddleName', 'LastName']
32 |
33 | print(columns)
--------------------------------------------------------------------------------
/20-DuplicateData/DuplicateData.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第二十课 定位、消除重复数据
4 | # 2018-10-24
5 |
6 | import pandas as pd
7 |
8 | students = pd.read_excel('./Students_Duplicates.xlsx')
9 | print('----原始数据----')
10 | print(students)
11 | print(students.columns)
12 |
13 | dupe = students.duplicated(subset='Name')
14 | print('\n----检查重复数据(True为重复)----')
15 | print(dupe)
16 |
17 | dupe = dupe[dupe] # 获取重复的行,等同于dupe = dupe[dupe==True]
18 | print('\n----查看重复数据----')
19 | print(students.iloc[dupe.index])
20 |
21 | students.drop_duplicates(subset='Name',inplace=True)
22 | print('\n----消除重复数据后的数据----')
23 | print(students)
24 |
--------------------------------------------------------------------------------
/20-DuplicateData/Students_Duplicates.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/20-DuplicateData/Students_Duplicates.xlsx
--------------------------------------------------------------------------------
/21-RotateDataSet/RotateDataSet.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第二十一课 定位、旋转数据表(行/列转换)
4 | # 2018-10-25
5 |
6 | import pandas as pd
7 |
8 | # 设置最大显示列数为20
9 | pd.options.display.max_columns=20
10 | video = pd.read_excel('./Videos.xlsx',index_col='Month')
11 | print('----原始数据----')
12 | print(video)
13 | print(video.columns)
14 |
15 | table = video.transpose()
16 | print('\n----行列转换的结果----')
17 | print(table)
--------------------------------------------------------------------------------
/21-RotateDataSet/Videos.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/21-RotateDataSet/Videos.xlsx
--------------------------------------------------------------------------------
/22-ReadData/ReadData.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第二十二课 读取CSV、TSV、TXT文件中的数据
4 | # 2018-10-25
5 |
6 | import pandas as pd
7 |
8 | student_csv = pd.read_csv('./Students.csv',index_col='ID')
9 | print('----读取 csv 数据----')
10 | print(student_csv)
11 |
12 | # sep 指定分隔符(读取csv文件时可省略,默认为 ',')
13 | student_tsv = pd.read_csv('./Students.tsv',sep='\t',index_col='ID')
14 | print('\n----读取 tsv 数据----')
15 | print(student_tsv)
16 |
17 | student_txt = pd.read_csv('./Students.txt',sep='|',index_col='ID')
18 | print('\n----读取 txt 数据----')
19 | print(student_txt)
--------------------------------------------------------------------------------
/22-ReadData/Students.csv:
--------------------------------------------------------------------------------
1 | ID,Name,Age
2 | 1,Student_001,21
3 | 2,Student_002,22
4 | 3,Student_003,23
5 | 4,Student_004,24
6 | 5,Student_005,25
7 | 6,Student_006,26
8 | 7,Student_007,27
9 | 8,Student_008,28
10 | 9,Student_009,29
11 | 10,Student_010,30
12 | 11,Student_011,31
13 | 12,Student_012,32
14 | 13,Student_013,33
15 | 14,Student_014,34
16 | 15,Student_015,35
17 | 16,Student_016,36
18 | 17,Student_017,37
19 | 18,Student_018,38
20 | 19,Student_019,39
21 | 20,Student_020,40
22 |
--------------------------------------------------------------------------------
/22-ReadData/Students.tsv:
--------------------------------------------------------------------------------
1 | ID Name Age
2 | 1 Student_001 21
3 | 2 Student_002 22
4 | 3 Student_003 23
5 | 4 Student_004 24
6 | 5 Student_005 25
7 | 6 Student_006 26
8 | 7 Student_007 27
9 | 8 Student_008 28
10 | 9 Student_009 29
11 | 10 Student_010 30
12 | 11 Student_011 31
13 | 12 Student_012 32
14 | 13 Student_013 33
15 | 14 Student_014 34
16 | 15 Student_015 35
17 | 16 Student_016 36
18 | 17 Student_017 37
19 | 18 Student_018 38
20 | 19 Student_019 39
21 | 20 Student_020 40
22 |
--------------------------------------------------------------------------------
/22-ReadData/Students.txt:
--------------------------------------------------------------------------------
1 | ID|Name|Age
2 | 1|Student_001|21
3 | 2|Student_002|22
4 | 3|Student_003|23
5 | 4|Student_004|24
6 | 5|Student_005|25
7 | 6|Student_006|26
8 | 7|Student_007|27
9 | 8|Student_008|28
10 | 9|Student_009|29
11 | 10|Student_010|30
12 | 11|Student_011|31
13 | 12|Student_012|32
14 | 13|Student_013|33
15 | 14|Student_014|34
16 | 15|Student_015|35
17 | 16|Student_016|36
18 | 17|Student_017|37
19 | 18|Student_018|38
20 | 19|Student_019|39
21 | 20|Student_020|40
22 |
--------------------------------------------------------------------------------
/23-GroupBy/GroupBy.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第二十三课 透视表,分组,聚合(group by)
4 | # 2018-10-25
5 |
6 | import pandas as pd
7 | import numpy as np
8 |
9 | # 设置最大显示列数为20
10 | pd.options.display.max_columns=20
11 | orders = pd.read_excel('./Orders.xlsx')
12 | print('----原始数据----')
13 | print(orders.head())
14 | print(orders.columns)
15 |
16 | orders['Year'] = pd.DatetimeIndex(orders['Date']).year
17 |
18 | # 方法一
19 | tt1 = orders.pivot_table(index='Category',columns='Year',values='Total',aggfunc=np.sum)
20 | print('\n----方法一----')
21 | print(tt1)
22 |
23 | group = orders.groupby(['Category','Year'])
24 | s = group['Total'].sum()
25 | c = group['ID'].count()
26 |
27 | tt2 = pd.DataFrame({'Sum':s,'Count':c})
28 | print('\n----方法二----')
29 | print(tt2)
--------------------------------------------------------------------------------
/23-GroupBy/Orders.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/23-GroupBy/Orders.xlsx
--------------------------------------------------------------------------------
/24-DataPrediction/DataPrediction.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第二十四课 线性回归,数据预测
4 | # 2018-10-25
5 |
6 | import pandas as pd
7 | import matplotlib.pyplot as plt
8 | from scipy.stats import linregress
9 |
10 | sales = pd.read_excel('./Sales.xlsx',dtype={'Month':str})
11 | print('----原始数据----')
12 | print(sales.head())
13 | print(sales.columns)
14 |
15 | # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.linregress.html
16 | # linregress 计算两组测量的线性最小二乘回归。
17 | # 共传递两个参数,这连个参数应为连个数组,并且两个素组的值应该一一对应
18 | # 返回值:
19 | # slope:回归线的斜率
20 | # intercept:回归线的截距
21 | # r:相关系数
22 | # p:假设检验的双侧p值,其零假设是斜率为零,使用Wald检验,检验统计量的t分布
23 | # std_err:估计梯度的标准误差。
24 | slope,intercept,r,p,std_err = linregress(sales.index,sales.Revenue)
25 |
26 | # 期望值
27 | exp = sales.index * slope + intercept
28 | # 线性回归方程回归方程
29 | # y = slope * x + intercept
30 |
31 | plt.scatter(sales.index,sales.Revenue)
32 | plt.plot(sales.index,exp,color='orange')
33 | plt.title('Sales')
34 | plt.xticks(sales.index,sales.Month,rotation=90)
35 | plt.tight_layout()
36 | plt.show()
37 |
38 |
--------------------------------------------------------------------------------
/24-DataPrediction/Sales.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/24-DataPrediction/Sales.xlsx
--------------------------------------------------------------------------------
/25&26-ConditionalFormatting/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | .ipynb_checkpoints\*
--------------------------------------------------------------------------------
/25&26-ConditionalFormatting/ConditionalFormatting01.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 9,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "----原始数据----\n",
13 | " ID Name Test_1 Test_2 Test_3\n",
14 | "0 1 Student_001 41 31 54\n",
15 | "1 2 Student_002 86 22 59\n",
16 | "2 3 Student_003 54 25 68\n",
17 | "3 4 Student_004 82 36 92\n",
18 | "4 5 Student_005 55 99 93\n",
19 | "5 6 Student_006 86 12 50\n",
20 | "6 7 Student_007 91 11 54\n",
21 | "7 8 Student_008 20 98 52\n",
22 | "8 9 Student_009 74 85 63\n",
23 | "9 10 Student_010 28 98 99\n",
24 | "10 11 Student_011 35 83 85\n",
25 | "11 12 Student_012 23 48 67\n",
26 | "12 13 Student_013 45 62 90\n",
27 | "13 14 Student_014 63 26 56\n",
28 | "14 15 Student_015 50 64 70\n",
29 | "15 16 Student_016 69 31 96\n",
30 | "16 17 Student_017 98 78 55\n",
31 | "17 18 Student_018 65 74 95\n",
32 | "18 19 Student_019 95 51 61\n",
33 | "19 20 Student_020 83 72 82\n",
34 | "Index(['ID', 'Name', 'Test_1', 'Test_2', 'Test_3'], dtype='object')\n"
35 | ]
36 | },
37 | {
38 | "data": {
39 | "text/html": [
40 | " \n",
222 | "
\n",
223 | " \n",
224 | " | \n",
225 | " ID | \n",
226 | " Name | \n",
227 | " Test_1 | \n",
228 | " Test_2 | \n",
229 | " Test_3 | \n",
230 | "
\n",
231 | " \n",
232 | " 0 | \n",
233 | " 1 | \n",
234 | " Student_001 | \n",
235 | " 41 | \n",
236 | " 31 | \n",
237 | " 54 | \n",
238 | "
\n",
239 | " 1 | \n",
240 | " 2 | \n",
241 | " Student_002 | \n",
242 | " 86 | \n",
243 | " 22 | \n",
244 | " 59 | \n",
245 | "
\n",
246 | " 2 | \n",
247 | " 3 | \n",
248 | " Student_003 | \n",
249 | " 54 | \n",
250 | " 25 | \n",
251 | " 68 | \n",
252 | "
\n",
253 | " 3 | \n",
254 | " 4 | \n",
255 | " Student_004 | \n",
256 | " 82 | \n",
257 | " 36 | \n",
258 | " 92 | \n",
259 | "
\n",
260 | " 4 | \n",
261 | " 5 | \n",
262 | " Student_005 | \n",
263 | " 55 | \n",
264 | " 99 | \n",
265 | " 93 | \n",
266 | "
\n",
267 | " 5 | \n",
268 | " 6 | \n",
269 | " Student_006 | \n",
270 | " 86 | \n",
271 | " 12 | \n",
272 | " 50 | \n",
273 | "
\n",
274 | " 6 | \n",
275 | " 7 | \n",
276 | " Student_007 | \n",
277 | " 91 | \n",
278 | " 11 | \n",
279 | " 54 | \n",
280 | "
\n",
281 | " 7 | \n",
282 | " 8 | \n",
283 | " Student_008 | \n",
284 | " 20 | \n",
285 | " 98 | \n",
286 | " 52 | \n",
287 | "
\n",
288 | " 8 | \n",
289 | " 9 | \n",
290 | " Student_009 | \n",
291 | " 74 | \n",
292 | " 85 | \n",
293 | " 63 | \n",
294 | "
\n",
295 | " 9 | \n",
296 | " 10 | \n",
297 | " Student_010 | \n",
298 | " 28 | \n",
299 | " 98 | \n",
300 | " 99 | \n",
301 | "
\n",
302 | " 10 | \n",
303 | " 11 | \n",
304 | " Student_011 | \n",
305 | " 35 | \n",
306 | " 83 | \n",
307 | " 85 | \n",
308 | "
\n",
309 | " 11 | \n",
310 | " 12 | \n",
311 | " Student_012 | \n",
312 | " 23 | \n",
313 | " 48 | \n",
314 | " 67 | \n",
315 | "
\n",
316 | " 12 | \n",
317 | " 13 | \n",
318 | " Student_013 | \n",
319 | " 45 | \n",
320 | " 62 | \n",
321 | " 90 | \n",
322 | "
\n",
323 | " 13 | \n",
324 | " 14 | \n",
325 | " Student_014 | \n",
326 | " 63 | \n",
327 | " 26 | \n",
328 | " 56 | \n",
329 | "
\n",
330 | " 14 | \n",
331 | " 15 | \n",
332 | " Student_015 | \n",
333 | " 50 | \n",
334 | " 64 | \n",
335 | " 70 | \n",
336 | "
\n",
337 | " 15 | \n",
338 | " 16 | \n",
339 | " Student_016 | \n",
340 | " 69 | \n",
341 | " 31 | \n",
342 | " 96 | \n",
343 | "
\n",
344 | " 16 | \n",
345 | " 17 | \n",
346 | " Student_017 | \n",
347 | " 98 | \n",
348 | " 78 | \n",
349 | " 55 | \n",
350 | "
\n",
351 | " 17 | \n",
352 | " 18 | \n",
353 | " Student_018 | \n",
354 | " 65 | \n",
355 | " 74 | \n",
356 | " 95 | \n",
357 | "
\n",
358 | " 18 | \n",
359 | " 19 | \n",
360 | " Student_019 | \n",
361 | " 95 | \n",
362 | " 51 | \n",
363 | " 61 | \n",
364 | "
\n",
365 | " 19 | \n",
366 | " 20 | \n",
367 | " Student_020 | \n",
368 | " 83 | \n",
369 | " 72 | \n",
370 | " 82 | \n",
371 | "
\n",
372 | "
"
373 | ],
374 | "text/plain": [
375 | ""
376 | ]
377 | },
378 | "execution_count": 9,
379 | "metadata": {},
380 | "output_type": "execute_result"
381 | }
382 | ],
383 | "source": [
384 | "# pandasVersusExcel\n",
385 | "# http://sa.mentorx.net/course/89/tasks\n",
386 | "# 第二十五课 条件格式化(上)\n",
387 | "# 2018-10-26\n",
388 | "\n",
389 | "import pandas as pd \n",
390 | "\n",
391 | "def low_score_red(s):\n",
392 | " color = 'red' if s < 60 else 'black'\n",
393 | " return f'color:{color}'\n",
394 | "\n",
395 | "def highest_score_green(col):\n",
396 | " return ['background-color:lime' if s == col.max() else 'background-color:white' for s in col]\n",
397 | "\n",
398 | "students = pd.read_excel('./Students.xlsx')\n",
399 | "print('----原始数据----')\n",
400 | "print(students)\n",
401 | "print(students.columns)\n",
402 | " \n",
403 | "students.style.applymap(low_score_red, subset=['Test_1', 'Test_2', 'Test_3']) \\\n",
404 | ".apply(highest_score_green, subset=['Test_1', 'Test_2', 'Test_3'])\n",
405 | "\n"
406 | ]
407 | },
408 | {
409 | "cell_type": "code",
410 | "execution_count": null,
411 | "metadata": {},
412 | "outputs": [],
413 | "source": []
414 | },
415 | {
416 | "cell_type": "code",
417 | "execution_count": null,
418 | "metadata": {},
419 | "outputs": [],
420 | "source": []
421 | }
422 | ],
423 | "metadata": {
424 | "kernelspec": {
425 | "display_name": "Python 3",
426 | "language": "python",
427 | "name": "python3"
428 | },
429 | "language_info": {
430 | "codemirror_mode": {
431 | "name": "ipython",
432 | "version": 3
433 | },
434 | "file_extension": ".py",
435 | "mimetype": "text/x-python",
436 | "name": "python",
437 | "nbconvert_exporter": "python",
438 | "pygments_lexer": "ipython3",
439 | "version": "3.6.6"
440 | }
441 | },
442 | "nbformat": 4,
443 | "nbformat_minor": 2
444 | }
445 |
--------------------------------------------------------------------------------
/25&26-ConditionalFormatting/ConditionalFormatting01.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第二十五课 条件格式化(上)
4 | # 2018-10-26
5 |
6 | import pandas as pd
7 | import seaborn as sns
8 |
9 |
10 | def low_score_red(s):
11 | color = 'red' if s < 60 else 'black'
12 | return f'color:{color}'
13 |
14 |
15 | def highest_score_green(col):
16 | return ['background-color:lime' if s == col.max() else 'background-color:white' for s in col]
17 |
18 |
19 | students = pd.read_excel('./Students.xlsx')
20 | print('----原始数据----')
21 | print(students)
22 | print(students.columns)
23 |
24 | students.style.applymap(low_score_red, subset=['Test_1', 'Test_2', 'Test_3']) \
25 | .apply(highest_score_green, subset=['Test_1', 'Test_2', 'Test_3'])
26 |
27 | # 说明
28 | # 由于编辑器的支持问题,此代码的效果可能无法展现
29 | # 请使用 Anaconda 中的 jupyter notebook 中打开 'ConditionalFormatting01.ipynb' 查看运行效果
30 |
31 |
32 |
--------------------------------------------------------------------------------
/25&26-ConditionalFormatting/ConditionalFormatting02.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 7,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "----原始数据----\n",
13 | " ID Name Test_1 Test_2 Test_3\n",
14 | "0 1 Student_001 41 31 54\n",
15 | "1 2 Student_002 86 22 59\n",
16 | "2 3 Student_003 54 25 68\n",
17 | "3 4 Student_004 82 36 92\n",
18 | "4 5 Student_005 55 99 93\n",
19 | "5 6 Student_006 86 12 50\n",
20 | "6 7 Student_007 91 11 54\n",
21 | "7 8 Student_008 20 98 52\n",
22 | "8 9 Student_009 74 85 63\n",
23 | "9 10 Student_010 28 98 99\n",
24 | "10 11 Student_011 35 83 85\n",
25 | "11 12 Student_012 23 48 67\n",
26 | "12 13 Student_013 45 62 90\n",
27 | "13 14 Student_014 63 26 56\n",
28 | "14 15 Student_015 50 64 70\n",
29 | "15 16 Student_016 69 31 96\n",
30 | "16 17 Student_017 98 78 55\n",
31 | "17 18 Student_018 65 74 95\n",
32 | "18 19 Student_019 95 51 61\n",
33 | "19 20 Student_020 83 72 82\n",
34 | "Index(['ID', 'Name', 'Test_1', 'Test_2', 'Test_3'], dtype='object')\n"
35 | ]
36 | },
37 | {
38 | "data": {
39 | "text/html": [
40 | " \n",
200 | " \n",
201 | " \n",
202 | " | \n",
203 | " ID | \n",
204 | " Name | \n",
205 | " Test_1 | \n",
206 | " Test_2 | \n",
207 | " Test_3 | \n",
208 | "
\n",
209 | " \n",
210 | " 0 | \n",
211 | " 1 | \n",
212 | " Student_001 | \n",
213 | " 41 | \n",
214 | " 31 | \n",
215 | " 54 | \n",
216 | "
\n",
217 | " 1 | \n",
218 | " 2 | \n",
219 | " Student_002 | \n",
220 | " 86 | \n",
221 | " 22 | \n",
222 | " 59 | \n",
223 | "
\n",
224 | " 2 | \n",
225 | " 3 | \n",
226 | " Student_003 | \n",
227 | " 54 | \n",
228 | " 25 | \n",
229 | " 68 | \n",
230 | "
\n",
231 | " 3 | \n",
232 | " 4 | \n",
233 | " Student_004 | \n",
234 | " 82 | \n",
235 | " 36 | \n",
236 | " 92 | \n",
237 | "
\n",
238 | " 4 | \n",
239 | " 5 | \n",
240 | " Student_005 | \n",
241 | " 55 | \n",
242 | " 99 | \n",
243 | " 93 | \n",
244 | "
\n",
245 | " 5 | \n",
246 | " 6 | \n",
247 | " Student_006 | \n",
248 | " 86 | \n",
249 | " 12 | \n",
250 | " 50 | \n",
251 | "
\n",
252 | " 6 | \n",
253 | " 7 | \n",
254 | " Student_007 | \n",
255 | " 91 | \n",
256 | " 11 | \n",
257 | " 54 | \n",
258 | "
\n",
259 | " 7 | \n",
260 | " 8 | \n",
261 | " Student_008 | \n",
262 | " 20 | \n",
263 | " 98 | \n",
264 | " 52 | \n",
265 | "
\n",
266 | " 8 | \n",
267 | " 9 | \n",
268 | " Student_009 | \n",
269 | " 74 | \n",
270 | " 85 | \n",
271 | " 63 | \n",
272 | "
\n",
273 | " 9 | \n",
274 | " 10 | \n",
275 | " Student_010 | \n",
276 | " 28 | \n",
277 | " 98 | \n",
278 | " 99 | \n",
279 | "
\n",
280 | " 10 | \n",
281 | " 11 | \n",
282 | " Student_011 | \n",
283 | " 35 | \n",
284 | " 83 | \n",
285 | " 85 | \n",
286 | "
\n",
287 | " 11 | \n",
288 | " 12 | \n",
289 | " Student_012 | \n",
290 | " 23 | \n",
291 | " 48 | \n",
292 | " 67 | \n",
293 | "
\n",
294 | " 12 | \n",
295 | " 13 | \n",
296 | " Student_013 | \n",
297 | " 45 | \n",
298 | " 62 | \n",
299 | " 90 | \n",
300 | "
\n",
301 | " 13 | \n",
302 | " 14 | \n",
303 | " Student_014 | \n",
304 | " 63 | \n",
305 | " 26 | \n",
306 | " 56 | \n",
307 | "
\n",
308 | " 14 | \n",
309 | " 15 | \n",
310 | " Student_015 | \n",
311 | " 50 | \n",
312 | " 64 | \n",
313 | " 70 | \n",
314 | "
\n",
315 | " 15 | \n",
316 | " 16 | \n",
317 | " Student_016 | \n",
318 | " 69 | \n",
319 | " 31 | \n",
320 | " 96 | \n",
321 | "
\n",
322 | " 16 | \n",
323 | " 17 | \n",
324 | " Student_017 | \n",
325 | " 98 | \n",
326 | " 78 | \n",
327 | " 55 | \n",
328 | "
\n",
329 | " 17 | \n",
330 | " 18 | \n",
331 | " Student_018 | \n",
332 | " 65 | \n",
333 | " 74 | \n",
334 | " 95 | \n",
335 | "
\n",
336 | " 18 | \n",
337 | " 19 | \n",
338 | " Student_019 | \n",
339 | " 95 | \n",
340 | " 51 | \n",
341 | " 61 | \n",
342 | "
\n",
343 | " 19 | \n",
344 | " 20 | \n",
345 | " Student_020 | \n",
346 | " 83 | \n",
347 | " 72 | \n",
348 | " 82 | \n",
349 | "
\n",
350 | "
"
351 | ],
352 | "text/plain": [
353 | ""
354 | ]
355 | },
356 | "execution_count": 7,
357 | "metadata": {},
358 | "output_type": "execute_result"
359 | }
360 | ],
361 | "source": [
362 | "# pandasVersusExcel\n",
363 | "# http://sa.mentorx.net/course/89/tasks\n",
364 | "# 第二十六课 条件格式化(下)\n",
365 | "# 2018-10-26\n",
366 | "\n",
367 | "import pandas as pd\n",
368 | "import seaborn as sns\n",
369 | "\n",
370 | "students = pd.read_excel('./Students.xlsx')\n",
371 | "print('----原始数据----')\n",
372 | "print(students)\n",
373 | "print(students.columns)\n",
374 | "\n",
375 | "# 以下两种效果不要同时使用,会被覆盖\n",
376 | "\n",
377 | "# 根据数据的大小显示不同深度的颜色\n",
378 | "col_map = sns.light_palette('green', as_cmap=True)\n",
379 | "# students.style.background_gradient(col_map, subset=['Test_1', 'Test_2', 'Test_3']) # 需要引入 seaborn\n",
380 | "\n",
381 | "# 根据数据的大小显示不同长度的色条\n",
382 | "students.style.bar(color='orange', subset=['Test_1', 'Test_2', ‘Test_’]) # 不需要引入 seaborn\n"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": null,
388 | "metadata": {},
389 | "outputs": [],
390 | "source": []
391 | }
392 | ],
393 | "metadata": {
394 | "kernelspec": {
395 | "display_name": "Python 3",
396 | "language": "python",
397 | "name": "python3"
398 | },
399 | "language_info": {
400 | "codemirror_mode": {
401 | "name": "ipython",
402 | "version": 3
403 | },
404 | "file_extension": ".py",
405 | "mimetype": "text/x-python",
406 | "name": "python",
407 | "nbconvert_exporter": "python",
408 | "pygments_lexer": "ipython3",
409 | "version": "3.6.6"
410 | }
411 | },
412 | "nbformat": 4,
413 | "nbformat_minor": 2
414 | }
415 |
--------------------------------------------------------------------------------
/25&26-ConditionalFormatting/ConditionalFormatting02.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第二十六课 条件格式化(下)
4 | # 2018-10-26
5 |
6 | import pandas as pd
7 | import seaborn as sns
8 |
9 | students = pd.read_excel('./Students.xlsx')
10 | print('----原始数据----')
11 | print(students)
12 | print(students.columns)
13 |
14 | # 以下两种效果不要同时使用,会被覆盖
15 |
16 | # 根据数据的大小显示不同深度的颜色
17 | col_map = sns.light_palette('green', as_cmap=True)
18 | # students.style.background_gradient(col_map, subset=['Test_1', 'Test_2', 'Test_3']) # 需要引入 seaborn
19 |
20 | # 根据数据的大小显示不同长度的色条
21 | students.style.bar(color='orange', subset=['Test_1', 'Test_2', 'Test_3']) # 不需要引入 seaborn
22 |
23 | # 说明
24 | # 由于编辑器的支持问题,此代码的效果可能无法展现
25 | # 请使用 Anaconda 中的 jupyter notebook 中打开 'ConditionalFormatting02.ipynb' 查看运行效果
26 |
--------------------------------------------------------------------------------
/25&26-ConditionalFormatting/Students.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/25&26-ConditionalFormatting/Students.xlsx
--------------------------------------------------------------------------------
/27-RowOperation/RowOperation.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第二十七课 行操作集锦
4 | # 建议在第八讲之后查看
5 | # 2018-10-30
6 |
7 | import pandas as pd
8 |
9 | students_001 = pd.read_excel('./Students.xlsx',sheet_name='Page_001',index_col='ID')
10 | students_002 = pd.read_excel('./Students.xlsx',sheet_name='Page_002',index_col='ID')
11 | print('----原始数据----')
12 | print('\n----Page_001----')
13 | print(students_001)
14 | print('\n----Page_002----')
15 | print(students_002)
16 |
17 | # 追加数据集
18 | students_add_dates = students_001.append(students_002)
19 | print('\n----追加数据集----')
20 | print(students_add_dates)
21 |
22 | # 追加数据行
23 | stu_col1 = pd.Series({'Name':'Abel','Score':99})
24 | students_add_col = students_add_dates.append(stu_col1,ignore_index=True)
25 | print('\n----追加数据行----')
26 | print(students_add_col)
27 |
28 | # 更改数据
29 | students_001.at[1,'Name'] = 'Jack'
30 | students_001.at[1,'Score'] = 100
31 | print('\n----更改数据 方法一----')
32 | print(students_001)
33 |
34 | stu_col2 = pd.Series({'ID':1,"Name":'Chen','Score':110})
35 | students_001.iloc[0] = stu_col2 # iloc 的参数为行数 ,从0开始
36 | print('\n----更改数据 方法二----')
37 | print(students_001)
38 |
39 | # 在数据中插入一行
40 | stu_col3 = pd.Series({"Name":'Scort','Score':110})
41 | part1 = students_001[:15]
42 | part2 = students_001[15:]
43 | students_001 = part1.append(stu_col3,ignore_index=True).append(part2,ignore_index=True)
44 | print('\n----在数据中插入一行----')
45 | print(students_001)
46 |
47 | # 删除数据行
48 | students_drop_col = students_001.drop(index=[15])
49 | print('\n----删除数据行----')
50 | print(students_drop_col)
51 |
52 | # 带条件的删除
53 | # 设置空值
54 | for i in range(5, 15):
55 | students_001['Name'].at[i] = ''
56 |
57 | # 去掉空值
58 | missing = students_001.loc[students_001['Name'] == '']
59 | students_001.drop(missing.index, inplace=True)
60 | print('\n----带条件的删除----')
61 | print(students_001)
--------------------------------------------------------------------------------
/27-RowOperation/Students.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/27-RowOperation/Students.xlsx
--------------------------------------------------------------------------------
/28-ColOperation/ColOperation.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第二十八课 列操作集锦
4 | # 建议在第八讲之后查看
5 | # 2018-10-30
6 |
7 | import pandas as pd
8 | import numpy as np
9 |
10 | students_001 = pd.read_excel('./Students.xlsx',sheet_name='Page_001')
11 | students_002 = pd.read_excel('./Students.xlsx',sheet_name='Page_002')
12 | print('----原始数据----')
13 | print('\n----Page_001----')
14 | print(students_001)
15 | print('\n----Page_002----')
16 | print(students_002)
17 |
18 | # 追加数据集
19 | students_add_dates = pd.concat([students_001,students_002],axis=1)
20 | print('\n----追加数据集(极少使用)----')
21 | print(students_add_dates)
22 |
23 | students = pd.concat([students_001,students_002]).reset_index(drop=True)
24 | print('\n----将要使用的数据----')
25 | print(students)
26 |
27 | # 追加数据列
28 | # students['Age'] = 25 # 等同于 np.repeat(25,len(students))
29 | students['Age'] = np.arange(0,len(students))
30 | print('\n----追加数据列----')
31 | print(students)
32 |
33 | # 删除列
34 | students.drop(columns='Age',inplace=True)
35 | print('\n----删除列----')
36 | print(students)
37 |
38 | # 插入列
39 | students.insert(1,column='Foo',value=np.repeat('foo',len(students)))
40 | print('\n----插入列----')
41 | print(students)
42 |
43 | # 修改列名
44 | students.rename(columns={'Foo':'FOO','Name':'NAME'},inplace=True)
45 | print('\n----修改列名----')
46 | print(students)
47 |
48 | # 删除含空值的行
49 | # 制造空值
50 | students['ID'] = students['ID'].astype(float)
51 | for i in range(3,5):
52 | students['ID'].at[i] = np.nan
53 |
54 | students.dropna(inplace=True)
55 | print('\n----删除含空值的行----')
56 | print(students)
57 |
58 |
59 |
--------------------------------------------------------------------------------
/28-ColOperation/Students.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/28-ColOperation/Students.xlsx
--------------------------------------------------------------------------------
/29-ReadDataBase/ReadDataBase.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第二十九课 读取数据库
4 | # 建议在第八讲之后查看
5 | # 2018-10-30
6 |
7 | import pyodbc
8 | import sqlalchemy
9 | import pandas as pd
10 | # sqlalchemy 和 pandas 均可链接数据库,选择其一即可
11 |
12 | # pandas 链接字符串
13 | connection = pyodbc.connect('DRIVER={SQL Server}; SERVER=(local); DATABASE=AdventureWorks;USER=sa;PASSWORD=123456')
14 | # sqlalchemy 链接字符串
15 | engine = sqlalchemy.create_engine('mssql+pyodbc://sa:123456@(local)/AdventureWorks?driver=SQL+Server')
16 |
17 | # 由于数据库中使用单引号,此处使用双引号引用 SQL 语句
18 | query = "SELECT FirstName, LastName FROM Person.Person"
19 | df1 = pd.read_sql_query(query, connection)
20 | df2 = pd.read_sql_query(query, engine)
21 |
22 | pd.options.display.max_columns = 999
23 | print(df1.head())
24 | print(df2.head())
--------------------------------------------------------------------------------
/3-Rows&Clumns&Cell/Rows&Clumns&Cell.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第三课 行、列、单元格
4 | # 2018-10-17
5 |
6 | import pandas as pd
7 |
8 | # --创建Series--
9 |
10 | # 方法一
11 | d = {'X':100,'Y':200,'Z':300}
12 | s1 = pd.Series(d) # 序列对象
13 | s1.name
14 | s1.index
15 | print(s1.index)
16 |
17 | # 方法二
18 | L1 = [100,200,300]
19 | L2 = ['X','Y','Z']
20 | s2 = pd.Series(L1,index=L2)
21 | print(s2.index)
22 |
23 | # --操作Excel--
24 |
25 | s1 = pd.Series([1,2,3],index=[1,2,3],name='A')
26 | s2 = pd.Series([10,20,30],index=[1,2,3],name='B')
27 | s3 = pd.Series([100,200,300],index=[1,2,3],name='C')
28 |
29 | df = pd.DataFrame({s1.name:s1,s2.name:s2,s3.name:s3})
30 | print(df)
--------------------------------------------------------------------------------
/30-WritingComplexEquations/Rectangles.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/30-WritingComplexEquations/Rectangles.xlsx
--------------------------------------------------------------------------------
/30-WritingComplexEquations/WritingComplexEquations.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第三十课 编写复杂方程
4 | # 建议在第八讲之后查看
5 | # 2018-10-30
6 |
7 | import pandas as pd
8 | import numpy as np
9 |
10 | # 计算外接圆的面积
11 | def get_Circumscribedcircle_area(lengh,height):
12 | r = np.sqrt(lengh**2 + height**2) / 2
13 | return r**2*np.pi
14 |
15 | def wrapper(row):
16 | return get_Circumscribedcircle_area(row['Length'],row['Height'])
17 |
18 |
19 | rectangles = pd.read_excel('./Rectangles.xlsx',index_col='ID')
20 | print('----原始数据----')
21 | print(rectangles)
22 |
23 | rectangles['CA'] = rectangles.apply(lambda row: get_Circumscribedcircle_area(row['Length'],row['Height']),axis=1)
24 | # rectangles['CA'] = rectangles.apply(wrapper,axis=1)
25 | print(rectangles)
26 |
--------------------------------------------------------------------------------
/4&5-ReadData&BaseInput/Books.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/4&5-ReadData&BaseInput/Books.xlsx
--------------------------------------------------------------------------------
/4&5-ReadData&BaseInput/Books_output.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/4&5-ReadData&BaseInput/Books_output.xlsx
--------------------------------------------------------------------------------
/4&5-ReadData&BaseInput/ReadData&BaseInput.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第四课 数据区域的读取,填充整数、文字
4 | # 第五课 填充日期序列
5 | # 2018-10-17
6 |
7 | import pandas as pd
8 | from datetime import date,timedelta
9 |
10 | # --数据区域的读取--
11 |
12 | # skiprows: 从序号为3的行开始读取(类似于 header)
13 | # usecols: 读取列的范围
14 | # dtype: 设置每一列的数据类型
15 | books = pd.read_excel('./Books.xlsx',skiprows=3,usecols='C:F',dtype={'ID':str,'Name':str,'InStore':str,'Date':str})
16 | print(books)
17 |
18 |
19 | print('-----------------分隔符-----------------')
20 |
21 | # --填充整数、文字、日前--
22 |
23 | # 日期加月份
24 | # d:起始日期,type:date
25 | # month_delta: 要添加的月数,type:int
26 | # 返回添加后的结果,type:date
27 | def add_month(d, month_delta):
28 | year_delta = month_delta // 12
29 | month = d.month + month_delta % 12
30 | if month != 12:
31 | year_delta += month // 12
32 | month = month % 12
33 | return date(d.year + year_delta, month, d.day)
34 |
35 | start = date(2018,10,17)
36 |
37 | for i in books.index:
38 | books['ID'].at[i] = i + 1
39 | books['InStore'].at[i] = 'Yes' if i % 2 == 0 else 'No'
40 | books['Date'].at[i] = start + timedelta(days=i)
41 | print(books)
42 |
43 | books.set_index('ID')
44 | books.to_excel('./Books_output.xlsx')
45 | print('-----------------Done-----------------')
46 |
--------------------------------------------------------------------------------
/6-InputFunction/Books.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/6-InputFunction/Books.xlsx
--------------------------------------------------------------------------------
/6-InputFunction/InputFunction.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第六课 函数填充
4 | # 2018-10-18
5 |
6 | import pandas as pd
7 |
8 | books = pd.read_excel('./Books.xlsx',index_col='ID')
9 | print('----计算前----')
10 | print(books)
11 |
12 | # 方法一
13 |
14 | # books['Price'] = books['ListPrice'] * books['Discount']
15 | # print('----方法一----')
16 | # print(books)
17 |
18 | # 方法二(此方法可以对计算的行的范围进行精确控制)
19 |
20 | for i in range(5,16): # books.index:
21 | books['Price'].at[i] = books['ListPrice'].at[i] * books['Discount'].at[i]
22 | print('----方法二----')
23 | print(books)
24 |
25 | # 方法一
26 | books['ListPrice'] += 2
27 |
28 | # 方法二
29 | def add_2(x):
30 | return x + 2
31 | books['ListPrice'] = books['ListPrice'].apply(add_2)
32 |
33 | # 方法三
34 | books['ListPrice'] = books['ListPrice'].apply(lambda x:x+2)
35 | print(books)
--------------------------------------------------------------------------------
/7-Sequence/List.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/7-Sequence/List.xlsx
--------------------------------------------------------------------------------
/7-Sequence/Sequence.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第七课 排序,多重排序
4 | # 2018-10-18
5 |
6 | import pandas as pd
7 |
8 | products = pd.read_excel('./List.xlsx',index_col='ID')
9 |
10 | print('----原始数据----')
11 | print(products)
12 |
13 | # sort_values 排序方法
14 | # by:根据什么排序
15 | # inplace:在原数据集中排序,而不是生成新的数据集
16 | # ascending:排序的顺序(True:默认,顺序|False:倒序)
17 |
18 | # 以 Price 按 倒序 排序
19 | products.sort_values(by='Price',inplace=True,ascending=False)
20 | print('\n----以 Price 按 倒序 排序----')
21 | print(products)
22 |
23 | # 先以 Worthy 按 顺序 排序,再以 Price 按倒序排序
24 | products.sort_values(by=['Worthy','Price'],inplace=True,ascending=[True,False])
25 | print('\n----先以 Worthy 按 顺序 排序,再以 Price 按倒序排序----')
26 | print(products)
--------------------------------------------------------------------------------
/8-DataFiltering/DataFiltering.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第八课 数据筛选、过滤
4 | # 2018-10-18
5 |
6 | import pandas as pd
7 |
8 | Students = pd.read_excel('./Students.xlsx',index_col='ID')
9 |
10 | print('----原始数据----')
11 | print(Students)
12 |
13 | # Age 大于等于 18 小于 30
14 | def age_18_to_30(age):
15 | return 18 <= age < 30
16 |
17 | # 成绩在 85 到 100 之间
18 | def level_a(score):
19 | return 85 <= score <= 100
20 |
21 | # 筛选 Age 大于等于 18 小于 30 的学生
22 | Students = Students.loc[Students['Age'].apply(lambda age:18 <= age < 30)]
23 | print('\n----筛选 Age 大于等于 18 小于 30 的学生----')
24 | print(Students)
25 |
26 | # 筛选 Age 大于等于 18 小于 30 成绩在 85 到 100 之间 的学生
27 | Students = Students.loc[Students.Age.apply(age_18_to_30)] \
28 | .loc[Students.Score.apply(level_a)]
29 | print('\n----筛选 Age 大于等于 18 小于 30 成绩在 85 到 100 之间 的学生----')
30 | print(Students)
31 |
32 | # 补充知识点
33 | # 1. Students['Age'] 的写法可以简写为 Students.Age
34 | # 2. age_18_to_30 函数可以用 lambda 表达式代替,因此
35 | # .apply(age_18_to_30) 可以简写为 .apply(lambda age:18 <= age < 30)
36 | # 3. Python 中 如遇表达式过长可以使用 ' \'(空格加正斜杠加回车)的方式换行
37 |
--------------------------------------------------------------------------------
/8-DataFiltering/Students.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/8-DataFiltering/Students.xlsx
--------------------------------------------------------------------------------
/9-Histogram/Figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/9-Histogram/Figure_1.png
--------------------------------------------------------------------------------
/9-Histogram/Histogram.py:
--------------------------------------------------------------------------------
1 | # pandasVersusExcel
2 | # http://sa.mentorx.net/course/89/tasks
3 | # 第九课 柱状图
4 | # 2018-10-18
5 |
6 | import pandas as pd
7 | import matplotlib.pyplot as plt
8 |
9 | students = pd.read_excel('./Students.xlsx')
10 |
11 | print('----原始数据----')
12 | print(students)
13 |
14 | students.sort_values(by='Number',inplace=True,ascending=False)
15 |
16 | # 使用 pandas 绘图(需要使用 matplotlib 展示图表)
17 | # students.plot.bar(x="Field",y='Number',color='orange',title='International Students by Field')
18 |
19 | # 使用 matplotlib 绘图
20 | plt.bar(students.Field,students.Number,color='orange')
21 | plt.xticks(students.Field,rotation='90') # 将 Field 旋转 90 度
22 | plt.xlabel('Field') # 设置 x轴 标题
23 | plt.ylabel('Number') # 设置 y轴 标题
24 | plt.title('International Students by Field',fontsize=16) # 设置标题
25 |
26 | # 展示图表
27 | plt.tight_layout() # 紧凑型布局
28 | plt.show()
--------------------------------------------------------------------------------
/9-Histogram/Students.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChanMenglin/PandasVersusExcel/34043a7ee941ad8a6f402a8c8b8798652ccf484d/9-Histogram/Students.xlsx
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Learn pandas vs Excel(Python)
2 |
3 |
4 | 本笔记为 [Pandas玩转Excel - Python数据分析轻松学](http://sa.mentorx.net/course/89/tasks) 这门课程的学习笔记
5 |
6 | ## Contents
7 | * [1 - 创建文件](1-CreateExcel/CreateExcel.py)
8 | * [2 - 读取文件](2-ReadExcel/ReadExcel.py)
9 | * [3 - 行、列、单元格](3-Rows&Clumns&Cell/Rows&Clumns&Cell.py)
10 | * [4&5 - 数据区域的读取,填充整数、文字,填充日期序列](4%265-ReadData&BaseInput/ReadData&BaseInput.py)
11 | * [6 - 函数填充](6-InputFunction/InputFunction.py)
12 | * [7 - 排序,多重排序](7-Sequence/Sequence.py)
13 | * [8 - 数据筛选、过滤](8-DataFiltering/DataFiltering.py)
14 | * [9 - 柱状图](9-Histogram/Histogram.py)
15 | * [10 - 绘制分组柱图,深度优化图表](10-GroupedHistogran&DepthOptimizationChart/GroupedHistogran&DepthOptimizationChart.py)
16 | * [11 - 绘制分组柱图,深度优化图表](11-SuperimposedHistogram&HorizontalHistogram/SuperimposedHistogram&HorizontalHistogram.py)
17 | * [12 - 绘制饼图](12-PieChart/PieChart.py)
18 | * [13 - 绘制折线趋势图、叠加区域图](13-PolylineTrendChart&OverlayAreaMap/PolylineTrendChart&OverlayAreaMap.py)
19 | * [14&15 - 散点图,直方图,密度图,·密度图,数据相关性](14%2615-ScatterPlot&Histogram&DensityMap/ScatterPlot&Histogram&DensityMap.py)
20 | * [16 - 多表联合(Join)](16-Join/Join.py)
21 | * [17 - 数据校验,轴的概念](17-DataValidation/DataValidation.py)
22 | * [18 - 把一列数据分割成两列](18-DataSegmentation/DataSegmentation.py)
23 | * [19 - 求和,求平均,统计导引](19-Statistics/Statistics.py)
24 | * [20 - 定位、消除重复数据](20-DuplicateData/DuplicateData.py)
25 | * [21 - 定位、旋转数据表(行/列转换)](21-RotateDataSet/RotateDataSet.py)
26 | * [22 - 读取CSV、TSV、TXT文件中的数据](22-ReadData/ReadData.py)
27 | * [23 - 透视表,分组,聚合(group by)](23-GroupBy/GroupBy.py)
28 | * [24 - 线性回归,数据预测](24-DataPrediction/DataPrediction.py)
29 | * [25 - 条件格式化(上)](25%2626-ConditionalFormatting/ConditionalFormatting01.py)
30 | * [26 - 条件格式化(下)](25%2626-ConditionalFormatting/ConditionalFormatting02.py)
31 | * [27 - 行操作集锦](27-RowOperation/RowOperation.py)
32 | * [28 - 列操作集锦](28-ColOperation/ColOperation.py)
33 | * [29 - 读取数据库](29-ReadDataBase/ReadDataBase.py)
34 | * [30 - 编写复杂方程](30-WritingComplexEquations/WritingComplexEquations.py)
35 |
36 |
37 | ---
38 | 鸣谢:
39 | [Pandas玩转Excel - Python数据分析轻松学](http://sa.mentorx.net/course/89/notes) 讲师:[Timothy](http://sa.mentorx.net/user/25)
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman
--------------------------------------------------------------------------------