├── IntroductionCourse ├── 32_poem.txt ├── TestCases │ ├── my_adder_34.py │ ├── test_my_adder_34.py │ ├── test_shopping_list_35.py │ └── shopping_list_35.py ├── 17_logical_operators.py ├── 11_comment.py ├── 9_naming.py ├── 8_variables.py ├── 10_math.py ├── 13_Read Eval Print Loop.py ├── 15_conditions.py ├── 14_input.py ├── 30_file_path.py ├── 21_while_loop.py ├── 12_types.py ├── 31_data.txt ├── 6&7_print.py ├── 27_class.py ├── 16_more_conditions.py ├── 18_list.py ├── 32_file_write.py ├── 20_for_loop.py ├── 22_格式化字符串.py ├── 23&24_functions.py ├── 28_class2.py ├── 33_error_fixing.py ├── 36_Higher_order_functions&anonymous_functions.py ├── 25_module.py ├── 31_file_read.py ├── 29_class_inheritance.py ├── 19_dictionary.py ├── 34&35_test.py └── 26_object_oriented_programming.py ├── DataAnalysis ├── Practices │ ├── Iris │ │ ├── 假设检验项目实战-zyf.pdf │ │ ├── Iris.csv │ │ └── Iris_cleaned.csv │ └── .ipynb_checkpoints │ │ └── 02 项目实战 _ 分析鸢尾花种类数据(空白版)-checkpoint.ipynb ├── height.csv ├── height2.csv ├── .ipynb_checkpoints │ ├── index-checkpoint.ipynb │ ├── 7.81_hypothetical_test-checkpoint.ipynb │ └── TERMINOLOGY-checkpoint.ipynb └── 7.81_hypothetical_test.ipynb ├── DataVisualization ├── practices │ └── penguins │ │ ├── 可视化帕默群岛企鹅数据-zyf.pdf │ │ └── penguins.csv ├── .ipynb_checkpoints │ ├── 6.73_statistics_basics_describe_numerical_data-checkpoint.ipynb │ ├── 05 项目实战 _ 可视化帕默群岛企鹅数据(空白版)-checkpoint.ipynb │ ├── 6.75_data_visualization_chart-checkpoint.ipynb │ ├── 6.72_Statistics_basics-checkpoint.ipynb │ └── 6.76_data_visualization_chart_extended-checkpoint.ipynb ├── temperature.csv ├── 6.73_statistics_basics_describe_numerical_data.ipynb ├── 6.75_data_visualization_chart.ipynb ├── 6.72_Statistics_basics.ipynb ├── 6.76_data_visualization_chart_extended.ipynb └── penguins.csv ├── README.md ├── DataAnalysisPreparation ├── 2.2_install_jupyter_notebook.py ├── 2.5_numpy_array.py ├── 2.4_Markdown&LaTex.py ├── 2.3_use_jupyter_notebook.py ├── 2.4_Markdown和LaTex入门.ipynb ├── 2.7_numpy_array_extended.ipynb └── 2.11_pandas_series_extended.ipynb ├── EvaluateAndCleanData ├── 4.31_more_dataset_for_data_analysis.ipynb ├── 4.33_upload_files_to_github.ipynb ├── 4.24_evaluate_data_criteria.ipynb ├── 4.32_evaluate_and_clean_data_manual.ipynb └── 4.26_clean_data.ipynb ├── DataSorting └── Practices │ └── 04 项目实战 _ 整理Netflix电影演员评分数据(空白版).ipynb └── DataFormatAndReadData ├── 3.22_dataformat_csv.ipynb ├── 3.19_retrieve_data.ipynb └── 3.20_dataformat_json.ipynb /IntroductionCourse/32_poem.txt: -------------------------------------------------------------------------------- 1 | 我欲乘风归去, 2 | 又恐琼楼玉宇, 3 | 高处不胜寒。 4 | 起舞弄清影, 5 | 何似在人间。 -------------------------------------------------------------------------------- /IntroductionCourse/TestCases/my_adder_34.py: -------------------------------------------------------------------------------- 1 | # 实现代码 2 | # 一个做加法的函数 3 | def my_adder(x, y): 4 | return x + y 5 | -------------------------------------------------------------------------------- /IntroductionCourse/17_logical_operators.py: -------------------------------------------------------------------------------- 1 | # python的逻辑运算符号只有三个,分别是and,or,not 2 | # 逻辑运算符可以混用,优先级not > and > or,也可以用括号改变运算顺序 3 | -------------------------------------------------------------------------------- /DataAnalysis/Practices/Iris/假设检验项目实战-zyf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NaturalCutie/Python-Data-Analysis-Notes/HEAD/DataAnalysis/Practices/Iris/假设检验项目实战-zyf.pdf -------------------------------------------------------------------------------- /IntroductionCourse/11_comment.py: -------------------------------------------------------------------------------- 1 | # 井号后面可以加注释 2 | # 井号也可以用来保留不想执行又不想删掉的代码 3 | # "ctrl" + "/" 可用于多行添加或删除井号 4 | 5 | "单纯用双引号包裹的字符串也可以用作注释" 6 | """或者用三个双引号 7 | 可以用作多行注释""" -------------------------------------------------------------------------------- /DataVisualization/practices/penguins/可视化帕默群岛企鹅数据-zyf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NaturalCutie/Python-Data-Analysis-Notes/HEAD/DataVisualization/practices/penguins/可视化帕默群岛企鹅数据-zyf.pdf -------------------------------------------------------------------------------- /DataVisualization/.ipynb_checkpoints/6.73_statistics_basics_describe_numerical_data-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /IntroductionCourse/9_naming.py: -------------------------------------------------------------------------------- 1 | # 变量命名的硬性规则,只能由文字,数字,下划线组成,且不能由数字开头 2 | # 变量命名约定俗成用下划线命名法:1.字母全部小写,2.不同单词用下划线分隔。 3 | # *变量名是大小写敏感的。变量名不要占用Python的关键字,Python所有关键字 @E9 2:40,Python关键字在Pycharm中会以彩色标注 4 | -------------------------------------------------------------------------------- /DataAnalysis/height.csv: -------------------------------------------------------------------------------- 1 | 身高,地区 2 | 165,A 3 | 167,A 4 | 172,A 5 | 176,A 6 | 178,A 7 | 180,A 8 | 182,A 9 | 183,A 10 | 185,A 11 | 188,A 12 | 155,B 13 | 158,B 14 | 160,B 15 | 162,B 16 | 165,B 17 | 168,B 18 | 172,B 19 | 176,B 20 | 179,B 21 | 182,B 22 | -------------------------------------------------------------------------------- /IntroductionCourse/8_variables.py: -------------------------------------------------------------------------------- 1 | # 变量是用来储存或指代值的 2 | # *python是从上到下执行的,必须在前面先对变量进行赋值,才能使用变量 3 | Mylove = '雪容融' 4 | 5 | # 如果要更改变量指代的值,可以通过赋值操作,给变量赋成另外一个值 6 | # 可以通过变量给变量赋值 7 | # !赋值操作里,等号左边的会被视为变量名,等号右边的会先被求值,然后把值分配给等号左边的变量 8 | Myfirstlove = Mylove 9 | Mylove = '冰墩墩' 10 | print(Mylove + Myfirstlove) 11 | -------------------------------------------------------------------------------- /IntroductionCourse/10_math.py: -------------------------------------------------------------------------------- 1 | # 通过官方文档了解标准库里包含哪些函数 2 | import math # 引入标准库中的math模块 3 | 4 | a = 1 5 | b = 9 6 | c = 20 7 | delta = b ** 2 - 4 * a * c 8 | x1 = (-b + math.sqrt(delta)) / (2 * a) # 使用模块里的函数或变量时,用 模块名.函数名/模块名.变量名 来使用。 9 | x2 = (-b - math.sqrt(delta)) / (2 * a) # (其实这里也可以写作delta ** (1/2)) 10 | print(x1) 11 | print(x2) 12 | -------------------------------------------------------------------------------- /IntroductionCourse/TestCases/test_my_adder_34.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from TestCases.my_adder_34 import my_adder 3 | 4 | 5 | class TestMyAdder(unittest.TestCase): 6 | def test_positive_with_positive(self): 7 | self.assertEqual(my_adder(5, 3), 8) 8 | # 函数调用也可以作为参数 9 | 10 | def test_negative_with_positive(self): 11 | self.assertEqual(my_adder(-5, 3), -2) 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python-Data-Analysis-Notes 2 | 基于B站 @林粒粒呀 老师Python数据分析课程的笔记,包括Python基础知识,以及数据读取、评估、清洗、分析、可视化等内容 3 | 4 | ## Notice 5 | 6 | 自己的笔记,以如下方式进行记录: 7 | 8 | - 笔记逻辑非常贴合粒粒老师原视频逻辑,代码绝大部分是原视频课程中的例子。并加上标题,让笔记结构清晰;加上注释,作为笔记知识内容。以下预览图,作为笔记示范。 9 | 10 | - 此外,"第二章 NumPy入门|探索数据更多玩法"之前笔记,在PyCharm中编写,是.py文件格式;之后笔记,在Jupyter Notebook中编写,是Notebook文件。 11 | 12 | - 关于文件格式,一般以章节开头,以粒粒老师视频标题的英文版作为文件名。 13 | 14 | - 目前,笔记更新至第七章的假设检验部分,后续笔记会随着学习进度,不断更新。 15 | -------------------------------------------------------------------------------- /IntroductionCourse/13_Read Eval Print Loop.py: -------------------------------------------------------------------------------- 1 | # 13_Python执行模式包括命令行模式和交互模式 2 | # 一、命令行模式 3 | # 写好命令后,保存并运行整个文件。运行时,Python解释器一行一行对文件进行解析和执行。 4 | 5 | # 二、交互模式 6 | # 输入一行后,Python立刻执行,并展示运行结果。 7 | 8 | # 1.交互模式的优点 9 | # 1)不需要创建新的文件,只需要进入交互式环境。 10 | # a.进入交互式环境:Pycharm下方的Python控制台,或是在windows的CMD输入python3 11 | # b.退出交互式环境,键入quit(),或是按ctrl + d 12 | # 2)不需要print语句,就能直接看到执行输出的结果。如直接输入变量或算式,返回结果。 13 | 14 | # 2.交互模式的缺点 15 | # 1)交互模式的命令都不会被保存 16 | -------------------------------------------------------------------------------- /DataVisualization/temperature.csv: -------------------------------------------------------------------------------- 1 | 姓名,体温 2 | Tom,37.8 3 | Jerry,36.8 4 | Lucy,37.0 5 | Emma,37.2 6 | John,36.6 7 | Alice,37.0 8 | Bob,37.0 9 | David,36.1 10 | Sam,37.5 11 | Alex,37.3 12 | Lisa,36.7 13 | Frank,36.9 14 | Grace,37.3 15 | Mary,36.9 16 | Ben,36.9 17 | Kate,36.3 18 | Oliver,37.3 19 | Sophie,37.1 20 | Ella,37.1 21 | Hannah,36.2 22 | James,37.8 23 | Leo,37.1 24 | Luke,36.8 25 | Max,38.0 26 | Mia,37.0 27 | Noah,36.3 28 | Peter,36.8 29 | Sarah,39.2 30 | Tim,38.6 31 | -------------------------------------------------------------------------------- /IntroductionCourse/TestCases/test_shopping_list_35.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from TestCases.shopping_list_35 import ShoppingList 3 | 4 | 5 | class TestShoppingList(unittest.TestCase): 6 | def setUp(self): 7 | self.shopping_list = ShoppingList({"牛奶": 3, "面包": 2, "苹果": 1}) 8 | 9 | def test_get_item_count(self): 10 | self.assertEqual(self.shopping_list.get_item_count(), 3) 11 | 12 | def test_get_total_price(self): 13 | self.assertEqual(self.shopping_list.get_total_price(), 6) 14 | -------------------------------------------------------------------------------- /IntroductionCourse/15_conditions.py: -------------------------------------------------------------------------------- 1 | """ 2 | 条件语句结构: 3 | if [条件]: 4 | [执行语句] # 执行语句前要有缩进,建议缩进为4个空格。有缩进的内容会判断为条件为真时要执行的内容,无缩进会认为if语句结束。 5 | [执行语句] # 执行语句可以为多个 # 假如条件为假时什么都不做,可以省略else及后面的内容 6 | else: 7 | [执行语句] 8 | [执行语句] 9 | """ 10 | 11 | # "条件"需求值为布尔值,包括: 12 | # 1.类型是布尔值的变量。 13 | # 2.结合比较运算符的算式,包括: 14 | # 1) == 2) != 3) > 4) < 5) >= 6) <= 15 | 16 | mood_index = int(input("宝宝今天的心情指数:")) 17 | if mood_index >= 60: 18 | print("可以睡个安稳觉了") 19 | print("(*^▽^*)") 20 | else: 21 | print("准备好战斗吧!") 22 | -------------------------------------------------------------------------------- /IntroductionCourse/14_input.py: -------------------------------------------------------------------------------- 1 | # input函数 2 | # input(),里面放字符串,作为给用户的提示信息; 3 | # 需要用一个变量去获取input函数返回的值。 4 | 5 | # *input函数返回的值为str类型, 6 | # 特别是用户输入的是数字,且后续需要做条件判断的时候:需要将输入信息转换成int类型或float类型,才能做条件判断 7 | 8 | # int函数、float函数、str函数 9 | # 以int函数举例,可以将str类型或float类型转换成int类型,*函数里的参数必须确实能被转化为整数 10 | # str函数在print时很有用,将int类型或float类型转化成字符串后,和其他字符串一起打印 11 | 12 | # BMI = 体重 / 身高 ** 2 13 | user_weight = float(input('请输入您的体重(单位:kg):')) 14 | user_height = float(input('请输入您的身高(单位:m):')) 15 | user_BMI = user_weight / user_height ** 2 16 | print("您的BMI值为:" + str(user_BMI)) 17 | -------------------------------------------------------------------------------- /IntroductionCourse/TestCases/shopping_list_35.py: -------------------------------------------------------------------------------- 1 | class ShoppingList: 2 | """初始化购物清单,shopping_list是字典类型,包含商品名和对应价格 3 | 例子:{"牙刷": 5, "沐浴露": 15, "电池": 7}""" 4 | def __init__(self, shopping_list): 5 | self.shopping_list = shopping_list 6 | 7 | """返回购物清单上有多少商品""" 8 | def get_item_count(self): 9 | return len(self.shopping_list) 10 | 11 | """返回购物清单商品价格总额数字""" 12 | def get_total_price(self): 13 | total_price = 0 14 | for price in self.shopping_list.values(): 15 | total_price += price 16 | return total_price 17 | -------------------------------------------------------------------------------- /IntroductionCourse/30_file_path.py: -------------------------------------------------------------------------------- 1 | # 30_文件路径 2 | # 一、目录结构 3 | # 1.类Unix操作系统(Linux, macOS等) 4 | # 有根目录,用“/”表示 5 | # 2.Windows操作系统 6 | # 每个磁盘分区有自己的根目录,用“分区名:\”表示 7 | 8 | # 二、定位文件位置 9 | # 1.绝对路径 10 | # 从根目录出发的路径 11 | # 1)类Unix操作系统(Linux, macOS等) 12 | # 绝对路径就是以斜杠开头,路径中的每个目录之间,用斜杠进行分隔,最后以目标文件或目标目录结尾 13 | # 2)Windows操作系统 14 | # 绝对路径以“分区名:\”开头,路径中的每个目录之间,用反斜杠进行分隔,最后以目标文件或目标目录结尾 15 | # 2.相对路径 16 | # 1)从一个参考位置出发,表示从一个参考位置出发,其它文件处于什么路径 17 | # 2)用“.”来表示参照文件当前所在的目录,用“..”表示更上一层的父目录,用“../..”或"..\.."表示父目录的父目录 18 | # 往更下层走的话,同样用"/"或"\"来分隔路径中的目录 19 | # *"./"或".\"是可以省略的,所以同一目录下的文件,想互相用相对路径找到彼此的话,可以直接使用文件名 20 | # 另:很多编辑器都能帮你复制文件的路径,比如:在PyCharm,右键点击文件,选择“复制路径/引用” 21 | # 就可以获得那个文件的绝对路径,以及以最顶层项目目录为参考的相对路径 22 | -------------------------------------------------------------------------------- /DataAnalysis/height2.csv: -------------------------------------------------------------------------------- 1 | 身高,地区 2 | 175,A 3 | 169,A 4 | 176,A 5 | 185,A 6 | 168,A 7 | 168,A 8 | 186,A 9 | 178,A 10 | 165,A 11 | 175,A 12 | 165,A 13 | 165,A 14 | 172,A 15 | 151,A 16 | 153,A 17 | 164,A 18 | 160,A 19 | 173,A 20 | 161,A 21 | 156,A 22 | 185,A 23 | 168,A 24 | 171,A 25 | 156,A 26 | 165,A 27 | 171,A 28 | 158,A 29 | 174,A 30 | 164,A 31 | 167,A 32 | 164,A 33 | 189,A 34 | 170,A 35 | 164,B 36 | 183,B 37 | 163,B 38 | 177,B 39 | 155,B 40 | 162,B 41 | 177,B 42 | 182,B 43 | 177,B 44 | 174,B 45 | 172,B 46 | 160,B 47 | 168,B 48 | 170,B 49 | 186,B 50 | 178,B 51 | 157,B 52 | 178,B 53 | 171,B 54 | 168,B 55 | 181,B 56 | 185,B 57 | 184,B 58 | 167,B 59 | 172,B 60 | 178,B 61 | 185,B 62 | 170,B 63 | 173,B 64 | 164,B 65 | 163,B 66 | 183,B 67 | 189,B 68 | -------------------------------------------------------------------------------- /IntroductionCourse/21_while_loop.py: -------------------------------------------------------------------------------- 1 | # 21_ 2 | """ 3 | while 条件A: 4 | 行动B 5 | # 计算机会判断条件A是否为真,如果为真,执行行动B;然后再次判断条件是否为真,再次执行行动B;如此循环,直到条件为假,退出循环 6 | # *如果while后面的条件A,在第一次判断的时候就为False,那么行动B一次也不会被执行 7 | """ 8 | # for循环与while循环 9 | # for循环一般有明确循环对象或次数;while循环一般循环次数未知 10 | 11 | # 一个对用户输入数字求平均值的计算器,这个计算器的特点是用户可以输入任意数量的数字 12 | print("你好,我可以帮你求平均值!") 13 | num = input("请输入数字(完成所有输入后,请输入q终止程序):") 14 | total = 0 15 | i = 0 16 | while num != 'q': 17 | total += float(num) # total += float(num) 相当于 total = total + float(num) 18 | i += 1 19 | num = input("请输入数字(完成所有输入后,请输入q终止程序):") 20 | if total == 0: 21 | result = 0 22 | else: 23 | result = total / i 24 | # 预防用户第一次就输入q,使得result = 0 / 0,导致报错 25 | print("平均值为:" + str(result)) 26 | -------------------------------------------------------------------------------- /IntroductionCourse/12_types.py: -------------------------------------------------------------------------------- 1 | # 一、Python数据类型包括字符串、整数、浮点数、布尔类型、空值类型、列表、字典等 2 | # 数据类型之所以重要,因为它决定了你能在该类型的对象上运用哪些函数,函数是用来执行功能的 3 | # 1.字符串str 4 | # 表示文本内容,特点是会被单引号或双引号包裹。 5 | # 1)用len函数对字符串求长度。 6 | # 在字符串长度计算中,空格、数字和符号都会占据一个长度。 7 | # *完整的转义符才占1个长度 8 | s = "Hello World!" 9 | print(len(s)) 10 | 11 | # 2)通过索引获取字符串中单个字符。 12 | # 字符串后跟上方括号,方括号里放索引,就能提取出该索引位置的字符 13 | # *索引是从0开始 14 | print(s[11]) 15 | print(s[len(s)-1]) 16 | 17 | # 2.整数int,浮点数float 18 | 19 | # 3.布尔类型bool 20 | # 只包含两种值,True和False。*True和False首字母大写 21 | b1 = True 22 | b2 = False 23 | 24 | # 4.空值类型NoneType 25 | # 表示完全没有值。(如果你知道一个变量,但还不知道变量的值,可以先定义为none)*None首字母大写 26 | n = None 27 | 28 | # 二、type函数 29 | # 会返回该对象的类型 30 | print(type(s)) 31 | print(type(b1)) 32 | print(type(n)) 33 | print(type(1.5)) 34 | -------------------------------------------------------------------------------- /DataAnalysisPreparation/2.2_install_jupyter_notebook.py: -------------------------------------------------------------------------------- 1 | # 2.2 安装Jupyter Notebook 2 | # 1.定义 3 | # Jupyter Notebook是一个基于网页的交互式计算环境,是数据分析、数据科学,甚至机器学习领域里非常流行的一款工具。 4 | # 可以用来编写代码、运行代码、查看输出、可视化数据,并分享输出的报告文档 5 | # 2.优点 6 | # 1)Jupyter Notebook可以按单元格运行代码 7 | # 对于搞数据的人来说,不是所有时候都想从头运行到结尾, 8 | # 比如数据量特别大的时候,假如读取数据要等几秒,清洗数据要等几秒, 9 | # 那在我们每次修改分析公式,想反复运行看效果的时候,不希望前面没有改动的步骤,比如说读取数据,还要反复被运行,因为这会浪费很多等待时间。 10 | # 用Jupyter Notebook就很简单了,我们可以把不同步骤放在不同单元格里,每次运行一个单元格的代码, 11 | # 这样我们可以只读取一遍数据,当反复修改和运行分析代码时,读数据的代码就不会再被运行了。 12 | # 2)可展示的信息格式更丰富 13 | # 我们用常规编辑器时,注释和代码一样都是纯文本, 14 | # 但分析数据时,有时需要记录和解释更多东西,比如数据的北京、使用的公式、分析思路等等, 15 | # 用Jupyter Notebook可以用Markdown标记语言,让注释更加清晰、有层次,还可以用LaTex插入公式 16 | # 当你把Jupyter Notebook上的内容,以HTML等格式分享给其他人的时候,这些效果丰富的文字,也会原封不动地展示给对方, 17 | # 帮助对方更好地理解你思考和分析的过程,也节约了你解答疑问的时间 18 | # 3)交互式运行环境 19 | # 交互模式相比命令行模式的好处是,当我们想查看输出的时候,不需要加上打印语句就能看到, 20 | # 那我们就可以很方便地查看变量的值,输出中间结果,有利于快速探索数据,试验不同分析方法。 21 | -------------------------------------------------------------------------------- /IntroductionCourse/31_data.txt: -------------------------------------------------------------------------------- 1 | Do not go gentle into that good night, 2 | Old age should burn and rave at close of day; 3 | Rage, rage against the dying of the light. 4 | 5 | Though wise men at their end know dark is right, 6 | Because their words had forked no lightning they 7 | Do not go gentle into that good night. 8 | 9 | Good men, the last wave by, crying how bright 10 | Their frail deeds might have danced in a green bay, 11 | Rage, rage against the dying of the light. 12 | 13 | Wild men who caught and sang the sun in flight, 14 | And learn, too late, they grieved it on its way, 15 | Do not go gentle into that good night. 16 | 17 | Grave men, near death, who see with blinding sight 18 | Blind eyes could blaze like meteors and be gay, 19 | Rage, rage against the dying of the light. 20 | 21 | And you, my father, there on the sad height, 22 | Curse, bless, me now with your fierce tears, I pray. 23 | Do not go gentle into that good night. 24 | Rage, rage against the dying of the light. -------------------------------------------------------------------------------- /IntroductionCourse/6&7_print.py: -------------------------------------------------------------------------------- 1 | # 一个print函数中只能包涵一种数据类型,print函数中可以用","将不同数据分隔开 2 | 3 | # 6_ 4 | print('Dad') 5 | 6 | # 7_1.字符串连接 7 | # 把几个字符串可以通过"+"连接成一个更长的,再打印出来 8 | # *引号包裹的字符串有空格,打印出来才有空格 9 | print('D'+'a'+'d') 10 | 11 | # 7_2.单双引号转义 12 | # 单引号会和最近的单引号配对,双引号会和最近的双引号配对。 13 | # 内容的引号是单,外面包裹字符串的引号得用双;内容的引号是双,外面包裹字符串的引号得是单; 14 | # 或者用"\" + 单引号或者双引号,表明后面的引号是单纯的引号符号。 15 | # (反斜杠是转义字符,表明后面的字符是纯字符,python会把反斜杠和后面的字符一起读) 16 | print('"Hey,girl"') 17 | print("I said \"Let\'s do sth!\"") 18 | print("I said \"Let's do sth!\"") # 包裹字符串的是双引号,内容的单引号也可以前面不用加转义字符 19 | 20 | # 7_3.换行 21 | # "\n" 表示换行 22 | print('What\'s now? \nI don\'t know') 23 | 24 | # 7_4.三引号跨行字符串 25 | # 三个连续的单引号或者双引号,用它包裹住文字,python就会把新的一行当成内容的换行,而不是代码语句的结束。 26 | print(''' 27 | 春江潮水连海平,海上明月共潮生。 28 | 滟滟随波千万里,何处春江无月明! 29 | 江流宛转绕芳甸,月照花林皆似霰。 30 | 空里流霜不觉飞,汀上白沙看不见。 31 | 江天一色无纤尘,皎皎空中孤月轮。 32 | 江畔何人初见月?江月何年初照人? 33 | 人生代代无穷已,江月年年望相似。 34 | 不知江月待何人,但见长江送流水。 35 | 白云一片去悠悠,青枫浦上不胜愁。 36 | 谁家今夜扁舟子?何处相思明月楼? 37 | 可怜楼上月裴回,应照离人妆镜台。 38 | 玉户帘中卷不去,捣衣砧上拂还来。 39 | 此时相望不相闻,愿逐月华流照君。 40 | 鸿雁长飞光不度,鱼龙潜跃水成文。 41 | 昨夜闲潭梦落花,可怜春半不还家。 42 | 江水流春去欲尽,江潭落月复西斜。 43 | 斜月沉沉藏海雾,碣石潇湘无限路。 44 | 不知乘月几人归,落月摇情满江树。 45 | ''') 46 | -------------------------------------------------------------------------------- /IntroductionCourse/27_class.py: -------------------------------------------------------------------------------- 1 | # 27_创建类 2 | # *类和函数都是:定义的时候,里面的代码不会被执行;只有在调用的时候,才是里面的代码被实际执行的时候。 3 | 4 | # 一、创建类 5 | # 1.类有一个特殊的方法叫构造函数,主要作用是定义实例对象的属性,必须要被命名为"__init__" 6 | # 2.括号里可以放任意数量的参数, 7 | # 但第一个参数永远是被占用的,得用于表示对象自身,约定俗成叫"self",它能帮你把属性的值绑定在实例对象上 8 | # *和定义普通变量时的下划线命名法不同,Python在定义类名的时候,用的是Pascal命名法,特点是用首字母大写来分隔单词 9 | class CuteDog: 10 | def __init__(self): 11 | self.name = "Lambton" 12 | # 若每个对象的属性都有一样的初始值,可以用这种方式定义类的属性并赋值 13 | name = "Lambton" 14 | # Python认为只是在给普通的name变量赋值,且是局部变量 15 | 16 | 17 | # 二、创建对象 18 | # 1.创建对象是用类名,括号,里面放入参数,这样__init__方法就会被调用,并返回一个对象 19 | # !"self"参数是不需要手动传入的 20 | # 2.获取对象的属性用"对象.属性名"来获取 21 | dog1 = CuteDog() 22 | print(dog1.name) 23 | # 返回dog1对象所绑定的name属性的值,为Lambton 24 | 25 | 26 | # 另:给__init__更加灵活的属性赋值,比如从参数获取属性的值 27 | # 那创建对象时,就需要在括号里面传入属性的值,此处name属性的值被赋值为传入的cat_name参数的值 28 | class CuteCat: 29 | def __init__(self, cat_name, age, color): 30 | self.name = cat_name 31 | # 属性名和参数名不一定一致 32 | self.age = age 33 | # self.age是绑定到对象身上的属性,而age是普通的变量,它的值是通过参数传进来的 34 | self.color = color 35 | 36 | 37 | 38 | cat1 = CuteCat("jojo", 2, "橙色") 39 | print(f"小猫{cat1.name}的年龄是{cat1.age}岁,花色是{cat1.color}") 40 | -------------------------------------------------------------------------------- /IntroductionCourse/16_more_conditions.py: -------------------------------------------------------------------------------- 1 | """ 2 | 嵌套条件语句结构: 3 | if [条件一]: 4 | if [条件二]: 5 | [语句A] 6 | else: 7 | [语句B] 8 | else: 9 | [语句C] 10 | """ 11 | 12 | """ 13 | 多个条件判断结构: 14 | if [条件一]: 15 | [语句A] 16 | elif [条件二]: # 如果条件二和条件三同时满足,python只会执行语句B。因为一旦执行那个分支,就不会再看同一层级下的其它条件判断了 17 | [语句B] 18 | elif [条件三]: 19 | [语句C] 20 | else: 21 | [语句D] 22 | """ 23 | 24 | # BMI = 体重 / 身高 ** 2 25 | user_gender = input("请输入您的性别(男/女):") 26 | user_weight = float(input('请输入您的体重(单位:kg):')) 27 | user_height = float(input('请输入您的身高(单位:m):')) 28 | user_BMI = user_weight / user_height**2 29 | print("您的BMI值为:" + str(user_BMI)) 30 | 31 | if user_gender == "男": 32 | if user_BMI <= 18.5: 33 | print("先生您好,此BMI属于偏瘦范围。") 34 | elif user_BMI <= 25: 35 | # 这里不用写成 "18.5 < BMI <= 25:",因为BMI <= 18.5 的情况已经囊括在第一个分支里了 36 | print("先生您好,此BMI属于正常范围。") 37 | elif user_BMI <= 30: 38 | print("先生您好,此BMI属于偏胖范围。") 39 | else: 40 | print("先生您好,此BMI属于肥胖范围。") 41 | else: 42 | if user_BMI <= 18.5: 43 | print("女士您好,此BMI属于偏瘦范围。") 44 | elif user_BMI <= 25: 45 | print("女士您好,此BMI属于正常范围。") 46 | elif user_BMI <= 30: 47 | print("女士您好,此BMI属于偏胖范围。") 48 | else: 49 | print("女士您好,此BMI属于肥胖范围。") 50 | -------------------------------------------------------------------------------- /IntroductionCourse/18_list.py: -------------------------------------------------------------------------------- 1 | # 18_列表 2 | # 列表是一种数据结构,用于把相关联的数据整合在一起 3 | # list = ["a", "b"] 4 | 5 | # 1)针对列表的方法 6 | # 方法与函数的区别: 7 | # 对象.方法名(...);函数(参数) 8 | # a.添加元素,只能通过方法添加一个元素 9 | # 如:list.append("c") 10 | # b.删除元素,只能通过方法删除一个元素 11 | # 如:list.remove("a") 12 | 13 | # 2)针对列表的python内置函数 14 | # a.max(num_list) 返回列表里的最大值 15 | # b.min(num_list) 返回列表里的最小值 16 | # c.sorted(num_list) 返回由小到大排序好的新列表,同时不改变原先的列表 17 | 18 | # 3)列表中可以放不同类型的数据 19 | # a.列表可以通过len函数求长度 20 | # b.列表可以通过索引 21 | # I.返回某个位置的元素,如list[0] 22 | # II.赋值,修改列表里的某个元素,如list[0] = "z" 23 | 24 | # 4)列表等与其他数据类型的区别 25 | # 列表等是可变的,字符串、整数、浮点数、布尔类型等不是 26 | # 即其他数据类型只能被重新赋值;而列表可以通过append等方法,直接改变原列表。*列表也可以重新被赋值 27 | 28 | 29 | # 例题1,运用1)和3)的知识 30 | shopping_list = ['键盘', "音响"] 31 | shopping_list.append('键帽') 32 | shopping_list.remove('键帽') 33 | shopping_list.append('音响') 34 | shopping_list.append('电竞椅') 35 | 36 | shopping_list[1] = '硬盘' # 列表可以通过索引赋值,更改一个元素 37 | print(shopping_list) 38 | print(len(shopping_list)) # 返回列表中元素数量 39 | print(shopping_list[2]) # 通过索引查找列表中第N个元素 40 | print() 41 | 42 | # 例题2,运用2)的知识 43 | price = [799, 1024, 200, 800] 44 | max_price = max(price) 45 | min_price = min(price) 46 | sorted_price = sorted(price) 47 | print(max_price, min_price) 48 | print(sorted_price) 49 | price = [1, 2, 3] # 列表也可以重新被赋值 50 | -------------------------------------------------------------------------------- /EvaluateAndCleanData/4.31_more_dataset_for_data_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "fb1985c7-bb60-41c0-913b-d51bce86d8b1", 6 | "metadata": {}, 7 | "source": [ 8 | "4.31_more_dataset_for_data_analysis" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "5b838f53-b66c-4027-990d-051d3ca2d35d", 14 | "metadata": {}, 15 | "source": [ 16 | "可以配合更多数据集二次实战,获得更多练习机会\n", 17 | "\n", 18 | "1. 课程配套的更多数据集,在'练习数据集'文件夹下面。这些数据集的相关信息,都在'数据集简介.larkdocx'文件里面\n", 19 | "\n", 20 | "2. 提供更多数据集的数据社区,在'更多数据集的获取平台.larkdocx'文件里面" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "id": "5f6ef059-d4da-480c-8cd4-a5d1498eee98", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [] 30 | } 31 | ], 32 | "metadata": { 33 | "kernelspec": { 34 | "display_name": "Python 3 (ipykernel)", 35 | "language": "python", 36 | "name": "python3" 37 | }, 38 | "language_info": { 39 | "codemirror_mode": { 40 | "name": "ipython", 41 | "version": 3 42 | }, 43 | "file_extension": ".py", 44 | "mimetype": "text/x-python", 45 | "name": "python", 46 | "nbconvert_exporter": "python", 47 | "pygments_lexer": "ipython3", 48 | "version": "3.11.1" 49 | } 50 | }, 51 | "nbformat": 4, 52 | "nbformat_minor": 5 53 | } 54 | -------------------------------------------------------------------------------- /IntroductionCourse/32_file_write.py: -------------------------------------------------------------------------------- 1 | # 32_文件操作|写文件 2 | # 把程序输出结果储存为文件,就可以不用反复运行程序,也可以把结果轻松共享给他人 3 | # 写文件和读文件有非常多相似之处,都需要打开文件,并且在操作完成后关闭文件 4 | 5 | # 一、打开文件 6 | # 1.第一个参数里放文件路径 7 | # 2.第二个参数要传入模式 8 | # 1)写入模式用“w”表示 9 | # *写入模式下,如果文件路径不存在,程序就会自动创建,传入文件名的那个文件 10 | # *如果文件路径已经存在,就会先把原本的文件内容清空 11 | # 无法读文件,用read方法,程序会报错 12 | 13 | # 2)附加模式,用"a"表示 14 | # *附加模式下,如果文件路径不存在,程序就会自动创建,传入文件名的那个文件 15 | # *如果文件路径已经存在,不把原本文件内容清空,可以在后面追加内容 16 | # 无法读文件,用read方法,程序会报错 17 | 18 | # 3)同时支持读写文件,用"r+"表示 19 | # *在r+模式下,文件路径不存在的话,就会报一个叫FileNotFoundError的错误,提示文件不存在 20 | # *此r+模式下,如果文件路径已经存在,从开始位置读取或者写入 21 | # *读取或写入都是从当前位置开始 22 | # 即先调用read方法,再调用write方法,从文件结尾附加写入 23 | # 先调用write方法,再调用read方法。会先从文件开头位置边写入边覆盖已有内容,再从写入内容后,读取未被覆盖内容 24 | # 先调用readline方法,再调用write方法,会先读一行文件,再从第二行开始边写入边覆盖 25 | 26 | # 4)同时支持读写文件,用"a+"表示 27 | # *a+模式下,如果文件路径不存在,程序就会自动创建,传入文件名的那个文件 28 | # *此a+模式下,如果文件路径已经存在,从最后位置读取或者写入 29 | # *读取或写入都是从当前位置开始 30 | 31 | # 3.我们可以传入可选参数encoding,来确保写入的文件编码也是UTF-8 32 | 33 | # 二、写文件 34 | # 1.write方法 35 | 36 | 37 | # 任务1:在一个新的名字为"poem.txt"的文件里,写入以下内容: 38 | # 我欲乘风归去, 39 | # 又恐琼楼玉宇 40 | # 高处不胜寒。 41 | with open("32_poem.txt", "w", encoding="utf-8") as f: 42 | f.write("我欲乘风归去,\n又恐琼楼玉宇,\n高处不胜寒。") 43 | 44 | # 任务2:在上面的“poem.txt”的文件结尾处,添加以下两句: 45 | # 起舞弄清影, 46 | # 何似在人间。 47 | with open("32_poem.txt", "a", encoding="utf-8") as f: 48 | f.write("\n起舞弄清影,\n何似在人间。") 49 | -------------------------------------------------------------------------------- /IntroductionCourse/20_for_loop.py: -------------------------------------------------------------------------------- 1 | # 20_ 2 | # for 变量名 in 可迭代对象(list, dict, str, etc) 3 | # 对列表进行迭代,就是按顺序对里面的各个元素操作;对字典进行迭代,就是按顺序对立面的各个键或值操作;对字符串进行迭代,就是按顺序对里面的各个字符操作。 4 | # 自己取一个变量名来代表可迭代对象里面的东西,这个变量会被依次被赋值为如列表里的每一个元素。针对每个元素的操作,写在for的下面一行。 5 | # 所有前面带缩进的都会被视为这个for循环里面的语句 6 | 7 | # range()函数 8 | # range用来表示整数序列。*range里第二个参数不在序列范围内 9 | # 例1.range(100) 默认0是列表第一个元素,99为列表最后一个元素 10 | # 例2.range(1, 100) 默认间隔1,列表里第一个元素为1,最后一个元素为99 11 | # 例3.range(1, 100, 2) 列表里第一个元素为1,最后一个元素为99,间隔2 12 | 13 | # 检查体温数字,找出不正常的低温 14 | temperature_list = [36.4, 36.6, 37.5, 38.1, 39.3] 15 | for temperature in temperature_list: 16 | if temperature >= 37.3: 17 | print(temperature) 18 | print("完球了") 19 | 20 | # 利用字典,筛选出发烧的人 21 | temperature_dict = {"111": 36.4, "112": 36.6, "113": 37.5, "114": 38.1, "115": 39.3} 22 | 23 | for temperature_tuple in temperature_dict.items(): 24 | staff_id = temperature_tuple[0] 25 | temperature = temperature_tuple[1] 26 | if temperature >= 37.3: 27 | print(staff_id) 28 | # 字典名.items 在for循环时,变量会被赋值为键和值组成的元组。 29 | 30 | # 另一种写法 31 | # for staff_id, temperature in temperature_dict.items(): 32 | # if temperature >= 37.3: 33 | # print(staff_id) 34 | # 这里for循环后面跟了两个变量,这种写法相当于把元组的第一个元素赋值给第一个变量,第二个元素赋值给第二个变量。 35 | 36 | # for循环结合range 37 | total = 0 38 | for i in range(1, 101): 39 | total = total + i 40 | print(total) 41 | 42 | 43 | -------------------------------------------------------------------------------- /IntroductionCourse/22_格式化字符串.py: -------------------------------------------------------------------------------- 1 | # 22_格式化字符串 2 | # 花括号表示会被替换的位置 3 | 4 | # 一、format方法 5 | # 1.根据参数位置替换 6 | # 花括号里面用数字表示参数位置,表示会用format里面的第几个参数进行替换 *0表示第一个参数 7 | # 2.根据关键词替换 8 | # 花括号里面用关键词来指定替换的对象。 *此时format里面参数位置就无所谓了 9 | 10 | # 二、f-字符串 11 | # 在字符串前加前缀“f”,花括号里面放入变量,花括号里的内容会被直接求值,添加到字符串内 12 | # f-字符串方法,花括号内可以放入算式,进行计算求值后添加到字符串内 13 | 14 | # !用format方法或f-字符串自动将,花括号中其他数据类型转化成str 15 | 16 | # 根据参数位置 17 | year = "虎" 18 | name = '老郑' 19 | message_content1 = """ 20 | 律回春渐,新元肇启。 21 | 新岁甫至,福气东来。 22 | 金{0}贺岁,欢乐祥瑞。 23 | 金{0}敲门,五福临门。 24 | 给{1}及家人拜年啦! 25 | 新春快乐,{0}年大吉! 26 | """.format(year, name) 27 | print(message_content1) 28 | 29 | # 根据关键词 30 | message_content2 = """ 31 | 律回春渐,新元肇启。 32 | 新岁甫至,福气东来。 33 | 金{current_year}贺岁,欢乐祥瑞。 34 | 金{current_year}敲门,五福临门。 35 | 给{receiver_name}及家人拜年啦! 36 | 新春快乐,{current_year}年大吉! 37 | """.format(current_year=year, 38 | receiver_name=name) # format函数中current_year是关键词;year是format函数的参数,也是变量 39 | # """.format(year=year, name=name) # 也可以如此表示,等号前面是关键词,等号后面是参数 40 | print(message_content2) 41 | 42 | # f-字符串 43 | message_content = f""" 44 | 律回春渐,新元肇启。 45 | 新岁甫至,福气东来。 46 | 金{year}贺岁,欢乐祥瑞。 47 | 金{year}敲门,五福临门。 48 | 给{name}及家人拜年啦! 49 | 新春快乐,{year}年大吉! 50 | """ 51 | 52 | # 对数字格式化 53 | gpa_dict = {'小明': 3.251, '小花': 3.869, '小李': 2.683, '小张': 3.685} 54 | for name, gpa in gpa_dict.items(): 55 | print("{0}你好,你的当前绩点为:{1:.2f}".format(name, gpa)) 56 | # print(f"{name}你好,你的当前绩点为:{gpa:.2f}") # “:.Nf” 来指定浮点数在格式化时保留几位小数 57 | -------------------------------------------------------------------------------- /IntroductionCourse/23&24_functions.py: -------------------------------------------------------------------------------- 1 | # 23_定义函数 2 | """ 3 | def 函数名(参数1, 参数2): 4 | # 定义函数的代码 5 | # ... 6 | # !定义函数的时候,里面的代码都不会被执行;只有在调用函数的时候,才是里面的代码被实际执行的时候 7 | """ 8 | 9 | 10 | # 定义一个计算扇形面积的函数 11 | def calculate_sector(central_angle, radius): 12 | sector_area = central_angle / 360 * 3.14 * radius ** 2 13 | print(f"此扇形面积为:{sector_area}") 14 | 15 | 16 | calculate_sector(180, 1) # 调用函数时,central_angle和radius两个参数会被赋值为传入的值 17 | 18 | 19 | # 24_ 20 | # 1.作用域 21 | # 在函数里定义的变量是局部变量,在函数内部可以访问到,在外部无法访问到。 22 | # *通过调用函数来运行某段代码,和直接运行某段代码,并不是完全一样的。 23 | # 2.return语句 24 | # return a, b, c 25 | # !return后跟上两个及以上变量的时候,返回这些变量的值组成的元组 26 | # 1)有return语句,函数在执行时,不仅会逐行运行里面的语句,还会在完成调用后,返回变量的值 27 | # 2)在没写return语句时,python函数的返回值会默认为None。 28 | # !print, append等实质上都是返回值为None的函数;len, sum等都是带返回值的函数,所以一般会把这些函数的调用结果赋值给其他变量。 29 | # 如:result = print("Hi!"),result值为None; result = len("Hi!"),result值为3 30 | 31 | # 写一个BMI计算器函数,可以计算出任意体重和身高的BMI值,并返回出计算的BMI值 32 | def calculate_BMI(weight, height): 33 | BMI = weight / height ** 2 34 | if BMI <= 18.5: 35 | category = '偏瘦' 36 | elif BMI <= 25: 37 | category = '正常' 38 | elif BMI <= 30: 39 | category = '偏胖' 40 | else: 41 | category = '肥胖' 42 | print(f"您的BMI分类为:{category}") 43 | return BMI 44 | 45 | 46 | weight = float(input('请输入您的体重(单位:kg):')) 47 | height = float(input('请输入您的身高(单位:m):')) 48 | BMI = calculate_BMI(weight, height) # 执行函数,同时将返回值赋值给BMI 49 | # calculate_BMI(weight,height) # 只执行函数 50 | print(f'您的BMI为{BMI:.2f}') 51 | -------------------------------------------------------------------------------- /IntroductionCourse/28_class2.py: -------------------------------------------------------------------------------- 1 | # 28_class2 2 | # 一、创建方法 3 | # 1.定义方法 4 | # 1)写在class里面,前面有缩进 5 | # 2)与__init__一样,第一个参数是"self"。 6 | # !一个作用是可以让我们在方法里面,去获取或修改和对象绑定的属性。 7 | # 这样就实现了,方法调用结果,根据属性的不同而改变 8 | # *方法的命名采用下划线命名法 9 | # 2.调用类方法, 10 | # 就用对象.方法名,括号里面放上参数进行调用 11 | # !"self"参数在这里也不需要手动传入的 12 | class CuteCat: 13 | def __init__(self, cat_name, cat_age, cat_color): 14 | self.name = cat_name 15 | self.age = cat_age 16 | self.color = cat_color 17 | 18 | def speak(self): 19 | print("喵" * self.age) 20 | # 字符串乘数字,表示把字符串重复这么多次 21 | 22 | def think(self, content): 23 | print(f"小猫{self.name}在思考{content}...\n") 24 | 25 | 26 | cat1 = CuteCat("Jojo", 3, "橙色") 27 | cat1.speak() 28 | cat1.think("现在去抓沙发还是去撕纸箱") 29 | 30 | 31 | # 定义一个学生类 32 | # 要求 33 | # 1.属性包括学生姓名、学号,以及语数英三科的成绩 34 | # 2.能够设置学生某科目成绩 35 | # 3.能够打印出学生的所有科目成绩 36 | class Student: 37 | def __init__(self, name, student_id): 38 | self.name = name 39 | self.id = student_id 40 | self.grades = {"语文": 0, "数学": 0, "英语": 0} 41 | 42 | def set_grade(self, course, grade): 43 | if course in self.grades: 44 | self.grades[course] = grade 45 | # 字典放入if语句或for语句时,默认返回键 46 | 47 | def print_grades(self): 48 | print(f"同学{self.name}(学号:{self.id})的成绩为:") 49 | for course in self.grades: 50 | print(f"{course}:{self.grades[course]}分") 51 | 52 | 53 | chen = Student("小陈", "100618") 54 | zeng = Student("小曾", "100622") 55 | zeng.set_grade("数学", 95) 56 | zeng.set_grade("语文", 91) 57 | zeng.print_grades() 58 | -------------------------------------------------------------------------------- /IntroductionCourse/33_error_fixing.py: -------------------------------------------------------------------------------- 1 | # 33_异常处理 2 | # 一、异常类型 3 | # 1.用长度之外的索引,对列表取值时,会产生IndexError,即索引错误 4 | # number_list = [56, 23,-5, 96] 5 | # number_list[4] 6 | # 2.用数字除以0时,会产生ZeroDivisionError,即除零错误 7 | # print(56 / (12 - 15 +3)) 8 | # 3.打开的文件不存在时,会产生FileNotFoundError,即找不到文件错误 9 | # f = open("./hi.txt", "r") 10 | # 4.让两个字符串做乘法,会产生TypeError,即类型错误 11 | # "yoo" * "hi" 12 | # .... 13 | 14 | # 二、捕捉异常|try/except语句 15 | # 1.try 16 | # try语句换行后的代码块里,放上你觉得可能会产生报错的代码 17 | # 2.except 18 | # except后面,跟上你想捕捉的错误名字,以及冒号 19 | # 换行后缩进的代码块里,放上那类错误发生后,你想相应执行的操作 20 | # !由于无法预判所有错误类型,如果你希望无论出现什么类型的错误,程序都不要炸的话 21 | # 可以直接写个"except:",这个语句会捕捉所有的错误类型 22 | # *try/except语句,在捕捉错误时,从上往下运行。 23 | # 如果第一个except语句就捕捉到了对应的错误,那后面的except语句都不会执行了 24 | # 和if、elif的逻辑很像,只有第一个符合条件的分支会运行 25 | 26 | # 3.else 27 | # else语句换行后缩进的代码块里,放上当try里面的语句,没有产生任何错误时,要执行的语句 28 | # 4.finally 29 | # finally语句换行后缩进的代码块里,放上无论错误发生与否,最终都会被执行的语句 30 | # *finally语句厉害的地方在于,无论是错误被某个except语句捕捉,还是没有任何错误产生, 31 | # 还是出现了你没捕捉到的错误,把程序给炸了。finally里面的代码,最终都会被执行 32 | 33 | # 以BMI指数程序为例,去捕捉异常 34 | # user_weight = float(input("请输入您的体重(单位:kg):")) 35 | # user_height = float(input("请输入您的身高(单位:m):")) 36 | # user_BMI = user_weight / user_height ** 2 37 | # print("您的BMI值为:" + str(user_BMI)) 38 | 39 | try: 40 | user_weight = float(input("请输入您的体重(单位:kg):")) 41 | user_height = float(input("请输入您的身高(单位:m):")) 42 | user_BMI = user_weight / user_height ** 2 43 | except ValueError: 44 | print("输入不为合理数字,请重新运行程序,并输入正确的数字。") 45 | except ZeroDivisionError: 46 | print("身高不能为零,请重新运行程序,并输入正确的数字。") 47 | except: 48 | print("发生了未知错误,请重新运行程序。") 49 | else: 50 | print("您的BMI值为:" + str(user_BMI)) 51 | finally: 52 | print("程序结束运行。") 53 | -------------------------------------------------------------------------------- /IntroductionCourse/36_Higher_order_functions&anonymous_functions.py: -------------------------------------------------------------------------------- 1 | # 36_高阶函数和匿名函数 2 | # 定义一个函数,能打印某数字平方和三次方的结果,还能打印某数字加10后的结果 3 | # 假如把计算和打印的过程逻辑都保留在这个函数里,则无法优雅直观地定义函数 4 | # 把计算平方、三次方、加10的函数独立出来,再把做计算的函数直接作为参数传入,逻辑更清晰直观 5 | def calculate_square(num): 6 | return num * num 7 | 8 | 9 | def calculate_cube(num): 10 | return num * num * num 11 | 12 | 13 | def calculate_plus_10(num): 14 | return num + 10 15 | 16 | 17 | def calculate_and_print(num, calculate): 18 | result = calculate(num) 19 | print(f""" 20 | |数字参数|{num}| 21 | |计算结果|{result}|""") 22 | 23 | 24 | calculate_and_print(3, calculate_square) 25 | calculate_and_print(7, calculate_plus_10) 26 | 27 | 28 | # 一、高阶函数 29 | # 1.定义:把函数作为参数的函数叫做高阶函数 30 | # *作为参数的函数,是直接用函数名进行传入,代表函数本身,后面不要带括号和参数 31 | # *用函数名作为参数,代表函数本身;而用函数名带上括号作为参数,这个函数就被调用了,传入的是函数调用后返回的结果 32 | # 2.用处:高阶函数给程序提供了更多灵活性:高阶函数负责核心功能,作为参数的函数负责实现多样化的功能 33 | 34 | # 二、匿名函数 lambda表达式被用于创建匿名函数 35 | # 1.应用场景 36 | # 适合函数作为高阶函数的参数,只需要一次性调用的场景 37 | # 2.由于匿名函数的定义,是由lambda这个关键字开始的,所以可以用lambda指代匿名函数 38 | # 3.用法 39 | # 在高阶函数的括号里,放上关键字lambda,跟上变量名,然后是冒号,然后是返回的结果 40 | # 冒号前面的变量名,表示的是传给匿名函数的参数 41 | # 如果要给匿名函数增加参数,只需要在lambda关键字后面,把参数用不同逗号分隔开 42 | calculate_and_print(7, lambda num: num * 5) 43 | # 4.特点 44 | # 匿名函数能减少代码行数,不用起名字,不用换行缩进,特别适合只需要一次性调用的场景 45 | # 匿名函数: 46 | # lambda num1, num2: num1 + num2 47 | # 普通函数: 48 | # def calculate_sum(num1, num2): 49 | # return num1 + num2 50 | # 5.匿名函数除了作为高阶函数的参数,也可以定义好后被直接调用 51 | # 调用方式和普通函数类似,都是函数后面括号,括号里传入参数。唯一的区别是,前面的匿名函数也要被括住,表示这是一个整体 52 | (lambda num1, num2: num1 + num2)(2, 3) 53 | # 6.局限性 54 | # 普通函数可以有多个语句/表达式,而匿名函数冒号后面只能有一个语句/表达式, 55 | # 只适用于简单的场景,多步骤的复杂逻辑,或者是涉及循环递归等无法用匿名函数写出来 56 | # 或者即使有些逻辑能用匿名函数写出来,可读性也会很差 57 | -------------------------------------------------------------------------------- /DataAnalysis/Practices/.ipynb_checkpoints/02 项目实战 _ 分析鸢尾花种类数据(空白版)-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 项目:分析鸢尾花种类数据" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 分析目标" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "此数据分析报告的目的是基于鸢尾花的属性数据,分析两种鸢尾花萼片、花瓣的长度和宽度平均值,是否存在显著性差异,让我们可以对不同种类鸢尾花的属性特征进行推断。" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## 简介" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "原始数据`Iris.csv`包括两种鸢尾花,每种有 50 个样本,以及每个样本的一些属性,包括萼片的长度和宽度、花瓣的长度和宽度。\n", 36 | "\n", 37 | "`Iris.csv`每列的含义如下:\n", 38 | "- Id:样本的ID。\n", 39 | "- SepalLengthCm:萼片的长度(单位为厘米)。\n", 40 | "- SepalWidthCm:萼片的宽度(单位为厘米)。\n", 41 | "- PetalLengthCm:花瓣的长度(单位为厘米)。\n", 42 | "- PetalWidthCm:花瓣的宽度(单位为厘米)。\n", 43 | "- Species:鸢尾花种类。" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [] 52 | } 53 | ], 54 | "metadata": { 55 | "kernelspec": { 56 | "display_name": "Python 3", 57 | "language": "python", 58 | "name": "python3" 59 | }, 60 | "language_info": { 61 | "codemirror_mode": { 62 | "name": "ipython", 63 | "version": 3 64 | }, 65 | "file_extension": ".py", 66 | "mimetype": "text/x-python", 67 | "name": "python", 68 | "nbconvert_exporter": "python", 69 | "pygments_lexer": "ipython3", 70 | "version": "3.8.1" 71 | } 72 | }, 73 | "nbformat": 4, 74 | "nbformat_minor": 4 75 | } 76 | -------------------------------------------------------------------------------- /IntroductionCourse/25_module.py: -------------------------------------------------------------------------------- 1 | # *在本文件中,模块指代标准库中的模块,模块、内置模块、标准库中的模块、标准库中的内置模块混用; 2 | # *第三方库中的模块、第三方模块、第三方库混用。 3 | 4 | # 内置函数、内置模块和第三方模块区别 5 | # 1.内置函数、内置类型等,无需引入。 6 | # 2.标准库中的内置模块,需要引入。 7 | # 3.第三方模块,需要安装+引入。 8 | 9 | 10 | # 一、调用内置函数 11 | # python所有内置函数 https://docs.python.org/zh-cn/3/library/functions.html 12 | print(sum([69, 124, -32, 27, 217])) 13 | 14 | 15 | # 二、引入标准库中的模块 16 | # 模块就是一个python程序。引入模块后,里面的函数和变量都可以为你所用。 17 | 18 | # (一)、引入模块的方式 19 | # 1.import语句 20 | """ 21 | import 模块名 22 | 模块名.函数名/模块名.变量名 来调用 23 | """ 24 | # 2. from...import...语句 25 | """ 26 | from 模块名 import 函数名/变量名 # 多个函数或变量用逗号进行分隔 27 | 函数名/变量名 直接使用 28 | """ 29 | # 3.from...import * 语句 30 | """ 31 | from 模块名 import * # 会把模块里所有内容都进行引入 32 | 函数名/变量名 直接使用 33 | * 不推荐使用,因为假如引入的模块A和B中都有abc函数,调用时不知是哪个模块的函数 34 | """ 35 | 36 | # (二)、标准库中的模块的使用 37 | # python标准库里的所有模块 https://docs.python.org/zh-cn/3/library/index.html 38 | # 点进模块就可以看到里面包含的函数和变量的用法和功能的介绍 39 | # 在Pycharm,ctrl+左键点击函数名,查看定义函数的源代码 40 | 41 | import statistics 42 | print(statistics.median([69, 124, -32, 27, 217])) 43 | 44 | 45 | # 三、第三方库的模块 46 | # (一)、安装和引入第三方模块 47 | # 1.安装 48 | """ 49 | 进入终端 50 | pip install 库名 51 | """ 52 | # 2.引入 53 | # 引入第三方库的模块的语法和前面一样 54 | 55 | # (二)、第三方库中的模块的使用 56 | # 搜索第三方库的网站 pypi.org 57 | # 搜索后,点进模块可以查看介绍和用法 58 | 59 | import akshare 60 | print(akshare.get_cffex_daily("20220222")) # 获取2022.2.22的中国金融期货交易所交易数据 61 | 62 | 63 | # 另:statistics.median()源代码 64 | def median1(data): 65 | data = sorted(data) 66 | n = len(data) 67 | if n == 0: 68 | raise statistics.StatisticsError("no median for empty data") 69 | if n % 2 == 1: 70 | return data[n // 2] 71 | else: 72 | i = n // 2 73 | return (data[i - 1] + data[i]) / 2 74 | # “%” 是取余的意思 75 | # “//” 是向下取整的意思,如:3/2的结果为1.5,而3//2的结果为1 76 | -------------------------------------------------------------------------------- /DataVisualization/.ipynb_checkpoints/05 项目实战 _ 可视化帕默群岛企鹅数据(空白版)-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 项目:可视化帕默群岛企鹅数据" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 分析目标" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "此数据分析报告的目的是对帕默群岛上企鹅样本的相关变量进行可视化,从而探索和分析种类、性别、所在岛屿等因素,与企鹅的身体属性,包括体重、嘴峰长度和深度、鳍的长度,之间的关系。" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## 简介" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "原始数据`Penguins.csv`包括334个收集自南极洲帕尔默群岛的3个岛屿上的企鹅样本,以及企鹅相关属性数据,包括种类名、所在岛、嘴峰长度、嘴峰深度、鳍长度、体重、性别。\n", 36 | "\n", 37 | "`Penguins.csv`每列的含义如下:\n", 38 | "- species:企鹅的种类\n", 39 | "- island:企鹅所在岛\n", 40 | "- culmen_length_mm:企鹅嘴峰的长度(单位为毫米)\n", 41 | "- culmen_depth_mm:企鹅嘴峰的深度(单位为毫米)\n", 42 | "- flipper_length_mm:企鹅鳍的长度(单位为毫米)\n", 43 | "- body_mass_g:企鹅体重(单位为克)\n", 44 | "- sex:企鹅性别" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [] 53 | } 54 | ], 55 | "metadata": { 56 | "kernelspec": { 57 | "display_name": "Python 3", 58 | "language": "python", 59 | "name": "python3" 60 | }, 61 | "language_info": { 62 | "codemirror_mode": { 63 | "name": "ipython", 64 | "version": 3 65 | }, 66 | "file_extension": ".py", 67 | "mimetype": "text/x-python", 68 | "name": "python", 69 | "nbconvert_exporter": "python", 70 | "pygments_lexer": "ipython3", 71 | "version": "3.8.1" 72 | } 73 | }, 74 | "nbformat": 4, 75 | "nbformat_minor": 4 76 | } 77 | -------------------------------------------------------------------------------- /DataAnalysisPreparation/2.5_numpy_array.py: -------------------------------------------------------------------------------- 1 | # 2.5_数组 2 | # 用Python做数据分析离不开两个最常用的库,NumPy和Pandas 3 | # NumPy的全程叫Numerical Python, 是专门针对计算用的,NumPy是很多数据或科学相关Python包的基础,包括马上要学到的pandas 4 | 5 | import numpy as np 6 | # "as np"的意思是,给这个库一个别名叫np,这样使用这个库的任何东西时,都可以用np,而不是numpy来调用 7 | # 目的是打字更少、更简洁 8 | 9 | # 一、N维数组(ND array) 10 | # NumPy里最核心的数据结构叫ND array, 11 | # (一)、NumPy的数组与Python的内置列表 12 | # 1.相似之处 13 | arr = np.array([5, 17, 3, 26, 31]) 14 | # 1)都可以通过索引去获得某个元素 15 | print(arr[0]) 16 | # 2)都可以通过切片获得某范围的多个元素 17 | # 打印第1个到第4个之前元素 18 | print(arr[0:3]) 19 | # 3)可以去迭代各个元素 20 | for element in arr: 21 | print(element) 22 | # 2.不同之处 23 | # NumPy数组里的数据类型需要统一,而列表里的数据类型不需要统一 24 | lst = [5, "a", True, 12.2, "!"] 25 | # 3.优点 26 | # 1)在对NumPy数组进行大规模数学运算或其它操作时,执行速度远高于Python内置列表, 27 | # 因此效率是数据处理方面选择NumPy的首要原因 28 | # 2)NumPy提供了很多专门做运算的函数,为操 作数据提供了很多便利 29 | 30 | # 二、安装和使用NumPy 31 | # 1.安装NumPy 32 | # 在CMD,输入"pip install numpy",进入JN 33 | # 2.导入NumPy 34 | # "import numpy as np" 35 | 36 | # 三、创建数组 37 | # (一)、array方法 38 | # 1.创建一个数组,最直观的方法就是通过NumPy的array方法,把列表转换成数组 39 | # 2.如果传入的是"[1, 2, 3]"这样一个简单的列表,它会被转换成一维数组; 40 | # 如果传入的是"[[1, 2, 3], [4, 5, 6]]"这样一个嵌套列表,也就是一个列表里面有另一个列表的情况,它会被转换成二维数组 41 | # 一个简单的判断方法是,最左边有几个方括号,它就会被转换成几维的数组 42 | # 3.这里也可以试一下传入不同类型的元素,虽然不报错,但array方法会强制把它们转换成同一类型 43 | # (二)、其他方法 44 | # 1.zeros方法 45 | # 给zeros方法传入一个数字,会返回一个全部是0的,长度为那个数字的数组 46 | np.zeros(3) 47 | # 会返回array([0., 0., 0.,])。由于数字类型是浮点数,所以每个数字后面有个小数点 48 | # 2.ones方法 49 | # 给ones方法传入一个数字,会返回一个全部是1的,长度为那个数字的数组 50 | np.ones(3) 51 | # 会返回array([1., 1., 1.,]) 52 | # 3.arange方法 表示是针对array的range方法 53 | # 里面传入的参数和range是一样的,第一个表示起始值,第二个表示结束值,第三个表示步长 54 | # *和range方法一样,结束值不会被包括在范围内 55 | np.arange(5, 11, 2) 56 | # 会返回array([5, 7, 9]) 57 | 58 | arr2 = np.array([[1, 2, 3], [4, 5, 6]]) 59 | 60 | # 四、数组的属性 61 | # 1.ndim会返回给我们数组的维度 62 | print(arr.ndim) 63 | # 2.shape会返回一个元组,表示各个维度的元素的个数 64 | print(arr.shape) 65 | # 因为arr是一维数组,第一个维度有5个元素,所以会打印"(5,)" 66 | # *如果元组里面只有一个元素,它会用元素后面的逗号,来强调这是由一个元素组成的元组 67 | print(arr2.shape) # 因为arr2是二维数组,第一个维度有2个元素,第二个维度有3个元素,所以会打印"(2, 3)"的元组 68 | # 3.size会返回数组里面元素的总个数 69 | print(arr.size) 70 | # 4.dtype会返回数组元素的类型,dtype表达的是data type的意思 71 | print(arr.dtype) 72 | # 会打印"int32",int开头说明类型是整数,32表示的是比特长度 73 | -------------------------------------------------------------------------------- /DataAnalysisPreparation/2.4_Markdown&LaTex.py: -------------------------------------------------------------------------------- 1 | # 2.4 2 | # 一、Markdown 3 | # (一)、优势 4 | # 1.格式比纯文本更丰富 5 | # 代码中的注释没法添加任何格式,或添加丰富的信息 6 | # Markdown支持标题、粗体、引用、列表、代码块等常用格式 7 | # 2.体积比富文本更轻量 8 | # Word或者一些网站支持的富文本编辑器,可以让我们更改内容样式或排版。但它的功能过于强大多样,让产出文件更加臃肿 9 | # 不支持自定义字体、颜色等操作,所有样式都是通过简单的符号来添加的。因此Markdown文件非常轻量,和纯文本差不了多少 10 | 11 | # 代码包的README,文件后缀一般都是.md,说明是一个Markdown文件 12 | 13 | # (二)、语法 14 | # 1.标题 15 | # 通过在前面添加1~6个#和1个空格,可以把文字设置成一至六级标题 16 | # 2.加粗、斜体、删除样式 17 | # 用两个*把文字包住,把文字变成粗体 18 | # 用一个*把文字包住,把文字变成斜体 19 | # 用两个~把文字包住,把文字用删除线划掉。*注意是英文输入法下的小波浪 20 | # 3.普通文字 21 | # 不把文字用任何符号包围的话,那就默认是普通的段落文字 22 | # 特点:加的换行,只会在文字之间出现一个空格 23 | # 如果想让文字分隔在不同行,一个方法是多打一次换行,另一个方法是在第一行后面额外加两个空格 24 | # 4.列表 25 | # 1)无序列表 26 | # 在每个列表元素前面,加上短横杠、空格"- " 27 | # 2)有序列表 28 | # 在每个列表元素前面,加上数字、英文句号、空格"1. " 29 | # 5.链接 *完整的链接是要带协议名的,比如前面的https:// 30 | # 1)展示链接 31 | # 把链接直接像普通文字那样放进去,如果Markdown识别出来这是个链接,就会把它变成可跳转的 32 | # 2)展示链接标题 更直观地告知读者链接指向的内容 33 | # 方括号把链接包围起来,在后面紧跟着的括号里面,放上链接。 34 | # [必应](https://cn.bing.com/) 35 | # 6.图片 36 | # 方括号里放文字,圆括号里面放图片链接,同时在方括号前面加上一个英文感叹号 37 | # ![城市景观](https://img0.baidu.com/it/u=25183460,870873689&fm=253) 38 | # *插入图片,方括号里放文字的意义是,如果图片加载不出来的话,就会显示那个文字内容作为替代 39 | # 7.引用 40 | # 插入一个引用段落,用右书名号、空格,后面紧跟着引用内容 41 | # 引用段落里的文字,和普通段落里的文字一样,不会因为你在内容里加了换行,展示效果里就有换行 42 | """ 43 | >蒹葭苍苍 44 | 白露为霜 45 | """ 46 | # 8.代码 47 | # 1)要在文字里插入代码,就用反引号包裹住代码,`import math` 48 | # 2)如果我们要插入独占一段的代码段落,就用三个反引号包裹住代码段落 49 | # *在开头的三个反引号后面,还可以跟上代码语言的名字,这样Markdown就会展示针对那个语言的语法高亮 50 | """ 51 | ```python 52 | import math 53 | print("Hello World!") 54 | print(math.pi) 55 | ``` 56 | """ 57 | # 9.公式 58 | # 1)在行内插入公式,就用1个$,包裹住那个公式 59 | # 2)要插入一个独占一行的公式,就用2个$,包裹住那个公式 60 | # 3)复杂的公式,可以用LaTex语法来表示 61 | 62 | # 二、LaTex 63 | # LaTex是一个排版系统,可以负责定义书籍、简历、论文等格式和布局,不局限于数学公式 64 | # 在JN的使用场景里,用LaTex就是为了在Markdown里插入公式,所以只需学会公式相关语法即可 65 | # 1.加减乘除 66 | # 表示加减的符号,就是键盘上的加减 67 | # 乘号和除号是没有的,用\times表示乘号,\div表示除号 68 | """ 69 | $$x + y$$ 70 | $$x - y$$ 71 | $$x \times y$$ 72 | $$x \div y$$ 73 | """ 74 | # 2.上标下标 75 | # 要加上标,用插入符^,跟上作为上标的内容;要加下标,用下划线_,后面跟上作为下标的内容 76 | # *LaTex默认上标或下标只包含1位字符,如果想把多个字符作为上标或下标,用花括号把它们组合起来,就可以了 77 | """ 78 | $$x^3$$ 79 | $$H_2O$$ 80 | $$S_{input}$$ 81 | """ 82 | # 3.求和求根 83 | # 求根符号,\sqrt,后面可以跟上方括号,里面的数字表示求几次方根 84 | # *LaTex默认求根符号的横线只拉到第1个字符,如果对一个长公式求根,可以在方括号后面跟上花括号,把要求根的内容全部包围起来 85 | """ 86 | $$\sum(x^2 + y^2)$$ 87 | $$\sqrt[3]x$$ 88 | $$\sqrt[3]{a^2m^2}$$ 89 | """ 90 | # 4.分数线 91 | # 用\frac表示,后面跟着两个花括号,在第一个花括号里面,放分数线上面的内容;在第二个花括号里,放分数线下面的内容 92 | """ 93 | $$\frac{x+y}{x-y}$$ 94 | """ 95 | # 5.其他 96 | # 除上面之外,LaTex还能搞定其它无数的公式符号, 97 | # 其他公式符号语法:https://oeis.org/wiki/List_of_LaTeX_mathematical_symbols 98 | -------------------------------------------------------------------------------- /DataAnalysis/.ipynb_checkpoints/index-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "bc53b743", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "《动手学深度学习》\n", 11 | "========================\n", 12 | "\n", 13 | "```eval_rst\n", 14 | ".. raw:: html\n", 15 | " :file: frontpage.html\n", 16 | "```\n", 17 | "\n", 18 | ":begin_tab:toc\n", 19 | " - [chapter_preface/index](chapter_preface/index.ipynb)\n", 20 | " - [chapter_installation/index](chapter_installation/index.ipynb)\n", 21 | " - [chapter_notation/index](chapter_notation/index.ipynb)\n", 22 | ":end_tab:\n", 23 | "\n", 24 | ":begin_tab:toc\n", 25 | " - [chapter_introduction/index](chapter_introduction/index.ipynb)\n", 26 | " - [chapter_preliminaries/index](chapter_preliminaries/index.ipynb)\n", 27 | " - [chapter_linear-networks/index](chapter_linear-networks/index.ipynb)\n", 28 | " - [chapter_multilayer-perceptrons/index](chapter_multilayer-perceptrons/index.ipynb)\n", 29 | " - [chapter_deep-learning-computation/index](chapter_deep-learning-computation/index.ipynb)\n", 30 | " - [chapter_convolutional-neural-networks/index](chapter_convolutional-neural-networks/index.ipynb)\n", 31 | " - [chapter_convolutional-modern/index](chapter_convolutional-modern/index.ipynb)\n", 32 | " - [chapter_recurrent-neural-networks/index](chapter_recurrent-neural-networks/index.ipynb)\n", 33 | " - [chapter_recurrent-modern/index](chapter_recurrent-modern/index.ipynb)\n", 34 | " - [chapter_attention-mechanisms/index](chapter_attention-mechanisms/index.ipynb)\n", 35 | " - [chapter_optimization/index](chapter_optimization/index.ipynb)\n", 36 | " - [chapter_computational-performance/index](chapter_computational-performance/index.ipynb)\n", 37 | " - [chapter_computer-vision/index](chapter_computer-vision/index.ipynb)\n", 38 | " - [chapter_natural-language-processing-pretraining/index](chapter_natural-language-processing-pretraining/index.ipynb)\n", 39 | " - [chapter_natural-language-processing-applications/index](chapter_natural-language-processing-applications/index.ipynb)\n", 40 | " - [chapter_appendix-tools-for-deep-learning/index](chapter_appendix-tools-for-deep-learning/index.ipynb)\n", 41 | ":end_tab:\n", 42 | "\n", 43 | ":begin_tab:toc\n", 44 | " - [chapter_references/zreferences](chapter_references/zreferences.ipynb)\n", 45 | ":end_tab:\n" 46 | ] 47 | } 48 | ], 49 | "metadata": { 50 | "language_info": { 51 | "name": "python" 52 | }, 53 | "required_libs": [] 54 | }, 55 | "nbformat": 4, 56 | "nbformat_minor": 5 57 | } -------------------------------------------------------------------------------- /DataSorting/Practices/04 项目实战 _ 整理Netflix电影演员评分数据(空白版).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 项目:整理Netflix电影演员评分数据" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 分析目标" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "此数据分析的目的是,整理不同流派影视作品,比如喜剧片、动作片、科幻片中,各演员出演作品的平均IMDB评分,从而挖掘出各个流派中的高评分作品演员。\n", 22 | "\n", 23 | "本实战项目的目的在于练习整理数据,从而得到可供下一步分析的数据。" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## 简介" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "原始数据集记录了截止至2022年7月美国地区可观看的所有Netflix电视剧及电影数据。数据集包含两个数据表:`titles.csv`和`credits.csv`。\n", 38 | "\n", 39 | "`titles.csv`包含电影及电视剧相关信息,包括影视作品ID、标题、类型、描述、流派、IMDB(一个国外的在线评分网站)评分,等等。`credits.csv`包含超过7万名出现在Netflix影视作品的导演及演员信息,包括名字、影视作品ID、人物名、演职员类型(导演/演员)等。" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "`titles.csv`每列的含义如下:\n", 47 | "- id:影视作品ID。\n", 48 | "- title:影视作品标题。\n", 49 | "- show_type:作品类型,电视节目或电影。\n", 50 | "- description:简短描述。\n", 51 | "- release_year:发布年份。\n", 52 | "- age_certification:适龄认证。\n", 53 | "- runtime:每集电视剧或电影的长度。\n", 54 | "- genres:流派类型列表。\n", 55 | "- production_countries:出品国家列表。\n", 56 | "- seasons:如果是电视剧,则是季数。\n", 57 | "- imdb_id:IMDB的ID。\n", 58 | "- imdb_score:IMDB的评分。\n", 59 | "- imdb_votes:IMDB的投票数。\n", 60 | "- tmdb_popularity:TMDB的流行度。\n", 61 | "- tmdb_score:TMDB的评分。\n", 62 | "\n", 63 | "`credits.csv`每列的含义如下:\n", 64 | "- person_ID:演职员ID。\n", 65 | "- id:参与的影视作品ID。\n", 66 | "- name:姓名。\n", 67 | "- character_name:角色姓名。\n", 68 | "- role:演职员类型,演员或导演。" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [] 77 | } 78 | ], 79 | "metadata": { 80 | "kernelspec": { 81 | "display_name": "Python 3 (ipykernel)", 82 | "language": "python", 83 | "name": "python3" 84 | }, 85 | "language_info": { 86 | "codemirror_mode": { 87 | "name": "ipython", 88 | "version": 3 89 | }, 90 | "file_extension": ".py", 91 | "mimetype": "text/x-python", 92 | "name": "python", 93 | "nbconvert_exporter": "python", 94 | "pygments_lexer": "ipython3", 95 | "version": "3.11.1" 96 | } 97 | }, 98 | "nbformat": 4, 99 | "nbformat_minor": 4 100 | } 101 | -------------------------------------------------------------------------------- /IntroductionCourse/31_file_read.py: -------------------------------------------------------------------------------- 1 | # 31_文件操作|读文件 2 | # -如果需要一个程序来帮忙计算销售数据,一种方法是根据数据文件,把数据一条条复制到代码里面,然后通过执行代码来获得结果 3 | # -如果学会了文件操作,就可以让同一段代码,读取不同的市场数据文件,只需要对文件名进行改动即可 4 | 5 | # 用Python读文件的第一步操作,得是先打开目标文件 6 | # 一、打开文件 7 | # 1.用open函数,里面放上文件路径, 8 | # 2.第二个参数是模式,模式是一个字符串 9 | # 常见的模式包括: 10 | # 1)"r",表示读取模式 11 | # 2)"w",表示写入模式 12 | # *第二个参数不写,默认为读取模式 13 | # *在读取模式下,文件路径不存在的话,就会报一个叫FileNotFoundError的错误,提示文件不存在 14 | # 3.open函数还有一个可选参数叫encoding,表示编码方式 15 | # 文件一般编码方式是UTF-8 16 | # open函数默认的encoding是跟随系统的 17 | # 这个参数传入UTF-8的值,就会用这个编码来读取文件 18 | # 4.open函数执行成功,会返回一个文件对象,可以后续对它进行读取或写入操作 19 | # f = open("相对路径或绝对路径", "r", encoding="utf-8") 20 | # *相对路径是相对于当前这个代码文件而言,我们要读的文件和它处于什么位置 21 | 22 | # 二、读文件 23 | # 用方法读文件,是因为open函数返回的是对象 24 | # (一)、read方法 25 | # 1.调用后,会一次性读取文件里面所有内容,并以字符串形式进行返回 26 | # 2.由于程序其实会记录那个文件读到哪个位置了,再次调用read方法,会发现返回的结果为空 27 | # *一般用代码操作的文件格式,都是简单直接的, 28 | # 以文本格式为例:代码喜欢txt这种纯文本,可以直接把里面内容转换为字符串,而不是word这种还有各种格式,很有可以读出问题 29 | # *在文件特别大的情况下,最好不要用read,因为读出来的内容会占用内存 30 | f = open("31_data.txt", "r", encoding="utf-8") 31 | print(f.read()) # 会读全部的文件内容 32 | print(f.read()) # 会读空字符串 33 | 34 | # 3.如果不想一次性读完整个文件,可以给read传一个数字,表示读多少字节,下一次调用read时,会从那个位置继续往下读 35 | f = open("31_data.txt", "r", encoding="utf-8") 36 | print(f.read(10)) # 会读第1-10个字节的文件内容 37 | print(f.read(10)) # 会读第11-20个字节的文件内容 38 | 39 | # (二)、readline方法 40 | # 1.调用后,只会读取文件一行的内容,并以字符串形式进行返回;下一次调用,就读下一行 41 | # 会根据换行符来判断什么时候算本行结尾,而且换行符也会被当成读到的内容的一部分 42 | f = open("31_data.txt", "r", encoding="utf-8") 43 | print(f.readline()) # 会读一行文件内容 44 | print(f.readline()) # 会读一行文件内容 45 | # 这里运行结果会有额外的空行,原因是readline会把每行的换行符给读到,同时print本身默认结尾换行 46 | 47 | # 2.由于一般文件行数未知,且读到结尾,readline方法和read方法一样会返回空字符串 48 | # 所以一般用while循环,判断只要返回不是空字符串,就继续读下一行 49 | f = open("31_data.txt", "r", encoding="utf-8") 50 | line = f.readline() # 读第一行 51 | while line != "": # 判断当前行是否为空 52 | print(line) # 不为空则打印当前行 53 | line = f.readline() # 读取下一行 54 | 55 | # (三)、readlines方法 56 | # readlines会读取全部文件内容,并返回由每行组成的字符串列表,所以一般和for循环结合 57 | f = open("31_data.txt", "r", encoding="utf-8") 58 | print(f.readlines()) 59 | 60 | f = open("31_data.txt", "r", encoding="utf-8") 61 | lines = f.readlines() # 把每行内容储存到列表里 62 | for line in lines: # 遍历每行内容 63 | print(line) # 打印当前行 64 | # *readlines方法也可以直接放在for循环中 65 | 66 | # 三、关闭文件 67 | # (一)、close方法 68 | # 调用后该文件对象就会释放系统资源,所以每次完成文件读写操作后,都应该关闭文件 69 | f = open("31_data.txt", "r", encoding="utf-8") 70 | print(f.read()) 71 | f.close() # 关闭文件,释放资源 72 | 73 | # (二)、with关键字 74 | # 1.with open函数 as 文件对象名: 75 | # 2.缩进的代码块里,放上对该文件对象的操作;在缩进的内容执行完毕后,文件就会被自动关闭 76 | # 3.这种写法能让代码更加剪接,也不需要单独调用close方法了 77 | with open("31_data.txt") as f: 78 | print(f.read()) 79 | 80 | # 四、创建文件 81 | # 1.用Windows系统自带的记事本或macOS自带的文本编辑 82 | # 2.在代码编辑器里,新建文件,后缀用.txt,表示是纯文本 83 | -------------------------------------------------------------------------------- /IntroductionCourse/29_class_inheritance.py: -------------------------------------------------------------------------------- 1 | # 29_类继承 2 | # 继承是在说:面向对象编程允许创建有层次的类,即类可以有子类和父类,来表示从属关系 3 | # 这样做的好处是父类的属性、方法都可以被继承,不需要反复定义,减少代码的冗余 4 | 5 | # 1.创建一个父类,把共享的属性和方法全部挪进去 6 | # 在子类名后面加上括号,里面写上父类的名字 7 | # 2.调用方法时,优先看所属的类有没有该方法,没有的话,往上找父类的同名方法用 8 | # *同时有的话,只执行子类的方法 9 | class Mammal: 10 | def __init__(self, name, sex): 11 | self.name = name 12 | self.sex = sex 13 | self.num_eyes = 2 14 | 15 | def breathe(self): 16 | print(self.name + "在呼吸...") 17 | 18 | def poop(self): 19 | print(self.name + "在拉屎...") 20 | 21 | 22 | # *子类不用缩进 23 | class Human(Mammal): 24 | def __init__(self, name, sex): 25 | super().__init__(name, sex) 26 | # 当子类之间方法有部分不同时,特别是部分属性值不同时,最好用super() 27 | # 可以省略重复的属性和方法代码行 28 | # super()会返回当前类的父类,如:super().__init__()会调用父类的构造函数 29 | # 子类方法中,缩进 + super().方法名(参数(省略(self)) + 方法不同的部分 30 | self.has_tail = False 31 | 32 | def read(self): 33 | print(self.name + "在阅读...") 34 | 35 | 36 | class Cat(Mammal): 37 | def __init__(self, name, sex): 38 | super().__init__(name, sex) 39 | self.has_tail = True 40 | 41 | def scratch_sofa(self): 42 | print(self.name + "在抓沙发...") 43 | 44 | 45 | cat1 = Cat("Jojo", "男") 46 | print(cat1.name) 47 | cat1.poop() 48 | 49 | 50 | # 类继承练习:人力系统 51 | # -员工分为两类:全职员工 FullTimeEmployee、兼职员工 PartTimeEmployee 52 | # -全职和兼职都有"姓名 name"、"工号 id"属性 53 | # 都具备"打印信息 print_info" (打印姓名、工号)方法 54 | # -全职有"月薪 monthly_salary"属性 55 | # 兼职有"日薪 daily_salary"属性、"每月工作天数 work_days"属性 56 | # -全职和兼职都有"计算月薪 calculate_monthly_pay"的方法,但具体计算过程不一样 57 | class Employee: 58 | def __init__(self, name, id): 59 | self.name = name 60 | self.id = id 61 | 62 | def print_info(self): 63 | print(f"员工{self.name},工号:{self.id}") 64 | 65 | 66 | class FullTimeEmployee(Employee): 67 | def __init__(self, name, id, monthly_salary): 68 | self.monthly_salary = monthly_salary 69 | super().__init__(name, id) 70 | 71 | def calculate_monthly_pay(self): 72 | print(f"{self.name}的月薪为:{self.monthly_salary}") 73 | 74 | 75 | class PartTimeEmployee(Employee): 76 | def __init__(self, name, id, daily_salary, work_days): 77 | super().__init__(name, id) 78 | self.daily_salary = daily_salary 79 | self.work_days = work_days 80 | 81 | def calculate_monthly_pay(self): 82 | monthly_salary = self.work_days * self.daily_salary 83 | print(f"{self.name}的月薪为:{monthly_salary}") 84 | 85 | 86 | zheng = FullTimeEmployee("小郑", "2978", 50000) 87 | zeng = PartTimeEmployee("小曾", "2979", 500, 30) 88 | zheng.print_info() 89 | zheng.calculate_monthly_pay() 90 | zeng.calculate_monthly_pay() 91 | -------------------------------------------------------------------------------- /DataAnalysisPreparation/2.3_use_jupyter_notebook.py: -------------------------------------------------------------------------------- 1 | # 2.3 Jupyter Notebook使用 2 | # Jupyter Notebook会是我们数据分析中使用的核心工具 3 | 4 | # 一、启动Jupyter Notebook 5 | # 1.1)Windows系统,在菜单栏搜索CMD,点击命令提示符 6 | # 2)macOS系统,点击顶部菜单栏的放大镜,输入"终端"或"terminal",回车进入 7 | # 出现大黑窗口后,输入Jupyter Notebook的启动命令:"Jupyter-Notebook" 8 | # 这时默认浏览器会自动打开一个网页,展示Notebook的主面板。 9 | # 2.如果不小心关闭了JN的网页,地址可以再CMD或终端里找到, 10 | # 其中某一行:"The Jupyter Notebook is running at...",后面跟着的就是页面地址,复制到浏览器即可 11 | # *3.接下来的时间里,记得不要关闭这个输入了启动命令的CMD或终端,否则JN会被终止 12 | 13 | # 二、创建Jupyter Notebook文件 14 | # 1.创建文件 15 | # 希望文件在什么位置,就点进那个文件夹,然后点击New,Notebook, 16 | # 一个新的编辑界面就会被打开,而且在桌面上也能看到一个全新的文件出现了 17 | # 2.重命名文件 18 | # 在编辑界面,点下标题,输入想要的名字 19 | 20 | # 三、Jupyter Notebook编辑界面 21 | # 标题下面分别是菜单栏、工具条以及单元格, 22 | # 工具条就是把菜单栏里一些最常用的操作摆出来,所以大部分时候我们只需要通过工具条和单元格打交道 23 | # 单元格主要用来写Python代码和文字, 24 | 25 | # (一)、编辑模式和命令模式 26 | # 1.编辑模式 27 | # 在我们点击单元格里面后,外框会变成绿色,表示当前是编辑模式, 28 | # 2.命令模式 29 | # 完成输入后,点Esc键,或者鼠标点下其它地方,外框会变成蓝色,表示当前是命令模式 30 | 31 | # (二)、工具栏 32 | # 1.第一个按钮,表示保存文件内容 33 | # 2.第二个加号按钮,表示在当前选中的单元格下面,新建一个单元格, 34 | # 3.接下来三个按钮,分别表示剪切选中的单元格、复制选中的单元格,以及粘贴选中的单元格 35 | # 还可以按住Shift键选中多个单元格,然后同一进行操作, 36 | # 4.上箭头表示把选中的单元格往上移动一格,下箭头表示往下移动一格,来更改单元格顺序 37 | # 5.运行按钮 38 | # 会执行这个单元格里面所有Python代码, 39 | # 1)执行时,左边方括号会展示星号,表示正在运行, 40 | # 2)执行完毕后,方括号里面会变成数字, 41 | # 3)数字表示的是执行顺序,比如运行完第一个单元格后,旁边数字显示1;继续运行下一个单元格,旁边数字就会显示2。 42 | # 4)JN很灵活的一点是,你可以用任意顺序运行单元格, 43 | # 比如可以运行第三格后,回到第一格再执行一遍;也可以多次反复运行同一个单元格。旁边的数字,会帮忙记录和告知执行过的顺序 44 | # 5)顺序是很关键的, 45 | # 比如你分别在第二和第三个单元格里,写了读取和查看数据的代码,想要修改读取的文件,需要修改和再次运行读取数据的代码 46 | # 这时,第二个单元格的数字大于第三个单元格,就能侧面提醒我们,第二个单元格里,查看数据来输出的代码还没有被更新,查看的还是之前的数据文件。 47 | # 所以应该把第三个单元格也运行一次 48 | # 另:代码单元格里的代码是通过交互模式运行的,也就是说可以不需要print语句,就能直接看到执行输出的结果, 49 | # *但是如果单元格里有多条输出语句,只有最后一项的输出会被展示,我们还是要借助print,才能同时展示多项输出结果, 50 | # 6.终止执行按钮 51 | # 执行单元格里代码的过程中,想要中断的话,就可以点击它 52 | # 7.重启按钮 53 | # 这会帮我们清空所有定义过的变量,而且单元格旁边的数字也会重新从1开始,表明重启过 54 | # 举个例子,假如第一格定义了一个变量,第二格输出这个变量的值,那运行第一格后变量的值就已经被储存到内存里了,每次输出第二格就会输出对应的值; 55 | # 但重启后,再运行第二格,就会提醒我们变量不存在了。 56 | # 8,重启并重新运行所有单元格的按钮 57 | # 非常使用,如果你想看自己写的所有代码,从上往下完整执行一遍的输出,就用这个操作 58 | # 它可以帮我们检查单元格顺序是否有问题 59 | # 9.下拉框,可以让我们切换单元格里的内容。最常用的就是代码和Markdown,单元格并不限于写代码,也可以写文字 60 | # 1)Markdown 61 | # 是一种帮助我们为内容增加样式的标记语言,语法简单 62 | # 通过在前面添加1~6个#和1个空格,可以把文字设置成一至六级标题 63 | # 2)公式 64 | # a.在行内插入公式,就用1个美元符号,包裹住那个公式 65 | # b.要插入一个独占一行的公式,就用2个美元符号,包裹住那个公式 66 | # c.复杂的公式,可以用LaTex语法来表示 67 | # 10.键盘按钮,是快捷键配置, 68 | # 掌握快捷键的使用,可以大大提升我们使用JN的效率,*在命令模式下使用 69 | # 1)A键,可以在当前单元格上方插入一个新的单元格 70 | # 2)B键,可以在当前单元格下方插入 71 | # 3)连按两次D,可以删除当前选中的单元格 72 | # 4)Shift+Enter,运行当前单元格,并跳到下一个单元格 73 | 74 | # 四、分享Jupyter Notebook 75 | # 1.可以自行编辑和运行 76 | # 如果对方使用JN,可以把这个以.ipynb为后缀的文件,直接发给对方 77 | # 2.只读 78 | # 点击File,选择Save and Export Notebook as,有很多选项 79 | # 比如HTML,这是针对网页的标记语言,所以对方可以直接用浏览器打开,所有代码以及Markdown文字都会原封不动得展示出来 80 | 81 | # 五、打开之前创建过的Jupyter Notebook 82 | # 启动JN,进入存放notebook的目录,点击.ipynb的文件 83 | -------------------------------------------------------------------------------- /EvaluateAndCleanData/4.33_upload_files_to_github.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "bdee74ea-74a6-4255-9949-77287ea18275", 6 | "metadata": {}, 7 | "source": [ 8 | "4.33_upload_files_to_github" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "4edcbebf-9c90-40ea-9c54-3523c4d9581d", 14 | "metadata": {}, 15 | "source": [ 16 | "Notebook文件格式不方便直接分享\n", 17 | "\n", 18 | "用文本编辑器打开,文本内容并不直观,也不可读。\n", 19 | "\n", 20 | "notebook文件需要用Jupyter Notebook或代码编辑器打开;或者把notebook上传到GitHub,然后分享项目链接,因为GitHub能直接渲染Notebook的内容并进行展示。" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "id": "072c6ce4-2596-4484-9e80-ab7e3d8232e2", 26 | "metadata": {}, 27 | "source": [ 28 | "### GitHub" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "c03f70fe-73b8-4b0d-a451-c68cc9d754c5", 34 | "metadata": {}, 35 | "source": [ 36 | "GitHub是世界上最大的代码托管网站,和开源社区。可以在上面管理代码和追踪代码历史记录,也可以在上面搜索和查看无数其它开发者的项目代码" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "id": "40b46fba-2321-4393-95d7-f0a76113f9bc", 42 | "metadata": {}, 43 | "source": [ 44 | "#### (一)、创建代码仓库" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "id": "f0a4eedd-645a-47a5-bf8c-19be80cfc421", 50 | "metadata": {}, 51 | "source": [ 52 | "1. license\n", 53 | "\n", 54 | " 也就是软件许可证,是用来规定和限制用户,使用这个仓库里面的代码的权利的。\n", 55 | "\n", 56 | " 不同许可证的含义:https://choosealicense.com/" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "id": "c0beb078-f856-48be-bf81-358d6b266333", 62 | "metadata": {}, 63 | "source": [ 64 | "#### (二)、Git" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "b42072d0-9ae3-4ecd-b259-3b2197511279", 70 | "metadata": {}, 71 | "source": [ 72 | "1. Git是GitHub背后的版本控制系统,git可以记录项目每次做了什么改动,是由谁改动的,也可以随时切换到之前某个版本的状态\n", 73 | "\n", 74 | "2. 学习Git命令:https://www.runoob.com/git/git-tutorial.html" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "id": "0ebe3915-4424-433b-8d10-38c4819d7864", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [] 84 | } 85 | ], 86 | "metadata": { 87 | "kernelspec": { 88 | "display_name": "Python 3 (ipykernel)", 89 | "language": "python", 90 | "name": "python3" 91 | }, 92 | "language_info": { 93 | "codemirror_mode": { 94 | "name": "ipython", 95 | "version": 3 96 | }, 97 | "file_extension": ".py", 98 | "mimetype": "text/x-python", 99 | "name": "python", 100 | "nbconvert_exporter": "python", 101 | "pygments_lexer": "ipython3", 102 | "version": "3.11.1" 103 | } 104 | }, 105 | "nbformat": 4, 106 | "nbformat_minor": 5 107 | } 108 | -------------------------------------------------------------------------------- /IntroductionCourse/19_dictionary.py: -------------------------------------------------------------------------------- 1 | # 元组tuple 2 | # example_tuple = ("张伟", "33") 3 | # 1)是python中一种不可变的数据类型。*由于元组不可变,添加、删除元素等统统不能操作。 4 | # 2)元组里面可以放多个元素。*与列表的区别:列表用方括号,元组用圆括号。 5 | # *元组和列表都可以通过索引,返回某个位置的元素 6 | a = (1, 2, 3) 7 | print(a[1]) 8 | b = [1, 2, 3] 9 | print(b[1]) 10 | 11 | # 19_ 12 | # 字典用于储存键值对:"键:值" / "key:value"。键有对应的值,键是用来查找值的。 13 | example_dictionary = {"小明": "13700000000", 14 | "小花": "13700000001"} 15 | # !键的类型必须是不可变的。 16 | # !值的类型可以是可变的也可以是不可变的,所以值的类型可以是列表或者字典。 17 | 18 | # 一、针对字典的方法 19 | # 1 .在字典名后面跟方括号,获取某个键的值 20 | xiaoming_tel = example_dictionary["小明"] 21 | # zhangwei_tel = example_dictionary2[("张伟", 33)] 22 | 23 | # 2.添加键值对/更新值 *取决于这个键是否已存在于字典 24 | example_dictionary["小刚"] = "18600000000" 25 | example_dictionary["小花"] = "18600000001" 26 | 27 | # 3.删除键值对 *键本身不存在会报错 28 | del example_dictionary["小明"] 29 | 30 | # 4.获取某个键是否已经存在于字典中,“键 in 字典”会返回一个布尔值 31 | print("小明" in example_dictionary) 32 | 33 | # 5.dict.items()返回键值对(键值对组成元组),dict.keys()返回键,dict.values()返回值 34 | # *字典放入if语句或for语句时,默认返回键 35 | # !for key, value in dict.items(): 36 | # 可以将字典里的键赋值给key,字典里的值赋值给value 37 | 38 | # 6.字典可以通过len函数,得到字典里有多少键值对 39 | len(example_dictionary) 40 | 41 | # 二、一些尝试 42 | # 1.字典也可以重新被赋值 43 | # slang_dict = {1: 1, 2: 2} 44 | # 2.可以打印列表,字典 45 | # print(slang_dict) 46 | # 3.sorted(字典名) 会对字典中的键排序并返回键。sorted()一般按照ASCII排序,对中文排序按照Unicode,所以看起来顺序杂乱。 47 | # print(sorted(slang_dict)) 48 | # 4.键值对中键的类型必须是不可变的; 而值的类型是可变的,所以值的类型除上述外还可以是list, dictionary 49 | # slang_dict[1] = {3: 3} 50 | # print(type(slang_dict[1])) 51 | 52 | 53 | # 结合input、字典、if判断,做一个查询流行语含义的电子词典程序 54 | slang_dict = {'觉醒年代': '《觉醒年代》首次以电视剧的形式回溯中国共产党的孕育和创立过程,生动再现中国近代历史的大变局,' 55 | '深刻讲述中国人民是怎样选择了中国共产党。该剧播出后广受好评,成为党史学习教育的生动教材。', 56 | 'YYDS': '“永远的神”的拼音缩写,用于表达对某人的高度敬佩和崇拜。2021年东京奥运会期间,不管是杨倩夺得首金,' 57 | '还是全红婵一场决赛跳出三个满分,或是“苏神”站上百米决赛跑道,全网齐喊“YYDS”,奥运期间一度刷屏。'} 58 | slang_dict[ 59 | '双减'] = '指进一步减轻义务教育阶段学生作业负担和校外培训负担。其目标是使学校教育教学质量和服务水平进一步提升,作业布置更加科学合理,' \ 60 | '学校课后服务基本满足学生需要,学生学习更好回归校园,校外培训机构培训行为全面规范。' 61 | # 此处在值中间换行由于没有花括号,需加\。 62 | # *以上两个语句,反映了PEP8代码风格,即当括号内参数很多,需要换行编写的时候。 63 | # 要么第一行放入参数,换行的参数与上一行的括号对齐;要么第一行不放入参数,即第一行最后字符是(,换行后有一个缩进 64 | slang_dict['破防'] = '原指在游戏中突破了对方的防御,使对方失去防御能力。现指因遇到一些事或看到一些信息后情感上受到很大冲击,内心深处被触动,心理防线被突破。' 65 | slang_dict['元宇宙'] = '源于小说《雪崩》的科幻概念,现指在XR(扩展现实)、数字孪生、区块链和AI(人工智能)等技术 '\ 66 | '推动下形成的虚实相融的互联网应用和社会生活形态。现阶段,元宇宙仍是一个不断演变、不断发展的概念。' \ 67 | 'Facebook(脸书)对外公布更名为“Meta”,该词即来源于“Metaverse”(元宇宙)。' 68 | slang_dict['绝绝子'] = '该词流行于某网络节目,节目中一些粉丝用“绝绝子”为选手加油。多用于赞美,表示“太绝了、太好了”。这个词引发了网友对网络语言的关注和讨论。' 69 | slang_dict['躺平'] = '该词指人在面对压力时,内心再无波澜,主动放弃,不做任何反抗。' \ 70 | '“躺平”更像是年轻人的一种解压和调整方式,是改变不了环境便改变心态的自我解脱。短暂“躺平”是为了积聚能量,更好地重新出发。' 71 | slang_dict['伤害性不高,侮辱性极强'] = '一段网络视频中,两名男子相互夹菜,而同桌的另一名女子则显得很孤单。于是有网友调侃“伤害性不高,侮辱性极强”。' \ 72 | '后被网友用来调侃某事虽然没有实质性危害,但是却令人很难堪。' 73 | slang_dict['我看不懂,但我大受震撼'] = '源自导演李安在纪录片《打扰伯格曼》(2013)里评价一部影视作品的话。现多用于表示自己对某件事情的不解、震惊。' 74 | slang_dict['强国有我'] = '源自建党百年天安门广场庆典上青年学子的庄严宣誓。“请党放心,强国有我”是青年一代对党和人民许下的庄重誓言,' \ 75 | '彰显着新时代中国青年的志气、骨气、底气。' 76 | 77 | query = input("请输入您想查询的流行语:") 78 | if query in slang_dict: 79 | print("您查询的" + query + "含义如下") 80 | print(slang_dict[query]) # 列表通过排序作为索引查找元素,而字典可以通过键查找值 81 | else: 82 | print("您查询的流行语暂未收录") 83 | print("当前本词典收录的词条数为:" + str(len(slang_dict)) + "条。") # 返回dict中键值对数量 84 | 85 | print(slang_dict[['YYDS', '觉醒年代']]) 86 | -------------------------------------------------------------------------------- /DataVisualization/6.73_statistics_basics_describe_numerical_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4fcee408-d223-4ef9-a221-7b51e9ab3481", 6 | "metadata": {}, 7 | "source": [ 8 | "# **6.73_statistics_basics_describe_numerical_data**" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "1d061f6d-6d40-4122-85b3-3945c4f5dea2", 14 | "metadata": {}, 15 | "source": [ 16 | "### 分析维度 " 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "e2558d97-21bd-421a-9ec1-96cbe86d4230", 22 | "metadata": {}, 23 | "source": [ 24 | "### 一、集中趋势指标" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "87929fa6-644b-42f6-861b-f333a99cdebb", 30 | "metadata": {}, 31 | "source": [ 32 | "1. 平均数\n", 33 | "\n", 34 | " 表示数据相对集中较多的中心位置,平均数很容易受到极端值的影响\n", 35 | "\n", 36 | "2. 中位数\n", 37 | "\n", 38 | " 表示数据的中间位置,中位数不容易受到极端值的影响,有时会和平均数一起纳入统计\n", 39 | "\n", 40 | "3. 众数\n", 41 | "\n", 42 | " 表示数据最普遍的倾向" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "id": "5f6837c3-ad04-4b5a-be35-5e60431a44df", 48 | "metadata": {}, 49 | "source": [ 50 | "### 二、离散趋势指标" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "id": "082676bc-6943-45d7-bd22-f37b6edbde9d", 56 | "metadata": {}, 57 | "source": [ 58 | "1. 极差\n", 59 | "\n", 60 | " 表示数据波动的范围\n", 61 | "\n", 62 | "2. 方差/标准差\n", 63 | "\n", 64 | " 方差和标准差可以互相转换\n", 65 | "\n", 66 | " 平均数是用来描述集中趋势的,所以数值和平均数的差距,就是在描述偏离中心值的离散趋势\n", 67 | "\n", 68 | " 之所以要对各个数据和平均数的差进行平方,是因为我们只在乎值和平均数的距离,并不在乎这个距离是正距离还是负距离\n", 69 | "\n", 70 | "3. 四分位距\n", 71 | "\n", 72 | " 四分位距=第三四分位数-第一四分位数\n", 73 | "\n", 74 | " 表示中间一半数值的离散程度,越大说明数据越分散" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "id": "b4a3aa0d-881f-476f-b3f3-8791be9af497", 80 | "metadata": {}, 81 | "source": [ 82 | "### 三、分布形状指标" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "id": "c8366ffb-a0d6-44e5-8ca2-576ca030634d", 88 | "metadata": {}, 89 | "source": [ 90 | "1. 直方图\n", 91 | "\n", 92 | " 直方图常被用于发现数据的分布情况,包括偏态、峰度、异常值等信息\n", 93 | "\n", 94 | " 将数据按照一定的区间范围进行分组,计算每个区间内的数据数量,然后用矩形条表示每个区间内数据的数量\n", 95 | "\n", 96 | "2. 正态分布\n", 97 | "\n", 98 | "3. 偏态分布\n", 99 | "\n", 100 | " - 正偏态/右偏态\n", 101 | "\n", 102 | " 长尾巴在右边,说明大部分数值比平均值更低,比如:平均月薪,被高工资拉高了平均值\n", 103 | "\n", 104 | " - 负偏态/左偏态\n", 105 | " \n", 106 | " 长尾巴在左边,说明大部分数值比平均值更高。" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "id": "dc6cef85-c48e-4e96-9dd9-c243ed8c3d45", 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [] 116 | } 117 | ], 118 | "metadata": { 119 | "kernelspec": { 120 | "display_name": "Python 3 (ipykernel)", 121 | "language": "python", 122 | "name": "python3" 123 | }, 124 | "language_info": { 125 | "codemirror_mode": { 126 | "name": "ipython", 127 | "version": 3 128 | }, 129 | "file_extension": ".py", 130 | "mimetype": "text/x-python", 131 | "name": "python", 132 | "nbconvert_exporter": "python", 133 | "pygments_lexer": "ipython3", 134 | "version": "3.11.1" 135 | } 136 | }, 137 | "nbformat": 4, 138 | "nbformat_minor": 5 139 | } 140 | -------------------------------------------------------------------------------- /DataFormatAndReadData/3.22_dataformat_csv.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "c6289597-ef62-410e-a093-2c8e45523900", 6 | "metadata": {}, 7 | "source": [ 8 | "# **3.22_dataformat_csv**" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "23c40658-27a3-4fee-a3df-4c677bd51b24", 14 | "metadata": {}, 15 | "source": [ 16 | "数据分析师最喜欢的数据格式" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "a645aba4-72cb-4cff-82f9-a3e7f52bdb78", 22 | "metadata": {}, 23 | "source": [ 24 | "与JSON一样,CSV也是纯文本文件,也就是说文字内容不存在粗体、下划线、字号、颜色等特征\n", 25 | "\n", 26 | "如果把逗号对齐,CSV的结构基本上就是一个表格" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "id": "d4df4290-d9c2-4646-9174-a267591a1f1a", 32 | "metadata": {}, 33 | "source": [ 34 | "### CSV" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "id": "be8b5bed-1b3d-449a-9006-6f011412be46", 40 | "metadata": {}, 41 | "source": [ 42 | "1. 概念\n", 43 | "\n", 44 | " CSV,全称是Comma-Separated Values,表示逗号分隔值,\n", 45 | "\n", 46 | "2. 结构\n", 47 | "\n", 48 | " 1)表头\n", 49 | "\n", 50 | " a.有表头\n", 51 | "\n", 52 | " CSV文件第一行,也就是展示列名那行,大部分情况下是表格头,包含了许多属性名\n", 53 | "\n", 54 | " b.无表头\n", 55 | "\n", 56 | " 也可以无表头,以数据直接开始\n", 57 | "\n", 58 | " 2)数据\n", 59 | " \n", 60 | " a.在表头下面,每一条数据都是独占一行的,因此当我们把CSV文件转换成DataFrame后,CSV的行与DataFrame的行之间能够直接对应上,\n", 61 | "\n", 62 | " b.每行数据里所包含的值的数量应该是相同的,逗号分隔符的数量也必须相同,但凡哪行多了或少了,说明那就不是一个合格有效的CSV\n", 63 | "\n", 64 | "3. 特殊情况\n", 65 | "\n", 66 | " 1)当数据值里面包含英文逗号,可以用引号围完整的值。里面的逗号,就不会被当成分隔符的逗号了\n", 67 | "\n", 68 | " 2)当某个值为空缺,可以让两个分隔逗号相邻,表示中间那个值不存在\n", 69 | "\n", 70 | "4. CSV与JSON对比\n", 71 | "\n", 72 | " CSV本身是一个非常规整的二维结构,能一眼就知道它所对应的表格长什么样子\n", 73 | "\n", 74 | " JSON则不同,它的结构能非常灵活,也可以层层嵌套,很难直观看出对应表格长什么样\n", 75 | "\n", 76 | " 因此,JSON是通用编程时受欢迎的数据结构,而CSV是数据分析时受欢迎的数据结构\n", 77 | "\n", 78 | "5. 编写CSV\n", 79 | "\n", 80 | " 打开代码编辑器或文本编辑器,按照CSV格式的规则写好后,把文件保存为以.csv结尾的文件即可\n", 81 | "\n", 82 | "6. **会编程的人用代码分析数据时,尽量会让源数据文件,以CSV、JSON等纯文本格式,或者sqlite等数据库文件格式**\n", 83 | "\n", 84 | "7. 优点\n", 85 | "\n", 86 | " 体积小,结构工整,容易让人理解,能非常直接地转换成表格。\n", 87 | " \n", 88 | " 可以用Excel软件去读取、修改或导出CSV。\n", 89 | "\n", 90 | "8. 读取\n", 91 | "\n", 92 | " 在实际的数据分析中,由于一般数据量比较大,动辄1G以上的CSV数据集是很常见的,\n", 93 | "\n", 94 | " 1)Excel\n", 95 | "\n", 96 | " 可能会卡住。因为Excel不止要展示数据,还要试图展示格式(虽然文件里可能就没有额外格式),还得加载一系列功能等\n", 97 | "\n", 98 | " 2)代码编辑器或者纯文本编辑器\n", 99 | "\n", 100 | " 3)更好的方法\n", 101 | "\n", 102 | " 用代码读取,转换成DataFrame,然后用Pandas库里的方法,想看几行看几行,而不用等海量的数据全部加载出来" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "id": "ac272783-ddb7-4c7d-9bc2-488e25a7257b", 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3 (ipykernel)", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.11.1" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 5 135 | } 136 | -------------------------------------------------------------------------------- /EvaluateAndCleanData/4.24_evaluate_data_criteria.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "aa04a785-c042-41ca-bcdb-41818e22f59f", 6 | "metadata": {}, 7 | "source": [ 8 | "# **4.24_evaluate_data_criteria**" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "253b81e8-5cd3-40e8-974c-91b917949044", 14 | "metadata": {}, 15 | "source": [ 16 | "评估数据时,我们主要看两方面:结构和内容" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "dc9fe348-157e-4bc2-8425-692d3b2ab5f9", 22 | "metadata": {}, 23 | "source": [ 24 | "### 一、结构" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "194a3554-7981-40fe-b464-b0de38a77b77", 30 | "metadata": {}, 31 | "source": [ 32 | "1. 结构方面需要清理的数据叫做乱数据;\n", 33 | " \n", 34 | " 结构方面不需要清理的数据叫做整洁数据。\n", 35 | "\n", 36 | "2. 整洁数据,根据埃德加科德的第三范式,包括以下三个特点:\n", 37 | "\n", 38 | " 1)每列是一个变量\n", 39 | "\n", 40 | " 2)每行是一个观察值\n", 41 | "\n", 42 | " 3)每个单元格是一个值\n", 43 | "\n", 44 | " 任何不符合以上三个特点的数据都是乱数据。\n", 45 | "\n", 46 | " 不整洁的数据存在多种多样的例子:比如存在合并单元格的表格或者每列存在多个变量。\n", 47 | "\n", 48 | " 很多方便人类理解的数据,并不是整洁数据;而整洁数据有时不太直观。因为整洁数据并不是方便人类理解,而是让代码更加高效处理和达成分析目的的" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "id": "16f005da-dc74-4e92-8584-126b510cc5a9", 54 | "metadata": {}, 55 | "source": [ 56 | "### 二、内容" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "id": "389bc017-9f20-45de-b82e-e8debfb88c81", 62 | "metadata": {}, 63 | "source": [ 64 | "#### (一)、概念" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "3194d0a7-a375-4968-b688-8de05f1570dd", 70 | "metadata": {}, 71 | "source": [ 72 | "内容方面需要清理的数据叫脏数据\n", 73 | "\n", 74 | "内容方面不需要清理的数据叫干净数据。" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "id": "98310fee-8b5b-47c0-b291-42f7658a608d", 80 | "metadata": {}, 81 | "source": [ 82 | "#### (二)、脏数据可能存在问题" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "id": "e6c744fb-8392-442a-9f63-36f485a7ee8a", 88 | "metadata": {}, 89 | "source": [ 90 | "1. 丢失数据\n", 91 | "\n", 92 | " 有些值为空缺。\n", 93 | "\n", 94 | " 空缺值的影响大小取决于具体情况,有些时候允许某些列的数据不全。\n", 95 | "\n", 96 | " 但我们仍然要进行评估,否则可能导致错误的分析,比如:如果我们没有考虑到有同学缺考,存在成绩缺失,直接用总分数除以总人头数求平均,就会导致计算结果被缺失值拉低。\n", 97 | "\n", 98 | "2. 重复数据\n", 99 | "\n", 100 | " 即数据中有些观察值重复出现。\n", 101 | "\n", 102 | " 有些值的重复不是问题,比如说班级里学生的性别。\n", 103 | "\n", 104 | " 但假如数据中作为唯一表示符的属性存在重复,或者数据中存在有数据实例所有属性值均相同的情况,就是有问题\n", 105 | "\n", 106 | "3. 不一致数据\n", 107 | "\n", 108 | " 即不同数据值实际含义相同,或不同数据值实际指代统一目标\n", 109 | "\n", 110 | " 比如:数字单位不一致、全称和简写混用、中文数字和阿拉伯数字混用等等\n", 111 | "\n", 112 | "4. 无效或错误数据\n", 113 | "\n", 114 | " 脱离现实规则的数据都是无效数据,比如:负数的身高\n", 115 | "\n", 116 | " 虽然符合规则但并不准确的数据是错误数据,比如:一个成年人的体重是3公斤" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "id": "f5091953-4ddf-40db-8ff6-6e3d62770ae3", 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [] 126 | } 127 | ], 128 | "metadata": { 129 | "kernelspec": { 130 | "display_name": "Python 3 (ipykernel)", 131 | "language": "python", 132 | "name": "python3" 133 | }, 134 | "language_info": { 135 | "codemirror_mode": { 136 | "name": "ipython", 137 | "version": 3 138 | }, 139 | "file_extension": ".py", 140 | "mimetype": "text/x-python", 141 | "name": "python", 142 | "nbconvert_exporter": "python", 143 | "pygments_lexer": "ipython3", 144 | "version": "3.11.1" 145 | } 146 | }, 147 | "nbformat": 4, 148 | "nbformat_minor": 5 149 | } 150 | -------------------------------------------------------------------------------- /DataAnalysis/Practices/Iris/Iris.csv: -------------------------------------------------------------------------------- 1 | Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species 2 | 1,5.1,3.5,1.4,0.2,Iris-setosa 3 | 2,4.9,3.0,1.4,0.2,Iris-setosa 4 | 3,4.7,3.2,1.3,0.2,Iris-setosa 5 | 4,4.6,3.1,1.5,0.2,Iris-setosa 6 | 5,5.0,3.6,1.4,0.2,Iris-setosa 7 | 6,5.4,3.9,1.7,0.4,Iris-setosa 8 | 7,4.6,3.4,1.4,0.3,Iris-setosa 9 | 8,5.0,3.4,1.5,0.2,Iris-setosa 10 | 9,4.4,2.9,1.4,0.2,Iris-setosa 11 | 10,4.9,3.1,1.5,0.1,Iris-setosa 12 | 11,5.4,3.7,1.5,0.2,Iris-setosa 13 | 12,4.8,3.4,1.6,0.2,Iris-setosa 14 | 13,4.8,3.0,1.4,0.1,Iris-setosa 15 | 14,4.3,3.0,1.1,0.1,Iris-setosa 16 | 15,5.8,4.0,1.2,0.2,Iris-setosa 17 | 16,5.7,4.4,1.5,0.4,Iris-setosa 18 | 17,5.4,3.9,1.3,0.4,Iris-setosa 19 | 18,5.1,3.5,1.4,0.3,Iris-setosa 20 | 19,5.7,3.8,1.7,0.3,Iris-setosa 21 | 20,5.1,3.8,1.5,0.3,Iris-setosa 22 | 21,5.4,3.4,1.7,0.2,Iris-setosa 23 | 22,5.1,3.7,1.5,0.4,Iris-setosa 24 | 23,4.6,3.6,1.0,0.2,Iris-setosa 25 | 24,5.1,3.3,1.7,0.5,Iris-setosa 26 | 25,4.8,3.4,1.9,0.2,Iris-setosa 27 | 26,5.0,3.0,1.6,0.2,Iris-setosa 28 | 27,5.0,3.4,1.6,0.4,Iris-setosa 29 | 28,5.2,3.5,1.5,0.2,Iris-setosa 30 | 29,5.2,3.4,1.4,0.2,Iris-setosa 31 | 30,4.7,3.2,1.6,0.2,Iris-setosa 32 | 31,4.8,3.1,1.6,0.2,Iris-setosa 33 | 32,5.4,3.4,1.5,0.4,Iris-setosa 34 | 33,5.2,4.1,1.5,0.1,Iris-setosa 35 | 34,5.5,4.2,1.4,0.2,Iris-setosa 36 | 35,4.9,3.1,1.5,0.1,Iris-setosa 37 | 36,5.0,3.2,1.2,0.2,Iris-setosa 38 | 37,5.5,3.5,1.3,0.2,Iris-setosa 39 | 38,4.9,3.1,1.5,0.1,Iris-setosa 40 | 39,4.4,3.0,1.3,0.2,Iris-setosa 41 | 40,5.1,3.4,1.5,0.2,Iris-setosa 42 | 41,5.0,3.5,1.3,0.3,Iris-setosa 43 | 42,4.5,2.3,1.3,0.3,Iris-setosa 44 | 43,4.4,3.2,1.3,0.2,Iris-setosa 45 | 44,5.0,3.5,1.6,0.6,Iris-setosa 46 | 45,5.1,3.8,1.9,0.4,Iris-setosa 47 | 46,4.8,3.0,1.4,0.3,Iris-setosa 48 | 47,5.1,3.8,1.6,0.2,Iris-setosa 49 | 48,4.6,3.2,1.4,0.2,Iris-setosa 50 | 49,5.3,3.7,1.5,0.2,Iris-setosa 51 | 50,5.0,3.3,1.4,0.2,Iris-setosa 52 | 51,7.0,3.2,4.7,1.4,Iris-versicolor 53 | 52,6.4,3.2,4.5,1.5,Iris-versicolor 54 | 53,6.9,3.1,4.9,1.5,Iris-versicolor 55 | 54,5.5,2.3,4.0,1.3,Iris-versicolor 56 | 55,6.5,2.8,4.6,1.5,Iris-versicolor 57 | 56,5.7,2.8,4.5,1.3,Iris-versicolor 58 | 57,6.3,3.3,4.7,1.6,Iris-versicolor 59 | 58,4.9,2.4,3.3,1.0,Iris-versicolor 60 | 59,6.6,2.9,4.6,1.3,Iris-versicolor 61 | 60,5.2,2.7,3.9,1.4,Iris-versicolor 62 | 61,5.0,2.0,3.5,1.0,Iris-versicolor 63 | 62,5.9,3.0,4.2,1.5,Iris-versicolor 64 | 63,6.0,2.2,4.0,1.0,Iris-versicolor 65 | 64,6.1,2.9,4.7,1.4,Iris-versicolor 66 | 65,5.6,2.9,3.6,1.3,Iris-versicolor 67 | 66,6.7,3.1,4.4,1.4,Iris-versicolor 68 | 67,5.6,3.0,4.5,1.5,Iris-versicolor 69 | 68,5.8,2.7,4.1,1.0,Iris-versicolor 70 | 69,6.2,2.2,4.5,1.5,Iris-versicolor 71 | 70,5.6,2.5,3.9,1.1,Iris-versicolor 72 | 71,5.9,3.2,4.8,1.8,Iris-versicolor 73 | 72,6.1,2.8,4.0,1.3,Iris-versicolor 74 | 73,6.3,2.5,4.9,1.5,Iris-versicolor 75 | 74,6.1,2.8,4.7,1.2,Iris-versicolor 76 | 75,6.4,2.9,4.3,1.3,Iris-versicolor 77 | 76,6.6,3.0,4.4,1.4,Iris-versicolor 78 | 77,6.8,2.8,4.8,1.4,Iris-versicolor 79 | 78,6.7,3.0,5.0,1.7,Iris-versicolor 80 | 79,6.0,2.9,4.5,1.5,Iris-versicolor 81 | 80,5.7,2.6,3.5,1.0,Iris-versicolor 82 | 81,5.5,2.4,3.8,1.1,Iris-versicolor 83 | 82,5.5,2.4,3.7,1.0,Iris-versicolor 84 | 83,5.8,2.7,3.9,1.2,Iris-versicolor 85 | 84,6.0,2.7,5.1,1.6,Iris-versicolor 86 | 85,5.4,3.0,4.5,1.5,Iris-versicolor 87 | 86,6.0,3.4,4.5,1.6,Iris-versicolor 88 | 87,6.7,3.1,4.7,1.5,Iris-versicolor 89 | 88,6.3,2.3,4.4,1.3,Iris-versicolor 90 | 89,5.6,3.0,4.1,1.3,Iris-versicolor 91 | 90,5.5,2.5,4.0,1.3,Iris-versicolor 92 | 91,5.5,2.6,4.4,1.2,Iris-versicolor 93 | 92,6.1,3.0,4.6,1.4,Iris-versicolor 94 | 93,5.8,2.6,4.0,1.2,Iris-versicolor 95 | 94,5.0,2.3,3.3,1.0,Iris-versicolor 96 | 95,5.6,2.7,4.2,1.3,Iris-versicolor 97 | 96,5.7,3.0,4.2,1.2,Iris-versicolor 98 | 97,5.7,2.9,4.2,1.3,Iris-versicolor 99 | 98,6.2,2.9,4.3,1.3,Iris-versicolor 100 | 99,5.1,2.5,3.0,1.1,Iris-versicolor 101 | 100,5.7,2.8,4.1,1.3,Iris-versicolor 102 | -------------------------------------------------------------------------------- /DataVisualization/.ipynb_checkpoints/6.75_data_visualization_chart-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "215fa9a7-694e-47cb-97d3-3d8094d698e5", 6 | "metadata": {}, 7 | "source": [ 8 | "# **6.75_data_visualization_chart**" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "4e6bff90-9c03-4971-8745-76d15a177bca", 14 | "metadata": {}, 15 | "source": [ 16 | "可视化的作用不仅在于最后步骤。在数据分析前,图表能在我们自行探索数据的时候,帮助发现隐藏的关系、趋势、影响,高效找到下一步的分析方向。甚至在数据评估与清理步骤,可视化也能帮助我们直观发现异常数据。因此,数据可视化是一个万金油,它不仅是数据分析全流程中的一个步骤,更是一种方式,可以任何步骤里使用。" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "e286ac92-2667-4709-ab84-db8bc947190d", 22 | "metadata": {}, 23 | "source": [ 24 | "如果我们的数据包含一个数值变量,即一维数据,可以绘制直方图、密度图、箱型图、小提琴图" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "e699b0ad-0b25-4a8d-ad83-2bf4734c26a4", 30 | "metadata": {}, 31 | "source": [ 32 | "### 一、直方图" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "id": "8eb58329-c2b2-4f1e-9ec1-13a9d62e67c9", 38 | "metadata": {}, 39 | "source": [ 40 | "1. 特点\n", 41 | "\n", 42 | " 展示数据的频率分布\n", 43 | "\n", 44 | "2. 表示\n", 45 | "\n", 46 | " 横轴表示某数据范围,而纵轴表示个数" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "id": "d0b89c87-c55f-44e5-b39c-f45fc258a62a", 52 | "metadata": {}, 53 | "source": [ 54 | "### 二、密度图 " 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "id": "932e8357-b375-461c-9999-db3fdf596aa2", 60 | "metadata": {}, 61 | "source": [ 62 | "1. 特点\n", 63 | "\n", 64 | " 同样用来表示数据的分布,更容易看出分布形状\n", 65 | "\n", 66 | "2. 不同\n", 67 | "\n", 68 | " 不同于直方图用一个个条柱表示频率,密度图会用一条平滑的曲线\n", 69 | "\n", 70 | " 纵轴表示的是概率密度,纵轴的最大值不会超过1" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "id": "8fdb6866-4cab-48bf-98ed-9e1c026ac78f", 76 | "metadata": {}, 77 | "source": [ 78 | "### 三、箱型图" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "id": "b295cd77-022a-4224-b347-1ce29f8fd80a", 84 | "metadata": {}, 85 | "source": [ 86 | "1. 特点\n", 87 | "\n", 88 | " 看出中位数、上下四分位数、四分位距大小、分布是否对称、是否紧密,以及有没有异常值\n", 89 | "\n", 90 | "2. 表示\n", 91 | "\n", 92 | " 纵轴表示数据值;箱子的边界分别是第一和第三四分位数,所以箱子的长度就是四分位距;而箱子中间那条线是中位数;箱子上下的横线分别是上界和下界;上界和下界外的点是异常值。\n", 93 | "\n", 94 | "3. 上界&下界\n", 95 | "\n", 96 | " 上界 = (数据最大值 or 第三四分位数 + 1.5倍四分位距),其中的最小值\n", 97 | "\n", 98 | " 下界 = (数据最小值 or 第一四分位数 - 1.5倍四分位距),其中的最大值\n", 99 | "\n", 100 | " 如果有数据的值,(大于第三四分位数 + 1.5倍四分位距 or 小于第一四分位数 - 1.5倍四分位距),会被用单独的点表示出来,被算作异常值" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "id": "f093db28-7003-484c-a4b4-cd5c94baa3ce", 106 | "metadata": {}, 107 | "source": [ 108 | "### 四、小提琴图" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "id": "91f0da6d-4738-4b5a-9e41-642fd582bb2e", 114 | "metadata": {}, 115 | "source": [ 116 | "1. 特点\n", 117 | "\n", 118 | " 能同时展现箱型图和密度图的信息\n", 119 | "\n", 120 | "2. 表示\n", 121 | "\n", 122 | " 中间的小圆点表示中位数;黑色条的边界和箱型图的箱子一样,分别是第一和第三四分位数;而小提琴的长度,表示95%置信区间;小提琴的体型,对应了密度图,越胖的地方那个值频率越高。" 123 | ] 124 | } 125 | ], 126 | "metadata": { 127 | "kernelspec": { 128 | "display_name": "Python 3 (ipykernel)", 129 | "language": "python", 130 | "name": "python3" 131 | }, 132 | "language_info": { 133 | "codemirror_mode": { 134 | "name": "ipython", 135 | "version": 3 136 | }, 137 | "file_extension": ".py", 138 | "mimetype": "text/x-python", 139 | "name": "python", 140 | "nbconvert_exporter": "python", 141 | "pygments_lexer": "ipython3", 142 | "version": "3.11.1" 143 | } 144 | }, 145 | "nbformat": 4, 146 | "nbformat_minor": 5 147 | } 148 | -------------------------------------------------------------------------------- /DataAnalysis/Practices/Iris/Iris_cleaned.csv: -------------------------------------------------------------------------------- 1 | Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species 2 | 1,5.1,3.5,1.4,0.2,Iris-setosa 3 | 2,4.9,3.0,1.4,0.2,Iris-setosa 4 | 3,4.7,3.2,1.3,0.2,Iris-setosa 5 | 4,4.6,3.1,1.5,0.2,Iris-setosa 6 | 5,5.0,3.6,1.4,0.2,Iris-setosa 7 | 6,5.4,3.9,1.7,0.4,Iris-setosa 8 | 7,4.6,3.4,1.4,0.3,Iris-setosa 9 | 8,5.0,3.4,1.5,0.2,Iris-setosa 10 | 9,4.4,2.9,1.4,0.2,Iris-setosa 11 | 10,4.9,3.1,1.5,0.1,Iris-setosa 12 | 11,5.4,3.7,1.5,0.2,Iris-setosa 13 | 12,4.8,3.4,1.6,0.2,Iris-setosa 14 | 13,4.8,3.0,1.4,0.1,Iris-setosa 15 | 14,4.3,3.0,1.1,0.1,Iris-setosa 16 | 15,5.8,4.0,1.2,0.2,Iris-setosa 17 | 16,5.7,4.4,1.5,0.4,Iris-setosa 18 | 17,5.4,3.9,1.3,0.4,Iris-setosa 19 | 18,5.1,3.5,1.4,0.3,Iris-setosa 20 | 19,5.7,3.8,1.7,0.3,Iris-setosa 21 | 20,5.1,3.8,1.5,0.3,Iris-setosa 22 | 21,5.4,3.4,1.7,0.2,Iris-setosa 23 | 22,5.1,3.7,1.5,0.4,Iris-setosa 24 | 23,4.6,3.6,1.0,0.2,Iris-setosa 25 | 24,5.1,3.3,1.7,0.5,Iris-setosa 26 | 25,4.8,3.4,1.9,0.2,Iris-setosa 27 | 26,5.0,3.0,1.6,0.2,Iris-setosa 28 | 27,5.0,3.4,1.6,0.4,Iris-setosa 29 | 28,5.2,3.5,1.5,0.2,Iris-setosa 30 | 29,5.2,3.4,1.4,0.2,Iris-setosa 31 | 30,4.7,3.2,1.6,0.2,Iris-setosa 32 | 31,4.8,3.1,1.6,0.2,Iris-setosa 33 | 32,5.4,3.4,1.5,0.4,Iris-setosa 34 | 33,5.2,4.1,1.5,0.1,Iris-setosa 35 | 34,5.5,4.2,1.4,0.2,Iris-setosa 36 | 35,4.9,3.1,1.5,0.1,Iris-setosa 37 | 36,5.0,3.2,1.2,0.2,Iris-setosa 38 | 37,5.5,3.5,1.3,0.2,Iris-setosa 39 | 38,4.9,3.1,1.5,0.1,Iris-setosa 40 | 39,4.4,3.0,1.3,0.2,Iris-setosa 41 | 40,5.1,3.4,1.5,0.2,Iris-setosa 42 | 41,5.0,3.5,1.3,0.3,Iris-setosa 43 | 42,4.5,2.3,1.3,0.3,Iris-setosa 44 | 43,4.4,3.2,1.3,0.2,Iris-setosa 45 | 44,5.0,3.5,1.6,0.6,Iris-setosa 46 | 45,5.1,3.8,1.9,0.4,Iris-setosa 47 | 46,4.8,3.0,1.4,0.3,Iris-setosa 48 | 47,5.1,3.8,1.6,0.2,Iris-setosa 49 | 48,4.6,3.2,1.4,0.2,Iris-setosa 50 | 49,5.3,3.7,1.5,0.2,Iris-setosa 51 | 50,5.0,3.3,1.4,0.2,Iris-setosa 52 | 51,7.0,3.2,4.7,1.4,Iris-versicolor 53 | 52,6.4,3.2,4.5,1.5,Iris-versicolor 54 | 53,6.9,3.1,4.9,1.5,Iris-versicolor 55 | 54,5.5,2.3,4.0,1.3,Iris-versicolor 56 | 55,6.5,2.8,4.6,1.5,Iris-versicolor 57 | 56,5.7,2.8,4.5,1.3,Iris-versicolor 58 | 57,6.3,3.3,4.7,1.6,Iris-versicolor 59 | 58,4.9,2.4,3.3,1.0,Iris-versicolor 60 | 59,6.6,2.9,4.6,1.3,Iris-versicolor 61 | 60,5.2,2.7,3.9,1.4,Iris-versicolor 62 | 61,5.0,2.0,3.5,1.0,Iris-versicolor 63 | 62,5.9,3.0,4.2,1.5,Iris-versicolor 64 | 63,6.0,2.2,4.0,1.0,Iris-versicolor 65 | 64,6.1,2.9,4.7,1.4,Iris-versicolor 66 | 65,5.6,2.9,3.6,1.3,Iris-versicolor 67 | 66,6.7,3.1,4.4,1.4,Iris-versicolor 68 | 67,5.6,3.0,4.5,1.5,Iris-versicolor 69 | 68,5.8,2.7,4.1,1.0,Iris-versicolor 70 | 69,6.2,2.2,4.5,1.5,Iris-versicolor 71 | 70,5.6,2.5,3.9,1.1,Iris-versicolor 72 | 71,5.9,3.2,4.8,1.8,Iris-versicolor 73 | 72,6.1,2.8,4.0,1.3,Iris-versicolor 74 | 73,6.3,2.5,4.9,1.5,Iris-versicolor 75 | 74,6.1,2.8,4.7,1.2,Iris-versicolor 76 | 75,6.4,2.9,4.3,1.3,Iris-versicolor 77 | 76,6.6,3.0,4.4,1.4,Iris-versicolor 78 | 77,6.8,2.8,4.8,1.4,Iris-versicolor 79 | 78,6.7,3.0,5.0,1.7,Iris-versicolor 80 | 79,6.0,2.9,4.5,1.5,Iris-versicolor 81 | 80,5.7,2.6,3.5,1.0,Iris-versicolor 82 | 81,5.5,2.4,3.8,1.1,Iris-versicolor 83 | 82,5.5,2.4,3.7,1.0,Iris-versicolor 84 | 83,5.8,2.7,3.9,1.2,Iris-versicolor 85 | 84,6.0,2.7,5.1,1.6,Iris-versicolor 86 | 85,5.4,3.0,4.5,1.5,Iris-versicolor 87 | 86,6.0,3.4,4.5,1.6,Iris-versicolor 88 | 87,6.7,3.1,4.7,1.5,Iris-versicolor 89 | 88,6.3,2.3,4.4,1.3,Iris-versicolor 90 | 89,5.6,3.0,4.1,1.3,Iris-versicolor 91 | 90,5.5,2.5,4.0,1.3,Iris-versicolor 92 | 91,5.5,2.6,4.4,1.2,Iris-versicolor 93 | 92,6.1,3.0,4.6,1.4,Iris-versicolor 94 | 93,5.8,2.6,4.0,1.2,Iris-versicolor 95 | 94,5.0,2.3,3.3,1.0,Iris-versicolor 96 | 95,5.6,2.7,4.2,1.3,Iris-versicolor 97 | 96,5.7,3.0,4.2,1.2,Iris-versicolor 98 | 97,5.7,2.9,4.2,1.3,Iris-versicolor 99 | 98,6.2,2.9,4.3,1.3,Iris-versicolor 100 | 99,5.1,2.5,3.0,1.1,Iris-versicolor 101 | 100,5.7,2.8,4.1,1.3,Iris-versicolor 102 | -------------------------------------------------------------------------------- /DataVisualization/6.75_data_visualization_chart.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "215fa9a7-694e-47cb-97d3-3d8094d698e5", 6 | "metadata": {}, 7 | "source": [ 8 | "# **6.75_data_visualization_chart**" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "4e6bff90-9c03-4971-8745-76d15a177bca", 14 | "metadata": {}, 15 | "source": [ 16 | "可视化的作用不仅在于最后步骤。在数据分析前,图表能在我们自行探索数据的时候,帮助发现隐藏的关系、趋势、影响,高效找到下一步的分析方向。甚至在数据评估与清理步骤,可视化也能帮助我们直观发现异常数据。因此,数据可视化是一个万金油,它不仅是数据分析全流程中的一个步骤,更是一种方式,可以任何步骤里使用。" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "e286ac92-2667-4709-ab84-db8bc947190d", 22 | "metadata": {}, 23 | "source": [ 24 | "如果我们的数据包含一个数值变量,可以绘制直方图、密度图、箱型图、小提琴图" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "e699b0ad-0b25-4a8d-ad83-2bf4734c26a4", 30 | "metadata": {}, 31 | "source": [ 32 | "### 一、直方图" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "id": "8eb58329-c2b2-4f1e-9ec1-13a9d62e67c9", 38 | "metadata": {}, 39 | "source": [ 40 | "1. 特点\n", 41 | "\n", 42 | " 展示数据的频率分布\n", 43 | "\n", 44 | "2. 表示\n", 45 | "\n", 46 | " 横轴表示某数据范围,而纵轴表示个数" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "id": "d0b89c87-c55f-44e5-b39c-f45fc258a62a", 52 | "metadata": {}, 53 | "source": [ 54 | "### 二、密度图(Kernel Density)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "id": "932e8357-b375-461c-9999-db3fdf596aa2", 60 | "metadata": {}, 61 | "source": [ 62 | "1. 特点\n", 63 | "\n", 64 | " 同样用来表示数据的分布,更容易看出分布形状\n", 65 | "\n", 66 | "2. 不同\n", 67 | "\n", 68 | " 不同于直方图用一个个条柱表示频率,密度图会用一条平滑的曲线\n", 69 | "\n", 70 | " 纵轴表示的是概率密度,纵轴的最大值不会超过1" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "id": "8fdb6866-4cab-48bf-98ed-9e1c026ac78f", 76 | "metadata": {}, 77 | "source": [ 78 | "### 三、箱型图" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "id": "b295cd77-022a-4224-b347-1ce29f8fd80a", 84 | "metadata": {}, 85 | "source": [ 86 | "1. 特点\n", 87 | "\n", 88 | " 看出中位数、上下四分位数、四分位距大小、分布是否对称、是否紧密,以及有没有异常值\n", 89 | "\n", 90 | "2. 表示\n", 91 | "\n", 92 | " 纵轴表示数据值;箱子的边界分别是第一和第三四分位数,所以箱子的长度就是四分位距;而箱子中间那条线是中位数;箱子上下的横线分别是上界和下界;上界和下界外的点是异常值。\n", 93 | "\n", 94 | "3. 上界&下界\n", 95 | "\n", 96 | " 上界 = (数据最大值 or 第三四分位数 + 1.5倍四分位距),其中的最小值\n", 97 | "\n", 98 | " 下界 = (数据最小值 or 第一四分位数 - 1.5倍四分位距),其中的最大值\n", 99 | "\n", 100 | " 如果有数据的值,(大于第三四分位数 + 1.5倍四分位距 or 小于第一四分位数 - 1.5倍四分位距),会被用单独的点表示出来,被算作异常值" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "id": "f093db28-7003-484c-a4b4-cd5c94baa3ce", 106 | "metadata": {}, 107 | "source": [ 108 | "### 四、小提琴图" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "id": "91f0da6d-4738-4b5a-9e41-642fd582bb2e", 114 | "metadata": {}, 115 | "source": [ 116 | "1. 特点\n", 117 | "\n", 118 | " 能同时展现箱型图和密度图的信息\n", 119 | "\n", 120 | "2. 表示\n", 121 | "\n", 122 | " 中间的小圆点表示中位数;黑色条的边界和箱型图的箱子一样,分别是第一和第三四分位数;而小提琴的长度,表示95%置信区间;小提琴的体型,对应了密度图,越胖的地方那个值频率越高。" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "id": "c9729313-ef27-4637-93c3-88ede9aa347c", 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [] 132 | } 133 | ], 134 | "metadata": { 135 | "kernelspec": { 136 | "display_name": "Python 3 (ipykernel)", 137 | "language": "python", 138 | "name": "python3" 139 | }, 140 | "language_info": { 141 | "codemirror_mode": { 142 | "name": "ipython", 143 | "version": 3 144 | }, 145 | "file_extension": ".py", 146 | "mimetype": "text/x-python", 147 | "name": "python", 148 | "nbconvert_exporter": "python", 149 | "pygments_lexer": "ipython3", 150 | "version": "3.11.1" 151 | } 152 | }, 153 | "nbformat": 4, 154 | "nbformat_minor": 5 155 | } 156 | -------------------------------------------------------------------------------- /DataFormatAndReadData/3.19_retrieve_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "228a0610-2bd6-4e0b-b234-a767b0875162", 6 | "metadata": {}, 7 | "source": [ 8 | "# **3.19_retrieve_data**" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "b5d6d99c-eafe-4082-b871-219a1a1eeb34", 14 | "metadata": {}, 15 | "source": [ 16 | "***数据分析流程:获取数据、读取数据、评估数据、清洗数据、整理数据、分析数据、可视化数据***" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "cbd33fc8-c1a5-4d3f-a4a4-aa1cef8893fb", 22 | "metadata": {}, 23 | "source": [ 24 | "### 一、获取私密数据" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "2fc614e6-5a30-40fe-9776-8e8459c689c0", 30 | "metadata": {}, 31 | "source": [ 32 | "获取私密数据没有通用方法,方法取决于具体情况 " 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "id": "d85547ae-1a47-4611-90b1-973b0050ddd0", 38 | "metadata": {}, 39 | "source": [ 40 | "### 二、获取公开数据" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "id": "f40af378-7800-4211-9911-ea8c6a4573c9", 46 | "metadata": {}, 47 | "source": [ 48 | "#### (一)、下载" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "id": "4eae05e9-a5bd-4868-b1ca-c0367a21682d", 54 | "metadata": {}, 55 | "source": [ 56 | "1. 公开数据集" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "id": "ca1a775c-d246-4af1-b99e-7663caaab80f", 62 | "metadata": {}, 63 | "source": [ 64 | " 1)网络上有一些提供公开数据集的网站,可以在后续掌握分析技巧后,探索和下载感兴趣的数据集。\n", 65 | "\n", 66 | " 飞桨(百度旗下深度学习平台)数据集:https://aistudio.baidu.com/aistudio/datasetoverview\n", 67 | "\n", 68 | " 天池(阿里云旗下开发者竞赛平台) :https://tianchi.aliyun.com/dataset/\n", 69 | "\n", 70 | " 和鲸社区(数据科学开源社区)数据集:https://www.heywhale.com/home/dataset\n", 71 | "\n", 72 | " 2)课程配套资料里也整理好了很多公开数据集" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "id": "fa169a62-d56b-488f-8939-0fbf83f1e3ce", 78 | "metadata": {}, 79 | "source": [ 80 | "#### (二)、爬虫" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "id": "ac69d491-5dae-408c-b509-eec0829a8699", 86 | "metadata": {}, 87 | "source": [ 88 | "获取数据的过程\n", 89 | "\n", 90 | "1. 获取网页内容\n", 91 | "\n", 92 | " 通过发送请求,获取网页源代码。\n", 93 | "\n", 94 | "2. 解析网页内容\n", 95 | "\n", 96 | " 解析源代码内容,提取出想要的内容。这些数据就可以作为后续分析的原料了..." 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "id": "c66cd5fb-0401-4f77-a1d5-5f830fa7608c", 102 | "metadata": {}, 103 | "source": [ 104 | "爬虫的红线\n", 105 | "\n", 106 | "1. 不要爬取公民隐私数据\n", 107 | "\n", 108 | "2. 不要爬取受著作权保护的内容\n", 109 | "\n", 110 | "3. 不要爬取国家事务、国防建设、尖端科学技术领域的计算机系统等" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "id": "17bc7aaf-ab0e-45e7-adaa-4ef95e8d6210", 116 | "metadata": {}, 117 | "source": [ 118 | "#### (三)、API" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "id": "73bb4518-3872-4639-9ab2-ad4e014c62f3", 124 | "metadata": {}, 125 | "source": [ 126 | "1. 概念\n", 127 | " \n", 128 | " API, 全称是Application Programming Interface, 表示应用程序编程接口。\n", 129 | "\n", 130 | " API定义了两个程序之间的服务合约,即双方是如何使用请求和响应来进行通讯的\n", 131 | "\n", 132 | "2. 爬虫和API获取数据的区别\n", 133 | "\n", 134 | " 当我们爬虫时,发送请求后,获取的是网页源码,但网页源码本身是用来给浏览器渲染的,而不是直接的信息。要从中寻找特定数据,还要解析网页源码\n", 135 | "\n", 136 | " 如果网站直接提供给我们API,我们就能按照对方规定好的服务合约,根据规则发送请求,然后直接获得想要的数据,不需要经过解析源码这一步骤\n", 137 | "\n", 138 | " 因此,当一个网站提供了公开API时,通过API而不是爬虫去获取数据,肯定是更高效的方法" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "id": "ba518bc6-4c64-416a-a483-418d4927178b", 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [] 148 | } 149 | ], 150 | "metadata": { 151 | "kernelspec": { 152 | "display_name": "Python 3 (ipykernel)", 153 | "language": "python", 154 | "name": "python3" 155 | }, 156 | "language_info": { 157 | "codemirror_mode": { 158 | "name": "ipython", 159 | "version": 3 160 | }, 161 | "file_extension": ".py", 162 | "mimetype": "text/x-python", 163 | "name": "python", 164 | "nbconvert_exporter": "python", 165 | "pygments_lexer": "ipython3", 166 | "version": "3.11.1" 167 | } 168 | }, 169 | "nbformat": 4, 170 | "nbformat_minor": 5 171 | } 172 | -------------------------------------------------------------------------------- /DataFormatAndReadData/3.20_dataformat_json.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "683c2d6c-9d56-4440-9e3c-c73577a17c54", 6 | "metadata": {}, 7 | "source": [ 8 | "# **3.20_dataformat_json**" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "7d204c59-2076-45ac-804d-ebdc5e81458f", 14 | "metadata": {}, 15 | "source": [ 16 | "程序员非常喜欢的数据格式" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "108254ed-91cf-4735-90f6-0df99b524688", 22 | "metadata": {}, 23 | "source": [ 24 | "### 一、数据格式" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "683f808c-2be0-4c3b-a0ed-168de4b3824d", 30 | "metadata": {}, 31 | "source": [ 32 | "不同文件格式有不同的读取方法,一般通过文件后缀来分辨文件格式\n", 33 | "\n", 34 | "**文件名后缀只是文件名的一部分,更改后缀并不改变文件里面的内容,因此更改后缀不影响实际的文件格式,**\n", 35 | "\n", 36 | "**文件名后缀会影响电脑选择用什么软件去打开它**\n", 37 | "\n", 38 | "当我们聊数据格式时,不仅聊的是这个文件以什么后缀结尾,更重要的是它里面的内容遵循怎样的格式规则" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "id": "a90b43e2-ef53-4354-b4d9-b4cc7d595ded", 44 | "metadata": {}, 45 | "source": [ 46 | "### 二、JSON" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "id": "8f5659b4-10fa-49db-92cb-333fd5dee9a5", 52 | "metadata": {}, 53 | "source": [ 54 | "1. 概念\n", 55 | "\n", 56 | " JSON,全称为JaveScript Object Notation,意思是JavaScript的对象表示法,可见这种数据格式和JavaScript的语法是有些关联的\n", 57 | "\n", 58 | " JSON,与Python的字典或列表,有非常类似之处;并且被无数主流变成语言支持\n", 59 | "\n", 60 | "2. 优点\n", 61 | "\n", 62 | " 1)用API获得的数据,很多时候数据都是一JSON格式进行返回的\n", 63 | "\n", 64 | " 2)占用体积小\n", 65 | "\n", 66 | " 3)非常容易被转换成程序语言自己的结构\n", 67 | "\n", 68 | "3. 数据结构\n", 69 | "\n", 70 | " JSON对象和JSON数组,可以分别被转换成Python字典和列表\n", 71 | "\n", 72 | " 1)JSON对象\n", 73 | "\n", 74 | " JSON对象,以大括号开头和结尾,然后里面都是键值对,每个键值对之间用逗号进行分隔。\n", 75 | "\n", 76 | "```JSON\n", 77 | "{\n", 78 | " \"id\": \"1\", \n", 79 | " \"type\": \"article\", \n", 80 | " \"title\": \"working with JSON data\", \n", 81 | " \"created\": \"2099-12-18T14:56:29.000Z\"\n", 82 | "}\n", 83 | "```\n", 84 | "\n", 85 | " 2)JSON数组\n", 86 | "\n", 87 | " JSON数组,以中括号开头和结尾,然后里面是一个个值,每个值之间用逗号进行分隔。\n", 88 | "\n", 89 | "```JSON\n", 90 | "[\n", 91 | " {\n", 92 | " \"title\": \"A Light in the Attic\"\n", 93 | " \"price\": \"£51.77\"\n", 94 | " },\n", 95 | " {\n", 96 | " \"title\": \"Tipping the Velvet\"\n", 97 | " \"price\": \"£53.74\"\n", 98 | " }\n", 99 | "]\n", 100 | "```\n", 101 | "\n", 102 | "4. JSON值的类型,需要属于以下几种:\n", 103 | "\n", 104 | " 1)字符串 \"star\"\n", 105 | "\n", 106 | " 2)数字 31\n", 107 | "\n", 108 | " 3)布尔值 true\n", 109 | "\n", 110 | " 4)数组 [\"hi\", 7] \n", 111 | "\n", 112 | " 5)对象 {\"age\": 25}\n", 113 | "\n", 114 | " 6)空值 null\n", 115 | "\n", 116 | " 支持嵌套, 数组里面的值可以是对象,对象里的值可以是数组\n", 117 | "\n", 118 | "5. 与Python语法的区别\n", 119 | "\n", 120 | " 1)Python字典,可以用整数等不可变数据类型作为键;但在JSON对象里,只能是字符串作为键,不能是其他类型。\n", 121 | "\n", 122 | " 2)JSON对象的键,不能存在重复,因为值要靠键提取;Python字典里,键重复不会报错,但在有两个键相同的情况下,后来赋给键的值将成为键的真实值\n", 123 | "\n", 124 | " 3)JSON里,字符串必须被双引号包围,不能用单引号\n", 125 | "\n", 126 | " 4)Python的布尔值,都是以大写开头;因为本质是JavaScript对象,而JavaScript的布尔值是以小写开头,JSON里布尔值也是以小写开头\n", 127 | "\n", 128 | " 5)JSON里空值是null\n", 129 | "\n", 130 | "6. 数据转换\n", 131 | "\n", 132 | " 因为JSON和Python的字典或列表还是存在差别的,因此获取JSON数据后,要进行解析+转换,才能去分析数据" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "id": "f21ca04e-3062-417e-8346-3b28979b5da4", 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [] 142 | } 143 | ], 144 | "metadata": { 145 | "kernelspec": { 146 | "display_name": "Python 3 (ipykernel)", 147 | "language": "python", 148 | "name": "python3" 149 | }, 150 | "language_info": { 151 | "codemirror_mode": { 152 | "name": "ipython", 153 | "version": 3 154 | }, 155 | "file_extension": ".py", 156 | "mimetype": "text/x-python", 157 | "name": "python", 158 | "nbconvert_exporter": "python", 159 | "pygments_lexer": "ipython3", 160 | "version": "3.11.1" 161 | } 162 | }, 163 | "nbformat": 4, 164 | "nbformat_minor": 5 165 | } 166 | -------------------------------------------------------------------------------- /DataVisualization/6.72_Statistics_basics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "83a62396-c020-419b-882c-dade02c2b35c", 6 | "metadata": {}, 7 | "source": [ 8 | "# **6.72_Statistics_basics**" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "e6d56090-5757-48fc-8943-8ee45bf51842", 14 | "metadata": {}, 15 | "source": [ 16 | "我们用Python做数据分析时,要用代码编写统计分析方法,运用在数据上。\n", 17 | "\n", 18 | "数据分析和统计学是无法分隔的。\n", 19 | "\n", 20 | "统计学的本质是对数据进行描述和推断,这一节主要涉及描述统计学知识。\n", 21 | "\n", 22 | "描述,指的是对数据提供特征的概述。" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "id": "616ffa59-7938-4518-824c-846fa63745c7", 28 | "metadata": {}, 29 | "source": [ 30 | "### 一、数据的分类" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "b89a6e7a-8057-4489-918d-ce4f1be148d9", 36 | "metadata": {}, 37 | "source": [ 38 | "#### (一)、分类数据" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "id": "61b0a71e-e16b-4241-b209-e6444b428306", 44 | "metadata": {}, 45 | "source": [ 46 | "分类数据指的是包含有限数量的不同类别的数据。" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "id": "5cc599d3-71c2-4620-b5d9-882fe50ad0a5", 52 | "metadata": {}, 53 | "source": [ 54 | "1. 定序数据\n", 55 | "\n", 56 | " 表示数据是可以有顺序的,比如:金银铜可以按顺序排\n", 57 | "\n", 58 | "2. 定类数据\n", 59 | "\n", 60 | " 表示数据没有顺序,比如:狗子的种类" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "id": "f38a0d7d-114b-4160-a083-801d53ad188f", 66 | "metadata": {}, 67 | "source": [ 68 | "#### (二)、数值数据" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "id": "4731d181-8fa2-4c29-83b0-5b0b6a92e845", 74 | "metadata": {}, 75 | "source": [ 76 | "数值数据,指的是测量出的观测值,是个具体的数值,对它进行求和或求平均值等数学运算是有意义的。" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "id": "23446ba7-cfc2-47c6-bb1a-627eb6f2f1d5", 82 | "metadata": {}, 83 | "source": [ 84 | "1. 连续数据\n", 85 | "\n", 86 | " 表示数据没有最小的表示单位,两个数值之间可以取无数不同的值\n", 87 | "\n", 88 | "2. 离散数据\n", 89 | "\n", 90 | " 表示数据只能以整数或自然数为单位" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "id": "f4eb4a30-4cc4-498d-a074-d1400fe0fad6", 96 | "metadata": {}, 97 | "source": [ 98 | "### 二、分析维度" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "id": "d470619e-78d8-4901-a65b-b306cbffc2bb", 104 | "metadata": {}, 105 | "source": [ 106 | "数值数据,通常是分析的重点,可以有三个分析维度。" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "id": "77c70253-9000-407c-9faf-930c4cb80428", 112 | "metadata": {}, 113 | "source": [ 114 | "#### (一)、集中趋势" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "id": "5db46a64-e3eb-4442-9c58-c1d772c85207", 120 | "metadata": {}, 121 | "source": [ 122 | "看的是数据集中在何处" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "id": "a76fed2e-1b23-4255-8c0a-f558d602f2cd", 128 | "metadata": {}, 129 | "source": [ 130 | "#### (二)、离散趋势" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "id": "42f5aa56-81a6-4bd0-885f-86439c4082a3", 136 | "metadata": {}, 137 | "source": [ 138 | "看的是数据偏离中心的散布情况" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "id": "05201be9-10e5-48b8-beff-767dbe12f6ec", 144 | "metadata": {}, 145 | "source": [ 146 | "#### (三)、分布形状" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "id": "54e84985-7c6e-4f49-8912-6d4b9f74e0ab", 152 | "metadata": {}, 153 | "source": [ 154 | "看的是分布的对称程度,峰值高低等等情况" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "id": "77aad5a2-ad27-45ce-bc62-3725672fcd7c", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [] 164 | } 165 | ], 166 | "metadata": { 167 | "kernelspec": { 168 | "display_name": "Python 3 (ipykernel)", 169 | "language": "python", 170 | "name": "python3" 171 | }, 172 | "language_info": { 173 | "codemirror_mode": { 174 | "name": "ipython", 175 | "version": 3 176 | }, 177 | "file_extension": ".py", 178 | "mimetype": "text/x-python", 179 | "name": "python", 180 | "nbconvert_exporter": "python", 181 | "pygments_lexer": "ipython3", 182 | "version": "3.11.1" 183 | } 184 | }, 185 | "nbformat": 4, 186 | "nbformat_minor": 5 187 | } 188 | -------------------------------------------------------------------------------- /DataVisualization/.ipynb_checkpoints/6.72_Statistics_basics-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "83a62396-c020-419b-882c-dade02c2b35c", 6 | "metadata": {}, 7 | "source": [ 8 | "6.72_Statistics_basics" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "e6d56090-5757-48fc-8943-8ee45bf51842", 14 | "metadata": {}, 15 | "source": [ 16 | "我们用Python做数据分析时,要用代码编写统计分析方法,运用在数据上。\n", 17 | "\n", 18 | "数据分析和统计学是无法分隔的。\n", 19 | "\n", 20 | "统计学的本质是对数据进行描述和推断,这一节主要涉及描述统计学知识。\n", 21 | "\n", 22 | "描述,指的是对数据提供特征的概述。" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "id": "616ffa59-7938-4518-824c-846fa63745c7", 28 | "metadata": {}, 29 | "source": [ 30 | "### 一、数据的分类" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "b89a6e7a-8057-4489-918d-ce4f1be148d9", 36 | "metadata": {}, 37 | "source": [ 38 | "#### (一)、分类数据" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "id": "61b0a71e-e16b-4241-b209-e6444b428306", 44 | "metadata": {}, 45 | "source": [ 46 | "分类数据指的是包含有限数量的不同类别的数据。" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "id": "5cc599d3-71c2-4620-b5d9-882fe50ad0a5", 52 | "metadata": {}, 53 | "source": [ 54 | "1. 定序数据\n", 55 | "\n", 56 | " 表示数据是可以有顺序的,比如:金银铜可以按顺序排\n", 57 | "\n", 58 | "2. 定类数据\n", 59 | "\n", 60 | " 表示数据没有顺序,比如:狗子的种类" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "id": "f38a0d7d-114b-4160-a083-801d53ad188f", 66 | "metadata": {}, 67 | "source": [ 68 | "#### (二)、数值数据" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "id": "4731d181-8fa2-4c29-83b0-5b0b6a92e845", 74 | "metadata": {}, 75 | "source": [ 76 | "数值数据,指的是测量出的观测值,是个具体的数值,对它进行求和或求平均值等数学运算是有意义的。" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "id": "23446ba7-cfc2-47c6-bb1a-627eb6f2f1d5", 82 | "metadata": {}, 83 | "source": [ 84 | "1. 连续数据\n", 85 | "\n", 86 | " 表示数据没有最小的表示单位,两个数值之间可以取无数不同的值\n", 87 | "\n", 88 | "2. 离散数据\n", 89 | "\n", 90 | " 表示数据只能以整数或自然数为单位" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "id": "f4eb4a30-4cc4-498d-a074-d1400fe0fad6", 96 | "metadata": {}, 97 | "source": [ 98 | "### 二、分析维度" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "id": "d470619e-78d8-4901-a65b-b306cbffc2bb", 104 | "metadata": {}, 105 | "source": [ 106 | "数值数据,通常是分析的重点,可以有三个分析维度。" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "id": "77c70253-9000-407c-9faf-930c4cb80428", 112 | "metadata": {}, 113 | "source": [ 114 | "#### (一)、集中趋势" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "id": "5db46a64-e3eb-4442-9c58-c1d772c85207", 120 | "metadata": {}, 121 | "source": [ 122 | "看的是数据集中在何处" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "id": "a76fed2e-1b23-4255-8c0a-f558d602f2cd", 128 | "metadata": {}, 129 | "source": [ 130 | "#### (二)、离散趋势" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "id": "42f5aa56-81a6-4bd0-885f-86439c4082a3", 136 | "metadata": {}, 137 | "source": [ 138 | "看的是数据偏离中心的散布情况" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "id": "05201be9-10e5-48b8-beff-767dbe12f6ec", 144 | "metadata": {}, 145 | "source": [ 146 | "#### (三)、分布形状" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "id": "54e84985-7c6e-4f49-8912-6d4b9f74e0ab", 152 | "metadata": {}, 153 | "source": [ 154 | "看的是分布的对称程度,峰值高低等等情况" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "id": "77aad5a2-ad27-45ce-bc62-3725672fcd7c", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [] 164 | } 165 | ], 166 | "metadata": { 167 | "kernelspec": { 168 | "display_name": "Python 3 (ipykernel)", 169 | "language": "python", 170 | "name": "python3" 171 | }, 172 | "language_info": { 173 | "codemirror_mode": { 174 | "name": "ipython", 175 | "version": 3 176 | }, 177 | "file_extension": ".py", 178 | "mimetype": "text/x-python", 179 | "name": "python", 180 | "nbconvert_exporter": "python", 181 | "pygments_lexer": "ipython3", 182 | "version": "3.11.1" 183 | } 184 | }, 185 | "nbformat": 4, 186 | "nbformat_minor": 5 187 | } 188 | -------------------------------------------------------------------------------- /IntroductionCourse/34&35_test.py: -------------------------------------------------------------------------------- 1 | # 34_测试 2 | # 测试的目的是帮我们确认,程序的行为是否和预期相同。 3 | # 而且测试除了能帮我们验证新代码是否正确之外,还能验证在改动老代码之后,不该受影响的地方,也仍然会按预期进行 4 | 5 | # 一、assert语句 6 | # 1.assert后面可以跟上任何布尔表达式,也就是值为True或False的表达式。 7 | # *测试时我们会在assert后面跟上,我们认为应该为True的表达式 8 | # 2.如果assert后面的表达式,最终求值出来的结果为True,那么无事发生,继续运行后面的代码 9 | # 但如果求值出来为False,就会产生"AssertionError",即断言错误。相当于提醒运行程序的人,这里不符合预期 10 | # 3.用assert的问题是,一旦出现AssertionError,程序就中止了。 11 | # 后面的代码并不会运行,我们并不知道剩下的代码里有哪些其他问题. 12 | assert len("Hi") == 2 13 | # assert "a" in ["b", "c"] 14 | # assert 1 + 2 > 6 15 | 16 | # 所以一般我们测试会使用专门做测试的库,它们能一次性跑多个测试用例(TestCase), 17 | # 并且能更直观地展现哪些测试用例通过了,哪些没有。 18 | # 另:测试用例(testcase)是独立的,即一个testcase实例的代码必须是完全自含的(不用调用此类的其它测试用例) 19 | 20 | # 二、unittest库 21 | # 1.unittest是一个很常用的Python单元测试库,是Python标准库 22 | # 2.单元测试的意思是,对软件中的最小可测试单元进行验证,比如验证某函数某方面表现是否符合预期 23 | # !3.我们一般会划分测试代码和实现代码到独立文件里,而不是和要实现的功能混在一起。 24 | # !4.为了能调用要测试的功能,还要在测试文件里,把要测试的函数或类也引入进来, 25 | # 如果测试文件和被测试文件位于同一文件夹下,用 from 文件名 import 函数名 或 from 文件名 import 类名 26 | # *文件名不能以数字开头 27 | 28 | # (一)、写测试 29 | # 1.创建一个类,名字可以以Test开头,表示这是一个用来测试的类. 30 | # 它要当unittest.TestCase的子类,这样就能使用那些,继承自unittest.TestCase的各种测试功能 31 | # 2.在这个类下面可以定义不同的测试用例,每一个测试用例都是类下面的一个方法 32 | # !命名必须以"test_"开头,因为unittest这个库,会自动搜寻"test_"开头的方法,而且只把"test_"开头的当成测试用例 33 | # 3.由于用assert,出现AssertionError,程序就直接中断了; 34 | # 所以我们可以用unittest库里TestCase类的assertEqual方法 35 | # 咱们的测试类已经继承自那个类,所以可以直接通过self调用父类方法 36 | # 另:assertEqual方法用法:传入的第一个参数和第二个参数如果相等,显示测试通过;如果不相等,显示测试不通过,但程序也不会炸 37 | # 4.写好测试用例后,我们只需要在编辑器的终端,输入"python -m unittest",表示运行unittest 38 | # 这个库就会自动搜索所有,继承了unittest库里TestCase类的子类,运行它们所有以"test_"开头的方法,然后展示测试结果 39 | # (终端可以输入,如:python -m unittest test_module,TestClass.test_method,运行文件、类和测试用例的测试 40 | # 测试模块也可以通过文件路径指定: ) 41 | # 5.结果会告诉你共运行了几个测试, 42 | # 然后上面的点点,每个点代表一个测试通过;如果有一个测试没有通过,其中一个点会变成F;如果一个测试出现异常,其中一个点会变成E 43 | # unittest还会详细告诉你是哪个文件下的哪个方法造成了测试失败,以及为什么失败 44 | 45 | # 例题在TestCases文件夹里 46 | 47 | 48 | # 35_测试 49 | 50 | # (二)、unittest.TestCase类的常见测试方法 51 | # 1.assertEqual(A,B) 类似于 assert A==B 52 | # 2.assertTrue(A) 类似于 assert A is True 53 | # 3.assertIn(A,B) 类似于 assert A in B 54 | # 4.assertNotEqual(A,B) 类似于 assert A!=B 55 | # 5.assertFalse(A) 类似于 assert A is False 56 | # 6.assertNotIn(A,B) 类似于 assert A not in B 57 | 58 | # *本质上assertTrue 可以代替这些所有方法, 59 | # 比如以下两个测试:都是在验证2是否不存在于[1, 2]这个列表里,通过与否的结果也是一样的 60 | # *但还是推荐更具针对性的方法 61 | # 比如想测试元素不在列表里,用assertNotIn这个专门针对元素和列表的方法,而不是assertTrue这个万能方法, 62 | # 原因是在测试未通过的时候,更针对性的方法,会给出更详细的失败原因 63 | 64 | # assertTrue(2 not in [1, 3 - 1]) 65 | # AssertionError: False is not true 66 | # assertNotIn(2, [3 - 1]) 67 | # AssertionError: 2 unexpectedly found in [1, 2] 68 | 69 | # (三)、其他方法 70 | # 有些时候我们还可以通过额外方法,进一步提高测试效率 71 | 72 | # setUp方法 *第一个字母s小写 73 | # *假如我们要测试一个类, 74 | # *为了能调用各种类方法,我们需要创建实例对象,由于不同测试用例之间是独立的,测试不同方法的时候,我们要不停创建新对象。 75 | # 为了减少不必要的重复代码,我们可以利用TestCase类里的SetUp方法: 76 | # 在运行各个测试方法,也就是"test_"开头的方法前,SetUp方法都会先被运行一次。 77 | # 我们只需要在SetUp方法里,把测试对象创建好,*作为当前测试类的一个属性 78 | # 然后在方法里,就可以通过属性,获取那个已创建好的对象,去写测试语句 79 | 80 | 81 | # 实现代码sentence.py 82 | class Sentence: 83 | def __init__(self, sentence): 84 | self.sentence = sentence 85 | 86 | """返回句子字母数量""" 87 | def str_count(self): 88 | return len(self.sentence) 89 | 90 | """返回句子单词数量""" 91 | def word_count(self): 92 | return len(self.sentence.split(" ")) 93 | 94 | """返回所有字母大写后的句子""" 95 | def upper(self): 96 | return self.sentence.upper() 97 | 98 | 99 | # 原测试代码 100 | import unittest 101 | # from sentence import Sentence 102 | 103 | 104 | class TestSentence1(unittest.TestCase): 105 | def test_str_count(self): 106 | sentence = Sentence("Hello World") 107 | # !可以在其他方法内创建对象 108 | self.assertEqual(sentence.str_count(), 12) 109 | # !方法调用也可以作为参数 110 | 111 | def test_word_count(self): 112 | sentence = Sentence("Hello World") 113 | self.assertEqual(sentence.word_count(), 2) 114 | 115 | def test_upper(self): 116 | sentence = Sentence("Hello World") 117 | self.assertEqual(sentence.upper(), "HELLO WORLD!") 118 | 119 | 120 | # 运用setUp方法的测试代码 121 | # import unittest 122 | # from sentence import Sentence 123 | 124 | 125 | class TestSentence2(unittest.TestCase): 126 | def setUp(self): 127 | self.sentence = Sentence("Hello World!") 128 | 129 | def test_str_count(self): 130 | self.assertEqual(self.sentence.str_count(), 12) 131 | 132 | def test_word_count(self): 133 | self.assertEqual(self.sentence.word_count(), 2) 134 | 135 | def test_upper(self): 136 | self.assertEqual(self.sentence.upper(), "HELLO WORLD!") 137 | 138 | # 例题在TestCases文件夹里 139 | -------------------------------------------------------------------------------- /IntroductionCourse/26_object_oriented_programming.py: -------------------------------------------------------------------------------- 1 | # 26_面向对象编程 2 | # 一、面向过程编程vs面向对象编程 3 | # (一)、面向过程编程 4 | # 1.过程是负责完成某个具体任务的代码,基本可以理解为函数。 5 | # 2.面向过程编程的核心,就是把要实现的事情拆分成一个个步骤,依次完成。 6 | """ 7 | 例1:写个ATM的程序,要往里面依次存入50块和取出100块, 8 | # 那就按照顺序写出这些步骤,并配合定义出存钱和取钱的函数 9 | *假如要记录步骤中更多性质,将性质作为参数补充 10 | def 存钱(面值): 11 | ... 12 | def 取钱(面值): 13 | ... 14 | 15 | 16 | 存钱(50) 17 | 取钱(100) 18 | ------------------------------------------------------------------------------------------------------ 19 | 例2:写个ATM的程序,要往里面依次存入50块和取出100块。但要记录ATM编号,银行,纸币编号,支行,发行年份等性质。 20 | def 存钱(面值, ATM编号, 银行, 纸币编号, 支行, 发行年份): 21 | ... 22 | def 取钱(面值, ATM编号, 银行, 纸币编号, 支行, 发行年份): 23 | ... 24 | 25 | 26 | 存钱(50, "001", "招商银行", "AA00000000", "南园支行", "2015") 27 | 取钱(100, "002", "中国银行", "AA00000001", "北园支行", "2020") 28 | *可以看出对这个例题的情况,面向过程编程,一方面会增加函数参数的数量; 29 | 另一方面,ATM有编号,纸币也有编号,这些数据在传参过程中混在一起,不利于理解, 30 | 随着程序长度和逻辑复杂度的增加,代码的清晰度可能由此降低。 31 | """ 32 | 33 | # (二)、面向对象编程及其基础语法 34 | # 1.以对象为核心,先考虑各个对象有什么性质、能做什么事情 35 | # 比如:每个ATM机都有自己的性质,包括编号、银行、支行。那么我们可以提取出这些性质,定义ATM类,然后用类创建对象。 36 | # 2.类与对象的关系 37 | # 类是创建对象的模板,对象是类的实例。 38 | # 3,属性 39 | # 如编号、银行、支行是不同ATM对象的属性 40 | """ 41 | # 定义ATM类 42 | class ATM: 43 | def __init__(self, 编号, 银行, 支行): 44 | self.编号 = 编号 45 | self.银行 = 银行 46 | self.支行 = 支行 47 | 48 | 49 | # 创建两个ATM对象 50 | atm1 = ATM("001", "招商银行", "南园支行") 51 | atm2 = ATM("002", "中国银行", "北园支行") 52 | 53 | 54 | # 定义纸币类 55 | class 纸币: 56 | def __init__(self, 编号, 面值, 发行年份): 57 | self.编号 = 编号 58 | self.面值 = 面值 59 | self.发行年份 = 发行年份 60 | 61 | 62 | # 创建两个纸币对象 63 | 纸币1 = 纸币("AA00000000", 50, "2015") 64 | 纸币2 = 纸币("AA00000001", 100, "2020") 65 | """ 66 | 67 | # 4.1)对象可以直接作为参数,传入函数中,减少函数参数的数量 68 | """ 69 | def 存钱(ATM对象, 纸币对象): 70 | ... 71 | def 取钱(ATM对象, 纸币对象): 72 | ... 73 | 74 | 75 | 存钱(atm1, 纸币1) 76 | 取钱(atm2, 纸币2) 77 | """ 78 | # 4.2)用对象把相关属性绑定在一起,让程序逻辑更加清晰,让人更清楚性质所属的对象是什么 79 | """ 80 | # 比如虽然ATM和纸币都有编号,但我们用"atm1.编号",获取属于atm1的编号;用"纸币1.编号",获取属于纸币1的编号。 81 | print(atm1.编号) 82 | print(纸币1,编号) 83 | """ 84 | 85 | # 5.除属性外,另外能和对象绑定的是方法。 86 | # 属性对应对象拥有的性质,而方法对应对象能做些什么。 87 | # 所谓属性就是放在类里面的变量,所谓方法就是放在类里面的函数。 88 | """ 89 | # *执行洗衣服的任务 90 | # 面向过程编程 91 | def 放(被放的物品, 放入的物品): 92 | ... 93 | def 开机(机器): 94 | ... 95 | def 清洗(需清洗的物品): 96 | ... 97 | def 烘干(需烘干的物品): 98 | ... 99 | 100 | 101 | 放("衣服", "洗衣机") 102 | 放("洗衣粉", "洗衣机") 103 | 开机("洗衣机") 104 | 清洗("衣服") 105 | 烘干("衣服") 106 | --------------------------------------------------------------------- 107 | # 面向对象编程 108 | # 人、洗衣机是执行事务的对象。 109 | # 人能放东西和开机,洗衣机能清洗和烘干,这些可以作为类的方法被定义 110 | class 人: 111 | def 放(self, 被放的物品, 放入的物品): 112 | ... 113 | def 开机(self, 机器): 114 | ... 115 | class 洗衣机: 116 | def 清洗(self, 需清洗的物品): 117 | ... 118 | def 烘干(self, 需烘干的物品): 119 | ... 120 | 121 | 122 | # 定义好类之后,就可以通过类来创建对象 123 | # *因为我这个对象没有性质,所以括号是空的 124 | 我 = 人() 125 | 我的洗衣机 = 洗衣机() 126 | 127 | # 用对象去执行方法 128 | 我.放("衣服", 我的洗衣机) 129 | 我.放("洗衣粉", 我的洗衣机) 130 | 我.开机(我的洗衣机) 131 | 我的洗衣机.清洗("衣服") 132 | 我的洗衣机.烘干("衣服") 133 | """ 134 | 135 | # 6.方法中用到自身属性,这个信息不需要作为参数被传入,而是可以直接被获取到 136 | """ 137 | # 比如洗衣机的方法需要用到容量这个信息 138 | class 洗衣机: 139 | def __init__(self, 容量): 140 | self.容量 = 容量 141 | def 清洗(self, 需清洗的物品): 142 | 洗衣机容量 = self.容量 143 | ... 144 | def 烘干(self, 需烘干的物品): 145 | 洗衣机容量 = self.容量 146 | ... 147 | """ 148 | 149 | # 7.面向过程是编年体,面向对象是纪传体 150 | 151 | # 三、面向对象编程的特性 152 | # (一)、封装 153 | # 封装表示,写类的人,将内部实现细节隐藏起来,使用类的人,只通过外部接口访问和使用 154 | # 比如有人写好了洗衣机这个类,其实你只需要知道它有什么方法,方法有什么作用,具体怎么用就足够了, 155 | # 不需要知道方法里面具体是怎么写的 156 | """ 157 | class 洗衣机: 158 | def __init__(self, 容量): 159 | *** 160 | def 清洗(self, 需清洗的物品): 161 | *** 162 | def 烘干(self, 需烘干的物品): 163 | *** 164 | """ 165 | 166 | # (二)、继承 167 | # 继承是在说:面向对象编程允许创建有层次的类,即类可以有子类和父类,来表示从属关系 168 | # 这样做的好处是父类的属性、方法都可以被继承,不需要反复定义,减少代码的冗余 169 | # 比如小学生、大学生都是学生,都应该有学号、年级的属性,都要去学校 170 | # 可以创建出一个叫学生的父类,然后让小学生和大学生去继承这个类 171 | 172 | # (三)、多态 173 | # 多态是指,同样的接口,由于对象具体类的不同,而有不同的表现 174 | # 比如虽然小学生和大学生都要写作业,但内容肯定不一样 175 | # 所以写作业的方法,就不能直接定义在父类里面,而是要分别定义在子类里 176 | # 调用写作业方法时,统一调用同一名称的方法,而他们会由于所属类不同,执行不同的写作业方法 177 | """ 178 | class 学生: 179 | def __init__(self, 学号, 年级): 180 | self.学号 = 学号 181 | self.年级 = 年级 182 | def 去学校(self, 学校): 183 | ... 184 | class 小学生(学生): 185 | def 写作业(self): 186 | # 写简单作业 187 | ... 188 | class 大学生(学生): 189 | def 写作业(self): 190 | # 写困难作业 191 | ... 192 | 193 | 194 | 小崽 = 小学生("0331", "小六") 195 | 大崽 = 大学生("1007", "大一") 196 | 我家崽们 = [大崽, 小崽] 197 | for 崽 in 我家崽们: 198 | 崽.写作业() 199 | """ 200 | 201 | # 综上,不是说面向对象就一定优于面向过程,选择哪个,还是取决于具体场景、具体需求。 202 | -------------------------------------------------------------------------------- /DataAnalysisPreparation/2.4_Markdown和LaTex入门.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "756b7cc8-acf0-4fd5-8a1d-bd1ee3b62a41", 6 | "metadata": {}, 7 | "source": [ 8 | "### 三级标题" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "09499706-d1ef-452e-b14a-8d74f0496b6f", 14 | "metadata": {}, 15 | "source": [ 16 | "**粗体字**" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "b34b43ab-c9e0-4a85-a454-3a1a8a7d6ed7", 22 | "metadata": {}, 23 | "source": [ 24 | "# 一级标题\n", 25 | "#### 四级标题" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "id": "dc1f68ea-44f5-4381-b12b-7f16c7f37f81", 31 | "metadata": {}, 32 | "source": [ 33 | "**加粗** \n", 34 | "*斜体* \n", 35 | "~~删除线~~" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "id": "03050d9c-2230-4f9b-b16d-48ec9e7014fc", 41 | "metadata": {}, 42 | "source": [ 43 | "- 无序列表\n", 44 | "- 无序列表\n", 45 | "- 无序列表" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "id": "c67e558c-fd05-434f-94db-e896cefb17f7", 51 | "metadata": {}, 52 | "source": [ 53 | "1. 有序列表1\n", 54 | "2. 有序列表2\n", 55 | "3. 有序列表3" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "id": "9c1e6a32-b3ee-474f-8cf5-c0bd394c3ae3", 61 | "metadata": {}, 62 | "source": [ 63 | "https://cn.bing.com/ \n", 64 | "[必应](https://cn.bing.com/)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "0b64ec24-43c0-4173-a7dd-d26f796bfe98", 70 | "metadata": {}, 71 | "source": [ 72 | "![城市景观](https://img0.baidu.com/it/u=25183460,870873689&fm=253)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "id": "9962bb84-375e-40d1-bb3f-06c76ac8ad99", 78 | "metadata": {}, 79 | "source": [ 80 | "> 蒹葭苍苍 \n", 81 | "白露为霜" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "id": "3a7476e1-fa24-462a-aff3-fbbc74ab33f9", 87 | "metadata": {}, 88 | "source": [ 89 | "插入代码 `import math`" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "id": "b4c1b80c-e45c-4241-b78d-71be800a43cc", 95 | "metadata": {}, 96 | "source": [ 97 | "```python\r\n", 98 | "import math\r\n", 99 | "print(\"Hello World!\")\r\n", 100 | "print(math.pi)\r\n", 101 | "```" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "id": "8e4e98e8-94b3-4db2-a383-36aa3f86990d", 107 | "metadata": {}, 108 | "source": [ 109 | "$$x + y$$\r\n", 110 | "$$x - y$$\r\n", 111 | "$$x \\times y$$\r\n", 112 | "$$x \\div y$$" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "id": "65776528-5f8e-41c1-b071-4435ea02a997", 118 | "metadata": {}, 119 | "source": [ 120 | "$$x^3$$\r\n", 121 | "$$H_2O$$\r\n", 122 | "$$S_{input}$$" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "id": "dd4ac9a1-1ab0-4e40-a6fc-07d658786012", 128 | "metadata": {}, 129 | "source": [ 130 | "$$\\sum(x^2 + y^2)$$\r\n", 131 | "$$\\sqrt[3]x$$\r\n", 132 | "$$\\sqrt[3]{a^2m^2}$$" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "id": "dd734be3-bf33-4a64-a6be-9215cb4aaafc", 138 | "metadata": {}, 139 | "source": [ 140 | "$$\\frac{x+y}{x-y}$$" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "id": "a7816dbe-6a60-471f-8375-e3830c40da76", 146 | "metadata": {}, 147 | "source": [ 148 | "$$\\alpha\\beta\\gamma$$" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "id": "d9f9d8b2-0d11-4240-8671-19d0fcf570bc", 154 | "metadata": {}, 155 | "source": [] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "id": "3814fcfd-2c50-403c-8ed5-09b9377669b5", 160 | "metadata": {}, 161 | "source": [] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "id": "bcf1d475-9c5b-4f70-b24c-46db363ea0e9", 166 | "metadata": {}, 167 | "source": [] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "id": "d957f787-51a3-4eb7-8472-254f2938e5e0", 172 | "metadata": {}, 173 | "source": [] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "id": "714a1b73-ea66-4483-bc81-e2b8694dc96c", 178 | "metadata": {}, 179 | "source": [] 180 | } 181 | ], 182 | "metadata": { 183 | "kernelspec": { 184 | "display_name": "Python 3 (ipykernel)", 185 | "language": "python", 186 | "name": "python3" 187 | }, 188 | "language_info": { 189 | "codemirror_mode": { 190 | "name": "ipython", 191 | "version": 3 192 | }, 193 | "file_extension": ".py", 194 | "mimetype": "text/x-python", 195 | "name": "python", 196 | "nbconvert_exporter": "python", 197 | "pygments_lexer": "ipython3", 198 | "version": "3.11.1" 199 | } 200 | }, 201 | "nbformat": 4, 202 | "nbformat_minor": 5 203 | } 204 | -------------------------------------------------------------------------------- /EvaluateAndCleanData/4.32_evaluate_and_clean_data_manual.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "af1e6e10-dda1-4d2d-bef5-2ec2451367d0", 6 | "metadata": {}, 7 | "source": [ 8 | "## **4.32_evaluate_and_clean_data_manual**" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "2d7f3e94-8880-46af-a6ef-0ff502a6c58f", 14 | "metadata": {}, 15 | "source": [ 16 | "**数据评估和清洗整体流程**\n", 17 | "\n", 18 | "**下载数据;读取数据;评估数据;根据对数据的评估,指定清洗的步骤;清洗数据;每清洗一步,就再次查看清洗后的数据,来确保问题已经解决;保存清洗后的数据**" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "22172899-90e8-4015-88b9-659266e77aa7", 24 | "metadata": {}, 25 | "source": [ 26 | "在Jupyter Notebook里进行数据清晰,并以报告的形式呈现内容。\n", 27 | "\n", 28 | "运用标题,使内容结构清晰;运用Markdown对代码进行注释,方便读者理解;并且运用Markdown,总结评估结论,使内容逻辑清晰。" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "89962c84-8ec3-48fc-ab33-c3c7289c15ed", 34 | "metadata": {}, 35 | "source": [ 36 | "### 下载数据" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "id": "8f022572-cf71-468b-b845-873cf10698f4", 42 | "metadata": {}, 43 | "source": [ 44 | "1. 每次清洗数据项目开始前,都新建一个新的文件夹,将数据文件和Jupyter Notebook放入。\n", 45 | "\n", 46 | "这样可以保持文件结构的整洁;数据文件和Jupyter Notebook放入同一文件夹,也方便读取" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "id": "f7404a26-5be5-476b-8a46-b5a186a6aa3e", 52 | "metadata": {}, 53 | "source": [ 54 | "2. 对于公开数据集,我们并不了解它的背景,在正式分析前,最好了解一下它的介绍,和数据每列的含义。" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "id": "52f043d3-0d17-4e2d-946c-10b4dfda6b26", 60 | "metadata": {}, 61 | "source": [ 62 | "### 读取数据" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "id": "58e21a0d-94ad-48cf-97e4-913139a901aa", 68 | "metadata": {}, 69 | "source": [ 70 | "### 评估数据 " 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "id": "7abc72b1-b4ab-4852-ab31-34b9214f6223", 76 | "metadata": {}, 77 | "source": [ 78 | "#### 缺失数据" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "id": "84216bbc-e311-4b87-aeff-b0f31999c078", 84 | "metadata": {}, 85 | "source": [ 86 | "发现缺失数据后,需要深入探索缺失数据:应把该变量存在缺失的观察值,筛选出来,进一步观察数据特征,(提出猜测,验证猜测)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "id": "9f3b31dc-5e98-456a-b669-394a1641bb8b", 92 | "metadata": {}, 93 | "source": [ 94 | "#### 重复数据" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "id": "cf191806-15ab-4467-b600-079ae2d89e03", 100 | "metadata": {}, 101 | "source": [ 102 | "首先判断哪些变量不应该重复,再去评估数据。\n", 103 | "\n", 104 | "**唯一标识符不一定不能重复,具体数据具体分析**" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "id": "7909658a-f063-4f6d-9c50-645b24d0d697", 110 | "metadata": {}, 111 | "source": [ 112 | "#### 不一致数据" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "id": "95304790-aeb9-4b8f-87b0-0fc1dc84fe52", 118 | "metadata": {}, 119 | "source": [ 120 | "先观察哪个变量可以判断是否存在不一致数据,再去评估数据。" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "id": "9a125a24-7c19-484b-a40f-54f5f421a4cb", 126 | "metadata": {}, 127 | "source": [ 128 | "#### 无效/错误数据" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "id": "5dfeeca3-0cbc-45f3-b1cf-8dff8bbe0eef", 134 | "metadata": {}, 135 | "source": [ 136 | "先观察哪个变量可以判断是否存在不一致数据。\n", 137 | "\n", 138 | "然后通过某些方法,结合常识,评估无效/错误数据。\n", 139 | "\n", 140 | "最后,与缺失数据一样,发现无效/错误数据后,需要深入探索无效/错误数据:应把该变量无效/错误的观察值,筛选出来,进一步观察数据特征,(提出猜测,验证猜测)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "id": "897f5dbc-ecb7-4298-99ad-fcc6e0db1a2c", 146 | "metadata": {}, 147 | "source": [ 148 | "## 根据对数据的评估,指定清洗的步骤" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "id": "7cac8fe0-db8f-467c-ab01-cb2bfdab5a9f", 154 | "metadata": {}, 155 | "source": [ 156 | "### 清洗数据" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "id": "45d9c2ae-2499-4228-b685-ed305ec96246", 162 | "metadata": {}, 163 | "source": [ 164 | "1. 在清洗前,先创建一个新变量,用于储存清洗过程中的数据。\n", 165 | "\n", 166 | " 这样原始的数据和经过清理的数据,分别储存在两个变量中" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "id": "7fcfddf4-0207-43e9-ae16-0aa46f6d32fb", 172 | "metadata": {}, 173 | "source": [ 174 | "2. 清洗数据过程中,遇到现学处理方法无法很好清理数据的情况,可以查询官方文档或查询搜索引擎。" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "id": "49eaad51-36b0-4eee-8459-ce1dc409cdb1", 180 | "metadata": {}, 181 | "source": [ 182 | "### 每清洗一步,就再次查看清洗后的数据,来确保问题已经解决" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "id": "18ce1f2e-0c01-4246-ba8a-c3846ebf86aa", 188 | "metadata": {}, 189 | "source": [ 190 | "### 保存清洗后的数据" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "id": "35e746a3-31b7-476e-badf-e4e30e019a0f", 196 | "metadata": {}, 197 | "source": [ 198 | "新的CSV文件名,可以是在原始CSV文件名后面,添加'_cleaned'。" 199 | ] 200 | } 201 | ], 202 | "metadata": { 203 | "kernelspec": { 204 | "display_name": "Python 3 (ipykernel)", 205 | "language": "python", 206 | "name": "python3" 207 | }, 208 | "language_info": { 209 | "codemirror_mode": { 210 | "name": "ipython", 211 | "version": 3 212 | }, 213 | "file_extension": ".py", 214 | "mimetype": "text/x-python", 215 | "name": "python", 216 | "nbconvert_exporter": "python", 217 | "pygments_lexer": "ipython3", 218 | "version": "3.11.1" 219 | } 220 | }, 221 | "nbformat": 4, 222 | "nbformat_minor": 5 223 | } 224 | -------------------------------------------------------------------------------- /DataAnalysis/.ipynb_checkpoints/7.81_hypothetical_test-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "8ba2c081-94e8-43d8-9d1d-095837b49eac", 6 | "metadata": {}, 7 | "source": [ 8 | "7.81_hypothetical_test" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "747e72fa-5135-4ffb-9859-6200fcdc17b0", 14 | "metadata": {}, 15 | "source": [ 16 | "### 一、有关假设检验的一些概念" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "d4cf7743-a280-45a1-ac96-33f20cf0cc82", 22 | "metadata": {}, 23 | "source": [ 24 | "1. 描述统计学与推断统计学\n", 25 | "\n", 26 | "- 描述统计学:对数据进行描述和总结\n", 27 | "\n", 28 | "- 推断统计学: 通过样本做出关于总体的推断和预测\n", 29 | "\n", 30 | "2. 对象和整体\n", 31 | " \n", 32 | "- 对象:我们想要观测的具体事物叫做对象\n", 33 | "\n", 34 | "- 整体:我们想观测的整个对象的集合\n", 35 | "\n", 36 | "3. 样本和整体\n", 37 | "\n", 38 | "- 样本是我们收集数据的对象\n", 39 | "\n", 40 | "- 总体是我们想要得到结论的群体\n", 41 | "\n", 42 | "4. 统计量和参数\n", 43 | "\n", 44 | "- 统计量:描述样本特征的数值\n", 45 | "\n", 46 | "- 参数:描述总体特征的数值\n", 47 | "\n", 48 | "- 在统计推断中,我们会基于样本的统计量,对总体的参数进行推断,从而得到对总体的结论" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "id": "77abcd2b-bf0e-40e9-b7ee-087fa12e2bc8", 54 | "metadata": {}, 55 | "source": [ 56 | "### 二、独立双样本t检验步骤" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "id": "4217a0db-7f0d-4b5e-b6ed-63e710d50dee", 62 | "metadata": {}, 63 | "source": [ 64 | "1. 概念\n", 65 | "\n", 66 | "- 独立:说明样本来自不同的总体,彼此没有关联\n", 67 | "\n", 68 | "- 双样本:比较两个不同样本的数据\n", 69 | "\n", 70 | "- t检验:一种统计方法,用于确定样本的平均值之间是否存在统计显著的差异\n", 71 | "\n", 72 | "2. 前提条件\n", 73 | "\n", 74 | "- 随机抽样\n", 75 | "\n", 76 | "- 总体大致呈正态分布 " 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "id": "e8a0921a-387d-43f8-ae28-525f6ad0d5bd", 82 | "metadata": {}, 83 | "source": [ 84 | "#### (一)、建立假设" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "id": "2a33f8cb-288d-443b-9402-4535e834614c", 90 | "metadata": {}, 91 | "source": [ 92 | "1. 原假设(H0)\n", 93 | "\n", 94 | "2. 备择假设(H1)\n", 95 | "\n", 96 | "一般我们进行假设检验的时候,是想反驳原假设,以及支持备择假设" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "id": "95f4041d-1471-4c37-9b89-a4095e41156a", 102 | "metadata": {}, 103 | "source": [ 104 | "#### (二)、选择单尾或双尾" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "id": "38b4098a-5db8-4551-b562-b5be8dc54c45", 110 | "metadata": {}, 111 | "source": [ 112 | "1. 双尾只推断总体之间是否有差异,不在意是正差还是负差\n", 113 | "\n", 114 | "2. 单尾推断只看是否存在正差异,或者只看是否存在负差异" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "id": "46ec60fa-4a97-4ced-a1e5-a66357be0754", 120 | "metadata": {}, 121 | "source": [ 122 | "#### (三)、确定显著水平" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "id": "48c6d02e-eb27-4530-8eec-c96d871422ab", 128 | "metadata": {}, 129 | "source": [ 130 | "显著水平反应了检验的严格程度。\n", 131 | "\n", 132 | "样本抽样存在随机性,检验结果没有可能保证100%符合现实,只能通过显著水平来调整严格程度\n", 133 | "\n", 134 | "常见的双尾检验显著水平是0.05,也就是说如果检验结果是拒绝原假设,原假设实际为真的概率是5%;再换句话说,如果检验结果是拒绝原假设,结论95%概率是对的。\n", 135 | "\n", 136 | "常见的单位检验显著水平是0.025。" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "id": "b1d7705a-ada8-455b-b5cf-05a13f2f188a", 142 | "metadata": {}, 143 | "source": [ 144 | "#### (四)、计算t值" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "id": "b4b58533-4f12-48d9-90fe-49a21b67c408", 150 | "metadata": {}, 151 | "source": [ 152 | "#### (五)、计算自由度" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "id": "89ef8380-8c8b-45c3-a79d-c658800c2ecd", 158 | "metadata": {}, 159 | "source": [ 160 | "自由度 = 样本1的数量 + 样本2的数量 -2" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "id": "122d9642-2f20-4a91-a2ed-f5bd9aeeff6b", 166 | "metadata": {}, 167 | "source": [ 168 | "#### (六)、查看t值临界值表" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "id": "5256df88-576b-414b-bcde-4f9054bfad59", 174 | "metadata": {}, 175 | "source": [ 176 | "根据单双尾、自由度和显著水平,去查t值临界值表" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "id": "cd171f86-90a6-45f2-b2f6-97f59d74f7b3", 182 | "metadata": {}, 183 | "source": [ 184 | "#### (七)、比较临界值和t值" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "id": "3fce23de-3f54-43c9-94d5-07a58bae58d2", 190 | "metadata": {}, 191 | "source": [ 192 | "t值 >= 临界值,拒绝原假设,说明存在显著差异\n", 193 | "\n", 194 | "t值 < 临界值,接受原假设,说明不存在显著差异" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "id": "1a6a7176-e3c0-44ed-beb0-724e13ec656e", 200 | "metadata": {}, 201 | "source": [ 202 | "### 三、Z检验" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "id": "47ece0f8-0dd7-40c0-be0d-6841e8b0548e", 208 | "metadata": {}, 209 | "source": [ 210 | "1. 前提条件的区别\n", 211 | "\n", 212 | "- 增加了要求:总体方差已知/样本容量大于30\n", 213 | "\n", 214 | "2. 步骤的区别\n", 215 | "\n", 216 | "- 不需要计算自由度\n", 217 | "\n", 218 | "- z值计算公式,相比于t值计算公式,把样本方差换成了总体方差\n", 219 | "\n", 220 | "- 查看z值临界值表\n", 221 | "\n", 222 | "3. 应用场景\n", 223 | "\n", 224 | "Z检验适用于已知总体方差,或者样本的数量大于30的情况\n", 225 | "\n", 226 | "t检验适用于总体方差位置,且样本的数量小于等于30的情况。\n", 227 | "\n", 228 | "在实际应用中,t检验比Z检验更加常见\n", 229 | "\n", 230 | "4. 检验结果差异\n", 231 | "\n", 232 | "Z检验可以提供更高的准确性和敏感性" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "id": "3320a27a-7a82-4f9a-8695-a9a4695d6c64", 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [] 242 | } 243 | ], 244 | "metadata": { 245 | "kernelspec": { 246 | "display_name": "Python 3 (ipykernel)", 247 | "language": "python", 248 | "name": "python3" 249 | }, 250 | "language_info": { 251 | "codemirror_mode": { 252 | "name": "ipython", 253 | "version": 3 254 | }, 255 | "file_extension": ".py", 256 | "mimetype": "text/x-python", 257 | "name": "python", 258 | "nbconvert_exporter": "python", 259 | "pygments_lexer": "ipython3", 260 | "version": "3.11.1" 261 | } 262 | }, 263 | "nbformat": 4, 264 | "nbformat_minor": 5 265 | } 266 | -------------------------------------------------------------------------------- /DataVisualization/6.76_data_visualization_chart_extended.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "ba058e3f-14c8-473d-86d2-5c8a2fbbdf2d", 6 | "metadata": {}, 7 | "source": [ 8 | "# **6.76_data_visualization_chart_extended**" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "bc3284aa-4da6-48f9-b36f-f8fd1da27131", 14 | "metadata": {}, 15 | "source": [ 16 | "### 一、数据包含两个变量" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "1dce401b-0de5-4dd4-9fdf-6c0e10b8a1d6", 22 | "metadata": {}, 23 | "source": [ 24 | "如果我们的数据包含两个变量,比如两列DataFrame的话,可以绘制散点图、折线图、条形图、饼图。其中散点图和折线图,主要针对两个数值变量;条形图和饼图主要针对一个分类变量加一个数值变量。" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "5e60f428-3b2a-46ce-b655-130aa9b9b98c", 30 | "metadata": {}, 31 | "source": [ 32 | "#### (一)、散点图" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "id": "2d170642-eaf4-4f49-a490-c70a5c0de81f", 38 | "metadata": {}, 39 | "source": [ 40 | "1. 特点\n", 41 | "\n", 42 | " 可以从散点图,看出变量之间的相关性,比如是否相关、呈正比还是呈反比、线性还是非线性等等,也可以帮我们发现异常值的存在\n", 43 | "\n", 44 | "2. 表示\n", 45 | "\n", 46 | " X轴表示一个变量的值,Y轴表示另一个变量的值" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "id": "67f13465-b7ad-4bf7-9571-6f87be629d0c", 52 | "metadata": {}, 53 | "source": [ 54 | "#### (二)、折线图" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "id": "d5c727b6-460e-42e9-b708-3612d38dc3a8", 60 | "metadata": {}, 61 | "source": [ 62 | "1. 特点\n", 63 | "\n", 64 | " 用于展示连续间隔或时间跨度上数值的变化,从而展示趋势变化" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "4b221e5f-c2b4-4cea-9f3c-f3e0b4ebf988", 70 | "metadata": {}, 71 | "source": [ 72 | "#### (三)、条形图" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "id": "08870a3d-e2b2-4577-ae86-eaba6be6678a", 78 | "metadata": {}, 79 | "source": [ 80 | "1. 特点\n", 81 | "\n", 82 | " 用来展示一个分类变量所对应的数值变量\n", 83 | "\n", 84 | "2. 条形图与直方图的不同\n", 85 | "\n", 86 | " - 直方图只针对一个数值变量,而条形图则是针对一个分类变量和一个数值变量\n", 87 | " \n", 88 | " - 直方图的各个条柱分隔除了不同的数字区间,而条形图的各个条柱分隔这不同的分类变量" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "id": "3e6c907c-fccc-4261-baa9-e467175477ae", 94 | "metadata": {}, 95 | "source": [ 96 | "#### (四)、饼图" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "id": "cb3eeb1e-8407-480f-be41-b518938c8887", 102 | "metadata": {}, 103 | "source": [ 104 | "1. 特点\n", 105 | "\n", 106 | " 用来展示各个分类对应的数值之间的比例,可以直观了解不同类别在整体中的占比\n", 107 | "\n", 108 | "2. 表示\n", 109 | "\n", 110 | " 每个圆弧的长度/面积,代表每个分类所占的百分比,全圆则表示所有分类占比的总和,也就是100%" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "id": "05ad1789-6dd9-453f-9b4e-60e8e315089c", 116 | "metadata": {}, 117 | "source": [ 118 | "### 二、数据包含多个变量" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "id": "1feca603-b96c-4f27-863c-56f82d89e9a3", 124 | "metadata": {}, 125 | "source": [ 126 | "#### 通过添加颜色或尺寸,在图表上表示新的变量" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "id": "0cb02b52-16ee-42a1-8874-a965ff179c0f", 132 | "metadata": {}, 133 | "source": [ 134 | "##### (一)、散点图" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "be72743a-05bd-4646-9e4c-d12dc554052d", 140 | "metadata": {}, 141 | "source": [ 142 | "1. 2个数值变量,1个分类变量\n", 143 | "\n", 144 | " 如果要引入新的分类变量,可以把点绘制成不同的颜色,让颜色表示不同分类\n", 145 | "\n", 146 | "2. 气泡图,3个数值变量\n", 147 | "\n", 148 | " 如果要引入新的数值变量,可以把点绘制成不同的大小,让面积去表示不同数值\n", 149 | "\n", 150 | "3. 气泡图,3个数值变量,1个分类变量\n", 151 | "\n", 152 | " 把气泡绘制成不同的颜色" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "id": "fa67b117-a47b-438c-8765-aa8627fcb397", 158 | "metadata": {}, 159 | "source": [ 160 | "##### (二)、折线图" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "id": "5de24e98-6d7f-4403-b0bf-b3e8b63b20ad", 166 | "metadata": {}, 167 | "source": [ 168 | "1. 2个数值变量,1个分类变量\n", 169 | "\n", 170 | " 可以绘制多条折线,不同颜色的折线代表不同分类" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "id": "e75711fb-7cc0-4137-ba55-ba55223544fa", 176 | "metadata": {}, 177 | "source": [ 178 | "##### (三)、条形图" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "id": "f527cb4e-8ca7-4d58-ac12-596e5417825a", 184 | "metadata": {}, 185 | "source": [ 186 | "1. 复式条形图,2个分类变量,1个数值变量\n", 187 | "\n", 188 | " 绘制多个条柱,不同颜色的条柱代表不同分类" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "id": "4287a6e0-6ad0-4b77-875e-a4ed2d30d6b3", 194 | "metadata": {}, 195 | "source": [ 196 | "##### (四)、热力图" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "id": "eda19b54-8c80-48be-9d48-7a5b1a180a63", 202 | "metadata": {}, 203 | "source": [ 204 | "2个分类变量,1个数值变量\n", 205 | "\n", 206 | "1. 特点\n", 207 | "\n", 208 | " 通过颜色来展示不同变量之间的数值差异\n", 209 | "\n", 210 | "2. 表示\n", 211 | "\n", 212 | " 横轴是分类变量,纵轴也是分类变量,每行或每列都表示一个分类种类,通过热力图里单元格的颜色或数值,表示数值变量数值的大小" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "id": "48793a37-e957-4ec6-8654-be55dca1ba78", 218 | "metadata": {}, 219 | "source": [ 220 | "#### 通过把多个图放在一起,互相对比来挖掘信息" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "id": "2994b881-34be-4f9d-a6d3-3eb196baa9a8", 226 | "metadata": {}, 227 | "source": [ 228 | "1. 通过把两个直方图叠在一张图上,可以直观的看出,它们集中位置的差异、分散程度的差异等等\n", 229 | "\n", 230 | "2. 通过把多个小提琴图并排放,可以直观的比较它们的四分位距、密度概率等等" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "id": "b6a89d0e-6f53-47d8-9cf7-8d8ceb8a327f", 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [] 240 | } 241 | ], 242 | "metadata": { 243 | "kernelspec": { 244 | "display_name": "Python 3 (ipykernel)", 245 | "language": "python", 246 | "name": "python3" 247 | }, 248 | "language_info": { 249 | "codemirror_mode": { 250 | "name": "ipython", 251 | "version": 3 252 | }, 253 | "file_extension": ".py", 254 | "mimetype": "text/x-python", 255 | "name": "python", 256 | "nbconvert_exporter": "python", 257 | "pygments_lexer": "ipython3", 258 | "version": "3.11.1" 259 | } 260 | }, 261 | "nbformat": 4, 262 | "nbformat_minor": 5 263 | } 264 | -------------------------------------------------------------------------------- /DataVisualization/.ipynb_checkpoints/6.76_data_visualization_chart_extended-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "ba058e3f-14c8-473d-86d2-5c8a2fbbdf2d", 6 | "metadata": {}, 7 | "source": [ 8 | "# **6.76_data_visualization_chart_extended**" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "bc3284aa-4da6-48f9-b36f-f8fd1da27131", 14 | "metadata": {}, 15 | "source": [ 16 | "### 一、数据包含两个变量" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "1dce401b-0de5-4dd4-9fdf-6c0e10b8a1d6", 22 | "metadata": {}, 23 | "source": [ 24 | "如果我们的数据包含两个变量,比如两列DataFrame的话,可以绘制散点图、折线图、条形图、饼图。其中散点图和折线图,主要针对两个数值变量;条形图和饼图主要针对一个分类变量加一个数值变量。" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "5e60f428-3b2a-46ce-b655-130aa9b9b98c", 30 | "metadata": {}, 31 | "source": [ 32 | "#### (一)、散点图" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "id": "2d170642-eaf4-4f49-a490-c70a5c0de81f", 38 | "metadata": {}, 39 | "source": [ 40 | "1. 特点\n", 41 | "\n", 42 | " 可以从散点图,看出变量之间的相关性,比如是否相关、呈正比还是呈反比、线性还是非线性等等,也可以帮我们发现异常值的存在\n", 43 | "\n", 44 | "2. 表示\n", 45 | "\n", 46 | " X轴表示一个变量的值,Y轴表示另一个变量的值" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "id": "67f13465-b7ad-4bf7-9571-6f87be629d0c", 52 | "metadata": {}, 53 | "source": [ 54 | "#### (二)、折线图" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "id": "d5c727b6-460e-42e9-b708-3612d38dc3a8", 60 | "metadata": {}, 61 | "source": [ 62 | "1. 特点\n", 63 | "\n", 64 | " 用于展示连续间隔或时间跨度上数值的变化,从而展示趋势变化" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "4b221e5f-c2b4-4cea-9f3c-f3e0b4ebf988", 70 | "metadata": {}, 71 | "source": [ 72 | "#### (三)、条形图" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "id": "08870a3d-e2b2-4577-ae86-eaba6be6678a", 78 | "metadata": {}, 79 | "source": [ 80 | "1. 特点\n", 81 | "\n", 82 | " 用来展示一个分类变量所对应的数值变量\n", 83 | "\n", 84 | "2. 条形图与直方图的不同\n", 85 | "\n", 86 | " - 直方图只针对一个数值变量,而条形图则是针对一个分类变量和一个数值变量\n", 87 | " \n", 88 | " - 直方图的各个条柱分隔除了不同的数字区间,而条形图的各个条柱分隔这不同的分类变量" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "id": "3e6c907c-fccc-4261-baa9-e467175477ae", 94 | "metadata": {}, 95 | "source": [ 96 | "#### (四)、饼图" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "id": "cb3eeb1e-8407-480f-be41-b518938c8887", 102 | "metadata": {}, 103 | "source": [ 104 | "1. 特点\n", 105 | "\n", 106 | " 用来展示各个分类对应的数值之间的比例,可以直观了解不同类别在整体中的占比\n", 107 | "\n", 108 | "2. 表示\n", 109 | "\n", 110 | " 每个圆弧的长度/面积,代表每个分类所占的百分比,全圆则表示所有分类占比的总和,也就是100%" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "id": "05ad1789-6dd9-453f-9b4e-60e8e315089c", 116 | "metadata": {}, 117 | "source": [ 118 | "### 二、数据包含多个变量" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "id": "1feca603-b96c-4f27-863c-56f82d89e9a3", 124 | "metadata": {}, 125 | "source": [ 126 | "#### 通过添加颜色或尺寸,在图表上表示新的变量" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "id": "0cb02b52-16ee-42a1-8874-a965ff179c0f", 132 | "metadata": {}, 133 | "source": [ 134 | "##### (一)、散点图" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "be72743a-05bd-4646-9e4c-d12dc554052d", 140 | "metadata": {}, 141 | "source": [ 142 | "1. 2个数值变量,1个分类变量\n", 143 | "\n", 144 | " 如果要引入新的分类变量,可以把点绘制成不同的颜色,让颜色表示不同分类\n", 145 | "\n", 146 | "2. 气泡图,3个数值变量\n", 147 | "\n", 148 | " 如果要引入新的数值变量,可以把点绘制成不同的大小,让面积去表示不同数值\n", 149 | "\n", 150 | "3. 气泡图,3个数值变量,1个分类变量\n", 151 | "\n", 152 | " 把气泡绘制成不同的颜色" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "id": "fa67b117-a47b-438c-8765-aa8627fcb397", 158 | "metadata": {}, 159 | "source": [ 160 | "##### (二)、折线图" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "id": "5de24e98-6d7f-4403-b0bf-b3e8b63b20ad", 166 | "metadata": {}, 167 | "source": [ 168 | "1. 2个数值变量,1个分类变量\n", 169 | "\n", 170 | " 可以绘制多条折线,不同颜色的折线代表不同分类" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "id": "e75711fb-7cc0-4137-ba55-ba55223544fa", 176 | "metadata": {}, 177 | "source": [ 178 | "##### (三)、条形图" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "id": "f527cb4e-8ca7-4d58-ac12-596e5417825a", 184 | "metadata": {}, 185 | "source": [ 186 | "1. 复式条形图,2个分类变量,1个数值变量\n", 187 | "\n", 188 | " 绘制多个条柱,不同颜色的条柱代表不同分类" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "id": "4287a6e0-6ad0-4b77-875e-a4ed2d30d6b3", 194 | "metadata": {}, 195 | "source": [ 196 | "##### (四)、热力图" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "id": "eda19b54-8c80-48be-9d48-7a5b1a180a63", 202 | "metadata": {}, 203 | "source": [ 204 | "2个分类变量,1个数值变量\n", 205 | "\n", 206 | "1. 特点\n", 207 | "\n", 208 | " 通过颜色来战士不同变量之间的数值差异\n", 209 | "\n", 210 | "2. 表示\n", 211 | "\n", 212 | " 横轴是分类变量,纵轴也是分类变量,每行或每列都表示一个分类种类,通过热力图里单元格的颜色或数值,表示数值变量数值的大小" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "id": "48793a37-e957-4ec6-8654-be55dca1ba78", 218 | "metadata": {}, 219 | "source": [ 220 | "#### 通过把多个图放在一起,互相对比来挖掘信息" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "id": "2994b881-34be-4f9d-a6d3-3eb196baa9a8", 226 | "metadata": {}, 227 | "source": [ 228 | "1. 通过把两个直方图叠在一张图上,可以直观的看出,它们集中位置的差异、分散程度的差异等等\n", 229 | "\n", 230 | "2. 通过把多个小提琴图并排放,课直观的比较它们的四分位距、密度概率等等" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "id": "b6a89d0e-6f53-47d8-9cf7-8d8ceb8a327f", 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [] 240 | } 241 | ], 242 | "metadata": { 243 | "kernelspec": { 244 | "display_name": "Python 3 (ipykernel)", 245 | "language": "python", 246 | "name": "python3" 247 | }, 248 | "language_info": { 249 | "codemirror_mode": { 250 | "name": "ipython", 251 | "version": 3 252 | }, 253 | "file_extension": ".py", 254 | "mimetype": "text/x-python", 255 | "name": "python", 256 | "nbconvert_exporter": "python", 257 | "pygments_lexer": "ipython3", 258 | "version": "3.11.1" 259 | } 260 | }, 261 | "nbformat": 4, 262 | "nbformat_minor": 5 263 | } 264 | -------------------------------------------------------------------------------- /DataAnalysis/7.81_hypothetical_test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "8ba2c081-94e8-43d8-9d1d-095837b49eac", 6 | "metadata": {}, 7 | "source": [ 8 | "# **7.81_hypothetical_test**" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "747e72fa-5135-4ffb-9859-6200fcdc17b0", 14 | "metadata": {}, 15 | "source": [ 16 | "### 一、有关假设检验的一些概念" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "d4cf7743-a280-45a1-ac96-33f20cf0cc82", 22 | "metadata": {}, 23 | "source": [ 24 | "1. 描述统计学与推断统计学\n", 25 | "\n", 26 | "- 描述统计学:对数据进行描述和总结\n", 27 | "\n", 28 | "- 推断统计学: 通过样本做出关于总体的推断和预测\n", 29 | "\n", 30 | "2. 对象和整体\n", 31 | " \n", 32 | "- 对象:我们想要观测的具体事物叫做对象\n", 33 | "\n", 34 | "- 整体:我们想观测的整个对象的集合\n", 35 | "\n", 36 | "3. 样本和整体\n", 37 | "\n", 38 | "- 样本是我们收集数据的对象\n", 39 | "\n", 40 | "- 总体是我们想要得到结论的群体\n", 41 | "\n", 42 | "4. 统计量和参数\n", 43 | "\n", 44 | "- 统计量:描述样本特征的数值\n", 45 | "\n", 46 | "- 参数:描述总体特征的数值\n", 47 | "\n", 48 | "- 在统计推断中,我们会基于样本的统计量,对总体的参数进行推断,从而得到对总体的结论" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "id": "77abcd2b-bf0e-40e9-b7ee-087fa12e2bc8", 54 | "metadata": {}, 55 | "source": [ 56 | "### 二、独立双样本t检验步骤" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "id": "4217a0db-7f0d-4b5e-b6ed-63e710d50dee", 62 | "metadata": {}, 63 | "source": [ 64 | "1. 概念\n", 65 | "\n", 66 | "- 独立:说明样本来自不同的总体,彼此没有关联\n", 67 | "\n", 68 | "- 双样本:比较两个不同样本的数据\n", 69 | "\n", 70 | "- t检验:一种统计方法,用于确定样本的平均值之间是否存在统计显著的差异\n", 71 | "\n", 72 | "2. 前提条件\n", 73 | "\n", 74 | "- 随机抽样\n", 75 | "\n", 76 | "- 总体大致呈正态分布 " 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "id": "e8a0921a-387d-43f8-ae28-525f6ad0d5bd", 82 | "metadata": {}, 83 | "source": [ 84 | "#### (一)、建立假设" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "id": "2a33f8cb-288d-443b-9402-4535e834614c", 90 | "metadata": {}, 91 | "source": [ 92 | "1. 原假设(H0)\n", 93 | "\n", 94 | "2. 备择假设(H1)\n", 95 | "\n", 96 | "一般我们进行假设检验的时候,是想反驳原假设,以及支持备择假设" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "id": "95f4041d-1471-4c37-9b89-a4095e41156a", 102 | "metadata": {}, 103 | "source": [ 104 | "#### (二)、选择单尾或双尾" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "id": "38b4098a-5db8-4551-b562-b5be8dc54c45", 110 | "metadata": {}, 111 | "source": [ 112 | "1. 双尾只推断总体之间是否有差异,不在意是正差还是负差\n", 113 | "\n", 114 | "2. 单尾推断只看是否存在正差异,或者只看是否存在负差异" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "id": "46ec60fa-4a97-4ced-a1e5-a66357be0754", 120 | "metadata": {}, 121 | "source": [ 122 | "#### (三)、确定显著水平" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "id": "48c6d02e-eb27-4530-8eec-c96d871422ab", 128 | "metadata": {}, 129 | "source": [ 130 | "显著水平反应了检验的严格程度。\n", 131 | "\n", 132 | "样本抽样存在随机性,检验结果没有可能保证100%符合现实,只能通过显著水平来调整严格程度\n", 133 | "\n", 134 | "常见的双尾检验显著水平是0.05,也就是说如果检验结果是拒绝原假设,原假设实际为真的概率是5%;再换句话说,如果检验结果是拒绝原假设,结论95%概率是对的。\n", 135 | "\n", 136 | "常见的单尾检验显著水平是0.025。\n", 137 | "\n", 138 | "**之所以单尾检验显著水平定为双尾检验显著水平的一半,可以想象两样本数据的正态分布图,一样本的平均值落在另一样本的一边区间中,说明差异显著性,说明观察正差异或负差异,说明是单尾假设检验;一样本的平均值落在另一样本的两边区间中,说明差异显著性,说明观察正差异和负差异,说明是双尾假设检验**" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "id": "b1d7705a-ada8-455b-b5cf-05a13f2f188a", 144 | "metadata": {}, 145 | "source": [ 146 | "#### (四)、计算t值" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "id": "25e93603-6c88-4e59-8e82-ef8fa02e055c", 152 | "metadata": {}, 153 | "source": [ 154 | "1. t值:表示两个样本之间均值差异的大小,越大说明差异约显著" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "id": "b4b58533-4f12-48d9-90fe-49a21b67c408", 160 | "metadata": {}, 161 | "source": [ 162 | "#### (五)、计算自由度" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "id": "89ef8380-8c8b-45c3-a79d-c658800c2ecd", 168 | "metadata": {}, 169 | "source": [ 170 | "自由度 = 样本1的数量 + 样本2的数量 -2" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "id": "122d9642-2f20-4a91-a2ed-f5bd9aeeff6b", 176 | "metadata": {}, 177 | "source": [ 178 | "#### (六)、查看t值临界值表" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "id": "5256df88-576b-414b-bcde-4f9054bfad59", 184 | "metadata": {}, 185 | "source": [ 186 | "根据单双尾、自由度和显著水平,去查t值临界值表" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "id": "cd171f86-90a6-45f2-b2f6-97f59d74f7b3", 192 | "metadata": {}, 193 | "source": [ 194 | "#### (七)、比较临界值和t值" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "id": "3fce23de-3f54-43c9-94d5-07a58bae58d2", 200 | "metadata": {}, 201 | "source": [ 202 | "t值 >= 临界值,拒绝原假设,说明存在显著差异\n", 203 | "\n", 204 | "t值 < 临界值,接受原假设,说明不存在显著差异" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "id": "1a6a7176-e3c0-44ed-beb0-724e13ec656e", 210 | "metadata": {}, 211 | "source": [ 212 | "### 三、Z检验" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "id": "47ece0f8-0dd7-40c0-be0d-6841e8b0548e", 218 | "metadata": {}, 219 | "source": [ 220 | "1. 前提条件的区别\n", 221 | "\n", 222 | "- 增加了要求:总体方差已知/样本容量大于30\n", 223 | "\n", 224 | "2. 步骤的区别\n", 225 | "\n", 226 | "- 不需要计算自由度\n", 227 | "\n", 228 | "- z值计算公式,相比于t值计算公式,把样本方差换成了总体方差\n", 229 | "\n", 230 | "- 查看z值临界值表\n", 231 | "\n", 232 | "3. 应用场景\n", 233 | "\n", 234 | "Z检验适用于已知总体方差,或者样本的数量大于30的情况\n", 235 | "\n", 236 | "t检验适用于总体方差未知,且样本的数量小于等于30的情况。\n", 237 | "\n", 238 | "在实际应用中,t检验比Z检验更加常见\n", 239 | "\n", 240 | "4. 检验结果差异\n", 241 | "\n", 242 | "Z检验可以提供更高的准确性和敏感性" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "id": "3320a27a-7a82-4f9a-8695-a9a4695d6c64", 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [] 252 | } 253 | ], 254 | "metadata": { 255 | "kernelspec": { 256 | "display_name": "Python 3 (ipykernel)", 257 | "language": "python", 258 | "name": "python3" 259 | }, 260 | "language_info": { 261 | "codemirror_mode": { 262 | "name": "ipython", 263 | "version": 3 264 | }, 265 | "file_extension": ".py", 266 | "mimetype": "text/x-python", 267 | "name": "python", 268 | "nbconvert_exporter": "python", 269 | "pygments_lexer": "ipython3", 270 | "version": "3.11.1" 271 | } 272 | }, 273 | "nbformat": 4, 274 | "nbformat_minor": 5 275 | } 276 | -------------------------------------------------------------------------------- /DataAnalysis/.ipynb_checkpoints/TERMINOLOGY-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "8e7fb728", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "## 英汉术语对照\n", 11 | "\n", 12 | "鞍点,saddle point\n", 13 | "\n", 14 | "变换,transform\n", 15 | "\n", 16 | "编码器,encoder\n", 17 | "\n", 18 | "标签,label\n", 19 | "\n", 20 | "步幅,stride\n", 21 | "\n", 22 | "参数,parameter\n", 23 | "\n", 24 | "长短期记忆网络,long short-term memory (LSTM)\n", 25 | "\n", 26 | "超参数,hyperparameter\n", 27 | "\n", 28 | "层序softmax,hierarchical softmax\n", 29 | "\n", 30 | "查准率,precision\n", 31 | "\n", 32 | "成本,cost\n", 33 | "\n", 34 | "词表,vocabulary\n", 35 | "\n", 36 | "词嵌入,word embedding\n", 37 | "\n", 38 | "词向量,word vector\n", 39 | "\n", 40 | "词元,token\n", 41 | "\n", 42 | "词元分析器,tokenizer\n", 43 | "\n", 44 | "词元化,tokenize\n", 45 | "\n", 46 | "汇聚层,pooling layer\n", 47 | "\n", 48 | "稠密,dense\n", 49 | "\n", 50 | "大小,size\n", 51 | "\n", 52 | "导入,import\n", 53 | "\n", 54 | "轮,epoch\n", 55 | "\n", 56 | "暂退法,dropout\n", 57 | "\n", 58 | "动量法,momentum (method)\n", 59 | "\n", 60 | "独立同分布,independent and identically distributed (i.i.d.)\n", 61 | "\n", 62 | "端到端,end-to-end\n", 63 | "\n", 64 | "多层感知机,multilayer perceptron\n", 65 | "\n", 66 | "多头注意力,multi-head attention\n", 67 | "\n", 68 | "二元分类,binary classification\n", 69 | "\n", 70 | "二元,bigram\n", 71 | "\n", 72 | "子采样,subsample\n", 73 | "\n", 74 | "发散,diverge\n", 75 | "\n", 76 | "泛化,generalization\n", 77 | "\n", 78 | "泛化误差,generalization error\n", 79 | "\n", 80 | "方差,variance\n", 81 | "\n", 82 | "分类,classification\n", 83 | "\n", 84 | "分类器,classifier\n", 85 | "\n", 86 | "负采样,negative sampling\n", 87 | "\n", 88 | "感受野,receptive field\n", 89 | "\n", 90 | "格拉姆矩阵,Gram matrix\n", 91 | "\n", 92 | "共现,co-occurrence\n", 93 | "\n", 94 | "广播,broadcast\n", 95 | "\n", 96 | "规范化,normalization\n", 97 | "\n", 98 | "过拟合,overfitting\n", 99 | "\n", 100 | "核回归,kernel regression\n", 101 | "\n", 102 | "恒等映射,identity mapping\n", 103 | "\n", 104 | "假设,hypothesis\n", 105 | "\n", 106 | "基准,baseline\n", 107 | "\n", 108 | "激活函数,activation function\n", 109 | "\n", 110 | "解码器,decoder\n", 111 | "\n", 112 | "近似法,approximate method\n", 113 | "\n", 114 | "经验风险最小化,empirical risk minimization\n", 115 | "\n", 116 | "局部最小值,local minimum\n", 117 | "\n", 118 | "卷积核,convolutional kernel\n", 119 | "\n", 120 | "卷积神经网络,convolutional neural network\n", 121 | "\n", 122 | "决策边界,decision boundary\n", 123 | "\n", 124 | "均值,mean\n", 125 | "\n", 126 | "均方误差,mean squared error\n", 127 | "\n", 128 | "均匀采样,uniform sampling\n", 129 | "\n", 130 | "块,block\n", 131 | "\n", 132 | "困惑度,perplexity\n", 133 | "\n", 134 | "拉普拉斯平滑,Laplace smoothing\n", 135 | "\n", 136 | "连结,concatenate\n", 137 | "\n", 138 | "类,class\n", 139 | "\n", 140 | "交叉熵,cross-entropy\n", 141 | "\n", 142 | "连续词袋,continous bag-of-words (CBOW)\n", 143 | "\n", 144 | "零张量,zero tensor\n", 145 | "\n", 146 | "流水线,pipeline\n", 147 | "\n", 148 | "滤波器,filter\n", 149 | "\n", 150 | "门控循环单元,gated recurrent units (GRU)\n", 151 | "\n", 152 | "目标检测,object detection\n", 153 | "\n", 154 | "偏置,bias\n", 155 | "\n", 156 | "偏导数,partial derivative\n", 157 | "\n", 158 | "偏移量,offset\n", 159 | "\n", 160 | "批量,batch\n", 161 | "\n", 162 | "齐普夫定律,Zipf's law\n", 163 | "\n", 164 | "欠拟合,underfitting\n", 165 | "\n", 166 | "情感分析,sentiment analysis\n", 167 | "\n", 168 | "全连接层,fully-connected layer\n", 169 | "\n", 170 | "权重,weight\n", 171 | "\n", 172 | "三元,trigram\n", 173 | "\n", 174 | "上采样,upsample\n", 175 | "\n", 176 | "上下文变量,context variable\n", 177 | "\n", 178 | "上下文窗口,context window\n", 179 | "\n", 180 | "上下文词,context word\n", 181 | "\n", 182 | "上下文向量,context vector\n", 183 | "\n", 184 | "实例/示例,instance\n", 185 | "\n", 186 | "收敛,converge\n", 187 | "\n", 188 | "属性,property\n", 189 | "\n", 190 | "数值方法,numerical method\n", 191 | "\n", 192 | "数据集,dataset\n", 193 | "\n", 194 | "数据示例,data instance\n", 195 | "\n", 196 | "数据样例,data example\n", 197 | "\n", 198 | "顺序分区,sequential partitioning\n", 199 | "\n", 200 | "softmax回归,softmax regression\n", 201 | "\n", 202 | "随机采样,random sampling\n", 203 | "\n", 204 | "损失函数,loss function\n", 205 | "\n", 206 | "双向循环神经网络,bidirectional recurrent neural network\n", 207 | "\n", 208 | "特征,feature\n", 209 | "\n", 210 | "特征图,feature map\n", 211 | "\n", 212 | "特征值,eigenvalue\n", 213 | "\n", 214 | "梯度,gradient\n", 215 | "\n", 216 | "梯度裁剪,gradient clipping\n", 217 | "\n", 218 | "梯度消失,vanishing gradients\n", 219 | "\n", 220 | "填充,padding\n", 221 | "\n", 222 | "跳元模型,skip-gram model\n", 223 | "\n", 224 | "调参,tune hyperparameter\n", 225 | "\n", 226 | "停用词,stop words\n", 227 | "\n", 228 | "通道,channel\n", 229 | "\n", 230 | "凸优化,convex optimization\n", 231 | "\n", 232 | "图像,image\n", 233 | "\n", 234 | "未知词元,unknown token\n", 235 | "\n", 236 | "无偏估计,unbiased estimate\n", 237 | "\n", 238 | "误差,error\n", 239 | "\n", 240 | "小批量,minibatch\n", 241 | "\n", 242 | "小批量梯度,minibatch gradient\n", 243 | "\n", 244 | "线性模型,linear model\n", 245 | "\n", 246 | "线性回归,linear regression\n", 247 | "\n", 248 | "协同过滤,collaborative filtering\n", 249 | "\n", 250 | "学习率,learning rate\n", 251 | "\n", 252 | "训练误差,training error\n", 253 | "\n", 254 | "循环神经网络,recurrent neural network (RNN)\n", 255 | "\n", 256 | "样例,example\n", 257 | "\n", 258 | "一维梯度下降,gradient descent in one-dimensional space\n", 259 | "\n", 260 | "一元,unigram\n", 261 | "\n", 262 | "隐藏变量,hidden variable\n", 263 | "\n", 264 | "隐藏层,hidden layer\n", 265 | "\n", 266 | "优化器,optimizer\n", 267 | "\n", 268 | "语料库,corpus\n", 269 | "\n", 270 | "运算符,operator\n", 271 | "\n", 272 | "自注意力,self-attention\n", 273 | "\n", 274 | "真实值,ground truth\n", 275 | "\n", 276 | "指标,metric\n", 277 | "\n", 278 | "支持向量机,support vector machine\n", 279 | "\n", 280 | "注意力机制,attention mechanism\n", 281 | "\n", 282 | "注意力模型,attention model\n", 283 | "\n", 284 | "注意力提示,attention cue\n", 285 | "\n", 286 | "准确率/精度,accuracy\n" 287 | ] 288 | } 289 | ], 290 | "metadata": { 291 | "language_info": { 292 | "name": "python" 293 | }, 294 | "required_libs": [] 295 | }, 296 | "nbformat": 4, 297 | "nbformat_minor": 5 298 | } -------------------------------------------------------------------------------- /EvaluateAndCleanData/4.26_clean_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "6a11515c-659c-4607-86fc-879e0151b53f", 6 | "metadata": {}, 7 | "source": [ 8 | "# **4.26_clean_data**" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "c43efa44-32e6-4fb9-92f5-608767243d15", 14 | "metadata": {}, 15 | "source": [ 16 | "在评估之后,下一步是根据评估结果,对数据进行清洗。\n", 17 | "\n", 18 | "这节先会对处理方式进行一个大致的了解。" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "610c4ef3-a31a-47be-bbc0-bc522544ba4b", 24 | "metadata": {}, 25 | "source": [ 26 | "### 一、清洗数据之前" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "id": "f7f0bc61-2e37-402c-add7-1d2cd7fe2388", 32 | "metadata": {}, 33 | "source": [ 34 | "在清洗数据之前,我们要先看看索引或列名是否有意义。\n", 35 | "\n", 36 | "如果索引或列名都是乱七八糟的,应该对它们进行重命名,或重新排序,以便我们理解数据。" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "id": "ccdfd9d1-e308-4ca3-99db-d2a048f53e9f", 42 | "metadata": {}, 43 | "source": [ 44 | "### 二、结构性问题" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "id": "f0410bea-2c45-4b1b-b344-520a77a13344", 50 | "metadata": {}, 51 | "source": [ 52 | "清洗数据,我们一般会先解决结构性问题,再处理内容性问题。" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "id": "a8047e72-180a-427d-8723-00ae8189080b", 58 | "metadata": {}, 59 | "source": [ 60 | "1. 整洁数据,根据埃德加科德的第三范式,包括以下三个特点:\n", 61 | "\n", 62 | " 1)每列是一个变量\n", 63 | "\n", 64 | " 2)每行是一个观察值\n", 65 | "\n", 66 | " 3)每个单元格是一个值\n", 67 | "\n", 68 | " 任何不符合以上三个特点的数据都是乱数据。" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "id": "5b3e96fd-1ed0-4e50-9dd3-797147aadcd2", 74 | "metadata": {}, 75 | "source": [ 76 | "#### (一)、每列是观察值,每行是变量" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "id": "3240f577-065d-42e3-934b-e6c69192034c", 82 | "metadata": {}, 83 | "source": [ 84 | "对行和列进行转置" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "id": "bd8c8a96-3061-4c1a-b140-4363ccd819b5", 90 | "metadata": {}, 91 | "source": [ 92 | "#### (二)、每列包含多个变量" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "id": "c8b2c680-5dbb-474e-ada6-a62e362ae5dd", 98 | "metadata": {}, 99 | "source": [ 100 | "1. 对列进行拆分,把多的变量分到其它列去\n", 101 | "\n", 102 | "2. 有的时候光拆分还不够,还要进行重塑,确保每列只包含一种变量\n", 103 | "\n", 104 | " 比如:许多列同时包含两个或多个变量的时候" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "id": "e45cf832-ab0f-4364-8d5c-924bf1e4a8c5", 110 | "metadata": {}, 111 | "source": [ 112 | "#### (三)、每行包含多个观察值" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "id": "b66ba578-c94a-458f-b38a-e6ca16808dc5", 118 | "metadata": {}, 119 | "source": [ 120 | "1. 对行进行拆分,让每个观察值为独立的一行\n", 121 | "\n", 122 | "2. 有的时候光拆分还不够,还要进行重塑,确保每列只包含一种观察值\n", 123 | "\n", 124 | " 比如:许多行同时包含两个或多个观察值的时候" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "id": "b2f51b9c-0222-4490-8271-0b5f89143fdd", 130 | "metadata": {}, 131 | "source": [ 132 | "很多时候,清理前的数据是宽数据,清理后的数据是长数据。\n", 133 | "\n", 134 | "我们清理的目的,是为了后续能更高效地用程序处理数据,而不是更方便地让人类理解,所以清理前的宽数据更直观易懂也是正常的。" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "2faff3cd-21d0-436c-bf73-830991d57298", 140 | "metadata": {}, 141 | "source": [ 142 | "### 三、内容性问题" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "id": "16025cea-4f0b-4fc9-ad14-8fc3876acb1f", 148 | "metadata": {}, 149 | "source": [ 150 | "在确保结构不存在问题后,我们再去深入到内容,处理脏数据。" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "id": "7d6a160b-f68e-43f4-bde3-d641c9f5766a", 156 | "metadata": {}, 157 | "source": [ 158 | "#### (一)、缺失数据" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "id": "2ec5d25e-05e2-41ac-92aa-922675262a0f", 164 | "metadata": {}, 165 | "source": [ 166 | "针对缺失数据,处理方式需要具体情况具体分析。\n", 167 | "\n", 168 | "1. 如果恰好知道空缺值的实际值,可以更新表格数据,人工把那个值填进去。\n", 169 | "\n", 170 | "2. 如果我们不知道空缺值的实际值,而缺失值并不影响此次分析,最直接的办法是不处理缺失值。\n", 171 | " \n", 172 | " Pandas在计算的时候,会自动忽略缺失值,所以很多时候放着不管不会造成什么问题。\n", 173 | "\n", 174 | "3. 如果是关键变量缺失,我们可以把变量为空缺的行删掉,只留下对分析结果有意义的数据。 \n", 175 | "\n", 176 | "4. 如果是关键变量缺失,我们也可以用填充值的方式去处理,比如说把平均数、中位数、众数等填充进去,来代替空缺值 " 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "id": "ff438dea-8adf-46a4-97c3-7293c796be80", 182 | "metadata": {}, 183 | "source": [ 184 | "#### (二)、重复数据" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "id": "469e5a08-1a1c-4fd5-9d63-9a30318e7b5b", 190 | "metadata": {}, 191 | "source": [ 192 | "针对重复数据,我们的处理方式就很简单了。\n", 193 | "\n", 194 | "找到后删除即可,不删除的话,重复数据可能影响分析结论" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "id": "85de65b0-04f4-45a7-ae6d-a7fb8595c782", 200 | "metadata": {}, 201 | "source": [ 202 | "#### (三)、不一致数据" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "id": "7d3a928b-2231-4bf4-8a52-4d4eb3076fe8", 208 | "metadata": {}, 209 | "source": [ 210 | "针对不一致数据,我们的目标是对它们进行统一。针对同一含义,只保留一种表达方式,把其余的都进行替换" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "id": "18338f98-0c6a-4f29-b5a9-574bd7d9f068", 216 | "metadata": {}, 217 | "source": [ 218 | "#### (四)、无效/错误数据" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "id": "e01a5473-4c8e-443c-934c-b46654158bce", 224 | "metadata": {}, 225 | "source": [ 226 | "针对无效/错误数据,也有不同的清洗途径。\n", 227 | "\n", 228 | "比如删除/替换,否则留下无效/错误数据,也可能影响分析结论。比如说一个负数的身高记录值,可以严重拉低平均值的分析。\n", 229 | "\n", 230 | "1. 把那条记录值进行删除。\n", 231 | " \n", 232 | " 因为Pandas会自动忽略空缺值,所以NaN值反而不影响平均值计算\n", 233 | "\n", 234 | "2. 替换成其他值。比如说平均数。 " 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "id": "7246e2cd-06b8-48f5-bf16-4d3f23ea4da7", 240 | "metadata": {}, 241 | "source": [ 242 | "### 四、其他问题" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "id": "1ca2e2b4-1050-4f43-aed1-0e5d03e6aae8", 248 | "metadata": {}, 249 | "source": [ 250 | "除了数据本身的问题以外,我们清理数据时,有时候还要针对编程语言或库,做一些其他的处理,包括对数据类型进行转换。\n", 251 | "\n", 252 | "比如:把手机号从数字类型转换成字符串类型;把'是'和'否'转换成布尔值True和False,能让我们之后针对这个变量的分析更加方便,包括能更简洁地进行逻辑判断" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "id": "359afe8c-3a47-4831-ab89-d104130b8637", 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [] 262 | } 263 | ], 264 | "metadata": { 265 | "kernelspec": { 266 | "display_name": "Python 3 (ipykernel)", 267 | "language": "python", 268 | "name": "python3" 269 | }, 270 | "language_info": { 271 | "codemirror_mode": { 272 | "name": "ipython", 273 | "version": 3 274 | }, 275 | "file_extension": ".py", 276 | "mimetype": "text/x-python", 277 | "name": "python", 278 | "nbconvert_exporter": "python", 279 | "pygments_lexer": "ipython3", 280 | "version": "3.11.1" 281 | } 282 | }, 283 | "nbformat": 4, 284 | "nbformat_minor": 5 285 | } 286 | -------------------------------------------------------------------------------- /DataAnalysisPreparation/2.7_numpy_array_extended.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "725c8757-87ac-48f7-a351-ab01c6d1577f", 6 | "metadata": {}, 7 | "source": [ 8 | "# **2.7_更多数组**" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "8a66b7c0-17e0-4e37-b361-ed9035bd4a58", 14 | "metadata": {}, 15 | "source": [ 16 | "### 一、针对NumPy数组的常用操作" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "7a739918-738d-4920-95e3-7bc0a30ad213", 22 | "metadata": {}, 23 | "source": [ 24 | "#### (一)、用concatenate函数连接数组" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "500727eb-931f-4f9f-8763-c26cc26b00be", 30 | "metadata": {}, 31 | "source": [ 32 | "1. concatenate函数接收的参数是列表,所以可以把两个数组,用中括号包围起来,作为一个列表传进去\n", 33 | "\n", 34 | "2. concatenate函数输出的结果,是两个数组里所有元素拼接起来后组成的新数组" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "id": "fcb7196d-ff20-4d85-8541-41d6ef0b6475", 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "array([ 5., 17., 3., 26., 31., 0., 0.])" 47 | ] 48 | }, 49 | "execution_count": 2, 50 | "metadata": {}, 51 | "output_type": "execute_result" 52 | } 53 | ], 54 | "source": [ 55 | "import numpy as np\n", 56 | "\n", 57 | "arr1 = np.array([5, 17, 3, 26, 31])\n", 58 | "arr2 = np.zeros(2)\n", 59 | "np.concatenate([arr1, arr2])" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "id": "5025e681-4f28-4367-95e6-d5f3b453f425", 65 | "metadata": {}, 66 | "source": [ 67 | "**可以看到数组里的数字后面都有个小数点,这是因为zeroes方法产生的是浮点数类型的数组,然后数组里数据类型又必须统一,所以拼接后的结果也是浮点数数组**" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "id": "c5f623f3-1e47-4974-96d5-4e68ba169ca8", 73 | "metadata": {}, 74 | "source": [ 75 | "3. 由于传入列表的长度是不限的,所以也可以一次性拼接多个数组\n", 76 | " ```python\n", 77 | " np.concatenate([arr1, arr2, arr3])\n", 78 | " ```" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "id": "a0e02b9c-8770-429d-88f6-9418c012df18", 84 | "metadata": {}, 85 | "source": [ 86 | "#### (二)、对内容进行排序" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "id": "d42b5c83-e427-4971-8d5b-d48d0be22018", 92 | "metadata": {}, 93 | "source": [ 94 | "1. 针对列表来说\n", 95 | " \n", 96 | " 1)sorted函数\n", 97 | " \n", 98 | " 传入列表后,就会返回一个新的排序好的列表\n", 99 | " \n", 100 | " **只返回排序好的新列表,不改变原始列表**\n", 101 | "\n", 102 | " **也可以传入Series作为参数**\n", 103 | " \n", 104 | " 2)sort方法\n", 105 | " \n", 106 | " 调用后,该列表里面的元素就都会被排序好\n", 107 | " \n", 108 | " **什么都不返回,但原始列表会被排序好**" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 8, 114 | "id": "45905294-b9d7-4d41-a05b-dc68fc92e61c", 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "[3, 5, 17, 26, 31]\n", 122 | "[5, 17, 3, 26, 31]\n", 123 | "None\n", 124 | "[3, 5, 17, 26, 31]\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "list1 = [5, 17, 3, 26, 31]\n", 130 | "sorted_list1 = sorted(list1)\n", 131 | "print(sorted_list1)\n", 132 | "print(list1)\n", 133 | "\n", 134 | "print(list1.sort())\n", 135 | "print(list1)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "id": "961dfd6c-9716-4f5c-aff5-55c5389cd4f2", 141 | "metadata": {}, 142 | "source": [ 143 | "2. 针对NumPy数组来说\n", 144 | "\n", 145 | " 1)NumPy的sort函数\n", 146 | "\n", 147 | " np.sort会返回排序好的新的数组,但是传入的原始数组不会被改变\n", 148 | "\n", 149 | " **也可以传入Series作为参数**\n", 150 | "\n", 151 | " 2)NumPy的sort方法\n", 152 | "\n", 153 | " 而数组的sort方法,会直接在该数组上进行改动,把元素排序好\n", 154 | " \n", 155 | " **你可以根据是否要直接更改数组,决定使用**" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 9, 161 | "id": "a4375284-c61b-43c7-a8df-7bbfcd45c67b", 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | "[ 3 5 17 26 31]\n", 169 | "[ 5 17 3 26 31]\n", 170 | "None\n", 171 | "[ 3 5 17 26 31]\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "arr1 = np.array([5, 17, 3, 26, 31])\n", 177 | "print(np.sort(arr1))\n", 178 | "print(arr1)\n", 179 | "\n", 180 | "print(arr1.sort())\n", 181 | "print(arr1)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "id": "79e2e797-5558-4309-bc81-47bdfbe6df43", 187 | "metadata": {}, 188 | "source": [ 189 | "#### (三)、用索引获得元素" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "id": "426ed270-02fc-44c4-a95a-d8d40b91673a", 195 | "metadata": {}, 196 | "source": [ 197 | "1. 获得某个元素\n", 198 | "\n", 199 | " 1)正着数\n", 200 | "\n", 201 | " 第一个元素的索引为0,后面依次+1\n", 202 | "\n", 203 | " 2)倒着数\n", 204 | "\n", 205 | " 最后一个元素的索引是-1,倒数第二个的索引是-2,以此类推\n", 206 | "\n", 207 | "2. 获得某范围的多个元素\n", 208 | "\n", 209 | " 切片会返回开头索引到结束索引前一个的所有元素" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 11, 215 | "id": "9e14c69f-95b3-411c-aad4-98a2093212e1", 216 | "metadata": {}, 217 | "outputs": [ 218 | { 219 | "data": { 220 | "text/plain": [ 221 | "array([ 5, 17, 26])" 222 | ] 223 | }, 224 | "execution_count": 11, 225 | "metadata": {}, 226 | "output_type": "execute_result" 227 | } 228 | ], 229 | "source": [ 230 | "arr1[1: 4]" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "id": "03692b01-987e-4538-b654-281ab8600a5d", 236 | "metadata": {}, 237 | "source": [ 238 | "#### (四)、数组和数组进行运算" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "id": "068519f7-ce69-4b74-aba9-ec83a822c0c0", 244 | "metadata": {}, 245 | "source": [ 246 | "**NumPy数组的强项之一是运算**\n", 247 | "\n", 248 | "1. 加减乘除\n", 249 | "\n", 250 | " 1)数组与数组之间\n", 251 | "\n", 252 | " 如果把形状相同的两个一维数组进行运算,会返回一个相同位置元素加减乘除后得到的数组。而列表做不到这么便捷了" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 15, 258 | "id": "b2b93a3b-273e-476e-a4c7-aa95a15530e4", 259 | "metadata": {}, 260 | "outputs": [ 261 | { 262 | "name": "stdout", 263 | "output_type": "stream", 264 | "text": [ 265 | "[ 8 9 10 11 12]\n", 266 | "[4 5 6 7 8]\n", 267 | "[12 14 16 18 20]\n", 268 | "[3. 3.5 4. 4.5 5. ]\n" 269 | ] 270 | } 271 | ], 272 | "source": [ 273 | "arr4 = np.array([6, 7, 8, 9, 10])\n", 274 | "arr5 = np.array([2, 2, 2, 2, 2])\n", 275 | "print(arr4 + arr5)\n", 276 | "print(arr4 - arr5)\n", 277 | "print(arr4 * arr5)\n", 278 | "print(arr4 / arr5)" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "id": "c6e418a2-6589-42d6-b059-29ecac1c2c93", 284 | "metadata": {}, 285 | "source": [ 286 | " 2)数组和单个数字之间\n", 287 | "\n", 288 | " 如果学过线性代数,可以看成是向量和标量之间的运算" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 29, 294 | "id": "94381964-d1e1-45e0-9abb-41fb410af739", 295 | "metadata": {}, 296 | "outputs": [ 297 | { 298 | "data": { 299 | "text/plain": [ 300 | "array([18, 21, 24, 27, 30])" 301 | ] 302 | }, 303 | "execution_count": 29, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "arr4 * 3" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "id": "3359c069-2c85-4df5-9008-18a6522ad45e", 315 | "metadata": {}, 316 | "source": [ 317 | " 这个例子里的乘3会运用在数组的每一个数字上,这种操作机制,叫做广播机制" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "id": "255ab2e0-cf54-43b0-ba6b-fb7c90c56e69", 323 | "metadata": {}, 324 | "source": [ 325 | " 3)聚合运算\n", 326 | "\n", 327 | " a.聚合运算是指,通过一组值,来得到一个值\n", 328 | "\n", 329 | " b.包括max求最大值,min求最小值,sum求和,mean求平均值等待\n", 330 | "\n", 331 | " c.数组.操作名(),返回相应的运算结果" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 17, 337 | "id": "9591a5bd-d9eb-4994-95a2-6de3a693fc69", 338 | "metadata": {}, 339 | "outputs": [ 340 | { 341 | "name": "stdout", 342 | "output_type": "stream", 343 | "text": [ 344 | "10\n", 345 | "6\n", 346 | "40\n", 347 | "8.0\n" 348 | ] 349 | } 350 | ], 351 | "source": [ 352 | "print(arr4.max())\n", 353 | "print(arr4.min())\n", 354 | "print(arr4.sum())\n", 355 | "print(arr4.mean())" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "id": "2b90b0e9-fbce-40ba-9217-82570da81f2b", 361 | "metadata": {}, 362 | "source": [ 363 | "#### (五)、根据条件筛选数组元素" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 27, 369 | "id": "afe054fb-2bb4-48f6-9ee0-0c75a739c23e", 370 | "metadata": {}, 371 | "outputs": [ 372 | { 373 | "data": { 374 | "text/plain": [ 375 | "array([False, False, True, True, True, True])" 376 | ] 377 | }, 378 | "execution_count": 27, 379 | "metadata": {}, 380 | "output_type": "execute_result" 381 | } 382 | ], 383 | "source": [ 384 | "arr = np.array([-22, 3, 65, 9, 11, 7])\n", 385 | "arr > 6" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "id": "195d3b0c-4e38-4133-a1ea-870c18ef9bee", 391 | "metadata": {}, 392 | "source": [ 393 | "1. \n", 394 | "\n", 395 | "一个数组和一个数字之间的操作,根据广播机制,> 6会被运用到每一个数字上,那产生的结果就是由True和False组成的布尔值数组,每个布尔值都代表了这个数字是否大于6" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 28, 401 | "id": "b214b2f6-630f-4296-a002-6a73974565c6", 402 | "metadata": {}, 403 | "outputs": [ 404 | { 405 | "data": { 406 | "text/plain": [ 407 | "array([65, 9, 11, 7])" 408 | ] 409 | }, 410 | "execution_count": 28, 411 | "metadata": {}, 412 | "output_type": "execute_result" 413 | } 414 | ], 415 | "source": [ 416 | "arr[arr > 6]" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "id": "98b029c5-563c-4e27-a4d8-068ddab1c659", 422 | "metadata": {}, 423 | "source": [ 424 | "布尔值数组可以用来对形状相同的数组进行索引,在方括号里放上这个布尔值数组,相应位置为True的元素就会被筛选出来,那作为结果的数组里就会有那个元素" 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "id": "57a3b7b2-30ac-4ad7-b9d3-d4011603fc4b", 430 | "metadata": {}, 431 | "source": [ 432 | "2. 结合逻辑运算,让筛选逻辑更加复杂\n", 433 | "\n", 434 | "与:Python里用and,在数组上用&\n", 435 | "\n", 436 | "或:Python里用or,在数组上用|\n", 437 | "\n", 438 | "非:Python里用not,在数组上用~" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": 22, 444 | "id": "a1e0005b-6d82-401c-8ef8-75473db12dad", 445 | "metadata": {}, 446 | "outputs": [ 447 | { 448 | "data": { 449 | "text/plain": [ 450 | "array([9, 7])" 451 | ] 452 | }, 453 | "execution_count": 22, 454 | "metadata": {}, 455 | "output_type": "execute_result" 456 | } 457 | ], 458 | "source": [ 459 | "arr[(arr > 6) & (arr < 10)]" 460 | ] 461 | }, 462 | { 463 | "cell_type": "markdown", 464 | "id": "845ad4b9-0c77-47e5-a027-a3a5729bd140", 465 | "metadata": {}, 466 | "source": [ 467 | "比如要筛选出所有大于6且小于10的数字,就把 > 6和 < 10这两个条件用括号括住,表示这两个条件要先于&计算" 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "id": "2c027013-63a4-475f-99dc-6069fba4082f", 473 | "metadata": {}, 474 | "source": [ 475 | "### 二、其他" 476 | ] 477 | }, 478 | { 479 | "cell_type": "markdown", 480 | "id": "82b0407e-c91b-4c27-80fb-9ed308bb5885", 481 | "metadata": {}, 482 | "source": [ 483 | "1. np.nan表示缺失值,将np.nan赋值给其他变量,可以使变量为NaN值" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 5, 489 | "id": "ff0e27f4-9d87-4f5e-90ff-dfd6c1d89eb2", 490 | "metadata": {}, 491 | "outputs": [ 492 | { 493 | "data": { 494 | "text/plain": [ 495 | "nan" 496 | ] 497 | }, 498 | "execution_count": 5, 499 | "metadata": {}, 500 | "output_type": "execute_result" 501 | } 502 | ], 503 | "source": [ 504 | "np.nan" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": null, 510 | "id": "b8d8902c-81d6-4244-a442-044bf927b5b8", 511 | "metadata": {}, 512 | "outputs": [], 513 | "source": [] 514 | } 515 | ], 516 | "metadata": { 517 | "kernelspec": { 518 | "display_name": "Python 3 (ipykernel)", 519 | "language": "python", 520 | "name": "python3" 521 | }, 522 | "language_info": { 523 | "codemirror_mode": { 524 | "name": "ipython", 525 | "version": 3 526 | }, 527 | "file_extension": ".py", 528 | "mimetype": "text/x-python", 529 | "name": "python", 530 | "nbconvert_exporter": "python", 531 | "pygments_lexer": "ipython3", 532 | "version": "3.11.1" 533 | } 534 | }, 535 | "nbformat": 4, 536 | "nbformat_minor": 5 537 | } 538 | -------------------------------------------------------------------------------- /DataVisualization/penguins.csv: -------------------------------------------------------------------------------- 1 | species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex 2 | Adelie,Torgersen,39.1,18.7,181,3750,MALE 3 | Adelie,Torgersen,39.5,17.4,186,3800,FEMALE 4 | Adelie,Torgersen,40.3,18,195,3250,FEMALE 5 | Adelie,Torgersen,NA,NA,NA,NA,NA 6 | Adelie,Torgersen,36.7,19.3,193,3450,FEMALE 7 | Adelie,Torgersen,39.3,20.6,190,3650,MALE 8 | Adelie,Torgersen,38.9,17.8,181,3625,FEMALE 9 | Adelie,Torgersen,39.2,19.6,195,4675,MALE 10 | Adelie,Torgersen,34.1,18.1,193,3475,NA 11 | Adelie,Torgersen,42,20.2,190,4250,NA 12 | Adelie,Torgersen,37.8,17.1,186,3300,NA 13 | Adelie,Torgersen,37.8,17.3,180,3700,NA 14 | Adelie,Torgersen,41.1,17.6,182,3200,FEMALE 15 | Adelie,Torgersen,38.6,21.2,191,3800,MALE 16 | Adelie,Torgersen,34.6,21.1,198,4400,MALE 17 | Adelie,Torgersen,36.6,17.8,185,3700,FEMALE 18 | Adelie,Torgersen,38.7,19,195,3450,FEMALE 19 | Adelie,Torgersen,42.5,20.7,197,4500,MALE 20 | Adelie,Torgersen,34.4,18.4,184,3325,FEMALE 21 | Adelie,Torgersen,46,21.5,194,4200,MALE 22 | Adelie,Biscoe,37.8,18.3,174,3400,FEMALE 23 | Adelie,Biscoe,37.7,18.7,180,3600,MALE 24 | Adelie,Biscoe,35.9,19.2,189,3800,FEMALE 25 | Adelie,Biscoe,38.2,18.1,185,3950,MALE 26 | Adelie,Biscoe,38.8,17.2,180,3800,MALE 27 | Adelie,Biscoe,35.3,18.9,187,3800,FEMALE 28 | Adelie,Biscoe,40.6,18.6,183,3550,MALE 29 | Adelie,Biscoe,40.5,17.9,187,3200,FEMALE 30 | Adelie,Biscoe,37.9,18.6,172,3150,FEMALE 31 | Adelie,Biscoe,40.5,18.9,180,3950,MALE 32 | Adelie,Dream,39.5,16.7,178,3250,FEMALE 33 | Adelie,Dream,37.2,18.1,178,3900,MALE 34 | Adelie,Dream,39.5,17.8,188,3300,FEMALE 35 | Adelie,Dream,40.9,18.9,184,3900,MALE 36 | Adelie,Dream,36.4,17,195,3325,FEMALE 37 | Adelie,Dream,39.2,21.1,196,4150,MALE 38 | Adelie,Dream,38.8,20,190,3950,MALE 39 | Adelie,Dream,42.2,18.5,180,3550,FEMALE 40 | Adelie,Dream,37.6,19.3,181,3300,FEMALE 41 | Adelie,Dream,39.8,19.1,184,4650,MALE 42 | Adelie,Dream,36.5,18,182,3150,FEMALE 43 | Adelie,Dream,40.8,18.4,195,3900,MALE 44 | Adelie,Dream,36,18.5,186,3100,FEMALE 45 | Adelie,Dream,44.1,19.7,196,4400,MALE 46 | Adelie,Dream,37,16.9,185,3000,FEMALE 47 | Adelie,Dream,39.6,18.8,190,4600,MALE 48 | Adelie,Dream,41.1,19,182,3425,MALE 49 | Adelie,Dream,37.5,18.9,179,2975,NA 50 | Adelie,Dream,36,17.9,190,3450,FEMALE 51 | Adelie,Dream,42.3,21.2,191,4150,MALE 52 | Adelie,Biscoe,39.6,17.7,186,3500,FEMALE 53 | Adelie,Biscoe,40.1,18.9,188,4300,MALE 54 | Adelie,Biscoe,35,17.9,190,3450,FEMALE 55 | Adelie,Biscoe,42,19.5,200,4050,MALE 56 | Adelie,Biscoe,34.5,18.1,187,2900,FEMALE 57 | Adelie,Biscoe,41.4,18.6,191,3700,MALE 58 | Adelie,Biscoe,39,17.5,186,3550,FEMALE 59 | Adelie,Biscoe,40.6,18.8,193,3800,MALE 60 | Adelie,Biscoe,36.5,16.6,181,2850,FEMALE 61 | Adelie,Biscoe,37.6,19.1,194,3750,MALE 62 | Adelie,Biscoe,35.7,16.9,185,3150,FEMALE 63 | Adelie,Biscoe,41.3,21.1,195,4400,MALE 64 | Adelie,Biscoe,37.6,17,185,3600,FEMALE 65 | Adelie,Biscoe,41.1,18.2,192,4050,MALE 66 | Adelie,Biscoe,36.4,17.1,184,2850,FEMALE 67 | Adelie,Biscoe,41.6,18,192,3950,MALE 68 | Adelie,Biscoe,35.5,16.2,195,3350,FEMALE 69 | Adelie,Biscoe,41.1,19.1,188,4100,MALE 70 | Adelie,Torgersen,35.9,16.6,190,3050,FEMALE 71 | Adelie,Torgersen,41.8,19.4,198,4450,MALE 72 | Adelie,Torgersen,33.5,19,190,3600,FEMALE 73 | Adelie,Torgersen,39.7,18.4,190,3900,MALE 74 | Adelie,Torgersen,39.6,17.2,196,3550,FEMALE 75 | Adelie,Torgersen,45.8,18.9,197,4150,MALE 76 | Adelie,Torgersen,35.5,17.5,190,3700,FEMALE 77 | Adelie,Torgersen,42.8,18.5,195,4250,MALE 78 | Adelie,Torgersen,40.9,16.8,191,3700,FEMALE 79 | Adelie,Torgersen,37.2,19.4,184,3900,MALE 80 | Adelie,Torgersen,36.2,16.1,187,3550,FEMALE 81 | Adelie,Torgersen,42.1,19.1,195,4000,MALE 82 | Adelie,Torgersen,34.6,17.2,189,3200,FEMALE 83 | Adelie,Torgersen,42.9,17.6,196,4700,MALE 84 | Adelie,Torgersen,36.7,18.8,187,3800,FEMALE 85 | Adelie,Torgersen,35.1,19.4,193,4200,MALE 86 | Adelie,Dream,37.3,17.8,191,3350,FEMALE 87 | Adelie,Dream,41.3,20.3,194,3550,MALE 88 | Adelie,Dream,36.3,19.5,190,3800,MALE 89 | Adelie,Dream,36.9,18.6,189,3500,FEMALE 90 | Adelie,Dream,38.3,19.2,189,3950,MALE 91 | Adelie,Dream,38.9,18.8,190,3600,FEMALE 92 | Adelie,Dream,35.7,18,202,3550,FEMALE 93 | Adelie,Dream,41.1,18.1,205,4300,MALE 94 | Adelie,Dream,34,17.1,185,3400,FEMALE 95 | Adelie,Dream,39.6,18.1,186,4450,MALE 96 | Adelie,Dream,36.2,17.3,187,3300,FEMALE 97 | Adelie,Dream,40.8,18.9,208,4300,MALE 98 | Adelie,Dream,38.1,18.6,190,3700,FEMALE 99 | Adelie,Dream,40.3,18.5,196,4350,MALE 100 | Adelie,Dream,33.1,16.1,178,2900,FEMALE 101 | Adelie,Dream,43.2,18.5,192,4100,MALE 102 | Adelie,Biscoe,35,17.9,192,3725,FEMALE 103 | Adelie,Biscoe,41,20,203,4725,MALE 104 | Adelie,Biscoe,37.7,16,183,3075,FEMALE 105 | Adelie,Biscoe,37.8,20,190,4250,MALE 106 | Adelie,Biscoe,37.9,18.6,193,2925,FEMALE 107 | Adelie,Biscoe,39.7,18.9,184,3550,MALE 108 | Adelie,Biscoe,38.6,17.2,199,3750,FEMALE 109 | Adelie,Biscoe,38.2,20,190,3900,MALE 110 | Adelie,Biscoe,38.1,17,181,3175,FEMALE 111 | Adelie,Biscoe,43.2,19,197,4775,MALE 112 | Adelie,Biscoe,38.1,16.5,198,3825,FEMALE 113 | Adelie,Biscoe,45.6,20.3,191,4600,MALE 114 | Adelie,Biscoe,39.7,17.7,193,3200,FEMALE 115 | Adelie,Biscoe,42.2,19.5,197,4275,MALE 116 | Adelie,Biscoe,39.6,20.7,191,3900,FEMALE 117 | Adelie,Biscoe,42.7,18.3,196,4075,MALE 118 | Adelie,Torgersen,38.6,17,188,2900,FEMALE 119 | Adelie,Torgersen,37.3,20.5,199,3775,MALE 120 | Adelie,Torgersen,35.7,17,189,3350,FEMALE 121 | Adelie,Torgersen,41.1,18.6,189,3325,MALE 122 | Adelie,Torgersen,36.2,17.2,187,3150,FEMALE 123 | Adelie,Torgersen,37.7,19.8,198,3500,MALE 124 | Adelie,Torgersen,40.2,17,176,3450,FEMALE 125 | Adelie,Torgersen,41.4,18.5,202,3875,MALE 126 | Adelie,Torgersen,35.2,15.9,186,3050,FEMALE 127 | Adelie,Torgersen,40.6,19,199,4000,MALE 128 | Adelie,Torgersen,38.8,17.6,191,3275,FEMALE 129 | Adelie,Torgersen,41.5,18.3,195,4300,MALE 130 | Adelie,Torgersen,39,17.1,191,3050,FEMALE 131 | Adelie,Torgersen,44.1,18,210,4000,MALE 132 | Adelie,Torgersen,38.5,17.9,190,3325,FEMALE 133 | Adelie,Torgersen,43.1,19.2,197,3500,MALE 134 | Adelie,Dream,36.8,18.5,193,3500,FEMALE 135 | Adelie,Dream,37.5,18.5,199,4475,MALE 136 | Adelie,Dream,38.1,17.6,187,3425,FEMALE 137 | Adelie,Dream,41.1,17.5,190,3900,MALE 138 | Adelie,Dream,35.6,17.5,191,3175,FEMALE 139 | Adelie,Dream,40.2,20.1,200,3975,MALE 140 | Adelie,Dream,37,16.5,185,3400,FEMALE 141 | Adelie,Dream,39.7,17.9,193,4250,MALE 142 | Adelie,Dream,40.2,17.1,193,3400,FEMALE 143 | Adelie,Dream,40.6,17.2,187,3475,MALE 144 | Adelie,Dream,32.1,15.5,188,3050,FEMALE 145 | Adelie,Dream,40.7,17,190,3725,MALE 146 | Adelie,Dream,37.3,16.8,192,3000,FEMALE 147 | Adelie,Dream,39,18.7,185,3650,MALE 148 | Adelie,Dream,39.2,18.6,190,4250,MALE 149 | Adelie,Dream,36.6,18.4,184,3475,FEMALE 150 | Adelie,Dream,36,17.8,195,3450,FEMALE 151 | Adelie,Dream,37.8,18.1,193,3750,MALE 152 | Adelie,Dream,36,17.1,187,3700,FEMALE 153 | Adelie,Dream,41.5,18.5,201,4000,MALE 154 | Chinstrap,Dream,46.5,17.9,192,3500,FEMALE 155 | Chinstrap,Dream,50,19.5,196,3900,MALE 156 | Chinstrap,Dream,51.3,19.2,193,3650,MALE 157 | Chinstrap,Dream,45.4,18.7,188,3525,FEMALE 158 | Chinstrap,Dream,52.7,19.8,197,3725,MALE 159 | Chinstrap,Dream,45.2,17.8,198,3950,FEMALE 160 | Chinstrap,Dream,46.1,18.2,178,3250,FEMALE 161 | Chinstrap,Dream,51.3,18.2,197,3750,MALE 162 | Chinstrap,Dream,46,18.9,195,4150,FEMALE 163 | Chinstrap,Dream,51.3,19.9,198,3700,MALE 164 | Chinstrap,Dream,46.6,17.8,193,3800,FEMALE 165 | Chinstrap,Dream,51.7,20.3,194,3775,MALE 166 | Chinstrap,Dream,47,17.3,185,3700,FEMALE 167 | Chinstrap,Dream,52,18.1,201,4050,MALE 168 | Chinstrap,Dream,45.9,17.1,190,3575,FEMALE 169 | Chinstrap,Dream,50.5,19.6,201,4050,MALE 170 | Chinstrap,Dream,50.3,20,197,3300,MALE 171 | Chinstrap,Dream,58,17.8,181,3700,FEMALE 172 | Chinstrap,Dream,46.4,18.6,190,3450,FEMALE 173 | Chinstrap,Dream,49.2,18.2,195,4400,MALE 174 | Chinstrap,Dream,42.4,17.3,181,3600,FEMALE 175 | Chinstrap,Dream,48.5,17.5,191,3400,MALE 176 | Chinstrap,Dream,43.2,16.6,187,2900,FEMALE 177 | Chinstrap,Dream,50.6,19.4,193,3800,MALE 178 | Chinstrap,Dream,46.7,17.9,195,3300,FEMALE 179 | Chinstrap,Dream,52,19,197,4150,MALE 180 | Chinstrap,Dream,50.5,18.4,200,3400,FEMALE 181 | Chinstrap,Dream,49.5,19,200,3800,MALE 182 | Chinstrap,Dream,46.4,17.8,191,3700,FEMALE 183 | Chinstrap,Dream,52.8,20,205,4550,MALE 184 | Chinstrap,Dream,40.9,16.6,187,3200,FEMALE 185 | Chinstrap,Dream,54.2,20.8,201,4300,MALE 186 | Chinstrap,Dream,42.5,16.7,187,3350,FEMALE 187 | Chinstrap,Dream,51,18.8,203,4100,MALE 188 | Chinstrap,Dream,49.7,18.6,195,3600,MALE 189 | Chinstrap,Dream,47.5,16.8,199,3900,FEMALE 190 | Chinstrap,Dream,47.6,18.3,195,3850,FEMALE 191 | Chinstrap,Dream,52,20.7,210,4800,MALE 192 | Chinstrap,Dream,46.9,16.6,192,2700,FEMALE 193 | Chinstrap,Dream,53.5,19.9,205,4500,MALE 194 | Chinstrap,Dream,49,19.5,210,3950,MALE 195 | Chinstrap,Dream,46.2,17.5,187,3650,FEMALE 196 | Chinstrap,Dream,50.9,19.1,196,3550,MALE 197 | Chinstrap,Dream,45.5,17,196,3500,FEMALE 198 | Chinstrap,Dream,50.9,17.9,196,3675,FEMALE 199 | Chinstrap,Dream,50.8,18.5,201,4450,MALE 200 | Chinstrap,Dream,50.1,17.9,190,3400,FEMALE 201 | Chinstrap,Dream,49,19.6,212,4300,MALE 202 | Chinstrap,Dream,51.5,18.7,187,3250,MALE 203 | Chinstrap,Dream,49.8,17.3,198,3675,FEMALE 204 | Chinstrap,Dream,48.1,16.4,199,3325,FEMALE 205 | Chinstrap,Dream,51.4,19,201,3950,MALE 206 | Chinstrap,Dream,45.7,17.3,193,3600,FEMALE 207 | Chinstrap,Dream,50.7,19.7,203,4050,MALE 208 | Chinstrap,Dream,42.5,17.3,187,3350,FEMALE 209 | Chinstrap,Dream,52.2,18.8,197,3450,MALE 210 | Chinstrap,Dream,45.2,16.6,191,3250,FEMALE 211 | Chinstrap,Dream,49.3,19.9,203,4050,MALE 212 | Chinstrap,Dream,50.2,18.8,202,3800,MALE 213 | Chinstrap,Dream,45.6,19.4,194,3525,FEMALE 214 | Chinstrap,Dream,51.9,19.5,206,3950,MALE 215 | Chinstrap,Dream,46.8,16.5,189,3650,FEMALE 216 | Chinstrap,Dream,45.7,17,195,3650,FEMALE 217 | Chinstrap,Dream,55.8,19.8,207,4000,MALE 218 | Chinstrap,Dream,43.5,18.1,202,3400,FEMALE 219 | Chinstrap,Dream,49.6,18.2,193,3775,MALE 220 | Chinstrap,Dream,50.8,19,210,4100,MALE 221 | Chinstrap,Dream,50.2,18.7,198,3775,FEMALE 222 | Gentoo,Biscoe,46.1,13.2,211,4500,FEMALE 223 | Gentoo,Biscoe,50,16.3,230,5700,MALE 224 | Gentoo,Biscoe,48.7,14.1,210,4450,FEMALE 225 | Gentoo,Biscoe,50,15.2,218,5700,MALE 226 | Gentoo,Biscoe,47.6,14.5,215,5400,MALE 227 | Gentoo,Biscoe,46.5,13.5,210,4550,FEMALE 228 | Gentoo,Biscoe,45.4,14.6,211,4800,FEMALE 229 | Gentoo,Biscoe,46.7,15.3,219,5200,MALE 230 | Gentoo,Biscoe,43.3,13.4,209,4400,FEMALE 231 | Gentoo,Biscoe,46.8,15.4,215,5150,MALE 232 | Gentoo,Biscoe,40.9,13.7,214,4650,FEMALE 233 | Gentoo,Biscoe,49,16.1,216,5550,MALE 234 | Gentoo,Biscoe,45.5,13.7,214,4650,FEMALE 235 | Gentoo,Biscoe,48.4,14.6,213,5850,MALE 236 | Gentoo,Biscoe,45.8,14.6,210,4200,FEMALE 237 | Gentoo,Biscoe,49.3,15.7,217,5850,MALE 238 | Gentoo,Biscoe,42,13.5,210,4150,FEMALE 239 | Gentoo,Biscoe,49.2,15.2,221,6300,MALE 240 | Gentoo,Biscoe,46.2,14.5,209,4800,FEMALE 241 | Gentoo,Biscoe,48.7,15.1,222,5350,MALE 242 | Gentoo,Biscoe,50.2,14.3,218,5700,MALE 243 | Gentoo,Biscoe,45.1,14.5,215,5000,FEMALE 244 | Gentoo,Biscoe,46.5,14.5,213,4400,FEMALE 245 | Gentoo,Biscoe,46.3,15.8,215,5050,MALE 246 | Gentoo,Biscoe,42.9,13.1,215,5000,FEMALE 247 | Gentoo,Biscoe,46.1,15.1,215,5100,MALE 248 | Gentoo,Biscoe,44.5,14.3,216,4100,NA 249 | Gentoo,Biscoe,47.8,15,215,5650,MALE 250 | Gentoo,Biscoe,48.2,14.3,210,4600,FEMALE 251 | Gentoo,Biscoe,50,15.3,220,5550,MALE 252 | Gentoo,Biscoe,47.3,15.3,222,5250,MALE 253 | Gentoo,Biscoe,42.8,14.2,209,4700,FEMALE 254 | Gentoo,Biscoe,45.1,14.5,207,5050,FEMALE 255 | Gentoo,Biscoe,59.6,17,230,6050,MALE 256 | Gentoo,Biscoe,49.1,14.8,220,5150,FEMALE 257 | Gentoo,Biscoe,48.4,16.3,220,5400,MALE 258 | Gentoo,Biscoe,42.6,13.7,213,4950,FEMALE 259 | Gentoo,Biscoe,44.4,17.3,219,5250,MALE 260 | Gentoo,Biscoe,44,13.6,208,4350,FEMALE 261 | Gentoo,Biscoe,48.7,15.7,208,5350,MALE 262 | Gentoo,Biscoe,42.7,13.7,208,3950,FEMALE 263 | Gentoo,Biscoe,49.6,16,225,5700,MALE 264 | Gentoo,Biscoe,45.3,13.7,210,4300,FEMALE 265 | Gentoo,Biscoe,49.6,15,216,4750,MALE 266 | Gentoo,Biscoe,50.5,15.9,222,5550,MALE 267 | Gentoo,Biscoe,43.6,13.9,217,4900,FEMALE 268 | Gentoo,Biscoe,45.5,13.9,210,4200,FEMALE 269 | Gentoo,Biscoe,50.5,15.9,225,5400,MALE 270 | Gentoo,Biscoe,44.9,13.3,213,5100,FEMALE 271 | Gentoo,Biscoe,45.2,15.8,215,5300,MALE 272 | Gentoo,Biscoe,46.6,14.2,210,4850,FEMALE 273 | Gentoo,Biscoe,48.5,14.1,220,5300,MALE 274 | Gentoo,Biscoe,45.1,14.4,210,4400,FEMALE 275 | Gentoo,Biscoe,50.1,15,225,5000,MALE 276 | Gentoo,Biscoe,46.5,14.4,217,4900,FEMALE 277 | Gentoo,Biscoe,45,15.4,220,5050,MALE 278 | Gentoo,Biscoe,43.8,13.9,208,4300,FEMALE 279 | Gentoo,Biscoe,45.5,15,220,5000,MALE 280 | Gentoo,Biscoe,43.2,14.5,208,4450,FEMALE 281 | Gentoo,Biscoe,50.4,15.3,224,5550,MALE 282 | Gentoo,Biscoe,45.3,13.8,208,4200,FEMALE 283 | Gentoo,Biscoe,46.2,14.9,221,5300,MALE 284 | Gentoo,Biscoe,45.7,13.9,214,4400,FEMALE 285 | Gentoo,Biscoe,54.3,15.7,231,5650,MALE 286 | Gentoo,Biscoe,45.8,14.2,219,4700,FEMALE 287 | Gentoo,Biscoe,49.8,16.8,230,5700,MALE 288 | Gentoo,Biscoe,46.2,14.4,214,4650,NA 289 | Gentoo,Biscoe,49.5,16.2,229,5800,MALE 290 | Gentoo,Biscoe,43.5,14.2,220,4700,FEMALE 291 | Gentoo,Biscoe,50.7,15,223,5550,MALE 292 | Gentoo,Biscoe,47.7,15,216,4750,FEMALE 293 | Gentoo,Biscoe,46.4,15.6,221,5000,MALE 294 | Gentoo,Biscoe,48.2,15.6,221,5100,MALE 295 | Gentoo,Biscoe,46.5,14.8,217,5200,FEMALE 296 | Gentoo,Biscoe,46.4,15,216,4700,FEMALE 297 | Gentoo,Biscoe,48.6,16,230,5800,MALE 298 | Gentoo,Biscoe,47.5,14.2,209,4600,FEMALE 299 | Gentoo,Biscoe,51.1,16.3,220,6000,MALE 300 | Gentoo,Biscoe,45.2,13.8,215,4750,FEMALE 301 | Gentoo,Biscoe,45.2,16.4,223,5950,MALE 302 | Gentoo,Biscoe,49.1,14.5,212,4625,FEMALE 303 | Gentoo,Biscoe,52.5,15.6,221,5450,MALE 304 | Gentoo,Biscoe,47.4,14.6,212,4725,FEMALE 305 | Gentoo,Biscoe,50,15.9,224,5350,MALE 306 | Gentoo,Biscoe,44.9,13.8,212,4750,FEMALE 307 | Gentoo,Biscoe,50.8,17.3,228,5600,MALE 308 | Gentoo,Biscoe,43.4,14.4,218,4600,FEMALE 309 | Gentoo,Biscoe,51.3,14.2,218,5300,MALE 310 | Gentoo,Biscoe,47.5,14,212,4875,FEMALE 311 | Gentoo,Biscoe,52.1,17,230,5550,MALE 312 | Gentoo,Biscoe,47.5,15,218,4950,FEMALE 313 | Gentoo,Biscoe,52.2,17.1,228,5400,MALE 314 | Gentoo,Biscoe,45.5,14.5,212,4750,FEMALE 315 | Gentoo,Biscoe,49.5,16.1,224,5650,MALE 316 | Gentoo,Biscoe,44.5,14.7,214,4850,FEMALE 317 | Gentoo,Biscoe,50.8,15.7,226,5200,MALE 318 | Gentoo,Biscoe,49.4,15.8,216,4925,MALE 319 | Gentoo,Biscoe,46.9,14.6,222,4875,FEMALE 320 | Gentoo,Biscoe,48.4,14.4,203,4625,FEMALE 321 | Gentoo,Biscoe,51.1,16.5,225,5250,MALE 322 | Gentoo,Biscoe,48.5,15,219,4850,FEMALE 323 | Gentoo,Biscoe,55.9,17,228,5600,MALE 324 | Gentoo,Biscoe,47.2,15.5,215,4975,FEMALE 325 | Gentoo,Biscoe,49.1,15,228,5500,MALE 326 | Gentoo,Biscoe,47.3,13.8,216,4725,NA 327 | Gentoo,Biscoe,46.8,16.1,215,5500,MALE 328 | Gentoo,Biscoe,41.7,14.7,210,4700,FEMALE 329 | Gentoo,Biscoe,53.4,15.8,219,5500,MALE 330 | Gentoo,Biscoe,43.3,14,208,4575,FEMALE 331 | Gentoo,Biscoe,48.1,15.1,209,5500,MALE 332 | Gentoo,Biscoe,50.5,15.2,216,5000,FEMALE 333 | Gentoo,Biscoe,49.8,15.9,229,5950,MALE 334 | Gentoo,Biscoe,43.5,15.2,213,4650,FEMALE 335 | Gentoo,Biscoe,51.5,16.3,230,5500,MALE 336 | Gentoo,Biscoe,46.2,14.1,217,4375,FEMALE 337 | Gentoo,Biscoe,55.1,16,230,5850,MALE 338 | Gentoo,Biscoe,44.5,15.7,217,4875,. 339 | Gentoo,Biscoe,48.8,16.2,222,6000,MALE 340 | Gentoo,Biscoe,47.2,13.7,214,4925,FEMALE 341 | Gentoo,Biscoe,NA,NA,NA,NA,NA 342 | Gentoo,Biscoe,46.8,14.3,215,4850,FEMALE 343 | Gentoo,Biscoe,50.4,15.7,222,5750,MALE 344 | Gentoo,Biscoe,45.2,14.8,212,5200,FEMALE 345 | Gentoo,Biscoe,49.9,16.1,213,5400,MALE 346 | -------------------------------------------------------------------------------- /DataVisualization/practices/penguins/penguins.csv: -------------------------------------------------------------------------------- 1 | species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex 2 | Adelie,Torgersen,39.1,18.7,181,3750,MALE 3 | Adelie,Torgersen,39.5,17.4,186,3800,FEMALE 4 | Adelie,Torgersen,40.3,18,195,3250,FEMALE 5 | Adelie,Torgersen,NA,NA,NA,NA,NA 6 | Adelie,Torgersen,36.7,19.3,193,3450,FEMALE 7 | Adelie,Torgersen,39.3,20.6,190,3650,MALE 8 | Adelie,Torgersen,38.9,17.8,181,3625,FEMALE 9 | Adelie,Torgersen,39.2,19.6,195,4675,MALE 10 | Adelie,Torgersen,34.1,18.1,193,3475,NA 11 | Adelie,Torgersen,42,20.2,190,4250,NA 12 | Adelie,Torgersen,37.8,17.1,186,3300,NA 13 | Adelie,Torgersen,37.8,17.3,180,3700,NA 14 | Adelie,Torgersen,41.1,17.6,182,3200,FEMALE 15 | Adelie,Torgersen,38.6,21.2,191,3800,MALE 16 | Adelie,Torgersen,34.6,21.1,198,4400,MALE 17 | Adelie,Torgersen,36.6,17.8,185,3700,FEMALE 18 | Adelie,Torgersen,38.7,19,195,3450,FEMALE 19 | Adelie,Torgersen,42.5,20.7,197,4500,MALE 20 | Adelie,Torgersen,34.4,18.4,184,3325,FEMALE 21 | Adelie,Torgersen,46,21.5,194,4200,MALE 22 | Adelie,Biscoe,37.8,18.3,174,3400,FEMALE 23 | Adelie,Biscoe,37.7,18.7,180,3600,MALE 24 | Adelie,Biscoe,35.9,19.2,189,3800,FEMALE 25 | Adelie,Biscoe,38.2,18.1,185,3950,MALE 26 | Adelie,Biscoe,38.8,17.2,180,3800,MALE 27 | Adelie,Biscoe,35.3,18.9,187,3800,FEMALE 28 | Adelie,Biscoe,40.6,18.6,183,3550,MALE 29 | Adelie,Biscoe,40.5,17.9,187,3200,FEMALE 30 | Adelie,Biscoe,37.9,18.6,172,3150,FEMALE 31 | Adelie,Biscoe,40.5,18.9,180,3950,MALE 32 | Adelie,Dream,39.5,16.7,178,3250,FEMALE 33 | Adelie,Dream,37.2,18.1,178,3900,MALE 34 | Adelie,Dream,39.5,17.8,188,3300,FEMALE 35 | Adelie,Dream,40.9,18.9,184,3900,MALE 36 | Adelie,Dream,36.4,17,195,3325,FEMALE 37 | Adelie,Dream,39.2,21.1,196,4150,MALE 38 | Adelie,Dream,38.8,20,190,3950,MALE 39 | Adelie,Dream,42.2,18.5,180,3550,FEMALE 40 | Adelie,Dream,37.6,19.3,181,3300,FEMALE 41 | Adelie,Dream,39.8,19.1,184,4650,MALE 42 | Adelie,Dream,36.5,18,182,3150,FEMALE 43 | Adelie,Dream,40.8,18.4,195,3900,MALE 44 | Adelie,Dream,36,18.5,186,3100,FEMALE 45 | Adelie,Dream,44.1,19.7,196,4400,MALE 46 | Adelie,Dream,37,16.9,185,3000,FEMALE 47 | Adelie,Dream,39.6,18.8,190,4600,MALE 48 | Adelie,Dream,41.1,19,182,3425,MALE 49 | Adelie,Dream,37.5,18.9,179,2975,NA 50 | Adelie,Dream,36,17.9,190,3450,FEMALE 51 | Adelie,Dream,42.3,21.2,191,4150,MALE 52 | Adelie,Biscoe,39.6,17.7,186,3500,FEMALE 53 | Adelie,Biscoe,40.1,18.9,188,4300,MALE 54 | Adelie,Biscoe,35,17.9,190,3450,FEMALE 55 | Adelie,Biscoe,42,19.5,200,4050,MALE 56 | Adelie,Biscoe,34.5,18.1,187,2900,FEMALE 57 | Adelie,Biscoe,41.4,18.6,191,3700,MALE 58 | Adelie,Biscoe,39,17.5,186,3550,FEMALE 59 | Adelie,Biscoe,40.6,18.8,193,3800,MALE 60 | Adelie,Biscoe,36.5,16.6,181,2850,FEMALE 61 | Adelie,Biscoe,37.6,19.1,194,3750,MALE 62 | Adelie,Biscoe,35.7,16.9,185,3150,FEMALE 63 | Adelie,Biscoe,41.3,21.1,195,4400,MALE 64 | Adelie,Biscoe,37.6,17,185,3600,FEMALE 65 | Adelie,Biscoe,41.1,18.2,192,4050,MALE 66 | Adelie,Biscoe,36.4,17.1,184,2850,FEMALE 67 | Adelie,Biscoe,41.6,18,192,3950,MALE 68 | Adelie,Biscoe,35.5,16.2,195,3350,FEMALE 69 | Adelie,Biscoe,41.1,19.1,188,4100,MALE 70 | Adelie,Torgersen,35.9,16.6,190,3050,FEMALE 71 | Adelie,Torgersen,41.8,19.4,198,4450,MALE 72 | Adelie,Torgersen,33.5,19,190,3600,FEMALE 73 | Adelie,Torgersen,39.7,18.4,190,3900,MALE 74 | Adelie,Torgersen,39.6,17.2,196,3550,FEMALE 75 | Adelie,Torgersen,45.8,18.9,197,4150,MALE 76 | Adelie,Torgersen,35.5,17.5,190,3700,FEMALE 77 | Adelie,Torgersen,42.8,18.5,195,4250,MALE 78 | Adelie,Torgersen,40.9,16.8,191,3700,FEMALE 79 | Adelie,Torgersen,37.2,19.4,184,3900,MALE 80 | Adelie,Torgersen,36.2,16.1,187,3550,FEMALE 81 | Adelie,Torgersen,42.1,19.1,195,4000,MALE 82 | Adelie,Torgersen,34.6,17.2,189,3200,FEMALE 83 | Adelie,Torgersen,42.9,17.6,196,4700,MALE 84 | Adelie,Torgersen,36.7,18.8,187,3800,FEMALE 85 | Adelie,Torgersen,35.1,19.4,193,4200,MALE 86 | Adelie,Dream,37.3,17.8,191,3350,FEMALE 87 | Adelie,Dream,41.3,20.3,194,3550,MALE 88 | Adelie,Dream,36.3,19.5,190,3800,MALE 89 | Adelie,Dream,36.9,18.6,189,3500,FEMALE 90 | Adelie,Dream,38.3,19.2,189,3950,MALE 91 | Adelie,Dream,38.9,18.8,190,3600,FEMALE 92 | Adelie,Dream,35.7,18,202,3550,FEMALE 93 | Adelie,Dream,41.1,18.1,205,4300,MALE 94 | Adelie,Dream,34,17.1,185,3400,FEMALE 95 | Adelie,Dream,39.6,18.1,186,4450,MALE 96 | Adelie,Dream,36.2,17.3,187,3300,FEMALE 97 | Adelie,Dream,40.8,18.9,208,4300,MALE 98 | Adelie,Dream,38.1,18.6,190,3700,FEMALE 99 | Adelie,Dream,40.3,18.5,196,4350,MALE 100 | Adelie,Dream,33.1,16.1,178,2900,FEMALE 101 | Adelie,Dream,43.2,18.5,192,4100,MALE 102 | Adelie,Biscoe,35,17.9,192,3725,FEMALE 103 | Adelie,Biscoe,41,20,203,4725,MALE 104 | Adelie,Biscoe,37.7,16,183,3075,FEMALE 105 | Adelie,Biscoe,37.8,20,190,4250,MALE 106 | Adelie,Biscoe,37.9,18.6,193,2925,FEMALE 107 | Adelie,Biscoe,39.7,18.9,184,3550,MALE 108 | Adelie,Biscoe,38.6,17.2,199,3750,FEMALE 109 | Adelie,Biscoe,38.2,20,190,3900,MALE 110 | Adelie,Biscoe,38.1,17,181,3175,FEMALE 111 | Adelie,Biscoe,43.2,19,197,4775,MALE 112 | Adelie,Biscoe,38.1,16.5,198,3825,FEMALE 113 | Adelie,Biscoe,45.6,20.3,191,4600,MALE 114 | Adelie,Biscoe,39.7,17.7,193,3200,FEMALE 115 | Adelie,Biscoe,42.2,19.5,197,4275,MALE 116 | Adelie,Biscoe,39.6,20.7,191,3900,FEMALE 117 | Adelie,Biscoe,42.7,18.3,196,4075,MALE 118 | Adelie,Torgersen,38.6,17,188,2900,FEMALE 119 | Adelie,Torgersen,37.3,20.5,199,3775,MALE 120 | Adelie,Torgersen,35.7,17,189,3350,FEMALE 121 | Adelie,Torgersen,41.1,18.6,189,3325,MALE 122 | Adelie,Torgersen,36.2,17.2,187,3150,FEMALE 123 | Adelie,Torgersen,37.7,19.8,198,3500,MALE 124 | Adelie,Torgersen,40.2,17,176,3450,FEMALE 125 | Adelie,Torgersen,41.4,18.5,202,3875,MALE 126 | Adelie,Torgersen,35.2,15.9,186,3050,FEMALE 127 | Adelie,Torgersen,40.6,19,199,4000,MALE 128 | Adelie,Torgersen,38.8,17.6,191,3275,FEMALE 129 | Adelie,Torgersen,41.5,18.3,195,4300,MALE 130 | Adelie,Torgersen,39,17.1,191,3050,FEMALE 131 | Adelie,Torgersen,44.1,18,210,4000,MALE 132 | Adelie,Torgersen,38.5,17.9,190,3325,FEMALE 133 | Adelie,Torgersen,43.1,19.2,197,3500,MALE 134 | Adelie,Dream,36.8,18.5,193,3500,FEMALE 135 | Adelie,Dream,37.5,18.5,199,4475,MALE 136 | Adelie,Dream,38.1,17.6,187,3425,FEMALE 137 | Adelie,Dream,41.1,17.5,190,3900,MALE 138 | Adelie,Dream,35.6,17.5,191,3175,FEMALE 139 | Adelie,Dream,40.2,20.1,200,3975,MALE 140 | Adelie,Dream,37,16.5,185,3400,FEMALE 141 | Adelie,Dream,39.7,17.9,193,4250,MALE 142 | Adelie,Dream,40.2,17.1,193,3400,FEMALE 143 | Adelie,Dream,40.6,17.2,187,3475,MALE 144 | Adelie,Dream,32.1,15.5,188,3050,FEMALE 145 | Adelie,Dream,40.7,17,190,3725,MALE 146 | Adelie,Dream,37.3,16.8,192,3000,FEMALE 147 | Adelie,Dream,39,18.7,185,3650,MALE 148 | Adelie,Dream,39.2,18.6,190,4250,MALE 149 | Adelie,Dream,36.6,18.4,184,3475,FEMALE 150 | Adelie,Dream,36,17.8,195,3450,FEMALE 151 | Adelie,Dream,37.8,18.1,193,3750,MALE 152 | Adelie,Dream,36,17.1,187,3700,FEMALE 153 | Adelie,Dream,41.5,18.5,201,4000,MALE 154 | Chinstrap,Dream,46.5,17.9,192,3500,FEMALE 155 | Chinstrap,Dream,50,19.5,196,3900,MALE 156 | Chinstrap,Dream,51.3,19.2,193,3650,MALE 157 | Chinstrap,Dream,45.4,18.7,188,3525,FEMALE 158 | Chinstrap,Dream,52.7,19.8,197,3725,MALE 159 | Chinstrap,Dream,45.2,17.8,198,3950,FEMALE 160 | Chinstrap,Dream,46.1,18.2,178,3250,FEMALE 161 | Chinstrap,Dream,51.3,18.2,197,3750,MALE 162 | Chinstrap,Dream,46,18.9,195,4150,FEMALE 163 | Chinstrap,Dream,51.3,19.9,198,3700,MALE 164 | Chinstrap,Dream,46.6,17.8,193,3800,FEMALE 165 | Chinstrap,Dream,51.7,20.3,194,3775,MALE 166 | Chinstrap,Dream,47,17.3,185,3700,FEMALE 167 | Chinstrap,Dream,52,18.1,201,4050,MALE 168 | Chinstrap,Dream,45.9,17.1,190,3575,FEMALE 169 | Chinstrap,Dream,50.5,19.6,201,4050,MALE 170 | Chinstrap,Dream,50.3,20,197,3300,MALE 171 | Chinstrap,Dream,58,17.8,181,3700,FEMALE 172 | Chinstrap,Dream,46.4,18.6,190,3450,FEMALE 173 | Chinstrap,Dream,49.2,18.2,195,4400,MALE 174 | Chinstrap,Dream,42.4,17.3,181,3600,FEMALE 175 | Chinstrap,Dream,48.5,17.5,191,3400,MALE 176 | Chinstrap,Dream,43.2,16.6,187,2900,FEMALE 177 | Chinstrap,Dream,50.6,19.4,193,3800,MALE 178 | Chinstrap,Dream,46.7,17.9,195,3300,FEMALE 179 | Chinstrap,Dream,52,19,197,4150,MALE 180 | Chinstrap,Dream,50.5,18.4,200,3400,FEMALE 181 | Chinstrap,Dream,49.5,19,200,3800,MALE 182 | Chinstrap,Dream,46.4,17.8,191,3700,FEMALE 183 | Chinstrap,Dream,52.8,20,205,4550,MALE 184 | Chinstrap,Dream,40.9,16.6,187,3200,FEMALE 185 | Chinstrap,Dream,54.2,20.8,201,4300,MALE 186 | Chinstrap,Dream,42.5,16.7,187,3350,FEMALE 187 | Chinstrap,Dream,51,18.8,203,4100,MALE 188 | Chinstrap,Dream,49.7,18.6,195,3600,MALE 189 | Chinstrap,Dream,47.5,16.8,199,3900,FEMALE 190 | Chinstrap,Dream,47.6,18.3,195,3850,FEMALE 191 | Chinstrap,Dream,52,20.7,210,4800,MALE 192 | Chinstrap,Dream,46.9,16.6,192,2700,FEMALE 193 | Chinstrap,Dream,53.5,19.9,205,4500,MALE 194 | Chinstrap,Dream,49,19.5,210,3950,MALE 195 | Chinstrap,Dream,46.2,17.5,187,3650,FEMALE 196 | Chinstrap,Dream,50.9,19.1,196,3550,MALE 197 | Chinstrap,Dream,45.5,17,196,3500,FEMALE 198 | Chinstrap,Dream,50.9,17.9,196,3675,FEMALE 199 | Chinstrap,Dream,50.8,18.5,201,4450,MALE 200 | Chinstrap,Dream,50.1,17.9,190,3400,FEMALE 201 | Chinstrap,Dream,49,19.6,212,4300,MALE 202 | Chinstrap,Dream,51.5,18.7,187,3250,MALE 203 | Chinstrap,Dream,49.8,17.3,198,3675,FEMALE 204 | Chinstrap,Dream,48.1,16.4,199,3325,FEMALE 205 | Chinstrap,Dream,51.4,19,201,3950,MALE 206 | Chinstrap,Dream,45.7,17.3,193,3600,FEMALE 207 | Chinstrap,Dream,50.7,19.7,203,4050,MALE 208 | Chinstrap,Dream,42.5,17.3,187,3350,FEMALE 209 | Chinstrap,Dream,52.2,18.8,197,3450,MALE 210 | Chinstrap,Dream,45.2,16.6,191,3250,FEMALE 211 | Chinstrap,Dream,49.3,19.9,203,4050,MALE 212 | Chinstrap,Dream,50.2,18.8,202,3800,MALE 213 | Chinstrap,Dream,45.6,19.4,194,3525,FEMALE 214 | Chinstrap,Dream,51.9,19.5,206,3950,MALE 215 | Chinstrap,Dream,46.8,16.5,189,3650,FEMALE 216 | Chinstrap,Dream,45.7,17,195,3650,FEMALE 217 | Chinstrap,Dream,55.8,19.8,207,4000,MALE 218 | Chinstrap,Dream,43.5,18.1,202,3400,FEMALE 219 | Chinstrap,Dream,49.6,18.2,193,3775,MALE 220 | Chinstrap,Dream,50.8,19,210,4100,MALE 221 | Chinstrap,Dream,50.2,18.7,198,3775,FEMALE 222 | Gentoo,Biscoe,46.1,13.2,211,4500,FEMALE 223 | Gentoo,Biscoe,50,16.3,230,5700,MALE 224 | Gentoo,Biscoe,48.7,14.1,210,4450,FEMALE 225 | Gentoo,Biscoe,50,15.2,218,5700,MALE 226 | Gentoo,Biscoe,47.6,14.5,215,5400,MALE 227 | Gentoo,Biscoe,46.5,13.5,210,4550,FEMALE 228 | Gentoo,Biscoe,45.4,14.6,211,4800,FEMALE 229 | Gentoo,Biscoe,46.7,15.3,219,5200,MALE 230 | Gentoo,Biscoe,43.3,13.4,209,4400,FEMALE 231 | Gentoo,Biscoe,46.8,15.4,215,5150,MALE 232 | Gentoo,Biscoe,40.9,13.7,214,4650,FEMALE 233 | Gentoo,Biscoe,49,16.1,216,5550,MALE 234 | Gentoo,Biscoe,45.5,13.7,214,4650,FEMALE 235 | Gentoo,Biscoe,48.4,14.6,213,5850,MALE 236 | Gentoo,Biscoe,45.8,14.6,210,4200,FEMALE 237 | Gentoo,Biscoe,49.3,15.7,217,5850,MALE 238 | Gentoo,Biscoe,42,13.5,210,4150,FEMALE 239 | Gentoo,Biscoe,49.2,15.2,221,6300,MALE 240 | Gentoo,Biscoe,46.2,14.5,209,4800,FEMALE 241 | Gentoo,Biscoe,48.7,15.1,222,5350,MALE 242 | Gentoo,Biscoe,50.2,14.3,218,5700,MALE 243 | Gentoo,Biscoe,45.1,14.5,215,5000,FEMALE 244 | Gentoo,Biscoe,46.5,14.5,213,4400,FEMALE 245 | Gentoo,Biscoe,46.3,15.8,215,5050,MALE 246 | Gentoo,Biscoe,42.9,13.1,215,5000,FEMALE 247 | Gentoo,Biscoe,46.1,15.1,215,5100,MALE 248 | Gentoo,Biscoe,44.5,14.3,216,4100,NA 249 | Gentoo,Biscoe,47.8,15,215,5650,MALE 250 | Gentoo,Biscoe,48.2,14.3,210,4600,FEMALE 251 | Gentoo,Biscoe,50,15.3,220,5550,MALE 252 | Gentoo,Biscoe,47.3,15.3,222,5250,MALE 253 | Gentoo,Biscoe,42.8,14.2,209,4700,FEMALE 254 | Gentoo,Biscoe,45.1,14.5,207,5050,FEMALE 255 | Gentoo,Biscoe,59.6,17,230,6050,MALE 256 | Gentoo,Biscoe,49.1,14.8,220,5150,FEMALE 257 | Gentoo,Biscoe,48.4,16.3,220,5400,MALE 258 | Gentoo,Biscoe,42.6,13.7,213,4950,FEMALE 259 | Gentoo,Biscoe,44.4,17.3,219,5250,MALE 260 | Gentoo,Biscoe,44,13.6,208,4350,FEMALE 261 | Gentoo,Biscoe,48.7,15.7,208,5350,MALE 262 | Gentoo,Biscoe,42.7,13.7,208,3950,FEMALE 263 | Gentoo,Biscoe,49.6,16,225,5700,MALE 264 | Gentoo,Biscoe,45.3,13.7,210,4300,FEMALE 265 | Gentoo,Biscoe,49.6,15,216,4750,MALE 266 | Gentoo,Biscoe,50.5,15.9,222,5550,MALE 267 | Gentoo,Biscoe,43.6,13.9,217,4900,FEMALE 268 | Gentoo,Biscoe,45.5,13.9,210,4200,FEMALE 269 | Gentoo,Biscoe,50.5,15.9,225,5400,MALE 270 | Gentoo,Biscoe,44.9,13.3,213,5100,FEMALE 271 | Gentoo,Biscoe,45.2,15.8,215,5300,MALE 272 | Gentoo,Biscoe,46.6,14.2,210,4850,FEMALE 273 | Gentoo,Biscoe,48.5,14.1,220,5300,MALE 274 | Gentoo,Biscoe,45.1,14.4,210,4400,FEMALE 275 | Gentoo,Biscoe,50.1,15,225,5000,MALE 276 | Gentoo,Biscoe,46.5,14.4,217,4900,FEMALE 277 | Gentoo,Biscoe,45,15.4,220,5050,MALE 278 | Gentoo,Biscoe,43.8,13.9,208,4300,FEMALE 279 | Gentoo,Biscoe,45.5,15,220,5000,MALE 280 | Gentoo,Biscoe,43.2,14.5,208,4450,FEMALE 281 | Gentoo,Biscoe,50.4,15.3,224,5550,MALE 282 | Gentoo,Biscoe,45.3,13.8,208,4200,FEMALE 283 | Gentoo,Biscoe,46.2,14.9,221,5300,MALE 284 | Gentoo,Biscoe,45.7,13.9,214,4400,FEMALE 285 | Gentoo,Biscoe,54.3,15.7,231,5650,MALE 286 | Gentoo,Biscoe,45.8,14.2,219,4700,FEMALE 287 | Gentoo,Biscoe,49.8,16.8,230,5700,MALE 288 | Gentoo,Biscoe,46.2,14.4,214,4650,NA 289 | Gentoo,Biscoe,49.5,16.2,229,5800,MALE 290 | Gentoo,Biscoe,43.5,14.2,220,4700,FEMALE 291 | Gentoo,Biscoe,50.7,15,223,5550,MALE 292 | Gentoo,Biscoe,47.7,15,216,4750,FEMALE 293 | Gentoo,Biscoe,46.4,15.6,221,5000,MALE 294 | Gentoo,Biscoe,48.2,15.6,221,5100,MALE 295 | Gentoo,Biscoe,46.5,14.8,217,5200,FEMALE 296 | Gentoo,Biscoe,46.4,15,216,4700,FEMALE 297 | Gentoo,Biscoe,48.6,16,230,5800,MALE 298 | Gentoo,Biscoe,47.5,14.2,209,4600,FEMALE 299 | Gentoo,Biscoe,51.1,16.3,220,6000,MALE 300 | Gentoo,Biscoe,45.2,13.8,215,4750,FEMALE 301 | Gentoo,Biscoe,45.2,16.4,223,5950,MALE 302 | Gentoo,Biscoe,49.1,14.5,212,4625,FEMALE 303 | Gentoo,Biscoe,52.5,15.6,221,5450,MALE 304 | Gentoo,Biscoe,47.4,14.6,212,4725,FEMALE 305 | Gentoo,Biscoe,50,15.9,224,5350,MALE 306 | Gentoo,Biscoe,44.9,13.8,212,4750,FEMALE 307 | Gentoo,Biscoe,50.8,17.3,228,5600,MALE 308 | Gentoo,Biscoe,43.4,14.4,218,4600,FEMALE 309 | Gentoo,Biscoe,51.3,14.2,218,5300,MALE 310 | Gentoo,Biscoe,47.5,14,212,4875,FEMALE 311 | Gentoo,Biscoe,52.1,17,230,5550,MALE 312 | Gentoo,Biscoe,47.5,15,218,4950,FEMALE 313 | Gentoo,Biscoe,52.2,17.1,228,5400,MALE 314 | Gentoo,Biscoe,45.5,14.5,212,4750,FEMALE 315 | Gentoo,Biscoe,49.5,16.1,224,5650,MALE 316 | Gentoo,Biscoe,44.5,14.7,214,4850,FEMALE 317 | Gentoo,Biscoe,50.8,15.7,226,5200,MALE 318 | Gentoo,Biscoe,49.4,15.8,216,4925,MALE 319 | Gentoo,Biscoe,46.9,14.6,222,4875,FEMALE 320 | Gentoo,Biscoe,48.4,14.4,203,4625,FEMALE 321 | Gentoo,Biscoe,51.1,16.5,225,5250,MALE 322 | Gentoo,Biscoe,48.5,15,219,4850,FEMALE 323 | Gentoo,Biscoe,55.9,17,228,5600,MALE 324 | Gentoo,Biscoe,47.2,15.5,215,4975,FEMALE 325 | Gentoo,Biscoe,49.1,15,228,5500,MALE 326 | Gentoo,Biscoe,47.3,13.8,216,4725,NA 327 | Gentoo,Biscoe,46.8,16.1,215,5500,MALE 328 | Gentoo,Biscoe,41.7,14.7,210,4700,FEMALE 329 | Gentoo,Biscoe,53.4,15.8,219,5500,MALE 330 | Gentoo,Biscoe,43.3,14,208,4575,FEMALE 331 | Gentoo,Biscoe,48.1,15.1,209,5500,MALE 332 | Gentoo,Biscoe,50.5,15.2,216,5000,FEMALE 333 | Gentoo,Biscoe,49.8,15.9,229,5950,MALE 334 | Gentoo,Biscoe,43.5,15.2,213,4650,FEMALE 335 | Gentoo,Biscoe,51.5,16.3,230,5500,MALE 336 | Gentoo,Biscoe,46.2,14.1,217,4375,FEMALE 337 | Gentoo,Biscoe,55.1,16,230,5850,MALE 338 | Gentoo,Biscoe,44.5,15.7,217,4875,. 339 | Gentoo,Biscoe,48.8,16.2,222,6000,MALE 340 | Gentoo,Biscoe,47.2,13.7,214,4925,FEMALE 341 | Gentoo,Biscoe,NA,NA,NA,NA,NA 342 | Gentoo,Biscoe,46.8,14.3,215,4850,FEMALE 343 | Gentoo,Biscoe,50.4,15.7,222,5750,MALE 344 | Gentoo,Biscoe,45.2,14.8,212,5200,FEMALE 345 | Gentoo,Biscoe,49.9,16.1,213,5400,MALE 346 | -------------------------------------------------------------------------------- /DataAnalysisPreparation/2.11_pandas_series_extended.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "778e6335-be65-4ded-bb2e-e07463fa10ca", 6 | "metadata": {}, 7 | "source": [ 8 | "# **2.11_series_extended**" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "5b11d661-6ffa-4165-b024-8ea080830187", 14 | "metadata": {}, 15 | "source": [ 16 | "### 一、Series的运算" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "5502ceee-d79f-4b92-bad7-51c721c36a28", 22 | "metadata": { 23 | "jp-MarkdownHeadingCollapsed": true 24 | }, 25 | "source": [ 26 | "#### (一)、Series和Series之间" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "id": "2bd744e1-f144-43e3-85e0-0d18c58922f0", 32 | "metadata": {}, 33 | "source": [ 34 | "在Series和Series之间,可以做加减乘除等各种运算,Pandas会自动根据索引去排序并对齐" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "id": "f84cfc30-4315-443e-9249-b3e6eb8fcc11", 40 | "metadata": {}, 41 | "source": [ 42 | "1. 如果某个索引只在其中一个Series出现的话,结果就会是NaN,表示not a number,说明无法得到计算值\n", 43 | "\n", 44 | " 也就是说,由于Series之间的计算会自动进行索引对齐,只有当某个索引同时出现在两个Series里时,结果里才会有对应的值\n", 45 | "\n", 46 | " ***按照什么进行排序? 数字按照大小,英文按照字母顺序,数字英文中文可以混在一起排序对齐,数字在英文前,英文在中文前。中文排序方式或许是ASCII***" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 1, 52 | "id": "240b7a6c-00ec-4248-bbd8-4176ca2f32c2", 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "text/plain": [ 58 | "1 9.0\n", 59 | "2 NaN\n", 60 | "3 11.0\n", 61 | "5 5.0\n", 62 | "7 NaN\n", 63 | "9 NaN\n", 64 | "10 NaN\n", 65 | "dtype: float64" 66 | ] 67 | }, 68 | "execution_count": 1, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "import pandas as pd\n", 75 | "s1 = pd.Series([1, 4, 2, 3, 5], index=[1, 3, 5, 7, 9])\n", 76 | "s2 = pd.Series([8, 1, 7 ,3 ,9], index=[1, 2, 3, 5, 10])\n", 77 | "s1 + s2" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 2, 83 | "id": "6d36a56f-1975-4478-8ad8-5616a12c0bfd", 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "1 2.0\n", 90 | "3 5.0\n", 91 | "5 NaN\n", 92 | "7 NaN\n", 93 | "9 NaN\n", 94 | "num NaN\n", 95 | "住 NaN\n", 96 | "只 NaN\n", 97 | "dtype: float64" 98 | ] 99 | }, 100 | "execution_count": 2, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "s0 = pd.Series({1: 1, \"只\": 1, \"住\": 1, \"num\": 1, 3: 1})\n", 107 | "s0 + s1" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "id": "f19e8c39-e9ac-4ce5-b20e-5d33810a5da1", 113 | "metadata": {}, 114 | "source": [ 115 | "2. 如果你希望给缺失的值一个默认值的话,可以用方法而不是运算符号进行运算,然后给fill_value这个参数传入一个值。\n", 116 | "\n", 117 | " 用符号的话我们没法额外传参,但用方法的话就可以" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 3, 123 | "id": "2efd4a73-7723-4f1b-98ea-eaae766d1478", 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "1 9.0\n", 130 | "2 1.0\n", 131 | "3 11.0\n", 132 | "5 5.0\n", 133 | "7 3.0\n", 134 | "9 5.0\n", 135 | "10 9.0\n", 136 | "dtype: float64" 137 | ] 138 | }, 139 | "execution_count": 3, 140 | "metadata": {}, 141 | "output_type": "execute_result" 142 | } 143 | ], 144 | "source": [ 145 | "s1.add(s2, fill_value=0)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "id": "356cc1db-43cf-40da-882c-37bbbab00cef", 151 | "metadata": {}, 152 | "source": [ 153 | " 等同于s1 + s2,并同时给两边缺失的值一个默认值0" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 4, 159 | "id": "64735d3d-3da1-42b3-8a0f-9fd5f2122c37", 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "data": { 164 | "text/plain": [ 165 | "1 -7.0\n", 166 | "2 -1.0\n", 167 | "3 -3.0\n", 168 | "5 -1.0\n", 169 | "7 3.0\n", 170 | "9 5.0\n", 171 | "10 -9.0\n", 172 | "dtype: float64" 173 | ] 174 | }, 175 | "execution_count": 4, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "s1.sub(s2, fill_value=0)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 5, 187 | "id": "34182474-7058-48ce-bf38-3b3b24b9fb4e", 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "data": { 192 | "text/plain": [ 193 | "1 8.0\n", 194 | "2 0.0\n", 195 | "3 28.0\n", 196 | "5 6.0\n", 197 | "7 0.0\n", 198 | "9 0.0\n", 199 | "10 0.0\n", 200 | "dtype: float64" 201 | ] 202 | }, 203 | "execution_count": 5, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "s1.mul(s2, fill_value=0)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 6, 215 | "id": "3bb5e9ca-3c8f-4b5a-a193-d8c4a4644409", 216 | "metadata": {}, 217 | "outputs": [ 218 | { 219 | "data": { 220 | "text/plain": [ 221 | "1 0.125000\n", 222 | "2 0.000000\n", 223 | "3 0.571429\n", 224 | "5 0.666667\n", 225 | "7 inf\n", 226 | "9 inf\n", 227 | "10 0.000000\n", 228 | "dtype: float64" 229 | ] 230 | }, 231 | "execution_count": 6, 232 | "metadata": {}, 233 | "output_type": "execute_result" 234 | } 235 | ], 236 | "source": [ 237 | "s1.div(s2, fill_value=0)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "id": "ea33eae2-8896-4b98-bf5e-cf014b9b47dd", 243 | "metadata": {}, 244 | "source": [ 245 | "3. 优势\n", 246 | "\n", 247 | " Series之间的操作会根据索引自动对齐的好处是,由于一般我们会利用标签索引表示不同对象的数据,那即使不同Series里数据顺序不一样,计算时也会根据索引自动对齐" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "id": "cd749132-2f33-451b-9ee8-22ebd5d143c2", 253 | "metadata": {}, 254 | "source": [ 255 | "#### (二)、聚合运算" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "id": "932c1f66-ca6b-4e62-9953-7157b5407b70", 261 | "metadata": {}, 262 | "source": [ 263 | "1. NumPy数组的统计方法,包括max, min, sum, mean,Pandas的Series对象也有相同名字的方法" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 7, 269 | "id": "7ca49452-6484-4748-958c-9fd1ae9efa49", 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | "5\n", 277 | "1\n", 278 | "15\n", 279 | "3.0\n" 280 | ] 281 | } 282 | ], 283 | "source": [ 284 | "print(s1.max())\n", 285 | "print(s1.min())\n", 286 | "print(s1.sum())\n", 287 | "print(s1.mean())" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "id": "88745ec3-7e8a-44ef-896b-3668f6ad993f", 293 | "metadata": {}, 294 | "source": [ 295 | "2. describe方法,是Series特有的一个强大的方法,describe方法能直接告诉我们很多这个Series的统计信息,\n", 296 | "\n", 297 | " 包括:元素个数、平均数、标准差、最小值、四分位数、最大值" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 8, 303 | "id": "f0d91b85-27cc-47c1-9e13-255e28a1637c", 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "data": { 308 | "text/plain": [ 309 | "count 5.000000\n", 310 | "mean 3.000000\n", 311 | "std 1.581139\n", 312 | "min 1.000000\n", 313 | "25% 2.000000\n", 314 | "50% 3.000000\n", 315 | "75% 4.000000\n", 316 | "max 5.000000\n", 317 | "dtype: float64" 318 | ] 319 | }, 320 | "execution_count": 8, 321 | "metadata": {}, 322 | "output_type": "execute_result" 323 | } 324 | ], 325 | "source": [ 326 | "s1.describe()" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "id": "d225e906-cf47-4229-ba15-bf7314e7b2cd", 332 | "metadata": {}, 333 | "source": [ 334 | "#### (三)、Series和单个数字之间" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "id": "8b5017f2-3fcd-498e-98ec-ddd7d89e3395", 340 | "metadata": {}, 341 | "source": [ 342 | "与NumPy数组的广播机制一样,在Pandas Series里,单个数字和Series之间进行操作的时候,操作会被自动运用到Series里每个元素上" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 9, 348 | "id": "71bc70cd-f4f2-413e-8d2a-92f0e7d3a883", 349 | "metadata": {}, 350 | "outputs": [ 351 | { 352 | "data": { 353 | "text/plain": [ 354 | "1 3\n", 355 | "3 12\n", 356 | "5 6\n", 357 | "7 9\n", 358 | "9 15\n", 359 | "dtype: int64" 360 | ] 361 | }, 362 | "execution_count": 9, 363 | "metadata": {}, 364 | "output_type": "execute_result" 365 | } 366 | ], 367 | "source": [ 368 | "s1 * 3" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "id": "ab6318a4-ab58-44ee-bb60-db01fa35ee9a", 374 | "metadata": {}, 375 | "source": [ 376 | "### 二、对元素分别执行相同操作" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "id": "df2f0d80-3558-4ce9-856c-e8cf37c42d2e", 382 | "metadata": {}, 383 | "source": [ 384 | "1. apply方法,接收函数作为参数,然后调用时把Series里各个元素,分别作为那个函数的参数,返回的Series里的元素,就是那个函数对原始Series里各个元素调用后的结果\n", 385 | "\n", 386 | " **apply方法不改变原始Series,而是会返回一个新的Series**" 387 | ] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "id": "33d9c511-85cf-4e5f-a2b5-2010a22adc64", 392 | "metadata": {}, 393 | "source": [ 394 | "***apply相当于是高阶函数***\n", 395 | " \n", 396 | "***注意传入的函数后面不要跟括号,因为不是要把函数调用后的结果,去作为apply的参数,而是把函数本身给apply***\n", 397 | " \n", 398 | "***apply的定义语句里肯定包括:***\n", 399 | " \n", 400 | "***a.让每一个Series里面的元素作为参数,调用函数***\n", 401 | " \n", 402 | "***b.将每一个元素调用函数得到的结果,组成的新Series。***" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "id": "8b8d5360-4706-4f1e-9b58-e78403fa078c", 408 | "metadata": {}, 409 | "source": [ 410 | " 优势:apply方法大大增加了我们操作Series的灵活性,能定义出来的函数,我们都可以作用在Series的各个元素上,帮我们得到新的Series" 411 | ] 412 | }, 413 | { 414 | "cell_type": "markdown", 415 | "id": "2a5f7a65-81fc-41db-bcb3-852d7e5ccfcb", 416 | "metadata": {}, 417 | "source": [ 418 | " 应用场景:当前有5名学生的成绩所组成的Series,索引为学生名字,我们希望能得到每个成绩对应的等级:90及以上是A,80到90是B,70到80是C,70以下是D。我们知道怎么根据分数数字得到对应等级,只需要get_grade_from_score函数即可。现在问题在于,如何对Series里每个元素,都运用这个函数,得到对应结果组成的新Series。新方法,apply方法可以实现这一步。" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 10, 424 | "id": "5ec4ac5e-9373-4730-a7fb-a738e3418cde", 425 | "metadata": {}, 426 | "outputs": [ 427 | { 428 | "data": { 429 | "text/plain": [ 430 | "小明 A\n", 431 | "小红 D\n", 432 | "小杰 C\n", 433 | "小丽 B\n", 434 | "小华 C\n", 435 | "dtype: object" 436 | ] 437 | }, 438 | "execution_count": 10, 439 | "metadata": {}, 440 | "output_type": "execute_result" 441 | } 442 | ], 443 | "source": [ 444 | "scores = pd.Series({\"小明\": 92, \"小红\": 67, \"小杰\": 70, \"小丽\": 88, \"小华\": 76})\n", 445 | "def get_grade_from_score(score):\n", 446 | " if score >= 90:\n", 447 | " return \"A\"\n", 448 | " elif score >= 80:\n", 449 | " return \"B\"\n", 450 | " elif score >= 70:\n", 451 | " return \"C\"\n", 452 | " else:\n", 453 | " return \"D\"\n", 454 | "\n", 455 | "\n", 456 | "grades = scores.apply(get_grade_from_score)\n", 457 | "grades" 458 | ] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "id": "faf18885-24f5-4d74-8450-0f36d5ccfb61", 463 | "metadata": {}, 464 | "source": [ 465 | "2. 除了传入定义好的函数名,在函数逻辑比较简单的时候,匿名函数也可以应用在这里" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 11, 471 | "id": "217add00-f8d3-46d7-bf12-a196600761bb", 472 | "metadata": {}, 473 | "outputs": [ 474 | { 475 | "data": { 476 | "text/plain": [ 477 | "小明 46.0\n", 478 | "小红 33.5\n", 479 | "小杰 35.0\n", 480 | "小丽 44.0\n", 481 | "小华 38.0\n", 482 | "dtype: float64" 483 | ] 484 | }, 485 | "execution_count": 11, 486 | "metadata": {}, 487 | "output_type": "execute_result" 488 | } 489 | ], 490 | "source": [ 491 | "half_scores = scores.apply(lambda x: 0.5*x)\n", 492 | "half_scores" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "id": "48b373b2-352e-44d1-bf40-8ce213513b47", 498 | "metadata": {}, 499 | "source": [ 500 | "### 三、转换数据类型 " 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "id": "7a4aad1c-cc13-46cb-834a-5205f4c7afcf", 506 | "metadata": {}, 507 | "source": [ 508 | "astype方法:转换Series的数据类型" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 15, 514 | "id": "86def539-1164-4578-bd32-367379bdc9de", 515 | "metadata": {}, 516 | "outputs": [ 517 | { 518 | "data": { 519 | "text/plain": [ 520 | "小明 92\n", 521 | "小红 67\n", 522 | "小杰 70\n", 523 | "小丽 88\n", 524 | "小华 76\n", 525 | "dtype: object" 526 | ] 527 | }, 528 | "execution_count": 15, 529 | "metadata": {}, 530 | "output_type": "execute_result" 531 | } 532 | ], 533 | "source": [ 534 | "scores = scores.astype(str)\n", 535 | "scores" 536 | ] 537 | }, 538 | { 539 | "cell_type": "markdown", 540 | "id": "3d803fc9-73a0-4add-ae1c-fd079bb4ede7", 541 | "metadata": {}, 542 | "source": [ 543 | "### 四、针对字符串Series,保留Series每个元素的某一部分" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "id": "deaacae9-96ba-4876-9826-081562d79a2f", 549 | "metadata": {}, 550 | "source": [ 551 | "str.slice方法\n", 552 | "\n", 553 | "str是Series类自带的一个属性,会返回一个包含了很多字符串相关操作方法的,StringMethods类的实例(返回实例才可以调用方法),对这个StringMethods实例调用slice方法,就会分别保留Series里每个元素选定的部分\n", 554 | "\n", 555 | "第一个参数传入,要保留的起始位置的索引;第二个参数传入,要保留的结束位置的下一索引" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 16, 561 | "id": "2d0bba6e-fdcb-45cb-a154-909f8a04a47f", 562 | "metadata": {}, 563 | "outputs": [ 564 | { 565 | "data": { 566 | "text/plain": [ 567 | "小明 9\n", 568 | "小红 6\n", 569 | "小杰 7\n", 570 | "小丽 8\n", 571 | "小华 7\n", 572 | "dtype: object" 573 | ] 574 | }, 575 | "execution_count": 16, 576 | "metadata": {}, 577 | "output_type": "execute_result" 578 | } 579 | ], 580 | "source": [ 581 | "scores.str.slice(0, 1)" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": null, 587 | "id": "03f2cd32-49c6-419e-87f5-9000794815cd", 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [] 591 | } 592 | ], 593 | "metadata": { 594 | "kernelspec": { 595 | "display_name": "Python 3 (ipykernel)", 596 | "language": "python", 597 | "name": "python3" 598 | }, 599 | "language_info": { 600 | "codemirror_mode": { 601 | "name": "ipython", 602 | "version": 3 603 | }, 604 | "file_extension": ".py", 605 | "mimetype": "text/x-python", 606 | "name": "python", 607 | "nbconvert_exporter": "python", 608 | "pygments_lexer": "ipython3", 609 | "version": "3.11.1" 610 | } 611 | }, 612 | "nbformat": 4, 613 | "nbformat_minor": 5 614 | } 615 | --------------------------------------------------------------------------------