├── .idea
├── dictionaries
│ └── wangs.xml
├── encodings.xml
├── inspectionProfiles
│ └── Project_Default.xml
├── libraries
│ └── R_User_Library.xml
├── misc.xml
├── modules.xml
├── other.xml
└── vcs.xml
├── 1.Python编程[从入门到实战]
├── 1.起步
│ ├── README.md
│ ├── hello_world.py
│ ├── linux_setup.md
│ ├── osx_setup.md
│ └── windows_setup.md
├── 2.变量和简单的数据类型
│ ├── 1.hello_world.py
│ ├── 2.name.py
│ ├── 3.apostrophe.py
│ ├── 4.birthday.py
│ └── 5.comment.py
├── 3.列表简介
│ ├── 1.bicycles.py
│ ├── 2.motorcycles.py
│ └── 3.cars.py
├── 4.操作列表
│ ├── 1.magicians.py
│ ├── 2.numbers.py
│ ├── 3.even_numbers.py
│ ├── 4.squares.py
│ ├── 5.players.py
│ ├── 6.foods.py
│ └── 7.dimensions.py
├── 5.if语句
│ ├── 1.cars.py
│ ├── 2.toppings.py
│ ├── 3.magic_number.py
│ ├── 4.banned_users.py
│ ├── 5.voting.py
│ └── 6.amusement_park.py
├── 6.字典
│ ├── 1.alien.py
│ ├── 2.favorite_languages.py
│ ├── 3.aliens.py
│ ├── 4.pizza.py
│ ├── 5.many_users.py
│ └── 6.user.py
└── 7.用户输入和while循环
│ ├── 1.parrot.py
│ ├── 2.greeter.py
│ ├── 3.rollercoaster.py
│ ├── 4.even_or_odd.py
│ ├── 5.counting.py
│ ├── 6.cities.py
│ ├── 7.confirmed_users.py
│ ├── 8.pets.py
│ └── 9.mountain_poll.py
├── 2.Python数据分析
├── week01
│ └── Python3.6基础.py
├── week02
│ ├── Python函数.py
│ └── Python条件语句.py
├── week03
│ └── numpy.py
├── week04
│ ├── data.csv
│ ├── numpy常用函数.py
│ ├── numpy股价分析实践.py
│ └── weeksummary.csv
├── week05
│ ├── Pandes_Dataframe.py
│ ├── data
│ │ ├── csv_mindex.csv
│ │ ├── ex1.csv
│ │ ├── ex2.csv
│ │ ├── ex3.csv
│ │ ├── ex3.txt
│ │ ├── ex4.csv
│ │ ├── ex5.csv
│ │ ├── ex6.csv
│ │ ├── ex7.csv
│ │ ├── frame_pickle
│ │ ├── out.csv
│ │ ├── test_file.csv
│ │ ├── tseries.csv
│ │ └── workbook.xls
│ ├── mydata.csv
│ ├── tseries.csv
│ └── 数据读取.py
├── week06
│ ├── data
│ │ ├── catering_sale.xls
│ │ ├── electricity_data.xls
│ │ ├── foods-2011-10-03.json
│ │ ├── macrodata.csv
│ │ ├── movies.dat
│ │ ├── normalization_data.xls
│ │ ├── olivier.txt
│ │ ├── principal_component.xls
│ │ └── sales.xls
│ └── 数据处理.py
├── week07
│ ├── data
│ │ ├── Haiti.csv
│ │ ├── macrodata.csv
│ │ ├── spx.csv
│ │ └── tips.csv
│ ├── figpath.png
│ ├── figpath.svg
│ └── 数据可视化.py
├── week08
│ ├── data
│ │ ├── stock_px.csv
│ │ └── tips.csv
│ └── 数据分组.py
├── week09
│ ├── Amtrak.xls
│ └── 统计基础.py
├── week10
│ ├── Advertising.csv
│ └── 线性回归分析.py
├── week11
│ ├── Logistic回归.py
│ ├── bankloan.xls
│ ├── data1.txt
│ └── data2.txt
├── week12
│ ├── arima_data.xls
│ ├── stock_px.csv
│ └── 时间序列分析法.py
├── week13
│ └── 分类算法.py
├── week14
│ ├── ex14.csv
│ └── 聚类算法.py
├── week15
│ ├── ex15.txt
│ └── 矩阵基础.py
└── 案例分析
│ ├── business_circle.xls
│ ├── standardized.xls
│ ├── 基于基站定位数据的商圈.py
│ ├── 电信客户流失分析.py
│ └── 股票指数构建.py
├── 3.Python网络爬虫[从入门到实战]
├── 2.编写第一个网络爬虫
│ ├── Python使用入门.py
│ ├── Test_Python基础练习.py
│ ├── title_test.txt
│ └── 简易爬虫.py
├── 3.静态网页抓取
│ ├── 1.request.py
│ ├── 2.request+header+POST.py
│ └── Test_TOP250电影数据.py
├── 4.动态网页抓取
│ ├── 1.解析json评论数据.py
│ ├── 2.selenium爬取评论数据.py
│ └── Test_RentData.py
└── 5.解析网页
│ ├── 1.re正则表达式.py
│ ├── 2.BeautifulSoup.py
│ ├── Cha 5 -解析网页.ipynb
│ └── Cha 5 _章末实战.ipynb
├── 4.算法图解
├── 1.二分查找
│ └── binary_sort.py
├── 2.选择排序
│ └── selection_sort.py
├── 3.递归
│ ├── 1.countdown.py
│ ├── 2.greet.py
│ └── 3.factorial.py
├── 4.快速排序
│ └── quick_sort.py
├── 5.散列表
│ ├── 1.dict.py
│ └── 2.check_voter.py
├── 6.广度优先搜索
│ └── breadth-first_search.py
├── 7.狄杰斯特拉算法
│ └── dijkstras_algorithm.py
├── 8.贪婪算法
│ └── set_covering.py
└── 9.动态规划
│ └── longest_common_subsequence.py
├── PythonCourses.iml
└── README.md
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/.idea/libraries/R_User_Library.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/other.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/1.起步/README.md:
--------------------------------------------------------------------------------
1 | ## Chapter 1 起步
2 |
3 | ### 三种操作系统环境下Python安装流程
4 | - [Linux](linux_setup.md)
5 | - [OS X](osx_setup.md)
6 | - [Windows](windows_setup.md)
7 |
8 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/1.起步/hello_world.py:
--------------------------------------------------------------------------------
1 | print("Hello Python world!")
2 | # Hello Python world!
3 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/1.起步/linux_setup.md:
--------------------------------------------------------------------------------
1 | Setup Instructions: Linux
2 | ===
3 |
4 | - [Checking your current version of Python](#current_version)
5 | - [Installing Python 3.5](#python3.5)
6 | - [Installing Geany](#installing_geany)
7 | - [Configuring Geany](#configuring_geany)
8 |
9 | Checking your current version of Python
10 | ---
11 |
12 | Python is probably already installed on your system. Find out which version is your default by issuing the command `python --version`:
13 |
14 | $ python --version
15 | Python 2.7.6
16 |
17 | If you see something like this, Python 2.7 is your default version. You should also see if you have Python 3 installed:
18 |
19 | $ python3 --version
20 | Python 3.4.0
21 |
22 | If you have Python 3.4 or later, it's fine to start out by using the installed version. If you have Python 3.3 or earlier, it's probably worth installing Python 3.5.
23 |
24 | [top](#)
25 |
26 | Installing Python 3.5
27 | ---
28 |
29 | The following instructions should work on Ubuntu, and most Debian-based systems that use the apt package manager.
30 |
31 | Add the *deadsnakes* package, and then install Python 3.5:
32 |
33 | $ sudo add-apt-repository ppa:fkrull/deadsnakes
34 | $ sudo apt-get update
35 | $ sudo apt-get install python3.5
36 |
37 | You can confirm that the installation was successful:
38 |
39 | $ python3.5 --version
40 | Python 3.5.0
41 |
42 | Now to start a Python terminal session, you'll use the command `python3.5`:
43 |
44 | $ python3.5
45 | Python 3.5.0 (default, Sep 17 2015, 00:00:00)
46 | [GCC 4.8.4] on linux
47 | Type "help", "copyright", "credits" or "license" for more information.
48 | >>>
49 |
50 | You'll use this command when you configure your text editor, and when you run programs from the terminal.
51 |
52 | [top](#)
53 |
54 | Installing Geany
55 | ---
56 |
57 | On Ubuntu and other systems that use the apt package manager, you can install Geany in one line:
58 |
59 | $ sudo apt-get install geany
60 |
61 | If this doesn't work, you can see the instructions at [http://geany.org/Download/ThirdPartyPackages/](http://geany.org/Download/ThirdPartyPackages/).
62 |
63 | [top](#)
64 |
65 |
66 | ### Configuring Geany
67 |
68 | If you use the simple command `python` to start a terminal session on your system, you shouldn't have to configure Geany at all. But if you use a command like `python3` or `python3.5`, you'll have to modify Geany slightly so it uses the correct version of Python to run your programs.
69 |
70 | Open an empty file and save it as *hello_world.py*. The file should have one line in it:
71 |
72 | print("Hello Python world!")
73 |
74 | Go to **Build>Set Build Commands**. You should see the word *Compile*, and a command next to the word *Compile*. Change this to
75 |
76 | python3 -m py_compile "%f"
77 |
78 | If you use a command like `python3.5`, make sure you use that command instead.
79 |
80 | Next to the word *Execute*, enter the following command:
81 |
82 | python3 "%f"
83 |
84 | Again, if you use a command like `python3.5`, make sure you use that command.
85 |
86 | Now you can run programs by selecting **Build>Execute**, clicking the Execute icon with a set of gears on it, or by pressing **F5**.
87 |
88 | [top](#)
89 |
90 |
91 |
92 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/1.起步/osx_setup.md:
--------------------------------------------------------------------------------
1 | Setup Instructions: OS X
2 | ===
3 |
4 | - [Checking your current version of Python](#current_version)
5 | - [Installing Python 3.5](#python3.5)
6 | - [Installing Sublime Text](#installing_st)
7 | - [Configuring Sublime Text](#configuring_st)
8 |
9 | Checking your current version of Python
10 | ---
11 |
12 | Python is probably already installed on your system. To check if it's installed, go to **Applications>Utilities** and click on **Terminal**. (You can also press command-spacebar, type *terminal*, and then press Enter.)
13 |
14 | Find out which version of Python is installed by issuing the command `python --version`:
15 |
16 | $ python --version
17 | Python 2.7.5
18 |
19 | If you see something like this, Python 2.7 is your default version. You should also see if you have Python 3 installed:
20 |
21 | $ python3 --version
22 | Python 3.4.0
23 |
24 | If you have Python 3.4 or later, it's fine to start out by using the installed version. If you have Python 3.3 or earlier, it's probably worth installing Python 3.5.
25 |
26 | [top](#)
27 |
28 | Installing Python 3.5
29 | ---
30 |
31 | Install [Homebrew](http://brew.sh/), which makes it easy to install the most recent version of Python. Start out by installing some of Apple's xcode tools:
32 |
33 | $ xcode-select --install
34 |
35 | The installation may take a while, depending on the speed of your connection. Next, install Homebrew:
36 |
37 | $ ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
38 |
39 | If you run the command **brew doctor**, you can verify that the installation was successful:
40 |
41 | $ brew doctor
42 | Your system is ready to brew.
43 |
44 | Now you can install Python packages through Homebrew. To install Python 3, enter the following command:
45 |
46 | $ brew install python3
47 |
48 | You can verify that Python 3 was installed correctly:
49 |
50 | $ python3 --version
51 | Python 3.5.0
52 |
53 | You'll use the **python3** command when you configure your text editor, when you start a Python terminal session, and when you run programs from the terminal.
54 |
55 | [top](#)
56 |
57 | Installing Sublime Text
58 | ---
59 |
60 | You can download an installer for Sublime Text by clicking on the OS X link at [http://www.sublimetext.com/3](http://www.sublimetext.com/3). Sublime Text has a liberal licensing policy; it's free as long as you want to use it, but the author requests that you purchase a license if you like the program and want to continue using it.
61 |
62 | After you've downloaded the installer, open it and then drag the Sublime Text icon into your *Applications* folder.
63 |
64 | [top](#)
65 |
66 |
67 | ### Configuring Sublime Text for Python 3
68 |
69 | If you use the simple command `python` to start a terminal session on your system, you shouldn't have to configure Sublime Text at all. But if you use a command like `python3` or `python3.5`, you'll have to modify Sublime Text slightly so it uses the correct version of Python to run your programs.
70 |
71 | Find the path to your Python interpreter:
72 |
73 | $ type -a python3
74 | python3 is /usr/local/bin/python3
75 |
76 | Open an empty file in Sublime Text and save it as *hello_world.py*. The file should have one line in it:
77 |
78 | print("Hello Python world!")
79 |
80 | Go to **Tools>Build System>New Build System**, which will open a new configuration file. Delete what you see, and enter the following:
81 |
82 | {
83 | "cmd": ["/usr/local/bin/python3", "-u", "$file"],
84 | }
85 |
86 | This tells Sublime Text to use your system's **python3** command when running programs. Make sure you use the path you found when running **type -a python3**, not necessarily the path you see here. Save the file as *Python3.sublime-build* in the directory that Sublime Text opens when you choose Save.
87 |
88 | Now you can run programs by selecting **Build>Execute**, clicking the Execute icon with a set of gears on it, or by pressing **F5**.
89 |
90 | [top](#)
91 |
92 |
93 |
94 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/1.起步/windows_setup.md:
--------------------------------------------------------------------------------
1 | Setup Instructions: Windows
2 | ===
3 |
4 | - [Checking your current version of Python](#current_version)
5 | - [Installing Python 3.5](#python3.5)
6 | - [Adding Python to Your Path Variable](#path_variable)
7 | - [Installing Geany](#installing_geany)
8 | - [Configuring Geany](#configuring_geany)
9 |
10 | Checking your current version of Python
11 | ---
12 |
13 | Python may already installed on your system. Open a command window by right-clicking on the Desktop while holding the shift key, and then select "Open Command Window Here". You can also search for "command" in the task bar. Find out which version is your default by issuing the command `python --version`:
14 |
15 | > python --version
16 | Python 2.7.6
17 |
18 | If you see something like this, Python 2.7 is your default version. You should also see if you have Python 3 installed:
19 |
20 | > python3 --version
21 | Python 3.4.0
22 |
23 | If you have Python 3.4 or later, it's fine to start out by using the installed version. If you have Python 3.3 or earlier, it's probably worth installing Python 3.5.
24 |
25 | If you get an error message for both of these commands, Python is not installed on your system, and you should install Python 3.5.
26 |
27 | [top](#)
28 |
29 | Installing Python 3.5
30 | ---
31 |
32 | Go to [https://www.python.org/downloads/](https://www.python.org/downloads/) and click the button labeled "Download Python 3.5". Download the installer, and when you run it make sure to check the *Add Python to PATH* option:
33 |
34 | 
35 |
36 | Checking this button ensures that you'll be able to use the simple command **python**. If you missed this step, see [Adding Python to Your Path Variable](#path_variable).
37 |
38 | You can confirm that the installation was successful:
39 |
40 | > python --version
41 | Python 3.5.0
42 |
43 | Now to start a Python terminal session, you'll use the command `python`:
44 |
45 | > python
46 | Python 3.5.0 (v3.5.0:374f501f4567, Sep 13 2015, 02:16:59) [MSC v.1900 32 bit (Intel)] on win32
47 | Type "help", "copyright", "credits" or "license" for more information.
48 | >>>
49 |
50 | You'll use this command when you configure your text editor, and when you run programs from the terminal.
51 |
52 | [top](#)
53 |
54 | Adding Python to Your Path Variable
55 | ---
56 | If you checked *Add Python to PATH* when you installed Python and the command **python** works, you can skip this step.
57 |
58 | To find the path to Python on your system, open Windows Explorer and look in your C:\ drive. Look for a folder starting with *Python*; you might need to enter *python* in the Windows Explorer search bar to find the right folder. Open the folder, and look for a file with the lowercase name *python*. Right-click this file and choose **Properties**; you'll then see the path to this file under the heading Location.
59 |
60 | In a terminal window, use the path to confirm the version you just installed:
61 |
62 | > C:\\Python35\python --version
63 | Python 3.5.0
64 |
65 | Open your system's **Control Panel**, choose **System and Security**, and then choose **System**. Click **Advanced System Settings*, and in the window that pops up click **Environment Variables**.
66 |
67 | In the box labeled *System variables*, look for a variable called `Path`. Click **Edit**. In the box that pops up, click in the box labeled *Variable Value* and use the right arrow key to scroll all the way to the right. Be careful not to write over the existing variable; if you do, click Cancel and try again. Add a semicolon and the path to your *python.exe* file to the existing variable:
68 |
69 | %SystemRoot%\system32\...\System32\WindowsPowerShell\v1.0\;C:\Python35
70 |
71 | Close any existing terminal windows, and open a new one. Now when you enter **python --version**, you should see the version of Python you just set in your `Path` variable. You can now start a Python terminal session by just entering **python** at a command prompt.
72 |
73 | Installing Geany
74 | ---
75 |
76 | You can download a Windows installer for Geany from [http://www.geany.org/Download/Releases](http://www.geany.org/Download/Releases). Download and run the installer called *geany-1.25_setup.exe*, accepting all the defaults.
77 |
78 | [top](#)
79 |
80 |
81 | ### Configuring Geany
82 |
83 | If you use the simple command `python` to start a terminal session on your system, you shouldn't have to configure Geany at all. But if you use a command like `python3` or a full path like `C:\Python35\python` to start a terminal session, you'll have to modify Geany slightly so it uses the correct version of Python to run your programs.
84 |
85 | Open an empty file and save it as *hello_world.py*. The file should have one line in it:
86 |
87 | print("Hello Python world!")
88 |
89 | Go to **Build>Set Build Commands**. You should see the word *Compile*, and a command next to the word *Compile*. Change this to
90 |
91 | python3 -m py_compile "%f"
92 |
93 | You can also use a full path in this setting, such as `C:\Python35\python -m py_compile "%f"`.
94 |
95 | Next to the word *Execute*, enter the following command:
96 |
97 | python3 "%f"
98 |
99 | Again, you can use a full path, such as `C:\Python35\python "%f"`.
100 |
101 | Now you can run programs by selecting **Build>Execute**, clicking the Execute icon with a set of gears on it, or by pressing **F5**.
102 |
103 | [top](#)
104 |
105 |
106 |
107 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/2.变量和简单的数据类型/1.hello_world.py:
--------------------------------------------------------------------------------
1 | message = "Hello Python world!"
2 | print(message)
3 | # Hello Python world!
4 |
5 | message = "Hello Python Crash Course world!"
6 | print(message)
7 | # Hello Python Crash Course world!
8 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/2.变量和简单的数据类型/2.name.py:
--------------------------------------------------------------------------------
1 | # 合并字符串
2 | first_name = "ada"
3 | last_name = "lovelace"
4 | full_name = first_name + " " + last_name
5 |
6 | # 修改字符串大小写
7 | message = "Hello, " + full_name.title() + "!"
8 | print(message)
9 | # Hello, Ada Lovelace!
10 |
11 | # 添加空白
12 | print('python')
13 | # python
14 |
15 | print('\tpython')
16 | # python
17 |
18 | # 删除空白
19 | language = 'python '
20 | print(language.rstrip())
21 | # python
22 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/2.变量和简单的数据类型/3.apostrophe.py:
--------------------------------------------------------------------------------
1 | # 正确使用单引号和双引号
2 | message = "One of Python's strengths is its diverse community."
3 | print(message)
4 | # One of Python's strengths is its diverse community.
5 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/2.变量和简单的数据类型/4.birthday.py:
--------------------------------------------------------------------------------
1 | # 使用str()函数避免错误
2 | age = 23
3 | message = "Happy " + str(age) + "rd Birthday!"
4 |
5 | print(message)
6 | # Happy 23rd Birthday!
7 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/2.变量和简单的数据类型/5.comment.py:
--------------------------------------------------------------------------------
1 | # Say hello to everyone. 编写注释
2 | print("Hello Python people!")
3 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/3.列表简介/1.bicycles.py:
--------------------------------------------------------------------------------
1 | # 调用访问元素
2 | bicycles = ['trek', 'cannondale', 'redline', 'specialized']
3 |
4 | message = "My first bicycle was a " + bicycles[0].title() + "."
5 |
6 | print(message)
7 | # My first bicycle was a Trek.
8 |
9 | print(bicycles[1])
10 | # cannondale
11 |
12 | print(bicycles[3])
13 | # specialized
14 |
15 | print(bicycles[-1])
16 | # specialized
17 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/3.列表简介/2.motorcycles.py:
--------------------------------------------------------------------------------
1 | # 修改,添加和删除元素
2 | motorcycles = ['honda', 'yamaha', 'suzuki', 'ducati']
3 | print(motorcycles)
4 | # ['honda', 'yamaha', 'suzuki', 'ducati']
5 |
6 | too_expensive = 'ducati'
7 | motorcycles.remove(too_expensive)
8 | print(motorcycles)
9 | # ['honda', 'yamaha', 'suzuki']
10 |
11 | print("\nA " + too_expensive.title() + " is too expensive for me.")
12 | # A Ducati is too expensive for me.
13 |
14 | # 添加元素
15 | motorcycles.append('ducati')
16 | print(motorcycles)
17 | # ['honda', 'yamaha', 'suzuki', 'ducati']
18 |
19 | # 插入元素
20 | motorcycles.insert(0, 'bmw')
21 | print(motorcycles)
22 | # ['bmw', 'honda', 'yamaha', 'suzuki', 'ducati']
23 |
24 | # 删除元素
25 | del motorcycles[0]
26 | print(motorcycles)
27 | # ['honda', 'yamaha', 'suzuki', 'ducati']
28 |
29 | # pop()删除元素
30 | motorcycles.pop()
31 | print(motorcycles)
32 | # ['honda', 'yamaha', 'suzuki']
33 |
34 | # remove()删除元素
35 | motorcycles.remove('honda')
36 | print(motorcycles)
37 | # ['yamaha', 'suzuki']
38 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/3.列表简介/3.cars.py:
--------------------------------------------------------------------------------
1 | # 组织列表
2 | cars = ['bmw', 'audi', 'toyota', 'subaru']
3 |
4 | print("Here is the original list:")
5 | print(cars)
6 | # ['bmw', 'audi', 'toyota', 'subaru']
7 |
8 | print("\nHere is the sorted list:")
9 | print(sorted(cars))
10 | # ['audi', 'bmw', 'subaru', 'toyota']
11 |
12 | print("\nHere is the reverse alphabetical list:")
13 | print(sorted(cars, reverse=True))
14 | # ['toyota', 'subaru', 'bmw', 'audi']
15 |
16 | print("\nHere is the original list again:")
17 | print(cars)
18 | # ['bmw', 'audi', 'toyota', 'subaru']
19 |
20 | # 确定列表长度
21 | print(len(cars))
22 | # 4
23 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/4.操作列表/1.magicians.py:
--------------------------------------------------------------------------------
1 | # 遍历列表
2 | magicians = ['alice', 'david', 'carolina']
3 |
4 | for magician in magicians:
5 | print(magician.title() + ", that was a great trick!")
6 | print("I can't wait to see your next trick, " + magician.title() + ".\n")
7 | '''
8 | Alice, that was a great trick!
9 | I can't wait to see your next trick, Alice.
10 |
11 | David, that was a great trick!
12 | I can't wait to see your next trick, David.
13 |
14 | Carolina, that was a great trick!
15 | I can't wait to see your next trick, Carolina.
16 | '''
17 |
18 | print("Thank you everyone, that was a great magic show!")
19 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/4.操作列表/2.numbers.py:
--------------------------------------------------------------------------------
1 | # 创建列表数值
2 | numbers = list(range(1, 6))
3 | print(numbers)
4 | # [1, 2, 3, 4, 5]
5 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/4.操作列表/3.even_numbers.py:
--------------------------------------------------------------------------------
1 | # 使用range()创建数字列表
2 | even_numbers = list(range(2, 11, 2))
3 | print(even_numbers)
4 | # [2, 4, 6, 8, 10]
5 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/4.操作列表/4.squares.py:
--------------------------------------------------------------------------------
1 | # 处理平方运算
2 | squares = []
3 | for value in range(1, 11):
4 | square = value ** 2
5 | squares.append(square)
6 |
7 | print(squares)
8 | # [1, 4, 9, 16, 25, 36, 49, 64, 81, 100]
9 |
10 | # 列表解析写法
11 | new_squares = [value ** 2 for value in range(1, 11)]
12 | print(new_squares)
13 | # [1, 4, 9, 16, 25, 36, 49, 64, 81, 100]
14 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/4.操作列表/5.players.py:
--------------------------------------------------------------------------------
1 | # 列表切片
2 | players = ['charles', 'martina', 'michael', 'florence', 'eli']
3 |
4 | print("Here are the first three players on my team:")
5 | for player in players[:3]:
6 | print(player.title())
7 | '''
8 | Charles
9 | Martina
10 | Michael
11 | '''
12 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/4.操作列表/6.foods.py:
--------------------------------------------------------------------------------
1 | # 复制列表
2 | my_foods = ['pizza', 'falafel', 'carrot cake']
3 | friend_foods = my_foods[:]
4 |
5 | my_foods.append('cannoli')
6 | friend_foods.append('ice cream')
7 |
8 | print("My favorite foods are:")
9 | print(my_foods)
10 | # ['pizza', 'falafel', 'carrot cake', 'cannoli']
11 |
12 | print("\nMy friend's favorite foods are:")
13 | print(friend_foods)
14 | # ['pizza', 'falafel', 'carrot cake', 'ice cream']
15 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/4.操作列表/7.dimensions.py:
--------------------------------------------------------------------------------
1 | # 元组
2 | dimensions = (200, 50)
3 | print(dimensions[0]) # 200
4 | print(dimensions[1]) # 50
5 |
6 | # 给元组重新赋值
7 | print("Original dimensions:")
8 | for dimension in dimensions:
9 | print(dimension)
10 | '''
11 | Original dimensions:
12 | 200
13 | 50
14 | '''
15 |
16 | dimensions = (400, 100)
17 | print("\nModified dimensions:")
18 | for dimension in dimensions:
19 | print(dimension)
20 | '''
21 | Modified dimensions:
22 | 400
23 | 100
24 | '''
25 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/5.if语句/1.cars.py:
--------------------------------------------------------------------------------
1 | # if简单示例
2 | cars = ['audi', 'bmw', 'subaru', 'toyota']
3 |
4 | for car in cars:
5 | if car == 'bmw':
6 | print(car.upper())
7 | else:
8 | print(car.title())
9 | '''
10 | Audi
11 | BMW
12 | Subaru
13 | Toyota
14 | '''
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/5.if语句/2.toppings.py:
--------------------------------------------------------------------------------
1 | # 检查是否相等
2 | available_toppings = ['mushrooms', 'olives', 'green peppers', 'pepperoni', 'pineapple', 'extra cheese']
3 |
4 | requested_toppings = ['mushrooms', 'french fries', 'extra cheese']
5 |
6 | for requested_topping in requested_toppings:
7 | if requested_topping in available_toppings:
8 | print("Adding " + requested_topping + ".")
9 | else:
10 | print("Sorry, we don't have " + requested_topping + ".")
11 | '''
12 | Adding mushrooms.
13 | Sorry, we don't have french fries.
14 | Adding extra cheese.
15 | '''
16 |
17 | print("\nFinished making your pizza!")
18 | # Finished making your pizza!
19 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/5.if语句/3.magic_number.py:
--------------------------------------------------------------------------------
1 | # 比较数字
2 | answer = 17
3 |
4 | if answer != 42:
5 | print("That is not the correct answer. Please try again!")
6 | # That is not the correct answer. Please try again!
7 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/5.if语句/4.banned_users.py:
--------------------------------------------------------------------------------
1 | # 检查特殊值是否不包含在列表中
2 | banned_users = ['andrew', 'carolina', 'david']
3 | user = 'marie'
4 |
5 | if user not in banned_users:
6 | print(user.title() + ", you can post a response if you wish.")
7 | # Marie, you can post a response if you wish.
8 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/5.if语句/5.voting.py:
--------------------------------------------------------------------------------
1 | # if语句
2 | age = 17
3 | if age >= 18:
4 | print("You are old enough to vote!")
5 | print("Have you registered to vote yet?")
6 | else:
7 | print("Sorry, you are too young to vote.")
8 | print("Please register to vote as soon as you turn 18!")
9 | '''
10 | Sorry, you are too young to vote.
11 | Please register to vote as soon as you turn 18!
12 | '''
13 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/5.if语句/6.amusement_park.py:
--------------------------------------------------------------------------------
1 | # if-else-else结构
2 | age = 12
3 |
4 | if age < 4:
5 | price = 0
6 | elif age < 18:
7 | price = 5
8 | elif age < 65:
9 | price = 10
10 | elif age >= 65:
11 | price = 5
12 |
13 | print("Your admission cost is $" + str(price) + ".")
14 | # Your admission cost is $5.
15 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/6.字典/1.alien.py:
--------------------------------------------------------------------------------
1 | # 字典基础用法
2 | alien_0 = {'x_position': 0, 'y_position': 25, 'speed': 'medium'}
3 | print("Original position: " + str(alien_0['x_position']))
4 | # Original position: 0
5 |
6 | if alien_0['speed'] == 'slow':
7 | x_increment = 1
8 | elif alien_0['speed'] == 'medium':
9 | x_increment = 2
10 | else:
11 | x_increment = 3
12 |
13 | alien_0['x_position'] = alien_0['x_position'] + x_increment
14 |
15 | print("New position: " + str(alien_0['x_position']))
16 | # New position: 2
17 |
18 | # 删除键
19 | del alien_0['speed']
20 | print(alien_0)
21 | # {'x_position': 2, 'y_position': 25}
22 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/6.字典/2.favorite_languages.py:
--------------------------------------------------------------------------------
1 | # 遍历字典
2 | favorite_languages = {
3 | 'jen': 'python',
4 | 'sarah': 'c',
5 | 'edward': 'ruby',
6 | 'phil': 'python',
7 | }
8 |
9 | for name, language in favorite_languages.items():
10 | print(name.title() + "'s favorite language is " + language.title() + ".")
11 | '''
12 | Jen's favorite language is Python.
13 | Sarah's favorite language is C.
14 | Edward's favorite language is Ruby.
15 | Phil's favorite language is Python.
16 | '''
17 |
18 | # 遍历键
19 | for key in favorite_languages.keys():
20 | print(key)
21 | '''
22 | jen
23 | sarah
24 | edward
25 | phil
26 | '''
27 |
28 | # 遍历值
29 | for value in favorite_languages.values():
30 | print(value)
31 | '''
32 | python
33 | c
34 | ruby
35 | python
36 | '''
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/6.字典/3.aliens.py:
--------------------------------------------------------------------------------
1 | # 嵌套字典
2 | aliens = []
3 |
4 | for alien_number in range(0, 30):
5 | new_alien = {'color': 'green', 'points': 5, 'speed': 'slow'}
6 | aliens.append(new_alien)
7 |
8 | for alien in aliens[0:3]:
9 | if alien['color'] == 'green':
10 | alien['color'] = 'yellow'
11 | alien['speed'] = 'medium'
12 | alien['points'] = 10
13 | elif alien['color'] == 'yellow':
14 | alien['color'] = 'red'
15 | alien['speed'] = 'fast'
16 | alien['points'] = 15
17 |
18 | for alien in aliens[0:5]:
19 | print(alien)
20 | print("...")
21 | '''
22 | {'color': 'yellow', 'points': 10, 'speed': 'medium'}
23 | {'color': 'yellow', 'points': 10, 'speed': 'medium'}
24 | {'color': 'yellow', 'points': 10, 'speed': 'medium'}
25 | {'color': 'green', 'points': 5, 'speed': 'slow'}
26 | {'color': 'green', 'points': 5, 'speed': 'slow'}
27 | ...
28 | '''
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/6.字典/4.pizza.py:
--------------------------------------------------------------------------------
1 | # 在字典中存储列表
2 |
3 | pizza = {
4 | 'crust': 'thick',
5 | 'toppings': ['mushrooms', 'extra cheese'],
6 | }
7 |
8 | print("You ordered a " + pizza['crust'] + "-crust pizza " +
9 | "with the following toppings:")
10 |
11 | for topping in pizza['toppings']:
12 | print("\t" + topping)
13 |
14 | '''
15 | You ordered a thick-crust pizza with the following toppings:
16 | mushrooms
17 | extra cheese
18 | '''
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/6.字典/5.many_users.py:
--------------------------------------------------------------------------------
1 | # 字典中存储字典
2 |
3 | users = {'aeinstein': {'first': 'albert',
4 | 'last': 'einstein',
5 | 'location': 'princeton'},
6 | 'mcurie': {'first': 'marie',
7 | 'last': 'curie',
8 | 'location': 'paris'},
9 | }
10 |
11 | for username, user_info in users.items():
12 | print("\nUsername: " + username)
13 | full_name = user_info['first'] + " " + user_info['last']
14 | location = user_info['location']
15 |
16 | print("\tFull name: " + full_name.title())
17 | print("\tLocation: " + location.title())
18 |
19 | '''
20 | Username: aeinstein
21 | Full name: Albert Einstein
22 | Location: Princeton
23 |
24 | Username: mcurie
25 | Full name: Marie Curie
26 | Location: Paris
27 | '''
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/6.字典/6.user.py:
--------------------------------------------------------------------------------
1 | # 遍历字典键值对
2 |
3 | user_0 = {'username': 'efermi',
4 | 'first': 'enrico',
5 | 'last': 'fermi',
6 | }
7 |
8 | for key, value in user_0.items():
9 | print("\nKey: " + key)
10 | print("Value: " + value)
11 |
12 | '''
13 | Key: username
14 | Value: efermi
15 |
16 | Key: first
17 | Value: enrico
18 |
19 | Key: last
20 | Value: fermi
21 | '''
22 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/7.用户输入和while循环/1.parrot.py:
--------------------------------------------------------------------------------
1 | # input()函数
2 | prompt = "\nTell me something, and I will repeat it back to you:"
3 | prompt += "\nEnter 'quit' to end the program. "
4 |
5 | active = True
6 | while active:
7 | message = input(prompt)
8 |
9 | if message == 'quit':
10 | active = False
11 | else:
12 | print(message)
13 |
14 | '''
15 | Tell me something, and I will repeat it back to you:
16 | Enter 'quit' to end the program. a
17 | a
18 |
19 | Tell me something, and I will repeat it back to you:
20 | Enter 'quit' to end the program. s
21 | s
22 |
23 | Tell me something, and I will repeat it back to you:
24 | Enter 'quit' to end the program. d
25 | d
26 |
27 | Tell me something, and I will repeat it back to you:
28 | Enter 'quit' to end the program. q
29 | q
30 |
31 | Tell me something, and I will repeat it back to you:
32 | Enter 'quit' to end the program. quit
33 |
34 | Process finished with exit code 0
35 | '''
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/7.用户输入和while循环/2.greeter.py:
--------------------------------------------------------------------------------
1 | # 编写清晰函数
2 | prompt = "If you tell us who you are, we can personalize the messages you see."
3 | prompt += "\nWhat is your first name? "
4 |
5 | name = input(prompt)
6 | print("\nHello, " + name + "!")
7 |
8 | '''
9 | If you tell us who you are, we can personalize the messages you see.
10 | What is your first name? Wang
11 |
12 | Hello, Wang!
13 |
14 | Process finished with exit code 0
15 | '''
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/7.用户输入和while循环/3.rollercoaster.py:
--------------------------------------------------------------------------------
1 | # 输入变量转型
2 | height = input("How tall are you, in inches? ")
3 | height = int(height)
4 |
5 | if height >= 36:
6 | print("\nYou're tall enough to ride!")
7 | else:
8 | print("\nYou'll be able to ride when you're a little older.")
9 |
10 | '''
11 | How tall are you, in inches? 32
12 |
13 | You'll be able to ride when you're a little older.
14 |
15 | Process finished with exit code 0
16 | '''
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/7.用户输入和while循环/4.even_or_odd.py:
--------------------------------------------------------------------------------
1 | # 判断输入奇数偶数
2 | number = input("Enter a number, and I'll tell you if it's even or odd: ")
3 | number = int(number)
4 |
5 | if number % 2 == 0:
6 | print("\nThe number " + str(number) + " is even.")
7 | else:
8 | print("\nThe number " + str(number) + " is odd.")
9 | '''
10 | Enter a number, and I'll tell you if it's even or odd: 11
11 |
12 | The number 11 is odd.
13 | '''
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/7.用户输入和while循环/5.counting.py:
--------------------------------------------------------------------------------
1 | # while()循环
2 | current_number = 1
3 |
4 | while current_number <= 5:
5 | print(current_number)
6 | current_number += 1
7 | '''
8 | 1
9 | 2
10 | 3
11 | 4
12 | 5
13 | '''
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/7.用户输入和while循环/6.cities.py:
--------------------------------------------------------------------------------
1 | # break退出循环
2 | prompt = "\nPlease tell me a city you have visited:"
3 | prompt += "\n(Enter 'quit' when you are finished.) "
4 |
5 | while True:
6 | city = input(prompt)
7 |
8 | if city == 'quit':
9 | break
10 | else:
11 | print("I'd love to go to " + city.title() + "!")
12 |
13 | '''
14 | Please tell me a city you have visited:
15 | (Enter 'quit' when you are finished.) New York
16 | I'd love to go to New York!
17 |
18 | Please tell me a city you have visited:
19 | (Enter 'quit' when you are finished.) quit
20 |
21 | Process finished with exit code 0
22 | '''
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/7.用户输入和while循环/7.confirmed_users.py:
--------------------------------------------------------------------------------
1 | # while()处理列表
2 | unconfirmed_users = ['alice', 'brian', 'candace']
3 | confirmed_users = []
4 |
5 | while unconfirmed_users:
6 | current_user = unconfirmed_users.pop()
7 |
8 | print("Verifying user: " + current_user.title())
9 | confirmed_users.append(current_user)
10 | '''
11 | Verifying user: Candace
12 | Verifying user: Brian
13 | Verifying user: Alice
14 | '''
15 |
16 | print("\nThe following users have been confirmed:")
17 | for confirmed_user in confirmed_users:
18 | print(confirmed_user.title())
19 | '''
20 | The following users have been confirmed:
21 | Candace
22 | Brian
23 | Alice
24 | '''
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/7.用户输入和while循环/8.pets.py:
--------------------------------------------------------------------------------
1 | # 删除列表元素
2 | pets = ['dog', 'cat', 'dog', 'goldfish', 'cat', 'rabbit', 'cat']
3 | print(pets)
4 | # ['dog', 'cat', 'dog', 'goldfish', 'cat', 'rabbit', 'cat']
5 |
6 | while 'cat' in pets:
7 | pets.remove('cat')
8 |
9 | print(pets)
10 | # ['dog', 'dog', 'goldfish', 'rabbit']
11 |
--------------------------------------------------------------------------------
/1.Python编程[从入门到实战]/7.用户输入和while循环/9.mountain_poll.py:
--------------------------------------------------------------------------------
1 | # 使用用户输入建立字典
2 | responses = {}
3 |
4 | polling_active = True
5 |
6 | while polling_active:
7 | name = input("\nWhat is your name? ")
8 | response = input("Which mountain would you like to climb someday? ")
9 |
10 | responses[name] = response
11 |
12 | repeat = input("Would you like to let another person respond? (yes/ no) ")
13 | if repeat == 'no':
14 | polling_active = False
15 |
16 | print("\n--- Poll Results ---")
17 | for name, response in responses.items():
18 | print(name + " would like to climb " + response + ".")
19 |
20 | '''
21 | What is your name? Wang
22 | Which mountain would you like to climb someday? Tai
23 | Would you like to let another person respond? (yes/ no) no
24 |
25 | --- Poll Results ---
26 | Wang would like to climb Tai.
27 | '''
28 |
--------------------------------------------------------------------------------
/2.Python数据分析/week01/Python3.6基础.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | print("Hello, Python!");
4 |
5 | # 行和缩进
6 | if True:
7 | print("True")
8 | else:
9 | print("False")
10 |
11 | if True:
12 | print("Answer")
13 | print("True")
14 |
15 | else:
16 | print("Answer")
17 | # 没有严格缩进,在执行时保持
18 | print("False")
19 |
20 | # 多行语句
21 | item_one = 1
22 | item_two = 2
23 | item_three = 3
24 | total = item_one + \
25 | item_two + \
26 | item_three
27 | print(total)
28 |
29 | days = ['Monday', 'Tuesday', 'Wednesday',
30 | 'Thursday', 'Friday']
31 | print(days)
32 |
33 | # 引号
34 | word = 'word'
35 | sentence = "这是一个句子。"
36 | paragraph = """这是一个段落。
37 | 包含了多个语句"""
38 | print(word)
39 |
40 | print(sentence)
41 |
42 | print(paragraph)
43 |
44 | # 注释
45 | # 第一个注释
46 | print("Hello, Python!")
47 | # 第二个注释
48 | name = "Madisetti" # 这是一个注释
49 |
50 | '''
51 | 这是多行注释,使用单引号。
52 | 这是多行注释,使用单引号。
53 | 这是多行注释,使用单引号。
54 | '''
55 |
56 | """
57 | 这是多行注释,使用双引号。
58 | 这是多行注释,使用双引号。
59 | 这是多行注释,使用双引号。
60 | """
61 |
62 | # 空行
63 | str = input("\n\nPress the enter key to exit.")
64 |
65 | import sys
66 |
67 | x = 'foo'
68 | sys.stdout.write(x + '\n')
69 |
70 | # 码组
71 | '''
72 | if expression :
73 | suite
74 | elif expression :
75 | suite
76 | else :
77 | suite
78 | '''
79 |
80 | # 帮助
81 | help(sys.stdout.write)
82 |
83 | # 变量赋值
84 | counter = 100 # 赋值整型变量
85 | miles = 1000.0 # 浮点型
86 | name = "John" # 字符串
87 |
88 | print(counter)
89 |
90 | print(miles)
91 |
92 | print(name)
93 |
94 | a = b = c = 1
95 | print(a, b, c)
96 |
97 | a, b, c = 1, 2, "john"
98 | print(a, b, c)
99 |
100 | # 数字
101 | var1 = 1
102 | var2 = 10
103 |
104 | # del var1[,var2[,var3[....,varN]]]]
105 | var = 5896419821
106 | var_a = 0.22
107 | var_b = 3e2
108 | del var
109 | del var_a, var_b
110 |
111 | # 字符串
112 | # s="a1a2•••an"(n>=0)
113 |
114 | s = 'ilovepython'
115 | print(s[1:5])
116 |
117 | print(s[5:-1])
118 |
119 | str = 'Hello World!'
120 | print(str) # 输出完整字符串
121 |
122 | print(str[0]) # 输出字符串中的第一个字符
123 |
124 | print(str[2:5]) # 输出字符串中第三个至第五个之间的字符串
125 |
126 | print(str[2:]) # 输出从第三个字符开始的字符串
127 |
128 | print(str * 2) # 输出字符串两次
129 |
130 | print(str + "TEST") # 输出连接的字符串
131 |
132 | # 列表
133 | list = ['abcd', 786, 2.23, 'john', 70.2]
134 | tinylist = [123, 'john']
135 |
136 | print(list) # 输出完整列表
137 |
138 | print(list[0]) # 输出列表的第一个元素
139 |
140 | print(list[1:3]) # 输出第二个至第三个的元素
141 |
142 | print(list[2:]) # 输出从第三个开始至列表末尾的所有元素
143 |
144 | print(tinylist * 2) # 输出列表两次
145 |
146 | print(list + tinylist) # 打印组合的列表
147 |
148 | # 元组
149 | tuple = ('abcd', 786, 2.23, 'john', 70.2)
150 | tinytuple = (123, 'john')
151 |
152 | print(tuple) # 输出完整元组
153 |
154 | print(tuple[0]) # 输出元组的第一个元素
155 |
156 | print(tuple[1:3]) # 输出第二个至第三个的元素
157 |
158 | print(tuple[2:]) # 输出从第三个开始至列表末尾的所有元素
159 |
160 | print(tinytuple * 2) # 输出元组两次
161 |
162 | print(tuple + tinytuple) # 打印组合的元组
163 |
164 | tuple = ('abcd', 786, 2.23, 'john', 70.2)
165 | list = ['abcd', 786, 2.23, 'john', 70.2]
166 | # tuple[2] = 1000 # 元组中是非法应用
167 | list[2] = 1000 # 列表中是合法应用
168 |
169 | # 元字典
170 | dict = {}
171 | dict['one'] = "This is one"
172 | dict[2] = "This is two"
173 |
174 | tinydict = {'name': 'john', 'code': 6734, 'dept': 'sales'}
175 |
176 | print(dict['one']) # 输出键为'one' 的值
177 |
178 | print(dict[2]) # 输出键为 2 的值
179 |
180 | print(tinydict) # 输出完整的字典
181 |
182 | print(tinydict.keys()) # 输出所有键
183 |
184 | print(tinydict.values()) # 输出所有值
185 |
186 | # 算术运算符
187 | a = 21
188 | b = 10
189 | c = 0
190 |
191 | c = a + b
192 | print("Line 1 - Value of c is ", c)
193 |
194 | c = a - b
195 | print("Line 2 - Value of c is ", c)
196 |
197 | c = a * b
198 | print("Line 3 - Value of c is ", c)
199 |
200 | c = a / b
201 | print("Line 4 - Value of c is ", c)
202 |
203 | c = a % b
204 | print("Line 5 - Value of c is ", c)
205 |
206 | a = 2
207 | b = 3
208 | c = a ** b
209 | print("Line 6 - Value of c is ", c)
210 |
211 | a = 10
212 | b = 5
213 | c = a // b
214 | print("Line 7 - Value of c is ", c)
215 |
216 | # 比较运算符
217 | a = 21
218 | b = 10
219 | c = 0
220 |
221 | if (a == b):
222 | print("Line 1 - a is equal to b")
223 |
224 | else:
225 | print("Line 1 - a is not equal to b")
226 |
227 | if (a != b):
228 | print("Line 2 - a is not equal to b")
229 |
230 | else:
231 | print
232 | "Line 2 - a is equal to b"
233 |
234 | if (a != b):
235 | print("Line 3 - a is not equal to b")
236 |
237 | else:
238 | print("Line 3 - a is equal to b")
239 |
240 | if (a < b):
241 | print("Line 4 - a is less than b")
242 |
243 | else:
244 | print("Line 4 - a is not less than b")
245 |
246 | if (a > b):
247 | print("Line 5 - a is greater than b")
248 |
249 | else:
250 | print("Line 5 - a is not greater than b")
251 |
252 | a = 5
253 | b = 20
254 | if (a <= b):
255 | print("Line 6 - a is either less than or equal to b")
256 |
257 | else:
258 | print("Line 6 - a is neither less than nor equal to b")
259 |
260 | if (b >= a):
261 | print("Line 7 - b is either greater than or equal to b")
262 |
263 | else:
264 | print("Line 7 - b is neither greater than nor equal to b")
265 |
266 | # 赋值运算符
267 | a = 21
268 | b = 10
269 | c = 0
270 |
271 | c = a + b
272 | print("Line 1 - Value of c is ", c)
273 |
274 | c += a
275 | print("Line 2 - Value of c is ", c)
276 |
277 | c *= a
278 | print("Line 3 - Value of c is ", c)
279 |
280 | c /= a
281 | print("Line 4 - Value of c is ", c)
282 |
283 | c = 2
284 | c %= a
285 | print("Line 5 - Value of c is ", c)
286 |
287 | c **= a
288 | print("Line 6 - Value of c is ", c)
289 |
290 | c //= a
291 | print("Line 7 - Value of c is ", c)
292 |
293 | # 位运算符
294 | a = 60 # 60 = 0011 1100
295 | b = 13 # 13 = 0000 1101
296 | c = 0
297 |
298 | c = a & b # 12 = 0000 1100
299 | print("Line 1 - Value of c is ", c)
300 |
301 | c = a | b # 61 = 0011 1101
302 | print("Line 2 - Value of c is ", c)
303 |
304 | c = a ^ b # 49 = 0011 0001
305 | print("Line 3 - Value of c is ", c)
306 |
307 | c = ~a # -61 = 1100 0011
308 | print("Line 4 - Value of c is ", c)
309 |
310 | c = a << 2 # 240 = 1111 0000
311 | print("Line 5 - Value of c is ", c)
312 |
313 | c = a >> 2 # 15 = 0000 1111
314 | print("Line 6 - Value of c is ", c)
315 |
316 | # 逻辑运算符
317 | a = 10
318 | b = 20
319 | c = 0
320 |
321 | if (a and b):
322 | print("Line 1 - a and b are true")
323 |
324 | else:
325 | print("Line 1 - Either a is not true or b is not true")
326 |
327 | if (a or b):
328 | print("Line 2 - Either a is true or b is true or both are true")
329 |
330 | else:
331 | print("Line 2 - Neither a is true nor b is true")
332 |
333 | a = 0
334 | if (a and b):
335 | print("Line 3 - a and b are true")
336 |
337 | else:
338 | print("Line 3 - Either a is not true or b is not true")
339 |
340 | if (a or b):
341 | print("Line 4 - Either a is true or b is true or both are true")
342 |
343 | else:
344 | print("Line 4 - Neither a is true nor b is true")
345 |
346 | if not (a and b):
347 | print("Line 5 - Either a is not true or b is not true or both are not true")
348 | else:
349 | print("Line 5 - a and b are true")
350 |
351 | # 成员运算符
352 | a = 10
353 | b = 20
354 | list = [1, 2, 3, 4, 5]
355 |
356 | if (a in list):
357 | print("Line 1 - a is available in the given list")
358 |
359 | else:
360 | print("Line 1 - a is not available in the given list")
361 |
362 | if (b not in list):
363 | print("Line 2 - b is not available in the given list")
364 |
365 | else:
366 | print("Line 2 - b is available in the given list")
367 |
368 | a = 2
369 | if (a in list):
370 | print("Line 3 - a is available in the given list")
371 |
372 | else:
373 | print("Line 3 - a is not available in the given list")
374 |
375 | # 身份运算符
376 | a = 20
377 | b = 20
378 |
379 | if (a is b):
380 | print("Line 1 - a and b have same identity")
381 |
382 | else:
383 | print("Line 1 - a and b do not have same identity")
384 |
385 | if (id(a) == id(b)):
386 | print("Line 2 - a and b have same identity")
387 |
388 | else:
389 | print("Line 2 - a and b do not have same identity")
390 |
391 | b = 30
392 | if (a is b):
393 | print("Line 3 - a and b have same identity")
394 |
395 | else:
396 | print("Line 3 - a and b do not have same identity")
397 |
398 | if (a is not b):
399 | print("Line 4 - a and b do not have same identity")
400 |
401 | else:
402 | print("Line 4 - a and b have same identity")
403 |
404 | # 运算符优先级
405 | a = 20
406 | b = 10
407 | c = 15
408 | d = 5
409 | e = 0
410 |
411 | e = (a + b) * c / d # ( 30 * 15 ) / 5
412 | print("Value of (a + b) * c / d is ", e)
413 |
414 | e = ((a + b) * c) / d # (30 * 15 ) / 5
415 | print("Value of ((a + b) * c) / d is ", e)
416 |
417 | e = (a + b) * (c / d) # (30) * (15/5)
418 | print("Value of (a + b) * (c / d) is ", e)
419 |
420 | e = a + (b * c) / d # 20 + (150/5)
421 | print("Value of a + (b * c) / d is ", e)
422 |
--------------------------------------------------------------------------------
/2.Python数据分析/week02/Python函数.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # 自定义函数
3 | '''
4 | def functionname( parameters ):
5 | "函数_文档字符串"
6 | function_suite
7 | return [expression]
8 | '''
9 |
10 |
11 | def printme(str):
12 | # "打印传入的字符串到标准显示设备上"
13 | print(str)
14 | return
15 |
16 |
17 | # 函数调用
18 | printme("我要调用用户自定义函数!");
19 | printme("再次调用同一函数");
20 |
21 |
22 | # 可写函数说明
23 | def changeme(mylist):
24 | # "修改传入的列表"
25 | mylist.append([1, 2, 3, 4]);
26 | print("函数内取值: ", mylist)
27 | return
28 |
29 |
30 | # 调用changeme函数
31 | mylist = [10, 20, 30];
32 | changeme(mylist);
33 | print("函数外取值: ", mylist)
34 |
35 |
36 | # 参数
37 | def printme(str):
38 | # "打印任何传入的字符串"
39 | print(str)
40 |
41 | return
42 |
43 |
44 | # 调用printme函数
45 | printme()
46 |
47 |
48 | def printme(str):
49 | # "打印任何传入的字符串"
50 | print(str)
51 | return
52 |
53 |
54 | # 调用printme函数
55 | printme(str="My string");
56 |
57 |
58 | def printinfo(name, age):
59 | # "打印任何传入的字符串"
60 | print("Name: ", name)
61 | print("Age ", age)
62 | return
63 |
64 |
65 | # 调用printinfo函数
66 | printinfo(age=50, name="miki");
67 |
68 |
69 | def printinfo(name, age=35):
70 | # "打印任何传入的字符串"
71 | print("Name: ", name)
72 | print("Age ", age)
73 | return
74 |
75 |
76 | # 调用printinfo函数
77 | printinfo(age=50, name="miki")
78 | printinfo(name="miki")
79 |
80 | # 不定长参数
81 | '''
82 | def functionname([formal_args,] *var_args_tuple ):
83 | "函数_文档字符串"
84 | function_suite
85 | return [expression]
86 | '''
87 |
88 |
89 | def printinfo(arg1, *vartuple):
90 | # "打印任何传入的参数"
91 | print("输出: ")
92 | print(arg1)
93 | for var in vartuple:
94 | print(var)
95 | return
96 |
97 |
98 | # 调用printinfo 函数
99 | printinfo(10)
100 | printinfo(70, 60, 50)
101 |
102 | # 匿名函数
103 | '''
104 | lambda [arg1 [,arg2,.....argn]]:expression
105 | '''
106 |
107 | sum = lambda arg1, arg2: arg1 + arg2;
108 | # 调用sum函数
109 | print("相加后的值为 : ", sum(10, 20))
110 |
111 | print("相加后的值为 : ", sum(20, 20))
112 |
113 |
114 | # return语句
115 | def sum(arg1, arg2):
116 | # 返回2个参数的和."
117 | total = arg1 + arg2
118 | print("函数内 : ", total)
119 | return total
120 |
121 |
122 | # 调用sum函数
123 | total = sum(10, 20)
124 | print("函数外 : ", total)
125 |
126 | # 变量的作用范围
127 | total = 0 # 这是一个全局变量
128 |
129 |
130 | # 可写函数说明
131 | def sum(arg1, arg2):
132 | # 返回2个参数的和."
133 | total = arg1 + arg2 # total在这里是局部变量.
134 | print("函数内是局部变量 : ", total)
135 | return total
136 |
137 |
138 | # 调用sum函数
139 | sum(10, 20)
140 | print("函数外是全局变量 : ", total)
141 |
142 | # 键盘输入
143 | str = input("Please enter:");
144 | print("你输入的内容是: ", str)
145 |
146 | # 打开与关闭文件
147 | # 打开一个文件
148 | fo = open("foo.txt", "wb")
149 | print("文件名: ", fo.name)
150 |
151 | print("是否已关闭 : ", fo.closed)
152 |
153 | print("访问模式 : ", fo.mode)
154 |
155 | print("末尾是否强制加空格 : ", fo.softspace)
156 |
157 | # 打开一个文件
158 | fo = open("foo.txt", "wb")
159 | print("文件名: ", fo.name)
160 |
161 | fo.close()
162 |
163 | # 打开一个文件
164 | fo = open("foo.txt", "wb")
165 | fo.write("hello\npython");
166 |
167 | # 关闭打开的文件
168 | fo.close()
169 |
170 | # 打开一个文件
171 | fo = open("foo.txt", "r+")
172 | str = fo.read(10);
173 | print("读取的字符串是 : ", str)
174 |
175 | # 关闭打开的文件
176 | fo.close()
177 |
178 | # 打开一个文件
179 | fo = open("foo.txt", "r+")
180 | str = fo.read(10)
181 | print("读取的字符串是 : ", str)
182 |
183 | # 查找当前位置
184 | position = fo.tell()
185 | print("当前文件位置 : ", position)
186 |
187 | # 把指针再次重新定位到文件开头
188 | position = fo.seek(0, 0)
189 | str = fo.read(10)
190 | print("重新读取字符串 : ", str)
191 |
192 | # 关闭打开的文件
193 | fo.close()
194 |
195 | import os
196 |
197 | # 重命名文件test1.txt到test2.txt。
198 | os.rename("test1.txt", "test2.txt")
199 |
200 | import os
201 |
202 | # 删除一个已经存在的文件test2.txt
203 | os.remove("test2.txt")
204 |
205 | # 异常处理
206 | try:
207 | fh = open("testfile", "w")
208 | fh.write("This is my test file for exception handling!!")
209 | except IOError:
210 | print("Error: can\'t find file or read data")
211 |
212 | else:
213 | print("Written content in the file successfully")
214 |
215 | fh.close()
216 |
217 | try:
218 | fh = open("testfile", "r")
219 | fh.write("This is my test file for exception handling!!")
220 | except IOError:
221 | print("Error: can\'t find file or read data")
222 |
223 | else:
224 | print("Written content in the file successfully")
225 |
226 | try:
227 | fh = open("testfile", "w")
228 | fh.write("This is my test file for exception handling!!")
229 | finally:
230 | print("Error: can\'t find file or read data")
231 |
232 | try:
233 | fh = open("testfile", "w")
234 | try:
235 | fh.write("This is my test file for exception handling!!")
236 | finally:
237 | print("Going to close the file")
238 |
239 | fh.close()
240 | except IOError:
241 | print("Error: can\'t find file or read data")
242 |
243 | '''
244 | try:
245 | Business
246 | Logic
247 | here...
248 | except "Invalid level!":
249 | Exception
250 | handling
251 | here...
252 | else:
253 | Rest
254 | of
255 | the
256 | code
257 | here...
258 | '''
259 |
260 |
261 | # 自定义异常
262 | class Networkerror(RuntimeError):
263 | def __init__(self, arg):
264 | self.args = arg
265 |
266 |
267 | try:
268 | raise Networkerror("Bad hostname")
269 | except Networkerror:
270 | print()
271 |
--------------------------------------------------------------------------------
/2.Python数据分析/week02/Python条件语句.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # 条件语句
3 | '''
4 | if 判断条件:
5 | 执行语句……
6 | else:
7 | 执行语句……
8 | '''
9 |
10 | flag = False
11 | name = 'python'
12 | if name == 'python': # 判断变量否为'python'
13 | flag = True # 条件成立时设置标志为真
14 | print('welcome boss') # 并输出欢迎信息
15 | else:
16 | print(name) # 条件不成立时输出变量名称
17 |
18 | '''
19 | if 判断条件1:
20 | 执行语句1……
21 | elif 判断条件2:
22 | 执行语句2……
23 | elif 判断条件3:
24 | 执行语句3……
25 | else:
26 | 执行语句4……
27 | '''
28 |
29 | num = 2
30 | if num == 3: # 判断num的值
31 | print('boss')
32 |
33 | elif num == 2:
34 | print('user')
35 |
36 | elif num == 1:
37 | print('worker')
38 |
39 | elif num < 0: # 值小于零时输出
40 | print('error')
41 |
42 | else:
43 | print('roadman') # 条件均不成立时输出
44 |
45 | num = 9
46 | if num >= 0 and num <= 10: # 判断值是否在0~10之间
47 | print('hello')
48 |
49 | num = 10
50 | if num < 0 or num > 10: # 判断值是否在小于0或大于10
51 | print('hello')
52 |
53 | else:
54 | print('undefine')
55 |
56 | num = 8
57 | # 判断值是否在0~5或者10~15之间
58 | if (num >= 0 and num <= 5) or (num >= 10 and num <= 15):
59 | print('hello')
60 |
61 | else:
62 | print('undefine')
63 |
64 | var = 100
65 | if (var == 100): print("变量 var 的值为100")
66 |
67 | print("Good bye!")
68 |
69 | # while语句
70 | '''
71 | while 判断条件:
72 | 执行语句……
73 | '''
74 | count = 0
75 | while (count < 9):
76 | print('The count is:', count)
77 |
78 | count = count + 1
79 |
80 | print("Good bye!")
81 |
82 | # continue 和 break 用法
83 |
84 | i = 1
85 | while i < 10:
86 | i += 1
87 | if i % 2 > 0: # 非双数时跳过输出
88 | continue
89 | print(i) # 输出双数2、4、6、8、10
90 |
91 | i = 1
92 | while 1: # 循环条件为1必定成立
93 | print(i) # 输出1~10
94 |
95 | i += 1
96 | if i > 10: # 当i大于10时跳出循环
97 | break
98 |
99 | # 死循环
100 | '''
101 | var = 1
102 | while var == 1 : # 该条件永远为true,循环将无限执行下去
103 | num = raw_input("Enter a number :")
104 | print "You entered: ", num
105 |
106 | print "Good bye!"
107 | '''
108 |
109 | # while … else
110 | count = 0
111 | while count < 5:
112 | print(count, " is less than 5")
113 |
114 | count = count + 1
115 | else:
116 | print(count, " is not less than 5")
117 |
118 | # 简单语句组
119 | flag = 0
120 | while (flag): print('Given flag is really true!')
121 |
122 | flag = 0
123 | print("Good bye!")
124 |
125 | # for语句
126 | '''
127 | for iterating_var in sequence:
128 | statements(s)
129 | '''
130 | for letter in 'Python': # 第一个实例
131 | print('当前字母 :', letter)
132 |
133 | fruits = ['banana', 'apple', 'mango']
134 | for fruit in fruits: # 第二个实例
135 | print('当前水果 :', fruit)
136 |
137 | print("Good bye!")
138 |
139 | # 序列索引迭代
140 | fruits = ['banana', 'apple', 'mango']
141 | for index in range(len(fruits)):
142 | print('当前水果 :', fruits[index])
143 |
144 | print("Good bye!")
145 |
146 | # for...else
147 | for num in range(10, 20): # 迭代 10 到 20 之间的数字
148 | for i in range(2, num): # 根据因子迭代
149 | if num % i == 0: # 确定第一个因子
150 | j = num / i # 计算第二个因子
151 | print('%d 等于 %d * %d' % (num, i, j))
152 | break # 跳出当前循环
153 | else: # 循环的 else 部分
154 | print(num, '是一个质数')
155 |
156 | # 嵌套循环
157 | i = 2
158 | while (i < 100):
159 | j = 2
160 | while (j <= (i / j)):
161 | if not (i % j): break
162 | j = j + 1
163 | if (j > i / j): print(i, " 是素数")
164 | i = i + 1
165 |
166 | print("Good bye!")
167 |
168 | # break语句
169 | for letter in 'Python': # First Example
170 | if letter == 'h':
171 | break
172 | print('Current Letter :', letter)
173 |
174 | var = 10 # Second Example
175 | while var > 0:
176 | print('Current variable value :', var)
177 |
178 | var = var - 1
179 | if var == 5:
180 | break
181 |
182 | print("Good bye!")
183 |
184 | # continue语句
185 | for letter in 'Python': # 第一个实例
186 | if letter == 'h':
187 | continue
188 | print('当前字母 :', letter)
189 |
190 | var = 10 # 第二个实例
191 | while var > 0:
192 | var = var - 1
193 | if var == 5:
194 | continue
195 | print('当前变量值 :', var)
196 |
197 | print("Good bye!")
198 |
199 | # pass语句
200 | # 输出 Python 的每个字母
201 | for letter in 'Python':
202 | if letter == 'h':
203 | pass
204 | print('这是 pass 块')
205 | print('当前字母 :', letter)
206 |
207 | print("Good bye!")
208 |
209 | # 格式字符串
210 | print("My name is %s and weight is %d kg!" % ('Zara', 21))
211 |
212 | # 时间与日期
213 | import time # This is required to include time module.
214 |
215 | ticks = time.time()
216 | print("Number of ticks since 12:00am, January 1, 1970:", ticks)
217 |
218 | localtime = time.localtime(time.time())
219 | print("Local current time :", localtime)
220 |
221 | localtime = time.asctime(time.localtime(time.time()))
222 | print("Local current time :", localtime)
223 |
224 | import calendar
225 |
226 | cal = calendar.month(2008, 1)
227 | print("Here is the calendar:")
228 |
229 | print(cal)
230 |
--------------------------------------------------------------------------------
/2.Python数据分析/week03/numpy.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # 向量相加-Python
3 | def pythonsum(n):
4 | a = list(range(n))
5 | b = list(range(n))
6 | c = []
7 | for i in range(len(a)):
8 | a[i] = i ** 2
9 | b[i] = i ** 3
10 | c.append(a[i] + b[i])
11 | return c
12 |
13 |
14 | # print(pythonsum(3))
15 |
16 | # 向量相加-NumPy
17 |
18 |
19 | def numpysum(n):
20 | a = np.arange(n) ** 2
21 | b = np.arange(n) ** 3
22 | c = a + b
23 | return c
24 |
25 |
26 | # print(numpysum(3))
27 |
28 | # 效率比较
29 | from datetime import datetime
30 |
31 | import numpy as np
32 |
33 | size = 1000
34 |
35 | start = datetime.now()
36 | c = pythonsum(size)
37 | delta = datetime.now() - start
38 | # print("The last 2 elements of the sum", c[-2:])
39 | # print("PythonSum elapsed time in microseconds", delta.microseconds)
40 |
41 | start = datetime.now()
42 | c = numpysum(size)
43 | delta = datetime.now() - start
44 | # print("The last 2 elements of the sum", c[-2:])
45 | # print("NumPySum elapsed time in microseconds", delta.microseconds)
46 |
47 |
48 | # numpy数组
49 | a = np.arange(5)
50 | # print(a.dtype)
51 | # print(a)
52 | # print(a.shape)
53 |
54 | # 创建多维数组
55 | m = np.array([np.arange(2), np.arange(2)])
56 | # print(m)
57 | # print(m.shape)
58 | # print(m.dtype)
59 | # print(np.zeros(10))
60 | # print(np.zeros((3, 6)))
61 | # print(np.empty((2, 3, 2)))
62 | # print(np.arange(15))
63 |
64 |
65 | # 选取数组元素
66 | a = np.array([[1, 2], [3, 4]])
67 |
68 | # print("In: a")
69 | # print(a)
70 | #
71 | # print("In: a[0,0]")
72 | # print(a[0, 0])
73 | # print("In: a[0,1]")
74 | # print(a[0, 1])
75 | #
76 | # print("In: a[1,0]")
77 | # print(a[1, 0])
78 | #
79 | # print("In: a[1,1]")
80 | # print(a[1, 1])
81 |
82 | # numpy数据类型
83 | # print("In: float64(42)")
84 | # print(np.float64(42))
85 | #
86 | # print("In: int8(42.0)")
87 | # print(np.int8(42.0))
88 | #
89 | # print("In: bool(42)")
90 | # print(np.bool(42))
91 | #
92 | # print(np.bool(0))
93 | #
94 | # print("In: bool(42.0)")
95 | # print(np.bool(42.0))
96 | #
97 | # print("In: float(True)")
98 | # print(np.float(True))
99 | # print(np.float(False))
100 | #
101 | # print("In: arange(7, dtype=uint16)")
102 | # print(np.arange(7, dtype=np.uint16))
103 | #
104 | # print("In: int(42.0 + 1.j)")
105 | #
106 | # #Type error
107 | # try:
108 | # print(np.int(42.0 + 1.j))
109 | # except TypeError:
110 | # print("TypeError")
111 |
112 | # Type error
113 | # print("In: float(42.0 + 1.j)")
114 | # print(float(42.0 + 1.j))
115 |
116 |
117 | # 数据类型转换
118 | arr = np.array([1, 2, 3, 4, 5])
119 | # print(arr.dtype)
120 | float_arr = arr.astype(np.float64)
121 | # print(float_arr.dtype)
122 |
123 | arr = np.array([3.7, -1.2, -2.6, 0.5, 12.9, 10.1])
124 | # print(arr)
125 | # print(arr.astype(np.int32))
126 |
127 | numeric_strings = np.array(['1.25', '-9.6', '42'], dtype=np.string_)
128 | # print(numeric_strings.astype(float))
129 |
130 |
131 | # 数据类型对象
132 | a = np.array([[1, 2], [3, 4]])
133 |
134 | # print(a.dtype.byteorder)
135 | #
136 | # print(a.dtype.itemsize)
137 |
138 | # 字符编码
139 | # print(np.arange(7, dtype='f'))
140 | # print(np.arange(7, dtype='D'))
141 | #
142 | # print(np.dtype(float))
143 | #
144 | # print(np.dtype('f'))
145 | #
146 | # print(np.dtype('d'))
147 | #
148 | # print(np.dtype('f8'))
149 | #
150 | # print(np.dtype('Float64'))
151 |
152 | # dtype类的属性
153 | t = np.dtype('Float64')
154 | # print(t.char)
155 | # print(t.type)
156 | # print(t.str)
157 |
158 |
159 | # 创建自定义数据类型
160 | t = np.dtype([('name', np.str_, 40), ('numitems', np.int32), ('price', np.float32)])
161 | # print(t)
162 | # print(t['name'])
163 | itemz = np.array([('Meaning of life DVD', 42, 3.14), ('Butter', 13, 2.72)], dtype=t)
164 | # print(itemz[1])
165 |
166 | # 数组与标量的运算
167 | arr = np.array([[1., 2., 3.], [4., 5., 6.]])
168 | # print(arr)
169 | # print(arr * arr)
170 | # print(arr - arr)
171 | # print(1 / arr)
172 | # print(arr ** 0.5)
173 |
174 |
175 | # 一维数组的索引与切片
176 | a = np.arange(9)
177 | # print(a[3:7])
178 | # 0到7步长为2
179 | # print(a[:7:2])
180 | # 倒序
181 | # print(a[::-1])
182 |
183 | # 同a[3:7:2]
184 | s = slice(3, 7, 2)
185 | # print(a[s])
186 | # 同a[0:0:-1]
187 | s = slice(None, None, -1)
188 | # print(a[s])
189 |
190 | # 多维数组的切片与索引
191 | # 实际上:3*4*2
192 | b = np.arange(24).reshape(2, 3, 4)
193 |
194 | # print(b.shape)
195 | # print("------------------")
196 | # print(b)
197 | # print("------------------")
198 | # print(b[0, 0, 0])
199 | # print("------------------")
200 | # print(b[:, 0, 0])
201 | # print("------------------")
202 | # print(b[0])
203 | # print("------------------")
204 | # print(b[0, :, :])
205 | # print("------------------")
206 | # print(b[0, ...])
207 | # print("------------------")
208 | # print(b[0, 1])
209 | # print("------------------")
210 | # print(b[0, 1, ::2])
211 | # print("------------------")
212 | # print(b[..., 1])
213 | # print("------------------")
214 | # print(b[:, 1])
215 | # print("------------------")
216 | # print(b[0, :, 1])
217 | # print("------------------")
218 | # print(b[0, :, -1])
219 | # print("------------------")
220 | # print(b[0, ::-1, -1])
221 | # print("------------------")
222 | # print(b[0, ::2, -1])
223 | # print("------------------")
224 | # print(b[::-1])
225 | # print("------------------")
226 |
227 | s = slice(None, None, -1)
228 | # print(b[(s, s, s)])
229 |
230 | # 布尔型索引
231 |
232 | names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
233 | data = np.random.randn(7, 4)
234 | # print(names)
235 | # print(data)
236 |
237 | # print(names == 'Bob')
238 | # print(data[names == 'Bob'])
239 |
240 | # print(data[names == 'Bob', 2:])
241 | # print(data[names == 'Bob', 3])
242 |
243 | names != 'Bob'
244 | # print(data[-(names == 'Bob')])
245 |
246 | mask = (names == 'Bob') | (names == 'Will')
247 | # print(mask)
248 | # print(data[mask])
249 |
250 | data[data < 0] = 0
251 | # print(data)
252 |
253 | data[names != 'Joe'] = 7
254 | # print(data)
255 |
256 | # 花式索引
257 | arr = np.empty((8, 4))
258 | for i in range(8):
259 | arr[i] = i
260 | # print(arr)
261 |
262 | # print(arr[[4, 3, 0, 6]])
263 | # print(arr[[-3, -5, -7]])
264 |
265 | arr = np.arange(32).reshape((8, 4))
266 | # print(arr)
267 | # 获取4个(m,n)
268 | # print(arr[[1, 5, 7, 2], [0, 3, 1, 2]])
269 | # 单排重排序
270 | # print(arr[[1, 5, 7, 2]][:, [0, 3, 1, 2]])
271 | # 同上
272 | # print(arr[np.ix_([1, 5, 7, 2], [0, 3, 1, 2])])
273 |
274 | # 数组转置
275 | arr = np.arange(15).reshape((3, 5))
276 | # print(arr)
277 | # print(arr.T)
278 |
279 | # 改变数组的维度
280 | b = np.arange(24).reshape(2, 3, 4)
281 | # print(b)
282 | # print(b.ravel())
283 | # print(b.flatten())
284 |
285 | b.shape = (6, 4)
286 | # print(b)
287 | # print(b.transpose())
288 |
289 | b.resize((2, 12))
290 | # print(b)
291 |
292 | # 组合数组
293 | a = np.arange(9).reshape(3, 3)
294 | # print(a)
295 |
296 | b = 2 * a
297 | # print(b)
298 | # 水平组合
299 | # print(np.hstack((a, b)))
300 | # 水平组合
301 | # print(np.concatenate((a, b), axis=1))
302 | # 垂直组合
303 | # print(np.vstack((a, b)))
304 | # 垂直组合
305 | # print(np.concatenate((a, b), axis=0))
306 | # 深度组合
307 | # print(np.dstack((a, b)))
308 |
309 | oned = np.arange(2)
310 | # print(oned)
311 | twice_oned = 2 * oned
312 | # print(twice_oned)
313 | '''
314 | # 列组合
315 | print(np.column_stack((oned, twice_oned)))
316 | # 列组合即水平组合
317 | print(np.column_stack((a, b)))
318 | # 两种方法一样
319 | print(np.column_stack((a, b)) == np.hstack((a, b)))
320 | # 行组合
321 | print(np.row_stack((oned, twice_oned)))
322 | # 行组合即垂直组合
323 | print(np.row_stack((a, b)))
324 | # 两种方法完全一样
325 | print(np.row_stack((a, b)) == np.vstack((a, b)))
326 | '''
327 |
328 | # 数组的分割
329 | a = np.arange(9).reshape(3, 3)
330 | '''
331 | print(a)
332 | # 纵向分割
333 | print(np.hsplit(a, 3))
334 | # 纵向分割
335 | print(np.split(a, 3, axis=1))
336 | # 水平分割
337 | print(np.vsplit(a, 3))
338 | # 水平分割
339 | print(np.split(a, 3, axis=0))
340 | '''
341 |
342 | c = np.arange(27).reshape(3, 3, 3)
343 | # 深度分割
344 | # print(c)
345 | # print(np.dsplit(c, 3))
346 |
347 |
348 | # 数组的属性
349 | b = np.arange(24).reshape(2, 12)
350 | '''
351 | # 维度
352 | print(b.ndim)
353 | # 尺寸(个数)
354 | print(b.size)
355 | # 每个元素的字节数
356 | print(b.itemsize)
357 | # 内存大小,上面的乘积
358 | print(b.nbytes)
359 | '''
360 |
361 | # 虚数
362 | b = np.array([1. + 1.j, 3. + 2.j])
363 | # print(b.real)
364 | # print(b.imag)
365 |
366 | b = np.arange(4).reshape(2, 2)
367 | # 类型
368 | # print(b.flat)
369 | # 展开成以为数组,用索引获取,还可以赋值
370 | # print(b.flat[2])
371 |
372 | # 数组的转换
373 | b = np.array([1. + 1.j, 3. + 2.j])
374 | # print(b)
375 | # 转换成列表
376 | # print(b.tolist())
377 | # 转换成字符串
378 | # print(b.tostring())
379 |
380 | # print(np.fromstring('20:42:52', sep=':', dtype=int))
381 |
382 | # print(b)
383 | # 虚数转实数,有数据丢失风险,因为丢掉了虚部
384 | # print(b.astype(int))
385 | # 实数转虚数
386 | # print(b.astype('complex'))
387 |
--------------------------------------------------------------------------------
/2.Python数据分析/week04/data.csv:
--------------------------------------------------------------------------------
1 | AAPL,28-01-2011, ,344.17,344.4,333.53,336.1,21144800
2 | AAPL,31-01-2011, ,335.8,340.04,334.3,339.32,13473000
3 | AAPL,01-02-2011, ,341.3,345.65,340.98,345.03,15236800
4 | AAPL,02-02-2011, ,344.45,345.25,343.55,344.32,9242600
5 | AAPL,03-02-2011, ,343.8,344.24,338.55,343.44,14064100
6 | AAPL,04-02-2011, ,343.61,346.7,343.51,346.5,11494200
7 | AAPL,07-02-2011, ,347.89,353.25,347.64,351.88,17322100
8 | AAPL,08-02-2011, ,353.68,355.52,352.15,355.2,13608500
9 | AAPL,09-02-2011, ,355.19,359,354.87,358.16,17240800
10 | AAPL,10-02-2011, ,357.39,360,348,354.54,33162400
11 | AAPL,11-02-2011, ,354.75,357.8,353.54,356.85,13127500
12 | AAPL,14-02-2011, ,356.79,359.48,356.71,359.18,11086200
13 | AAPL,15-02-2011, ,359.19,359.97,357.55,359.9,10149000
14 | AAPL,16-02-2011, ,360.8,364.9,360.5,363.13,17184100
15 | AAPL,17-02-2011, ,357.1,360.27,356.52,358.3,18949000
16 | AAPL,18-02-2011, ,358.21,359.5,349.52,350.56,29144500
17 | AAPL,22-02-2011, ,342.05,345.4,337.72,338.61,31162200
18 | AAPL,23-02-2011, ,338.77,344.64,338.61,342.62,23994700
19 | AAPL,24-02-2011, ,344.02,345.15,338.37,342.88,17853500
20 | AAPL,25-02-2011, ,345.29,348.43,344.8,348.16,13572000
21 | AAPL,28-02-2011, ,351.21,355.05,351.12,353.21,14395400
22 | AAPL,01-03-2011, ,355.47,355.72,347.68,349.31,16290300
23 | AAPL,02-03-2011, ,349.96,354.35,348.4,352.12,21521000
24 | AAPL,03-03-2011, ,357.2,359.79,355.92,359.56,17885200
25 | AAPL,04-03-2011, ,360.07,360.29,357.75,360,16188000
26 | AAPL,07-03-2011, ,361.11,361.67,351.31,355.36,19504300
27 | AAPL,08-03-2011, ,354.91,357.4,352.25,355.76,12718000
28 | AAPL,09-03-2011, ,354.69,354.76,350.6,352.47,16192700
29 | AAPL,10-03-2011, ,349.69,349.77,344.9,346.67,18138800
30 | AAPL,11-03-2011, ,345.4,352.32,345,351.99,16824200
31 |
--------------------------------------------------------------------------------
/2.Python数据分析/week04/numpy常用函数.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import division
3 |
4 | import numpy as np
5 | from numpy.random import randn
6 |
7 | # 通用函数
8 | arr = np.arange(10)
9 | # print(arr)
10 | # print(np.sqrt(arr))
11 | # print(np.exp(arr))
12 |
13 | x = randn(8)
14 | y = randn(8)
15 | # print(x)
16 | # print(y)
17 | # print(np.maximum(x, y)) # 元素级最大值
18 |
19 | arr = randn(7)
20 | # print(arr)
21 | # print(np.modf(arr)) # 分解整数小数部分
22 |
23 | # 利用数组进行数据处理
24 | # 向量化
25 | points = np.arange(-5, 5, 0.01) # 1000均等分点
26 | xs, ys = np.meshgrid(points, points)
27 | # print(xs)
28 | # print(ys)
29 |
30 | z = np.sqrt(xs ** 2 + ys ** 2)
31 | # print(z)
32 | # plt.imshow(z, cmap=plt.cm.gray);
33 | # plt.colorbar()
34 | # plt.title("Image plot of $\sqrt{x^2 + y^2}$ for a grid of values")
35 | # plt.draw()
36 | # plt.show()
37 |
38 | # 将条件逻辑表达为数组运算
39 | xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
40 | yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
41 | cond = np.array([True, False, True, True, False])
42 |
43 | result = [(x if c else y)
44 | for x, y, c in zip(xarr, yarr, cond)]
45 | # print(result)
46 |
47 | result = np.where(cond, xarr, yarr)
48 | # print(result)
49 |
50 | arr = randn(4, 4)
51 | # print(arr)
52 | # print(np.where(arr > 0, 2, -2)) # 大于0为2,小于0为-2
53 | # print(np.where(arr > 0, 2, arr)) # 大于0为2
54 |
55 | '''
56 | result = []
57 | for i in range(n):
58 | if cond1[i] and cond2[i]:
59 | result.append(0)
60 | elif cond1[i]:
61 | result.append(1)
62 | elif cond2[i]:
63 | result.append(2)
64 | else:
65 | result.append(3)
66 |
67 | # Not to be executed
68 | np.where(cond1 & cond2, 0,
69 | np.where(cond1, 1,
70 | np.where(cond2, 2, 3)))
71 |
72 | # Not to be executed
73 | result = 1 * cond1 + 2 * cond2 + 3 * -(cond1 | cond2)
74 | '''
75 |
76 | # 数学与统计方法
77 | arr = np.random.randn(5, 4) # 标准正态分布数据
78 | # print(arr)
79 | # print(arr.mean())
80 | # print(np.mean(arr))
81 | # print(arr.sum())
82 |
83 | # print(arr.mean(axis=1)) # 每一行均值
84 | # print(arr.sum(0)) # 每一列和
85 |
86 | arr = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
87 | # print(arr)
88 | # print(arr.cumsum(0)) # 累计和
89 | # print(arr.cumprod(1)) # 累计积
90 |
91 | # 用于布尔型数组的方法
92 | arr = randn(100)
93 | # print(arr)
94 | # print((arr > 0).sum()) # 正值的数量
95 |
96 | bools = np.array([False, False, True, False])
97 | # print(bools.any()) # 是否有一个true
98 | # print(bools.all()) # 是否全部为true
99 |
100 | # 排序
101 | arr = randn(8)
102 | # print(arr)
103 | arr.sort()
104 | # print(arr)
105 |
106 | arr = randn(5, 3)
107 | # print(arr)
108 | arr.sort(1) # 按行维度排序
109 | # print(arr)
110 |
111 | large_arr = randn(1000)
112 | # print(large_arr)
113 | large_arr.sort() # 先排序
114 | # print(large_arr[int(0.05 * len(large_arr))]) # 5%分位数
115 |
116 | # 唯一化以及其他的集合逻辑
117 | names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
118 | # print(np.unique(names))
119 | ints = np.array([3, 3, 3, 2, 2, 1, 1, 4, 4])
120 | # print(np.unique(ints))
121 | # print(sorted(set(names)))
122 |
123 | # 元素是否在已有数组中
124 | values = np.array([6, 0, 0, 3, 2, 5, 6])
125 | # print(np.in1d(values, [2, 3, 6]))
126 |
127 | # 线性代数
128 | x = np.array([[1., 2., 3.], [4., 5., 6.]])
129 | y = np.array([[6., 23.], [-1, 7], [8, 9]])
130 | # print(x)
131 | # print(y)
132 | # print(x.dot(y)) # 等价于np.dot(x, y)矩阵乘法
133 | # print(np.dot(x, np.ones(3)))
134 |
135 | # print(np.random.seed(12345))
136 |
137 | from numpy.linalg import qr
138 |
139 | X = randn(5, 5)
140 | # print(X)
141 | mat = X.T.dot(X)
142 | # print(mat)
143 | # print(inv(mat)) # 逆矩阵
144 | # print(mat.dot(inv(mat))) # 单位矩阵
145 |
146 | q, r = qr(mat)
147 | # print(q)
148 | # print(r)
149 |
150 | # 随机数生成
151 | samples = np.random.normal(size=(4, 4))
152 | # print(samples)
153 |
154 | N = 1000000
155 | # print(get_ipython().magic(u'timeit samples = [normalvariate(0, 1) for _ in xrange(N)]'))
156 | # print(get_ipython().magic(u'timeit np.random.normal(size=N)'))
157 |
158 | '''
159 | # 范例:随机漫步
160 | import random
161 |
162 | position = 0
163 | walk = [position]
164 | steps = 1000
165 | for i in range(steps):
166 | step = 1 if random.randint(0, 1) else -1
167 | position += step
168 | walk.append(position)
169 | # print(walk)
170 |
171 | nsteps = 1000
172 | draws = np.random.randint(0, 2, size=nsteps)
173 | steps = np.where(draws > 0, 1, -1)
174 | walk = steps.cumsum()
175 |
176 | print(walk.min())
177 | print(walk.max())
178 | print((np.abs(walk) >= 10).argmax())
179 | '''
180 |
181 | '''
182 | # 一次模拟多个随机漫步
183 | nwalks = 5000
184 | nsteps = 1000
185 | draws = np.random.randint(0, 2, size=(nwalks, nsteps)) # 0 or 1
186 | steps = np.where(draws > 0, 1, -1)
187 | walks = steps.cumsum(1)
188 | print(walks)
189 |
190 | print(walks.max())
191 | print(walks.min())
192 |
193 | hits30 = (np.abs(walks) >= 30).any(1)
194 | print(hits30)
195 | print(hits30.sum()) # 到达30或-30的数量
196 |
197 | crossing_times = (np.abs(walks[hits30]) >= 30).argmax(1)
198 | print(crossing_times.mean())
199 |
200 | steps = np.random.normal(loc=0, scale=0.25,
201 | size=(nwalks, nsteps))
202 | '''
203 |
--------------------------------------------------------------------------------
/2.Python数据分析/week04/numpy股价分析实践.py:
--------------------------------------------------------------------------------
1 | # 利用NumPy进行历史股价分析
2 | import numpy as np
3 |
4 | # 读入文件,获取第7和8列字段(收盘价 || 成交量)
5 | c, v = np.loadtxt('data.csv', delimiter=',', usecols=(6, 7), unpack=True)
6 |
7 | # 计算成交量加权平均价格
8 | vwap = np.average(c, weights=v)
9 | print("VWAP =", vwap)
10 |
11 | # 算术平均值函数
12 | print("mean =", np.mean(c))
13 |
14 | # 时间加权平均价格,时间近权重大
15 | t = np.arange(len(c))
16 | print("twap =", np.average(c, weights=t))
17 |
18 | # 寻找最大值和最小值
19 | h, l = np.loadtxt('data.csv', delimiter=',', usecols=(4, 5), unpack=True)
20 | print("highest =", np.max(h))
21 | print("lowest =", np.min(l))
22 |
23 | # 最高价最低价平均值
24 | print((np.max(h) + np.min(l)) / 2)
25 |
26 | # 最高价取值范围
27 | print("Spread high price", np.ptp(h))
28 | # 最低价取值范围
29 | print("Spread low price", np.ptp(l))
30 |
31 | # 统计分析
32 | c = np.loadtxt('data.csv', delimiter=',', usecols=(6,), unpack=True)
33 | # 中位数
34 | print("median =", np.median(c))
35 |
36 | # 排序
37 | sorted = np.msort(c)
38 | print("sorted =", sorted)
39 |
40 | # 方差
41 | N = len(c)
42 | print("variance =", np.var(c))
43 | print("variance from definition =", np.mean((c - c.mean()) ** 2))
44 |
45 | # 股票收益率
46 | c = np.loadtxt('data.csv', delimiter=',', usecols=(6,), unpack=True)
47 |
48 | # 差分,相邻差值
49 | returns = np.diff(c) / c[: -1]
50 | print("Standard deviation =", np.std(returns))
51 |
52 | # 对数收益率
53 | logreturns = np.diff(np.log(c))
54 | posretindices = np.where(returns > 0) # 收益率大于0
55 | print("Indices with positive returns", posretindices)
56 |
57 | # 年化波动率
58 | # 对数收益率标准差/均值
59 | annual_volatility = np.std(logreturns) / np.mean(logreturns)
60 | # 再/交易日倒数的平方
61 | annual_volatility = annual_volatility / np.sqrt(1. / 252.)
62 | print("Annual volatility", annual_volatility)
63 |
64 | # 月度波动率
65 | print("Monthly volatility", annual_volatility * np.sqrt(1. / 12.))
66 |
67 | # 日期分析
68 | from datetime import datetime
69 |
70 |
71 | # Monday 0
72 | # Tuesday 1
73 | # Wednesday 2
74 | # Thursday 3
75 | # Friday 4
76 | # Saturday 5
77 | # Sunday 6
78 | def datestr2num(s):
79 | return datetime.strptime(s.decode('ascii'), "%d-%m-%Y").date().weekday()
80 |
81 |
82 | # 读取日期 || 收盘价
83 | dates, close = np.loadtxt('data.csv', delimiter=',', usecols=(1, 6),
84 | converters={1: datestr2num}, unpack=True)
85 | print("Dates =", dates)
86 |
87 | # 5个工作日数组
88 | averages = np.zeros(5)
89 |
90 | for i in range(5):
91 | indices = np.where(dates == i)
92 | prices = np.take(close, indices)
93 | avg = np.mean(prices)
94 | print("Day", i, "prices", prices, "Average", avg)
95 | averages[i] = avg
96 |
97 | top = np.max(averages)
98 | print("Highest average", top)
99 | print("Top day of the week", np.argmax(averages))
100 |
101 | bottom = np.min(averages)
102 | print("Lowest average", bottom)
103 | print("Bottom day of the week", np.argmin(averages))
104 |
105 |
106 | # 周汇总
107 | def datestr2num(s):
108 | return datetime.strptime(s.decode('ascii'), "%d-%m-%Y").date().weekday()
109 |
110 |
111 | # 读取日期 || 开盘价 || 最高价 || 最低价 || 收盘价
112 | dates, open, high, low, close = np.loadtxt('data.csv', delimiter=',',
113 | usecols=(1, 3, 4, 5, 6), converters={1: datestr2num}, unpack=True)
114 | # 前三周数据
115 | close = close[:16]
116 | dates = dates[:16]
117 |
118 | # 获取第一个星期一的索引
119 | first_monday = np.ravel(np.where(dates == 0))[0]
120 | print("The first Monday index is", first_monday)
121 |
122 | # 获取最后一个星期五的索引
123 | last_friday = np.ravel(np.where(dates == 4))[-1]
124 | print("The last Friday index is", last_friday)
125 |
126 | # 重新编排索引
127 | weeks_indices = np.arange(first_monday, last_friday + 1)
128 | print("Weeks indices initial", weeks_indices)
129 |
130 | # 三周分成三组
131 | weeks_indices = np.split(weeks_indices, 3)
132 | print("Weeks indices after split", weeks_indices)
133 |
134 |
135 | # 周汇总
136 | def summarize(a, o, h, l, c):
137 | monday_open = o[a[0]]
138 | week_high = np.max(np.take(h, a))
139 | week_low = np.min(np.take(l, a))
140 | friday_close = c[a[-1]]
141 |
142 | return ("APPL", monday_open, week_high, week_low, friday_close)
143 |
144 |
145 | weeksummary = np.apply_along_axis(summarize, 1, weeks_indices, open, high, low, close)
146 | print("Week summary", weeksummary)
147 |
148 | # 保存到CSV文件
149 | np.savetxt("weeksummary.csv", weeksummary, delimiter=",", fmt="%s")
150 |
151 | # ATR真实波动幅度均值
152 | # 获取最高价 || 最低价 || 收盘价
153 | h, l, c = np.loadtxt('data.csv', delimiter=',', usecols=(4, 5, 6), unpack=True)
154 |
155 | # 获取最后20个数据
156 | N = 20
157 | h = h[-N:]
158 | l = l[-N:]
159 |
160 | print("len(h)", len(h), "len(l)", len(l))
161 | print("Close", c)
162 |
163 | # 最后20个数据的对应的前一天数据
164 | previousclose = c[-N - 1: -1]
165 |
166 | print("len(previousclose)", len(previousclose))
167 | print("Previous close", previousclose)
168 |
169 | # 三种数据差值
170 | truerange = np.maximum(h - l, h - previousclose, previousclose - l)
171 | print("True range", truerange)
172 |
173 | # 20天空间数组
174 | atr = np.zeros(N)
175 |
176 | # 真实率浮动平均值
177 | atr[0] = np.mean(truerange)
178 |
179 | # 公式计算ATR
180 | for i in range(1, N):
181 | atr[i] = (N - 1) * atr[i - 1] + truerange[i]
182 | atr[i] /= N
183 |
184 | print("ATR", atr)
185 |
186 | # 简单移动平均线
187 | from matplotlib.pyplot import plot
188 | from matplotlib.pyplot import show
189 |
190 | # 滑动窗口
191 | N = 5
192 |
193 | weights = np.ones(N) / N
194 | print("Weights", weights)
195 |
196 | c = np.loadtxt('data.csv', delimiter=',', usecols=(6,), unpack=True)
197 | # 简单移动平均值SMA
198 | sma = np.convolve(weights, c)[N - 1:-N + 1]
199 | t = np.arange(N - 1, len(c))
200 | plot(t, c[N - 1:], lw=1.0)
201 | plot(t, sma, lw=2.0)
202 | show()
203 |
204 | # 指数移动平均线
205 | x = np.arange(5)
206 | print("Exp", np.exp(x))
207 | print("Linspace", np.linspace(-1, 0, 5))
208 |
209 | N = 5
210 | weights = np.exp(np.linspace(-1., 0., N))
211 | weights /= weights.sum()
212 | print("Weights", weights)
213 |
214 | c = np.loadtxt('data.csv', delimiter=',', usecols=(6,), unpack=True)
215 | ema = np.convolve(weights, c)[N - 1:-N + 1]
216 | t = np.arange(N - 1, len(c))
217 | plot(t, c[N - 1:], lw=1.0)
218 | plot(t, ema, lw=2.0)
219 | show()
220 |
221 | # 布林带
222 | N = 5
223 |
224 | weights = np.ones(N) / N
225 | print("Weights", weights)
226 |
227 | c = np.loadtxt('data.csv', delimiter=',', usecols=(6,), unpack=True)
228 | sma = np.convolve(weights, c)[N - 1:-N + 1]
229 | deviation = []
230 | C = len(c)
231 |
232 | for i in range(N - 1, C):
233 | if i + N < C:
234 | dev = c[i: i + N]
235 | else:
236 | dev = c[-N:]
237 |
238 | averages = np.zeros(N)
239 | averages.fill(sma[i - N - 1])
240 | dev = dev - averages
241 | dev = dev ** 2
242 | dev = np.sqrt(np.mean(dev))
243 | deviation.append(dev)
244 |
245 | deviation = 2 * np.array(deviation)
246 | print(len(deviation), len(sma))
247 |
248 | # 上限
249 | upperBB = sma + deviation
250 | # 下限
251 | lowerBB = sma - deviation
252 |
253 | c_slice = c[N - 1:]
254 | between_bands = np.where((c_slice < upperBB) & (c_slice > lowerBB))
255 |
256 | print(lowerBB[between_bands])
257 |
258 | print(c[between_bands])
259 |
260 | print(upperBB[between_bands])
261 |
262 | between_bands = len(np.ravel(between_bands))
263 | print("Ratio between bands", float(between_bands) / len(c_slice))
264 |
265 | t = np.arange(N - 1, C)
266 | plot(t, c_slice, lw=1.0)
267 | plot(t, sma, lw=2.0)
268 | plot(t, upperBB, lw=3.0)
269 | plot(t, lowerBB, lw=4.0)
270 | show()
271 |
272 | # 线性模型
273 | N = 5
274 |
275 | c = np.loadtxt('data.csv', delimiter=',', usecols=(6,), unpack=True)
276 |
277 | b = c[-N:]
278 | b = b[::-1]
279 | print("b", b)
280 |
281 | A = np.zeros((N, N), float)
282 | print("Zeros N by N", A)
283 |
284 | for i in range(N):
285 | A[i,] = c[-N - 1 - i: - 1 - i]
286 |
287 | print("A", A)
288 |
289 | (x, residuals, rank, s) = np.linalg.lstsq(A, b)
290 |
291 | print(x, residuals, rank, s)
292 |
293 | print(np.dot(b, x))
294 |
295 |
296 | # 趋势线
297 | def fit_line(t, y):
298 | A = np.vstack([t, np.ones_like(t)]).T
299 |
300 | return np.linalg.lstsq(A, y)[0]
301 |
302 |
303 | h, l, c = np.loadtxt('data.csv', delimiter=',', usecols=(4, 5, 6), unpack=True)
304 |
305 | pivots = (h + l + c) / 3
306 | print("Pivots", pivots)
307 |
308 | t = np.arange(len(c))
309 | sa, sb = fit_line(t, pivots - (h - l))
310 | ra, rb = fit_line(t, pivots + (h - l))
311 |
312 | support = sa * t + sb
313 | resistance = ra * t + rb
314 | condition = (c > support) & (c < resistance)
315 | print("Condition", condition)
316 |
317 | between_bands = np.where(condition)
318 | print(support[between_bands])
319 |
320 | print(c[between_bands])
321 |
322 | print(resistance[between_bands])
323 |
324 | between_bands = len(np.ravel(between_bands))
325 | print("Number points between bands", between_bands)
326 |
327 | print("Ratio between bands", float(between_bands) / len(c))
328 |
329 | print("Tomorrows support", sa * (t[-1] + 1) + sb)
330 |
331 | print("Tomorrows resistance", ra * (t[-1] + 1) + rb)
332 |
333 | a1 = c[c > support]
334 | a2 = c[c < resistance]
335 | print("Number of points between bands 2nd approach", len(np.intersect1d(a1, a2)))
336 |
337 | plot(t, c)
338 | plot(t, support)
339 | plot(t, resistance)
340 | show()
341 |
--------------------------------------------------------------------------------
/2.Python数据分析/week04/weeksummary.csv:
--------------------------------------------------------------------------------
1 | APPL,335.8,346.7,334.3,346.5
2 | APPL,347.8,360.0,347.6,356.8
3 | APPL,356.7,364.9,349.5,350.5
4 |
--------------------------------------------------------------------------------
/2.Python数据分析/week05/Pandes_Dataframe.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from pandas import Series, DataFrame
6 |
7 | # Pandas
8 | # Series:有索引的一维列表
9 | obj = Series([4, 7, -5, 3])
10 | print(obj)
11 |
12 | # 数据值
13 | print(obj.values)
14 | # 数据索引
15 | print(obj.index)
16 |
17 | # 自定义索引
18 | obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
19 | print(obj2)
20 | # 数据索引
21 | print(obj2.index)
22 | # 获取数据
23 | print(obj2['a'])
24 |
25 | obj2['d'] = 6
26 | print(obj2[['c', 'a', 'd']])
27 |
28 | print(obj2[obj2 > 0])
29 | print(obj2 * 2)
30 | print(np.exp(obj2))
31 |
32 | print('b' in obj2)
33 | print('e' in obj2)
34 |
35 | # 字典初始化Series
36 | sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
37 | obj3 = Series(sdata)
38 | print(obj3)
39 |
40 | # 列表对齐索引
41 | states = ['California', 'Ohio', 'Oregon', 'Texas']
42 | obj4 = Series(sdata, index=states)
43 | print(obj4)
44 |
45 | # 判断非空
46 | print(pd.isnull(obj4))
47 | print(pd.notnull(obj4))
48 |
49 | print(obj4.isnull())
50 |
51 | # 索引对应运算
52 | print(obj3)
53 | print(obj4)
54 | print(obj3 + obj4)
55 |
56 | # 命名key value
57 | obj4.name = 'population'
58 | obj4.index.name = 'state'
59 | print(obj4)
60 |
61 | # 重命名索引
62 | obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
63 | print(obj)
64 |
65 | print('-----------------------------')
66 | print('-----------------------------')
67 |
68 | # Dataframe:表格型数据结构,同一列数据类型相同
69 | data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
70 | 'year': [2000, 2001, 2002, 2001, 2002],
71 | 'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
72 | frame = DataFrame(data)
73 |
74 | print(frame)
75 |
76 | # 自定义列顺序
77 | print(DataFrame(data, columns=['year', 'state', 'pop']))
78 |
79 | # 多余则缺省
80 | frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
81 | index=['one', 'two', 'three', 'four', 'five'])
82 | print(frame2)
83 | # 列名
84 | print(frame2.columns)
85 |
86 | # 获取列
87 | print(frame2['state'])
88 | print(frame2.year)
89 | # 获取行
90 | print(frame2.ix['three'])
91 |
92 | # 赋值列
93 | frame2['debt'] = 16.5
94 | print(frame2)
95 |
96 | # 赋值匹配
97 | frame2['debt'] = np.arange(5.)
98 | print(frame2)
99 |
100 | # Series赋值
101 | val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
102 | frame2['debt'] = val
103 | print(frame2)
104 |
105 | frame2['eastern'] = frame2.state == 'Ohio'
106 | print(frame2)
107 |
108 | # 删除列
109 | del frame2['eastern']
110 | print(frame2.columns)
111 |
112 | print('-------------------------')
113 | pop = {'Nevada': {2001: 2.4, 2002: 2.9},
114 | 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
115 | frame3 = DataFrame(pop)
116 | print(frame3)
117 |
118 | # 转置行列
119 | print(frame3.T)
120 |
121 | # 自定义索引顺序
122 | print(DataFrame(pop, index=[2001, 2002, 2003]))
123 |
124 | pdata = {'Ohio': frame3['Ohio'][:-1],
125 | 'Nevada': frame3['Nevada'][:2]}
126 | print(DataFrame(pdata))
127 |
128 | # 行列索引命名
129 | frame3.index.name = 'year'
130 | frame3.columns.name = 'state'
131 | print(frame3)
132 | print(frame3.values)
133 | print(frame2.values)
134 |
135 | # 索引对象
136 | obj = Series(range(3), index=['a', 'b', 'c'])
137 | index = obj.index
138 | print(index)
139 |
140 | print(index[1:])
141 |
142 | # index[1] = 'd' # 不可直接更改
143 |
144 | index = pd.Index(np.arange(3))
145 | obj2 = Series([1.5, -2.5, 0], index=index)
146 | print(obj2.index is index)
147 |
148 | print(frame3)
149 | print('Ohio' in frame3.columns)
150 | print(2003 in frame3.index)
151 |
--------------------------------------------------------------------------------
/2.Python数据分析/week05/data/csv_mindex.csv:
--------------------------------------------------------------------------------
1 | key1,key2,value1,value2
2 | one,a,1,2
3 | one,b,3,4
4 | one,c,5,6
5 | one,d,7,8
6 | two,a,9,10
7 | two,b,11,12
8 | two,c,13,14
9 | two,d,15,16
10 |
--------------------------------------------------------------------------------
/2.Python数据分析/week05/data/ex1.csv:
--------------------------------------------------------------------------------
1 | a,b,c,d,message
2 | 1,2,3,4,hello
3 | 5,6,7,8,world
4 | 9,10,11,12,foo
--------------------------------------------------------------------------------
/2.Python数据分析/week05/data/ex2.csv:
--------------------------------------------------------------------------------
1 | 1,2,3,4,hello
2 | 5,6,7,8,world
3 | 9,10,11,12,foo
--------------------------------------------------------------------------------
/2.Python数据分析/week05/data/ex3.csv:
--------------------------------------------------------------------------------
1 | A B C
2 | aaa -0.264438 -1.026059 -0.619500
3 | bbb 0.927272 0.302904 -0.032399
4 | ccc -0.264273 -0.386314 -0.217601
5 | ddd -0.871858 -0.348382 1.100491
--------------------------------------------------------------------------------
/2.Python数据分析/week05/data/ex3.txt:
--------------------------------------------------------------------------------
1 | A B C
2 | aaa -0.264438 -1.026059 -0.619500
3 | bbb 0.927272 0.302904 -0.032399
4 | ccc -0.264273 -0.386314 -0.217601
5 | ddd -0.871858 -0.348382 1.100491
6 |
--------------------------------------------------------------------------------
/2.Python数据分析/week05/data/ex4.csv:
--------------------------------------------------------------------------------
1 | # hey!
2 | a,b,c,d,message
3 | # just wanted to make things more difficult for you
4 | # who reads CSV files with computers, anyway?
5 | 1,2,3,4,hello
6 | 5,6,7,8,world
7 | 9,10,11,12,foo
--------------------------------------------------------------------------------
/2.Python数据分析/week05/data/ex5.csv:
--------------------------------------------------------------------------------
1 | something,a,b,c,d,message
2 | one,1,2,3,4,NA
3 | two,5,6,,8,world
4 | three,9,10,11,12,foo
--------------------------------------------------------------------------------
/2.Python数据分析/week05/data/ex7.csv:
--------------------------------------------------------------------------------
1 | "a","b","c"
2 | "1","2","3"
3 | "1","2","3","4"
4 |
--------------------------------------------------------------------------------
/2.Python数据分析/week05/data/frame_pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/littleheap/Python-Projects/a2bebb14dd49aeb1a0746eeadd8a30db6875ab01/2.Python数据分析/week05/data/frame_pickle
--------------------------------------------------------------------------------
/2.Python数据分析/week05/data/out.csv:
--------------------------------------------------------------------------------
1 | ,something,a,b,c,d,message
2 | 0,one,1,2,3.0,4,
3 | 1,two,5,6,,8,world
4 | 2,three,9,10,11.0,12,foo
5 |
--------------------------------------------------------------------------------
/2.Python数据分析/week05/data/test_file.csv:
--------------------------------------------------------------------------------
1 | "a","b","c"
2 | "1","2","3"
3 | "1","2","3","4"
4 |
--------------------------------------------------------------------------------
/2.Python数据分析/week05/data/tseries.csv:
--------------------------------------------------------------------------------
1 | 2000-01-01,0
2 | 2000-01-02,1
3 | 2000-01-03,2
4 | 2000-01-04,3
5 | 2000-01-05,4
6 | 2000-01-06,5
7 | 2000-01-07,6
8 |
--------------------------------------------------------------------------------
/2.Python数据分析/week05/data/workbook.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/littleheap/Python-Projects/a2bebb14dd49aeb1a0746eeadd8a30db6875ab01/2.Python数据分析/week05/data/workbook.xls
--------------------------------------------------------------------------------
/2.Python数据分析/week05/mydata.csv:
--------------------------------------------------------------------------------
1 | one;two;three
2 | 1;2;3
3 | 4;5;6
4 | 7;8;9
5 |
--------------------------------------------------------------------------------
/2.Python数据分析/week05/tseries.csv:
--------------------------------------------------------------------------------
1 | 2000-01-01,0
2 | 2000-01-02,1
3 | 2000-01-03,2
4 | 2000-01-04,3
5 | 2000-01-05,4
6 | 2000-01-06,5
7 | 2000-01-07,6
8 |
--------------------------------------------------------------------------------
/2.Python数据分析/week05/数据读取.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from pandas import Series, DataFrame
3 |
4 | # 数据读取
5 | # 读取文本格式数据
6 | df = pd.read_csv('d:data/ex1.csv')
7 | print(df)
8 |
9 | # 另一种方法数据读取
10 | pd.read_table('d:data/ex1.csv', sep=',')
11 |
12 | # 不需要以表头方式读取第一行
13 | df = pd.read_csv('d:data/ex2.csv', header=None)
14 | print(df)
15 | # 自定义添加表头
16 | df = pd.read_csv('d:data/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])
17 | print(df)
18 |
19 | names = ['a', 'b', 'c', 'd', 'message']
20 | df = pd.read_csv('d:data/ex2.csv', names=names, index_col='message')
21 | print(df)
22 |
23 | # 归一化数据
24 | parsed = pd.read_csv('d:data/csv_mindex.csv', index_col=['key1', 'key2'])
25 | print(parsed)
26 |
27 | # 正则表达式读取数据
28 | list(open('d:data/ex3.txt'))
29 | result = pd.read_table('d:data/ex3.txt', sep='\s+')
30 | print(result)
31 |
32 | # 略过指定行
33 | df = pd.read_csv('d:data/ex4.csv', skiprows=[0, 2, 3])
34 | print(df)
35 |
36 | # 缺失值情况
37 | result = pd.read_csv('d:data/ex5.csv')
38 | print(result)
39 | print(pd.isnull(result))
40 |
41 | # 缺失值处理
42 | result = pd.read_csv('d:data/ex5.csv', na_values=['NULL'])
43 | print(result)
44 |
45 | sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
46 | pd.read_csv('d:data/ex5.csv', na_values=sentinels)
47 |
48 | # 逐行读取文本文件
49 | result = pd.read_csv('d:data/ex6.csv')
50 | # print(result)
51 |
52 | # 读取前五行
53 | df = pd.read_csv('d:data/ex6.csv', nrows=5)
54 | print(df)
55 |
56 | # 分块读取
57 | chunker = pd.read_csv('d:data/ex6.csv', chunksize=1000)
58 | print(chunker)
59 |
60 | chunker = pd.read_csv('d:data/ex6.csv', chunksize=1000)
61 | print(chunker)
62 |
63 | # 累计统计
64 | tot = Series([])
65 | for piece in chunker:
66 | tot = tot.add(piece['key'].value_counts(), fill_value=0)
67 |
68 | tot = tot.sort_values()
69 | print(tot[:10])
70 |
71 | # 文件写出
72 | data = pd.read_csv('d:data/ex5.csv')
73 | print(data)
74 | data.to_csv('d:data/out.csv')
75 |
76 | import sys
77 | import numpy as np
78 |
79 | # 设置分隔符
80 | df = data.to_csv(sys.stdout, sep='|')
81 | print(df)
82 |
83 | # 设置缺省值
84 | df = data.to_csv(sys.stdout, na_rep='NULL')
85 | print(df)
86 |
87 | # 省略行列标签
88 | df = data.to_csv(sys.stdout, index=False, header=False)
89 | print(df)
90 |
91 | # 指定列
92 | df = data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])
93 | print(df)
94 |
95 | # 循环写操作
96 | dates = pd.date_range('1/1/2000', periods=7)
97 | ts = Series(np.arange(7), index=dates)
98 | ts.to_csv('tseries.csv')
99 |
100 | df = Series.from_csv('tseries.csv', parse_dates=True)
101 | print(df)
102 |
103 | print('-----------------------------------')
104 | print('-----------------------------------')
105 | # 手工处理分隔符格式
106 | import csv
107 |
108 | f = open('d:data/ex7.csv')
109 |
110 | reader = csv.reader(f)
111 |
112 | for line in reader:
113 | print(line)
114 |
115 | # 字典处理,数据对齐
116 | lines = list(csv.reader(open('d:data/ex7.csv')))
117 | header, values = lines[0], lines[1:]
118 | data_dict = {h: v for h, v in zip(header, zip(*values))}
119 | print(data_dict)
120 |
121 |
122 | class my_dialect(csv.Dialect):
123 | lineterminator = '\n'
124 | delimiter = ';'
125 | quotechar = '"'
126 | quoting = csv.QUOTE_MINIMAL
127 |
128 |
129 | with open('mydata.csv', 'w') as f:
130 | writer = csv.writer(f, dialect=my_dialect)
131 | writer.writerow(('one', 'two', 'three'))
132 | writer.writerow(('1', '2', '3'))
133 | writer.writerow(('4', '5', '6'))
134 | writer.writerow(('7', '8', '9'))
135 | df = pd.read_table('mydata.csv', sep=';')
136 | print(df)
137 |
138 | print('-----------------------------------')
139 | print('-----------------------------------')
140 | # Excel数据
141 | # 生成xls工作薄
142 | import xlwt, xlrd
143 |
144 | path = 'd:data/'
145 |
146 | wb = xlwt.Workbook()
147 | print(wb)
148 |
149 | wb.add_sheet('first_sheet', cell_overwrite_ok=True)
150 | print(wb.get_active_sheet())
151 |
152 | ws_1 = wb.get_sheet(0)
153 | print(ws_1)
154 |
155 | ws_2 = wb.add_sheet('second_sheet')
156 |
157 | data = np.arange(1.0, 65.0).reshape((8, 8))
158 | print(data)
159 |
160 | # 写入:行 || 列 || 值
161 | ws_1.write(0, 0, 100)
162 |
163 | # 循环写入
164 | for c in range(data.shape[0]):
165 | for r in range(data.shape[1]):
166 | ws_1.write(r, c, data[c, r])
167 | ws_2.write(r, c, data[r, c])
168 |
169 | wb.save(path + 'workbook.xls')
170 |
171 | # 生成xlsx工作薄
172 |
173 | # 从工作薄中读取
174 | book = xlrd.open_workbook(path + 'workbook.xls')
175 | print(book)
176 |
177 | book.sheet_names()
178 |
179 | # 获取工作表
180 | sheet_1 = book.sheet_by_name('first_sheet')
181 | sheet_2 = book.sheet_by_index(1)
182 | print(sheet_1)
183 | print(sheet_2.name)
184 |
185 | # 工作表行列数
186 | print(sheet_1.ncols, sheet_1.nrows)
187 |
188 | # 获取工作空间某一格数据
189 | cl = sheet_1.cell(0, 0)
190 | print(cl.value)
191 | print(cl.ctype)
192 |
193 | print(sheet_2.row(3))
194 |
195 | print(sheet_2.col(3))
196 |
197 | print(sheet_1.col_values(3, start_rowx=3, end_rowx=7))
198 |
199 | print(sheet_1.row_values(3, start_colx=3, end_colx=7))
200 | #
201 | for c in range(sheet_1.ncols):
202 | for r in range(sheet_1.nrows):
203 | print('%i' % sheet_1.cell(r, c).value)
204 | print
205 |
206 | # 使用pandas读取
207 | xls_file = pd.ExcelFile(path + 'workbook.xls')
208 | table = xls_file.parse('first_sheet')
209 | print(table)
210 | table = xls_file.parse('second_sheet')
211 | print(table)
212 |
213 | print('-----------------------------------')
214 | print('-----------------------------------')
215 | # JSON数据
216 |
217 | obj = """
218 | {"name": "Wes",
219 | "places_lived": ["United States", "Spain", "Germany"],
220 | "pet": null,
221 | "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
222 | {"name": "Katie", "age": 33, "pet": "Cisco"}]
223 | }
224 | """
225 |
226 | import json
227 |
228 | result = json.loads(obj)
229 | print(result)
230 |
231 | asjson = json.dumps(result)
232 |
233 | siblings = DataFrame(result['siblings'], columns=['name', 'age'])
234 | print(siblings)
235 |
236 | print('-----------------------------------')
237 | print('-----------------------------------')
238 | # 二进制数据格式
239 | # pickle
240 | frame = pd.read_csv('d:data/ex1.csv')
241 | print(frame)
242 | frame.to_pickle('d:data/frame_pickle')
243 |
244 | df = pd.read_pickle('d:data/frame_pickle')
245 | print(df)
246 |
247 | # # HDF5格式
248 | # store = pd.HDFStore('mydata.h5')
249 | # store['obj1'] = frame
250 | # store['obj1_col'] = frame['a']
251 | # store
252 | #
253 | # store['obj1']
254 | #
255 | # store.close()
256 | # os.remove('mydata.h5')
257 |
258 | print('-----------------------------------')
259 | print('-----------------------------------')
260 | # 使用HTML和Web API
261 | import requests
262 |
263 | url = 'https://api.github.com/repos/pydata/pandas/milestones/28/labels'
264 | resp = requests.get(url)
265 | print(resp)
266 |
267 | data = json.loads(resp.text)
268 |
269 | issue_labels = DataFrame(data)
270 | print(issue_labels)
271 |
272 | print('-----------------------------------')
273 | print('-----------------------------------')
274 | # 使用数据库
275 | import sqlite3
276 |
277 | query = """
278 | CREATE TABLE test
279 | (a VARCHAR(20), b VARCHAR(20),
280 | c REAL, d INTEGER
281 | );"""
282 |
283 | con = sqlite3.connect(':memory:')
284 | con.execute(query)
285 | con.commit()
286 |
287 | data = [('Atlanta', 'Georgia', 1.25, 6),
288 | ('Tallahassee', 'Florida', 2.6, 3),
289 | ('Sacramento', 'California', 1.7, 5)]
290 | stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
291 |
292 | con.executemany(stmt, data)
293 | con.commit()
294 |
295 | cursor = con.execute('SELECT * FROM test')
296 | rows = cursor.fetchall()
297 | print(rows)
298 |
299 | DataFrame(rows, columns=zip(*cursor.description)[0])
300 |
301 | import pandas.io.sql as sql
302 |
303 | result = sql.read_sql('select * from test', con)
304 | print(result)
305 |
--------------------------------------------------------------------------------
/2.Python数据分析/week06/data/catering_sale.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/littleheap/Python-Projects/a2bebb14dd49aeb1a0746eeadd8a30db6875ab01/2.Python数据分析/week06/data/catering_sale.xls
--------------------------------------------------------------------------------
/2.Python数据分析/week06/data/electricity_data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/littleheap/Python-Projects/a2bebb14dd49aeb1a0746eeadd8a30db6875ab01/2.Python数据分析/week06/data/electricity_data.xls
--------------------------------------------------------------------------------
/2.Python数据分析/week06/data/movies.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/littleheap/Python-Projects/a2bebb14dd49aeb1a0746eeadd8a30db6875ab01/2.Python数据分析/week06/data/movies.dat
--------------------------------------------------------------------------------
/2.Python数据分析/week06/data/normalization_data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/littleheap/Python-Projects/a2bebb14dd49aeb1a0746eeadd8a30db6875ab01/2.Python数据分析/week06/data/normalization_data.xls
--------------------------------------------------------------------------------
/2.Python数据分析/week06/data/olivier.txt:
--------------------------------------------------------------------------------
1 | We currently don't have any high level routines for dealing with the
2 | recommendation systems / matrix completion tasks in scikit-learn.
3 |
4 | However you should be able to run a approximate truncated SVD using
5 | sklearn.decomposition.RandomizedPCA on it (once converted as a
6 | scipy.sparse.csr_matrix) and extract the first 100 components or so.
7 | This can be the base of baseline recommender system as explained in
8 | this blog post http://www.igvita.com/2007/01/15/svd-recommendation-system-in-ruby/
9 |
10 | Also maybe scipy's arpack might be able to compute the SVD of such a
11 | large sparse matrix.
12 |
13 | You also might want to have a look at https://github.com/muricoca/crab
14 | . It has utilities for loading the movielens data and implements some
15 | common recommender systems strategies.
16 |
--------------------------------------------------------------------------------
/2.Python数据分析/week06/data/principal_component.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/littleheap/Python-Projects/a2bebb14dd49aeb1a0746eeadd8a30db6875ab01/2.Python数据分析/week06/data/principal_component.xls
--------------------------------------------------------------------------------
/2.Python数据分析/week06/data/sales.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/littleheap/Python-Projects/a2bebb14dd49aeb1a0746eeadd8a30db6875ab01/2.Python数据分析/week06/data/sales.xls
--------------------------------------------------------------------------------
/2.Python数据分析/week07/data/tips.csv:
--------------------------------------------------------------------------------
1 | total_bill,tip,sex,smoker,day,time,size
2 | 16.99,1.01,Female,No,Sun,Dinner,2
3 | 10.34,1.66,Male,No,Sun,Dinner,3
4 | 21.01,3.5,Male,No,Sun,Dinner,3
5 | 23.68,3.31,Male,No,Sun,Dinner,2
6 | 24.59,3.61,Female,No,Sun,Dinner,4
7 | 25.29,4.71,Male,No,Sun,Dinner,4
8 | 8.77,2.0,Male,No,Sun,Dinner,2
9 | 26.88,3.12,Male,No,Sun,Dinner,4
10 | 15.04,1.96,Male,No,Sun,Dinner,2
11 | 14.78,3.23,Male,No,Sun,Dinner,2
12 | 10.27,1.71,Male,No,Sun,Dinner,2
13 | 35.26,5.0,Female,No,Sun,Dinner,4
14 | 15.42,1.57,Male,No,Sun,Dinner,2
15 | 18.43,3.0,Male,No,Sun,Dinner,4
16 | 14.83,3.02,Female,No,Sun,Dinner,2
17 | 21.58,3.92,Male,No,Sun,Dinner,2
18 | 10.33,1.67,Female,No,Sun,Dinner,3
19 | 16.29,3.71,Male,No,Sun,Dinner,3
20 | 16.97,3.5,Female,No,Sun,Dinner,3
21 | 20.65,3.35,Male,No,Sat,Dinner,3
22 | 17.92,4.08,Male,No,Sat,Dinner,2
23 | 20.29,2.75,Female,No,Sat,Dinner,2
24 | 15.77,2.23,Female,No,Sat,Dinner,2
25 | 39.42,7.58,Male,No,Sat,Dinner,4
26 | 19.82,3.18,Male,No,Sat,Dinner,2
27 | 17.81,2.34,Male,No,Sat,Dinner,4
28 | 13.37,2.0,Male,No,Sat,Dinner,2
29 | 12.69,2.0,Male,No,Sat,Dinner,2
30 | 21.7,4.3,Male,No,Sat,Dinner,2
31 | 19.65,3.0,Female,No,Sat,Dinner,2
32 | 9.55,1.45,Male,No,Sat,Dinner,2
33 | 18.35,2.5,Male,No,Sat,Dinner,4
34 | 15.06,3.0,Female,No,Sat,Dinner,2
35 | 20.69,2.45,Female,No,Sat,Dinner,4
36 | 17.78,3.27,Male,No,Sat,Dinner,2
37 | 24.06,3.6,Male,No,Sat,Dinner,3
38 | 16.31,2.0,Male,No,Sat,Dinner,3
39 | 16.93,3.07,Female,No,Sat,Dinner,3
40 | 18.69,2.31,Male,No,Sat,Dinner,3
41 | 31.27,5.0,Male,No,Sat,Dinner,3
42 | 16.04,2.24,Male,No,Sat,Dinner,3
43 | 17.46,2.54,Male,No,Sun,Dinner,2
44 | 13.94,3.06,Male,No,Sun,Dinner,2
45 | 9.68,1.32,Male,No,Sun,Dinner,2
46 | 30.4,5.6,Male,No,Sun,Dinner,4
47 | 18.29,3.0,Male,No,Sun,Dinner,2
48 | 22.23,5.0,Male,No,Sun,Dinner,2
49 | 32.4,6.0,Male,No,Sun,Dinner,4
50 | 28.55,2.05,Male,No,Sun,Dinner,3
51 | 18.04,3.0,Male,No,Sun,Dinner,2
52 | 12.54,2.5,Male,No,Sun,Dinner,2
53 | 10.29,2.6,Female,No,Sun,Dinner,2
54 | 34.81,5.2,Female,No,Sun,Dinner,4
55 | 9.94,1.56,Male,No,Sun,Dinner,2
56 | 25.56,4.34,Male,No,Sun,Dinner,4
57 | 19.49,3.51,Male,No,Sun,Dinner,2
58 | 38.01,3.0,Male,Yes,Sat,Dinner,4
59 | 26.41,1.5,Female,No,Sat,Dinner,2
60 | 11.24,1.76,Male,Yes,Sat,Dinner,2
61 | 48.27,6.73,Male,No,Sat,Dinner,4
62 | 20.29,3.21,Male,Yes,Sat,Dinner,2
63 | 13.81,2.0,Male,Yes,Sat,Dinner,2
64 | 11.02,1.98,Male,Yes,Sat,Dinner,2
65 | 18.29,3.76,Male,Yes,Sat,Dinner,4
66 | 17.59,2.64,Male,No,Sat,Dinner,3
67 | 20.08,3.15,Male,No,Sat,Dinner,3
68 | 16.45,2.47,Female,No,Sat,Dinner,2
69 | 3.07,1.0,Female,Yes,Sat,Dinner,1
70 | 20.23,2.01,Male,No,Sat,Dinner,2
71 | 15.01,2.09,Male,Yes,Sat,Dinner,2
72 | 12.02,1.97,Male,No,Sat,Dinner,2
73 | 17.07,3.0,Female,No,Sat,Dinner,3
74 | 26.86,3.14,Female,Yes,Sat,Dinner,2
75 | 25.28,5.0,Female,Yes,Sat,Dinner,2
76 | 14.73,2.2,Female,No,Sat,Dinner,2
77 | 10.51,1.25,Male,No,Sat,Dinner,2
78 | 17.92,3.08,Male,Yes,Sat,Dinner,2
79 | 27.2,4.0,Male,No,Thur,Lunch,4
80 | 22.76,3.0,Male,No,Thur,Lunch,2
81 | 17.29,2.71,Male,No,Thur,Lunch,2
82 | 19.44,3.0,Male,Yes,Thur,Lunch,2
83 | 16.66,3.4,Male,No,Thur,Lunch,2
84 | 10.07,1.83,Female,No,Thur,Lunch,1
85 | 32.68,5.0,Male,Yes,Thur,Lunch,2
86 | 15.98,2.03,Male,No,Thur,Lunch,2
87 | 34.83,5.17,Female,No,Thur,Lunch,4
88 | 13.03,2.0,Male,No,Thur,Lunch,2
89 | 18.28,4.0,Male,No,Thur,Lunch,2
90 | 24.71,5.85,Male,No,Thur,Lunch,2
91 | 21.16,3.0,Male,No,Thur,Lunch,2
92 | 28.97,3.0,Male,Yes,Fri,Dinner,2
93 | 22.49,3.5,Male,No,Fri,Dinner,2
94 | 5.75,1.0,Female,Yes,Fri,Dinner,2
95 | 16.32,4.3,Female,Yes,Fri,Dinner,2
96 | 22.75,3.25,Female,No,Fri,Dinner,2
97 | 40.17,4.73,Male,Yes,Fri,Dinner,4
98 | 27.28,4.0,Male,Yes,Fri,Dinner,2
99 | 12.03,1.5,Male,Yes,Fri,Dinner,2
100 | 21.01,3.0,Male,Yes,Fri,Dinner,2
101 | 12.46,1.5,Male,No,Fri,Dinner,2
102 | 11.35,2.5,Female,Yes,Fri,Dinner,2
103 | 15.38,3.0,Female,Yes,Fri,Dinner,2
104 | 44.3,2.5,Female,Yes,Sat,Dinner,3
105 | 22.42,3.48,Female,Yes,Sat,Dinner,2
106 | 20.92,4.08,Female,No,Sat,Dinner,2
107 | 15.36,1.64,Male,Yes,Sat,Dinner,2
108 | 20.49,4.06,Male,Yes,Sat,Dinner,2
109 | 25.21,4.29,Male,Yes,Sat,Dinner,2
110 | 18.24,3.76,Male,No,Sat,Dinner,2
111 | 14.31,4.0,Female,Yes,Sat,Dinner,2
112 | 14.0,3.0,Male,No,Sat,Dinner,2
113 | 7.25,1.0,Female,No,Sat,Dinner,1
114 | 38.07,4.0,Male,No,Sun,Dinner,3
115 | 23.95,2.55,Male,No,Sun,Dinner,2
116 | 25.71,4.0,Female,No,Sun,Dinner,3
117 | 17.31,3.5,Female,No,Sun,Dinner,2
118 | 29.93,5.07,Male,No,Sun,Dinner,4
119 | 10.65,1.5,Female,No,Thur,Lunch,2
120 | 12.43,1.8,Female,No,Thur,Lunch,2
121 | 24.08,2.92,Female,No,Thur,Lunch,4
122 | 11.69,2.31,Male,No,Thur,Lunch,2
123 | 13.42,1.68,Female,No,Thur,Lunch,2
124 | 14.26,2.5,Male,No,Thur,Lunch,2
125 | 15.95,2.0,Male,No,Thur,Lunch,2
126 | 12.48,2.52,Female,No,Thur,Lunch,2
127 | 29.8,4.2,Female,No,Thur,Lunch,6
128 | 8.52,1.48,Male,No,Thur,Lunch,2
129 | 14.52,2.0,Female,No,Thur,Lunch,2
130 | 11.38,2.0,Female,No,Thur,Lunch,2
131 | 22.82,2.18,Male,No,Thur,Lunch,3
132 | 19.08,1.5,Male,No,Thur,Lunch,2
133 | 20.27,2.83,Female,No,Thur,Lunch,2
134 | 11.17,1.5,Female,No,Thur,Lunch,2
135 | 12.26,2.0,Female,No,Thur,Lunch,2
136 | 18.26,3.25,Female,No,Thur,Lunch,2
137 | 8.51,1.25,Female,No,Thur,Lunch,2
138 | 10.33,2.0,Female,No,Thur,Lunch,2
139 | 14.15,2.0,Female,No,Thur,Lunch,2
140 | 16.0,2.0,Male,Yes,Thur,Lunch,2
141 | 13.16,2.75,Female,No,Thur,Lunch,2
142 | 17.47,3.5,Female,No,Thur,Lunch,2
143 | 34.3,6.7,Male,No,Thur,Lunch,6
144 | 41.19,5.0,Male,No,Thur,Lunch,5
145 | 27.05,5.0,Female,No,Thur,Lunch,6
146 | 16.43,2.3,Female,No,Thur,Lunch,2
147 | 8.35,1.5,Female,No,Thur,Lunch,2
148 | 18.64,1.36,Female,No,Thur,Lunch,3
149 | 11.87,1.63,Female,No,Thur,Lunch,2
150 | 9.78,1.73,Male,No,Thur,Lunch,2
151 | 7.51,2.0,Male,No,Thur,Lunch,2
152 | 14.07,2.5,Male,No,Sun,Dinner,2
153 | 13.13,2.0,Male,No,Sun,Dinner,2
154 | 17.26,2.74,Male,No,Sun,Dinner,3
155 | 24.55,2.0,Male,No,Sun,Dinner,4
156 | 19.77,2.0,Male,No,Sun,Dinner,4
157 | 29.85,5.14,Female,No,Sun,Dinner,5
158 | 48.17,5.0,Male,No,Sun,Dinner,6
159 | 25.0,3.75,Female,No,Sun,Dinner,4
160 | 13.39,2.61,Female,No,Sun,Dinner,2
161 | 16.49,2.0,Male,No,Sun,Dinner,4
162 | 21.5,3.5,Male,No,Sun,Dinner,4
163 | 12.66,2.5,Male,No,Sun,Dinner,2
164 | 16.21,2.0,Female,No,Sun,Dinner,3
165 | 13.81,2.0,Male,No,Sun,Dinner,2
166 | 17.51,3.0,Female,Yes,Sun,Dinner,2
167 | 24.52,3.48,Male,No,Sun,Dinner,3
168 | 20.76,2.24,Male,No,Sun,Dinner,2
169 | 31.71,4.5,Male,No,Sun,Dinner,4
170 | 10.59,1.61,Female,Yes,Sat,Dinner,2
171 | 10.63,2.0,Female,Yes,Sat,Dinner,2
172 | 50.81,10.0,Male,Yes,Sat,Dinner,3
173 | 15.81,3.16,Male,Yes,Sat,Dinner,2
174 | 7.25,5.15,Male,Yes,Sun,Dinner,2
175 | 31.85,3.18,Male,Yes,Sun,Dinner,2
176 | 16.82,4.0,Male,Yes,Sun,Dinner,2
177 | 32.9,3.11,Male,Yes,Sun,Dinner,2
178 | 17.89,2.0,Male,Yes,Sun,Dinner,2
179 | 14.48,2.0,Male,Yes,Sun,Dinner,2
180 | 9.6,4.0,Female,Yes,Sun,Dinner,2
181 | 34.63,3.55,Male,Yes,Sun,Dinner,2
182 | 34.65,3.68,Male,Yes,Sun,Dinner,4
183 | 23.33,5.65,Male,Yes,Sun,Dinner,2
184 | 45.35,3.5,Male,Yes,Sun,Dinner,3
185 | 23.17,6.5,Male,Yes,Sun,Dinner,4
186 | 40.55,3.0,Male,Yes,Sun,Dinner,2
187 | 20.69,5.0,Male,No,Sun,Dinner,5
188 | 20.9,3.5,Female,Yes,Sun,Dinner,3
189 | 30.46,2.0,Male,Yes,Sun,Dinner,5
190 | 18.15,3.5,Female,Yes,Sun,Dinner,3
191 | 23.1,4.0,Male,Yes,Sun,Dinner,3
192 | 15.69,1.5,Male,Yes,Sun,Dinner,2
193 | 19.81,4.19,Female,Yes,Thur,Lunch,2
194 | 28.44,2.56,Male,Yes,Thur,Lunch,2
195 | 15.48,2.02,Male,Yes,Thur,Lunch,2
196 | 16.58,4.0,Male,Yes,Thur,Lunch,2
197 | 7.56,1.44,Male,No,Thur,Lunch,2
198 | 10.34,2.0,Male,Yes,Thur,Lunch,2
199 | 43.11,5.0,Female,Yes,Thur,Lunch,4
200 | 13.0,2.0,Female,Yes,Thur,Lunch,2
201 | 13.51,2.0,Male,Yes,Thur,Lunch,2
202 | 18.71,4.0,Male,Yes,Thur,Lunch,3
203 | 12.74,2.01,Female,Yes,Thur,Lunch,2
204 | 13.0,2.0,Female,Yes,Thur,Lunch,2
205 | 16.4,2.5,Female,Yes,Thur,Lunch,2
206 | 20.53,4.0,Male,Yes,Thur,Lunch,4
207 | 16.47,3.23,Female,Yes,Thur,Lunch,3
208 | 26.59,3.41,Male,Yes,Sat,Dinner,3
209 | 38.73,3.0,Male,Yes,Sat,Dinner,4
210 | 24.27,2.03,Male,Yes,Sat,Dinner,2
211 | 12.76,2.23,Female,Yes,Sat,Dinner,2
212 | 30.06,2.0,Male,Yes,Sat,Dinner,3
213 | 25.89,5.16,Male,Yes,Sat,Dinner,4
214 | 48.33,9.0,Male,No,Sat,Dinner,4
215 | 13.27,2.5,Female,Yes,Sat,Dinner,2
216 | 28.17,6.5,Female,Yes,Sat,Dinner,3
217 | 12.9,1.1,Female,Yes,Sat,Dinner,2
218 | 28.15,3.0,Male,Yes,Sat,Dinner,5
219 | 11.59,1.5,Male,Yes,Sat,Dinner,2
220 | 7.74,1.44,Male,Yes,Sat,Dinner,2
221 | 30.14,3.09,Female,Yes,Sat,Dinner,4
222 | 12.16,2.2,Male,Yes,Fri,Lunch,2
223 | 13.42,3.48,Female,Yes,Fri,Lunch,2
224 | 8.58,1.92,Male,Yes,Fri,Lunch,1
225 | 15.98,3.0,Female,No,Fri,Lunch,3
226 | 13.42,1.58,Male,Yes,Fri,Lunch,2
227 | 16.27,2.5,Female,Yes,Fri,Lunch,2
228 | 10.09,2.0,Female,Yes,Fri,Lunch,2
229 | 20.45,3.0,Male,No,Sat,Dinner,4
230 | 13.28,2.72,Male,No,Sat,Dinner,2
231 | 22.12,2.88,Female,Yes,Sat,Dinner,2
232 | 24.01,2.0,Male,Yes,Sat,Dinner,4
233 | 15.69,3.0,Male,Yes,Sat,Dinner,3
234 | 11.61,3.39,Male,No,Sat,Dinner,2
235 | 10.77,1.47,Male,No,Sat,Dinner,2
236 | 15.53,3.0,Male,Yes,Sat,Dinner,2
237 | 10.07,1.25,Male,No,Sat,Dinner,2
238 | 12.6,1.0,Male,Yes,Sat,Dinner,2
239 | 32.83,1.17,Male,Yes,Sat,Dinner,2
240 | 35.83,4.67,Female,No,Sat,Dinner,3
241 | 29.03,5.92,Male,No,Sat,Dinner,3
242 | 27.18,2.0,Female,Yes,Sat,Dinner,2
243 | 22.67,2.0,Male,Yes,Sat,Dinner,2
244 | 17.82,1.75,Male,No,Sat,Dinner,2
245 | 18.78,3.0,Female,No,Thur,Dinner,2
246 |
--------------------------------------------------------------------------------
/2.Python数据分析/week07/figpath.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/littleheap/Python-Projects/a2bebb14dd49aeb1a0746eeadd8a30db6875ab01/2.Python数据分析/week07/figpath.png
--------------------------------------------------------------------------------
/2.Python数据分析/week07/数据可视化.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import division
4 | from numpy.random import randn
5 | import numpy as np
6 | import os
7 | import matplotlib.pyplot as plt
8 |
9 | np.random.seed(12345)
10 | plt.rc('figure', figsize=(10, 6))
11 | from pandas import Series, DataFrame
12 | import pandas as pd
13 |
14 | np.set_printoptions(precision=4)
15 |
16 | # get_ipython().magic(u'matplotlib inline')
17 | # get_ipython().magic(u'pwd')
18 |
19 | # matplotlib创建图表
20 | plt.plot([1, 2, 3, 2, 3, 2, 2, 1])
21 | plt.show()
22 |
23 | plt.plot([4, 3, 2, 1], [1, 2, 3, 4])
24 | plt.show()
25 |
26 | # # 更多简单的图形
27 | x = [1, 2, 3, 4]
28 | y = [5, 4, 3, 2]
29 |
30 | plt.figure()
31 |
32 | plt.subplot(2, 3, 1)
33 | plt.plot(x, y)
34 |
35 | plt.subplot(232)
36 | plt.bar(x, y)
37 |
38 | plt.subplot(233)
39 | plt.barh(x, y)
40 |
41 | plt.subplot(234)
42 | plt.bar(x, y)
43 | y1 = [7, 8, 5, 3]
44 | plt.bar(x, y1, bottom=y, color='r')
45 |
46 | plt.subplot(235)
47 | plt.boxplot(x)
48 |
49 | plt.subplot(236)
50 | plt.scatter(x, y)
51 |
52 | plt.show()
53 |
54 | # figure与subplot
55 | # figure对象
56 | fig = plt.figure()
57 |
58 | ax1 = fig.add_subplot(2, 2, 1)
59 | ax2 = fig.add_subplot(2, 2, 2)
60 | ax3 = fig.add_subplot(2, 2, 3)
61 | plt.show()
62 |
63 | from numpy.random import randn
64 |
65 | plt.plot(randn(50).cumsum(), 'k--')
66 |
67 | fig.show()
68 |
69 | _ = ax1.hist(randn(100), bins=20, color='k', alpha=0.3)
70 |
71 | ax2.scatter(np.arange(30), np.arange(30) + 3 * randn(30))
72 |
73 | plt.close('all')
74 |
75 | fig, axes = plt.subplots(2, 3)
76 | print(axes)
77 |
78 | # 调整subplot周围的间距
79 | plt.subplots_adjust(left=None, bottom=None, right=None, top=None,
80 | wspace=None, hspace=None)
81 |
82 | fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)
83 | for i in range(2):
84 | for j in range(2):
85 | axes[i, j].hist(randn(500), bins=50, color='k', alpha=0.5)
86 | plt.subplots_adjust(wspace=0, hspace=0)
87 |
88 | # matplotlib基本设置
89 | # 颜色、标记和线型
90 | plt.figure()
91 |
92 | x = [1, 2, 3, 4]
93 | y = [5, 4, 3, 2]
94 |
95 | plt.plot(x, y, linestyle='--', color='g')
96 |
97 | plt.plot(randn(30).cumsum(), 'ko--')
98 |
99 | plt.plot(randn(30).cumsum(), color='k', linestyle='dashed', marker='o')
100 |
101 | plt.close('all')
102 |
103 | data = randn(30).cumsum()
104 |
105 | plt.plot(data, 'k--', label='Default')
106 |
107 | plt.plot(data, 'k-', drawstyle='steps-post', label='steps-post')
108 |
109 | plt.legend(loc='best')
110 |
111 | # 设置标题、轴标签、刻度以及刻度标签
112 | fig = plt.figure()
113 | ax = fig.add_subplot(1, 1, 1)
114 | ax.plot(randn(1000).cumsum())
115 |
116 | ticks = ax.set_xticks([0, 250, 500, 750, 1000])
117 | labels = ax.set_xticklabels(['one', 'two', 'three', 'four', 'five'],
118 | rotation=30, fontsize='small')
119 | ax.set_title('My first matplotlib plot')
120 | ax.set_xlabel('Stages')
121 |
122 | # 添加图例
123 | fig = plt.figure()
124 | ax = fig.add_subplot(1, 1, 1)
125 | ax.plot(randn(1000).cumsum(), 'k', label='one')
126 | ax.plot(randn(1000).cumsum(), 'k--', label='two')
127 | ax.plot(randn(1000).cumsum(), 'k.', label='three')
128 |
129 | ax.legend(loc='best')
130 |
131 | # 注释以及在subplot上绘图
132 | from datetime import datetime
133 |
134 | fig = plt.figure()
135 | ax = fig.add_subplot(1, 1, 1)
136 |
137 | data = pd.read_csv('./data/spx.csv', index_col=0, parse_dates=True)
138 | spx = data['SPX']
139 |
140 | spx.plot(ax=ax, style='k-')
141 |
142 | plt.show()
143 |
144 | crisis_data = [
145 | (datetime(2007, 10, 11), 'Peak of bull market'),
146 | (datetime(2008, 3, 12), 'Bear Stearns Fails'),
147 | (datetime(2008, 9, 15), 'Lehman Bankruptcy')
148 | ]
149 |
150 | for date, label in crisis_data:
151 | ax.annotate(label, xy=(date, spx.asof(date) + 50),
152 | xytext=(date, spx.asof(date) + 200),
153 | arrowprops=dict(facecolor='black'),
154 | horizontalalignment='left', verticalalignment='top')
155 |
156 | ax.set_xlim(['1/1/2007', '1/1/2011'])
157 | ax.set_ylim([600, 1800])
158 |
159 | ax.set_title('Important dates in 2008-2009 financial crisis')
160 |
161 | fig = plt.figure()
162 | ax = fig.add_subplot(1, 1, 1)
163 |
164 | rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color='k', alpha=0.3)
165 | circ = plt.Circle((0.7, 0.2), 0.15, color='b', alpha=0.3)
166 | pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]],
167 | color='g', alpha=0.5)
168 |
169 | ax.add_patch(rect)
170 | ax.add_patch(circ)
171 | ax.add_patch(pgon)
172 |
173 | plt.show()
174 |
175 | # 图表的保存
176 | print(fig)
177 |
178 | fig.savefig('figpath.svg')
179 |
180 | fig.savefig('figpath.png', dpi=400, bbox_inches='tight')
181 |
182 | from io import BytesIO
183 |
184 | buffer = BytesIO()
185 | plt.savefig(buffer)
186 | plot_data = buffer.getvalue()
187 |
188 | # matplotlib配置
189 | plt.rc('figure', figsize=(10, 10))
190 |
191 | font_options = {'family': 'monospace',
192 | 'weight': 'bold',
193 | 'size': 26.0}
194 |
195 | plt.rc('font', **font_options)
196 |
197 | plt.show()
198 |
199 | # pandas中的绘图函数
200 | # 线图
201 | plt.close('all')
202 |
203 | s = Series(np.random.randn(10).cumsum(), index=np.arange(0, 100, 10))
204 | s.plot()
205 | plt.show()
206 |
207 | df = DataFrame(np.random.randn(10, 4).cumsum(0),
208 | columns=['A', 'B', 'C', 'D'],
209 | index=np.arange(0, 100, 10))
210 | df.plot()
211 | plt.show()
212 |
213 | # 柱形图
214 | fig, axes = plt.subplots(2, 1)
215 | data = Series(np.random.rand(16), index=list('abcdefghijklmnop'))
216 | data.plot(kind='bar', ax=axes[0], color='k', alpha=0.7)
217 | data.plot(kind='barh', ax=axes[1], color='k', alpha=0.7)
218 |
219 | df = DataFrame(np.random.rand(6, 4),
220 | index=['one', 'two', 'three', 'four', 'five', 'six'],
221 | columns=pd.Index(['A', 'B', 'C', 'D'], name='Genus'))
222 | print(df)
223 | df.plot(kind='bar')
224 |
225 | plt.figure()
226 |
227 | df.plot(kind='barh', stacked=True, alpha=0.5)
228 |
229 | tips = pd.read_csv('./data/tips.csv')
230 | party_counts = pd.crosstab(tips.day, tips['size'])
231 | print(party_counts)
232 |
233 | party_counts = party_counts.ix[:, 2:5]
234 |
235 | party_pcts = party_counts.div(party_counts.sum(1).astype(float), axis=0)
236 | print(party_pcts)
237 |
238 | party_pcts.plot(kind='bar', stacked=True)
239 |
240 | plt.show()
241 |
242 | # 直方图和密度图
243 | plt.figure()
244 |
245 | tips['tip_pct'] = tips['tip'] / tips['total_bill']
246 | tips['tip_pct'].hist(bins=50)
247 |
248 | plt.figure()
249 |
250 | tips['tip_pct'].plot(kind='kde')
251 |
252 | plt.figure()
253 |
254 | comp1 = np.random.normal(0, 1, size=200) # N(0, 1)
255 | comp2 = np.random.normal(10, 2, size=200) # N(10, 4)
256 | values = Series(np.concatenate([comp1, comp2]))
257 | values.hist(bins=100, alpha=0.3, color='k', normed=True)
258 | values.plot(kind='kde', style='k--')
259 |
260 | plt.show()
261 |
262 | # 散点图
263 | macro = pd.read_csv('./data/macrodata.csv')
264 | data = macro[['cpi', 'm1', 'tbilrate', 'unemp']]
265 | trans_data = np.log(data).diff().dropna()
266 | print(trans_data[-5:])
267 |
268 | plt.figure()
269 |
270 | plt.scatter(trans_data['m1'], trans_data['unemp'])
271 | plt.title('Changes in log %s vs. log %s' % ('m1', 'unemp'))
272 |
273 | pd.scatter_matrix(trans_data, diagonal='kde', color='k', alpha=0.3)
274 |
275 | plt.show()
276 |
277 | # Matplotlib做图
278 | # 误差条形图
279 | x = np.arange(0, 10, 1)
280 |
281 | y = np.log(x)
282 |
283 | xe = 0.1 * np.abs(np.random.randn(len(y)))
284 |
285 | plt.bar(x, y, yerr=xe, width=0.4, align='center', ecolor='r', color='cyan',
286 | label='experiment #1');
287 |
288 | plt.xlabel('# measurement')
289 | plt.ylabel('Measured values')
290 | plt.title('Measurements')
291 | plt.legend(loc='upper left')
292 |
293 | plt.show()
294 |
295 | # 饼图
296 | plt.figure(1, figsize=(8, 8))
297 | ax = plt.axes([0.1, 0.1, 0.8, 0.8])
298 |
299 | labels = 'Spring', 'Summer', 'Autumn', 'Winter'
300 | values = [15, 16, 16, 28]
301 | explode = [0.1, 0.1, 0.1, 0.1]
302 |
303 | plt.pie(values, explode=explode, labels=labels,
304 | autopct='%1.1f%%', startangle=67)
305 |
306 | plt.title('Rainy days by season')
307 |
308 | plt.show()
309 |
310 | # 等高线图
311 | import matplotlib as mpl
312 |
313 |
314 | def process_signals(x, y):
315 | return (1 - (x ** 2 + y ** 2)) * np.exp(-y ** 3 / 3)
316 |
317 |
318 | x = np.arange(-1.5, 1.5, 0.1)
319 | y = np.arange(-1.5, 1.5, 0.1)
320 |
321 | X, Y = np.meshgrid(x, y)
322 |
323 | Z = process_signals(X, Y)
324 |
325 | N = np.arange(-1, 1.5, 0.3)
326 |
327 | CS = plt.contour(Z, N, linewidths=2, cmap=mpl.cm.jet)
328 | plt.clabel(CS, inline=True, fmt='%1.1f', fontsize=10)
329 | plt.colorbar(CS)
330 |
331 | plt.title('My function: $z=(1-x^2+y^2) e^{-(y^3)/3}$')
332 | plt.show()
333 |
334 | # 3D图像
335 | # 3D柱形图
336 | import matplotlib.dates as mdates
337 | from mpl_toolkits.mplot3d import Axes3D
338 |
339 | mpl.rcParams['font.size'] = 10
340 |
341 | fig = plt.figure()
342 | ax = fig.add_subplot(111, projection='3d')
343 |
344 | for z in [2011, 2012, 2013, 2014]:
345 | xs = range(1, 13)
346 | ys = 1000 * np.random.rand(12)
347 |
348 | color = plt.cm.Set2(np.random.choice(range(plt.cm.Set2.N)))
349 | ax.bar(xs, ys, zs=z, zdir='y', color=color, alpha=0.8)
350 |
351 | ax.xaxis.set_major_locator(mpl.ticker.FixedLocator(xs))
352 | ax.yaxis.set_major_locator(mpl.ticker.FixedLocator(ys))
353 |
354 | ax.set_xlabel('Month')
355 | ax.set_ylabel('Year')
356 | ax.set_zlabel('Sales Net [usd]')
357 |
358 | plt.show()
359 |
360 | # 3D直方图
361 | mpl.rcParams['font.size'] = 10
362 |
363 | samples = 25
364 |
365 | x = np.random.normal(5, 1, samples)
366 | y = np.random.normal(3, .5, samples)
367 |
368 | fig = plt.figure()
369 | ax = fig.add_subplot(211, projection='3d')
370 |
371 | hist, xedges, yedges = np.histogram2d(x, y, bins=10)
372 |
373 | elements = (len(xedges) - 1) * (len(yedges) - 1)
374 | xpos, ypos = np.meshgrid(xedges[:-1] + .25, yedges[:-1] + .25)
375 |
376 | xpos = xpos.flatten()
377 | ypos = ypos.flatten()
378 | zpos = np.zeros(elements)
379 |
380 | dx = .1 * np.ones_like(zpos)
381 | dy = dx.copy()
382 |
383 | dz = hist.flatten()
384 |
385 | ax.bar3d(xpos, ypos, zpos, dx, dy, dz, color='b', alpha=0.4)
386 | ax.set_xlabel('X Axis')
387 | ax.set_ylabel('Y Axis')
388 | ax.set_zlabel('Z Axis')
389 |
390 | ax2 = fig.add_subplot(212)
391 | ax2.scatter(x, y)
392 | ax2.set_xlabel('X Axis')
393 | ax2.set_ylabel('Y Axis')
394 |
395 | plt.show()
396 |
--------------------------------------------------------------------------------
/2.Python数据分析/week08/data/tips.csv:
--------------------------------------------------------------------------------
1 | total_bill,tip,sex,smoker,day,time,size
2 | 16.99,1.01,Female,No,Sun,Dinner,2
3 | 10.34,1.66,Male,No,Sun,Dinner,3
4 | 21.01,3.5,Male,No,Sun,Dinner,3
5 | 23.68,3.31,Male,No,Sun,Dinner,2
6 | 24.59,3.61,Female,No,Sun,Dinner,4
7 | 25.29,4.71,Male,No,Sun,Dinner,4
8 | 8.77,2.0,Male,No,Sun,Dinner,2
9 | 26.88,3.12,Male,No,Sun,Dinner,4
10 | 15.04,1.96,Male,No,Sun,Dinner,2
11 | 14.78,3.23,Male,No,Sun,Dinner,2
12 | 10.27,1.71,Male,No,Sun,Dinner,2
13 | 35.26,5.0,Female,No,Sun,Dinner,4
14 | 15.42,1.57,Male,No,Sun,Dinner,2
15 | 18.43,3.0,Male,No,Sun,Dinner,4
16 | 14.83,3.02,Female,No,Sun,Dinner,2
17 | 21.58,3.92,Male,No,Sun,Dinner,2
18 | 10.33,1.67,Female,No,Sun,Dinner,3
19 | 16.29,3.71,Male,No,Sun,Dinner,3
20 | 16.97,3.5,Female,No,Sun,Dinner,3
21 | 20.65,3.35,Male,No,Sat,Dinner,3
22 | 17.92,4.08,Male,No,Sat,Dinner,2
23 | 20.29,2.75,Female,No,Sat,Dinner,2
24 | 15.77,2.23,Female,No,Sat,Dinner,2
25 | 39.42,7.58,Male,No,Sat,Dinner,4
26 | 19.82,3.18,Male,No,Sat,Dinner,2
27 | 17.81,2.34,Male,No,Sat,Dinner,4
28 | 13.37,2.0,Male,No,Sat,Dinner,2
29 | 12.69,2.0,Male,No,Sat,Dinner,2
30 | 21.7,4.3,Male,No,Sat,Dinner,2
31 | 19.65,3.0,Female,No,Sat,Dinner,2
32 | 9.55,1.45,Male,No,Sat,Dinner,2
33 | 18.35,2.5,Male,No,Sat,Dinner,4
34 | 15.06,3.0,Female,No,Sat,Dinner,2
35 | 20.69,2.45,Female,No,Sat,Dinner,4
36 | 17.78,3.27,Male,No,Sat,Dinner,2
37 | 24.06,3.6,Male,No,Sat,Dinner,3
38 | 16.31,2.0,Male,No,Sat,Dinner,3
39 | 16.93,3.07,Female,No,Sat,Dinner,3
40 | 18.69,2.31,Male,No,Sat,Dinner,3
41 | 31.27,5.0,Male,No,Sat,Dinner,3
42 | 16.04,2.24,Male,No,Sat,Dinner,3
43 | 17.46,2.54,Male,No,Sun,Dinner,2
44 | 13.94,3.06,Male,No,Sun,Dinner,2
45 | 9.68,1.32,Male,No,Sun,Dinner,2
46 | 30.4,5.6,Male,No,Sun,Dinner,4
47 | 18.29,3.0,Male,No,Sun,Dinner,2
48 | 22.23,5.0,Male,No,Sun,Dinner,2
49 | 32.4,6.0,Male,No,Sun,Dinner,4
50 | 28.55,2.05,Male,No,Sun,Dinner,3
51 | 18.04,3.0,Male,No,Sun,Dinner,2
52 | 12.54,2.5,Male,No,Sun,Dinner,2
53 | 10.29,2.6,Female,No,Sun,Dinner,2
54 | 34.81,5.2,Female,No,Sun,Dinner,4
55 | 9.94,1.56,Male,No,Sun,Dinner,2
56 | 25.56,4.34,Male,No,Sun,Dinner,4
57 | 19.49,3.51,Male,No,Sun,Dinner,2
58 | 38.01,3.0,Male,Yes,Sat,Dinner,4
59 | 26.41,1.5,Female,No,Sat,Dinner,2
60 | 11.24,1.76,Male,Yes,Sat,Dinner,2
61 | 48.27,6.73,Male,No,Sat,Dinner,4
62 | 20.29,3.21,Male,Yes,Sat,Dinner,2
63 | 13.81,2.0,Male,Yes,Sat,Dinner,2
64 | 11.02,1.98,Male,Yes,Sat,Dinner,2
65 | 18.29,3.76,Male,Yes,Sat,Dinner,4
66 | 17.59,2.64,Male,No,Sat,Dinner,3
67 | 20.08,3.15,Male,No,Sat,Dinner,3
68 | 16.45,2.47,Female,No,Sat,Dinner,2
69 | 3.07,1.0,Female,Yes,Sat,Dinner,1
70 | 20.23,2.01,Male,No,Sat,Dinner,2
71 | 15.01,2.09,Male,Yes,Sat,Dinner,2
72 | 12.02,1.97,Male,No,Sat,Dinner,2
73 | 17.07,3.0,Female,No,Sat,Dinner,3
74 | 26.86,3.14,Female,Yes,Sat,Dinner,2
75 | 25.28,5.0,Female,Yes,Sat,Dinner,2
76 | 14.73,2.2,Female,No,Sat,Dinner,2
77 | 10.51,1.25,Male,No,Sat,Dinner,2
78 | 17.92,3.08,Male,Yes,Sat,Dinner,2
79 | 27.2,4.0,Male,No,Thur,Lunch,4
80 | 22.76,3.0,Male,No,Thur,Lunch,2
81 | 17.29,2.71,Male,No,Thur,Lunch,2
82 | 19.44,3.0,Male,Yes,Thur,Lunch,2
83 | 16.66,3.4,Male,No,Thur,Lunch,2
84 | 10.07,1.83,Female,No,Thur,Lunch,1
85 | 32.68,5.0,Male,Yes,Thur,Lunch,2
86 | 15.98,2.03,Male,No,Thur,Lunch,2
87 | 34.83,5.17,Female,No,Thur,Lunch,4
88 | 13.03,2.0,Male,No,Thur,Lunch,2
89 | 18.28,4.0,Male,No,Thur,Lunch,2
90 | 24.71,5.85,Male,No,Thur,Lunch,2
91 | 21.16,3.0,Male,No,Thur,Lunch,2
92 | 28.97,3.0,Male,Yes,Fri,Dinner,2
93 | 22.49,3.5,Male,No,Fri,Dinner,2
94 | 5.75,1.0,Female,Yes,Fri,Dinner,2
95 | 16.32,4.3,Female,Yes,Fri,Dinner,2
96 | 22.75,3.25,Female,No,Fri,Dinner,2
97 | 40.17,4.73,Male,Yes,Fri,Dinner,4
98 | 27.28,4.0,Male,Yes,Fri,Dinner,2
99 | 12.03,1.5,Male,Yes,Fri,Dinner,2
100 | 21.01,3.0,Male,Yes,Fri,Dinner,2
101 | 12.46,1.5,Male,No,Fri,Dinner,2
102 | 11.35,2.5,Female,Yes,Fri,Dinner,2
103 | 15.38,3.0,Female,Yes,Fri,Dinner,2
104 | 44.3,2.5,Female,Yes,Sat,Dinner,3
105 | 22.42,3.48,Female,Yes,Sat,Dinner,2
106 | 20.92,4.08,Female,No,Sat,Dinner,2
107 | 15.36,1.64,Male,Yes,Sat,Dinner,2
108 | 20.49,4.06,Male,Yes,Sat,Dinner,2
109 | 25.21,4.29,Male,Yes,Sat,Dinner,2
110 | 18.24,3.76,Male,No,Sat,Dinner,2
111 | 14.31,4.0,Female,Yes,Sat,Dinner,2
112 | 14.0,3.0,Male,No,Sat,Dinner,2
113 | 7.25,1.0,Female,No,Sat,Dinner,1
114 | 38.07,4.0,Male,No,Sun,Dinner,3
115 | 23.95,2.55,Male,No,Sun,Dinner,2
116 | 25.71,4.0,Female,No,Sun,Dinner,3
117 | 17.31,3.5,Female,No,Sun,Dinner,2
118 | 29.93,5.07,Male,No,Sun,Dinner,4
119 | 10.65,1.5,Female,No,Thur,Lunch,2
120 | 12.43,1.8,Female,No,Thur,Lunch,2
121 | 24.08,2.92,Female,No,Thur,Lunch,4
122 | 11.69,2.31,Male,No,Thur,Lunch,2
123 | 13.42,1.68,Female,No,Thur,Lunch,2
124 | 14.26,2.5,Male,No,Thur,Lunch,2
125 | 15.95,2.0,Male,No,Thur,Lunch,2
126 | 12.48,2.52,Female,No,Thur,Lunch,2
127 | 29.8,4.2,Female,No,Thur,Lunch,6
128 | 8.52,1.48,Male,No,Thur,Lunch,2
129 | 14.52,2.0,Female,No,Thur,Lunch,2
130 | 11.38,2.0,Female,No,Thur,Lunch,2
131 | 22.82,2.18,Male,No,Thur,Lunch,3
132 | 19.08,1.5,Male,No,Thur,Lunch,2
133 | 20.27,2.83,Female,No,Thur,Lunch,2
134 | 11.17,1.5,Female,No,Thur,Lunch,2
135 | 12.26,2.0,Female,No,Thur,Lunch,2
136 | 18.26,3.25,Female,No,Thur,Lunch,2
137 | 8.51,1.25,Female,No,Thur,Lunch,2
138 | 10.33,2.0,Female,No,Thur,Lunch,2
139 | 14.15,2.0,Female,No,Thur,Lunch,2
140 | 16.0,2.0,Male,Yes,Thur,Lunch,2
141 | 13.16,2.75,Female,No,Thur,Lunch,2
142 | 17.47,3.5,Female,No,Thur,Lunch,2
143 | 34.3,6.7,Male,No,Thur,Lunch,6
144 | 41.19,5.0,Male,No,Thur,Lunch,5
145 | 27.05,5.0,Female,No,Thur,Lunch,6
146 | 16.43,2.3,Female,No,Thur,Lunch,2
147 | 8.35,1.5,Female,No,Thur,Lunch,2
148 | 18.64,1.36,Female,No,Thur,Lunch,3
149 | 11.87,1.63,Female,No,Thur,Lunch,2
150 | 9.78,1.73,Male,No,Thur,Lunch,2
151 | 7.51,2.0,Male,No,Thur,Lunch,2
152 | 14.07,2.5,Male,No,Sun,Dinner,2
153 | 13.13,2.0,Male,No,Sun,Dinner,2
154 | 17.26,2.74,Male,No,Sun,Dinner,3
155 | 24.55,2.0,Male,No,Sun,Dinner,4
156 | 19.77,2.0,Male,No,Sun,Dinner,4
157 | 29.85,5.14,Female,No,Sun,Dinner,5
158 | 48.17,5.0,Male,No,Sun,Dinner,6
159 | 25.0,3.75,Female,No,Sun,Dinner,4
160 | 13.39,2.61,Female,No,Sun,Dinner,2
161 | 16.49,2.0,Male,No,Sun,Dinner,4
162 | 21.5,3.5,Male,No,Sun,Dinner,4
163 | 12.66,2.5,Male,No,Sun,Dinner,2
164 | 16.21,2.0,Female,No,Sun,Dinner,3
165 | 13.81,2.0,Male,No,Sun,Dinner,2
166 | 17.51,3.0,Female,Yes,Sun,Dinner,2
167 | 24.52,3.48,Male,No,Sun,Dinner,3
168 | 20.76,2.24,Male,No,Sun,Dinner,2
169 | 31.71,4.5,Male,No,Sun,Dinner,4
170 | 10.59,1.61,Female,Yes,Sat,Dinner,2
171 | 10.63,2.0,Female,Yes,Sat,Dinner,2
172 | 50.81,10.0,Male,Yes,Sat,Dinner,3
173 | 15.81,3.16,Male,Yes,Sat,Dinner,2
174 | 7.25,5.15,Male,Yes,Sun,Dinner,2
175 | 31.85,3.18,Male,Yes,Sun,Dinner,2
176 | 16.82,4.0,Male,Yes,Sun,Dinner,2
177 | 32.9,3.11,Male,Yes,Sun,Dinner,2
178 | 17.89,2.0,Male,Yes,Sun,Dinner,2
179 | 14.48,2.0,Male,Yes,Sun,Dinner,2
180 | 9.6,4.0,Female,Yes,Sun,Dinner,2
181 | 34.63,3.55,Male,Yes,Sun,Dinner,2
182 | 34.65,3.68,Male,Yes,Sun,Dinner,4
183 | 23.33,5.65,Male,Yes,Sun,Dinner,2
184 | 45.35,3.5,Male,Yes,Sun,Dinner,3
185 | 23.17,6.5,Male,Yes,Sun,Dinner,4
186 | 40.55,3.0,Male,Yes,Sun,Dinner,2
187 | 20.69,5.0,Male,No,Sun,Dinner,5
188 | 20.9,3.5,Female,Yes,Sun,Dinner,3
189 | 30.46,2.0,Male,Yes,Sun,Dinner,5
190 | 18.15,3.5,Female,Yes,Sun,Dinner,3
191 | 23.1,4.0,Male,Yes,Sun,Dinner,3
192 | 15.69,1.5,Male,Yes,Sun,Dinner,2
193 | 19.81,4.19,Female,Yes,Thur,Lunch,2
194 | 28.44,2.56,Male,Yes,Thur,Lunch,2
195 | 15.48,2.02,Male,Yes,Thur,Lunch,2
196 | 16.58,4.0,Male,Yes,Thur,Lunch,2
197 | 7.56,1.44,Male,No,Thur,Lunch,2
198 | 10.34,2.0,Male,Yes,Thur,Lunch,2
199 | 43.11,5.0,Female,Yes,Thur,Lunch,4
200 | 13.0,2.0,Female,Yes,Thur,Lunch,2
201 | 13.51,2.0,Male,Yes,Thur,Lunch,2
202 | 18.71,4.0,Male,Yes,Thur,Lunch,3
203 | 12.74,2.01,Female,Yes,Thur,Lunch,2
204 | 13.0,2.0,Female,Yes,Thur,Lunch,2
205 | 16.4,2.5,Female,Yes,Thur,Lunch,2
206 | 20.53,4.0,Male,Yes,Thur,Lunch,4
207 | 16.47,3.23,Female,Yes,Thur,Lunch,3
208 | 26.59,3.41,Male,Yes,Sat,Dinner,3
209 | 38.73,3.0,Male,Yes,Sat,Dinner,4
210 | 24.27,2.03,Male,Yes,Sat,Dinner,2
211 | 12.76,2.23,Female,Yes,Sat,Dinner,2
212 | 30.06,2.0,Male,Yes,Sat,Dinner,3
213 | 25.89,5.16,Male,Yes,Sat,Dinner,4
214 | 48.33,9.0,Male,No,Sat,Dinner,4
215 | 13.27,2.5,Female,Yes,Sat,Dinner,2
216 | 28.17,6.5,Female,Yes,Sat,Dinner,3
217 | 12.9,1.1,Female,Yes,Sat,Dinner,2
218 | 28.15,3.0,Male,Yes,Sat,Dinner,5
219 | 11.59,1.5,Male,Yes,Sat,Dinner,2
220 | 7.74,1.44,Male,Yes,Sat,Dinner,2
221 | 30.14,3.09,Female,Yes,Sat,Dinner,4
222 | 12.16,2.2,Male,Yes,Fri,Lunch,2
223 | 13.42,3.48,Female,Yes,Fri,Lunch,2
224 | 8.58,1.92,Male,Yes,Fri,Lunch,1
225 | 15.98,3.0,Female,No,Fri,Lunch,3
226 | 13.42,1.58,Male,Yes,Fri,Lunch,2
227 | 16.27,2.5,Female,Yes,Fri,Lunch,2
228 | 10.09,2.0,Female,Yes,Fri,Lunch,2
229 | 20.45,3.0,Male,No,Sat,Dinner,4
230 | 13.28,2.72,Male,No,Sat,Dinner,2
231 | 22.12,2.88,Female,Yes,Sat,Dinner,2
232 | 24.01,2.0,Male,Yes,Sat,Dinner,4
233 | 15.69,3.0,Male,Yes,Sat,Dinner,3
234 | 11.61,3.39,Male,No,Sat,Dinner,2
235 | 10.77,1.47,Male,No,Sat,Dinner,2
236 | 15.53,3.0,Male,Yes,Sat,Dinner,2
237 | 10.07,1.25,Male,No,Sat,Dinner,2
238 | 12.6,1.0,Male,Yes,Sat,Dinner,2
239 | 32.83,1.17,Male,Yes,Sat,Dinner,2
240 | 35.83,4.67,Female,No,Sat,Dinner,3
241 | 29.03,5.92,Male,No,Sat,Dinner,3
242 | 27.18,2.0,Female,Yes,Sat,Dinner,2
243 | 22.67,2.0,Male,Yes,Sat,Dinner,2
244 | 17.82,1.75,Male,No,Sat,Dinner,2
245 | 18.78,3.0,Female,No,Thur,Dinner,2
246 |
--------------------------------------------------------------------------------
/2.Python数据分析/week09/Amtrak.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/littleheap/Python-Projects/a2bebb14dd49aeb1a0746eeadd8a30db6875ab01/2.Python数据分析/week09/Amtrak.xls
--------------------------------------------------------------------------------
/2.Python数据分析/week09/统计基础.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import pandas as pd
4 | from pandas import Series, DataFrame
5 | import numpy as np
6 |
7 | a = [98, 83, 65, 72, 79, 76, 75, 94, 91, 77, 63, 83, 89, 69, 64, 78, 63, 86, 91, 72, 71, 72, 70, 80, 65, 70, 62, 74, 71,
8 | 76]
9 |
10 | print(np.mean(a))
11 |
12 | print(np.mean(np.sort(a)[14:16]))
13 |
14 | print(np.sort(a))
15 |
16 |
17 | # 求众数的函数
18 | def get_mode(arr):
19 | mode = []
20 | arr_appear = dict((a, arr.count(a)) for a in arr) # 统计各个元素出现的次数
21 | if max(arr_appear.values()) == 1: # 如果最大的出现为1
22 | return # 则没有众数
23 | else:
24 | for k, v in arr_appear.items(): # 否则,出现次数最大的数字,就是众数
25 | if v == max(arr_appear.values()):
26 | mode.append(k)
27 | return mode
28 |
29 |
30 | print(get_mode(a))
31 |
32 | print(np.var(a))
33 |
34 | print(np.std(a))
35 |
36 | a = Series(a)
37 |
38 | print(a.skew())
39 |
40 | print(a.kurt())
41 |
42 | print(a.describe())
43 |
44 | df = DataFrame({'data1': np.random.randn(5),
45 | 'data2': np.random.randn(5)})
46 | print(df.cov())
47 |
48 | print(df.corr())
49 |
50 | # 假设检验
51 | from scipy import stats as ss
52 |
53 | df = DataFrame({'data': [10.1, 10, 9.8, 10.5, 9.7, 10.1, 9.9, 10.2, 10.3, 9.9]})
54 |
55 | print(ss.ttest_1samp(a=df, popmean=10))
56 |
--------------------------------------------------------------------------------
/2.Python数据分析/week10/Advertising.csv:
--------------------------------------------------------------------------------
1 | ,TV,Radio,Newspaper,Sales
2 | 1,230.1,37.8,69.2,22.1
3 | 2,44.5,39.3,45.1,10.4
4 | 3,17.2,45.9,69.3,9.3
5 | 4,151.5,41.3,58.5,18.5
6 | 5,180.8,10.8,58.4,12.9
7 | 6,8.7,48.9,75,7.2
8 | 7,57.5,32.8,23.5,11.8
9 | 8,120.2,19.6,11.6,13.2
10 | 9,8.6,2.1,1,4.8
11 | 10,199.8,2.6,21.2,10.6
12 | 11,66.1,5.8,24.2,8.6
13 | 12,214.7,24,4,17.4
14 | 13,23.8,35.1,65.9,9.2
15 | 14,97.5,7.6,7.2,9.7
16 | 15,204.1,32.9,46,19
17 | 16,195.4,47.7,52.9,22.4
18 | 17,67.8,36.6,114,12.5
19 | 18,281.4,39.6,55.8,24.4
20 | 19,69.2,20.5,18.3,11.3
21 | 20,147.3,23.9,19.1,14.6
22 | 21,218.4,27.7,53.4,18
23 | 22,237.4,5.1,23.5,12.5
24 | 23,13.2,15.9,49.6,5.6
25 | 24,228.3,16.9,26.2,15.5
26 | 25,62.3,12.6,18.3,9.7
27 | 26,262.9,3.5,19.5,12
28 | 27,142.9,29.3,12.6,15
29 | 28,240.1,16.7,22.9,15.9
30 | 29,248.8,27.1,22.9,18.9
31 | 30,70.6,16,40.8,10.5
32 | 31,292.9,28.3,43.2,21.4
33 | 32,112.9,17.4,38.6,11.9
34 | 33,97.2,1.5,30,9.6
35 | 34,265.6,20,0.3,17.4
36 | 35,95.7,1.4,7.4,9.5
37 | 36,290.7,4.1,8.5,12.8
38 | 37,266.9,43.8,5,25.4
39 | 38,74.7,49.4,45.7,14.7
40 | 39,43.1,26.7,35.1,10.1
41 | 40,228,37.7,32,21.5
42 | 41,202.5,22.3,31.6,16.6
43 | 42,177,33.4,38.7,17.1
44 | 43,293.6,27.7,1.8,20.7
45 | 44,206.9,8.4,26.4,12.9
46 | 45,25.1,25.7,43.3,8.5
47 | 46,175.1,22.5,31.5,14.9
48 | 47,89.7,9.9,35.7,10.6
49 | 48,239.9,41.5,18.5,23.2
50 | 49,227.2,15.8,49.9,14.8
51 | 50,66.9,11.7,36.8,9.7
52 | 51,199.8,3.1,34.6,11.4
53 | 52,100.4,9.6,3.6,10.7
54 | 53,216.4,41.7,39.6,22.6
55 | 54,182.6,46.2,58.7,21.2
56 | 55,262.7,28.8,15.9,20.2
57 | 56,198.9,49.4,60,23.7
58 | 57,7.3,28.1,41.4,5.5
59 | 58,136.2,19.2,16.6,13.2
60 | 59,210.8,49.6,37.7,23.8
61 | 60,210.7,29.5,9.3,18.4
62 | 61,53.5,2,21.4,8.1
63 | 62,261.3,42.7,54.7,24.2
64 | 63,239.3,15.5,27.3,15.7
65 | 64,102.7,29.6,8.4,14
66 | 65,131.1,42.8,28.9,18
67 | 66,69,9.3,0.9,9.3
68 | 67,31.5,24.6,2.2,9.5
69 | 68,139.3,14.5,10.2,13.4
70 | 69,237.4,27.5,11,18.9
71 | 70,216.8,43.9,27.2,22.3
72 | 71,199.1,30.6,38.7,18.3
73 | 72,109.8,14.3,31.7,12.4
74 | 73,26.8,33,19.3,8.8
75 | 74,129.4,5.7,31.3,11
76 | 75,213.4,24.6,13.1,17
77 | 76,16.9,43.7,89.4,8.7
78 | 77,27.5,1.6,20.7,6.9
79 | 78,120.5,28.5,14.2,14.2
80 | 79,5.4,29.9,9.4,5.3
81 | 80,116,7.7,23.1,11
82 | 81,76.4,26.7,22.3,11.8
83 | 82,239.8,4.1,36.9,12.3
84 | 83,75.3,20.3,32.5,11.3
85 | 84,68.4,44.5,35.6,13.6
86 | 85,213.5,43,33.8,21.7
87 | 86,193.2,18.4,65.7,15.2
88 | 87,76.3,27.5,16,12
89 | 88,110.7,40.6,63.2,16
90 | 89,88.3,25.5,73.4,12.9
91 | 90,109.8,47.8,51.4,16.7
92 | 91,134.3,4.9,9.3,11.2
93 | 92,28.6,1.5,33,7.3
94 | 93,217.7,33.5,59,19.4
95 | 94,250.9,36.5,72.3,22.2
96 | 95,107.4,14,10.9,11.5
97 | 96,163.3,31.6,52.9,16.9
98 | 97,197.6,3.5,5.9,11.7
99 | 98,184.9,21,22,15.5
100 | 99,289.7,42.3,51.2,25.4
101 | 100,135.2,41.7,45.9,17.2
102 | 101,222.4,4.3,49.8,11.7
103 | 102,296.4,36.3,100.9,23.8
104 | 103,280.2,10.1,21.4,14.8
105 | 104,187.9,17.2,17.9,14.7
106 | 105,238.2,34.3,5.3,20.7
107 | 106,137.9,46.4,59,19.2
108 | 107,25,11,29.7,7.2
109 | 108,90.4,0.3,23.2,8.7
110 | 109,13.1,0.4,25.6,5.3
111 | 110,255.4,26.9,5.5,19.8
112 | 111,225.8,8.2,56.5,13.4
113 | 112,241.7,38,23.2,21.8
114 | 113,175.7,15.4,2.4,14.1
115 | 114,209.6,20.6,10.7,15.9
116 | 115,78.2,46.8,34.5,14.6
117 | 116,75.1,35,52.7,12.6
118 | 117,139.2,14.3,25.6,12.2
119 | 118,76.4,0.8,14.8,9.4
120 | 119,125.7,36.9,79.2,15.9
121 | 120,19.4,16,22.3,6.6
122 | 121,141.3,26.8,46.2,15.5
123 | 122,18.8,21.7,50.4,7
124 | 123,224,2.4,15.6,11.6
125 | 124,123.1,34.6,12.4,15.2
126 | 125,229.5,32.3,74.2,19.7
127 | 126,87.2,11.8,25.9,10.6
128 | 127,7.8,38.9,50.6,6.6
129 | 128,80.2,0,9.2,8.8
130 | 129,220.3,49,3.2,24.7
131 | 130,59.6,12,43.1,9.7
132 | 131,0.7,39.6,8.7,1.6
133 | 132,265.2,2.9,43,12.7
134 | 133,8.4,27.2,2.1,5.7
135 | 134,219.8,33.5,45.1,19.6
136 | 135,36.9,38.6,65.6,10.8
137 | 136,48.3,47,8.5,11.6
138 | 137,25.6,39,9.3,9.5
139 | 138,273.7,28.9,59.7,20.8
140 | 139,43,25.9,20.5,9.6
141 | 140,184.9,43.9,1.7,20.7
142 | 141,73.4,17,12.9,10.9
143 | 142,193.7,35.4,75.6,19.2
144 | 143,220.5,33.2,37.9,20.1
145 | 144,104.6,5.7,34.4,10.4
146 | 145,96.2,14.8,38.9,11.4
147 | 146,140.3,1.9,9,10.3
148 | 147,240.1,7.3,8.7,13.2
149 | 148,243.2,49,44.3,25.4
150 | 149,38,40.3,11.9,10.9
151 | 150,44.7,25.8,20.6,10.1
152 | 151,280.7,13.9,37,16.1
153 | 152,121,8.4,48.7,11.6
154 | 153,197.6,23.3,14.2,16.6
155 | 154,171.3,39.7,37.7,19
156 | 155,187.8,21.1,9.5,15.6
157 | 156,4.1,11.6,5.7,3.2
158 | 157,93.9,43.5,50.5,15.3
159 | 158,149.8,1.3,24.3,10.1
160 | 159,11.7,36.9,45.2,7.3
161 | 160,131.7,18.4,34.6,12.9
162 | 161,172.5,18.1,30.7,14.4
163 | 162,85.7,35.8,49.3,13.3
164 | 163,188.4,18.1,25.6,14.9
165 | 164,163.5,36.8,7.4,18
166 | 165,117.2,14.7,5.4,11.9
167 | 166,234.5,3.4,84.8,11.9
168 | 167,17.9,37.6,21.6,8
169 | 168,206.8,5.2,19.4,12.2
170 | 169,215.4,23.6,57.6,17.1
171 | 170,284.3,10.6,6.4,15
172 | 171,50,11.6,18.4,8.4
173 | 172,164.5,20.9,47.4,14.5
174 | 173,19.6,20.1,17,7.6
175 | 174,168.4,7.1,12.8,11.7
176 | 175,222.4,3.4,13.1,11.5
177 | 176,276.9,48.9,41.8,27
178 | 177,248.4,30.2,20.3,20.2
179 | 178,170.2,7.8,35.2,11.7
180 | 179,276.7,2.3,23.7,11.8
181 | 180,165.6,10,17.6,12.6
182 | 181,156.6,2.6,8.3,10.5
183 | 182,218.5,5.4,27.4,12.2
184 | 183,56.2,5.7,29.7,8.7
185 | 184,287.6,43,71.8,26.2
186 | 185,253.8,21.3,30,17.6
187 | 186,205,45.1,19.6,22.6
188 | 187,139.5,2.1,26.6,10.3
189 | 188,191.1,28.7,18.2,17.3
190 | 189,286,13.9,3.7,15.9
191 | 190,18.7,12.1,23.4,6.7
192 | 191,39.5,41.1,5.8,10.8
193 | 192,75.5,10.8,6,9.9
194 | 193,17.2,4.1,31.6,5.9
195 | 194,166.8,42,3.6,19.6
196 | 195,149.7,35.6,6,17.3
197 | 196,38.2,3.7,13.8,7.6
198 | 197,94.2,4.9,8.1,9.7
199 | 198,177,9.3,6.4,12.8
200 | 199,283.6,42,66.2,25.5
201 | 200,232.1,8.6,8.7,13.4
202 |
--------------------------------------------------------------------------------
/2.Python数据分析/week10/线性回归分析.py:
--------------------------------------------------------------------------------
1 | from numpy import *
2 | import numpy as np
3 | import pandas as pd
4 |
5 | # 线性回归
6 | # 读取数据
7 | data = pd.read_csv('Advertising.csv', index_col=0)
8 |
9 | print('\n')
10 | print('data head : ')
11 | print(data.head()) # 数据前5行
12 |
13 | print('\n')
14 | print('data tail : ')
15 | print(data.tail()) # 数据后5行
16 |
17 | # 画散点图
18 | import seaborn as sns
19 | import matplotlib.pyplot as plt
20 |
21 | # # 单显示数据
22 | # sns.pairplot(data, x_vars=['TV', 'Radio', 'Newspaper'], y_vars='Sales', size=7, aspect=0.8)
23 | #
24 | # # 加上回归线
25 | # sns.pairplot(data, x_vars=['TV', 'Radio', 'Newspaper'], y_vars='Sales', size=7, aspect=0.8, kind='reg')
26 | #
27 | # x = data[['TV', 'Radio', 'Newspaper']]
28 | # y = data['Sales']
29 | # plt.figure(figsize=(9, 12))
30 | # plt.subplot(311)
31 | # plt.plot(data['TV'], y, 'ro')
32 | # plt.title('TV')
33 | # plt.grid()
34 | # plt.subplot(312)
35 | # plt.plot(data['Radio'], y, 'g^')
36 | # plt.title('Radio')
37 | # plt.grid()
38 | # plt.subplot(313)
39 | # plt.plot(data['Newspaper'], y, 'b*')
40 | # plt.title('Newspaper')
41 | # plt.grid()
42 | # plt.tight_layout()
43 | # plt.show()
44 |
45 | # 计算相关系数矩阵
46 | print('\n')
47 | print('相关矩阵:')
48 | print(data.corr())
49 |
50 | # 构建X、Y数据集
51 | X = data[['TV', 'Radio', 'Newspaper']]
52 | print('\n')
53 | print('X-head : ')
54 | print(X.head())
55 |
56 | y = data['Sales']
57 | print('\n')
58 | print('Y-head : ')
59 | print(y.head())
60 |
61 |
62 | # 根据系数矩阵公式计算
63 | def standRegres(xArr, yArr):
64 | # 转化为矩阵
65 | xMat = mat(xArr)
66 | yMat = mat(yArr).T
67 | xTx = xMat.T * xMat
68 | if linalg.det(xTx) == 0.0: # 判断行列式是不是0,是奇异矩阵的话就不能求逆
69 | print("This matrix is singular, cannot do inverse")
70 | return
71 | ws = xTx.I * (xMat.T * yMat) # I是矩阵的逆
72 | return ws
73 |
74 |
75 | # 求解回归方程系数
76 | X2 = X
77 | X2['intercept'] = [1] * 200 # 原始自变量数据增加一列全1的截距项,变为4列
78 | print('\n')
79 | print('根据系数矩阵公式计算的四个参数:\n', standRegres(X2, y))
80 |
81 | # 利用现有库求解
82 | from sklearn.linear_model import LinearRegression
83 |
84 | linreg = LinearRegression()
85 |
86 | linreg.fit(X, y)
87 |
88 | print('\n')
89 | print('库函数计算的参数:', linreg.coef_)
90 | print('库函数计算的截距:', linreg.intercept_)
91 |
92 | print(zip(['TV', 'Radio', 'Newspaper'], linreg.coef_))
93 |
94 | # 测试集和训练集的构建,交叉验证
95 | from sklearn.model_selection import train_test_split
96 |
97 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
98 |
99 | linreg.fit(X_train, y_train)
100 |
101 | # 结果
102 | print('\n')
103 |
104 | print('交叉验证计算的系数:', linreg.coef_)
105 |
106 | print('交叉验证计算的截距:', linreg.intercept_)
107 |
108 | print(zip(['TV', 'Radio', 'Newspaper'], linreg.coef_))
109 |
110 | # 预测
111 | y_pred = linreg.predict(X_test)
112 |
113 | # 误差评估
114 | from sklearn import metrics
115 |
116 | print('\n')
117 | print('三元参数模型评估:')
118 |
119 | # 误差取绝对值的均值
120 | print("MAE:", metrics.mean_absolute_error(y_test, y_pred))
121 |
122 | # 误差的平方的均值
123 | print("MSE:", metrics.mean_squared_error(y_test, y_pred))
124 |
125 | # MSE的开方
126 | print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
127 |
128 | # 只选择电视和广播的数据训练,报纸作用不大剔除
129 | feature_cols = ['TV', 'Radio']
130 |
131 | X = data[feature_cols]
132 | y = data.Sales
133 |
134 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
135 |
136 | linreg.fit(X_train, y_train)
137 |
138 | y_pred = linreg.predict(X_test)
139 |
140 | print('\n')
141 | print('二元参数模型评估:')
142 |
143 | # 对比模型性能
144 | print("MAE:", metrics.mean_absolute_error(y_test, y_pred))
145 |
146 | print("MSE:", metrics.mean_squared_error(y_test, y_pred))
147 |
148 | print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
149 |
--------------------------------------------------------------------------------
/2.Python数据分析/week11/Logistic回归.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # 逻辑回归 自动建模
4 | import pandas as pd
5 |
6 | # 参数初始化
7 | filename = 'bankloan.xls' # 银行贷款客户数据,700 * (8 + 1)
8 | data = pd.read_excel(filename)
9 | print('data-head : ')
10 | print(data)
11 |
12 | x = data.iloc[:, :8].as_matrix() # 取前8列作为特征矩阵
13 | y = data.iloc[:, 8].as_matrix() # 取第9列作为结果矩阵
14 | print('\n')
15 | print('X : ', x)
16 | print('Y : ', y)
17 |
18 | from sklearn.linear_model import LogisticRegression as LR
19 | from sklearn.linear_model import RandomizedLogisticRegression as RLR
20 |
21 | rlr = RLR() # 建立随机逻辑回归模型,筛选变量
22 | rlr.fit(x, y) # 训练模型
23 | support = list(rlr.get_support())
24 | support.append(False)
25 | print('\n')
26 | print('特征筛选结果:', support) # 获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数
27 | print(u'有效特征为:%s' % ','.join(data.columns[support]))
28 |
29 | x = data[data.columns[support]].as_matrix() # 筛选好特征,重新构建数据集
30 |
31 | lr = LR() # 建立逻辑回归模型
32 | lr.fit(x, y) # 用筛选后的特征数据来训练模型
33 | print('\n')
34 | print(u'Logistic模型的平均正确率为:%s' % lr.score(x, y)) # 给出模型的平均正确率,本例为81.4%
35 |
36 | # 非线性回归
37 | import matplotlib.pyplot as plt
38 | import seaborn as sns
39 | import numpy as np
40 | from sklearn import metrics
41 |
42 | x = pd.DataFrame([1.5, 2.8, 4.5, 7.5, 10.5, 13.5, 15.1, 16.5, 19.5, 22.5, 24.5, 26.5])
43 | y = pd.DataFrame([7.0, 5.5, 4.6, 3.6, 2.9, 2.7, 2.5, 2.4, 2.2, 2.1, 1.9, 1.8])
44 |
45 | fig = plt.figure()
46 | ax = fig.add_subplot(1, 1, 1)
47 | ax.scatter(x, y)
48 | fig.show()
49 |
50 | from sklearn.linear_model import LinearRegression
51 |
52 | linreg = LinearRegression()
53 | linreg.fit(x, y)
54 |
55 | print('\n')
56 | print('线性回归的Coefficients: ', linreg.coef_)
57 |
58 | y_pred = linreg.predict(x)
59 |
60 | print("MSE: ", metrics.mean_squared_error(y, y_pred))
61 |
62 | print('Variance score: %.2f' % linreg.score(x, y))
63 |
64 | # 多项式模型
65 | x1 = x
66 | x2 = x ** 2
67 | x1['x2'] = x2
68 |
69 | linreg = LinearRegression()
70 | linreg.fit(x1, y)
71 |
72 | print('\n')
73 | print('非线性回归的Coefficients: ', linreg.coef_)
74 |
75 | y_pred = linreg.predict(x)
76 |
77 | print("MSE:", metrics.mean_squared_error(y, y_pred))
78 |
79 | # 对数模型
80 | x2 = pd.DataFrame(np.log(x[0]))
81 |
82 | linreg = LinearRegression()
83 | linreg.fit(x2, y)
84 |
85 | print('\n')
86 | print('对数模型的Coefficients: ', linreg.coef_)
87 |
88 | y_pred = linreg.predict(x2)
89 | print("MSE:", metrics.mean_squared_error(y, y_pred))
90 |
91 | # 指数模型
92 | y2 = pd.DataFrame(np.log(y)) # 对因变量求对数
93 |
94 | linreg = LinearRegression()
95 | linreg.fit(pd.DataFrame(x[0]), y2)
96 |
97 | print('\n')
98 | print('指数模型的Coefficients: ', linreg.coef_)
99 |
100 | y_pred = linreg.predict(pd.DataFrame(x[0]))
101 |
102 | print("MSE:", metrics.mean_squared_error(y2, y_pred))
103 |
104 | # 幂函数模型的
105 |
106 | linreg = LinearRegression()
107 | linreg.fit(x2, y2)
108 |
109 | print('\n')
110 | print('幂函数模型的Coefficients: \n', linreg.coef_)
111 |
112 | y_pred = linreg.predict(x2)
113 |
114 | print("MSE:", metrics.mean_squared_error(y2, y_pred))
115 |
--------------------------------------------------------------------------------
/2.Python数据分析/week11/bankloan.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/littleheap/Python-Projects/a2bebb14dd49aeb1a0746eeadd8a30db6875ab01/2.Python数据分析/week11/bankloan.xls
--------------------------------------------------------------------------------
/2.Python数据分析/week11/data1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/littleheap/Python-Projects/a2bebb14dd49aeb1a0746eeadd8a30db6875ab01/2.Python数据分析/week11/data1.txt
--------------------------------------------------------------------------------
/2.Python数据分析/week11/data2.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/littleheap/Python-Projects/a2bebb14dd49aeb1a0746eeadd8a30db6875ab01/2.Python数据分析/week11/data2.txt
--------------------------------------------------------------------------------
/2.Python数据分析/week12/arima_data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/littleheap/Python-Projects/a2bebb14dd49aeb1a0746eeadd8a30db6875ab01/2.Python数据分析/week12/arima_data.xls
--------------------------------------------------------------------------------
/2.Python数据分析/week12/时间序列分析法.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import division
4 | from pandas import Series, DataFrame
5 | import pandas as pd
6 | from numpy.random import randn
7 | import numpy as np
8 |
9 | pd.options.display.max_rows = 12
10 | np.set_printoptions(precision=4, suppress=True)
11 | import matplotlib.pyplot as plt
12 |
13 | plt.rc('figure', figsize=(12, 4))
14 |
15 | from datetime import datetime
16 |
17 | now = datetime.now()
18 | print('now : ', now)
19 | print('now.year, now.month, now.day : ', now.year, now.month, now.day)
20 |
21 | delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)
22 | print('delta : ', delta)
23 | print('delta.days : ', delta.days)
24 | print('delta.seconds : ', delta.seconds)
25 |
26 | from datetime import timedelta
27 |
28 | start = datetime(2011, 1, 7)
29 | print('datetime(2011, 1, 7) + timedelta(12) : ', start + timedelta(12))
30 | print('datetime(2011, 1, 7) - 2 * timedelta(12) : ', start - 2 * timedelta(12))
31 |
32 | print('-------------------------')
33 | # 字符串转日期
34 | stamp = datetime(2011, 1, 3)
35 | print('datetime(2011, 1, 3) : ', str(stamp))
36 | print('datetime(2011, 1, 3) : ', stamp.strftime('%Y-%m-%d'))
37 |
38 | value = '2011-01-03'
39 | print('2011-01-03 : ', datetime.strptime(value, '%Y-%m-%d'))
40 |
41 | datestrs = ['7/6/2011', '8/6/2011']
42 | print([datetime.strptime(x, '%m/%d/%Y') for x in datestrs])
43 |
44 | from dateutil.parser import parse
45 |
46 | print('parse : ', parse('2011-01-03'))
47 | print('parse : ', parse('Jan 31, 1997 10:45 PM'))
48 | print('parse : ', parse('6/12/2011', dayfirst=True))
49 |
50 | print('datestrs : ', datestrs)
51 | print('pd.to_datetime(datestrs) : ', pd.to_datetime(datestrs))
52 |
53 | idx = pd.to_datetime(datestrs + [None])
54 | print('pd.to_datetime(datestrs + [None]) : ', idx)
55 | print('idx[2] : ', idx[2])
56 | print('pd.isnull(idx) : ', pd.isnull(idx))
57 |
58 | print('-------------------------')
59 | # pands中的时间序列
60 | from datetime import datetime
61 |
62 | dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), datetime(2011, 1, 7),
63 | datetime(2011, 1, 8), datetime(2011, 1, 10), datetime(2011, 1, 12)]
64 | ts = Series(np.random.randn(6), index=dates)
65 | print(ts)
66 | print(type(ts))
67 |
68 | print('ts.index : ', ts.index)
69 | print('ts + ts[::2] : ', ts + ts[::2])
70 | print('ts.index.dtype : ', ts.index.dtype)
71 |
72 | stamp = ts.index[0]
73 | print('ts.index[0] : ', stamp)
74 |
75 | print('-------------------------')
76 | # 索引、选取与子集构造
77 | stamp = ts.index[2]
78 | print('ts.index[2] : ', ts[stamp])
79 | print('ts[1/10/2011] : ', ts['1/10/2011'])
80 | print('ts[20110110] : ', ts['20110110'])
81 |
82 | longer_ts = Series(np.random.randn(1000),
83 | index=pd.date_range('1/1/2000', periods=1000))
84 | print('longer_ts : ', longer_ts)
85 | print(longer_ts['2001'])
86 | print(longer_ts['2001-05'])
87 |
88 | print(ts[datetime(2011, 1, 7):])
89 | print('ts : ', ts)
90 | print(ts['1/6/2011':'1/11/2011'])
91 | print(ts.truncate(after='1/9/2011'))
92 |
93 | print('-------------------------')
94 | dates = pd.date_range('1/1/2000', periods=100, freq='W-WED')
95 | long_df = DataFrame(np.random.randn(100, 4),
96 | index=dates,
97 | columns=['Colorado', 'Texas', 'New York', 'Ohio'])
98 | print(long_df)
99 | print(long_df.ix['5-2001'])
100 |
101 | print('-------------------------')
102 | dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000',
103 | '1/3/2000'])
104 | dup_ts = Series(np.arange(5), index=dates)
105 | print(dup_ts)
106 | print('dup_ts.index.is_unique : ', dup_ts.index.is_unique)
107 | print(dup_ts['1/3/2000'])
108 | print(dup_ts['1/2/2000'])
109 |
110 | grouped = dup_ts.groupby(level=0)
111 | print(grouped.mean())
112 | print(grouped.count())
113 |
114 | print('-------------------------')
115 | # 日期范围、频率与移动
116 | print(ts)
117 | print(ts.resample('D').mean())
118 |
119 | index = pd.date_range('4/1/2012', '6/1/2012')
120 | print(index)
121 | print(pd.date_range(start='4/1/2012', periods=20))
122 | print(pd.date_range(end='6/1/2012', periods=20))
123 | print(pd.date_range('1/1/2000', '12/1/2000', freq='BM'))
124 | print(pd.date_range('5/2/2012 12:56:31', periods=5))
125 | print(pd.date_range('5/2/2012 12:56:31', periods=5, normalize=True))
126 |
127 | print('-------------------------')
128 | from pandas.tseries.offsets import Hour, Minute
129 |
130 | hour = Hour()
131 | print(hour)
132 |
133 | four_hours = Hour(4)
134 | print(four_hours)
135 |
136 | print(pd.date_range('1/1/2000', '1/3/2000 23:59', freq='4h'))
137 |
138 | print(Hour(2) + Minute(30))
139 |
140 | print(pd.date_range('1/1/2000', periods=10, freq='1h30min'))
141 |
142 | rng = pd.date_range('1/1/2012', '9/1/2012', freq='WOM-3FRI')
143 | print(list(rng))
144 |
145 | ts = Series(np.random.randn(4),
146 | index=pd.date_range('1/1/2000', periods=4, freq='M'))
147 | print(ts)
148 | print(ts.shift(2))
149 | print(ts.shift(-2))
150 | print(ts / ts.shift(1) - 1)
151 | print(ts.shift(2, freq='M'))
152 | print(ts.shift(3, freq='D'))
153 | print(ts.shift(1, freq='3D'))
154 | print(ts.shift(1, freq='90T'))
155 |
156 | print('-------------------------')
157 | from pandas.tseries.offsets import Day, MonthEnd
158 |
159 | now = datetime(2011, 11, 17)
160 | print(now + 3 * Day())
161 | print(now + MonthEnd())
162 | print(now + MonthEnd(2))
163 |
164 | offset = MonthEnd()
165 | print(offset.rollforward(now))
166 | print(offset.rollback(now))
167 |
168 | ts = Series(np.random.randn(20),
169 | index=pd.date_range('1/15/2000', periods=20, freq='4d'))
170 | print(ts.groupby(offset.rollforward).mean())
171 | print(ts.resample('M', how='mean'))
172 |
173 | print('-------------------------')
174 | # 时间序列可视化
175 | close_px_all = pd.read_csv('stock_px.csv', parse_dates=True, index_col=0)
176 | close_px = close_px_all[['AAPL', 'MSFT', 'XOM']]
177 | close_px = close_px.resample('B', fill_method='ffill').ffill()
178 | print(close_px.info())
179 | close_px['AAPL'].plot()
180 | plt.show()
181 | close_px.ix['2009'].plot()
182 | plt.show()
183 | close_px['AAPL'].ix['01-2011':'03-2011'].plot()
184 | plt.show()
185 |
186 | appl_q = close_px['AAPL'].resample('Q-DEC', fill_method='ffill').ffill()
187 | appl_q.ix['2009':].plot()
188 | plt.show()
189 |
190 | close_px = close_px.asfreq('B').fillna(method='ffill').ffill()
191 | close_px.AAPL.plot()
192 | plt.show()
193 |
194 | pd.rolling_mean(close_px.AAPL, 250).plot()
195 | plt.show()
196 |
197 | plt.figure()
198 |
199 | appl_std250 = pd.rolling_std(close_px.AAPL, 250, min_periods=10)
200 | print(appl_std250[5:12])
201 | appl_std250.plot()
202 | plt.show()
203 |
204 | expanding_mean = lambda x: pd.rolling_mean(x, len(x), min_periods=1)
205 | pd.rolling_mean(close_px, 60).plot(logy=True)
206 | plt.show()
207 |
208 | plt.close('all')
209 |
210 | fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True, sharey=True,
211 | figsize=(12, 7))
212 |
213 | aapl_px = close_px.AAPL['2005':'2009']
214 | ma60 = pd.rolling_mean(aapl_px, 60, min_periods=50)
215 | ewma60 = pd.ewma(aapl_px, span=60)
216 |
217 | aapl_px.plot(style='k-', ax=axes[0])
218 | ma60.plot(style='k--', ax=axes[0])
219 | aapl_px.plot(style='k-', ax=axes[1])
220 | ewma60.plot(style='k--', ax=axes[1])
221 | axes[0].set_title('Simple MA')
222 | axes[1].set_title('Exponentially-weighted MA')
223 | plt.show()
224 |
225 | print(close_px)
226 | spx_px = close_px_all['SPX']
227 | spx_rets = spx_px / spx_px.shift(1) - 1
228 | returns = close_px.pct_change()
229 | corr = pd.rolling_corr(returns.AAPL, spx_rets, 125, min_periods=100)
230 | corr.plot()
231 | plt.show()
232 |
233 | corr = pd.rolling_corr(returns, spx_rets, 125, min_periods=100)
234 | corr.plot()
235 | plt.show()
236 |
237 | from scipy.stats import percentileofscore
238 |
239 | score_at_2percent = lambda x: percentileofscore(x, 0.02)
240 | result = pd.rolling_apply(returns.AAPL, 250, score_at_2percent)
241 | result.plot()
242 | plt.show()
243 |
244 | '''
245 | print('-------------------------')
246 | # 时序案例分析
247 | # 参数初始化
248 | discfile = 'arima_data.xls'
249 | forecastnum = 5
250 |
251 | # 读取数据,指定日期列为指标,Pandas自动将“日期”列识别为Datetime格式
252 | data = pd.read_excel(discfile, index_col=u'日期')
253 | data = pd.DataFrame(data, dtype=np.float64)
254 | print('data : ', data)
255 |
256 | # 时序图
257 | plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
258 | plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
259 | data.plot()
260 | plt.show()
261 |
262 | # 自相关图
263 | from statsmodels.graphics.tsaplots import plot_acf
264 |
265 | plot_acf(data).show()
266 |
267 | # 平稳性检测
268 | from statsmodels.tsa.stattools import adfuller as ADF
269 |
270 | print(ADF(data[u'销量']))
271 | # 返回值依次为adf、pvalue、usedlag、nobs、critical values、icbest、regresults、resstore
272 |
273 | # 差分后的结果
274 | D_data = data.diff().dropna()
275 | D_data.columns = [u'销量差分']
276 | D_data.plot() # 时序图
277 | plt.show()
278 | plot_acf(D_data).show() # 自相关图
279 |
280 | from statsmodels.graphics.tsaplots import plot_pacf
281 |
282 | plot_pacf(D_data).show() # 偏自相关图
283 | ADF(D_data[u'销量差分']) # 平稳性检测
284 |
285 | # 白噪声检验
286 | from statsmodels.stats.diagnostic import acorr_ljungbox
287 |
288 | acorr_ljungbox(D_data, lags=1) # 返回统计量和p值
289 |
290 | from statsmodels.tsa.arima_model import ARIMA
291 |
292 | # 定阶
293 | pmax = int(len(D_data) / 10) # 一般阶数不超过length/10
294 | qmax = int(len(D_data) / 10) # 一般阶数不超过length/10
295 | bic_matrix = [] # bic矩阵
296 | for p in range(pmax + 1):
297 | tmp = []
298 | for q in range(qmax + 1):
299 | try: # 存在部分报错,所以用try来跳过报错。
300 | tmp.append(ARIMA(data, (p, 1, q)).fit().bic)
301 | except:
302 | tmp.append(None)
303 | bic_matrix.append(tmp)
304 |
305 | bic_matrix = pd.DataFrame(bic_matrix) # 从中可以找出最小值
306 |
307 | p, q = bic_matrix.stack().idxmin() # 先用stack展平,然后用idxmin找出最小值位置。
308 | print(u'BIC最小的p值和q值为:%s、%s' % (p, q))
309 | model = ARIMA(data, (0, 1, 1)).fit() # 建立ARIMA(0, 1, 1)模型
310 | model.summary() # 给出一份模型报告
311 | model.forecast(5) # 作为期5天的预测,返回预测结果、标准误差、置信区间。
312 | '''
313 |
--------------------------------------------------------------------------------
/2.Python数据分析/week13/分类算法.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import numpy as np
4 | from sklearn.neighbors import KNeighborsClassifier
5 | from sklearn.metrics import precision_recall_curve
6 | from sklearn.metrics import classification_report
7 | from sklearn.naive_bayes import BernoulliNB
8 | from sklearn.feature_extraction.text import TfidfVectorizer
9 | from sklearn.cross_validation import train_test_split
10 | import matplotlib.pyplot as plt
11 | import pandas as pd
12 |
13 | # knn最邻近算法
14 | inputfile = 'sales_data.xls'
15 | data = pd.read_excel(inputfile, index_col=u'序号') # 导入数据
16 |
17 | # 数据是类别标签,要将它转换为数据
18 | # 用1来表示“好”、“是”、“高”这三个属性,用-1来表示“坏”、“否”、“低”
19 | data[data == u'好'] = 1
20 | data[data == u'是'] = 1
21 | data[data == u'高'] = 1
22 | data[data != 1] = -1
23 | x = data.iloc[:, :3].as_matrix().astype(int) # 前3列作为特征值
24 | y = data.iloc[:, 3].as_matrix().astype(int) # 第4列作为输出
25 |
26 | # 拆分训练数据与测试数据
27 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
28 |
29 | # 训练KNN分类器
30 | clf = KNeighborsClassifier(algorithm='kd_tree')
31 | clf.fit(x_train, y_train)
32 |
33 | # 测试结果
34 | answer = clf.predict(x_test)
35 | print('x_test : \n', x_test)
36 | print('answer : \n', answer)
37 | print('y_test : \n', y_test)
38 | print('answer == y_test : \n', np.mean(answer == y_test))
39 |
40 | # 准确率
41 | precision, recall, thresholds = precision_recall_curve(y_train, clf.predict(x_train))
42 | print('report : ', classification_report(y_test, answer, target_names=['高', '低']))
43 |
44 | print('-----------------------------')
45 | # 贝叶斯分类器
46 | # 训练贝叶斯分类器
47 | clf = BernoulliNB()
48 | clf.fit(x_train, y_train)
49 |
50 | # 测试结果
51 | answer = clf.predict(x_test)
52 | print('x_test : \n', x_test)
53 | print('answer : \n', answer)
54 | print('y_test : \n', y_test)
55 | print('answer == y_test : \n', np.mean(answer == y_test))
56 | print('report : ', classification_report(y_test, answer, target_names=['低', '高']))
57 |
58 | print('-----------------------------')
59 | # 决策树
60 | from sklearn.tree import DecisionTreeClassifier as DTC
61 |
62 | dtc = DTC(criterion='entropy') # 建立决策树模型,基于信息熵
63 | dtc.fit(x_train, y_train) # 训练模型
64 |
65 | # 导入相关函数,可视化决策树。
66 | # 导出的结果是一个dot文件,需要安装Graphviz才能将它转换为pdf或png等格式。
67 | from sklearn.tree import export_graphviz
68 | from sklearn.externals.six import StringIO
69 |
70 | with open("tree.dot", 'w') as f:
71 | f = export_graphviz(dtc, out_file=f)
72 |
73 | # 测试结果
74 | answer = dtc.predict(x_test)
75 | print('x_test : \n', x_test)
76 | print('answer : \n', answer)
77 | print('y_test : \n', y_test)
78 | print('answer == y_test : \n', np.mean(answer == y_test))
79 | print('report : ', classification_report(y_test, answer, target_names=['低', '高']))
80 |
81 | print('-----------------------------')
82 | # SVM
83 | from sklearn.svm import SVC
84 |
85 | clf = SVC()
86 | clf.fit(x_train, y_train)
87 |
88 | # 测试结果
89 | answer = clf.predict(x_test)
90 | print('x_test : \n', x_test)
91 | print('answer : \n', answer)
92 | print('y_test : \n', y_test)
93 | print('answer == y_test : \n', np.mean(answer == y_test))
94 | print('report : ', classification_report(y_test, answer, target_names=['低', '高']))
95 |
--------------------------------------------------------------------------------
/2.Python数据分析/week14/ex14.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/littleheap/Python-Projects/a2bebb14dd49aeb1a0746eeadd8a30db6875ab01/2.Python数据分析/week14/ex14.csv
--------------------------------------------------------------------------------
/2.Python数据分析/week14/聚类算法.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from IPython import get_ipython
4 | from sklearn.cluster import AgglomerativeClustering
5 | import matplotlib.pyplot as plt
6 | import numpy as np
7 | from scipy import ndimage
8 | from matplotlib import pyplot as plt
9 | from sklearn import manifold, datasets
10 |
11 | '''
12 | 聚类算法
13 | '''
14 | # 聚类算法:无监督学习
15 | digits = datasets.load_digits(n_class=10) # 模块内数据集
16 | X = digits.data
17 | y = digits.target # 预先分类结果标记
18 | n_samples, n_features = X.shape
19 | print(X[:5, :])
20 | print(n_samples, n_features)
21 |
22 |
23 | # 可视化聚类
24 | def plot_clustering(X_red, X, labels, title=None):
25 | x_min, x_max = np.min(X_red, axis=0), np.max(X_red, axis=0)
26 | X_red = (X_red - x_min) / (x_max - x_min) # 归一化
27 | plt.figure(figsize=(6, 4))
28 | for i in range(X_red.shape[0]):
29 | plt.text(X_red[i, 0], X_red[i, 1], str(y[i]),
30 | color=plt.cm.spectral(labels[i] / 10.),
31 | fontdict={'weight': 'bold', 'size': 9})
32 | plt.xticks([])
33 | plt.yticks([])
34 | if title is not None:
35 | plt.title(title, size=17)
36 | plt.axis('off')
37 | plt.tight_layout()
38 |
39 |
40 | # 2D embedding of the digits dataset
41 | print("Computing embedding")
42 | X_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X)
43 | print("Done.")
44 |
45 | from sklearn.cluster import AgglomerativeClustering
46 |
47 | for linkage in ('ward', 'average', 'complete'): # 3种聚类算法指标
48 | clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10)
49 | clustering.fit(X_red)
50 | plot_clustering(X_red, X, clustering.labels_, "%s linkage" % linkage)
51 |
52 | plt.show()
53 |
54 | X0 = np.array([7, 5, 7, 3, 4, 1, 0, 2, 8, 6, 5, 3])
55 | X1 = np.array([5, 7, 7, 3, 6, 4, 0, 2, 7, 8, 5, 7])
56 | plt.figure()
57 | plt.axis([-1, 9, -1, 9])
58 | plt.grid(True)
59 | plt.plot(X0, X1, 'k.')
60 | plt.show()
61 |
62 | C1 = [1, 4, 5, 9, 11]
63 | C2 = list(set(range(12)) - set(C1))
64 | X0C1, X1C1 = X0[C1], X1[C1]
65 | X0C2, X1C2 = X0[C2], X1[C2]
66 | plt.figure()
67 | plt.axis([-1, 9, -1, 9])
68 | plt.grid(True)
69 | plt.plot(X0C1, X1C1, 'rx')
70 | plt.plot(X0C2, X1C2, 'g.')
71 | plt.plot(4, 6, 'rx', ms=12.0)
72 | plt.plot(5, 5, 'g.', ms=12.0)
73 | plt.show()
74 |
75 | C1 = [1, 2, 4, 8, 9, 11]
76 | C2 = list(set(range(12)) - set(C1))
77 | X0C1, X1C1 = X0[C1], X1[C1]
78 | X0C2, X1C2 = X0[C2], X1[C2]
79 | plt.figure()
80 | plt.axis([-1, 9, -1, 9])
81 | plt.grid(True)
82 | plt.plot(X0C1, X1C1, 'rx')
83 | plt.plot(X0C2, X1C2, 'g.')
84 | plt.plot(3.8, 6.4, 'rx', ms=12.0)
85 | plt.plot(4.57, 4.14, 'g.', ms=12.0);
86 | plt.show()
87 |
88 | C1 = [0, 1, 2, 4, 8, 9, 10, 11]
89 | C2 = list(set(range(12)) - set(C1))
90 | X0C1, X1C1 = X0[C1], X1[C1]
91 | X0C2, X1C2 = X0[C2], X1[C2]
92 | plt.figure()
93 | plt.axis([-1, 9, -1, 9])
94 | plt.grid(True)
95 | plt.plot(X0C1, X1C1, 'rx')
96 | plt.plot(X0C2, X1C2, 'g.')
97 | plt.plot(5.5, 7.0, 'rx', ms=12.0)
98 | plt.plot(2.2, 2.8, 'g.', ms=12.0)
99 | plt.show()
100 |
101 | '''
102 | K-Means
103 | '''
104 | # 2分类最佳
105 | cluster1 = np.random.uniform(0.5, 1.5, (2, 10))
106 | cluster2 = np.random.uniform(3.5, 4.5, (2, 10))
107 | X = np.hstack((cluster1, cluster2)).T
108 | plt.figure()
109 | plt.axis([0, 5, 0, 5])
110 | plt.grid(True)
111 | plt.plot(X[:, 0], X[:, 1], 'k.')
112 | plt.show()
113 |
114 | from sklearn.cluster import KMeans
115 | from scipy.spatial.distance import cdist
116 |
117 | K = range(1, 10)
118 | meandistortions = []
119 | for k in K:
120 | kmeans = KMeans(n_clusters=k)
121 | kmeans.fit(X)
122 | meandistortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
123 | plt.plot(K, meandistortions, 'bx-')
124 | plt.xlabel('k')
125 | plt.ylabel('The average degree of distortion')
126 | plt.title('Best k')
127 | plt.show()
128 |
129 | import numpy as np
130 |
131 | # 3分类最佳
132 | x1 = np.array([1, 2, 3, 1, 5, 6, 5, 5, 6, 7, 8, 9, 7, 9])
133 | x2 = np.array([1, 3, 2, 2, 8, 6, 7, 6, 7, 1, 2, 1, 1, 3])
134 | X = np.array(list(zip(x1, x2))).reshape(len(x1), 2)
135 | plt.figure()
136 | plt.axis([0, 10, 0, 10])
137 | plt.grid(True)
138 | plt.plot(X[:, 0], X[:, 1], 'k.')
139 | plt.show()
140 |
141 | from sklearn.cluster import KMeans
142 | from scipy.spatial.distance import cdist
143 |
144 | K = range(1, 10)
145 | meandistortions = []
146 | for k in K:
147 | kmeans = KMeans(n_clusters=k)
148 | kmeans.fit(X)
149 | meandistortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
150 | plt.plot(K, meandistortions, 'bx-')
151 | plt.xlabel('k')
152 | plt.ylabel('The average degree of distortion')
153 | plt.title('Best K')
154 | plt.show()
155 |
156 | '''
157 | DBSCAN:基于密度的方法
158 |
159 | '''
160 | import numpy as np
161 |
162 | from sklearn.cluster import DBSCAN
163 | from sklearn import metrics
164 | from sklearn.datasets.samples_generator import make_blobs
165 | from sklearn.preprocessing import StandardScaler
166 |
167 | # 样本数据
168 | centers = [[1, 1], [-1, -1], [1, -1]]
169 | X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
170 | random_state=0)
171 |
172 | X = StandardScaler().fit_transform(X)
173 |
174 | # DBSCAN密度聚类
175 | db = DBSCAN(eps=0.3, min_samples=10).fit(X)
176 | core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
177 | core_samples_mask[db.core_sample_indices_] = True
178 | labels = db.labels_
179 |
180 | # 聚类数据信息量,忽略噪声
181 | n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
182 |
183 | print('Estimated number of clusters: %d' % n_clusters_)
184 | print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
185 | print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
186 | print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
187 | print("Adjusted Rand Index: %0.3f"
188 | % metrics.adjusted_rand_score(labels_true, labels))
189 | print("Adjusted Mutual Information: %0.3f"
190 | % metrics.adjusted_mutual_info_score(labels_true, labels))
191 | print("Silhouette Coefficient: %0.3f"
192 | % metrics.silhouette_score(X, labels))
193 |
194 | import matplotlib.pyplot as plt
195 |
196 | # 黑色是未被分类点
197 | unique_labels = set(labels)
198 | colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
199 | for k, col in zip(unique_labels, colors):
200 | if k == -1:
201 | # Black used for noise.
202 | col = 'k'
203 |
204 | class_member_mask = (labels == k)
205 |
206 | xy = X[class_member_mask & core_samples_mask]
207 | plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
208 | markeredgecolor='k', markersize=14)
209 |
210 | xy = X[class_member_mask & ~core_samples_mask]
211 | plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
212 | markeredgecolor='k', markersize=6)
213 |
214 | plt.title('Estimated number of clusters: %d' % n_clusters_)
215 | plt.show()
216 |
--------------------------------------------------------------------------------
/2.Python数据分析/week15/ex15.txt:
--------------------------------------------------------------------------------
1 | "x1" "x2" "x3" "y"
2 | "1" 149.3 4.2 108.1 15.9
3 | "2" 161.2 4.1 114.8 16.4
4 | "3" 171.5 3.1 123.2 19
5 | "4" 175.5 3.1 126.9 19.1
6 | "5" 180.8 1.1 132.1 18.8
7 | "6" 190.7 2.2 137.7 20.4
8 | "7" 202.1 2.1 146 22.7
9 | "8" 212.4 5.6 154.1 26.5
10 | "9" 226.1 5 162.3 28.1
11 | "10" 231.9 5.1 164.3 27.6
12 | "11" 239 0.7 167.6 26.3
13 |
--------------------------------------------------------------------------------
/2.Python数据分析/week15/矩阵基础.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # 协方差矩阵
4 | import numpy as np
5 |
6 | X = [[2, 0, -1.4],
7 | [2.2, 0.2, -1.5],
8 | [2.4, 0.1, -1],
9 | [1.9, 0, -1.2]]
10 |
11 | print(np.cov(np.array(X).T))
12 |
13 | # 特征值与特征向量
14 | w, v = np.linalg.eig(np.array([[1, -2], [2, -3]]))
15 |
16 | print('特征值:{}\n特征向量:{}'.format(w, v))
17 |
18 | a = [[-0.27, -0.3],
19 | [1.23, 1.3],
20 | [0.03, 0.4],
21 | [-0.67, 0.6],
22 | [-0.87, 0.6],
23 | [0.63, 0.1],
24 | [-0.67, -0.7],
25 | [-0.87, -0.7],
26 | [1.33, 1.3],
27 | [0.13, -0.2]]
28 |
29 | b = [[0.73251454], [0.68075138]]
30 |
31 | print(np.dot(a, b))
32 |
33 | # 鸢尾花数据集的降维
34 | import matplotlib.pyplot as plt
35 | from sklearn.decomposition import PCA
36 | from sklearn.datasets import load_iris
37 |
38 | data = load_iris()
39 | y = data.target
40 | X = data.data
41 | pca = PCA(n_components=2)
42 | reduced_X = pca.fit_transform(X)
43 |
44 | red_x, red_y = [], []
45 | blue_x, blue_y = [], []
46 | green_x, green_y = [], []
47 | for i in range(len(reduced_X)):
48 | if y[i] == 0:
49 | red_x.append(reduced_X[i][0])
50 | red_y.append(reduced_X[i][1])
51 | elif y[i] == 1:
52 | blue_x.append(reduced_X[i][0])
53 | blue_y.append(reduced_X[i][1])
54 | else:
55 | green_x.append(reduced_X[i][0])
56 | green_y.append(reduced_X[i][1])
57 | plt.scatter(red_x, red_y, c='r', marker='x')
58 | plt.scatter(blue_x, blue_y, c='b', marker='D')
59 | plt.scatter(green_x, green_y, c='g', marker='.')
60 | plt.show()
61 |
--------------------------------------------------------------------------------
/2.Python数据分析/案例分析/business_circle.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/littleheap/Python-Projects/a2bebb14dd49aeb1a0746eeadd8a30db6875ab01/2.Python数据分析/案例分析/business_circle.xls
--------------------------------------------------------------------------------
/2.Python数据分析/案例分析/standardized.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/littleheap/Python-Projects/a2bebb14dd49aeb1a0746eeadd8a30db6875ab01/2.Python数据分析/案例分析/standardized.xls
--------------------------------------------------------------------------------
/2.Python数据分析/案例分析/基于基站定位数据的商圈.py:
--------------------------------------------------------------------------------
1 | # 数据标准化到[0,1]
2 | import pandas as pd
3 |
4 | # 参数初始化
5 | filename = 'business_circle.xls' # 原始数据文件
6 | standardizedfile = 'standardized.xls' # 标准化后数据保存路径
7 |
8 | data = pd.read_excel(filename, index_col=u'基站编号') # 读取数据
9 |
10 | data = (data - data.min()) / (data.max() - data.min()) # 离差标准化
11 | data = data.reset_index()
12 |
13 | data.to_excel(standardizedfile, index=False) # 保存结果
14 |
15 | # 参数初始化
16 | standardizedfile = 'standardized.xls' # 标准化后的数据文件
17 | data = pd.read_excel(standardizedfile, index_col=u'基站编号') # 读取数据
18 |
19 | import matplotlib.pyplot as plt
20 | from scipy.cluster.hierarchy import linkage, dendrogram
21 |
22 | # 这里使用scipy的层次聚类函数
23 |
24 | Z = linkage(data, method='ward', metric='euclidean') # 谱系聚类图
25 | P = dendrogram(Z, 0) # 画谱系聚类图
26 | plt.show()
27 |
28 | # 层次聚类算法
29 |
30 | # 参数初始化
31 | standardizedfile = 'standardized.xls' # 标准化后的数据文件
32 | k = 3 # 聚类数
33 | data = pd.read_excel(standardizedfile, index_col=u'基站编号') # 读取数据
34 |
35 | from sklearn.cluster import AgglomerativeClustering # 导入sklearn的层次聚类函数
36 |
37 | model = AgglomerativeClustering(n_clusters=k, linkage='ward')
38 | model.fit(data) # 训练模型
39 |
40 | # 详细输出原始数据及其类别
41 | r = pd.concat([data, pd.Series(model.labels_, index=data.index)], axis=1) # 详细输出每个样本对应的类别
42 | r.columns = list(data.columns) + [u'聚类类别'] # 重命名表头
43 |
44 | import matplotlib.pyplot as plt
45 |
46 | plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
47 | plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
48 |
49 | style = ['ro-', 'go-', 'bo-']
50 | xlabels = [u'工作日人均停留时间', u'凌晨人均停留时间', u'周末人均停留时间', u'日均人流量']
51 | pic_output = 'd:/data/type_' # 聚类图文件名前缀
52 |
53 | for i in range(k): # 逐一作图,作出不同样式
54 | plt.figure()
55 | tmp = r[r[u'聚类类别'] == i].iloc[:, :4] # 提取每一类
56 | for j in range(len(tmp)):
57 | plt.plot(range(1, 5), tmp.iloc[j], style[i])
58 |
59 | plt.xticks(range(1, 5), xlabels, rotation=20) # 坐标标签
60 | plt.title(u'商圈类别%s' % (i + 1)) # 我们计数习惯从1开始
61 | plt.subplots_adjust(bottom=0.15) # 调整底部
62 | plt.savefig(u'%s%s.png' % (pic_output, i + 1)) # 保存图片
63 |
--------------------------------------------------------------------------------
/2.Python数据分析/案例分析/电信客户流失分析.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import sys
6 | from pandas import Series, DataFrame
7 | import matplotlib.pyplot as plt
8 |
9 | filename = 'telco.xls'
10 | data = pd.read_excel(filename)
11 | data.head()
12 |
13 | x = data.iloc[:, :37].as_matrix()
14 | y = data.iloc[:, 37].as_matrix()
15 |
16 | from sklearn.linear_model import LogisticRegression as LR
17 |
18 | lr = LR() # 建立逻辑回归模型
19 | lr.fit(x, y) # 用筛选后的特征数据来训练模型
20 | print(u'逻辑回归模型训练结束。')
21 | print(u'模型的平均正确率为:%s' % lr.score(x, y)) # 给出模型的平均正确率,本例为77.8%
22 |
23 |
24 | def cm_plot(y, yp):
25 | from sklearn.metrics import confusion_matrix # 导入混淆矩阵函数
26 |
27 | cm = confusion_matrix(y, yp) # 混淆矩阵
28 |
29 | import matplotlib.pyplot as plt # 导入作图库
30 | plt.matshow(cm, cmap=plt.cm.Greens) # 画混淆矩阵图,配色风格使用cm.Greens,更多风格请参考官网。
31 | plt.colorbar() # 颜色标签
32 |
33 | for x in range(len(cm)): # 数据标签
34 | for y in range(len(cm)):
35 | plt.annotate(cm[x, y], xy=(x, y), horizontalalignment='center', verticalalignment='center')
36 |
37 | plt.ylabel('True label') # 坐标轴标签
38 | plt.xlabel('Predicted label') # 坐标轴标签
39 | return plt
40 |
41 |
42 | from sklearn.cross_validation import train_test_split
43 |
44 | p = 0.2 # 设置测试数据比例
45 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=p)
46 |
47 | from sklearn.tree import DecisionTreeClassifier # 导入决策树模型
48 |
49 | treefile = 'd:/data/tree.pkl' # 模型输出名字
50 | tree = DecisionTreeClassifier() # 建立决策树模型
51 | tree.fit(x_train, y_train) # 训练
52 |
53 | # 保存模型
54 | from sklearn.externals import joblib
55 |
56 | joblib.dump(tree, treefile)
57 |
58 | cm_plot(y_train, tree.predict(x_train)).show() # 显示混淆矩阵可视化结果
59 | # 注意到Scikit-Learn使用predict方法直接给出预测结果。
60 |
61 | from sklearn.metrics import roc_curve # 导入ROC曲线函数
62 |
63 | fpr, tpr, thresholds = roc_curve(y_test, tree.predict_proba(x_test)[:, 1], pos_label=1)
64 | plt.plot(fpr, tpr, linewidth=2, label='ROC of CART', color='green') # 作出ROC曲线
65 | plt.xlabel('False Positive Rate') # 坐标轴标签
66 | plt.ylabel('True Positive Rate') # 坐标轴标签
67 | plt.ylim(0, 1.05) # 边界范围
68 | plt.xlim(0, 1.05) # 边界范围
69 | plt.legend(loc=4) # 图例
70 | plt.show() # 显示作图结果
71 |
72 | # 训练KNN分类器
73 | from sklearn.neighbors import KNeighborsClassifier
74 | from sklearn.metrics import precision_recall_curve
75 | from sklearn.metrics import classification_report
76 |
77 | clf = KNeighborsClassifier(algorithm='kd_tree')
78 | clf.fit(x_train, y_train)
79 |
80 | # 测试结果
81 | answer = clf.predict(x_test)
82 | print(x_test)
83 | print(answer)
84 | print(y_test)
85 | print(np.mean(answer == y_test))
86 |
87 | # 准确率
88 | precision, recall, thresholds = precision_recall_curve(y_train, clf.predict(x_train))
89 | print(classification_report(y_test, answer, target_names=['高', '低']))
90 |
91 | # 贝叶斯分类器
92 | # 训练贝叶斯分类器
93 | from sklearn.naive_bayes import BernoulliNB
94 |
95 | clf = BernoulliNB()
96 | clf.fit(x_train, y_train)
97 |
98 | # 测试结果
99 | answer = clf.predict(x_test)
100 | print(x_test)
101 | print(answer)
102 | print(y_test)
103 | print(np.mean(answer == y_test))
104 | print(classification_report(y_test, answer, target_names=['低', '高']))
105 |
106 | from sklearn.svm import SVC
107 |
108 | clf = SVC()
109 | clf.fit(x_train, y_train)
110 |
111 | # 测试结果
112 | answer = clf.predict(x_test)
113 | print(x_test)
114 | print(answer)
115 | print(y_test)
116 | print(np.mean(answer == y_test))
117 | print(classification_report(y_test, answer, target_names=['低', '高']))
118 |
--------------------------------------------------------------------------------
/2.Python数据分析/案例分析/股票指数构建.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from datetime import datetime
3 | from sklearn.decomposition import PCA, FactorAnalysis
4 | import matplotlib.pyplot as plt
5 |
6 | # 读入数据
7 | data = pd.read_csv("stock_prices.csv")
8 | data['Date'] = [datetime.strptime(x, '%Y-%m-%d') for x in data.Date]
9 | data = data[data['Stock'] != 'DDR']
10 | ndat = data.pivot('Date', 'Stock', 'Close')
11 | ndat.head()
12 | ndat = ndat.dropna()
13 |
14 | # 协方差矩阵
15 | cor_mat = ndat.corr()
16 | print(cor_mat)
17 |
18 | # 主成分分析
19 | pca = PCA()
20 | reduced_X = pca.fit_transform(ndat)
21 | print(pca.explained_variance_ratio_)
22 |
23 | pca2 = PCA(n_components=1)
24 | market = pca2.fit_transform(ndat)
25 |
26 | # 与道琼斯指数比较
27 | dji = pd.read_csv("DJI.csv")
28 | dji = dji.dropna()
29 | dji['Date'] = [datetime.strptime(x, '%Y/%m/%d') for x in dji.Date]
30 | dji.head()
31 | dji_sub = dji[dji.Date.isin(ndat.index)]
32 | dji_close = dji_sub['Close']
33 |
34 | fig = plt.figure()
35 | plt.scatter(dji_close, market, color='blue')
36 | plt.show()
37 |
38 | dji_close2 = (dji_close - dji_close.mean()) / dji_close.std()
39 | market2 = -(market - market.mean()) / market.std()
40 | fig = plt.figure()
41 | plt.plot(market2, color='red')
42 | plt.plot(dji_close2, color='blue')
43 |
44 | # 因子分析
45 | fa = FactorAnalysis(n_components=1)
46 | market3 = fa.fit_transform(ndat)
47 |
48 | fig = plt.figure()
49 | plt.scatter(dji_close, market3, color='blue')
50 | plt.show()
51 |
52 | market4 = -(market3 - market3.mean()) / market3.std()
53 | fig = plt.figure()
54 | plt.plot(market4, c="g")
55 | plt.plot(market2, c="r")
56 | plt.plot(dji_close2, c='b')
57 |
--------------------------------------------------------------------------------
/3.Python网络爬虫[从入门到实战]/2.编写第一个网络爬虫/Python使用入门.py:
--------------------------------------------------------------------------------
1 | # 基本命令
2 |
3 | print("Hello World!") # Hello World!
4 |
5 | x = 1
6 | if x == 1:
7 | print("Hello World!") # Hello World!
8 |
9 | # 在前面加上#,代表注释
10 | print("Hello World!") # Hello World!
11 |
12 | # 数据类型
13 |
14 | string1 = 'Python Web Scrappy'
15 | string2 = "by Santos"
16 | string3 = string1 + " " + string2
17 | print(string3) # Python Web Scrappy by Santos
18 |
19 | int1 = 7
20 | float1 = 7.5
21 | trans_int = int(float1)
22 | print(trans_int) # 7
23 |
24 | list1 = ['Python', 'Web', 'Scrappy']
25 | list2 = [1, 2, 3, 4, 5]
26 | list3 = ["a", 2, "c", 4]
27 | print("list1[0]: ", list1[0]) # list1[0]: Python
28 | print("list2[1:3]: ", list2[1:3]) # list2[1:3]: [2, 3]
29 |
30 | list1[1] = "new"
31 | print(list1) # ['Python', 'new', 'Scrappy']
32 |
33 | namebook = {"Name": "Alex", "Age": 7, "Class": "First"}
34 | print(namebook["Name"]) # 可以把相应的键值放入方括号,提取值Alex
35 | print(namebook) # 也可以直接输出整个字典{'Name': 'Alex', 'Age': 7, 'Class': 'First'}
36 |
37 | # 循环提取整个dictionary的key和value
38 | for key, value in namebook.items():
39 | print(key, value)
40 | '''
41 | Name Alex
42 | Age 7
43 | Class First
44 | '''
45 |
46 | # 条件语句和循环语句
47 |
48 | book = "python" # 定义字符串book
49 | if book == "python": # 判断变量是否为'python'
50 | print("You are studying python.") # 条件成立时输出You are studying python.
51 | else:
52 | print("Wrong.") # 条件不成立时输出
53 |
54 | book = "java" # 定义字符串book
55 | if book == "python": # 判断变量是否为'python'
56 | print("You are studying python.") # 条件成立时输出
57 | elif book == "java": # 判断变量是否为'java '
58 | print("You are studying java.") # 条件成立时输出You are studying java.
59 | else:
60 | print("Wrong.") # 条件不成立时输出
61 |
62 | citylist = ["Beijing", "Shanghai", "Guangzhou"]
63 | for eachcity in citylist:
64 | print(eachcity)
65 | '''
66 | Beijing
67 | Shanghai
68 | Guangzhou
69 | '''
70 |
71 | count = 0
72 | while count < 3:
73 | print(count) # 打印出 0,1,2
74 | count += 1 # 与 count = count + 1 一样
75 |
76 |
77 | # 函数
78 |
79 | # 定义函数
80 | def calulus(x):
81 | y = x + 1
82 | return y
83 |
84 |
85 | # 调用函数
86 | result = calulus(2)
87 | print(result) # 3
88 |
89 |
90 | # 定义函数
91 | def fruit_function(fruit1, fruit2):
92 | fruits = fruit1 + " " + fruit2
93 | return fruits
94 |
95 |
96 | # 调用函数
97 | result = fruit_function("apple", "banana")
98 | print(result) # apple banana
99 |
100 |
101 | # 面向对象编程
102 |
103 | class Person: # 创建类
104 | def __init__(self, name, age): # __init__()方法称为类的构造方法
105 | self.name = name
106 | self.age = age
107 |
108 | def detail(self): # 通过self调用被封装的内容
109 | print(self.name)
110 | print(self.age)
111 |
112 |
113 | obj1 = Person('santos', 18)
114 | obj1.detail() # Python将obj1传给self参数,即:obj1.detail(obj1),此时内部self=obj1
115 | '''
116 | santos
117 | 18
118 | '''
119 |
120 |
121 | def detail(name, age):
122 | print(name)
123 | print(age)
124 |
125 |
126 | obj1 = detail('santos', 18)
127 | '''
128 | santos
129 | 18
130 | '''
131 |
132 |
133 | class Person: # 创建类
134 | def __init__(self, name, age): # __init__()方法称为类的构造方法
135 | self.name = name
136 | self.age = age
137 |
138 |
139 | obj1 = Person('santos', 18) # 将"santos"和 18 分别封装到 obj1 及 self的 name和age属性
140 |
141 |
142 | # 封装
143 | class Person: # 创建类
144 | def __init__(self, name, age): # __init__()方法称为类的构造方法
145 | self.name = name
146 | self.age = age
147 |
148 |
149 | obj1 = Person('santos', 18) # 将"santos"和 18 分别封装到 obj1 及 self的 name和age属性
150 | print(obj1.name) # 直接调用obj1对象的name属性santos
151 | print(obj1.age) # 直接调用obj1对象的age属性18
152 |
153 |
154 | class Person: # 创建类
155 | def __init__(self, name, age): # __init__()方法称为类的构造方法
156 | self.name = name
157 | self.age = age
158 |
159 | def detail(self): # 通过self调用被封装的内容
160 | print(self.name)
161 | print(self.age)
162 |
163 |
164 | obj1 = Person('santos', 18)
165 | obj1.detail() # Python将obj1传给self参数,即:obj1.detail(obj1),此时内部self=obj1
166 |
167 |
168 | # 继承
169 | class Animal:
170 | def eat(self):
171 | print("%s 吃 " % self.name)
172 |
173 | def drink(self):
174 | print("%s 喝 " % self.name)
175 |
176 | def shit(self):
177 | print("%s 拉 " % self.name)
178 |
179 | def pee(self):
180 | print("%s 撒 " % self.name)
181 |
182 |
183 | class Cat(Animal):
184 | def __init__(self, name):
185 | self.name = name
186 |
187 | def cry(self):
188 | print('喵喵叫')
189 |
190 |
191 | class Dog(Animal):
192 | def __init__(self, name):
193 | self.name = name
194 |
195 | def cry(self):
196 | print('汪汪叫')
197 |
198 |
199 | c1 = Cat('小白家的小黑猫')
200 | c1.eat() # 小白家的小黑猫 吃
201 | c1.cry() # 喵喵叫
202 |
203 | d1 = Dog('胖子家的小瘦狗')
204 | d1.eat() # 胖子家的小瘦狗 吃
205 |
--------------------------------------------------------------------------------
/3.Python网络爬虫[从入门到实战]/2.编写第一个网络爬虫/Test_Python基础练习.py:
--------------------------------------------------------------------------------
1 | # 循环输出1-100之间的所有奇数
2 |
3 | for i in range(100):
4 | if i % 2 == 1:
5 | print(i)
6 |
7 | # 修改字符串
8 |
9 | str1 = '你好$$$我正在学Python@#@#现在需要&%&%&修改字符串'
10 | str2 = str1.replace('$$$', ' ').replace('@#@#', ' ').replace('&%&%&', ' ')
11 | print(str2) # 你好 我正在学Python 现在需要 修改字符串
12 |
13 | # 输出9*9乘法口诀表
14 |
15 | for i in range(1, 10):
16 | for j in range(1, i + 1):
17 | print("%dx%d=%d\t" % (j, i, i * j), end="")
18 | print("")
19 | '''
20 | 1x1=1
21 | 1x2=2 2x2=4
22 | 1x3=3 2x3=6 3x3=9
23 | 1x4=4 2x4=8 3x4=12 4x4=16
24 | 1x5=5 2x5=10 3x5=15 4x5=20 5x5=25
25 | 1x6=6 2x6=12 3x6=18 4x6=24 5x6=30 6x6=36
26 | 1x7=7 2x7=14 3x7=21 4x7=28 5x7=35 6x7=42 7x7=49
27 | 1x8=8 2x8=16 3x8=24 4x8=32 5x8=40 6x8=48 7x8=56 8x8=64
28 | 1x9=9 2x9=18 3x9=27 4x9=36 5x9=45 6x9=54 7x9=63 8x9=72 9x9=81
29 | '''
30 |
31 |
32 | # 按规则求企业的利润奖金
33 |
34 | def calcute_profit(I):
35 | I = I / 10000
36 | if I <= 10:
37 | a = I * 0.01
38 | return a * 10000
39 | elif I <= 20 and I > 10:
40 | b = 0.25 + I * 0.075
41 | return b * 10000
42 | elif I <= 40 and I > 20:
43 | c = 0.75 + I * 0.05
44 | return c * 10000
45 | elif I <= 60 and I > 40:
46 | d = 0.95 + I * 0.03
47 | return d * 10000
48 | elif I <= 60 and I > 100:
49 | e = 2 + I * 0.015
50 | return e * 10000
51 | else:
52 | f = 2.95 + I * 0.01
53 | return f * 10000
54 |
55 |
56 | # I = int(input('净利润:'))
57 | # profit = calcute_profit(I)
58 | # print('利润为%d元时,应发奖金总数为%d元' % (I, profit))
59 | # '''
60 | # 净利润:210000
61 | # 利润为210000元时,应发奖金总数为18000元
62 | # '''
63 | #
64 | #
65 | # def calcute_profit(I):
66 | # arr = [1000000, 600000, 400000, 200000, 100000, 0] # 列表列出分界值
67 | # rat = [0.01, 0.015, 0.03, 0.05, 0.075, 0.1] # 列表列出不同分界值对应的奖金比例
68 | # r = 0 # 总奖金初始值
69 | # for idx in range(0, 6): # 6档循环
70 | # if I > arr[idx]:
71 | # r = r + (I - arr[idx]) * rat[idx]
72 | # I = arr[idx]
73 | # return r
74 | #
75 | #
76 | # I = int(input('净利润:'))
77 | # profit = calcute_profit(I)
78 | # print('利润为%d元时,应发奖金总数为%d元' % (I, profit))
79 | # '''
80 | # 净利润:210000
81 | # 利润为210000元时,应发奖金总数为18000元
82 | # '''
83 |
84 | # 字典排序
85 | import operator
86 |
87 | x = {1: 2, 3: 4, 4: 3, 2: 1, 0: 0}
88 | sorted_x = sorted(x.items(), key=operator.itemgetter(1))
89 | print(sorted_x) # [(0, 0), (2, 1), (1, 2), (4, 3), (3, 4)]
90 |
91 | # 输出判定
92 |
93 | a = 1
94 |
95 |
96 | def fun(a):
97 | a = 2
98 |
99 |
100 | fun(a)
101 | print(a) # 1
102 |
103 | a = []
104 |
105 |
106 | def fun(a):
107 | a.append(1)
108 |
109 |
110 | fun(a)
111 | print(a) # [1]
112 |
113 |
114 | # 输出判定
115 |
116 | class Person:
117 | name = "aaa"
118 |
119 |
120 | p1 = Person()
121 | p2 = Person()
122 | p1.name = "bbb"
123 | print(p1.name) # bbb
124 | print(p2.name) # aaa
125 | print(Person.name) # aaa
126 |
127 |
128 | class Person:
129 | name = []
130 |
131 |
132 | p1 = Person()
133 | p2 = Person()
134 | p1.name.append(1)
135 | print(p1.name) # [1]
136 | print(p2.name) # [1]
137 | print(Person.name) # [1]
138 |
--------------------------------------------------------------------------------
/3.Python网络爬虫[从入门到实战]/2.编写第一个网络爬虫/title_test.txt:
--------------------------------------------------------------------------------
1 | 4.3 通过selenium 模拟浏览器抓取
--------------------------------------------------------------------------------
/3.Python网络爬虫[从入门到实战]/2.编写第一个网络爬虫/简易爬虫.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | # 1.获取页面
4 | link = "http://www.santostang.com/"
5 | headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
6 |
7 | r = requests.get(link, headers=headers)
8 | # print(r.text) # 打印了整个页面HTML源码
9 |
10 | # 2.提取需要的数据
11 | from bs4 import BeautifulSoup # 从bs4这个库中导入BeautifulSoup
12 |
13 | soup = BeautifulSoup(r.text, "html.parser") # 使用BeautifulSoup解析这段代码
14 | title = soup.find("h1", class_="post-title").a.text.strip() # 抓取博客页面第一篇文章标题
15 | print(title) # 打印标题:4.3 通过selenium 模拟浏览器抓取
16 |
17 | # 3.存储数据
18 | with open('title_test.txt', "a+") as f:
19 | f.write(title)
20 | f.close()
21 |
--------------------------------------------------------------------------------
/3.Python网络爬虫[从入门到实战]/3.静态网页抓取/1.request.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | # 请求页面
4 | r = requests.get('http://www.santostang.com/')
5 |
6 | # 输出获取的信息
7 | print("文本编码:", r.encoding) # 文本编码: UTF-8
8 | print("响应状态码:", r.status_code) # 200表示请求成功 响应状态码: 200
9 | # print("字符串方式的响应体:", r.text)
10 |
11 | # 传递URL参数
12 | key_dict = {'key1': 'value1', 'key2': 'value2'}
13 | r = requests.get('http://httpbin.org/get', params=key_dict)
14 |
15 | print("URL已经正确编码:", r.url) # URL已经正确编码: http://httpbin.org/get?key1=value1&key2=value2
16 | print("字符串方式的响应体: \n", r.text)
17 | '''
18 | 字符串方式的响应体:
19 | {
20 | "args": {
21 | "key1": "value1",
22 | "key2": "value2"
23 | },
24 | "headers": {
25 | "Accept": "*/*",
26 | "Accept-Encoding": "gzip, deflate",
27 | "Connection": "close",
28 | "Host": "httpbin.org",
29 | "User-Agent": "python-requests/2.21.0"
30 | },
31 | "origin": "111.166.36.3",
32 | "url": "http://httpbin.org/get?key1=value1&key2=value2"
33 | }
34 | '''
35 |
--------------------------------------------------------------------------------
/3.Python网络爬虫[从入门到实战]/3.静态网页抓取/2.request+header+POST.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | # 定制请求头
4 | # 去页面信息查找页面对应的Header请求头
5 | headers = {
6 | 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
7 | 'Host': 'www.santostang.com'
8 | }
9 | r = requests.get('http://www.santostang.com/', headers=headers)
10 | print("响应状态码:", r.status_code) # 响应状态码: 200
11 |
12 | # 发送POST请求
13 | key_dict = {'key1': 'value1', 'key2': 'value2'}
14 | r = requests.post('http://httpbin.org/post', data=key_dict)
15 | print("URL已经正确编码:", r.url) # URL已经正确编码: http://httpbin.org/post
16 | print(r.text)
17 | '''
18 | {
19 | "args": {},
20 | "data": "",
21 | "files": {},
22 | "form": {
23 | "key1": "value1",
24 | "key2": "value2"
25 | },
26 | "headers": {
27 | "Accept": "*/*",
28 | "Accept-Encoding": "gzip, deflate",
29 | "Connection": "close",
30 | "Content-Length": "23",
31 | "Content-Type": "application/x-www-form-urlencoded",
32 | "Host": "httpbin.org",
33 | "User-Agent": "python-requests/2.21.0"
34 | },
35 | "json": null,
36 | "origin": "111.166.36.3",
37 | "url": "http://httpbin.org/post"
38 | }
39 | '''
40 |
41 | # 超时处理
42 | link = "http://www.santostang.com/"
43 | r = requests.get(link, timeout=0.001)
44 |
--------------------------------------------------------------------------------
/3.Python网络爬虫[从入门到实战]/3.静态网页抓取/Test_TOP250电影数据.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 |
4 |
5 | # 获取电影信息页面函数
6 | def get_movies():
7 | headers = {
8 | 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
9 | 'Host': 'movie.douban.com'
10 | }
11 | # 观测到每页25个电影,抓取前250个电影需要10页
12 | for i in range(0, 10):
13 | link = 'https://movie.douban.com/top250?start=' + str(i * 25)
14 | r = requests.get(link, headers=headers, timeout=10)
15 | print(str(i + 1), "页响应状态码:", r.status_code)
16 | print(r.text)
17 |
18 |
19 | # 可以成功抓取电影的每个页面信息
20 | # get_movies()
21 |
22 | # 提取所需的TOP250电影名称
23 | def get_movies_name():
24 | headers = {
25 | 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
26 | 'Host': 'movie.douban.com'
27 | }
28 | movie_list = []
29 | for i in range(0, 10):
30 | link = 'https://movie.douban.com/top250?start=' + str(i * 25)
31 | r = requests.get(link, headers=headers, timeout=10)
32 | print(str(i + 1), "页响应状态码:", r.status_code)
33 | '''
34 | 1 页响应状态码: 200
35 | 2 页响应状态码: 200
36 | 3 页响应状态码: 200
37 | 4 页响应状态码: 200
38 | 5 页响应状态码: 200
39 | 6 页响应状态码: 200
40 | 7 页响应状态码: 200
41 | 8 页响应状态码: 200
42 | 9 页响应状态码: 200
43 | 10 页响应状态码: 200
44 | '''
45 |
46 | # 在获取HTML页面信息的前提下,用BS进行目标信息抓取
47 | soup = BeautifulSoup(r.text, "lxml")
48 | div_list = soup.find_all('div', class_='hd')
49 | for each in div_list:
50 | movie = each.a.span.text.strip()
51 | movie_list.append(movie)
52 | return movie_list
53 |
54 |
55 | movie_list = get_movies_name()
56 | print(movie_list)
57 | '''
58 | ['肖申克的救赎', '霸王别姬', '这个杀手不太冷', '阿甘正传', '美丽人生', '泰坦尼克号', '千与千寻', '辛德勒的名单', '盗梦空间', '机器人总动员', '忠犬八公的故事', '三傻大闹宝莱坞', '海上钢琴师', '放牛班的春天', '大话西游之大圣娶亲', '楚门的世界', '龙猫', '星际穿越', '教父', '熔炉', '无间道', '当幸福来敲门', '触不可及', '怦然心动', '疯狂动物城', '乱世佳人', '蝙蝠侠:黑暗骑士', '活着', '天堂电影院', '少年派的奇幻漂流', '十二怒汉', '鬼子来了', '指环王3:王者无敌', '控方证人', '天空之城', '搏击俱乐部', '飞屋环游记', '大话西游之月光宝盒', '罗马假日', '摔跤吧!爸爸', '窃听风暴', '哈尔的移动城堡', '辩护人', '闻香识女人', '两杆大烟枪', '飞越疯人院', '死亡诗社', 'V字仇杀队', '指环王2:双塔奇兵', '海豚湾', '教父2', '指环王1:魔戒再现', '饮食男女', '美丽心灵', '素媛', '情书', '狮子王', '末代皇帝', '钢琴家', '美国往事', '小鞋子', '七宗罪', '本杰明·巴顿奇事', '被嫌弃的松子的一生', '致命魔术', '西西里的美丽传说', '黑客帝国', '让子弹飞', '看不见的客人', '拯救大兵瑞恩', '天使爱美丽', '音乐之声', '低俗小说', '大闹天宫', '勇敢的心', '剪刀手爱德华', '哈利·波特与魔法石', '沉默的羔羊', '蝴蝶效应', '春光乍泄', '心灵捕手', '入殓师', '猫鼠游戏', '布达佩斯大饭店', '禁闭岛', '玛丽和马克思', '阳光灿烂的日子', '幽灵公主', '第六感', '狩猎', '重庆森林', '穿条纹睡衣的男孩', '致命ID', '断背山', '加勒比海盗', '阿凡达', '摩登时代', '大鱼', '告白', '寻梦环游记', '一一', '射雕英雄传之东成西就', '甜蜜蜜', '阳光姐妹淘', '消失的爱人', '爱在黎明破晓前', '上帝之城', '喜剧之王', '小森林 夏秋篇', '侧耳倾听', '风之谷', '恐怖直播', '倩女幽魂', '超脱', '红辣椒', '爱在日落黄昏时', '菊次郎的夏天', '驯龙高手', '幸福终点站', '神偷奶爸', '借东西的小人阿莉埃蒂', '杀人回忆', '请以你的名字呼唤我', '哈利·波特与死亡圣器(下)', '七武士', '岁月神偷', '小森林 冬春篇', '怪兽电力公司', '萤火虫之墓', '谍影重重3', '电锯惊魂', '7号房的礼物', '东邪西毒', '喜宴', '疯狂原始人', '贫民窟的百万富翁', '萤火之森', '记忆碎片', '黑天鹅', '真爱至上', '超能陆战队', '英雄本色', '唐伯虎点秋香', '蝙蝠侠:黑暗骑士崛起', '雨人', '心迷宫', '卢旺达饭店', '傲慢与偏见', '荒蛮故事', '海洋', '纵横四海', '无人知晓', '时空恋旅人', '海边的曼彻斯特', '教父3', '玩具总动员3', '完美的世界', '花样年华', '虎口脱险', '血战钢锯岭', '达拉斯买家俱乐部', '恋恋笔记本', '燃情岁月', '二十二', '雨中曲', '冰川时代', '魂断蓝桥', '我是山姆', '穿越时空的少女', '猜火车', '人工智能', '头脑特工队', '被解救的姜戈', '爆裂鼓手', '未麻的部屋', '你的名字。', '无敌破坏王', '罗生门', '浪潮', '阿飞正传', '香水', '朗读者', '房间', '模仿游戏', '恐怖游轮', '一个叫欧维的男人决定去死', '可可西里', '忠犬八公物语', '魔女宅急便', '战争之王', '一次别离', '哪吒闹海', '完美陌生人', '谍影重重', '追随', '谍影重重2', '地球上的星星', '牯岭街少年杀人事件', '撞车', '黑客帝国3:矩阵革命', '惊魂记', '青蛇', '梦之安魂曲', '海街日记', '小萝莉的猴神大叔', '再次出发之纽约遇见你', '新龙门客栈', '步履不停', '源代码', '终结者2:审判日', '东京物语', '初恋这件小事', '疯狂的石头', '城市之光', '绿里奇迹', '爱在午夜降临前', '无耻混蛋', '末路狂花', '这个男人来自地球', '秒速5厘米', '勇闯夺命岛', 'E.T. 外星人', '变脸', '彗星来的那一夜', '碧海蓝天', '卡萨布兰卡', '黄金三镖客', '发条橙', '聚焦', '血钻', '美国丽人', '国王的演讲', '海盗电台', '非常嫌疑犯', '荒野生存', '黑鹰坠落', '我爱你', '千钧一发', '英国病人', '天书奇谭', '遗愿清单', '2001太空漫游', '荒岛余生', '迁徙的鸟', '勇士', '枪火', '海蒂和爷爷', '叫我第一名', '燕尾蝶', '穆赫兰道']
59 | '''
60 |
--------------------------------------------------------------------------------
/3.Python网络爬虫[从入门到实战]/4.动态网页抓取/1.解析json评论数据.py:
--------------------------------------------------------------------------------
1 | import json
2 | import requests
3 |
4 | link = "https://api-zero.livere.com/v1/comments/list?callback=jQuery1124049866736766120545_1506309304525&limit=10&offset=1&repSeq=3871836&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&_=1506309304527"
5 | headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
6 |
7 | r = requests.get(link, headers=headers)
8 | # print(r.text)
9 |
10 | # 获取json的string
11 | json_string = r.text
12 | json_string = json_string[json_string.find('{'):-2]
13 |
14 | # json格式解析
15 | json_data = json.loads(json_string)
16 | comment_list = json_data['results']['parents']
17 |
18 | for eachone in comment_list:
19 | message = eachone['content']
20 | print(message)
21 | '''
22 | selenium操控firefox抓取博客评论的时候出现:[WinError 10053] 你的主机中的软件中止了一个已建立的连接, 请大神指点
23 | 刀斯林无处不在
24 | 大巴黎!咚咚咚!
25 | 222333
26 | 111
27 | 为什么第四章出版的时候不重新改改呢
28 | 测试评论
29 | 1111111111111111111111111
30 | 1111111111111111111111111
31 | 哪里哪里在哪里?
32 | '''
33 |
34 |
35 | # 批量解析评论
36 | def single_page_comment(link):
37 | headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
38 | r = requests.get(link, headers=headers)
39 | # 获取json的string
40 | json_string = r.text
41 | json_string = json_string[json_string.find('{'):-2]
42 | json_data = json.loads(json_string)
43 | comment_list = json_data['results']['parents']
44 |
45 | for eachone in comment_list:
46 | message = eachone['content']
47 | print(message)
48 |
49 |
50 | # 爬取前4页的动态评论
51 | for page in range(1, 4):
52 | link1 = "https://api-zero.livere.com/v1/comments/list?callback=jQuery112407875296433383039_1506267778283&limit=10&offset="
53 | link2 = "&repSeq=3871836&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&_=1506267778285"
54 | page_str = str(page)
55 | link = link1 + page_str + link2
56 | print(link)
57 | single_page_comment(link)
--------------------------------------------------------------------------------
/3.Python网络爬虫[从入门到实战]/4.动态网页抓取/2.selenium爬取评论数据.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
3 |
4 | # export PYTHONPATH=$PYTHONPATH:`pwd`:`pwd`/geckodriver
5 |
6 | # 使用Selenium模拟浏览器抓取网页
7 | caps = webdriver.DesiredCapabilities().FIREFOX
8 | caps["marionette"] = False
9 | binary = FirefoxBinary(r'/Applications/Firefox.app')
10 | driver = webdriver.Firefox(firefox_binary=binary, capabilities=caps)
11 | driver.get("http://www.santostang.com/2017/03/02/hello-world/")
12 | driver.switch_to.frame(driver.find_element_by_css_selector("iframe[title='livere']"))
13 |
14 | comment = driver.find_element_by_css_selector('div.reply-content')
15 | content = comment.find_element_by_tag_name('p')
16 | print(content.text)
17 |
18 | # 加载一页全部评论
19 | caps = webdriver.DesiredCapabilities().FIREFOX
20 | caps["marionette"] = False
21 | binary = FirefoxBinary(r'/Applications/Firefox.app')
22 | driver = webdriver.Firefox(firefox_binary=binary, capabilities=caps)
23 | driver.get("http://www.santostang.com/2017/03/02/hello-world/")
24 | driver.switch_to.frame(driver.find_element_by_css_selector("iframe[title='livere']"))
25 |
26 | comments = driver.find_elements_by_css_selector('div.reply-content')
27 | for eachcomment in comments:
28 | content = eachcomment.find_element_by_tag_name('p')
29 | print(content.text)
30 |
31 | # 限制图片加载
32 | caps = webdriver.DesiredCapabilities().FIREFOX
33 | caps["marionette"] = False
34 | binary = FirefoxBinary(r'D:\Program Files\Mozilla Firefox\firefox.exe')
35 | # 把上述地址改成你电脑中Firefox程序的地址
36 | fp = webdriver.FirefoxProfile()
37 | fp.set_preference("permissions.default.image", 2)
38 | driver = webdriver.Firefox(firefox_binary=binary, firefox_profile=fp, capabilities=caps)
39 | driver.get("http://www.santostang.com/2017/03/02/hello-world/")
40 |
41 | # 限制 JavaScript 的执行
42 | caps = webdriver.DesiredCapabilities().FIREFOX
43 | caps["marionette"] = False
44 | binary = FirefoxBinary(r'D:\Program Files\Mozilla Firefox\firefox.exe')
45 | # 把上述地址改成你电脑中Firefox程序的地址
46 | fp = webdriver.FirefoxProfile()
47 | fp.set_preference("javascript.enabled", False)
48 | driver = webdriver.Firefox(firefox_binary=binary, firefox_profile=fp, capabilities=caps)
49 | driver.get("http://www.santostang.com/2017/03/02/hello-world/")
50 |
--------------------------------------------------------------------------------
/3.Python网络爬虫[从入门到实战]/4.动态网页抓取/Test_RentData.py:
--------------------------------------------------------------------------------
1 | import time
2 | from selenium import webdriver
3 | from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
4 |
5 | caps = webdriver.DesiredCapabilities().FIREFOX
6 | caps["marionette"] = True
7 | binary = FirefoxBinary(r'D:\Program Files\Mozilla Firefox\firefox.exe')
8 | # 把上述地址改成你电脑中Firefox程序的地址
9 |
10 | # 用 selenium 的 driver 来启动 firefox
11 | driver = webdriver.Firefox(firefox_binary=binary, capabilities=caps)
12 | # 在虚拟浏览器中打开 Airbnb 页面
13 | driver.get("https://zh.airbnb.com/s/Shenzhen--China?page=1")
14 |
15 | for i in range(0, 5):
16 | # 找到页面中所有的出租房
17 | rent_list = driver.find_elements_by_css_selector('div._1788tsr0')
18 |
19 | # 对于每一个出租房
20 | for eachhouse in rent_list:
21 | # 找到评论数量
22 | try:
23 | comment = eachhouse.find_element_by_css_selector('span._gb7fydm')
24 | comment = comment.text
25 | except:
26 | comment = 0
27 |
28 | # 找到价格
29 | price = eachhouse.find_element_by_css_selector('span._hylizj6')
30 | price = price.text[4:]
31 |
32 | # 找到名称
33 | name = eachhouse.find_element_by_css_selector('div._ew0cqip')
34 | name = name.text
35 |
36 | # 找到房屋类型,大小
37 | details = eachhouse.find_elements_by_css_selector('div._saba1yg small div span')
38 | details = details[0].text
39 | house_type = details.split(" · ")[0]
40 | bed_number = details.split(" · ")[1]
41 | print(comment, price, name, house_type, bed_number)
42 | # 下一页
43 | nextpage = driver.find_element_by_css_selector('li._b8vexar').click()
44 | time.sleep(5)
45 |
--------------------------------------------------------------------------------
/3.Python网络爬虫[从入门到实战]/5.解析网页/1.re正则表达式.py:
--------------------------------------------------------------------------------
1 | import re
2 | import requests
3 |
4 | # 3种正则表达式
5 |
6 | # (1)re.match()
7 | m = re.match('www', 'www.santostang.com')
8 |
9 | print("匹配的结果: ", m)
10 | # 匹配的结果: <_sre.SRE_Match object; span=(0, 3), match='www'>
11 |
12 | print("匹配的起始与终点: ", m.span())
13 | # 匹配的起始与终点: (0, 3)
14 |
15 | print("匹配的起始位置: ", m.start())
16 | # 匹配的起始位置: 0
17 |
18 | print("匹配的终点位置: ", m.end())
19 | # 匹配的起始位置: 0
20 |
21 | line = "Fat cats are smarter than dogs, is it right?"
22 |
23 | m = re.match(r'(.*) are (.*?) dogs', line)
24 |
25 | print('匹配的整句话', m.group(0))
26 | # 匹配的整句话 Fat cats are smarter than dogs
27 |
28 | print('匹配的第一个结果', m.group(1))
29 | # 匹配的第一个结果 Fat cats
30 |
31 | print('匹配的第二个结果', m.group(2))
32 | # 匹配的第二个结果 smarter than
33 |
34 | print('匹配的结果列表', m.groups())
35 | # 匹配的结果列表 ('Fat cats', 'smarter than')
36 |
37 | # (2)re.search()
38 | m_match = re.match('com', 'www.santostang.com')
39 |
40 | m_search = re.search('com', 'www.santostang.com')
41 |
42 | print(m_match)
43 | # None
44 |
45 | print(m_search)
46 | # <_sre.SRE_Match object; span=(15, 18), match='com'>
47 |
48 | # (3)re.findall()
49 | # [0-9]+代表任意长度的数字
50 | m_match = re.match('[0-9]+', '12345 is the first number, 23456 is the sencond')
51 |
52 | m_search = re.search('[0-9]+', 'The first number is 12345, 23456 is the sencond')
53 |
54 | m_findall = re.findall('[0-9]+', '12345 is the first number, 23456 is the sencond')
55 |
56 | print(m_match.group())
57 | # 12345
58 |
59 | print(m_search.group())
60 | # 12345
61 |
62 | print(m_findall)
63 | # ['12345', '23456']
64 |
65 | # 解析网页HTML
66 | link = "http://www.santostang.com/"
67 | headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
68 | r = requests.get(link, headers=headers)
69 | html = r.text
70 |
71 | title_list = re.findall('
', html)
72 |
73 | print(html)
74 | '''
75 |
76 | '''
77 |
78 | print(title_list)
79 | '''
80 | ['4.3 通过selenium 模拟浏览器抓取', '4.2 解析真实地址抓取', '第四章- 动态网页抓取 (解析真实地址 + selenium)', '《网络爬虫:从入门到实践》一书勘误', 'Hello world!']
81 | '''
82 |
--------------------------------------------------------------------------------
/3.Python网络爬虫[从入门到实战]/5.解析网页/2.BeautifulSoup.py:
--------------------------------------------------------------------------------
1 | import re
2 | import requests
3 | from bs4 import BeautifulSoup
4 |
5 | # 使用BeautifulSoup抓取标题
6 | link = "http://www.santostang.com/"
7 | headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
8 | r = requests.get(link, headers=headers)
9 |
10 | soup = BeautifulSoup(r.text, "html.parser")
11 | first_title = soup.find("h1", class_="post-title").a.text.strip()
12 | print("第一篇文章的标题是:", first_title)
13 | # 第一篇文章的标题是: 4.3 通过selenium 模拟浏览器抓取
14 |
15 | title_list = soup.find_all("h1", class_="post-title")
16 | for i in range(len(title_list)):
17 | title = title_list[i].a.text.strip()
18 | print('第 %s 篇文章的标题是:%s' % (i + 1, title))
19 | '''
20 | 第 1 篇文章的标题是:4.3 通过selenium 模拟浏览器抓取
21 | 第 2 篇文章的标题是:4.2 解析真实地址抓取
22 | 第 3 篇文章的标题是:第四章- 动态网页抓取 (解析真实地址 + selenium)
23 | 第 4 篇文章的标题是:《网络爬虫:从入门到实践》一书勘误
24 | 第 5 篇文章的标题是:Hello world!
25 | '''
26 |
27 | # 使用BeautifulSoup进行代码美化
28 | html = """
29 |
30 |
45 | """
46 | soup = BeautifulSoup(html, "html.parser")
47 | print(soup.prettify())
48 | '''
49 |
50 |
73 |
74 |
98 | '''
99 |
100 | print(soup.header.h3)
101 | # 大数据@唐松Santos
102 |
103 | print(soup.header.div.contents)
104 | '''
105 | ['\n', , '\n', , '\n', , '\n', ]
106 | '''
107 |
108 | print(soup.header.div.contents[1])
109 | '''
110 |
111 | '''
112 |
113 | for child in soup.header.div.children:
114 | print(child)
115 | '''
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 | '''
130 |
131 | for child in soup.header.div.descendants:
132 | print(child)
133 | '''
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 | '''
149 |
150 | a_tag = soup.header.div.a
151 | print(a_tag)
152 | #
153 | print(a_tag.parent)
154 | '''
155 |
160 | '''
161 |
162 | print(soup.find_all('div', class_='sns'))
163 | '''
164 | []
169 | '''
170 |
171 | # 文档搜索树
172 | for tag in soup.find_all(re.compile("^h")):
173 | print(tag.name)
174 | '''
175 | header
176 | h3
177 | '''
178 |
179 | # CSS选择器
180 | print(soup.select("header h3")) # [大数据@唐松Santos
]
181 |
182 |
183 |
--------------------------------------------------------------------------------
/4.算法图解/1.二分查找/binary_sort.py:
--------------------------------------------------------------------------------
1 | def binary_search(list, item):
2 | # low和high两个游标记录要搜索的范围边界
3 | low = 0
4 | high = len(list) - 1
5 |
6 | # 当算法不断二分并缩小搜索范围
7 | while low <= high:
8 | # 检查目标范围内二分点中间元素是否为查找对象
9 | mid = (low + high) // 2
10 | guess = list[mid]
11 | # 如果找到目标元素
12 | if guess == item:
13 | return mid
14 | # 当中间元素大于目标元素
15 | if guess > item:
16 | high = mid - 1
17 | # 如果中间元素小于目标元素
18 | else:
19 | low = mid + 1
20 |
21 | # 搜索范围越界则说明没有目标元素
22 | return None
23 |
24 |
25 | my_list = [1, 3, 5, 7, 9]
26 | print(binary_search(my_list, 3)) # => 1
27 |
28 | # None代表数列中没有目标元素
29 | print(binary_search(my_list, -1)) # => None
30 |
--------------------------------------------------------------------------------
/4.算法图解/2.选择排序/selection_sort.py:
--------------------------------------------------------------------------------
1 | # 找到数列中最小的元素
2 | def findSmallest(arr):
3 | # 存储当前遍历到的最小元素
4 | smallest = arr[0]
5 | # 存储当前最小元素的索引
6 | smallest_index = 0
7 | # 如果找到更小元素,则更新最小元素信息
8 | for i in range(1, len(arr)):
9 | if arr[i] < smallest:
10 | smallest_index = i
11 | smallest = arr[i]
12 | # 返回查找到的最小元素索引
13 | return smallest_index
14 |
15 |
16 | # 数列排序
17 | def selectionSort(arr):
18 | newArr = []
19 | for i in range(len(arr)):
20 | # 找到当前数列最小元素并放到一个新数列存储器中
21 | smallest = findSmallest(arr)
22 | newArr.append(arr.pop(smallest))
23 | return newArr
24 |
25 |
26 | # 选择排序从小到大排序数列
27 | print(selectionSort([5, 3, 6, 2, 10]))
28 | # [2, 3, 5, 6, 10]
29 |
--------------------------------------------------------------------------------
/4.算法图解/3.递归/1.countdown.py:
--------------------------------------------------------------------------------
1 | def countdown(i):
2 | print(i)
3 | # 基线条件
4 | if i <= 0:
5 | return
6 | # 递归条件
7 | else:
8 | countdown(i - 1)
9 |
10 |
11 | countdown(5)
12 | '''
13 | 5
14 | 4
15 | 3
16 | 2
17 | 1
18 | 0
19 | '''
--------------------------------------------------------------------------------
/4.算法图解/3.递归/2.greet.py:
--------------------------------------------------------------------------------
1 | def greet2(name):
2 | print("how are you, ", name, "?")
3 |
4 |
5 | def bye():
6 | print("ok bye!")
7 |
8 |
9 | def greet(name):
10 | print("hello, ", name, "!")
11 | greet2(name)
12 | print("getting ready to say bye...")
13 | bye()
14 |
15 |
16 | greet("adit")
17 | '''
18 | hello, adit !
19 | how are you, adit ?
20 | getting ready to say bye...
21 | ok bye!
22 | '''
23 |
--------------------------------------------------------------------------------
/4.算法图解/3.递归/3.factorial.py:
--------------------------------------------------------------------------------
1 | def fact(x):
2 | if x == 1:
3 | return 1
4 | else:
5 | return x * fact(x - 1)
6 |
7 |
8 | print(fact(5)) # 120
9 |
--------------------------------------------------------------------------------
/4.算法图解/4.快速排序/quick_sort.py:
--------------------------------------------------------------------------------
1 | def quicksort(array):
2 | if len(array) < 2:
3 | # 基线条件:数列为空或者只包含一个元素的数组一定是有序的
4 | return array
5 | else:
6 | # 递归条件:基准值直接取每组自数列的首个元素
7 | pivot = array[0]
8 | # 由所有小于等于基准值的元素组成的子数组
9 | less = [i for i in array[1:] if i <= pivot]
10 | # 由所有大于基准值的元素组成的子数组
11 | greater = [i for i in array[1:] if i > pivot]
12 | return quicksort(less) + [pivot] + quicksort(greater)
13 |
14 |
15 | print(quicksort([10, 5, 2, 3])) # [2, 3, 5, 10]
16 |
--------------------------------------------------------------------------------
/4.算法图解/5.散列表/1.dict.py:
--------------------------------------------------------------------------------
1 | book = {"apple": 0.67, "milk": 1.49, "avocado": 1.49}
2 |
3 | print(book) # {'apple': 0.67, 'milk': 1.49, 'avocado': 1.49}
4 |
--------------------------------------------------------------------------------
/4.算法图解/5.散列表/2.check_voter.py:
--------------------------------------------------------------------------------
1 | voted = {}
2 |
3 |
4 | def check_voter(name):
5 | if voted.get(name):
6 | print("kick them out!")
7 | else:
8 | voted[name] = True
9 | print("let them vote!")
10 |
11 |
12 | check_voter("tom") # let them vote!
13 | check_voter("mike") # let them vote!
14 | check_voter("mike") # kick them out!
15 |
--------------------------------------------------------------------------------
/4.算法图解/6.广度优先搜索/breadth-first_search.py:
--------------------------------------------------------------------------------
1 | from collections import deque
2 |
3 |
4 | # 检测是否为目标对象
5 | def person_is_seller(name):
6 | return name[-1] == 'm'
7 |
8 |
9 | # 创建图信息的节点散列表
10 | graph = {}
11 | graph["you"] = ["alice", "bob", "claire"]
12 | graph["bob"] = ["anuj", "peggy"]
13 | graph["alice"] = ["peggy"]
14 | graph["claire"] = ["thom", "jonny"]
15 | graph["anuj"] = []
16 | graph["peggy"] = []
17 | graph["thom"] = []
18 | graph["jonny"] = []
19 |
20 |
21 | # 广度优先搜索函数
22 | def search(name):
23 | # 初始化队列
24 | search_queue = deque()
25 | # 将当前对象的邻居放入队列
26 | search_queue += graph[name]
27 | # 记录已搜寻的路径信息列表
28 | searched = []
29 | while search_queue:
30 | # 头对象出队
31 | person = search_queue.popleft()
32 | # 确认当前出队对象未被检索
33 | if person not in searched:
34 | # 检测是否为目标搜寻对象
35 | if person_is_seller(person):
36 | print(person + " is a mango seller!")
37 | return True
38 | else:
39 | # 如果不是,将当前对象的邻居放入搜寻队列
40 | search_queue += graph[person]
41 | # 标记当前对象为已搜寻
42 | searched.append(person)
43 | return False
44 |
45 |
46 | search("you") # thom is a mango seller!
47 |
--------------------------------------------------------------------------------
/4.算法图解/7.狄杰斯特拉算法/dijkstras_algorithm.py:
--------------------------------------------------------------------------------
1 | # 图网络散列表
2 | graph = {} # 初始化图散列表
3 | graph["start"] = {} # 初始化起始节点散列表
4 | graph["start"]["a"] = 6 # 起始节点到a节点权重为6
5 | graph["start"]["b"] = 2 # 起始节点到a节点权重为2
6 |
7 | graph["a"] = {} # 初始化a节点散列表
8 | graph["a"]["fin"] = 1 # a节点到终点权重为1
9 |
10 | graph["b"] = {} # 初始化b节点散列表
11 | graph["b"]["a"] = 3 # b节点到a节点权重为3
12 | graph["b"]["fin"] = 5 # b节点到终点权重为5
13 |
14 | graph["fin"] = {} # 初始化终点散列表
15 |
16 | # 各节点开销散列表
17 | infinity = float("inf")
18 | costs = {} # 初始化开销散列表
19 | costs["a"] = 6 # 更新a节点开销为6
20 | costs["b"] = 2 # 更新b节点开销为2
21 | costs["fin"] = infinity # 更新终点开销为无穷
22 |
23 | # 父节点记录散列表
24 | parents = {} # 初始化父节点记录散列表
25 | parents["a"] = "start" # 记录a节点的父节点为起始节点
26 | parents["b"] = "start" # 记录b节点的父节点为起始节点
27 | parents["fin"] = None # 记录终点的父节点为未知
28 |
29 | # 已处理节点记录列表
30 | processed = []
31 |
32 |
33 | # 寻找未处理节点中最小开销节点的函数
34 | def find_lowest_cost_node(costs):
35 | # 初始化最低开销节点为正无穷
36 | lowest_cost = float("inf")
37 | # 初始化最低开销节点为未知
38 | lowest_cost_node = None
39 | # 遍历所有节点
40 | for node in costs:
41 | # 获取当前节点开销
42 | cost = costs[node]
43 | # 如果当前节点小于最低开销并且未被处理
44 | if cost < lowest_cost and node not in processed:
45 | # 记录当前得到的最低开销
46 | lowest_cost = cost
47 | # 记录当前得到的最低开销节点
48 | lowest_cost_node = node
49 | # 返回找到的未被处理的最低开销节点
50 | return lowest_cost_node
51 |
52 |
53 | # 在未处理的节点中找出最小开销的节点
54 | node = find_lowest_cost_node(costs)
55 | # 当所有节点都被处理后结束算法
56 | while node is not None:
57 | # 获取当前节点开销
58 | cost = costs[node]
59 | # 遍历当前节点所有邻居节点
60 | neighbors = graph[node]
61 | for n in neighbors.keys():
62 | # 计算邻居节点新开销
63 | new_cost = cost + neighbors[n]
64 | # 比较原始开销和新开销大小
65 | if costs[n] > new_cost:
66 | # 如果原始开销大于新开销则更新该节点开销
67 | costs[n] = new_cost
68 | # 当前节点变为邻居节点的父节点
69 | parents[n] = node
70 | # 标记当前节点为已处理
71 | processed.append(node)
72 | # 找到下一个要处理的节点
73 | node = find_lowest_cost_node(costs)
74 |
75 | print("Cost from the start to each node:")
76 | print(costs) # {'a': 5, 'b': 2, 'fin': 6}
77 |
--------------------------------------------------------------------------------
/4.算法图解/8.贪婪算法/set_covering.py:
--------------------------------------------------------------------------------
1 | # 广播台覆盖州范围问题
2 |
3 | # 目标覆盖的州名称
4 | states_needed = set(["mt", "wa", "or", "id", "nv", "ut", "ca", "az"])
5 |
6 | # 不同广播台覆盖范围
7 | stations = {}
8 | stations["kone"] = set(["id", "nv", "ut"])
9 | stations["ktwo"] = set(["wa", "id", "mt"])
10 | stations["kthree"] = set(["or", "nv", "ca"])
11 | stations["kfour"] = set(["nv", "ut"])
12 | stations["kfive"] = set(["ca", "az"])
13 |
14 | # 最终选择的广播台
15 | final_stations = set()
16 |
17 | # 当还有未覆盖的目标州就继续选择
18 | while states_needed:
19 | # 选择覆盖最广的电台
20 | best_station = None
21 | # 包含该广播台覆盖的所有未覆盖的州
22 | states_covered = set()
23 | # 遍历所有广播台和对应的覆盖州
24 | for station, states in stations.items():
25 | # 需要覆盖的州和当前电台可覆盖的州取交集
26 | covered = states_needed & states
27 | # 如果交集面积大于最小覆盖标记
28 | if len(covered) > len(states_covered):
29 | # 记录当前电台为最优选择
30 | best_station = station
31 | # 记录当前最优电台所能覆盖的最大有效面积
32 | states_covered = covered
33 | # 需求覆盖面积删除当前最优电台覆盖面积
34 | states_needed -= states_covered
35 | # 将当前最优电台放入选择集合
36 | final_stations.add(best_station)
37 |
38 | print(final_stations) # {'kfive', 'kthree', 'kone', 'ktwo'}
39 |
--------------------------------------------------------------------------------
/4.算法图解/9.动态规划/longest_common_subsequence.py:
--------------------------------------------------------------------------------
1 | if word_a[i] == word_b[j]:
2 | # The letters match.
3 | cell[i][j] = cell[i - 1][j - 1] + 1
4 | else:
5 | # The letters don't match.
6 | cell[i][j] = max(cell[i - 1][j], cell[i][j - 1])
7 |
--------------------------------------------------------------------------------
/PythonCourses.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Python Courses
2 | Lesson |Content
3 | |:-:|:-|
4 | |[lesson01](1.Python编程[从入门到实战]/lesson01) |[位运算](1.Python编程[从入门到实战]/lesson01/%E4%BD%8D%E8%BD%AC%E6%8D%A2.py)
[加法运算性能比较](1.Python编程[从入门到实战]/lesson01/%E5%8A%A0%E6%B3%95%E8%BF%90%E7%AE%97%E6%80%A7%E8%83%BD%E6%AF%94%E8%BE%83.py)
[基本运算符](1.Python编程[从入门到实战]/lesson01/%E5%9F%BA%E6%9C%AC%E8%BF%90%E7%AE%97%E7%AC%A6.py)
[异常_注释_多行](1.Python编程[从入门到实战]/lesson01/%E5%BC%82%E5%B8%B8_%E6%B3%A8%E9%87%8A_%E5%A4%9A%E8%A1%8C.py)
[简易爬虫](1.Python编程[从入门到实战]/lesson01/%E7%AE%80%E6%98%93%E7%88%AC%E8%99%AB.py)|
5 | |[lesson02](1.Python编程[从入门到实战]/lesson02)|[函数](1.Python编程[从入门到实战]/lesson02/%E5%87%BD%E6%95%B0.py)
[判定_循环](1.Python编程[从入门到实战]/lesson02/%E5%88%A4%E5%AE%9A_%E5%BE%AA%E7%8E%AF.py)
[字符串](1.Python编程[从入门到实战]/lesson02/%E5%AD%97%E7%AC%A6%E4%B8%B2.py)
[数据类型](1.Python编程[从入门到实战]/lesson02/%E6%95%B0%E6%8D%AE%E7%B1%BB%E5%9E%8B.py)
[汉诺塔问题](1.Python编程[从入门到实战]/lesson02/%E6%B1%89%E8%AF%BA%E5%A1%94%E9%97%AE%E9%A2%98.py)|
6 | |[lesson03](1.Python编程[从入门到实战]/lesson03)|[数组_元组](1.Python编程[从入门到实战]/lesson03/list_tuple.py)
[反转语句](1.Python编程[从入门到实战]/lesson03/reverse_by_word.py)
[数组特殊创建](1.Python编程[从入门到实战]/lesson03/comprehension.py)
[集合](1.Python编程[从入门到实战]/lesson03/set.py)
[寻找目标加和数值对](1.Python编程[从入门到实战]/lesson03/two_sum.py)
[集合](1.Python编程[从入门到实战]/lesson03/set.py)
[数组切片](1.Python编程[从入门到实战]/lesson03/slice.py)
[生成器](1.Python编程[从入门到实战]/lesson03/generator.py)
[迭代器](1.Python编程[从入门到实战]/lesson03/iter.py)|
7 | |[lesson04](1.Python编程[从入门到实战]/lesson04)|[面向对象](1.Python编程[从入门到实战]/lesson04/%E9%9D%A2%E5%90%91%E5%AF%B9%E8%B1%A1.py)|
8 | |[lesson05](1.Python编程[从入门到实战]/lesson05)|[文件访问](1.Python编程[从入门到实战]/lesson05/%E6%96%87%E4%BB%B6%E8%AE%BF%E9%97%AE.py)
[函数式编程](1.Python编程[从入门到实战]/lesson05/%E5%87%BD%E6%95%B0%E5%BC%8F%E7%BC%96%E7%A8%8B.py)|
9 | |[lesson06](1.Python编程[从入门到实战]/lesson06)|[call回调函数](1.Python编程[从入门到实战]/lesson06/call.py)
[enum匹配数组](1.Python编程[从入门到实战]/lesson06/enum.py)
[exception异常](1.Python编程[从入门到实战]/lesson06/exception.py)
[getitem获取元素](1.Python编程[从入门到实战]/lesson06/getitem.py)
[iter递归](1.Python编程[从入门到实战]/lesson06/iter.py)
[meta动态添加方法](1.Python编程[从入门到实战]/lesson06/meta.py)
[orm异步数据库处理](1.Python编程[从入门到实战]/lesson06/orm.py)
[property属性](1.Python编程[从入门到实战]/lesson06/property.py)
[property描述器](1.Python编程[从入门到实战]/lesson06/property_imp.py)
[slots动态添加属性](1.Python编程[从入门到实战]/lesson06/slots.py)
[重写__str__函数](1.Python编程[从入门到实战]/lesson06/str.py)
[type动态创建方法](1.Python编程[从入门到实战]/lesson06/type.py)
[unittest单元测试](1.Python编程[从入门到实战]/lesson06/unittest.py)|
10 |
--------------------------------------------------------------------------------