作者:[Chris Albon](https://chrisalbon.com/)
4 | >
5 | > 译者:[飞龙](https://github.com/wizardforcel)
6 | >
7 | > 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/)
8 |
9 | ## 转置矩阵或向量
10 |
11 | ```py
12 | # 加载库
13 | import numpy as np
14 |
15 | # 创建向量
16 | vector = np.array([1, 2, 3, 4, 5, 6])
17 |
18 | # 创建矩阵
19 | matrix = np.array([[1, 2, 3],
20 | [4, 5, 6],
21 | [7, 8, 9]])
22 |
23 | # 转置向量
24 | vector.T
25 |
26 | # array([1, 2, 3, 4, 5, 6])
27 |
28 | # 转置矩阵
29 | matrix.T
30 |
31 | '''
32 | array([[1, 4, 7],
33 | [2, 5, 8],
34 | [3, 6, 9]])
35 | '''
36 | ```
37 |
38 | ## 选择数组中的元素
39 |
40 | ```py
41 | # 加载库
42 | import numpy as np
43 |
44 | # 创建行向量
45 | vector = np.array([1, 2, 3, 4, 5, 6])
46 |
47 | # 选择第二个元素
48 | vector[1]
49 |
50 | # 2
51 |
52 | # 创建矩阵
53 | matrix = np.array([[1, 2, 3],
54 | [4, 5, 6],
55 | [7, 8, 9]])
56 |
57 | # 选择第二行第二列
58 | matrix[1,1]
59 |
60 | # 5
61 |
62 | # 创建矩阵
63 | tensor = np.array([
64 | [[[1, 1], [1, 1]], [[2, 2], [2, 2]]],
65 | [[[3, 3], [3, 3]], [[4, 4], [4, 4]]]
66 | ])
67 |
68 | # 选择三个维度的每个的第二个元素
69 | tensor[1,1,1]
70 |
71 | # array([4, 4])
72 | ```
73 |
74 | ## 数组变形
75 |
76 | ```py
77 | # 加载库
78 | import numpy as np
79 |
80 | # 创建 4x3 矩阵
81 | matrix = np.array([[1, 2, 3],
82 | [4, 5, 6],
83 | [7, 8, 9],
84 | [10, 11, 12]])
85 |
86 | # 将矩阵变形为 2x6 矩阵
87 | matrix.reshape(2, 6)
88 |
89 | '''
90 | array([[ 1, 2, 3, 4, 5, 6],
91 | [ 7, 8, 9, 10, 11, 12]])
92 | '''
93 | ```
94 |
95 | ## 矩阵的逆
96 |
97 | ```py
98 | # 加载库
99 | import numpy as np
100 |
101 | # 创建矩阵
102 | matrix = np.array([[1, 4],
103 | [2, 5]])
104 |
105 | # 计算矩阵的逆
106 | np.linalg.inv(matrix)
107 |
108 | '''
109 | array([[-1.66666667, 1.33333333],
110 | [ 0.66666667, -0.33333333]])
111 | '''
112 | ```
113 |
114 | ## 获取矩阵对角线
115 |
116 | ```py
117 | # 加载库
118 | import numpy as np
119 |
120 | # 创建矩阵
121 | matrix = np.array([[1, 2, 3],
122 | [4, 5, 6],
123 | [7, 8, 9]])
124 |
125 | # 返回对角线元素
126 | matrix.diagonal()
127 |
128 | # array([1, 5, 9])
129 |
130 | # 创建矩阵的迹
131 | matrix.diagonal().sum()
132 |
133 | # 15
134 | ```
135 |
136 | ## 展开矩阵
137 |
138 | ```py
139 | # 加载库
140 | import numpy as np
141 |
142 | # 创建矩阵
143 | matrix = np.array([[1, 2, 3],
144 | [4, 5, 6],
145 | [7, 8, 9]])
146 |
147 | # 展开矩阵
148 | matrix.flatten()
149 |
150 | # array([1, 2, 3, 4, 5, 6, 7, 8, 9])
151 | ```
152 |
153 | ## 寻找矩阵的秩
154 |
155 | ```py
156 | # 加载库
157 | import numpy as np
158 |
159 | # 创建矩阵
160 | matrix = np.array([[1, 2, 3],
161 | [4, 5, 6],
162 | [7, 8, 9]])
163 |
164 | # 返回矩阵的秩
165 | np.linalg.matrix_rank(matrix)
166 |
167 | # 2
168 | ```
169 |
170 | ## Find The Maximum And Minimum
171 |
172 | ```py
173 | # 加载库
174 | import numpy as np
175 |
176 | # 创建矩阵
177 | matrix = np.array([[1, 2, 3],
178 | [4, 5, 6],
179 | [7, 8, 9]])
180 |
181 | # 返回最大元素
182 | np.max(matrix)
183 |
184 | # 9
185 |
186 | # 返回最小元素
187 | np.min(matrix)
188 |
189 | # 1
190 |
191 | # 寻找每列的最大元素
192 | np.max(matrix, axis=0)
193 |
194 | # array([7, 8, 9])
195 |
196 | # 寻找每行的最大元素
197 | np.max(matrix, axis=1)
198 |
199 | # array([3, 6, 9])
200 | ```
201 |
202 | ## 描述数组
203 |
204 | ```py
205 | # 加载库
206 | import numpy as np
207 |
208 | # 创建矩阵
209 | matrix = np.array([[1, 2, 3, 4],
210 | [5, 6, 7, 8],
211 | [9, 10, 11, 12]])
212 |
213 | # 查看行和列数
214 | matrix.shape
215 |
216 | # (3, 4)
217 |
218 | # 查看元素数(行乘列)
219 | matrix.size
220 |
221 | # 12
222 |
223 | # 查看维数
224 | matrix.ndim
225 |
226 | # 2
227 | ```
228 |
229 | ## 创建向量
230 |
231 | ```py
232 | # 加载库
233 | import numpy as np
234 |
235 | # 创建行向量
236 | vector_row = np.array([1, 2, 3])
237 |
238 | # 创建列向量
239 | vector_column = np.array([[1],
240 | [2],
241 | [3]])
242 | ```
243 |
244 | ## 创建稀疏矩阵
245 |
246 | ```py
247 | # Load libraries
248 | import numpy as np
249 | from scipy import sparse
250 |
251 | # 创建矩阵
252 | matrix = np.array([[0, 0],
253 | [0, 1],
254 | [3, 0]])
255 |
256 | # 创建压缩稀疏行(CSR)矩阵
257 | matrix_sparse = sparse.csr_matrix(matrix)
258 | ```
259 |
260 | 注意:有许多类型的稀疏矩阵。 在上面的示例中,我们使用 CSR,但我们使用的类型应该反映我们的用例。
261 |
262 | ## 创建矩阵
263 |
264 | ```py
265 | # 加载库
266 | import numpy as np
267 |
268 | # 创建矩阵
269 | matrix = np.array([[1, 4],
270 | [2, 5]])
271 | ```
272 |
273 | 注意 NumPy 的`mat`数据结构对于我们的目的而言不太灵活,应该避免。
274 |
275 | ## 将字典转换为矩阵
276 |
277 | ```py
278 | # 加载库
279 | from sklearn.feature_extraction import DictVectorizer
280 |
281 | # 我们的数据字典
282 | data_dict = [{'Red': 2, 'Blue': 4},
283 | {'Red': 4, 'Blue': 3},
284 | {'Red': 1, 'Yellow': 2},
285 | {'Red': 2, 'Yellow': 2}]
286 |
287 | # 创建 DictVectorizer 对象
288 | dictvectorizer = DictVectorizer(sparse=False)
289 |
290 | # 将字典转换为特征矩阵
291 | features = dictvectorizer.fit_transform(data_dict)
292 |
293 | # 查看特征矩阵
294 | features
295 |
296 | '''
297 | array([[ 4., 2., 0.],
298 | [ 3., 4., 0.],
299 | [ 0., 1., 2.],
300 | [ 0., 2., 2.]])
301 | '''
302 |
303 | # 查看特征矩阵的列名
304 | dictvectorizer.get_feature_names()
305 |
306 | # ['Blue', 'Red', 'Yellow']
307 | ```
308 |
309 | ## 计算矩阵的迹
310 |
311 | ```py
312 | # 加载库
313 | import numpy as np
314 |
315 | # 创建矩阵
316 | matrix = np.array([[1, 2, 3],
317 | [4, 5, 6],
318 | [7, 8, 9]])
319 |
320 | # 计算矩阵的迹
321 | matrix.diagonal().sum()
322 |
323 | # 15
324 | ```
325 |
326 | ## 计算矩阵的行列式
327 |
328 | ```py
329 | # 加载库
330 | import numpy as np
331 |
332 | # 创建矩阵
333 | matrix = np.array([[1, 2, 3],
334 | [4, 5, 6],
335 | [7, 8, 9]])
336 |
337 | # 返回矩阵的行列式
338 | np.linalg.det(matrix)
339 |
340 | # -9.5161973539299405e-16
341 | ```
342 |
343 | ## 计算均值、方差和标准差
344 |
345 | ```py
346 | # 加载库
347 | import numpy as np
348 |
349 | # 创建矩阵
350 | matrix = np.array([[1, 2, 3],
351 | [4, 5, 6],
352 | [7, 8, 9]])
353 |
354 | # 返回均值
355 | np.mean(matrix)
356 |
357 | # 5.0
358 |
359 | # 返回方差
360 | np.var(matrix)
361 |
362 | # 6.666666666666667
363 |
364 | # 返回标准差
365 | np.std(matrix)
366 |
367 | # 2.5819888974716112
368 | ```
369 |
370 | ## 计算两个向量的点积
371 |
372 | ```py
373 | # 加载库
374 | import numpy as np
375 |
376 | # 创建两个向量
377 | vector_a = np.array([1,2,3])
378 | vector_b = np.array([4,5,6])
379 |
380 | # 计算点积
381 | np.dot(vector_a, vector_b)
382 |
383 | # 32
384 |
385 | # 计算点积
386 | vector_a @ vector_b
387 |
388 | # 32
389 | ```
390 |
391 | ## 对元素应用操作
392 |
393 | ```py
394 | # 加载库
395 | import numpy as np
396 |
397 | # 创建矩阵
398 | matrix = np.array([[1, 2, 3],
399 | [4, 5, 6],
400 | [7, 8, 9]])
401 |
402 | # 创建加上 100 的函数
403 | add_100 = lambda i: i + 100
404 |
405 | # 创建向量化函数
406 | vectorized_add_100 = np.vectorize(add_100)
407 |
408 | # 对矩阵的所有元素应用函数
409 | vectorized_add_100(matrix)
410 |
411 | '''
412 | array([[101, 102, 103],
413 | [104, 105, 106],
414 | [107, 108, 109]])
415 | '''
416 | ```
417 |
418 | ## 矩阵的加和减
419 |
420 | ```py
421 | # 加载库
422 | import numpy as np
423 |
424 | # 创建矩阵
425 | matrix_a = np.array([[1, 1, 1],
426 | [1, 1, 1],
427 | [1, 1, 2]])
428 |
429 | # 创建矩阵
430 | matrix_b = np.array([[1, 3, 1],
431 | [1, 3, 1],
432 | [1, 3, 8]])
433 |
434 | # 将两个矩阵相加
435 | np.add(matrix_a, matrix_b)
436 |
437 | '''
438 | array([[ 2, 4, 2],
439 | [ 2, 4, 2],
440 | [ 2, 4, 10]])
441 | '''
442 |
443 | # 将两个矩阵相减
444 | np.subtract(matrix_a, matrix_b)
445 |
446 | '''
447 | array([[ 0, -2, 0],
448 | [ 0, -2, 0],
449 | [ 0, -2, -6]])
450 | '''
451 | ```
452 |
--------------------------------------------------------------------------------
/5.md:
--------------------------------------------------------------------------------
1 | # 五、文本预处理
2 |
3 | > 作者:[Chris Albon](https://chrisalbon.com/)
4 | >
5 | > 译者:[飞龙](https://github.com/wizardforcel)
6 | >
7 | > 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/)
8 |
9 | ## 词袋
10 |
11 | 
12 |
13 | ```py
14 | # 加载库
15 | import numpy as np
16 | from sklearn.feature_extraction.text import CountVectorizer
17 | import pandas as pd
18 |
19 | # 创建文本
20 | text_data = np.array(['I love Brazil. Brazil!',
21 | 'Sweden is best',
22 | 'Germany beats both'])
23 |
24 | # 创建词袋特征矩阵
25 | count = CountVectorizer()
26 | bag_of_words = count.fit_transform(text_data)
27 |
28 | # 展示特征矩阵
29 | bag_of_words.toarray()
30 |
31 | '''
32 | array([[0, 0, 0, 2, 0, 0, 1, 0],
33 | [0, 1, 0, 0, 0, 1, 0, 1],
34 | [1, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)
35 | '''
36 |
37 | # 获取特征名称
38 | feature_names = count.get_feature_names()
39 |
40 | # 查看特征名称
41 | feature_names
42 |
43 | # ['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']
44 |
45 | # 创建数据帧
46 | pd.DataFrame(bag_of_words.toarray(), columns=feature_names)
47 | ```
48 |
49 | | | beats | best | both | brazil | germany | is | love | sweden |
50 | | --- | --- | --- | --- | --- | --- | --- | --- | --- |
51 | | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 1 | 0 |
52 | | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
53 | | 2 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
54 |
55 | ## 解析 HTML
56 |
57 | ```py
58 | # 加载库
59 | from bs4 import BeautifulSoup
60 |
61 | # 创建一些 HTML 代码
62 | html = "Masego Azra
"
63 |
64 | # 解析 html
65 | soup = BeautifulSoup(html, "lxml")
66 |
67 | # 寻找带有 "full_name" 类的 ,展示文本
68 | soup.find("div", { "class" : "full_name" }).text
69 |
70 | # 'Masego Azra'
71 | ```
72 |
73 | ## 移除标点
74 |
75 | ```py
76 | # 加载库
77 | import string
78 | import numpy as np
79 |
80 | # 创建文本
81 | text_data = ['Hi!!!! I. Love. This. Song....',
82 | '10000% Agree!!!! #LoveIT',
83 | 'Right?!?!']
84 |
85 | # 创建函数,使用 string.punctuation 移除所有标点
86 | def remove_punctuation(sentence: str) -> str:
87 | return sentence.translate(str.maketrans('', '', string.punctuation))
88 |
89 | # 应用函数
90 | [remove_punctuation(sentence) for sentence in text_data]
91 |
92 | # ['Hi I Love This Song', '10000 Agree LoveIT', 'Right']
93 | ```
94 |
95 | ## 移除停止词
96 |
97 | ```py
98 | # 加载库
99 | from nltk.corpus import stopwords
100 |
101 | # 你第一次需要下载停止词的集合
102 | import nltk
103 | nltk.download('stopwords')
104 |
105 | '''
106 | [nltk_data] Downloading package stopwords to
107 | [nltk_data] /Users/chrisalbon/nltk_data...
108 | [nltk_data] Package stopwords is already up-to-date!
109 |
110 | True
111 | '''
112 |
113 | # 创建单词标记
114 | tokenized_words = ['i', 'am', 'going', 'to', 'go', 'to', 'the', 'store', 'and', 'park']
115 |
116 | # 加载停止词
117 | stop_words = stopwords.words('english')
118 |
119 | # 展示停止词
120 | stop_words[:5]
121 |
122 | # ['i', 'me', 'my', 'myself', 'we']
123 |
124 | # 移除停止词
125 | [word for word in tokenized_words if word not in stop_words]
126 |
127 | # ['going', 'go', 'store', 'park']
128 | ```
129 |
130 | ## 替换字符
131 |
132 | ```py
133 | # 导入库
134 | import re
135 |
136 | # 创建文本
137 | text_data = ['Interrobang. By Aishwarya Henriette',
138 | 'Parking And Going. By Karl Gautier',
139 | 'Today Is The night. By Jarek Prakash']
140 |
141 | # 移除句号
142 | remove_periods = [string.replace('.', '') for string in text_data]
143 |
144 | # 展示文本
145 | remove_periods
146 |
147 | '''
148 | ['Interrobang By Aishwarya Henriette',
149 | 'Parking And Going By Karl Gautier',
150 | 'Today Is The night By Jarek Prakash']
151 | '''
152 |
153 | # 创建函数
154 | def replace_letters_with_X(string: str) -> str:
155 | return re.sub(r'[a-zA-Z]', 'X', string)
156 |
157 | # 应用函数
158 | [replace_letters_with_X(string) for string in remove_periods]
159 |
160 | '''
161 | ['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
162 | 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
163 | 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']
164 | '''
165 | ```
166 |
167 | ## 词干提取
168 |
169 | 
170 |
171 | ```py
172 | # 加载库
173 | from nltk.stem.porter import PorterStemmer
174 |
175 | # 创建单词标记
176 | tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']
177 | ```
178 |
179 | 词干提取通过识别和删除词缀(例如动名词)同时保持词的根本意义,将词语简化为词干。 NLTK 的`PorterStemmer`实现了广泛使用的 Porter 词干算法。
180 |
181 | ```py
182 | # 创建提取器
183 | porter = PorterStemmer()
184 |
185 | # 应用提取器
186 | [porter.stem(word) for word in tokenized_words]
187 |
188 | # ['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']
189 | ```
190 |
191 | ## 移除空白
192 |
193 | ```py
194 | # 创建文本
195 | text_data = [' Interrobang. By Aishwarya Henriette ',
196 | 'Parking And Going. By Karl Gautier',
197 | ' Today Is The night. By Jarek Prakash ']
198 |
199 | # 移除空白
200 | strip_whitespace = [string.strip() for string in text_data]
201 |
202 | # 展示文本
203 | strip_whitespace
204 |
205 | '''
206 | ['Interrobang. By Aishwarya Henriette',
207 | 'Parking And Going. By Karl Gautier',
208 | 'Today Is The night. By Jarek Prakash']
209 | '''
210 | ```
211 |
212 | ## 词性标签
213 |
214 | ```py
215 | # 加载库
216 | from nltk import pos_tag
217 | from nltk import word_tokenize
218 |
219 | # 创建文本
220 | text_data = "Chris loved outdoor running"
221 |
222 | # 使用预训练的词性标注器
223 | text_tagged = pos_tag(word_tokenize(text_data))
224 |
225 | # 展示词性
226 | text_tagged
227 |
228 | # [('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]
229 | ```
230 |
231 | 输出是一个元组列表,包含单词和词性的标记。 NLTK 使用 Penn Treebank 词性标签。
232 |
233 | | 标签 | 词性 |
234 | | --- | --- |
235 | | NNP | 专有名词,单数 |
236 | | NN | 名词,单数或集体 |
237 | | RB | 副词 |
238 | | VBD | 动词,过去式 |
239 | | VBG | 动词,动名词或现在分词 |
240 | | JJ | 形容词 |
241 | | PRP | 人称代词 |
242 |
243 | ## TF-IDF
244 |
245 | 
246 |
247 | ```py
248 | # 加载库
249 | import numpy as np
250 | from sklearn.feature_extraction.text import TfidfVectorizer
251 | import pandas as pd
252 |
253 | # 创建文本
254 | text_data = np.array(['I love Brazil. Brazil!',
255 | 'Sweden is best',
256 | 'Germany beats both'])
257 |
258 | # 创建 tf-idf 特征矩阵
259 | tfidf = TfidfVectorizer()
260 | feature_matrix = tfidf.fit_transform(text_data)
261 |
262 | # 展示 tf-idf 特征矩阵
263 | feature_matrix.toarray()
264 |
265 | '''
266 | array([[ 0. , 0. , 0. , 0.89442719, 0. ,
267 | 0. , 0.4472136 , 0. ],
268 | [ 0. , 0.57735027, 0. , 0. , 0. ,
269 | 0.57735027, 0. , 0.57735027],
270 | [ 0.57735027, 0. , 0.57735027, 0. , 0.57735027,
271 | 0. , 0. , 0. ]])
272 | '''
273 |
274 | # 展示 tf-idf 特征矩阵
275 | tfidf.get_feature_names()
276 |
277 | # ['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']
278 |
279 | # 创建数据帧
280 | pd.DataFrame(feature_matrix.toarray(), columns=tfidf.get_feature_names())
281 | ```
282 |
283 | | | beats | best | both | brazil | germany | is | love | sweden |
284 | | --- | --- | --- | --- | --- | --- | --- | --- | --- |
285 | | 0 | 0.00000 | 0.00000 | 0.00000 | 0.894427 | 0.00000 | 0.00000 | 0.447214 | 0.00000 |
286 | | 1 | 0.00000 | 0.57735 | 0.00000 | 0.000000 | 0.00000 | 0.57735 | 0.000000 | 0.57735 |
287 | | 2 | 0.57735 | 0.00000 | 0.57735 | 0.000000 | 0.57735 | 0.00000 | 0.000000 | 0.00000 |
288 |
289 | ## 文本分词
290 |
291 | ```py
292 | # 加载库
293 | from nltk.tokenize import word_tokenize, sent_tokenize
294 |
295 | # 创建文本
296 | string = "The science of today is the technology of tomorrow. Tomorrow is today."
297 |
298 | # 对文本分词
299 | word_tokenize(string)
300 |
301 | '''
302 | ['The',
303 | 'science',
304 | 'of',
305 | 'today',
306 | 'is',
307 | 'the',
308 | 'technology',
309 | 'of',
310 | 'tomorrow',
311 | '.',
312 | 'Tomorrow',
313 | 'is',
314 | 'today',
315 | '.']
316 | '''
317 |
318 | # 对句子分词
319 | sent_tokenize(string)
320 |
321 | # ['The science of today is the technology of tomorrow.', 'Tomorrow is today.']
322 | ```
323 |
--------------------------------------------------------------------------------
/12.md:
--------------------------------------------------------------------------------
1 | # 十二、逻辑回归
2 |
3 | > 作者:[Chris Albon](https://chrisalbon.com/)
4 | >
5 | > 译者:[飞龙](https://github.com/wizardforcel)
6 | >
7 | > 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/)
8 |
9 | ## C 超参数快速调优
10 |
11 | 有时,学习算法的特征使我们能够比蛮力或随机模型搜索方法更快地搜索最佳超参数。
12 |
13 | scikit-learn 的`LogisticRegressionCV`方法包含一个参数`C`。 如果提供了一个列表,`C`是可供选择的候选超参数值。 如果提供了一个整数,`C`的这么多个候选值,将从 0.0001 和 10000 之间的对数标度(`C`的合理值范围)中提取。
14 |
15 | ```py
16 | # 加载库
17 | from sklearn import linear_model, datasets
18 |
19 | # 加载数据
20 | iris = datasets.load_iris()
21 | X = iris.data
22 | y = iris.target
23 |
24 | # 创建逻辑回归的交叉验证
25 | clf = linear_model.LogisticRegressionCV(Cs=100)
26 |
27 | # 训练模型
28 | clf.fit(X, y)
29 |
30 | '''
31 | LogisticRegressionCV(Cs=100, class_weight=None, cv=None, dual=False,
32 | fit_intercept=True, intercept_scaling=1.0, max_iter=100,
33 | multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
34 | refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)
35 | '''
36 | ```
37 |
38 | # 在逻辑回归中处理不平衡类别
39 |
40 | 像 scikit-learn 中的许多其他学习算法一样,`LogisticRegression`带有处理不平衡类的内置方法。 如果我们有高度不平衡的类,并且在预处理期间没有解决它,我们可以选择使用`class_weight`参数来对类加权,确保我们拥有每个类的平衡组合。 具体来说,`balanced`参数会自动对类加权,与其频率成反比:
41 |
42 | 
43 |
44 | 其中  是类  的权重, 是观察数, 是类  中的观察数, 是类的总数。
45 |
46 | ```py
47 | # 加载库
48 | from sklearn.linear_model import LogisticRegression
49 | from sklearn import datasets
50 | from sklearn.preprocessing import StandardScaler
51 | import numpy as np
52 |
53 | # 加载数据
54 | iris = datasets.load_iris()
55 | X = iris.data
56 | y = iris.target
57 |
58 | # 通过移除前 40 个观测,使类高度不均衡
59 | X = X[40:,:]
60 | y = y[40:]
61 |
62 | # 创建目标向量,如果表示类别是否为 0
63 | y = np.where((y == 0), 0, 1)
64 |
65 | # 标准化特征
66 | scaler = StandardScaler()
67 | X_std = scaler.fit_transform(X)
68 |
69 | # 创建决策树分类器对象
70 | clf = LogisticRegression(random_state=0, class_weight='balanced')
71 |
72 | # 训练模型
73 | model = clf.fit(X_std, y)
74 | ```
75 |
76 | ## 逻辑回归
77 |
78 | 尽管其名称中存在“回归”,但逻辑回归实际上是广泛使用的二分类器(即,目标向量只有两个值)。 在逻辑回归中,线性模型(例如 )包含在 logit(也称为 sigmoid)函数中,,满足:
79 |
80 | 
81 |
82 | 其中  是第  个观测的目标值  为 1 的概率, 是训练数据, 和  是要学习的参数, 是自然常数。
83 |
84 | ```py
85 | # 加载库
86 | from sklearn.linear_model import LogisticRegression
87 | from sklearn import datasets
88 | from sklearn.preprocessing import StandardScaler
89 |
90 | # 加载只有两个类别的数据
91 | iris = datasets.load_iris()
92 | X = iris.data[:100,:]
93 | y = iris.target[:100]
94 |
95 | # 标准化特征
96 | scaler = StandardScaler()
97 | X_std = scaler.fit_transform(X)
98 |
99 | # 创建逻辑回归对象
100 | clf = LogisticRegression(random_state=0)
101 |
102 | # 训练模型
103 | model = clf.fit(X_std, y)
104 |
105 | # 创建新的观测
106 | new_observation = [[.5, .5, .5, .5]]
107 |
108 | # 预测类别
109 | model.predict(new_observation)
110 |
111 | # array([1])
112 |
113 | # 查看预测的概率
114 | model.predict_proba(new_observation)
115 |
116 | # array([[ 0.18823041, 0.81176959]])
117 | ```
118 |
119 | # 大量数据上的逻辑回归
120 |
121 | scikit-learn 的`LogisticRegression`提供了许多用于训练逻辑回归的技术,称为求解器。 大多数情况下,scikit-learn 会自动为我们选择最佳求解器,或警告我们,你不能用求解器做一些事情。 但是,我们应该注意一个特殊情况。
122 |
123 | 虽然精确的解释超出了本书的范围,但随机平均梯度下降使得我们在数据非常大时,比其他求解器更快训练模型。 但是,对特征尺度也非常敏感,标准化我们的特征尤为重要。 我们可以通过设置`solver ='sag'`来设置我们的学习算法来使用这个求解器。
124 |
125 | ```py
126 | # 加载库
127 | from sklearn.linear_model import LogisticRegression
128 | from sklearn import datasets
129 | from sklearn.preprocessing import StandardScaler
130 |
131 | # 加载数据
132 | iris = datasets.load_iris()
133 | X = iris.data
134 | y = iris.target
135 |
136 | # 标准化特征
137 | scaler = StandardScaler()
138 | X_std = scaler.fit_transform(X)
139 |
140 | # 创建使用 SAG 求解器的逻辑回归
141 | clf = LogisticRegression(random_state=0, solver='sag')
142 |
143 | # 训练模型
144 | model = clf.fit(X_std, y)
145 | ```
146 |
147 | # 带有 L1 正则化的逻辑回归
148 |
149 | L1 正则化(也称为最小绝对误差)是数据科学中的强大工具。 有许多教程解释 L1 正则化,我不会在这里尝试这样做。 相反,本教程将展示正则化参数`C`对系数和模型精度的影响。
150 |
151 | ```py
152 | import numpy as np
153 | from sklearn.linear_model import LogisticRegression
154 | from sklearn import datasets
155 | from sklearn.model_selection import train_test_split
156 | from sklearn.preprocessing import StandardScaler
157 | ```
158 |
159 | 本教程中使用的数据集是着名的[鸢尾花数据集](https://en.wikipedia.org/wiki/Iris_flower_data_set)。鸢尾花数据包含来自三种鸢尾花`y`,和四个特征变量`X`的 50 个样本。
160 |
161 | 数据集包含三个类别(三种鸢尾),但是为了简单起见,如果目标数据是二元的,则更容易。因此,我们将从数据中删除最后一种鸢尾。
162 |
163 | ```py
164 | # 加载鸢尾花数据集
165 | iris = datasets.load_iris()
166 |
167 | # 从特征中创建 X
168 | X = iris.data
169 |
170 | # 从目标中创建 y
171 | y = iris.target
172 |
173 | # 重新生成变量,保留所有标签不是 2 的数据
174 | X = X[y != 2]
175 | y = y[y != 2]
176 |
177 | # 查看特征
178 | X[0:5]
179 |
180 | '''
181 | array([[ 5.1, 3.5, 1.4, 0.2],
182 | [ 4.9, 3\. , 1.4, 0.2],
183 | [ 4.7, 3.2, 1.3, 0.2],
184 | [ 4.6, 3.1, 1.5, 0.2],
185 | [ 5\. , 3.6, 1.4, 0.2]])
186 | '''
187 |
188 | # 查看目标数据
189 | y
190 |
191 | '''
192 | array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
193 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194 | 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
195 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
196 | 1, 1, 1, 1, 1, 1, 1, 1])
197 | '''
198 |
199 | # 将数据拆分为测试和训练集
200 | # 将 30% 的样本放入测试集
201 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
202 | ```
203 |
204 | 因为正则化惩罚由系数的绝对值之和组成,所以我们需要缩放数据,使系数都基于相同的比例。
205 |
206 | ```py
207 | # 创建缩放器对象
208 | sc = StandardScaler()
209 |
210 | # 将缩放器拟合训练数据,并转换
211 | X_train_std = sc.fit_transform(X_train)
212 |
213 | # 将缩放器应用于测试数据
214 | X_test_std = sc.transform(X_test)
215 | ```
216 |
217 | L1 的用处在于它可以将特征系数逼近 0,从而创建一种特征选择方法。 在下面的代码中,我们运行带有 L1 惩罚的逻辑回归四次,每次都减少了`C`的值。 我们应该期望随着`C`的减少,更多的系数变为 0。
218 |
219 | ```py
220 | C = [10, 1, .1, .001]
221 |
222 | for c in C:
223 | clf = LogisticRegression(penalty='l1', C=c)
224 | clf.fit(X_train, y_train)
225 | print('C:', c)
226 | print('Coefficient of each feature:', clf.coef_)
227 | print('Training accuracy:', clf.score(X_train, y_train))
228 | print('Test accuracy:', clf.score(X_test, y_test))
229 | print('')
230 |
231 | '''
232 | C: 10
233 | Coefficient of each feature: [[-0.0855264 -3.75409972 4.40427765 0\. ]]
234 | Training accuracy: 1.0
235 | Test accuracy: 1.0
236 |
237 | C: 1
238 | Coefficient of each feature: [[ 0\. -2.28800472 2.5766469 0\. ]]
239 | Training accuracy: 1.0
240 | Test accuracy: 1.0
241 |
242 | C: 0.1
243 | Coefficient of each feature: [[ 0\. -0.82310456 0.97171847 0\. ]]
244 | Training accuracy: 1.0
245 | Test accuracy: 1.0
246 |
247 | C: 0.001
248 | Coefficient of each feature: [[ 0\. 0\. 0\. 0.]]
249 | Training accuracy: 0.5
250 | Test accuracy: 0.5
251 | '''
252 | ```
253 |
254 | 注意,当`C`减小时,模型系数变小(例如,从`C = 10`时的`4.36276075`变为`C = 0.1`时的`0.0.97175097`),直到`C = 0.001`,所有系数都是零。 这是变得更加突出的,正则化惩罚的效果。
255 |
256 | # OVR 逻辑回归
257 |
258 | 逻辑回归本身只是二分类器,这意味着它们无法处理具有两个类别以上的目标向量。 但是,逻辑回归有一些聪明的扩展来实现它。 在 One-VS-Rest(OVR)逻辑回归中,针对每个类别训练单独的模型,预测观测是否是该类(因此使其成为二分类问题)。 它假定每个分类问题(例如是不是类 0)是独立的。
259 |
260 | ```py
261 | # 加载库
262 | from sklearn.linear_model import LogisticRegression
263 | from sklearn import datasets
264 | from sklearn.preprocessing import StandardScaler
265 |
266 | # 加载数据
267 | iris = datasets.load_iris()
268 | X = iris.data
269 | y = iris.target
270 |
271 | # 标准化特征
272 | scaler = StandardScaler()
273 | X_std = scaler.fit_transform(X)
274 |
275 | # 创建 OVR 逻辑回归对象
276 | clf = LogisticRegression(random_state=0, multi_class='ovr')
277 |
278 | # 训练模型
279 | model = clf.fit(X_std, y)
280 |
281 | # 创建新的观测
282 | new_observation = [[.5, .5, .5, .5]]
283 |
284 | # 预测类别
285 | model.predict(new_observation)
286 |
287 | # array([2])
288 |
289 | # 查看预测概率
290 | model.predict_proba(new_observation)
291 |
292 | # array([[ 0.0829087 , 0.29697265, 0.62011865]])
293 | ```
--------------------------------------------------------------------------------
/21.md:
--------------------------------------------------------------------------------
1 | # 二十一、统计学
2 |
3 | > 作者:[Chris Albon](https://chrisalbon.com/)
4 | >
5 | > 译者:[飞龙](https://github.com/wizardforcel)
6 | >
7 | > 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/)
8 |
9 | ## 贝塞尔校正
10 |
11 | 贝塞尔的校正是我们在样本方差和样本标准差的计算中使用  而不是  的原因。
12 |
13 | 样本方差:
14 |
15 | 
16 |
17 | 当我们计算样本方差时,我们试图估计总体方差,这是一个未知值。 为了进行这种估计,我们从样本与总体均值的平方差的平均值,来估计未知的总体方差。 这种估计技术的负面影响是,因为我们正在采样,我们更有可能观察到差较小的观测,因为它们更常见(例如它们是分布的中心)。 按照定义我们将低估总体方差。
18 |
19 | 弗里德里希贝塞尔发现,通过将有偏差(未校正)的样本方差  乘以 ,我们将能够减少这种偏差,从而能够准确估计总体方差和标准差。 乘法的最终结果是无偏样本方差。
20 |
21 | ## 演示中心极限定律
22 |
23 | ```py
24 | # 导入包
25 | import pandas as pd
26 | import numpy as np
27 |
28 | # 将 matplotlib 设为内联
29 | %matplotlib inline
30 |
31 | # 创建空的数据帧
32 | population = pd.DataFrame()
33 |
34 | # 创建一列,它是来自均匀分布的 10000 个随机数
35 | population['numbers'] = np.random.uniform(0,10000,size=10000)
36 |
37 | # 绘制得分数据的直方图
38 | # 这确认了数据不是正态分布的
39 | population['numbers'].hist(bins=100)
40 |
41 | #
42 | ```
43 |
44 | 
45 |
46 | ```py
47 | # 查看数值的均值
48 | population['numbers'].mean()
49 |
50 | # 4983.824612472138
51 |
52 | # 创建列表
53 | sampled_means = []
54 |
55 | # 执行 1000 次
56 | for i in range(0,1000):
57 | # 从总体中随机抽取 100 行
58 | # 计算它们的均值,附加到 sampled_means
59 | sampled_means.append(population.sample(n=100).mean().values[0])
60 |
61 | # 绘制 sampled_means 的直方图
62 | # 它很明显是正态分布的,中心约为 5000
63 | pd.Series(sampled_means).hist(bins=100)
64 |
65 | #
66 | ```
67 |
68 | 
69 |
70 | 这是关键的图表,记住总体分布是均匀的,然而,这个分布接近正态。 这是中心极限理论的关键点,也是我们可以假设样本均值是无偏的原因。
71 |
72 | ```py
73 | # 查看 sampled_means 的均值
74 | pd.Series(sampled_means).mean()
75 |
76 | # 4981.465310909289
77 |
78 | # 将样本均值的均值减去真实的总体均值
79 | error = population['numbers'].mean() - pd.Series(sampled_means).mean()
80 |
81 | # 打印
82 | print('The Mean Sample Mean is only %f different the True Population mean!' % error)
83 |
84 | # The Mean Sample Mean is only 2.359302 different the True Population mean!
85 | ```
86 |
87 | ## 皮尔逊相关系数
88 |
89 | 基于 [cbare](http://stackoverflow.com/users/199166/cbare) 的[这个](http://stackoverflow.com/a/17389980/2935984) StackOverflow 答案。
90 |
91 | ```py
92 | import statistics as stats
93 |
94 | x = [1,2,3,4,5,6,7,8,9]
95 | y = [2,1,2,4.5,7,6.5,6,9,9.5]
96 | ```
97 | 有许多等价的表达方式来计算皮尔逊相关系数(也称为皮尔逊的 r)。这是一个。
98 |
99 | 
100 |
101 | 其中  和  是  和  的标准差, 是  和  的[标准得分](https://en.wikipedia.org/wiki/Standard_score)。
102 |
103 | ```py
104 | # 创建函数
105 | def pearson(x,y):
106 |
107 | # 创建 n,数据中的观测数量
108 | n = len(x)
109 |
110 | # 创建列表来储存标准得分
111 | standard_score_x = []
112 | standard_score_y = []
113 |
114 | # 计算 x 的均值
115 | mean_x = stats.mean(x)
116 |
117 | # 计算 x 的标准差
118 | standard_deviation_x = stats.stdev(x)
119 |
120 | # 计算 y 的均值
121 | mean_y = stats.mean(y)
122 |
123 | # 计算 y 的标准差
124 | standard_deviation_y = stats.stdev(y)
125 |
126 | # 对于 x 中的每个观测
127 | for observation in x:
128 |
129 | # 计算 x 的标准得分
130 | standard_score_x.append((observation - mean_x)/standard_deviation_x)
131 |
132 | # 对于 y 中的每个观测
133 | for observation in y:
134 |
135 | # 计算 y 的标准得分
136 | standard_score_y.append((observation - mean_y)/standard_deviation_y)
137 |
138 | # 将标准得分加在一起,求和,然后除以 n-1,返回该值
139 | return (sum([i*j for i,j in zip(standard_score_x, standard_score_y)]))/(n-1)
140 |
141 | # 展示皮尔逊相关系数
142 | pearson(x,y)
143 |
144 | # 0.9412443251336238
145 | ```
146 |
147 | ## 概率质量函数(PMF)
148 |
149 | ```py
150 | # 加载库
151 | import matplotlib.pyplot as plt
152 |
153 | # 创建一些随机整数
154 | data = [3,2,3,4,2,3,5,2,2,3,3,5,2,2,5,6,2,2,2,3,6,6,2,4,3,2,3]
155 |
156 | # 创建字典来储存计数
157 | count = {}
158 |
159 | # 对于数据中的每个值
160 | for observation in data:
161 | # 键为观测,值递增
162 | count[observation] = count.get(observation, 0) + 1
163 |
164 | # 计算观测数量 observations
165 | n = len(data)
166 |
167 | # 创建字典
168 | probability_mass_function = {}
169 |
170 | # 对于每个唯一值
171 | for unique_value, count in count.items():
172 | # 将计数归一化,通过除以数据量,添加到 PMC 字典
173 | probability_mass_function[unique_value] = count / n
174 |
175 | # 绘制概率质量函数
176 | plt.bar(list(probability_mass_function.keys()), probability_mass_function.values(), color='g')
177 | plt.show()
178 | ```
179 |
180 | 
181 |
182 | ## Spearman 排名相关度
183 |
184 | ```py
185 | import numpy as np
186 | import pandas as pd
187 | import scipy.stats
188 |
189 | # 创建两列随机变量
190 | x = [1,2,3,4,5,6,7,8,9]
191 | y = [2,1,2,4.5,7,6.5,6,9,9.5]
192 | ```
193 |
194 | Spearman 的排名相关度,是变量的排名版本的皮尔逊相关系数。
195 |
196 | ```py
197 | # 创建接受 x 和 y 的函数
198 | def spearmans_rank_correlation(xs, ys):
199 |
200 | # 计算 x 的排名
201 | #(也就是排序后元素的位置)
202 | xranks = pd.Series(xs).rank()
203 |
204 | # 计算 y 的排名
205 | yranks = pd.Series(ys).rank()
206 |
207 | # 在数据的排名版本上,计算皮尔逊相关系数
208 | return scipy.stats.pearsonr(xranks, yranks)
209 |
210 | # 运行函数
211 | spearmans_rank_correlation(x, y)[0]
212 |
213 | # 0.90377360145618091
214 |
215 | # 仅仅检查我们的结果,使用 Scipy 的 Spearman
216 | scipy.stats.spearmanr(x, y)[0]
217 |
218 | # 0.90377360145618102
219 | ```
220 |
221 | ## T 检验
222 |
223 | ```py
224 | from scipy import stats
225 | import numpy as np
226 |
227 | # 创建 20 个观测的列表,从均值为 1,
228 | # 标准差为 1.5 的正态分布中随机抽取
229 | x = np.random.normal(1, 1.5, 20)
230 |
231 | # 创建 20 个观测的列表,从均值为 0,
232 | # 标准差为 1.5 的正态分布中随机抽取
233 | y = np.random.normal(0, 1.5, 20)
234 | ```
235 |
236 | ### 单样本双边 T 检验
237 |
238 | 想象一下单样本 T 检验,并绘制一个“正态形状的”山丘,以`1`为中心,并以`1.5`为标准差而“展开”,然后在`0`处放置一个标志并查看标志在山丘上的位置。它靠近顶部吗? 或者远离山丘? 如果标志靠近山丘的底部或更远,则 t 检验的 p 值将低于`0.05`。
239 |
240 | ```py
241 | # 运行 T 检验来检验 x 的均值和 0 相比,是否有统计学显著的差异
242 | pvalue = stats.ttest_1samp(x, 0)[1]
243 |
244 | # 查看 p 值
245 | pvalue
246 |
247 | # 0.00010976647757800537
248 | ```
249 |
250 | ### 双样本非配对等方差双边 T 检验
251 |
252 | 想象一下单样本 T 检验,并根据标准差绘制两个(正态形状的)山丘,以它们的均值为中心,并根据他们的标准差绘制它们的“平坦度”(个体延展度)。 T 检验考察了两座山丘重叠的程度。 它们基本上是彼此覆盖的吗? 山丘的底部几乎没有碰到吗? 如果山丘的尾部刚刚重叠或根本不重叠,则 t 检验的 p 值将低于 0.05。
253 |
254 | ```py
255 | stats.ttest_ind(x, y)[1]
256 |
257 | # 0.00035082056802728071
258 |
259 | stats.ttest_ind(x, y, equal_var=False)[1]
260 |
261 | # 0.00035089238660076095
262 | ```
263 |
264 | ### 双样本配对双边 T 检验
265 |
266 | 当我们采集重复样本,并且想要考虑我们正在测试的两个分布是成对的这一事实时,使用配对 T 检验。
267 |
268 | ```py
269 | stats.ttest_rel(x, y)[1]
270 |
271 | # 0.00034222792790150386
272 | ```
273 |
274 | ## 方差和标准差
275 |
276 | ```py
277 | # 导入包
278 | import math
279 |
280 | # 创建值的列表
281 | data = [3,2,3,4,2,3,5,2,2,33,3,5,2,2,5,6,62,2,2,3,6,6,2,23,3,2,3]
282 | ```
283 |
284 | 方差是衡量数据分布延展度的指标。 方差越大,数据点越“分散”。 方差,通常表示为 ,计算方式如下:
285 |
286 | 
287 |
288 | 
289 |
290 | 其中  是观测数, 是观察值的平均值, 是单个观察值减去数据均值。 请注意,如果我们根据来自该总体的样本估计总体的方差,我们应该使用第二个等式,将  替换为 。
291 |
292 | ```py
293 | # 计算 n
294 | n = len(data)
295 |
296 | # 计算均值
297 | mean = sum(data)/len(data)
298 |
299 | # 从均值创建所有观测的差
300 | all_deviations_from_mean_squared = []
301 |
302 | # 对于数据中的每个观测
303 | for observation in data:
304 |
305 | # 计算到均值的差
306 | deviation_from_mean = (observation - mean)
307 |
308 | # 计算平方
309 | deviation_from_mean_squared = deviation_from_mean**2
310 |
311 | # 将结果添加到列表
312 | all_deviations_from_mean_squared.append(deviation_from_mean_squared)
313 |
314 | # 对于列表中所有平方差求和
315 | sum_of_deviations_from_mean_squared = sum(all_deviations_from_mean_squared)
316 |
317 | # 除以 n
318 | population_variance = sum_of_deviations_from_mean_squared/n
319 |
320 | # 展示方差
321 | population_variance
322 |
323 | # 160.78463648834017
324 | ```
325 |
326 | 标准差就是方差的平方根。
327 |
328 | ```py
329 | # 计算总体方差的平方根
330 | population_standard_deviation = math.sqrt(population_variance)
331 |
332 | # 打印总体标准差
333 | population_standard_deviation
334 |
335 | # 12.68008818929664
336 | ```
--------------------------------------------------------------------------------
/10.md:
--------------------------------------------------------------------------------
1 | # 十、模型选择
2 |
3 | > 作者:[Chris Albon](https://chrisalbon.com/)
4 | >
5 | > 译者:[飞龙](https://github.com/wizardforcel)
6 | >
7 | > 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/)
8 |
9 | ## 在模型选择期间寻找最佳预处理步骤
10 |
11 | 在进行模型选择时,我们必须小心正确处理预处理。 首先,`GridSearchCV`使用交叉验证来确定哪个模型表现最好。 然而,在交叉验证中,我们假装作为测试集被留出的一折是不可见的,因此不适合一些预处理步骤(例如缩放或标准化)。 出于这个原因,我们无法预处理数据然后运行`GridSearchCV`。
12 |
13 | 其次,一些预处理方法有自己的参数,通常必须由用户提供。 通过在搜索空间中包括候选成分值,可以像对待任何想要搜索其他超参数一样对待它们。
14 |
15 | ```py
16 | # 加载库
17 | import numpy as np
18 | from sklearn import datasets
19 | from sklearn.feature_selection import SelectKBest
20 | from sklearn.linear_model import LogisticRegression
21 | from sklearn.model_selection import GridSearchCV
22 | from sklearn.pipeline import Pipeline, FeatureUnion
23 | from sklearn.decomposition import PCA
24 | from sklearn.preprocessing import StandardScaler
25 |
26 | # 设置随机种子
27 | np.random.seed(0)
28 |
29 | # 加载数据
30 | iris = datasets.load_iris()
31 | X = iris.data
32 | y = iris.target
33 | ```
34 |
35 | 我们包括两个不同的预处理步骤:主成分分析和 k 最佳特征选择。
36 |
37 | ```py
38 | # 创建组合预处理对象
39 | preprocess = FeatureUnion([('pca', PCA()), ("kbest", SelectKBest(k=1))])
40 |
41 | # 创建流水线
42 | pipe = Pipeline([('preprocess', preprocess), ('classifier', LogisticRegression())])
43 |
44 | # 创建候选值空间
45 | search_space = [{'preprocess__pca__n_components': [1, 2, 3],
46 | 'classifier__penalty': ['l1', 'l2'],
47 | 'classifier__C': np.logspace(0, 4, 10)}]
48 |
49 | # 创建网格搜索
50 | clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)
51 |
52 | # 拟合网格搜索
53 | best_model = clf.fit(X, y)
54 |
55 | # 查看最佳超参数
56 | print('Best Number Of Princpal Components:', best_model.best_estimator_.get_params()['preprocess__pca__n_components'])
57 | print('Best Penalty:', best_model.best_estimator_.get_params()['classifier__penalty'])
58 | print('Best C:', best_model.best_estimator_.get_params()['classifier__C'])
59 |
60 | '''
61 | Best Number Of Princpal Components: 3
62 | Best Penalty: l1
63 | Best C: 59.9484250319
64 | '''
65 | ```
66 |
67 | ## 使用网格搜索的超参数调优
68 |
69 | 
70 |
71 | ```py
72 | # 加载库
73 | import numpy as np
74 | from sklearn import linear_model, datasets
75 | from sklearn.model_selection import GridSearchCV
76 |
77 | # 加载数据
78 | iris = datasets.load_iris()
79 | X = iris.data
80 | y = iris.target
81 |
82 | # 创建逻辑回归
83 | logistic = linear_model.LogisticRegression()
84 |
85 | # 创建正则化惩罚空间
86 | penalty = ['l1', 'l2']
87 |
88 | # 创建正则化超参数空间
89 | C = np.logspace(0, 4, 10)
90 |
91 | # 创建超参数选项
92 | hyperparameters = dict(C=C, penalty=penalty)
93 |
94 | # 使用 5 折交叉验证创建网格搜索
95 | clf = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)
96 |
97 | # 拟合网格搜索
98 | best_model = clf.fit(X, y)
99 |
100 | # 查看最佳超参数
101 | print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
102 | print('Best C:', best_model.best_estimator_.get_params()['C'])
103 | '''
104 | Best Penalty: l1
105 | Best C: 7.74263682681
106 | '''
107 |
108 | # 预测目标向量
109 | best_model.predict(X)
110 | '''
111 | array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
112 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
113 | 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
114 | 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
115 | 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
116 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,
117 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
118 | '''
119 | ```
120 |
121 | ## 使用随机搜索的超参数调优
122 |
123 |
124 | ```py
125 | # 加载库
126 | from scipy.stats import uniform
127 | from sklearn import linear_model, datasets
128 | from sklearn.model_selection import RandomizedSearchCV
129 |
130 | # 加载数据
131 | iris = datasets.load_iris()
132 | X = iris.data
133 | y = iris.target
134 |
135 | # 创建逻辑回归
136 | logistic = linear_model.LogisticRegression()
137 |
138 | # 创建正则化惩罚空间
139 | penalty = ['l1', 'l2']
140 |
141 | # 使用均匀分布创建正则化超参数分布
142 | C = uniform(loc=0, scale=4)
143 |
144 | # 创建超参数选项
145 | hyperparameters = dict(C=C, penalty=penalty)
146 |
147 | # 使用 5 折交叉验证和 100 个迭代
148 | clf = RandomizedSearchCV(logistic, hyperparameters, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)
149 |
150 | # 拟合随机搜索
151 | best_model = clf.fit(X, y)
152 |
153 | # 查看最佳超参数
154 | print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
155 | print('Best C:', best_model.best_estimator_.get_params()['C'])
156 | '''
157 | Best Penalty: l1
158 | Best C: 1.66808801881
159 | '''
160 |
161 | # 预测目标向量
162 | best_model.predict(X)
163 | '''
164 | array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
165 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
166 | 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
167 | 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1,
168 | 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
169 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2,
170 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
171 | '''
172 | ```
173 |
174 | ## 使用网格搜索的模型选择
175 |
176 | 
177 |
178 |
179 | ```py
180 | # 加载库
181 | import numpy as np
182 | from sklearn import datasets
183 | from sklearn.linear_model import LogisticRegression
184 | from sklearn.ensemble import RandomForestClassifier
185 | from sklearn.model_selection import GridSearchCV
186 | from sklearn.pipeline import Pipeline
187 |
188 | # 设置随机种子
189 | np.random.seed(0)
190 |
191 | # 加载数据
192 | iris = datasets.load_iris()
193 | X = iris.data
194 | y = iris.target
195 | ```
196 |
197 | 请注意,我们包括需要搜索的多个可能的学习算法和多个可能的超参数值。
198 |
199 | ```py
200 | # 创建流水线
201 | pipe = Pipeline([('classifier', RandomForestClassifier())])
202 |
203 | # 创建候选学习算法和它们的超参数的空间
204 | search_space = [{'classifier': [LogisticRegression()],
205 | 'classifier__penalty': ['l1', 'l2'],
206 | 'classifier__C': np.logspace(0, 4, 10)},
207 | {'classifier': [RandomForestClassifier()],
208 | 'classifier__n_estimators': [10, 100, 1000],
209 | 'classifier__max_features': [1, 2, 3]}]
210 |
211 | # 创建网格搜索
212 | clf = GridSearchCV(pipe, search_space, cv=5, verbose=0)
213 |
214 | # 拟合网格搜索
215 | best_model = clf.fit(X, y)
216 |
217 | # 查看最佳模型
218 | best_model.best_estimator_.get_params()['classifier']
219 | '''
220 | LogisticRegression(C=7.7426368268112693, class_weight=None, dual=False,
221 | fit_intercept=True, intercept_scaling=1, max_iter=100,
222 | multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
223 | solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
224 | '''
225 |
226 | # 预测目标向量
227 | best_model.predict(X)
228 | '''
229 | array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
230 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
231 | 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
232 | 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
233 | 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
234 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,
235 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
236 | '''
237 | ```
238 |
239 | ## 带有参数选项的流水线
240 |
241 |
242 | ```py
243 | # 导入所需的包
244 | import numpy as np
245 | from sklearn import linear_model, decomposition, datasets
246 | from sklearn.pipeline import Pipeline
247 | from sklearn.model_selection import GridSearchCV, cross_val_score
248 | from sklearn.preprocessing import StandardScaler
249 |
250 | # 加载乳腺癌数据集
251 | dataset = datasets.load_breast_cancer()
252 |
253 | # 从数据集特征中创建 X
254 | X = dataset.data
255 |
256 | # 从数据集目标中创建 y
257 | y = dataset.target
258 |
259 | # 创建缩放器对象
260 | sc = StandardScaler()
261 |
262 | # 创建 PCA 对象
263 | pca = decomposition.PCA()
264 |
265 | # 创建逻辑回归对象,带有 L2 惩罚
266 | logistic = linear_model.LogisticRegression()
267 |
268 | # 创建三步流水线。首先,标准化数据。
269 | # 其次,使用 PCA 转换数据。
270 | # 然后在数据上训练逻辑回归。
271 | pipe = Pipeline(steps=[('sc', sc),
272 | ('pca', pca),
273 | ('logistic', logistic)])
274 |
275 | # 创建 1 到 30 的一列整数(X + 1,特征序号)
276 | n_components = list(range(1,X.shape[1]+1,1))
277 |
278 | # 创建正则化参数的一列值
279 | C = np.logspace(-4, 4, 50)
280 |
281 | # 为正则化乘法创建一列选项
282 | penalty = ['l1', 'l2']
283 |
284 | # 为所有参数选项创建字典
285 | # 注意,你可以使用 '__' 来访问流水线的步骤的参数
286 | parameters = dict(pca__n_components=n_components,
287 | logistic__C=C,
288 | logistic__penalty=penalty)
289 |
290 | # 创建网格搜索对象
291 | clf = GridSearchCV(pipe, parameters)
292 |
293 | # 拟合网格搜索
294 | clf.fit(X, y)
295 |
296 | # 查看超参数
297 | print('Best Penalty:', clf.best_estimator_.get_params()['logistic__penalty'])
298 | print('Best C:', clf.best_estimator_.get_params()['logistic__C'])
299 | print('Best Number Of Components:', clf.best_estimator_.get_params()['pca__n_components'])
300 |
301 | # 使用 3 折交叉验证拟合网格搜索
302 | cross_val_score(clf, X, y)
303 | ```
--------------------------------------------------------------------------------
/6.md:
--------------------------------------------------------------------------------
1 | # 六、日期时间预处理
2 |
3 | > 作者:[Chris Albon](https://chrisalbon.com/)
4 | >
5 | > 译者:[飞龙](https://github.com/wizardforcel)
6 | >
7 | > 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/)
8 |
9 | ## 把日期和时间拆成多个特征
10 |
11 | ```py
12 | # 加载库
13 | import pandas as pd
14 |
15 | # 创建数据帧
16 | df = pd.DataFrame()
17 |
18 | # 创建五个日期
19 | df['date'] = pd.date_range('1/1/2001', periods=150, freq='W')
20 |
21 | # 为年月日,时分秒创建特征
22 | df['year'] = df['date'].dt.year
23 | df['month'] = df['date'].dt.month
24 | df['day'] = df['date'].dt.day
25 | df['hour'] = df['date'].dt.hour
26 | df['minute'] = df['date'].dt.minute
27 |
28 | # 展示三行
29 | df.head(3)
30 | ```
31 |
32 | | | date | year | month | day | hour | minute |
33 | | --- | --- | --- | --- | --- | --- | --- |
34 | | 0 | 2001-01-07 | 2001 | 1 | 7 | 0 | 0 |
35 | | 1 | 2001-01-14 | 2001 | 1 | 14 | 0 | 0 |
36 | | 2 | 2001-01-21 | 2001 | 1 | 21 | 0 | 0 |
37 |
38 | ## 计算日期时间之间的差
39 |
40 | ```py
41 | # 加载库
42 | import pandas as pd
43 |
44 | # 创建数据帧
45 | df = pd.DataFrame()
46 |
47 | # 创建两个 datetime 特征
48 | df['Arrived'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-04-2017')]
49 | df['Left'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-06-2017')]
50 |
51 | # 计算特征之间的间隔
52 | df['Left'] - df['Arrived']
53 |
54 | '''
55 | 0 0 days
56 | 1 2 days
57 | dtype: timedelta64[ns]
58 | '''
59 |
60 | # 计算特征之间的间隔
61 | pd.Series(delta.days for delta in (df['Left'] - df['Arrived']))
62 |
63 | '''
64 | 0 0
65 | 1 2
66 | dtype: int64
67 | '''
68 | ```
69 |
70 | ## 将字符串转换为日期
71 |
72 | ```py
73 | # 加载库
74 | import numpy as np
75 | import pandas as pd
76 |
77 | # 创建字符串
78 | date_strings = np.array(['03-04-2005 11:35 PM',
79 | '23-05-2010 12:01 AM',
80 | '04-09-2009 09:09 PM'])
81 | ```
82 |
83 | 如果`errors="coerce"`那么任何问题都不会产生错误(默认行为),而是将导致错误的值设置为`NaT`(即缺失值)。
84 |
85 | | 代码 | 描述 | 示例 |
86 | | --- | --- | --- |
87 | | ` %Y ` | 整年 | `2001` |
88 | | ` %m ` | 零填充的月份 | `04` |
89 | | ` %d ` | 零填充的日期 | `09` |
90 | | ` %I ` | 零填充的小时(12 小时) | `02` |
91 | | ` %p ` | AM 或 PM | `AM` |
92 | | ` %M ` | 零填充的分钟 | `05` |
93 | | ` %S ` | 零填充的秒钟 | `09` |
94 |
95 | ```py
96 | # 转换为 datetime
97 | [pd.to_datetime(date, format="%d-%m-%Y %I:%M %p", errors="coerce") for date in date_strings]
98 |
99 | '''
100 | [Timestamp('2005-04-03 23:35:00'),
101 | Timestamp('2010-05-23 00:01:00'),
102 | Timestamp('2009-09-04 21:09:00')]
103 | '''
104 | ```
105 |
106 | ## 转换 pandas 列的时区
107 |
108 | ```py
109 | # 加载库
110 | import pandas as pd
111 | from pytz import all_timezones
112 |
113 | # 展示十个时区
114 | all_timezones[0:10]
115 |
116 | '''
117 | ['Africa/Abidjan',
118 | 'Africa/Accra',
119 | 'Africa/Addis_Ababa',
120 | 'Africa/Algiers',
121 | 'Africa/Asmara',
122 | 'Africa/Asmera',
123 | 'Africa/Bamako',
124 | 'Africa/Bangui',
125 | 'Africa/Banjul',
126 | 'Africa/Bissau']
127 | '''
128 |
129 | # 创建十个日期
130 | dates = pd.Series(pd.date_range('2/2/2002', periods=10, freq='M'))
131 |
132 | # 设置时区
133 | dates_with_abidjan_time_zone = dates.dt.tz_localize('Africa/Abidjan')
134 |
135 | # 查看 pandas 序列
136 | dates_with_abidjan_time_zone
137 |
138 | '''
139 | 0 2002-02-28 00:00:00+00:00
140 | 1 2002-03-31 00:00:00+00:00
141 | 2 2002-04-30 00:00:00+00:00
142 | 3 2002-05-31 00:00:00+00:00
143 | 4 2002-06-30 00:00:00+00:00
144 | 5 2002-07-31 00:00:00+00:00
145 | 6 2002-08-31 00:00:00+00:00
146 | 7 2002-09-30 00:00:00+00:00
147 | 8 2002-10-31 00:00:00+00:00
148 | 9 2002-11-30 00:00:00+00:00
149 | dtype: datetime64[ns, Africa/Abidjan]
150 | '''
151 |
152 | # 转换时区
153 | dates_with_london_time_zone = dates_with_abidjan_time_zone.dt.tz_convert('Europe/London')
154 |
155 | # 查看 pandas 序列
156 | dates_with_london_time_zone
157 |
158 | '''
159 | 0 2002-02-28 00:00:00+00:00
160 | 1 2002-03-31 00:00:00+00:00
161 | 2 2002-04-30 01:00:00+01:00
162 | 3 2002-05-31 01:00:00+01:00
163 | 4 2002-06-30 01:00:00+01:00
164 | 5 2002-07-31 01:00:00+01:00
165 | 6 2002-08-31 01:00:00+01:00
166 | 7 2002-09-30 01:00:00+01:00
167 | 8 2002-10-31 00:00:00+00:00
168 | 9 2002-11-30 00:00:00+00:00
169 | dtype: datetime64[ns, Europe/London]
170 | '''
171 | ```
172 |
173 | ## 编码星期
174 |
175 | ```py
176 | # 加载库
177 | import pandas as pd
178 |
179 | # 创建数据集
180 | dates = pd.Series(pd.date_range('2/2/2002', periods=3, freq='M'))
181 |
182 | # 查看数据
183 | dates
184 |
185 | '''
186 | 0 2002-02-28
187 | 1 2002-03-31
188 | 2 2002-04-30
189 | dtype: datetime64[ns]
190 | '''
191 |
192 | # 查看星期
193 | dates.dt.weekday_name
194 |
195 | '''
196 | 0 Thursday
197 | 1 Sunday
198 | 2 Tuesday
199 | dtype: object
200 | '''
201 | ```
202 |
203 | ## 处理时间序列中的缺失值
204 |
205 | ```py
206 | # 加载库
207 | import pandas as pd
208 | import numpy as np
209 |
210 | # 创建日期
211 | time_index = pd.date_range('01/01/2010', periods=5, freq='M')
212 |
213 | # 创建数据帧,设置索引
214 | df = pd.DataFrame(index=time_index)
215 |
216 | # 创建带有一些缺失值的特征
217 | df['Sales'] = [1.0,2.0,np.nan,np.nan,5.0]
218 |
219 | # 对缺失值执行插值
220 | df.interpolate()
221 | ```
222 |
223 | | | Sales |
224 | | --- | --- |
225 | | 2010-01-31 | 1.0 |
226 | | 2010-02-28 | 2.0 |
227 | | 2010-03-31 | 3.0 |
228 | | 2010-04-30 | 4.0 |
229 | | 2010-05-31 | 5.0 |
230 |
231 | ```py
232 | # 前向填充
233 | df.ffill()
234 | ```
235 |
236 | | | Sales |
237 | | --- | --- |
238 | | 2010-01-31 | 1.0 |
239 | | 2010-02-28 | 2.0 |
240 | | 2010-03-31 | 2.0 |
241 | | 2010-04-30 | 2.0 |
242 | | 2010-05-31 | 5.0 |
243 |
244 | ```py
245 | # 后向填充
246 | df.bfill()
247 | ```
248 |
249 | | | Sales |
250 | | --- | --- |
251 | | 2010-01-31 | 1.0 |
252 | | 2010-02-28 | 2.0 |
253 | | 2010-03-31 | 5.0 |
254 | | 2010-04-30 | 5.0 |
255 | | 2010-05-31 | 5.0 |
256 |
257 | ```py
258 | # 对缺失值执行插值
259 | df.interpolate(limit=1, limit_direction='forward')
260 | ```
261 |
262 | | | Sales |
263 | | --- | --- |
264 | | 2010-01-31 | 1.0 |
265 | | 2010-02-28 | 2.0 |
266 | | 2010-03-31 | 3.0 |
267 | | 2010-04-30 | NaN |
268 | | 2010-05-31 | 5.0 |
269 |
270 | ## 处理时区
271 |
272 | ```py
273 | # 加载库
274 | import pandas as pd
275 | from pytz import all_timezones
276 |
277 | # 展示十个时区
278 | all_timezones[0:10]
279 |
280 | '''
281 | ['Africa/Abidjan',
282 | 'Africa/Accra',
283 | 'Africa/Addis_Ababa',
284 | 'Africa/Algiers',
285 | 'Africa/Asmara',
286 | 'Africa/Asmera',
287 | 'Africa/Bamako',
288 | 'Africa/Bangui',
289 | 'Africa/Banjul',
290 | 'Africa/Bissau']
291 | '''
292 |
293 | # 创建 datetime
294 | pd.Timestamp('2017-05-01 06:00:00', tz='Europe/London')
295 |
296 | # Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London')
297 |
298 | # 创建 datetime
299 | date = pd.Timestamp('2017-05-01 06:00:00')
300 |
301 | # 设置时区
302 | date_in_london = date.tz_localize('Europe/London')
303 |
304 | # 修改时区
305 | date_in_london.tz_convert('Africa/Abidjan')
306 |
307 | # Timestamp('2017-05-01 05:00:00+0000', tz='Africa/Abidjan')
308 | ```
309 |
310 | ## 平移时间特征
311 |
312 | ```py
313 | # 加载库
314 | import pandas as pd
315 |
316 | # 创建数据帧
317 | df = pd.DataFrame()
318 |
319 | # 创建数据
320 | df['dates'] = pd.date_range('1/1/2001', periods=5, freq='D')
321 | df['stock_price'] = [1.1,2.2,3.3,4.4,5.5]
322 |
323 | # 将值平移一行
324 | df['previous_days_stock_price'] = df['stock_price'].shift(1)
325 |
326 | # 展示数据帧
327 | df
328 | ```
329 |
330 | | | dates | stock_price | previous_days_stock_price |
331 | | --- | --- | --- | --- |
332 | | 0 | 2001-01-01 | 1.1 | NaN |
333 | | 1 | 2001-01-02 | 2.2 | 1.1 |
334 | | 2 | 2001-01-03 | 3.3 | 2.2 |
335 | | 3 | 2001-01-04 | 4.4 | 3.3 |
336 | | 4 | 2001-01-05 | 5.5 | 4.4 |
337 |
338 | ## 滑动时间窗口
339 |
340 | ```py
341 | # 加载库
342 | import pandas as pd
343 |
344 | # 创建 datetime
345 | time_index = pd.date_range('01/01/2010', periods=5, freq='M')
346 |
347 | # 创建数据帧,设置索引
348 | df = pd.DataFrame(index=time_index)
349 |
350 | # 创建特征
351 | df['Stock_Price'] = [1,2,3,4,5]
352 |
353 | # 计算滑动均值
354 | df.rolling(window=2).mean()
355 | ```
356 |
357 | | | Stock_Price |
358 | | --- | --- |
359 | | 2010-01-31 | NaN |
360 | | 2010-02-28 | 1.5 |
361 | | 2010-03-31 | 2.5 |
362 | | 2010-04-30 | 3.5 |
363 | | 2010-05-31 | 4.5 |
364 |
365 | ```py
366 | # 识别滑动时间窗口中的最大值
367 | df.rolling(window=2).max()
368 | ```
369 |
370 | | | Stock_Price |
371 | | --- | --- |
372 | | 2010-01-31 | NaN |
373 | | 2010-02-28 | 2.0 |
374 | | 2010-03-31 | 3.0 |
375 | | 2010-04-30 | 4.0 |
376 | | 2010-05-31 | 5.0 |
377 |
378 | ## 选择日期时间范围
379 |
380 | ```py
381 | # 加载库
382 | import pandas as pd
383 |
384 | # 创建数据帧
385 | df = pd.DataFrame()
386 |
387 | # 创建 datetime
388 | df['date'] = pd.date_range('1/1/2001', periods=100000, freq='H')
389 | ```
390 |
391 | 如果数据帧未按时间索引,请使用此方法。
392 |
393 | ```py
394 | # 选择两个日期时间之间的观测
395 | df[(df['date'] > '2002-1-1 01:00:00') & (df['date'] <= '2002-1-1 04:00:00')]
396 | ```
397 |
398 | | | date |
399 | | --- | --- |
400 | | 8762 | 2002-01-01 02:00:00 |
401 | | 8763 | 2002-01-01 03:00:00 |
402 | | 8764 | 2002-01-01 04:00:00 |
403 |
404 | 如果数据帧按时间索引,请使用此方法。
405 |
406 | ```py
407 | # 设置索引
408 | df = df.set_index(df['date'])
409 |
410 | # 选择两个日期时间之间的观测
411 | df.loc['2002-1-1 01:00:00':'2002-1-1 04:00:00']
412 | ```
413 |
414 | | | date |
415 | | --- | --- |
416 | | date | |
417 | | 2002-01-01 01:00:00 | 2002-01-01 01:00:00 |
418 | | 2002-01-01 02:00:00 | 2002-01-01 02:00:00 |
419 | | 2002-01-01 03:00:00 | 2002-01-01 03:00:00 |
420 | | 2002-01-01 04:00:00 | 2002-01-01 04:00:00 |
421 |
--------------------------------------------------------------------------------
/7.md:
--------------------------------------------------------------------------------
1 | # 七、特征工程
2 |
3 | > 作者:[Chris Albon](https://chrisalbon.com/)
4 | >
5 | > 译者:[飞龙](https://github.com/wizardforcel)
6 | >
7 | > 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/)
8 |
9 | ## 稀疏特征矩阵上的降维
10 |
11 | ```py
12 | # 加载库
13 | from sklearn.preprocessing import StandardScaler
14 | from sklearn.decomposition import TruncatedSVD
15 | from scipy.sparse import csr_matrix
16 | from sklearn import datasets
17 | import numpy as np
18 |
19 | # 加载数据
20 | digits = datasets.load_digits()
21 |
22 | # 标准化特征矩阵
23 | X = StandardScaler().fit_transform(digits.data)
24 |
25 | # 生成稀疏矩阵
26 | X_sparse = csr_matrix(X)
27 |
28 | # 创建 TSVD
29 | tsvd = TruncatedSVD(n_components=10)
30 |
31 | # 在稀疏矩阵上使用 TSVD
32 | X_sparse_tsvd = tsvd.fit(X_sparse).transform(X_sparse)
33 |
34 | # 展示结果
35 | print('Original number of features:', X_sparse.shape[1])
36 | print('Reduced number of features:', X_sparse_tsvd.shape[1])
37 |
38 | '''
39 | Original number of features: 64
40 | Reduced number of features: 10
41 | '''
42 |
43 | # 前三个主成分的解释方差比之和
44 | tsvd.explained_variance_ratio_[0:3].sum()
45 |
46 | # 0.30039385372588506
47 | ```
48 |
49 | ## 核 PCA 降维
50 |
51 | 
52 |
53 | ```py
54 | # 加载库
55 | from sklearn.decomposition import PCA, KernelPCA
56 | from sklearn.datasets import make_circles
57 |
58 | # 创建线性不可分的数据
59 | X, _ = make_circles(n_samples=1000, random_state=1, noise=0.1, factor=0.1)
60 |
61 | # 应用带有径向基函数(RBF)核的核 PCA
62 | kpca = KernelPCA(kernel="rbf", gamma=15, n_components=1)
63 | X_kpca = kpca.fit_transform(X)
64 |
65 | print('Original number of features:', X.shape[1])
66 | print('Reduced number of features:', X_kpca.shape[1])
67 |
68 | '''
69 | Original number of features: 2
70 | Reduced number of features: 1
71 | '''
72 | ```
73 |
74 | ## 使用 PCA 的降维
75 |
76 | 
77 |
78 | ```py
79 | # 加载库
80 | from sklearn.preprocessing import StandardScaler
81 | from sklearn.decomposition import PCA
82 | from sklearn import datasets
83 |
84 | # 加载数据
85 | digits = datasets.load_digits()
86 |
87 | # 标准化特征矩阵
88 | X = StandardScaler().fit_transform(digits.data)
89 |
90 | # 创建保留 99% 方差的 PCA
91 | pca = PCA(n_components=0.99, whiten=True)
92 |
93 | # 使用 PCA
94 | X_pca = pca.fit_transform(X)
95 |
96 | # 展示结果
97 | print('Original number of features:', X.shape[1])
98 | print('Reduced number of features:', X_pca.shape[1])
99 |
100 | '''
101 | Original number of features: 64
102 | Reduced number of features: 54
103 | '''
104 | ```
105 |
106 | ## PCA 特征提取
107 |
108 | [主成分分析](https://en.wikipedia.org/wiki/Principal_component_analysis)(PCA)是数据科学中常见的特征提取方法。 从技术上讲,PCA 找到具有最高特征值的协方差矩阵的特征向量,然后使用这些特征向量将数据投影到相等或更小维度的新子空间。 实际上,PCA 将 n 个特征矩阵转换为(可能)小于 n 个特征的新数据集。 也就是说,它通过构造新的较少变量来减少特征的数量,这些变量捕获原始特征中找到的信息的重要部分。 但是,本教程的目的不是要解释 PCA 的概念,这在其他地方做得非常好,而是用于演示 PCA 的实际应用。
109 |
110 | ```py
111 | # 导入库
112 | import numpy as np
113 | from sklearn import decomposition, datasets
114 | from sklearn.preprocessing import StandardScaler
115 |
116 | # 加载乳腺癌数据集
117 | dataset = datasets.load_breast_cancer()
118 |
119 | # 加载特征
120 | X = dataset.data
121 | ```
122 |
123 | 请注意,原始数据包含 569 个观测和 30 个特征。
124 |
125 | ```py
126 | # 查看数据集的形状
127 | X.shape
128 |
129 | # (569, 30)
130 | ```
131 |
132 | 这里是数据的样子
133 |
134 | ```py
135 | # 查看数据
136 | X
137 |
138 | '''
139 | array([[ 1.79900000e+01, 1.03800000e+01, 1.22800000e+02, ...,
140 | 2.65400000e-01, 4.60100000e-01, 1.18900000e-01],
141 | [ 2.05700000e+01, 1.77700000e+01, 1.32900000e+02, ...,
142 | 1.86000000e-01, 2.75000000e-01, 8.90200000e-02],
143 | [ 1.96900000e+01, 2.12500000e+01, 1.30000000e+02, ...,
144 | 2.43000000e-01, 3.61300000e-01, 8.75800000e-02],
145 | ...,
146 | [ 1.66000000e+01, 2.80800000e+01, 1.08300000e+02, ...,
147 | 1.41800000e-01, 2.21800000e-01, 7.82000000e-02],
148 | [ 2.06000000e+01, 2.93300000e+01, 1.40100000e+02, ...,
149 | 2.65000000e-01, 4.08700000e-01, 1.24000000e-01],
150 | [ 7.76000000e+00, 2.45400000e+01, 4.79200000e+01, ...,
151 | 0.00000000e+00, 2.87100000e-01, 7.03900000e-02]])
152 | '''
153 |
154 | # 创建缩放器对象
155 | sc = StandardScaler()
156 |
157 | # 使缩放器拟合特征并转换
158 | X_std = sc.fit_transform(X)
159 | ```
160 |
161 | 请注意,PCA 包含一个参数,即成分数。 这是输出特征的数量,需要进行调整。
162 |
163 | ```py
164 | # 创建 PCA 对象,使用两个成分作为参数
165 | pca = decomposition.PCA(n_components=2)
166 |
167 | # 拟合 PCA 并转换数据
168 | X_std_pca = pca.fit_transform(X_std)
169 | ```
170 |
171 | 在 PCA 之后,新数据已降到了两个特征,其行数与原始特征相同。
172 |
173 | ```py
174 | # 查看新特征数据的形状
175 | X_std_pca.shape
176 |
177 | # (569, 2)
178 |
179 | # 查看新特征数据
180 | X_std_pca
181 |
182 | '''
183 | array([[ 9.19283683, 1.94858307],
184 | [ 2.3878018 , -3.76817174],
185 | [ 5.73389628, -1.0751738 ],
186 | ...,
187 | [ 1.25617928, -1.90229671],
188 | [ 10.37479406, 1.67201011],
189 | [ -5.4752433 , -0.67063679]])
190 | '''
191 | ```
192 |
193 | ## 使用 KMeans 聚类对观测分组
194 |
195 | ```py
196 | # 加载库
197 | from sklearn.datasets import make_blobs
198 | from sklearn.cluster import KMeans
199 | import pandas as pd
200 |
201 | # 制作模拟特征矩阵
202 | X, _ = make_blobs(n_samples = 50,
203 | n_features = 2,
204 | centers = 3,
205 | random_state = 1)
206 |
207 | # 创建 DataFrame
208 | df = pd.DataFrame(X, columns=['feature_1','feature_2'])
209 |
210 | # 创建 KMeans 聚类器
211 | clusterer = KMeans(3, random_state=1)
212 |
213 | # 拟合聚类器
214 | clusterer.fit(X)
215 |
216 | '''
217 | KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
218 | n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
219 | random_state=1, tol=0.0001, verbose=0)
220 | '''
221 |
222 | # 预测值
223 | df['group'] = clusterer.predict(X)
224 |
225 | # 前几个观测
226 | df.head(5)
227 | ```
228 |
229 | | | feature_1 | feature_2 | group |
230 | | --- | --- | --- | --- |
231 | | 0 | -9.877554 | -3.336145 | 0 |
232 | | 1 | -7.287210 | -8.353986 | 2 |
233 | | 2 | -6.943061 | -7.023744 | 2 |
234 | | 3 | -7.440167 | -8.791959 | 2 |
235 | | 4 | -6.641388 | -8.075888 | 2 |
236 |
237 | # 为 LDA 选择最佳数量的成分
238 |
239 | 在 scikit-learn 中,LDA 是使用`LinearDiscriminantAnalysis`实现的,包含一个参数`n_components`,表示我们想要返回的特征数。 为了找出用于`n_components`的参数值(例如,要保留多少参数),我们可以利用一个事实,`explain_variance_ratio_`告诉我们每个输出特征的解释方差并且是有序数组。
240 |
241 | 具体来说,我们可以运行`Linear_iscriminantAnalysis`,将`n_components`设置为`None`来返回由每个特征成分的解释方差比,然后计算需要多少成分才能超过解释方差的阈值(通常为 0.95 或 0.99)。
242 |
243 | ```py
244 | # 加载库
245 | from sklearn import datasets
246 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
247 |
248 | # 加载鸢尾花数据集
249 | iris = datasets.load_iris()
250 | X = iris.data
251 | y = iris.target
252 |
253 | # 创建并运行 LDA
254 | lda = LinearDiscriminantAnalysis(n_components=None)
255 | X_lda = lda.fit(X, y)
256 |
257 | # 创建解释方差比的数组
258 | lda_var_ratios = lda.explained_variance_ratio_
259 |
260 | # 创建函数
261 | def select_n_components(var_ratio, goal_var: float) -> int:
262 | # 设置目前为止的初始解释方差
263 | total_variance = 0.0
264 |
265 | # 设置初始特征数
266 | n_components = 0
267 |
268 | # 对于每个特征的解释方差
269 | for explained_variance in var_ratio:
270 |
271 | # 将解释方差添加到总体
272 | total_variance += explained_variance
273 |
274 | # 成分数加一
275 | n_components += 1
276 |
277 | # 如果我们达到了我们的解释方差目标
278 | if total_variance >= goal_var:
279 | # 结束循环
280 | break
281 |
282 | # 返回成分数量
283 | return n_components
284 |
285 | # 执行函数
286 | select_n_components(lda_var_ratios, 0.95)
287 |
288 | # 1
289 | ```
290 |
291 | ## 为 TSVD 选择最佳数量的成分
292 |
293 | ```py
294 | # 加载库
295 | from sklearn.preprocessing import StandardScaler
296 | from sklearn.decomposition import TruncatedSVD
297 | from scipy.sparse import csr_matrix
298 | from sklearn import datasets
299 | import numpy as np
300 |
301 | # 加载数据
302 | digits = datasets.load_digits()
303 |
304 | # Standardize the feature matrix
305 | X = StandardScaler().fit_transform(digits.data)
306 |
307 | # 制作系数矩阵
308 | X_sparse = csr_matrix(X)
309 |
310 | # 创建并使用特征数减一运行 TSVD
311 | tsvd = TruncatedSVD(n_components=X_sparse.shape[1]-1)
312 | X_tsvd = tsvd.fit(X)
313 |
314 | # 解释方差的列表
315 | tsvd_var_ratios = tsvd.explained_variance_ratio_
316 |
317 | # 创建函数
318 | def select_n_components(var_ratio, goal_var: float) -> int:
319 | # 设置目前为止的初始解释方差
320 | total_variance = 0.0
321 |
322 | # 设置初始特征数
323 | n_components = 0
324 |
325 | # 对于每个特征的解释方差
326 | for explained_variance in var_ratio:
327 |
328 | # 将解释方差添加到总体
329 | total_variance += explained_variance
330 |
331 | # 成分数加一
332 | n_components += 1
333 |
334 | # 如果我们达到了我们的解释方差目标
335 | if total_variance >= goal_var:
336 | # 结束循环
337 | break
338 |
339 | # 返回成分数量
340 | return n_components
341 |
342 | # 执行函数
343 | select_n_components(tsvd_var_ratios, 0.95)
344 |
345 | # 40
346 | ```
347 |
348 | ## 将 LDA 用于降维
349 |
350 | ```py
351 | # 加载库
352 | from sklearn import datasets
353 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
354 |
355 | # 加载鸢尾花数据集
356 | iris = datasets.load_iris()
357 | X = iris.data
358 | y = iris.target
359 |
360 | # 创建 LDA,它将数据降维到 1 个特征
361 | lda = LinearDiscriminantAnalysis(n_components=1)
362 |
363 | # 运行 LDA 并使用它转换特征
364 | X_lda = lda.fit(X, y).transform(X)
365 |
366 | # 打印特征数
367 | print('Original number of features:', X.shape[1])
368 | print('Reduced number of features:', X_lda.shape[1])
369 |
370 | '''
371 | Original number of features: 4
372 | Reduced number of features: 1
373 | '''
374 |
375 | ## 查看解释方差比
376 | lda.explained_variance_ratio_
377 |
378 | # array([ 0.99147248])
379 | ```
380 |
--------------------------------------------------------------------------------
/11.md:
--------------------------------------------------------------------------------
1 | # 十一、线性回归
2 |
3 | > 作者:[Chris Albon](https://chrisalbon.com/)
4 | >
5 | > 译者:[飞龙](https://github.com/wizardforcel)
6 | >
7 | > 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/)
8 |
9 | ## 添加交互项
10 |
11 | 
12 |
13 | ```py
14 | # 加载库
15 | from sklearn.linear_model import LinearRegression
16 | from sklearn.datasets import load_boston
17 | from sklearn.preprocessing import PolynomialFeatures
18 | import warnings
19 |
20 | # 屏蔽警告
21 | warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")
22 |
23 | # 加载只有两个特征的数据
24 | boston = load_boston()
25 | X = boston.data[:,0:2]
26 | y = boston.target
27 | ```
28 |
29 | 通过添加一个新的特征,它是交互特征的乘积,来添加交互项。
30 |
31 | 
32 |
33 | 其中  和  分别是两个特征的值, 表示两者之间的交互。使用 scikit-learn 的`PolynomialFeatures`,来为所有特征组合创建交互术项会很有用。 然后,我们可以使用模型选择策略,来识别产生最佳模型的特征和交互项的组合。
34 |
35 | ```py
36 | # 创建交互项(非多项式特征)
37 | interaction = PolynomialFeatures(degree=3, include_bias=False, interaction_only=True)
38 | X_inter = interaction.fit_transform(X)
39 |
40 | # 创建线性回归
41 | regr = LinearRegression()
42 |
43 | # 拟合线性回归
44 | model = regr.fit(X_inter, y)
45 | ```
46 |
47 | ## 创建交互特征
48 |
49 | ```py
50 | # 加载库
51 | from sklearn.preprocessing import PolynomialFeatures
52 | import numpy as np
53 |
54 | # 创建特征矩阵
55 | X = np.array([[2, 3],
56 | [2, 3],
57 | [2, 3]])
58 |
59 | # 创建 PolynomialFeatures 对象,它的 interaction_only 设为 True
60 | interaction = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
61 |
62 | # 转换特征矩阵
63 | interaction.fit_transform(X)
64 | '''
65 | array([[ 2., 3., 6.],
66 | [ 2., 3., 6.],
67 | [ 2., 3., 6.]])
68 | '''
69 | ```
70 |
71 | ## Lasso 回归的 Alpha 的效果
72 |
73 | 我们通常希望执行一个称为[正则化](https://en.wikipedia.org/wiki/Regularization)的过程,其中我们会惩罚模型中的系数数量,以便仅保留最重要的系数。 当你拥有带有 100,000 多个系数的数据集时,这一点尤为重要。
74 |
75 | [Lasso 回归](https://en.wikipedia.org/wiki/Lasso_(statistics))是正则化的常用建模技术。 它背后的数学非常有趣,但实际上,你需要知道的是,Lasso 回归带有一个参数`alpha`,而`alpha`越高,大多数特征系数越会为零。
76 |
77 | 也就是说,当`alpha`为`0`时,Lasso 回归产生与线性回归相同的系数。 当`alpha`非常大时,所有系数都为零。
78 |
79 | 在本教程中,我运行三个 Lasso 回归,具有不同的`alpha`值,并显示对系数结果的影响。
80 |
81 | ```py
82 | from sklearn.linear_model import Lasso
83 | from sklearn.preprocessing import StandardScaler
84 | from sklearn.datasets import load_boston
85 | import pandas as pd
86 |
87 | boston = load_boston()
88 | scaler = StandardScaler()
89 | X = scaler.fit_transform(boston["data"])
90 | Y = boston["target"]
91 | names = boston["feature_names"]
92 |
93 | # 创建函数 lasso
94 | def lasso(alphas):
95 | '''
96 | 接受 alpha 列表。输出数据帧,包含每个 alpha 的 Lasso 回归的系数。
97 | '''
98 | # 创建空数据帧
99 | df = pd.DataFrame()
100 |
101 | # 创建特征名称列
102 | df['Feature Name'] = names
103 |
104 | # 对于每个列表中的 alpha 值,
105 | for alpha in alphas:
106 | # 创建这个 alpha 值的 laaso 回归,
107 | lasso = Lasso(alpha=alpha)
108 |
109 | # 拟合 lasso 回归
110 | lasso.fit(X, Y)
111 |
112 | # 为这个 alpha 值创建列名称
113 | column_name = 'Alpha = %f' % alpha
114 |
115 | # 创建系数列
116 | df[column_name] = lasso.coef_
117 |
118 | # 返回数据帧
119 | return df
120 |
121 | # 调用函数 lasso
122 | lasso([.0001, .5, 10])
123 | ```
124 |
125 | | | Feature Name | Alpha = 0.000100 | Alpha = 0.500000 | Alpha = 10.000000 |
126 | | --- | --- | --- | --- | --- |
127 | | 0 | CRIM | -0.920130 | -0.106977 | -0.0 |
128 | | 1 | ZN | 1.080498 | 0.000000 | 0.0 |
129 | | 2 | INDUS | 0.142027 | -0.000000 | -0.0 |
130 | | 3 | CHAS | 0.682235 | 0.397399 | 0.0 |
131 | | 4 | NOX | -2.059250 | -0.000000 | -0.0 |
132 | | 5 | RM | 2.670814 | 2.973323 | 0.0 |
133 | | 6 | AGE | 0.020680 | -0.000000 | -0.0 |
134 | | 7 | DIS | -3.104070 | -0.169378 | 0.0 |
135 | | 8 | RAD | 2.656950 | -0.000000 | -0.0 |
136 | | 9 | TAX | -2.074110 | -0.000000 | -0.0 |
137 | | 10 | PTRATIO | -2.061921 | -1.599574 | -0.0 |
138 | | 11 | B | 0.856553 | 0.545715 | 0.0 |
139 | | 12 | LSTAT | -3.748470 | -3.668884 | -0.0 |
140 |
141 | 请注意,随着alpha值的增加,更多特征的系数为 0。
142 |
143 | # Lasso 回归
144 |
145 | ```py
146 | # 加载库
147 | from sklearn.linear_model import Lasso
148 | from sklearn.datasets import load_boston
149 | from sklearn.preprocessing import StandardScaler
150 |
151 | # 加载数据
152 | boston = load_boston()
153 | X = boston.data
154 | y = boston.target
155 |
156 | # 标准化特征
157 | scaler = StandardScaler()
158 | X_std = scaler.fit_transform(X)
159 | ```
160 |
161 | 超参数  让我们控制我们对系数的惩罚程度,更高的  值创建更简单的模型。 的理想值应该像任何其他超参数一样调整。 在 scikit-learn中,使用`alpha`参数设置 。
162 |
163 | ```py
164 | # 创建带有某个 alpha 值的 Lasso
165 | regr = Lasso(alpha=0.5)
166 |
167 | # 拟合 Lasso 回归
168 | model = regr.fit(X_std, y)
169 | ```
170 |
171 | ## 线性回归
172 |
173 | 来源:[scikit-learn](http://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html#example-linear-model-plot-ols-py),[DrawMyData](http://robertgrantstats.co.uk/drawmydata.html).
174 |
175 | 本教程的目的是,简要介绍机器学习中使用的统计模型构建的逻辑。如果你想更加了解本教程背后的理论,请查看[统计学习导论](https://www.amazon.com/Introduction-Statistical-Learning-Applications-Statistics/dp/1461471370)。
176 |
177 | 让我们开始吧。
178 |
179 | ```py
180 | import pandas as pd
181 | from sklearn import linear_model
182 | import random
183 | import numpy as np
184 | %matplotlib inline
185 | ```
186 |
187 | 添加这些库后,让我们加载数据集(数据集可以在他的站点的 GitHub 仓库中找到)。
188 |
189 | ```py
190 | # 加载数据
191 | df = pd.read_csv('../data/simulated_data/battledeaths_n300_cor99.csv')
192 |
193 | # 打乱数据的行(这是必要的,
194 | # 仅仅由于我使用 DrawMyData 创建数据的方式。真正的分析中通常不需要
195 | df = df.sample(frac=1)
196 | ```
197 |
198 | 让我们看一下数据的前几行,以便了解它。
199 |
200 | ```py
201 | # 查看数据的前几行
202 | df.head()
203 | ```
204 |
205 | | | friendly_battledeaths | enemy_battledeaths |
206 | | --- | --- | --- |
207 | | 7 | 8.2051 | 9.6154 |
208 | | 286 | 88.7179 | 86.1538 |
209 | | 164 | 14.3590 | 8.8462 |
210 | | 180 | 38.9744 | 36.5385 |
211 | | 89 | 93.0769 | 93.0769 |
212 |
213 | 现在让我们绘制数据,以便我们可以看到它的结构。
214 |
215 | ```py
216 | # 绘制两个变量,彼此对照
217 | df.plot(x='friendly_battledeaths', y='enemy_battledeaths', kind='scatter')
218 |
219 | #
220 | ```
221 |
222 | 
223 |
224 | 现在是真正的工作了。 为了判断我们的模型有多好,我们需要一些东西来测试它。 我们可以使用称为交叉验证的技术来实现这一目标。 交叉验证可以变得更加复杂和强大,但在这个例子中,我们将使用这种技术的最简单版本。
225 |
226 | ### 步骤
227 |
228 | 1. 将数据集划分为两个数据集:我们将用于训练模型的“训练”数据集,和我们将用于判断该模型准确率的“测试”数据集。
229 | 2. 在“训练”数据上训练模型。
230 | 3. 将该模型应用于测试数据的`X`变量,创建模型对测试数据`Y`的猜测。
231 | 4. 比较模型对测试数据`Y`的预测,与实际测试数据`Y`的接近程度。
232 |
233 | ```py
234 | # 创建我们的预测器/自变量
235 | # 以及我们的响应/因变量
236 | X = df['friendly_battledeaths']
237 | y = df['enemy_battledeaths']
238 |
239 | # 从前 30 个观测中创建测试数据
240 | X_test = X[0:30].reshape(-1,1)
241 | y_test = y[0:30]
242 |
243 | # 从剩余的观测中创建我们的训练数据
244 | X_train = X[30:].reshape(-1,1)
245 | y_train = y[30:]
246 | ```
247 |
248 | 让我们使用我们的训练数据训练模型。
249 |
250 | ```py
251 | # 创建 OLS 回归对象
252 | ols = linear_model.LinearRegression()
253 |
254 | # 使用训练数据来训练模型
255 | model = ols.fit(X_train, y_train)
256 | ```
257 |
258 | 以下是模型的一些基本输出,特别是系数和 R 方得分。
259 |
260 | ```py
261 | # 查看训练模型的系数
262 | model.coef_
263 |
264 | # array([ 0.97696721])
265 |
266 | # 查看 R 方得分
267 | model.score(X_test, y_test)
268 |
269 | # 0.98573393818904709
270 | ```
271 |
272 | 现在我们已经使用训练数据,来训练一个名为`model`的模型,我们可以将它应用于测试数据的`X`,来预测测试数据的`Y`。
273 |
274 | 以前我们使用`X_train`和`y_train`来训练线性回归模型,我们将其存储为一个名为`model`的变量。 代码`model.predict(X_test)`将训练好的模型应用于`X_test`数据,这是模型以前从未见过的数据,来生成`Y`的预测值。
275 |
276 | 只需运行代码即可轻松看到:
277 |
278 | ```py
279 | # 在 X_test 上运行模型并显示前五个结果
280 | list(model.predict(X_test)[0:5])
281 | '''
282 | [7.4633347104887342,
283 | 86.121700007313791,
284 | 13.475493202059415,
285 | 37.523931774900845,
286 | 90.380300060086256]
287 | '''
288 | ```
289 |
290 | 这个数组是模型对测试数据`Y`值的最佳猜测。 将它们与实际测试数据`Y`值进行比较:
291 |
292 | ```py
293 | # 查看前五个测试 Y 值
294 | list(y_test)[0:5]
295 | '''
296 | [9.6153999999999993,
297 | 86.153800000000004,
298 | 8.8461999999999996,
299 | 36.538499999999999,
300 | 93.076899999999995]
301 | '''
302 | ```
303 |
304 | 模型的预测值与实际值之间的差异,是我们判断模型的准确率的方式,因为完全准确的模型没有残差。
305 |
306 | 但是,要判断模型,我们需要一个可用作度量的统计量(数字)。 我们希望这个度量能够捕获数据中所有观测的预测值与实际值之间的差异。
307 |
308 | 用于量化`Y`的最常见统计量是**残差平方和**:
309 |
310 | 
311 |
312 | 不要让数学符号吓到:
313 |
314 | *  是我们训练的模型:`model.predict(X_test)`
315 | *  是测试数据的`y`:`y_test`
316 | *  是指数:`**2`
317 | *  是求和:`.sum()`
318 |
319 | 在残差的平方和中,对于每个观测,我们找到模型的预测`Y`和实际`Y`值之间的差异,然后将该差异平方来使所有值为正。 然后我们将所有这些平方差加在一起得到一个数字。 最终结果是一个统计量,表示模型的预测与实际值的距离。
320 |
321 | ```py
322 | # 将我们使用训练数据创建的模型
323 | # 应用于测试数据,并计算RSS。
324 | ((y_test - model.predict(X_test)) **2).sum()
325 |
326 | # 313.6087355571951
327 | ```
328 |
329 | 注意:你还可以使用均方差(MSE),它是 RSS 除以自由度。 但我发现用 RSS 来思考是有帮助的。
330 |
331 | ```py
332 | # 计算 MSE
333 | np.mean((model.predict(X_test) - y_test) **2)
334 |
335 | # 10.45362451857317
336 | ```
337 |
338 | ## Sklearn 线性回归
339 |
340 | ```py
341 | # 加载库
342 | from sklearn.linear_model import LinearRegression
343 | from sklearn.datasets import load_boston
344 | import warnings
345 |
346 | # 屏蔽警告
347 | warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")
348 |
349 | # 加载数据
350 | boston = load_boston()
351 | X = boston.data
352 | y = boston.target
353 |
354 | # 创建线性回归
355 | regr = LinearRegression()
356 |
357 | # 拟合线性回归
358 | model = regr.fit(X, y)
359 |
360 | # 查看截距(偏差)
361 | model.intercept_
362 |
363 | # 36.491103280361038
364 |
365 | # 查看特征系数(权重)
366 | model.coef_
367 |
368 | '''
369 | array([ -1.07170557e-01, 4.63952195e-02, 2.08602395e-02,
370 | 2.68856140e+00, -1.77957587e+01, 3.80475246e+00,
371 | 7.51061703e-04, -1.47575880e+00, 3.05655038e-01,
372 | -1.23293463e-02, -9.53463555e-01, 9.39251272e-03,
373 | -5.25466633e-01])
374 | '''
375 | ```
376 |
377 | # 岭回归
378 |
379 | 
380 |
381 | ```py
382 | # 加载库
383 | from sklearn.linear_model import Ridge
384 | from sklearn.datasets import load_boston
385 | from sklearn.preprocessing import StandardScaler
386 |
387 | # 加载数据
388 | boston = load_boston()
389 | X = boston.data
390 | y = boston.target
391 |
392 | # 标准化特征
393 | scaler = StandardScaler()
394 | X_std = scaler.fit_transform(X)
395 | ```
396 |
397 | 超参数  让我们控制我们对系数的惩罚程度,更高的  值创建更简单的模型。 的理想值应该像任何其他超参数一样调整。 在 scikit-learn中,使用`alpha`参数设置 。
398 |
399 | ```py
400 | # 创建带有 alpha 值的岭回归
401 | regr = Ridge(alpha=0.5)
402 |
403 | # 拟合岭回归
404 | model = regr.fit(X_std, y)
405 | ```
406 |
407 | # 为岭回归选择最佳的 alpha 值
408 |
409 | ```py
410 | # 加载库
411 | from sklearn.linear_model import RidgeCV
412 | from sklearn.datasets import load_boston
413 | from sklearn.preprocessing import StandardScaler
414 |
415 | # 加载数据
416 | boston = load_boston()
417 | X = boston.data
418 | y = boston.target
419 | ```
420 |
421 | 注意:因为在线性回归中,系数的值由特征的尺度部分确定,并且在正则化的模型中,所有系数加在一起,我们必须确保在训练之前将特征标准化。
422 |
423 | ```py
424 | # 标准化特征
425 | scaler = StandardScaler()
426 | X_std = scaler.fit_transform(X)
427 |
428 | # 创建带有三个可能 alpha 值的岭回归
429 | regr_cv = RidgeCV(alphas=[0.1, 1.0, 10.0])
430 | ```
431 |
432 | scikit-learn 包含`RidgeCV`方法,允许我们为  选择理想值:
433 |
434 | ```py
435 | # 拟合岭回归
436 | model_cv = regr_cv.fit(X_std, y)
437 |
438 | # 查看 alpha
439 | model_cv.alpha_
440 |
441 | # 1.0
442 | ```
--------------------------------------------------------------------------------
/4.md:
--------------------------------------------------------------------------------
1 | # 四、图像预处理
2 |
3 | > 作者:[Chris Albon](https://chrisalbon.com/)
4 | >
5 | > 译者:[飞龙](https://github.com/wizardforcel)
6 | >
7 | > 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/)
8 |
9 | ## 图像二值化
10 |
11 | ```py
12 | # 加载库
13 | import cv2
14 | import numpy as np
15 | from matplotlib import pyplot as plt
16 |
17 | # 将图像加载为灰度
18 | image_grey = cv2.imread('img/plane_256x256.jpg', cv2.IMREAD_GRAYSCALE)
19 |
20 | # 应用自适应阈值
21 | max_output_value = 255
22 | neighorhood_size = 99
23 | subtract_from_mean = 10
24 | image_binarized = cv2.adaptiveThreshold(image_grey,
25 | max_output_value,
26 | cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
27 | cv2.THRESH_BINARY,
28 | neighorhood_size,
29 | subtract_from_mean)
30 |
31 | # 展示图像
32 | plt.imshow(image_binarized, cmap='gray'), plt.axis("off")
33 | plt.show()
34 | ```
35 |
36 | 
37 |
38 | ## 图像模糊
39 |
40 | ```py
41 | # 加载库
42 | import cv2
43 | import numpy as np
44 | from matplotlib import pyplot as plt
45 |
46 | # 将图像加载为灰度
47 | image = cv2.imread('img/plane_256x256.jpg', cv2.IMREAD_GRAYSCALE)
48 |
49 | # 使图像模糊
50 | image_blurry = cv2.blur(image, (5,5))
51 |
52 | # 展示图像
53 | plt.imshow(image_blurry, cmap='gray'), plt.xticks([]), plt.yticks([])
54 | plt.show()
55 | ```
56 |
57 | 
58 |
59 | ## 图像剪裁
60 |
61 | ```py
62 | # 加载库
63 | import cv2
64 | import numpy as np
65 | from matplotlib import pyplot as plt
66 |
67 | # 将图像加载为灰度
68 | image = cv2.imread('img/plane_256x256.jpg', cv2.IMREAD_GRAYSCALE)
69 |
70 | # 选择所有行,和前一半的列
71 | image_cropped = image[:,:126]
72 |
73 | # 查看图像
74 | plt.imshow(image_cropped, cmap='gray'), plt.axis("off")
75 | plt.show()
76 | ```
77 |
78 | 
79 |
80 | ## 边缘检测
81 |
82 | ```py
83 | # 加载库
84 | import cv2
85 | import numpy as np
86 | from matplotlib import pyplot as plt
87 |
88 | # 将图像加载为灰度
89 | image_gray = cv2.imread('img/plane_256x256.jpg', cv2.IMREAD_GRAYSCALE)
90 |
91 | # 计算强度中值
92 | median_intensity = np.median(image_gray)
93 |
94 | # 将阈值设为强度中值上下一个标准差
95 | lower_threshold = int(max(0, (1.0 - 0.33) * median_intensity))
96 | upper_threshold = int(min(255, (1.0 + 0.33) * median_intensity))
97 |
98 | # 应用 canny 边缘检测
99 | image_canny = cv2.Canny(image_gray, lower_threshold, upper_threshold)
100 |
101 | # 展示图像
102 | plt.imshow(image_canny, cmap='gray'), plt.axis("off")
103 | plt.show()
104 | ```
105 |
106 | 
107 |
108 | ## 增强彩色图像的对比度
109 |
110 | ```py
111 | # 加载库
112 | import cv2
113 | import numpy as np
114 | from matplotlib import pyplot as plt
115 |
116 | # 加载图像
117 | image_bgr = cv2.imread('img/plane.jpg')
118 |
119 | # 转换为 YUV
120 | image_yuv = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2YUV)
121 |
122 | # 应用直方图均衡
123 | image_yuv[:, :, 0] = cv2.equalizeHist(image_yuv[:, :, 0])
124 |
125 | # 转换为 RGB
126 | image_rgb = cv2.cvtColor(image_yuv, cv2.COLOR_YUV2RGB)
127 |
128 | # 展示图像
129 | plt.imshow(image_rgb), plt.axis("off")
130 | plt.show()
131 | ```
132 |
133 | 
134 |
135 | ## 增强灰度图像的对比度
136 |
137 | ```py
138 | # 加载库
139 | import cv2
140 | import numpy as np
141 | from matplotlib import pyplot as plt
142 |
143 | # 将图像加载为灰度
144 | image = cv2.imread('img/plane_256x256.jpg', cv2.IMREAD_GRAYSCALE)
145 |
146 | # 增强图像
147 | image_enhanced = cv2.equalizeHist(image)
148 |
149 | # 展示图像
150 | plt.imshow(image_enhanced, cmap='gray'), plt.axis("off")
151 | plt.show()
152 | ```
153 |
154 | 
155 |
156 | # Harris 角点检测
157 |
158 | Harris 角点检测器是检测两个边缘角点的常用方法。 它寻找窗口(也称为邻域或补丁),其中窗口的小移动(想象摇动窗口)使窗口内的像素内容产生大的变化。
159 |
160 | ```py
161 | # 加载库
162 | import cv2
163 | import numpy as np
164 | from matplotlib import pyplot as plt
165 |
166 | # 将图像加载为灰度
167 | image_bgr = cv2.imread('img/plane_256x256.jpg')
168 | image_gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
169 | image_gray = np.float32(image_gray)
170 |
171 | # 设置角点检测器的参数
172 | block_size = 2
173 | aperture = 29
174 | free_parameter = 0.04
175 |
176 | # 检测角点
177 | detector_responses = cv2.cornerHarris(image_gray, block_size, aperture, free_parameter)
178 |
179 | # 大型角点标记器
180 | detector_responses = cv2.dilate(detector_responses, None)
181 |
182 | # 只保留大于阈值的检测器结果,标记为白色
183 | threshold = 0.02
184 | image_bgr[detector_responses > threshold * detector_responses.max()] = [255,255,255]
185 |
186 | # 转换为灰度
187 | image_gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
188 |
189 | # 展示图像
190 | plt.imshow(image_gray, cmap='gray'), plt.axis("off")
191 | plt.show()
192 | ```
193 |
194 | 
195 |
196 | # 安装 OpenCV
197 |
198 | 虽然有许多好的库,OpenCV 是最受欢迎和文档最全的图像处理库。 使用 OpenCV 的最大障碍之一就是安装它。 但是,幸运的是,我们可以使用 Anaconda 的软件包管理器工具 conda,在我们的终端中用一行代码安装 OpenCV:
199 |
200 | ```
201 | conda install --channel https://conda.anaconda.org/menpo opencv3
202 | ```
203 |
204 | 之后,我们可以通过打开笔记本,导入 OpenCV 并检查版本号(3.1.0)来检查安装:
205 |
206 | ```py
207 | # 加载库
208 | import cv2
209 |
210 | # 查看版本号
211 | cv2.__version__
212 |
213 | # '3.2.0'
214 | ```
215 |
216 | ## 颜色隔离
217 |
218 | ```py
219 | # 加载库
220 | import cv2
221 | import numpy as np
222 | from matplotlib import pyplot as plt
223 |
224 | # 加载图像
225 | image_bgr = cv2.imread('img/plane_256x256.jpg')
226 |
227 | # 将 BGR 转换为 HSV
228 | image_hsv = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2HSV)
229 |
230 | # 定义 HSV 中蓝色值的范围
231 | lower_blue = np.array([50,100,50])
232 | upper_blue = np.array([130,255,255])
233 |
234 | # 创建遮罩
235 | mask = cv2.inRange(image_hsv, lower_blue, upper_blue)
236 |
237 | # 屏蔽图像
238 | image_bgr_masked = cv2.bitwise_and(image_bgr, image_bgr, mask=mask)
239 |
240 | # 将 BGR 转换为 RGB
241 | image_rgb = cv2.cvtColor(image_bgr_masked, cv2.COLOR_BGR2RGB)
242 |
243 | # 展示图像
244 | plt.imshow(image_rgb), plt.axis("off")
245 | plt.show()
246 | ```
247 |
248 | 
249 |
250 | ## 加载图像
251 |
252 | ```py
253 | # 加载库
254 | import cv2
255 | import numpy as np
256 | from matplotlib import pyplot as plt
257 |
258 | # 将图像加载为灰度
259 | image = cv2.imread('img/plane.jpg', cv2.IMREAD_GRAYSCALE)
260 |
261 | # 展示图像
262 | plt.imshow(image, cmap='gray'), plt.axis("off")
263 | plt.show()
264 | ```
265 |
266 | 
267 |
268 | ```py
269 | # 加载彩色图像
270 | image_bgr = cv2.imread('img/plane.jpg', cv2.IMREAD_COLOR)
271 |
272 | # 转换为 RGB
273 | image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
274 |
275 | # 展示图像
276 | plt.imshow(image_rgb), plt.axis("off")
277 | plt.show()
278 | ```
279 |
280 | 
281 |
282 | ```py
283 | # 展示图像数据
284 | image
285 |
286 | '''
287 | array([[140, 136, 146, ..., 132, 139, 134],
288 | [144, 136, 149, ..., 142, 124, 126],
289 | [152, 139, 144, ..., 121, 127, 134],
290 | ...,
291 | [156, 146, 144, ..., 157, 154, 151],
292 | [146, 150, 147, ..., 156, 158, 157],
293 | [143, 138, 147, ..., 156, 157, 157]], dtype=uint8)
294 | '''
295 |
296 | # 展示维度
297 | image.shape
298 |
299 | # (2270, 3600)
300 | ```
301 |
302 | # 背景移除
303 |
304 | [](https://machinelearningflashcards.com)
305 |
306 | ```py
307 | # 加载库
308 | import cv2
309 | import numpy as np
310 | from matplotlib import pyplot as plt
311 |
312 | # 加载图像
313 | image_bgr = cv2.imread('img/plane_256x256.jpg')
314 |
315 | # 转换为 RGB
316 | image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
317 |
318 | # 矩形值:起点 x,起点 y,宽度,高度
319 | rectangle = (0, 56, 256, 150)
320 |
321 | # 创建初始遮罩
322 | mask = np.zeros(image_rgb.shape[:2], np.uint8)
323 |
324 | # 创建用于 grabCut 的临时数组
325 | bgdModel = np.zeros((1, 65), np.float64)
326 | fgdModel = np.zeros((1, 65), np.float64)
327 |
328 | # 执行 grabCut
329 | cv2.grabCut(image_rgb, # 我们的图像
330 | mask, # 遮罩
331 | rectangle, # 我们的矩形
332 | bgdModel, # 用于背景的临时数组
333 | fgdModel, # 用于前景的临时数组
334 | 5, # 迭代数量
335 | cv2.GC_INIT_WITH_RECT) # 使用我们的矩形来初始化
336 |
337 | # 创建遮罩,其中背景设置为 0,否则为 1
338 | mask_2 = np.where((mask==2) | (mask==0), 0, 1).astype('uint8')
339 |
340 | # 使用新的遮罩移除多个图像的背景
341 | image_rgb_nobg = image_rgb * mask_2[:, :, np.newaxis]
342 |
343 | # 展示图像
344 | plt.imshow(image_rgb_nobg), plt.axis("off")
345 | plt.show()
346 | ```
347 |
348 | 
349 |
350 | ## 保存图像
351 |
352 | ```py
353 | # 加载库
354 | import cv2
355 | import numpy as np
356 | from matplotlib import pyplot as plt
357 |
358 | # 将图像加载为灰度
359 | image = cv2.imread('img/plane.jpg', cv2.IMREAD_GRAYSCALE)
360 |
361 | # 展示图像
362 | plt.imshow(image, cmap='gray'), plt.axis("off")
363 | plt.show()
364 | ```
365 |
366 | 
367 |
368 | ```py
369 | # 保存图像
370 | cv2.imwrite('img/plane_new.jpg', image)
371 |
372 | # True
373 | ```
374 |
375 | ## 图像锐化
376 |
377 | ```py
378 | # 加载库
379 | import cv2
380 | import numpy as np
381 | from matplotlib import pyplot as plt
382 |
383 | # 将图像加载为灰度
384 | image = cv2.imread('img/plane_256x256.jpg', cv2.IMREAD_GRAYSCALE)
385 |
386 | # 创建核
387 | kernel = np.array([[0, -1, 0],
388 | [-1, 5,-1],
389 | [0, -1, 0]])
390 |
391 | # 锐化图像
392 | image_sharp = cv2.filter2D(image, -1, kernel)
393 |
394 | # 展示图像
395 | plt.imshow(image_sharp, cmap='gray'), plt.axis("off")
396 | plt.show()
397 | ```
398 |
399 | 
400 |
401 | ## Shi-Tomasi 角点检测
402 |
403 | ```py
404 | # 加载库
405 | import cv2
406 | import numpy as np
407 | from matplotlib import pyplot as plt
408 |
409 | # 加载图像
410 | image_bgr = cv2.imread('img/plane_256x256.jpg')
411 | image_gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
412 |
413 | # 要检测的角点数量
414 | corners_to_detect = 10
415 | minimum_quality_score = 0.05
416 | minimum_distance = 25
417 |
418 | # 检测角点
419 | corners = cv2.goodFeaturesToTrack(image_gray,
420 | corners_to_detect,
421 | minimum_quality_score,
422 | minimum_distance)
423 | corners = np.float32(corners)
424 |
425 | # 在每个角点上绘制白色圆圈
426 | for corner in corners:
427 | x, y = corner[0]
428 | cv2.circle(image_bgr, (x,y), 10, (255,255,255), -1)
429 |
430 | # 转换为灰度
431 | image_gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
432 |
433 | # 展示图像
434 | plt.imshow(image_gray, cmap='gray'), plt.axis("off")
435 | plt.show()
436 | ```
437 |
438 | 
439 |
440 | ## 使用颜色均值作为特征
441 |
442 | ```py
443 | # 加载库
444 | import cv2
445 | import numpy as np
446 | from matplotlib import pyplot as plt
447 |
448 | # 将图像加载为 BGR
449 | image_bgr = cv2.imread('img/plane_256x256.jpg', cv2.IMREAD_COLOR)
450 |
451 | # 计算每个通道的均值
452 | channels = cv2.mean(image_bgr)
453 |
454 | # 交换蓝色和红色值(使其变成 RGB 而不是 BGR)
455 | observation = np.array([(channels[2], channels[1], channels[0])])
456 |
457 | # 展示通道的均值
458 | observation
459 |
460 | # array([[ 90.53204346, 133.11735535, 169.03074646]])
461 |
462 | # 展示图像
463 | plt.imshow(observation), plt.axis("off")
464 | plt.show()
465 | ```
466 |
467 | 
468 |
--------------------------------------------------------------------------------
/16.md:
--------------------------------------------------------------------------------
1 | # 十六、朴素贝叶斯
2 |
3 | > 作者:[Chris Albon](https://chrisalbon.com/)
4 | >
5 | > 译者:[飞龙](https://github.com/wizardforcel)
6 | >
7 | > 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/)
8 |
9 | ## 伯努利朴素贝叶斯
10 |
11 | 伯努利朴素贝叶斯分类器假设我们的所有特征都是二元的,它们仅有两个值(例如,已经是独热编码的标称分类特征)。
12 |
13 | ```py
14 | # 加载库
15 | import numpy as np
16 | from sklearn.naive_bayes import BernoulliNB
17 |
18 | # 创建三个二元特征
19 | X = np.random.randint(2, size=(100, 3))
20 |
21 | # 创建二元目标向量
22 | y = np.random.randint(2, size=(100, 1)).ravel()
23 |
24 | # 查看前十个观测
25 | X[0:10]
26 |
27 | '''
28 | array([[1, 1, 1],
29 | [0, 1, 0],
30 | [1, 1, 1],
31 | [0, 0, 0],
32 | [1, 0, 1],
33 | [1, 1, 1],
34 | [0, 1, 1],
35 | [1, 1, 1],
36 | [1, 1, 1],
37 | [1, 1, 0]])
38 | '''
39 |
40 | # 创建伯努利朴素贝叶斯对象,带有每个类别的先验概率
41 | clf = BernoulliNB(class_prior=[0.25, 0.5])
42 |
43 | # 训练模型
44 | model = clf.fit(X, y)
45 | ```
46 |
47 | ## 校准预测概率
48 |
49 | 类别概率是机器学习模型中常见且有用的部分。 在 scikit-learn 中,大多数学习算法允许我们使用`predict_proba`来查看成员的类别预测概率。 例如,如果我们想要仅预测某个类,如果模型预测它们是该类的概率超过 90%,则这非常有用。 然而,一些模型,包括朴素贝叶斯分类器输出的概率,不基于现实世界。 也就是说,`predict_proba`可能预测,观测有 0.70 的机会成为某一类,而实际情况是它是 0.10 或 0.99。 特别是在朴素贝叶斯中,虽然不同目标类别的预测概率的排名是有效的,但是原始预测概率倾向于接近 0 和 1 的极值。
50 |
51 | 为了获得有意义的预测概率,我们需要进行所谓的校准。 在 scikit-learn 中,我们可以使用`CalibratedClassifierCV`类,使用 k-fold 交叉验证创建校准良好的预测概率。 在`CalibratedClassifierCV`中,训练集用于训练模型,测试集用于校准预测概率。返回的预测概率是 k 折的平均值。
52 |
53 | ```py
54 | # 加载库
55 | from sklearn import datasets
56 | from sklearn.naive_bayes import GaussianNB
57 | from sklearn.calibration import CalibratedClassifierCV
58 |
59 | # 加载数据
60 | iris = datasets.load_iris()
61 | X = iris.data
62 | y = iris.target
63 |
64 | # 创建高斯朴素贝叶斯对象
65 | clf = GaussianNB()
66 |
67 | # 使用 sigmoid 校准创建校准的交叉验证
68 | clf_sigmoid = CalibratedClassifierCV(clf, cv=2, method='sigmoid')
69 |
70 | # 校准概率
71 | clf_sigmoid.fit(X, y)
72 |
73 | '''
74 | CalibratedClassifierCV(base_estimator=GaussianNB(priors=None), cv=2,
75 | method='sigmoid')
76 | '''
77 |
78 | # 创建新的观测
79 | new_observation = [[ 2.6, 2.6, 2.6, 0.4]]
80 |
81 | # 查看校准概率
82 | clf_sigmoid.predict_proba(new_observation)
83 |
84 | # array([[ 0.31859969, 0.63663466, 0.04476565]])
85 | ```
86 |
87 | ## 高斯朴素贝叶斯分类器
88 |
89 | 
90 |
91 | 由于正态分布的假设,高斯朴素贝叶斯最适用于我们所有特征都是连续的情况。
92 |
93 | ```py
94 | # 加载库
95 | from sklearn import datasets
96 | from sklearn.naive_bayes import GaussianNB
97 |
98 | # 加载数据
99 | iris = datasets.load_iris()
100 | X = iris.data
101 | y = iris.target
102 |
103 | # 创建高斯朴素贝叶斯对象,带有每个类别的先验概率
104 | clf = GaussianNB(priors=[0.25, 0.25, 0.5])
105 |
106 | # 训练模型
107 | model = clf.fit(X, y)
108 |
109 | # 创建新的观测
110 | new_observation = [[ 4, 4, 4, 0.4]]
111 |
112 | # 预测类别
113 | model.predict(new_observation)
114 |
115 | # array([1])
116 | ```
117 |
118 | 注意:来自高斯朴素贝叶斯的原始预测概率(使用`predict_proba`输出)未校准。 也就是说,他们不应该是可信的。 如果我们想要创建有用的预测概率,我们将需要使用等渗回归或相关方法来校准它们。
119 |
120 | ## 多项式逻辑回归
121 |
122 | 在多项逻辑回归(MLR)中,我们在 Recipe 15.1 中看到的逻辑函数被 softmax 函数替换:
123 |
124 | 
125 |
126 | 其中  是第  个观测的目标值  是类  的概率, 是类的总数。MLR 的一个实际优点是使用`predict_proba`方法预测的概率更可靠(即校准更好)。
127 |
128 | ```py
129 | # 加载库
130 | from sklearn.linear_model import LogisticRegression
131 | from sklearn import datasets
132 | from sklearn.preprocessing import StandardScaler
133 |
134 | # 加载数据
135 | iris = datasets.load_iris()
136 | X = iris.data
137 | y = iris.target
138 |
139 | # 标准化特征
140 | scaler = StandardScaler()
141 | X_std = scaler.fit_transform(X)
142 |
143 | # 创建 OVR 逻辑回归对象
144 | clf = LogisticRegression(random_state=0, multi_class='multinomial', solver='newton-cg')
145 |
146 | # 训练模型
147 | model = clf.fit(X_std, y)
148 |
149 | # 创建新的观测
150 | new_observation = [[.5, .5, .5, .5]]
151 |
152 | # 预测类别
153 | model.predict(new_observation)
154 |
155 | # array([1])
156 |
157 | # 查看预测概率
158 | model.predict_proba(new_observation)
159 |
160 | # array([[ 0.01944996, 0.74469584, 0.2358542 ]])
161 | ```
162 |
163 | ## 多项式朴素贝叶斯分类器
164 |
165 | 多项式朴素贝叶斯的工作方式类似于高斯朴素贝叶斯,但假设这些特征是多项式分布的。 在实践中,这意味着当我们具有离散数据(例如,电影评级范围为 1 到 5)时,通常使用该分类器。
166 |
167 | ```py
168 | # 加载库
169 | import numpy as np
170 | from sklearn.naive_bayes import MultinomialNB
171 | from sklearn.feature_extraction.text import CountVectorizer
172 |
173 | # 创建文本
174 | text_data = np.array(['I love Brazil. Brazil!',
175 | 'Brazil is best',
176 | 'Germany beats both'])
177 |
178 | # 创建词袋
179 | count = CountVectorizer()
180 | bag_of_words = count.fit_transform(text_data)
181 |
182 | # 创建特征矩阵
183 | X = bag_of_words.toarray()
184 |
185 | # 创建目标向量
186 | y = np.array([0,0,1])
187 |
188 | # 创建多项式朴素贝叶斯对象,带有每个类别的先验概率
189 | clf = MultinomialNB(class_prior=[0.25, 0.5])
190 |
191 | # 训练模型
192 | model = clf.fit(X, y)
193 |
194 | # 创建新的观测
195 | new_observation = [[0, 0, 0, 1, 0, 1, 0]]
196 |
197 | # 预测新观测的类别
198 | model.predict(new_observation)
199 |
200 | # array([0])
201 | ```
202 |
203 | ## 从零编写朴素贝叶斯分类器
204 |
205 | 朴素贝叶斯是一种简单的分类器,当只有少量观测可用时,这种分类器表现良好。 在本教程中,我们将从头开始创建一个高斯朴素贝叶斯分类器,并使用它来预测以前未见过的数据点的类别。本教程基于 Wikipedia 的[朴素贝叶斯分类器页面](https://en.wikipedia.org/wiki/Naive_Bayes_classifier)上的示例,我已经用 Python 实现了它并调整了一些符号来改进解释。
206 |
207 | ```py
208 | import pandas as pd
209 | import numpy as np
210 | ```
211 |
212 | 我们的数据集包含八个个体的数据。 我们将使用数据集构建一个分类器,该分类器接收个体的身高,体重和脚码,并输出其性别预测。
213 |
214 | ```py
215 | # 创建空数据帧
216 | data = pd.DataFrame()
217 |
218 | # 创建我们的目标变量
219 | data['Gender'] = ['male','male','male','male','female','female','female','female']
220 |
221 | # 创建我们的特征变量
222 | data['Height'] = [6,5.92,5.58,5.92,5,5.5,5.42,5.75]
223 | data['Weight'] = [180,190,170,165,100,150,130,150]
224 | data['Foot_Size'] = [12,11,12,10,6,8,7,9]
225 |
226 | # 查看数据
227 | data
228 | ```
229 |
230 | | | Gender | Height | Weight | Foot_Size |
231 | | --- | --- | --- | --- | --- |
232 | | 0 | male | 6.00 | 180 | 12 |
233 | | 1 | male | 5.92 | 190 | 11 |
234 | | 2 | male | 5.58 | 170 | 12 |
235 | | 3 | male | 5.92 | 165 | 10 |
236 | | 4 | female | 5.00 | 100 | 6 |
237 | | 5 | female | 5.50 | 150 | 8 |
238 | | 6 | female | 5.42 | 130 | 7 |
239 | | 7 | female | 5.75 | 150 | 9 |
240 |
241 | 上面的数据集用于构造我们的分类器。 下面我们将创建一个新的个体,我们知道它的特征值,但不知道它的性别。我们的目标是预测它的性别。
242 |
243 | ```py
244 | # 创建空数据帧
245 | person = pd.DataFrame()
246 |
247 | # 为这一行创建相同特征值
248 | person['Height'] = [6]
249 | person['Weight'] = [130]
250 | person['Foot_Size'] = [8]
251 |
252 | # 查看数据
253 | person
254 | ```
255 |
256 | | | Height | Weight | Foot_Size |
257 | | --- | --- | --- | --- |
258 | | 0 | 6 | 130 | 8 |
259 |
260 | 贝叶斯定理是一个着名的方程,它允许我们根据数据进行预测。 这是贝叶斯定理的经典版本:
261 |
262 | 
263 |
264 | 这可能过于抽象,所以让我们替换一些变量以使其更具体。 在贝叶斯分类器中,给定数据的情况下,我们有兴趣找出观测的类别(例如男性或女性,垃圾邮件或非垃圾邮件):
265 |
266 | 
267 |
268 | 其中:
269 |
270 | *  是特定类别(例如男性)
271 | *  是观测的数据
272 | *  称为后验
273 | *  叫做似然
274 | *  叫做先验
275 | *  叫做边缘概率
276 |
277 | 在贝叶斯分类器中,我们计算每个观测的每个类的后验(严格来说,我们只计算后验的分子,但现在忽略它)。 然后,基于后验值最大的类别对观测分类。 在我们的例子中,我们为观测预测两个可能的类别(例如男性和女性),因此我们将计算两个后验:一个用于男性,一个用于女性。
278 |
279 | 
280 |
281 | 
282 |
283 | 高斯朴素的贝叶斯可能是最受欢迎的贝叶斯分类器。 为了解释这个名称的含义,让我们看一下当我们应用两个类别(男性和女性)和三个特征变量(高度,重量和尺寸)时贝叶斯方程式的样子:
284 |
285 | 
286 |
287 | 
288 |
289 | 现在让我们解释一下上面的方程式:
290 |
291 | *  是先验概率。正如你所看到的,只是观测是男性的概率。 这只是数据集中的男性数量除以数据集中的总人数。
292 | *  是似然。注意我们已经解释了  所以它现在是数据集中的每个特征。“高斯”和“朴素”来自似然中的两个假设:
293 | 1. 如果你查看似然中的每项,你会注意到,我们假设每个特征彼此不相关。 也就是说,脚码与体重或身高等无关。这显然不是真的,而且是一个“朴素”的假设 - 因此称为“朴素贝叶斯”。
294 | 2. 其次,我们假设特征的值(例如女性的身体,女性的体重)通常是高斯分布的。这意味着  是通过将所需参数输入正态分布的概率密度函数来计算的:
295 |
296 | 
297 |
298 | *  可能是贝叶斯方法中最令人困惑的部分之一。 在玩具示例(包括我们的)中,完全可以计算边际概率。 但是,在许多实际情况中,要找到边际概率的值极其困难或不可能(解释为什么超出了本教程的范围)。 对于我们的分类器来说,这并不像你想象的那么严重。 为什么? 因为我们不关心真正的后验值是什么,我们只关心哪个类具有最高的后验值。 并且因为边际概率对于所有类别都是相同的,(1)我们可以忽略分母,(2)只计算每个类的后验分子,(3)选择最大的分子。 也就是说,我们可以忽略后验分母,并仅根据后验分子的相对值进行预测。
299 |
300 | 好的! 理论结束。 现在让我们开始计算贝叶斯方程的所有不同部分。
301 |
302 | 先验可以是常数或概率分布。 在我们的例子中,这只是性别的概率。计算这很简单:
303 |
304 | ```py
305 | # 男性数量
306 | n_male = data['Gender'][data['Gender'] == 'male'].count()
307 |
308 | # 女性数量
309 | n_female = data['Gender'][data['Gender'] == 'female'].count()
310 |
311 | # 总行数
312 | total_ppl = data['Gender'].count()
313 |
314 | # 男性比例
315 | P_male = n_male/total_ppl
316 |
317 | # 女性比例
318 | P_female = n_female/total_ppl
319 | ```
320 |
321 | 请记住,我们的似然中的每一项(例如 )都可以看做正态的 PDF。 例如:
322 |
323 | 
324 |
325 | 这意味着对于每个类别(例如女性)和特征(例如身高)组合,我们需要从数据计算方差和均值。Pandas 让这很容易:
326 |
327 | ```py
328 | # 按性别分组数据,并计算每个特征的均值
329 | data_means = data.groupby('Gender').mean()
330 |
331 | # 查看值
332 | data_means
333 | ```
334 |
335 | | | Height | Weight | Foot_Size |
336 | | --- | --- | --- | --- |
337 | | Gender | | | |
338 | | female | 5.4175 | 132.50 | 7.50 |
339 | | male | 5.8550 | 176.25 | 11.25 |
340 |
341 | ```py
342 | # 按性别分组数据,并计算每个特征的方差
343 | data_variance = data.groupby('Gender').var()
344 |
345 | # 查看值
346 | data_variance
347 | ```
348 |
349 | | | Height | Weight | Foot_Size |
350 | | --- | --- | --- | --- |
351 | | Gender | | | |
352 | | female | 0.097225 | 558.333333 | 1.666667 |
353 | | male | 0.035033 | 122.916667 | 0.916667 |
354 |
355 | 现在我们可以创建我们需要的所有变量。 下面的代码可能看起来很复杂,但我们所做的,只是从上面两个表中的每个单元格中创建一个变量。
356 |
357 | ```py
358 | # 男性的均值
359 | male_height_mean = data_means['Height'][data_variance.index == 'male'].values[0]
360 | male_weight_mean = data_means['Weight'][data_variance.index == 'male'].values[0]
361 | male_footsize_mean = data_means['Foot_Size'][data_variance.index == 'male'].values[0]
362 |
363 | # 男性的方差
364 | male_height_variance = data_variance['Height'][data_variance.index == 'male'].values[0]
365 | male_weight_variance = data_variance['Weight'][data_variance.index == 'male'].values[0]
366 | male_footsize_variance = data_variance['Foot_Size'][data_variance.index == 'male'].values[0]
367 |
368 | # Means for female
369 | female_height_mean = data_means['Height'][data_variance.index == 'female'].values[0]
370 | female_weight_mean = data_means['Weight'][data_variance.index == 'female'].values[0]
371 | female_footsize_mean = data_means['Foot_Size'][data_variance.index == 'female'].values[0]
372 |
373 | # Variance for female
374 | female_height_variance = data_variance['Height'][data_variance.index == 'female'].values[0]
375 | female_weight_variance = data_variance['Weight'][data_variance.index == 'female'].values[0]
376 | female_footsize_variance = data_variance['Foot_Size'][data_variance.index == 'female'].values[0]
377 | ```
378 |
379 | 最后,我们需要创建一个函数来计算每个似然项的概率密度(例如 )。
380 |
381 | ```py
382 | # 创建计算 p(x | y) 的函数
383 | def p_x_given_y(x, mean_y, variance_y):
384 |
385 | # 将参数输入到概率密度函数
386 | p = 1/(np.sqrt(2*np.pi*variance_y)) * np.exp((-(x-mean_y)**2)/(2*variance_y))
387 |
388 | # 返回 p
389 | return p
390 | ```
391 |
392 | 好的! 我们的贝叶斯分类器准备就绪。 请记住,既然我们可以忽略边际概率(分母),我们实际计算的是:
393 |
394 | 
395 |
396 | 为此,我们只需要插入未分类个体(`height = 6`)的值,数据集的变量(例如女性身高的均值)和我们上面编写的函数(`p_x_given_y`):
397 |
398 | ```py
399 | # 如果未分类的观测是男性的后验分子
400 | P_male * \
401 | p_x_given_y(person['Height'][0], male_height_mean, male_height_variance) * \
402 | p_x_given_y(person['Weight'][0], male_weight_mean, male_weight_variance) * \
403 | p_x_given_y(person['Foot_Size'][0], male_footsize_mean, male_footsize_variance)
404 |
405 | # 6.1970718438780782e-09
406 | ```
407 |
408 | ```py
409 | # 如果未分类的观测是女性的后验分子
410 | P_female * \
411 | p_x_given_y(person['Height'][0], female_height_mean, female_height_variance) * \
412 | p_x_given_y(person['Weight'][0], female_weight_mean, female_weight_variance) * \
413 | p_x_given_y(person['Foot_Size'][0], female_footsize_mean, female_footsize_variance)
414 |
415 | # 0.00053779091836300176
416 | ```
417 |
418 | 因为女性的后验分子大于男性,所以我们预测这个人是女性。
419 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License (CC BY-NC-SA 4.0)
2 |
3 | Copyright © 2020 ApacheCN(apachecn@163.com)
4 |
5 | By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions.
6 |
7 | Section 1 – Definitions.
8 |
9 | a. Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image.
10 | b. Adapter's License means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License.
11 | c. BY-NC-SA Compatible License means a license listed at creativecommons.org/compatiblelicenses, approved by Creative Commons as essentially the equivalent of this Public License.
12 | d. Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights.
13 | e. Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements.
14 | f. Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material.
15 | g. License Elements means the license attributes listed in the name of a Creative Commons Public License. The License Elements of this Public License are Attribution, NonCommercial, and ShareAlike.
16 | h. Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License.
17 | i. Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license.
18 | j. Licensor means the individual(s) or entity(ies) granting rights under this Public License.
19 | k. NonCommercial means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange.
20 | l. Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them.
21 | m. Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world.
22 | n. You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning.
23 |
24 | Section 2 – Scope.
25 |
26 | a. License grant.
27 | 1. Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to:
28 | A. reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and
29 | B. produce, reproduce, and Share Adapted Material for NonCommercial purposes only.
30 | 2. Exceptions and Limitations. For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions.
31 | 3. Term. The term of this Public License is specified in Section 6(a).
32 | 4. Media and formats; technical modifications allowed. The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material.
33 | 5. Downstream recipients.
34 | A. Offer from the Licensor – Licensed Material. Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License.
35 | B. Additional offer from the Licensor – Adapted Material. Every recipient of Adapted Material from You automatically receives an offer from the Licensor to exercise the Licensed Rights in the Adapted Material under the conditions of the Adapter’s License You apply.
36 | C. No downstream restrictions. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material.
37 | 6. No endorsement. Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i).
38 | b. Other rights.
39 | 1. Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise.
40 | 2. Patent and trademark rights are not licensed under this Public License.
41 | 3. To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes.
42 |
43 | Section 3 – License Conditions.
44 |
45 | Your exercise of the Licensed Rights is expressly made subject to the following conditions.
46 |
47 | a. Attribution.
48 | 1. If You Share the Licensed Material (including in modified form), You must:
49 | A. retain the following if it is supplied by the Licensor with the Licensed Material:
50 | i. identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated);
51 | ii. a copyright notice;
52 | iii. a notice that refers to this Public License;
53 | iv. a notice that refers to the disclaimer of warranties;
54 | v. a URI or hyperlink to the Licensed Material to the extent reasonably practicable;
55 | B. indicate if You modified the Licensed Material and retain an indication of any previous modifications; and
56 | C. indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License.
57 | 2. You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information.
58 | 3. If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable.
59 | b. ShareAlike.
60 | In addition to the conditions in Section 3(a), if You Share Adapted Material You produce, the following conditions also apply.
61 | 1. The Adapter’s License You apply must be a Creative Commons license with the same License Elements, this version or later, or a BY-NC-SA Compatible License.
62 | 2. You must include the text of, or the URI or hyperlink to, the Adapter's License You apply. You may satisfy this condition in any reasonable manner based on the medium, means, and context in which You Share Adapted Material.
63 | 3. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, Adapted Material that restrict exercise of the rights granted under the Adapter's License You apply.
64 |
65 | Section 4 – Sui Generis Database Rights.
66 |
67 | Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material:
68 |
69 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only;
70 | b. if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material, including for purposes of Section 3(b); and
71 | c. You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database.
72 |
73 | For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights.
74 |
75 | Section 5 – Disclaimer of Warranties and Limitation of Liability.
76 |
77 | a. Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You.
78 | b. To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You.
79 | c. The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability.
80 |
81 | Section 6 – Term and Termination.
82 |
83 | a. This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically.
84 | b. Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates:
85 | 1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or
86 | 2. upon express reinstatement by the Licensor.
87 | For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License.
88 | c. For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License.
89 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public License.
90 |
91 | Section 7 – Other Terms and Conditions.
92 |
93 | a. The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed.
94 | b. Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License.
95 |
96 | Section 8 – Interpretation.
97 |
98 | a. For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License.
99 | b. To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions.
100 | c. No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor.
101 | d. Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority.
--------------------------------------------------------------------------------
/NAV.md:
--------------------------------------------------------------------------------
1 | + 编程
2 | + [JavaTPoint 编程语言中文教程📚](https://apachecn.github.io/javatpoint-prog-zh)
3 | + [JavaTPoint .NET 中文教程📚](https://apachecn.github.io/javatpoint-dotnet-zh)
4 | + [JavaTPoint Java 中文教程📚](https://apachecn.github.io/javatpoint-java-zh)
5 | + [JavaTPoint Python 中文教程📚](https://apachecn.github.io/javatpoint-python-zh)
6 | + [GeeksForGeeks 编程语言杂项中文教程📚](https://apachecn.github.io/geeksforgeeks-lang-misc-zh)
7 | + [GeeksForGeeks C# 中文教程📚](https://apachecn.github.io/geeksforgeeks-csharp-zh)
8 | + [GeeksForGeeks Scala 中文教程📚](https://apachecn.github.io/geeksforgeeks-scala-zh)
9 | + [GeeksForGeeks Python 中文教程📚](https://apachecn.github.io/geeksforgeeks-python-zh)
10 | + [GeeksForGeeks C/C++ 中文教程📚](https://apachecn.github.io/geeksforgeeks-c-cpp-zh)
11 | + [GeeksForGeeks Java 中文教程📚](https://apachecn.github.io/geeksforgeeks-java-zh)
12 | + [GeeksForGeeks JavaScript 中文教程📚](https://apachecn.github.io/geeksforgeeks-js-zh)
13 | + [ApacheCN C# 译文集📚](https://apachecn.github.io/apachecn-csharp-zh)
14 | + [ApacheCN C# 译文集(二)📚](https://apachecn.github.io/apachecn-csharp-zh-pt2)
15 | + [ApacheCN C# 译文集(三)📚](https://apachecn.github.io/apachecn-csharp-zh-pt3)
16 | + [ApacheCN C# 译文集(四)📚](https://apachecn.github.io/apachecn-csharp-zh-pt4)
17 | + [ApacheCN Golang 译文集📚](https://apachecn.github.io/apachecn-golang-zh)
18 | + [ApacheCN Golang 译文集(二)📚](https://apachecn.github.io/apachecn-golang-zh-pt2)
19 | + [ApacheCN C/C++ 译文集📚](https://apachecn.github.io/apachecn-c-cpp-zh)
20 | + [ApacheCN C/C++ 译文集(二)📚](https://apachecn.github.io/apachecn-c-cpp-zh-pt2)
21 | + [ApacheCN C/C++ 译文集(三)📚](https://apachecn.github.io/apachecn-c-cpp-zh-pt3)
22 | + [ApacheCN Java 译文集📚](https://apachecn.github.io/apachecn-java-zh)
23 | + [ApacheCN Java 译文集(二)📚](https://apachecn.github.io/apachecn-java-zh-pt2)
24 | + [ApacheCN Java 译文集(三)📚](https://apachecn.github.io/apachecn-java-zh-pt3)
25 | + [ApacheCN JavaScript 译文集📚](https://apachecn.github.io/apachecn-js-zh)
26 | + [ApacheCN JavaScript 译文集(二)📚](https://apachecn.github.io/apachecn-js-zh-pt2)
27 | + [ApacheCN JavaScript 译文集(三)📚](https://apachecn.github.io/apachecn-js-zh-pt3)
28 | + [ApacheCN JavaScript 译文集(四)📚](https://apachecn.github.io/apachecn-js-zh-pt4)
29 | + [ApacheCN Python 译文集📚](https://apachecn.github.io/apachecn-python-zh)
30 | + [ApacheCN Python 译文集(二)📚](https://apachecn.github.io/apachecn-python-zh-pt2)
31 | + [ApacheCN Python 译文集(三)📚](https://apachecn.github.io/apachecn-python-zh-pt3)
32 | + [ApacheCN Python 译文集(四)📚](https://apachecn.github.io/apachecn-python-zh-pt4)
33 | + [ApacheCN Ruby 译文集📚](https://apachecn.github.io/apachecn-ruby-zh)
34 | + [BeginnersBook 中文系列教程📚](https://apachecn.github.io/beginnersbook-zh)
35 | + [JavaScript 编程精解 中文第三版](https://apachecn.github.io/eloquent-js-3e-zh)
36 | + [Guru99 中文系列教程📚🚧](https://apachecn.github.io/guru99-zh)
37 | + [HowToDoInJava 中文系列教程📚](https://apachecn.github.io/howtodoinjava-zh)
38 | + [OverIQ 中文系列教程📚](https://apachecn.github.io/overiq-zh)
39 | + [LearnETutroials 中文系列教程📚](https://apachecn.github.io/learnetutorials-zh)
40 | + [StudyTonight 中文系列教程📚](https://apachecn.github.io/studytonight-zh)
41 | + [TutorialGateway 中文系列教程📚](https://apachecn.github.io/tutorialgateway-zh)
42 | + [TutorialGateway BI 中文系列教程📚](https://apachecn.github.io/tutorialgateway-bi-zh)
43 | + [TutorialsTeacher 中文系列教程📚](https://apachecn.github.io/tutorialsteacher-zh)
44 | + [通过示例学 Golang 2020 中文版](https://apachecn.github.io/golang-by-example-2020-zh)
45 | + [写给不耐烦程序员的 JavaScript🚧](https://apachecn.github.io/impatient-js-zh)
46 | + [JavaBeginnersTutorial 中文系列教程📚](https://apachecn.github.io/jbt-zh)
47 | + [JavaTutorialNetwork 中文系列教程📚](https://apachecn.github.io/jtn-zh)
48 | + [笨办法学C 中文版](https://apachecn.github.io/lcthw-zh)
49 | + [笨办法学 Python · 续 中文版](https://apachecn.github.io/lmpythw-zh)
50 | + [Programiz 中文系列教程📚](https://apachecn.github.io/programiz-zh)
51 | + [PythonBasics 中文系列教程📚](https://apachecn.github.io/pythonbasics-zh)
52 | + [PythonGuru 中文系列教程📚](https://apachecn.github.io/pythonguru-zh)
53 | + [PythonSpot 中文系列教程📚](https://apachecn.github.io/pythonspot-zh)
54 | + [Think Python](https://apachecn.github.io/think-py-2e-zh)
55 | + [ZetCode 中文系列教程📚](https://apachecn.github.io/zetcode-zh)
56 | + 前端
57 | + [JavaTPoint 移动开发中文教程📚](https://apachecn.github.io/javatpoint-mobi-zh)
58 | + [GeeksForGeeks Web 杂项中文教程📚](https://apachecn.github.io/geeksforgeeks-web-misc-zh)
59 | + [GeeksForGeeks Angular/Vue/React 中文教程📚](https://apachecn.github.io/geeksforgeeks-ng-vue-react-zh)
60 | + [GeeksForGeeks jQuery 中文教程📚](https://apachecn.github.io/geeksforgeeks-jquery-zh)
61 | + [GeeksForGeeks CSS 中文教程📚](https://apachecn.github.io/geeksforgeeks-css-zh)
62 | + [GeeksForGeeks HTML 中文教程📚](https://apachecn.github.io/geeksforgeeks-html-zh)
63 | + [ApacheCN Vue 译文集📚](https://apachecn.github.io/apachecn-vue-zh)
64 | + [ApacheCN Angular 译文集📚](https://apachecn.github.io/apachecn-angular-zh)
65 | + [ApacheCN React 译文集📚](https://apachecn.github.io/apachecn-react-zh)
66 | + [ApacheCN jQuery 译文集📚](https://apachecn.github.io/apachecn-jquery-zh)
67 | + [ApacheCN jQuery 译文集(二)📚](https://apachecn.github.io/apachecn-jquery-zh-pt2)
68 | + 后端/大数据
69 | + [JavaTPoint 大数据中文教程📚](https://apachecn.github.io/javatpoint-bigdata-zh)
70 | + [JavaTPoint Web 开发中文教程📚](https://apachecn.github.io/javatpoint-web-zh)
71 | + [JavaTPoint 数据库中文教程📚](https://apachecn.github.io/javatpoint-db-zh)
72 | + [JavaTPoint PHP 中文教程📚](https://apachecn.github.io/javatpoint-php-zh)
73 | + [GeeksForGeeks ASP 中文教程📚](https://apachecn.github.io/geeksforgeeks-asp-zh)
74 | + [GeeksForGeeks SQL 中文教程📚](https://apachecn.github.io/geeksforgeeks-sql-zh)
75 | + [GeeksForGeeks NodeJS 中文教程📚](https://apachecn.github.io/geeksforgeeks-nodejs-zh)
76 | + [GeeksForGeeks PHP 中文教程📚](https://apachecn.github.io/geeksforgeeks-php-zh)
77 | + [ApacheCN 数据库译文集📚](https://apachecn.github.io/apachecn-db-zh)
78 | + [ApacheCN 数据库译文集(二)📚](https://apachecn.github.io/apachecn-db-zh-pt2)
79 | + [ApacheCN Python Web 译文集📚](https://apachecn.github.io/apachecn-pythonweb-zh)
80 | + [ApacheCN Python Web 译文集(二)📚](https://apachecn.github.io/apachecn-pythonweb-zh-pt2)
81 | + [ApacheCN Asp.NET 译文集📚](https://apachecn.github.io/apachecn-asp-dotnet-zh)
82 | + [ApacheCN Asp.NET 译文集(二)📚](https://apachecn.github.io/apachecn-asp-dotnet-zh-pt2)
83 | + [ApacheCN Asp.NET 译文集(三)📚](https://apachecn.github.io/apachecn-asp-dotnet-zh-pt3)
84 | + [ApacheCN Asp.NET 译文集(四)📚](https://apachecn.github.io/apachecn-asp-dotnet-zh-pt4)
85 | + [ApacheCN NodeJS 译文集📚](https://apachecn.github.io/apachecn-node-zh)
86 | + [ApacheCN NodeJS 译文集(二)📚](https://apachecn.github.io/apachecn-node-zh-pt2)
87 | + [ApacheCN PHP 译文集📚](https://apachecn.github.io/apachecn-php-zh)
88 | + [ApacheCN PHP 译文集(二)📚](https://apachecn.github.io/apachecn-php-zh-pt2)
89 | + [ApacheCN 大数据译文集(二)📚](https://apachecn.github.io/apachecn-bigdata-zh-pt2)
90 | + [ApacheCN 大数据译文集(三)📚](https://apachecn.github.io/apachecn-bigdata-zh-pt3)
91 | + [ApacheCN 大数据译文集📚](https://apachecn.github.io/apachecn-bigdata-zh)
92 | + [ApacheCN Java Web 译文集📚](https://apachecn.github.io/apachecn-javaweb-zh)
93 | + [ApacheCN Java Web 译文集(二)📚](https://apachecn.github.io/apachecn-javaweb-zh-pt2)
94 | + [Airflow 中文文档](https://apachecn.github.io/airflow-doc-zh)
95 | + [Elasticsearch 5.4 中文文档](https://apachecn.github.io/elasticsearch-doc-zh)
96 | + [Flink 中文文档](https://apachecn.github.io/flink-doc-zh)
97 | + [HBase™ 中文参考指南 3.0🚧](https://apachecn.github.io/hbase-doc-zh)
98 | + [HighScalability 中文示例📚🚧](https://apachecn.github.io/highscalability-examples-zh)
99 | + [Kibana 5.2 中文文档](https://apachecn.github.io/kibana-doc-zh)
100 | + [Kudu 1.4.0 中文文档](https://apachecn.github.io/kudu-doc-zh)
101 | + [Apache Spark 官方文档中文版](https://apachecn.github.io/spark-doc-zh)
102 | + [Apache Kafka 官方文档中文版](https://apachecn.github.io/kafka-site-zh)
103 | + [Spring Boot 1.5.2 中文文档](https://apachecn.github.io/spring-boot-doc-zh)
104 | + [Storm 1.1.0 中文文档](https://apachecn.github.io/storm-doc-zh)
105 | + [Zeppelin 0.7.2 中文文档](https://apachecn.github.io/zeppelin-doc-zh)
106 | + 工具
107 | + [JavaTPoint 实用工具中文教程📚](https://apachecn.github.io/javatpoint-util-zh)
108 | + [ApacheCN DevOps 译文集📚](https://apachecn.github.io/apachecn-devops-zh)
109 | + [ApacheCN DevOps 译文集(二)📚](https://apachecn.github.io/apachecn-devops-zh-pt2)
110 | + [ApacheCN DevOps 译文集(三)📚](https://apachecn.github.io/apachecn-devops-zh-pt3)
111 | + [ApacheCN DevOps 译文集(四)📚](https://apachecn.github.io/apachecn-devops-zh-pt4)
112 | + [ApacheCN DevOps 译文集(五)📚](https://apachecn.github.io/apachecn-devops-zh-pt5)
113 | + [ApacheCN Linux 译文集📚](https://apachecn.github.io/apachecn-linux-zh)
114 | + [ApacheCN Linux 译文集(二)📚](https://apachecn.github.io/apachecn-linux-zh-pt2)
115 | + [ApacheCN Linux 译文集(三)📚](https://apachecn.github.io/apachecn-linux-zh-pt3)
116 | + [Cython 3.0 中文文档🚧](https://apachecn.github.io/cython-doc-zh)
117 | + [Git 中文参考🚧](https://apachecn.github.io/git-doc-zh)
118 | + [Gitlab 中文文档🚧](https://apachecn.github.io/gitlab-doc-zh)
119 | + [笨办法学 Linux 中文版](https://apachecn.github.io/llthw-zh)
120 | + [Numba 0.44 中文文档🚧](https://apachecn.github.io/numba-doc-zh)
121 | + [PyQt4 中文文档🚧](https://apachecn.github.io/pyqt4-doc-zh)
122 | + [Scrapy 1.6 中文文档](https://apachecn.github.io/scrapy-doc-zh)
123 | + 数据科学
124 | + [ApacheCN 数据科学译文集📚](https://apachecn.github.io/apachecn-ds-zh)
125 | + [ApacheCN 数据科学译文集(二)📚](https://apachecn.github.io/apachecn-ds-zh-pt2)
126 | + [ApacheCN 数据科学译文集(三)📚](https://apachecn.github.io/apachecn-ds-zh-pt3)
127 | + [ApacheCN 数据科学译文集📚](https://apachecn.github.io/apachecn-ds-zh)
128 | + [MIT 18.03 面向初学者的微积分🚧](https://apachecn.github.io/calc4b-zh)
129 | + [UCB Data8 计算与推断思维](https://apachecn.github.io/data8-textbook-zh)
130 | + [数据可视化的基础知识](https://apachecn.github.io/dataviz-zh)
131 | + [数据科学和人工智能技术笔记](https://apachecn.github.io/ds-ai-tech-notes)
132 | + [数据科学 IPython 笔记本📚](https://apachecn.github.io/ds-ipynb-zh)
133 | + [UCB DS100 数据科学的原理与技巧🚧](https://apachecn.github.io/ds100-textbook-zh)
134 | + [ApacheCN 数据科学和人工智能知识库](https://apachecn.github.io/dsai-wiki)
135 | + [Matplotlib 用户指南](https://apachecn.github.io/matplotlib-doc-zh)
136 | + [MIT 18.06 线性代数笔记](https://apachecn.github.io/mit-18.06-linalg-notes)
137 | + [利用 Python 进行数据分析 · 第 2 版](https://apachecn.github.io/pyda-2e-zh)
138 | + [QuantLearning](https://apachecn.github.io/quant-learning)
139 | + [seaborn 0.9 中文文档](https://apachecn.github.io/seaborn-doc-zh)
140 | + [社交媒体挖掘 - 翻译版](https://apachecn.github.io/socialmediamining-zh)
141 | + [斯坦福 Stats60 21 世纪的统计思维🚧](https://apachecn.github.io/stats-thinking-21-zh)
142 | + [复杂性思维 中文第二版](https://apachecn.github.io/think-comp-2e-zh)
143 | + [PyMiner 开发者指南](https://apachecn.github.io/pyminer-dev-guide)
144 | + 人工智能
145 | + [JavaTPoint 数据科学与人工智能中文教程📚](https://apachecn.github.io/javatpoint-dsai-zh)
146 | + [GeeksForGeeks 人工智能中文教程📚](https://apachecn.github.io/geeksforgeeks-ai-zh)
147 | + [AILearning📚](https://apachecn.github.io/ailearning)
148 | + [ApacheCN 计算机视觉译文集📚](https://apachecn.github.io/apachecn-cv-zh)
149 | + [ApacheCN 计算机视觉译文集(二)📚](https://apachecn.github.io/apachecn-cv-zh-pt2)
150 | + [ApacheCN 深度学习译文集📚](https://apachecn.github.io/apachecn-dl-zh)
151 | + [ApacheCN 深度学习译文集(二)📚](https://apachecn.github.io/apachecn-dl-zh-pt2)
152 | + [ApacheCN 深度学习译文集(三)📚](https://apachecn.github.io/apachecn-dl-zh-pt3)
153 | + [ApacheCN 机器学习译文集📚](https://apachecn.github.io/apachecn-ml-zh)
154 | + [ApacheCN 机器学习译文集(二)📚](https://apachecn.github.io/apachecn-ml-zh-pt2)
155 | + [ApacheCN 机器学习译文集(三)📚](https://apachecn.github.io/apachecn-ml-zh-pt3)
156 | + [FastText 中文文档](https://apachecn.github.io/fasttext-doc-zh)
157 | + [面向机器学习的特征工程](https://apachecn.github.io/fe4ml-zh)
158 | + [Gensim 中文文档](https://apachecn.github.io/gensim-doc-zh)
159 | + [Sklearn 与 TensorFlow 机器学习实用指南第二版](https://apachecn.github.io/hands-on-ml-2e-zh)
160 | + [LightGBM 中文文档](https://apachecn.github.io/lightgbm-doc-zh)
161 | + [Machine Learning Mastery 博客文章翻译📚🚧](https://apachecn.github.io/ml-mastery-zh)
162 | + [Machine Learning Mastery 博客文章翻译(二)📚🚧](https://apachecn.github.io/ml-mastery-zh-pt2)
163 | + [Python 自然语言处理 第二版](https://apachecn.github.io/nlp-py-2e-zh)
164 | + [PyTorch 自然语言处理](https://apachecn.github.io/nlp-pytorch-zh)
165 | + [台湾大学林轩田机器学习笔记](https://apachecn.github.io/ntu-hsuantienlin-ml)
166 | + [OpenCV 中文文档 4.0.0](https://apachecn.github.io/opencv-doc-zh)
167 | + [PythonProgramming.net 系列教程📚](https://apachecn.github.io/python-programming-net-zh)
168 | + [PyTorch 中文教程](https://apachecn.github.io/pytorch-doc-zh)
169 | + [scikit-learn (sklearn) 官方文档中文版](https://apachecn.github.io/sklearn-doc-zh)
170 | + [XGBoost 中文文档](https://apachecn.github.io/xgboost-doc-zh)
171 | + 计算机科学
172 | + [JavaTPoint 计算机科学中文教程📚](https://apachecn.github.io/javatpoint-cs-zh)
173 | + [ApacheCN 数据结构与算法译文集📚](https://apachecn.github.io/apachecn-algo-zh)
174 | + [ApacheCN 计算机系统译文集📚](https://apachecn.github.io/apachecn-sys-zh)
175 | + [NUS CS1101s SICP JavaScript 描述🚧](https://apachecn.github.io/sicp-js-zh)
176 | + [UCB CS61a SICP Python 描述](https://apachecn.github.io/sicp-py-zh)
177 | + [数据结构思维中文版](https://apachecn.github.io/think-dast-zh)
178 | + [UIUC CS241 系统编程中文讲义🚧](https://apachecn.github.io/uiuc-cs241-notes-zh)
179 | + 安全
180 | + [ApacheCN Kali Linux 译文集📚](https://apachecn.github.io/apachecn-kali-zh)
181 | + [ApacheCN 网络安全译文集📚](https://apachecn.github.io/apachecn-sec-zh)
182 | + [ApacheCN 网络安全译文集(二)📚](https://apachecn.github.io/apachecn-sec-zh-pt2)
183 | + [SecLearning——零组文库备份📚](https://apachecn.github.io/sec-learning)
184 | + [ApacheCN 安全知识库📚](https://apachecn.github.io/sec-wiki)
185 | + [Web Hacking 101 中文版](https://apachecn.github.io/web-hacking-101-zh)
186 | + 其它
187 | + [生化环材劝退文集](https://apachecn.github.io/bio-chem-env-mat-discourage)
188 | + [5 分钟商学院精细笔记](https://apachecn.github.io/business-5min-notes)
189 | + [iBooker 布客](https://apachecn.github.io/home)
190 | + [iBooker 布客老实人报](https://apachecn.github.io/ibooker-plain-dealer)
191 | + [使用 Qiskit 学习量子计算 - 翻译版](https://apachecn.github.io/lqcuq-zh)
192 | + [原则 · 中文版](https://apachecn.github.io/principles-zh)
193 | + [斯坦福 CS183 & YC 创业课系列中文笔记📚](https://apachecn.github.io/stanford-cs183-notes)
194 | + [iBooker 团队知识库📚](https://apachecn.github.io/team-wiki)
195 | + [ApacheCN 技术评论](https://apachecn.github.io/tech-review)
196 | + [通往财富自由之路精细笔记](https://apachecn.github.io/the-way-to-wealth-freedom-notes)
197 |
--------------------------------------------------------------------------------