├── .idea
├── kmeans.iml
├── misc.xml
├── modules.xml
└── vcs.xml
├── data
├── Folds5x2_pp.xlsx
├── all_musicers.xlsx
├── drugName_dic.txt
├── full_user.avsc
├── kv1.txt
├── ndy.txt
├── outputfile.xlsx
├── people.json
├── people.txt
├── sample_kmeans_data.txt
├── sample_lda_data.txt
├── sample_lda_libsvm_data.txt
├── sampling.csv
├── sampling.txt
├── sampling.xlsx
├── test.xlsx
├── text.xlsx
├── user.avsc
├── users.avro
└── users.parquet
├── pythonadvance
├── Gensimtest.py
├── Regression_analyse.py
├── StatsModelstest.py
├── __init__.py
├── cipin.py
├── clustermeric.py
├── dxckeras.py
├── dxcprec.py
├── dxcsvm.py
├── full_linearRession.py
├── nltktest.py
├── numpytest.py
├── pandatest.py
├── pymysqltest.py
├── regular.py
├── scipytest.py
├── sklearnkmeans.py
├── yaofang_fenxi_text.py
└── yichangtest.py
├── pythonbasic
├── classtest.py
├── decoratortest.py
├── dictionary.py
├── huatu.py
├── ossys.py
├── pandasql.py
├── pythonbasic.py
└── randomtest.py
├── pythondata
├── __init__.py
└── datasets.py
├── sparkml
├── __init__.py
├── mlkmeans.py
├── sparkSession.py
├── sparkio.py
├── sparklda.py
├── sparklr.py
├── sparkpipline.py
├── sparkpipline2.py
├── sparktf-itf.py
└── tokenizer.py
└── text_analyse2
├── __init__.py
├── extract.txt
├── jiebatest.py
└── userdict.txt
/.idea/kmeans.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 | AngularJS
14 |
15 |
16 |
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/data/Folds5x2_pp.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/data/Folds5x2_pp.xlsx
--------------------------------------------------------------------------------
/data/all_musicers.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/data/all_musicers.xlsx
--------------------------------------------------------------------------------
/data/full_user.avsc:
--------------------------------------------------------------------------------
1 | {"type": "record", "namespace": "example.avro", "name": "User", "fields": [{"type": "string", "name": "name"}, {"type": ["string", "null"], "name": "favorite_color"}, {"type": {"items": "int", "type": "array"}, "name": "favorite_numbers"}]}
--------------------------------------------------------------------------------
/data/kv1.txt:
--------------------------------------------------------------------------------
1 | 238val_238
2 | 86val_86
3 | 311val_311
4 | 27val_27
5 | 165val_165
6 | 409val_409
7 | 255val_255
8 | 278val_278
9 | 98val_98
10 | 484val_484
11 | 265val_265
12 | 193val_193
13 | 401val_401
14 | 150val_150
15 | 273val_273
16 | 224val_224
17 | 369val_369
18 | 66val_66
19 | 128val_128
20 | 213val_213
21 | 146val_146
22 | 406val_406
23 | 429val_429
24 | 374val_374
25 | 152val_152
26 | 469val_469
27 | 145val_145
28 | 495val_495
29 | 37val_37
30 | 327val_327
31 | 281val_281
32 | 277val_277
33 | 209val_209
34 | 15val_15
35 | 82val_82
36 | 403val_403
37 | 166val_166
38 | 417val_417
39 | 430val_430
40 | 252val_252
41 | 292val_292
42 | 219val_219
43 | 287val_287
44 | 153val_153
45 | 193val_193
46 | 338val_338
47 | 446val_446
48 | 459val_459
49 | 394val_394
50 | 237val_237
51 | 482val_482
52 | 174val_174
53 | 413val_413
54 | 494val_494
55 | 207val_207
56 | 199val_199
57 | 466val_466
58 | 208val_208
59 | 174val_174
60 | 399val_399
61 | 396val_396
62 | 247val_247
63 | 417val_417
64 | 489val_489
65 | 162val_162
66 | 377val_377
67 | 397val_397
68 | 309val_309
69 | 365val_365
70 | 266val_266
71 | 439val_439
72 | 342val_342
73 | 367val_367
74 | 325val_325
75 | 167val_167
76 | 195val_195
77 | 475val_475
78 | 17val_17
79 | 113val_113
80 | 155val_155
81 | 203val_203
82 | 339val_339
83 | 0val_0
84 | 455val_455
85 | 128val_128
86 | 311val_311
87 | 316val_316
88 | 57val_57
89 | 302val_302
90 | 205val_205
91 | 149val_149
92 | 438val_438
93 | 345val_345
94 | 129val_129
95 | 170val_170
96 | 20val_20
97 | 489val_489
98 | 157val_157
99 | 378val_378
100 | 221val_221
101 | 92val_92
102 | 111val_111
103 | 47val_47
104 | 72val_72
105 | 4val_4
106 | 280val_280
107 | 35val_35
108 | 427val_427
109 | 277val_277
110 | 208val_208
111 | 356val_356
112 | 399val_399
113 | 169val_169
114 | 382val_382
115 | 498val_498
116 | 125val_125
117 | 386val_386
118 | 437val_437
119 | 469val_469
120 | 192val_192
121 | 286val_286
122 | 187val_187
123 | 176val_176
124 | 54val_54
125 | 459val_459
126 | 51val_51
127 | 138val_138
128 | 103val_103
129 | 239val_239
130 | 213val_213
131 | 216val_216
132 | 430val_430
133 | 278val_278
134 | 176val_176
135 | 289val_289
136 | 221val_221
137 | 65val_65
138 | 318val_318
139 | 332val_332
140 | 311val_311
141 | 275val_275
142 | 137val_137
143 | 241val_241
144 | 83val_83
145 | 333val_333
146 | 180val_180
147 | 284val_284
148 | 12val_12
149 | 230val_230
150 | 181val_181
151 | 67val_67
152 | 260val_260
153 | 404val_404
154 | 384val_384
155 | 489val_489
156 | 353val_353
157 | 373val_373
158 | 272val_272
159 | 138val_138
160 | 217val_217
161 | 84val_84
162 | 348val_348
163 | 466val_466
164 | 58val_58
165 | 8val_8
166 | 411val_411
167 | 230val_230
168 | 208val_208
169 | 348val_348
170 | 24val_24
171 | 463val_463
172 | 431val_431
173 | 179val_179
174 | 172val_172
175 | 42val_42
176 | 129val_129
177 | 158val_158
178 | 119val_119
179 | 496val_496
180 | 0val_0
181 | 322val_322
182 | 197val_197
183 | 468val_468
184 | 393val_393
185 | 454val_454
186 | 100val_100
187 | 298val_298
188 | 199val_199
189 | 191val_191
190 | 418val_418
191 | 96val_96
192 | 26val_26
193 | 165val_165
194 | 327val_327
195 | 230val_230
196 | 205val_205
197 | 120val_120
198 | 131val_131
199 | 51val_51
200 | 404val_404
201 | 43val_43
202 | 436val_436
203 | 156val_156
204 | 469val_469
205 | 468val_468
206 | 308val_308
207 | 95val_95
208 | 196val_196
209 | 288val_288
210 | 481val_481
211 | 457val_457
212 | 98val_98
213 | 282val_282
214 | 197val_197
215 | 187val_187
216 | 318val_318
217 | 318val_318
218 | 409val_409
219 | 470val_470
220 | 137val_137
221 | 369val_369
222 | 316val_316
223 | 169val_169
224 | 413val_413
225 | 85val_85
226 | 77val_77
227 | 0val_0
228 | 490val_490
229 | 87val_87
230 | 364val_364
231 | 179val_179
232 | 118val_118
233 | 134val_134
234 | 395val_395
235 | 282val_282
236 | 138val_138
237 | 238val_238
238 | 419val_419
239 | 15val_15
240 | 118val_118
241 | 72val_72
242 | 90val_90
243 | 307val_307
244 | 19val_19
245 | 435val_435
246 | 10val_10
247 | 277val_277
248 | 273val_273
249 | 306val_306
250 | 224val_224
251 | 309val_309
252 | 389val_389
253 | 327val_327
254 | 242val_242
255 | 369val_369
256 | 392val_392
257 | 272val_272
258 | 331val_331
259 | 401val_401
260 | 242val_242
261 | 452val_452
262 | 177val_177
263 | 226val_226
264 | 5val_5
265 | 497val_497
266 | 402val_402
267 | 396val_396
268 | 317val_317
269 | 395val_395
270 | 58val_58
271 | 35val_35
272 | 336val_336
273 | 95val_95
274 | 11val_11
275 | 168val_168
276 | 34val_34
277 | 229val_229
278 | 233val_233
279 | 143val_143
280 | 472val_472
281 | 322val_322
282 | 498val_498
283 | 160val_160
284 | 195val_195
285 | 42val_42
286 | 321val_321
287 | 430val_430
288 | 119val_119
289 | 489val_489
290 | 458val_458
291 | 78val_78
292 | 76val_76
293 | 41val_41
294 | 223val_223
295 | 492val_492
296 | 149val_149
297 | 449val_449
298 | 218val_218
299 | 228val_228
300 | 138val_138
301 | 453val_453
302 | 30val_30
303 | 209val_209
304 | 64val_64
305 | 468val_468
306 | 76val_76
307 | 74val_74
308 | 342val_342
309 | 69val_69
310 | 230val_230
311 | 33val_33
312 | 368val_368
313 | 103val_103
314 | 296val_296
315 | 113val_113
316 | 216val_216
317 | 367val_367
318 | 344val_344
319 | 167val_167
320 | 274val_274
321 | 219val_219
322 | 239val_239
323 | 485val_485
324 | 116val_116
325 | 223val_223
326 | 256val_256
327 | 263val_263
328 | 70val_70
329 | 487val_487
330 | 480val_480
331 | 401val_401
332 | 288val_288
333 | 191val_191
334 | 5val_5
335 | 244val_244
336 | 438val_438
337 | 128val_128
338 | 467val_467
339 | 432val_432
340 | 202val_202
341 | 316val_316
342 | 229val_229
343 | 469val_469
344 | 463val_463
345 | 280val_280
346 | 2val_2
347 | 35val_35
348 | 283val_283
349 | 331val_331
350 | 235val_235
351 | 80val_80
352 | 44val_44
353 | 193val_193
354 | 321val_321
355 | 335val_335
356 | 104val_104
357 | 466val_466
358 | 366val_366
359 | 175val_175
360 | 403val_403
361 | 483val_483
362 | 53val_53
363 | 105val_105
364 | 257val_257
365 | 406val_406
366 | 409val_409
367 | 190val_190
368 | 406val_406
369 | 401val_401
370 | 114val_114
371 | 258val_258
372 | 90val_90
373 | 203val_203
374 | 262val_262
375 | 348val_348
376 | 424val_424
377 | 12val_12
378 | 396val_396
379 | 201val_201
380 | 217val_217
381 | 164val_164
382 | 431val_431
383 | 454val_454
384 | 478val_478
385 | 298val_298
386 | 125val_125
387 | 431val_431
388 | 164val_164
389 | 424val_424
390 | 187val_187
391 | 382val_382
392 | 5val_5
393 | 70val_70
394 | 397val_397
395 | 480val_480
396 | 291val_291
397 | 24val_24
398 | 351val_351
399 | 255val_255
400 | 104val_104
401 | 70val_70
402 | 163val_163
403 | 438val_438
404 | 119val_119
405 | 414val_414
406 | 200val_200
407 | 491val_491
408 | 237val_237
409 | 439val_439
410 | 360val_360
411 | 248val_248
412 | 479val_479
413 | 305val_305
414 | 417val_417
415 | 199val_199
416 | 444val_444
417 | 120val_120
418 | 429val_429
419 | 169val_169
420 | 443val_443
421 | 323val_323
422 | 325val_325
423 | 277val_277
424 | 230val_230
425 | 478val_478
426 | 178val_178
427 | 468val_468
428 | 310val_310
429 | 317val_317
430 | 333val_333
431 | 493val_493
432 | 460val_460
433 | 207val_207
434 | 249val_249
435 | 265val_265
436 | 480val_480
437 | 83val_83
438 | 136val_136
439 | 353val_353
440 | 172val_172
441 | 214val_214
442 | 462val_462
443 | 233val_233
444 | 406val_406
445 | 133val_133
446 | 175val_175
447 | 189val_189
448 | 454val_454
449 | 375val_375
450 | 401val_401
451 | 421val_421
452 | 407val_407
453 | 384val_384
454 | 256val_256
455 | 26val_26
456 | 134val_134
457 | 67val_67
458 | 384val_384
459 | 379val_379
460 | 18val_18
461 | 462val_462
462 | 492val_492
463 | 100val_100
464 | 298val_298
465 | 9val_9
466 | 341val_341
467 | 498val_498
468 | 146val_146
469 | 458val_458
470 | 362val_362
471 | 186val_186
472 | 285val_285
473 | 348val_348
474 | 167val_167
475 | 18val_18
476 | 273val_273
477 | 183val_183
478 | 281val_281
479 | 344val_344
480 | 97val_97
481 | 469val_469
482 | 315val_315
483 | 84val_84
484 | 28val_28
485 | 37val_37
486 | 448val_448
487 | 152val_152
488 | 348val_348
489 | 307val_307
490 | 194val_194
491 | 414val_414
492 | 477val_477
493 | 222val_222
494 | 126val_126
495 | 90val_90
496 | 169val_169
497 | 403val_403
498 | 400val_400
499 | 200val_200
500 | 97val_97
501 |
--------------------------------------------------------------------------------
/data/ndy.txt:
--------------------------------------------------------------------------------
1 | 尿道炎 罗红霉素150MG+呋喃妥因100MG+三金片,很管用的全科医疗 尿道炎的治疗:柴胡2ml+丁胺卡那0.2,im;呋兰妥因2片,一日四次,效果蛮好,两三天即愈。 痛经的治疗:炎痛喜康一次两片,每次在月经快来的前两天服用,一天一次服用四天,即可不痛。注意有胃病的忌服
--------------------------------------------------------------------------------
/data/outputfile.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/data/outputfile.xlsx
--------------------------------------------------------------------------------
/data/people.json:
--------------------------------------------------------------------------------
1 | {"name":"Michael"}
2 | {"name":"Andy", "age":30}
3 | {"name":"Justin", "age":19}
4 |
--------------------------------------------------------------------------------
/data/people.txt:
--------------------------------------------------------------------------------
1 | Michael, 29
2 | Andy, 30
3 | Justin, 19
4 |
--------------------------------------------------------------------------------
/data/sample_kmeans_data.txt:
--------------------------------------------------------------------------------
1 | 0 1:0.0 2:0.0 3:0.0
2 | 1 1:0.1 2:0.1 3:0.1
3 | 2 1:0.2 2:0.2 3:0.2
4 | 3 1:9.0 2:9.0 3:9.0
5 | 4 1:9.1 2:9.1 3:9.1
6 | 5 1:9.2 2:9.2 3:9.2
7 |
--------------------------------------------------------------------------------
/data/sample_lda_data.txt:
--------------------------------------------------------------------------------
1 | 1 2 6 0 2 3 1 1 0 0 3
2 | 1 3 0 1 3 0 0 2 0 0 1
3 | 1 4 1 0 0 4 9 0 1 2 0
4 | 2 1 0 3 0 0 5 0 2 3 9
5 | 3 1 1 9 3 0 2 0 0 1 3
6 | 4 2 0 3 4 5 1 1 1 4 0
7 | 2 1 0 3 0 0 5 0 2 2 9
8 | 1 1 1 9 2 1 2 0 0 1 3
9 | 4 4 0 3 4 2 1 3 0 0 0
10 | 2 8 2 0 3 0 2 0 2 7 2
11 | 1 1 1 9 0 2 2 0 0 3 3
12 | 4 1 0 0 4 5 1 3 0 1 0
13 |
--------------------------------------------------------------------------------
/data/sample_lda_libsvm_data.txt:
--------------------------------------------------------------------------------
1 | 0 1:1 2:2 3:6 4:0 5:2 6:3 7:1 8:1 9:0 10:0 11:3
2 | 1 1:1 2:3 3:0 4:1 5:3 6:0 7:0 8:2 9:0 10:0 11:1
3 | 2 1:1 2:4 3:1 4:0 5:0 6:4 7:9 8:0 9:1 10:2 11:0
4 | 3 1:2 2:1 3:0 4:3 5:0 6:0 7:5 8:0 9:2 10:3 11:9
5 | 4 1:3 2:1 3:1 4:9 5:3 6:0 7:2 8:0 9:0 10:1 11:3
6 | 5 1:4 2:2 3:0 4:3 5:4 6:5 7:1 8:1 9:1 10:4 11:0
7 | 6 1:2 2:1 3:0 4:3 5:0 6:0 7:5 8:0 9:2 10:2 11:9
8 | 7 1:1 2:1 3:1 4:9 5:2 6:1 7:2 8:0 9:0 10:1 11:3
9 | 8 1:4 2:4 3:0 4:3 5:4 6:2 7:1 8:3 9:0 10:0 11:0
10 | 9 1:2 2:8 3:2 4:0 5:3 6:0 7:2 8:0 9:2 10:7 11:2
11 | 10 1:1 2:1 3:1 4:9 5:0 6:2 7:2 8:0 9:0 10:3 11:3
12 | 11 1:4 2:1 3:0 4:0 5:4 6:5 7:1 8:3 9:0 10:1 11:0
13 |
--------------------------------------------------------------------------------
/data/sampling.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/data/sampling.xlsx
--------------------------------------------------------------------------------
/data/test.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/data/test.xlsx
--------------------------------------------------------------------------------
/data/text.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/data/text.xlsx
--------------------------------------------------------------------------------
/data/user.avsc:
--------------------------------------------------------------------------------
1 | {"namespace": "example.avro",
2 | "type": "record",
3 | "name": "User",
4 | "fields": [
5 | {"name": "name", "type": "string"},
6 | {"name": "favorite_color", "type": ["string", "null"]}
7 | ]
8 | }
9 |
--------------------------------------------------------------------------------
/data/users.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/data/users.avro
--------------------------------------------------------------------------------
/data/users.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/data/users.parquet
--------------------------------------------------------------------------------
/pythonadvance/Gensimtest.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf-8 -*-
2 |
3 | import gensim,logging
4 | import numpy as np
5 | import os
6 |
7 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)
8 |
9 | sentences=[['first','sentence'],['second','sentence']] #说明了模型的输入是什么形式的,必须是这种列表性质的
10 |
11 | model=gensim.models.Word2Vec(sentences,min_count=1,size=100) #min_count是可以忽略的出现次数少的词,因此当min_count设置为2时,报错,因为first被忽略了。size代表向量的维度。
12 |
13 | #print model['three']
14 |
15 |
16 | print(model['first'])
17 | print model.similarity('first','second')
18 | print model.most_similar(positive=['first'], negative=['sentence'])
19 | print ("first sentence second sentence , 有哪个是不匹配的? word2vec结果说是:"+model.doesnt_match("first sentence second sentence".split()))
20 |
21 | print model['first'].shape
22 |
23 | #没有理解
24 | class TextLoader(object):
25 | def __init__(self):
26 | pass
27 |
28 | def __iter__(self):
29 | input = open('corpus-seg.txt', 'r')
30 | line = str(input.readline())
31 | counter = 0
32 | while line != None and len(line) > 4:
33 | # print line
34 | segments = line.split(' ')
35 | yield segments
36 | line = str(input.readline())
37 |
38 | #没有理解
39 | class MySentences(object):
40 | def __init__(self, dirname):
41 | self.dirname = dirname
42 |
43 | def __iter__(self):
44 | for fname in os.listdir(self.dirname):
45 | for line in open(os.path.join(self.dirname, fname)):
46 | yield line.split()
47 |
48 | #sentences = MySentences('/some/directory') # a memory-friendly iterator
49 | #model = gensim.models.Word2Vec(sentences)
50 |
51 | def fab(max):
52 | n, a, b = 0, 0, 1
53 | while n < max:
54 | yield b
55 | # print b
56 | a, b = b, a + b
57 | n = n + 1
58 | print type(fab(5))
59 | next(fab(5))
60 | for i in fab(5):
61 | print i
62 | from gensim import corpora
63 |
64 | documents = ["Human machine interface for lab abc computer applications",
65 | "A survey of user opinion of computer system response time",
66 | "The EPS user interface management system",
67 | "System and human system engineering testing of EPS",
68 | "Relation of user perceived response time to error measurement",
69 | "The generation of random binary unordered trees",
70 | "The intersection graph of paths in trees",
71 | "Graph minors IV Widths of trees and well quasi ordering",
72 | "Graph minors A survey"]
73 |
74 | # remove common words and tokenize
75 | stoplist = set('for a of the and to in'.split())
76 | texts = [[word for word in document.lower().split() if word not in stoplist]
77 | for document in documents]
78 |
79 | # remove words that appear only once
80 | from collections import defaultdict
81 | frequency = defaultdict(int)
82 | for text in texts:
83 | for token in text:
84 | frequency[token] += 1
85 | print 'sucess'
86 | texts = [[token for token in text if frequency[token] > 1]
87 | for text in texts]
88 | from pprint import pprint # pretty-printer
89 | pprint(texts)
90 | print type(texts)
91 |
92 | dictionary = corpora.Dictionary(texts)
93 | #dictionary.save('../tmp/deerwester.dict') # store the dictionary, for future reference
94 | print(dictionary)
95 | print(dictionary.token2id)
96 | new_doc = "Human computer interaction"
97 | new_vec = dictionary.doc2bow(new_doc.lower().split())
98 | print(new_vec)
99 |
100 | corpus = [dictionary.doc2bow(text) for text in texts]
101 | #corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) # store to disk, for later use
102 | print(corpus)
103 |
--------------------------------------------------------------------------------
/pythonadvance/Regression_analyse.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf-8 -*-
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | import scipy as sp
5 | from scipy.stats import norm
6 | from sklearn.pipeline import Pipeline
7 | from sklearn.linear_model import LinearRegression
8 | from sklearn.preprocessing import PolynomialFeatures
9 | from sklearn import linear_model
10 |
11 | ''''' 数据生成 '''
12 | x = np.arange(0, 1, 0.002)
13 | y = norm.rvs(0, size=500, scale=0.1)#产生正态分布的数0是均值,0.1是方差,500是数量
14 | y = y + x ** 2
15 |
16 | ''''' 均方误差根 '''
17 |
18 |
19 | def rmse(y_test, y):
20 | return sp.sqrt(sp.mean((y_test - y) ** 2))
21 |
22 |
23 | ''''' 与均值相比的优秀程度,介于[0~1]。0表示不如均值。1表示完美预测.这个版本的实现是参考scikit-learn官网文档 '''
24 |
25 | #R2方法是将预测值跟只使用均值的情况下相比,看能好多少。其区间通常在(0,1)之间。0表示还不如什么都不预测,直接取均值的情况,而1表示所有预测跟真实结果完美匹配的情况。
26 |
27 | #R2的计算方法,不同的文献稍微有不同。如本文中函数R2是依据scikit-learn官网文档实现的,跟clf.score函数结果一致。
28 | def R2(y_test, y_true):
29 | return 1 - ((y_test - y_true) ** 2).sum() / ((y_true - y_true.mean()) ** 2).sum()
30 |
31 |
32 | ''''' 这是Conway&White《机器学习使用案例解析》里的版本 '''
33 |
34 |
35 | def R22(y_test, y_true):
36 | y_mean = np.array(y_true)
37 | y_mean[:] = y_mean.mean()
38 | return 1 - rmse(y_test, y_true) / rmse(y_mean, y_true)
39 |
40 |
41 | plt.scatter(x, y, s=5)
42 | degree = [1, 2, 100]
43 | y_test = []
44 | y_test = np.array(y_test)
45 |
46 | for d in degree:
47 | clf = Pipeline([('poly', PolynomialFeatures(degree=d)),
48 | ('linear', LinearRegression(fit_intercept=False))])
49 | clf.fit(x[:, np.newaxis], y)
50 | y_test = clf.predict(x[:, np.newaxis])
51 |
52 | print(clf.named_steps['linear'].coef_)
53 | print('rmse=%.2f, R2=%.2f, R22=%.2f, clf.score=%.2f' %
54 | (rmse(y_test, y),
55 | R2(y_test, y),
56 | R22(y_test, y),
57 | clf.score(x[:, np.newaxis], y)))
58 |
59 | plt.plot(x, y_test, linewidth=2)
60 |
61 | plt.grid()
62 | plt.legend(['1', '2', '100'], loc='upper left')
63 | plt.show()
--------------------------------------------------------------------------------
/pythonadvance/StatsModelstest.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf-8 -*-
2 | from statsmodels.tsa.stattools import adfuller as ADF
3 | import numpy as np
4 |
5 | print ADF(np.random.rand(100))
--------------------------------------------------------------------------------
/pythonadvance/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/pythonadvance/__init__.py
--------------------------------------------------------------------------------
/pythonadvance/cipin.py:
--------------------------------------------------------------------------------
1 | # from numpy import array
2 | # from math import sqrt
3 | #
4 | # from pyspark.mllib.clustering import KMeans, KMeansModel
5 | # from pyspark import SparkContext
6 | # sc = SparkContext("local",appName="KMeans")
7 | #
8 | # data = sc.textFile("D:\\PycharmProjects\\data\\mllib\\kmeans_data.txt")
9 | # parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
10 | #
11 | # # Build the model (cluster the data)
12 | # clusters = KMeans.train(parsedData, 2, maxIterations=10,
13 | # runs=10, initializationMode="random")
14 | #
15 | # # Evaluate clustering by computing Within Set Sum of Squared Errors
16 | # def error(point):
17 | # center = clusters.centers[clusters.predict(point)]
18 | # return sqrt(sum([x**2 for x in (point - center)]))
19 | #
20 | # WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
21 | # print("Within Set Sum of Squared Error = " + str(WSSSE))
22 | #
23 | # # Save and load model
24 | # clusters.save(sc, "D:\\PycharmProjects\\KmeansModel")
25 | # sameModel = KMeansModel.load(sc, "D:\\PycharmProjects\\KmeansModel")
26 |
27 |
28 | from pyspark import SparkContext
29 |
30 | logFile = "D:\spark-1.6.2-bin-hadoop2.6\README.md"
31 | sc = SparkContext("local","Simple App")
32 | logData = sc.textFile(logFile).cache()
33 |
34 | numAs = logData.filter(lambda s: 'a' in s).count()
35 | numBs = logData.filter(lambda s: 'b' in s).count()
36 |
37 | print("Lines with a: %i, lines with b: %i"%(numAs, numBs))
38 |
39 |
40 |
41 | # from pyspark.ml.clustering import KMeans
42 | #
43 | # from pyspark.sql import SparkSession
44 | #
45 | # # Loads data.
46 | #
47 | # spark = SparkSession \
48 | # .builder \
49 | # .appName("mlkmeans") \
50 | # .getOrCreate()
51 | # dataset = spark.read.format("libsvm").load("D:\\PycharmProjects\\data\\mllib\\sample_kmeans_data.txt")
52 | #
53 | # # Trains a k-means model.
54 | # kmeans = KMeans().setK(2).setSeed(1)
55 | # model = kmeans.fit(dataset)
56 | #
57 | # # Evaluate clustering by computing Within Set Sum of Squared Errors.
58 | # wssse = model.computeCost(dataset)
59 | # print("Within Set Sum of Squared Errors = " + str(wssse))
60 | #
61 | # # Shows the result.
62 | # centers = model.clusterCenters()
63 | # print("Cluster Centers: ")
64 | # for center in centers:
65 | # print(center)
66 |
67 |
68 | # from pyspark import SparkContext
69 | #
70 | # sc = SparkContext('local')
71 | # doc = sc.parallelize([['a','b','c'],['b','d','d']])
72 | # words = doc.flatMap(lambda d:d).distinct().collect()
73 | # word_dict = {w:i for w,i in zip(words,range(len(words)))}
74 | # word_dict_b = sc.broadcast(word_dict)
75 | #
76 | # def wordCountPerDoc(d):
77 | # dict={}
78 | # wd = word_dict_b.value
79 | # for w in d:
80 | # if dict.has_key(wd[w]):
81 | # dict[wd[w]] +=1
82 | # else:
83 | # dict[wd[w]] = 1
84 | # return dict
85 | # print doc.map(wordCountPerDoc).collect()
86 | # print "successful!"
87 |
--------------------------------------------------------------------------------
/pythonadvance/clustermeric.py:
--------------------------------------------------------------------------------
1 | from sklearn.cluster import AffinityPropagation
2 | from sklearn import metrics
3 | from sklearn.datasets.samples_generator import make_blobs
4 | from sklearn import
5 |
6 | centers = [[1, 1], [-1, -1], [1, -1]]
7 | X, labels_true = make_blobs(n_samples=300, centers=centers, cluster_std=0.5,
8 | random_state=0)
9 | print type(X)
10 | print X
11 | af = AffinityPropagation(preference=-50).fit(X)
12 | cluster_centers_indices = af.cluster_centers_indices_
13 | labels = af.labels_
14 | print type(labels)
15 | print labels
16 | n_clusters_ = len(cluster_centers_indices)
17 |
18 | print('Estimated number of clusters: %d' % n_clusters_)
19 | print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
20 | print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
21 | print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
22 | print("Adjusted Rand Index: %0.3f"
23 | % metrics.adjusted_rand_score(labels_true, labels))
24 | print("Adjusted Mutual Information: %0.3f"
25 | % metrics.adjusted_mutual_info_score(labels_true, labels))
26 | print("Silhouette Coefficient: %0.3f"
27 | % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
28 |
--------------------------------------------------------------------------------
/pythonadvance/dxckeras.py:
--------------------------------------------------------------------------------
1 | from keras.models import Sequential
2 | from keras.layers.core import Dense ,Dropout,Activation
3 | from keras.optimizers import SGD
4 | import pandas as pd
5 | import matplotlib.pyplot as plt
6 |
7 | print 'qx'
8 | datas=pd.read_excel('../data/sampling.xlsx')
9 | X=datas.iloc[:,1:].as_matrix()
10 | y=datas.iloc[:,0].as_matrix()
11 | print y
12 | model= Sequential()
13 | model.add(Dense(26,input_dim=26))
14 | model.add(Activation('linear'))
15 |
16 | model.add(Dense(26,input_dim=26))
17 | model.add(Activation('linear'))
18 | model.add(Dropout(0.5))
19 | model.add(Dense(1,input_dim=26))
20 | #model.add(Activation('linear'))
21 | # sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
22 | # model.compile(loss='mean_squared_error',optimizer=sgd,metrics=["accuracy"])
23 | model.compile(loss='mean_squared_error', optimizer='rmsprop')
24 | model.fit(X, y, batch_size=5, nb_epoch=100, shuffle=True,verbose=0,validation_split=0.2)
25 | score=model.evaluate(X,y,batch_size=16)
26 | p=model.predict(X,batch_size=16,verbose=0)
27 | print p
28 |
29 | fig, ax = plt.subplots()
30 | ax.scatter(y, p)
31 |
32 | ax.plot([y.min(),y.max()],[y.min(),y.max()],'g',lw=4)
33 | plt.show()
34 |
35 |
36 | #from keras.models import Sequential
37 | #
38 | # from keras.layers import LSTM, Dense
39 | #
40 | # import numpy as np
41 | #
42 | # data_dim = 16
43 | #
44 | # timesteps = 8
45 | #
46 | # nb_classes = 10
47 | #
48 | # # expected input data shape: (batch_size, timesteps, data_dim)
49 | #
50 | # model = Sequential()
51 | #
52 | # model.add(LSTM(32, return_sequences=True,
53 | #
54 | # input_shape=(timesteps, data_dim)))
55 | #
56 | # model.add(LSTM(32, return_sequences=True))
57 | #
58 | # model.add(LSTM(32))
59 | #
60 | # model.add(Dense(10, activation='softmax'))
61 | #
62 | # model.compile(loss='categorical_crossentropy',
63 | #
64 | # optimizer='rmsprop',
65 | #
66 | # metrics=['accuracy'])
67 | #
68 | # # generate dummy training data
69 | #
70 | # x_train = np.random.random((1000, timesteps, data_dim))
71 | #
72 | # y_train = np.random.random((1000, nb_classes))
73 | #
74 | # # generate dummy validation data
75 | #
76 | # x_val = np.random.random((100, timesteps, data_dim))
77 | #
78 | # y_val = np.random.random((100, nb_classes))
79 | #
80 | # model.fit(x_train, y_train,
81 | #
82 | # batch_size=64, nb_epoch=5,
83 | #
84 | # validation_data=(x_val, y_val))
85 |
--------------------------------------------------------------------------------
/pythonadvance/dxcprec.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 | import pandas as pd
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 |
6 | datas=pd.read_excel('../data/sampling.xlsx')
7 | # print datas
8 | data=datas.iloc[:,1:].as_matrix()
9 | target=datas.iloc[:,0].as_matrix()
10 | print data.shape
11 |
12 | print type(target)
13 | from sklearn.linear_model import LinearRegression
14 | model=LinearRegression()
15 | model.fit(data,target)
16 | # print model.predict(data[0])
17 | #
18 | print np.matrix(model.coef_)
19 | from sklearn.model_selection import cross_val_predict
20 | predicted = cross_val_predict(model, data, target, cv=10)
21 |
22 | fig, ax = plt.subplots()
23 | ax.scatter(target, predicted)
24 | ax.plot([target.min(), target.max()], [target.min(), target.max()], 'g', lw=1)
25 | ax.set_xlabel('Measured')
26 | ax.set_ylabel('Predicted')
27 | plt.show()
28 | print model.score(data,target)
--------------------------------------------------------------------------------
/pythonadvance/dxcsvm.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.svm import SVR
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 |
6 |
7 |
8 | datas=pd.read_excel('../data/sampling.xlsx')
9 | X=datas.iloc[:,1].as_matrix()
10 | y=datas.iloc[:,0].as_matrix()
11 | # print X
12 | # print y
13 | print X.shape
14 | # import xlrd
15 | # datas=xlrd.open_workbook('sampling.xlsx')
16 | # table = datas.sheet_by_name(u'Sheet1')
17 | # X= np.matrix(table.col_values(4))
18 | # y=np.matrix(table.col_values(0))
19 | # print X
20 | # print y
21 | svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
22 | svr_lin = SVR(kernel='linear', C=1e3)
23 | svr_poly = SVR(kernel='poly', C=1e3, degree=2)
24 | svr_rbf.fit(X, y)
25 | y_rbf = svr_rbf.predict(X)
26 | svr_lin.fit(X, y)
27 | y_lin = svr_lin.predict(X)
28 | svr_poly.fit(X, y)
29 | y_poly = svr_poly.predict(X)
30 | lw = 2
31 | plt.scatter(X, y, color='darkorange', label='data')
32 | plt.hold('on')
33 | plt.plot(X, y_rbf, color='navy', lw=lw, label='RBF model')
34 | plt.plot(X, y_lin, color='c', lw=lw, label='Linear model')
35 | plt.plot(X, y_poly, color='cornflowerblue', lw=lw, label='Polynomial model')
36 | plt.xlabel('data')
37 | plt.ylabel('target')
38 | plt.title('Support Vector Regression')
39 | plt.legend()
40 | plt.show()
41 |
--------------------------------------------------------------------------------
/pythonadvance/full_linearRession.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf-8 -*-
2 | import matplotlib.pyplot as plt
3 |
4 | import pandas as pd
5 | import numpy as np
6 | from sklearn.linear_model import LinearRegression
7 |
8 | data=pd.read_excel('../data/Folds5x2_pp.xlsx')
9 | print data.shape
10 | X=data.iloc[:,0:4]
11 | y=data.iloc[:,4]
12 |
13 |
14 | from sklearn.cross_validation import train_test_split
15 |
16 | X_train,x_test,y_train,y_test=train_test_split(X,y)
17 | #print X_train
18 | #print y_train
19 | #print x_test
20 | #print y_test
21 |
22 |
23 | model=LinearRegression()
24 | model.fit(X_train,y_train)
25 | print 'sucess'
26 | y_pred=model.predict(x_test)
27 |
28 | from sklearn import metrics
29 | #print model.score(X,y)
30 |
31 | # 用scikit-learn计算MSE
32 | print "MSE:",metrics.mean_squared_error(y_test, y_pred)
33 | # 用scikit-learn计算RMSE
34 | print "RMSE:",np.sqrt(metrics.mean_squared_error(y_test, y_pred))
35 |
36 | from sklearn.model_selection import cross_val_predict
37 | print 'sucess'
38 | y_pred=cross_val_predict(model,X,y,cv=10)
39 |
40 | print "MSE",metrics.mean_squared_error(y,y_pred)
41 | print "RMSE:",np.sqrt(metrics.mean_squared_error(y,y_pred))
42 |
43 | ax=plt.subplot()
44 | ax.scatter(y,y_pred)
45 | ax.plot([y.min(),y.max()],[y.min(),y.max()])
46 | ax.set_xlabel('Measured')
47 | ax.set_ylabel('Predicted')
48 | plt.show()
--------------------------------------------------------------------------------
/pythonadvance/nltktest.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf-8 -*-
2 | import nltk
3 | # nltk.download()
4 | sentence="I love python."
5 | tokens=nltk.word_tokenize(sentence)
6 |
7 |
8 | to=nltk.pos_tag(tokens)
9 | for i in to:
10 | print i
11 | h=nltk.pos_tag(['美'])
12 |
13 | for i in h:
14 | print i[0]+'---'+i[1]
15 | # from nltk.corpus import webtext
16 | #
17 | # webtext.fileids() #得到语料中所有文件的id集合
18 | #
19 | # webtext.raw(fileid) #给定文件的所有字符集合
20 | #
21 | # webtext.words(fileid) #所有单词集合
22 | #
23 | # webtext.sents(fileid) #所有句子集合
--------------------------------------------------------------------------------
/pythonadvance/numpytest.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 | import numpy as np
3 |
4 | a=np.array([2,0,1,5])
5 | print a
6 | print a[:3]
7 | print a.min()
8 | a.sort()
9 | b=np.array([[1,2,3],[4,5,6]])
10 | print (b*b) #是点乘不是传统意义上的矩阵相乘
11 |
12 | print type(b) #numpy 仍然是传统意义上的数组
13 |
14 |
15 | a=np.arange(1,4).cumprod() #每个数取得是连乘就是阶乘了
16 | print a
17 | b=np.array([2]*3).cumprod() #2的一次方到2的三次方,里边是产生了3个2
18 | print b
19 |
20 | #np后边产生的都是数组
21 | print np.linspace(1,2,10) #数组与列表的区别就在于没有逗号
22 |
23 | print np.array([2]*3)
24 | print 'matrix'
25 | d= np.array([[1,2,3],[4,5,6]])
26 | print d.cumsum(0) #0是指一列,1是指一行
27 | print d.cumsum(1)
28 | print d.cumprod(1)
29 | print d.cumprod(0)
30 |
31 | e=np.random.randn(3,4)
32 | print type(e) #np产生的都是一维或者高维数组
33 |
34 | t1=np.linspace(0,2,10)
35 | print t1
36 | t2=np.linspace(-1,1,20)
37 | print t1,t2
38 | #t=np.concatenate(t1,t2)
39 | #print t
40 |
41 |
42 |
43 | a = np.matrix([ [1, 2, 3, 4],
44 | [5, 5, 6, 8],
45 | [7, 9, 9, 1],
46 | [4, 6, 7, 1]
47 | ])
48 |
49 | #矩阵加减法:
50 | e = a + a
51 | #or
52 | e = a - a
53 |
54 | #矩阵乘法:
55 | b = a * a #not matrix multiplication!
56 | print type(b)
57 | #or
58 | c = np.dot(a, a) #matrix multiplication
59 | #or
60 | d = a
61 | np.dot(a, a, d) #matrix multiplication
62 |
63 | #转置矩阵(transpose)
64 | g = a.transpose()
65 | #or
66 | h = a.T #not matrix transpose!
67 |
68 | #逆矩阵(inverse)
69 | #The inverse of a matrix A is the matrix B such that AB=I where I is the identity matrix consisting of ones down the main diagonal. Usually B is denoted B=A-1 .
70 | #In SciPy, the matrix inverse of the Numpy array, A, is obtained using linalg.inv (A) , or using A.I
71 | f = np.linalg.inv(a)
72 | #or
73 | f = a ** (-1)
74 | #or
75 | f = a.I
76 |
77 | #行列式(determinant)
78 | j = np.linalg.det(a)
79 |
80 | #伴随矩阵(adjoint)
81 | #(need more test)
82 | m = np.dot(np.linalg.det(a), np.linalg.inv(a)) # A-1 = A'' / |A| ==> A''= A-1|A|
83 |
84 | #矩阵范数(matrix norms)
85 | k = np.linalg.norm(a)
86 |
87 | l1=[1,2,3]
88 | l2=[1,2,3]
89 | l3=[1,2,3]
90 | l1=np.array(l1)
91 | l2=np.array(l2)
92 | l3=np.array(l3)
93 | l=list((l1+l2+l3)/3)
94 | print l
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
--------------------------------------------------------------------------------
/pythonadvance/pandatest.py:
--------------------------------------------------------------------------------
1 | #-- coding:utf-8 --#
2 | import pandas as pd
3 | from pandas import Series,DataFrame
4 | import numpy as np
5 |
6 | l=Series(data=[1,2,3])
7 | print l
8 | print l.data
9 |
10 | for i in l.index:
11 | print l[i] #只输出了data部分的数
12 | print l[1:] #是对整个Series进行了切分。
13 | #print l['a']
14 | print l.index
15 | m=[1,2,3,4]
16 | print m #列表与index不是同一种数据结构,index是一种对象类型,但是对index或者column赋值可以使用列表对其进行赋值。访问他们只能通过序号,可以是切片的形式。
17 |
18 | s=Series([2,3,4],index=['b','a','c'])
19 |
20 | d=DataFrame({'e':4,'d':5,'f':6},index=['a','b','c'])
21 | print d.index
22 | print d.columns
23 | print d.values
24 | print d.describe()
25 | print '测试iterrows'
26 | for i ,series in d.iterrows():#返回每行的序列号,及每行对应每个列的值。
27 | print i,'---',series
28 |
29 | print '测试iloc'
30 | print d
31 | print d.iloc[0:2]#前边默认是行。
32 |
33 | print set(d['d'].tolist())
34 | print s
35 | print d
36 | sd=pd.concat((s,d),axis=1)
37 | print sd
38 |
39 | d.head(1)
40 | d.tail(1)
41 | #d.to_excel('../data/test.xlsx')
42 | r=pd.read_excel('../data/sampling.xlsx')
43 | #print r
44 |
45 | dates=pd.date_range('20170217',periods=2)
46 | data=pd.DataFrame(np.random.randn(2,4),index=dates,columns=['a','b','c','d'])
47 | print data
48 | print '测试to_list'
49 | print list(set(data['d'].tolist()[0]))
50 | #doc_word = list(set(doc['content'].tolist()[0]))
51 | a=Series([1,2,3,4,None,5])
52 | print a.isnull() #类型仍然是list
53 | print type(a[a.isnull()]) #只出现结果是true的值,类型仍然是list ,pandas的任何一列都是Series
54 |
55 | print a[range(1,3)] #Series的读取问题,可以根据列表序号进行读取。
56 |
57 |
--------------------------------------------------------------------------------
/pythonadvance/pymysqltest.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 | import pymysql
3 |
4 | conn=pymysql.connect(host='192.168.1.111',user='root',passwd='1234',db='zmap_empi',charset='utf8')
5 |
6 | cur=conn.cursor()
7 | query='select count(*) from zmap_r_patient_empi_jb'
8 |
9 | cur.execute(query)
10 | result = cur.fetchall() # result为tuple类型,记录存放是((),(),...()) 这样的形式
11 |
12 |
13 | for i in result:
14 | print i
--------------------------------------------------------------------------------
/pythonadvance/regular.py:
--------------------------------------------------------------------------------
1 |
2 | # -*- coding: UTF-8 -*-
3 |
4 | import re
5 | print re.match('www', 'www.runoob.com').span() # 在起始位置匹配
6 | print re.match('www','www.runoob.com')
7 | #print re.match('com', 'www.runoob.com').span() # 不在起始位置匹配,re.match 尝试从字符串的起始位置匹配一个模式,如果不是起始位置匹配成功的话,match()就返回none。
8 |
9 |
10 | import re
11 |
12 | test = "我是123周小旭,来自1bd江西ab九江"
13 |
14 | result = re.findall(ur'[\u4e00-\u9fa5]', test.decode('utf-8'))
15 |
16 | print result
17 |
18 |
19 | print ''.join(result)
20 |
21 | result=re.findall(r'[0-9]',test)
22 | print result
23 |
24 | result=re.findall(r'[a-z]',test)
25 | print result
26 |
27 | text = "JGood is a handsome boy, he is cool, clever, and so on..."
28 | m = re.search(r'\shan(ds)ome\s', text)
29 | if m:
30 | print m.group(0), m.group(1)
31 | else:
32 | print 'not search'
33 |
34 |
35 | text = "JGood is a handsome boy, he is cool, clever, and so on..."
36 | print re.sub(r'\s+', '-', text)
37 | #re.split(r'\s+', text);将字符串按空格分割成一个单词列表
38 |
39 | #re.findall可以获取字符串中所有匹配的字符串。如:re.findall(r'\w*oo\w*', text);获取字符串中,包含'oo'的所有单词。
40 |
41 | text = "JGood is a handsome boy, he is cool, clever, and so on..."
42 | regex = re.compile(r'\w*oo\w*')
43 | print regex.findall(text) #查找所有包含'oo'的单词
44 | print regex.sub(lambda m: '[' + m.group(0) + ']', text) #将字符串中含有'oo'的单词用[]括起来。
45 | inputStr = "hello 123 world 456"
46 | replacedStr = re.sub("\d+", "222", inputStr)
47 | print replacedStr
48 |
49 | str='''江苏省人民医院心血管内科 感谢信56封礼物40个[图片]职 称:副主任医师 副教授[图片]擅 长:
50 | 心房颤动、室性心动过速与各种室上性心动过速的导管消融与缓慢性心律失常的起搏治疗
51 | 心房颤动、室性心动过速与各种室上性心动过速的导管消融与缓慢性心律失常的起搏治疗[图片]执业经历:
52 | 张凤祥,男,医学博士,副主任医师、副教授,硕士生导师;中国医师协会心律失常分会青委会副主任委员,中华医学会心电生理和起搏分会青年委员,中国医师协会心血管内科医师分会青年委员,中华医学会江苏省心血管病分会青年委员会副主委,中华全科医学杂志编委;2007年毕业于南京医科大学并任职于南京医科大学第一附属医院(江苏省人民医院)。熟练掌握各种心律失常的诊断与治疗。主要研究方向:心律失常的临床治疗与基础研究。擅长1)心房颤动、室性早搏、室性心动过速、房性心动过速、阵发性室上速、等心律失常的导管消融治疗;2) 房室传导阻滞、病态窦房结综合征等缓慢心律失常的起搏治疗;3)Brugada综合征、长QT综合征、短QT、儿茶酚胺敏感室速等心脏性猝死预防。发表学术论文50篇,其中SCI文章近20篇。主持国家自然科学基金3项,江苏省六大人才高峰课题1项,中国医师协会课题1项;参与973、十二五等重大科研课题;荣获江苏省卫生厅新技术引进二等奖2项。<< 收起'''
53 | print str
54 | str=str.replace('\n','')
55 | print str
56 | str1='''通过好大夫在线提前预约名大夫的办法
57 |
58 | 准备去北京或者其他大城市看病或者做手术的人,可以提前通过好大夫在线联系好大夫,否则,直接去了这些名大夫的号很难挂。下边以去北京安贞找马长生治疗房颤为例说明办法:
59 | ,
60 | 第一,提前拍摄好自己的病历材料,(手头没有可以凭病号身份证去医院病案室复印)。要求最少提供:
61 | (1)发作时期的心电图或者记录有发作症状的24小时心电图报告页。
62 | (2)心脏彩超。必须带数据部分
63 | (3)血液生化全部项目报告单
64 | (4)你治疗期间做的其他价格高的检查,比如心脏CTA等等
65 | (5)最近一次因为房颤住院的出院小结
66 | 图片要求拍摄清晰,可以适当掩盖住名字部分。
67 | 写一份详细的生病状况和治疗情况说明,内容要有发病症状,所入医院名称和科室。大概治疗用药物,写上出院小结内大夫给的诊断结论还有出院大夫要求吃的药物。
68 | 第二登陆马长生的好大夫在线的个人网站地址是 http://machangsheng.haodf.com/ (如果你找其他大夫看病就去其他大夫的网站)
69 | 第三点击 网上咨询。一般都提示你注册,注册的时候所留手机号码必须真实有效。以后要用来接收大夫回复以后的短信通知。
70 | 第四填写各种内容和上传图片
71 | 第五,点击最下面的确定 等待大夫的回复
72 | 大夫回复后,有时候他会提问你问题,也可能要求补充病历材料等,你可以根据情况继续上传和给出说明
73 | 一般来说,一个注册号码一次可以得到3次提问机会,超过部分是要收费的。当然,到第三次的时候大夫如果认为有必要,一般会再给你三次机会。
74 | 如果大夫认为你适合找他看病或者手术,一般他会给你一个住院管理大夫的电话,你电话过去说明是马长生叫你打的电话,要求给定病床。等他有病床了会提前通知你。这样你再去北京就不用等待了,免去了很多外地人到北京住院治疗的烦恼
75 |
76 | 以下是心脏内科导管消融的好大夫
77 |
78 | http://machangsheng.haodf.com/ 马长生 北京安贞
79 |
80 | http://dongjianzeng.haodf.com/ 董建增 北京安贞
81 |
82 | http://yaoyan.haodf.com/ 姚焰 北京阜外
83 |
84 | http://liuxu001.haodf.com/ 刘旭 上海市胸科医院 心内科
85 |
86 | http://wxqing1212.haodf.com/ 王现青.河南省人民医院 心血管内科
87 |
88 | http://huanghe1977.haodf.com/ 黄鹤 武汉大学人民医院 心血管内科
89 |
90 | http://xpliu71.haodf.com/ 刘兴鹏 北京朝阳医院 心脏中心
91 |
92 | http://jiangchenyang.haodf.com/ 蒋晨阳 浙江大学医学院附属邵逸夫医院 心内科
93 |
94 | http://liushaowen.haodf.com/ 刘少稳上海市第一人民医院 心内科
95 |
96 |
97 |
98 |
99 | 以下是心脏外科用胸腔镜治疗房颤,的好大夫,
100 |
101 | http://zhengzhe.haodf.com/ 郑哲 北京阜外医院
102 |
103 | http://mxu263.haodf.com/ 孟旭 北京安贞医院 心脏外科中心
104 |
105 | http://xuchunlei.haodf.com/ 许春雷 北京安贞医院 心脏外科中心
106 |
107 | http://meiju.haodf.com/ 上海新华 梅举
108 |
109 | http://chengyunge.haodf.com/ 上海远大程云阁 国内唯一用胸腔镜做迷宫三手术的大夫,价格4.5万最便宜
110 |
111 |
112 | 病友阵发房颤根据亲身经历整理
113 | '''
114 | print str1
115 | str1=str1.replace('/r/n','')
116 | print str1
--------------------------------------------------------------------------------
/pythonadvance/scipytest.py:
--------------------------------------------------------------------------------
1 | from scipy import stats
2 | X=stats.norm.rvs(0,size=500,scale=0.1)
3 | #X =stats.norm(loc=1.0,scale=2.0,size = 100)
4 |
5 |
6 | print stats.norm.fit(X)
7 | from scipy import stats
8 | import numpy as np
9 | import matplotlib.pyplot as plt
10 | fs_meetsig = np.random.random(30)
11 | fs_xk = np.sort(fs_meetsig)
12 | fs_pk = np.ones_like(fs_xk) / len(fs_xk)
13 | fs_rv_dist = stats.rv_discrete(name='fs_rv_dist', values=(fs_xk, fs_pk))
14 |
15 | plt.plot(fs_xk, fs_rv_dist.cdf(fs_xk), 'b-', ms=12, mec='r', label='friend')
16 | plt.show()
17 |
18 | age = [23, 23, 27, 27, 39, 41, 47, 49, 50, 52, 54, 54, 56, 57, 58, 58, 60, 61]
19 | fat_percent = [9.5, 26.5, 7.8, 17.8, 31.4, 25.9, 27.4, 27.2, 31.2, 34.6, 42.5, 28.8, 33.4, 30.2, 34.1, 32.9, 41.2, 35.7]
20 | age = np.array(age)
21 | fat_percent = np.array(fat_percent)
22 | data = np.vstack([age, fat_percent]).reshape([-1, 2])
23 |
24 | print(stats.describe(data))
25 |
26 | for key, value in stats.describe(data)._asdict().items():
27 | print(key, ':', value)
28 |
29 | # shannon_entropy = stats.entropy(ij/sum(ij), base=None)
30 | # print(shannon_entropy)
--------------------------------------------------------------------------------
/pythonadvance/sklearnkmeans.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf-8 -*-
2 | import pandas
3 | import matplotlib.pyplot as plt
4 |
5 | inputfile='../data/sampling.xlsx'
6 | outputfile='../data/outputfile.xlsx'
7 | data=pandas.read_excel(inputfile)
8 | print data.index #可以同过index_col设置index的。
9 | #print data.values
10 | #data=data[2:]
11 | data_zs=1.0*(data-data.mean())/data.std()
12 |
13 |
14 | from sklearn.cluster import KMeans
15 | model=KMeans(n_clusters=100,n_jobs=1,max_iter=500)
16 | model.fit(data_zs)
17 | # fig=plt.figure()
18 | # ax=fig.add_subplot(111)
19 | # ax.scatter()
20 |
21 | t=pandas.concat([data,pandas.Series(model.labels_,index=data.index)],axis=1)
22 | t.columns=list(data.columns)+[u'聚类类别']
23 |
24 | #print type(t)
25 | #print t
26 |
27 | #print len( model.labels_)
28 | r1=pandas.Series(model.labels_).value_counts()
29 | #r1.index=range(100)
30 | #print len(model.labels_)
31 | #print model.cluster_centers_
32 | r2=pandas.DataFrame(model.cluster_centers_)
33 | r=pandas.concat([r2,r1],axis=1)
34 | r.columns=list(data.columns) + [u'聚类类别']
35 | #print r
36 |
37 |
38 | l= r[r[u'聚类类别']<2].index
39 | #print l
40 | #print r
41 | l=l.tolist()
42 | # print type(l)
43 | # print l
44 | # t=t.iloc[l,:]
45 | # print t
46 | #t=t[t[u'聚类类别'] in l]
47 | # print set(l)
48 | # print type(t[u'聚类类别'])
49 | t=t[ t[u'聚类类别'].isin(l)]
50 |
51 |
52 | # for i in l:
53 | # (t[t[u'聚类类别']==i].index)
54 | # t[t[u'聚类类别']==i]
55 | # r=r.append(t[t[u'聚类类别']==i])
56 |
57 | #r=r[99:]
58 |
59 | #print r
60 | #print data(x)
61 | # for i in x:
62 | # print data(index=i)
63 | t.to_excel(outputfile)
64 |
--------------------------------------------------------------------------------
/pythonadvance/yaofang_fenxi_text.py:
--------------------------------------------------------------------------------
1 | #encoding=utf-8
2 | #from __future__ import unicode_literals
3 | #import sys
4 | #sys.path.append("../")
5 |
6 | import jieba
7 | import jieba.posseg
8 | import jieba.analyse
9 |
10 | print('关键词提取')
11 | print('-'*40)
12 | print(' TF-IDF')
13 | print('-'*40)
14 |
15 | f = open("../data/ndy.txt","r")
16 | s = f.read()
17 | print type(s)
18 |
19 | for x, w in jieba.analyse.extract_tags(s, withWeight=True):
20 | print('%s %s' % (x, w))
21 |
22 | print('-'*40)
23 | print(' TextRank')
24 | print('-'*40)
25 |
26 | for x, w in jieba.analyse.textrank(s, withWeight=True):
27 | print('%s %s' % (x, w))
28 |
29 | seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
30 | print "Full Mode:", "/ ".join(seg_list) #全模式
--------------------------------------------------------------------------------
/pythonadvance/yichangtest.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 | import pandas as pd
3 | number = '../data/all_musicers.xlsx' #设定播放数据路径,该路径为代码所在路径的上一个目录data中.
4 | data = pd.read_excel(number)
5 |
6 | data1=data.iloc[:,0:10]#10位歌手的183天音乐播放量
7 | #data2=data.iloc[:,10:20]
8 | #data3=data.iloc[:,20:30]
9 | #data4=data.iloc[:,30:40]
10 | #data5=data.iloc[:,40:50]
11 | import matplotlib.pyplot as plt #导入图像库
12 | plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
13 | plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
14 | plt.figure(1, figsize=(13, 26))#可设定图像大小
15 | #plt.figure() #建立图像
16 | p = data1.boxplot(return_type = 'dict') #画箱线图,直接使用DataFrame的方法.代码到这为止,就已经可以显示带有异常值的箱型图了,但为了标注出异常值的数值,还需要以下代码进行标注.
17 | #for i in range(0,4):
18 | x = p['fliers'][2].get_xdata() # 'flies'即为异常值的标签.[0]是用来标注第1位歌手的异常值数值,同理[i]标注第i+1位歌手的异常值.
19 | y = p['fliers'][2].get_ydata()
20 | y.sort() #从小到大排序
21 | print x
22 | print y
23 | for i in range(len(x)):
24 | if i>0:
25 | plt.annotate(y[i], xy = (x[i],y[i]), xytext=(x[i]+0.05 -0.8/(y[i]-y[i-1]),y[i]))
26 | else:
27 | plt.annotate(y[i], xy = (x[i],y[i]), xytext=(x[i]+0.08,y[i]))
28 |
29 | plt.show() #展示箱线图
30 | #输出结果如下:其中,+所表示的均是(统计学认为的)异常值.工作中,要结合数据应用背景, 距离箱型图上下界很近的可归为正常值.
31 |
32 | for i in range(0,182):
33 | if data1.iloc[:,1][i]>125:
34 | data1.iloc[:,1][i]=(data1.iloc[:,1][i+1]+data1.iloc[:,1][i-1])/2
35 | for i in range(0,182):
36 | if data1.iloc[:,2][i]>600:
37 | data1.iloc[:,2][i]=(data1.iloc[:,2][i+1]+data1.iloc[:,1][i-1])/2
38 | for i in range(0,182):
39 | if data1.iloc[:,4][i]>225:
40 | data1.iloc[:,4][i]=(data1.iloc[:,4][i+1]+data1.iloc[:,4][i-1])/2
41 | for i in range(0,182):
42 | if data1.iloc[:,7][i]>60:
43 | data1.iloc[:,7][i]=(data1.iloc[:,7][i+1]+data1.iloc[:,7][i-1])/2
44 | for i in range(0,182):
45 | if data1.iloc[:,8][i]>2500:
46 | data1.iloc[:,8][i]=(data1.iloc[:,8][i+1]+data1.iloc[:,8][i-1])/2
47 |
48 | data1.to_csv("train_innoraml.csv")
--------------------------------------------------------------------------------
/pythonbasic/classtest.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf-8 -*-
2 | class Person(object):
3 | pass
4 |
5 | xiaoming = Person()
6 | xiaohong = Person()
7 |
8 | print xiaoming
9 | print xiaohong
10 | print xiaoming==xiaohong
11 |
12 | print cmp(2,3)
13 | #定义类之后,可以实例化后,为不同的实例赋予不同的属性
14 | class Person(object):
15 | pass
16 |
17 | p1 = Person()
18 | p1.name = 'Bart'
19 |
20 | p2 = Person()
21 | p2.name = 'Adam'
22 |
23 | p3 = Person()
24 | p3.name = 'Lisa'
25 | p3.job ='programmer'
26 |
27 | L1 = [p1, p2, p3]
28 | L2 = sorted(L1,lambda p1,p2:cmp(p1.name,p2.name))
29 |
30 | print L2[0].name
31 | print L2[1].name
32 | print L2[2].name
33 | print L2[2].job
34 | #也可以在实例初始化时,就设定好
35 | class Person(object):
36 | def __init__(self,nam,gender,birth,**kw):
37 | self.name=nam
38 | self.gender=gender
39 | self.birth=birth
40 | for k,v in kw.iteritems():
41 | setattr(self,k,v)
42 |
43 | xiaoming = Person('Xiao Ming', 'Male', '1990-1-1', job='Student')
44 |
45 | print xiaoming.name
46 | print xiaoming.job
47 | #Python对属性权限的控制是通过属性名来实现的,如果一个属性由双下划线开头(__),该属性就无法被外部访问。
48 | class Person(object):
49 | def __init__(self, name, score):
50 | self.name=name
51 | self.__score=score
52 |
53 | p = Person('Bob', 59)
54 |
55 | print p.name
56 | try:
57 | print p.__score
58 | except AttributeError:
59 | print 'attributeerror'
60 | #实例属性每个实例各自拥有,互相独立,而类属性有且只有一份。类属性发生改变,所有的都改变
61 | class Person(object):
62 | count= 0
63 | def __init__(self,name):
64 | Person.count= Person.count + 1
65 | self.name= name
66 |
67 | p1 = Person('Bob')
68 | print Person.count
69 |
70 | p2 = Person('Alice')
71 | print Person.count
72 |
73 | p3 = Person('Tim')
74 | print Person.count
75 |
76 | #当实例属性和类属性重名时,实例属性优先级高,它将屏蔽掉对类属性的访问。 千万不要在实例上修改类属性,它实际上并没有修改类属性,而是给实例绑定了一个实例属性。
77 |
78 | class Person(object):
79 |
80 | __count = 0
81 |
82 | def __init__(self, name):
83 | Person.__count=Person.__count+1
84 | self.name=name
85 | print Person.__count
86 |
87 | p1 = Person('Bob')
88 | p2 = Person('Alice')
89 |
90 | try:
91 | print Person.__count
92 | except AttributeError:
93 | print 'attributeerror'
94 |
95 | #实例的方法就是在类中定义的函数,它的第一个参数永远是 self,指向调用该方法的实例本身,其他参数和一个普通函数是完全一样的:
96 | #调用实例方法必须在实例上调用:也就是说必须先初始化
97 | class Person(object):
98 |
99 | def __init__(self, name, score):
100 | self.name=name
101 | self.__score=score
102 |
103 | def get_grade(self):
104 | if self.__score>80:
105 | return 'A'
106 | elif self.__score>=60:
107 | return 'B'
108 | elif self.__score<60:
109 | return 'C'
110 |
111 | p1 = Person('Bob', 90)
112 | p2 = Person('Alice', 65)
113 | p3 = Person('Tim', 48)
114 |
115 | print p1.get_grade()
116 | print p2.get_grade()
117 | print p3.get_grade()
118 |
119 |
120 |
--------------------------------------------------------------------------------
/pythonbasic/decoratortest.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf-8 -*-
2 | from __future__ import division
3 | def cmp_ignore_case(s1, s2):
4 | u1=s1.upper()
5 | u2=s2.upper()
6 | if u1u2:
9 | return 1
10 |
11 | #sorted函数是一个高阶函数,其包含的函数是一个比较函数,返回值是-1或者1
12 | print sorted(['bob', 'about', 'Zoo', 'Credit'], cmp_ignore_case) #小的在前面是-1,是从小到大排列,反之从大到小;
13 |
14 |
15 | def count():
16 | fs = []
17 | for i in range(1, 4):
18 | def f(j): #本身已经是一个闭包,内部定义函数接受外部函数参数,并返回内部函数
19 | def g():
20 | return j*j
21 | return g
22 | r=f(i)
23 | fs.append(r)
24 | return fs
25 |
26 | f1, f2, f3 = count()
27 | print f1(), f2(), f3() #这里f1=f(1),f2=f(2),f3=f(3)
28 |
29 | #装饰器的作用是写一个装饰器函数,能够一次满足所有有类似需求的函数。如:都有输出log的需求,那么写一个log装饰器即可
30 |
31 | import time
32 |
33 | def performance(f):
34 | def fn(*args,**kw):
35 | t1=time.time()
36 | r=f(*args,**kw)
37 | t2=time.time()
38 | print 'call %s()in %fs' % (f.__name__,(t2-t1))
39 | return r
40 | return fn
41 |
42 | @performance
43 | def factorial(n):
44 | return reduce(lambda x,y: x*y, range(1, n+1))
45 |
46 | print factorial(10)
47 |
48 | def log(f):
49 | def fn(x):
50 | print 'call ' + f.__name__ + '()...'
51 | return f(x)
52 | return fn
53 | @log
54 | def factorial(n):
55 | return reduce(lambda x,y: x*y, range(1, n+1))
56 | print factorial(10)
57 |
58 | def log(prefix):
59 | def log_decorator(f):
60 | def wrapper(*args, **kw):
61 | print '[%s] %s()...' % (prefix, f.__name__)
62 | return f(*args, **kw)
63 | return wrapper
64 | return log_decorator
65 |
66 | @log('DEBUG')
67 | def test():
68 | pass
69 | print test()
70 |
71 | import time
72 |
73 | def performance(unit):
74 | def per_decorator(f):
75 | def fn(*args,**kw):
76 | t1=time.time()
77 | r=f(*args,**kw)
78 | t2=time.time()
79 | t=(t2-t1)*1000 if unit=='ms'else(t2-t1) #这个地方有疑问
80 | print'call %s()in %f%s'%(f.__name__,t,unit)
81 | return r
82 | return fn
83 | return per_decorator
84 |
85 | @performance('ms')
86 | def factorial(n):
87 | return reduce(lambda x,y: x*y, range(1, n+1))
88 |
89 | print factorial(10)
90 |
91 | #偏函数的意义减少函数默认参数的设置 functools.partial是偏函数的基本格式
92 | import functools
93 |
94 | sorted_ignore_case = functools.partial(sorted,cmp=lambda s1,s2:cmp(s1.upper(),s2.upper()))
95 |
96 | print sorted_ignore_case(['bob', 'about', 'Zoo', 'Credit'])
97 |
98 | #2.7使用3.几之后的功能使用 __future
99 | print 10/3
100 |
101 |
102 | print 10 / 3
103 | print 10 // 3
104 | #3.0之后unicode不需要加4
105 |
106 | s = 'am I an unicode?'
107 | print isinstance(s, unicode)
--------------------------------------------------------------------------------
/pythonbasic/dictionary.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import jieba, os
3 | import codecs
4 | from gensim import corpora, models, similarities
5 | from pprint import pprint
6 | from collections import defaultdict
7 | import sys
8 | import pickle
9 |
10 | reload(sys)
11 | sys.setdefaultencoding('utf-8')
12 |
13 |
14 | def print_dict(dict):
15 | for key in dict:
16 | print type(key), key, str(dict[key]),
17 | print
18 |
19 |
20 | def test3():
21 | ''''
22 | gensim学习之Dictionary
23 | '''
24 | a = [['一','一','二'],['一','二','三']]
25 | b = ['一','一','三','四','四']
26 | dictionary = corpora.Dictionary(a)
27 | print "########dictionary信息##########"
28 | print str(dictionary) #
29 | print "字典,{单词id,在多少文档中出现}"
30 | print dictionary.dfs #字典,{单词id,在多少文档中出现}
31 | print "文档数目"
32 | print dictionary.num_docs #文档数目
33 | print "dictionary.items()"
34 | print_dict(dict(dictionary.items())) #
35 | print "字典,{单词id,对应的词}"
36 | print_dict(dictionary.id2token) #字典,{单词id,对应的词}
37 | print "字典,{词,对应的单词id}"
38 | print_dict(dictionary.token2id) #字典,{词,对应的单词id}
39 | print "所有词的个数"
40 | print dictionary.num_pos #所有词的个数
41 | print "每个文件中不重复词个数的和"
42 | print dictionary.num_nnz #每个文件中不重复词个数的和
43 | print "########doc2bow##########"
44 | #dictionary.add_documents([b])
45 | #allow_update->更新当前字典;return_missing->返回字典中不存在的词
46 | #result为b文章转换得到的词袋,列表[(单词id,词频)]
47 | result, missing = dictionary.doc2bow(b, allow_update=False, return_missing=True)
48 | print "词袋b,列表[(单词id,词频)]"
49 | print result
50 | print "不在字典中的词及其词频,字典[(单词,词频)]"
51 | print_dict(missing)
52 | print "########bow信息##########"
53 | for id, freq in result:
54 | print id, dictionary.id2token[id], freq
55 | print "########dictionary信息##########"
56 | #过滤文档频率大于no_below,小于no_above*num_docs的词
57 | dictionary.filter_extremes(no_below=1, no_above=0.5, keep_n=10)
58 |
59 | return
60 |
61 | test3()
62 |
63 |
64 |
--------------------------------------------------------------------------------
/pythonbasic/huatu.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import pandas as pd
3 | import numpy as np
4 |
5 |
6 | fig=plt.figure()
7 | ax=fig.add_subplot(111)
8 | for i in range(1,6):
9 | for j in range(1,6):
10 | ax.scatter(i,j)
11 |
12 | ax.plot([0,5],[0,5],'k',lw=4)
13 | plt.show()
14 |
15 | dates=pd.date_range('2/17/2017',periods=1000)
16 | nd=pd.DataFrame(np.random.randn(1000,4),index=dates,columns=['a','b','c','d'])
17 | print nd
18 | nd=nd.cumsum()
19 | plt.figure()
20 | nd.plot()
21 | plt.show()
22 |
23 | t=np.arange(0.0,5.0,0.01)
24 | s=np.cos(2*np.pi*t)
25 | line,=plt.plot(t,s,lw=2)
26 | plt.annotate('local max',xy=(2,1),xytext=(3,1.5),arrowprops=dict(facecolor='black',shrink=0.05))
27 | plt.ylim(-2,2)
28 |
29 | plt.show()
--------------------------------------------------------------------------------
/pythonbasic/ossys.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | os.system('ping www.baidu.com')
4 |
--------------------------------------------------------------------------------
/pythonbasic/pandasql.py:
--------------------------------------------------------------------------------
1 |
2 | from pandasql import sqldf, load_meat, load_births
3 | import pandasql import *
4 |
5 |
6 | pysqldf = lambda q: sqldf(q, globals())
7 | meat = load_meat()
8 | births = load_births()
9 | print pysqldf("SELECT * FROM meat LIMIT 10;").head()
--------------------------------------------------------------------------------
/pythonbasic/pythonbasic.py:
--------------------------------------------------------------------------------
1 | #-- coding:utf-8 --#
2 |
3 | #列表自动生成
4 |
5 | l=range(10)
6 | print l
7 | l2=l[:-9-1:-1]
8 | print l2
9 | l1=l[9:2:-2]#第三个参数是间隔
10 | print l1
11 |
12 | print '\\' #""是一样的不需要变
13 | print r'\\' #r中没有转义字符
14 | print u'\\' #u是unicode编码普通的是ascii编码
15 | print '\n'
16 | print '\t'
17 |
18 | a = 'python'
19 | print 'hello,', a and 'world' #在计算 a and b 时,如果 a 是 False,则根据与运算法则,整个结果必定为 False,因此返回 a;如果 a 是 True,则整个计算结果必定取决与 b,因此返回 b
20 | b = ''
21 | print 'hello,', b or 'world' #在计算 a or b 时,如果 a 是 True,则根据或运算法则,整个计算结果必定为 True,因此返回 a;如果 a 是 False,则整个计算结果必定取决于 b,因此返回 b。
22 |
23 | c=['a','b','c']
24 | for i,j in enumerate(c):
25 | print i,'-',j
26 | for i ,j in zip(range(1,len(c)+1),c):
27 | print i ,'-',j
28 |
29 | print filter(lambda s:s and len(s.strip())>0, ['test', None, '', 'str', ' ', 'END']) #冒号前面是函数参数,冒号后边是表达式也是return值。
30 | #print type(None)
31 | #python中的map函数
32 | def format_name(s):
33 | return s[0].upper()+s[1:].lower()
34 |
35 | print map(format_name, ['adam', 'LISA', 'barT'])#函数返回新的元素
36 | #python中的filter函数
37 | import math
38 | def is_sqr(x):
39 | return x and math.sqrt(x)%1==0
40 | print is_sqr(100)
41 | print filter(is_sqr, range(1, 101)) #函数返回布尔类型的值
42 | #python中的reduce函数
43 | def prod(x, y):
44 | return x*y
45 |
46 | print reduce(prod, [2, 4, 5, 7, 12])
47 |
48 |
49 | str="abcd"
50 | l=list(str)
51 | print l
52 |
53 | a="a"
54 | b="b"
55 | #print str(a) 报错
56 | c=100
57 | a=12
58 | print chr(a)
59 | a='10'
60 | print int(a)
61 | #print a+b+chr(c)
62 |
63 | #print int(a)+int(b)+c 报错
64 | i=int(raw_input('请输入i:'))
65 | l=range(i)
66 | for i in range(i):
67 | l[i]=i+2
68 |
69 | print l
70 | fo=open('../data/people.txt')
71 | print type(fo)
72 | for i in fo:
73 | i.strip(',') #是删除''引号中的字符串
74 | i=i.split(',')
75 | print type(i)
76 | for i in i:
77 | print i
78 |
79 |
80 |
81 | def temp_convert(var):
82 | try:
83 | return int(var)
84 | except ValueError, Argument:
85 | print "参数没有包含数字\n", Argument
86 |
87 | # 调用函数
88 | temp_convert("xyz")
89 |
90 | a=[[1,2,3],[4,5,6]]
91 | print type(a)
92 |
93 | dataSet = [[1, 1, 'yes'],
94 | [1, 1, 'yes'],
95 | [1, 0, 'no'],
96 | [0, 1, 'no'],
97 | [0, 1, 'no']]
98 | print type(dataSet)
99 |
100 | print range(1,5)
101 | print range(6,10)
102 | print range(1,5)+range(6,10) #列表相加不是俩俩相加,而是合成一个更长的list
103 |
104 | i='####口疮####飞滋####口炎疮####复发性口疮####复发性'
105 | i=i.strip('####')
106 | print i
107 | j='###我们###你们'
108 | j=j.strip('####')
109 | print j
110 |
111 | l=[2,5,4,3]
112 | l.sort()
113 | print l
114 | l=[2,5,4,3]
115 | sorted(l)
116 | print l
117 | m={'a':1,'b':2}
118 | print m['a']
119 | #print m[2] dict的顺序是不能颠倒的。
120 |
121 | l=[2,5,4,3]
122 | print l.index(2)
123 | import numpy.linalg as la
124 | import numpy as np
125 | a=np.array([1,2,3])
126 | b=np.array([2,3,5])
127 | print b-a
128 | print la.norm(b-a)
129 | import math
130 |
131 | print math.sqrt(6)
132 |
133 | l=[1,2,3]
134 | print l[-1]
135 | str='你好吗?\n' \
136 | '我很好' \
137 | ' ' \
138 | '你好吗?'
139 |
140 | if '吗'or '呀' or'?' in str:
141 | print str
142 |
143 | import time
144 | a = "2016-04-19 17:37:01"
145 | c="2016-04-19 17:37:46"
146 | b=time.mktime(time.strptime(a,'%Y-%m-%d %H:%M:%S'))
147 | d=time.mktime(time.strptime(c,'%Y-%m-%d %H:%M:%S'))
148 | print b
149 | print d
150 |
151 | import numpy as np
152 | a=np.array(([1,2,3,4]))
153 | print np.median(a)#中位数
154 | print np.percentile(a,75)#95%分位数
155 |
156 | a='a我'
157 | print len(a)
158 |
159 | str='我很开心'
160 | if '?' in str:
161 | print str
162 |
163 | for i in range(6):
164 | print i
165 | if i==3:
166 | i=i+2
167 | print i
168 |
169 |
--------------------------------------------------------------------------------
/pythonbasic/randomtest.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf-8 -*-
2 | import numpy
3 | import random
4 | import pandas as pd
5 |
6 | print numpy.random.randn(6,5) #服从正太分布的,均值为0的矩阵
7 | print numpy.random.rand(3,4) #服从正态分布的,0,1之间的矩阵
8 |
9 | print random.random()
10 |
11 | print random.uniform(10, 20)
12 |
13 | print random.randint(12, 20)
14 |
15 | print random.choice('abcdefg%^*f')
16 |
17 | print random.sample('abcdefghij',3) #选取特定数量的字符
18 |
19 | import string
20 |
21 | print string.join(random.sample(['a','b','c','d','e','f','g','h','i','j'], 3)).replace(" ","")
22 |
23 | print random.choice ( ['apple', 'pear', 'peach', 'orange', 'lemon'] )
24 |
25 | items = [1, 2, 3, 4, 5, 6]
26 | random.shuffle(items)
27 | print items
28 |
--------------------------------------------------------------------------------
/pythondata/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/pythondata/__init__.py
--------------------------------------------------------------------------------
/pythondata/datasets.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import load_digits
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 |
5 | digits=load_digits()
6 | print digits.data.shape
7 | #print digits
8 | #digits.reshape()
9 | np.savetxt("filename.txt",digits)
10 |
11 | plt.gray()
12 | plt.matshow(digits.images[0])
13 | plt.show()
14 |
15 |
16 |
--------------------------------------------------------------------------------
/sparkml/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/sparkml/__init__.py
--------------------------------------------------------------------------------
/sparkml/mlkmeans.py:
--------------------------------------------------------------------------------
1 | #-- coding:utf-8 --#
2 | from pyspark.ml.clustering import KMeans
3 | from pyspark.ml.linalg import Vectors
4 | from pyspark.sql import SparkSession
5 | from pyspark.sql import Row
6 | import pandas as pd
7 | spark=SparkSession \
8 | .builder \
9 | .appName("kmeans") \
10 | .getOrCreate()
11 |
12 | inputfile='../data/sampling.xlsx'
13 | outputfile='../data/outputfile.xlsx'
14 | data=pd.read_excel(inputfile)
15 |
16 | print spark.createDataFrame(data).collect() #spark通过pandas.DataFrame 转化为spark DataFrame ,df.toPandas()
17 | #还可以通过tuple list与字典产生。
18 |
19 | data=spark.read.format('csv').load('../data/sampling.csv') #直接读与createDataFrame方式进行。
20 | data.show()
21 | #dataset=spark.read.format('libsvm').load('../data/sample_kmeans_data.txt')
22 | data=spark.sparkContext.textFile('../data/sampling.csv')
23 | parts=data.map(lambda l: l.split(","))
24 | dataset = parts.map(lambda p: Row(label=p[0], features=Vectors.dense([int(p[1]),int(p[2]),int(p[3]),int(p[4]),int(p[5]),int(p[6]),int(p[7]),int(p[8]),int(p[9]),int(p[10]),int(p[11]),int(p[12]),int(p[13]),int(p[14]),int(p[15]),int(p[16]),int(p[17]),int(p[18]),int(p[19]),int(p[20]),int(p[21]),int(p[22]),int(p[23]),int(p[24]),int(p[25])])))#冒号前是参数,冒号后表达式就是返回值
25 | #RDD是对象的集合,而DataFrame是Vectors.dense(p[1:])
26 |
27 | # Infer the schema, and register the DataFrame as a table.
28 | #schemaPeople = spark.createDataFrame(people) #两种产生datafram结构的方式
29 | datasets=spark.createDataFrame(dataset)
30 |
31 | datasets.show()
32 |
33 | # Trains a k-means model.
34 | kmeans = KMeans().setK(2).setSeed(1)
35 | model = kmeans.fit(datasets)
36 |
37 | # Evaluate clustering by computing Within Set Sum of Squared Errors.
38 | wssse = model.computeCost(datasets)
39 | print("Within Set Sum of Squared Errors = " + str(wssse))
40 |
41 | # Shows the result.
42 | centers = model.clusterCenters()
43 |
44 | df=pd.DataFrame(centers)
45 | print df.dtypes
46 | df.to_excel(outputfile)
47 |
48 | print("Cluster Centers: ")
49 | for center in centers:
50 | print(center)
51 |
52 |
--------------------------------------------------------------------------------
/sparkml/sparkSession.py:
--------------------------------------------------------------------------------
1 | #-- coding:utf-8 --#
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql import Row
4 | from pyspark import SparkContext
5 | spark=SparkSession \
6 | .builder \
7 | .appName("sparkSession") \
8 | .getOrCreate()
9 | #.config("spark.some.config.option", "some-value") 可有可无,不设置的情况下就是默认情况。
10 | df=spark.read.json('../data/people.json')
11 |
12 | df.show()
13 | print '-------'
14 | df.take(1) #这是什么意思?
15 | print '--------'
16 | df.printSchema()
17 | df.select(df['name'],df['age']).show()
18 | #df.select(df['name'], df['age'] ).show()
19 |
20 | df.filter(df['age']>20).show()
21 | df.groupby(df['age']).count().show()
22 |
23 | #执行sql必须是要产生视图执行如下函数
24 | df.createOrReplaceTempView('people')
25 | sqldf=spark.sql('select * from people')
26 | sqldf.show()
27 | #产生全局临时视图
28 |
29 | # df.createGlobalTempView('people')
30 | # spark.sql("select * from global_tem.people").show()
31 | # 为什么出错没搞清
32 |
33 | #datafram 与RDD的交互
34 | #RDD仍然要作为一种必须要学的数据结构
35 | #sc=SparkContext(appName="rddtest")这种数据结构就不可取了
36 | sc=spark.sparkContext
37 | lines=sc.textFile('../data/people.txt')
38 | parts=lines.map(lambda l: l.split(","))
39 | people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))#冒号前是参数,冒号后表达式就是返回值
40 | #RDD是对象的集合,而DataFrame是
41 |
42 | # Infer the schema, and register the DataFrame as a table.
43 | #schemaPeople = spark.createDataFrame(people) #两种产生datafram结构的方式
44 | schemaPeople=people.toDF()
45 | schemaPeople.createOrReplaceTempView("people")
46 |
47 | # SQL can be run over DataFrames that have been registered as a table.
48 | teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
49 |
50 | # The results of SQL queries are Dataframe objects.
51 | # rdd returns the content as an :class:`pyspark.RDD` of :class:`Row`.
52 | teenNames = teenagers.rdd.map(lambda p: "Name: " + p.name).collect()
53 | for name in teenNames:
54 | print(name)
55 |
56 |
57 | #是否可以从ndarray中产生datafram,从dataframe,series中转为DataFrame
58 | #必须要解决spark算法数据输入的问题。
59 |
60 | df = spark.read.load("../data/users.parquet")
61 | #df.select("name", "favorite_color").write.save("namesAndFavColors.parquet")
62 | df.show()
63 |
64 | peopleDF = spark.read.json("../data/people.json")
65 |
66 | # DataFrames can be saved as Parquet files, maintaining the schema information.
67 | peopleDF.write.parquet("../data/people.parquet")
68 |
69 | # Read in the Parquet file created above.
70 | # Parquet files are self-describing so the schema is preserved.
71 | # The result of loading a parquet file is also a DataFrame.
72 | parquetFile = spark.read.parquet("../data/people.parquet")
73 |
74 | # Parquet files can also be used to create a temporary view and then used in SQL statements.
75 | parquetFile.createOrReplaceTempView("parquetFile")
76 | teenagers = spark.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
77 |
--------------------------------------------------------------------------------
/sparkml/sparkio.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/sparkml/sparkio.py
--------------------------------------------------------------------------------
/sparkml/sparklda.py:
--------------------------------------------------------------------------------
1 | #--coding:utf-8 --#
2 | from pyspark.ml.clustering import LDA
3 | from pyspark.sql import SparkSession
4 | spark=SparkSession \
5 | .builder \
6 | .appName('sparklda') \
7 | .getOrCreate()
8 |
9 | #libsvm是一种什么样的数据结构为什么要采用这样的方式
10 | #为什么一般的txt文件不可以呢?????
11 | # Loads data.
12 | dataset = spark.read.format("libsvm").load("../data/sample_lda_libsvm_data.txt")
13 |
14 | # Trains a LDA model.
15 | lda = LDA(k=10, maxIter=10)
16 | model = lda.fit(dataset)
17 |
18 | ll = model.logLikelihood(dataset)
19 | lp = model.logPerplexity(dataset)
20 | print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
21 | print("The upper bound bound on perplexity: " + str(lp))
22 |
23 | # Describe topics.
24 | topics = model.describeTopics(3)
25 | print("The topics described by their top-weighted terms:")
26 | topics.show(truncate=False)
27 |
28 | # Shows the result
29 | transformed = model.transform(dataset)
30 | transformed.show(truncate=False)
--------------------------------------------------------------------------------
/sparkml/sparklr.py:
--------------------------------------------------------------------------------
1 | #-- coding:utf-8 --#
2 | from pyspark.ml.linalg import Vectors
3 | from pyspark.ml.classification import LogisticRegression
4 | from pyspark.sql import SparkSession
5 |
6 | spark=SparkSession \
7 | .builder \
8 | .appName('lr') \
9 | .getOrCreate()
10 |
11 | # Prepare training data from a list of (label, features) tuples.
12 | training = spark.createDataFrame([
13 | (1.0, Vectors.dense([0.0, 1.1, 0.1])),
14 | (0.0, Vectors.dense([2.0, 1.0, -1.0])),
15 | (0.0, Vectors.dense([2.0, 1.3, 1.0])),
16 | (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])
17 | #基于dataframe的spark机器学习算法必须要有features,label标签才可以
18 | #应该是不同的算法应该具有不同的标签才对
19 |
20 | # Create a LogisticRegression instance. This instance is an Estimator.
21 | lr = LogisticRegression(maxIter=10, regParam=0.01)
22 | # Print out the parameters, documentation, and any default values.
23 | print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
24 |
25 | # Learn a LogisticRegression model. This uses the parameters stored in lr.
26 | model1 = lr.fit(training)
27 |
28 | # Since model1 is a Model (i.e., a transformer produced by an Estimator),
29 | # we can view the parameters it used during fit().
30 | # This prints the parameter (name: value) pairs, where names are unique IDs for this
31 | # LogisticRegression instance.
32 | print("Model 1 was fit using parameters: ")
33 | print(model1.extractParamMap())
34 |
35 | # We may alternatively specify parameters using a Python dictionary as a paramMap
36 | paramMap = {lr.maxIter: 20}
37 | paramMap[lr.maxIter] = 30 # Specify 1 Param, overwriting the original maxIter.
38 | paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55}) # Specify multiple Params.
39 |
40 | # You can combine paramMaps, which are python dictionaries.
41 | paramMap2 = {lr.probabilityCol: "myProbability"} # Change output column name
42 | paramMapCombined = paramMap.copy()
43 | paramMapCombined.update(paramMap2)
44 |
45 | # Now learn a new model using the paramMapCombined parameters.
46 | # paramMapCombined overrides all parameters set earlier via lr.set* methods.
47 | model2 = lr.fit(training, paramMapCombined)
48 | print("Model 2 was fit using parameters: ")
49 | print(model2.extractParamMap())
50 |
51 | # Prepare test data
52 | test = spark.createDataFrame([
53 | (1.0, Vectors.dense([-1.0, 1.5, 1.3])),
54 | (0.0, Vectors.dense([3.0, 2.0, -0.1])),
55 | (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])
56 |
57 | # Make predictions on test data using the Transformer.transform() method.
58 | # LogisticRegression.transform will only use the 'features' column.
59 | # Note that model2.transform() outputs a "myProbability" column instead of the usual
60 | # 'probability' column since we renamed the lr.probabilityCol parameter previously.
61 | prediction = model2.transform(test)
62 | result = prediction.select("features", "label", "myProbability", "prediction") \
63 | .collect()
64 |
65 | for row in result:
66 | print("features=%s, label=%s -> prob=%s, prediction=%s"
67 | % (row.features, row.label, row.myProbability, row.prediction))
--------------------------------------------------------------------------------
/sparkml/sparkpipline.py:
--------------------------------------------------------------------------------
1 | #-- coding:utf-8 --#
2 | from pyspark.ml import Pipeline
3 | from pyspark.ml.classification import LogisticRegression
4 | from pyspark.ml.feature import HashingTF, Tokenizer
5 | from pyspark.sql import SparkSession
6 |
7 | spark=SparkSession \
8 | .builder \
9 | .appName('pipline') \
10 | .getOrCreate()
11 |
12 | # Prepare training documents from a list of (id, text, label) tuples.
13 | #tuples的一个list表
14 | #spark本身具备了sqlContext的功能
15 | training = spark.createDataFrame([
16 | (0, "a b c d e spark", 1.0),
17 | (1, "b d", 0.0),
18 | (2, "spark f g h", 1.0),
19 | (3, "hadoop mapreduce", 0.0)
20 | ], ["id", "text", "label"])
21 |
22 | # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
23 | tokenizer = Tokenizer(inputCol="text", outputCol="words")
24 | tokenizerdata=tokenizer.transform(training)
25 | tokenizerdata.show()
26 | hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
27 | hashingTFdata=hashingTF.transform(tokenizerdata)
28 | hashingTFdata.select('features').show()
29 | lr = LogisticRegression(maxIter=10, regParam=0.001)
30 | lr.fit(hashingTFdata)
31 | pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
32 |
33 | # Fit the pipeline to training documents.
34 | model = pipeline.fit(training)
35 | #算法部分也要依据固有的dataframe格式才可以。
36 | # Prepare test documents, which are unlabeled (id, text) tuples.
37 | test = spark.createDataFrame([
38 | (4, "spark i j k"),
39 | (5, "l m n"),
40 | (6, "spark hadoop spark"),
41 | (7, "apache hadoop")
42 | ], ["id", "text"])
43 |
44 | # Make predictions on test documents and print columns of interest.
45 | prediction = model.transform(test)
46 | print prediction
47 | selected = prediction.select("id", "text", "probability", "prediction")
48 | selected.show()
49 | print selected.collect() #将DataFrame按照row的格式输出
50 |
51 | for row in selected.collect():
52 | print row
53 | rid, text, prob, prediction = row
54 | print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction))
55 |
56 |
57 | #需重点看一下spark.read DataFrame东西。
58 | #疑问一、写入的时候为什么报错
59 | #疑问二、如何将dataframe结构转化为算法需要的结构。label,features,可以设定。
60 | #是否可以使用RDD改变map映射,再用raw的方式赋予label与raw的方式
--------------------------------------------------------------------------------
/sparkml/sparkpipline2.py:
--------------------------------------------------------------------------------
1 | #-- coding:utf-8 --#
2 | from pyspark.ml import Pipeline
3 | from pyspark.ml.classification import LogisticRegression
4 | from pyspark.ml.feature import HashingTF, Tokenizer
5 | from pyspark.sql import SparkSession
6 |
7 | spark=SparkSession \
8 | .builder \
9 | .appName('pipline') \
10 | .getOrCreate()
11 |
12 | # Prepare training documents from a list of (id, text, label) tuples.
13 | #tuples的一个list表
14 | training = spark.createDataFrame([
15 | (0, "a b c d e spark", 1.0),
16 | (1, "b d", 0.0),
17 | (2, "spark f g h", 1.0),
18 | (3, "hadoop mapreduce", 0.0)
19 | ], ["id", "text", "label"])
20 |
21 | # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
22 | tokenizer = Tokenizer(inputCol="text", outputCol="words")
23 | hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
24 | lr = LogisticRegression(maxIter=10, regParam=0.001)
25 | pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
26 |
27 | # Fit the pipeline to training documents.
28 | model = pipeline.fit(training)
29 |
30 | # Prepare test documents, which are unlabeled (id, text) tuples.
31 | test = spark.createDataFrame([
32 | (4, "spark i j k"),
33 | (5, "l m n"),
34 | (6, "spark hadoop spark"),
35 | (7, "apache hadoop")
36 | ], ["id", "text"])
37 |
38 | # Make predictions on test documents and print columns of interest.
39 | prediction = model.transform(test)
40 | print prediction
41 | selected = prediction.select("id", "text", "probability", "prediction")
42 | selected.show()
43 | print selected.collect() #将DataFrame按照row的格式输出
44 |
45 | for row in selected.collect():
46 | print row
47 | rid, text, prob, prediction = row
48 | print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction))
49 |
50 |
51 | #需重点看一下spark.read DataFrame东西。
--------------------------------------------------------------------------------
/sparkml/sparktf-itf.py:
--------------------------------------------------------------------------------
1 | #-- coding:utf-8 --#
2 | from pyspark.ml.feature import HashingTF, IDF, Tokenizer
3 | from pyspark.sql import SparkSession
4 |
5 | spark=SparkSession \
6 | .builder \
7 | .appName('sparktfitf') \
8 | .getOrCreate()
9 |
10 | sentenceData = spark.createDataFrame([
11 | (0.0, "Hi I heard about Spark"),
12 | (0.0, "I wish Java could use case classes"),
13 | (1.0, "Logistic regression models are neat")
14 | ], ["label", "sentence"])
15 |
16 | tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
17 | wordsData = tokenizer.transform(sentenceData)
18 | wordsData.show()
19 |
20 | hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
21 | featurizedData = hashingTF.transform(wordsData)
22 | featurizedData.show()
23 | # alternatively, CountVectorizer can also be used to get term frequency vectors
24 |
25 | idf = IDF(inputCol="rawFeatures", outputCol="features")
26 | idfModel = idf.fit(featurizedData) #idf 需要训练其他都是转化就可以了啊。
27 | rescaledData = idfModel.transform(featurizedData)
28 |
29 | rescaledData.select("label", "features").show()
--------------------------------------------------------------------------------
/sparkml/tokenizer.py:
--------------------------------------------------------------------------------
1 | from pyspark.ml.feature import Tokenizer, RegexTokenizer
2 | from pyspark.sql.functions import col, udf
3 | from pyspark.sql.types import IntegerType
4 | from pyspark.sql import SparkSession
5 |
6 | spark=SparkSession \
7 | .builder \
8 | .appName('tokenizerchinese') \
9 | .getOrCreate()
10 | sentenceDataFrame = spark.createDataFrame([
11 | (0, "Hi I heard about Spark"),
12 | (1, "I wish Java could use case classes"),
13 | (2, "Logistic,regression,models,are,neat")
14 | ], ["id", "sentence"])
15 |
16 | tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
17 |
18 | regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
19 | # alternatively, pattern="\\w+", gaps(False)
20 |
21 | countTokens = udf(lambda words: len(words), IntegerType())
22 |
23 | tokenized = tokenizer.transform(sentenceDataFrame)
24 | tokenized.select("sentence", "words")\
25 | .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
26 |
27 | regexTokenized = regexTokenizer.transform(sentenceDataFrame)
28 | regexTokenized.select("sentence", "words") \
29 | .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
--------------------------------------------------------------------------------
/text_analyse2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/text_analyse2/__init__.py
--------------------------------------------------------------------------------
/text_analyse2/extract.txt:
--------------------------------------------------------------------------------
1 | 圣诞消费旺季即将到来,不得不推迟出货
--------------------------------------------------------------------------------
/text_analyse2/jiebatest.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf-8 -*-
2 |
3 |
4 | #jieba 分词
5 | import jieba
6 | import jieba.analyse
7 |
8 | seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
9 | print "Full Mode:", "/ ".join(seg_list) #全模式
10 |
11 | seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
12 | print "Default Mode:", "/ ".join(seg_list) #精确模式
13 |
14 | seg_list = jieba.cut("他来到了网易杭研大厦") #默认是精确模式
15 | print ", ".join(seg_list)
16 |
17 | seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
18 | print ", ".join(seg_list)
19 |
20 |
21 | w= open('result.txt','w')
22 |
23 | s= '圣诞消费旺季即将到来,不得不推迟出货'
24 | content=open('extract.txt').read()
25 |
26 | seglist = list(jieba.cut(s,cut_all=False))
27 | print ",".join(seglist)
28 | for i in seglist:
29 | w.write(i.encode('utf-8'))#或者 w.write(i.encode('gbk'))
30 | w.write(',')
31 | w.close()
32 |
33 | #jieba增加自己的用户词典
34 | #jieba.load_userdict(file_name) # file_name为自定义词典的路径
35 |
36 | import sys
37 | sys.path.append("../")
38 | import jieba
39 | jieba.load_userdict("userdict.txt")
40 | import jieba.posseg as pseg
41 |
42 | test_sent = "李小福是创新办主任也是云计算方面的专家;"
43 | test_sent += "例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
44 | words = jieba.cut(test_sent)
45 | print type(words)
46 |
47 | for w in words:
48 | print w
49 |
50 |
51 | result = pseg.cut(test_sent)
52 |
53 | for w in result:
54 | print w.word, "/", w.flag, ", ",
55 |
56 |
57 | print "\n========"
58 |
59 | terms = jieba.cut('easy_install is great')
60 | for t in terms:
61 | print t
62 | print '-------------------------'
63 | terms = jieba.cut('python 的正则表达式是好用的')
64 | for t in terms:
65 | print t
66 |
67 | import jieba.analyse
68 | strx = '网络让我们之间的距离变的如此之近,也同时让我们变的如此遥远。世界上最远的距离不是南极到北极,也不是喜马拉雅之巅到马里亚纳之渊;而是相对而坐,却各自忙着刷手机。暂别网络世界,去和爱人道一句早安,去和朋友聊一夜往事,去和家人吃一顿饭,其实也是挺好的'
69 | s= '结巴分词是一个Python下的中文分词组件'
70 | rt = jieba.analyse.extract_tags(strx,5)
71 | print jieba.analyse.extract_tags(s,2) #这个样是按照列表的形式进行的输出,就和之前遇到的一样,不是编码的问题,而是关键词在列表中,才导致的这种问题。
72 | for r in rt:
73 | print r
--------------------------------------------------------------------------------
/text_analyse2/userdict.txt:
--------------------------------------------------------------------------------
1 | 李小福 2 nr
2 | 创新办 3 i
3 | easy_install 3 eng
4 | 好用 300
5 | 韩玉赏鉴 3 nz
--------------------------------------------------------------------------------