├── .idea ├── kmeans.iml ├── misc.xml ├── modules.xml └── vcs.xml ├── data ├── Folds5x2_pp.xlsx ├── all_musicers.xlsx ├── drugName_dic.txt ├── full_user.avsc ├── kv1.txt ├── ndy.txt ├── outputfile.xlsx ├── people.json ├── people.txt ├── sample_kmeans_data.txt ├── sample_lda_data.txt ├── sample_lda_libsvm_data.txt ├── sampling.csv ├── sampling.txt ├── sampling.xlsx ├── test.xlsx ├── text.xlsx ├── user.avsc ├── users.avro └── users.parquet ├── pythonadvance ├── Gensimtest.py ├── Regression_analyse.py ├── StatsModelstest.py ├── __init__.py ├── cipin.py ├── clustermeric.py ├── dxckeras.py ├── dxcprec.py ├── dxcsvm.py ├── full_linearRession.py ├── nltktest.py ├── numpytest.py ├── pandatest.py ├── pymysqltest.py ├── regular.py ├── scipytest.py ├── sklearnkmeans.py ├── yaofang_fenxi_text.py └── yichangtest.py ├── pythonbasic ├── classtest.py ├── decoratortest.py ├── dictionary.py ├── huatu.py ├── ossys.py ├── pandasql.py ├── pythonbasic.py └── randomtest.py ├── pythondata ├── __init__.py └── datasets.py ├── sparkml ├── __init__.py ├── mlkmeans.py ├── sparkSession.py ├── sparkio.py ├── sparklda.py ├── sparklr.py ├── sparkpipline.py ├── sparkpipline2.py ├── sparktf-itf.py └── tokenizer.py └── text_analyse2 ├── __init__.py ├── extract.txt ├── jiebatest.py └── userdict.txt /.idea/kmeans.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | AngularJS 14 | 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /data/Folds5x2_pp.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/data/Folds5x2_pp.xlsx -------------------------------------------------------------------------------- /data/all_musicers.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/data/all_musicers.xlsx -------------------------------------------------------------------------------- /data/full_user.avsc: -------------------------------------------------------------------------------- 1 | {"type": "record", "namespace": "example.avro", "name": "User", "fields": [{"type": "string", "name": "name"}, {"type": ["string", "null"], "name": "favorite_color"}, {"type": {"items": "int", "type": "array"}, "name": "favorite_numbers"}]} -------------------------------------------------------------------------------- /data/kv1.txt: -------------------------------------------------------------------------------- 1 | 238val_238 2 | 86val_86 3 | 311val_311 4 | 27val_27 5 | 165val_165 6 | 409val_409 7 | 255val_255 8 | 278val_278 9 | 98val_98 10 | 484val_484 11 | 265val_265 12 | 193val_193 13 | 401val_401 14 | 150val_150 15 | 273val_273 16 | 224val_224 17 | 369val_369 18 | 66val_66 19 | 128val_128 20 | 213val_213 21 | 146val_146 22 | 406val_406 23 | 429val_429 24 | 374val_374 25 | 152val_152 26 | 469val_469 27 | 145val_145 28 | 495val_495 29 | 37val_37 30 | 327val_327 31 | 281val_281 32 | 277val_277 33 | 209val_209 34 | 15val_15 35 | 82val_82 36 | 403val_403 37 | 166val_166 38 | 417val_417 39 | 430val_430 40 | 252val_252 41 | 292val_292 42 | 219val_219 43 | 287val_287 44 | 153val_153 45 | 193val_193 46 | 338val_338 47 | 446val_446 48 | 459val_459 49 | 394val_394 50 | 237val_237 51 | 482val_482 52 | 174val_174 53 | 413val_413 54 | 494val_494 55 | 207val_207 56 | 199val_199 57 | 466val_466 58 | 208val_208 59 | 174val_174 60 | 399val_399 61 | 396val_396 62 | 247val_247 63 | 417val_417 64 | 489val_489 65 | 162val_162 66 | 377val_377 67 | 397val_397 68 | 309val_309 69 | 365val_365 70 | 266val_266 71 | 439val_439 72 | 342val_342 73 | 367val_367 74 | 325val_325 75 | 167val_167 76 | 195val_195 77 | 475val_475 78 | 17val_17 79 | 113val_113 80 | 155val_155 81 | 203val_203 82 | 339val_339 83 | 0val_0 84 | 455val_455 85 | 128val_128 86 | 311val_311 87 | 316val_316 88 | 57val_57 89 | 302val_302 90 | 205val_205 91 | 149val_149 92 | 438val_438 93 | 345val_345 94 | 129val_129 95 | 170val_170 96 | 20val_20 97 | 489val_489 98 | 157val_157 99 | 378val_378 100 | 221val_221 101 | 92val_92 102 | 111val_111 103 | 47val_47 104 | 72val_72 105 | 4val_4 106 | 280val_280 107 | 35val_35 108 | 427val_427 109 | 277val_277 110 | 208val_208 111 | 356val_356 112 | 399val_399 113 | 169val_169 114 | 382val_382 115 | 498val_498 116 | 125val_125 117 | 386val_386 118 | 437val_437 119 | 469val_469 120 | 192val_192 121 | 286val_286 122 | 187val_187 123 | 176val_176 124 | 54val_54 125 | 459val_459 126 | 51val_51 127 | 138val_138 128 | 103val_103 129 | 239val_239 130 | 213val_213 131 | 216val_216 132 | 430val_430 133 | 278val_278 134 | 176val_176 135 | 289val_289 136 | 221val_221 137 | 65val_65 138 | 318val_318 139 | 332val_332 140 | 311val_311 141 | 275val_275 142 | 137val_137 143 | 241val_241 144 | 83val_83 145 | 333val_333 146 | 180val_180 147 | 284val_284 148 | 12val_12 149 | 230val_230 150 | 181val_181 151 | 67val_67 152 | 260val_260 153 | 404val_404 154 | 384val_384 155 | 489val_489 156 | 353val_353 157 | 373val_373 158 | 272val_272 159 | 138val_138 160 | 217val_217 161 | 84val_84 162 | 348val_348 163 | 466val_466 164 | 58val_58 165 | 8val_8 166 | 411val_411 167 | 230val_230 168 | 208val_208 169 | 348val_348 170 | 24val_24 171 | 463val_463 172 | 431val_431 173 | 179val_179 174 | 172val_172 175 | 42val_42 176 | 129val_129 177 | 158val_158 178 | 119val_119 179 | 496val_496 180 | 0val_0 181 | 322val_322 182 | 197val_197 183 | 468val_468 184 | 393val_393 185 | 454val_454 186 | 100val_100 187 | 298val_298 188 | 199val_199 189 | 191val_191 190 | 418val_418 191 | 96val_96 192 | 26val_26 193 | 165val_165 194 | 327val_327 195 | 230val_230 196 | 205val_205 197 | 120val_120 198 | 131val_131 199 | 51val_51 200 | 404val_404 201 | 43val_43 202 | 436val_436 203 | 156val_156 204 | 469val_469 205 | 468val_468 206 | 308val_308 207 | 95val_95 208 | 196val_196 209 | 288val_288 210 | 481val_481 211 | 457val_457 212 | 98val_98 213 | 282val_282 214 | 197val_197 215 | 187val_187 216 | 318val_318 217 | 318val_318 218 | 409val_409 219 | 470val_470 220 | 137val_137 221 | 369val_369 222 | 316val_316 223 | 169val_169 224 | 413val_413 225 | 85val_85 226 | 77val_77 227 | 0val_0 228 | 490val_490 229 | 87val_87 230 | 364val_364 231 | 179val_179 232 | 118val_118 233 | 134val_134 234 | 395val_395 235 | 282val_282 236 | 138val_138 237 | 238val_238 238 | 419val_419 239 | 15val_15 240 | 118val_118 241 | 72val_72 242 | 90val_90 243 | 307val_307 244 | 19val_19 245 | 435val_435 246 | 10val_10 247 | 277val_277 248 | 273val_273 249 | 306val_306 250 | 224val_224 251 | 309val_309 252 | 389val_389 253 | 327val_327 254 | 242val_242 255 | 369val_369 256 | 392val_392 257 | 272val_272 258 | 331val_331 259 | 401val_401 260 | 242val_242 261 | 452val_452 262 | 177val_177 263 | 226val_226 264 | 5val_5 265 | 497val_497 266 | 402val_402 267 | 396val_396 268 | 317val_317 269 | 395val_395 270 | 58val_58 271 | 35val_35 272 | 336val_336 273 | 95val_95 274 | 11val_11 275 | 168val_168 276 | 34val_34 277 | 229val_229 278 | 233val_233 279 | 143val_143 280 | 472val_472 281 | 322val_322 282 | 498val_498 283 | 160val_160 284 | 195val_195 285 | 42val_42 286 | 321val_321 287 | 430val_430 288 | 119val_119 289 | 489val_489 290 | 458val_458 291 | 78val_78 292 | 76val_76 293 | 41val_41 294 | 223val_223 295 | 492val_492 296 | 149val_149 297 | 449val_449 298 | 218val_218 299 | 228val_228 300 | 138val_138 301 | 453val_453 302 | 30val_30 303 | 209val_209 304 | 64val_64 305 | 468val_468 306 | 76val_76 307 | 74val_74 308 | 342val_342 309 | 69val_69 310 | 230val_230 311 | 33val_33 312 | 368val_368 313 | 103val_103 314 | 296val_296 315 | 113val_113 316 | 216val_216 317 | 367val_367 318 | 344val_344 319 | 167val_167 320 | 274val_274 321 | 219val_219 322 | 239val_239 323 | 485val_485 324 | 116val_116 325 | 223val_223 326 | 256val_256 327 | 263val_263 328 | 70val_70 329 | 487val_487 330 | 480val_480 331 | 401val_401 332 | 288val_288 333 | 191val_191 334 | 5val_5 335 | 244val_244 336 | 438val_438 337 | 128val_128 338 | 467val_467 339 | 432val_432 340 | 202val_202 341 | 316val_316 342 | 229val_229 343 | 469val_469 344 | 463val_463 345 | 280val_280 346 | 2val_2 347 | 35val_35 348 | 283val_283 349 | 331val_331 350 | 235val_235 351 | 80val_80 352 | 44val_44 353 | 193val_193 354 | 321val_321 355 | 335val_335 356 | 104val_104 357 | 466val_466 358 | 366val_366 359 | 175val_175 360 | 403val_403 361 | 483val_483 362 | 53val_53 363 | 105val_105 364 | 257val_257 365 | 406val_406 366 | 409val_409 367 | 190val_190 368 | 406val_406 369 | 401val_401 370 | 114val_114 371 | 258val_258 372 | 90val_90 373 | 203val_203 374 | 262val_262 375 | 348val_348 376 | 424val_424 377 | 12val_12 378 | 396val_396 379 | 201val_201 380 | 217val_217 381 | 164val_164 382 | 431val_431 383 | 454val_454 384 | 478val_478 385 | 298val_298 386 | 125val_125 387 | 431val_431 388 | 164val_164 389 | 424val_424 390 | 187val_187 391 | 382val_382 392 | 5val_5 393 | 70val_70 394 | 397val_397 395 | 480val_480 396 | 291val_291 397 | 24val_24 398 | 351val_351 399 | 255val_255 400 | 104val_104 401 | 70val_70 402 | 163val_163 403 | 438val_438 404 | 119val_119 405 | 414val_414 406 | 200val_200 407 | 491val_491 408 | 237val_237 409 | 439val_439 410 | 360val_360 411 | 248val_248 412 | 479val_479 413 | 305val_305 414 | 417val_417 415 | 199val_199 416 | 444val_444 417 | 120val_120 418 | 429val_429 419 | 169val_169 420 | 443val_443 421 | 323val_323 422 | 325val_325 423 | 277val_277 424 | 230val_230 425 | 478val_478 426 | 178val_178 427 | 468val_468 428 | 310val_310 429 | 317val_317 430 | 333val_333 431 | 493val_493 432 | 460val_460 433 | 207val_207 434 | 249val_249 435 | 265val_265 436 | 480val_480 437 | 83val_83 438 | 136val_136 439 | 353val_353 440 | 172val_172 441 | 214val_214 442 | 462val_462 443 | 233val_233 444 | 406val_406 445 | 133val_133 446 | 175val_175 447 | 189val_189 448 | 454val_454 449 | 375val_375 450 | 401val_401 451 | 421val_421 452 | 407val_407 453 | 384val_384 454 | 256val_256 455 | 26val_26 456 | 134val_134 457 | 67val_67 458 | 384val_384 459 | 379val_379 460 | 18val_18 461 | 462val_462 462 | 492val_492 463 | 100val_100 464 | 298val_298 465 | 9val_9 466 | 341val_341 467 | 498val_498 468 | 146val_146 469 | 458val_458 470 | 362val_362 471 | 186val_186 472 | 285val_285 473 | 348val_348 474 | 167val_167 475 | 18val_18 476 | 273val_273 477 | 183val_183 478 | 281val_281 479 | 344val_344 480 | 97val_97 481 | 469val_469 482 | 315val_315 483 | 84val_84 484 | 28val_28 485 | 37val_37 486 | 448val_448 487 | 152val_152 488 | 348val_348 489 | 307val_307 490 | 194val_194 491 | 414val_414 492 | 477val_477 493 | 222val_222 494 | 126val_126 495 | 90val_90 496 | 169val_169 497 | 403val_403 498 | 400val_400 499 | 200val_200 500 | 97val_97 501 | -------------------------------------------------------------------------------- /data/ndy.txt: -------------------------------------------------------------------------------- 1 | 尿道炎罗红霉素150MG+呋喃妥因100MG+三金片，很管用的全科医疗尿道炎的治疗：柴胡2ml+丁胺卡那0.2，im；呋兰妥因2片，一日四次，效果蛮好，两三天即愈。痛经的治疗：炎痛喜康一次两片，每次在月经快来的前两天服用，一天一次服用四天，即可不痛。注意有胃病的忌服 -------------------------------------------------------------------------------- /data/outputfile.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/data/outputfile.xlsx -------------------------------------------------------------------------------- /data/people.json: -------------------------------------------------------------------------------- 1 | {"name":"Michael"} 2 | {"name":"Andy", "age":30} 3 | {"name":"Justin", "age":19} 4 | -------------------------------------------------------------------------------- /data/people.txt: -------------------------------------------------------------------------------- 1 | Michael, 29 2 | Andy, 30 3 | Justin, 19 4 | -------------------------------------------------------------------------------- /data/sample_kmeans_data.txt: -------------------------------------------------------------------------------- 1 | 0 1:0.0 2:0.0 3:0.0 2 | 1 1:0.1 2:0.1 3:0.1 3 | 2 1:0.2 2:0.2 3:0.2 4 | 3 1:9.0 2:9.0 3:9.0 5 | 4 1:9.1 2:9.1 3:9.1 6 | 5 1:9.2 2:9.2 3:9.2 7 | -------------------------------------------------------------------------------- /data/sample_lda_data.txt: -------------------------------------------------------------------------------- 1 | 1 2 6 0 2 3 1 1 0 0 3 2 | 1 3 0 1 3 0 0 2 0 0 1 3 | 1 4 1 0 0 4 9 0 1 2 0 4 | 2 1 0 3 0 0 5 0 2 3 9 5 | 3 1 1 9 3 0 2 0 0 1 3 6 | 4 2 0 3 4 5 1 1 1 4 0 7 | 2 1 0 3 0 0 5 0 2 2 9 8 | 1 1 1 9 2 1 2 0 0 1 3 9 | 4 4 0 3 4 2 1 3 0 0 0 10 | 2 8 2 0 3 0 2 0 2 7 2 11 | 1 1 1 9 0 2 2 0 0 3 3 12 | 4 1 0 0 4 5 1 3 0 1 0 13 | -------------------------------------------------------------------------------- /data/sample_lda_libsvm_data.txt: -------------------------------------------------------------------------------- 1 | 0 1:1 2:2 3:6 4:0 5:2 6:3 7:1 8:1 9:0 10:0 11:3 2 | 1 1:1 2:3 3:0 4:1 5:3 6:0 7:0 8:2 9:0 10:0 11:1 3 | 2 1:1 2:4 3:1 4:0 5:0 6:4 7:9 8:0 9:1 10:2 11:0 4 | 3 1:2 2:1 3:0 4:3 5:0 6:0 7:5 8:0 9:2 10:3 11:9 5 | 4 1:3 2:1 3:1 4:9 5:3 6:0 7:2 8:0 9:0 10:1 11:3 6 | 5 1:4 2:2 3:0 4:3 5:4 6:5 7:1 8:1 9:1 10:4 11:0 7 | 6 1:2 2:1 3:0 4:3 5:0 6:0 7:5 8:0 9:2 10:2 11:9 8 | 7 1:1 2:1 3:1 4:9 5:2 6:1 7:2 8:0 9:0 10:1 11:3 9 | 8 1:4 2:4 3:0 4:3 5:4 6:2 7:1 8:3 9:0 10:0 11:0 10 | 9 1:2 2:8 3:2 4:0 5:3 6:0 7:2 8:0 9:2 10:7 11:2 11 | 10 1:1 2:1 3:1 4:9 5:0 6:2 7:2 8:0 9:0 10:3 11:3 12 | 11 1:4 2:1 3:0 4:0 5:4 6:5 7:1 8:3 9:0 10:1 11:0 13 | -------------------------------------------------------------------------------- /data/sampling.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/data/sampling.xlsx -------------------------------------------------------------------------------- /data/test.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/data/test.xlsx -------------------------------------------------------------------------------- /data/text.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/data/text.xlsx -------------------------------------------------------------------------------- /data/user.avsc: -------------------------------------------------------------------------------- 1 | {"namespace": "example.avro", 2 | "type": "record", 3 | "name": "User", 4 | "fields": [ 5 | {"name": "name", "type": "string"}, 6 | {"name": "favorite_color", "type": ["string", "null"]} 7 | ] 8 | } 9 | -------------------------------------------------------------------------------- /data/users.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/data/users.avro -------------------------------------------------------------------------------- /data/users.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/data/users.parquet -------------------------------------------------------------------------------- /pythonadvance/Gensimtest.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | 3 | import gensim,logging 4 | import numpy as np 5 | import os 6 | 7 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO) 8 | 9 | sentences=[['first','sentence'],['second','sentence']] #说明了模型的输入是什么形式的,必须是这种列表性质的 10 | 11 | model=gensim.models.Word2Vec(sentences,min_count=1,size=100) #min_count是可以忽略的出现次数少的词，因此当min_count设置为2时，报错，因为first被忽略了。size代表向量的维度。 12 | 13 | #print model['three'] 14 | 15 | 16 | print(model['first']) 17 | print model.similarity('first','second') 18 | print model.most_similar(positive=['first'], negative=['sentence']) 19 | print ("first sentence second sentence , 有哪个是不匹配的? word2vec结果说是:"+model.doesnt_match("first sentence second sentence".split())) 20 | 21 | print model['first'].shape 22 | 23 | #没有理解 24 | class TextLoader(object): 25 | def __init__(self): 26 | pass 27 | 28 | def __iter__(self): 29 | input = open('corpus-seg.txt', 'r') 30 | line = str(input.readline()) 31 | counter = 0 32 | while line != None and len(line) > 4: 33 | # print line 34 | segments = line.split(' ') 35 | yield segments 36 | line = str(input.readline()) 37 | 38 | #没有理解 39 | class MySentences(object): 40 | def __init__(self, dirname): 41 | self.dirname = dirname 42 | 43 | def __iter__(self): 44 | for fname in os.listdir(self.dirname): 45 | for line in open(os.path.join(self.dirname, fname)): 46 | yield line.split() 47 | 48 | #sentences = MySentences('/some/directory') # a memory-friendly iterator 49 | #model = gensim.models.Word2Vec(sentences) 50 | 51 | def fab(max): 52 | n, a, b = 0, 0, 1 53 | while n < max: 54 | yield b 55 | # print b 56 | a, b = b, a + b 57 | n = n + 1 58 | print type(fab(5)) 59 | next(fab(5)) 60 | for i in fab(5): 61 | print i 62 | from gensim import corpora 63 | 64 | documents = ["Human machine interface for lab abc computer applications", 65 | "A survey of user opinion of computer system response time", 66 | "The EPS user interface management system", 67 | "System and human system engineering testing of EPS", 68 | "Relation of user perceived response time to error measurement", 69 | "The generation of random binary unordered trees", 70 | "The intersection graph of paths in trees", 71 | "Graph minors IV Widths of trees and well quasi ordering", 72 | "Graph minors A survey"] 73 | 74 | # remove common words and tokenize 75 | stoplist = set('for a of the and to in'.split()) 76 | texts = [[word for word in document.lower().split() if word not in stoplist] 77 | for document in documents] 78 | 79 | # remove words that appear only once 80 | from collections import defaultdict 81 | frequency = defaultdict(int) 82 | for text in texts: 83 | for token in text: 84 | frequency[token] += 1 85 | print 'sucess' 86 | texts = [[token for token in text if frequency[token] > 1] 87 | for text in texts] 88 | from pprint import pprint # pretty-printer 89 | pprint(texts) 90 | print type(texts) 91 | 92 | dictionary = corpora.Dictionary(texts) 93 | #dictionary.save('../tmp/deerwester.dict') # store the dictionary, for future reference 94 | print(dictionary) 95 | print(dictionary.token2id) 96 | new_doc = "Human computer interaction" 97 | new_vec = dictionary.doc2bow(new_doc.lower().split()) 98 | print(new_vec) 99 | 100 | corpus = [dictionary.doc2bow(text) for text in texts] 101 | #corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) # store to disk, for later use 102 | print(corpus) 103 | -------------------------------------------------------------------------------- /pythonadvance/Regression_analyse.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import scipy as sp 5 | from scipy.stats import norm 6 | from sklearn.pipeline import Pipeline 7 | from sklearn.linear_model import LinearRegression 8 | from sklearn.preprocessing import PolynomialFeatures 9 | from sklearn import linear_model 10 | 11 | ''''' 数据生成 ''' 12 | x = np.arange(0, 1, 0.002) 13 | y = norm.rvs(0, size=500, scale=0.1)#产生正态分布的数0是均值，0.1是方差，500是数量 14 | y = y + x ** 2 15 | 16 | ''''' 均方误差根 ''' 17 | 18 | 19 | def rmse(y_test, y): 20 | return sp.sqrt(sp.mean((y_test - y) ** 2)) 21 | 22 | 23 | ''''' 与均值相比的优秀程度，介于[0~1]。0表示不如均值。1表示完美预测.这个版本的实现是参考scikit-learn官网文档 ''' 24 | 25 | #R2方法是将预测值跟只使用均值的情况下相比，看能好多少。其区间通常在（0,1）之间。0表示还不如什么都不预测，直接取均值的情况，而1表示所有预测跟真实结果完美匹配的情况。 26 | 27 | #R2的计算方法，不同的文献稍微有不同。如本文中函数R2是依据scikit-learn官网文档实现的，跟clf.score函数结果一致。 28 | def R2(y_test, y_true): 29 | return 1 - ((y_test - y_true) ** 2).sum() / ((y_true - y_true.mean()) ** 2).sum() 30 | 31 | 32 | ''''' 这是Conway&White《机器学习使用案例解析》里的版本 ''' 33 | 34 | 35 | def R22(y_test, y_true): 36 | y_mean = np.array(y_true) 37 | y_mean[:] = y_mean.mean() 38 | return 1 - rmse(y_test, y_true) / rmse(y_mean, y_true) 39 | 40 | 41 | plt.scatter(x, y, s=5) 42 | degree = [1, 2, 100] 43 | y_test = [] 44 | y_test = np.array(y_test) 45 | 46 | for d in degree: 47 | clf = Pipeline([('poly', PolynomialFeatures(degree=d)), 48 | ('linear', LinearRegression(fit_intercept=False))]) 49 | clf.fit(x[:, np.newaxis], y) 50 | y_test = clf.predict(x[:, np.newaxis]) 51 | 52 | print(clf.named_steps['linear'].coef_) 53 | print('rmse=%.2f, R2=%.2f, R22=%.2f, clf.score=%.2f' % 54 | (rmse(y_test, y), 55 | R2(y_test, y), 56 | R22(y_test, y), 57 | clf.score(x[:, np.newaxis], y))) 58 | 59 | plt.plot(x, y_test, linewidth=2) 60 | 61 | plt.grid() 62 | plt.legend(['1', '2', '100'], loc='upper left') 63 | plt.show() -------------------------------------------------------------------------------- /pythonadvance/StatsModelstest.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | from statsmodels.tsa.stattools import adfuller as ADF 3 | import numpy as np 4 | 5 | print ADF(np.random.rand(100)) -------------------------------------------------------------------------------- /pythonadvance/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/pythonadvance/__init__.py -------------------------------------------------------------------------------- /pythonadvance/cipin.py: -------------------------------------------------------------------------------- 1 | # from numpy import array 2 | # from math import sqrt 3 | # 4 | # from pyspark.mllib.clustering import KMeans, KMeansModel 5 | # from pyspark import SparkContext 6 | # sc = SparkContext("local",appName="KMeans") 7 | # 8 | # data = sc.textFile("D:\\PycharmProjects\\data\\mllib\\kmeans_data.txt") 9 | # parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) 10 | # 11 | # # Build the model (cluster the data) 12 | # clusters = KMeans.train(parsedData, 2, maxIterations=10, 13 | # runs=10, initializationMode="random") 14 | # 15 | # # Evaluate clustering by computing Within Set Sum of Squared Errors 16 | # def error(point): 17 | # center = clusters.centers[clusters.predict(point)] 18 | # return sqrt(sum([x**2 for x in (point - center)])) 19 | # 20 | # WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) 21 | # print("Within Set Sum of Squared Error = " + str(WSSSE)) 22 | # 23 | # # Save and load model 24 | # clusters.save(sc, "D:\\PycharmProjects\\KmeansModel") 25 | # sameModel = KMeansModel.load(sc, "D:\\PycharmProjects\\KmeansModel") 26 | 27 | 28 | from pyspark import SparkContext 29 | 30 | logFile = "D:\spark-1.6.2-bin-hadoop2.6\README.md" 31 | sc = SparkContext("local","Simple App") 32 | logData = sc.textFile(logFile).cache() 33 | 34 | numAs = logData.filter(lambda s: 'a' in s).count() 35 | numBs = logData.filter(lambda s: 'b' in s).count() 36 | 37 | print("Lines with a: %i, lines with b: %i"%(numAs, numBs)) 38 | 39 | 40 | 41 | # from pyspark.ml.clustering import KMeans 42 | # 43 | # from pyspark.sql import SparkSession 44 | # 45 | # # Loads data. 46 | # 47 | # spark = SparkSession \ 48 | # .builder \ 49 | # .appName("mlkmeans") \ 50 | # .getOrCreate() 51 | # dataset = spark.read.format("libsvm").load("D:\\PycharmProjects\\data\\mllib\\sample_kmeans_data.txt") 52 | # 53 | # # Trains a k-means model. 54 | # kmeans = KMeans().setK(2).setSeed(1) 55 | # model = kmeans.fit(dataset) 56 | # 57 | # # Evaluate clustering by computing Within Set Sum of Squared Errors. 58 | # wssse = model.computeCost(dataset) 59 | # print("Within Set Sum of Squared Errors = " + str(wssse)) 60 | # 61 | # # Shows the result. 62 | # centers = model.clusterCenters() 63 | # print("Cluster Centers: ") 64 | # for center in centers: 65 | # print(center) 66 | 67 | 68 | # from pyspark import SparkContext 69 | # 70 | # sc = SparkContext('local') 71 | # doc = sc.parallelize([['a','b','c'],['b','d','d']]) 72 | # words = doc.flatMap(lambda d:d).distinct().collect() 73 | # word_dict = {w:i for w,i in zip(words,range(len(words)))} 74 | # word_dict_b = sc.broadcast(word_dict) 75 | # 76 | # def wordCountPerDoc(d): 77 | # dict={} 78 | # wd = word_dict_b.value 79 | # for w in d: 80 | # if dict.has_key(wd[w]): 81 | # dict[wd[w]] +=1 82 | # else: 83 | # dict[wd[w]] = 1 84 | # return dict 85 | # print doc.map(wordCountPerDoc).collect() 86 | # print "successful!" 87 | -------------------------------------------------------------------------------- /pythonadvance/clustermeric.py: -------------------------------------------------------------------------------- 1 | from sklearn.cluster import AffinityPropagation 2 | from sklearn import metrics 3 | from sklearn.datasets.samples_generator import make_blobs 4 | from sklearn import 5 | 6 | centers = [[1, 1], [-1, -1], [1, -1]] 7 | X, labels_true = make_blobs(n_samples=300, centers=centers, cluster_std=0.5, 8 | random_state=0) 9 | print type(X) 10 | print X 11 | af = AffinityPropagation(preference=-50).fit(X) 12 | cluster_centers_indices = af.cluster_centers_indices_ 13 | labels = af.labels_ 14 | print type(labels) 15 | print labels 16 | n_clusters_ = len(cluster_centers_indices) 17 | 18 | print('Estimated number of clusters: %d' % n_clusters_) 19 | print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) 20 | print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) 21 | print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) 22 | print("Adjusted Rand Index: %0.3f" 23 | % metrics.adjusted_rand_score(labels_true, labels)) 24 | print("Adjusted Mutual Information: %0.3f" 25 | % metrics.adjusted_mutual_info_score(labels_true, labels)) 26 | print("Silhouette Coefficient: %0.3f" 27 | % metrics.silhouette_score(X, labels, metric='sqeuclidean')) 28 | -------------------------------------------------------------------------------- /pythonadvance/dxckeras.py: -------------------------------------------------------------------------------- 1 | from keras.models import Sequential 2 | from keras.layers.core import Dense ,Dropout,Activation 3 | from keras.optimizers import SGD 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | 7 | print 'qx' 8 | datas=pd.read_excel('../data/sampling.xlsx') 9 | X=datas.iloc[:,1:].as_matrix() 10 | y=datas.iloc[:,0].as_matrix() 11 | print y 12 | model= Sequential() 13 | model.add(Dense(26,input_dim=26)) 14 | model.add(Activation('linear')) 15 | 16 | model.add(Dense(26,input_dim=26)) 17 | model.add(Activation('linear')) 18 | model.add(Dropout(0.5)) 19 | model.add(Dense(1,input_dim=26)) 20 | #model.add(Activation('linear')) 21 | # sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True) 22 | # model.compile(loss='mean_squared_error',optimizer=sgd,metrics=["accuracy"]) 23 | model.compile(loss='mean_squared_error', optimizer='rmsprop') 24 | model.fit(X, y, batch_size=5, nb_epoch=100, shuffle=True,verbose=0,validation_split=0.2) 25 | score=model.evaluate(X,y,batch_size=16) 26 | p=model.predict(X,batch_size=16,verbose=0) 27 | print p 28 | 29 | fig, ax = plt.subplots() 30 | ax.scatter(y, p) 31 | 32 | ax.plot([y.min(),y.max()],[y.min(),y.max()],'g',lw=4) 33 | plt.show() 34 | 35 | 36 | #from keras.models import Sequential 37 | # 38 | # from keras.layers import LSTM, Dense 39 | # 40 | # import numpy as np 41 | # 42 | # data_dim = 16 43 | # 44 | # timesteps = 8 45 | # 46 | # nb_classes = 10 47 | # 48 | # # expected input data shape: (batch_size, timesteps, data_dim) 49 | # 50 | # model = Sequential() 51 | # 52 | # model.add(LSTM(32, return_sequences=True, 53 | # 54 | # input_shape=(timesteps, data_dim))) 55 | # 56 | # model.add(LSTM(32, return_sequences=True)) 57 | # 58 | # model.add(LSTM(32)) 59 | # 60 | # model.add(Dense(10, activation='softmax')) 61 | # 62 | # model.compile(loss='categorical_crossentropy', 63 | # 64 | # optimizer='rmsprop', 65 | # 66 | # metrics=['accuracy']) 67 | # 68 | # # generate dummy training data 69 | # 70 | # x_train = np.random.random((1000, timesteps, data_dim)) 71 | # 72 | # y_train = np.random.random((1000, nb_classes)) 73 | # 74 | # # generate dummy validation data 75 | # 76 | # x_val = np.random.random((100, timesteps, data_dim)) 77 | # 78 | # y_val = np.random.random((100, nb_classes)) 79 | # 80 | # model.fit(x_train, y_train, 81 | # 82 | # batch_size=64, nb_epoch=5, 83 | # 84 | # validation_data=(x_val, y_val)) 85 | -------------------------------------------------------------------------------- /pythonadvance/dxcprec.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | datas=pd.read_excel('../data/sampling.xlsx') 7 | # print datas 8 | data=datas.iloc[:,1:].as_matrix() 9 | target=datas.iloc[:,0].as_matrix() 10 | print data.shape 11 | 12 | print type(target) 13 | from sklearn.linear_model import LinearRegression 14 | model=LinearRegression() 15 | model.fit(data,target) 16 | # print model.predict(data[0]) 17 | # 18 | print np.matrix(model.coef_) 19 | from sklearn.model_selection import cross_val_predict 20 | predicted = cross_val_predict(model, data, target, cv=10) 21 | 22 | fig, ax = plt.subplots() 23 | ax.scatter(target, predicted) 24 | ax.plot([target.min(), target.max()], [target.min(), target.max()], 'g', lw=1) 25 | ax.set_xlabel('Measured') 26 | ax.set_ylabel('Predicted') 27 | plt.show() 28 | print model.score(data,target) -------------------------------------------------------------------------------- /pythonadvance/dxcsvm.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.svm import SVR 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | 7 | 8 | datas=pd.read_excel('../data/sampling.xlsx') 9 | X=datas.iloc[:,1].as_matrix() 10 | y=datas.iloc[:,0].as_matrix() 11 | # print X 12 | # print y 13 | print X.shape 14 | # import xlrd 15 | # datas=xlrd.open_workbook('sampling.xlsx') 16 | # table = datas.sheet_by_name(u'Sheet1') 17 | # X= np.matrix(table.col_values(4)) 18 | # y=np.matrix(table.col_values(0)) 19 | # print X 20 | # print y 21 | svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) 22 | svr_lin = SVR(kernel='linear', C=1e3) 23 | svr_poly = SVR(kernel='poly', C=1e3, degree=2) 24 | svr_rbf.fit(X, y) 25 | y_rbf = svr_rbf.predict(X) 26 | svr_lin.fit(X, y) 27 | y_lin = svr_lin.predict(X) 28 | svr_poly.fit(X, y) 29 | y_poly = svr_poly.predict(X) 30 | lw = 2 31 | plt.scatter(X, y, color='darkorange', label='data') 32 | plt.hold('on') 33 | plt.plot(X, y_rbf, color='navy', lw=lw, label='RBF model') 34 | plt.plot(X, y_lin, color='c', lw=lw, label='Linear model') 35 | plt.plot(X, y_poly, color='cornflowerblue', lw=lw, label='Polynomial model') 36 | plt.xlabel('data') 37 | plt.ylabel('target') 38 | plt.title('Support Vector Regression') 39 | plt.legend() 40 | plt.show() 41 | -------------------------------------------------------------------------------- /pythonadvance/full_linearRession.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | import matplotlib.pyplot as plt 3 | 4 | import pandas as pd 5 | import numpy as np 6 | from sklearn.linear_model import LinearRegression 7 | 8 | data=pd.read_excel('../data/Folds5x2_pp.xlsx') 9 | print data.shape 10 | X=data.iloc[:,0:4] 11 | y=data.iloc[:,4] 12 | 13 | 14 | from sklearn.cross_validation import train_test_split 15 | 16 | X_train,x_test,y_train,y_test=train_test_split(X,y) 17 | #print X_train 18 | #print y_train 19 | #print x_test 20 | #print y_test 21 | 22 | 23 | model=LinearRegression() 24 | model.fit(X_train,y_train) 25 | print 'sucess' 26 | y_pred=model.predict(x_test) 27 | 28 | from sklearn import metrics 29 | #print model.score(X,y) 30 | 31 | # 用scikit-learn计算MSE 32 | print "MSE:",metrics.mean_squared_error(y_test, y_pred) 33 | # 用scikit-learn计算RMSE 34 | print "RMSE:",np.sqrt(metrics.mean_squared_error(y_test, y_pred)) 35 | 36 | from sklearn.model_selection import cross_val_predict 37 | print 'sucess' 38 | y_pred=cross_val_predict(model,X,y,cv=10) 39 | 40 | print "MSE",metrics.mean_squared_error(y,y_pred) 41 | print "RMSE:",np.sqrt(metrics.mean_squared_error(y,y_pred)) 42 | 43 | ax=plt.subplot() 44 | ax.scatter(y,y_pred) 45 | ax.plot([y.min(),y.max()],[y.min(),y.max()]) 46 | ax.set_xlabel('Measured') 47 | ax.set_ylabel('Predicted') 48 | plt.show() -------------------------------------------------------------------------------- /pythonadvance/nltktest.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | import nltk 3 | # nltk.download() 4 | sentence="I love python." 5 | tokens=nltk.word_tokenize(sentence) 6 | 7 | 8 | to=nltk.pos_tag(tokens) 9 | for i in to: 10 | print i 11 | h=nltk.pos_tag(['美']) 12 | 13 | for i in h: 14 | print i[0]+'---'+i[1] 15 | # from nltk.corpus import webtext 16 | # 17 | # webtext.fileids() #得到语料中所有文件的id集合 18 | # 19 | # webtext.raw(fileid) #给定文件的所有字符集合 20 | # 21 | # webtext.words(fileid) #所有单词集合 22 | # 23 | # webtext.sents(fileid) #所有句子集合 -------------------------------------------------------------------------------- /pythonadvance/numpytest.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | a=np.array([2,0,1,5]) 5 | print a 6 | print a[:3] 7 | print a.min() 8 | a.sort() 9 | b=np.array([[1,2,3],[4,5,6]]) 10 | print (b*b) #是点乘不是传统意义上的矩阵相乘 11 | 12 | print type(b) #numpy 仍然是传统意义上的数组 13 | 14 | 15 | a=np.arange(1,4).cumprod() #每个数取得是连乘就是阶乘了 16 | print a 17 | b=np.array([2]*3).cumprod() #2的一次方到2的三次方，里边是产生了3个2 18 | print b 19 | 20 | #np后边产生的都是数组 21 | print np.linspace(1,2,10) #数组与列表的区别就在于没有逗号 22 | 23 | print np.array([2]*3) 24 | print 'matrix' 25 | d= np.array([[1,2,3],[4,5,6]]) 26 | print d.cumsum(0) #0是指一列，1是指一行 27 | print d.cumsum(1) 28 | print d.cumprod(1) 29 | print d.cumprod(0) 30 | 31 | e=np.random.randn(3,4) 32 | print type(e) #np产生的都是一维或者高维数组 33 | 34 | t1=np.linspace(0,2,10) 35 | print t1 36 | t2=np.linspace(-1,1,20) 37 | print t1,t2 38 | #t=np.concatenate(t1,t2) 39 | #print t 40 | 41 | 42 | 43 | a = np.matrix([ [1, 2, 3, 4], 44 | [5, 5, 6, 8], 45 | [7, 9, 9, 1], 46 | [4, 6, 7, 1] 47 | ]) 48 | 49 | #矩阵加减法： 50 | e = a + a 51 | #or 52 | e = a - a 53 | 54 | #矩阵乘法: 55 | b = a * a #not matrix multiplication! 56 | print type(b) 57 | #or 58 | c = np.dot(a, a) #matrix multiplication 59 | #or 60 | d = a 61 | np.dot(a, a, d) #matrix multiplication 62 | 63 | #转置矩阵(transpose) 64 | g = a.transpose() 65 | #or 66 | h = a.T #not matrix transpose! 67 | 68 | #逆矩阵(inverse) 69 | #The inverse of a matrix A is the matrix B such that AB=I where I is the identity matrix consisting of ones down the main diagonal. Usually B is denoted B=A-1 . 70 | #In SciPy, the matrix inverse of the Numpy array, A, is obtained using linalg.inv (A) , or using A.I 71 | f = np.linalg.inv(a) 72 | #or 73 | f = a ** (-1) 74 | #or 75 | f = a.I 76 | 77 | #行列式(determinant) 78 | j = np.linalg.det(a) 79 | 80 | #伴随矩阵(adjoint) 81 | #(need more test) 82 | m = np.dot(np.linalg.det(a), np.linalg.inv(a)) # A-1 = A'' / |A| ==> A''= A-1|A| 83 | 84 | #矩阵范数(matrix norms) 85 | k = np.linalg.norm(a) 86 | 87 | l1=[1,2,3] 88 | l2=[1,2,3] 89 | l3=[1,2,3] 90 | l1=np.array(l1) 91 | l2=np.array(l2) 92 | l3=np.array(l3) 93 | l=list((l1+l2+l3)/3) 94 | print l 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /pythonadvance/pandatest.py: -------------------------------------------------------------------------------- 1 | #-- coding:utf-8 --# 2 | import pandas as pd 3 | from pandas import Series,DataFrame 4 | import numpy as np 5 | 6 | l=Series(data=[1,2,3]) 7 | print l 8 | print l.data 9 | 10 | for i in l.index: 11 | print l[i] #只输出了data部分的数 12 | print l[1:] #是对整个Series进行了切分。 13 | #print l['a'] 14 | print l.index 15 | m=[1,2,3,4] 16 | print m #列表与index不是同一种数据结构，index是一种对象类型，但是对index或者column赋值可以使用列表对其进行赋值。访问他们只能通过序号，可以是切片的形式。 17 | 18 | s=Series([2,3,4],index=['b','a','c']) 19 | 20 | d=DataFrame({'e':4,'d':5,'f':6},index=['a','b','c']) 21 | print d.index 22 | print d.columns 23 | print d.values 24 | print d.describe() 25 | print '测试iterrows' 26 | for i ,series in d.iterrows():#返回每行的序列号，及每行对应每个列的值。 27 | print i,'---',series 28 | 29 | print '测试iloc' 30 | print d 31 | print d.iloc[0:2]#前边默认是行。 32 | 33 | print set(d['d'].tolist()) 34 | print s 35 | print d 36 | sd=pd.concat((s,d),axis=1) 37 | print sd 38 | 39 | d.head(1) 40 | d.tail(1) 41 | #d.to_excel('../data/test.xlsx') 42 | r=pd.read_excel('../data/sampling.xlsx') 43 | #print r 44 | 45 | dates=pd.date_range('20170217',periods=2) 46 | data=pd.DataFrame(np.random.randn(2,4),index=dates,columns=['a','b','c','d']) 47 | print data 48 | print '测试to_list' 49 | print list(set(data['d'].tolist()[0])) 50 | #doc_word = list(set(doc['content'].tolist()[0])) 51 | a=Series([1,2,3,4,None,5]) 52 | print a.isnull() #类型仍然是list 53 | print type(a[a.isnull()]) #只出现结果是true的值，类型仍然是list ，pandas的任何一列都是Series 54 | 55 | print a[range(1,3)] #Series的读取问题，可以根据列表序号进行读取。 56 | 57 | -------------------------------------------------------------------------------- /pythonadvance/pymysqltest.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | import pymysql 3 | 4 | conn=pymysql.connect(host='192.168.1.111',user='root',passwd='1234',db='zmap_empi',charset='utf8') 5 | 6 | cur=conn.cursor() 7 | query='select count(*) from zmap_r_patient_empi_jb' 8 | 9 | cur.execute(query) 10 | result = cur.fetchall() # result为tuple类型，记录存放是((),(),...()) 这样的形式 11 | 12 | 13 | for i in result: 14 | print i -------------------------------------------------------------------------------- /pythonadvance/regular.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: UTF-8 -*- 3 | 4 | import re 5 | print re.match('www', 'www.runoob.com').span() # 在起始位置匹配 6 | print re.match('www','www.runoob.com') 7 | #print re.match('com', 'www.runoob.com').span() # 不在起始位置匹配,re.match 尝试从字符串的起始位置匹配一个模式，如果不是起始位置匹配成功的话，match()就返回none。 8 | 9 | 10 | import re 11 | 12 | test = "我是123周小旭,来自1bd江西ab九江" 13 | 14 | result = re.findall(ur'[\u4e00-\u9fa5]', test.decode('utf-8')) 15 | 16 | print result 17 | 18 | 19 | print ''.join(result) 20 | 21 | result=re.findall(r'[0-9]',test) 22 | print result 23 | 24 | result=re.findall(r'[a-z]',test) 25 | print result 26 | 27 | text = "JGood is a handsome boy, he is cool, clever, and so on..." 28 | m = re.search(r'\shan(ds)ome\s', text) 29 | if m: 30 | print m.group(0), m.group(1) 31 | else: 32 | print 'not search' 33 | 34 | 35 | text = "JGood is a handsome boy, he is cool, clever, and so on..." 36 | print re.sub(r'\s+', '-', text) 37 | #re.split(r'\s+', text)；将字符串按空格分割成一个单词列表 38 | 39 | #re.findall可以获取字符串中所有匹配的字符串。如：re.findall(r'\w*oo\w*', text)；获取字符串中，包含'oo'的所有单词。 40 | 41 | text = "JGood is a handsome boy, he is cool, clever, and so on..." 42 | regex = re.compile(r'\w*oo\w*') 43 | print regex.findall(text) #查找所有包含'oo'的单词 44 | print regex.sub(lambda m: '[' + m.group(0) + ']', text) #将字符串中含有'oo'的单词用[]括起来。 45 | inputStr = "hello 123 world 456" 46 | replacedStr = re.sub("\d+", "222", inputStr) 47 | print replacedStr 48 | 49 | str='''江苏省人民医院心血管内科感谢信56封礼物40个[图片]职　　称：副主任医师副教授[图片]擅　　长： 50 | 心房颤动、室性心动过速与各种室上性心动过速的导管消融与缓慢性心律失常的起搏治疗 51 | 心房颤动、室性心动过速与各种室上性心动过速的导管消融与缓慢性心律失常的起搏治疗[图片]执业经历： 52 | 张凤祥，男，医学博士，副主任医师、副教授，硕士生导师；中国医师协会心律失常分会青委会副主任委员，中华医学会心电生理和起搏分会青年委员，中国医师协会心血管内科医师分会青年委员，中华医学会江苏省心血管病分会青年委员会副主委，中华全科医学杂志编委；2007年毕业于南京医科大学并任职于南京医科大学第一附属医院（江苏省人民医院）。熟练掌握各种心律失常的诊断与治疗。主要研究方向：心律失常的临床治疗与基础研究。擅长1）心房颤动、室性早搏、室性心动过速、房性心动过速、阵发性室上速、等心律失常的导管消融治疗；2) 房室传导阻滞、病态窦房结综合征等缓慢心律失常的起搏治疗；3）Brugada综合征、长QT综合征、短QT、儿茶酚胺敏感室速等心脏性猝死预防。发表学术论文50篇，其中SCI文章近20篇。主持国家自然科学基金3项，江苏省六大人才高峰课题1项，中国医师协会课题1项；参与973、十二五等重大科研课题；荣获江苏省卫生厅新技术引进二等奖2项。<< 收起''' 53 | print str 54 | str=str.replace('\n','') 55 | print str 56 | str1='''通过好大夫在线提前预约名大夫的办法 57 | 58 | 准备去北京或者其他大城市看病或者做手术的人，可以提前通过好大夫在线联系好大夫，否则，直接去了这些名大夫的号很难挂。下边以去北京安贞找马长生治疗房颤为例说明办法： 59 | ， 60 | 第一，提前拍摄好自己的病历材料，（手头没有可以凭病号身份证去医院病案室复印）。要求最少提供： 61 | （1）发作时期的心电图或者记录有发作症状的24小时心电图报告页。 62 | （2）心脏彩超。必须带数据部分 63 | （3）血液生化全部项目报告单 64 | （4）你治疗期间做的其他价格高的检查，比如心脏CTA等等 65 | （5）最近一次因为房颤住院的出院小结 66 | 图片要求拍摄清晰，可以适当掩盖住名字部分。 67 | 写一份详细的生病状况和治疗情况说明，内容要有发病症状，所入医院名称和科室。大概治疗用药物，写上出院小结内大夫给的诊断结论还有出院大夫要求吃的药物。 68 | 第二登陆马长生的好大夫在线的个人网站地址是 http://machangsheng.haodf.com/ （如果你找其他大夫看病就去其他大夫的网站） 69 | 第三点击网上咨询。一般都提示你注册，注册的时候所留手机号码必须真实有效。以后要用来接收大夫回复以后的短信通知。 70 | 第四填写各种内容和上传图片 71 | 第五，点击最下面的确定等待大夫的回复 72 | 大夫回复后，有时候他会提问你问题，也可能要求补充病历材料等，你可以根据情况继续上传和给出说明 73 | 一般来说，一个注册号码一次可以得到3次提问机会，超过部分是要收费的。当然，到第三次的时候大夫如果认为有必要，一般会再给你三次机会。 74 | 如果大夫认为你适合找他看病或者手术，一般他会给你一个住院管理大夫的电话，你电话过去说明是马长生叫你打的电话，要求给定病床。等他有病床了会提前通知你。这样你再去北京就不用等待了，免去了很多外地人到北京住院治疗的烦恼 75 | 76 | 以下是心脏内科导管消融的好大夫 77 | 78 | http://machangsheng.haodf.com/ 马长生北京安贞 79 | 80 | http://dongjianzeng.haodf.com/ 董建增北京安贞 81 | 82 | http://yaoyan.haodf.com/ 姚焰北京阜外 83 | 84 | http://liuxu001.haodf.com/ 刘旭上海市胸科医院心内科 85 | 86 | http://wxqing1212.haodf.com/ 王现青.河南省人民医院心血管内科 87 | 88 | http://huanghe1977.haodf.com/ 黄鹤武汉大学人民医院心血管内科 89 | 90 | http://xpliu71.haodf.com/ 刘兴鹏北京朝阳医院心脏中心 91 | 92 | http://jiangchenyang.haodf.com/ 蒋晨阳浙江大学医学院附属邵逸夫医院心内科 93 | 94 | http://liushaowen.haodf.com/ 刘少稳上海市第一人民医院心内科 95 | 96 | 97 | 98 | 99 | 以下是心脏外科用胸腔镜治疗房颤,的好大夫, 100 | 101 | http://zhengzhe.haodf.com/ 郑哲北京阜外医院 102 | 103 | http://mxu263.haodf.com/ 孟旭北京安贞医院心脏外科中心 104 | 105 | http://xuchunlei.haodf.com/ 许春雷北京安贞医院心脏外科中心 106 | 107 | http://meiju.haodf.com/ 上海新华梅举 108 | 109 | http://chengyunge.haodf.com/ 上海远大程云阁国内唯一用胸腔镜做迷宫三手术的大夫，价格4.5万最便宜 110 | 111 | 112 | 病友阵发房颤根据亲身经历整理 113 | ''' 114 | print str1 115 | str1=str1.replace('/r/n','') 116 | print str1 -------------------------------------------------------------------------------- /pythonadvance/scipytest.py: -------------------------------------------------------------------------------- 1 | from scipy import stats 2 | X=stats.norm.rvs(0,size=500,scale=0.1) 3 | #X =stats.norm(loc=1.0,scale=2.0,size = 100) 4 | 5 | 6 | print stats.norm.fit(X) 7 | from scipy import stats 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | fs_meetsig = np.random.random(30) 11 | fs_xk = np.sort(fs_meetsig) 12 | fs_pk = np.ones_like(fs_xk) / len(fs_xk) 13 | fs_rv_dist = stats.rv_discrete(name='fs_rv_dist', values=(fs_xk, fs_pk)) 14 | 15 | plt.plot(fs_xk, fs_rv_dist.cdf(fs_xk), 'b-', ms=12, mec='r', label='friend') 16 | plt.show() 17 | 18 | age = [23, 23, 27, 27, 39, 41, 47, 49, 50, 52, 54, 54, 56, 57, 58, 58, 60, 61] 19 | fat_percent = [9.5, 26.5, 7.8, 17.8, 31.4, 25.9, 27.4, 27.2, 31.2, 34.6, 42.5, 28.8, 33.4, 30.2, 34.1, 32.9, 41.2, 35.7] 20 | age = np.array(age) 21 | fat_percent = np.array(fat_percent) 22 | data = np.vstack([age, fat_percent]).reshape([-1, 2]) 23 | 24 | print(stats.describe(data)) 25 | 26 | for key, value in stats.describe(data)._asdict().items(): 27 | print(key, ':', value) 28 | 29 | # shannon_entropy = stats.entropy(ij/sum(ij), base=None) 30 | # print(shannon_entropy) -------------------------------------------------------------------------------- /pythonadvance/sklearnkmeans.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | import pandas 3 | import matplotlib.pyplot as plt 4 | 5 | inputfile='../data/sampling.xlsx' 6 | outputfile='../data/outputfile.xlsx' 7 | data=pandas.read_excel(inputfile) 8 | print data.index #可以同过index_col设置index的。 9 | #print data.values 10 | #data=data[2:] 11 | data_zs=1.0*(data-data.mean())/data.std() 12 | 13 | 14 | from sklearn.cluster import KMeans 15 | model=KMeans(n_clusters=100,n_jobs=1,max_iter=500) 16 | model.fit(data_zs) 17 | # fig=plt.figure() 18 | # ax=fig.add_subplot(111) 19 | # ax.scatter() 20 | 21 | t=pandas.concat([data,pandas.Series(model.labels_,index=data.index)],axis=1) 22 | t.columns=list(data.columns)+[u'聚类类别'] 23 | 24 | #print type(t) 25 | #print t 26 | 27 | #print len( model.labels_) 28 | r1=pandas.Series(model.labels_).value_counts() 29 | #r1.index=range(100) 30 | #print len(model.labels_) 31 | #print model.cluster_centers_ 32 | r2=pandas.DataFrame(model.cluster_centers_) 33 | r=pandas.concat([r2,r1],axis=1) 34 | r.columns=list(data.columns) + [u'聚类类别'] 35 | #print r 36 | 37 | 38 | l= r[r[u'聚类类别']<2].index 39 | #print l 40 | #print r 41 | l=l.tolist() 42 | # print type(l) 43 | # print l 44 | # t=t.iloc[l,:] 45 | # print t 46 | #t=t[t[u'聚类类别'] in l] 47 | # print set(l) 48 | # print type(t[u'聚类类别']) 49 | t=t[ t[u'聚类类别'].isin(l)] 50 | 51 | 52 | # for i in l: 53 | # (t[t[u'聚类类别']==i].index) 54 | # t[t[u'聚类类别']==i] 55 | # r=r.append(t[t[u'聚类类别']==i]) 56 | 57 | #r=r[99:] 58 | 59 | #print r 60 | #print data(x) 61 | # for i in x: 62 | # print data(index=i) 63 | t.to_excel(outputfile) 64 | -------------------------------------------------------------------------------- /pythonadvance/yaofang_fenxi_text.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | #from __future__ import unicode_literals 3 | #import sys 4 | #sys.path.append("../") 5 | 6 | import jieba 7 | import jieba.posseg 8 | import jieba.analyse 9 | 10 | print('关键词提取') 11 | print('-'*40) 12 | print(' TF-IDF') 13 | print('-'*40) 14 | 15 | f = open("../data/ndy.txt","r") 16 | s = f.read() 17 | print type(s) 18 | 19 | for x, w in jieba.analyse.extract_tags(s, withWeight=True): 20 | print('%s %s' % (x, w)) 21 | 22 | print('-'*40) 23 | print(' TextRank') 24 | print('-'*40) 25 | 26 | for x, w in jieba.analyse.textrank(s, withWeight=True): 27 | print('%s %s' % (x, w)) 28 | 29 | seg_list = jieba.cut("我来到北京清华大学", cut_all=True) 30 | print "Full Mode:", "/ ".join(seg_list) #全模式 -------------------------------------------------------------------------------- /pythonadvance/yichangtest.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | import pandas as pd 3 | number = '../data/all_musicers.xlsx' #设定播放数据路径,该路径为代码所在路径的上一个目录data中. 4 | data = pd.read_excel(number) 5 | 6 | data1=data.iloc[:,0:10]#10位歌手的183天音乐播放量 7 | #data2=data.iloc[:,10:20] 8 | #data3=data.iloc[:,20:30] 9 | #data4=data.iloc[:,30:40] 10 | #data5=data.iloc[:,40:50] 11 | import matplotlib.pyplot as plt #导入图像库 12 | plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签 13 | plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号 14 | plt.figure(1, figsize=(13, 26))#可设定图像大小 15 | #plt.figure() #建立图像 16 | p = data1.boxplot(return_type = 'dict') #画箱线图，直接使用DataFrame的方法.代码到这为止,就已经可以显示带有异常值的箱型图了,但为了标注出异常值的数值,还需要以下代码进行标注. 17 | #for i in range(0,4): 18 | x = p['fliers'][2].get_xdata() # 'flies'即为异常值的标签.[0]是用来标注第1位歌手的异常值数值,同理[i]标注第i+1位歌手的异常值. 19 | y = p['fliers'][2].get_ydata() 20 | y.sort() #从小到大排序 21 | print x 22 | print y 23 | for i in range(len(x)): 24 | if i>0: 25 | plt.annotate(y[i], xy = (x[i],y[i]), xytext=(x[i]+0.05 -0.8/(y[i]-y[i-1]),y[i])) 26 | else: 27 | plt.annotate(y[i], xy = (x[i],y[i]), xytext=(x[i]+0.08,y[i])) 28 | 29 | plt.show() #展示箱线图 30 | #输出结果如下:其中,+所表示的均是(统计学认为的)异常值.工作中,要结合数据应用背景, 距离箱型图上下界很近的可归为正常值. 31 | 32 | for i in range(0,182): 33 | if data1.iloc[:,1][i]>125: 34 | data1.iloc[:,1][i]=(data1.iloc[:,1][i+1]+data1.iloc[:,1][i-1])/2 35 | for i in range(0,182): 36 | if data1.iloc[:,2][i]>600: 37 | data1.iloc[:,2][i]=(data1.iloc[:,2][i+1]+data1.iloc[:,1][i-1])/2 38 | for i in range(0,182): 39 | if data1.iloc[:,4][i]>225: 40 | data1.iloc[:,4][i]=(data1.iloc[:,4][i+1]+data1.iloc[:,4][i-1])/2 41 | for i in range(0,182): 42 | if data1.iloc[:,7][i]>60: 43 | data1.iloc[:,7][i]=(data1.iloc[:,7][i+1]+data1.iloc[:,7][i-1])/2 44 | for i in range(0,182): 45 | if data1.iloc[:,8][i]>2500: 46 | data1.iloc[:,8][i]=(data1.iloc[:,8][i+1]+data1.iloc[:,8][i-1])/2 47 | 48 | data1.to_csv("train_innoraml.csv") -------------------------------------------------------------------------------- /pythonbasic/classtest.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | class Person(object): 3 | pass 4 | 5 | xiaoming = Person() 6 | xiaohong = Person() 7 | 8 | print xiaoming 9 | print xiaohong 10 | print xiaoming==xiaohong 11 | 12 | print cmp(2,3) 13 | #定义类之后，可以实例化后，为不同的实例赋予不同的属性 14 | class Person(object): 15 | pass 16 | 17 | p1 = Person() 18 | p1.name = 'Bart' 19 | 20 | p2 = Person() 21 | p2.name = 'Adam' 22 | 23 | p3 = Person() 24 | p3.name = 'Lisa' 25 | p3.job ='programmer' 26 | 27 | L1 = [p1, p2, p3] 28 | L2 = sorted(L1,lambda p1,p2:cmp(p1.name,p2.name)) 29 | 30 | print L2[0].name 31 | print L2[1].name 32 | print L2[2].name 33 | print L2[2].job 34 | #也可以在实例初始化时，就设定好 35 | class Person(object): 36 | def __init__(self,nam,gender,birth,**kw): 37 | self.name=nam 38 | self.gender=gender 39 | self.birth=birth 40 | for k,v in kw.iteritems(): 41 | setattr(self,k,v) 42 | 43 | xiaoming = Person('Xiao Ming', 'Male', '1990-1-1', job='Student') 44 | 45 | print xiaoming.name 46 | print xiaoming.job 47 | #Python对属性权限的控制是通过属性名来实现的，如果一个属性由双下划线开头(__)，该属性就无法被外部访问。 48 | class Person(object): 49 | def __init__(self, name, score): 50 | self.name=name 51 | self.__score=score 52 | 53 | p = Person('Bob', 59) 54 | 55 | print p.name 56 | try: 57 | print p.__score 58 | except AttributeError: 59 | print 'attributeerror' 60 | #实例属性每个实例各自拥有，互相独立，而类属性有且只有一份。类属性发生改变，所有的都改变 61 | class Person(object): 62 | count= 0 63 | def __init__(self,name): 64 | Person.count= Person.count + 1 65 | self.name= name 66 | 67 | p1 = Person('Bob') 68 | print Person.count 69 | 70 | p2 = Person('Alice') 71 | print Person.count 72 | 73 | p3 = Person('Tim') 74 | print Person.count 75 | 76 | #当实例属性和类属性重名时，实例属性优先级高，它将屏蔽掉对类属性的访问。千万不要在实例上修改类属性，它实际上并没有修改类属性，而是给实例绑定了一个实例属性。 77 | 78 | class Person(object): 79 | 80 | __count = 0 81 | 82 | def __init__(self, name): 83 | Person.__count=Person.__count+1 84 | self.name=name 85 | print Person.__count 86 | 87 | p1 = Person('Bob') 88 | p2 = Person('Alice') 89 | 90 | try: 91 | print Person.__count 92 | except AttributeError: 93 | print 'attributeerror' 94 | 95 | #实例的方法就是在类中定义的函数，它的第一个参数永远是 self，指向调用该方法的实例本身，其他参数和一个普通函数是完全一样的： 96 | #调用实例方法必须在实例上调用：也就是说必须先初始化 97 | class Person(object): 98 | 99 | def __init__(self, name, score): 100 | self.name=name 101 | self.__score=score 102 | 103 | def get_grade(self): 104 | if self.__score>80: 105 | return 'A' 106 | elif self.__score>=60: 107 | return 'B' 108 | elif self.__score<60: 109 | return 'C' 110 | 111 | p1 = Person('Bob', 90) 112 | p2 = Person('Alice', 65) 113 | p3 = Person('Tim', 48) 114 | 115 | print p1.get_grade() 116 | print p2.get_grade() 117 | print p3.get_grade() 118 | 119 | 120 | -------------------------------------------------------------------------------- /pythonbasic/decoratortest.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | from __future__ import division 3 | def cmp_ignore_case(s1, s2): 4 | u1=s1.upper() 5 | u2=s2.upper() 6 | if u1u2: 9 | return 1 10 | 11 | #sorted函数是一个高阶函数，其包含的函数是一个比较函数，返回值是-1或者1 12 | print sorted(['bob', 'about', 'Zoo', 'Credit'], cmp_ignore_case) #小的在前面是-1，是从小到大排列，反之从大到小； 13 | 14 | 15 | def count(): 16 | fs = [] 17 | for i in range(1, 4): 18 | def f(j): #本身已经是一个闭包，内部定义函数接受外部函数参数，并返回内部函数 19 | def g(): 20 | return j*j 21 | return g 22 | r=f(i) 23 | fs.append(r) 24 | return fs 25 | 26 | f1, f2, f3 = count() 27 | print f1(), f2(), f3() #这里f1=f(1),f2=f(2),f3=f(3) 28 | 29 | #装饰器的作用是写一个装饰器函数，能够一次满足所有有类似需求的函数。如：都有输出log的需求，那么写一个log装饰器即可 30 | 31 | import time 32 | 33 | def performance(f): 34 | def fn(*args,**kw): 35 | t1=time.time() 36 | r=f(*args,**kw) 37 | t2=time.time() 38 | print 'call %s()in %fs' % (f.__name__,(t2-t1)) 39 | return r 40 | return fn 41 | 42 | @performance 43 | def factorial(n): 44 | return reduce(lambda x,y: x*y, range(1, n+1)) 45 | 46 | print factorial(10) 47 | 48 | def log(f): 49 | def fn(x): 50 | print 'call ' + f.__name__ + '()...' 51 | return f(x) 52 | return fn 53 | @log 54 | def factorial(n): 55 | return reduce(lambda x,y: x*y, range(1, n+1)) 56 | print factorial(10) 57 | 58 | def log(prefix): 59 | def log_decorator(f): 60 | def wrapper(*args, **kw): 61 | print '[%s] %s()...' % (prefix, f.__name__) 62 | return f(*args, **kw) 63 | return wrapper 64 | return log_decorator 65 | 66 | @log('DEBUG') 67 | def test(): 68 | pass 69 | print test() 70 | 71 | import time 72 | 73 | def performance(unit): 74 | def per_decorator(f): 75 | def fn(*args,**kw): 76 | t1=time.time() 77 | r=f(*args,**kw) 78 | t2=time.time() 79 | t=(t2-t1)*1000 if unit=='ms'else(t2-t1) #这个地方有疑问 80 | print'call %s()in %f%s'%(f.__name__,t,unit) 81 | return r 82 | return fn 83 | return per_decorator 84 | 85 | @performance('ms') 86 | def factorial(n): 87 | return reduce(lambda x,y: x*y, range(1, n+1)) 88 | 89 | print factorial(10) 90 | 91 | #偏函数的意义减少函数默认参数的设置 functools.partial是偏函数的基本格式 92 | import functools 93 | 94 | sorted_ignore_case = functools.partial(sorted,cmp=lambda s1,s2:cmp(s1.upper(),s2.upper())) 95 | 96 | print sorted_ignore_case(['bob', 'about', 'Zoo', 'Credit']) 97 | 98 | #2.7使用3.几之后的功能使用 __future 99 | print 10/3 100 | 101 | 102 | print 10 / 3 103 | print 10 // 3 104 | #3.0之后unicode不需要加4 105 | 106 | s = 'am I an unicode?' 107 | print isinstance(s, unicode) -------------------------------------------------------------------------------- /pythonbasic/dictionary.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import jieba, os 3 | import codecs 4 | from gensim import corpora, models, similarities 5 | from pprint import pprint 6 | from collections import defaultdict 7 | import sys 8 | import pickle 9 | 10 | reload(sys) 11 | sys.setdefaultencoding('utf-8') 12 | 13 | 14 | def print_dict(dict): 15 | for key in dict: 16 | print type(key), key, str(dict[key]), 17 | print 18 | 19 | 20 | def test3(): 21 | '''' 22 | gensim学习之Dictionary 23 | ''' 24 | a = [['一','一','二'],['一','二','三']] 25 | b = ['一','一','三','四','四'] 26 | dictionary = corpora.Dictionary(a) 27 | print "########dictionary信息##########" 28 | print str(dictionary) # 29 | print "字典，{单词id，在多少文档中出现}" 30 | print dictionary.dfs #字典，{单词id，在多少文档中出现} 31 | print "文档数目" 32 | print dictionary.num_docs #文档数目 33 | print "dictionary.items()" 34 | print_dict(dict(dictionary.items())) # 35 | print "字典，{单词id，对应的词}" 36 | print_dict(dictionary.id2token) #字典，{单词id，对应的词} 37 | print "字典，{词，对应的单词id}" 38 | print_dict(dictionary.token2id) #字典，{词，对应的单词id} 39 | print "所有词的个数" 40 | print dictionary.num_pos #所有词的个数 41 | print "每个文件中不重复词个数的和" 42 | print dictionary.num_nnz #每个文件中不重复词个数的和 43 | print "########doc2bow##########" 44 | #dictionary.add_documents([b]) 45 | #allow_update->更新当前字典；return_missing->返回字典中不存在的词 46 | #result为b文章转换得到的词袋，列表[(单词id，词频)] 47 | result, missing = dictionary.doc2bow(b, allow_update=False, return_missing=True) 48 | print "词袋b，列表[(单词id，词频)]" 49 | print result 50 | print "不在字典中的词及其词频，字典[(单词，词频)]" 51 | print_dict(missing) 52 | print "########bow信息##########" 53 | for id, freq in result: 54 | print id, dictionary.id2token[id], freq 55 | print "########dictionary信息##########" 56 | #过滤文档频率大于no_below，小于no_above*num_docs的词 57 | dictionary.filter_extremes(no_below=1, no_above=0.5, keep_n=10) 58 | 59 | return 60 | 61 | test3() 62 | 63 | 64 | -------------------------------------------------------------------------------- /pythonbasic/huatu.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | import numpy as np 4 | 5 | 6 | fig=plt.figure() 7 | ax=fig.add_subplot(111) 8 | for i in range(1,6): 9 | for j in range(1,6): 10 | ax.scatter(i,j) 11 | 12 | ax.plot([0,5],[0,5],'k',lw=4) 13 | plt.show() 14 | 15 | dates=pd.date_range('2/17/2017',periods=1000) 16 | nd=pd.DataFrame(np.random.randn(1000,4),index=dates,columns=['a','b','c','d']) 17 | print nd 18 | nd=nd.cumsum() 19 | plt.figure() 20 | nd.plot() 21 | plt.show() 22 | 23 | t=np.arange(0.0,5.0,0.01) 24 | s=np.cos(2*np.pi*t) 25 | line,=plt.plot(t,s,lw=2) 26 | plt.annotate('local max',xy=(2,1),xytext=(3,1.5),arrowprops=dict(facecolor='black',shrink=0.05)) 27 | plt.ylim(-2,2) 28 | 29 | plt.show() -------------------------------------------------------------------------------- /pythonbasic/ossys.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.system('ping www.baidu.com') 4 | -------------------------------------------------------------------------------- /pythonbasic/pandasql.py: -------------------------------------------------------------------------------- 1 | 2 | from pandasql import sqldf, load_meat, load_births 3 | import pandasql import * 4 | 5 | 6 | pysqldf = lambda q: sqldf(q, globals()) 7 | meat = load_meat() 8 | births = load_births() 9 | print pysqldf("SELECT * FROM meat LIMIT 10;").head() -------------------------------------------------------------------------------- /pythonbasic/pythonbasic.py: -------------------------------------------------------------------------------- 1 | #-- coding:utf-8 --# 2 | 3 | #列表自动生成 4 | 5 | l=range(10) 6 | print l 7 | l2=l[:-9-1:-1] 8 | print l2 9 | l1=l[9:2:-2]#第三个参数是间隔 10 | print l1 11 | 12 | print '\\' #""是一样的不需要变 13 | print r'\\' #r中没有转义字符 14 | print u'\\' #u是unicode编码普通的是ascii编码 15 | print '\n' 16 | print '\t' 17 | 18 | a = 'python' 19 | print 'hello,', a and 'world' #在计算 a and b 时，如果 a 是 False，则根据与运算法则，整个结果必定为 False，因此返回 a；如果 a 是 True，则整个计算结果必定取决与 b，因此返回 b 20 | b = '' 21 | print 'hello,', b or 'world' #在计算 a or b 时，如果 a 是 True，则根据或运算法则，整个计算结果必定为 True，因此返回 a；如果 a 是 False，则整个计算结果必定取决于 b，因此返回 b。 22 | 23 | c=['a','b','c'] 24 | for i,j in enumerate(c): 25 | print i,'-',j 26 | for i ,j in zip(range(1,len(c)+1),c): 27 | print i ,'-',j 28 | 29 | print filter(lambda s:s and len(s.strip())>0, ['test', None, '', 'str', ' ', 'END']) #冒号前面是函数参数，冒号后边是表达式也是return值。 30 | #print type(None) 31 | #python中的map函数 32 | def format_name(s): 33 | return s[0].upper()+s[1:].lower() 34 | 35 | print map(format_name, ['adam', 'LISA', 'barT'])#函数返回新的元素 36 | #python中的filter函数 37 | import math 38 | def is_sqr(x): 39 | return x and math.sqrt(x)%1==0 40 | print is_sqr(100) 41 | print filter(is_sqr, range(1, 101)) #函数返回布尔类型的值 42 | #python中的reduce函数 43 | def prod(x, y): 44 | return x*y 45 | 46 | print reduce(prod, [2, 4, 5, 7, 12]) 47 | 48 | 49 | str="abcd" 50 | l=list(str) 51 | print l 52 | 53 | a="a" 54 | b="b" 55 | #print str(a) 报错 56 | c=100 57 | a=12 58 | print chr(a) 59 | a='10' 60 | print int(a) 61 | #print a+b+chr(c) 62 | 63 | #print int(a)+int(b)+c 报错 64 | i=int(raw_input('请输入i：')) 65 | l=range(i) 66 | for i in range(i): 67 | l[i]=i+2 68 | 69 | print l 70 | fo=open('../data/people.txt') 71 | print type(fo) 72 | for i in fo: 73 | i.strip(',') #是删除''引号中的字符串 74 | i=i.split(',') 75 | print type(i) 76 | for i in i: 77 | print i 78 | 79 | 80 | 81 | def temp_convert(var): 82 | try: 83 | return int(var) 84 | except ValueError, Argument: 85 | print "参数没有包含数字\n", Argument 86 | 87 | # 调用函数 88 | temp_convert("xyz") 89 | 90 | a=[[1,2,3],[4,5,6]] 91 | print type(a) 92 | 93 | dataSet = [[1, 1, 'yes'], 94 | [1, 1, 'yes'], 95 | [1, 0, 'no'], 96 | [0, 1, 'no'], 97 | [0, 1, 'no']] 98 | print type(dataSet) 99 | 100 | print range(1,5) 101 | print range(6,10) 102 | print range(1,5)+range(6,10) #列表相加不是俩俩相加，而是合成一个更长的list 103 | 104 | i='####口疮####飞滋####口炎疮####复发性口疮####复发性' 105 | i=i.strip('####') 106 | print i 107 | j='###我们###你们' 108 | j=j.strip('####') 109 | print j 110 | 111 | l=[2,5,4,3] 112 | l.sort() 113 | print l 114 | l=[2,5,4,3] 115 | sorted(l) 116 | print l 117 | m={'a':1,'b':2} 118 | print m['a'] 119 | #print m[2] dict的顺序是不能颠倒的。 120 | 121 | l=[2,5,4,3] 122 | print l.index(2) 123 | import numpy.linalg as la 124 | import numpy as np 125 | a=np.array([1,2,3]) 126 | b=np.array([2,3,5]) 127 | print b-a 128 | print la.norm(b-a) 129 | import math 130 | 131 | print math.sqrt(6) 132 | 133 | l=[1,2,3] 134 | print l[-1] 135 | str='你好吗？\n' \ 136 | '我很好' \ 137 | ' ' \ 138 | '你好吗？' 139 | 140 | if '吗'or '呀' or'？' in str: 141 | print str 142 | 143 | import time 144 | a = "2016-04-19 17:37:01" 145 | c="2016-04-19 17:37:46" 146 | b=time.mktime(time.strptime(a,'%Y-%m-%d %H:%M:%S')) 147 | d=time.mktime(time.strptime(c,'%Y-%m-%d %H:%M:%S')) 148 | print b 149 | print d 150 | 151 | import numpy as np 152 | a=np.array(([1,2,3,4])) 153 | print np.median(a)#中位数 154 | print np.percentile(a,75)#95%分位数 155 | 156 | a='a我' 157 | print len(a) 158 | 159 | str='我很开心' 160 | if '？' in str: 161 | print str 162 | 163 | for i in range(6): 164 | print i 165 | if i==3: 166 | i=i+2 167 | print i 168 | 169 | -------------------------------------------------------------------------------- /pythonbasic/randomtest.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | import numpy 3 | import random 4 | import pandas as pd 5 | 6 | print numpy.random.randn(6,5) #服从正太分布的，均值为0的矩阵 7 | print numpy.random.rand(3,4) #服从正态分布的，0,1之间的矩阵 8 | 9 | print random.random() 10 | 11 | print random.uniform(10, 20) 12 | 13 | print random.randint(12, 20) 14 | 15 | print random.choice('abcdefg&#%^*f') 16 | 17 | print random.sample('abcdefghij',3) #选取特定数量的字符 18 | 19 | import string 20 | 21 | print string.join(random.sample(['a','b','c','d','e','f','g','h','i','j'], 3)).replace(" ","") 22 | 23 | print random.choice ( ['apple', 'pear', 'peach', 'orange', 'lemon'] ) 24 | 25 | items = [1, 2, 3, 4, 5, 6] 26 | random.shuffle(items) 27 | print items 28 | -------------------------------------------------------------------------------- /pythondata/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/pythondata/__init__.py -------------------------------------------------------------------------------- /pythondata/datasets.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_digits 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | digits=load_digits() 6 | print digits.data.shape 7 | #print digits 8 | #digits.reshape() 9 | np.savetxt("filename.txt",digits) 10 | 11 | plt.gray() 12 | plt.matshow(digits.images[0]) 13 | plt.show() 14 | 15 | 16 | -------------------------------------------------------------------------------- /sparkml/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/sparkml/__init__.py -------------------------------------------------------------------------------- /sparkml/mlkmeans.py: -------------------------------------------------------------------------------- 1 | #-- coding:utf-8 --# 2 | from pyspark.ml.clustering import KMeans 3 | from pyspark.ml.linalg import Vectors 4 | from pyspark.sql import SparkSession 5 | from pyspark.sql import Row 6 | import pandas as pd 7 | spark=SparkSession \ 8 | .builder \ 9 | .appName("kmeans") \ 10 | .getOrCreate() 11 | 12 | inputfile='../data/sampling.xlsx' 13 | outputfile='../data/outputfile.xlsx' 14 | data=pd.read_excel(inputfile) 15 | 16 | print spark.createDataFrame(data).collect() #spark通过pandas.DataFrame 转化为spark DataFrame ，df.toPandas() 17 | #还可以通过tuple list与字典产生。 18 | 19 | data=spark.read.format('csv').load('../data/sampling.csv') #直接读与createDataFrame方式进行。 20 | data.show() 21 | #dataset=spark.read.format('libsvm').load('../data/sample_kmeans_data.txt') 22 | data=spark.sparkContext.textFile('../data/sampling.csv') 23 | parts=data.map(lambda l: l.split(",")) 24 | dataset = parts.map(lambda p: Row(label=p[0], features=Vectors.dense([int(p[1]),int(p[2]),int(p[3]),int(p[4]),int(p[5]),int(p[6]),int(p[7]),int(p[8]),int(p[9]),int(p[10]),int(p[11]),int(p[12]),int(p[13]),int(p[14]),int(p[15]),int(p[16]),int(p[17]),int(p[18]),int(p[19]),int(p[20]),int(p[21]),int(p[22]),int(p[23]),int(p[24]),int(p[25])])))#冒号前是参数，冒号后表达式就是返回值 25 | #RDD是对象的集合，而DataFrame是Vectors.dense(p[1:]) 26 | 27 | # Infer the schema, and register the DataFrame as a table. 28 | #schemaPeople = spark.createDataFrame(people) #两种产生datafram结构的方式 29 | datasets=spark.createDataFrame(dataset) 30 | 31 | datasets.show() 32 | 33 | # Trains a k-means model. 34 | kmeans = KMeans().setK(2).setSeed(1) 35 | model = kmeans.fit(datasets) 36 | 37 | # Evaluate clustering by computing Within Set Sum of Squared Errors. 38 | wssse = model.computeCost(datasets) 39 | print("Within Set Sum of Squared Errors = " + str(wssse)) 40 | 41 | # Shows the result. 42 | centers = model.clusterCenters() 43 | 44 | df=pd.DataFrame(centers) 45 | print df.dtypes 46 | df.to_excel(outputfile) 47 | 48 | print("Cluster Centers: ") 49 | for center in centers: 50 | print(center) 51 | 52 | -------------------------------------------------------------------------------- /sparkml/sparkSession.py: -------------------------------------------------------------------------------- 1 | #-- coding:utf-8 --# 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql import Row 4 | from pyspark import SparkContext 5 | spark=SparkSession \ 6 | .builder \ 7 | .appName("sparkSession") \ 8 | .getOrCreate() 9 | #.config("spark.some.config.option", "some-value") 可有可无，不设置的情况下就是默认情况。 10 | df=spark.read.json('../data/people.json') 11 | 12 | df.show() 13 | print '-------' 14 | df.take(1) #这是什么意思？ 15 | print '--------' 16 | df.printSchema() 17 | df.select(df['name'],df['age']).show() 18 | #df.select(df['name'], df['age'] ).show() 19 | 20 | df.filter(df['age']>20).show() 21 | df.groupby(df['age']).count().show() 22 | 23 | #执行sql必须是要产生视图执行如下函数 24 | df.createOrReplaceTempView('people') 25 | sqldf=spark.sql('select * from people') 26 | sqldf.show() 27 | #产生全局临时视图 28 | 29 | # df.createGlobalTempView('people') 30 | # spark.sql("select * from global_tem.people").show() 31 | # 为什么出错没搞清 32 | 33 | #datafram 与RDD的交互 34 | #RDD仍然要作为一种必须要学的数据结构 35 | #sc=SparkContext(appName="rddtest")这种数据结构就不可取了 36 | sc=spark.sparkContext 37 | lines=sc.textFile('../data/people.txt') 38 | parts=lines.map(lambda l: l.split(",")) 39 | people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))#冒号前是参数，冒号后表达式就是返回值 40 | #RDD是对象的集合，而DataFrame是 41 | 42 | # Infer the schema, and register the DataFrame as a table. 43 | #schemaPeople = spark.createDataFrame(people) #两种产生datafram结构的方式 44 | schemaPeople=people.toDF() 45 | schemaPeople.createOrReplaceTempView("people") 46 | 47 | # SQL can be run over DataFrames that have been registered as a table. 48 | teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") 49 | 50 | # The results of SQL queries are Dataframe objects. 51 | # rdd returns the content as an :class:`pyspark.RDD` of :class:`Row`. 52 | teenNames = teenagers.rdd.map(lambda p: "Name: " + p.name).collect() 53 | for name in teenNames: 54 | print(name) 55 | 56 | 57 | #是否可以从ndarray中产生datafram,从dataframe，series中转为DataFrame 58 | #必须要解决spark算法数据输入的问题。 59 | 60 | df = spark.read.load("../data/users.parquet") 61 | #df.select("name", "favorite_color").write.save("namesAndFavColors.parquet") 62 | df.show() 63 | 64 | peopleDF = spark.read.json("../data/people.json") 65 | 66 | # DataFrames can be saved as Parquet files, maintaining the schema information. 67 | peopleDF.write.parquet("../data/people.parquet") 68 | 69 | # Read in the Parquet file created above. 70 | # Parquet files are self-describing so the schema is preserved. 71 | # The result of loading a parquet file is also a DataFrame. 72 | parquetFile = spark.read.parquet("../data/people.parquet") 73 | 74 | # Parquet files can also be used to create a temporary view and then used in SQL statements. 75 | parquetFile.createOrReplaceTempView("parquetFile") 76 | teenagers = spark.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19") 77 | -------------------------------------------------------------------------------- /sparkml/sparkio.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/sparkml/sparkio.py -------------------------------------------------------------------------------- /sparkml/sparklda.py: -------------------------------------------------------------------------------- 1 | #--coding:utf-8 --# 2 | from pyspark.ml.clustering import LDA 3 | from pyspark.sql import SparkSession 4 | spark=SparkSession \ 5 | .builder \ 6 | .appName('sparklda') \ 7 | .getOrCreate() 8 | 9 | #libsvm是一种什么样的数据结构为什么要采用这样的方式 10 | #为什么一般的txt文件不可以呢？？？？？ 11 | # Loads data. 12 | dataset = spark.read.format("libsvm").load("../data/sample_lda_libsvm_data.txt") 13 | 14 | # Trains a LDA model. 15 | lda = LDA(k=10, maxIter=10) 16 | model = lda.fit(dataset) 17 | 18 | ll = model.logLikelihood(dataset) 19 | lp = model.logPerplexity(dataset) 20 | print("The lower bound on the log likelihood of the entire corpus: " + str(ll)) 21 | print("The upper bound bound on perplexity: " + str(lp)) 22 | 23 | # Describe topics. 24 | topics = model.describeTopics(3) 25 | print("The topics described by their top-weighted terms:") 26 | topics.show(truncate=False) 27 | 28 | # Shows the result 29 | transformed = model.transform(dataset) 30 | transformed.show(truncate=False) -------------------------------------------------------------------------------- /sparkml/sparklr.py: -------------------------------------------------------------------------------- 1 | #-- coding:utf-8 --# 2 | from pyspark.ml.linalg import Vectors 3 | from pyspark.ml.classification import LogisticRegression 4 | from pyspark.sql import SparkSession 5 | 6 | spark=SparkSession \ 7 | .builder \ 8 | .appName('lr') \ 9 | .getOrCreate() 10 | 11 | # Prepare training data from a list of (label, features) tuples. 12 | training = spark.createDataFrame([ 13 | (1.0, Vectors.dense([0.0, 1.1, 0.1])), 14 | (0.0, Vectors.dense([2.0, 1.0, -1.0])), 15 | (0.0, Vectors.dense([2.0, 1.3, 1.0])), 16 | (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"]) 17 | #基于dataframe的spark机器学习算法必须要有features,label标签才可以 18 | #应该是不同的算法应该具有不同的标签才对 19 | 20 | # Create a LogisticRegression instance. This instance is an Estimator. 21 | lr = LogisticRegression(maxIter=10, regParam=0.01) 22 | # Print out the parameters, documentation, and any default values. 23 | print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") 24 | 25 | # Learn a LogisticRegression model. This uses the parameters stored in lr. 26 | model1 = lr.fit(training) 27 | 28 | # Since model1 is a Model (i.e., a transformer produced by an Estimator), 29 | # we can view the parameters it used during fit(). 30 | # This prints the parameter (name: value) pairs, where names are unique IDs for this 31 | # LogisticRegression instance. 32 | print("Model 1 was fit using parameters: ") 33 | print(model1.extractParamMap()) 34 | 35 | # We may alternatively specify parameters using a Python dictionary as a paramMap 36 | paramMap = {lr.maxIter: 20} 37 | paramMap[lr.maxIter] = 30 # Specify 1 Param, overwriting the original maxIter. 38 | paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55}) # Specify multiple Params. 39 | 40 | # You can combine paramMaps, which are python dictionaries. 41 | paramMap2 = {lr.probabilityCol: "myProbability"} # Change output column name 42 | paramMapCombined = paramMap.copy() 43 | paramMapCombined.update(paramMap2) 44 | 45 | # Now learn a new model using the paramMapCombined parameters. 46 | # paramMapCombined overrides all parameters set earlier via lr.set* methods. 47 | model2 = lr.fit(training, paramMapCombined) 48 | print("Model 2 was fit using parameters: ") 49 | print(model2.extractParamMap()) 50 | 51 | # Prepare test data 52 | test = spark.createDataFrame([ 53 | (1.0, Vectors.dense([-1.0, 1.5, 1.3])), 54 | (0.0, Vectors.dense([3.0, 2.0, -0.1])), 55 | (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"]) 56 | 57 | # Make predictions on test data using the Transformer.transform() method. 58 | # LogisticRegression.transform will only use the 'features' column. 59 | # Note that model2.transform() outputs a "myProbability" column instead of the usual 60 | # 'probability' column since we renamed the lr.probabilityCol parameter previously. 61 | prediction = model2.transform(test) 62 | result = prediction.select("features", "label", "myProbability", "prediction") \ 63 | .collect() 64 | 65 | for row in result: 66 | print("features=%s, label=%s -> prob=%s, prediction=%s" 67 | % (row.features, row.label, row.myProbability, row.prediction)) -------------------------------------------------------------------------------- /sparkml/sparkpipline.py: -------------------------------------------------------------------------------- 1 | #-- coding:utf-8 --# 2 | from pyspark.ml import Pipeline 3 | from pyspark.ml.classification import LogisticRegression 4 | from pyspark.ml.feature import HashingTF, Tokenizer 5 | from pyspark.sql import SparkSession 6 | 7 | spark=SparkSession \ 8 | .builder \ 9 | .appName('pipline') \ 10 | .getOrCreate() 11 | 12 | # Prepare training documents from a list of (id, text, label) tuples. 13 | #tuples的一个list表 14 | #spark本身具备了sqlContext的功能 15 | training = spark.createDataFrame([ 16 | (0, "a b c d e spark", 1.0), 17 | (1, "b d", 0.0), 18 | (2, "spark f g h", 1.0), 19 | (3, "hadoop mapreduce", 0.0) 20 | ], ["id", "text", "label"]) 21 | 22 | # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. 23 | tokenizer = Tokenizer(inputCol="text", outputCol="words") 24 | tokenizerdata=tokenizer.transform(training) 25 | tokenizerdata.show() 26 | hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") 27 | hashingTFdata=hashingTF.transform(tokenizerdata) 28 | hashingTFdata.select('features').show() 29 | lr = LogisticRegression(maxIter=10, regParam=0.001) 30 | lr.fit(hashingTFdata) 31 | pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) 32 | 33 | # Fit the pipeline to training documents. 34 | model = pipeline.fit(training) 35 | #算法部分也要依据固有的dataframe格式才可以。 36 | # Prepare test documents, which are unlabeled (id, text) tuples. 37 | test = spark.createDataFrame([ 38 | (4, "spark i j k"), 39 | (5, "l m n"), 40 | (6, "spark hadoop spark"), 41 | (7, "apache hadoop") 42 | ], ["id", "text"]) 43 | 44 | # Make predictions on test documents and print columns of interest. 45 | prediction = model.transform(test) 46 | print prediction 47 | selected = prediction.select("id", "text", "probability", "prediction") 48 | selected.show() 49 | print selected.collect() #将DataFrame按照row的格式输出 50 | 51 | for row in selected.collect(): 52 | print row 53 | rid, text, prob, prediction = row 54 | print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction)) 55 | 56 | 57 | #需重点看一下spark.read DataFrame东西。 58 | #疑问一、写入的时候为什么报错 59 | #疑问二、如何将dataframe结构转化为算法需要的结构。label，features,可以设定。 60 | #是否可以使用RDD改变map映射，再用raw的方式赋予label与raw的方式 -------------------------------------------------------------------------------- /sparkml/sparkpipline2.py: -------------------------------------------------------------------------------- 1 | #-- coding:utf-8 --# 2 | from pyspark.ml import Pipeline 3 | from pyspark.ml.classification import LogisticRegression 4 | from pyspark.ml.feature import HashingTF, Tokenizer 5 | from pyspark.sql import SparkSession 6 | 7 | spark=SparkSession \ 8 | .builder \ 9 | .appName('pipline') \ 10 | .getOrCreate() 11 | 12 | # Prepare training documents from a list of (id, text, label) tuples. 13 | #tuples的一个list表 14 | training = spark.createDataFrame([ 15 | (0, "a b c d e spark", 1.0), 16 | (1, "b d", 0.0), 17 | (2, "spark f g h", 1.0), 18 | (3, "hadoop mapreduce", 0.0) 19 | ], ["id", "text", "label"]) 20 | 21 | # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. 22 | tokenizer = Tokenizer(inputCol="text", outputCol="words") 23 | hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") 24 | lr = LogisticRegression(maxIter=10, regParam=0.001) 25 | pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) 26 | 27 | # Fit the pipeline to training documents. 28 | model = pipeline.fit(training) 29 | 30 | # Prepare test documents, which are unlabeled (id, text) tuples. 31 | test = spark.createDataFrame([ 32 | (4, "spark i j k"), 33 | (5, "l m n"), 34 | (6, "spark hadoop spark"), 35 | (7, "apache hadoop") 36 | ], ["id", "text"]) 37 | 38 | # Make predictions on test documents and print columns of interest. 39 | prediction = model.transform(test) 40 | print prediction 41 | selected = prediction.select("id", "text", "probability", "prediction") 42 | selected.show() 43 | print selected.collect() #将DataFrame按照row的格式输出 44 | 45 | for row in selected.collect(): 46 | print row 47 | rid, text, prob, prediction = row 48 | print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction)) 49 | 50 | 51 | #需重点看一下spark.read DataFrame东西。 -------------------------------------------------------------------------------- /sparkml/sparktf-itf.py: -------------------------------------------------------------------------------- 1 | #-- coding:utf-8 --# 2 | from pyspark.ml.feature import HashingTF, IDF, Tokenizer 3 | from pyspark.sql import SparkSession 4 | 5 | spark=SparkSession \ 6 | .builder \ 7 | .appName('sparktfitf') \ 8 | .getOrCreate() 9 | 10 | sentenceData = spark.createDataFrame([ 11 | (0.0, "Hi I heard about Spark"), 12 | (0.0, "I wish Java could use case classes"), 13 | (1.0, "Logistic regression models are neat") 14 | ], ["label", "sentence"]) 15 | 16 | tokenizer = Tokenizer(inputCol="sentence", outputCol="words") 17 | wordsData = tokenizer.transform(sentenceData) 18 | wordsData.show() 19 | 20 | hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) 21 | featurizedData = hashingTF.transform(wordsData) 22 | featurizedData.show() 23 | # alternatively, CountVectorizer can also be used to get term frequency vectors 24 | 25 | idf = IDF(inputCol="rawFeatures", outputCol="features") 26 | idfModel = idf.fit(featurizedData) #idf 需要训练其他都是转化就可以了啊。 27 | rescaledData = idfModel.transform(featurizedData) 28 | 29 | rescaledData.select("label", "features").show() -------------------------------------------------------------------------------- /sparkml/tokenizer.py: -------------------------------------------------------------------------------- 1 | from pyspark.ml.feature import Tokenizer, RegexTokenizer 2 | from pyspark.sql.functions import col, udf 3 | from pyspark.sql.types import IntegerType 4 | from pyspark.sql import SparkSession 5 | 6 | spark=SparkSession \ 7 | .builder \ 8 | .appName('tokenizerchinese') \ 9 | .getOrCreate() 10 | sentenceDataFrame = spark.createDataFrame([ 11 | (0, "Hi I heard about Spark"), 12 | (1, "I wish Java could use case classes"), 13 | (2, "Logistic,regression,models,are,neat") 14 | ], ["id", "sentence"]) 15 | 16 | tokenizer = Tokenizer(inputCol="sentence", outputCol="words") 17 | 18 | regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W") 19 | # alternatively, pattern="\\w+", gaps(False) 20 | 21 | countTokens = udf(lambda words: len(words), IntegerType()) 22 | 23 | tokenized = tokenizer.transform(sentenceDataFrame) 24 | tokenized.select("sentence", "words")\ 25 | .withColumn("tokens", countTokens(col("words"))).show(truncate=False) 26 | 27 | regexTokenized = regexTokenizer.transform(sentenceDataFrame) 28 | regexTokenized.select("sentence", "words") \ 29 | .withColumn("tokens", countTokens(col("words"))).show(truncate=False) -------------------------------------------------------------------------------- /text_analyse2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/SmartQA-System/db659597eea7ad5759e183d8affa9053e4076a34/text_analyse2/__init__.py -------------------------------------------------------------------------------- /text_analyse2/extract.txt: -------------------------------------------------------------------------------- 1 | 圣诞消费旺季即将到来，不得不推迟出货 -------------------------------------------------------------------------------- /text_analyse2/jiebatest.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | 3 | 4 | #jieba 分词 5 | import jieba 6 | import jieba.analyse 7 | 8 | seg_list = jieba.cut("我来到北京清华大学",cut_all=True) 9 | print "Full Mode:", "/ ".join(seg_list) #全模式 10 | 11 | seg_list = jieba.cut("我来到北京清华大学",cut_all=False) 12 | print "Default Mode:", "/ ".join(seg_list) #精确模式 13 | 14 | seg_list = jieba.cut("他来到了网易杭研大厦") #默认是精确模式 15 | print ", ".join(seg_list) 16 | 17 | seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造") #搜索引擎模式 18 | print ", ".join(seg_list) 19 | 20 | 21 | w= open('result.txt','w') 22 | 23 | s= '圣诞消费旺季即将到来，不得不推迟出货' 24 | content=open('extract.txt').read() 25 | 26 | seglist = list(jieba.cut(s,cut_all=False)) 27 | print ",".join(seglist) 28 | for i in seglist: 29 | w.write(i.encode('utf-8'))#或者 w.write(i.encode('gbk')) 30 | w.write(',') 31 | w.close() 32 | 33 | #jieba增加自己的用户词典 34 | #jieba.load_userdict(file_name) # file_name为自定义词典的路径 35 | 36 | import sys 37 | sys.path.append("../") 38 | import jieba 39 | jieba.load_userdict("userdict.txt") 40 | import jieba.posseg as pseg 41 | 42 | test_sent = "李小福是创新办主任也是云计算方面的专家;" 43 | test_sent += "例如我输入一个带“韩玉赏鉴”的标题，在自定义词库中也增加了此词为N类型" 44 | words = jieba.cut(test_sent) 45 | print type(words) 46 | 47 | for w in words: 48 | print w 49 | 50 | 51 | result = pseg.cut(test_sent) 52 | 53 | for w in result: 54 | print w.word, "/", w.flag, ", ", 55 | 56 | 57 | print "\n========" 58 | 59 | terms = jieba.cut('easy_install is great') 60 | for t in terms: 61 | print t 62 | print '-------------------------' 63 | terms = jieba.cut('python 的正则表达式是好用的') 64 | for t in terms: 65 | print t 66 | 67 | import jieba.analyse 68 | strx = '网络让我们之间的距离变的如此之近，也同时让我们变的如此遥远。世界上最远的距离不是南极到北极，也不是喜马拉雅之巅到马里亚纳之渊；而是相对而坐，却各自忙着刷手机。暂别网络世界，去和爱人道一句早安，去和朋友聊一夜往事，去和家人吃一顿饭，其实也是挺好的' 69 | s= '结巴分词是一个Python下的中文分词组件' 70 | rt = jieba.analyse.extract_tags(strx,5) 71 | print jieba.analyse.extract_tags(s,2) #这个样是按照列表的形式进行的输出，就和之前遇到的一样，不是编码的问题，而是关键词在列表中，才导致的这种问题。 72 | for r in rt: 73 | print r -------------------------------------------------------------------------------- /text_analyse2/userdict.txt: -------------------------------------------------------------------------------- 1 | 李小福 2 nr 2 | 创新办 3 i 3 | easy_install 3 eng 4 | 好用 300 5 | 韩玉赏鉴 3 nz --------------------------------------------------------------------------------