├── BloodTestReportOCR
    ├── README.md
    ├── bloodtestdata.json
    ├── caffe_predict.py
    ├── checkpoint
    ├── classifier.py
    ├── config.py
    ├── digits
    ├── imageFilter.py
    ├── imgproc.py
    ├── lenet.prototxt
    ├── lenet_iter_800.caffemodel
    ├── model.ckpt.data-00000-of-00001
    ├── model.ckpt.index
    ├── nn_model
    │   ├── checkpoint
    │   ├── model.ckpt.data-00000-of-00001
    │   └── model.ckpt.index
    ├── origin_pics
    │   ├── bloodtestreport1.jpg
    │   ├── bloodtestreport2.jpg
    │   ├── bloodtestreport3.jpg
    │   ├── bloodtestreport4.jpg
    │   ├── bloodtestreport5.jpg
    │   ├── bloodtestreport6.jpg
    │   ├── bloodtestreport7.jpg
    │   └── region.jpg
    ├── pHash.py
    ├── pd_predict.py
    ├── rnn_model
    │   ├── rnn_age_model
    │   │   ├── checkpoint
    │   │   ├── model.ckpt.data-00000-of-00001
    │   │   ├── model.ckpt.index
    │   │   └── model.ckpt.meta
    │   └── rnn_sex_model
    │   │   ├── checkpoint
    │   │   ├── model.ckpt.data-00000-of-00001
    │   │   ├── model.ckpt.index
    │   │   ├── model.ckpt.meta
    │   │   └── rnn_age_model
    │   │       ├── checkpoint
    │   │       ├── model.ckpt.data-00000-of-00001
    │   │       ├── model.ckpt.index
    │   │       └── model.ckpt.meta
    ├── rnn_predict.py
    ├── static
    │   ├── index.html
    │   └── index_with_fileinput_plugin.html
    ├── temp_pics
    │   └── README.md
    ├── tf_predict.py
    └── view.py
├── Caffe
    ├── README.md
    ├── caffe_sex_train_predict.py
    ├── config.prototxt
    ├── draw_net.py
    ├── lenet_train.prototxt
    └── model_prod_prototxt
├── DigitRecogn
    ├── README.md
    ├── index.html
    ├── neural_network_design.py
    ├── ocr.js
    ├── ocr.py
    └── server.py
├── Keras
    ├── .gitignore
    ├── KerasDistinguishAge.py
    ├── README.md
    ├── gender_age_predict_cnn.py
    ├── kerashandwritetest.py
    └── train.py
├── LICENSE
├── MxNet
    └── README.md
├── PaddlePaddle
    ├── README.md
    ├── __init__.py
    ├── dataprovider.py
    ├── predict_age.sh
    ├── predict_sex.sh
    ├── prediction.py
    ├── prediction_age.py
    ├── prediction_sex.py
    ├── preprocess.py
    ├── preprocess.sh
    ├── test.bmp
    ├── test.list
    ├── train.list
    ├── train.sh
    ├── train_age.sh
    ├── train_sex.sh
    ├── trainer_config_age.py
    ├── trainer_config_sex.py
    └── vgg.py
├── README.md
├── Spark
    ├── BloodTestReportDeepLearning
    │   ├── BTR_binary_classification.py
    │   ├── BTR_decision_tree.py
    │   ├── BTR_gradient_boosting.py
    │   ├── BloodTestReportbyLR.py
    │   ├── BloodTestReportbyNB.py
    │   ├── BloodTestReportbyRF.py
    │   ├── BloodTestReportbySVM.py
    │   ├── README.md
    │   ├── data_set.csv
    │   ├── dataformat.py
    │   └── spark单机安装15122016.md
    ├── DigitRecogn_Spark
    │   ├── Readme.md
    │   ├── index.html
    │   ├── ocr.js
    │   └── server.py
    └── README.md
├── TensorFlow
    ├── LSTM.py
    ├── README.md
    ├── age_predict.py
    ├── agepredict_v2.0.py
    ├── predict.csv
    ├── rnn.py
    ├── sex_predict.py
    └── train.csv
├── Traindata.py
├── dealdata.py
├── matlab
    ├── nn
    │   ├── create_nn.m
    │   ├── network_hit139.mat
    │   ├── readme.md
    │   └── test_nn.m
    └── svm_with_pca
    │   ├── readme.md
    │   └── svm_with_pca.m
├── sklearn
    ├── .idea
    │   ├── bloodpredict.iml
    │   ├── misc.xml
    │   ├── modules.xml
    │   └── workspace.xml
    ├── README.md
    ├── age_predict.py
    ├── bloodpredict.py
    ├── download.sh
    └── gender_predict.py
└── weixin
    ├── README.md
    ├── reply_text.xml
    └── wx.py


/BloodTestReportOCR/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # 血常规检验报告OCR
  3 | 
  4 | 
  5 | 
  6 | ## 运行环境
  7 | 
  8 | ```
  9 | # 安装numpy,
 10 | sudo apt-get install python-numpy # http://www.numpy.org/
 11 | # 安装opencv
 12 | sudo apt-get install python-opencv # http://opencv.org/
 13 | 
 14 | ##安装OCR和预处理相关依赖
 15 | sudo apt-get install tesseract-ocr
 16 | sudo pip install pytesseract
 17 | sudo apt-get install python-tk
 18 | sudo pip install pillow
 19 | 
 20 | # 安装Flask框架、mongo
 21 | sudo pip install Flask
 22 | sudo apt-get install mongodb # 如果找不到可以先sudo apt-get update
 23 | sudo service mongodb started
 24 | sudo pip install pymongo
 25 | ```
 26 | 
 27 | ## 运行
 28 | 
 29 | ```
 30 | cd  BloodTestReportOCR
 31 | python view.py # upload图像,在浏览器打开http://yourip:8080
 32 | 
 33 | ```
 34 | 
 35 | ## view.py 
 36 | 
 37 | Web 端上传图片到服务器，存入mongodb并获取oid，稍作修整，希望能往REST架构设计，目前还不完善；
 38 | 前端采用了vue.js, mvvm模式。写了两个版本，一个是index.html无插件，另一个使用了bootstrap-fileinput插件，有点问题；
 39 | 
 40 | ## imageFilter.py
 41 | 对图像透视裁剪和OCR进行了简单的封装，以便于模块间的交互，规定适当的接口
 42 | ```    
 43 |     imageFilter = ImageFilter() # 可以传入一个opencv格式打开的图片
 44 |    
 45 |     num = 22
 46 |     print imageFilter.ocr(num)
 47 | ```
 48 | 
 49 | #### ocr函数 - 模块主函数返回识别数据
 50 | 
 51 | 用于对img进行ocr识别，他会先进行剪切，之后进一步做ocr识别，返回一个json对象
 52 | 如果剪切失败，则返回None
 53 | @num 规定剪切项目数
 54 | 
 55 | #### perspect函数做 - 初步的矫正图片
 56 | 
 57 | 用于透视image，他会缓存一个透视后的opencv numpy矩阵，并返回该矩阵
 58 | 透视失败，则会返回None，并打印不是报告
 59 | @param 透视参数
 60 | 
 61 | * 关于param
 62 | 
 63 | 参数的形式为[p1, p2, p3 ,p4 ,p5]。
 64 | p1,p2,p3,p4,p5都是整型，其中p1必须是奇数。
 65 | 
 66 | p1是高斯模糊的参数，p2和p3是canny边缘检测的高低阈值，p4和p5是和筛选有关的乘数。
 67 | 
 68 | 如果化验报告单放在桌子上时，有的边缘会稍微翘起，产生比较明显的阴影，这种阴影有可能被识别出来，导致定位失败。
 69 | 解决的方法是调整p2和p3，来将阴影线筛选掉。但是如果将p2和p3调的比较高，就会导致其他图里的黑线也被筛选掉了。
 70 | 参数的选择是一个问题。
 71 | 我在getinfo.default中设置的是一个较低的阈值，p2=70,p3=30，这个阈值不会屏蔽阴影线。
 72 | 如果改为p2=70,p3=50则可以屏蔽，但是会导致其他图片识别困难。
 73 | 
 74 | 就现在来看，得到较好结果的前提主要有三个
 75 |  - 化验单尽量平整
 76 |  - 图片中应该包含全部的三条黑线
 77 |  - 图片尽量不要包含化验单的边缘，如果有的话，请尽量避开有阴影的边缘。
 78 | 
 79 | #### filter函数 - 过滤掉不合格的或非报告图片
 80 | 
 81 | 返回img经过透视过后的PIL格式的Image对象，如果缓存中有PerspectivImg则直接使用，没有先进行透视
 82 | 过滤失败则返回None
 83 | @param filter参数
 84 | 
 85 | 
 86 | #### autocut函数 - 将图片中性别、年龄、日期和各项目名称数据分别剪切出来
 87 | 
 88 | 用于剪切ImageFilter中的img成员，剪切之后临时图片保存在out_path，
 89 | 如果剪切失败，返回-1，成功返回0
 90 |  @num 剪切项目数
 91 |  @param 剪切参数
 92 |  
 93 | 剪切出来的图片在BloodTestReportOCR/temp_pics/ 文件夹下
 94 | 
 95 | 函数输出为data0.jpg,data1.jpg......等一系列图片，分别是白细胞计数，中性粒细胞记数等的数值的图片。
 96 | 
 97 | #### classifier.py
 98 | 
 99 | 用于判定裁剪矫正后的报告和裁剪出检测项目的编号
100 | 
101 | #### imgproc.py 
102 | 将识别的图像进行处理二值化等操作，提高识别率
103 | 包括对中文和数字的处理
104 | 
105 | #### digits
106 | 将该文件替换Tesseract-OCR\tessdata\configs中的digits
107 | 


--------------------------------------------------------------------------------
/BloodTestReportOCR/bloodtestdata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_id": "bbca5d6a-2156-41c4-89da-0329e8c99a4f",
 3 |   "originPicture": "bbca5d6a-2156-41c4-89da-0329e8c99a4f",
 4 |   "date": "2016-09-21",
 5 |   "profile": {
 6 |     "gender": "Man",
 7 |     "age": 30
 8 |   },
 9 |   "bloodtest": [
10 |     {"name":  "白细胞记数",      "alias": "WBC", "value":  0, "range":  "4-10",     "unit": "10E9/L"},
11 |     {"name":  "中性粒细胞计数",  "alias": "GRA", "value":  0, "range":  "1.8-6.4",  "unit": "10E9/L"},
12 |     {"name":  "淋巴细胞计数",    "alias": "LYM", "value":  0, "range":  "1-3.3",    "unit": "10E9/L"},
13 |     {"name":  "单核细胞计数",    "alias": "MONO","value":  0, "range":  "0.2~1",    "unit": "10E9/L"},
14 |     {"name":  "嗜酸性粒细胞记数","alias": "EO",  "value":  0, "range":  "0-0.5",    "unit": "10E9/L"},
15 |     {"name":  "嗜碱性粒细胞记数","alias": "BASO","value":  0, "range":  "0.02-0.1", "unit": "%"     },
16 |     {"name":  "中性粒细胞百分比","alias": "GRA%","value":  0, "range":  "40-75",    "unit": "%"     },
17 |     {"name":  "淋巴细胞百分比",  "alias": "LYM%","value":  0, "range":  "18-40",    "unit": "%"     },
18 |     {"name":  "单核细胞百分比",  "alias": "MONO%","value":  0,"range":  "3.5-10",   "unit": "%"     },
19 |     {"name":  "嗜酸性粒细胞百分比","alias": "EO%",  "value":  0, "range":  "0-0.5", "unit": "%"     },
20 |     {"name":  "嗜碱性粒细胞百分比","alias": "BASO%","value":  0, "range":  "0-1.5", "unit": "%"     },
21 |     {"name":  "红细胞记数",      "alias": "RBC", "value":  0, "range":  "4-5.5",    "unit": "10E12/L"},
22 |     {"name":  "血红蛋白",        "alias": "HGB", "value":  0, "range":  "120-160",  "unit": "g/L"   },
23 |     {"name":  "红细胞压积",      "alias": "HCT", "value":  0, "range":  "42-49",    "unit": "L/L"   },
24 |     {"name":  "红细胞平均体积",  "alias": "MCV", "value":  0, "range":  "82-95",    "unit": "fL"    },
25 |     {"name":  "平均血红蛋白",    "alias": "MCH", "value":  0, "range":  "27-33",    "unit": "pg"    },
26 |     {"name":  "平均血红蛋白浓度","alias": "MCHC","value":  0, "range":  "320-360",  "unit": "g/L"   },
27 |     {"name":  "红细胞分布宽度",  "alias": "RDW%","value":  0, "range":  "10.6-15",  "unit": "%"     },
28 |     {"name":  "血小板记数",      "alias": "PLT", "value":  0, "range":  "100-300",  "unit": "10E9/L"},
29 |     {"name":  "血小板压积",      "alias": "PCT", "value":  0, "range":  "0.11-0.28","unit": "L/L"   },
30 |     {"name":  "血小板分布宽度",  "alias": "PDW%","value":  0, "range":  "15.1-18.1","unit": "%"     },
31 |     {"name":  "平均血小板体积",  "alias": "MPV", "value":  0, "range":  "6-14",     "unit": "fL"    }
32 |   ]
33 | }


--------------------------------------------------------------------------------
/BloodTestReportOCR/caffe_predict.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import numpy as np
 3 | import pdb
 4 | import sys,os
 5 | import caffe
 6 | 
 7 | 
 8 | def predict():
 9 |     
10 |     # 设置当前的工作环境目录
11 |     root = '/home/liucan/ocr/' 
12 |     # 我们也把caffe/python也添加到当前环境
13 |     #sys.path.insert(0, '/home/gzr/caffe/python')
14 |     #os.chdir('/home/gzr/caffe')#更换工作目录
15 |     # 设置网络结构
16 |     net_file='./lenet.prototxt'
17 |     # 添加训练之后的参数
18 |     caffe_model='./lenet_iter_800.caffemodel'
19 |     # 均值文件
20 |     #mean_file= '/home/liucan/ocr/mean.npy'
21 | 
22 |     # 这里对任何一个程序都是通用的，就是处理图片
23 |     # 把上面添加的两个变量都作为参数构造一个Net
24 |     net = caffe.Net(net_file,caffe_model,caffe.TEST)
25 |     # 得到data的形状，这里的图片是默认matplotlib底层加载的
26 |     transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
27 |     # matplotlib加载的image是像素[0-1],图片的数据格式[weight,high,channels]，RGB
28 |     # caffe加载的图片需要的是[0-255]像素，数据格式[channels,weight,high],BGR，那么就需要转换
29 | 
30 |     #pdb.set_trace()
31 | 
32 |     # channel 放到前面
33 |     transformer.set_transpose('data',(2, 0, 1))
34 |     #transformer.set_mean('data', np.load(mean_file).mean(1).mean(1))
35 |     # 图片像素放大到[0-255]
36 |     transformer.set_raw_scale('data', 255) 
37 |     # RGB-->BGR 转换
38 |     #transformer.set_channel_swap('data',(2, 1, 0))
39 | 
40 |     # 这里才是加载图片
41 |     im=caffe.io.load_image(root+'img/p9.jpg', color=True)
42 |     #grayim = im[:,:,0]
43 |     #im = np.reshape(grayim,(170,37,3))
44 | 
45 |     # 用上面的transformer.preprocess来处理刚刚加载图片
46 |     net.blobs['data'].data[...] = transformer.preprocess('data',im)
47 |     #注意,网络开始向前传播啦
48 |     out = net.forward()
49 |     # 最终的结果: 当前这个图片的属于哪个物体的概率(列表表示)
50 |     output_prob = net.blobs['prob'].data[0]
51 |     # 找出最大的那个概率
52 |     print 'predicted class is:', output_prob.argmax()
53 |     return output_prob.argmax()
54 | 
55 | if __name__=='__main__':
56 |     predict()
57 | 
58 | 


--------------------------------------------------------------------------------
/BloodTestReportOCR/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model.ckpt"
2 | all_model_checkpoint_paths: "model.ckpt"
3 | 


--------------------------------------------------------------------------------
/BloodTestReportOCR/classifier.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import pHash
 3 | from PIL import Image
 4 | import os
 5 | # 判断是否血常规检验报告，输入经过矫正后的报告图像
 6 | def isReport(img):
 7 |     # add your code here
 8 |     image = Image.open(os.getcwd() + '/origin_pics/region.jpg')
 9 |     rate=pHash.classify_DCT(image,img)/64.0
10 | 
11 |     if(rate>0.6):
12 |         return True
13 |     else:
14 |         return False
15 | 
16 | # 根据剪裁好的项目名称图片获得该项目的分类号，注意不是检验报告上的编号，是我们存储的编号
17 | num = 0
18 | def getItemNum(img):
19 |     # replace your code
20 |     global num
21 |     if num >= 22:
22 |         num = 0
23 |     ret = num
24 |     num = num + 1
25 |     return ret
26 | 
27 | # unit test
28 | if __name__ == '__main__':
29 |     import classifier
30 | 
31 |     img = []
32 |     if classifier.isReport(img) :
33 |         print 'classifier.isReport(img) is True'
34 |     for i in range(33):
35 |         print classifier.getItemNum(img)
36 | 


--------------------------------------------------------------------------------
/BloodTestReportOCR/config.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ALLOWED_EXTENSIONS = set(['png', 'jpeg', 'jpg'])
 3 | 
 4 | DB_HOST = 'localhost'
 5 | DB_PORT = 27017
 6 | 
 7 | SERVER_HOST = '0.0.0.0'
 8 | SERVER_PORT = 8080
 9 | 
10 | DEBUG=True
11 | 
12 | MODEL=0
13 | 


--------------------------------------------------------------------------------
/BloodTestReportOCR/digits:
--------------------------------------------------------------------------------
1 | tessedit_char_whitelist 0123456789.-.
2 | 


--------------------------------------------------------------------------------
/BloodTestReportOCR/imgproc.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import cv2
 3 | 
 4 | def  digitsimg(src):
 5 |     
 6 |     #灰度化
 7 |     img_gray = cv2.cvtColor(src,cv2.COLOR_BGR2GRAY)
 8 | 
 9 |     #Otsu thresholding 二值化
10 |     ret,result= cv2.threshold(img_gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
11 | 
12 |     #腐蚀去除一些小的点
13 |     kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(3,2))
14 |     eroded = cv2.erode(result,kernel)
15 | 
16 |     #将结果放大便于识别
17 |     result = cv2.resize(result,(128,128),interpolation=cv2.INTER_CUBIC)
18 | 
19 |    # cv2.imshow('result',result)
20 |    # cv2.waitKey(0)
21 | 
22 |     #腐蚀去除放大后的一些小的点
23 |     eroded = cv2.erode(result,kernel)
24 |   #  cv2.imshow('eroded',eroded)
25 |   #  cv2.waitKey(0)
26 |     #膨胀使数字更饱满
27 |     result = cv2.dilate(eroded,kernel)
28 |  #   cv2.imshow('dilated',result)
29 | 
30 |     #直方图均衡化使图像更清晰
31 |     cv2.equalizeHist(result)
32 |     #中值滤波去除噪点
33 |     result = cv2.medianBlur(result,5)
34 | #    cv2.imshow('median',result)
35 |  #   cv2.waitKey(0)
36 |     return result
37 | '''
38 | def chineseimg(src):
39 | 
40 |     
41 | 
42 |     #灰度化
43 |     img_gray = cv2.cvtColor(src,cv2.COLOR_BGR2GRAY)
44 | 
45 | 
46 |     #Otsu thresholding 二值化
47 |     ret,result= cv2.threshold(img_gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
48 |   #  cv2.imshow('otsu',result)
49 |   #  cv2.waitKey(0)
50 | 
51 | 
52 |     #直方图均衡化使图像更清晰
53 |     cv2.equalizeHist(result)
54 |   #  cv2.imshow('直方图',result)
55 |  #   cv2.waitKey(0)
56 |     return result
57 | 
58 |     #将结果放大便于识别
59 |     result = cv2.resize(result,(256,128),interpolation=cv2.INTER_CUBIC)
60 | 
61 |     #腐蚀去除放大后的一些小的点
62 |     kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(3,2))
63 |     eroded = cv2.erode(result,kernel)
64 |     cv2.imshow('eroded',eroded)
65 |     cv2.waitKey(0)
66 | 
67 |     #膨胀使数字更饱满
68 |     result = cv2.dilate(eroded,kernel)
69 |     cv2.imshow('dilated',result)
70 |     cv2.waitKey(0)
71 | 
72 |     #直方图均衡化使图像更清晰
73 |     cv2.equalizeHist(result)
74 |     #中值滤波去除噪点
75 |     result = cv2.medianBlur(result,5)
76 |     cv2.imshow('median',result)
77 |     cv2.waitKey(0)'''
78 |     
79 | 
80 | 


--------------------------------------------------------------------------------
/BloodTestReportOCR/lenet.prototxt:
--------------------------------------------------------------------------------
  1 | name: "LeNet"
  2 | layer {
  3 |   name: "data"
  4 |   type: "Input"
  5 |   top: "data"
  6 |   input_param { shape: { dim: 4 dim: 3 dim: 37 dim: 170 } }
  7 | }
  8 | layer {
  9 |   name: "conv1"
 10 |   type: "Convolution"
 11 |   bottom: "data"
 12 |   top: "conv1"
 13 |   param {
 14 |     lr_mult: 1
 15 |   }
 16 |   param {
 17 |     lr_mult: 2
 18 |   }
 19 |   convolution_param {
 20 |     num_output: 20
 21 |     kernel_size: 5
 22 |     stride: 1
 23 |     weight_filler {
 24 |       type: "xavier"
 25 |     }
 26 |     bias_filler {
 27 |       type: "constant"
 28 |     }
 29 |   }
 30 | }
 31 | layer {
 32 |   name: "pool1"
 33 |   type: "Pooling"
 34 |   bottom: "conv1"
 35 |   top: "pool1"
 36 |   pooling_param {
 37 |     pool: MAX
 38 |     kernel_size: 2
 39 |     stride: 2
 40 |   }
 41 | }
 42 | layer {
 43 |   name: "conv2"
 44 |   type: "Convolution"
 45 |   bottom: "pool1"
 46 |   top: "conv2"
 47 |   param {
 48 |     lr_mult: 1
 49 |   }
 50 |   param {
 51 |     lr_mult: 2
 52 |   }
 53 |   convolution_param {
 54 |     num_output: 50
 55 |     kernel_size: 5
 56 |     stride: 1
 57 |     weight_filler {
 58 |       type: "xavier"
 59 |     }
 60 |     bias_filler {
 61 |       type: "constant"
 62 |     }
 63 |   }
 64 | }
 65 | layer {
 66 |   name: "pool2"
 67 |   type: "Pooling"
 68 |   bottom: "conv2"
 69 |   top: "pool2"
 70 |   pooling_param {
 71 |     pool: MAX
 72 |     kernel_size: 2
 73 |     stride: 2
 74 |   }
 75 | }
 76 | layer {
 77 |   name: "ip1"
 78 |   type: "InnerProduct"
 79 |   bottom: "pool2"
 80 |   top: "ip1"
 81 |   param {
 82 |     lr_mult: 1
 83 |   }
 84 |   param {
 85 |     lr_mult: 2
 86 |   }
 87 |   inner_product_param {
 88 |     num_output: 500
 89 |     weight_filler {
 90 |       type: "xavier"
 91 |     }
 92 |     bias_filler {
 93 |       type: "constant"
 94 |     }
 95 |   }
 96 | }
 97 | layer {
 98 |   name: "relu1"
 99 |   type: "ReLU"
100 |   bottom: "ip1"
101 |   top: "ip1"
102 | }
103 | layer {
104 |   name: "ip2"
105 |   type: "InnerProduct"
106 |   bottom: "ip1"
107 |   top: "ip2"
108 |   param {
109 |     lr_mult: 1
110 |   }
111 |   param {
112 |     lr_mult: 2
113 |   }
114 |   inner_product_param {
115 |     num_output: 22
116 |     weight_filler {
117 |       type: "xavier"
118 |     }
119 |     bias_filler {
120 |       type: "constant"
121 |     }
122 |   }
123 | }
124 | layer {
125 |   name: "prob"
126 |   type: "Softmax"
127 |   bottom: "ip2"
128 |   top: "prob"
129 | }
130 | 


--------------------------------------------------------------------------------
/BloodTestReportOCR/lenet_iter_800.caffemodel:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/lenet_iter_800.caffemodel


--------------------------------------------------------------------------------
/BloodTestReportOCR/model.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/model.ckpt.data-00000-of-00001


--------------------------------------------------------------------------------
/BloodTestReportOCR/model.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/model.ckpt.index


--------------------------------------------------------------------------------
/BloodTestReportOCR/nn_model/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model.ckpt"
2 | all_model_checkpoint_paths: "model.ckpt"
3 | 


--------------------------------------------------------------------------------
/BloodTestReportOCR/nn_model/model.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/nn_model/model.ckpt.data-00000-of-00001


--------------------------------------------------------------------------------
/BloodTestReportOCR/nn_model/model.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/nn_model/model.ckpt.index


--------------------------------------------------------------------------------
/BloodTestReportOCR/origin_pics/bloodtestreport1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/origin_pics/bloodtestreport1.jpg


--------------------------------------------------------------------------------
/BloodTestReportOCR/origin_pics/bloodtestreport2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/origin_pics/bloodtestreport2.jpg


--------------------------------------------------------------------------------
/BloodTestReportOCR/origin_pics/bloodtestreport3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/origin_pics/bloodtestreport3.jpg


--------------------------------------------------------------------------------
/BloodTestReportOCR/origin_pics/bloodtestreport4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/origin_pics/bloodtestreport4.jpg


--------------------------------------------------------------------------------
/BloodTestReportOCR/origin_pics/bloodtestreport5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/origin_pics/bloodtestreport5.jpg


--------------------------------------------------------------------------------
/BloodTestReportOCR/origin_pics/bloodtestreport6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/origin_pics/bloodtestreport6.jpg


--------------------------------------------------------------------------------
/BloodTestReportOCR/origin_pics/bloodtestreport7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/origin_pics/bloodtestreport7.jpg


--------------------------------------------------------------------------------
/BloodTestReportOCR/origin_pics/region.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/origin_pics/region.jpg


--------------------------------------------------------------------------------
/BloodTestReportOCR/pHash.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri Dec  2 20:03:39 2016
  5 | 
  6 | @author: zhao
  7 | """
  8 | import numpy as np
  9 | from PIL import Image
 10 | from PIL import ImageFilter
 11 | from PIL import ImageOps
 12 | import math
 13 | #得到hamming code
 14 | def get_code(List,middle):
 15 | 
 16 | 	result = []
 17 | 	for index in range(0,len(List)):
 18 | 		if List[index] > middle:
 19 | 			result.append("1")
 20 | 		else:
 21 | 			result.append("0")
 22 | 	return result
 23 | 
 24 | 
 25 | #比较hamming code
 26 | def comp_code(code1,code2):
 27 | 	num = 0
 28 | 	for index in range(0,len(code1)):
 29 | 		if str(code1[index]) == str(code2[index]):
 30 | 			num+=1
 31 | 	return num 
 32 | 
 33 | #计算平均值
 34 | def get_middle(List):
 35 |      li = List[:]
 36 |      li.sort()
 37 |      value = 0
 38 |      if len(li)%2==0:
 39 | 		index = int((len(li)/2)) - 1
 40 | 
 41 | 		value = li[index]
 42 |      else:
 43 | 		index = int((len(li)/2))
 44 | 		value = (li[index]+li[index-1])/2
 45 |      return value
 46 | 
 47 | #得到像素矩阵
 48 | def get_matrix(image):
 49 | 
 50 | 	matrix = []
 51 | 	size = image.size
 52 | 	for height in range(0,size[1]):
 53 | 		pixel = []
 54 | 		for width in range(0,size[0]):
 55 | 			pixel_value = image.getpixel((width,height))
 56 | 			pixel.append(pixel_value)
 57 | 		matrix.append(pixel)	
 58 | 
 59 | 	return matrix
 60 | 
 61 | #求离散余弦变换的系数矩阵[A]
 62 | def get_coefficient(n):
 63 | 	matrix = []
 64 | 	PI = math.pi
 65 | 	sqr = math.sqrt(1/n)
 66 | 	value = []
 67 | 	for i in range(0,n):
 68 | 		value.append(sqr)
 69 | 	matrix.append(value)
 70 | 
 71 | 	for i in range(1,n):
 72 | 		value=[]
 73 | 		for j in range (0,n):
 74 | 			data = math.sqrt(2.0/n) * math.cos(i*PI*(j+0.5)/n);  
 75 | 			value.append(data)
 76 | 		matrix.append(value)
 77 | 
 78 | 	return matrix
 79 | 
 80 | #转置
 81 | def get_transposing(matrix):
 82 | 	new_matrix = []
 83 | 
 84 | 	for i in range(0,len(matrix)):
 85 | 		value = []
 86 | 		for j in range(0,len(matrix[i])):
 87 | 			value.append(matrix[j][i])
 88 | 		new_matrix.append(value)
 89 | 
 90 | 	return new_matrix
 91 | #矩阵乘法
 92 | def get_mult(matrix1,matrix2):
 93 | 	new_matrix = []
 94 | 
 95 | 	for i in range(0,len(matrix1)):
 96 | 		value_list = []
 97 | 		for j in range(0,len(matrix1)): 
 98 | 			t = 0.0
 99 | 			for k in range(0,len(matrix1)):
100 | 				t += matrix1[i][k] * matrix2[k][j]
101 | 			value_list.append(t)
102 | 		new_matrix.append(value_list)
103 | 
104 | 	return new_matrix
105 |  
106 | #计算DCT
107 | def DCT(double_matrix):
108 | 	n = len(double_matrix) 
109 | 	A = get_coefficient(n)
110 | 	AT = get_transposing(A)
111 | 
112 | 	temp = get_mult(double_matrix, A)
113 | 	DCT_matrix = get_mult(temp, AT)
114 | 
115 | 	return DCT_matrix
116 |  
117 | #缩小DCT	
118 | def sub_matrix_to_list(DCT_matrix,part_size):
119 | 	w,h = part_size
120 | 	List = []
121 | 	for i in range(0,h):
122 | 		for j in range(0,w):
123 | 			List.append(DCT_matrix[i][j])
124 | 	return List
125 | 
126 | 
127 | 
128 | def classify_DCT(image1,image2,size=(32,32),part_size=(8,8)):
129 | 	
130 | 	assert size[0]==size[1],"size error"
131 | 	assert part_size[0]==part_size[1],"part_size error"
132 | 
133 | 	image1 = image1.resize(size).convert('L').filter(ImageFilter.BLUR)
134 | 	image1 = ImageOps.equalize(image1)
135 | 	matrix = get_matrix(image1)
136 | 	DCT_matrix = DCT(matrix)
137 | 	List = sub_matrix_to_list(DCT_matrix, part_size)
138 | 	middle = get_middle(List)
139 | 	code1 = get_code(List, middle) 
140 | 
141 | 
142 | 	image2 = image2.resize(size).convert('L').filter(ImageFilter.BLUR)
143 | 	image2 = ImageOps.equalize(image2)
144 | 	matrix = get_matrix(image2)
145 | 	DCT_matrix = DCT(matrix)
146 | 	List = sub_matrix_to_list(DCT_matrix, part_size)
147 | 	middle = get_middle(List)
148 | 	code2 = get_code(List, middle)
149 |  
150 | 	return comp_code(code1, code2)
151 | 
152 | 
153 |        
154 | 


--------------------------------------------------------------------------------
/BloodTestReportOCR/pd_predict.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from py_paddle import swig_paddle
 3 | import sys
 4 | sys.path.append("..")
 5 | from PaddlePaddle import prediction_sex,prediction_age
 6 | def predict(arr):
 7 |     swig_paddle.initPaddle("--use_gpu=0")
 8 |     data = [arr.tolist()]
 9 |     #直接填充4个0
10 |     for i in range(4):
11 | 	data[0][0].append(0)
12 |     sex = prediction_sex.predict(data)
13 |     age = prediction_age.predict(data)
14 |     return sex,age
15 | 


--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_age_model/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model.ckpt"
2 | all_model_checkpoint_paths: "model.ckpt"
3 | 


--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_age_model/model.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/rnn_model/rnn_age_model/model.ckpt.data-00000-of-00001


--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_age_model/model.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/rnn_model/rnn_age_model/model.ckpt.index


--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_age_model/model.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/rnn_model/rnn_age_model/model.ckpt.meta


--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_sex_model/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model.ckpt"
2 | all_model_checkpoint_paths: "model.ckpt"
3 | 


--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_sex_model/model.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/rnn_model/rnn_sex_model/model.ckpt.data-00000-of-00001


--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_sex_model/model.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/rnn_model/rnn_sex_model/model.ckpt.index


--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_sex_model/model.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/rnn_model/rnn_sex_model/model.ckpt.meta


--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_sex_model/rnn_age_model/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model.ckpt"
2 | all_model_checkpoint_paths: "model.ckpt"
3 | 


--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_sex_model/rnn_age_model/model.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/rnn_model/rnn_sex_model/rnn_age_model/model.ckpt.data-00000-of-00001


--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_sex_model/rnn_age_model/model.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/rnn_model/rnn_sex_model/rnn_age_model/model.ckpt.index


--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_sex_model/rnn_age_model/model.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/rnn_model/rnn_sex_model/rnn_age_model/model.ckpt.meta


--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | from tensorflow.python.ops import rnn, rnn_cell
  6 | 
  7 | def predict_sex(data_predict):
  8 |     tf.reset_default_graph()
  9 | 
 10 |     # Network Parameters
 11 |     n_input = 11  # MNIST data input (img shape: 28*28)
 12 |     n_steps = 2  # timesteps
 13 |     n_hidden = 128  # hidden layer num of features
 14 |     n_classes = 2  # MNIST total classes (0-9 digits)
 15 | 
 16 |     data_predict = np.reshape(data_predict, (1,n_steps, n_input))
 17 | 
 18 | 
 19 | 
 20 | 
 21 |     # tf Graph input
 22 |     x = tf.placeholder("float", [None, n_steps, n_input])
 23 |     y = tf.placeholder("float", [None, n_classes])
 24 | 
 25 |     # Define weights
 26 |     weights = {
 27 |         'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))
 28 |     }
 29 |     biases = {
 30 |         'out': tf.Variable(tf.random_normal([n_classes]))
 31 |     }
 32 | 
 33 |     def RNN(x, weights, biases):
 34 | 
 35 |         # Prepare data shape to match `rnn` function requirements
 36 |         # Current data input shape: (batch_size, n_steps, n_input)
 37 |         # Required shape: 'n_steps' tensors list of shape (batch_size, n_input)
 38 | 
 39 | 
 40 |         # Permuting batch_size and n_steps
 41 |         x = tf.transpose(x, [1, 0, 2])
 42 |         #to : (n_steps, batch_size, n_input)
 43 | 
 44 | 
 45 |         # Dimensionality reduction
 46 |         x = tf.reshape(x, [-1, n_input])
 47 |         # Reshaping to (n_steps*batch_size, n_input)
 48 | 
 49 |         # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
 50 |         x = tf.split(0, n_steps, x)
 51 | 
 52 |         # Define a lstm cell with tensorflow
 53 |         lstm_cell = rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0)
 54 | 
 55 |         # Get lstm cell output
 56 |         outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
 57 | 
 58 |         # Linear activation, using rnn inner loop last output
 59 |         return tf.matmul(outputs[-1], weights['out']) + biases['out']
 60 | 
 61 |     pred = RNN(x, weights, biases)
 62 | 
 63 |     # Initializing the variables
 64 |     init = tf.global_variables_initializer()
 65 | 
 66 |     ######
 67 |     saver = tf.train.Saver()
 68 | 
 69 |     # Launch the graph
 70 |     with tf.Session() as sess:
 71 |         sess.run(init)
 72 |         saver.restore(sess,"./rnn_model/rnn_sex_model/model.ckpt")
 73 |         p = sess.run(pred, feed_dict={x:data_predict})
 74 | 
 75 | 
 76 | 
 77 |     if p[0][0] > p[0][1]:
 78 |         sex_result = 0
 79 |     else:
 80 |         sex_result = 1
 81 | 
 82 | 
 83 |     return sex_result
 84 | 
 85 | 
 86 | 
 87 | def predict_age(data_predict):
 88 |     tf.reset_default_graph()
 89 | 
 90 |     # Network Parameters
 91 |     n_input = 11  # MNIST data input (img shape: 28*28)
 92 |     n_steps = 2  # timesteps
 93 |     n_hidden = 128  # hidden layer num of features
 94 |     n_classes = 10 #MNIST total classes (0-9 digits)
 95 | 
 96 |     data_predict = np.reshape(data_predict, (1,n_steps, n_input))
 97 | 
 98 | 
 99 | 
100 | 
101 |     # tf Graph input
102 |     x = tf.placeholder("float", [None, n_steps, n_input])
103 |     y = tf.placeholder("float", [None, n_classes])
104 | 
105 |     # Define weights
106 |     weights = {
107 |         'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))
108 |     }
109 |     biases = {
110 |         'out': tf.Variable(tf.random_normal([n_classes]))
111 |     }
112 | 
113 |     def RNN(x, weights, biases):
114 | 
115 |         # Prepare data shape to match `rnn` function requirements
116 |         # Current data input shape: (batch_size, n_steps, n_input)
117 |         # Required shape: 'n_steps' tensors list of shape (batch_size, n_input)
118 | 
119 | 
120 |         # Permuting batch_size and n_steps
121 | 
122 |         x = tf.transpose(x, [1, 0, 2])
123 |         # Reshaping to (n_steps*batch_size, n_input)
124 |         x = tf.reshape(x, [-1, n_input])
125 |         # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
126 |         x = tf.split(0, n_steps, x)
127 | 
128 |         # Define a lstm cell with tensorflow
129 |         lstm_cell = rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0)
130 | 
131 |         # Get lstm cell output
132 |         outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
133 | 
134 |         # Linear activation, using rnn inner loop last output
135 |         return tf.matmul(outputs[-1], weights['out']) + biases['out']
136 | 
137 |     pred = RNN(x, weights, biases)
138 | 
139 |     # Initializing the variables
140 |     init = tf.global_variables_initializer()
141 | 
142 |     ######
143 |     saver = tf.train.Saver()
144 | 
145 |     # Launch the graph
146 |     with tf.Session() as sess:
147 |         sess.run(init)
148 |         saver.restore(sess,"./rnn_model/rnn_age_model/model.ckpt")
149 |         p = sess.run(pred, feed_dict={x:data_predict})
150 | 
151 |     # print(tf.argmax(p, 1))
152 |     max = p[0][0]
153 |     max_i = 0
154 |     for i in range(n_classes):
155 |         if p[0][i] > max:
156 |             max_i = i
157 |             max = p[0][i]
158 | 
159 | 
160 |     age_result = str(max_i * 10) + "~" + str((max_i+1) *10 -1)
161 | 
162 |     return age_result


--------------------------------------------------------------------------------
/BloodTestReportOCR/static/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="zh-cn">
  3 | 
  4 | <head>
  5 |     <meta charset="utf-8">
  6 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 |     <meta name="viewport" content="width=device-width, initial-scale=1">
  8 |     <title>BloodTestOCR</title>
  9 |     <!-- Jquey load frist-->
 10 |     <script src="http://cdn.bootcss.com/jquery/3.1.1/jquery.min.js" type="text/javascript"></script>
 11 |     <!-- Bootstrap -->
 12 |     <link rel="stylesheet" href="http://cdn.bootcss.com/bootstrap/3.3.0/css/bootstrap.min.css">
 13 |     <!-- bootstrap.js below is needed if you wish to zoom and view file content 
 14 |      in a larger detailed modal dialog -->
 15 |     <!-- https://unpkg.com/vue/dist/vue.js -->
 16 |     <script src="http://cdn.bootcss.com/bootstrap/3.3.0/js/bootstrap.min.js"></script>
 17 |     <script src="http://static.runoob.com/assets/vue/1.0.11/vue.min.js"></script>
 18 | </head>
 19 | 
 20 | <body>
 21 | <div class="container">
 22 |     <div class="navbar navbar-default">
 23 |         <div class="navbar-header">
 24 |             <a class="navbar-brand" href="#">BloodTestOCR Demo</a>
 25 |         </div>
 26 |     </div>
 27 | </div>
 28 | <div class="container">
 29 |     <div class="checkbox">
 30 |     </div>
 31 |     <form id="data" class="form-signin" role="form" method="post" action="upload" enctype="multipart/form-data">
 32 |         <h2 class="form-signin-heading">请选择血常规检验报告图片上传</h2>
 33 |         <div class="checkbox">
 34 |         </div>
 35 |         <input type="file" name="imagefile" class="form-control" placeholder="file path" required autofocus>
 36 |         <div class="checkbox">
 37 |         </div>
 38 |         <button class="btn btn-lg btn-primary btn-block" type="submit">提交</button>
 39 |     </form>
 40 | </div>
 41 | <div id="filtered-image" class="container">
 42 | 
 43 | </div>
 44 | <!-- /container -->
 45 | <!-- /container -->
 46 | <div class="container">
 47 |     <hr class="soften" />
 48 | </div>
 49 | <div id="report" class="container">
 50 |     <button id="btnReport" type="button" v-on:click="showReport()" class="btn btn-primary btn-lg btn-block" style="display:none;">生成报告</button>
 51 |     <div id="report-table" style="display:none;">
 52 |         <div style="width:50%;float:left;">
 53 |             <table id= "table_left" class="table table-inverse table-hover table-bordered">
 54 |                 <thead>
 55 |                 <tr>
 56 |                     <th> </th>
 57 |                     <th>检测项目</th>
 58 |                     <th>结果</th>
 59 |                     <th>参考范围</th>
 60 |                     <th>单位</th>
 61 |                 </tr>
 62 |                 </thead>
 63 |                 <tbody>
 64 |                 <tr v-for="item in report_items_left">
 65 |                     <td>{{ item.count }}</td>
 66 |                     <td>{{ item.name }}</td>
 67 |                     <td>
 68 |                         <input type="text" v-model="item.value" class="form-control" placeholder="检测值" />
 69 |                     </td>
 70 |                     <td>{{ item.range }}</td>
 71 |                     <td>{{ item.unit }}</td>
 72 |                 </tr>
 73 |                 </tbody>
 74 |             </table>
 75 |         </div>
 76 | 
 77 |         <div style="width:50%;float:right;">
 78 |             <table id= "table_right" class="table table-inverse table-hover table-bordered">
 79 |                 <thead>
 80 |                 <tr>
 81 |                     <th> </th>
 82 |                     <th>检测项目</th>
 83 |                     <th>结果</th>
 84 |                     <th>参考范围</th>
 85 |                     <th>单位</th>
 86 |                 </tr>
 87 |                 </thead>
 88 |                 <tbody>
 89 |                 <tr v-for="item in report_items_right">
 90 |                     <td>{{ item.count }}</td>
 91 |                     <td>{{ item.name }}</td>
 92 |                     <td>
 93 |                         <input type="text" v-model="item.value" class="form-control" placeholder="检测值" />
 94 |                     </td>
 95 |                     <td>{{ item.range }}</td>
 96 |                     <td>{{ item.unit }}</td>
 97 |                 </tr>
 98 |                 </tbody>
 99 |             </table>
100 |         </div>
101 |     </div>
102 |     <br>
103 |     <br>
104 |     <button id="btnPredict" type="button" v-on:click="test()" class="btn btn-primary btn-lg btn-block" style="display:none;">开始预测</button>
105 | </div>
106 | <script>
107 |     function refresh(){
108 |         location.reload();
109 | 	}
110 |     var report = new Vue({
111 |         el: '#report',
112 |         data: {
113 |             report_items_left: new Array(),
114 |             report_items_right: new Array(),
115 |         },
116 |         methods: {
117 |             showReport: function(event) {
118 | 
119 |                 url = $("#filtered-report").attr("src");
120 |                 if (url == null) {
121 |                     alert("请上传报告");
122 |                     return;
123 |                 }
124 | 
125 |                 url = 'report/' + url.split('/')[2];
126 |                 console.log(url);
127 |                 /*
128 |                  $.get(url,function(data) {
129 |                  console.log(data);
130 |                  console.log(data['bloodtest']);
131 |                  for (var i = 0; i < data['bloodtest'].length; i++) {
132 |                  this.report_items_left.push({
133 |                  name: data.bloodtest[i].name,
134 |                  alias: data.bloodtest[i].alias,
135 |                  value: data.bloodtest[i].value,
136 |                  range: data.bloodtest[i].range,
137 |                  unit: data.boodtest[i].unit
138 |                  });
139 |                  }
140 |                  });
141 |                  */
142 |                 $.ajax({
143 |                     url: url,
144 |                     success: function(data) {
145 |                         console.log("response before JSON.parse:\n" + data);
146 |                         // data是string，必须转化为json对象,第一次parse将不标准的string转换为JSON标准格式的string，第二次parse将标准格式的string转换为JSON对象
147 |                         var json_str = JSON.parse(data);
148 |                         var json_data = JSON.parse(json_str);
149 |                         console.log("response of json_str:\n" + json_str);
150 |                         console.log("response of json_data:\n" + json_data);
151 |                         console.log("data.date\n"+json_data["date"]);
152 |                         console.log(json_data.bloodtest);
153 | 
154 |                         //先清空表格内容
155 |                         $("#table_left  tr:not(:first)").empty();
156 |                         $("#table_right  tr:not(:first)").empty();
157 | 
158 |                         for (var i = 0; i < json_data["bloodtest"].length; i++) {
159 |                             if(i<13){
160 |                                 report.report_items_left.push({
161 |                                     count: i+1,
162 |                                     name: json_data.bloodtest[i]["name"],
163 |                                     alias: json_data.bloodtest[i].alias,
164 |                                     value: json_data.bloodtest[i].value,
165 |                                     range: json_data.bloodtest[i].range,
166 |                                     unit: json_data.bloodtest[i].unit
167 |                                 });
168 |                             }
169 | 
170 |                             else {
171 |                                 report.report_items_right.push({
172 |                                     count: i+1,
173 |                                     name: json_data.bloodtest[i]["name"],
174 |                                     alias: json_data.bloodtest[i].alias,
175 |                                     value: json_data.bloodtest[i].value,
176 |                                     range: json_data.bloodtest[i].range,
177 |                                     unit: json_data.bloodtest[i].unit
178 |                                 });
179 |                             }
180 | 
181 |                         }
182 |                         $("#report-table").show();
183 |                         $("#btnPredict").show();
184 |                     }
185 |                 });
186 |             },
187 | 
188 | 
189 |             test: function(event) {
190 | 
191 |                 data = [];
192 |                 for(var i=0;i<13;i++)
193 |                     data[i] = Number(this.report_items_left[i].value);
194 |                 for(var i=0;i<9;i++)
195 |                     data[13+i] = Number(this.report_items_right[i].value);
196 | 
197 |                 var data = {
198 |                     data: JSON.stringify(({
199 |                         "value":data
200 |                     }))
201 |                 };
202 | 
203 | 
204 |                 $.ajax({
205 |                     url: "/predict",
206 |                     type: 'POST',
207 |                     data: data,
208 |                     success: function(data) {
209 |                         var obj = JSON.parse(data)
210 |                         if(obj.sex == 1)
211 |                             var sexsex = "男";
212 |                         else
213 |                             var sexsex = "女"
214 |                         alert("性别：" + sexsex + "\n年龄：" + obj.age);
215 |                         refresh();
216 |                     }
217 |                 })
218 |             }
219 | 
220 |         }
221 |     });
222 | 
223 |     $(document).on('submit', "form#data", function(e) {
224 |         e.preventDefault();
225 |         $.ajax({
226 |             url: $(this).attr('action'),
227 |             type: 'POST',
228 |             data: new FormData(this),
229 |             processData: false,
230 |             contentType: false
231 |         }).done(function(data) {
232 |             //console.log(data.templates);
233 | 
234 |             if(data.error == 1)
235 |             {
236 |                 alert("图片不合格！");
237 |             }else
238 |             {
239 |                 $("#filtered-image").empty().append(data.templates);
240 |                 $("#btnReport").show();
241 |             }
242 |         });
243 |     });
244 | </script>
245 | </body>
246 | </html>
247 | 


--------------------------------------------------------------------------------
/BloodTestReportOCR/temp_pics/README.md:
--------------------------------------------------------------------------------
1 | temp pictures
2 | 


--------------------------------------------------------------------------------
/BloodTestReportOCR/tf_predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | 
  6 | def normalized(a,b):
  7 |     for i in range(22):
  8 |         tmp = np.mean(a[:, i])
  9 | 
 10 |         a[:, i] = a[:, i] - tmp
 11 |         b[:, i] = b[:, i] - tmp
 12 | 
 13 | 
 14 |         if np.min(a[:, i]) != np.max(a[:, i]):
 15 |             b[:, i] = 2 * (b[:, i] - np.min(a[:, i])) / (np.max(a[:, i]) - np.min(a[:, i])) - 1
 16 |         else:
 17 |             b[:, i] = 0
 18 |     return b
 19 | 
 20 | def predict(data_predict):
 21 |     tf.reset_default_graph()
 22 |     data_nor = np.loadtxt(open("./data.csv", "rb"), delimiter=",", skiprows=0)
 23 | 
 24 |     data_predict = normalized(data_nor[:, 2:], data_predict)
 25 | 
 26 |     '''
 27 |         参数
 28 |         '''
 29 |     learning_rate = 0.005
 30 |     display_step = 100
 31 |     n_input = 22
 32 | 
 33 |     n_hidden_1_age = 32
 34 |     n_hidden_2_age = 16
 35 |     n_classes_age = 1
 36 | 
 37 |     n_hidden_1_sex = 16
 38 |     n_hidden_2_sex = 8
 39 |     n_classes_sex = 2
 40 |     data = np.loadtxt(open("./data.csv", "rb"), delimiter=",", skiprows=0)
 41 |     '''
 42 |     建立年龄模型
 43 |     '''
 44 |     x_age = tf.placeholder("float", [None, n_input])
 45 |     y_age = tf.placeholder("float", [None, n_classes_age])
 46 | 
 47 |     def multilayer_perceptron_age(x_age, weights_age, biases_age):
 48 |         # Hidden layer with RELU activation
 49 |         layer_1 = tf.add(tf.matmul(x_age, weights_age['h1']), biases_age['b1'])
 50 |         layer_1 = tf.nn.relu(layer_1)
 51 |         # Hidden layer with RELU activation
 52 |         layer_2 = tf.add(tf.matmul(layer_1, weights_age['h2']), biases_age['b2'])
 53 |         layer_2 = tf.nn.relu(layer_2)
 54 |         # Output layer with linear activation
 55 |         out_layer = tf.matmul(layer_2, weights_age['out']) + biases_age['out']
 56 |         return out_layer
 57 | 
 58 |     weights_age = {
 59 |         'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1_age])),
 60 |         'h2': tf.Variable(tf.random_normal([n_hidden_1_age, n_hidden_2_age])),
 61 |         'out': tf.Variable(tf.random_normal([n_hidden_2_age, n_classes_age]))
 62 |     }
 63 |     biases_age = {
 64 |         'b1': tf.Variable(tf.random_normal([n_hidden_1_age])),
 65 |         'b2': tf.Variable(tf.random_normal([n_hidden_2_age])),
 66 |         'out': tf.Variable(tf.random_normal([n_classes_age]))
 67 |     }
 68 |     pred_age = multilayer_perceptron_age(x_age, weights_age, biases_age)
 69 |     '''
 70 |     建立性别模型
 71 |     '''
 72 |     x_sex = tf.placeholder("float", [None, n_input])
 73 |     y_sex = tf.placeholder("float", [None, n_classes_sex])
 74 | 
 75 |     def multilayer_perceptron_sex(x_sex, weights_sex, biases_sex):
 76 |         # Hidden layer with RELU activation
 77 |         layer_1 = tf.add(tf.matmul(x_sex, weights_sex['h1']), biases_sex['b1'])
 78 |         layer_1 = tf.nn.relu(layer_1)
 79 |         # Hidden layer with RELU activation
 80 |         layer_2 = tf.add(tf.matmul(layer_1, weights_sex['h2']), biases_sex['b2'])
 81 |         layer_2 = tf.nn.relu(layer_2)
 82 |         # Output layer with linear activation
 83 |         out_layer = tf.matmul(layer_2, weights_sex['out']) + biases_sex['out']
 84 |         return out_layer
 85 | 
 86 |     weights_sex = {
 87 |         'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1_sex])),
 88 |         'h2': tf.Variable(tf.random_normal([n_hidden_1_sex, n_hidden_2_sex])),
 89 |         'out': tf.Variable(tf.random_normal([n_hidden_2_sex, n_classes_sex]))
 90 |     }
 91 |     biases_sex = {
 92 |         'b1': tf.Variable(tf.random_normal([n_hidden_1_sex])),
 93 |         'b2': tf.Variable(tf.random_normal([n_hidden_2_sex])),
 94 |         'out': tf.Variable(tf.random_normal([n_classes_sex]))
 95 |     }
 96 |     pred_sex = multilayer_perceptron_sex(x_sex, weights_sex, biases_sex)
 97 | 
 98 |     '''
 99 |     共同的初始化
100 |     '''
101 |     saver = tf.train.Saver()
102 |     init = tf.global_variables_initializer()
103 |     with tf.Session() as sess:
104 |         saver.restore(sess, "./nn_model/model.ckpt")
105 |         print ("load model success!")
106 |         p_sex = sess.run(pred_sex, feed_dict={x_sex: data_predict})
107 |         p_age = sess.run(pred_age, feed_dict={x_age: data_predict})
108 |     if p_sex[0][0] > p_sex[0][1]:
109 |         sex_result = 1
110 |     else:
111 |         sex_result = 0
112 | 
113 |     age_result = p_age[0][0] * 50 +50
114 | 
115 |     return sex_result,age_result


--------------------------------------------------------------------------------
/BloodTestReportOCR/view.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import json
  5 | from cStringIO import StringIO
  6 | 
  7 | import bson
  8 | import cv2
  9 | import flask
 10 | import numpy
 11 | from PIL import Image
 12 | from bson.json_util import dumps
 13 | from flask import Flask, request, Response, jsonify, redirect, json
 14 | from pymongo import MongoClient
 15 | from werkzeug.utils import secure_filename
 16 | 
 17 | import tf_predict
 18 | from imageFilter import ImageFilter
 19 | import rnn_predict
 20 | import pd_predict
 21 | 
 22 | app = Flask(__name__, static_url_path="")
 23 | 
 24 | # 读取配置文件
 25 | app.config.from_object('config')
 26 | 
 27 | # 连接数据库，并获取数据库对象
 28 | db = MongoClient(app.config['DB_HOST'], app.config['DB_PORT']).test
 29 | 
 30 | 
 31 | # 将矫正后图片与图片识别结果（JSON）存入数据库
 32 | def save_file(file_str, f, report_data):
 33 |     content = StringIO(file_str)
 34 | 
 35 |     try:
 36 |         mime = Image.open(content).format.lower()
 37 |         print 'content of mime is：', mime
 38 |         if mime not in app.config['ALLOWED_EXTENSIONS']:
 39 |             raise IOError()
 40 |     except IOError:
 41 |         abort(400)
 42 |     c = dict(report_data=report_data, content=bson.binary.Binary(content.getvalue()), filename=secure_filename(f.name),
 43 |              mime=mime)
 44 |     db.files.save(c)
 45 |     return c['_id'], c['filename']
 46 | 
 47 | 
 48 | @app.route('/', methods=['GET', 'POST'])
 49 | def index():
 50 |     return redirect('/index.html')
 51 | 
 52 | 
 53 | @app.route('/upload', methods=['POST'])
 54 | def upload():
 55 |     if request.method == 'POST':
 56 |         if 'imagefile' not in request.files:
 57 |             flash('No file part')
 58 |             return jsonify({"error": "No file part"})
 59 |         imgfile = request.files['imagefile']
 60 |         if imgfile.filename == '':
 61 |             flash('No selected file')
 62 |             return jsonify({"error": "No selected file"})
 63 |         if imgfile:
 64 |             # pil = StringIO(imgfile)
 65 |             # pil = Image.open(pil)
 66 |             # print 'imgfile:', imgfile
 67 |             img = cv2.imdecode(numpy.fromstring(imgfile.read(), numpy.uint8), cv2.CV_LOAD_IMAGE_UNCHANGED)
 68 |             report_data = ImageFilter(image=img).ocr(22)
 69 |             if report_data == None:
 70 |                 data = {
 71 |                     "error": 1,
 72 |                 }
 73 |                 return jsonify(data)
 74 | 
 75 |             with open('temp_pics/region.jpg') as f:
 76 |                 if f is None:
 77 |                     print 'Error! f is None!'
 78 |                 else:
 79 | 
 80 |                     '''
 81 |                         定义file_str存储矫正后的图片文件f的内容（str格式）,方便之后对图片做二次透视以及将图片内容存储至数据库中
 82 |                     '''
 83 |                     file_str = f.read()
 84 |                     '''
 85 |                         使用矫正后的图片，将矫正后图片与识别结果（JSON数据）一并存入mongoDB，
 86 |                         这样前台点击生成报告时将直接从数据库中取出JSON数据，而不需要再进行图像透视，缩短生成报告的响应时间
 87 |                     '''
 88 |                     #img_region = cv2.imdecode(numpy.fromstring(file_str, numpy.uint8), cv2.CV_LOAD_IMAGE_UNCHANGED)
 89 |                     #report_data = ImageFilter(image=img).ocr(22)
 90 |                     fid, filename = save_file(file_str, f, report_data)
 91 |             print 'fid:', fid
 92 |             if fid is not None:
 93 |                 templates = "<div><img id=\'filtered-report\' src=\'/file/%s\' class=\'file-preview-image\' width=\'100%%\' height=\'512\'></div>" % (
 94 |                     fid)
 95 |                 data = {
 96 |                     "templates": templates,
 97 |                 }
 98 |             return jsonify(data)
 99 |             # return render_template("result.html", filename=filename, fileid=fid)
100 |     # return render_template("error.html", errormessage="No POST methods")
101 |     return jsonify({"error": "No POST methods"})
102 | 
103 | 
104 | '''
105 |     根据图像oid，在mongodb中查询，并返回Binary对象
106 | '''
107 | 
108 | 
109 | @app.route('/file/<fid>')
110 | def find_file(fid):
111 |     try:
112 |         file = db.files.find_one(bson.objectid.ObjectId(fid))
113 |         if file is None:
114 |             raise bson.errors.InvalidId()
115 |         return Response(file['content'], mimetype='image/' + file['mime'])
116 |     except bson.errors.InvalidId:
117 |         flask.abort(404)
118 | 
119 | 
120 | '''
121 |     直接从数据库中取出之前识别好的JSON数据，并且用bson.json_util.dumps将其从BSON转换为JSON格式的str类型
122 | '''
123 | 
124 | 
125 | @app.route('/report/<fid>')
126 | def get_report(fid):
127 |     # print 'get_report(fid):', fid
128 |     try:
129 |         file = db.files.find_one(bson.objectid.ObjectId(fid))
130 |         if file is None:
131 |             raise bson.errors.InvalidId()
132 | 
133 |         print 'type before transform:\n', type(file['report_data'])
134 | 
135 |         report_data = bson.json_util.dumps(file['report_data'])
136 | 
137 |         print 'type after transform:\n', type(report_data)
138 |         if report_data is None:
139 |             print 'report_data is NONE! Error!!!!'
140 |             return jsonify({"error": "can't ocr'"})
141 |         return jsonify(report_data)
142 |     except bson.errors.InvalidId:
143 |         flask.abort(404)
144 | 
145 | 
146 | def update_report(fid,ss):
147 |     # load json example
148 |     with open('bloodtestdata.json') as json_file:
149 |         data = json.load(json_file)
150 | 
151 |     for i in range(22):
152 |         data['bloodtest'][i]['value'] = ss[i]
153 |     json_data = json.dumps(data, ensure_ascii=False, indent=4)
154 | 
155 |     db.files.update_one({
156 |         '_id': bson.objectid.ObjectId(fid)}, {
157 |         '$set': {
158 |             'report_data': json_data
159 |         }
160 |     }, upsert=False)
161 | 
162 | 
163 |     file = db.files.find_one(bson.objectid.ObjectId(fid))
164 |     report_data = bson.json_util.dumps(file['report_data'])
165 |     print report_data
166 |     
167 | 
168 | 
169 | @app.route('/predict/<fid>', methods=['POST'])
170 | def predict(fid):
171 | 
172 | 
173 |     print ("predict now!")
174 | 
175 |     data = json.loads(request.form.get('data'))
176 |     ss = data['value']
177 | 
178 | 
179 |     # 若用户在输入框中对数值进行修正，则更新mongodb中的数据
180 |     update_report(fid,ss)
181 | 
182 |     arr = numpy.array(ss)
183 |     arr = numpy.reshape(arr, [1, 22])
184 | 
185 | 
186 |     if app.config['MODEL'] == 'rnn':
187 |         sex = rnn_predict.predict_sex(arr)
188 |         age = rnn_predict.predict_age(arr)
189 |         result = {
190 |             "sex": sex,
191 |             "age": age
192 |         }
193 | 
194 |     elif app.config['MODEL'] == 'tf':
195 |         sex, age = tf_predict.predict(arr)
196 |         result = {
197 |             "sex": sex,
198 |             "age": int(age)
199 |         }
200 |     elif app.config['MODEL'] == 'pd':
201 |         sex, age = pd_predict.predict(arr)
202 |         result = {
203 |             "sex": sex,
204 |             "age": int(age)
205 |         }
206 | 
207 | 
208 | 
209 |     return json.dumps(result)
210 | 
211 | 
212 | 
213 | if __name__ == '__main__':
214 | 
215 |     app.run(host=app.config['SERVER_HOST'], port=app.config['SERVER_PORT'])
216 |     
217 | 


--------------------------------------------------------------------------------
/Caffe/README.md:
--------------------------------------------------------------------------------
  1 | ##文件说明
  2 | 
  3 |  - caffe_sex_train_predict.py 性别预测demo主要代码，完成数据格式转换，训练及预测流程控制
  4 |  - config.prototxt                     训练网络配置文件
  5 |  - lenet_train.prototxt              训练网络设置
  6 |  - model_prod_prototxt           预测网络设置
  7 |  - draw_net.py                         网络绘图代码（未整合至主代码文件中）
  8 | 
  9 | ##caffe的安装：
 10 | **1、安装基本依赖**
 11 | 
 12 | ```
 13 | sudo apt-get install libprotobuf-dev libleveldb-dev libsnappy-dev libopencv-dev libhdf5-serial-dev protobuf-compiler
 14 | ```
 15 | 
 16 | ```
 17 | sudo apt-get install --no-install-recommends libboost-all-dev
 18 | ```
 19 | 
 20 | 	由于ubuntu的库有各种依赖关系，apt－get可能无法解决，建议使用aptitude，会给出多个解决方案，实测可行！
 21 | 	sudo aptitude install ...
 22 | 
 23 | **2、若不使用gpu，可以跳过安装cuda！（而且好像16.04已经带有cuda8）**
 24 | 
 25 | **3、安装ATLAS**
 26 | 
 27 | ```
 28 | sudo apt-get install libatlas-base-dev
 29 | ```
 30 | 
 31 | **4、下载caffe**
 32 | 
 33 | ```
 34 | git clone https://github.com/BVLC/caffe.git
 35 | ```
 36 | 
 37 | **5、修改Makefile.config**
 38 | 
 39 | ```
 40 | cd caffe
 41 | cp Makefile.config.example Makefile.config
 42 | gedit Makefile.config
 43 | ```
 44 | 
 45 | 将# cpu_only := 1的注释去掉，找到并修改为：
 46 | 
 47 | ```
 48 | INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include /usr/include/hdf5/serial
 49 | LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib /usr/lib/i386-linux-gnu/hdf5/serial
 50 | ```
 51 | 如果是ubuntu16.04 64位版本，需要将第二项改为 ：
 52 | ```
 53 | LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib /usr/lib/x86_64-linux-gnu /usr/lib/x86_64-linux-gnu/hdf5/serial
 54 | ```
 55 | 
 56 | 如果make all依然有错，你可能需要进行下一步
 57 | ```
 58 | cd /usr/lib/x86_64-linux-gnu
 59 | 
 60 | sudo ln -s libhdf5_serial.so.10.1.0 libhdf5.so
 61 | 
 62 | sudo ln -s libhdf5_serial_hl.so.10.0.2 libhdf5_hl.so
 63 | ```
 64 | 这依然是版本的锅。
 65 | 
 66 | **6、编译安装**
 67 | 
 68 | ```
 69 | make all
 70 | make test
 71 | make runtest
 72 | ```
 73 | 
 74 | 到此caffe安装已经完成！
 75 | 若有需要用到python或matlab接口的，先设置好Makefile.config中的路径，再另外编译：
 76 | 
 77 | ```
 78 | make pycaffe
 79 | make matcaffe
 80 | ```
 81 | ubuntu16.04 64位出错可能的解决方法：
 82 | ```
 83 | # (Python 2.7 development files)
 84 | sudo apt-get install -y python-dev
 85 | sudo apt-get install -y python-numpy python-scipy
 86 | ```
 87 | 修改Makefile.config中
 88 | ```
 89 | PYTHON_INCLUDE := /usr/include/python2.7 /usr/local/lib/python2.7/dist-packages/numpy/core/include
 90 | 
 91 | WITH_PYTHON_LAYER := 1
 92 | ```
 93 | 这是因为numpy安装路径可能不一样。
 94 | 
 95 | 添加python环境变量，方便以后imoprt caffe，打开/etc/bash.bashrc末尾添加：
 96 | 
 97 | ```
 98 | PYTHONPATH=/xxx/xxx/caffe/python:$PYTHONPATH
 99 | ```
100 | 
101 | 另外pycaffe的接口暴露在caff目录下的python文件夹，只需要import caffe就可以直接调用。matcaffe接口官网有介绍。
102 | 
103 | ##prototxt网络模型绘制成可视化图片
104 | 
105 | draw_net.py可以将网络模型由prototxt变成一张图片，draw_net.py存放在caffe根目录下python文件夹中。
106 | 
107 | 绘制网络模型前，先安装两个库：ＧraphViz和pydot
108 | 
109 | **1.安装ＧraphViz**
110 | 
111 | Graphviz的是一款图形绘制工具，用来被python程序调用绘制图片
112 | 
113 |     sudo apt-get install GraphViz
114 | 
115 | **2.安装pydot**
116 | 
117 | pydot是python的支持画图的库
118 | 
119 |     sudo pip install pydot
120 | 
121 | **3.编译pycaffe**
122 | 
123 |     make pycaffe
124 | 
125 | 完成上面三个步骤之后，就可以绘制网络模型了，draw_net.py执行的时候带三个参数
126 | 
127 | 第一个参数：网络模型的prototxt文件
128 | 
129 | 第二个参数：保存的图片路径及名字
130 | 
131 | 第二个参数：–rankdir=x , x 有四种选项，分别是LR, RL, TB, BT 。用来表示网络的方向，分别是从左到右，从右到左，从上到小，从下到上。默认为ＬＲ。
132 | 
133 | **绘制Lenet模型**
134 | 
135 | 在caffe根目录下
136 | 
137 |     python python/draw_net.py examples/mnist/lenet_train_test.prototxt ./lenet_train_test.jpg --rankdir=BT
138 | 
139 | 绘制完成后将会生成lenet_train_test.jpg
140 | 
141 | ## 利用CAFFE预测病人性别,正确率只有70%，还可以通过优化网络结构进行提升
142 | 
143 | ### 环境配置(Ubuntu 14.04或以上版本)
144 | 
145 | 如果还有模块没有安装，可以使用如下命令安装
146 | ```
147 | sudo pip install module_name
148 | ```
149 | 获取的数据来源：
150 | 
151 | 同项目目录下`Spark/BllodTestReportDeeplearning/data_set.csv`
152 | 
153 | ### 使用
154 |  - 在当前目录下建立两个数据库文件夹，test_data_lmdb，train_data_lmdb
155 | 
156 | ```
157 | mkdir test_data_lmdb train_datalmdb
158 | ```
159 |  - 运行caffe_sex_train_predict.py
160 | 
161 | ```
162 | python caffe_sex_train_predict.py
163 | ```
164 | 
165 | 注意：重复运行create_data_lmdb()并不会覆盖原来的文件，而是会在原文件结尾处继续生成新数据，如
166 | 果需要重新调试，可以删除两个文件
167 | 
168 | 相关资料链接：
169 | 官网上神经网络搭建实例：
170 | http://nbviewer.ipython.org/github/joyofdata/joyofdata-articles/blob/master/deeplearning-with-caffe/Neural-Networks-with-Caffe-on-the-GPU.ipynb
171 | 
172 | layer 详解：
173 | http://blog.csdn.net/u011762313/article/details/47361571#sigmoid
174 | 


--------------------------------------------------------------------------------
/Caffe/caffe_sex_train_predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import random
  4 | import subprocess
  5 | import platform
  6 | import sys,os
  7 | sys.path.append('/home/summer/Desktop/caffe/python')
  8 | import caffe
  9 | import lmdb
 10 | from sklearn.cross_validation import StratifiedShuffleSplit
 11 | import pandas as pd
 12 | import numpy as np
 13 | import matplotlib.pyplot as plt
 14 | 
 15 | 
 16 | 
 17 | def extract(filename):
 18 |     matrix = np.loadtxt(filename, dtype='string', skiprows= 1,delimiter=',', usecols=(1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28))
 19 |     matrix = matrix_filter(matrix)
 20 |     matrix = np.asarray(matrix)
 21 | 
 22 |     data = matrix[:,1:27]
 23 |     sex  = matrix[:,0]
 24 |     data = data.astype(np.float) #convert string to float
 25 |     for i in range(len(sex)):
 26 |         if sex[i] == '\xc4\xd0':
 27 |             sex[i] = 1
 28 |         else :
 29 |         	if sex[i] != '1':
 30 |            		sex[i] = 0
 31 |     return data,sex
 32 | #filter the row which contains the wrong elements
 33 | def matrix_filter(matrix):
 34 |     count = 0
 35 |     flag = 0
 36 |     for row in matrix:
 37 |         for cloumn in row:
 38 |             if cloumn == '--.--' or cloumn == '':  #Discard the wrong value
 39 |                 flag = 1
 40 | 
 41 |         if count == 0 and flag == 0:
 42 |             table = row
 43 |             count = 1
 44 |             continue
 45 |         if flag == 0 :
 46 |             table = np.c_[table,row]   #Add the elements by extend cloumns
 47 |         else:
 48 |             flag = 0
 49 |     table = table.transpose()          #Transpose the matrix
 50 |     return table
 51 | 
 52 | #nomalize the data
 53 | def nomalize(X_train, X_test):
 54 | 
 55 |     ave = X_train.mean(axis=0) # get the average of cloumns
 56 |     std = X_train.std(axis=0)  # get the standard deviation of cloumns
 57 |     train_table = [(row - ave)/std for row in X_train]
 58 |     X_train = (np.asarray(train_table))
 59 | 
 60 |     test_table = [(row - ave)/std for row in X_test]
 61 |     X_test = (np.asarray(test_table))
 62 |     return X_train, X_test
 63 | 
 64 | #load data into lmdb
 65 | def load_data_into_lmdb(lmdb_name, features, labels=None):
 66 |     env = lmdb.open(lmdb_name, map_size=features.nbytes*10)
 67 | 
 68 |     features = features[:,:,None,None]
 69 |     for i in range(features.shape[0]):
 70 |         datum = caffe.proto.caffe_pb2.Datum()
 71 | 
 72 |         datum.channels = features.shape[1]   # features's number(26)
 73 |         datum.height = 1                     # due to eachone only have one data
 74 |         datum.width = 1                      # so the size is 1x1
 75 | 
 76 |         if features.dtype == np.int:         # convert data to string
 77 |             datum.data = features[i].tostring()
 78 |         elif features.dtype == np.float:
 79 |             datum.float_data.extend(features[i].flat)
 80 |         else:
 81 |             raise Exception("features.dtype unknown.")
 82 | 
 83 |         if labels is not None:
 84 |             datum.label = int(labels[i])
 85 | 
 86 |         str_id = '{:08}'.format(i)
 87 |         with env.begin(write=True) as txn:
 88 |             txn.put(str_id, datum.SerializeToString())
 89 | 
 90 | def get_data_from_lmdb_evalue(lmdb_name):
 91 |     lmdb_env = lmdb.open(lmdb_name, readonly=True)
 92 |     lmdb_txn = lmdb_env.begin()
 93 |     lmdb_cursor = lmdb_txn.cursor()
 94 |     datum = caffe.proto.caffe_pb2.Datum()
 95 |     success = 0
 96 |     count = 0
 97 |     #raw_datum = lmdb_txn.get()
 98 |     for key, value in lmdb_cursor:
 99 | 
100 |         datum.ParseFromString(value)
101 |         label = datum.label
102 |         feature = caffe.io.datum_to_array(datum)
103 |         out = net.forward(**{net.inputs[0]: np.asarray([feature])})
104 |         count+=1
105 |         if np.argmax(out["prob"][0]) == label :
106 |             success+=1
107 |             print "success", out
108 |     return count,success
109 | 
110 | def create_data_lmdb():
111 | 
112 |     #prefit
113 |     X, y = extract('data_set.csv')
114 |     vec_log = np.vectorize(lambda x: x)
115 |     vec_int = np.vectorize(lambda str: int(str[-1]))
116 |     features = vec_log(X)
117 |     labels = vec_int(y)
118 | 
119 |     #train : test = 9 : 1
120 |     sss = StratifiedShuffleSplit(labels, 1, test_size=0.1, random_state=0)
121 |     sss = list(sss)[0]
122 | 
123 |     features_training = features[sss[0],]
124 |     labels_training = labels[sss[0],]
125 | 
126 |     features_testing = features[sss[1],]
127 |     labels_testing = labels[sss[1],]
128 | 
129 |     #nomalized data 66%, unnomalized data 53%
130 |     features_training, features_testing = nomalize(features_training, features_testing)
131 | 
132 |     load_data_into_lmdb("train_data_lmdb", features_training, labels_training)
133 |     load_data_into_lmdb("test_data_lmdb", features_testing, labels_testing)
134 | 
135 | if __name__=='__main__':
136 |     #建立lmdb格式数据库，只需创建一次，再次创建需要清除原来数据文件
137 |     create_data_lmdb();
138 |     #根据配置文件开始训练模型
139 |     solver = caffe.get_solver("config.prototxt")
140 |     solver.solve()
141 | 
142 |     net = caffe.Net("model_prod_prototxt","_iter_500000.caffemodel", caffe.TEST)
143 | 
144 |     # if the index of the largest element matches the integer
145 |     # label we stored for that case - then the prediction is right
146 |     total,success = get_data_from_lmdb_evalue("test_data_lmdb/")
147 |     print "accuracy:", success*100/total,"%"
148 | 


--------------------------------------------------------------------------------
/Caffe/config.prototxt:
--------------------------------------------------------------------------------
 1 | test_iter: 50               #测试的批次
 2 | test_interval: 10000        #每10000次迭代后测试一次
 3 | base_lr: 0.01               #基础学习率
 4 | display: 10000              #每10000次迭代显示一次数据
 5 | max_iter: 500000            #最大迭代次数
 6 | 
 7 | lr_policy: "inv"            #学习率变化公式
 8 | #    - fixed: always return base_lr.
 9 | #    - step: return base_lr * gamma ^ (floor(iter / step))
10 | #    - exp: return base_lr * gamma ^ iter
11 | 
12 | #    - inv: return base_lr * (1 + gamma * iter) ^ (- power)
13 | #    - sigmoid: the effective learning rate follows a sigmod decay
14 | #      return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize))))
15 | 
16 | 
17 | #    - multistep: similar to step but it allows non uniform steps defined by
18 | #      stepvalue
19 | #    - poly: the effective learning rate follows a polynomial decay, to be
20 | #      zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power)
21 | 
22 | gamma: 0.0001               #学习率变化参数
23 | power: 0.75                 #指数参数
24 | momentum: 0.9               #学习的参数
25 | weight_decay: 0.0005        #后向传播的权重比例
26 | solver_mode: CPU            #运行模式为cpu模式
27 | net: "lenet_train.prototxt" #训练网络
28 | stepsize: 100000            #每100000次迭代减少学习率
29 | 


--------------------------------------------------------------------------------
/Caffe/draw_net.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Draw a graph of the net architecture.
 4 | """
 5 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
 6 | from google.protobuf import text_format
 7 | 
 8 | import caffe
 9 | import caffe.draw
10 | from caffe.proto import caffe_pb2
11 | 
12 | 
13 | def parse_args():
14 |     """Parse input arguments
15 |     """
16 | 
17 |     parser = ArgumentParser(description=__doc__,
18 |                             formatter_class=ArgumentDefaultsHelpFormatter)
19 | 
20 |     parser.add_argument('input_net_proto_file',
21 |                         help='Input network prototxt file')
22 |     parser.add_argument('output_image_file',
23 |                         help='Output image file')
24 |     parser.add_argument('--rankdir',
25 |                         help=('One of TB (top-bottom, i.e., vertical), '
26 |                               'RL (right-left, i.e., horizontal), or another '
27 |                               'valid dot option; see '
28 |                               'http://www.graphviz.org/doc/info/'
29 |                               'attrs.html#k:rankdir'),
30 |                         default='LR')
31 |     parser.add_argument('--phase',
32 |                         help=('Which network phase to draw: can be TRAIN, '
33 |                               'TEST, or ALL.  If ALL, then all layers are drawn '
34 |                               'regardless of phase.'),
35 |                         default="ALL")
36 | 
37 |     args = parser.parse_args()
38 |     return args
39 | 
40 | 
41 | def main():
42 |     args = parse_args()
43 |     net = caffe_pb2.NetParameter()
44 |     text_format.Merge(open(args.input_net_proto_file).read(), net)
45 |     print('Drawing net to %s' % args.output_image_file)
46 |     phase=None;
47 |     if args.phase == "TRAIN":
48 |         phase = caffe.TRAIN
49 |     elif args.phase == "TEST":
50 |         phase = caffe.TEST
51 |     elif args.phase != "ALL":
52 |         raise ValueError("Unknown phase: " + args.phase)
53 |     caffe.draw.draw_net_to_file(net, args.output_image_file, args.rankdir,
54 |                                 phase)
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     main()
59 | 


--------------------------------------------------------------------------------
/Caffe/lenet_train.prototxt:
--------------------------------------------------------------------------------
  1 | name: "LeNet"
  2 | layer {
  3 |   name: "mnist"
  4 |   type: "Data"               #type 定义层次类型
  5 |   top: "data"                #输入数据来自一些’bottom’ blobs, 输出一些’top’ blobs
  6 |   top: "label"
  7 |   include {
  8 |     phase: TRAIN             #指明训练网络
  9 |   }
 10 |   transform_param {          #数据的预处理，一般图像设置为该值，1/255,本项目中用不到。
 11 |     #scale: 0.00390625
 12 |   }
 13 |   data_param {               #数据来源，批处理大小，和LMDB数据格式
 14 |     source: "train_data_lmdb"
 15 |     batch_size: 1
 16 |     backend: LMDB
 17 |   }
 18 | }
 19 | layer {
 20 |   name: "mnist"
 21 |   type: "Data"
 22 |   top: "data"
 23 |   top: "label"
 24 |   include {
 25 |     phase: TEST             #两个放在一起代表训练时采用的train和test网络用的同一个网络结构
 26 |   }
 27 |   transform_param {         #数据的预处理
 28 |     #scale: 0.00390625
 29 |   }
 30 |   data_param {
 31 |     source: "test_data_lmdb"
 32 |     batch_size: 100
 33 |     backend: LMDB
 34 |   }
 35 | }
 36 | layer {
 37 |   name: "ip1"
 38 |   type: "InnerProduct"             #内积层又叫全连接层，输入当做一个一维向量，产生的输出也是以向量的形式输出，
 39 |   bottom: "data"
 40 |   top: "ip1"
 41 |   param {                         #层的权值和偏置相关参数
 42 |     lr_mult: 1
 43 |   }
 44 |   param {
 45 |     lr_mult: 2
 46 |   }
 47 |   inner_product_param {           #全连接层节点数设置，或者说滤波器数目（必须设置）
 48 |     num_output: 300
 49 |     weight_filler {
 50 |       type: "xavier"              #滤波器类型
 51 |     }
 52 |     bias_filler {                 #偏执类型，默认值
 53 |       type: "constant"
 54 |     }
 55 |   }
 56 | }
 57 | layer {                           #激励层：常见激励函数有,relu:max(x, 0)
 58 |   name: "relu1"                   #       sigmod,TanH,AbsVal 等等
 59 |   type: "ReLU"
 60 |   bottom: "ip1"
 61 |   top: "ip1"
 62 | }
 63 | layer {
 64 |   name: "ip2"
 65 |   type: "InnerProduct"
 66 |   bottom: "ip1"
 67 |   top: "ip2"
 68 |   param {
 69 |     lr_mult: 1
 70 |   }
 71 |   param {
 72 |     lr_mult: 2
 73 |   }
 74 |   inner_product_param {
 75 |     num_output: 2
 76 |     weight_filler {
 77 |       type: "xavier"
 78 |     }
 79 |     bias_filler {
 80 |       type: "constant"
 81 |     }
 82 |   }
 83 | }
 84 | layer {
 85 |   name: "accuracy"        #test 网络输出准确率
 86 |   type: "Accuracy"
 87 |   bottom: "ip2"
 88 |   bottom: "label"
 89 |   top: "accuracy"
 90 |   include {
 91 |     phase: TEST
 92 |   }
 93 | }
 94 | layer {
 95 |   name: "loss"
 96 |   type: "SoftmaxWithLoss"   #SoftmaxWithLoss（广义线性回归分析损失层）
 97 |   bottom: "ip2"
 98 |   bottom: "label"
 99 |   top: "loss"
100 | }
101 | 


--------------------------------------------------------------------------------
/Caffe/model_prod_prototxt:
--------------------------------------------------------------------------------
 1 | name: "otto"
 2 | input: "data"               //指定输入的shape.num,channels,height,width
 3 | input_dim: 1
 4 | input_dim: 26
 5 | input_dim: 1
 6 | input_dim: 1
 7 | layer {
 8 |   name: "ip1"
 9 |   type: "InnerProduct"
10 |   bottom: "data"
11 |   top: "ip1"
12 |   inner_product_param {
13 |     num_output: 300
14 |     weight_filler {
15 |       type: "xavier"
16 |     }
17 |     bias_filler {
18 |       type: "constant"
19 |       value: 0
20 |     }
21 |   }
22 | }
23 | layer {
24 |   name: "relu1"
25 |   type: "ReLU"
26 |   bottom: "ip1"
27 |   top: "ip1"
28 | }
29 | layer {
30 |   name: "ip2"
31 |   type: "InnerProduct"
32 |   bottom: "ip1"
33 |   top: "ip2"
34 |   inner_product_param {
35 |     num_output: 2
36 |     weight_filler {
37 |       type: "xavier"
38 |     }
39 |     bias_filler {
40 |       type: "constant"
41 |       value: 0
42 |     }
43 |   }
44 | }
45 | layer {
46 |   name: "prob"
47 |   type: "Softmax"
48 |   bottom: "ip2"
49 |   top: "prob"
50 | }
51 | 


--------------------------------------------------------------------------------
/DigitRecogn/README.md:
--------------------------------------------------------------------------------
 1 | ### 神经网络实现手写字符识别系统
 2 | 
 3 |  - BP神经网络
 4 |  - 输入层维数 400
 5 |  - 隐藏层神经元 15
 6 |  - 输出层维数 10
 7 |  - 学习率 0.1
 8 |  - 激活函数 sigmoid
 9 |  - 参数保存在 nn.json
10 | 
11 | #### 环境配置(如果在本地运行)
12 |  - 系统: ubuntu 14.04 64位
13 | 
14 | ```
15 | # 安装pip
16 | sudo apt-get install python-pip
17 | 
18 | # 用pip安装numpy和scipy, 使用科大镜像加速
19 | pip install --user numpy scipy -i https://pypi.mirrors.ustc.edu.cn/simple
20 | 
21 | # 如果上一步安装失败就使用ubuntu的包管理器试试
22 | sudo apt-get install python-numpy python-scipy
23 | 
24 | # 安装sklearn, neural_network_design.py需要调用它做交叉验证
25 | pip install -U scikit-learn -i https://pypi.mirrors.ustc.edu.cn/simple
26 | 
27 | # 如果在服务器上运行，修改ocr.js里的HOST为服务器的地址，如http://2016.mc2lab.com
28 | 
29 | ```
30 | 
31 | 
32 | #### 运行
33 | 
34 | 1. 下载图像和标签数据
35 | 
36 | 
37 |         wget http://labfile.oss.aliyuncs.com/courses/593/data.csv
38 |         wget http://labfile.oss.aliyuncs.com/courses/593/dataLabels.csv
39 | 
40 | 
41 | 2. 训练模型
42 | 
43 |         python neural_network_design.py
44 | 
45 | 3. 创建服务器
46 | 
47 |         python -m SimpleHTTPServer 3000
48 | 
49 | 4. 加载服务器
50 | 
51 |         python server.py
52 | 
53 | 5. 访问
54 | 
55 |         localhost:3000
56 | 
57 | 
58 | * 实现指导见https://www.shiyanlou.com/courses/593
59 | 


--------------------------------------------------------------------------------
/DigitRecogn/index.html:
--------------------------------------------------------------------------------
 1 | <!-- ocr.html -->
 2 | <!DOCTYPE html>
 3 | <html>
 4 | <head>
 5 |     <script src="ocr.js"></script>
 6 | </head>
 7 | <body onload="ocrDemo.onLoadFunction()">
 8 |     <div id="main-container" style="text-align: center;">
 9 |         <h1>OCR Demo</h1>
10 |         <canvas id="canvas" width="200" height="200"></canvas>
11 |         <form name="input">
12 |             <p>Digit: <input id="digit" type="text"> </p>
13 |             <input type="button" value="Train" onclick="ocrDemo.train()">
14 |             <input type="button" value="Test" onclick="ocrDemo.test()">
15 |             <input type="button" value="Reset" onclick="ocrDemo.resetCanvas();"/>
16 |         </form> 
17 |     </div>
18 | </body>
19 | </html>
20 | 


--------------------------------------------------------------------------------
/DigitRecogn/neural_network_design.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | # neural_network_design.py
 3 | 
 4 | import numpy as np
 5 | from ocr import OCRNeuralNetwork
 6 | from sklearn.cross_validation import train_test_split
 7 | 
 8 | def test(data_matrix, data_labels, test_indices, nn):
 9 |     correct_guess_count = 0
10 |     for i in test_indices:
11 |         test = data_matrix[i]
12 |         prediction = nn.predict(test)
13 |         if data_labels[i] == prediction:
14 |             correct_guess_count += 1
15 |     return correct_guess_count / float(len(test_indices))
16 | 
17 | data_matrix = np.loadtxt(open('data.csv', 'rb'), delimiter = ',').tolist()
18 | data_labels = np.loadtxt(open('dataLabels.csv', 'rb')).tolist()
19 | 
20 | # Create training and testing sets.
21 | train_indices, test_indices = train_test_split(list(range(5000)))
22 | 
23 | print "PERFORMANCE"
24 | print "-----------"
25 | 
26 | for i in xrange(5, 50, 5):
27 |     nn = OCRNeuralNetwork(i, data_matrix, data_labels, train_indices, False)
28 |     performance = str(test(data_matrix, data_labels, test_indices, nn))
29 |     print "{i} Hidden Nodes: {val}".format(i=i, val=performance)
30 | 


--------------------------------------------------------------------------------
/DigitRecogn/ocr.js:
--------------------------------------------------------------------------------
  1 | var ocrDemo = {
  2 |     CANVAS_WIDTH: 200,
  3 |     TRANSLATED_WIDTH: 20,
  4 |     PIXEL_WIDTH: 10, // TRANSLATED_WIDTH = CANVAS_WIDTH / PIXEL_WIDTH
  5 |     BATCH_SIZE: 1,
  6 | 
  7 |     // 服务器端参数
  8 |     PORT: "9000",
  9 |     HOST: "http://localhost",
 10 | 
 11 |     // 颜色变量
 12 |     BLACK: "#000000",
 13 |     BLUE: "#0000ff",
 14 | 
 15 |     // 客户端训练数据集
 16 |     trainArray: [],
 17 |     trainingRequestCount: 0,
 18 | 
 19 |     onLoadFunction: function() {
 20 |         this.resetCanvas();
 21 |     },
 22 | 
 23 |     resetCanvas: function() {
 24 |         var canvas = document.getElementById('canvas');
 25 |         var ctx = canvas.getContext('2d');
 26 | 
 27 |         this.data = [];
 28 |         ctx.fillStyle = this.BLACK;
 29 |         ctx.fillRect(0, 0, this.CANVAS_WIDTH, this.CANVAS_WIDTH);
 30 |         var matrixSize = 400;
 31 |         while (matrixSize--) this.data.push(0);
 32 |         this.drawGrid(ctx);
 33 | 
 34 |         // 绑定事件操作
 35 |         canvas.onmousemove = function(e) { this.onMouseMove(e, ctx, canvas) }.bind(this);
 36 |         canvas.onmousedown = function(e) { this.onMouseDown(e, ctx, canvas) }.bind(this);
 37 |         canvas.onmouseup = function(e) { this.onMouseUp(e, ctx) }.bind(this);
 38 |     },
 39 | 
 40 |     drawGrid: function(ctx) {
 41 |         for (var x = this.PIXEL_WIDTH, y = this.PIXEL_WIDTH; x < this.CANVAS_WIDTH; x += this.PIXEL_WIDTH, y += this.PIXEL_WIDTH) {
 42 |             ctx.strokeStyle = this.BLUE;
 43 |             ctx.beginPath();
 44 |             ctx.moveTo(x, 0);
 45 |             ctx.lineTo(x, this.CANVAS_WIDTH);
 46 |             ctx.stroke();
 47 | 
 48 |             ctx.beginPath();
 49 |             ctx.moveTo(0, y);
 50 |             ctx.lineTo(this.CANVAS_WIDTH, y);
 51 |             ctx.stroke();
 52 |         }
 53 |     },
 54 | 
 55 |     onMouseMove: function(e, ctx, canvas) {
 56 |         if (!canvas.isDrawing) {
 57 |             return;
 58 |         }
 59 |         this.fillSquare(ctx, e.clientX - canvas.offsetLeft, e.clientY - canvas.offsetTop);
 60 |     },
 61 | 
 62 |     onMouseDown: function(e, ctx, canvas) {
 63 |         canvas.isDrawing = true;
 64 |         this.fillSquare(ctx, e.clientX - canvas.offsetLeft, e.clientY - canvas.offsetTop);
 65 |     },
 66 | 
 67 |     onMouseUp: function(e) {
 68 |         canvas.isDrawing = false;
 69 |     },
 70 | 
 71 |     fillSquare: function(ctx, x, y) {
 72 |         var xPixel = Math.floor(x / this.PIXEL_WIDTH);
 73 |         var yPixel = Math.floor(y / this.PIXEL_WIDTH);
 74 |         // 存储手写输入数据
 75 |         this.data[((xPixel - 1)  * this.TRANSLATED_WIDTH + yPixel) - 1] = 1;
 76 | 
 77 |         ctx.fillStyle = '#ffffff';
 78 |         ctx.fillRect(xPixel * this.PIXEL_WIDTH, yPixel * this.PIXEL_WIDTH, this.PIXEL_WIDTH, this.PIXEL_WIDTH);
 79 |     },
 80 | 
 81 |     train: function() {
 82 |         var digitVal = document.getElementById("digit").value;
 83 |         if (!digitVal || this.data.indexOf(1) < 0) {
 84 |             alert("Please type and draw a digit value in order to train the network");
 85 |             return;
 86 |         }
 87 |         // 将数据加入客户端训练数据集
 88 |         this.trainArray.push({"y0": this.data, "label": parseInt(digitVal)});
 89 |         this.trainingRequestCount++;
 90 | 
 91 |         // 将客服端训练数据集发送给服务器端
 92 |         if (this.trainingRequestCount == this.BATCH_SIZE) {
 93 |             alert("Sending training data to server...");
 94 |             var json = {
 95 |                 trainArray: this.trainArray,
 96 |                 train: true
 97 |             };
 98 | 
 99 |             this.sendData(json);
100 |             this.trainingRequestCount = 0;
101 |             this.trainArray = [];
102 |         }
103 |     },
104 | 
105 |     // 发送预测请求
106 |     test: function() {
107 |         if (this.data.indexOf(1) < 0) {
108 |             alert("Please draw a digit in order to test the network");
109 |             return;
110 |         }
111 |         var json = {
112 |             image: this.data,
113 |             predict: true
114 |         };
115 |         this.sendData(json);
116 |     },
117 | 
118 |     // 处理服务器响应
119 |     receiveResponse: function(xmlHttp) {
120 |         if (xmlHttp.status != 200) {
121 |             alert("Server returned status " + xmlHttp.status);
122 |             return;
123 |         }
124 |         var responseJSON = JSON.parse(xmlHttp.responseText);
125 |         if (xmlHttp.responseText && responseJSON.type == "test") {
126 |             alert("The neural network predicts you wrote a \'" + responseJSON.result + '\'');
127 |         }
128 |     },
129 | 
130 |     onError: function(e) {
131 |         alert("Error occurred while connecting to server: " + e.target.statusText);
132 |     },
133 | 
134 |     sendData: function(json) {
135 |         var xmlHttp = new XMLHttpRequest();
136 |         xmlHttp.open('POST', this.HOST + ":" + this.PORT, false);
137 |         xmlHttp.onload = function() { this.receiveResponse(xmlHttp); }.bind(this);
138 |         xmlHttp.onerror = function() { this.onError(xmlHttp) }.bind(this);
139 |         var msg = JSON.stringify(json);
140 |         xmlHttp.setRequestHeader('Content-length', msg.length);
141 |         xmlHttp.setRequestHeader("Connection", "close");
142 |         xmlHttp.send(msg);
143 |     }
144 | }
145 | 
146 | 


--------------------------------------------------------------------------------
/DigitRecogn/ocr.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | 
  3 | import csv
  4 | import numpy as np
  5 | from numpy import matrix
  6 | from math import pow
  7 | from collections import namedtuple
  8 | import math
  9 | import random
 10 | import os
 11 | import json
 12 | 
 13 | class OCRNeuralNetwork:
 14 |     LEARNING_RATE = 0.1
 15 |     WIDTH_IN_PIXELS = 20
 16 |     # 保存神经网络的文件路径
 17 |     NN_FILE_PATH = 'nn.json'
 18 | 
 19 |     def __init__(self, num_hidden_nodes, data_matrix, data_labels, training_indices, use_file=True):
 20 |         # sigmoid函数
 21 |         self.sigmoid = np.vectorize(self._sigmoid_scalar)
 22 |         # sigmoid求导函数
 23 |         self.sigmoid_prime = np.vectorize(self._sigmoid_prime_scalar)
 24 |         # 决定了要不要导入nn.json
 25 |         self._use_file = use_file
 26 |         # 数据集
 27 |         self.data_matrix = data_matrix
 28 |         self.data_labels = data_labels
 29 | 
 30 |         if (not os.path.isfile(OCRNeuralNetwork.NN_FILE_PATH) or not use_file):
 31 |             # 初始化神经网络
 32 |             self.theta1 = self._rand_initialize_weights(400, num_hidden_nodes)
 33 |             self.theta2 = self._rand_initialize_weights(num_hidden_nodes, 10)
 34 |             self.input_layer_bias = self._rand_initialize_weights(1, num_hidden_nodes)
 35 |             self.hidden_layer_bias = self._rand_initialize_weights(1, 10)
 36 | 
 37 |             # 训练并保存
 38 |             TrainData = namedtuple('TrainData', ['y0', 'label'])
 39 |             self.train([TrainData(self.data_matrix[i], int(self.data_labels[i])) for i in training_indices])
 40 |             self.save()
 41 |         else:
 42 |             # 如果nn.json存在则加载
 43 |             self._load()
 44 | 
 45 |     def _rand_initialize_weights(self, size_in, size_out):
 46 |         return [((x * 0.12) - 0.06) for x in np.random.rand(size_out, size_in)]
 47 | 
 48 |     def _sigmoid_scalar(self, z):
 49 |         return 1 / (1 + math.e ** -z)
 50 | 
 51 |     def _sigmoid_prime_scalar(self, z):
 52 |         return self.sigmoid(z) * (1 - self.sigmoid(z))
 53 | 
 54 | 
 55 |     def train(self, training_data_array):
 56 |         for data in training_data_array:
 57 |             # 前向传播得到结果向量
 58 |             y1 = np.dot(np.mat(self.theta1), np.mat(data.y0).T)
 59 |             sum1 =  y1 + np.mat(self.input_layer_bias)
 60 |             y1 = self.sigmoid(sum1)
 61 | 
 62 |             y2 = np.dot(np.array(self.theta2), y1)
 63 |             y2 = np.add(y2, self.hidden_layer_bias)
 64 |             y2 = self.sigmoid(y2)
 65 | 
 66 |             # 后向传播得到误差向量
 67 |             actual_vals = [0] * 10 
 68 |             actual_vals[data.label] = 1
 69 |             output_errors = np.mat(actual_vals).T - np.mat(y2)
 70 |             hidden_errors = np.multiply(np.dot(np.mat(self.theta2).T, output_errors), self.sigmoid_prime(sum1))
 71 | 
 72 |             # 更新权重矩阵与偏置向量
 73 |             self.theta1 += self.LEARNING_RATE * np.dot(np.mat(hidden_errors), np.mat(data.y0))
 74 |             self.theta2 += self.LEARNING_RATE * np.dot(np.mat(output_errors), np.mat(y1).T)
 75 |             self.hidden_layer_bias += self.LEARNING_RATE * output_errors
 76 |             self.input_layer_bias += self.LEARNING_RATE * hidden_errors
 77 | 
 78 |     def predict(self, test):
 79 |         y1 = np.dot(np.mat(self.theta1), np.mat(test).T)
 80 |         y1 =  y1 + np.mat(self.input_layer_bias) # Add the bias
 81 |         y1 = self.sigmoid(y1)
 82 | 
 83 |         y2 = np.dot(np.array(self.theta2), y1)
 84 |         y2 = np.add(y2, self.hidden_layer_bias) # Add the bias
 85 |         y2 = self.sigmoid(y2)
 86 | 
 87 |         results = y2.T.tolist()[0]
 88 |         return results.index(max(results))
 89 | 
 90 |     def save(self):
 91 |         if not self._use_file:
 92 |             return
 93 | 
 94 |         json_neural_network = {
 95 |             "theta1":[np_mat.tolist()[0] for np_mat in self.theta1],
 96 |             "theta2":[np_mat.tolist()[0] for np_mat in self.theta2],
 97 |             "b1":self.input_layer_bias[0].tolist()[0],
 98 |             "b2":self.hidden_layer_bias[0].tolist()[0]
 99 |         };
100 |         with open(OCRNeuralNetwork.NN_FILE_PATH,'w') as nnFile:
101 |             json.dump(json_neural_network, nnFile)
102 | 
103 |     def _load(self):
104 |         if not self._use_file:
105 |             return
106 | 
107 |         with open(OCRNeuralNetwork.NN_FILE_PATH) as nnFile:
108 |             nn = json.load(nnFile)
109 |         self.theta1 = [np.array(li) for li in nn['theta1']]
110 |         self.theta2 = [np.array(li) for li in nn['theta2']]
111 |         self.input_layer_bias = [np.array(nn['b1'][0])]
112 |         self.hidden_layer_bias = [np.array(nn['b2'][0])]
113 | 


--------------------------------------------------------------------------------
/DigitRecogn/server.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import BaseHTTPServer
 3 | import json
 4 | from ocr import OCRNeuralNetwork
 5 | import numpy as np
 6 | import random
 7 | from collections import namedtuple
 8 | 
 9 | #服务器端配置
10 | HOST_NAME = ''
11 | PORT_NUMBER = 9000
12 | #这个值是通过运行神经网络设计脚本得到的最优值
13 | HIDDEN_NODE_COUNT = 15
14 | 
15 | # 加载数据集
16 | data_matrix = np.loadtxt(open('data.csv', 'rb'), delimiter = ',')
17 | data_labels = np.loadtxt(open('dataLabels.csv', 'rb'))
18 | 
19 | # 转换成list类型
20 | data_matrix = data_matrix.tolist()
21 | data_labels = data_labels.tolist()
22 | 
23 | # 数据集一共5000个数据，train_indice存储用来训练的数据的序号
24 | train_indice = range(5000)
25 | # 打乱训练顺序
26 | random.shuffle(train_indice)
27 | 
28 | nn = OCRNeuralNetwork(HIDDEN_NODE_COUNT, data_matrix, data_labels, train_indice);
29 | 
30 | class JSONHandler(BaseHTTPServer.BaseHTTPRequestHandler):
31 |     """处理接收到的POST请求"""
32 |     def do_POST(self):
33 |         response_code = 200
34 |         response = ""
35 |         var_len = int(self.headers.get('Content-Length'))
36 |         content = self.rfile.read(var_len);
37 |         payload = json.loads(content);
38 | 
39 |         # 如果是训练请求，训练然后保存训练完的神经网络
40 |         if payload.get('train'):
41 |             # 转化数据格式
42 |             TrainData = namedtuple('TrainData', ['y0', 'label'])
43 |             nn.train([TrainData(payload['trainArray'][0]['y0'],payload['trainArray'][0]['label'])])
44 |             nn.save()
45 |         # 如果是预测请求，返回预测值
46 |         elif payload.get('predict'):
47 |             try:
48 |                 print nn.predict(data_matrix[0])
49 |                 response = {"type":"test", "result":str(nn.predict(payload['image']))}
50 |             except:
51 |                 response_code = 500
52 |         else:
53 |             response_code = 400
54 | 
55 |         self.send_response(response_code)
56 |         self.send_header("Content-type", "application/json")
57 |         self.send_header("Access-Control-Allow-Origin", "*")
58 |         self.end_headers()
59 |         if response:
60 |             self.wfile.write(json.dumps(response))
61 |         return
62 | 
63 | if __name__ == '__main__':
64 |     server_class = BaseHTTPServer.HTTPServer;
65 |     httpd = server_class((HOST_NAME, PORT_NUMBER), JSONHandler)
66 | 
67 |     try:
68 |         #启动服务器
69 |         httpd.serve_forever()
70 |     except KeyboardInterrupt:
71 |         pass
72 |     else:
73 |         print "Unexpected server exception occurred."
74 |     finally:
75 |         httpd.server_close()
76 | 
77 | 


--------------------------------------------------------------------------------
/Keras/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | 


--------------------------------------------------------------------------------
/Keras/KerasDistinguishAge.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-  
  2 | import numpy as np
  3 | np.random.seed(1337)  # for reproducibility
  4 | from keras.models import Sequential, model_from_json
  5 | from keras.layers.core import Dense, Dropout, Activation
  6 | from keras.optimizers import SGD, Adam, RMSprop, Adagrad
  7 | from keras.utils import np_utils
  8 | from time import sleep
  9 | 
 10 | batch_size = 128
 11 | nb_classes = 20
 12 | nb_epoch = 100
 13 | def load_data():
 14 |    x_train=[]
 15 |    Y_train=[]
 16 |    x_test=[]
 17 |    Y_test=[]
 18 |    
 19 |    f = open("train.txt","r")
 20 |    i = 0
 21 |    for line in f.readlines():
 22 |        line = line.strip("\n").split(",")
 23 |        if i>0:
 24 |          Y_train.append(int(float(line[2])/5))
 25 |          del line[0]
 26 |          del line[0]
 27 |          del line[0]
 28 |          x_train.append(line)
 29 |        i += 1
 30 |    x1=np.array(x_train)
 31 |    y1=np.array(Y_train)
 32 |    f.close()
 33 |    
 34 |    f = open("test.txt","r")
 35 |    i = 0
 36 |    for line in f.readlines():
 37 |        line = line.strip("\n").split(",")
 38 |        if i>0:
 39 |          Y_test.append(int(float(line[2])/5))
 40 |          del line[0]
 41 |          del line[0]
 42 |          del line[0]
 43 |          x_test.append(line)
 44 |        i += 1
 45 |    x2=np.array(x_test)
 46 |    y2=np.array(Y_test)
 47 |    f.close()
 48 | 
 49 |    return (x1, y1), (x2, y2)
 50 |    
 51 | # the data, shuffled and split between train and test sets
 52 | (X_train, y_train), (X_test, y_test) = load_data()
 53 | X_train = X_train.reshape(1858, 26)
 54 | X_test = X_test.reshape(200, 26)
 55 | X_train = X_train.astype('float32')
 56 | X_test = X_test.astype('float32')
 57 | X_train /= 255
 58 | X_test /= 255
 59 | print(X_train.shape[0], 'train samples')
 60 | print(X_test.shape[0], 'test samples')
 61 | # convert class vectors to binary class matrices
 62 | Y_train = np_utils.to_categorical(y_train, nb_classes)
 63 | Y_test = np_utils.to_categorical(y_test, nb_classes)
 64 | 
 65 | model = Sequential()
 66 | model.add(Dense(16, input_shape=(26,)))
 67 | model.add(Activation('relu'))
 68 | model.add(Dropout(0.1))
 69 | model.add(Dense(output_dim=247))
 70 | model.add(Activation('relu'))
 71 | model.add(Dropout(0.1))
 72 | model.add(Dense(output_dim=125))
 73 | model.add(Activation('relu'))
 74 | model.add(Dropout(0.2))
 75 | model.add(Dense(output_dim=20))
 76 | model.add(Activation('softmax'))
 77 | 
 78 | adagrad=Adagrad(lr=0.02, epsilon=1e-4)
 79 | #model.compile(loss='categorical_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])
 80 | model.compile(loss='categorical_crossentropy',optimizer=adagrad,metrics=['accuracy'])
 81 | history = model.fit(X_train, Y_train,batch_size=batch_size, 
 82 |                     nb_epoch=nb_epoch,verbose=1, 
 83 |                     validation_data=(X_test, Y_test))
 84 | # 输出结果看看
 85 | # result = []
 86 | # result = model.predict_classes(X_test,batch_size=batch_size,verbose=1)
 87 | # 
 88 | # for r in result:
 89 | #     print r
 90 | 
 91 | score = model.evaluate(X_test, Y_test, verbose=1)
 92 | # print('Test score:', score[0])
 93 | print('Test accuracy:', score[1])
 94 | print "end"
 95 | 
 96 | # #保存模型
 97 | # json_string = model.to_json()
 98 | # open("model/model_json.json","w").write(json_string)
 99 | # #保存权重，暂时保存权重会出错
100 | # model.save_weights("model/model_json_weight.h5")
101 | # #加载保存的模型
102 | # model = model_from_json(open("model/model_json.json").read())
103 | # model.load_weights("model/model_json_weight.h5")
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/Keras/README.md:
--------------------------------------------------------------------------------
  1 | Keras手写识别字符集Demo
  2 | 
  3 | 1.关于Keras
  4 | 
  5 |   Keras是基于Theano和TensorFlow的一个深度学习框架，它的设计参考了Torch，用Python语言编写，是一个高度模块化的神经网络库，支持GPU和CPU。
  6 |    
  7 | 2.关于Keras的Sequential模型
  8 | 
  9 |   Keras有两种类型的模型，顺序模型（Sequential）和泛型模型（Model）：
 10 |   
 11 |   2.1 Sequential是多个网络层的线性堆叠，可以通过向Sequential模型传递一个layer的list来构造该模型：
 12 | 		from keras.models import Sequential
 13 | 		from keras.layers import Dense, Activation
 14 | 		model = Sequential([Dense(32, input_dim=784),Activation('relu'),Dense(10),Activation('softmax'),])
 15 | 	  也可以通过.add()方法一个个的将layer加入模型中：
 16 | 		model = Sequential()
 17 | 		model.add(Dense(32, input_dim=784))
 18 | 		model.add(Activation('relu'))
 19 | 		
 20 |   2.2 模型需要知道输入数据的shape
 21 | 	  Sequential的第一层需要接受一个关于输入数据shape的参数，后面的各个层则可以自动的推导出中间数据的shape，因此不需要为每个层都指定这个参数。
 22 |       
 23 | 	  有几种方法来为第一层指定输入数据的shape：
 24 | 		1.传递一个input_shape的关键字参数给第一层，input_shape是一个tuple类型的数据，其中也可以填入None，如果填入None则表示此位置可能是任何正整数。数据的batch大小不应包含在其中。
 25 | 		2.传递一个batch_input_shape的关键字参数给第一层，该参数包含数据的batch大小。该参数在指定固定大小batch时比较有用，例如在stateful RNNs中。事实上，Keras在内部会通过添加一个None将input_shape转化为batch_input_shape
 26 | 		3.有些2D层，如Dense，支持通过指定其输入维度input_dim来隐含的指定输入数据shape。一些3D的时域层支持通过参数input_dim和input_length来指定输入shape。
 27 | 	  下面的三个指定输入数据shape的方法是严格等价的：
 28 | 		1.model = Sequential()
 29 | 		  model.add(Dense(32, input_shape=(784,)))
 30 | 		2.model = Sequential()
 31 | 		  model.add(Dense(32, batch_input_shape=(None, 784)))
 32 | 		3.model = Sequential()
 33 | 		  model.add(Dense(32, input_dim=784))
 34 | 		  
 35 |   2.3 Sequential模型常用方法
 36 |   
 37 | 		2.3.1 compile
 38 | 		compile(self, optimizer, loss, metrics=[], sample_weight_mode=None)
 39 | 		编译用来配置模型的学习过程，其参数有：
 40 | 			optimizer：字符串（预定义优化器名）或优化器对象，参考优化器
 41 | 			loss：字符串（预定义损失函数名）或目标函数，参考目标函数
 42 | 			metrics：列表，包含评估模型在训练和测试时的网络性能的指标，典型用法是metrics=['accuracy']
 43 | 			sample_weight_mode：如果你需要按时间步为样本赋权（2D权矩阵），将该值设为“temporal”。默认为“None”，代表按样本赋权（1D权）。在下面fit函数的解释中有相关的参考内容。
 44 | 			kwargs：使用TensorFlow作为后端请忽略该参数，若使用Theano作为后端，kwargs的值将会传递给 K.function
 45 | 		代码示例：
 46 | 			model = Sequential()
 47 | 			model.add(Dense(32, input_shape=(500,)))
 48 | 			model.add(Dense(10, activation='softmax'))
 49 | 			model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])
 50 | 			
 51 | 		2.3.2 fit
 52 | 		fit(self, x, y, batch_size=32, nb_epoch=10, verbose=1, callbacks=[], validation_split=0.0, validation_data=None, shuffle=True, class_weight=None, sample_weight=None)
 53 | 		本函数将模型训练nb_epoch轮，其参数有：
 54 | 			x：输入数据。如果模型只有一个输入，那么x的类型是numpy array，如果模型有多个输入，那么x的类型应当为list，list的元素是对应于各个输入的numpy array
 55 | 			y：标签，numpy array
 56 | 			batch_size：整数，指定进行梯度下降时每个batch包含的样本数。训练时一个batch的样本会被计算一次梯度下降，使目标函数优化一步。
 57 | 			nb_epoch：整数，训练的轮数，训练数据将会被遍历nb_epoch次。Keras中nb开头的变量均为"number of"的意思
 58 | 			verbose：日志显示，0为不在标准输出流输出日志信息，1为输出进度条记录，2为每个epoch输出一行记录
 59 | 			callbacks：list，其中的元素是keras.callbacks.Callback的对象。这个list中的回调函数将会在训练过程中的适当时机被调用，参考回调函数
 60 | 			validation_split：0~1之间的浮点数，用来指定训练集的一定比例数据作为验证集。验证集将不参与训练，并在每个epoch结束后测试的模型的指标，如损失函数、精确度等。
 61 | 			validation_data：形式为（X，y）的tuple，是指定的验证集。此参数将覆盖validation_spilt。
 62 | 			shuffle：布尔值或字符串，一般为布尔值，表示是否在训练过程中随机打乱输入样本的顺序。若为字符串“batch”，则是用来处理HDF5数据的特殊情况，它将在batch内部将数据打乱。
 63 | 			class_weight：字典，将不同的类别映射为不同的权值，该参数用来在训练过程中调整损失函数（只能用于训练）
 64 | 			sample_weight：权值的numpy array，用于在训练时调整损失函数（仅用于训练）。可以传递一个1D的与样本等长的向量用于对样本进行1对1的加权，或者在面对时序数据时，传递一个的形式为（samples，sequence_length）的矩阵来为每个时间步上的样本赋不同的权。这种情况下请确定在编译模型时添加了sample_weight_mode='temporal'。
 65 | 
 66 | 3.关于Keras的常用层
 67 | 
 68 | 	3.1 Dense层
 69 | 		Dense(output_dim, init='glorot_uniform', activation='linear', weights=None, W_regularizer=None, b_regularizer=None, activity_regularizer=None, W_constraint=None, b_constraint=None, bias=True, input_dim=None)
 70 | 		Dense就是常用的全连接层，这里是一个使用示例：
 71 | 			model = Sequential()
 72 | 			model.add(Dense(32, input_dim=16))
 73 | 
 74 | 			model = Sequential()
 75 | 			model.add(Dense(32, input_shape=(16,)))
 76 | 			
 77 | 			model.add(Dense(32))
 78 | 		部分常用参数：
 79 | 			output_dim：大于0的整数，代表该层的输出维度。模型中非首层的全连接层其输入维度可以自动推断，因此非首层的全连接定义时不需要指定输入维度。
 80 | 			init：初始化方法，为预定义初始化方法名的字符串，或用于初始化权重的Theano函数。该参数仅在不传递weights参数时才有意义。
 81 | 			activation：激活函数，为预定义的激活函数名（参考激活函数），或逐元素（element-wise）的Theano函数。如果不指定该参数，将不会使用任何激活函数（即使用线性激活函数：a(x)=x）
 82 | 			input_dim：整数，输入数据的维度。当Dense层作为网络的第一层时，必须指定该参数或input_shape参数。
 83 | 			
 84 | 	3.2 Activation层
 85 | 		Activation(activation)
 86 | 		激活层对一个层的输出施加激活函数。
 87 | 		
 88 | 	3.3 Dropout层
 89 | 		Dropout(p)
 90 | 		为输入数据施加Dropout。Dropout将在训练过程中每次更新参数时随机断开一定百分比（p）的输入神经元连接，Dropout层用于防止过拟合。
 91 | 		
 92 | 	3.4 Flatten层
 93 | 		Flatten()
 94 | 		Flatten层用来将输入“压平”，即把多维的输入一维化，常用在从卷积层到全连接层的过渡。Flatten不影响batch的大小。这里是一个使用例子：
 95 | 			model = Sequential()
 96 | 			model.add(Convolution2D(64, 3, 3, border_mode='same', input_shape=(3, 32, 32)))
 97 | 			# 模型输出形状 == (None, 64, 32, 32)
 98 | 			model.add(Flatten())
 99 | 			# 模型输出形状 == (None, 65536)
100 | 			
101 | 	3.5 Convolution1D层
102 | 		Convolution1D(nb_filter, filter_length, init='uniform', activation='linear', weights=None, border_mode='valid', subsample_length=1, W_regularizer=None, b_regularizer=None, activity_regularizer=None, W_constraint=None, b_constraint=None, bias=True, input_dim=None, input_length=None)
103 | 		一维卷积层，用以在一维输入信号上进行邻域滤波。当使用该层作为首层时，需要提供关键字参数input_dim或input_shape。
104 | 		
105 | 	3.6 Convolution2D层
106 | 		二维卷积层对二维输入进行滑动窗卷积，当使用该层作为第一层时，应提供input_shape参数。
107 | 		
108 | 	3.7 MaxPooling1D层
109 | 		MaxPooling1D(pool_length=2, stride=None, border_mode='valid')
110 | 		对时域1D信号进行最大值池化
111 | 		参数：
112 | 			pool_length：下采样因子，如取2则将输入下采样到一半长度
113 | 			stride：整数或None，步长值
114 | 			border_mode：‘valid’或者‘same’
115 |  
116 |  4.数据集 
117 |  
118 | 	kerashandwerite.py的数据集下载地址：http://pan.baidu.com/s/1nvEuc8D
119 | 	
120 | ## 基于Keras的CNN性别年龄预测预测
121 | 本次request通过keras实现了性别预测
122 | 
123 | - 性别预测：cnn方法将train.csv的数据二分，对训练集本身的准确率在一段时间后会极高，但预测集在73%-78%。进一步增加训练次数后，反而将导致测试准确率下降。
124 | 
125 | - 但是年龄预测由于训练数据过少，分类较多，目前无法得到明显的准确率提升，仅能维持在10选1的概率下达到26%-30%。
126 | 
127 | 
128 | ### 其他简要说明
129 | 
130 | 1. gender-predict-cnn实现了一个初级的cnn性别预测算法。
131 | 
132 | 2. 下一步方向是参数调优和尝试使用Inception-v4或者ResNet/VGG来实现性别或年龄预测(这可以在数据不全的场合下提高准确率)。
133 | 
134 | 3. （年龄预测可以尝试按照gender-predict-cnn中的注释修改实现预测）
135 | 
136 | 4. 数据集为train.csv
137 | 
138 | 5. 具体函数作用参看注释和本节之前的说明。
139 | 
140 | 


--------------------------------------------------------------------------------
/Keras/gender_age_predict_cnn.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: UTF-8 -*-
  2 | 
  3 | import numpy as np
  4 | from keras.utils import np_utils
  5 | from keras.models import Sequential
  6 | from keras.layers import Dense,Activation,Convolution2D,MaxPooling2D,Flatten
  7 | from keras.optimizers import Adam
  8 | import csv
  9 | import string
 10 | 
 11 | # 性别是2分类 年龄是10分类
 12 | age = 10
 13 | gender = 2
 14 | 
 15 | #修改这个地方可以选择预测性别还是年龄
 16 | #但年龄的准确率不忍直视
 17 | test_what = gender
 18 | 
 19 | #数据的分组边界
 20 | splitor=1400
 21 | 
 22 | # 准备数据
 23 | age_orign = []
 24 | data_orign = []
 25 | sex_orign = []
 26 | with open('train.csv','rb') as precsv:
 27 | 	reader = csv.reader(precsv)
 28 | 	for line in reader:
 29 | 		# 忽略第一行
 30 | 		if reader.line_num == 1:
 31 | 			continue
 32 | 		if(line[1] == '\xc4\xd0'):
 33 | 			sex_orign.append(0) # 性别数据
 34 | 		else:
 35 | 			sex_orign.append(1) 
 36 | 		age_orign.append(int(float(line[2])/10)) # 年龄(按照10岁为一个阶段分组)
 37 | 		data_orign.append(line[4:]) # 血检数据
 38 | 		
 39 | # 将数据分为训练集和测试集		
 40 | age_train = np.array(age_orign[:splitor])
 41 | data_train = np.array(data_orign[:splitor])
 42 | sex_train = np.array(sex_orign[:splitor])
 43 | 
 44 | age_predict = np.array(age_orign[splitor:])
 45 | data_predict = np.array(data_orign[splitor:])
 46 | sex_predict = np.array(sex_orign[splitor:])
 47 | 
 48 | # 数据的维度（数据含有多少项）
 49 | data_dim = data_train.shape[1]
 50 | 
 51 | 
 52 | if test_what == age:
 53 | 	XT =  data_train.reshape(-1,data_dim,1,1)
 54 | 	YT =  np_utils.to_categorical(age_train,nb_classes=age)
 55 | 	XT2 =  data_predict.reshape(-1,data_dim,1,1)
 56 | 	YT2 =  np_utils.to_categorical(age_predict,nb_classes=age)
 57 | 	output_dim = age
 58 | 	loss_str = 'categorical_crossentropy'
 59 | else:
 60 | 	XT =  data_train.reshape(-1,data_dim,1,1)
 61 | 	YT =  np_utils.to_categorical(sex_train,nb_classes=gender)
 62 | 	XT2 =  data_predict.reshape(-1,data_dim,1,1)
 63 | 	YT2 =  np_utils.to_categorical(sex_predict,nb_classes=gender)
 64 | 	output_dim = gender
 65 | 	loss_str = 'binary_crossentropy'
 66 | 
 67 | #
 68 | model = Sequential()
 69 | 
 70 | # 
 71 | model.add( Convolution2D( 
 72 | 	nb_filter=data_dim*data_dim, 
 73 | 	nb_row=5, 
 74 | 	nb_col=5,
 75 | 	border_mode='same',
 76 | 	input_shape=(data_dim,1,1) 
 77 | 	))
 78 | model.add(Activation('relu'))
 79 | 
 80 | # pooling
 81 | 
 82 | model.add( MaxPooling2D(
 83 | 	pool_size=(2,2),
 84 | 	strides=(2,2),
 85 | 	border_mode='same'	 
 86 | 	))
 87 | 	
 88 | model.add( Convolution2D(64,5,5,border_mode='same'))
 89 | model.add( Flatten())
 90 | model.add( Dense(1024) )
 91 | #model.add( Activation('relu'))
 92 | 
 93 | model.add( Activation('relu'))
 94 | model.add(Dense(output_dim))
 95 | model.add( Activation('softmax'))
 96 | adam = Adam(lr=0.0001)
 97 | model.compile(
 98 | 	loss=loss_str,
 99 | 	optimizer=adam,
100 | 	metrics=['accuracy']
101 | 	)
102 | 
103 | model.fit(XT,YT,nb_epoch=100,batch_size=32)
104 | 	
105 | print '===='	
106 | loss,accuracy = model.evaluate(XT2,YT2)
107 | print loss
108 | print accuracy
109 | 
110 | 
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/Keras/kerashandwritetest.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-  
  2 | from keras.preprocessing.image import ImageDataGenerator  
  3 | from keras.models import Sequential  
  4 | from keras.layers.core import Dense, Dropout, Activation, Flatten  
  5 | from keras.layers.advanced_activations import PReLU  
  6 | from keras.layers.convolutional import Convolution2D, MaxPooling2D  
  7 | from keras.optimizers import SGD, Adadelta, Adagrad  
  8 | from keras.utils import np_utils, generic_utils  
  9 | from six.moves import range   
 10 | import random  
 11 | import os  
 12 | from PIL import Image  
 13 | import numpy as np
 14 | from keras import backend
 15 | backend.set_image_dim_ordering('th')  
 16 |   
 17 | #读取文件夹mnist下的42000张图片，图片为灰度图，所以为1通道，图像大小28*28  
 18 | #如果是将彩色图作为输入,则将1替换为3，并且data[i,:,:,:] = arr改为data[i,:,:,:] = [arr[:,:,0],arr[:,:,1],arr[:,:,2]]  
 19 | def load_data():  
 20 |     data = np.empty((42000,1,28,28),dtype="float32")  
 21 |     label = np.empty((42000,),dtype="uint8")  
 22 |   
 23 |     imgs = os.listdir("./mnist")  
 24 |     num = len(imgs)  
 25 |     for i in range(num):
 26 |         if '.jpg' in imgs[i]:
 27 |             img = Image.open("./mnist/"+imgs[i])  
 28 |             arr = np.asarray(img,dtype="float32")  
 29 |             data[i,:,:,:] = arr  
 30 |             label[i] = int(imgs[i].split('.')[0])  
 31 |     return data,label 
 32 | 
 33 | #加载数据  
 34 | data, label = load_data()  
 35 | #打乱数据  
 36 | index = [i for i in range(len(data))]  
 37 | random.shuffle(index)  
 38 | data = data[index]  
 39 | label = label[index]  
 40 | print(data.shape[0], ' samples')  
 41 |   
 42 | #label为0~9共10个类别，keras要求格式为binary class matrices,转化一下，直接调用keras提供的这个函数  
 43 | label = np_utils.to_categorical(label, 10)  
 44 |   
 45 | #开始建立CNN模型  
 46 | #生成一个model,可以通过向 Sequential模型传递一个layer的list来构造该model 
 47 | model = Sequential()  
 48 |   
 49 | #第一个卷积层，4个卷积核，每个卷积核大小5*5。1表示输入的图片的通道,灰度图为1通道。  
 50 | #border_mode边界模式  可以是valid或者full，valid只适用于完整的图像补丁的过滤器
 51 | model.add(Convolution2D(4, 5, 5, border_mode='valid', input_shape=(1,28,28)))#input_shape在后面的层可以推导出来，不需要为每一个层都指定这个参数  
 52 | model.add(Activation('tanh'))  
 53 | #model.add(Dropout(0.5))#训练过程更新参数随机断开一定比例的神经元连接，避免过拟合，它们在正向传播过程中对于下游神经元的贡献效果暂时消失了，反向传播时该神经元也不会有任何权重的更新。  
 54 |   
 55 | #第二个卷积层，8个卷积核，每个卷积核大小3*3。                                                                  4表示输入的特征图个数，等于上一层的卷积核个数  
 56 | #激活函数用tanh  
 57 | #采用maxpooling，poolsize为(2,2)  
 58 | model.add(Convolution2D(8, 3, 3, border_mode='valid'))  
 59 | model.add(Activation('tanh'))  
 60 | model.add(MaxPooling2D(pool_size=(2, 2)))  
 61 |   
 62 | #第三个卷积层，16个卷积核，每个卷积核大小3*3  
 63 | #激活函数用tanh  
 64 | #采用maxpooling，poolsize为(2,2)最大化池操作，也就是下采样，这里是2维代表两个方向（竖直，水平）的 ，对输入进行size为(2,2)的下采样操作的话，结果就剩下了输入的每一维度的一半，即总的结果是原输入的四分之一。 
 65 | model.add(Convolution2D(16,  3, 3, border_mode='valid'))  
 66 | model.add(Activation('tanh'))  
 67 | model.add(MaxPooling2D(pool_size=(2, 2)))  
 68 |   
 69 | #全连接层，先将前一层输出的二维特征图flatten为一维的，常常用在卷积层到全链接层的过度。  
 70 | #Dense就是隐藏层。16就是上一层输出的特征图个数。4是根据每个卷积层计算出来的：(28-5+1)得到24,(24-3+1)/2得到11，(11-3+1)/2得到4  
 71 | #全连接有128个神经元节点,初始化方式为normal  
 72 | model.add(Flatten())  
 73 | model.add(Dense(input_dim=256, output_dim=128))#256=16*4*4  
 74 | model.add(Activation('tanh'))  
 75 |   
 76 | #Softmax分类，输出是10类别  
 77 | model.add(Dense(input_dim=128, output_dim=10))  
 78 | model.add(Activation('softmax'))  
 79 |   
 80 | #开始训练模型  
 81 | #使用SGD优化函数  ，lr学习速率， momentum参数更新动量，decay是学习速率的衰减系数(每个epoch衰减一次),Nesterov的值是False或者True，表示使不使用Nesterov momentum
 82 | #model.compile里的参数loss就是损失函数(目标函数)， optimizer是使用的优化器，metrics列表包含评估模型在训练和测试时的网络性能的指标 
 83 | sgd = SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True)  
 84 | model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])  
 85 |   
 86 |    
 87 | #调用fit方法，就是一个训练过程  
 88 | #输入的数据，标签，进行梯度下降的时候每个batch包含的样本数，训练的轮数，是否打乱，日志显示（0不输出日志信息，1输出进度条，2每轮训练输出一条记录），是否显示精确度，选择作为验证集的比例
 89 | model.fit(data, label, batch_size=100, nb_epoch=1,shuffle=True,verbose=1,show_accuracy=True,validation_split=0.2)
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/Keras/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | np.random.seed(1337)  # for reproducibility
  4 | from keras.models import Sequential
  5 | from keras.layers.core import Dense, Dropout, Activation
  6 | from keras.optimizers import SGD, Adam, RMSprop
  7 | from keras.utils import np_utils
  8 | from time import sleep
  9 | batch_size = 128
 10 | nb_classes = 2
 11 | nb_epoch = 100
 12 | def load_data():
 13 |    x_train=[]
 14 |    Y_train=[]
 15 |    x_test=[]
 16 |    Y_test=[]
 17 |    
 18 |    f = open("train.txt","r")
 19 |    i = 0
 20 |    for line in f.readlines():
 21 |        line = line.split(",")
 22 |        if i>0:
 23 |          if line[1] == "男":
 24 |              Y_train.append(0)
 25 |          else: 
 26 |              Y_train.append(1)
 27 |          del line[0]
 28 |          del line[0]
 29 |          del line[0]
 30 |          x_train.append(line)
 31 |        i += 1
 32 |    x1=np.array(x_train)
 33 |    y1=np.array(Y_train)
 34 |    f.close()
 35 |    
 36 |    f = open("test.txt","r")
 37 |    i = 0
 38 |    for line in f.readlines():
 39 |        line = line.split(",")
 40 |        if i>0:
 41 |          if line[1] == "男":
 42 |              Y_test.append(0)
 43 |          else:
 44 |              Y_test.append(1)
 45 |          del line[0]
 46 |          del line[0]
 47 |          del line[0]
 48 |          x_test.append(line)
 49 |        i += 1
 50 |    x2=np.array(x_test)
 51 |    y2=np.array(Y_test)
 52 |    f.close()
 53 | 
 54 |    return (x1, y1), (x2, y2)
 55 |    
 56 | (X_train, y_train), (X_test, y_test) = load_data()
 57 | X_train = X_train.reshape(1858, 26)
 58 | X_test = X_test.reshape(200, 26)
 59 | X_train = X_train.astype('float32')
 60 | X_test = X_test.astype('float32')
 61 | X_train /= 255
 62 | X_test /= 255
 63 | 
 64 | print(X_train.shape[0], 'train samples')
 65 | print(X_test.shape[0], 'test samples')
 66 | 
 67 | Y_train = np_utils.to_categorical(y_train, nb_classes)
 68 | Y_test = np_utils.to_categorical(y_test, nb_classes)
 69 | #分成3层，中间隐层有32个节点
 70 | model = Sequential()
 71 | model.add(Dense(32, input_shape=(26,)))
 72 | model.add(Activation('relu'))
 73 | model.add(Dropout(0.2))
 74 | model.add(Dense(output_dim=32))
 75 | model.add(Activation('relu'))
 76 | model.add(Dropout(0.2))
 77 | model.add(Dense(output_dim=2))
 78 | model.add(Activation('softmax'))
 79 | 
 80 | model.compile(loss='categorical_crossentropy',
 81 |               optimizer=RMSprop(),
 82 |               metrics=['accuracy'])
 83 | history = model.fit(X_train, Y_train,
 84 |                     batch_size=batch_size, nb_epoch=nb_epoch,
 85 |                     verbose=1, validation_data=(X_test, Y_test))
 86 | #输出预测结果看一下
 87 | '''
 88 | result = []
 89 | result = model.predict_classes(X_test,batch_size=batch_size,verbose=1)
 90 | for r in result:
 91 |     print r
 92 | '''
 93 | 
 94 | score = model.evaluate(X_test, Y_test, verbose=1)
 95 | 
 96 | #print('Test score:', score[0])
 97 | print('Test accuracy:', score[1])
 98 | print "end"
 99 | 
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 | Version 2.0, January 2004
  3 | http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 | "License" shall mean the terms and conditions for use, reproduction, and
 10 | distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright
 13 | owner that is granting the License.
 14 | 
 15 | "Legal Entity" shall mean the union of the acting entity and all other entities
 16 | that control, are controlled by, or are under common control with that entity.
 17 | For the purposes of this definition, "control" means (i) the power, direct or
 18 | indirect, to cause the direction or management of such entity, whether by
 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
 20 | outstanding shares, or (iii) beneficial ownership of such entity.
 21 | 
 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising
 23 | permissions granted by this License.
 24 | 
 25 | "Source" form shall mean the preferred form for making modifications, including
 26 | but not limited to software source code, documentation source, and configuration
 27 | files.
 28 | 
 29 | "Object" form shall mean any form resulting from mechanical transformation or
 30 | translation of a Source form, including but not limited to compiled object code,
 31 | generated documentation, and conversions to other media types.
 32 | 
 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made
 34 | available under the License, as indicated by a copyright notice that is included
 35 | in or attached to the work (an example is provided in the Appendix below).
 36 | 
 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that
 38 | is based on (or derived from) the Work and for which the editorial revisions,
 39 | annotations, elaborations, or other modifications represent, as a whole, an
 40 | original work of authorship. For the purposes of this License, Derivative Works
 41 | shall not include works that remain separable from, or merely link (or bind by
 42 | name) to the interfaces of, the Work and Derivative Works thereof.
 43 | 
 44 | "Contribution" shall mean any work of authorship, including the original version
 45 | of the Work and any modifications or additions to that Work or Derivative Works
 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work
 47 | by the copyright owner or by an individual or Legal Entity authorized to submit
 48 | on behalf of the copyright owner. For the purposes of this definition,
 49 | "submitted" means any form of electronic, verbal, or written communication sent
 50 | to the Licensor or its representatives, including but not limited to
 51 | communication on electronic mailing lists, source code control systems, and
 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for
 53 | the purpose of discussing and improving the Work, but excluding communication
 54 | that is conspicuously marked or otherwise designated in writing by the copyright
 55 | owner as "Not a Contribution."
 56 | 
 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf
 58 | of whom a Contribution has been received by Licensor and subsequently
 59 | incorporated within the Work.
 60 | 
 61 | 2. Grant of Copyright License.
 62 | 
 63 | Subject to the terms and conditions of this License, each Contributor hereby
 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 65 | irrevocable copyright license to reproduce, prepare Derivative Works of,
 66 | publicly display, publicly perform, sublicense, and distribute the Work and such
 67 | Derivative Works in Source or Object form.
 68 | 
 69 | 3. Grant of Patent License.
 70 | 
 71 | Subject to the terms and conditions of this License, each Contributor hereby
 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 73 | irrevocable (except as stated in this section) patent license to make, have
 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where
 75 | such license applies only to those patent claims licensable by such Contributor
 76 | that are necessarily infringed by their Contribution(s) alone or by combination
 77 | of their Contribution(s) with the Work to which such Contribution(s) was
 78 | submitted. If You institute patent litigation against any entity (including a
 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a
 80 | Contribution incorporated within the Work constitutes direct or contributory
 81 | patent infringement, then any patent licenses granted to You under this License
 82 | for that Work shall terminate as of the date such litigation is filed.
 83 | 
 84 | 4. Redistribution.
 85 | 
 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof
 87 | in any medium, with or without modifications, and in Source or Object form,
 88 | provided that You meet the following conditions:
 89 | 
 90 | You must give any other recipients of the Work or Derivative Works a copy of
 91 | this License; and
 92 | You must cause any modified files to carry prominent notices stating that You
 93 | changed the files; and
 94 | You must retain, in the Source form of any Derivative Works that You distribute,
 95 | all copyright, patent, trademark, and attribution notices from the Source form
 96 | of the Work, excluding those notices that do not pertain to any part of the
 97 | Derivative Works; and
 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any
 99 | Derivative Works that You distribute must include a readable copy of the
100 | attribution notices contained within such NOTICE file, excluding those notices
101 | that do not pertain to any part of the Derivative Works, in at least one of the
102 | following places: within a NOTICE text file distributed as part of the
103 | Derivative Works; within the Source form or documentation, if provided along
104 | with the Derivative Works; or, within a display generated by the Derivative
105 | Works, if and wherever such third-party notices normally appear. The contents of
106 | the NOTICE file are for informational purposes only and do not modify the
107 | License. You may add Your own attribution notices within Derivative Works that
108 | You distribute, alongside or as an addendum to the NOTICE text from the Work,
109 | provided that such additional attribution notices cannot be construed as
110 | modifying the License.
111 | You may add Your own copyright statement to Your modifications and may provide
112 | additional or different license terms and conditions for use, reproduction, or
113 | distribution of Your modifications, or for any such Derivative Works as a whole,
114 | provided Your use, reproduction, and distribution of the Work otherwise complies
115 | with the conditions stated in this License.
116 | 
117 | 5. Submission of Contributions.
118 | 
119 | Unless You explicitly state otherwise, any Contribution intentionally submitted
120 | for inclusion in the Work by You to the Licensor shall be under the terms and
121 | conditions of this License, without any additional terms or conditions.
122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of
123 | any separate license agreement you may have executed with Licensor regarding
124 | such Contributions.
125 | 
126 | 6. Trademarks.
127 | 
128 | This License does not grant permission to use the trade names, trademarks,
129 | service marks, or product names of the Licensor, except as required for
130 | reasonable and customary use in describing the origin of the Work and
131 | reproducing the content of the NOTICE file.
132 | 
133 | 7. Disclaimer of Warranty.
134 | 
135 | Unless required by applicable law or agreed to in writing, Licensor provides the
136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
138 | including, without limitation, any warranties or conditions of TITLE,
139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
140 | solely responsible for determining the appropriateness of using or
141 | redistributing the Work and assume any risks associated with Your exercise of
142 | permissions under this License.
143 | 
144 | 8. Limitation of Liability.
145 | 
146 | In no event and under no legal theory, whether in tort (including negligence),
147 | contract, or otherwise, unless required by applicable law (such as deliberate
148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be
149 | liable to You for damages, including any direct, indirect, special, incidental,
150 | or consequential damages of any character arising as a result of this License or
151 | out of the use or inability to use the Work (including but not limited to
152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or
153 | any and all other commercial damages or losses), even if such Contributor has
154 | been advised of the possibility of such damages.
155 | 
156 | 9. Accepting Warranty or Additional Liability.
157 | 
158 | While redistributing the Work or Derivative Works thereof, You may choose to
159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or
160 | other liability obligations and/or rights consistent with this License. However,
161 | in accepting such obligations, You may act only on Your own behalf and on Your
162 | sole responsibility, not on behalf of any other Contributor, and only if You
163 | agree to indemnify, defend, and hold each Contributor harmless for any liability
164 | incurred by, or claims asserted against, such Contributor by reason of your
165 | accepting any such warranty or additional liability.
166 | 
167 | END OF TERMS AND CONDITIONS
168 | 
169 | APPENDIX: How to apply the Apache License to your work
170 | 
171 | To apply the Apache License to your work, attach the following boilerplate
172 | notice, with the fields enclosed by brackets "{}" replaced with your own
173 | identifying information. (Don't include the brackets!) The text should be
174 | enclosed in the appropriate comment syntax for the file format. We also
175 | recommend that a file or class name and description of purpose be included on
176 | the same "printed page" as the copyright notice for easier identification within
177 | third-party archives.
178 | 
179 |    Copyright 2016 mengning
180 | 
181 |    Licensed under the Apache License, Version 2.0 (the "License");
182 |    you may not use this file except in compliance with the License.
183 |    You may obtain a copy of the License at
184 | 
185 |      http://www.apache.org/licenses/LICENSE-2.0
186 | 
187 |    Unless required by applicable law or agreed to in writing, software
188 |    distributed under the License is distributed on an "AS IS" BASIS,
189 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190 |    See the License for the specific language governing permissions and
191 |    limitations under the License.
192 | 


--------------------------------------------------------------------------------
/MxNet/README.md:
--------------------------------------------------------------------------------
 1 | *Mxnet是一个轻量化分布式可移植深度学习计算平台，支持多机多节点、多GPU的计算，其openMP+MPI/SSH+Cuda/Cudnn的框架是的计算速度很快，且能与分布式文件系统结合实现大数据的深度学习。*
 2 | ##Mxnet单节点的安装：
 3 | **1、安装基本依赖**
 4 | ```
 5 | sudo apt-get update
 6 | ```
 7 | 
 8 | ```
 9 | sudo apt-get install -y build-essential git libblas-dev libopencv-dev
10 | ```
11 | **2、下载mxnet**
12 | ```
13 | git clone --recursive https://github.com/dmlc/mxnet.git
14 | ```
15 | **3、安装CUDA**
16 | ```
17 | 具体参见http://blog.csdn.net/xizero00/article/details/43227019
18 | ```
19 | **4、编译支持GPU的MXnet**
20 | 
21 | 将mxnet/目录里找到mxnet/make/子目录，把该目录下的config.mk复制到mxnet/目录，用文本编辑器打开，找到并修改以下两行：
22 | ```
23 | USE_CUDA = 1
24 | 
25 | USE_CUDA_PATH = /usr/local/cuda
26 | ```
27 | 修改之后，在mxnet/目录下编译
28 | ```
29 | make -j4
30 | ```
31 | **５、安装Python支持**
32 | ```
33 | cd python;
34 | 
35 | python setup.py install
36 | ```
37 | 有些时候需要安装setuptools和numpy(sudo apt-get install python-numpy)。
38 | **６、运行Mnist手写体识别实例**
39 | 在mxnet/example/image-classification里可以找到MXnet自带MNIST的识别样例
40 | ```
41 | cd mxnet/example/image-classification
42 | 
43 | python train_mnist.py
44 | ```


--------------------------------------------------------------------------------
/PaddlePaddle/README.md:
--------------------------------------------------------------------------------
  1 | ﻿
  2 | # PaddlePaddle图像分类demo
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | ## 安装PaddlePaddle
  9 | 
 10 | ```
 11 | 
 12 | # 下载安装包
 13 | 
 14 | wget https://github.com/PaddlePaddle/Paddle/releases/download/V0.8.0b1/paddle-cpu-0.8.0b1-Linux.deb
 15 | 
 16 | 
 17 | # 安装
 18 | 
 19 | gdebi paddle-cpu-0.8.0b1-Linux.deb
 20 | 
 21 | 如果 gdebi 没有安装,则需要使用 sudo apt-get install gdebi, 来安装 gdebi 。
 22 | 
 23 | 或者使用下面一条命令安装.
 24 | 
 25 | dpkg -i paddle-cpu-0.8.0b1-Linux.deb
 26 | 
 27 | apt-get install -f
 28 | 在 dpkg -i 的时候如果报一些依赖未找到的错误是正常的， 在 apt-get install -f 里会继续安装 PaddlePaddle
 29 | 
 30 | 官方教程http://www.paddlepaddle.org/doc_cn/build_and_install/install/ubuntu_install.html
 31 | 
 32 | 
 33 | ```
 34 | 
 35 | 
 36 | ## 下载MNIST数据集
 37 | 
 38 | 下载地址https://pan.baidu.com/s/1kUNBkyz
 39 | 
 40 | 在当前目录建立data文件夹，将MNIST.rar里的train和test文件夹解压到data文件夹下
 41 | 
 42 | 注该数据集将原版MNIST二进制文件中的图片提取出来分别放入train和test文件夹，用户可以自行添加图片到train和test文件夹下，但要修改源码中关于图像大小的参数
 43 | 
 44 | 
 45 | 
 46 | ## 训练MNIST
 47 | 
 48 | 
 49 | ```
 50 | 
 51 | sh preprocess.sh # 调用preprocess.py 预处理
 52 | 
 53 | sh train.sh # 调用vgg.py训练，该脚本文件可设置训练模型存放路径和训练线程数等参数
 54 | 
 55 | python prediction.py # 预测，注意设置其中模型路径model_path
 56 | 
 57 | ```
 58 | 
 59 | 
 60 | 
 61 | ## 训练性别
 62 | 
 63 | 
 64 | 训练前把train.csv，predict.csv拷贝到当前路径
 65 | ```
 66 | 
 67 | sh train_sex.sh # 调用trainer_config_sex.py训练，注意设置num_passes训练次数，训练三十次错误率能降到30%左右
 68 | 
 69 | sh predict_sex.sh # 调用trainer_config_sex.py预测，注意设置模型路径model_path
 70 | 
 71 | ```
 72 | 
 73 | 
 74 | 
 75 | ## 训练年龄
 76 | 
 77 | 
 78 | 训练前把train.csv，predict.csv拷贝到当前路径
 79 | ```
 80 | 
 81 | sh train_age.sh # 调用trainer_config_age.py训练，注意设置num_passes训练次数，如果以5分段预测，训练100次错误率在85%左右，不分段错误率在95%左右
 82 | 
 83 | sh predict_age.sh # 调用trainer_config_age.py预测，注意设置模型路径model_path
 84 | 
 85 | ```
 86 | 
 87 | 
 88 | 
 89 | ## preprocess.py 
 90 | 
 91 | 
 92 | 预处理模块，将data文件夹下的图片转换为PaddlePaddle格式
 93 | 
 94 | 转换后的数据存放在data/batches文件夹下
 95 | 
 96 | 
 97 | 
 98 | ## vgg.py
 99 | 
100 | 
101 | 训练模块，使用VGG网络训练，该网络在ILSVRC2014的图像分类项目上获第二名
102 | 
103 | 训练后的模型存放在vgg_model/pass-n文件夹下，n表示第几次训练，每训练一次会生成一个模型文件夹，理论上训练次数越多的模型效果越好
104 | 
105 | 注使用CPU训练速度很慢，平均训练一次需要近半小时，目前PaddlePaddle使用CPU训练出来的模型和GPU训练出来的模型不一样，所以用CPU训练只能用CPU预测，用GPU训练只能用GPU预测，而且用GPU预测要安装GPU版的PaddlePaddle和CUDA，cudnn,并且需要NVIDIA显卡支持，所以这里用的是CPU版的
106 | 
107 | 
108 | 
109 | ## prediction.py
110 | 
111 | 
112 | 预测模块，其中image参数为要识别的图像路径
113 | 
114 | 
115 | 
116 | ## dataprovider.py
117 | 
118 | 
119 | 实现向PaddlePaddle提供数据的接口，详见dataprovider.py注释
120 | 
121 | 
122 | 
123 | ## trainer_config_sex.py
124 | 
125 | 
126 | 性别训练网络配置
127 | 
128 | 
129 | 
130 | ## trainer_config_age.py
131 | 
132 | 
133 | 年龄训练网络配置
134 | 
135 | 
136 | 
137 | ## predict_age.sh & predict_sex.sh
138 | 
139 | 
140 | 预测脚本文件，预测的结果保存在当前路径下的result.txt文件，第一个数为预测的结果，后面的数是每个标签的概率
141 | 
142 | 
143 | 
144 | ## prediction_age.py & prediction_sex.py
145 | 
146 | 
147 | 提供预测接口，也可单独执行，接口输入为一个形如[[[0,1,2,...]]]的list，输出为性别或年龄的标签
148 | 
149 | 
150 | 
151 | ## train.list & test.list
152 | 
153 | 
154 | 训练文件和测试文件的列表
155 | 
156 | 
157 | 
158 | ## __init__.py
159 | 
160 | 
161 | 使A2的文件能导入本文件夹下的模块


--------------------------------------------------------------------------------
/PaddlePaddle/__init__.py:
--------------------------------------------------------------------------------
1 |  
2 | 


--------------------------------------------------------------------------------
/PaddlePaddle/dataprovider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import io
  3 | import random
  4 | import paddle.utils.image_util as image_util
  5 | from paddle.trainer.PyDataProvider2 import *
  6 | import csv
  7 | 
  8 | @provider(input_types=[
  9 |     #训练数据大小
 10 |     dense_vector(26),
 11 |     #标签种类
 12 |     integer_value(2)
 13 | ])
 14 | #提供性别训练数据的函数
 15 | def process_sex(settings, file_name):
 16 |     csvfile = file('train.csv', 'rb')
 17 |     reader = csv.reader(csvfile)
 18 |     for row in reader:
 19 |         if row[0]!='id':
 20 | 	    sex=0
 21 | 	    if(row[1]=='\xc4\xd0'):
 22 | 		sex=1
 23 | 	    del row[0]
 24 | 	    del row[0]
 25 | 	    del row[0]
 26 | 	    pixels = []
 27 | 	    for j in row:
 28 | 		if(j!=''):
 29 |             	    pixels.append(float(j))
 30 | 	    if(len(pixels)==26):
 31 | 	    	yield pixels,int(sex)
 32 |     csvfile.close()
 33 | 
 34 | def predict_initializer(settings, **kwargs):
 35 |     settings.input_types=[
 36 |     dense_vector(26)
 37 |     ]
 38 | #提供性别预测数据的函数
 39 | @provider(init_hook=predict_initializer, should_shuffle=False)
 40 | def process_predict_sex(settings, file_name):
 41 |     csvfile = file('predict.csv', 'rb')
 42 |     reader = csv.reader(csvfile)
 43 |     rows= [row for row in reader]
 44 |     #预测第一行
 45 |     row=rows[1]
 46 |     sex='女'
 47 |     if(row[1]=='\xc4\xd0'):
 48 | 	sex='男'
 49 |     print '实际性别：'+sex
 50 |     del row[0]
 51 |     del row[0]
 52 |     del row[0]
 53 |     pixels = []
 54 |     for j in row:
 55 | 	pixels.append(float(j))
 56 |     if(len(pixels)==26):
 57 | 	yield pixels
 58 | 
 59 | @provider(input_types=[
 60 |     dense_vector(26),
 61 |     integer_value(100)
 62 | ])
 63 | #提供年龄训练数据的函数
 64 | def process_age(settings, file_name):
 65 |     csvfile = file('train.csv', 'rb')
 66 |     reader = csv.reader(csvfile)
 67 |     for row in reader:
 68 |         if row[0]!='id':
 69 | 	    age=int(row[2])
 70 | 	    del row[0]
 71 | 	    del row[0]
 72 | 	    del row[0]
 73 | 	    pixels = []
 74 | 	    for j in row:
 75 | 		if(j!=''):
 76 |             	    pixels.append(float(j))
 77 | 	    if(len(pixels)==26):
 78 | 	    	yield pixels,age
 79 |     csvfile.close()
 80 | 
 81 | def predict_initializer(settings, **kwargs):
 82 |     settings.input_types=[
 83 |     dense_vector(26)
 84 |     ]
 85 | #提供年龄预测数据的函数
 86 | @provider(init_hook=predict_initializer, should_shuffle=False)
 87 | def process_predict_age(settings, file_name):
 88 |     csvfile = file('predict.csv', 'rb')
 89 |     reader = csv.reader(csvfile)
 90 |     rows= [row for row in reader]
 91 |     row=rows[1]
 92 |     print '实际年龄：'+row[2]
 93 |     del row[0]
 94 |     del row[0]
 95 |     del row[0]
 96 |     pixels = []
 97 |     for j in row:
 98 | 	if(j!=''):
 99 |             pixels.append(float(j))
100 |     if(len(pixels)==26):
101 | 	yield pixels
102 |     csvfile.close()
103 | 
104 | def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,
105 |          is_train, **kwargs):
106 |     settings.mean_img_size = mean_img_size
107 |     settings.img_size = img_size
108 |     settings.num_classes = num_classes
109 |     settings.color = color
110 |     settings.is_train = is_train
111 | 
112 |     if settings.color:
113 |         settings.img_raw_size = settings.img_size * settings.img_size * 3
114 |     else:
115 |         settings.img_raw_size = settings.img_size * settings.img_size
116 | 
117 |     settings.meta_path = meta
118 |     settings.use_jpeg = use_jpeg
119 | 
120 |     settings.img_mean = image_util.load_meta(settings.meta_path,
121 |                                              settings.mean_img_size,
122 |                                              settings.img_size,
123 |                                              settings.color)
124 | 
125 |     settings.logger.info('Image size: %s', settings.img_size)
126 |     settings.logger.info('Meta path: %s', settings.meta_path)
127 |     '''
128 | PaddlePaddle的数据包括四种主要类型，和三种序列模式。其中，四种数据类型是
129 | 
130 | dense_vector 表示稠密的浮点数向量。
131 | sparse_binary_vector 表示稀疏的零一向量，即大部分值为0，有值的位置只能取1
132 | sparse_float_vector 表示稀疏的向量，即大部分值为0，有值的部分可以是任何浮点数
133 | integer 表示整数标签。
134 | 而三种序列模式为
135 | 
136 | SequenceType.NO_SEQUENCE 即不是一条序列
137 | SequenceType.SEQUENCE 即是一条时间序列
138 | SequenceType.SUB_SEQUENCE 即是一条时间序列，且序列的每一个元素还是一个时间序列。
139 | '''
140 |     settings.input_types = [
141 |         dense_vector(settings.img_raw_size),  # image feature
142 |         integer_value(settings.num_classes)]  # labels
143 | 
144 |     settings.logger.info('DataProvider Initialization finished')
145 | '''
146 | @provider 是一个Python的 Decorator ，他可以将某一个函数标记成一个PyDataProvider
147 | PyDataProvider是PaddlePaddle使用Python提供数据的推荐接口。使用该接口用户可以只关注如何从文件中读取每一条数据，而不用关心数据如何传输给PaddlePaddle，数据如何存储等等。该数据接口使用多线程读取数据，并提供了简单的Cache功能
148 | init_hook可以传入一个函数。这个函数在初始化的时候会被调用。这个函数的参数是:
149 | 
150 | 第一个参数是 settings 对象。这个对象和process的第一个参数一致。具有的属性有
151 | settings.input_types 设置输入类型。参考 input_types
152 | settings.logger 一个logging对象
153 | 其他参数都使用key word argument传入。这些参数包括paddle定义的参数，和用户传入的参数。
154 | Paddle定义的参数包括:
155 | is_train bool参数，表示这个DataProvider是训练用的DataProvider或者测试用的 DataProvider
156 | file_list 所有文件列表。
157 | 用户定义的参数使用args在训练配置中设置。
158 | 
159 | 注意，PaddlePaddle保留添加参数的权力，所以init_hook尽量使用 **kwargs , 来接受不使用的 函数来保证兼容性。
160 | 详见http://www.paddlepaddle.org/doc_cn/ui/data_provider/pydataprovider2.html
161 | '''
162 | @provider(init_hook=hook)
163 | def processData(settings, file_name):
164 |     """
165 |     加载数据
166 |     迭代每一批的所有图像和标签
167 |     file_name: 批文件名
168 |     """
169 |     #使用pickle类来进行python对象的序列化，而cPickle提供了一个更快速简单的接口，如python文档所说的：“cPickle -- A faster pickle”
170 |     data = cPickle.load(io.open(file_name, 'rb'))
171 |     #list() 方法用于将元组转换为列表，元组与列表的区别在于元组的元素值不能修改，元组是放在括号中，列表是放于方括号中。
172 |     indexes = list(range(len(data['images'])))
173 |     if settings.is_train:
174 |         random.shuffle(indexes)
175 |     for i in indexes:
176 |         if settings.use_jpeg == 1:
177 |             img = image_util.decode_jpeg(data['images'][i])
178 |         else:
179 |             img = data['images'][i]
180 | 	#如果不是训练, 裁剪图像中心区域.否则随机裁剪,
181 |         img_feat = image_util.preprocess_img(img, settings.img_mean,
182 |                                              settings.img_size, settings.is_train,
183 |                                              settings.color)
184 |         label = data['labels'][i]
185 | 	'''
186 | 	包含yield语句的函数会被特地编译成生成器。当函数被调用时，他们返回一个生成器对象
187 | 	不像一般函数生成值后退出，生成器函数生成值后会自动挂起并暂停他们的执行和状态，他的本地变量将保存状态信息，这些信息在函数恢复时将再度有效
188 | 	执行到 yield时，processData 函数就返回一个迭代值，下次迭代时，代码从 yield的下一条语句继续执行
189 | 	'''
190 |         yield img_feat.tolist(), int(label)
191 | 


--------------------------------------------------------------------------------
/PaddlePaddle/predict_age.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | paddle train \
 5 |     --config=trainer_config_age.py \
 6 |     --use_gpu=1 \
 7 |     --job=test \
 8 |     --init_model_path="output_age/pass-00099" \
 9 |     --config_args=is_predict=1 \
10 |     --predict_output_dir=. \
11 | 
12 | mv rank-00000 result.txt
13 | 


--------------------------------------------------------------------------------
/PaddlePaddle/predict_sex.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | paddle train \
 5 |     --config=trainer_config_sex.py \
 6 |     --use_gpu=1 \
 7 |     --job=test \
 8 |     --init_model_path="output_sex/pass-00029" \
 9 |     --config_args=is_predict=1 \
10 |     --predict_output_dir=. \
11 | 
12 | mv rank-00000 result.txt
13 | 


--------------------------------------------------------------------------------
/PaddlePaddle/prediction.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import os,sys
  3 | import numpy as np
  4 | import logging
  5 | from PIL import Image
  6 | from optparse import OptionParser
  7 | 
  8 | import paddle.utils.image_util as image_util
  9 | 
 10 | from py_paddle import swig_paddle, DataProviderConverter
 11 | from paddle.trainer.PyDataProvider2 import dense_vector
 12 | from paddle.trainer.config_parser import parse_config
 13 | 
 14 | logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
 15 | logging.getLogger().setLevel(logging.INFO)
 16 | 
 17 | class ImageClassifier():
 18 |     def __init__(self,
 19 |                  train_conf,
 20 |                  use_gpu=True,
 21 |                  model_dir=None,
 22 |                  resize_dim=None,
 23 |                  crop_dim=None,
 24 |                  mean_file=None,
 25 |                  oversample=False,
 26 |                  is_color=False):
 27 |         """
 28 |         train_conf: 网络配置文件
 29 |         model_dir: 模型路径
 30 |         resize_dim: 设为原图大小
 31 |         crop_dim: 图像裁剪大小，一般设为原图大小
 32 |         oversample: bool, oversample表示多次裁剪,这里禁用
 33 |         """
 34 |         self.train_conf = train_conf
 35 |         self.model_dir = model_dir
 36 |         if model_dir is None:
 37 |             self.model_dir = os.path.dirname(train_conf)
 38 | 
 39 |         self.resize_dim = resize_dim
 40 |         self.crop_dims = [crop_dim, crop_dim]
 41 |         self.oversample = oversample
 42 |         self.is_color = is_color
 43 | 
 44 |         self.transformer = image_util.ImageTransformer(is_color = is_color)
 45 |         self.transformer.set_transpose((2,0,1))
 46 | 
 47 |         self.mean_file = mean_file
 48 |         mean = np.load(self.mean_file)['data_mean']
 49 |         mean = mean.reshape(1, self.crop_dims[0], self.crop_dims[1])
 50 |         self.transformer.set_mean(mean) # mean pixel
 51 |         gpu = 1 if use_gpu else 0
 52 |         conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (gpu)
 53 | 	#使用 parse_config() 解析训练时的配置文件
 54 |         conf = parse_config(train_conf, conf_args)
 55 | 	#PaddlePaddle目前使用Swig对其常用的预测接口进行了封装，使在Python环境下的预测接口更加简单
 56 | 	#使用 swig_paddle.initPaddle() 传入命令行参数初始化 PaddlePaddle
 57 |         swig_paddle.initPaddle("--use_gpu=%d" % (int(use_gpu)))
 58 | 	#使用 swig_paddle.GradientMachine.createFromConfigproto() 根据上一步解析好的配置创建神经网络
 59 |         self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
 60 |         assert isinstance(self.network, swig_paddle.GradientMachine)
 61 | 	#从模型文件加载参数
 62 |         self.network.loadParameters(self.model_dir)
 63 | 
 64 |         data_size = 1 * self.crop_dims[0] * self.crop_dims[1]
 65 |         slots = [dense_vector(data_size)]
 66 | 	'''
 67 | 创建一个 DataProviderConverter 对象converter。
 68 | swig_paddle接受的原始数据是C++的Matrix，也就是直接写内存的float数组。 这个接口并不用户友好。所以，我们提供了一个工具类DataProviderConverter。 这个工具类接收和PyDataProvider2一样的输入数据
 69 | 	'''
 70 |         self.converter = DataProviderConverter(slots)
 71 | 
 72 |     def get_data(self, img_path):
 73 |         """
 74 |         1. 读取图片.
 75 |         2. resize 或 oversampling.
 76 |         3. transformer data: transpose, sub mean.
 77 |         return K x H x W ndarray.
 78 |         """
 79 |         image = image_util.load_image(img_path, self.is_color)
 80 |         if self.oversample:
 81 |             # image_util.resize_image: short side is self.resize_dim
 82 |             image = image_util.resize_image(image, self.resize_dim)
 83 |             image = np.array(image)
 84 |             input = np.zeros((1, image.shape[0], image.shape[1],1),
 85 |                              dtype=np.float32)
 86 | 	    if self.is_color:
 87 |             	input[0] = image.astype(np.float32)
 88 | 	    else:
 89 | 	    	for i in range(0,self.resize_dim):
 90 | 		    for j in range(0,self.resize_dim):
 91 | 		        input[0][i][j][0]=image[i][j]
 92 |             input = image_util.oversample(input, self.crop_dims)
 93 |         else:
 94 |             image = image.resize(self.crop_dims, Image.ANTIALIAS)
 95 | 	    image = np.array(image)
 96 |             input = np.zeros((1, self.crop_dims[0], self.crop_dims[1],1),
 97 |                              dtype=np.float32)
 98 | 	    if self.is_color:
 99 |             	input[0] = image.astype(np.float32)
100 | 	    else:
101 | 	    	for i in range(0,self.resize_dim):
102 | 		    for j in range(0,self.resize_dim):
103 | 		        input[0][i][j][0]=image[i][j]
104 | 
105 |         data_in = []
106 |         for img in input:
107 |             img = self.transformer.transformer(img).flatten()
108 |             data_in.append([img.tolist()])
109 |         return data_in
110 | 
111 |     def forward(self, input_data):
112 |         in_arg = self.converter(input_data)
113 |         return self.network.forwardTest(in_arg)
114 | 
115 |     def forward(self, data, output_layer):
116 |         #返回每种标签的概率，详见http://www.paddlepaddle.org/doc_cn/ui/predict/swig_py_paddle.html
117 |         input = self.converter(data)
118 |         self.network.forwardTest(input)
119 |         output = self.network.getLayerOutputs(output_layer)
120 |         return output[output_layer].mean(0)
121 | 
122 |     def predict(self, image=None, output_layer=None):
123 |         assert isinstance(image, basestring)
124 |         assert isinstance(output_layer, basestring)
125 |         data = self.get_data(image)#读取图片
126 |         prob = self.forward(data, output_layer)
127 |         lab = np.argsort(-prob)#按降序排列,返回的是数组值的索引值
128 |         logging.info("Label of %s is: %d", image, lab[0])
129 | 
130 | if __name__ == '__main__':
131 |     image_size=28#图像大小
132 |     crop_size=28#图像大小
133 |     multi_crop=0#多次裁剪
134 |     config="vgg.py"#网络配置文件
135 |     output_layer="__fc_layer_1__"
136 |     mean_path="data/batches/batches.meta"
137 |     model_path="vgg_model/pass-00000/"#模型路径
138 |     image="test.bmp"#要识别的图片路径
139 |     use_gpu=0#是否使用GPU
140 | 
141 |     obj = ImageClassifier(train_conf=config,
142 |                           model_dir=model_path,
143 |                           resize_dim=image_size,
144 |                           crop_dim=crop_size,
145 |                           mean_file=mean_path,
146 |                           use_gpu=use_gpu,
147 |                           oversample=multi_crop)
148 |     obj.predict(image, output_layer)
149 | 


--------------------------------------------------------------------------------
/PaddlePaddle/prediction_age.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from py_paddle import swig_paddle, DataProviderConverter
 3 | from paddle.trainer.PyDataProvider2 import dense_vector
 4 | from paddle.trainer.config_parser import parse_config
 5 | import numpy as np
 6 | import csv
 7 | import os
 8 | 
 9 | def predict(data):
10 |     path=os.path.split(os.path.realpath(__file__))[0]
11 |     conf = parse_config(path+"/trainer_config_age.py", "is_predict=1")
12 |     print conf.data_config.load_data_args
13 |     network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
14 |     network.loadParameters(path+"/output_age/pass-00099")
15 |     converter = DataProviderConverter([dense_vector(26)])
16 |     inArg = converter(data)
17 |     network.forwardTest(inArg)
18 |     output = network.getLayerOutputs("__fc_layer_0__")
19 |     #print output
20 |     prob = output["__fc_layer_0__"][0]
21 |     #print prob
22 |     lab = np.argsort(-prob)
23 |     #print lab
24 |     return lab[0]
25 | 
26 | if __name__ == '__main__':
27 |     swig_paddle.initPaddle("--use_gpu=0")
28 |     csvfile = file('predict.csv', 'rb')
29 |     reader = csv.reader(csvfile)
30 |     rows= [row for row in reader]
31 |     row=rows[1]
32 |     print '实际年龄：'+row[2]
33 |     del row[0]
34 |     del row[0]
35 |     del row[0]
36 |     data = [[[]]]
37 |     for j in row:
38 | 	data[0][0].append(float(j))
39 |     csvfile.close()
40 |     print '预测年龄:'+str(predict(data))
41 | 


--------------------------------------------------------------------------------
/PaddlePaddle/prediction_sex.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from py_paddle import swig_paddle, DataProviderConverter
 3 | from paddle.trainer.PyDataProvider2 import dense_vector
 4 | from paddle.trainer.config_parser import parse_config
 5 | import numpy as np
 6 | import csv
 7 | import os
 8 | 
 9 | def predict(data):
10 |     path=os.path.split(os.path.realpath(__file__))[0]
11 |     conf = parse_config(path+"/trainer_config_sex.py", "is_predict=1")
12 |     print conf.data_config.load_data_args
13 |     network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
14 |     network.loadParameters(path+"/output_sex/pass-00029")
15 |     converter = DataProviderConverter([dense_vector(26)])
16 |     inArg = converter(data)
17 |     network.forwardTest(inArg)
18 |     output = network.getLayerOutputs("__fc_layer_0__")
19 |     prob = output["__fc_layer_0__"][0]
20 |     lab = np.argsort(-prob)
21 |     return lab[0]
22 | 
23 | if __name__ == '__main__':
24 |     swig_paddle.initPaddle("--use_gpu=0")
25 |     csvfile = file('predict.csv', 'rb')
26 |     reader = csv.reader(csvfile)
27 |     rows= [row for row in reader]
28 |     #预测第一行
29 |     row=rows[1]
30 |     sex='女'
31 |     if(row[1]=='\xc4\xd0'):
32 | 	sex='男'
33 |     print '实际性别：'+sex
34 |     del row[0]
35 |     del row[0]
36 |     del row[0]
37 |     data = [[[]]]
38 |     for j in row:
39 | 	data[0][0].append(float(j))
40 |     csvfile.close()
41 |     if(predict(data)==1):
42 | 	print '预测性别：男'
43 |     else:
44 | 	print '预测性别：女'
45 | 


--------------------------------------------------------------------------------
/PaddlePaddle/preprocess.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from paddle.utils.preprocess_img import ImageClassificationDatasetCreater
 3 | from optparse import OptionParser
 4 | 
 5 | #处理命令行参数
 6 | def option_parser():
 7 |     parser = OptionParser(usage="usage: python preprcoess.py "\
 8 |                           "-i data_dir [options]")
 9 |     parser.add_option("-i", "--input", action="store",
10 |                       dest="input", help="图片路径")
11 |     parser.add_option("-s", "--size", action="store",
12 |                       dest="size", help="图片大小")
13 |     parser.add_option("-c", "--color", action="store",
14 |                       dest="color", help="图片有没有颜色")
15 |     return parser.parse_args()
16 | 
17 | if __name__ == '__main__':
18 |      options, args = option_parser()
19 |      data_dir = options.input
20 |      processed_image_size = int(options.size)
21 |      color = options.color == "1"
22 |      data_creator = ImageClassificationDatasetCreater(data_dir,
23 |                                                       processed_image_size,
24 |                                                       color)
25 |      #每个训练文件包含的图片数
26 |      data_creator.num_per_batch = 1000
27 |      data_creator.overwrite = True
28 |      data_creator.create_batches()
29 | 


--------------------------------------------------------------------------------
/PaddlePaddle/preprocess.sh:
--------------------------------------------------------------------------------
1 | 
2 | set -e
3 | data_dir=./data
4 | python preprocess.py -i $data_dir -s 28 -c 0
5 | 
6 | #-i后为训练数据存放路径，-s后为图像大小,-c后为图像有没有颜色
7 | 


--------------------------------------------------------------------------------
/PaddlePaddle/test.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/PaddlePaddle/test.bmp


--------------------------------------------------------------------------------
/PaddlePaddle/test.list:
--------------------------------------------------------------------------------
1 | predict.csv
2 | 


--------------------------------------------------------------------------------
/PaddlePaddle/train.list:
--------------------------------------------------------------------------------
1 | train.csv
2 | 


--------------------------------------------------------------------------------
/PaddlePaddle/train.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | set -e
 3 | config=vgg.py
 4 | output=./vgg_model
 5 | log=train.log
 6 | 
 7 | paddle train \
 8 | --config=$config \
 9 | --use_gpu=0 \
10 | --trainer_count=8 \
11 | --num_passes=10 \
12 | --save_dir=$output \
13 | 2>&1 | tee $log
14 | 
15 | python -m paddle.utils.plotcurve -i $log > plot.png
16 | 
17 | :<<'
18 | use_gpu是否使用GPU训练
19 | trainer_count训练线程数，使用CPU时建议设为CPU的线程数，使用GPU时设为GPU的数目
20 | num_passes训练次数,每训练一次会生成一个模型文件夹
21 | output模型存放路径
22 | '
23 | 
24 | 


--------------------------------------------------------------------------------
/PaddlePaddle/train_age.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | paddle train \
 5 |   --config=trainer_config_age.py \
 6 |   --save_dir=./output_age \
 7 |   --trainer_count=1 \
 8 |   --num_passes=100 \
 9 |   --use_gpu=1 \
10 |   2>&1 | tee 'train_age.log'
11 | 


--------------------------------------------------------------------------------
/PaddlePaddle/train_sex.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | paddle train \
 5 |   --config=trainer_config_sex.py \
 6 |   --save_dir=./output_sex \
 7 |   --trainer_count=1 \
 8 |   --num_passes=30 \
 9 |   --use_gpu=1 \
10 |   2>&1 | tee 'train_sex.log'
11 | 


--------------------------------------------------------------------------------
/PaddlePaddle/trainer_config_age.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from paddle.trainer_config_helpers import *
 3 | import csv
 4 | 
 5 | is_predict = get_config_arg('is_predict', bool, False)
 6 | define_py_data_sources2(
 7 |     #训练文件列表
 8 |     train_list='train.list' if not is_predict else None,
 9 |     #测试文件列表
10 |     test_list='test.list',
11 |     #指明提供数据的函数
12 |     module="dataprovider",
13 |     obj='process_age' if not is_predict else 'process_predict_age')
14 | 
15 | settings(
16 |     #批尺寸
17 |     batch_size=128 if not is_predict else 1,
18 |     #学习速率
19 |     learning_rate=2e-3,
20 |     #学习方式
21 |     learning_method=AdamOptimizer(),
22 |     #权重衰减
23 |     regularization=L2Regularization(8e-4))
24 | #输入数据大小
25 | data = data_layer(name="data", size=26)
26 | #直接全连接，指明输出数据大小，激活函数是Softmax
27 | output = fc_layer(name="__fc_layer_0__",input=data, size=100, act=SoftmaxActivation())
28 | if is_predict:
29 |     #获得最大概率的标签
30 |     maxid = maxid_layer(output)
31 |     outputs([maxid, output])
32 | else:
33 |     #标签大小
34 |     label = data_layer(name="label", size=100)
35 |     #计算误差
36 |     cls = classification_cost(input=output, label=label)
37 |     outputs(cls)
38 | 


--------------------------------------------------------------------------------
/PaddlePaddle/trainer_config_sex.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from paddle.trainer_config_helpers import *
 3 | import csv
 4 | 
 5 | is_predict = get_config_arg('is_predict', bool, False)
 6 | define_py_data_sources2(
 7 |     #训练文件列表
 8 |     train_list='train.list' if not is_predict else None,
 9 |     #测试文件列表
10 |     test_list='test.list',
11 |     #指明提供数据的函数
12 |     module="dataprovider",
13 |     obj='process_sex' if not is_predict else 'process_predict_sex')
14 | 
15 | settings(
16 |     #批尺寸
17 |     batch_size=128 if not is_predict else 1,
18 |     #学习速率
19 |     learning_rate=2e-3,
20 |     #学习方式
21 |     learning_method=AdamOptimizer(),
22 |     #权重衰减
23 |     regularization=L2Regularization(8e-4))
24 | #输入数据大小
25 | data = data_layer(name="data", size=26)
26 | #直接全连接，指明输出数据大小，激活函数是Softmax
27 | output = fc_layer(name="__fc_layer_0__",input=data, size=2, act=SoftmaxActivation())
28 | if is_predict:
29 |     #找出最大概率的标签
30 |     maxid = maxid_layer(output)
31 |     outputs([maxid, output])
32 | else:
33 |     #标签大小
34 |     label = data_layer(name="label", size=2)
35 |     #计算误差
36 |     cls = classification_cost(input=output, label=label)
37 |     outputs(cls)
38 | 


--------------------------------------------------------------------------------
/PaddlePaddle/vgg.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from paddle.trainer_config_helpers import *
 3 | 
 4 | is_predict = get_config_arg("is_predict", bool, False)
 5 | 
 6 | ####################Data Configuration ##################
 7 | if not is_predict:
 8 |   data_dir='data/batches/'
 9 |   meta_path=data_dir+'batches.meta'
10 | 
11 |   '''
12 |   mean_img_size,img_size图像大小
13 |   num_classes分类类别数
14 |   color图像有无颜色
15 |   '''
16 |   args = {'meta':meta_path,'mean_img_size': 28,
17 |           'img_size': 28,'num_classes': 10,
18 |           'use_jpeg': 1,'color': 0}
19 | 
20 |   #引用image_provider.py中的processData函数
21 |   define_py_data_sources2(train_list=data_dir+"train.list",
22 |                           test_list=data_dir+'test.list',
23 |                           module='dataprovider',
24 |                           obj='processData',
25 |                           args=args)
26 | 
27 | ######################Algorithm Configuration #############
28 | settings(
29 |     #批尺寸，一次训练多少数据
30 |     batch_size = 128,
31 |     #学习速率
32 |     learning_rate = 0.1 / 128.0,
33 |     #学习方式，详见http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/optimizers.html
34 |     learning_method = MomentumOptimizer(0.9),
35 |     #权重衰减，防过拟合
36 |     regularization = L2Regularization(0.0005 * 128)
37 | )
38 | 
39 | #######################Network Configuration #############
40 | #图片大小，通道数×长×宽
41 | data_size=1*28*28
42 | #分类数量
43 | label_size=10
44 | #关于layer,详见http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html
45 | img = data_layer(name='image',
46 |                  size=data_size)
47 | #small_vgg在trainer_config_helpers.network预定义
48 | #关于网络详见http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/networks.html
49 | predict = small_vgg(input_image=img,
50 |                     num_channels=1,#图像通道数，灰度图像为1
51 |                     num_classes=label_size)
52 | 
53 | if not is_predict:
54 |     lbl = data_layer(name="label", size=label_size)
55 |     outputs(classification_cost(input=predict, label=lbl))
56 | else:
57 |     #预测网络直接输出最后一层的结果而不是像训练时以cost layer作为输出
58 |     outputs(predict)
59 | 
60 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 对血常规检验报告的OCR识别、深度学习与分析
 2 | 
 3 | * 将血常规检验报告的图片识别出年龄、性别及血常规检验的各项数据
 4 |     * 图片上传页面，提交的结果是图片存储到了mongodb数据库得到一个OID或到指定目录到一个path
 5 |     * 图片识别得到一个json数据存储到了mongodb数据库得到一个OID，[json数据](https://coding.net/u/mengning/p/np2016/git/blob/master/BloodTestReportOCR/bloodtestdata.json)
 6 |        * 自动截取目标区域，已经能不同旋转角度的图片自动准备截取目标区域，但对倾斜透视的图片处理效果不佳,[具体用法](https://coding.net/u/mengning/p/np2016/git/blob/master/BloodTestReportOCR/README.md)
 7 |        * 预处理，比如增加对比度、锐化
 8 |        * 识别
 9 |            
10 |     * 识别结果页面，上部是原始图片，下部是一个显示识别数据的表格，以便对照识别结果
11 | * 学习血常规检验的各项数据及对应的年龄性别
12 | * 根据血常规检验的各项数据预测年龄和性别
13 | 
14 | ## Links
15 | 
16 | * [我的博客](http://www.csxiaoyao.com/blog/2017/01/01/ustc-np2016%E8%AF%BE%E7%A8%8B%E5%AD%A6%E4%B9%A0%E6%80%BB%E7%BB%93/)
17 | 


--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/BTR_binary_classification.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | #基于spark血常规检验报告深度学习
 3 | #by Islotus
 4 | #2016.12.15
 5 | 
 6 | from __future__ import print_function
 7 | 
 8 | import sys
 9 | import math
10 | 
11 | from pyspark.sql import SparkSession
12 | from pyspark.mllib.classification import LogisticRegressionWithLBFGS
13 | from pyspark.mllib.evaluation import BinaryClassificationMetrics
14 | from pyspark.mllib.regression import LabeledPoint
15 | 
16 | from pyspark import SparkContext
17 | #from pyspark.mllib.classification import SVMWithSGD, SVMModel
18 | from pyspark.mllib.util import MLUtils
19 | 
20 | if __name__ == "__main__":
21 | 
22 | 	sc = SparkContext(appName="BloodTestReportPythonBinaryClassificationMerticsExample")
23 | 	
24 | 	#读取数据
25 | 	print('Begin Load Data File!')
26 | 	sexData = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata_sex.txt")
27 | 	print('Data File has been Loaded!')
28 | 
29 | 	accuracySex = []
30 | 
31 | 	for i in range(0,100):
32 | 		#将数据随机分隔为9：1, 分别作为训练数据（training）和预测数据（test）
33 | 		sexTraining, sexTest = sexData.randomSplit([0.9, 0.1])
34 | 		
35 | 		#训练二分类模型
36 | 		sexModel = LogisticRegressionWithLBFGS.train(sexTraining)
37 | 		
38 | 		#对test数据进行预测，输出预测准确度
39 | 		sexPredictionAndLabels = sexTest.map(lambda lp: (float(sexModel.predict(lp.features)), lp.label))
40 | 		accuracySex.append(1.0 * sexPredictionAndLabels.filter(lambda (x, v): x == v).count() / sexTest.count())
41 | 		
42 | 	#AVG:平均数  MSE:均方差
43 | 	SexRDD = sc.parallelize(accuracySex)
44 | 	SexPAAVG = SexRDD.reduce(lambda x,y:x+y)/SexRDD.count()
45 | 	SexPAMSE = math.sqrt(SexRDD.map(lambda x:(x - SexPAAVG)*(x - SexPAAVG)).reduce(lambda x,y:x+y)/SexRDD.count())
46 | 
47 | 	print('Sex Prediction Accuracy AVG:{}'.format(SexPAAVG))
48 | 	print('Sex Prediction Accuracy MSE:{}'.format(SexPAMSE))
49 | 
50 | 	output = open('BinaryClassificationMetricsResult.txt', 'w')
51 | 	output.write('Sex Prediction Accuracy AVG is:' + str(SexPAAVG) + "\n")
52 | 	output.write('Sex Prediction Accuracy MSE is:' + str(SexPAMSE) + "\n")
53 | 	for i in accuracySex:
54 | 		output.write(str(i)+",")
55 | 	output.write("\n")
56 | 	output.close()
57 |     
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/BTR_decision_tree.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | #基于spark血常规检验报告深度学习
  3 | #by Islotus
  4 | #2016.12.15
  5 | 
  6 | from __future__ import print_function
  7 | 
  8 | import sys
  9 | import math
 10 | from pyspark import SparkContext
 11 | from pyspark.mllib.tree import DecisionTree
 12 | from pyspark.mllib.util import MLUtils
 13 | 
 14 | if __name__ == "__main__":
 15 | 
 16 | 	sc = SparkContext(appName="BloodTestReportPythonDecisionTreeExample")
 17 | 
 18 | 	#读取数据
 19 | 	print('Begin Load Data File!')
 20 | 	sexData = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata_sex.txt")
 21 | 	ageData = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata_age.txt")
 22 | 
 23 | 	print('Data File has been Loaded!')
 24 |     
 25 | 	accuracySex = []
 26 | 	accuracyAge = []
 27 | 	for i in range(0,100):
 28 | 		#将数据随机分割为9:1, 分别作为训练数据（training）和预测数据（test）
 29 | 		sexTraining, sexTest = sexData.randomSplit([0.9, 0.1])
 30 | 		ageTraining, ageTest = ageData.randomSplit([0.9, 0.1])
 31 | 
 32 | 		#训练决策树模型
 33 | 		sexModel = DecisionTree.trainClassifier(sexTraining, numClasses=2, categoricalFeaturesInfo={},
 34 | 							impurity='gini', maxDepth=5, maxBins=32)
 35 | 		ageModel = DecisionTree.trainClassifier(ageTraining, numClasses=1000, categoricalFeaturesInfo={},
 36 | 							impurity='gini', maxDepth=5, maxBins=32)
 37 | 
 38 | 		#对test数据进行预测，输出预测准确度
 39 | 		sexPredictionAndLabel = sexTest.map(lambda p: p.label).zip(sexModel.predict(sexTest.map(lambda x: x.features)))
 40 | 		agePredictionAndLabel = ageTest.map(lambda p: p.label).zip(ageModel.predict(ageTest.map(lambda x: x.features)))
 41 |         
 42 | 		accuracySex.append(1.0 * sexPredictionAndLabel.filter(lambda (x, v): x == v).count() / sexTest.count())
 43 | 		accuracyAge.append(1.0 * agePredictionAndLabel.filter(lambda (x, v): abs((x-v)<=5)).count() / ageTest.count())
 44 | 
 45 | 	#AVG:平均数  MSE:均方差
 46 | 	SexRDD = sc.parallelize(accuracySex)
 47 | 	AgeRDD = sc.parallelize(accuracyAge)
 48 | 	SexPAAVG = SexRDD.reduce(lambda x,y:x+y)/SexRDD.count()
 49 | 	AgePAAVG = AgeRDD.reduce(lambda x,y:x+y)/AgeRDD.count()
 50 | 	SexPAMSE = math.sqrt(SexRDD.map(lambda x:(x - SexPAAVG)*(x - SexPAAVG)).reduce(lambda x,y:x+y)/SexRDD.count())
 51 | 	AgePAMSE = math.sqrt(AgeRDD.map(lambda x:(x - AgePAAVG)*(x - AgePAAVG)).reduce(lambda x,y:x+y)/AgeRDD.count())
 52 | 	#print(sum(accuracySex) / len(accuracySex))
 53 | 	#print(sum(accuracyAge) / len(accuracyAge))
 54 | 
 55 | 	print('Sex Prediction Accuracy AVG:{}'.format(SexPAAVG))
 56 | 	print('Sex Prediction Accuracy MSE:{}'.format(SexPAMSE))
 57 | 	print('AGE Prediction Accuracy AVG:{}'.format(AgePAAVG))
 58 | 	print('AGE Prediction Accuracy MSE:{}'.format(AgePAMSE))
 59 | 
 60 | 	output = open('DecisionTreeResult.txt', 'w')
 61 | 	output.write('Sex Prediction Accuracy AVG is:' + str(SexPAAVG) + "\n")
 62 | 	output.write('Sex Prediction Accuracy MSE is:' + str(SexPAMSE) + "\n")
 63 | 	for i in accuracySex:
 64 | 		output.write(str(i)+",")
 65 | 	output.write("\n")
 66 | 	output.write('Age Prediction Accuracy AVG is:' + str(AgePAAVG) + "\n")
 67 | 	output.write('Age Prediction Accuracy MSE is:' + str(AgePAMSE) + "\n")
 68 | 	for i in accuracyAge:
 69 | 		output.write(str(i) + ",")
 70 | 	output.write("\n")
 71 | 	output.close()		
 72 | 
 73 | 
 74 | 
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/BTR_gradient_boosting.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | #基于Spark血常规检验报告深度学习
 3 | #by Islotus
 4 | #2016.12.15
 5 | 
 6 | from __future__ import print_function
 7 | 
 8 | import sys
 9 | import math
10 | from pyspark import SparkContext
11 | 
12 | from pyspark.mllib.tree import GradientBoostedTrees
13 | from pyspark.mllib.util import MLUtils
14 | 
15 | if __name__ == "__main__":
16 | 
17 | 	sc = SparkContext(appName="BloodTestReportPythonGradientBoostedTreesClassificationExample")
18 | 
19 | 	#读取数据
20 | 	print('Begin Load Data File!')
21 | 	sexData = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata_sex.txt")
22 |    
23 | 	print('Data File has been Loaded!')
24 |     
25 | 	accuracySex = []
26 |     
27 | 	for i in range(0,100):
28 | 		#将数据随机分割成9:1,分别作为训练数据（training）和预测数据（test）
29 | 		sexTraining, sexTest = sexData.randomSplit([0.9, 0.1])
30 | 		
31 | 		#训练梯度增强树模型
32 | 		sexModel = GradientBoostedTrees.trainClassifier(sexTraining, categoricalFeaturesInfo={}, numIterations=3)
33 | 		
34 | 		#对test数据进行预测，输出预测准确度
35 | 		sexPredictionAndLabel = sexTest.map(lambda lp: lp.label).zip(sexModel.predict(sexTest.map(lambda x: x.features)))
36 | 		accuracySex.append(1.0 * sexPredictionAndLabel.filter(lambda (x, v): x == v).count() / sexTest.count())
37 | 	
38 | 	#AVG:平均数 MSE:均方差
39 | 	SexRDD = sc.parallelize(accuracySex)
40 | 	SexPAAVG = SexRDD.reduce(lambda x,y:x+y)/SexRDD.count()
41 | 	SexPAMSE = math.sqrt(SexRDD.map(lambda x:(x - SexPAAVG)*(x - SexPAAVG)).reduce(lambda x,y:x+y)/SexRDD.count())
42 | 	#print(sum(accuracySex) / len(accuracySex))
43 | 	#print(sum(accuracyAge) / len(accuracyAge))
44 | 
45 | 	print('Sex Prediction Accuracy AVG:{}'.format(SexPAAVG))
46 | 	print('Sex Prediction Accuracy MSE:{}'.format(SexPAMSE))
47 | 
48 | 	output = open('GradientBoostedTreesClassificationResult.txt', 'w')
49 | 	output.write('Sex Prediction Accuracy AVG is:' + str(SexPAAVG) + "\n")
50 | 	output.write('Sex Prediction Accuracy MSE is:' + str(SexPAMSE) + "\n")
51 | 	for i in accuracySex:
52 | 		output.write(str(i)+",")
53 | 	output.write("\n")
54 |     
55 | 	output.close()
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/BloodTestReportbyLR.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/Spark/BloodTestReportDeepLearning/BloodTestReportbyLR.py


--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/BloodTestReportbyNB.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: cp936 -*-
 2 | #基于Spark的朴素贝叶斯血常规检验报告深度学习系统
 3 | #2016.12.14
 4 | 
 5 | from __future__ import print_function
 6 | 
 7 | import sys
 8 | import math
 9 | from pyspark import SparkContext
10 | from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
11 | from pyspark.mllib.util import MLUtils
12 | 
13 | 
14 | if __name__ == "__main__":
15 | 
16 |     sc = SparkContext(appName="BloodTestReportPythonNaiveBayesExample")
17 | 
18 |     # 读取数据.
19 |     print('Begin Load Data File!')
20 |     sexData = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata_sex.txt")
21 |     ageData = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata_age.txt")
22 |     #print(data.collect())
23 |     print('Data File has been Loaded!')
24 |     #for(d in data.take(3)):
25 |     #   print(d)
26 |     accuracySex = []
27 |     accuracyAge = []
28 |     for i in range(0,100):
29 |         # 将数据随机分割为9:1，分别作为训练数据（training）和预测数据（test）.
30 |         sexTraining, sexTest = sexData.randomSplit([0.9, 0.1])
31 |         ageTraining, ageTest = ageData.randomSplit([0.9, 0.1])
32 | 
33 |         # 训练朴素贝叶斯模型.
34 |         #print('Begin NaiveBayes tranning!')
35 |         sexModel = NaiveBayes.train(sexTraining, 1.0)
36 |         ageModel = NaiveBayes.train(ageTraining, 1.0)
37 |         #print('Trainning over!')
38 |         # 对test数据进行预测，输出预测准确度.
39 |         sexPredictionAndLabel = sexTest.map(lambda p: (sexModel.predict(p.features), p.label))
40 |         agePredictionAndLabel = ageTest.map(lambda p: (ageModel.predict(p.features), p.label))
41 |         #print(predictionAndLabel.collect())
42 |         accuracySex.append(1.0 * sexPredictionAndLabel.filter(lambda (x, v): x == v).count() / sexTest.count())
43 |         accuracyAge.append(1.0 * agePredictionAndLabel.filter(lambda (x, v): abs((x-v)<=5)).count() / ageTest.count())
44 |     #AVG：平均数  MSE：均方差
45 |     SexRDD = sc.parallelize(accuracySex)
46 |     AgeRDD = sc.parallelize(accuracyAge)
47 |     SexPAAVG = SexRDD.reduce(lambda x,y:x+y)/SexRDD.count()
48 |     AgePAAVG = AgeRDD.reduce(lambda x,y:x+y)/AgeRDD.count()
49 |     SexPAMSE = math.sqrt(SexRDD.map(lambda x:(x - SexPAAVG)*(x - SexPAAVG)).reduce(lambda x,y:x+y)/SexRDD.count())
50 |     AgePAMSE = math.sqrt(AgeRDD.map(lambda x:(x - AgePAAVG)*(x - AgePAAVG)).reduce(lambda x,y:x+y)/AgeRDD.count())
51 |     #print(sum(accuracySex) / len(accuracySex))
52 |     #print(sum(accuracyAge) / len(accuracyAge))
53 | 
54 |     print('Sex Prediction Accuracy AVG:{}'.format(SexPAAVG))
55 |     print('Sex Prediction Accuracy MSE:{}'.format(SexPAMSE))
56 |     print('AGE Prediction Accuracy AVG:{}'.format(AgePAAVG))
57 |     print('AGE Prediction Accuracy MSE:{}'.format(AgePAMSE))
58 | 
59 |     output = open('NaiveBayesResult.txt', 'w')
60 |     output.write('Sex Prediction Accuracy AVG is:' + str(SexPAAVG) + "\n")
61 |     output.write('Sex Prediction Accuracy MSE is:' + str(SexPAMSE) + "\n")
62 |     for i in accuracySex:
63 |         output.write(str(i)+",")
64 |     output.write("\n")
65 |     output.write('Age Prediction Accuracy AVG is:' + str(AgePAAVG) + "\n")
66 |     output.write('Age Prediction Accuracy MSE is:' + str(AgePAMSE) + "\n")
67 |     for i in accuracyAge:
68 |         output.write(str(i) + ",")
69 |     output.write("\n")
70 |     output.close()
71 |     
72 | 


--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/BloodTestReportbyRF.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: cp936 -*-
 2 | #基于Spark的随机树血常规检验报告深度学习系统
 3 | #2016.12.14
 4 | 
 5 | from __future__ import print_function
 6 | 
 7 | import sys
 8 | import math
 9 | from pyspark import SparkContext
10 | from pyspark.mllib.tree import RandomForest
11 | from pyspark.mllib.util import MLUtils
12 | 
13 | 
14 | if __name__ == "__main__":
15 | 
16 |     sc = SparkContext(appName="BloodTestReportPythonRandomForestExample")
17 | 
18 |     # 读取数据.
19 |     print('Begin Load Data File!')
20 |     sexData = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata_sex.txt")
21 |     ageData = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata_age.txt")
22 |     #print(data.collect())
23 |     print('Data File has been Loaded!')
24 |     #for(d in data.take(3)):
25 |     #   print(d)
26 |     accuracySex = []
27 |     accuracyAge = []
28 |     for i in range(0,100):
29 |         # 将数据随机分割为9:1，分别作为训练数据（training）和预测数据（test）.
30 |         sexTraining, sexTest = sexData.randomSplit([0.9, 0.1])
31 |         ageTraining, ageTest = ageData.randomSplit([0.9, 0.1])
32 | 
33 |         # 训练随机树模型.
34 |         #print('Begin RandomForest tranning!')
35 |         sexModel = RandomForest.trainClassifier(sexTraining,numClasses=2, categoricalFeaturesInfo={},
36 |                                          numTrees=3, featureSubsetStrategy="auto",
37 |                                          impurity='gini', maxDepth=4, maxBins=32)
38 |         ageModel = RandomForest.trainClassifier(ageTraining,numClasses=1000, categoricalFeaturesInfo={},
39 |                                          numTrees=3, featureSubsetStrategy="auto",
40 |                                          impurity='gini', maxDepth=4, maxBins=32)
41 |         #print('Trainning over!')
42 |         # 对test数据进行预测，输出预测准确度.
43 |         sexPredictionAndLabel = sexTest.map(lambda p: p.label).zip(sexModel.predict(sexTest.map(lambda x: x.features)))
44 |         agePredictionAndLabel = ageTest.map(lambda p: p.label).zip(ageModel.predict(ageTest.map(lambda x: x.features)))
45 |         #print(predictionAndLabel.collect())
46 |         accuracySex.append(1.0 * sexPredictionAndLabel.filter(lambda (x, v): x == v).count() / sexTest.count())
47 |         accuracyAge.append(1.0 * agePredictionAndLabel.filter(lambda (x, v): abs((x-v)<=5)).count() / ageTest.count())
48 |     #AVG：平均数  MSE：均方差
49 |     SexRDD = sc.parallelize(accuracySex)
50 |     AgeRDD = sc.parallelize(accuracyAge)
51 |     SexPAAVG = SexRDD.reduce(lambda x,y:x+y)/SexRDD.count()
52 |     AgePAAVG = AgeRDD.reduce(lambda x,y:x+y)/AgeRDD.count()
53 |     SexPAMSE = math.sqrt(SexRDD.map(lambda x:(x - SexPAAVG)*(x - SexPAAVG)).reduce(lambda x,y:x+y)/SexRDD.count())
54 |     AgePAMSE = math.sqrt(AgeRDD.map(lambda x:(x - AgePAAVG)*(x - AgePAAVG)).reduce(lambda x,y:x+y)/AgeRDD.count())
55 |     #print(sum(accuracySex) / len(accuracySex))
56 |     #print(sum(accuracyAge) / len(accuracyAge))
57 | 
58 |     print('Sex Prediction Accuracy AVG:{}'.format(SexPAAVG))
59 |     print('Sex Prediction Accuracy MSE:{}'.format(SexPAMSE))
60 |     print('AGE Prediction Accuracy AVG:{}'.format(AgePAAVG))
61 |     print('AGE Prediction Accuracy MSE:{}'.format(AgePAMSE))
62 | 
63 |     output = open('RandomForestResult.txt', 'w')
64 |     output.write('Sex Prediction Accuracy AVG is:' + str(SexPAAVG) + "\n")
65 |     output.write('Sex Prediction Accuracy MSE is:' + str(SexPAMSE) + "\n")
66 |     for i in accuracySex:
67 |         output.write(str(i)+",")
68 |     output.write("\n")
69 |     output.write('Age Prediction Accuracy AVG is:' + str(AgePAAVG) + "\n")
70 |     output.write('Age Prediction Accuracy MSE is:' + str(AgePAMSE) + "\n")
71 |     for i in accuracyAge:
72 |         output.write(str(i) + ",")
73 |     output.write("\n")
74 |     output.close()
75 |     
76 | 


--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/BloodTestReportbySVM.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/Spark/BloodTestReportDeepLearning/BloodTestReportbySVM.py


--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/README.md:
--------------------------------------------------------------------------------
 1 | # 基于Spark的血常规检验报告深度学习系统
 2 | ##构造训练测试数据
 3 | 原始数据在data_set.csv中，运行
 4 | ```
 5 | python ./dataformat.py
 6 | ```
 7 | 
 8 | 生成Spark使用的labeled point数据，分别保存在LabeledPointsdata_age.txt和LabeledPointsdata_sex.txt中
 9 | 
10 | ##运行
11 | 
12 | 所有示例都自动对两个数据集中的数据随机分为9：1，9份做模型训练，1份做预测测试。重复100次后分别计算年龄和性别的预测准确度和方差，在屏幕输出的同时，保存在对应的 算法名+result.txt文件中。
13 | 
14 | ###朴素贝叶斯算法（支持多分类）
15 | ```
16 | python ./BloodTestReportbyNB.py
17 | ```
18 | 
19 | 结果：
20 | ```
21 | Sex Prediction Accuracy AVG is:0.621970740283
22 | Sex Prediction Accuracy MSE is:0.0339853457575
23 | Age Prediction Accuracy AVG is:0.539635804425
24 | Age Prediction Accuracy MSE is:0.039652048965
25 | ```
26 | ###线性支持向量机（仅支持二分类）
27 | ```
28 | python ./BloodTestReportbySVM.py
29 | ```
30 | 
31 | 结果(迭代次数=100)：
32 | ```
33 | Sex Prediction Accuracy AVG is:0.528946440893
34 | Sex Prediction Accuracy MSE is:0.0499342692342
35 | ```
36 | 
37 | ###逻辑回归（仅支持二分类）
38 | 
39 | ```
40 | python ./BloodTestReportbyLR.py
41 | ```
42 | 
43 | 结果(迭代次数=100)：
44 | ```
45 | Sex Prediction Accuracy AVG is:0.717975697167
46 | Sex Prediction Accuracy MSE is:0.0303414723843
47 | ```
48 | 
49 | ###随机树（支持多分类）
50 | ```
51 | python ./BloodTestReportbyRF.py
52 | ```
53 | 
54 | 结果（树=3，最大深度=4，最大叶子数=32，纯度计算方式：基尼系数，性别分类=2，年龄分类=1000（此处取值与纯度计算方式有关，实际年龄label只有92个，具体算法还未完全掌握））：
55 | ```
56 | Sex Prediction Accuracy AVG is:0.71622711581
57 | Sex Prediction Accuracy MSE is:0.0255871783772
58 | Age Prediction Accuracy AVG is:0.561977173151
59 | Age Prediction Accuracy MSE is:0.0622593472121
60 | ```
61 | 
62 | ###梯度提升树（仅支持二分类）
63 | 
64 | ```
65 | python ./BTR_gradient_boosting.py
66 | ```
67 | 
68 | 结果(迭代次数=100)：
69 | ```
70 | Sex Prediction Accuracy AVG is:0.728212518228
71 | Sex Prediction Accuracy MSE is:0.0305777571064
72 | ```
73 | 
74 | ###二分类（仅支持二分类）
75 | 
76 | ```
77 | python ./BTR_binary_classification.py
78 | ```
79 | 
80 | 结果(迭代次数=100)：
81 | ```
82 | Sex Prediction Accuracy AVG is:0.718756411999
83 | Sex Prediction Accuracy MSE is:0.0311279215968
84 | ```
85 | 
86 | ###决策树（支持多分类）
87 | ```
88 | python ./BTR_decision_tree.py
89 | ```
90 | 
91 | 结果：
92 | ```
93 | Sex Prediction Accuracy AVG is:0.707608775434
94 | Sex Prediction Accuracy MSE is:0.0292234440441
95 | Age Prediction Accuracy AVG is:0.552560046229
96 | Age Prediction Accuracy MSE is:0.05098502703
97 | ```


--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/data_set.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/Spark/BloodTestReportDeepLearning/data_set.csv


--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/dataformat.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/Spark/BloodTestReportDeepLearning/dataformat.py


--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/spark单机安装15122016.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/Spark/BloodTestReportDeepLearning/spark单机安装15122016.md


--------------------------------------------------------------------------------
/Spark/DigitRecogn_Spark/Readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | #基于Spark的Ocr手写字符识别系统Demo
 3 | 
 4 | ##构造训练测试数据
 5 | ###下载数据集
 6 | ```
 7 | 
 8 | wget http://labfile.oss.aliyuncs.com/courses/593/data.csv
 9 | ```
10 | 该数据集是https://www.shiyanlou.com/courses/593/labs/1966/document 中为反向神经网络训练用的数据集
11 | ###格式化数据集
12 | Spark深度学习常用的两种训练数据格式为Labeled point和LibSVM，在此，我们使用Labeled point作为训练数据格式。
13 | 
14 | 
15 | labeled point 是一个局部向量，要么是密集型的要么是稀疏型的，用一个label/response进行关联。在Spark里，labeled points 被用来监督学习算法。我们使用一个double数来存储一个label，因此我们能够使用labeled points进行回归和分类。
16 | 在二进制分类里，一个label可以是 0（负数）或者 1（正数）。在多级分类中，labels可以是class的索引，从0开始：0,1,2,......
17 | 
18 | 本Demo采用朴素贝叶斯作为训练、预测模型，特征值必须是非负数。
19 | 
20 | 程序在运行过程中先读取并格式化./data.csv中的数据，然后和网页前端传来的训练数据一起格式化为labeled points格式
21 | 新生成的LabeledPoints数据保存在LabeledPointsdata.txt中。
22 | 
23 | 需要预测时，先将LabeledPointsdata.txt中的数据读取为Spark 专用 RDD 形式，然后训练到model中
24 | 
25 | ##运行
26 | 
27 | 
28 | ###创建服务器
29 | ```
30 |  python -m SimpleHTTPServer 3000
31 | ```
32 | 
33 | ###加载服务器
34 | ```
35 |  python server.py
36 | 
37 | ```
38 | ###访问
39 | ```
40 |  localhost:3000
41 | ```


--------------------------------------------------------------------------------
/Spark/DigitRecogn_Spark/index.html:
--------------------------------------------------------------------------------
 1 | <!-- ocr.html -->
 2 | <!DOCTYPE html>
 3 | <html>
 4 | <head>
 5 |     <script src="ocr.js"></script>
 6 | </head>
 7 | <body onload="ocrDemo.onLoadFunction()">
 8 |     <div id="main-container" style="text-align: center;">
 9 |         <h1>OCR Demo</h1>
10 |         <canvas id="canvas" width="200" height="200"></canvas>
11 |         <form name="input">
12 |             <p>Digit: <input id="digit" type="text"> </p>
13 |             <input type="button" value="Train" onclick="ocrDemo.train()">
14 |             <input type="button" value="Test" onclick="ocrDemo.test()">
15 |             <input type="button" value="Reset" onclick="ocrDemo.resetCanvas();"/>
16 |         </form> 
17 |     </div>
18 | </body>
19 | </html>
20 | 


--------------------------------------------------------------------------------
/Spark/DigitRecogn_Spark/ocr.js:
--------------------------------------------------------------------------------
  1 | var ocrDemo = {
  2 |     CANVAS_WIDTH: 200,
  3 |     TRANSLATED_WIDTH: 20,
  4 |     PIXEL_WIDTH: 10, // TRANSLATED_WIDTH = CANVAS_WIDTH / PIXEL_WIDTH
  5 |     BATCH_SIZE: 1,
  6 | 
  7 |     // 服务器端参数
  8 |     PORT: "9000",
  9 |     HOST: "http://localhost",
 10 | 
 11 |     // 颜色变量
 12 |     BLACK: "#000000",
 13 |     BLUE: "#0000ff",
 14 | 
 15 |     // 客户端训练数据集
 16 |     trainArray: [],
 17 |     trainingRequestCount: 0,
 18 | 
 19 |     onLoadFunction: function() {
 20 |         this.resetCanvas();
 21 |     },
 22 | 
 23 |     resetCanvas: function() {
 24 |         var canvas = document.getElementById('canvas');
 25 |         var ctx = canvas.getContext('2d');
 26 | 
 27 |         this.data = [];
 28 |         ctx.fillStyle = this.BLACK;
 29 |         ctx.fillRect(0, 0, this.CANVAS_WIDTH, this.CANVAS_WIDTH);
 30 |         var matrixSize = 400;
 31 |         while (matrixSize--) this.data.push(0);
 32 |         this.drawGrid(ctx);
 33 | 
 34 |         // 绑定事件操作
 35 |         canvas.onmousemove = function(e) { this.onMouseMove(e, ctx, canvas) }.bind(this);
 36 |         canvas.onmousedown = function(e) { this.onMouseDown(e, ctx, canvas) }.bind(this);
 37 |         canvas.onmouseup = function(e) { this.onMouseUp(e, ctx) }.bind(this);
 38 |     },
 39 | 
 40 |     drawGrid: function(ctx) {
 41 |         for (var x = this.PIXEL_WIDTH, y = this.PIXEL_WIDTH; x < this.CANVAS_WIDTH; x += this.PIXEL_WIDTH, y += this.PIXEL_WIDTH) {
 42 |             ctx.strokeStyle = this.BLUE;
 43 |             ctx.beginPath();
 44 |             ctx.moveTo(x, 0);
 45 |             ctx.lineTo(x, this.CANVAS_WIDTH);
 46 |             ctx.stroke();
 47 | 
 48 |             ctx.beginPath();
 49 |             ctx.moveTo(0, y);
 50 |             ctx.lineTo(this.CANVAS_WIDTH, y);
 51 |             ctx.stroke();
 52 |         }
 53 |     },
 54 | 
 55 |     onMouseMove: function(e, ctx, canvas) {
 56 |         if (!canvas.isDrawing) {
 57 |             return;
 58 |         }
 59 |         this.fillSquare(ctx, e.clientX - canvas.offsetLeft, e.clientY - canvas.offsetTop);
 60 |     },
 61 | 
 62 |     onMouseDown: function(e, ctx, canvas) {
 63 |         canvas.isDrawing = true;
 64 |         this.fillSquare(ctx, e.clientX - canvas.offsetLeft, e.clientY - canvas.offsetTop);
 65 |     },
 66 | 
 67 |     onMouseUp: function(e) {
 68 |         canvas.isDrawing = false;
 69 |     },
 70 | 
 71 |     fillSquare: function(ctx, x, y) {
 72 |         var xPixel = Math.floor(x / this.PIXEL_WIDTH);
 73 |         var yPixel = Math.floor(y / this.PIXEL_WIDTH);
 74 |         // 存储手写输入数据
 75 |         this.data[((xPixel - 1)  * this.TRANSLATED_WIDTH + yPixel) - 1] = 1;
 76 | 
 77 |         ctx.fillStyle = '#ffffff';
 78 |         ctx.fillRect(xPixel * this.PIXEL_WIDTH, yPixel * this.PIXEL_WIDTH, this.PIXEL_WIDTH, this.PIXEL_WIDTH);
 79 |     },
 80 | 
 81 |     train: function() {
 82 |         var digitVal = document.getElementById("digit").value;
 83 |         if (!digitVal || this.data.indexOf(1) < 0) {
 84 |             alert("Please type and draw a digit value in order to train the network");
 85 |             return;
 86 |         }
 87 |         // 将数据加入客户端训练数据集
 88 |         this.trainArray.push({"y0": this.data, "label": parseInt(digitVal)});
 89 |         this.trainingRequestCount++;
 90 | 
 91 |         // 将客服端训练数据集发送给服务器端
 92 |         if (this.trainingRequestCount == this.BATCH_SIZE) {
 93 |             alert("Sending training data to server...");
 94 |             var json = {
 95 |                 trainArray: this.trainArray,
 96 |                 train: true
 97 |             };
 98 | 
 99 |             this.sendData(json);
100 |             this.trainingRequestCount = 0;
101 |             this.trainArray = [];
102 |         }
103 |     },
104 | 
105 |     // 发送预测请求
106 |     test: function() {
107 |         if (this.data.indexOf(1) < 0) {
108 |             alert("Please draw a digit in order to test the network");
109 |             return;
110 |         }
111 |         var json = {
112 |             image: this.data,
113 |             predict: true
114 |         };
115 |         this.sendData(json);
116 |     },
117 | 
118 |     // 处理服务器响应
119 |     receiveResponse: function(xmlHttp) {
120 |         if (xmlHttp.status != 200) {
121 |             alert("Server returned status " + xmlHttp.status);
122 |             return;
123 |         }
124 |         var responseJSON = JSON.parse(xmlHttp.responseText);
125 |         if (xmlHttp.responseText && responseJSON.type == "test") {
126 |             alert("The neural network predicts you wrote a \'" + responseJSON.result + '\'');
127 |         }
128 |     },
129 | 
130 |     onError: function(e) {
131 |         alert("Error occurred while connecting to server: " + e.target.statusText);
132 |     },
133 | 
134 |     sendData: function(json) {
135 |         var xmlHttp = new XMLHttpRequest();
136 |         xmlHttp.open('POST', this.HOST + ":" + this.PORT, false);
137 |         xmlHttp.onload = function() { this.receiveResponse(xmlHttp); }.bind(this);
138 |         xmlHttp.onerror = function() { this.onError(xmlHttp) }.bind(this);
139 |         var msg = JSON.stringify(json);
140 |         xmlHttp.setRequestHeader('Content-length', msg.length);
141 |         xmlHttp.setRequestHeader("Connection", "close");
142 |         xmlHttp.send(msg);
143 |     }
144 | }
145 | 
146 | 


--------------------------------------------------------------------------------
/Spark/DigitRecogn_Spark/server.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from __future__ import print_function
 3 | import BaseHTTPServer
 4 | import json
 5 | import csv
 6 | import shutil
 7 | from pyspark import SparkContext
 8 | from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
 9 | from pyspark.mllib.util import MLUtils
10 | 
11 | #服务器端配置
12 | HOST_NAME = 'localhost'
13 | PORT_NUMBER = 9000
14 | reader = csv.reader(file('./data.csv', 'rb'))
15 | output = open('LabeledPointsdata.txt', 'a')
16 | reader = csv.reader(file('./data.csv', 'rb'))
17 | output = open('LabeledPointsdata.txt', 'w')
18 | n = 0
19 | 
20 | sc = SparkContext(appName="PythonNaiveBayesExample")
21 | 
22 | for line in reader:
23 |     outputline ='%d' % int(n/500) + "," #每500行为一个数字的训练集
24 |     n = n + 1
25 |     for c in line:
26 |         if "0.0000000000"==c:
27 |             outputline += '0 '
28 |         else:
29 |             outputline += '1 '
30 |     outputline += '\n'
31 |     output.write(outputline)
32 | output.close()
33 | print('Format Successful!')
34 | 
35 | class JSONHandler(BaseHTTPServer.BaseHTTPRequestHandler):
36 | 
37 |     """处理接收到的POST请求"""
38 |     def do_POST(self):
39 |         response_code = 200
40 |         response = ""
41 |         var_len = int(self.headers.get('Content-Length'))
42 |         content = self.rfile.read(var_len);
43 |         payload = json.loads(content);
44 | 
45 |         # 如果是训练请求，训练然后保存训练完的神经网络
46 |         if payload.get('train'):
47 |             # 转化数据格式
48 |             TrainData = ""
49 |             for d in payload['trainArray'][0]['y0']:
50 |                 TrainData = TrainData + " " + ('%d' % d)
51 |             TrainData = '%d' % (payload['trainArray'][0]['label']) + "," + TrainData.lstrip() +"\n"
52 |             print(TrainData)
53 |             Addoutput = open('LabeledPointsdata.txt', 'a')
54 |             Addoutput.write(TrainData)
55 |             Addoutput.close()
56 | 
57 | 
58 |         # 如果是预测请求，返回预测值
59 |         elif payload.get('predict'):
60 |             try:
61 |                 training = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata.txt")
62 |                 print('Begin NaiveBayes tranning!')
63 |                 model = NaiveBayes.train(training, 1.0)
64 |                 print('Trainning over!')
65 |                 print(payload['image'])
66 |                 response = {"type":"test", "result":str(model.predict(payload['image']))}
67 |             except:
68 |                 response_code = 500
69 |         else:
70 |             response_code = 400
71 | 
72 |         self.send_response(response_code)
73 |         self.send_header("Content-type", "application/json")
74 |         self.send_header("Access-Control-Allow-Origin", "*")
75 |         self.end_headers()
76 |         if response:
77 |             self.wfile.write(json.dumps(response))
78 |         return
79 | 
80 | if __name__ == '__main__':
81 |     server_class = BaseHTTPServer.HTTPServer;
82 |     httpd = server_class((HOST_NAME, PORT_NUMBER), JSONHandler)
83 | 
84 |     try:
85 |         #启动服务器
86 |         httpd.serve_forever()
87 |         print("Server started.")
88 |     except KeyboardInterrupt:
89 |         pass
90 |     else:
91 |         print ("Unexpected server exception occurred.")
92 |     finally:
93 |         httpd.server_close()
94 | 
95 | 


--------------------------------------------------------------------------------
/Spark/README.md:
--------------------------------------------------------------------------------
 1 | # 血常规检验报告深度学习系统 on Spark
 2 | 
 3 | Spark是UC Berkeley AMP lab (加州大学伯克利分校的AMP实验室)所开源的类Hadoop MapReduce的通用并行框架，Spark，拥有Hadoop MapReduce所具有的优点；但不同于MapReduce的是Job中间输出结果可以保存在内存中，从而不再需要读写HDFS，因此Spark能更好地适用于数据挖掘与机器学习等需要迭代的MapReduce的算法。
 4 | 
 5 | 该Demo主要演示Spark的深度学习功能，数据由Spark直接读取，尚未使用Hadoop等数据库。
 6 | 
 7 | ##运行环境
 8 | ###安装JDK
 9 | ```
10 | java -version
11 | ```
12 | 如果未安装，请下载最新JDK并设置相应的JAVA_HOME、JRE_HOME、CLASSPATH、PATH变量
13 | 
14 | ###安装Scala并添加Scala_HOME,更新PATH
15 | 
16 | ```
17 | sudo apt-get install scala
18 | ```
19 | 
20 | 下载Spark并解压
21 | 
22 | 官网下载地址：http://spark.apache.org/downloads.html
23 | ###配置Spark环境
24 | ```
25 | cp ./conf/spark-env.sh.template ./conf/spark-env.sh
26 | ```
27 | ###安装Python依赖包
28 | ```
29 | sudo apt-get install python-numpy
30 | ```
31 | ###设置Python依赖路径
32 | ```
33 | sudo vim /etc/profile
34 | ```
35 | 在结尾处添加
36 | ```
37 | export SPARK_HOME=/home/hadoop/spark  #你的Spark解压目录
38 | 
39 | export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.1-src.zip:$PYTHONPATH   #py4j及pysqrk的相关依赖路径,py4j-0.10.1-src文件名可能会因Spark版本不同而不同，请设置为自己对应目录下的文件名
40 | ```
41 | ###启动SPARK
42 | ```
43 | sudo ./sbin/start-all.sh
44 | ```
45 | 在root下输入jps应该可以看到Master和Worker两个进程
46 | 
47 | 也可以登陆
48 | ```
49 | http://127.0.0.1:8080/
50 | ```
51 | 查看Spark状态
52 | 
53 | ##安装MongoDB Connector for Hadoop
54 | 
55 | MongoDB Connector for Hadoop是一个类库，他允许包括Spark、Pig、Hive、Mapreduce等在内的多种Hadoop架构中的组件使用MongoDB作为数据源。
56 | ###第三方软件安装
57 | 使用Maven安装：
58 | ```
59 | <dependency>
60 |     <groupId>org.mongodb.mongo-hadoop</groupId>
61 |     <artifactId>mongo-hadoop-core</artifactId>
62 |     <version>1.5.1</version>
63 | </dependency>
64 | ```
65 | 或使用Gradle安装：
66 | ```
67 | compile 'org.mongodb.mongo-hadoop:mongo-hadoop-core:1.5.1'
68 | ```
69 | ###独立安装
70 | 克隆源代码：
71 | ```
72 | git clone https://github.com/mongodb/mongo-hadoop.git
73 | ```
74 | 
75 | 源代码克隆后需要编译，编译过程需连接外网进行下载，国内下载速度较慢，建议使用VPN
76 | ```
77 | ./gradlew jar
78 | ```
79 | 编译后的文件会放在core/build/libs目录下。若安装了Hadoop，则将三个文件分别拷贝至以下目录
80 | 
81 | -$HADOOP_PREFIX/lib/
82 | -$HADOOP_PREFIX/share/hadoop/mapreduce/
83 | -$HADOOP_PREFIX/share/hadoop/lib/
84 | 若是Spark独立部署，则将其拷贝至本机pyspark目录下即可。
85 | 
86 | 


--------------------------------------------------------------------------------
/TensorFlow/LSTM.py:
--------------------------------------------------------------------------------
  1 | ﻿from __future__ import print_function
  2 | import tensorflow as tf
  3 | from tensorflow.python.ops import rnn, rnn_cell
  4 | import numpy as np
  5 | 
  6 | learning_rate = 0.002
  7 | training_iters = 1858
  8 | Text_iters = 200
  9 | display_step = 10
 10 | 
 11 | 
 12 | 
 13 | n_input = 13
 14 | n_steps = 2
 15 | n_hidden = 64 
 16 | n_classes = 2 
 17 | 
 18 | def one_hot(a, length):
 19 |     b = np.zeros([length, 2])
 20 |     for i in range(length):
 21 |         if a[i] == 0:
 22 |             b[i][1] = 1
 23 |         else:
 24 |             b[i][0] = 1
 25 |     return b
 26 | 
 27 | 
 28 | train_data = np.loadtxt(open("./train.csv","rb"),delimiter=",",skiprows=0)
 29 | test_data = np.loadtxt(open("./predict.csv","rb"),delimiter=",",skiprows=0)
 30 | #selet rows and column
 31 | train_label_sex = train_data[:, 1:2]
 32 | 
 33 | train_label_sex = one_hot(train_label_sex,train_data.shape[0])
 34 | 
 35 | 
 36 | train_data = train_data[:, 3:]
 37 | 
 38 | train_data = np.reshape(train_data, (1858,n_steps,n_input))
 39 | 
 40 | 
 41 | test_label_sex = test_data[:, 1:2]
 42 | test_label_sex = one_hot(test_label_sex,test_data.shape[0])
 43 | test_data = test_data[:, 3:]
 44 | test_data = np.reshape(test_data, (200,n_steps,n_input))
 45 | 
 46 | 
 47 | 
 48 | x = tf.placeholder("float", [None, n_steps, n_input])
 49 | 
 50 | # Tensorflow LSTM cell requires 2x n_hidden length (state & cell)
 51 | istate = tf.placeholder("float", [None, 2 * n_hidden])
 52 | y = tf.placeholder("float", [None, n_classes])
 53 | 
 54 | weights = {
 55 |     'hidden': tf.Variable(tf.random_normal([n_input, n_hidden])), 
 56 |     'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))
 57 | }
 58 | biases = {
 59 |     'hidden': tf.Variable(tf.random_normal([n_hidden])),
 60 |     'out': tf.Variable(tf.random_normal([n_classes]))
 61 | }
 62 | 
 63 | 
 64 | def RNN(_x, _istate, _weights, _biases):
 65 | 
 66 |     
 67 |     # Permuting  n_steps
 68 |   
 69 |     _x = tf.transpose(_x, [1, 0, 2])
 70 |     # Reshaping to (n_steps*batch_size, n_input)
 71 |     _x = tf.reshape(_x, [-1, n_input])
 72 |     # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
 73 | 
 74 | 	 
 75 |     _x = tf.matmul(_x, _weights['hidden']) + _biases['hidden']
 76 |     
 77 |     lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0,state_is_tuple=False)
 78 |     _x = tf.split(0, n_steps, _x) 
 79 |     # Get lstm cell output
 80 |     outputs, states = tf.nn.rnn(lstm_cell, _x, dtype=tf.float32, initial_state=_istate)
 81 | 
 82 |     # Linear activation, using rnn inner loop last output
 83 |     return tf.matmul(outputs[-1], _weights['out']) + _biases['out']
 84 | 
 85 | pred = RNN(x, istate, weights, biases)
 86 | 
 87 | cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
 88 | 
 89 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
 90 | 
 91 | 
 92 | correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
 93 | accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
 94 | 
 95 | init = tf.global_variables_initializer()
 96 | 
 97 | 
 98 | with tf.Session() as sess:
 99 |     sess.run(init)
100 |     step = 1
101 |     
102 |     while step  < 300:
103 |        
104 |         sess.run(optimizer, feed_dict={x: train_data, y: train_label_sex, istate: np.zeros((training_iters, 2 * n_hidden))})
105 |         if step % display_step == 0:
106 |             # Calculate batch accuracy
107 |             acc = sess.run(accuracy, feed_dict={x: train_data, y: train_label_sex, istate: np.zeros((training_iters, 2 * n_hidden))})
108 |             # Calculate batch loss
109 |             loss = sess.run(cost, feed_dict={x: train_data, y: train_label_sex,istate: np.zeros((training_iters, 2 * n_hidden))})
110 |             print("Iter " + str(step) + ", Loss= " + \
111 |                   "{:.6f}".format(loss) + ", Training Accuracy= " + \
112 |                   "{:.5f}".format(acc))
113 |         step += 1
114 |     print("Optimization Finished!")
115 | 
116 | 
117 |     print("Testing Accuracy:", \
118 | sess.run(accuracy, feed_dict={x: test_data, y: test_label_sex,
119 |                                                              istate: np.zeros((Text_iters, 2 * n_hidden))}))
120 | 


--------------------------------------------------------------------------------
/TensorFlow/README.md:
--------------------------------------------------------------------------------
 1 | # Tensorflow框架下的mnist手写字符识别
 2 | - 简单双隐层 26->238->512->100（年龄）/ 2（性别）
 3 | - 学习率0.01/0.1
 4 | - 训练数据集A2的csv血液数据报告文件
 5 | - 输出层用softmax函数做分类器，损失函数是cross entropy
 6 | - 批处理大小为17
 7 | - 本内容皆在提供tensorflow标准数据格式的预处理范例
 8 | 
 9 | ### 环境配置
10 | 系统: UBUNTU系列， 有N卡支持CUDA请装GPU版本并在sess出处使用GPU执行训练
11 | 
12 |     # 安装numpy
13 |     sudo apt-get install python-numpy
14 |     
15 |     # 安装PIL
16 |     sudo apt-get install python-imaging
17 |     
18 |     # 安装Tensorflow
19 |     pip install --upgrade https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.12.0rc0-cp27-none-linux_x86_64.whl
20 |     
21 |     
22 | ### 运行
23 |     mkdir ckpt_age
24 |     mkdir ckpt_sex
25 |     python age_predict.py   # 网络结构未优化，准确率40%上下
26 |     python sex_predict.py   # 同样由于网络结构问题，损失函数不收敛
27 | ### 解释
28 | 1. age_predict.py 
29 |    训练网络，并预测一条记录（预测样本放在代码最后）
30 | 2. tfrecords后缀文件和ckpt文件夹下内容 
31 |     第一次运行会根据数据集产生tfrecord文件，文件feed以及分batch均需要构建为这个标准数据格式，如需要扩充变化数据集请删除tfrecords内容（如要变化数据格式请重新）
32 |     第一次运行会在ckpt下状态保存点，如果需要调参再训练，请删除ckpt文件夹下内容; 
33 | 
34 | ### 注意
35 | 如果不是用的最新版tensorflow，请去旧版文档查询并更改Saver()和Initializer()函数，0.11及以下版本使用的API名称是不同的
36 | 
37 | ### agepredictv2.0.py注释
38 | 定义了添加层函数。通过升维，使不同年龄段输出节点不同，调参找到比较好结果，设置2隐藏层。隐藏层节点数约为输入75%。使年龄预测率提高到24%左右。
39 | 其典型的bp神经网络模型流要更具有普适性。可直接利用该文件夹俩csv文件运行。 ——SA312
40 | 
41 | ### TensorBoard可视化
42 | 程序运行完毕之后, 会产生logs目录 , 使用命令 tensorboard --logdir='logs/'，然后打开浏览器查看
43 | 


--------------------------------------------------------------------------------
/TensorFlow/age_predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import tensorflow as tf
  4 | import numpy as np
  5 | import csv
  6 | import random
  7 | 
  8 | # id,sex,age,WBC,RBC,HGB,HCT,MCV,MCH,MCHC,RDW,PLT,MPV,PCT,PDW,LYM,LYM%,MON,MON%,NEU,NEU%,EOS,EOS%,BAS,BAS%,ALY,ALY%,LIC,LIC%
  9 | 
 10 | # 预测的正确结果定义为 |X - Y| <= 5
 11 | 
 12 | '''
 13 | 数据处理部分
 14 | '''
 15 | # 数据集路径
 16 | cwd = os.getcwd()
 17 | 
 18 | train = csv.reader(open(cwd + '/train.csv', 'rb'))
 19 | predict = csv.reader(open(cwd + '/predict.csv', 'rb'))
 20 | 
 21 | 
 22 | # 转化标签为one-hot格式, 类别为100类(0 ~ 99岁)
 23 | def dense_to_one_hot(labels_dense, num_classes=100):
 24 |     labels_dense = np.array(labels_dense)
 25 |     num_labels = labels_dense.shape[0]
 26 |     index_offset = np.arange(num_labels) * num_classes
 27 |     labels_one_hot = np.zeros((num_labels, num_classes))
 28 |     labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
 29 |     return labels_one_hot
 30 | 
 31 | 
 32 | # 读取数据
 33 | def write_to_tensor(name, csv_name):
 34 |     if os.path.exists(name):
 35 |         return
 36 |     csv_file = csv.reader(open(cwd + '/' + csv_name, 'rb'))
 37 |     writer = tf.python_io.TFRecordWriter(name)
 38 |     i = 0
 39 |     for line in csv_file:
 40 |         if not line:
 41 |             break
 42 |         if len(line) is not 29:
 43 |             continue
 44 |         if line[2] is '1.5':
 45 |             print line[2]
 46 |             continue
 47 |         index = [int(line[2])]
 48 |         data = map(float, line)[3:29]
 49 |         # 注意list类型, Feature或FeatureList等
 50 |         example = tf.train.Example(features=tf.train.Features(feature={
 51 |             "label": tf.train.Feature(int64_list=tf.train.Int64List(value=index)),
 52 |             'content': tf.train.Feature(float_list=tf.train.FloatList(value=data))
 53 |         }))
 54 |         print data, index
 55 |         # 序列化并写入tfrecord
 56 |         writer.write(example.SerializeToString())
 57 |         i += 1
 58 |     print i, "Data dealed"
 59 |     writer.close()
 60 | 
 61 | 
 62 | # 读取数据并解析
 63 | def read_and_decode(filename):
 64 |     # 根据文件名生成一个队列
 65 |     filename_queue = tf.train.string_input_producer([filename])
 66 |     # 创建tfrecord reader
 67 |     reader = tf.TFRecordReader()
 68 |     # 返回文件名和文件
 69 |     _, serialized_example = reader.read(filename_queue)
 70 |     # 读取时要注意fix shape
 71 |     features = tf.parse_single_example(serialized_example,
 72 |                                        features={
 73 |                                            'label': tf.FixedLenFeature([], tf.int64),
 74 |                                            'content': tf.FixedLenFeature([26], tf.float32),
 75 |                                        })
 76 | 
 77 |     data = tf.cast(features['content'], tf.float32)
 78 |     label = tf.cast(features['label'], tf.int32)
 79 |     return data, label
 80 | 
 81 | 
 82 | '''
 83 | 网络结构部分
 84 | '''
 85 | 
 86 | # 定义占位符
 87 | x = tf.placeholder(tf.float32, shape=[None, 26])
 88 | y_ = tf.placeholder(tf.float32, shape=[None, 100])
 89 | 
 90 | 
 91 | # 定义权重参数格式函数  参数初始值为随机数 0 ~ 0.2
 92 | def weight_variable(shape):
 93 |     initial = tf.truncated_normal(shape, stddev=random.uniform(0, 0.2))
 94 |     return tf.Variable(initial)
 95 | 
 96 | 
 97 | def bias_variable(shape):
 98 |     initial = tf.constant(random.uniform(0, 0.2), shape=shape)
 99 |     return tf.Variable(initial)
100 | 
101 | 
102 | # 调整输入尺寸，一维展开以适应输入层
103 | # 全连接层参数格式
104 | # 全连接层1参数格式
105 | W_fc1 = weight_variable([26, 64])
106 | b_fc1 = bias_variable([64])
107 | 
108 | # 全连接层1reshape
109 | h_pool2_flat = tf.reshape(x, [-1, 26])
110 | 
111 | # 激励函数fc1
112 | h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
113 | 
114 | 
115 | # 全连接层2参数格式
116 | W_fc2 = weight_variable([64, 512])
117 | b_fc2 = bias_variable([512])
118 | 
119 | # 全连接层2输入reshape
120 | h_fc1_2 = tf.reshape(h_fc1, [-1, 64])
121 | 
122 | # 激励函数fc2
123 | h_fc2 = tf.nn.relu(tf.matmul(h_fc1_2, W_fc2) + b_fc2)
124 | 
125 | # dropout层
126 | keep_prob = tf.placeholder(tf.float32)
127 | h_fc1_drop = tf.nn.dropout(h_fc2, keep_prob)
128 | 
129 | # 输出层参数格式
130 | W_fc3 = weight_variable([512, 100])
131 | b_fc3 = bias_variable([100])
132 | 
133 | # 输出内容为y_result
134 | y_result = tf.matmul(h_fc1_drop, W_fc3) + b_fc3
135 | 
136 | # 定义损失函数 交叉熵
137 | cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_result, y_))
138 | 
139 | # 定义训练op
140 | train_step = tf.train.AdamOptimizer(0.01).minimize(cross_entropy)
141 | 
142 | # 定义正确预测 |y - Y| <= 5
143 | correct_prediction = tf.less_equal(tf.abs(tf.sub(tf.argmax(y_result, 1), tf.argmax(y_, 1))), 5)
144 | # correct_prediction = tf.equal(tf.argmax(y_result, 1), tf.argmax(y_, 1))
145 | 
146 | # 定义正确率
147 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
148 | 
149 | # 定义Model Saver op
150 | saver = tf.train.Saver()
151 | 
152 | # 定义计算图激活op
153 | init_op = tf.global_variables_initializer()
154 | 
155 | '''
156 | 训练部分
157 | '''
158 | 
159 | # 如果没有保存模型则训练一个新的
160 | 
161 | if not os.path.exists("./ckpt_age/checkpoint"):
162 |     # 创建tfrecord
163 |     write_to_tensor('train_age.tfrecords', 'train.csv')
164 |     write_to_tensor('predict_age.tfrecords', 'predict.csv')
165 |     # 读取tfrecord
166 |     train_img, train_label = read_and_decode("train_age.tfrecords")
167 |     test_img, test_label = read_and_decode("predict_age.tfrecords")
168 | 
169 |     # 使用shuffle_batch分batch并打乱顺序
170 |     img_batch, label_batch = tf.train.shuffle_batch([train_img, train_label],
171 |                                                     batch_size=17, capacity=2000,
172 |                                                     min_after_dequeue=1000)
173 |     test_img_batch, test_label_batch = tf.train.shuffle_batch([test_img, test_label],
174 |                                                               batch_size=200, capacity=20000,
175 |                                                               min_after_dequeue=10000)
176 |     with tf.Session() as sess:
177 |         # 激活计算图
178 |         sess.run(init_op)
179 |         # 启动队列
180 |         threads = tf.train.start_queue_runners(sess=sess)
181 |         # 迭代次数 = 10000
182 |         for i in range(10000):
183 |             # batch
184 |             image, label = sess.run([img_batch, label_batch])
185 |             # 输出局部正确率
186 |             if i % 100 == 0:
187 |                 train_accuracy = accuracy.eval(feed_dict={
188 |                     x: image, y_: dense_to_one_hot(label), keep_prob: 1.0})
189 |                 print("step %d, training accuracy %g" % (i, train_accuracy))
190 |             train_step.run(feed_dict={x: image, y_: dense_to_one_hot(label), keep_prob: 0.5})
191 |         # 加载测试集
192 |         test_img, test_label = sess.run([test_img_batch, test_label_batch])
193 |         # 输出整体正确率
194 |         print("test accuracy %g" % accuracy.eval(feed_dict={
195 |             x: test_img, y_: dense_to_one_hot(test_label), keep_prob: 1.0}))
196 |         # 保存模型
197 |         save_path = saver.save(sess, cwd + "/ckpt_age/age.ckpt", write_meta_graph=None)
198 |         print("Model saved in file: %s" % save_path)
199 | 
200 | '''
201 | 预测部分
202 | '''
203 | 
204 | def preloadedata(data):
205 |     return tf.reshape(np.array(map(float, data[3:29])), [1, 26]).eval()
206 | 
207 | # 加载模型
208 | with tf.Session() as sess:
209 |     # 恢复checkpoint.
210 |     saver.restore(sess, cwd + "/ckpt_age/age.ckpt")
211 |     print("Model restored.")
212 |     # 读取数据
213 |     predict_data = csv.reader(open(cwd + '/predict.csv', 'rb'))
214 |     # 预处理数据
215 |     my_data = [108,0,7,8.2,7.2,0.191,10.2,2.87,35.1,0.79,9.6,4.38,53.5,0.05,4.8,0.6,0.1,1.2,0.09,1.1,0.14,1.7,139,0.403,84,29,346,10.3,267]
216 |     my_data = preloadedata(my_data)
217 |     # 输出预测结果
218 |     print "predictions", tf.argmax(y_result, 1).eval(feed_dict={x: my_data, keep_prob: 1.0}, session=sess)
219 |     # 输出各年龄概率
220 |     # print "probabilities", tf.nn.softmax(y_result.eval(feed_dict={x: my_data, keep_prob: 1.0}, session=sess)).eval()
221 | 
222 | 


--------------------------------------------------------------------------------
/TensorFlow/agepredict_v2.0.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''
  3 | 定义了添加层函数，进行年龄预测作业。通过升维，使不同数输出节点不同，
  4 | 以及非常长时间的调参，找到比较好的结果，
  5 | 设置2隐藏层。隐藏层节点数按出入节点75%。
  6 | 使年龄预测率提高到23%左右，最高25%。
  7 | 其典型的bp神经网络模型流程可借鉴。———SA312
  8 | 
  9 | 增加了损失的输出，增加了TensorBoard可视化。———SA458
 10 | '''
 11 | 
 12 | import tensorflow as tf
 13 | import numpy as np
 14 | import csv
 15 | import math
 16 | 
 17 | 
 18 | label_orign2 = []
 19 | data_orign2 = []
 20 | sex_orign2 = []
 21 | age_orign2 = []
 22 | 
 23 | #读预测数据
 24 | with open('predict.csv','rb') as precsv2:
 25 | 	reader2 = csv.reader(precsv2)
 26 | 	for line2 in reader2:
 27 | 		
 28 | 		if reader2.line_num == 1:
 29 | 			continue 
 30 | 		label_origntemp2 = [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]     #升维度
 31 | 		label_origntemp2.insert(int(math.floor(float(line2[2])/10)),float(math.floor(float(line2[2])/10)))
 32 | 		label_orign2.append(label_origntemp2)
 33 | 		data_orign2.append(line2[3:]) 
 34 | label_np_arr2 = np.array(label_orign2)
 35 | data_np_arr2 = np.array(data_orign2)
 36 | sex_np_arr2 = np.array(sex_orign2)
 37 | 
 38 | data_len2 = data_np_arr2.shape[1]
 39 | data_num2 =  data_np_arr2.shape[0]
 40 | 
 41 | 
 42 | 
 43 | label_orign = []
 44 | data_orign = []
 45 | sex_orign = []
 46 | age_orign = []
 47 | #读训练数据
 48 | with open('train.csv','rb') as precsv:
 49 | 	reader = csv.reader(precsv)
 50 | 	for line in reader:
 51 | 		
 52 | 		if reader.line_num == 1:
 53 | 			continue
 54 | 		label_origntemp = [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]      #升维度
 55 | 		label_origntemp.insert(int(math.floor(float(line[2])/10)),float(math.floor(float(line[2])/10)))
 56 | 		label_orign.append(label_origntemp)   
 57 | 		data_orign.append(line[3:])  
 58 | label_np_arr = np.array(label_orign)
 59 | data_np_arr = np.array(data_orign)
 60 | #sex_np_arr = np.array(sex_orign)
 61 | 
 62 | 
 63 | data_len = data_np_arr.shape[1]
 64 | data_num =  data_np_arr.shape[0]
 65 | 
 66 | #添加层函数
 67 | def add_layer(inputs,in_size,out_size,n_layer,activation_function=None):
 68 |     layer_name='layer%s'%n_layer
 69 |     with tf.name_scope('layer'):
 70 |         with tf.name_scope('weights'):
 71 |             Ws = tf.Variable(tf.random_normal([in_size,out_size]))
 72 |             tf.histogram_summary(layer_name+'/weights',Ws)
 73 |         with tf.name_scope('baises'):
 74 |             bs = tf.Variable(tf.zeros([1,out_size])+0.5)
 75 |             tf.histogram_summary(layer_name+'/baises',bs)
 76 |         with tf.name_scope('Wx_plus_b'):
 77 |             Wxpb = tf.matmul(inputs,Ws) + bs
 78 |   
 79 |         if activation_function is None:
 80 |             outputs = Wxpb
 81 |         else:
 82 |             outputs = activation_function(Wxpb)
 83 |         tf.histogram_summary(layer_name+'/outputs',outputs)
 84 |         return outputs
 85 | #比较函数
 86 | def compute_accuracy(v_xs,v_ys):
 87 | 	global prediction
 88 | 	y_pre = sess.run(prediction,feed_dict={xs:v_xs})
 89 | 	correct_prediction = tf.equal(tf.argmax(y_pre,1),tf.argmax(v_ys,1))
 90 | 	accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))
 91 | 	result = sess.run(accuracy,feed_dict={xs:v_xs,ys:v_ys})
 92 | 	return result
 93 | 	
 94 | # define placeholder for inputs to network
 95 | with tf.name_scope('inputs'):
 96 |     xs = tf.placeholder(tf.float32,[None,data_len])
 97 |     ys = tf.placeholder(tf.float32,[None,10])
 98 | 
 99 | #3个隐藏层
100 | l1 = add_layer(xs,data_len,19,n_layer=1,activation_function=tf.nn.sigmoid)
101 | l2 = add_layer(l1,19,19,n_layer=2,activation_function=tf.nn.sigmoid)
102 | l3 = add_layer(l2,19,19,n_layer=3,activation_function=tf.nn.sigmoid)
103 | # add output layer
104 | prediction = add_layer(l3,19,10,n_layer=4,activation_function=tf.nn.softmax)
105 | 
106 | 
107 | 
108 | with tf.name_scope('loss'):
109 |     cross_entropy = tf.reduce_mean(-tf.reduce_sum(ys*tf.log(prediction),reduction_indices=[1]))
110 |     tf.scalar_summary('loss',cross_entropy) #show in evernt
111 | with tf.name_scope('train'):
112 |     train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy)
113 | 
114 | 
115 | init = tf.initialize_all_variables()
116 | 
117 | saver = tf.train.Saver()
118 | sess = tf.Session()
119 | merged = tf.merge_all_summaries()
120 | writer = tf.train.SummaryWriter("logs/", sess.graph)
121 | sess.run(init)
122 | 
123 | for i in range(10000):
124 |         _, cost = sess.run([train_step, cross_entropy], feed_dict={xs:data_np_arr,
125 |                                                       ys:label_np_arr.reshape((data_num,10))})
126 | 	#sess.run(train_step,feed_dict={xs:data_np_arr,ys:label_np_arr.reshape((data_num,10))})
127 | 	if i%50 == 0:
128 |                 print("Epoch:", '%04d' % (i), "cost=", \
129 |                 "{:.9f}".format(cost),"Accuracy:",compute_accuracy(data_np_arr2,label_np_arr2.reshape((data_num2,10))))
130 |                 result = sess.run(merged,feed_dict={xs:data_np_arr,
131 |                                                     ys:label_np_arr.reshape((data_num,10))})
132 |                 writer.add_summary(result,i)
133 |  
134 | print("Optimization Finished!")
135 | 
136 | 


--------------------------------------------------------------------------------
/TensorFlow/rnn.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import tensorflow as tf
  3 | from tensorflow.python.ops import rnn, rnn_cell
  4 | import numpy as np
  5 | 
  6 | # Parameters
  7 | learning_rate = 0.001
  8 | training_iters = 200
  9 | display_step = 10
 10 | 
 11 | # Network Parameters
 12 | n_input = 13 # MNIST data input (img shape: 28*28)
 13 | n_steps = 2 # timesteps
 14 | n_hidden = 64 # hidden layer num of features
 15 | n_classes = 2 # MNIST total classes (0-9 digits)
 16 | 
 17 | def one_hot(a, length):
 18 |     b = np.zeros([length, 2])
 19 |     for i in range(length):
 20 |         if a[i] == 0:
 21 |             b[i][1] = 1
 22 |         else:
 23 |             b[i][0] = 1
 24 |     return b
 25 | 
 26 | #1858+200
 27 | train_data = np.loadtxt(open("./train.csv","rb"),delimiter=",",skiprows=0)
 28 | test_data = np.loadtxt(open("./predict.csv","rb"),delimiter=",",skiprows=0)
 29 | train_label_sex = train_data[:, 1:2]
 30 | train_label_sex = one_hot(train_label_sex,train_data.shape[0])
 31 | train_data = train_data[:, 3:]
 32 | train_data = np.reshape(train_data, (1858,n_steps,n_input))
 33 | 
 34 | 
 35 | 
 36 | test_label_sex = test_data[:, 1:2]
 37 | test_label_sex = one_hot(test_label_sex,test_data.shape[0])
 38 | test_data = test_data[:, 3:]
 39 | test_data = np.reshape(test_data, (200,n_steps,n_input))
 40 | 
 41 | 
 42 | # tf Graph input
 43 | x = tf.placeholder("float", [None, n_steps, n_input])
 44 | y = tf.placeholder("float", [None, n_classes])
 45 | 
 46 | # Define weights
 47 | weights = {
 48 |     'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))
 49 | }
 50 | biases = {
 51 |     'out': tf.Variable(tf.random_normal([n_classes]))
 52 | }
 53 | 
 54 | 
 55 | def RNN(x, weights, biases):
 56 | 
 57 |     # Prepare data shape to match `rnn` function requirements
 58 |     # Current data input shape: (batch_size, n_steps, n_input)
 59 |     # Required shape: 'n_steps' tensors list of shape (batch_size, n_input)
 60 | 
 61 |     # Permuting  n_steps
 62 | 
 63 |     x = tf.transpose(x, [1, 0, 2])
 64 |     # Reshaping to (n_steps*batch_size, n_input)
 65 |     x = tf.reshape(x, [-1, n_input])
 66 |     # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
 67 |     x = tf.split(0, n_steps, x)
 68 | 
 69 |     # Define a lstm cell with tensorflow
 70 |     lstm_cell = rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0)
 71 | 
 72 |     # Get lstm cell output
 73 |     outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
 74 | 
 75 |     # Linear activation, using rnn inner loop last output
 76 |     return tf.matmul(outputs[-1], weights['out']) + biases['out']
 77 | 
 78 | pred = RNN(x, weights, biases)
 79 | 
 80 | # Define loss and optimizer
 81 | cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
 82 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
 83 | 
 84 | # Evaluate model
 85 | correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
 86 | accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
 87 | 
 88 | # Initializing the variables
 89 | init = tf.global_variables_initializer()
 90 | 
 91 | # Launch the graph
 92 | with tf.Session() as sess:
 93 |     sess.run(init)
 94 |     step = 1
 95 |     # Keep training until reach max iterations
 96 |     while step < training_iters:
 97 |         # Reshape data to get 28 seq of 28 elements
 98 |         # Run optimization op (backprop)
 99 |         sess.run(optimizer, feed_dict={x: train_data, y: train_label_sex})
100 |         if step % display_step == 0:
101 |             # Calculate batch accuracy
102 |             acc = sess.run(accuracy, feed_dict={x: train_data, y: train_label_sex})
103 |             # Calculate batch loss
104 |             loss = sess.run(cost, feed_dict={x: train_data, y: train_label_sex})
105 |             print("Iter " + str(step) + ", Loss= " + \
106 |                   "{:.6f}".format(loss) + ", Training Accuracy= " + \
107 |                   "{:.5f}".format(acc))
108 |         step += 1
109 |     print("Optimization Finished!")
110 | 
111 | 
112 |     print("Testing Accuracy:", \
113 | sess.run(accuracy, feed_dict={x: test_data, y: test_label_sex}))
114 | 


--------------------------------------------------------------------------------
/TensorFlow/sex_predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import tensorflow as tf
  4 | import numpy as np
  5 | import csv
  6 | import random
  7 | # id,sex,age,WBC,RBC,HGB,HCT,MCV,MCH,MCHC,RDW,PLT,MPV,PCT,PDW,LYM,LYM%,MON,MON%,NEU,NEU%,EOS,EOS%,BAS,BAS%,ALY,ALY%,LIC,LIC%
  8 | 
  9 | # 1为男, 0为女
 10 | 
 11 | '''
 12 | 数据处理部分
 13 | '''
 14 | # 数据集路径
 15 | cwd = os.getcwd()
 16 | 
 17 | train = csv.reader(open(cwd + '/train.csv', 'rb'))
 18 | predict = csv.reader(open(cwd + '/predict.csv', 'rb'))
 19 | 
 20 | 
 21 | # 转化标签为one-hot格式(类别为两类，男和女)
 22 | def dense_to_one_hot(labels_dense, num_classes=2):
 23 |     labels_dense = np.array(labels_dense)
 24 |     num_labels = labels_dense.shape[0]
 25 |     index_offset = np.arange(num_labels) * num_classes
 26 |     labels_one_hot = np.zeros((num_labels, num_classes))
 27 |     labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
 28 |     return labels_one_hot
 29 | 
 30 | 
 31 | # 读取数据
 32 | def write_to_tensor(name, csv_name):
 33 |     if os.path.exists(name):
 34 |         return
 35 |     csv_file = csv.reader(open(cwd + '/' + csv_name, 'rb'))
 36 |     writer = tf.python_io.TFRecordWriter(name)
 37 |     for line in csv_file:
 38 |         if not line:
 39 |             break
 40 |         if len(line) is not 29:
 41 |             continue
 42 |         index = [int(line[1])]
 43 |         # 提取从第4列到第28列
 44 |         data = map(float, line)[3:29]
 45 |         # 注意list类型, Feature或FeatureList等
 46 |         example = tf.train.Example(features=tf.train.Features(feature={
 47 |             "label": tf.train.Feature(int64_list=tf.train.Int64List(value=index)),
 48 |             'content': tf.train.Feature(float_list=tf.train.FloatList(value=data))
 49 |         }))
 50 |         print data, index
 51 |         # 序列化并写入tfrecord
 52 |         writer.write(example.SerializeToString())
 53 |     writer.close()
 54 | 
 55 | 
 56 | # 读取数据并解析
 57 | def read_and_decode(filename):
 58 |     # 根据文件名生成一个队列
 59 |     filename_queue = tf.train.string_input_producer([filename])
 60 |     # 创建tfrecord reader
 61 |     reader = tf.TFRecordReader()
 62 |     # 返回文件名和文件
 63 |     _, serialized_example = reader.read(filename_queue)
 64 |     # 读取时要注意fix shape
 65 |     features = tf.parse_single_example(serialized_example,
 66 |                                        features={
 67 |                                            'label': tf.FixedLenFeature([], tf.int64),
 68 |                                            'content': tf.FixedLenFeature([26], tf.float32),
 69 |                                        })
 70 |     data = tf.cast(features['content'], tf.float32)
 71 |     label = tf.cast(features['label'], tf.int32)
 72 |     return data, label
 73 | 
 74 | 
 75 | '''
 76 | 网络结构部分  结构 双隐层  26 -  64 - 512 - 2 均为全连接层
 77 | '''
 78 | 
 79 | #添加层函数
 80 | def add_layer(inputs,in_size,out_size,n_layer,activation_function=None):
 81 |     layer_name='layer%s'%n_layer
 82 |     with tf.name_scope('layer'):
 83 |         with tf.name_scope('weights'):
 84 |             Ws = tf.Variable(tf.random_normal([in_size,out_size]))
 85 |             tf.histogram_summary(layer_name+'/weights',Ws)
 86 |         with tf.name_scope('baises'):
 87 |             bs = tf.Variable(tf.zeros([1,out_size])+0.5)
 88 |             tf.histogram_summary(layer_name+'/baises',bs)
 89 |         with tf.name_scope('Wx_plus_b'):
 90 |             Wxpb = tf.matmul(inputs,Ws) + bs
 91 |   
 92 |         if activation_function is None:
 93 |             outputs = Wxpb
 94 |         else:
 95 |             outputs = activation_function(Wxpb)
 96 |         tf.histogram_summary(layer_name+'/outputs',outputs)
 97 |         return outputs
 98 | 
 99 | # 定义占位符
100 | with tf.name_scope('inputs'):
101 |     x = tf.placeholder(tf.float32, shape=[None, 26])
102 |     y_ = tf.placeholder(tf.float32, shape=[None, 2])
103 | 
104 | #2个隐藏层
105 | l1 = add_layer(tf.reshape(x, [-1, 26]),26,64,n_layer=1,activation_function=tf.nn.relu)
106 | l2 = add_layer(l1,64,512,n_layer=2,activation_function=tf.nn.relu)
107 | # add output layer
108 | y_result = add_layer(l2,512,2,n_layer=3)
109 | 
110 | 
111 | # 定义损失函数 交叉熵
112 | with tf.name_scope('loss'):
113 |     cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_result, y_))
114 |     tf.scalar_summary('loss',cross_entropy)
115 | # 定义训练op
116 | with tf.name_scope('train'):
117 |     train_step = tf.train.AdamOptimizer(0.1).minimize(cross_entropy)
118 | 
119 | # 定义正确预测
120 | # correct_prediction = tf.less_equal(tf.abs(tf.sub(tf.argmax(y_result, 1), tf.argmax(y_, 1))), 5)
121 | correct_prediction = tf.equal(tf.argmax(y_result, 1), tf.argmax(y_, 1))
122 | 
123 | # 定义正确率
124 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
125 | 
126 | # 定义Model Saver op
127 | saver = tf.train.Saver()
128 | 
129 | # 定义计算图激活op
130 | init_op = tf.global_variables_initializer()
131 | 
132 | '''
133 | 训练部分
134 | '''
135 | # 如果没有保存模型则训练一个新的
136 | if not os.path.exists("./ckpt_sex/checkpoint"):
137 |     # 创建tfrecord
138 |     write_to_tensor('train_sex.tfrecords', 'train.csv')
139 |     write_to_tensor('predict_sex.tfrecords', 'predict.csv')
140 |     # 读取tfrecord
141 |     train_img, train_label = read_and_decode("train_sex.tfrecords")
142 |     test_img, test_label = read_and_decode("predict_sex.tfrecords")
143 | 
144 |     # 使用shuffle_batch分batch并打乱顺序
145 |     img_batch, label_batch = tf.train.shuffle_batch([train_img, train_label],
146 |                                                         batch_size=17, capacity=2000,
147 |                                                         min_after_dequeue=1000)
148 |     test_img_batch, test_label_batch = tf.train.shuffle_batch([test_img, test_label],
149 |                                                                   batch_size=200, capacity=20000,
150 |                                                                   min_after_dequeue=10000)
151 |     with tf.Session() as sess:
152 |         
153 |         merged = tf.merge_all_summaries()
154 |         writer = tf.train.SummaryWriter("logs/", sess.graph)
155 |         # 激活计算图
156 |         sess.run(init_op)
157 |         # 启动队列
158 |         threads = tf.train.start_queue_runners(sess=sess)
159 |         # 迭代次数 = 10000
160 |         for i in range(10000):
161 |             # batch
162 |             image, label = sess.run([img_batch, label_batch])
163 |             # 输出局部正确率
164 |             if i % 100 == 0:
165 |                 train_accuracy = accuracy.eval(feed_dict={
166 |                     x: image, y_: dense_to_one_hot(label)})
167 |                 print("step %d, training accuracy %g" % (i, train_accuracy))
168 |                 result = sess.run(merged,feed_dict={x: image, 
169 |                                                     y_: dense_to_one_hot(label)})
170 |                 writer.add_summary(result,i)
171 |             train_step.run(feed_dict={x: image, y_: dense_to_one_hot(label)})
172 |         # 加载测试集
173 |         test_img, test_label = sess.run([test_img_batch, test_label_batch])
174 |         # 输出整体正确率
175 |         print("test accuracy %g" % accuracy.eval(feed_dict={
176 |             x: test_img, y_: dense_to_one_hot(test_label)}))
177 |         # 保存模型
178 |         save_path = saver.save(sess, cwd + "/ckpt_sex/sex.ckpt", write_meta_graph=None)
179 |         print("Model saved in file: %s" % save_path)
180 | 
181 | '''
182 | 预测部分
183 | 给出预测数据格式CSV中任意一行(包括id有29个数据)即可
184 | '''
185 | 
186 | def preloadedata(data):
187 |     return tf.reshape(np.array(map(float, data[3:29])), [1, 26]).eval()
188 | 
189 | # 加载模型
190 | with tf.Session() as sess:
191 |     # 恢复checkpoint.
192 |     saver.restore(sess, cwd + "/ckpt_sex/sex.ckpt")
193 |     print("Model restored.")
194 |     # 读取数据
195 |     predict_data = csv.reader(open(cwd + '/predict.csv', 'rb'))
196 |     # 预处理数据
197 |     my_data = [37,1,66,8.7,6.9,0.111,10.8,0.55,6.3,0.4,4.6,7.61,87.7,0.1,3.78,1.1,0.03,0.3,0.03,0.4,0.16,1.8,122,0.352,93,32.1,345,11.4,160]
198 |     my_data = preloadedata(my_data)
199 |     # 输出预测结果
200 |     print "predictions", tf.argmax(y_result, 1).eval(feed_dict={x: my_data, keep_prob: 1.0}, session=sess)
201 |     # 输出男女概率
202 |     print "probabilities", tf.nn.softmax(y_result.eval(feed_dict={x: my_data, keep_prob: 1.0}, session=sess)).eval()
203 | 
204 | 


--------------------------------------------------------------------------------
/Traindata.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Tue Dec 13 17:19:21 2016
 5 | 
 6 | @author: zhao
 7 | """
 8 | import random
 9 | import numpy as np
10 | import pandas as pd
11 | 
12 | class Traindata:
13 |     def __init__(self):
14 |         self.df = pd.read_csv('trainurl', index_col = 0)
15 |         #将性别转化为2维矩阵,行代表病人id，列代表性别，为男则第一列置1,女则第二列置1
16 |         self.gender = np.zeros((1858,2))
17 |         for i in range(1858):
18 |             if self.df.iloc[i,0]==1:
19 |                 self.gender[i,0]=1
20 |             else:
21 |                 self.gender[i,1]=1
22 |         self.age = self.df.loc[1:,['age']]
23 |         #将26项指标转换为26列的矩阵
24 |         self.parameter = self.df.loc[1:,['WBC','RBC','HGB','HCT','MCV','MCH','MCHC','ROW','PLT','MPV','PCT','PDW','LYM','LYM%','MON','MON%','NEU','NEU%','EOS','EOS%','BAS','BAS%','ALY','ALY%','LIC','LIC%']]
25 |         self.parameter = np.array(self.parameter)
26 |     #可以返回随机的n个数据
27 |     def next_batch_gender(self,n):
28 |         lable = np.zeros((n,2))
29 |         para = np.zeros((n,26))
30 |         for i in range(n):
31 |             k=random.randint(0, 1858)
32 |             if self.gender[k,0]==1:
33 |                 lable[i,0]=1
34 |             else:
35 |                 lable[i,1]=1
36 |             para[0] = self.parameter[k]
37 |         return para,lable
38 | 
39 |     def next_batch_age(self,n):
40 |         para = np.zeros((n,26))
41 |         for i in range(n):
42 |             k=random.randint(0, 1858)
43 |             if(i==0):
44 |                 age = pd.DataFrame([self.age.iloc[k]])
45 |             else:
46 |                 age.append(self.age.iloc[k])
47 |             para[0] = self.parameter[k]
48 |         return para,age
49 |         
50 | 
51 |        
52 |     
53 | 


--------------------------------------------------------------------------------
/dealdata.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import csv as cv
 3 | import numpy as np
 4 | import re
 5 | csv_file1_object = cv.reader(open('table1.csv','rb'))
 6 | csv_file2_object = cv.reader(open('table2.csv','rb'))
 7 | csv_file3=open('table3.csv','wb')
 8 | csv_file3_object=cv.writer(csv_file3)
 9 | head1 = csv_file1_object.next()
10 | head2 = csv_file2_object.next()
11 | data_2=[]
12 | data2=[]
13 | for row in csv_file2_object:
14 | 	#print len(row[1])
15 | 	if len(row[1])<10 and int(row[0])<=26:
16 | 		data_2.append(row)
17 | 	else:
18 | 		pass
19 | 
20 | data2=np.array(data_2)
21 | col=0
22 | data2=data2[np.argsort(data2[:,col])]
23 | 
24 | 
25 | csv_file3_object.writerow(['id','sex','age','WBC','RBC','HGB','HCT','MCV','MCH'\
26 | 	,'MCHC','RDW','PLT','MPV','PCT','PDW','LYM','LYM%','MON','MON%','NEU','NEU%','EOS'\
27 | 	,'EOS%','BAS','BAS%','ALY','ALY%','LIC','LIC%'])
28 | i=1
29 | for row in csv_file1_object:
30 | 	right_only_stats= data2[(data2[0::,2]==row[2] ) & (data2[0::,3]==row[3]),1]
31 | 
32 | 	#right_only_stats=np.column_stack((right_only_stats,np.array([row[0],row[1]])))
33 | 	right_only_stats=np.insert(right_only_stats,0,values=i,axis=None)
34 | 	i=i+1
35 | 	right_only_stats=np.insert(right_only_stats,1,values=row[0],axis=None)
36 | 	right_only_stats=np.insert(right_only_stats,2,values=row[1],axis=None)
37 | 	#right_only_stats= data2[0::,2]==row[2]
38 | 	if len(right_only_stats)==29:
39 | 		csv_file3_object.writerow(right_only_stats)


--------------------------------------------------------------------------------
/matlab/nn/create_nn.m:
--------------------------------------------------------------------------------
1 | load('predict_input_transpose.mat')
2 | load('predict_output_transpose.mat')
3 | load('train_input_transpose.mat')
4 | load('train_output_transpose.mat')
5 | net=newff(train_input_transpose,train_output_transpose,{10,2});
6 | [net,tr]=train(net,train_input_transpose,train_output_transpose);


--------------------------------------------------------------------------------
/matlab/nn/network_hit139.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/matlab/nn/network_hit139.mat


--------------------------------------------------------------------------------
/matlab/nn/readme.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/matlab/nn/readme.md


--------------------------------------------------------------------------------
/matlab/nn/test_nn.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/matlab/nn/test_nn.m


--------------------------------------------------------------------------------
/matlab/svm_with_pca/readme.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/matlab/svm_with_pca/readme.md


--------------------------------------------------------------------------------
/matlab/svm_with_pca/svm_with_pca.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/matlab/svm_with_pca/svm_with_pca.m


--------------------------------------------------------------------------------
/sklearn/.idea/bloodpredict.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Nosetests" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Nosetests" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/sklearn/.idea/misc.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ProjectLevelVcsManager" settingsEditedManually="false">
 4 |     <OptionsSetting value="true" id="Add" />
 5 |     <OptionsSetting value="true" id="Remove" />
 6 |     <OptionsSetting value="true" id="Checkout" />
 7 |     <OptionsSetting value="true" id="Update" />
 8 |     <OptionsSetting value="true" id="Status" />
 9 |     <OptionsSetting value="true" id="Edit" />
10 |     <ConfirmationsSetting value="0" id="Add" />
11 |     <ConfirmationsSetting value="0" id="Remove" />
12 |   </component>
13 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.12 (C:\Program Files\Anaconda2\python.exe)" project-jdk-type="Python SDK" />
14 | </project>


--------------------------------------------------------------------------------
/sklearn/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/bloodpredict.iml" filepath="$PROJECT_DIR$/.idea/bloodpredict.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/sklearn/README.md:
--------------------------------------------------------------------------------
 1 | # 利用scikit-learn预测病人性别以及年龄
 2 | 
 3 | ## 环境配置(Ubuntu 14.04或以上版本)
 4 | 
 5 | ```
 6 | sudo apt-get install python-numpy cython python-scipy python-matplotlib
 7 | pip install -U scikit-learn(如果不行就加sudo)
 8 | pip install pandas
 9 | ```
10 | 
11 | ## 使用
12 | 1. 下载预处理过的数据集
13 | 
14 | ```
15 | chmod +x download.sh
16 | ./download.sh
17 | ```
18 | 
19 | 2. 预测
20 | 
21 | ```
22 | python gender_predict.py
23 | python age_predict.py
24 | ```


--------------------------------------------------------------------------------
/sklearn/age_predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | pandas 0.18.1
  4 | scikit-learn 0.18.1
  5 | matplotlib 1.5.3
  6 | numpy 1.11.1
  7 | """
  8 | 
  9 | import pandas as pd
 10 | import numpy as np
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | from sklearn.model_selection import train_test_split
 14 | from sklearn.feature_selection import SelectFromModel
 15 | from sklearn.ensemble import RandomForestRegressor
 16 | from sklearn.ensemble import AdaBoostRegressor
 17 | 
 18 | # 使用了预处理的第二组数据集
 19 | class_names_train2 = ['sex','age','WBC','RBC','BAS#','HGB','HCT','MCV',
 20 |                       'MCH','MCHC','RDW-CV','PLT','MPV','PCT','PDW','LYM#',
 21 |                       'LYM%','MONO','MONO%','NEU#','NEU%','EOS#','EOS%','BAS%',
 22 |                       'IG#','IG%','NRBC#','NRBC%','P-LCR']
 23 | 
 24 | 
 25 | def load_data():
 26 |     # 数据集已合并, 去掉了标签行, sex预处理为数字
 27 |     df = pd.DataFrame(pd.read_csv('train2.csv', names=class_names_train2))
 28 |     # 转化为字符串
 29 |     df = df.convert_objects(convert_numeric=True)
 30 |     # 使用平均值填充缺失值
 31 |     df = df.fillna(df.mean())
 32 |     return df
 33 | 
 34 | 
 35 | def split_data(df, low, high):
 36 |     """
 37 |     :param df: 输入的dataframe
 38 |     :param low: 截取区间的低阈值
 39 |     :param high: 截取区间的高阈值(不包含)
 40 |     :return: 截取的dataframe
 41 |     """
 42 |     df_lowcut = df[df['age'] >= low]
 43 |     df_cut = df_lowcut[df_lowcut['age'] < high]
 44 | 
 45 |     selected_names = [x for x in class_names_train2 if (x != 'age' and x != 'sex')]
 46 |     x_data = df_cut[selected_names].as_matrix()
 47 |     y_data = df_cut['age'].as_matrix()
 48 |     # 用平均值填充nan
 49 |     def fill_nan(np_array):
 50 |         col_mean = np.nanmean(np_array, axis=0)
 51 |         nan_ids = np.where(np.isnan(np_array))
 52 |         np_array[nan_ids] = np.take(col_mean, nan_ids[1])
 53 |         return np_array
 54 | 
 55 |     x_data = fill_nan(x_data)
 56 |     print 'x有没有nan值:', np.any(np.isnan(x_data))
 57 |     print 'y有没有nan值:', np.any(np.isnan(y_data))
 58 | 
 59 |     return x_data, y_data
 60 | 
 61 | 
 62 | def draw(labels, prediction):
 63 |     """
 64 |     绘制折线图比较结果
 65 |     :param labels: 1维numpy数组
 66 |     :param prediction: 1维numpy数组
 67 |     :return:
 68 |     """
 69 |     result = []
 70 |     for i in range(labels.shape[0]):
 71 |         result.append([labels[i], prediction[i]])
 72 | 
 73 |     # 将年龄按照大小排序
 74 |     result = sorted(result, key=lambda x: x[0])
 75 |     labels = [row[0] for row in result]
 76 |     prediction = [row[1] for row in result]
 77 | 
 78 |     plt.plot(labels, label='labels')
 79 |     plt.plot(prediction, label='predict')
 80 |     plt.legend(loc='upper left')
 81 |     plt.show()
 82 | 
 83 | 
 84 | # 评估测试集
 85 | def evalue(clf, X_test, y_test):
 86 |     pd = clf.predict(X_test)
 87 |     
 88 |     delta = [x1 - x2 for (x1, x2) in zip(y_test, pd)]
 89 |     correct_indices = [x for x in delta if abs(x) < 5]
 90 |     precision = float(len(correct_indices)) / len(pd)
 91 |     
 92 |     print '准确率为: ' + str(precision)
 93 |     draw(y_test, pd)
 94 | 
 95 | 
 96 | def feature_select(clf, X_train, y_train, X_test):
 97 |     # 预训练
 98 |     print '特征选择预训练中...'
 99 |     clf.fit(X_train, y_train)
100 |     
101 |     # 评估特征
102 |     importances = clf.feature_importances_
103 |     indices = np.argsort(importances)[::-1]
104 |     print("特征权值分布为: ")
105 |     for f in range(X_train.shape[1]):
106 |         print("%d. %s %d (%f)" % (f + 1, class_names_train2[indices[f]], indices[f], importances[indices[f]]))
107 |     
108 |     # 过滤掉权值小于threshold的特征
109 |     model = SelectFromModel(clf, threshold=0.01, prefit=True)
110 |     X_train_new = model.transform(X_train)
111 |     X_test_new = model.transform(X_test)
112 |     print '训练集和测试集的容量以及选择的特征数为: ', X_train_new.shape, X_test_new.shape
113 |     # 返回压缩特征之后的训练集和测试集
114 |     return X_train_new, X_test_new
115 | 
116 | 
117 | if __name__ == '__main__':
118 |     #载入数据
119 |     df = load_data()
120 |     x1, y1 = split_data(df, 0, 25)
121 |     x2, y2 = split_data(df, 25, 60)
122 |     x3, y3 = split_data(df, 60, 80)
123 | 
124 |     def test_data(X_data, y_data):
125 |         # 按9:1分裂训练集/测试集
126 |         X_train, X_test, y_train, y_test = \
127 |             train_test_split(X_data, y_data, test_size=0.1, random_state=0)
128 |         # 使用随机森林
129 |         clf = RandomForestRegressor(max_features=None, n_estimators=20, max_depth=None)
130 |         # 特征选择
131 |         X_train_compressed, X_test_compressed = feature_select(clf, X_train, y_train, X_test)
132 |         # 使用提取的特征重新训练
133 |         clf.fit(X_train_compressed, y_train)
134 |         # 评估训练集效果
135 |         evalue(clf, X_train_compressed, y_train)
136 |         # 评估测试集效果
137 |         evalue(clf, X_test_compressed, y_test)
138 | 
139 |     test_data(x1, y1)
140 |     test_data(x2, y2)
141 |     test_data(x3, y3)
142 | 


--------------------------------------------------------------------------------
/sklearn/bloodpredict.py:
--------------------------------------------------------------------------------
 1 | #coding = utf-8
 2 | import pickle
 3 | import numpy as np
 4 | from sklearn import svm
 5 | from sklearn import metrics
 6 | from sklearn.cross_validation import train_test_split
 7 | 
 8 | 
 9 | def extract(filename):
10 |     X = np.loadtxt(filename, skiprows= 1,delimiter=',', usecols=(3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28))
11 |     y = np.loadtxt(filename, dtype='string', skiprows= 1,delimiter=',', usecols=(1,))
12 |     for i in range(len(y)):
13 |         if y[i] == '\xc4\xd0':
14 |             y[i] = 1
15 |         else:
16 |             y[i] = 0
17 |     return X,y
18 | 
19 | def split_test(X,y):
20 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
21 |     return X_train, X_test, y_train, y_test
22 | 
23 | def save_model(model,name):
24 |     pickle.dump(model, open(str(name)+'.pkl', 'w'))
25 | 
26 | def load_model(name):
27 |     model = pickle.load(open(str(name)+'.pkl'))
28 |     return model
29 | 
30 | if __name__ == "__main__":
31 |     X, y = extract('train.csv')
32 |     X_train, X_test, y_train, y_test = split_test(X, y)
33 |     clf = svm.SVC(kernel='linear', gamma=0.7, C = 1.0).fit(X_train, y_train)
34 |     y_predicted = clf.predict(X_test)
35 |     print metrics.classification_report(y_test, y_predicted)
36 |     print
37 |     print "test_accuracy_score"
38 |     print metrics.accuracy_score(y_test, y_predicted)
39 |     save_model(clf,'sex')
40 | 
41 |     X, y =extract('predict.csv')
42 |     clf2 = load_model('sex')
43 |     y2_predicted = clf2.predict(X)
44 |     print "accuracy_score"
45 |     print metrics.accuracy_score(y, y2_predicted)
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/sklearn/download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FILE=data.csv
 4 | URL=http://home.ustc.edu.cn/~xxuan/$FILE
 5 | 
 6 | echo "Downloading data.csv..."
 7 | wget $URL -O $FILE
 8 | echo "Done."
 9 | 
10 | FILE=train2.csv
11 | echo "Downloading train2.csv..."
12 | wget $URL -O $FILE
13 | echo "Done."


--------------------------------------------------------------------------------
/sklearn/gender_predict.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | pandas 0.18.1
 4 | scikit-learn 0.18.1
 5 | matplotlib 1.5.3
 6 | numpy 1.11.1
 7 | """
 8 | 
 9 | import pandas as pd
10 | import numpy as np
11 | 
12 | from sklearn.model_selection import train_test_split
13 | from sklearn.feature_selection import SelectFromModel
14 | from sklearn.ensemble import AdaBoostClassifier
15 | 
16 | class_names_train2 = ['sex', 'age', 'WBC', 'RBC', 'BAS#', 'HGB', 'HCT', 'MCV',
17 |                           'MCH', 'MCHC', 'RDW-CV', 'PLT', 'MPV', 'PCT', 'PDW', 'LYM#',
18 |                           'LYM%', 'MONO', 'MONO%', 'NEU#', 'NEU%', 'EOS#', 'EOS%', 'BAS%',
19 |                           'IG#', 'IG%', 'NRBC#', 'NRBC%', 'P-LCR']
20 | 
21 | 
22 | def load_data():
23 |     # 数据集已合并, 去掉了标签行
24 |     # sex预处理: 男是1, 女是0
25 | 
26 |     df = pd.DataFrame(pd.read_csv('train2.csv', names=class_names_train2))
27 |     df = df.convert_objects(convert_numeric=True)
28 |     df = df.fillna(df.mean())
29 | 
30 |     # 去掉id, 分裂标签
31 |     selected_names = [x for x in class_names_train2 if (x != 'sex' and x != 'age')]
32 |     X_data = df[selected_names].as_matrix()
33 |     y_data = df['sex'].as_matrix().astype(int)
34 |     return X_data, y_data
35 | 
36 | 
37 | def data_preprocess(X_data, y_data):
38 |     # 按3:1分裂训练集/测试集
39 |     X_train, X_test, y_train, y_test = \
40 |         train_test_split(X_data, y_data, test_size=0.25)
41 |     return X_train, X_test, y_train, y_test
42 | 
43 | 
44 | def evalue(clf, X_test, y_test):
45 |     """
46 |     评估模型在测试集上的性能
47 |     :param clf: 模型
48 |     :param X_test: 测试集数据
49 |     :param y_test: 测试集标记
50 |     :return:
51 |     """
52 |     pd = clf.predict(X_test)
53 | 
54 |     correct_pairs = [(x, y) for (x, y) in zip(y_test, pd) if x == y]
55 |     precision = float(len(correct_pairs)) / len(pd)
56 | 
57 |     print '准确率为: ' + str(precision)
58 | 
59 | 
60 | def feature_select(clf, X_train, y_train, X_test):
61 |     # 预训练
62 |     clf.fit(X_train, y_train)
63 |     
64 |     # 评估特征
65 |     importances = clf.feature_importances_
66 |     indices = np.argsort(importances)[::-1]
67 |     print("特征权值分布为: ")
68 |     for f in range(X_train.shape[1]):
69 |         print("%d. %s %d (%f)" % (f + 1, class_names_train2[indices[f]], indices[f], importances[indices[f]]))
70 |     
71 |     # 过滤掉权值小于threshold的特征
72 |     model = SelectFromModel(clf, threshold=0.04, prefit=True)
73 |     X_train_new = model.transform(X_train)
74 |     X_test_new = model.transform(X_test)
75 |     print '训练集和测试集的容量以及选择的特征数为: ', X_train_new.shape, X_test_new.shape
76 |     # 返回压缩特征之后的训练集和测试集
77 |     return X_train_new, X_test_new
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     # 载入数据
82 |     X_data, y_data = load_data()
83 |     X_train, X_test, y_train, y_test = data_preprocess(X_data, y_data)
84 | 
85 |     # 使用adaboost
86 |     clf = clf = AdaBoostClassifier()
87 |     # 选择特征, 压缩数据
88 |     X_train_compressed, X_test_compressed = feature_select(clf, X_train, y_train, X_test)
89 |     
90 |     # 使用选择的特征重新训练
91 |     clf.fit(X_train_compressed, y_train)
92 |     # 评估模型
93 |     evalue(clf, X_test_compressed, y_test)
94 | 


--------------------------------------------------------------------------------
/weixin/README.md:
--------------------------------------------------------------------------------
 1 | # 关于微信公众号的开发
 2 | ##1.环境配置
 3 | ###安装lxml
 4 |   sudo apt-get install python-lxml
 5 | ##2.运行
 6 | ###1)将wx.py内token代码换成自己公众号接口配置的token
 7 | ###2)开始
 8 | sudo python wx.py 80 
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/weixin/reply_text.xml:
--------------------------------------------------------------------------------
1 | $def with (toUser,fromUser,createTime,content)
2 | <xml>
3 | <ToUserName><![CDATA[$toUser]]></ToUserName>
4 | <FromUserName><![CDATA[$fromUser]]></FromUserName>
5 | <CreateTime>$createTime</CreateTime>
6 | <MsgType><![CDATA[text]]></MsgType>
7 | <Content><![CDATA[$content]]></Content>
8 | </xml>
9 | 


--------------------------------------------------------------------------------
/weixin/wx.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys
 3 | sys.path.append("..")
 4 | import web
 5 | import hashlib
 6 | import urllib2
 7 | import time
 8 | from lxml import etree
 9 | from PIL import Image
10 | import BloodTestReportOCR.tf_predict
11 | from BloodTestReportOCR.imageFilter import ImageFilter
12 | import numpy, cv2
13 | import json
14 | urls = (
15 |     '/weixin', 'Weixin'
16 | )
17 | 
18 | token = "galigeigei"
19 | 
20 | class Weixin:
21 |     def __init__(self):
22 |         self.render = web.template.render('./')
23 | 
24 |     def POST(self):
25 |         str_xml = web.data()
26 |         xml = etree.fromstring(str_xml)
27 |         msgType = xml.find("MsgType").text
28 |         fromUser = xml.find("FromUserName").text
29 |         toUser = xml.find("ToUserName").text
30 | 
31 |         res = '请输入图片'
32 |         if msgType == 'image':
33 |             print('gali')
34 |             url = xml.find('PicUrl').text
35 |             img = cv2.imdecode(numpy.fromstring(urllib2.urlopen(url).read(), numpy.uint8), cv2.CV_LOAD_IMAGE_UNCHANGED)
36 |             data = ImageFilter(image=img).ocr(22)
37 |             if data:
38 |                 data = json.loads(data)
39 |                 pre = [str(data['bloodtest'][i]['value']) for i in range(22)]
40 |                 for i in range(22):
41 |                     if pre[i] == '': pre[i] = 0
42 |                     else:
43 |                         tmp = pre[i].replace('.', '', pre[i].count('.')-1)
44 |                         pre[i] = float(tmp)
45 |                        
46 |                 arr = numpy.array(pre)
47 |                 arr = numpy.reshape(arr, [1, 22])
48 |                 
49 |                 sex, age = tf_predict.predict(arr)
50 |                 res = 'sex:'+['女','男'][sex] + '  age:'+str(int(age))
51 |             else:
52 |                 res = '请输入正确图片'
53 | 
54 |         return self.render.reply_text(fromUser, toUser, int(time.time()), res)
55 | 
56 |     def GET(self):
57 |         data = web.input()
58 |         signature = data.signature
59 |         timestamp = data.timestamp
60 |         nonce = data.nonce
61 |         echostr = data.echostr
62 |         list = [token, timestamp, nonce]
63 |         list.sort()
64 |         str = list[0] + list[1] + list[2]
65 |         hashcode = hashlib.sha1(str).hexdigest()
66 |         if hashcode == signature: return echostr
67 | 
68 | app = web.application(urls, globals())
69 | 
70 | if __name__ == '__main__':  
71 |     app.run()
72 | 


--------------------------------------------------------------------------------