├── BloodTestReportOCR
├── README.md
├── bloodtestdata.json
├── caffe_predict.py
├── checkpoint
├── classifier.py
├── config.py
├── digits
├── imageFilter.py
├── imgproc.py
├── lenet.prototxt
├── lenet_iter_800.caffemodel
├── model.ckpt.data-00000-of-00001
├── model.ckpt.index
├── nn_model
│ ├── checkpoint
│ ├── model.ckpt.data-00000-of-00001
│ └── model.ckpt.index
├── origin_pics
│ ├── bloodtestreport1.jpg
│ ├── bloodtestreport2.jpg
│ ├── bloodtestreport3.jpg
│ ├── bloodtestreport4.jpg
│ ├── bloodtestreport5.jpg
│ ├── bloodtestreport6.jpg
│ ├── bloodtestreport7.jpg
│ └── region.jpg
├── pHash.py
├── pd_predict.py
├── rnn_model
│ ├── rnn_age_model
│ │ ├── checkpoint
│ │ ├── model.ckpt.data-00000-of-00001
│ │ ├── model.ckpt.index
│ │ └── model.ckpt.meta
│ └── rnn_sex_model
│ │ ├── checkpoint
│ │ ├── model.ckpt.data-00000-of-00001
│ │ ├── model.ckpt.index
│ │ ├── model.ckpt.meta
│ │ └── rnn_age_model
│ │ ├── checkpoint
│ │ ├── model.ckpt.data-00000-of-00001
│ │ ├── model.ckpt.index
│ │ └── model.ckpt.meta
├── rnn_predict.py
├── static
│ ├── index.html
│ └── index_with_fileinput_plugin.html
├── temp_pics
│ └── README.md
├── tf_predict.py
└── view.py
├── Caffe
├── README.md
├── caffe_sex_train_predict.py
├── config.prototxt
├── draw_net.py
├── lenet_train.prototxt
└── model_prod_prototxt
├── DigitRecogn
├── README.md
├── index.html
├── neural_network_design.py
├── ocr.js
├── ocr.py
└── server.py
├── Keras
├── .gitignore
├── KerasDistinguishAge.py
├── README.md
├── gender_age_predict_cnn.py
├── kerashandwritetest.py
└── train.py
├── LICENSE
├── MxNet
└── README.md
├── PaddlePaddle
├── README.md
├── __init__.py
├── dataprovider.py
├── predict_age.sh
├── predict_sex.sh
├── prediction.py
├── prediction_age.py
├── prediction_sex.py
├── preprocess.py
├── preprocess.sh
├── test.bmp
├── test.list
├── train.list
├── train.sh
├── train_age.sh
├── train_sex.sh
├── trainer_config_age.py
├── trainer_config_sex.py
└── vgg.py
├── README.md
├── Spark
├── BloodTestReportDeepLearning
│ ├── BTR_binary_classification.py
│ ├── BTR_decision_tree.py
│ ├── BTR_gradient_boosting.py
│ ├── BloodTestReportbyLR.py
│ ├── BloodTestReportbyNB.py
│ ├── BloodTestReportbyRF.py
│ ├── BloodTestReportbySVM.py
│ ├── README.md
│ ├── data_set.csv
│ ├── dataformat.py
│ └── spark单机安装15122016.md
├── DigitRecogn_Spark
│ ├── Readme.md
│ ├── index.html
│ ├── ocr.js
│ └── server.py
└── README.md
├── TensorFlow
├── LSTM.py
├── README.md
├── age_predict.py
├── agepredict_v2.0.py
├── predict.csv
├── rnn.py
├── sex_predict.py
└── train.csv
├── Traindata.py
├── dealdata.py
├── matlab
├── nn
│ ├── create_nn.m
│ ├── network_hit139.mat
│ ├── readme.md
│ └── test_nn.m
└── svm_with_pca
│ ├── readme.md
│ └── svm_with_pca.m
├── sklearn
├── .idea
│ ├── bloodpredict.iml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── README.md
├── age_predict.py
├── bloodpredict.py
├── download.sh
└── gender_predict.py
└── weixin
├── README.md
├── reply_text.xml
└── wx.py
/BloodTestReportOCR/README.md:
--------------------------------------------------------------------------------
1 |
2 | # 血常规检验报告OCR
3 |
4 |
5 |
6 | ## 运行环境
7 |
8 | ```
9 | # 安装numpy,
10 | sudo apt-get install python-numpy # http://www.numpy.org/
11 | # 安装opencv
12 | sudo apt-get install python-opencv # http://opencv.org/
13 |
14 | ##安装OCR和预处理相关依赖
15 | sudo apt-get install tesseract-ocr
16 | sudo pip install pytesseract
17 | sudo apt-get install python-tk
18 | sudo pip install pillow
19 |
20 | # 安装Flask框架、mongo
21 | sudo pip install Flask
22 | sudo apt-get install mongodb # 如果找不到可以先sudo apt-get update
23 | sudo service mongodb started
24 | sudo pip install pymongo
25 | ```
26 |
27 | ## 运行
28 |
29 | ```
30 | cd BloodTestReportOCR
31 | python view.py # upload图像,在浏览器打开http://yourip:8080
32 |
33 | ```
34 |
35 | ## view.py
36 |
37 | Web 端上传图片到服务器,存入mongodb并获取oid,稍作修整,希望能往REST架构设计,目前还不完善;
38 | 前端采用了vue.js, mvvm模式。写了两个版本,一个是index.html无插件,另一个使用了bootstrap-fileinput插件,有点问题;
39 |
40 | ## imageFilter.py
41 | 对图像透视裁剪和OCR进行了简单的封装,以便于模块间的交互,规定适当的接口
42 | ```
43 | imageFilter = ImageFilter() # 可以传入一个opencv格式打开的图片
44 |
45 | num = 22
46 | print imageFilter.ocr(num)
47 | ```
48 |
49 | #### ocr函数 - 模块主函数返回识别数据
50 |
51 | 用于对img进行ocr识别,他会先进行剪切,之后进一步做ocr识别,返回一个json对象
52 | 如果剪切失败,则返回None
53 | @num 规定剪切项目数
54 |
55 | #### perspect函数做 - 初步的矫正图片
56 |
57 | 用于透视image,他会缓存一个透视后的opencv numpy矩阵,并返回该矩阵
58 | 透视失败,则会返回None,并打印不是报告
59 | @param 透视参数
60 |
61 | * 关于param
62 |
63 | 参数的形式为[p1, p2, p3 ,p4 ,p5]。
64 | p1,p2,p3,p4,p5都是整型,其中p1必须是奇数。
65 |
66 | p1是高斯模糊的参数,p2和p3是canny边缘检测的高低阈值,p4和p5是和筛选有关的乘数。
67 |
68 | 如果化验报告单放在桌子上时,有的边缘会稍微翘起,产生比较明显的阴影,这种阴影有可能被识别出来,导致定位失败。
69 | 解决的方法是调整p2和p3,来将阴影线筛选掉。但是如果将p2和p3调的比较高,就会导致其他图里的黑线也被筛选掉了。
70 | 参数的选择是一个问题。
71 | 我在getinfo.default中设置的是一个较低的阈值,p2=70,p3=30,这个阈值不会屏蔽阴影线。
72 | 如果改为p2=70,p3=50则可以屏蔽,但是会导致其他图片识别困难。
73 |
74 | 就现在来看,得到较好结果的前提主要有三个
75 | - 化验单尽量平整
76 | - 图片中应该包含全部的三条黑线
77 | - 图片尽量不要包含化验单的边缘,如果有的话,请尽量避开有阴影的边缘。
78 |
79 | #### filter函数 - 过滤掉不合格的或非报告图片
80 |
81 | 返回img经过透视过后的PIL格式的Image对象,如果缓存中有PerspectivImg则直接使用,没有先进行透视
82 | 过滤失败则返回None
83 | @param filter参数
84 |
85 |
86 | #### autocut函数 - 将图片中性别、年龄、日期和各项目名称数据分别剪切出来
87 |
88 | 用于剪切ImageFilter中的img成员,剪切之后临时图片保存在out_path,
89 | 如果剪切失败,返回-1,成功返回0
90 | @num 剪切项目数
91 | @param 剪切参数
92 |
93 | 剪切出来的图片在BloodTestReportOCR/temp_pics/ 文件夹下
94 |
95 | 函数输出为data0.jpg,data1.jpg......等一系列图片,分别是白细胞计数,中性粒细胞记数等的数值的图片。
96 |
97 | #### classifier.py
98 |
99 | 用于判定裁剪矫正后的报告和裁剪出检测项目的编号
100 |
101 | #### imgproc.py
102 | 将识别的图像进行处理二值化等操作,提高识别率
103 | 包括对中文和数字的处理
104 |
105 | #### digits
106 | 将该文件替换Tesseract-OCR\tessdata\configs中的digits
107 |
--------------------------------------------------------------------------------
/BloodTestReportOCR/bloodtestdata.json:
--------------------------------------------------------------------------------
1 | {
2 | "_id": "bbca5d6a-2156-41c4-89da-0329e8c99a4f",
3 | "originPicture": "bbca5d6a-2156-41c4-89da-0329e8c99a4f",
4 | "date": "2016-09-21",
5 | "profile": {
6 | "gender": "Man",
7 | "age": 30
8 | },
9 | "bloodtest": [
10 | {"name": "白细胞记数", "alias": "WBC", "value": 0, "range": "4-10", "unit": "10E9/L"},
11 | {"name": "中性粒细胞计数", "alias": "GRA", "value": 0, "range": "1.8-6.4", "unit": "10E9/L"},
12 | {"name": "淋巴细胞计数", "alias": "LYM", "value": 0, "range": "1-3.3", "unit": "10E9/L"},
13 | {"name": "单核细胞计数", "alias": "MONO","value": 0, "range": "0.2~1", "unit": "10E9/L"},
14 | {"name": "嗜酸性粒细胞记数","alias": "EO", "value": 0, "range": "0-0.5", "unit": "10E9/L"},
15 | {"name": "嗜碱性粒细胞记数","alias": "BASO","value": 0, "range": "0.02-0.1", "unit": "%" },
16 | {"name": "中性粒细胞百分比","alias": "GRA%","value": 0, "range": "40-75", "unit": "%" },
17 | {"name": "淋巴细胞百分比", "alias": "LYM%","value": 0, "range": "18-40", "unit": "%" },
18 | {"name": "单核细胞百分比", "alias": "MONO%","value": 0,"range": "3.5-10", "unit": "%" },
19 | {"name": "嗜酸性粒细胞百分比","alias": "EO%", "value": 0, "range": "0-0.5", "unit": "%" },
20 | {"name": "嗜碱性粒细胞百分比","alias": "BASO%","value": 0, "range": "0-1.5", "unit": "%" },
21 | {"name": "红细胞记数", "alias": "RBC", "value": 0, "range": "4-5.5", "unit": "10E12/L"},
22 | {"name": "血红蛋白", "alias": "HGB", "value": 0, "range": "120-160", "unit": "g/L" },
23 | {"name": "红细胞压积", "alias": "HCT", "value": 0, "range": "42-49", "unit": "L/L" },
24 | {"name": "红细胞平均体积", "alias": "MCV", "value": 0, "range": "82-95", "unit": "fL" },
25 | {"name": "平均血红蛋白", "alias": "MCH", "value": 0, "range": "27-33", "unit": "pg" },
26 | {"name": "平均血红蛋白浓度","alias": "MCHC","value": 0, "range": "320-360", "unit": "g/L" },
27 | {"name": "红细胞分布宽度", "alias": "RDW%","value": 0, "range": "10.6-15", "unit": "%" },
28 | {"name": "血小板记数", "alias": "PLT", "value": 0, "range": "100-300", "unit": "10E9/L"},
29 | {"name": "血小板压积", "alias": "PCT", "value": 0, "range": "0.11-0.28","unit": "L/L" },
30 | {"name": "血小板分布宽度", "alias": "PDW%","value": 0, "range": "15.1-18.1","unit": "%" },
31 | {"name": "平均血小板体积", "alias": "MPV", "value": 0, "range": "6-14", "unit": "fL" }
32 | ]
33 | }
--------------------------------------------------------------------------------
/BloodTestReportOCR/caffe_predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import numpy as np
3 | import pdb
4 | import sys,os
5 | import caffe
6 |
7 |
8 | def predict():
9 |
10 | # 设置当前的工作环境目录
11 | root = '/home/liucan/ocr/'
12 | # 我们也把caffe/python也添加到当前环境
13 | #sys.path.insert(0, '/home/gzr/caffe/python')
14 | #os.chdir('/home/gzr/caffe')#更换工作目录
15 | # 设置网络结构
16 | net_file='./lenet.prototxt'
17 | # 添加训练之后的参数
18 | caffe_model='./lenet_iter_800.caffemodel'
19 | # 均值文件
20 | #mean_file= '/home/liucan/ocr/mean.npy'
21 |
22 | # 这里对任何一个程序都是通用的,就是处理图片
23 | # 把上面添加的两个变量都作为参数构造一个Net
24 | net = caffe.Net(net_file,caffe_model,caffe.TEST)
25 | # 得到data的形状,这里的图片是默认matplotlib底层加载的
26 | transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
27 | # matplotlib加载的image是像素[0-1],图片的数据格式[weight,high,channels],RGB
28 | # caffe加载的图片需要的是[0-255]像素,数据格式[channels,weight,high],BGR,那么就需要转换
29 |
30 | #pdb.set_trace()
31 |
32 | # channel 放到前面
33 | transformer.set_transpose('data',(2, 0, 1))
34 | #transformer.set_mean('data', np.load(mean_file).mean(1).mean(1))
35 | # 图片像素放大到[0-255]
36 | transformer.set_raw_scale('data', 255)
37 | # RGB-->BGR 转换
38 | #transformer.set_channel_swap('data',(2, 1, 0))
39 |
40 | # 这里才是加载图片
41 | im=caffe.io.load_image(root+'img/p9.jpg', color=True)
42 | #grayim = im[:,:,0]
43 | #im = np.reshape(grayim,(170,37,3))
44 |
45 | # 用上面的transformer.preprocess来处理刚刚加载图片
46 | net.blobs['data'].data[...] = transformer.preprocess('data',im)
47 | #注意,网络开始向前传播啦
48 | out = net.forward()
49 | # 最终的结果: 当前这个图片的属于哪个物体的概率(列表表示)
50 | output_prob = net.blobs['prob'].data[0]
51 | # 找出最大的那个概率
52 | print 'predicted class is:', output_prob.argmax()
53 | return output_prob.argmax()
54 |
55 | if __name__=='__main__':
56 | predict()
57 |
58 |
--------------------------------------------------------------------------------
/BloodTestReportOCR/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model.ckpt"
2 | all_model_checkpoint_paths: "model.ckpt"
3 |
--------------------------------------------------------------------------------
/BloodTestReportOCR/classifier.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import pHash
3 | from PIL import Image
4 | import os
5 | # 判断是否血常规检验报告,输入经过矫正后的报告图像
6 | def isReport(img):
7 | # add your code here
8 | image = Image.open(os.getcwd() + '/origin_pics/region.jpg')
9 | rate=pHash.classify_DCT(image,img)/64.0
10 |
11 | if(rate>0.6):
12 | return True
13 | else:
14 | return False
15 |
16 | # 根据剪裁好的项目名称图片获得该项目的分类号,注意不是检验报告上的编号,是我们存储的编号
17 | num = 0
18 | def getItemNum(img):
19 | # replace your code
20 | global num
21 | if num >= 22:
22 | num = 0
23 | ret = num
24 | num = num + 1
25 | return ret
26 |
27 | # unit test
28 | if __name__ == '__main__':
29 | import classifier
30 |
31 | img = []
32 | if classifier.isReport(img) :
33 | print 'classifier.isReport(img) is True'
34 | for i in range(33):
35 | print classifier.getItemNum(img)
36 |
--------------------------------------------------------------------------------
/BloodTestReportOCR/config.py:
--------------------------------------------------------------------------------
1 |
2 | ALLOWED_EXTENSIONS = set(['png', 'jpeg', 'jpg'])
3 |
4 | DB_HOST = 'localhost'
5 | DB_PORT = 27017
6 |
7 | SERVER_HOST = '0.0.0.0'
8 | SERVER_PORT = 8080
9 |
10 | DEBUG=True
11 |
12 | MODEL=0
13 |
--------------------------------------------------------------------------------
/BloodTestReportOCR/digits:
--------------------------------------------------------------------------------
1 | tessedit_char_whitelist 0123456789.-.
2 |
--------------------------------------------------------------------------------
/BloodTestReportOCR/imgproc.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import cv2
3 |
4 | def digitsimg(src):
5 |
6 | #灰度化
7 | img_gray = cv2.cvtColor(src,cv2.COLOR_BGR2GRAY)
8 |
9 | #Otsu thresholding 二值化
10 | ret,result= cv2.threshold(img_gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
11 |
12 | #腐蚀去除一些小的点
13 | kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(3,2))
14 | eroded = cv2.erode(result,kernel)
15 |
16 | #将结果放大便于识别
17 | result = cv2.resize(result,(128,128),interpolation=cv2.INTER_CUBIC)
18 |
19 | # cv2.imshow('result',result)
20 | # cv2.waitKey(0)
21 |
22 | #腐蚀去除放大后的一些小的点
23 | eroded = cv2.erode(result,kernel)
24 | # cv2.imshow('eroded',eroded)
25 | # cv2.waitKey(0)
26 | #膨胀使数字更饱满
27 | result = cv2.dilate(eroded,kernel)
28 | # cv2.imshow('dilated',result)
29 |
30 | #直方图均衡化使图像更清晰
31 | cv2.equalizeHist(result)
32 | #中值滤波去除噪点
33 | result = cv2.medianBlur(result,5)
34 | # cv2.imshow('median',result)
35 | # cv2.waitKey(0)
36 | return result
37 | '''
38 | def chineseimg(src):
39 |
40 |
41 |
42 | #灰度化
43 | img_gray = cv2.cvtColor(src,cv2.COLOR_BGR2GRAY)
44 |
45 |
46 | #Otsu thresholding 二值化
47 | ret,result= cv2.threshold(img_gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
48 | # cv2.imshow('otsu',result)
49 | # cv2.waitKey(0)
50 |
51 |
52 | #直方图均衡化使图像更清晰
53 | cv2.equalizeHist(result)
54 | # cv2.imshow('直方图',result)
55 | # cv2.waitKey(0)
56 | return result
57 |
58 | #将结果放大便于识别
59 | result = cv2.resize(result,(256,128),interpolation=cv2.INTER_CUBIC)
60 |
61 | #腐蚀去除放大后的一些小的点
62 | kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(3,2))
63 | eroded = cv2.erode(result,kernel)
64 | cv2.imshow('eroded',eroded)
65 | cv2.waitKey(0)
66 |
67 | #膨胀使数字更饱满
68 | result = cv2.dilate(eroded,kernel)
69 | cv2.imshow('dilated',result)
70 | cv2.waitKey(0)
71 |
72 | #直方图均衡化使图像更清晰
73 | cv2.equalizeHist(result)
74 | #中值滤波去除噪点
75 | result = cv2.medianBlur(result,5)
76 | cv2.imshow('median',result)
77 | cv2.waitKey(0)'''
78 |
79 |
80 |
--------------------------------------------------------------------------------
/BloodTestReportOCR/lenet.prototxt:
--------------------------------------------------------------------------------
1 | name: "LeNet"
2 | layer {
3 | name: "data"
4 | type: "Input"
5 | top: "data"
6 | input_param { shape: { dim: 4 dim: 3 dim: 37 dim: 170 } }
7 | }
8 | layer {
9 | name: "conv1"
10 | type: "Convolution"
11 | bottom: "data"
12 | top: "conv1"
13 | param {
14 | lr_mult: 1
15 | }
16 | param {
17 | lr_mult: 2
18 | }
19 | convolution_param {
20 | num_output: 20
21 | kernel_size: 5
22 | stride: 1
23 | weight_filler {
24 | type: "xavier"
25 | }
26 | bias_filler {
27 | type: "constant"
28 | }
29 | }
30 | }
31 | layer {
32 | name: "pool1"
33 | type: "Pooling"
34 | bottom: "conv1"
35 | top: "pool1"
36 | pooling_param {
37 | pool: MAX
38 | kernel_size: 2
39 | stride: 2
40 | }
41 | }
42 | layer {
43 | name: "conv2"
44 | type: "Convolution"
45 | bottom: "pool1"
46 | top: "conv2"
47 | param {
48 | lr_mult: 1
49 | }
50 | param {
51 | lr_mult: 2
52 | }
53 | convolution_param {
54 | num_output: 50
55 | kernel_size: 5
56 | stride: 1
57 | weight_filler {
58 | type: "xavier"
59 | }
60 | bias_filler {
61 | type: "constant"
62 | }
63 | }
64 | }
65 | layer {
66 | name: "pool2"
67 | type: "Pooling"
68 | bottom: "conv2"
69 | top: "pool2"
70 | pooling_param {
71 | pool: MAX
72 | kernel_size: 2
73 | stride: 2
74 | }
75 | }
76 | layer {
77 | name: "ip1"
78 | type: "InnerProduct"
79 | bottom: "pool2"
80 | top: "ip1"
81 | param {
82 | lr_mult: 1
83 | }
84 | param {
85 | lr_mult: 2
86 | }
87 | inner_product_param {
88 | num_output: 500
89 | weight_filler {
90 | type: "xavier"
91 | }
92 | bias_filler {
93 | type: "constant"
94 | }
95 | }
96 | }
97 | layer {
98 | name: "relu1"
99 | type: "ReLU"
100 | bottom: "ip1"
101 | top: "ip1"
102 | }
103 | layer {
104 | name: "ip2"
105 | type: "InnerProduct"
106 | bottom: "ip1"
107 | top: "ip2"
108 | param {
109 | lr_mult: 1
110 | }
111 | param {
112 | lr_mult: 2
113 | }
114 | inner_product_param {
115 | num_output: 22
116 | weight_filler {
117 | type: "xavier"
118 | }
119 | bias_filler {
120 | type: "constant"
121 | }
122 | }
123 | }
124 | layer {
125 | name: "prob"
126 | type: "Softmax"
127 | bottom: "ip2"
128 | top: "prob"
129 | }
130 |
--------------------------------------------------------------------------------
/BloodTestReportOCR/lenet_iter_800.caffemodel:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/lenet_iter_800.caffemodel
--------------------------------------------------------------------------------
/BloodTestReportOCR/model.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/model.ckpt.data-00000-of-00001
--------------------------------------------------------------------------------
/BloodTestReportOCR/model.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/model.ckpt.index
--------------------------------------------------------------------------------
/BloodTestReportOCR/nn_model/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model.ckpt"
2 | all_model_checkpoint_paths: "model.ckpt"
3 |
--------------------------------------------------------------------------------
/BloodTestReportOCR/nn_model/model.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/nn_model/model.ckpt.data-00000-of-00001
--------------------------------------------------------------------------------
/BloodTestReportOCR/nn_model/model.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/nn_model/model.ckpt.index
--------------------------------------------------------------------------------
/BloodTestReportOCR/origin_pics/bloodtestreport1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/origin_pics/bloodtestreport1.jpg
--------------------------------------------------------------------------------
/BloodTestReportOCR/origin_pics/bloodtestreport2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/origin_pics/bloodtestreport2.jpg
--------------------------------------------------------------------------------
/BloodTestReportOCR/origin_pics/bloodtestreport3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/origin_pics/bloodtestreport3.jpg
--------------------------------------------------------------------------------
/BloodTestReportOCR/origin_pics/bloodtestreport4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/origin_pics/bloodtestreport4.jpg
--------------------------------------------------------------------------------
/BloodTestReportOCR/origin_pics/bloodtestreport5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/origin_pics/bloodtestreport5.jpg
--------------------------------------------------------------------------------
/BloodTestReportOCR/origin_pics/bloodtestreport6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/origin_pics/bloodtestreport6.jpg
--------------------------------------------------------------------------------
/BloodTestReportOCR/origin_pics/bloodtestreport7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/origin_pics/bloodtestreport7.jpg
--------------------------------------------------------------------------------
/BloodTestReportOCR/origin_pics/region.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/origin_pics/region.jpg
--------------------------------------------------------------------------------
/BloodTestReportOCR/pHash.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Fri Dec 2 20:03:39 2016
5 |
6 | @author: zhao
7 | """
8 | import numpy as np
9 | from PIL import Image
10 | from PIL import ImageFilter
11 | from PIL import ImageOps
12 | import math
13 | #得到hamming code
14 | def get_code(List,middle):
15 |
16 | result = []
17 | for index in range(0,len(List)):
18 | if List[index] > middle:
19 | result.append("1")
20 | else:
21 | result.append("0")
22 | return result
23 |
24 |
25 | #比较hamming code
26 | def comp_code(code1,code2):
27 | num = 0
28 | for index in range(0,len(code1)):
29 | if str(code1[index]) == str(code2[index]):
30 | num+=1
31 | return num
32 |
33 | #计算平均值
34 | def get_middle(List):
35 | li = List[:]
36 | li.sort()
37 | value = 0
38 | if len(li)%2==0:
39 | index = int((len(li)/2)) - 1
40 |
41 | value = li[index]
42 | else:
43 | index = int((len(li)/2))
44 | value = (li[index]+li[index-1])/2
45 | return value
46 |
47 | #得到像素矩阵
48 | def get_matrix(image):
49 |
50 | matrix = []
51 | size = image.size
52 | for height in range(0,size[1]):
53 | pixel = []
54 | for width in range(0,size[0]):
55 | pixel_value = image.getpixel((width,height))
56 | pixel.append(pixel_value)
57 | matrix.append(pixel)
58 |
59 | return matrix
60 |
61 | #求离散余弦变换的系数矩阵[A]
62 | def get_coefficient(n):
63 | matrix = []
64 | PI = math.pi
65 | sqr = math.sqrt(1/n)
66 | value = []
67 | for i in range(0,n):
68 | value.append(sqr)
69 | matrix.append(value)
70 |
71 | for i in range(1,n):
72 | value=[]
73 | for j in range (0,n):
74 | data = math.sqrt(2.0/n) * math.cos(i*PI*(j+0.5)/n);
75 | value.append(data)
76 | matrix.append(value)
77 |
78 | return matrix
79 |
80 | #转置
81 | def get_transposing(matrix):
82 | new_matrix = []
83 |
84 | for i in range(0,len(matrix)):
85 | value = []
86 | for j in range(0,len(matrix[i])):
87 | value.append(matrix[j][i])
88 | new_matrix.append(value)
89 |
90 | return new_matrix
91 | #矩阵乘法
92 | def get_mult(matrix1,matrix2):
93 | new_matrix = []
94 |
95 | for i in range(0,len(matrix1)):
96 | value_list = []
97 | for j in range(0,len(matrix1)):
98 | t = 0.0
99 | for k in range(0,len(matrix1)):
100 | t += matrix1[i][k] * matrix2[k][j]
101 | value_list.append(t)
102 | new_matrix.append(value_list)
103 |
104 | return new_matrix
105 |
106 | #计算DCT
107 | def DCT(double_matrix):
108 | n = len(double_matrix)
109 | A = get_coefficient(n)
110 | AT = get_transposing(A)
111 |
112 | temp = get_mult(double_matrix, A)
113 | DCT_matrix = get_mult(temp, AT)
114 |
115 | return DCT_matrix
116 |
117 | #缩小DCT
118 | def sub_matrix_to_list(DCT_matrix,part_size):
119 | w,h = part_size
120 | List = []
121 | for i in range(0,h):
122 | for j in range(0,w):
123 | List.append(DCT_matrix[i][j])
124 | return List
125 |
126 |
127 |
128 | def classify_DCT(image1,image2,size=(32,32),part_size=(8,8)):
129 |
130 | assert size[0]==size[1],"size error"
131 | assert part_size[0]==part_size[1],"part_size error"
132 |
133 | image1 = image1.resize(size).convert('L').filter(ImageFilter.BLUR)
134 | image1 = ImageOps.equalize(image1)
135 | matrix = get_matrix(image1)
136 | DCT_matrix = DCT(matrix)
137 | List = sub_matrix_to_list(DCT_matrix, part_size)
138 | middle = get_middle(List)
139 | code1 = get_code(List, middle)
140 |
141 |
142 | image2 = image2.resize(size).convert('L').filter(ImageFilter.BLUR)
143 | image2 = ImageOps.equalize(image2)
144 | matrix = get_matrix(image2)
145 | DCT_matrix = DCT(matrix)
146 | List = sub_matrix_to_list(DCT_matrix, part_size)
147 | middle = get_middle(List)
148 | code2 = get_code(List, middle)
149 |
150 | return comp_code(code1, code2)
151 |
152 |
153 |
154 |
--------------------------------------------------------------------------------
/BloodTestReportOCR/pd_predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from py_paddle import swig_paddle
3 | import sys
4 | sys.path.append("..")
5 | from PaddlePaddle import prediction_sex,prediction_age
6 | def predict(arr):
7 | swig_paddle.initPaddle("--use_gpu=0")
8 | data = [arr.tolist()]
9 | #直接填充4个0
10 | for i in range(4):
11 | data[0][0].append(0)
12 | sex = prediction_sex.predict(data)
13 | age = prediction_age.predict(data)
14 | return sex,age
15 |
--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_age_model/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model.ckpt"
2 | all_model_checkpoint_paths: "model.ckpt"
3 |
--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_age_model/model.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/rnn_model/rnn_age_model/model.ckpt.data-00000-of-00001
--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_age_model/model.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/rnn_model/rnn_age_model/model.ckpt.index
--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_age_model/model.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/rnn_model/rnn_age_model/model.ckpt.meta
--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_sex_model/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model.ckpt"
2 | all_model_checkpoint_paths: "model.ckpt"
3 |
--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_sex_model/model.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/rnn_model/rnn_sex_model/model.ckpt.data-00000-of-00001
--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_sex_model/model.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/rnn_model/rnn_sex_model/model.ckpt.index
--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_sex_model/model.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/rnn_model/rnn_sex_model/model.ckpt.meta
--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_sex_model/rnn_age_model/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model.ckpt"
2 | all_model_checkpoint_paths: "model.ckpt"
3 |
--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_sex_model/rnn_age_model/model.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/rnn_model/rnn_sex_model/rnn_age_model/model.ckpt.data-00000-of-00001
--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_sex_model/rnn_age_model/model.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/rnn_model/rnn_sex_model/rnn_age_model/model.ckpt.index
--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_model/rnn_sex_model/rnn_age_model/model.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/BloodTestReportOCR/rnn_model/rnn_sex_model/rnn_age_model/model.ckpt.meta
--------------------------------------------------------------------------------
/BloodTestReportOCR/rnn_predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import numpy as np
4 | import tensorflow as tf
5 | from tensorflow.python.ops import rnn, rnn_cell
6 |
7 | def predict_sex(data_predict):
8 | tf.reset_default_graph()
9 |
10 | # Network Parameters
11 | n_input = 11 # MNIST data input (img shape: 28*28)
12 | n_steps = 2 # timesteps
13 | n_hidden = 128 # hidden layer num of features
14 | n_classes = 2 # MNIST total classes (0-9 digits)
15 |
16 | data_predict = np.reshape(data_predict, (1,n_steps, n_input))
17 |
18 |
19 |
20 |
21 | # tf Graph input
22 | x = tf.placeholder("float", [None, n_steps, n_input])
23 | y = tf.placeholder("float", [None, n_classes])
24 |
25 | # Define weights
26 | weights = {
27 | 'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))
28 | }
29 | biases = {
30 | 'out': tf.Variable(tf.random_normal([n_classes]))
31 | }
32 |
33 | def RNN(x, weights, biases):
34 |
35 | # Prepare data shape to match `rnn` function requirements
36 | # Current data input shape: (batch_size, n_steps, n_input)
37 | # Required shape: 'n_steps' tensors list of shape (batch_size, n_input)
38 |
39 |
40 | # Permuting batch_size and n_steps
41 | x = tf.transpose(x, [1, 0, 2])
42 | #to : (n_steps, batch_size, n_input)
43 |
44 |
45 | # Dimensionality reduction
46 | x = tf.reshape(x, [-1, n_input])
47 | # Reshaping to (n_steps*batch_size, n_input)
48 |
49 | # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
50 | x = tf.split(0, n_steps, x)
51 |
52 | # Define a lstm cell with tensorflow
53 | lstm_cell = rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0)
54 |
55 | # Get lstm cell output
56 | outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
57 |
58 | # Linear activation, using rnn inner loop last output
59 | return tf.matmul(outputs[-1], weights['out']) + biases['out']
60 |
61 | pred = RNN(x, weights, biases)
62 |
63 | # Initializing the variables
64 | init = tf.global_variables_initializer()
65 |
66 | ######
67 | saver = tf.train.Saver()
68 |
69 | # Launch the graph
70 | with tf.Session() as sess:
71 | sess.run(init)
72 | saver.restore(sess,"./rnn_model/rnn_sex_model/model.ckpt")
73 | p = sess.run(pred, feed_dict={x:data_predict})
74 |
75 |
76 |
77 | if p[0][0] > p[0][1]:
78 | sex_result = 0
79 | else:
80 | sex_result = 1
81 |
82 |
83 | return sex_result
84 |
85 |
86 |
87 | def predict_age(data_predict):
88 | tf.reset_default_graph()
89 |
90 | # Network Parameters
91 | n_input = 11 # MNIST data input (img shape: 28*28)
92 | n_steps = 2 # timesteps
93 | n_hidden = 128 # hidden layer num of features
94 | n_classes = 10 #MNIST total classes (0-9 digits)
95 |
96 | data_predict = np.reshape(data_predict, (1,n_steps, n_input))
97 |
98 |
99 |
100 |
101 | # tf Graph input
102 | x = tf.placeholder("float", [None, n_steps, n_input])
103 | y = tf.placeholder("float", [None, n_classes])
104 |
105 | # Define weights
106 | weights = {
107 | 'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))
108 | }
109 | biases = {
110 | 'out': tf.Variable(tf.random_normal([n_classes]))
111 | }
112 |
113 | def RNN(x, weights, biases):
114 |
115 | # Prepare data shape to match `rnn` function requirements
116 | # Current data input shape: (batch_size, n_steps, n_input)
117 | # Required shape: 'n_steps' tensors list of shape (batch_size, n_input)
118 |
119 |
120 | # Permuting batch_size and n_steps
121 |
122 | x = tf.transpose(x, [1, 0, 2])
123 | # Reshaping to (n_steps*batch_size, n_input)
124 | x = tf.reshape(x, [-1, n_input])
125 | # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
126 | x = tf.split(0, n_steps, x)
127 |
128 | # Define a lstm cell with tensorflow
129 | lstm_cell = rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0)
130 |
131 | # Get lstm cell output
132 | outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
133 |
134 | # Linear activation, using rnn inner loop last output
135 | return tf.matmul(outputs[-1], weights['out']) + biases['out']
136 |
137 | pred = RNN(x, weights, biases)
138 |
139 | # Initializing the variables
140 | init = tf.global_variables_initializer()
141 |
142 | ######
143 | saver = tf.train.Saver()
144 |
145 | # Launch the graph
146 | with tf.Session() as sess:
147 | sess.run(init)
148 | saver.restore(sess,"./rnn_model/rnn_age_model/model.ckpt")
149 | p = sess.run(pred, feed_dict={x:data_predict})
150 |
151 | # print(tf.argmax(p, 1))
152 | max = p[0][0]
153 | max_i = 0
154 | for i in range(n_classes):
155 | if p[0][i] > max:
156 | max_i = i
157 | max = p[0][i]
158 |
159 |
160 | age_result = str(max_i * 10) + "~" + str((max_i+1) *10 -1)
161 |
162 | return age_result
--------------------------------------------------------------------------------
/BloodTestReportOCR/static/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | BloodTestOCR
9 |
10 |
11 |
12 |
13 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
28 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
102 |
103 |
104 |
105 |
106 |
245 |
246 |
247 |
--------------------------------------------------------------------------------
/BloodTestReportOCR/temp_pics/README.md:
--------------------------------------------------------------------------------
1 | temp pictures
2 |
--------------------------------------------------------------------------------
/BloodTestReportOCR/tf_predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import numpy as np
4 | import tensorflow as tf
5 |
6 | def normalized(a,b):
7 | for i in range(22):
8 | tmp = np.mean(a[:, i])
9 |
10 | a[:, i] = a[:, i] - tmp
11 | b[:, i] = b[:, i] - tmp
12 |
13 |
14 | if np.min(a[:, i]) != np.max(a[:, i]):
15 | b[:, i] = 2 * (b[:, i] - np.min(a[:, i])) / (np.max(a[:, i]) - np.min(a[:, i])) - 1
16 | else:
17 | b[:, i] = 0
18 | return b
19 |
20 | def predict(data_predict):
21 | tf.reset_default_graph()
22 | data_nor = np.loadtxt(open("./data.csv", "rb"), delimiter=",", skiprows=0)
23 |
24 | data_predict = normalized(data_nor[:, 2:], data_predict)
25 |
26 | '''
27 | 参数
28 | '''
29 | learning_rate = 0.005
30 | display_step = 100
31 | n_input = 22
32 |
33 | n_hidden_1_age = 32
34 | n_hidden_2_age = 16
35 | n_classes_age = 1
36 |
37 | n_hidden_1_sex = 16
38 | n_hidden_2_sex = 8
39 | n_classes_sex = 2
40 | data = np.loadtxt(open("./data.csv", "rb"), delimiter=",", skiprows=0)
41 | '''
42 | 建立年龄模型
43 | '''
44 | x_age = tf.placeholder("float", [None, n_input])
45 | y_age = tf.placeholder("float", [None, n_classes_age])
46 |
47 | def multilayer_perceptron_age(x_age, weights_age, biases_age):
48 | # Hidden layer with RELU activation
49 | layer_1 = tf.add(tf.matmul(x_age, weights_age['h1']), biases_age['b1'])
50 | layer_1 = tf.nn.relu(layer_1)
51 | # Hidden layer with RELU activation
52 | layer_2 = tf.add(tf.matmul(layer_1, weights_age['h2']), biases_age['b2'])
53 | layer_2 = tf.nn.relu(layer_2)
54 | # Output layer with linear activation
55 | out_layer = tf.matmul(layer_2, weights_age['out']) + biases_age['out']
56 | return out_layer
57 |
58 | weights_age = {
59 | 'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1_age])),
60 | 'h2': tf.Variable(tf.random_normal([n_hidden_1_age, n_hidden_2_age])),
61 | 'out': tf.Variable(tf.random_normal([n_hidden_2_age, n_classes_age]))
62 | }
63 | biases_age = {
64 | 'b1': tf.Variable(tf.random_normal([n_hidden_1_age])),
65 | 'b2': tf.Variable(tf.random_normal([n_hidden_2_age])),
66 | 'out': tf.Variable(tf.random_normal([n_classes_age]))
67 | }
68 | pred_age = multilayer_perceptron_age(x_age, weights_age, biases_age)
69 | '''
70 | 建立性别模型
71 | '''
72 | x_sex = tf.placeholder("float", [None, n_input])
73 | y_sex = tf.placeholder("float", [None, n_classes_sex])
74 |
75 | def multilayer_perceptron_sex(x_sex, weights_sex, biases_sex):
76 | # Hidden layer with RELU activation
77 | layer_1 = tf.add(tf.matmul(x_sex, weights_sex['h1']), biases_sex['b1'])
78 | layer_1 = tf.nn.relu(layer_1)
79 | # Hidden layer with RELU activation
80 | layer_2 = tf.add(tf.matmul(layer_1, weights_sex['h2']), biases_sex['b2'])
81 | layer_2 = tf.nn.relu(layer_2)
82 | # Output layer with linear activation
83 | out_layer = tf.matmul(layer_2, weights_sex['out']) + biases_sex['out']
84 | return out_layer
85 |
86 | weights_sex = {
87 | 'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1_sex])),
88 | 'h2': tf.Variable(tf.random_normal([n_hidden_1_sex, n_hidden_2_sex])),
89 | 'out': tf.Variable(tf.random_normal([n_hidden_2_sex, n_classes_sex]))
90 | }
91 | biases_sex = {
92 | 'b1': tf.Variable(tf.random_normal([n_hidden_1_sex])),
93 | 'b2': tf.Variable(tf.random_normal([n_hidden_2_sex])),
94 | 'out': tf.Variable(tf.random_normal([n_classes_sex]))
95 | }
96 | pred_sex = multilayer_perceptron_sex(x_sex, weights_sex, biases_sex)
97 |
98 | '''
99 | 共同的初始化
100 | '''
101 | saver = tf.train.Saver()
102 | init = tf.global_variables_initializer()
103 | with tf.Session() as sess:
104 | saver.restore(sess, "./nn_model/model.ckpt")
105 | print ("load model success!")
106 | p_sex = sess.run(pred_sex, feed_dict={x_sex: data_predict})
107 | p_age = sess.run(pred_age, feed_dict={x_age: data_predict})
108 | if p_sex[0][0] > p_sex[0][1]:
109 | sex_result = 1
110 | else:
111 | sex_result = 0
112 |
113 | age_result = p_age[0][0] * 50 +50
114 |
115 | return sex_result,age_result
--------------------------------------------------------------------------------
/BloodTestReportOCR/view.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import json
5 | from cStringIO import StringIO
6 |
7 | import bson
8 | import cv2
9 | import flask
10 | import numpy
11 | from PIL import Image
12 | from bson.json_util import dumps
13 | from flask import Flask, request, Response, jsonify, redirect, json
14 | from pymongo import MongoClient
15 | from werkzeug.utils import secure_filename
16 |
17 | import tf_predict
18 | from imageFilter import ImageFilter
19 | import rnn_predict
20 | import pd_predict
21 |
22 | app = Flask(__name__, static_url_path="")
23 |
24 | # 读取配置文件
25 | app.config.from_object('config')
26 |
27 | # 连接数据库,并获取数据库对象
28 | db = MongoClient(app.config['DB_HOST'], app.config['DB_PORT']).test
29 |
30 |
31 | # 将矫正后图片与图片识别结果(JSON)存入数据库
32 | def save_file(file_str, f, report_data):
33 | content = StringIO(file_str)
34 |
35 | try:
36 | mime = Image.open(content).format.lower()
37 | print 'content of mime is:', mime
38 | if mime not in app.config['ALLOWED_EXTENSIONS']:
39 | raise IOError()
40 | except IOError:
41 | abort(400)
42 | c = dict(report_data=report_data, content=bson.binary.Binary(content.getvalue()), filename=secure_filename(f.name),
43 | mime=mime)
44 | db.files.save(c)
45 | return c['_id'], c['filename']
46 |
47 |
48 | @app.route('/', methods=['GET', 'POST'])
49 | def index():
50 | return redirect('/index.html')
51 |
52 |
53 | @app.route('/upload', methods=['POST'])
54 | def upload():
55 | if request.method == 'POST':
56 | if 'imagefile' not in request.files:
57 | flash('No file part')
58 | return jsonify({"error": "No file part"})
59 | imgfile = request.files['imagefile']
60 | if imgfile.filename == '':
61 | flash('No selected file')
62 | return jsonify({"error": "No selected file"})
63 | if imgfile:
64 | # pil = StringIO(imgfile)
65 | # pil = Image.open(pil)
66 | # print 'imgfile:', imgfile
67 | img = cv2.imdecode(numpy.fromstring(imgfile.read(), numpy.uint8), cv2.CV_LOAD_IMAGE_UNCHANGED)
68 | report_data = ImageFilter(image=img).ocr(22)
69 | if report_data == None:
70 | data = {
71 | "error": 1,
72 | }
73 | return jsonify(data)
74 |
75 | with open('temp_pics/region.jpg') as f:
76 | if f is None:
77 | print 'Error! f is None!'
78 | else:
79 |
80 | '''
81 | 定义file_str存储矫正后的图片文件f的内容(str格式),方便之后对图片做二次透视以及将图片内容存储至数据库中
82 | '''
83 | file_str = f.read()
84 | '''
85 | 使用矫正后的图片,将矫正后图片与识别结果(JSON数据)一并存入mongoDB,
86 | 这样前台点击生成报告时将直接从数据库中取出JSON数据,而不需要再进行图像透视,缩短生成报告的响应时间
87 | '''
88 | #img_region = cv2.imdecode(numpy.fromstring(file_str, numpy.uint8), cv2.CV_LOAD_IMAGE_UNCHANGED)
89 | #report_data = ImageFilter(image=img).ocr(22)
90 | fid, filename = save_file(file_str, f, report_data)
91 | print 'fid:', fid
92 | if fid is not None:
93 | templates = "
" % (
94 | fid)
95 | data = {
96 | "templates": templates,
97 | }
98 | return jsonify(data)
99 | # return render_template("result.html", filename=filename, fileid=fid)
100 | # return render_template("error.html", errormessage="No POST methods")
101 | return jsonify({"error": "No POST methods"})
102 |
103 |
104 | '''
105 | 根据图像oid,在mongodb中查询,并返回Binary对象
106 | '''
107 |
108 |
109 | @app.route('/file/')
110 | def find_file(fid):
111 | try:
112 | file = db.files.find_one(bson.objectid.ObjectId(fid))
113 | if file is None:
114 | raise bson.errors.InvalidId()
115 | return Response(file['content'], mimetype='image/' + file['mime'])
116 | except bson.errors.InvalidId:
117 | flask.abort(404)
118 |
119 |
120 | '''
121 | 直接从数据库中取出之前识别好的JSON数据,并且用bson.json_util.dumps将其从BSON转换为JSON格式的str类型
122 | '''
123 |
124 |
125 | @app.route('/report/')
126 | def get_report(fid):
127 | # print 'get_report(fid):', fid
128 | try:
129 | file = db.files.find_one(bson.objectid.ObjectId(fid))
130 | if file is None:
131 | raise bson.errors.InvalidId()
132 |
133 | print 'type before transform:\n', type(file['report_data'])
134 |
135 | report_data = bson.json_util.dumps(file['report_data'])
136 |
137 | print 'type after transform:\n', type(report_data)
138 | if report_data is None:
139 | print 'report_data is NONE! Error!!!!'
140 | return jsonify({"error": "can't ocr'"})
141 | return jsonify(report_data)
142 | except bson.errors.InvalidId:
143 | flask.abort(404)
144 |
145 |
146 | def update_report(fid,ss):
147 | # load json example
148 | with open('bloodtestdata.json') as json_file:
149 | data = json.load(json_file)
150 |
151 | for i in range(22):
152 | data['bloodtest'][i]['value'] = ss[i]
153 | json_data = json.dumps(data, ensure_ascii=False, indent=4)
154 |
155 | db.files.update_one({
156 | '_id': bson.objectid.ObjectId(fid)}, {
157 | '$set': {
158 | 'report_data': json_data
159 | }
160 | }, upsert=False)
161 |
162 |
163 | file = db.files.find_one(bson.objectid.ObjectId(fid))
164 | report_data = bson.json_util.dumps(file['report_data'])
165 | print report_data
166 |
167 |
168 |
169 | @app.route('/predict/', methods=['POST'])
170 | def predict(fid):
171 |
172 |
173 | print ("predict now!")
174 |
175 | data = json.loads(request.form.get('data'))
176 | ss = data['value']
177 |
178 |
179 | # 若用户在输入框中对数值进行修正,则更新mongodb中的数据
180 | update_report(fid,ss)
181 |
182 | arr = numpy.array(ss)
183 | arr = numpy.reshape(arr, [1, 22])
184 |
185 |
186 | if app.config['MODEL'] == 'rnn':
187 | sex = rnn_predict.predict_sex(arr)
188 | age = rnn_predict.predict_age(arr)
189 | result = {
190 | "sex": sex,
191 | "age": age
192 | }
193 |
194 | elif app.config['MODEL'] == 'tf':
195 | sex, age = tf_predict.predict(arr)
196 | result = {
197 | "sex": sex,
198 | "age": int(age)
199 | }
200 | elif app.config['MODEL'] == 'pd':
201 | sex, age = pd_predict.predict(arr)
202 | result = {
203 | "sex": sex,
204 | "age": int(age)
205 | }
206 |
207 |
208 |
209 | return json.dumps(result)
210 |
211 |
212 |
213 | if __name__ == '__main__':
214 |
215 | app.run(host=app.config['SERVER_HOST'], port=app.config['SERVER_PORT'])
216 |
217 |
--------------------------------------------------------------------------------
/Caffe/README.md:
--------------------------------------------------------------------------------
1 | ##文件说明
2 |
3 | - caffe_sex_train_predict.py 性别预测demo主要代码,完成数据格式转换,训练及预测流程控制
4 | - config.prototxt 训练网络配置文件
5 | - lenet_train.prototxt 训练网络设置
6 | - model_prod_prototxt 预测网络设置
7 | - draw_net.py 网络绘图代码(未整合至主代码文件中)
8 |
9 | ##caffe的安装:
10 | **1、安装基本依赖**
11 |
12 | ```
13 | sudo apt-get install libprotobuf-dev libleveldb-dev libsnappy-dev libopencv-dev libhdf5-serial-dev protobuf-compiler
14 | ```
15 |
16 | ```
17 | sudo apt-get install --no-install-recommends libboost-all-dev
18 | ```
19 |
20 | 由于ubuntu的库有各种依赖关系,apt-get可能无法解决,建议使用aptitude,会给出多个解决方案,实测可行!
21 | sudo aptitude install ...
22 |
23 | **2、若不使用gpu,可以跳过安装cuda!(而且好像16.04已经带有cuda8)**
24 |
25 | **3、安装ATLAS**
26 |
27 | ```
28 | sudo apt-get install libatlas-base-dev
29 | ```
30 |
31 | **4、下载caffe**
32 |
33 | ```
34 | git clone https://github.com/BVLC/caffe.git
35 | ```
36 |
37 | **5、修改Makefile.config**
38 |
39 | ```
40 | cd caffe
41 | cp Makefile.config.example Makefile.config
42 | gedit Makefile.config
43 | ```
44 |
45 | 将# cpu_only := 1的注释去掉,找到并修改为:
46 |
47 | ```
48 | INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include /usr/include/hdf5/serial
49 | LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib /usr/lib/i386-linux-gnu/hdf5/serial
50 | ```
51 | 如果是ubuntu16.04 64位版本,需要将第二项改为 :
52 | ```
53 | LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib /usr/lib/x86_64-linux-gnu /usr/lib/x86_64-linux-gnu/hdf5/serial
54 | ```
55 |
56 | 如果make all依然有错,你可能需要进行下一步
57 | ```
58 | cd /usr/lib/x86_64-linux-gnu
59 |
60 | sudo ln -s libhdf5_serial.so.10.1.0 libhdf5.so
61 |
62 | sudo ln -s libhdf5_serial_hl.so.10.0.2 libhdf5_hl.so
63 | ```
64 | 这依然是版本的锅。
65 |
66 | **6、编译安装**
67 |
68 | ```
69 | make all
70 | make test
71 | make runtest
72 | ```
73 |
74 | 到此caffe安装已经完成!
75 | 若有需要用到python或matlab接口的,先设置好Makefile.config中的路径,再另外编译:
76 |
77 | ```
78 | make pycaffe
79 | make matcaffe
80 | ```
81 | ubuntu16.04 64位出错可能的解决方法:
82 | ```
83 | # (Python 2.7 development files)
84 | sudo apt-get install -y python-dev
85 | sudo apt-get install -y python-numpy python-scipy
86 | ```
87 | 修改Makefile.config中
88 | ```
89 | PYTHON_INCLUDE := /usr/include/python2.7 /usr/local/lib/python2.7/dist-packages/numpy/core/include
90 |
91 | WITH_PYTHON_LAYER := 1
92 | ```
93 | 这是因为numpy安装路径可能不一样。
94 |
95 | 添加python环境变量,方便以后imoprt caffe,打开/etc/bash.bashrc末尾添加:
96 |
97 | ```
98 | PYTHONPATH=/xxx/xxx/caffe/python:$PYTHONPATH
99 | ```
100 |
101 | 另外pycaffe的接口暴露在caff目录下的python文件夹,只需要import caffe就可以直接调用。matcaffe接口官网有介绍。
102 |
103 | ##prototxt网络模型绘制成可视化图片
104 |
105 | draw_net.py可以将网络模型由prototxt变成一张图片,draw_net.py存放在caffe根目录下python文件夹中。
106 |
107 | 绘制网络模型前,先安装两个库:GraphViz和pydot
108 |
109 | **1.安装GraphViz**
110 |
111 | Graphviz的是一款图形绘制工具,用来被python程序调用绘制图片
112 |
113 | sudo apt-get install GraphViz
114 |
115 | **2.安装pydot**
116 |
117 | pydot是python的支持画图的库
118 |
119 | sudo pip install pydot
120 |
121 | **3.编译pycaffe**
122 |
123 | make pycaffe
124 |
125 | 完成上面三个步骤之后,就可以绘制网络模型了,draw_net.py执行的时候带三个参数
126 |
127 | 第一个参数:网络模型的prototxt文件
128 |
129 | 第二个参数:保存的图片路径及名字
130 |
131 | 第二个参数:–rankdir=x , x 有四种选项,分别是LR, RL, TB, BT 。用来表示网络的方向,分别是从左到右,从右到左,从上到小,从下到上。默认为LR。
132 |
133 | **绘制Lenet模型**
134 |
135 | 在caffe根目录下
136 |
137 | python python/draw_net.py examples/mnist/lenet_train_test.prototxt ./lenet_train_test.jpg --rankdir=BT
138 |
139 | 绘制完成后将会生成lenet_train_test.jpg
140 |
141 | ## 利用CAFFE预测病人性别,正确率只有70%,还可以通过优化网络结构进行提升
142 |
143 | ### 环境配置(Ubuntu 14.04或以上版本)
144 |
145 | 如果还有模块没有安装,可以使用如下命令安装
146 | ```
147 | sudo pip install module_name
148 | ```
149 | 获取的数据来源:
150 |
151 | 同项目目录下`Spark/BllodTestReportDeeplearning/data_set.csv`
152 |
153 | ### 使用
154 | - 在当前目录下建立两个数据库文件夹,test_data_lmdb,train_data_lmdb
155 |
156 | ```
157 | mkdir test_data_lmdb train_datalmdb
158 | ```
159 | - 运行caffe_sex_train_predict.py
160 |
161 | ```
162 | python caffe_sex_train_predict.py
163 | ```
164 |
165 | 注意:重复运行create_data_lmdb()并不会覆盖原来的文件,而是会在原文件结尾处继续生成新数据,如
166 | 果需要重新调试,可以删除两个文件
167 |
168 | 相关资料链接:
169 | 官网上神经网络搭建实例:
170 | http://nbviewer.ipython.org/github/joyofdata/joyofdata-articles/blob/master/deeplearning-with-caffe/Neural-Networks-with-Caffe-on-the-GPU.ipynb
171 |
172 | layer 详解:
173 | http://blog.csdn.net/u011762313/article/details/47361571#sigmoid
174 |
--------------------------------------------------------------------------------
/Caffe/caffe_sex_train_predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | import random
4 | import subprocess
5 | import platform
6 | import sys,os
7 | sys.path.append('/home/summer/Desktop/caffe/python')
8 | import caffe
9 | import lmdb
10 | from sklearn.cross_validation import StratifiedShuffleSplit
11 | import pandas as pd
12 | import numpy as np
13 | import matplotlib.pyplot as plt
14 |
15 |
16 |
17 | def extract(filename):
18 | matrix = np.loadtxt(filename, dtype='string', skiprows= 1,delimiter=',', usecols=(1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28))
19 | matrix = matrix_filter(matrix)
20 | matrix = np.asarray(matrix)
21 |
22 | data = matrix[:,1:27]
23 | sex = matrix[:,0]
24 | data = data.astype(np.float) #convert string to float
25 | for i in range(len(sex)):
26 | if sex[i] == '\xc4\xd0':
27 | sex[i] = 1
28 | else :
29 | if sex[i] != '1':
30 | sex[i] = 0
31 | return data,sex
32 | #filter the row which contains the wrong elements
33 | def matrix_filter(matrix):
34 | count = 0
35 | flag = 0
36 | for row in matrix:
37 | for cloumn in row:
38 | if cloumn == '--.--' or cloumn == '': #Discard the wrong value
39 | flag = 1
40 |
41 | if count == 0 and flag == 0:
42 | table = row
43 | count = 1
44 | continue
45 | if flag == 0 :
46 | table = np.c_[table,row] #Add the elements by extend cloumns
47 | else:
48 | flag = 0
49 | table = table.transpose() #Transpose the matrix
50 | return table
51 |
52 | #nomalize the data
53 | def nomalize(X_train, X_test):
54 |
55 | ave = X_train.mean(axis=0) # get the average of cloumns
56 | std = X_train.std(axis=0) # get the standard deviation of cloumns
57 | train_table = [(row - ave)/std for row in X_train]
58 | X_train = (np.asarray(train_table))
59 |
60 | test_table = [(row - ave)/std for row in X_test]
61 | X_test = (np.asarray(test_table))
62 | return X_train, X_test
63 |
64 | #load data into lmdb
65 | def load_data_into_lmdb(lmdb_name, features, labels=None):
66 | env = lmdb.open(lmdb_name, map_size=features.nbytes*10)
67 |
68 | features = features[:,:,None,None]
69 | for i in range(features.shape[0]):
70 | datum = caffe.proto.caffe_pb2.Datum()
71 |
72 | datum.channels = features.shape[1] # features's number(26)
73 | datum.height = 1 # due to eachone only have one data
74 | datum.width = 1 # so the size is 1x1
75 |
76 | if features.dtype == np.int: # convert data to string
77 | datum.data = features[i].tostring()
78 | elif features.dtype == np.float:
79 | datum.float_data.extend(features[i].flat)
80 | else:
81 | raise Exception("features.dtype unknown.")
82 |
83 | if labels is not None:
84 | datum.label = int(labels[i])
85 |
86 | str_id = '{:08}'.format(i)
87 | with env.begin(write=True) as txn:
88 | txn.put(str_id, datum.SerializeToString())
89 |
90 | def get_data_from_lmdb_evalue(lmdb_name):
91 | lmdb_env = lmdb.open(lmdb_name, readonly=True)
92 | lmdb_txn = lmdb_env.begin()
93 | lmdb_cursor = lmdb_txn.cursor()
94 | datum = caffe.proto.caffe_pb2.Datum()
95 | success = 0
96 | count = 0
97 | #raw_datum = lmdb_txn.get()
98 | for key, value in lmdb_cursor:
99 |
100 | datum.ParseFromString(value)
101 | label = datum.label
102 | feature = caffe.io.datum_to_array(datum)
103 | out = net.forward(**{net.inputs[0]: np.asarray([feature])})
104 | count+=1
105 | if np.argmax(out["prob"][0]) == label :
106 | success+=1
107 | print "success", out
108 | return count,success
109 |
110 | def create_data_lmdb():
111 |
112 | #prefit
113 | X, y = extract('data_set.csv')
114 | vec_log = np.vectorize(lambda x: x)
115 | vec_int = np.vectorize(lambda str: int(str[-1]))
116 | features = vec_log(X)
117 | labels = vec_int(y)
118 |
119 | #train : test = 9 : 1
120 | sss = StratifiedShuffleSplit(labels, 1, test_size=0.1, random_state=0)
121 | sss = list(sss)[0]
122 |
123 | features_training = features[sss[0],]
124 | labels_training = labels[sss[0],]
125 |
126 | features_testing = features[sss[1],]
127 | labels_testing = labels[sss[1],]
128 |
129 | #nomalized data 66%, unnomalized data 53%
130 | features_training, features_testing = nomalize(features_training, features_testing)
131 |
132 | load_data_into_lmdb("train_data_lmdb", features_training, labels_training)
133 | load_data_into_lmdb("test_data_lmdb", features_testing, labels_testing)
134 |
135 | if __name__=='__main__':
136 | #建立lmdb格式数据库,只需创建一次,再次创建需要清除原来数据文件
137 | create_data_lmdb();
138 | #根据配置文件开始训练模型
139 | solver = caffe.get_solver("config.prototxt")
140 | solver.solve()
141 |
142 | net = caffe.Net("model_prod_prototxt","_iter_500000.caffemodel", caffe.TEST)
143 |
144 | # if the index of the largest element matches the integer
145 | # label we stored for that case - then the prediction is right
146 | total,success = get_data_from_lmdb_evalue("test_data_lmdb/")
147 | print "accuracy:", success*100/total,"%"
148 |
--------------------------------------------------------------------------------
/Caffe/config.prototxt:
--------------------------------------------------------------------------------
1 | test_iter: 50 #测试的批次
2 | test_interval: 10000 #每10000次迭代后测试一次
3 | base_lr: 0.01 #基础学习率
4 | display: 10000 #每10000次迭代显示一次数据
5 | max_iter: 500000 #最大迭代次数
6 |
7 | lr_policy: "inv" #学习率变化公式
8 | # - fixed: always return base_lr.
9 | # - step: return base_lr * gamma ^ (floor(iter / step))
10 | # - exp: return base_lr * gamma ^ iter
11 |
12 | # - inv: return base_lr * (1 + gamma * iter) ^ (- power)
13 | # - sigmoid: the effective learning rate follows a sigmod decay
14 | # return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize))))
15 |
16 |
17 | # - multistep: similar to step but it allows non uniform steps defined by
18 | # stepvalue
19 | # - poly: the effective learning rate follows a polynomial decay, to be
20 | # zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power)
21 |
22 | gamma: 0.0001 #学习率变化参数
23 | power: 0.75 #指数参数
24 | momentum: 0.9 #学习的参数
25 | weight_decay: 0.0005 #后向传播的权重比例
26 | solver_mode: CPU #运行模式为cpu模式
27 | net: "lenet_train.prototxt" #训练网络
28 | stepsize: 100000 #每100000次迭代减少学习率
29 |
--------------------------------------------------------------------------------
/Caffe/draw_net.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Draw a graph of the net architecture.
4 | """
5 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
6 | from google.protobuf import text_format
7 |
8 | import caffe
9 | import caffe.draw
10 | from caffe.proto import caffe_pb2
11 |
12 |
13 | def parse_args():
14 | """Parse input arguments
15 | """
16 |
17 | parser = ArgumentParser(description=__doc__,
18 | formatter_class=ArgumentDefaultsHelpFormatter)
19 |
20 | parser.add_argument('input_net_proto_file',
21 | help='Input network prototxt file')
22 | parser.add_argument('output_image_file',
23 | help='Output image file')
24 | parser.add_argument('--rankdir',
25 | help=('One of TB (top-bottom, i.e., vertical), '
26 | 'RL (right-left, i.e., horizontal), or another '
27 | 'valid dot option; see '
28 | 'http://www.graphviz.org/doc/info/'
29 | 'attrs.html#k:rankdir'),
30 | default='LR')
31 | parser.add_argument('--phase',
32 | help=('Which network phase to draw: can be TRAIN, '
33 | 'TEST, or ALL. If ALL, then all layers are drawn '
34 | 'regardless of phase.'),
35 | default="ALL")
36 |
37 | args = parser.parse_args()
38 | return args
39 |
40 |
41 | def main():
42 | args = parse_args()
43 | net = caffe_pb2.NetParameter()
44 | text_format.Merge(open(args.input_net_proto_file).read(), net)
45 | print('Drawing net to %s' % args.output_image_file)
46 | phase=None;
47 | if args.phase == "TRAIN":
48 | phase = caffe.TRAIN
49 | elif args.phase == "TEST":
50 | phase = caffe.TEST
51 | elif args.phase != "ALL":
52 | raise ValueError("Unknown phase: " + args.phase)
53 | caffe.draw.draw_net_to_file(net, args.output_image_file, args.rankdir,
54 | phase)
55 |
56 |
57 | if __name__ == '__main__':
58 | main()
59 |
--------------------------------------------------------------------------------
/Caffe/lenet_train.prototxt:
--------------------------------------------------------------------------------
1 | name: "LeNet"
2 | layer {
3 | name: "mnist"
4 | type: "Data" #type 定义层次类型
5 | top: "data" #输入数据来自一些’bottom’ blobs, 输出一些’top’ blobs
6 | top: "label"
7 | include {
8 | phase: TRAIN #指明训练网络
9 | }
10 | transform_param { #数据的预处理,一般图像设置为该值,1/255,本项目中用不到。
11 | #scale: 0.00390625
12 | }
13 | data_param { #数据来源,批处理大小,和LMDB数据格式
14 | source: "train_data_lmdb"
15 | batch_size: 1
16 | backend: LMDB
17 | }
18 | }
19 | layer {
20 | name: "mnist"
21 | type: "Data"
22 | top: "data"
23 | top: "label"
24 | include {
25 | phase: TEST #两个放在一起代表训练时采用的train和test网络用的同一个网络结构
26 | }
27 | transform_param { #数据的预处理
28 | #scale: 0.00390625
29 | }
30 | data_param {
31 | source: "test_data_lmdb"
32 | batch_size: 100
33 | backend: LMDB
34 | }
35 | }
36 | layer {
37 | name: "ip1"
38 | type: "InnerProduct" #内积层又叫全连接层,输入当做一个一维向量,产生的输出也是以向量的形式输出,
39 | bottom: "data"
40 | top: "ip1"
41 | param { #层的权值和偏置相关参数
42 | lr_mult: 1
43 | }
44 | param {
45 | lr_mult: 2
46 | }
47 | inner_product_param { #全连接层节点数设置,或者说滤波器数目(必须设置)
48 | num_output: 300
49 | weight_filler {
50 | type: "xavier" #滤波器类型
51 | }
52 | bias_filler { #偏执类型,默认值
53 | type: "constant"
54 | }
55 | }
56 | }
57 | layer { #激励层:常见激励函数有,relu:max(x, 0)
58 | name: "relu1" # sigmod,TanH,AbsVal 等等
59 | type: "ReLU"
60 | bottom: "ip1"
61 | top: "ip1"
62 | }
63 | layer {
64 | name: "ip2"
65 | type: "InnerProduct"
66 | bottom: "ip1"
67 | top: "ip2"
68 | param {
69 | lr_mult: 1
70 | }
71 | param {
72 | lr_mult: 2
73 | }
74 | inner_product_param {
75 | num_output: 2
76 | weight_filler {
77 | type: "xavier"
78 | }
79 | bias_filler {
80 | type: "constant"
81 | }
82 | }
83 | }
84 | layer {
85 | name: "accuracy" #test 网络输出准确率
86 | type: "Accuracy"
87 | bottom: "ip2"
88 | bottom: "label"
89 | top: "accuracy"
90 | include {
91 | phase: TEST
92 | }
93 | }
94 | layer {
95 | name: "loss"
96 | type: "SoftmaxWithLoss" #SoftmaxWithLoss(广义线性回归分析损失层)
97 | bottom: "ip2"
98 | bottom: "label"
99 | top: "loss"
100 | }
101 |
--------------------------------------------------------------------------------
/Caffe/model_prod_prototxt:
--------------------------------------------------------------------------------
1 | name: "otto"
2 | input: "data" //指定输入的shape.num,channels,height,width
3 | input_dim: 1
4 | input_dim: 26
5 | input_dim: 1
6 | input_dim: 1
7 | layer {
8 | name: "ip1"
9 | type: "InnerProduct"
10 | bottom: "data"
11 | top: "ip1"
12 | inner_product_param {
13 | num_output: 300
14 | weight_filler {
15 | type: "xavier"
16 | }
17 | bias_filler {
18 | type: "constant"
19 | value: 0
20 | }
21 | }
22 | }
23 | layer {
24 | name: "relu1"
25 | type: "ReLU"
26 | bottom: "ip1"
27 | top: "ip1"
28 | }
29 | layer {
30 | name: "ip2"
31 | type: "InnerProduct"
32 | bottom: "ip1"
33 | top: "ip2"
34 | inner_product_param {
35 | num_output: 2
36 | weight_filler {
37 | type: "xavier"
38 | }
39 | bias_filler {
40 | type: "constant"
41 | value: 0
42 | }
43 | }
44 | }
45 | layer {
46 | name: "prob"
47 | type: "Softmax"
48 | bottom: "ip2"
49 | top: "prob"
50 | }
51 |
--------------------------------------------------------------------------------
/DigitRecogn/README.md:
--------------------------------------------------------------------------------
1 | ### 神经网络实现手写字符识别系统
2 |
3 | - BP神经网络
4 | - 输入层维数 400
5 | - 隐藏层神经元 15
6 | - 输出层维数 10
7 | - 学习率 0.1
8 | - 激活函数 sigmoid
9 | - 参数保存在 nn.json
10 |
11 | #### 环境配置(如果在本地运行)
12 | - 系统: ubuntu 14.04 64位
13 |
14 | ```
15 | # 安装pip
16 | sudo apt-get install python-pip
17 |
18 | # 用pip安装numpy和scipy, 使用科大镜像加速
19 | pip install --user numpy scipy -i https://pypi.mirrors.ustc.edu.cn/simple
20 |
21 | # 如果上一步安装失败就使用ubuntu的包管理器试试
22 | sudo apt-get install python-numpy python-scipy
23 |
24 | # 安装sklearn, neural_network_design.py需要调用它做交叉验证
25 | pip install -U scikit-learn -i https://pypi.mirrors.ustc.edu.cn/simple
26 |
27 | # 如果在服务器上运行,修改ocr.js里的HOST为服务器的地址,如http://2016.mc2lab.com
28 |
29 | ```
30 |
31 |
32 | #### 运行
33 |
34 | 1. 下载图像和标签数据
35 |
36 |
37 | wget http://labfile.oss.aliyuncs.com/courses/593/data.csv
38 | wget http://labfile.oss.aliyuncs.com/courses/593/dataLabels.csv
39 |
40 |
41 | 2. 训练模型
42 |
43 | python neural_network_design.py
44 |
45 | 3. 创建服务器
46 |
47 | python -m SimpleHTTPServer 3000
48 |
49 | 4. 加载服务器
50 |
51 | python server.py
52 |
53 | 5. 访问
54 |
55 | localhost:3000
56 |
57 |
58 | * 实现指导见https://www.shiyanlou.com/courses/593
59 |
--------------------------------------------------------------------------------
/DigitRecogn/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
OCR Demo
10 |
11 |
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/DigitRecogn/neural_network_design.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | # neural_network_design.py
3 |
4 | import numpy as np
5 | from ocr import OCRNeuralNetwork
6 | from sklearn.cross_validation import train_test_split
7 |
8 | def test(data_matrix, data_labels, test_indices, nn):
9 | correct_guess_count = 0
10 | for i in test_indices:
11 | test = data_matrix[i]
12 | prediction = nn.predict(test)
13 | if data_labels[i] == prediction:
14 | correct_guess_count += 1
15 | return correct_guess_count / float(len(test_indices))
16 |
17 | data_matrix = np.loadtxt(open('data.csv', 'rb'), delimiter = ',').tolist()
18 | data_labels = np.loadtxt(open('dataLabels.csv', 'rb')).tolist()
19 |
20 | # Create training and testing sets.
21 | train_indices, test_indices = train_test_split(list(range(5000)))
22 |
23 | print "PERFORMANCE"
24 | print "-----------"
25 |
26 | for i in xrange(5, 50, 5):
27 | nn = OCRNeuralNetwork(i, data_matrix, data_labels, train_indices, False)
28 | performance = str(test(data_matrix, data_labels, test_indices, nn))
29 | print "{i} Hidden Nodes: {val}".format(i=i, val=performance)
30 |
--------------------------------------------------------------------------------
/DigitRecogn/ocr.js:
--------------------------------------------------------------------------------
1 | var ocrDemo = {
2 | CANVAS_WIDTH: 200,
3 | TRANSLATED_WIDTH: 20,
4 | PIXEL_WIDTH: 10, // TRANSLATED_WIDTH = CANVAS_WIDTH / PIXEL_WIDTH
5 | BATCH_SIZE: 1,
6 |
7 | // 服务器端参数
8 | PORT: "9000",
9 | HOST: "http://localhost",
10 |
11 | // 颜色变量
12 | BLACK: "#000000",
13 | BLUE: "#0000ff",
14 |
15 | // 客户端训练数据集
16 | trainArray: [],
17 | trainingRequestCount: 0,
18 |
19 | onLoadFunction: function() {
20 | this.resetCanvas();
21 | },
22 |
23 | resetCanvas: function() {
24 | var canvas = document.getElementById('canvas');
25 | var ctx = canvas.getContext('2d');
26 |
27 | this.data = [];
28 | ctx.fillStyle = this.BLACK;
29 | ctx.fillRect(0, 0, this.CANVAS_WIDTH, this.CANVAS_WIDTH);
30 | var matrixSize = 400;
31 | while (matrixSize--) this.data.push(0);
32 | this.drawGrid(ctx);
33 |
34 | // 绑定事件操作
35 | canvas.onmousemove = function(e) { this.onMouseMove(e, ctx, canvas) }.bind(this);
36 | canvas.onmousedown = function(e) { this.onMouseDown(e, ctx, canvas) }.bind(this);
37 | canvas.onmouseup = function(e) { this.onMouseUp(e, ctx) }.bind(this);
38 | },
39 |
40 | drawGrid: function(ctx) {
41 | for (var x = this.PIXEL_WIDTH, y = this.PIXEL_WIDTH; x < this.CANVAS_WIDTH; x += this.PIXEL_WIDTH, y += this.PIXEL_WIDTH) {
42 | ctx.strokeStyle = this.BLUE;
43 | ctx.beginPath();
44 | ctx.moveTo(x, 0);
45 | ctx.lineTo(x, this.CANVAS_WIDTH);
46 | ctx.stroke();
47 |
48 | ctx.beginPath();
49 | ctx.moveTo(0, y);
50 | ctx.lineTo(this.CANVAS_WIDTH, y);
51 | ctx.stroke();
52 | }
53 | },
54 |
55 | onMouseMove: function(e, ctx, canvas) {
56 | if (!canvas.isDrawing) {
57 | return;
58 | }
59 | this.fillSquare(ctx, e.clientX - canvas.offsetLeft, e.clientY - canvas.offsetTop);
60 | },
61 |
62 | onMouseDown: function(e, ctx, canvas) {
63 | canvas.isDrawing = true;
64 | this.fillSquare(ctx, e.clientX - canvas.offsetLeft, e.clientY - canvas.offsetTop);
65 | },
66 |
67 | onMouseUp: function(e) {
68 | canvas.isDrawing = false;
69 | },
70 |
71 | fillSquare: function(ctx, x, y) {
72 | var xPixel = Math.floor(x / this.PIXEL_WIDTH);
73 | var yPixel = Math.floor(y / this.PIXEL_WIDTH);
74 | // 存储手写输入数据
75 | this.data[((xPixel - 1) * this.TRANSLATED_WIDTH + yPixel) - 1] = 1;
76 |
77 | ctx.fillStyle = '#ffffff';
78 | ctx.fillRect(xPixel * this.PIXEL_WIDTH, yPixel * this.PIXEL_WIDTH, this.PIXEL_WIDTH, this.PIXEL_WIDTH);
79 | },
80 |
81 | train: function() {
82 | var digitVal = document.getElementById("digit").value;
83 | if (!digitVal || this.data.indexOf(1) < 0) {
84 | alert("Please type and draw a digit value in order to train the network");
85 | return;
86 | }
87 | // 将数据加入客户端训练数据集
88 | this.trainArray.push({"y0": this.data, "label": parseInt(digitVal)});
89 | this.trainingRequestCount++;
90 |
91 | // 将客服端训练数据集发送给服务器端
92 | if (this.trainingRequestCount == this.BATCH_SIZE) {
93 | alert("Sending training data to server...");
94 | var json = {
95 | trainArray: this.trainArray,
96 | train: true
97 | };
98 |
99 | this.sendData(json);
100 | this.trainingRequestCount = 0;
101 | this.trainArray = [];
102 | }
103 | },
104 |
105 | // 发送预测请求
106 | test: function() {
107 | if (this.data.indexOf(1) < 0) {
108 | alert("Please draw a digit in order to test the network");
109 | return;
110 | }
111 | var json = {
112 | image: this.data,
113 | predict: true
114 | };
115 | this.sendData(json);
116 | },
117 |
118 | // 处理服务器响应
119 | receiveResponse: function(xmlHttp) {
120 | if (xmlHttp.status != 200) {
121 | alert("Server returned status " + xmlHttp.status);
122 | return;
123 | }
124 | var responseJSON = JSON.parse(xmlHttp.responseText);
125 | if (xmlHttp.responseText && responseJSON.type == "test") {
126 | alert("The neural network predicts you wrote a \'" + responseJSON.result + '\'');
127 | }
128 | },
129 |
130 | onError: function(e) {
131 | alert("Error occurred while connecting to server: " + e.target.statusText);
132 | },
133 |
134 | sendData: function(json) {
135 | var xmlHttp = new XMLHttpRequest();
136 | xmlHttp.open('POST', this.HOST + ":" + this.PORT, false);
137 | xmlHttp.onload = function() { this.receiveResponse(xmlHttp); }.bind(this);
138 | xmlHttp.onerror = function() { this.onError(xmlHttp) }.bind(this);
139 | var msg = JSON.stringify(json);
140 | xmlHttp.setRequestHeader('Content-length', msg.length);
141 | xmlHttp.setRequestHeader("Connection", "close");
142 | xmlHttp.send(msg);
143 | }
144 | }
145 |
146 |
--------------------------------------------------------------------------------
/DigitRecogn/ocr.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 |
3 | import csv
4 | import numpy as np
5 | from numpy import matrix
6 | from math import pow
7 | from collections import namedtuple
8 | import math
9 | import random
10 | import os
11 | import json
12 |
13 | class OCRNeuralNetwork:
14 | LEARNING_RATE = 0.1
15 | WIDTH_IN_PIXELS = 20
16 | # 保存神经网络的文件路径
17 | NN_FILE_PATH = 'nn.json'
18 |
19 | def __init__(self, num_hidden_nodes, data_matrix, data_labels, training_indices, use_file=True):
20 | # sigmoid函数
21 | self.sigmoid = np.vectorize(self._sigmoid_scalar)
22 | # sigmoid求导函数
23 | self.sigmoid_prime = np.vectorize(self._sigmoid_prime_scalar)
24 | # 决定了要不要导入nn.json
25 | self._use_file = use_file
26 | # 数据集
27 | self.data_matrix = data_matrix
28 | self.data_labels = data_labels
29 |
30 | if (not os.path.isfile(OCRNeuralNetwork.NN_FILE_PATH) or not use_file):
31 | # 初始化神经网络
32 | self.theta1 = self._rand_initialize_weights(400, num_hidden_nodes)
33 | self.theta2 = self._rand_initialize_weights(num_hidden_nodes, 10)
34 | self.input_layer_bias = self._rand_initialize_weights(1, num_hidden_nodes)
35 | self.hidden_layer_bias = self._rand_initialize_weights(1, 10)
36 |
37 | # 训练并保存
38 | TrainData = namedtuple('TrainData', ['y0', 'label'])
39 | self.train([TrainData(self.data_matrix[i], int(self.data_labels[i])) for i in training_indices])
40 | self.save()
41 | else:
42 | # 如果nn.json存在则加载
43 | self._load()
44 |
45 | def _rand_initialize_weights(self, size_in, size_out):
46 | return [((x * 0.12) - 0.06) for x in np.random.rand(size_out, size_in)]
47 |
48 | def _sigmoid_scalar(self, z):
49 | return 1 / (1 + math.e ** -z)
50 |
51 | def _sigmoid_prime_scalar(self, z):
52 | return self.sigmoid(z) * (1 - self.sigmoid(z))
53 |
54 |
55 | def train(self, training_data_array):
56 | for data in training_data_array:
57 | # 前向传播得到结果向量
58 | y1 = np.dot(np.mat(self.theta1), np.mat(data.y0).T)
59 | sum1 = y1 + np.mat(self.input_layer_bias)
60 | y1 = self.sigmoid(sum1)
61 |
62 | y2 = np.dot(np.array(self.theta2), y1)
63 | y2 = np.add(y2, self.hidden_layer_bias)
64 | y2 = self.sigmoid(y2)
65 |
66 | # 后向传播得到误差向量
67 | actual_vals = [0] * 10
68 | actual_vals[data.label] = 1
69 | output_errors = np.mat(actual_vals).T - np.mat(y2)
70 | hidden_errors = np.multiply(np.dot(np.mat(self.theta2).T, output_errors), self.sigmoid_prime(sum1))
71 |
72 | # 更新权重矩阵与偏置向量
73 | self.theta1 += self.LEARNING_RATE * np.dot(np.mat(hidden_errors), np.mat(data.y0))
74 | self.theta2 += self.LEARNING_RATE * np.dot(np.mat(output_errors), np.mat(y1).T)
75 | self.hidden_layer_bias += self.LEARNING_RATE * output_errors
76 | self.input_layer_bias += self.LEARNING_RATE * hidden_errors
77 |
78 | def predict(self, test):
79 | y1 = np.dot(np.mat(self.theta1), np.mat(test).T)
80 | y1 = y1 + np.mat(self.input_layer_bias) # Add the bias
81 | y1 = self.sigmoid(y1)
82 |
83 | y2 = np.dot(np.array(self.theta2), y1)
84 | y2 = np.add(y2, self.hidden_layer_bias) # Add the bias
85 | y2 = self.sigmoid(y2)
86 |
87 | results = y2.T.tolist()[0]
88 | return results.index(max(results))
89 |
90 | def save(self):
91 | if not self._use_file:
92 | return
93 |
94 | json_neural_network = {
95 | "theta1":[np_mat.tolist()[0] for np_mat in self.theta1],
96 | "theta2":[np_mat.tolist()[0] for np_mat in self.theta2],
97 | "b1":self.input_layer_bias[0].tolist()[0],
98 | "b2":self.hidden_layer_bias[0].tolist()[0]
99 | };
100 | with open(OCRNeuralNetwork.NN_FILE_PATH,'w') as nnFile:
101 | json.dump(json_neural_network, nnFile)
102 |
103 | def _load(self):
104 | if not self._use_file:
105 | return
106 |
107 | with open(OCRNeuralNetwork.NN_FILE_PATH) as nnFile:
108 | nn = json.load(nnFile)
109 | self.theta1 = [np.array(li) for li in nn['theta1']]
110 | self.theta2 = [np.array(li) for li in nn['theta2']]
111 | self.input_layer_bias = [np.array(nn['b1'][0])]
112 | self.hidden_layer_bias = [np.array(nn['b2'][0])]
113 |
--------------------------------------------------------------------------------
/DigitRecogn/server.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import BaseHTTPServer
3 | import json
4 | from ocr import OCRNeuralNetwork
5 | import numpy as np
6 | import random
7 | from collections import namedtuple
8 |
9 | #服务器端配置
10 | HOST_NAME = ''
11 | PORT_NUMBER = 9000
12 | #这个值是通过运行神经网络设计脚本得到的最优值
13 | HIDDEN_NODE_COUNT = 15
14 |
15 | # 加载数据集
16 | data_matrix = np.loadtxt(open('data.csv', 'rb'), delimiter = ',')
17 | data_labels = np.loadtxt(open('dataLabels.csv', 'rb'))
18 |
19 | # 转换成list类型
20 | data_matrix = data_matrix.tolist()
21 | data_labels = data_labels.tolist()
22 |
23 | # 数据集一共5000个数据,train_indice存储用来训练的数据的序号
24 | train_indice = range(5000)
25 | # 打乱训练顺序
26 | random.shuffle(train_indice)
27 |
28 | nn = OCRNeuralNetwork(HIDDEN_NODE_COUNT, data_matrix, data_labels, train_indice);
29 |
30 | class JSONHandler(BaseHTTPServer.BaseHTTPRequestHandler):
31 | """处理接收到的POST请求"""
32 | def do_POST(self):
33 | response_code = 200
34 | response = ""
35 | var_len = int(self.headers.get('Content-Length'))
36 | content = self.rfile.read(var_len);
37 | payload = json.loads(content);
38 |
39 | # 如果是训练请求,训练然后保存训练完的神经网络
40 | if payload.get('train'):
41 | # 转化数据格式
42 | TrainData = namedtuple('TrainData', ['y0', 'label'])
43 | nn.train([TrainData(payload['trainArray'][0]['y0'],payload['trainArray'][0]['label'])])
44 | nn.save()
45 | # 如果是预测请求,返回预测值
46 | elif payload.get('predict'):
47 | try:
48 | print nn.predict(data_matrix[0])
49 | response = {"type":"test", "result":str(nn.predict(payload['image']))}
50 | except:
51 | response_code = 500
52 | else:
53 | response_code = 400
54 |
55 | self.send_response(response_code)
56 | self.send_header("Content-type", "application/json")
57 | self.send_header("Access-Control-Allow-Origin", "*")
58 | self.end_headers()
59 | if response:
60 | self.wfile.write(json.dumps(response))
61 | return
62 |
63 | if __name__ == '__main__':
64 | server_class = BaseHTTPServer.HTTPServer;
65 | httpd = server_class((HOST_NAME, PORT_NUMBER), JSONHandler)
66 |
67 | try:
68 | #启动服务器
69 | httpd.serve_forever()
70 | except KeyboardInterrupt:
71 | pass
72 | else:
73 | print "Unexpected server exception occurred."
74 | finally:
75 | httpd.server_close()
76 |
77 |
--------------------------------------------------------------------------------
/Keras/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 |
--------------------------------------------------------------------------------
/Keras/KerasDistinguishAge.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | np.random.seed(1337) # for reproducibility
4 | from keras.models import Sequential, model_from_json
5 | from keras.layers.core import Dense, Dropout, Activation
6 | from keras.optimizers import SGD, Adam, RMSprop, Adagrad
7 | from keras.utils import np_utils
8 | from time import sleep
9 |
10 | batch_size = 128
11 | nb_classes = 20
12 | nb_epoch = 100
13 | def load_data():
14 | x_train=[]
15 | Y_train=[]
16 | x_test=[]
17 | Y_test=[]
18 |
19 | f = open("train.txt","r")
20 | i = 0
21 | for line in f.readlines():
22 | line = line.strip("\n").split(",")
23 | if i>0:
24 | Y_train.append(int(float(line[2])/5))
25 | del line[0]
26 | del line[0]
27 | del line[0]
28 | x_train.append(line)
29 | i += 1
30 | x1=np.array(x_train)
31 | y1=np.array(Y_train)
32 | f.close()
33 |
34 | f = open("test.txt","r")
35 | i = 0
36 | for line in f.readlines():
37 | line = line.strip("\n").split(",")
38 | if i>0:
39 | Y_test.append(int(float(line[2])/5))
40 | del line[0]
41 | del line[0]
42 | del line[0]
43 | x_test.append(line)
44 | i += 1
45 | x2=np.array(x_test)
46 | y2=np.array(Y_test)
47 | f.close()
48 |
49 | return (x1, y1), (x2, y2)
50 |
51 | # the data, shuffled and split between train and test sets
52 | (X_train, y_train), (X_test, y_test) = load_data()
53 | X_train = X_train.reshape(1858, 26)
54 | X_test = X_test.reshape(200, 26)
55 | X_train = X_train.astype('float32')
56 | X_test = X_test.astype('float32')
57 | X_train /= 255
58 | X_test /= 255
59 | print(X_train.shape[0], 'train samples')
60 | print(X_test.shape[0], 'test samples')
61 | # convert class vectors to binary class matrices
62 | Y_train = np_utils.to_categorical(y_train, nb_classes)
63 | Y_test = np_utils.to_categorical(y_test, nb_classes)
64 |
65 | model = Sequential()
66 | model.add(Dense(16, input_shape=(26,)))
67 | model.add(Activation('relu'))
68 | model.add(Dropout(0.1))
69 | model.add(Dense(output_dim=247))
70 | model.add(Activation('relu'))
71 | model.add(Dropout(0.1))
72 | model.add(Dense(output_dim=125))
73 | model.add(Activation('relu'))
74 | model.add(Dropout(0.2))
75 | model.add(Dense(output_dim=20))
76 | model.add(Activation('softmax'))
77 |
78 | adagrad=Adagrad(lr=0.02, epsilon=1e-4)
79 | #model.compile(loss='categorical_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])
80 | model.compile(loss='categorical_crossentropy',optimizer=adagrad,metrics=['accuracy'])
81 | history = model.fit(X_train, Y_train,batch_size=batch_size,
82 | nb_epoch=nb_epoch,verbose=1,
83 | validation_data=(X_test, Y_test))
84 | # 输出结果看看
85 | # result = []
86 | # result = model.predict_classes(X_test,batch_size=batch_size,verbose=1)
87 | #
88 | # for r in result:
89 | # print r
90 |
91 | score = model.evaluate(X_test, Y_test, verbose=1)
92 | # print('Test score:', score[0])
93 | print('Test accuracy:', score[1])
94 | print "end"
95 |
96 | # #保存模型
97 | # json_string = model.to_json()
98 | # open("model/model_json.json","w").write(json_string)
99 | # #保存权重,暂时保存权重会出错
100 | # model.save_weights("model/model_json_weight.h5")
101 | # #加载保存的模型
102 | # model = model_from_json(open("model/model_json.json").read())
103 | # model.load_weights("model/model_json_weight.h5")
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
--------------------------------------------------------------------------------
/Keras/README.md:
--------------------------------------------------------------------------------
1 | Keras手写识别字符集Demo
2 |
3 | 1.关于Keras
4 |
5 | Keras是基于Theano和TensorFlow的一个深度学习框架,它的设计参考了Torch,用Python语言编写,是一个高度模块化的神经网络库,支持GPU和CPU。
6 |
7 | 2.关于Keras的Sequential模型
8 |
9 | Keras有两种类型的模型,顺序模型(Sequential)和泛型模型(Model):
10 |
11 | 2.1 Sequential是多个网络层的线性堆叠,可以通过向Sequential模型传递一个layer的list来构造该模型:
12 | from keras.models import Sequential
13 | from keras.layers import Dense, Activation
14 | model = Sequential([Dense(32, input_dim=784),Activation('relu'),Dense(10),Activation('softmax'),])
15 | 也可以通过.add()方法一个个的将layer加入模型中:
16 | model = Sequential()
17 | model.add(Dense(32, input_dim=784))
18 | model.add(Activation('relu'))
19 |
20 | 2.2 模型需要知道输入数据的shape
21 | Sequential的第一层需要接受一个关于输入数据shape的参数,后面的各个层则可以自动的推导出中间数据的shape,因此不需要为每个层都指定这个参数。
22 |
23 | 有几种方法来为第一层指定输入数据的shape:
24 | 1.传递一个input_shape的关键字参数给第一层,input_shape是一个tuple类型的数据,其中也可以填入None,如果填入None则表示此位置可能是任何正整数。数据的batch大小不应包含在其中。
25 | 2.传递一个batch_input_shape的关键字参数给第一层,该参数包含数据的batch大小。该参数在指定固定大小batch时比较有用,例如在stateful RNNs中。事实上,Keras在内部会通过添加一个None将input_shape转化为batch_input_shape
26 | 3.有些2D层,如Dense,支持通过指定其输入维度input_dim来隐含的指定输入数据shape。一些3D的时域层支持通过参数input_dim和input_length来指定输入shape。
27 | 下面的三个指定输入数据shape的方法是严格等价的:
28 | 1.model = Sequential()
29 | model.add(Dense(32, input_shape=(784,)))
30 | 2.model = Sequential()
31 | model.add(Dense(32, batch_input_shape=(None, 784)))
32 | 3.model = Sequential()
33 | model.add(Dense(32, input_dim=784))
34 |
35 | 2.3 Sequential模型常用方法
36 |
37 | 2.3.1 compile
38 | compile(self, optimizer, loss, metrics=[], sample_weight_mode=None)
39 | 编译用来配置模型的学习过程,其参数有:
40 | optimizer:字符串(预定义优化器名)或优化器对象,参考优化器
41 | loss:字符串(预定义损失函数名)或目标函数,参考目标函数
42 | metrics:列表,包含评估模型在训练和测试时的网络性能的指标,典型用法是metrics=['accuracy']
43 | sample_weight_mode:如果你需要按时间步为样本赋权(2D权矩阵),将该值设为“temporal”。默认为“None”,代表按样本赋权(1D权)。在下面fit函数的解释中有相关的参考内容。
44 | kwargs:使用TensorFlow作为后端请忽略该参数,若使用Theano作为后端,kwargs的值将会传递给 K.function
45 | 代码示例:
46 | model = Sequential()
47 | model.add(Dense(32, input_shape=(500,)))
48 | model.add(Dense(10, activation='softmax'))
49 | model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])
50 |
51 | 2.3.2 fit
52 | fit(self, x, y, batch_size=32, nb_epoch=10, verbose=1, callbacks=[], validation_split=0.0, validation_data=None, shuffle=True, class_weight=None, sample_weight=None)
53 | 本函数将模型训练nb_epoch轮,其参数有:
54 | x:输入数据。如果模型只有一个输入,那么x的类型是numpy array,如果模型有多个输入,那么x的类型应当为list,list的元素是对应于各个输入的numpy array
55 | y:标签,numpy array
56 | batch_size:整数,指定进行梯度下降时每个batch包含的样本数。训练时一个batch的样本会被计算一次梯度下降,使目标函数优化一步。
57 | nb_epoch:整数,训练的轮数,训练数据将会被遍历nb_epoch次。Keras中nb开头的变量均为"number of"的意思
58 | verbose:日志显示,0为不在标准输出流输出日志信息,1为输出进度条记录,2为每个epoch输出一行记录
59 | callbacks:list,其中的元素是keras.callbacks.Callback的对象。这个list中的回调函数将会在训练过程中的适当时机被调用,参考回调函数
60 | validation_split:0~1之间的浮点数,用来指定训练集的一定比例数据作为验证集。验证集将不参与训练,并在每个epoch结束后测试的模型的指标,如损失函数、精确度等。
61 | validation_data:形式为(X,y)的tuple,是指定的验证集。此参数将覆盖validation_spilt。
62 | shuffle:布尔值或字符串,一般为布尔值,表示是否在训练过程中随机打乱输入样本的顺序。若为字符串“batch”,则是用来处理HDF5数据的特殊情况,它将在batch内部将数据打乱。
63 | class_weight:字典,将不同的类别映射为不同的权值,该参数用来在训练过程中调整损失函数(只能用于训练)
64 | sample_weight:权值的numpy array,用于在训练时调整损失函数(仅用于训练)。可以传递一个1D的与样本等长的向量用于对样本进行1对1的加权,或者在面对时序数据时,传递一个的形式为(samples,sequence_length)的矩阵来为每个时间步上的样本赋不同的权。这种情况下请确定在编译模型时添加了sample_weight_mode='temporal'。
65 |
66 | 3.关于Keras的常用层
67 |
68 | 3.1 Dense层
69 | Dense(output_dim, init='glorot_uniform', activation='linear', weights=None, W_regularizer=None, b_regularizer=None, activity_regularizer=None, W_constraint=None, b_constraint=None, bias=True, input_dim=None)
70 | Dense就是常用的全连接层,这里是一个使用示例:
71 | model = Sequential()
72 | model.add(Dense(32, input_dim=16))
73 |
74 | model = Sequential()
75 | model.add(Dense(32, input_shape=(16,)))
76 |
77 | model.add(Dense(32))
78 | 部分常用参数:
79 | output_dim:大于0的整数,代表该层的输出维度。模型中非首层的全连接层其输入维度可以自动推断,因此非首层的全连接定义时不需要指定输入维度。
80 | init:初始化方法,为预定义初始化方法名的字符串,或用于初始化权重的Theano函数。该参数仅在不传递weights参数时才有意义。
81 | activation:激活函数,为预定义的激活函数名(参考激活函数),或逐元素(element-wise)的Theano函数。如果不指定该参数,将不会使用任何激活函数(即使用线性激活函数:a(x)=x)
82 | input_dim:整数,输入数据的维度。当Dense层作为网络的第一层时,必须指定该参数或input_shape参数。
83 |
84 | 3.2 Activation层
85 | Activation(activation)
86 | 激活层对一个层的输出施加激活函数。
87 |
88 | 3.3 Dropout层
89 | Dropout(p)
90 | 为输入数据施加Dropout。Dropout将在训练过程中每次更新参数时随机断开一定百分比(p)的输入神经元连接,Dropout层用于防止过拟合。
91 |
92 | 3.4 Flatten层
93 | Flatten()
94 | Flatten层用来将输入“压平”,即把多维的输入一维化,常用在从卷积层到全连接层的过渡。Flatten不影响batch的大小。这里是一个使用例子:
95 | model = Sequential()
96 | model.add(Convolution2D(64, 3, 3, border_mode='same', input_shape=(3, 32, 32)))
97 | # 模型输出形状 == (None, 64, 32, 32)
98 | model.add(Flatten())
99 | # 模型输出形状 == (None, 65536)
100 |
101 | 3.5 Convolution1D层
102 | Convolution1D(nb_filter, filter_length, init='uniform', activation='linear', weights=None, border_mode='valid', subsample_length=1, W_regularizer=None, b_regularizer=None, activity_regularizer=None, W_constraint=None, b_constraint=None, bias=True, input_dim=None, input_length=None)
103 | 一维卷积层,用以在一维输入信号上进行邻域滤波。当使用该层作为首层时,需要提供关键字参数input_dim或input_shape。
104 |
105 | 3.6 Convolution2D层
106 | 二维卷积层对二维输入进行滑动窗卷积,当使用该层作为第一层时,应提供input_shape参数。
107 |
108 | 3.7 MaxPooling1D层
109 | MaxPooling1D(pool_length=2, stride=None, border_mode='valid')
110 | 对时域1D信号进行最大值池化
111 | 参数:
112 | pool_length:下采样因子,如取2则将输入下采样到一半长度
113 | stride:整数或None,步长值
114 | border_mode:‘valid’或者‘same’
115 |
116 | 4.数据集
117 |
118 | kerashandwerite.py的数据集下载地址:http://pan.baidu.com/s/1nvEuc8D
119 |
120 | ## 基于Keras的CNN性别年龄预测预测
121 | 本次request通过keras实现了性别预测
122 |
123 | - 性别预测:cnn方法将train.csv的数据二分,对训练集本身的准确率在一段时间后会极高,但预测集在73%-78%。进一步增加训练次数后,反而将导致测试准确率下降。
124 |
125 | - 但是年龄预测由于训练数据过少,分类较多,目前无法得到明显的准确率提升,仅能维持在10选1的概率下达到26%-30%。
126 |
127 |
128 | ### 其他简要说明
129 |
130 | 1. gender-predict-cnn实现了一个初级的cnn性别预测算法。
131 |
132 | 2. 下一步方向是参数调优和尝试使用Inception-v4或者ResNet/VGG来实现性别或年龄预测(这可以在数据不全的场合下提高准确率)。
133 |
134 | 3. (年龄预测可以尝试按照gender-predict-cnn中的注释修改实现预测)
135 |
136 | 4. 数据集为train.csv
137 |
138 | 5. 具体函数作用参看注释和本节之前的说明。
139 |
140 |
--------------------------------------------------------------------------------
/Keras/gender_age_predict_cnn.py:
--------------------------------------------------------------------------------
1 | #-*- coding: UTF-8 -*-
2 |
3 | import numpy as np
4 | from keras.utils import np_utils
5 | from keras.models import Sequential
6 | from keras.layers import Dense,Activation,Convolution2D,MaxPooling2D,Flatten
7 | from keras.optimizers import Adam
8 | import csv
9 | import string
10 |
11 | # 性别是2分类 年龄是10分类
12 | age = 10
13 | gender = 2
14 |
15 | #修改这个地方可以选择预测性别还是年龄
16 | #但年龄的准确率不忍直视
17 | test_what = gender
18 |
19 | #数据的分组边界
20 | splitor=1400
21 |
22 | # 准备数据
23 | age_orign = []
24 | data_orign = []
25 | sex_orign = []
26 | with open('train.csv','rb') as precsv:
27 | reader = csv.reader(precsv)
28 | for line in reader:
29 | # 忽略第一行
30 | if reader.line_num == 1:
31 | continue
32 | if(line[1] == '\xc4\xd0'):
33 | sex_orign.append(0) # 性别数据
34 | else:
35 | sex_orign.append(1)
36 | age_orign.append(int(float(line[2])/10)) # 年龄(按照10岁为一个阶段分组)
37 | data_orign.append(line[4:]) # 血检数据
38 |
39 | # 将数据分为训练集和测试集
40 | age_train = np.array(age_orign[:splitor])
41 | data_train = np.array(data_orign[:splitor])
42 | sex_train = np.array(sex_orign[:splitor])
43 |
44 | age_predict = np.array(age_orign[splitor:])
45 | data_predict = np.array(data_orign[splitor:])
46 | sex_predict = np.array(sex_orign[splitor:])
47 |
48 | # 数据的维度(数据含有多少项)
49 | data_dim = data_train.shape[1]
50 |
51 |
52 | if test_what == age:
53 | XT = data_train.reshape(-1,data_dim,1,1)
54 | YT = np_utils.to_categorical(age_train,nb_classes=age)
55 | XT2 = data_predict.reshape(-1,data_dim,1,1)
56 | YT2 = np_utils.to_categorical(age_predict,nb_classes=age)
57 | output_dim = age
58 | loss_str = 'categorical_crossentropy'
59 | else:
60 | XT = data_train.reshape(-1,data_dim,1,1)
61 | YT = np_utils.to_categorical(sex_train,nb_classes=gender)
62 | XT2 = data_predict.reshape(-1,data_dim,1,1)
63 | YT2 = np_utils.to_categorical(sex_predict,nb_classes=gender)
64 | output_dim = gender
65 | loss_str = 'binary_crossentropy'
66 |
67 | #
68 | model = Sequential()
69 |
70 | #
71 | model.add( Convolution2D(
72 | nb_filter=data_dim*data_dim,
73 | nb_row=5,
74 | nb_col=5,
75 | border_mode='same',
76 | input_shape=(data_dim,1,1)
77 | ))
78 | model.add(Activation('relu'))
79 |
80 | # pooling
81 |
82 | model.add( MaxPooling2D(
83 | pool_size=(2,2),
84 | strides=(2,2),
85 | border_mode='same'
86 | ))
87 |
88 | model.add( Convolution2D(64,5,5,border_mode='same'))
89 | model.add( Flatten())
90 | model.add( Dense(1024) )
91 | #model.add( Activation('relu'))
92 |
93 | model.add( Activation('relu'))
94 | model.add(Dense(output_dim))
95 | model.add( Activation('softmax'))
96 | adam = Adam(lr=0.0001)
97 | model.compile(
98 | loss=loss_str,
99 | optimizer=adam,
100 | metrics=['accuracy']
101 | )
102 |
103 | model.fit(XT,YT,nb_epoch=100,batch_size=32)
104 |
105 | print '===='
106 | loss,accuracy = model.evaluate(XT2,YT2)
107 | print loss
108 | print accuracy
109 |
110 |
111 |
112 |
113 |
--------------------------------------------------------------------------------
/Keras/kerashandwritetest.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from keras.preprocessing.image import ImageDataGenerator
3 | from keras.models import Sequential
4 | from keras.layers.core import Dense, Dropout, Activation, Flatten
5 | from keras.layers.advanced_activations import PReLU
6 | from keras.layers.convolutional import Convolution2D, MaxPooling2D
7 | from keras.optimizers import SGD, Adadelta, Adagrad
8 | from keras.utils import np_utils, generic_utils
9 | from six.moves import range
10 | import random
11 | import os
12 | from PIL import Image
13 | import numpy as np
14 | from keras import backend
15 | backend.set_image_dim_ordering('th')
16 |
17 | #读取文件夹mnist下的42000张图片,图片为灰度图,所以为1通道,图像大小28*28
18 | #如果是将彩色图作为输入,则将1替换为3,并且data[i,:,:,:] = arr改为data[i,:,:,:] = [arr[:,:,0],arr[:,:,1],arr[:,:,2]]
19 | def load_data():
20 | data = np.empty((42000,1,28,28),dtype="float32")
21 | label = np.empty((42000,),dtype="uint8")
22 |
23 | imgs = os.listdir("./mnist")
24 | num = len(imgs)
25 | for i in range(num):
26 | if '.jpg' in imgs[i]:
27 | img = Image.open("./mnist/"+imgs[i])
28 | arr = np.asarray(img,dtype="float32")
29 | data[i,:,:,:] = arr
30 | label[i] = int(imgs[i].split('.')[0])
31 | return data,label
32 |
33 | #加载数据
34 | data, label = load_data()
35 | #打乱数据
36 | index = [i for i in range(len(data))]
37 | random.shuffle(index)
38 | data = data[index]
39 | label = label[index]
40 | print(data.shape[0], ' samples')
41 |
42 | #label为0~9共10个类别,keras要求格式为binary class matrices,转化一下,直接调用keras提供的这个函数
43 | label = np_utils.to_categorical(label, 10)
44 |
45 | #开始建立CNN模型
46 | #生成一个model,可以通过向 Sequential模型传递一个layer的list来构造该model
47 | model = Sequential()
48 |
49 | #第一个卷积层,4个卷积核,每个卷积核大小5*5。1表示输入的图片的通道,灰度图为1通道。
50 | #border_mode边界模式 可以是valid或者full,valid只适用于完整的图像补丁的过滤器
51 | model.add(Convolution2D(4, 5, 5, border_mode='valid', input_shape=(1,28,28)))#input_shape在后面的层可以推导出来,不需要为每一个层都指定这个参数
52 | model.add(Activation('tanh'))
53 | #model.add(Dropout(0.5))#训练过程更新参数随机断开一定比例的神经元连接,避免过拟合,它们在正向传播过程中对于下游神经元的贡献效果暂时消失了,反向传播时该神经元也不会有任何权重的更新。
54 |
55 | #第二个卷积层,8个卷积核,每个卷积核大小3*3。 4表示输入的特征图个数,等于上一层的卷积核个数
56 | #激活函数用tanh
57 | #采用maxpooling,poolsize为(2,2)
58 | model.add(Convolution2D(8, 3, 3, border_mode='valid'))
59 | model.add(Activation('tanh'))
60 | model.add(MaxPooling2D(pool_size=(2, 2)))
61 |
62 | #第三个卷积层,16个卷积核,每个卷积核大小3*3
63 | #激活函数用tanh
64 | #采用maxpooling,poolsize为(2,2)最大化池操作,也就是下采样,这里是2维代表两个方向(竖直,水平)的 ,对输入进行size为(2,2)的下采样操作的话,结果就剩下了输入的每一维度的一半,即总的结果是原输入的四分之一。
65 | model.add(Convolution2D(16, 3, 3, border_mode='valid'))
66 | model.add(Activation('tanh'))
67 | model.add(MaxPooling2D(pool_size=(2, 2)))
68 |
69 | #全连接层,先将前一层输出的二维特征图flatten为一维的,常常用在卷积层到全链接层的过度。
70 | #Dense就是隐藏层。16就是上一层输出的特征图个数。4是根据每个卷积层计算出来的:(28-5+1)得到24,(24-3+1)/2得到11,(11-3+1)/2得到4
71 | #全连接有128个神经元节点,初始化方式为normal
72 | model.add(Flatten())
73 | model.add(Dense(input_dim=256, output_dim=128))#256=16*4*4
74 | model.add(Activation('tanh'))
75 |
76 | #Softmax分类,输出是10类别
77 | model.add(Dense(input_dim=128, output_dim=10))
78 | model.add(Activation('softmax'))
79 |
80 | #开始训练模型
81 | #使用SGD优化函数 ,lr学习速率, momentum参数更新动量,decay是学习速率的衰减系数(每个epoch衰减一次),Nesterov的值是False或者True,表示使不使用Nesterov momentum
82 | #model.compile里的参数loss就是损失函数(目标函数), optimizer是使用的优化器,metrics列表包含评估模型在训练和测试时的网络性能的指标
83 | sgd = SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True)
84 | model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
85 |
86 |
87 | #调用fit方法,就是一个训练过程
88 | #输入的数据,标签,进行梯度下降的时候每个batch包含的样本数,训练的轮数,是否打乱,日志显示(0不输出日志信息,1输出进度条,2每轮训练输出一条记录),是否显示精确度,选择作为验证集的比例
89 | model.fit(data, label, batch_size=100, nb_epoch=1,shuffle=True,verbose=1,show_accuracy=True,validation_split=0.2)
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
--------------------------------------------------------------------------------
/Keras/train.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | np.random.seed(1337) # for reproducibility
4 | from keras.models import Sequential
5 | from keras.layers.core import Dense, Dropout, Activation
6 | from keras.optimizers import SGD, Adam, RMSprop
7 | from keras.utils import np_utils
8 | from time import sleep
9 | batch_size = 128
10 | nb_classes = 2
11 | nb_epoch = 100
12 | def load_data():
13 | x_train=[]
14 | Y_train=[]
15 | x_test=[]
16 | Y_test=[]
17 |
18 | f = open("train.txt","r")
19 | i = 0
20 | for line in f.readlines():
21 | line = line.split(",")
22 | if i>0:
23 | if line[1] == "男":
24 | Y_train.append(0)
25 | else:
26 | Y_train.append(1)
27 | del line[0]
28 | del line[0]
29 | del line[0]
30 | x_train.append(line)
31 | i += 1
32 | x1=np.array(x_train)
33 | y1=np.array(Y_train)
34 | f.close()
35 |
36 | f = open("test.txt","r")
37 | i = 0
38 | for line in f.readlines():
39 | line = line.split(",")
40 | if i>0:
41 | if line[1] == "男":
42 | Y_test.append(0)
43 | else:
44 | Y_test.append(1)
45 | del line[0]
46 | del line[0]
47 | del line[0]
48 | x_test.append(line)
49 | i += 1
50 | x2=np.array(x_test)
51 | y2=np.array(Y_test)
52 | f.close()
53 |
54 | return (x1, y1), (x2, y2)
55 |
56 | (X_train, y_train), (X_test, y_test) = load_data()
57 | X_train = X_train.reshape(1858, 26)
58 | X_test = X_test.reshape(200, 26)
59 | X_train = X_train.astype('float32')
60 | X_test = X_test.astype('float32')
61 | X_train /= 255
62 | X_test /= 255
63 |
64 | print(X_train.shape[0], 'train samples')
65 | print(X_test.shape[0], 'test samples')
66 |
67 | Y_train = np_utils.to_categorical(y_train, nb_classes)
68 | Y_test = np_utils.to_categorical(y_test, nb_classes)
69 | #分成3层,中间隐层有32个节点
70 | model = Sequential()
71 | model.add(Dense(32, input_shape=(26,)))
72 | model.add(Activation('relu'))
73 | model.add(Dropout(0.2))
74 | model.add(Dense(output_dim=32))
75 | model.add(Activation('relu'))
76 | model.add(Dropout(0.2))
77 | model.add(Dense(output_dim=2))
78 | model.add(Activation('softmax'))
79 |
80 | model.compile(loss='categorical_crossentropy',
81 | optimizer=RMSprop(),
82 | metrics=['accuracy'])
83 | history = model.fit(X_train, Y_train,
84 | batch_size=batch_size, nb_epoch=nb_epoch,
85 | verbose=1, validation_data=(X_test, Y_test))
86 | #输出预测结果看一下
87 | '''
88 | result = []
89 | result = model.predict_classes(X_test,batch_size=batch_size,verbose=1)
90 | for r in result:
91 | print r
92 | '''
93 |
94 | score = model.evaluate(X_test, Y_test, verbose=1)
95 |
96 | #print('Test score:', score[0])
97 | print('Test accuracy:', score[1])
98 | print "end"
99 |
100 |
101 |
102 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction, and
10 | distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright
13 | owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all other entities
16 | that control, are controlled by, or are under common control with that entity.
17 | For the purposes of this definition, "control" means (i) the power, direct or
18 | indirect, to cause the direction or management of such entity, whether by
19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
20 | outstanding shares, or (iii) beneficial ownership of such entity.
21 |
22 | "You" (or "Your") shall mean an individual or Legal Entity exercising
23 | permissions granted by this License.
24 |
25 | "Source" form shall mean the preferred form for making modifications, including
26 | but not limited to software source code, documentation source, and configuration
27 | files.
28 |
29 | "Object" form shall mean any form resulting from mechanical transformation or
30 | translation of a Source form, including but not limited to compiled object code,
31 | generated documentation, and conversions to other media types.
32 |
33 | "Work" shall mean the work of authorship, whether in Source or Object form, made
34 | available under the License, as indicated by a copyright notice that is included
35 | in or attached to the work (an example is provided in the Appendix below).
36 |
37 | "Derivative Works" shall mean any work, whether in Source or Object form, that
38 | is based on (or derived from) the Work and for which the editorial revisions,
39 | annotations, elaborations, or other modifications represent, as a whole, an
40 | original work of authorship. For the purposes of this License, Derivative Works
41 | shall not include works that remain separable from, or merely link (or bind by
42 | name) to the interfaces of, the Work and Derivative Works thereof.
43 |
44 | "Contribution" shall mean any work of authorship, including the original version
45 | of the Work and any modifications or additions to that Work or Derivative Works
46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work
47 | by the copyright owner or by an individual or Legal Entity authorized to submit
48 | on behalf of the copyright owner. For the purposes of this definition,
49 | "submitted" means any form of electronic, verbal, or written communication sent
50 | to the Licensor or its representatives, including but not limited to
51 | communication on electronic mailing lists, source code control systems, and
52 | issue tracking systems that are managed by, or on behalf of, the Licensor for
53 | the purpose of discussing and improving the Work, but excluding communication
54 | that is conspicuously marked or otherwise designated in writing by the copyright
55 | owner as "Not a Contribution."
56 |
57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf
58 | of whom a Contribution has been received by Licensor and subsequently
59 | incorporated within the Work.
60 |
61 | 2. Grant of Copyright License.
62 |
63 | Subject to the terms and conditions of this License, each Contributor hereby
64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
65 | irrevocable copyright license to reproduce, prepare Derivative Works of,
66 | publicly display, publicly perform, sublicense, and distribute the Work and such
67 | Derivative Works in Source or Object form.
68 |
69 | 3. Grant of Patent License.
70 |
71 | Subject to the terms and conditions of this License, each Contributor hereby
72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
73 | irrevocable (except as stated in this section) patent license to make, have
74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where
75 | such license applies only to those patent claims licensable by such Contributor
76 | that are necessarily infringed by their Contribution(s) alone or by combination
77 | of their Contribution(s) with the Work to which such Contribution(s) was
78 | submitted. If You institute patent litigation against any entity (including a
79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a
80 | Contribution incorporated within the Work constitutes direct or contributory
81 | patent infringement, then any patent licenses granted to You under this License
82 | for that Work shall terminate as of the date such litigation is filed.
83 |
84 | 4. Redistribution.
85 |
86 | You may reproduce and distribute copies of the Work or Derivative Works thereof
87 | in any medium, with or without modifications, and in Source or Object form,
88 | provided that You meet the following conditions:
89 |
90 | You must give any other recipients of the Work or Derivative Works a copy of
91 | this License; and
92 | You must cause any modified files to carry prominent notices stating that You
93 | changed the files; and
94 | You must retain, in the Source form of any Derivative Works that You distribute,
95 | all copyright, patent, trademark, and attribution notices from the Source form
96 | of the Work, excluding those notices that do not pertain to any part of the
97 | Derivative Works; and
98 | If the Work includes a "NOTICE" text file as part of its distribution, then any
99 | Derivative Works that You distribute must include a readable copy of the
100 | attribution notices contained within such NOTICE file, excluding those notices
101 | that do not pertain to any part of the Derivative Works, in at least one of the
102 | following places: within a NOTICE text file distributed as part of the
103 | Derivative Works; within the Source form or documentation, if provided along
104 | with the Derivative Works; or, within a display generated by the Derivative
105 | Works, if and wherever such third-party notices normally appear. The contents of
106 | the NOTICE file are for informational purposes only and do not modify the
107 | License. You may add Your own attribution notices within Derivative Works that
108 | You distribute, alongside or as an addendum to the NOTICE text from the Work,
109 | provided that such additional attribution notices cannot be construed as
110 | modifying the License.
111 | You may add Your own copyright statement to Your modifications and may provide
112 | additional or different license terms and conditions for use, reproduction, or
113 | distribution of Your modifications, or for any such Derivative Works as a whole,
114 | provided Your use, reproduction, and distribution of the Work otherwise complies
115 | with the conditions stated in this License.
116 |
117 | 5. Submission of Contributions.
118 |
119 | Unless You explicitly state otherwise, any Contribution intentionally submitted
120 | for inclusion in the Work by You to the Licensor shall be under the terms and
121 | conditions of this License, without any additional terms or conditions.
122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of
123 | any separate license agreement you may have executed with Licensor regarding
124 | such Contributions.
125 |
126 | 6. Trademarks.
127 |
128 | This License does not grant permission to use the trade names, trademarks,
129 | service marks, or product names of the Licensor, except as required for
130 | reasonable and customary use in describing the origin of the Work and
131 | reproducing the content of the NOTICE file.
132 |
133 | 7. Disclaimer of Warranty.
134 |
135 | Unless required by applicable law or agreed to in writing, Licensor provides the
136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
138 | including, without limitation, any warranties or conditions of TITLE,
139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
140 | solely responsible for determining the appropriateness of using or
141 | redistributing the Work and assume any risks associated with Your exercise of
142 | permissions under this License.
143 |
144 | 8. Limitation of Liability.
145 |
146 | In no event and under no legal theory, whether in tort (including negligence),
147 | contract, or otherwise, unless required by applicable law (such as deliberate
148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be
149 | liable to You for damages, including any direct, indirect, special, incidental,
150 | or consequential damages of any character arising as a result of this License or
151 | out of the use or inability to use the Work (including but not limited to
152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or
153 | any and all other commercial damages or losses), even if such Contributor has
154 | been advised of the possibility of such damages.
155 |
156 | 9. Accepting Warranty or Additional Liability.
157 |
158 | While redistributing the Work or Derivative Works thereof, You may choose to
159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or
160 | other liability obligations and/or rights consistent with this License. However,
161 | in accepting such obligations, You may act only on Your own behalf and on Your
162 | sole responsibility, not on behalf of any other Contributor, and only if You
163 | agree to indemnify, defend, and hold each Contributor harmless for any liability
164 | incurred by, or claims asserted against, such Contributor by reason of your
165 | accepting any such warranty or additional liability.
166 |
167 | END OF TERMS AND CONDITIONS
168 |
169 | APPENDIX: How to apply the Apache License to your work
170 |
171 | To apply the Apache License to your work, attach the following boilerplate
172 | notice, with the fields enclosed by brackets "{}" replaced with your own
173 | identifying information. (Don't include the brackets!) The text should be
174 | enclosed in the appropriate comment syntax for the file format. We also
175 | recommend that a file or class name and description of purpose be included on
176 | the same "printed page" as the copyright notice for easier identification within
177 | third-party archives.
178 |
179 | Copyright 2016 mengning
180 |
181 | Licensed under the Apache License, Version 2.0 (the "License");
182 | you may not use this file except in compliance with the License.
183 | You may obtain a copy of the License at
184 |
185 | http://www.apache.org/licenses/LICENSE-2.0
186 |
187 | Unless required by applicable law or agreed to in writing, software
188 | distributed under the License is distributed on an "AS IS" BASIS,
189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190 | See the License for the specific language governing permissions and
191 | limitations under the License.
192 |
--------------------------------------------------------------------------------
/MxNet/README.md:
--------------------------------------------------------------------------------
1 | *Mxnet是一个轻量化分布式可移植深度学习计算平台,支持多机多节点、多GPU的计算,其openMP+MPI/SSH+Cuda/Cudnn的框架是的计算速度很快,且能与分布式文件系统结合实现大数据的深度学习。*
2 | ##Mxnet单节点的安装:
3 | **1、安装基本依赖**
4 | ```
5 | sudo apt-get update
6 | ```
7 |
8 | ```
9 | sudo apt-get install -y build-essential git libblas-dev libopencv-dev
10 | ```
11 | **2、下载mxnet**
12 | ```
13 | git clone --recursive https://github.com/dmlc/mxnet.git
14 | ```
15 | **3、安装CUDA**
16 | ```
17 | 具体参见http://blog.csdn.net/xizero00/article/details/43227019
18 | ```
19 | **4、编译支持GPU的MXnet**
20 |
21 | 将mxnet/目录里找到mxnet/make/子目录,把该目录下的config.mk复制到mxnet/目录,用文本编辑器打开,找到并修改以下两行:
22 | ```
23 | USE_CUDA = 1
24 |
25 | USE_CUDA_PATH = /usr/local/cuda
26 | ```
27 | 修改之后,在mxnet/目录下编译
28 | ```
29 | make -j4
30 | ```
31 | **5、安装Python支持**
32 | ```
33 | cd python;
34 |
35 | python setup.py install
36 | ```
37 | 有些时候需要安装setuptools和numpy(sudo apt-get install python-numpy)。
38 | **6、运行Mnist手写体识别实例**
39 | 在mxnet/example/image-classification里可以找到MXnet自带MNIST的识别样例
40 | ```
41 | cd mxnet/example/image-classification
42 |
43 | python train_mnist.py
44 | ```
--------------------------------------------------------------------------------
/PaddlePaddle/README.md:
--------------------------------------------------------------------------------
1 |
2 | # PaddlePaddle图像分类demo
3 |
4 |
5 |
6 |
7 |
8 | ## 安装PaddlePaddle
9 |
10 | ```
11 |
12 | # 下载安装包
13 |
14 | wget https://github.com/PaddlePaddle/Paddle/releases/download/V0.8.0b1/paddle-cpu-0.8.0b1-Linux.deb
15 |
16 |
17 | # 安装
18 |
19 | gdebi paddle-cpu-0.8.0b1-Linux.deb
20 |
21 | 如果 gdebi 没有安装,则需要使用 sudo apt-get install gdebi, 来安装 gdebi 。
22 |
23 | 或者使用下面一条命令安装.
24 |
25 | dpkg -i paddle-cpu-0.8.0b1-Linux.deb
26 |
27 | apt-get install -f
28 | 在 dpkg -i 的时候如果报一些依赖未找到的错误是正常的, 在 apt-get install -f 里会继续安装 PaddlePaddle
29 |
30 | 官方教程http://www.paddlepaddle.org/doc_cn/build_and_install/install/ubuntu_install.html
31 |
32 |
33 | ```
34 |
35 |
36 | ## 下载MNIST数据集
37 |
38 | 下载地址https://pan.baidu.com/s/1kUNBkyz
39 |
40 | 在当前目录建立data文件夹,将MNIST.rar里的train和test文件夹解压到data文件夹下
41 |
42 | 注该数据集将原版MNIST二进制文件中的图片提取出来分别放入train和test文件夹,用户可以自行添加图片到train和test文件夹下,但要修改源码中关于图像大小的参数
43 |
44 |
45 |
46 | ## 训练MNIST
47 |
48 |
49 | ```
50 |
51 | sh preprocess.sh # 调用preprocess.py 预处理
52 |
53 | sh train.sh # 调用vgg.py训练,该脚本文件可设置训练模型存放路径和训练线程数等参数
54 |
55 | python prediction.py # 预测,注意设置其中模型路径model_path
56 |
57 | ```
58 |
59 |
60 |
61 | ## 训练性别
62 |
63 |
64 | 训练前把train.csv,predict.csv拷贝到当前路径
65 | ```
66 |
67 | sh train_sex.sh # 调用trainer_config_sex.py训练,注意设置num_passes训练次数,训练三十次错误率能降到30%左右
68 |
69 | sh predict_sex.sh # 调用trainer_config_sex.py预测,注意设置模型路径model_path
70 |
71 | ```
72 |
73 |
74 |
75 | ## 训练年龄
76 |
77 |
78 | 训练前把train.csv,predict.csv拷贝到当前路径
79 | ```
80 |
81 | sh train_age.sh # 调用trainer_config_age.py训练,注意设置num_passes训练次数,如果以5分段预测,训练100次错误率在85%左右,不分段错误率在95%左右
82 |
83 | sh predict_age.sh # 调用trainer_config_age.py预测,注意设置模型路径model_path
84 |
85 | ```
86 |
87 |
88 |
89 | ## preprocess.py
90 |
91 |
92 | 预处理模块,将data文件夹下的图片转换为PaddlePaddle格式
93 |
94 | 转换后的数据存放在data/batches文件夹下
95 |
96 |
97 |
98 | ## vgg.py
99 |
100 |
101 | 训练模块,使用VGG网络训练,该网络在ILSVRC2014的图像分类项目上获第二名
102 |
103 | 训练后的模型存放在vgg_model/pass-n文件夹下,n表示第几次训练,每训练一次会生成一个模型文件夹,理论上训练次数越多的模型效果越好
104 |
105 | 注使用CPU训练速度很慢,平均训练一次需要近半小时,目前PaddlePaddle使用CPU训练出来的模型和GPU训练出来的模型不一样,所以用CPU训练只能用CPU预测,用GPU训练只能用GPU预测,而且用GPU预测要安装GPU版的PaddlePaddle和CUDA,cudnn,并且需要NVIDIA显卡支持,所以这里用的是CPU版的
106 |
107 |
108 |
109 | ## prediction.py
110 |
111 |
112 | 预测模块,其中image参数为要识别的图像路径
113 |
114 |
115 |
116 | ## dataprovider.py
117 |
118 |
119 | 实现向PaddlePaddle提供数据的接口,详见dataprovider.py注释
120 |
121 |
122 |
123 | ## trainer_config_sex.py
124 |
125 |
126 | 性别训练网络配置
127 |
128 |
129 |
130 | ## trainer_config_age.py
131 |
132 |
133 | 年龄训练网络配置
134 |
135 |
136 |
137 | ## predict_age.sh & predict_sex.sh
138 |
139 |
140 | 预测脚本文件,预测的结果保存在当前路径下的result.txt文件,第一个数为预测的结果,后面的数是每个标签的概率
141 |
142 |
143 |
144 | ## prediction_age.py & prediction_sex.py
145 |
146 |
147 | 提供预测接口,也可单独执行,接口输入为一个形如[[[0,1,2,...]]]的list,输出为性别或年龄的标签
148 |
149 |
150 |
151 | ## train.list & test.list
152 |
153 |
154 | 训练文件和测试文件的列表
155 |
156 |
157 |
158 | ## __init__.py
159 |
160 |
161 | 使A2的文件能导入本文件夹下的模块
--------------------------------------------------------------------------------
/PaddlePaddle/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/PaddlePaddle/dataprovider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import io
3 | import random
4 | import paddle.utils.image_util as image_util
5 | from paddle.trainer.PyDataProvider2 import *
6 | import csv
7 |
8 | @provider(input_types=[
9 | #训练数据大小
10 | dense_vector(26),
11 | #标签种类
12 | integer_value(2)
13 | ])
14 | #提供性别训练数据的函数
15 | def process_sex(settings, file_name):
16 | csvfile = file('train.csv', 'rb')
17 | reader = csv.reader(csvfile)
18 | for row in reader:
19 | if row[0]!='id':
20 | sex=0
21 | if(row[1]=='\xc4\xd0'):
22 | sex=1
23 | del row[0]
24 | del row[0]
25 | del row[0]
26 | pixels = []
27 | for j in row:
28 | if(j!=''):
29 | pixels.append(float(j))
30 | if(len(pixels)==26):
31 | yield pixels,int(sex)
32 | csvfile.close()
33 |
34 | def predict_initializer(settings, **kwargs):
35 | settings.input_types=[
36 | dense_vector(26)
37 | ]
38 | #提供性别预测数据的函数
39 | @provider(init_hook=predict_initializer, should_shuffle=False)
40 | def process_predict_sex(settings, file_name):
41 | csvfile = file('predict.csv', 'rb')
42 | reader = csv.reader(csvfile)
43 | rows= [row for row in reader]
44 | #预测第一行
45 | row=rows[1]
46 | sex='女'
47 | if(row[1]=='\xc4\xd0'):
48 | sex='男'
49 | print '实际性别:'+sex
50 | del row[0]
51 | del row[0]
52 | del row[0]
53 | pixels = []
54 | for j in row:
55 | pixels.append(float(j))
56 | if(len(pixels)==26):
57 | yield pixels
58 |
59 | @provider(input_types=[
60 | dense_vector(26),
61 | integer_value(100)
62 | ])
63 | #提供年龄训练数据的函数
64 | def process_age(settings, file_name):
65 | csvfile = file('train.csv', 'rb')
66 | reader = csv.reader(csvfile)
67 | for row in reader:
68 | if row[0]!='id':
69 | age=int(row[2])
70 | del row[0]
71 | del row[0]
72 | del row[0]
73 | pixels = []
74 | for j in row:
75 | if(j!=''):
76 | pixels.append(float(j))
77 | if(len(pixels)==26):
78 | yield pixels,age
79 | csvfile.close()
80 |
81 | def predict_initializer(settings, **kwargs):
82 | settings.input_types=[
83 | dense_vector(26)
84 | ]
85 | #提供年龄预测数据的函数
86 | @provider(init_hook=predict_initializer, should_shuffle=False)
87 | def process_predict_age(settings, file_name):
88 | csvfile = file('predict.csv', 'rb')
89 | reader = csv.reader(csvfile)
90 | rows= [row for row in reader]
91 | row=rows[1]
92 | print '实际年龄:'+row[2]
93 | del row[0]
94 | del row[0]
95 | del row[0]
96 | pixels = []
97 | for j in row:
98 | if(j!=''):
99 | pixels.append(float(j))
100 | if(len(pixels)==26):
101 | yield pixels
102 | csvfile.close()
103 |
104 | def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,
105 | is_train, **kwargs):
106 | settings.mean_img_size = mean_img_size
107 | settings.img_size = img_size
108 | settings.num_classes = num_classes
109 | settings.color = color
110 | settings.is_train = is_train
111 |
112 | if settings.color:
113 | settings.img_raw_size = settings.img_size * settings.img_size * 3
114 | else:
115 | settings.img_raw_size = settings.img_size * settings.img_size
116 |
117 | settings.meta_path = meta
118 | settings.use_jpeg = use_jpeg
119 |
120 | settings.img_mean = image_util.load_meta(settings.meta_path,
121 | settings.mean_img_size,
122 | settings.img_size,
123 | settings.color)
124 |
125 | settings.logger.info('Image size: %s', settings.img_size)
126 | settings.logger.info('Meta path: %s', settings.meta_path)
127 | '''
128 | PaddlePaddle的数据包括四种主要类型,和三种序列模式。其中,四种数据类型是
129 |
130 | dense_vector 表示稠密的浮点数向量。
131 | sparse_binary_vector 表示稀疏的零一向量,即大部分值为0,有值的位置只能取1
132 | sparse_float_vector 表示稀疏的向量,即大部分值为0,有值的部分可以是任何浮点数
133 | integer 表示整数标签。
134 | 而三种序列模式为
135 |
136 | SequenceType.NO_SEQUENCE 即不是一条序列
137 | SequenceType.SEQUENCE 即是一条时间序列
138 | SequenceType.SUB_SEQUENCE 即是一条时间序列,且序列的每一个元素还是一个时间序列。
139 | '''
140 | settings.input_types = [
141 | dense_vector(settings.img_raw_size), # image feature
142 | integer_value(settings.num_classes)] # labels
143 |
144 | settings.logger.info('DataProvider Initialization finished')
145 | '''
146 | @provider 是一个Python的 Decorator ,他可以将某一个函数标记成一个PyDataProvider
147 | PyDataProvider是PaddlePaddle使用Python提供数据的推荐接口。使用该接口用户可以只关注如何从文件中读取每一条数据,而不用关心数据如何传输给PaddlePaddle,数据如何存储等等。该数据接口使用多线程读取数据,并提供了简单的Cache功能
148 | init_hook可以传入一个函数。这个函数在初始化的时候会被调用。这个函数的参数是:
149 |
150 | 第一个参数是 settings 对象。这个对象和process的第一个参数一致。具有的属性有
151 | settings.input_types 设置输入类型。参考 input_types
152 | settings.logger 一个logging对象
153 | 其他参数都使用key word argument传入。这些参数包括paddle定义的参数,和用户传入的参数。
154 | Paddle定义的参数包括:
155 | is_train bool参数,表示这个DataProvider是训练用的DataProvider或者测试用的 DataProvider
156 | file_list 所有文件列表。
157 | 用户定义的参数使用args在训练配置中设置。
158 |
159 | 注意,PaddlePaddle保留添加参数的权力,所以init_hook尽量使用 **kwargs , 来接受不使用的 函数来保证兼容性。
160 | 详见http://www.paddlepaddle.org/doc_cn/ui/data_provider/pydataprovider2.html
161 | '''
162 | @provider(init_hook=hook)
163 | def processData(settings, file_name):
164 | """
165 | 加载数据
166 | 迭代每一批的所有图像和标签
167 | file_name: 批文件名
168 | """
169 | #使用pickle类来进行python对象的序列化,而cPickle提供了一个更快速简单的接口,如python文档所说的:“cPickle -- A faster pickle”
170 | data = cPickle.load(io.open(file_name, 'rb'))
171 | #list() 方法用于将元组转换为列表,元组与列表的区别在于元组的元素值不能修改,元组是放在括号中,列表是放于方括号中。
172 | indexes = list(range(len(data['images'])))
173 | if settings.is_train:
174 | random.shuffle(indexes)
175 | for i in indexes:
176 | if settings.use_jpeg == 1:
177 | img = image_util.decode_jpeg(data['images'][i])
178 | else:
179 | img = data['images'][i]
180 | #如果不是训练, 裁剪图像中心区域.否则随机裁剪,
181 | img_feat = image_util.preprocess_img(img, settings.img_mean,
182 | settings.img_size, settings.is_train,
183 | settings.color)
184 | label = data['labels'][i]
185 | '''
186 | 包含yield语句的函数会被特地编译成生成器。当函数被调用时,他们返回一个生成器对象
187 | 不像一般函数生成值后退出,生成器函数生成值后会自动挂起并暂停他们的执行和状态,他的本地变量将保存状态信息,这些信息在函数恢复时将再度有效
188 | 执行到 yield时,processData 函数就返回一个迭代值,下次迭代时,代码从 yield的下一条语句继续执行
189 | '''
190 | yield img_feat.tolist(), int(label)
191 |
--------------------------------------------------------------------------------
/PaddlePaddle/predict_age.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | paddle train \
5 | --config=trainer_config_age.py \
6 | --use_gpu=1 \
7 | --job=test \
8 | --init_model_path="output_age/pass-00099" \
9 | --config_args=is_predict=1 \
10 | --predict_output_dir=. \
11 |
12 | mv rank-00000 result.txt
13 |
--------------------------------------------------------------------------------
/PaddlePaddle/predict_sex.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | paddle train \
5 | --config=trainer_config_sex.py \
6 | --use_gpu=1 \
7 | --job=test \
8 | --init_model_path="output_sex/pass-00029" \
9 | --config_args=is_predict=1 \
10 | --predict_output_dir=. \
11 |
12 | mv rank-00000 result.txt
13 |
--------------------------------------------------------------------------------
/PaddlePaddle/prediction.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import os,sys
3 | import numpy as np
4 | import logging
5 | from PIL import Image
6 | from optparse import OptionParser
7 |
8 | import paddle.utils.image_util as image_util
9 |
10 | from py_paddle import swig_paddle, DataProviderConverter
11 | from paddle.trainer.PyDataProvider2 import dense_vector
12 | from paddle.trainer.config_parser import parse_config
13 |
14 | logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
15 | logging.getLogger().setLevel(logging.INFO)
16 |
17 | class ImageClassifier():
18 | def __init__(self,
19 | train_conf,
20 | use_gpu=True,
21 | model_dir=None,
22 | resize_dim=None,
23 | crop_dim=None,
24 | mean_file=None,
25 | oversample=False,
26 | is_color=False):
27 | """
28 | train_conf: 网络配置文件
29 | model_dir: 模型路径
30 | resize_dim: 设为原图大小
31 | crop_dim: 图像裁剪大小,一般设为原图大小
32 | oversample: bool, oversample表示多次裁剪,这里禁用
33 | """
34 | self.train_conf = train_conf
35 | self.model_dir = model_dir
36 | if model_dir is None:
37 | self.model_dir = os.path.dirname(train_conf)
38 |
39 | self.resize_dim = resize_dim
40 | self.crop_dims = [crop_dim, crop_dim]
41 | self.oversample = oversample
42 | self.is_color = is_color
43 |
44 | self.transformer = image_util.ImageTransformer(is_color = is_color)
45 | self.transformer.set_transpose((2,0,1))
46 |
47 | self.mean_file = mean_file
48 | mean = np.load(self.mean_file)['data_mean']
49 | mean = mean.reshape(1, self.crop_dims[0], self.crop_dims[1])
50 | self.transformer.set_mean(mean) # mean pixel
51 | gpu = 1 if use_gpu else 0
52 | conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (gpu)
53 | #使用 parse_config() 解析训练时的配置文件
54 | conf = parse_config(train_conf, conf_args)
55 | #PaddlePaddle目前使用Swig对其常用的预测接口进行了封装,使在Python环境下的预测接口更加简单
56 | #使用 swig_paddle.initPaddle() 传入命令行参数初始化 PaddlePaddle
57 | swig_paddle.initPaddle("--use_gpu=%d" % (int(use_gpu)))
58 | #使用 swig_paddle.GradientMachine.createFromConfigproto() 根据上一步解析好的配置创建神经网络
59 | self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
60 | assert isinstance(self.network, swig_paddle.GradientMachine)
61 | #从模型文件加载参数
62 | self.network.loadParameters(self.model_dir)
63 |
64 | data_size = 1 * self.crop_dims[0] * self.crop_dims[1]
65 | slots = [dense_vector(data_size)]
66 | '''
67 | 创建一个 DataProviderConverter 对象converter。
68 | swig_paddle接受的原始数据是C++的Matrix,也就是直接写内存的float数组。 这个接口并不用户友好。所以,我们提供了一个工具类DataProviderConverter。 这个工具类接收和PyDataProvider2一样的输入数据
69 | '''
70 | self.converter = DataProviderConverter(slots)
71 |
72 | def get_data(self, img_path):
73 | """
74 | 1. 读取图片.
75 | 2. resize 或 oversampling.
76 | 3. transformer data: transpose, sub mean.
77 | return K x H x W ndarray.
78 | """
79 | image = image_util.load_image(img_path, self.is_color)
80 | if self.oversample:
81 | # image_util.resize_image: short side is self.resize_dim
82 | image = image_util.resize_image(image, self.resize_dim)
83 | image = np.array(image)
84 | input = np.zeros((1, image.shape[0], image.shape[1],1),
85 | dtype=np.float32)
86 | if self.is_color:
87 | input[0] = image.astype(np.float32)
88 | else:
89 | for i in range(0,self.resize_dim):
90 | for j in range(0,self.resize_dim):
91 | input[0][i][j][0]=image[i][j]
92 | input = image_util.oversample(input, self.crop_dims)
93 | else:
94 | image = image.resize(self.crop_dims, Image.ANTIALIAS)
95 | image = np.array(image)
96 | input = np.zeros((1, self.crop_dims[0], self.crop_dims[1],1),
97 | dtype=np.float32)
98 | if self.is_color:
99 | input[0] = image.astype(np.float32)
100 | else:
101 | for i in range(0,self.resize_dim):
102 | for j in range(0,self.resize_dim):
103 | input[0][i][j][0]=image[i][j]
104 |
105 | data_in = []
106 | for img in input:
107 | img = self.transformer.transformer(img).flatten()
108 | data_in.append([img.tolist()])
109 | return data_in
110 |
111 | def forward(self, input_data):
112 | in_arg = self.converter(input_data)
113 | return self.network.forwardTest(in_arg)
114 |
115 | def forward(self, data, output_layer):
116 | #返回每种标签的概率,详见http://www.paddlepaddle.org/doc_cn/ui/predict/swig_py_paddle.html
117 | input = self.converter(data)
118 | self.network.forwardTest(input)
119 | output = self.network.getLayerOutputs(output_layer)
120 | return output[output_layer].mean(0)
121 |
122 | def predict(self, image=None, output_layer=None):
123 | assert isinstance(image, basestring)
124 | assert isinstance(output_layer, basestring)
125 | data = self.get_data(image)#读取图片
126 | prob = self.forward(data, output_layer)
127 | lab = np.argsort(-prob)#按降序排列,返回的是数组值的索引值
128 | logging.info("Label of %s is: %d", image, lab[0])
129 |
130 | if __name__ == '__main__':
131 | image_size=28#图像大小
132 | crop_size=28#图像大小
133 | multi_crop=0#多次裁剪
134 | config="vgg.py"#网络配置文件
135 | output_layer="__fc_layer_1__"
136 | mean_path="data/batches/batches.meta"
137 | model_path="vgg_model/pass-00000/"#模型路径
138 | image="test.bmp"#要识别的图片路径
139 | use_gpu=0#是否使用GPU
140 |
141 | obj = ImageClassifier(train_conf=config,
142 | model_dir=model_path,
143 | resize_dim=image_size,
144 | crop_dim=crop_size,
145 | mean_file=mean_path,
146 | use_gpu=use_gpu,
147 | oversample=multi_crop)
148 | obj.predict(image, output_layer)
149 |
--------------------------------------------------------------------------------
/PaddlePaddle/prediction_age.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | from py_paddle import swig_paddle, DataProviderConverter
3 | from paddle.trainer.PyDataProvider2 import dense_vector
4 | from paddle.trainer.config_parser import parse_config
5 | import numpy as np
6 | import csv
7 | import os
8 |
9 | def predict(data):
10 | path=os.path.split(os.path.realpath(__file__))[0]
11 | conf = parse_config(path+"/trainer_config_age.py", "is_predict=1")
12 | print conf.data_config.load_data_args
13 | network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
14 | network.loadParameters(path+"/output_age/pass-00099")
15 | converter = DataProviderConverter([dense_vector(26)])
16 | inArg = converter(data)
17 | network.forwardTest(inArg)
18 | output = network.getLayerOutputs("__fc_layer_0__")
19 | #print output
20 | prob = output["__fc_layer_0__"][0]
21 | #print prob
22 | lab = np.argsort(-prob)
23 | #print lab
24 | return lab[0]
25 |
26 | if __name__ == '__main__':
27 | swig_paddle.initPaddle("--use_gpu=0")
28 | csvfile = file('predict.csv', 'rb')
29 | reader = csv.reader(csvfile)
30 | rows= [row for row in reader]
31 | row=rows[1]
32 | print '实际年龄:'+row[2]
33 | del row[0]
34 | del row[0]
35 | del row[0]
36 | data = [[[]]]
37 | for j in row:
38 | data[0][0].append(float(j))
39 | csvfile.close()
40 | print '预测年龄:'+str(predict(data))
41 |
--------------------------------------------------------------------------------
/PaddlePaddle/prediction_sex.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | from py_paddle import swig_paddle, DataProviderConverter
3 | from paddle.trainer.PyDataProvider2 import dense_vector
4 | from paddle.trainer.config_parser import parse_config
5 | import numpy as np
6 | import csv
7 | import os
8 |
9 | def predict(data):
10 | path=os.path.split(os.path.realpath(__file__))[0]
11 | conf = parse_config(path+"/trainer_config_sex.py", "is_predict=1")
12 | print conf.data_config.load_data_args
13 | network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
14 | network.loadParameters(path+"/output_sex/pass-00029")
15 | converter = DataProviderConverter([dense_vector(26)])
16 | inArg = converter(data)
17 | network.forwardTest(inArg)
18 | output = network.getLayerOutputs("__fc_layer_0__")
19 | prob = output["__fc_layer_0__"][0]
20 | lab = np.argsort(-prob)
21 | return lab[0]
22 |
23 | if __name__ == '__main__':
24 | swig_paddle.initPaddle("--use_gpu=0")
25 | csvfile = file('predict.csv', 'rb')
26 | reader = csv.reader(csvfile)
27 | rows= [row for row in reader]
28 | #预测第一行
29 | row=rows[1]
30 | sex='女'
31 | if(row[1]=='\xc4\xd0'):
32 | sex='男'
33 | print '实际性别:'+sex
34 | del row[0]
35 | del row[0]
36 | del row[0]
37 | data = [[[]]]
38 | for j in row:
39 | data[0][0].append(float(j))
40 | csvfile.close()
41 | if(predict(data)==1):
42 | print '预测性别:男'
43 | else:
44 | print '预测性别:女'
45 |
--------------------------------------------------------------------------------
/PaddlePaddle/preprocess.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | from paddle.utils.preprocess_img import ImageClassificationDatasetCreater
3 | from optparse import OptionParser
4 |
5 | #处理命令行参数
6 | def option_parser():
7 | parser = OptionParser(usage="usage: python preprcoess.py "\
8 | "-i data_dir [options]")
9 | parser.add_option("-i", "--input", action="store",
10 | dest="input", help="图片路径")
11 | parser.add_option("-s", "--size", action="store",
12 | dest="size", help="图片大小")
13 | parser.add_option("-c", "--color", action="store",
14 | dest="color", help="图片有没有颜色")
15 | return parser.parse_args()
16 |
17 | if __name__ == '__main__':
18 | options, args = option_parser()
19 | data_dir = options.input
20 | processed_image_size = int(options.size)
21 | color = options.color == "1"
22 | data_creator = ImageClassificationDatasetCreater(data_dir,
23 | processed_image_size,
24 | color)
25 | #每个训练文件包含的图片数
26 | data_creator.num_per_batch = 1000
27 | data_creator.overwrite = True
28 | data_creator.create_batches()
29 |
--------------------------------------------------------------------------------
/PaddlePaddle/preprocess.sh:
--------------------------------------------------------------------------------
1 |
2 | set -e
3 | data_dir=./data
4 | python preprocess.py -i $data_dir -s 28 -c 0
5 |
6 | #-i后为训练数据存放路径,-s后为图像大小,-c后为图像有没有颜色
7 |
--------------------------------------------------------------------------------
/PaddlePaddle/test.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/PaddlePaddle/test.bmp
--------------------------------------------------------------------------------
/PaddlePaddle/test.list:
--------------------------------------------------------------------------------
1 | predict.csv
2 |
--------------------------------------------------------------------------------
/PaddlePaddle/train.list:
--------------------------------------------------------------------------------
1 | train.csv
2 |
--------------------------------------------------------------------------------
/PaddlePaddle/train.sh:
--------------------------------------------------------------------------------
1 |
2 | set -e
3 | config=vgg.py
4 | output=./vgg_model
5 | log=train.log
6 |
7 | paddle train \
8 | --config=$config \
9 | --use_gpu=0 \
10 | --trainer_count=8 \
11 | --num_passes=10 \
12 | --save_dir=$output \
13 | 2>&1 | tee $log
14 |
15 | python -m paddle.utils.plotcurve -i $log > plot.png
16 |
17 | :<<'
18 | use_gpu是否使用GPU训练
19 | trainer_count训练线程数,使用CPU时建议设为CPU的线程数,使用GPU时设为GPU的数目
20 | num_passes训练次数,每训练一次会生成一个模型文件夹
21 | output模型存放路径
22 | '
23 |
24 |
--------------------------------------------------------------------------------
/PaddlePaddle/train_age.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | paddle train \
5 | --config=trainer_config_age.py \
6 | --save_dir=./output_age \
7 | --trainer_count=1 \
8 | --num_passes=100 \
9 | --use_gpu=1 \
10 | 2>&1 | tee 'train_age.log'
11 |
--------------------------------------------------------------------------------
/PaddlePaddle/train_sex.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | paddle train \
5 | --config=trainer_config_sex.py \
6 | --save_dir=./output_sex \
7 | --trainer_count=1 \
8 | --num_passes=30 \
9 | --use_gpu=1 \
10 | 2>&1 | tee 'train_sex.log'
11 |
--------------------------------------------------------------------------------
/PaddlePaddle/trainer_config_age.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | from paddle.trainer_config_helpers import *
3 | import csv
4 |
5 | is_predict = get_config_arg('is_predict', bool, False)
6 | define_py_data_sources2(
7 | #训练文件列表
8 | train_list='train.list' if not is_predict else None,
9 | #测试文件列表
10 | test_list='test.list',
11 | #指明提供数据的函数
12 | module="dataprovider",
13 | obj='process_age' if not is_predict else 'process_predict_age')
14 |
15 | settings(
16 | #批尺寸
17 | batch_size=128 if not is_predict else 1,
18 | #学习速率
19 | learning_rate=2e-3,
20 | #学习方式
21 | learning_method=AdamOptimizer(),
22 | #权重衰减
23 | regularization=L2Regularization(8e-4))
24 | #输入数据大小
25 | data = data_layer(name="data", size=26)
26 | #直接全连接,指明输出数据大小,激活函数是Softmax
27 | output = fc_layer(name="__fc_layer_0__",input=data, size=100, act=SoftmaxActivation())
28 | if is_predict:
29 | #获得最大概率的标签
30 | maxid = maxid_layer(output)
31 | outputs([maxid, output])
32 | else:
33 | #标签大小
34 | label = data_layer(name="label", size=100)
35 | #计算误差
36 | cls = classification_cost(input=output, label=label)
37 | outputs(cls)
38 |
--------------------------------------------------------------------------------
/PaddlePaddle/trainer_config_sex.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | from paddle.trainer_config_helpers import *
3 | import csv
4 |
5 | is_predict = get_config_arg('is_predict', bool, False)
6 | define_py_data_sources2(
7 | #训练文件列表
8 | train_list='train.list' if not is_predict else None,
9 | #测试文件列表
10 | test_list='test.list',
11 | #指明提供数据的函数
12 | module="dataprovider",
13 | obj='process_sex' if not is_predict else 'process_predict_sex')
14 |
15 | settings(
16 | #批尺寸
17 | batch_size=128 if not is_predict else 1,
18 | #学习速率
19 | learning_rate=2e-3,
20 | #学习方式
21 | learning_method=AdamOptimizer(),
22 | #权重衰减
23 | regularization=L2Regularization(8e-4))
24 | #输入数据大小
25 | data = data_layer(name="data", size=26)
26 | #直接全连接,指明输出数据大小,激活函数是Softmax
27 | output = fc_layer(name="__fc_layer_0__",input=data, size=2, act=SoftmaxActivation())
28 | if is_predict:
29 | #找出最大概率的标签
30 | maxid = maxid_layer(output)
31 | outputs([maxid, output])
32 | else:
33 | #标签大小
34 | label = data_layer(name="label", size=2)
35 | #计算误差
36 | cls = classification_cost(input=output, label=label)
37 | outputs(cls)
38 |
--------------------------------------------------------------------------------
/PaddlePaddle/vgg.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | from paddle.trainer_config_helpers import *
3 |
4 | is_predict = get_config_arg("is_predict", bool, False)
5 |
6 | ####################Data Configuration ##################
7 | if not is_predict:
8 | data_dir='data/batches/'
9 | meta_path=data_dir+'batches.meta'
10 |
11 | '''
12 | mean_img_size,img_size图像大小
13 | num_classes分类类别数
14 | color图像有无颜色
15 | '''
16 | args = {'meta':meta_path,'mean_img_size': 28,
17 | 'img_size': 28,'num_classes': 10,
18 | 'use_jpeg': 1,'color': 0}
19 |
20 | #引用image_provider.py中的processData函数
21 | define_py_data_sources2(train_list=data_dir+"train.list",
22 | test_list=data_dir+'test.list',
23 | module='dataprovider',
24 | obj='processData',
25 | args=args)
26 |
27 | ######################Algorithm Configuration #############
28 | settings(
29 | #批尺寸,一次训练多少数据
30 | batch_size = 128,
31 | #学习速率
32 | learning_rate = 0.1 / 128.0,
33 | #学习方式,详见http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/optimizers.html
34 | learning_method = MomentumOptimizer(0.9),
35 | #权重衰减,防过拟合
36 | regularization = L2Regularization(0.0005 * 128)
37 | )
38 |
39 | #######################Network Configuration #############
40 | #图片大小,通道数×长×宽
41 | data_size=1*28*28
42 | #分类数量
43 | label_size=10
44 | #关于layer,详见http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html
45 | img = data_layer(name='image',
46 | size=data_size)
47 | #small_vgg在trainer_config_helpers.network预定义
48 | #关于网络详见http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/networks.html
49 | predict = small_vgg(input_image=img,
50 | num_channels=1,#图像通道数,灰度图像为1
51 | num_classes=label_size)
52 |
53 | if not is_predict:
54 | lbl = data_layer(name="label", size=label_size)
55 | outputs(classification_cost(input=predict, label=lbl))
56 | else:
57 | #预测网络直接输出最后一层的结果而不是像训练时以cost layer作为输出
58 | outputs(predict)
59 |
60 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 对血常规检验报告的OCR识别、深度学习与分析
2 |
3 | * 将血常规检验报告的图片识别出年龄、性别及血常规检验的各项数据
4 | * 图片上传页面,提交的结果是图片存储到了mongodb数据库得到一个OID或到指定目录到一个path
5 | * 图片识别得到一个json数据存储到了mongodb数据库得到一个OID,[json数据](https://coding.net/u/mengning/p/np2016/git/blob/master/BloodTestReportOCR/bloodtestdata.json)
6 | * 自动截取目标区域,已经能不同旋转角度的图片自动准备截取目标区域,但对倾斜透视的图片处理效果不佳,[具体用法](https://coding.net/u/mengning/p/np2016/git/blob/master/BloodTestReportOCR/README.md)
7 | * 预处理,比如增加对比度、锐化
8 | * 识别
9 |
10 | * 识别结果页面,上部是原始图片,下部是一个显示识别数据的表格,以便对照识别结果
11 | * 学习血常规检验的各项数据及对应的年龄性别
12 | * 根据血常规检验的各项数据预测年龄和性别
13 |
14 | ## Links
15 |
16 | * [我的博客](http://www.csxiaoyao.com/blog/2017/01/01/ustc-np2016%E8%AF%BE%E7%A8%8B%E5%AD%A6%E4%B9%A0%E6%80%BB%E7%BB%93/)
17 |
--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/BTR_binary_classification.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | #基于spark血常规检验报告深度学习
3 | #by Islotus
4 | #2016.12.15
5 |
6 | from __future__ import print_function
7 |
8 | import sys
9 | import math
10 |
11 | from pyspark.sql import SparkSession
12 | from pyspark.mllib.classification import LogisticRegressionWithLBFGS
13 | from pyspark.mllib.evaluation import BinaryClassificationMetrics
14 | from pyspark.mllib.regression import LabeledPoint
15 |
16 | from pyspark import SparkContext
17 | #from pyspark.mllib.classification import SVMWithSGD, SVMModel
18 | from pyspark.mllib.util import MLUtils
19 |
20 | if __name__ == "__main__":
21 |
22 | sc = SparkContext(appName="BloodTestReportPythonBinaryClassificationMerticsExample")
23 |
24 | #读取数据
25 | print('Begin Load Data File!')
26 | sexData = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata_sex.txt")
27 | print('Data File has been Loaded!')
28 |
29 | accuracySex = []
30 |
31 | for i in range(0,100):
32 | #将数据随机分隔为9:1, 分别作为训练数据(training)和预测数据(test)
33 | sexTraining, sexTest = sexData.randomSplit([0.9, 0.1])
34 |
35 | #训练二分类模型
36 | sexModel = LogisticRegressionWithLBFGS.train(sexTraining)
37 |
38 | #对test数据进行预测,输出预测准确度
39 | sexPredictionAndLabels = sexTest.map(lambda lp: (float(sexModel.predict(lp.features)), lp.label))
40 | accuracySex.append(1.0 * sexPredictionAndLabels.filter(lambda (x, v): x == v).count() / sexTest.count())
41 |
42 | #AVG:平均数 MSE:均方差
43 | SexRDD = sc.parallelize(accuracySex)
44 | SexPAAVG = SexRDD.reduce(lambda x,y:x+y)/SexRDD.count()
45 | SexPAMSE = math.sqrt(SexRDD.map(lambda x:(x - SexPAAVG)*(x - SexPAAVG)).reduce(lambda x,y:x+y)/SexRDD.count())
46 |
47 | print('Sex Prediction Accuracy AVG:{}'.format(SexPAAVG))
48 | print('Sex Prediction Accuracy MSE:{}'.format(SexPAMSE))
49 |
50 | output = open('BinaryClassificationMetricsResult.txt', 'w')
51 | output.write('Sex Prediction Accuracy AVG is:' + str(SexPAAVG) + "\n")
52 | output.write('Sex Prediction Accuracy MSE is:' + str(SexPAMSE) + "\n")
53 | for i in accuracySex:
54 | output.write(str(i)+",")
55 | output.write("\n")
56 | output.close()
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/BTR_decision_tree.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | #基于spark血常规检验报告深度学习
3 | #by Islotus
4 | #2016.12.15
5 |
6 | from __future__ import print_function
7 |
8 | import sys
9 | import math
10 | from pyspark import SparkContext
11 | from pyspark.mllib.tree import DecisionTree
12 | from pyspark.mllib.util import MLUtils
13 |
14 | if __name__ == "__main__":
15 |
16 | sc = SparkContext(appName="BloodTestReportPythonDecisionTreeExample")
17 |
18 | #读取数据
19 | print('Begin Load Data File!')
20 | sexData = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata_sex.txt")
21 | ageData = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata_age.txt")
22 |
23 | print('Data File has been Loaded!')
24 |
25 | accuracySex = []
26 | accuracyAge = []
27 | for i in range(0,100):
28 | #将数据随机分割为9:1, 分别作为训练数据(training)和预测数据(test)
29 | sexTraining, sexTest = sexData.randomSplit([0.9, 0.1])
30 | ageTraining, ageTest = ageData.randomSplit([0.9, 0.1])
31 |
32 | #训练决策树模型
33 | sexModel = DecisionTree.trainClassifier(sexTraining, numClasses=2, categoricalFeaturesInfo={},
34 | impurity='gini', maxDepth=5, maxBins=32)
35 | ageModel = DecisionTree.trainClassifier(ageTraining, numClasses=1000, categoricalFeaturesInfo={},
36 | impurity='gini', maxDepth=5, maxBins=32)
37 |
38 | #对test数据进行预测,输出预测准确度
39 | sexPredictionAndLabel = sexTest.map(lambda p: p.label).zip(sexModel.predict(sexTest.map(lambda x: x.features)))
40 | agePredictionAndLabel = ageTest.map(lambda p: p.label).zip(ageModel.predict(ageTest.map(lambda x: x.features)))
41 |
42 | accuracySex.append(1.0 * sexPredictionAndLabel.filter(lambda (x, v): x == v).count() / sexTest.count())
43 | accuracyAge.append(1.0 * agePredictionAndLabel.filter(lambda (x, v): abs((x-v)<=5)).count() / ageTest.count())
44 |
45 | #AVG:平均数 MSE:均方差
46 | SexRDD = sc.parallelize(accuracySex)
47 | AgeRDD = sc.parallelize(accuracyAge)
48 | SexPAAVG = SexRDD.reduce(lambda x,y:x+y)/SexRDD.count()
49 | AgePAAVG = AgeRDD.reduce(lambda x,y:x+y)/AgeRDD.count()
50 | SexPAMSE = math.sqrt(SexRDD.map(lambda x:(x - SexPAAVG)*(x - SexPAAVG)).reduce(lambda x,y:x+y)/SexRDD.count())
51 | AgePAMSE = math.sqrt(AgeRDD.map(lambda x:(x - AgePAAVG)*(x - AgePAAVG)).reduce(lambda x,y:x+y)/AgeRDD.count())
52 | #print(sum(accuracySex) / len(accuracySex))
53 | #print(sum(accuracyAge) / len(accuracyAge))
54 |
55 | print('Sex Prediction Accuracy AVG:{}'.format(SexPAAVG))
56 | print('Sex Prediction Accuracy MSE:{}'.format(SexPAMSE))
57 | print('AGE Prediction Accuracy AVG:{}'.format(AgePAAVG))
58 | print('AGE Prediction Accuracy MSE:{}'.format(AgePAMSE))
59 |
60 | output = open('DecisionTreeResult.txt', 'w')
61 | output.write('Sex Prediction Accuracy AVG is:' + str(SexPAAVG) + "\n")
62 | output.write('Sex Prediction Accuracy MSE is:' + str(SexPAMSE) + "\n")
63 | for i in accuracySex:
64 | output.write(str(i)+",")
65 | output.write("\n")
66 | output.write('Age Prediction Accuracy AVG is:' + str(AgePAAVG) + "\n")
67 | output.write('Age Prediction Accuracy MSE is:' + str(AgePAMSE) + "\n")
68 | for i in accuracyAge:
69 | output.write(str(i) + ",")
70 | output.write("\n")
71 | output.close()
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/BTR_gradient_boosting.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | #基于Spark血常规检验报告深度学习
3 | #by Islotus
4 | #2016.12.15
5 |
6 | from __future__ import print_function
7 |
8 | import sys
9 | import math
10 | from pyspark import SparkContext
11 |
12 | from pyspark.mllib.tree import GradientBoostedTrees
13 | from pyspark.mllib.util import MLUtils
14 |
15 | if __name__ == "__main__":
16 |
17 | sc = SparkContext(appName="BloodTestReportPythonGradientBoostedTreesClassificationExample")
18 |
19 | #读取数据
20 | print('Begin Load Data File!')
21 | sexData = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata_sex.txt")
22 |
23 | print('Data File has been Loaded!')
24 |
25 | accuracySex = []
26 |
27 | for i in range(0,100):
28 | #将数据随机分割成9:1,分别作为训练数据(training)和预测数据(test)
29 | sexTraining, sexTest = sexData.randomSplit([0.9, 0.1])
30 |
31 | #训练梯度增强树模型
32 | sexModel = GradientBoostedTrees.trainClassifier(sexTraining, categoricalFeaturesInfo={}, numIterations=3)
33 |
34 | #对test数据进行预测,输出预测准确度
35 | sexPredictionAndLabel = sexTest.map(lambda lp: lp.label).zip(sexModel.predict(sexTest.map(lambda x: x.features)))
36 | accuracySex.append(1.0 * sexPredictionAndLabel.filter(lambda (x, v): x == v).count() / sexTest.count())
37 |
38 | #AVG:平均数 MSE:均方差
39 | SexRDD = sc.parallelize(accuracySex)
40 | SexPAAVG = SexRDD.reduce(lambda x,y:x+y)/SexRDD.count()
41 | SexPAMSE = math.sqrt(SexRDD.map(lambda x:(x - SexPAAVG)*(x - SexPAAVG)).reduce(lambda x,y:x+y)/SexRDD.count())
42 | #print(sum(accuracySex) / len(accuracySex))
43 | #print(sum(accuracyAge) / len(accuracyAge))
44 |
45 | print('Sex Prediction Accuracy AVG:{}'.format(SexPAAVG))
46 | print('Sex Prediction Accuracy MSE:{}'.format(SexPAMSE))
47 |
48 | output = open('GradientBoostedTreesClassificationResult.txt', 'w')
49 | output.write('Sex Prediction Accuracy AVG is:' + str(SexPAAVG) + "\n")
50 | output.write('Sex Prediction Accuracy MSE is:' + str(SexPAMSE) + "\n")
51 | for i in accuracySex:
52 | output.write(str(i)+",")
53 | output.write("\n")
54 |
55 | output.close()
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/BloodTestReportbyLR.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/Spark/BloodTestReportDeepLearning/BloodTestReportbyLR.py
--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/BloodTestReportbyNB.py:
--------------------------------------------------------------------------------
1 | # -*- coding: cp936 -*-
2 | #基于Spark的朴素贝叶斯血常规检验报告深度学习系统
3 | #2016.12.14
4 |
5 | from __future__ import print_function
6 |
7 | import sys
8 | import math
9 | from pyspark import SparkContext
10 | from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
11 | from pyspark.mllib.util import MLUtils
12 |
13 |
14 | if __name__ == "__main__":
15 |
16 | sc = SparkContext(appName="BloodTestReportPythonNaiveBayesExample")
17 |
18 | # 读取数据.
19 | print('Begin Load Data File!')
20 | sexData = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata_sex.txt")
21 | ageData = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata_age.txt")
22 | #print(data.collect())
23 | print('Data File has been Loaded!')
24 | #for(d in data.take(3)):
25 | # print(d)
26 | accuracySex = []
27 | accuracyAge = []
28 | for i in range(0,100):
29 | # 将数据随机分割为9:1,分别作为训练数据(training)和预测数据(test).
30 | sexTraining, sexTest = sexData.randomSplit([0.9, 0.1])
31 | ageTraining, ageTest = ageData.randomSplit([0.9, 0.1])
32 |
33 | # 训练朴素贝叶斯模型.
34 | #print('Begin NaiveBayes tranning!')
35 | sexModel = NaiveBayes.train(sexTraining, 1.0)
36 | ageModel = NaiveBayes.train(ageTraining, 1.0)
37 | #print('Trainning over!')
38 | # 对test数据进行预测,输出预测准确度.
39 | sexPredictionAndLabel = sexTest.map(lambda p: (sexModel.predict(p.features), p.label))
40 | agePredictionAndLabel = ageTest.map(lambda p: (ageModel.predict(p.features), p.label))
41 | #print(predictionAndLabel.collect())
42 | accuracySex.append(1.0 * sexPredictionAndLabel.filter(lambda (x, v): x == v).count() / sexTest.count())
43 | accuracyAge.append(1.0 * agePredictionAndLabel.filter(lambda (x, v): abs((x-v)<=5)).count() / ageTest.count())
44 | #AVG:平均数 MSE:均方差
45 | SexRDD = sc.parallelize(accuracySex)
46 | AgeRDD = sc.parallelize(accuracyAge)
47 | SexPAAVG = SexRDD.reduce(lambda x,y:x+y)/SexRDD.count()
48 | AgePAAVG = AgeRDD.reduce(lambda x,y:x+y)/AgeRDD.count()
49 | SexPAMSE = math.sqrt(SexRDD.map(lambda x:(x - SexPAAVG)*(x - SexPAAVG)).reduce(lambda x,y:x+y)/SexRDD.count())
50 | AgePAMSE = math.sqrt(AgeRDD.map(lambda x:(x - AgePAAVG)*(x - AgePAAVG)).reduce(lambda x,y:x+y)/AgeRDD.count())
51 | #print(sum(accuracySex) / len(accuracySex))
52 | #print(sum(accuracyAge) / len(accuracyAge))
53 |
54 | print('Sex Prediction Accuracy AVG:{}'.format(SexPAAVG))
55 | print('Sex Prediction Accuracy MSE:{}'.format(SexPAMSE))
56 | print('AGE Prediction Accuracy AVG:{}'.format(AgePAAVG))
57 | print('AGE Prediction Accuracy MSE:{}'.format(AgePAMSE))
58 |
59 | output = open('NaiveBayesResult.txt', 'w')
60 | output.write('Sex Prediction Accuracy AVG is:' + str(SexPAAVG) + "\n")
61 | output.write('Sex Prediction Accuracy MSE is:' + str(SexPAMSE) + "\n")
62 | for i in accuracySex:
63 | output.write(str(i)+",")
64 | output.write("\n")
65 | output.write('Age Prediction Accuracy AVG is:' + str(AgePAAVG) + "\n")
66 | output.write('Age Prediction Accuracy MSE is:' + str(AgePAMSE) + "\n")
67 | for i in accuracyAge:
68 | output.write(str(i) + ",")
69 | output.write("\n")
70 | output.close()
71 |
72 |
--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/BloodTestReportbyRF.py:
--------------------------------------------------------------------------------
1 | # -*- coding: cp936 -*-
2 | #基于Spark的随机树血常规检验报告深度学习系统
3 | #2016.12.14
4 |
5 | from __future__ import print_function
6 |
7 | import sys
8 | import math
9 | from pyspark import SparkContext
10 | from pyspark.mllib.tree import RandomForest
11 | from pyspark.mllib.util import MLUtils
12 |
13 |
14 | if __name__ == "__main__":
15 |
16 | sc = SparkContext(appName="BloodTestReportPythonRandomForestExample")
17 |
18 | # 读取数据.
19 | print('Begin Load Data File!')
20 | sexData = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata_sex.txt")
21 | ageData = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata_age.txt")
22 | #print(data.collect())
23 | print('Data File has been Loaded!')
24 | #for(d in data.take(3)):
25 | # print(d)
26 | accuracySex = []
27 | accuracyAge = []
28 | for i in range(0,100):
29 | # 将数据随机分割为9:1,分别作为训练数据(training)和预测数据(test).
30 | sexTraining, sexTest = sexData.randomSplit([0.9, 0.1])
31 | ageTraining, ageTest = ageData.randomSplit([0.9, 0.1])
32 |
33 | # 训练随机树模型.
34 | #print('Begin RandomForest tranning!')
35 | sexModel = RandomForest.trainClassifier(sexTraining,numClasses=2, categoricalFeaturesInfo={},
36 | numTrees=3, featureSubsetStrategy="auto",
37 | impurity='gini', maxDepth=4, maxBins=32)
38 | ageModel = RandomForest.trainClassifier(ageTraining,numClasses=1000, categoricalFeaturesInfo={},
39 | numTrees=3, featureSubsetStrategy="auto",
40 | impurity='gini', maxDepth=4, maxBins=32)
41 | #print('Trainning over!')
42 | # 对test数据进行预测,输出预测准确度.
43 | sexPredictionAndLabel = sexTest.map(lambda p: p.label).zip(sexModel.predict(sexTest.map(lambda x: x.features)))
44 | agePredictionAndLabel = ageTest.map(lambda p: p.label).zip(ageModel.predict(ageTest.map(lambda x: x.features)))
45 | #print(predictionAndLabel.collect())
46 | accuracySex.append(1.0 * sexPredictionAndLabel.filter(lambda (x, v): x == v).count() / sexTest.count())
47 | accuracyAge.append(1.0 * agePredictionAndLabel.filter(lambda (x, v): abs((x-v)<=5)).count() / ageTest.count())
48 | #AVG:平均数 MSE:均方差
49 | SexRDD = sc.parallelize(accuracySex)
50 | AgeRDD = sc.parallelize(accuracyAge)
51 | SexPAAVG = SexRDD.reduce(lambda x,y:x+y)/SexRDD.count()
52 | AgePAAVG = AgeRDD.reduce(lambda x,y:x+y)/AgeRDD.count()
53 | SexPAMSE = math.sqrt(SexRDD.map(lambda x:(x - SexPAAVG)*(x - SexPAAVG)).reduce(lambda x,y:x+y)/SexRDD.count())
54 | AgePAMSE = math.sqrt(AgeRDD.map(lambda x:(x - AgePAAVG)*(x - AgePAAVG)).reduce(lambda x,y:x+y)/AgeRDD.count())
55 | #print(sum(accuracySex) / len(accuracySex))
56 | #print(sum(accuracyAge) / len(accuracyAge))
57 |
58 | print('Sex Prediction Accuracy AVG:{}'.format(SexPAAVG))
59 | print('Sex Prediction Accuracy MSE:{}'.format(SexPAMSE))
60 | print('AGE Prediction Accuracy AVG:{}'.format(AgePAAVG))
61 | print('AGE Prediction Accuracy MSE:{}'.format(AgePAMSE))
62 |
63 | output = open('RandomForestResult.txt', 'w')
64 | output.write('Sex Prediction Accuracy AVG is:' + str(SexPAAVG) + "\n")
65 | output.write('Sex Prediction Accuracy MSE is:' + str(SexPAMSE) + "\n")
66 | for i in accuracySex:
67 | output.write(str(i)+",")
68 | output.write("\n")
69 | output.write('Age Prediction Accuracy AVG is:' + str(AgePAAVG) + "\n")
70 | output.write('Age Prediction Accuracy MSE is:' + str(AgePAMSE) + "\n")
71 | for i in accuracyAge:
72 | output.write(str(i) + ",")
73 | output.write("\n")
74 | output.close()
75 |
76 |
--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/BloodTestReportbySVM.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/Spark/BloodTestReportDeepLearning/BloodTestReportbySVM.py
--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/README.md:
--------------------------------------------------------------------------------
1 | # 基于Spark的血常规检验报告深度学习系统
2 | ##构造训练测试数据
3 | 原始数据在data_set.csv中,运行
4 | ```
5 | python ./dataformat.py
6 | ```
7 |
8 | 生成Spark使用的labeled point数据,分别保存在LabeledPointsdata_age.txt和LabeledPointsdata_sex.txt中
9 |
10 | ##运行
11 |
12 | 所有示例都自动对两个数据集中的数据随机分为9:1,9份做模型训练,1份做预测测试。重复100次后分别计算年龄和性别的预测准确度和方差,在屏幕输出的同时,保存在对应的 算法名+result.txt文件中。
13 |
14 | ###朴素贝叶斯算法(支持多分类)
15 | ```
16 | python ./BloodTestReportbyNB.py
17 | ```
18 |
19 | 结果:
20 | ```
21 | Sex Prediction Accuracy AVG is:0.621970740283
22 | Sex Prediction Accuracy MSE is:0.0339853457575
23 | Age Prediction Accuracy AVG is:0.539635804425
24 | Age Prediction Accuracy MSE is:0.039652048965
25 | ```
26 | ###线性支持向量机(仅支持二分类)
27 | ```
28 | python ./BloodTestReportbySVM.py
29 | ```
30 |
31 | 结果(迭代次数=100):
32 | ```
33 | Sex Prediction Accuracy AVG is:0.528946440893
34 | Sex Prediction Accuracy MSE is:0.0499342692342
35 | ```
36 |
37 | ###逻辑回归(仅支持二分类)
38 |
39 | ```
40 | python ./BloodTestReportbyLR.py
41 | ```
42 |
43 | 结果(迭代次数=100):
44 | ```
45 | Sex Prediction Accuracy AVG is:0.717975697167
46 | Sex Prediction Accuracy MSE is:0.0303414723843
47 | ```
48 |
49 | ###随机树(支持多分类)
50 | ```
51 | python ./BloodTestReportbyRF.py
52 | ```
53 |
54 | 结果(树=3,最大深度=4,最大叶子数=32,纯度计算方式:基尼系数,性别分类=2,年龄分类=1000(此处取值与纯度计算方式有关,实际年龄label只有92个,具体算法还未完全掌握)):
55 | ```
56 | Sex Prediction Accuracy AVG is:0.71622711581
57 | Sex Prediction Accuracy MSE is:0.0255871783772
58 | Age Prediction Accuracy AVG is:0.561977173151
59 | Age Prediction Accuracy MSE is:0.0622593472121
60 | ```
61 |
62 | ###梯度提升树(仅支持二分类)
63 |
64 | ```
65 | python ./BTR_gradient_boosting.py
66 | ```
67 |
68 | 结果(迭代次数=100):
69 | ```
70 | Sex Prediction Accuracy AVG is:0.728212518228
71 | Sex Prediction Accuracy MSE is:0.0305777571064
72 | ```
73 |
74 | ###二分类(仅支持二分类)
75 |
76 | ```
77 | python ./BTR_binary_classification.py
78 | ```
79 |
80 | 结果(迭代次数=100):
81 | ```
82 | Sex Prediction Accuracy AVG is:0.718756411999
83 | Sex Prediction Accuracy MSE is:0.0311279215968
84 | ```
85 |
86 | ###决策树(支持多分类)
87 | ```
88 | python ./BTR_decision_tree.py
89 | ```
90 |
91 | 结果:
92 | ```
93 | Sex Prediction Accuracy AVG is:0.707608775434
94 | Sex Prediction Accuracy MSE is:0.0292234440441
95 | Age Prediction Accuracy AVG is:0.552560046229
96 | Age Prediction Accuracy MSE is:0.05098502703
97 | ```
--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/data_set.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/Spark/BloodTestReportDeepLearning/data_set.csv
--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/dataformat.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/Spark/BloodTestReportDeepLearning/dataformat.py
--------------------------------------------------------------------------------
/Spark/BloodTestReportDeepLearning/spark单机安装15122016.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/Spark/BloodTestReportDeepLearning/spark单机安装15122016.md
--------------------------------------------------------------------------------
/Spark/DigitRecogn_Spark/Readme.md:
--------------------------------------------------------------------------------
1 |
2 | #基于Spark的Ocr手写字符识别系统Demo
3 |
4 | ##构造训练测试数据
5 | ###下载数据集
6 | ```
7 |
8 | wget http://labfile.oss.aliyuncs.com/courses/593/data.csv
9 | ```
10 | 该数据集是https://www.shiyanlou.com/courses/593/labs/1966/document 中为反向神经网络训练用的数据集
11 | ###格式化数据集
12 | Spark深度学习常用的两种训练数据格式为Labeled point和LibSVM,在此,我们使用Labeled point作为训练数据格式。
13 |
14 |
15 | labeled point 是一个局部向量,要么是密集型的要么是稀疏型的,用一个label/response进行关联。在Spark里,labeled points 被用来监督学习算法。我们使用一个double数来存储一个label,因此我们能够使用labeled points进行回归和分类。
16 | 在二进制分类里,一个label可以是 0(负数)或者 1(正数)。在多级分类中,labels可以是class的索引,从0开始:0,1,2,......
17 |
18 | 本Demo采用朴素贝叶斯作为训练、预测模型,特征值必须是非负数。
19 |
20 | 程序在运行过程中先读取并格式化./data.csv中的数据,然后和网页前端传来的训练数据一起格式化为labeled points格式
21 | 新生成的LabeledPoints数据保存在LabeledPointsdata.txt中。
22 |
23 | 需要预测时,先将LabeledPointsdata.txt中的数据读取为Spark 专用 RDD 形式,然后训练到model中
24 |
25 | ##运行
26 |
27 |
28 | ###创建服务器
29 | ```
30 | python -m SimpleHTTPServer 3000
31 | ```
32 |
33 | ###加载服务器
34 | ```
35 | python server.py
36 |
37 | ```
38 | ###访问
39 | ```
40 | localhost:3000
41 | ```
--------------------------------------------------------------------------------
/Spark/DigitRecogn_Spark/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
OCR Demo
10 |
11 |
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/Spark/DigitRecogn_Spark/ocr.js:
--------------------------------------------------------------------------------
1 | var ocrDemo = {
2 | CANVAS_WIDTH: 200,
3 | TRANSLATED_WIDTH: 20,
4 | PIXEL_WIDTH: 10, // TRANSLATED_WIDTH = CANVAS_WIDTH / PIXEL_WIDTH
5 | BATCH_SIZE: 1,
6 |
7 | // 服务器端参数
8 | PORT: "9000",
9 | HOST: "http://localhost",
10 |
11 | // 颜色变量
12 | BLACK: "#000000",
13 | BLUE: "#0000ff",
14 |
15 | // 客户端训练数据集
16 | trainArray: [],
17 | trainingRequestCount: 0,
18 |
19 | onLoadFunction: function() {
20 | this.resetCanvas();
21 | },
22 |
23 | resetCanvas: function() {
24 | var canvas = document.getElementById('canvas');
25 | var ctx = canvas.getContext('2d');
26 |
27 | this.data = [];
28 | ctx.fillStyle = this.BLACK;
29 | ctx.fillRect(0, 0, this.CANVAS_WIDTH, this.CANVAS_WIDTH);
30 | var matrixSize = 400;
31 | while (matrixSize--) this.data.push(0);
32 | this.drawGrid(ctx);
33 |
34 | // 绑定事件操作
35 | canvas.onmousemove = function(e) { this.onMouseMove(e, ctx, canvas) }.bind(this);
36 | canvas.onmousedown = function(e) { this.onMouseDown(e, ctx, canvas) }.bind(this);
37 | canvas.onmouseup = function(e) { this.onMouseUp(e, ctx) }.bind(this);
38 | },
39 |
40 | drawGrid: function(ctx) {
41 | for (var x = this.PIXEL_WIDTH, y = this.PIXEL_WIDTH; x < this.CANVAS_WIDTH; x += this.PIXEL_WIDTH, y += this.PIXEL_WIDTH) {
42 | ctx.strokeStyle = this.BLUE;
43 | ctx.beginPath();
44 | ctx.moveTo(x, 0);
45 | ctx.lineTo(x, this.CANVAS_WIDTH);
46 | ctx.stroke();
47 |
48 | ctx.beginPath();
49 | ctx.moveTo(0, y);
50 | ctx.lineTo(this.CANVAS_WIDTH, y);
51 | ctx.stroke();
52 | }
53 | },
54 |
55 | onMouseMove: function(e, ctx, canvas) {
56 | if (!canvas.isDrawing) {
57 | return;
58 | }
59 | this.fillSquare(ctx, e.clientX - canvas.offsetLeft, e.clientY - canvas.offsetTop);
60 | },
61 |
62 | onMouseDown: function(e, ctx, canvas) {
63 | canvas.isDrawing = true;
64 | this.fillSquare(ctx, e.clientX - canvas.offsetLeft, e.clientY - canvas.offsetTop);
65 | },
66 |
67 | onMouseUp: function(e) {
68 | canvas.isDrawing = false;
69 | },
70 |
71 | fillSquare: function(ctx, x, y) {
72 | var xPixel = Math.floor(x / this.PIXEL_WIDTH);
73 | var yPixel = Math.floor(y / this.PIXEL_WIDTH);
74 | // 存储手写输入数据
75 | this.data[((xPixel - 1) * this.TRANSLATED_WIDTH + yPixel) - 1] = 1;
76 |
77 | ctx.fillStyle = '#ffffff';
78 | ctx.fillRect(xPixel * this.PIXEL_WIDTH, yPixel * this.PIXEL_WIDTH, this.PIXEL_WIDTH, this.PIXEL_WIDTH);
79 | },
80 |
81 | train: function() {
82 | var digitVal = document.getElementById("digit").value;
83 | if (!digitVal || this.data.indexOf(1) < 0) {
84 | alert("Please type and draw a digit value in order to train the network");
85 | return;
86 | }
87 | // 将数据加入客户端训练数据集
88 | this.trainArray.push({"y0": this.data, "label": parseInt(digitVal)});
89 | this.trainingRequestCount++;
90 |
91 | // 将客服端训练数据集发送给服务器端
92 | if (this.trainingRequestCount == this.BATCH_SIZE) {
93 | alert("Sending training data to server...");
94 | var json = {
95 | trainArray: this.trainArray,
96 | train: true
97 | };
98 |
99 | this.sendData(json);
100 | this.trainingRequestCount = 0;
101 | this.trainArray = [];
102 | }
103 | },
104 |
105 | // 发送预测请求
106 | test: function() {
107 | if (this.data.indexOf(1) < 0) {
108 | alert("Please draw a digit in order to test the network");
109 | return;
110 | }
111 | var json = {
112 | image: this.data,
113 | predict: true
114 | };
115 | this.sendData(json);
116 | },
117 |
118 | // 处理服务器响应
119 | receiveResponse: function(xmlHttp) {
120 | if (xmlHttp.status != 200) {
121 | alert("Server returned status " + xmlHttp.status);
122 | return;
123 | }
124 | var responseJSON = JSON.parse(xmlHttp.responseText);
125 | if (xmlHttp.responseText && responseJSON.type == "test") {
126 | alert("The neural network predicts you wrote a \'" + responseJSON.result + '\'');
127 | }
128 | },
129 |
130 | onError: function(e) {
131 | alert("Error occurred while connecting to server: " + e.target.statusText);
132 | },
133 |
134 | sendData: function(json) {
135 | var xmlHttp = new XMLHttpRequest();
136 | xmlHttp.open('POST', this.HOST + ":" + this.PORT, false);
137 | xmlHttp.onload = function() { this.receiveResponse(xmlHttp); }.bind(this);
138 | xmlHttp.onerror = function() { this.onError(xmlHttp) }.bind(this);
139 | var msg = JSON.stringify(json);
140 | xmlHttp.setRequestHeader('Content-length', msg.length);
141 | xmlHttp.setRequestHeader("Connection", "close");
142 | xmlHttp.send(msg);
143 | }
144 | }
145 |
146 |
--------------------------------------------------------------------------------
/Spark/DigitRecogn_Spark/server.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | from __future__ import print_function
3 | import BaseHTTPServer
4 | import json
5 | import csv
6 | import shutil
7 | from pyspark import SparkContext
8 | from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
9 | from pyspark.mllib.util import MLUtils
10 |
11 | #服务器端配置
12 | HOST_NAME = 'localhost'
13 | PORT_NUMBER = 9000
14 | reader = csv.reader(file('./data.csv', 'rb'))
15 | output = open('LabeledPointsdata.txt', 'a')
16 | reader = csv.reader(file('./data.csv', 'rb'))
17 | output = open('LabeledPointsdata.txt', 'w')
18 | n = 0
19 |
20 | sc = SparkContext(appName="PythonNaiveBayesExample")
21 |
22 | for line in reader:
23 | outputline ='%d' % int(n/500) + "," #每500行为一个数字的训练集
24 | n = n + 1
25 | for c in line:
26 | if "0.0000000000"==c:
27 | outputline += '0 '
28 | else:
29 | outputline += '1 '
30 | outputline += '\n'
31 | output.write(outputline)
32 | output.close()
33 | print('Format Successful!')
34 |
35 | class JSONHandler(BaseHTTPServer.BaseHTTPRequestHandler):
36 |
37 | """处理接收到的POST请求"""
38 | def do_POST(self):
39 | response_code = 200
40 | response = ""
41 | var_len = int(self.headers.get('Content-Length'))
42 | content = self.rfile.read(var_len);
43 | payload = json.loads(content);
44 |
45 | # 如果是训练请求,训练然后保存训练完的神经网络
46 | if payload.get('train'):
47 | # 转化数据格式
48 | TrainData = ""
49 | for d in payload['trainArray'][0]['y0']:
50 | TrainData = TrainData + " " + ('%d' % d)
51 | TrainData = '%d' % (payload['trainArray'][0]['label']) + "," + TrainData.lstrip() +"\n"
52 | print(TrainData)
53 | Addoutput = open('LabeledPointsdata.txt', 'a')
54 | Addoutput.write(TrainData)
55 | Addoutput.close()
56 |
57 |
58 | # 如果是预测请求,返回预测值
59 | elif payload.get('predict'):
60 | try:
61 | training = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata.txt")
62 | print('Begin NaiveBayes tranning!')
63 | model = NaiveBayes.train(training, 1.0)
64 | print('Trainning over!')
65 | print(payload['image'])
66 | response = {"type":"test", "result":str(model.predict(payload['image']))}
67 | except:
68 | response_code = 500
69 | else:
70 | response_code = 400
71 |
72 | self.send_response(response_code)
73 | self.send_header("Content-type", "application/json")
74 | self.send_header("Access-Control-Allow-Origin", "*")
75 | self.end_headers()
76 | if response:
77 | self.wfile.write(json.dumps(response))
78 | return
79 |
80 | if __name__ == '__main__':
81 | server_class = BaseHTTPServer.HTTPServer;
82 | httpd = server_class((HOST_NAME, PORT_NUMBER), JSONHandler)
83 |
84 | try:
85 | #启动服务器
86 | httpd.serve_forever()
87 | print("Server started.")
88 | except KeyboardInterrupt:
89 | pass
90 | else:
91 | print ("Unexpected server exception occurred.")
92 | finally:
93 | httpd.server_close()
94 |
95 |
--------------------------------------------------------------------------------
/Spark/README.md:
--------------------------------------------------------------------------------
1 | # 血常规检验报告深度学习系统 on Spark
2 |
3 | Spark是UC Berkeley AMP lab (加州大学伯克利分校的AMP实验室)所开源的类Hadoop MapReduce的通用并行框架,Spark,拥有Hadoop MapReduce所具有的优点;但不同于MapReduce的是Job中间输出结果可以保存在内存中,从而不再需要读写HDFS,因此Spark能更好地适用于数据挖掘与机器学习等需要迭代的MapReduce的算法。
4 |
5 | 该Demo主要演示Spark的深度学习功能,数据由Spark直接读取,尚未使用Hadoop等数据库。
6 |
7 | ##运行环境
8 | ###安装JDK
9 | ```
10 | java -version
11 | ```
12 | 如果未安装,请下载最新JDK并设置相应的JAVA_HOME、JRE_HOME、CLASSPATH、PATH变量
13 |
14 | ###安装Scala并添加Scala_HOME,更新PATH
15 |
16 | ```
17 | sudo apt-get install scala
18 | ```
19 |
20 | 下载Spark并解压
21 |
22 | 官网下载地址:http://spark.apache.org/downloads.html
23 | ###配置Spark环境
24 | ```
25 | cp ./conf/spark-env.sh.template ./conf/spark-env.sh
26 | ```
27 | ###安装Python依赖包
28 | ```
29 | sudo apt-get install python-numpy
30 | ```
31 | ###设置Python依赖路径
32 | ```
33 | sudo vim /etc/profile
34 | ```
35 | 在结尾处添加
36 | ```
37 | export SPARK_HOME=/home/hadoop/spark #你的Spark解压目录
38 |
39 | export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.1-src.zip:$PYTHONPATH #py4j及pysqrk的相关依赖路径,py4j-0.10.1-src文件名可能会因Spark版本不同而不同,请设置为自己对应目录下的文件名
40 | ```
41 | ###启动SPARK
42 | ```
43 | sudo ./sbin/start-all.sh
44 | ```
45 | 在root下输入jps应该可以看到Master和Worker两个进程
46 |
47 | 也可以登陆
48 | ```
49 | http://127.0.0.1:8080/
50 | ```
51 | 查看Spark状态
52 |
53 | ##安装MongoDB Connector for Hadoop
54 |
55 | MongoDB Connector for Hadoop是一个类库,他允许包括Spark、Pig、Hive、Mapreduce等在内的多种Hadoop架构中的组件使用MongoDB作为数据源。
56 | ###第三方软件安装
57 | 使用Maven安装:
58 | ```
59 |
60 | org.mongodb.mongo-hadoop
61 | mongo-hadoop-core
62 | 1.5.1
63 |
64 | ```
65 | 或使用Gradle安装:
66 | ```
67 | compile 'org.mongodb.mongo-hadoop:mongo-hadoop-core:1.5.1'
68 | ```
69 | ###独立安装
70 | 克隆源代码:
71 | ```
72 | git clone https://github.com/mongodb/mongo-hadoop.git
73 | ```
74 |
75 | 源代码克隆后需要编译,编译过程需连接外网进行下载,国内下载速度较慢,建议使用VPN
76 | ```
77 | ./gradlew jar
78 | ```
79 | 编译后的文件会放在core/build/libs目录下。若安装了Hadoop,则将三个文件分别拷贝至以下目录
80 |
81 | -$HADOOP_PREFIX/lib/
82 | -$HADOOP_PREFIX/share/hadoop/mapreduce/
83 | -$HADOOP_PREFIX/share/hadoop/lib/
84 | 若是Spark独立部署,则将其拷贝至本机pyspark目录下即可。
85 |
86 |
--------------------------------------------------------------------------------
/TensorFlow/LSTM.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import tensorflow as tf
3 | from tensorflow.python.ops import rnn, rnn_cell
4 | import numpy as np
5 |
6 | learning_rate = 0.002
7 | training_iters = 1858
8 | Text_iters = 200
9 | display_step = 10
10 |
11 |
12 |
13 | n_input = 13
14 | n_steps = 2
15 | n_hidden = 64
16 | n_classes = 2
17 |
18 | def one_hot(a, length):
19 | b = np.zeros([length, 2])
20 | for i in range(length):
21 | if a[i] == 0:
22 | b[i][1] = 1
23 | else:
24 | b[i][0] = 1
25 | return b
26 |
27 |
28 | train_data = np.loadtxt(open("./train.csv","rb"),delimiter=",",skiprows=0)
29 | test_data = np.loadtxt(open("./predict.csv","rb"),delimiter=",",skiprows=0)
30 | #selet rows and column
31 | train_label_sex = train_data[:, 1:2]
32 |
33 | train_label_sex = one_hot(train_label_sex,train_data.shape[0])
34 |
35 |
36 | train_data = train_data[:, 3:]
37 |
38 | train_data = np.reshape(train_data, (1858,n_steps,n_input))
39 |
40 |
41 | test_label_sex = test_data[:, 1:2]
42 | test_label_sex = one_hot(test_label_sex,test_data.shape[0])
43 | test_data = test_data[:, 3:]
44 | test_data = np.reshape(test_data, (200,n_steps,n_input))
45 |
46 |
47 |
48 | x = tf.placeholder("float", [None, n_steps, n_input])
49 |
50 | # Tensorflow LSTM cell requires 2x n_hidden length (state & cell)
51 | istate = tf.placeholder("float", [None, 2 * n_hidden])
52 | y = tf.placeholder("float", [None, n_classes])
53 |
54 | weights = {
55 | 'hidden': tf.Variable(tf.random_normal([n_input, n_hidden])),
56 | 'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))
57 | }
58 | biases = {
59 | 'hidden': tf.Variable(tf.random_normal([n_hidden])),
60 | 'out': tf.Variable(tf.random_normal([n_classes]))
61 | }
62 |
63 |
64 | def RNN(_x, _istate, _weights, _biases):
65 |
66 |
67 | # Permuting n_steps
68 |
69 | _x = tf.transpose(_x, [1, 0, 2])
70 | # Reshaping to (n_steps*batch_size, n_input)
71 | _x = tf.reshape(_x, [-1, n_input])
72 | # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
73 |
74 |
75 | _x = tf.matmul(_x, _weights['hidden']) + _biases['hidden']
76 |
77 | lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0,state_is_tuple=False)
78 | _x = tf.split(0, n_steps, _x)
79 | # Get lstm cell output
80 | outputs, states = tf.nn.rnn(lstm_cell, _x, dtype=tf.float32, initial_state=_istate)
81 |
82 | # Linear activation, using rnn inner loop last output
83 | return tf.matmul(outputs[-1], _weights['out']) + _biases['out']
84 |
85 | pred = RNN(x, istate, weights, biases)
86 |
87 | cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
88 |
89 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
90 |
91 |
92 | correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
93 | accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
94 |
95 | init = tf.global_variables_initializer()
96 |
97 |
98 | with tf.Session() as sess:
99 | sess.run(init)
100 | step = 1
101 |
102 | while step < 300:
103 |
104 | sess.run(optimizer, feed_dict={x: train_data, y: train_label_sex, istate: np.zeros((training_iters, 2 * n_hidden))})
105 | if step % display_step == 0:
106 | # Calculate batch accuracy
107 | acc = sess.run(accuracy, feed_dict={x: train_data, y: train_label_sex, istate: np.zeros((training_iters, 2 * n_hidden))})
108 | # Calculate batch loss
109 | loss = sess.run(cost, feed_dict={x: train_data, y: train_label_sex,istate: np.zeros((training_iters, 2 * n_hidden))})
110 | print("Iter " + str(step) + ", Loss= " + \
111 | "{:.6f}".format(loss) + ", Training Accuracy= " + \
112 | "{:.5f}".format(acc))
113 | step += 1
114 | print("Optimization Finished!")
115 |
116 |
117 | print("Testing Accuracy:", \
118 | sess.run(accuracy, feed_dict={x: test_data, y: test_label_sex,
119 | istate: np.zeros((Text_iters, 2 * n_hidden))}))
120 |
--------------------------------------------------------------------------------
/TensorFlow/README.md:
--------------------------------------------------------------------------------
1 | # Tensorflow框架下的mnist手写字符识别
2 | - 简单双隐层 26->238->512->100(年龄)/ 2(性别)
3 | - 学习率0.01/0.1
4 | - 训练数据集A2的csv血液数据报告文件
5 | - 输出层用softmax函数做分类器,损失函数是cross entropy
6 | - 批处理大小为17
7 | - 本内容皆在提供tensorflow标准数据格式的预处理范例
8 |
9 | ### 环境配置
10 | 系统: UBUNTU系列, 有N卡支持CUDA请装GPU版本并在sess出处使用GPU执行训练
11 |
12 | # 安装numpy
13 | sudo apt-get install python-numpy
14 |
15 | # 安装PIL
16 | sudo apt-get install python-imaging
17 |
18 | # 安装Tensorflow
19 | pip install --upgrade https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.12.0rc0-cp27-none-linux_x86_64.whl
20 |
21 |
22 | ### 运行
23 | mkdir ckpt_age
24 | mkdir ckpt_sex
25 | python age_predict.py # 网络结构未优化,准确率40%上下
26 | python sex_predict.py # 同样由于网络结构问题,损失函数不收敛
27 | ### 解释
28 | 1. age_predict.py
29 | 训练网络,并预测一条记录(预测样本放在代码最后)
30 | 2. tfrecords后缀文件和ckpt文件夹下内容
31 | 第一次运行会根据数据集产生tfrecord文件,文件feed以及分batch均需要构建为这个标准数据格式,如需要扩充变化数据集请删除tfrecords内容(如要变化数据格式请重新)
32 | 第一次运行会在ckpt下状态保存点,如果需要调参再训练,请删除ckpt文件夹下内容;
33 |
34 | ### 注意
35 | 如果不是用的最新版tensorflow,请去旧版文档查询并更改Saver()和Initializer()函数,0.11及以下版本使用的API名称是不同的
36 |
37 | ### agepredictv2.0.py注释
38 | 定义了添加层函数。通过升维,使不同年龄段输出节点不同,调参找到比较好结果,设置2隐藏层。隐藏层节点数约为输入75%。使年龄预测率提高到24%左右。
39 | 其典型的bp神经网络模型流要更具有普适性。可直接利用该文件夹俩csv文件运行。 ——SA312
40 |
41 | ### TensorBoard可视化
42 | 程序运行完毕之后, 会产生logs目录 , 使用命令 tensorboard --logdir='logs/',然后打开浏览器查看
43 |
--------------------------------------------------------------------------------
/TensorFlow/age_predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 | import tensorflow as tf
4 | import numpy as np
5 | import csv
6 | import random
7 |
8 | # id,sex,age,WBC,RBC,HGB,HCT,MCV,MCH,MCHC,RDW,PLT,MPV,PCT,PDW,LYM,LYM%,MON,MON%,NEU,NEU%,EOS,EOS%,BAS,BAS%,ALY,ALY%,LIC,LIC%
9 |
10 | # 预测的正确结果定义为 |X - Y| <= 5
11 |
12 | '''
13 | 数据处理部分
14 | '''
15 | # 数据集路径
16 | cwd = os.getcwd()
17 |
18 | train = csv.reader(open(cwd + '/train.csv', 'rb'))
19 | predict = csv.reader(open(cwd + '/predict.csv', 'rb'))
20 |
21 |
22 | # 转化标签为one-hot格式, 类别为100类(0 ~ 99岁)
23 | def dense_to_one_hot(labels_dense, num_classes=100):
24 | labels_dense = np.array(labels_dense)
25 | num_labels = labels_dense.shape[0]
26 | index_offset = np.arange(num_labels) * num_classes
27 | labels_one_hot = np.zeros((num_labels, num_classes))
28 | labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
29 | return labels_one_hot
30 |
31 |
32 | # 读取数据
33 | def write_to_tensor(name, csv_name):
34 | if os.path.exists(name):
35 | return
36 | csv_file = csv.reader(open(cwd + '/' + csv_name, 'rb'))
37 | writer = tf.python_io.TFRecordWriter(name)
38 | i = 0
39 | for line in csv_file:
40 | if not line:
41 | break
42 | if len(line) is not 29:
43 | continue
44 | if line[2] is '1.5':
45 | print line[2]
46 | continue
47 | index = [int(line[2])]
48 | data = map(float, line)[3:29]
49 | # 注意list类型, Feature或FeatureList等
50 | example = tf.train.Example(features=tf.train.Features(feature={
51 | "label": tf.train.Feature(int64_list=tf.train.Int64List(value=index)),
52 | 'content': tf.train.Feature(float_list=tf.train.FloatList(value=data))
53 | }))
54 | print data, index
55 | # 序列化并写入tfrecord
56 | writer.write(example.SerializeToString())
57 | i += 1
58 | print i, "Data dealed"
59 | writer.close()
60 |
61 |
62 | # 读取数据并解析
63 | def read_and_decode(filename):
64 | # 根据文件名生成一个队列
65 | filename_queue = tf.train.string_input_producer([filename])
66 | # 创建tfrecord reader
67 | reader = tf.TFRecordReader()
68 | # 返回文件名和文件
69 | _, serialized_example = reader.read(filename_queue)
70 | # 读取时要注意fix shape
71 | features = tf.parse_single_example(serialized_example,
72 | features={
73 | 'label': tf.FixedLenFeature([], tf.int64),
74 | 'content': tf.FixedLenFeature([26], tf.float32),
75 | })
76 |
77 | data = tf.cast(features['content'], tf.float32)
78 | label = tf.cast(features['label'], tf.int32)
79 | return data, label
80 |
81 |
82 | '''
83 | 网络结构部分
84 | '''
85 |
86 | # 定义占位符
87 | x = tf.placeholder(tf.float32, shape=[None, 26])
88 | y_ = tf.placeholder(tf.float32, shape=[None, 100])
89 |
90 |
91 | # 定义权重参数格式函数 参数初始值为随机数 0 ~ 0.2
92 | def weight_variable(shape):
93 | initial = tf.truncated_normal(shape, stddev=random.uniform(0, 0.2))
94 | return tf.Variable(initial)
95 |
96 |
97 | def bias_variable(shape):
98 | initial = tf.constant(random.uniform(0, 0.2), shape=shape)
99 | return tf.Variable(initial)
100 |
101 |
102 | # 调整输入尺寸,一维展开以适应输入层
103 | # 全连接层参数格式
104 | # 全连接层1参数格式
105 | W_fc1 = weight_variable([26, 64])
106 | b_fc1 = bias_variable([64])
107 |
108 | # 全连接层1reshape
109 | h_pool2_flat = tf.reshape(x, [-1, 26])
110 |
111 | # 激励函数fc1
112 | h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
113 |
114 |
115 | # 全连接层2参数格式
116 | W_fc2 = weight_variable([64, 512])
117 | b_fc2 = bias_variable([512])
118 |
119 | # 全连接层2输入reshape
120 | h_fc1_2 = tf.reshape(h_fc1, [-1, 64])
121 |
122 | # 激励函数fc2
123 | h_fc2 = tf.nn.relu(tf.matmul(h_fc1_2, W_fc2) + b_fc2)
124 |
125 | # dropout层
126 | keep_prob = tf.placeholder(tf.float32)
127 | h_fc1_drop = tf.nn.dropout(h_fc2, keep_prob)
128 |
129 | # 输出层参数格式
130 | W_fc3 = weight_variable([512, 100])
131 | b_fc3 = bias_variable([100])
132 |
133 | # 输出内容为y_result
134 | y_result = tf.matmul(h_fc1_drop, W_fc3) + b_fc3
135 |
136 | # 定义损失函数 交叉熵
137 | cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_result, y_))
138 |
139 | # 定义训练op
140 | train_step = tf.train.AdamOptimizer(0.01).minimize(cross_entropy)
141 |
142 | # 定义正确预测 |y - Y| <= 5
143 | correct_prediction = tf.less_equal(tf.abs(tf.sub(tf.argmax(y_result, 1), tf.argmax(y_, 1))), 5)
144 | # correct_prediction = tf.equal(tf.argmax(y_result, 1), tf.argmax(y_, 1))
145 |
146 | # 定义正确率
147 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
148 |
149 | # 定义Model Saver op
150 | saver = tf.train.Saver()
151 |
152 | # 定义计算图激活op
153 | init_op = tf.global_variables_initializer()
154 |
155 | '''
156 | 训练部分
157 | '''
158 |
159 | # 如果没有保存模型则训练一个新的
160 |
161 | if not os.path.exists("./ckpt_age/checkpoint"):
162 | # 创建tfrecord
163 | write_to_tensor('train_age.tfrecords', 'train.csv')
164 | write_to_tensor('predict_age.tfrecords', 'predict.csv')
165 | # 读取tfrecord
166 | train_img, train_label = read_and_decode("train_age.tfrecords")
167 | test_img, test_label = read_and_decode("predict_age.tfrecords")
168 |
169 | # 使用shuffle_batch分batch并打乱顺序
170 | img_batch, label_batch = tf.train.shuffle_batch([train_img, train_label],
171 | batch_size=17, capacity=2000,
172 | min_after_dequeue=1000)
173 | test_img_batch, test_label_batch = tf.train.shuffle_batch([test_img, test_label],
174 | batch_size=200, capacity=20000,
175 | min_after_dequeue=10000)
176 | with tf.Session() as sess:
177 | # 激活计算图
178 | sess.run(init_op)
179 | # 启动队列
180 | threads = tf.train.start_queue_runners(sess=sess)
181 | # 迭代次数 = 10000
182 | for i in range(10000):
183 | # batch
184 | image, label = sess.run([img_batch, label_batch])
185 | # 输出局部正确率
186 | if i % 100 == 0:
187 | train_accuracy = accuracy.eval(feed_dict={
188 | x: image, y_: dense_to_one_hot(label), keep_prob: 1.0})
189 | print("step %d, training accuracy %g" % (i, train_accuracy))
190 | train_step.run(feed_dict={x: image, y_: dense_to_one_hot(label), keep_prob: 0.5})
191 | # 加载测试集
192 | test_img, test_label = sess.run([test_img_batch, test_label_batch])
193 | # 输出整体正确率
194 | print("test accuracy %g" % accuracy.eval(feed_dict={
195 | x: test_img, y_: dense_to_one_hot(test_label), keep_prob: 1.0}))
196 | # 保存模型
197 | save_path = saver.save(sess, cwd + "/ckpt_age/age.ckpt", write_meta_graph=None)
198 | print("Model saved in file: %s" % save_path)
199 |
200 | '''
201 | 预测部分
202 | '''
203 |
204 | def preloadedata(data):
205 | return tf.reshape(np.array(map(float, data[3:29])), [1, 26]).eval()
206 |
207 | # 加载模型
208 | with tf.Session() as sess:
209 | # 恢复checkpoint.
210 | saver.restore(sess, cwd + "/ckpt_age/age.ckpt")
211 | print("Model restored.")
212 | # 读取数据
213 | predict_data = csv.reader(open(cwd + '/predict.csv', 'rb'))
214 | # 预处理数据
215 | my_data = [108,0,7,8.2,7.2,0.191,10.2,2.87,35.1,0.79,9.6,4.38,53.5,0.05,4.8,0.6,0.1,1.2,0.09,1.1,0.14,1.7,139,0.403,84,29,346,10.3,267]
216 | my_data = preloadedata(my_data)
217 | # 输出预测结果
218 | print "predictions", tf.argmax(y_result, 1).eval(feed_dict={x: my_data, keep_prob: 1.0}, session=sess)
219 | # 输出各年龄概率
220 | # print "probabilities", tf.nn.softmax(y_result.eval(feed_dict={x: my_data, keep_prob: 1.0}, session=sess)).eval()
221 |
222 |
--------------------------------------------------------------------------------
/TensorFlow/agepredict_v2.0.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | '''
3 | 定义了添加层函数,进行年龄预测作业。通过升维,使不同数输出节点不同,
4 | 以及非常长时间的调参,找到比较好的结果,
5 | 设置2隐藏层。隐藏层节点数按出入节点75%。
6 | 使年龄预测率提高到23%左右,最高25%。
7 | 其典型的bp神经网络模型流程可借鉴。———SA312
8 |
9 | 增加了损失的输出,增加了TensorBoard可视化。———SA458
10 | '''
11 |
12 | import tensorflow as tf
13 | import numpy as np
14 | import csv
15 | import math
16 |
17 |
18 | label_orign2 = []
19 | data_orign2 = []
20 | sex_orign2 = []
21 | age_orign2 = []
22 |
23 | #读预测数据
24 | with open('predict.csv','rb') as precsv2:
25 | reader2 = csv.reader(precsv2)
26 | for line2 in reader2:
27 |
28 | if reader2.line_num == 1:
29 | continue
30 | label_origntemp2 = [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0] #升维度
31 | label_origntemp2.insert(int(math.floor(float(line2[2])/10)),float(math.floor(float(line2[2])/10)))
32 | label_orign2.append(label_origntemp2)
33 | data_orign2.append(line2[3:])
34 | label_np_arr2 = np.array(label_orign2)
35 | data_np_arr2 = np.array(data_orign2)
36 | sex_np_arr2 = np.array(sex_orign2)
37 |
38 | data_len2 = data_np_arr2.shape[1]
39 | data_num2 = data_np_arr2.shape[0]
40 |
41 |
42 |
43 | label_orign = []
44 | data_orign = []
45 | sex_orign = []
46 | age_orign = []
47 | #读训练数据
48 | with open('train.csv','rb') as precsv:
49 | reader = csv.reader(precsv)
50 | for line in reader:
51 |
52 | if reader.line_num == 1:
53 | continue
54 | label_origntemp = [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0] #升维度
55 | label_origntemp.insert(int(math.floor(float(line[2])/10)),float(math.floor(float(line[2])/10)))
56 | label_orign.append(label_origntemp)
57 | data_orign.append(line[3:])
58 | label_np_arr = np.array(label_orign)
59 | data_np_arr = np.array(data_orign)
60 | #sex_np_arr = np.array(sex_orign)
61 |
62 |
63 | data_len = data_np_arr.shape[1]
64 | data_num = data_np_arr.shape[0]
65 |
66 | #添加层函数
67 | def add_layer(inputs,in_size,out_size,n_layer,activation_function=None):
68 | layer_name='layer%s'%n_layer
69 | with tf.name_scope('layer'):
70 | with tf.name_scope('weights'):
71 | Ws = tf.Variable(tf.random_normal([in_size,out_size]))
72 | tf.histogram_summary(layer_name+'/weights',Ws)
73 | with tf.name_scope('baises'):
74 | bs = tf.Variable(tf.zeros([1,out_size])+0.5)
75 | tf.histogram_summary(layer_name+'/baises',bs)
76 | with tf.name_scope('Wx_plus_b'):
77 | Wxpb = tf.matmul(inputs,Ws) + bs
78 |
79 | if activation_function is None:
80 | outputs = Wxpb
81 | else:
82 | outputs = activation_function(Wxpb)
83 | tf.histogram_summary(layer_name+'/outputs',outputs)
84 | return outputs
85 | #比较函数
86 | def compute_accuracy(v_xs,v_ys):
87 | global prediction
88 | y_pre = sess.run(prediction,feed_dict={xs:v_xs})
89 | correct_prediction = tf.equal(tf.argmax(y_pre,1),tf.argmax(v_ys,1))
90 | accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))
91 | result = sess.run(accuracy,feed_dict={xs:v_xs,ys:v_ys})
92 | return result
93 |
94 | # define placeholder for inputs to network
95 | with tf.name_scope('inputs'):
96 | xs = tf.placeholder(tf.float32,[None,data_len])
97 | ys = tf.placeholder(tf.float32,[None,10])
98 |
99 | #3个隐藏层
100 | l1 = add_layer(xs,data_len,19,n_layer=1,activation_function=tf.nn.sigmoid)
101 | l2 = add_layer(l1,19,19,n_layer=2,activation_function=tf.nn.sigmoid)
102 | l3 = add_layer(l2,19,19,n_layer=3,activation_function=tf.nn.sigmoid)
103 | # add output layer
104 | prediction = add_layer(l3,19,10,n_layer=4,activation_function=tf.nn.softmax)
105 |
106 |
107 |
108 | with tf.name_scope('loss'):
109 | cross_entropy = tf.reduce_mean(-tf.reduce_sum(ys*tf.log(prediction),reduction_indices=[1]))
110 | tf.scalar_summary('loss',cross_entropy) #show in evernt
111 | with tf.name_scope('train'):
112 | train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy)
113 |
114 |
115 | init = tf.initialize_all_variables()
116 |
117 | saver = tf.train.Saver()
118 | sess = tf.Session()
119 | merged = tf.merge_all_summaries()
120 | writer = tf.train.SummaryWriter("logs/", sess.graph)
121 | sess.run(init)
122 |
123 | for i in range(10000):
124 | _, cost = sess.run([train_step, cross_entropy], feed_dict={xs:data_np_arr,
125 | ys:label_np_arr.reshape((data_num,10))})
126 | #sess.run(train_step,feed_dict={xs:data_np_arr,ys:label_np_arr.reshape((data_num,10))})
127 | if i%50 == 0:
128 | print("Epoch:", '%04d' % (i), "cost=", \
129 | "{:.9f}".format(cost),"Accuracy:",compute_accuracy(data_np_arr2,label_np_arr2.reshape((data_num2,10))))
130 | result = sess.run(merged,feed_dict={xs:data_np_arr,
131 | ys:label_np_arr.reshape((data_num,10))})
132 | writer.add_summary(result,i)
133 |
134 | print("Optimization Finished!")
135 |
136 |
--------------------------------------------------------------------------------
/TensorFlow/rnn.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import tensorflow as tf
3 | from tensorflow.python.ops import rnn, rnn_cell
4 | import numpy as np
5 |
6 | # Parameters
7 | learning_rate = 0.001
8 | training_iters = 200
9 | display_step = 10
10 |
11 | # Network Parameters
12 | n_input = 13 # MNIST data input (img shape: 28*28)
13 | n_steps = 2 # timesteps
14 | n_hidden = 64 # hidden layer num of features
15 | n_classes = 2 # MNIST total classes (0-9 digits)
16 |
17 | def one_hot(a, length):
18 | b = np.zeros([length, 2])
19 | for i in range(length):
20 | if a[i] == 0:
21 | b[i][1] = 1
22 | else:
23 | b[i][0] = 1
24 | return b
25 |
26 | #1858+200
27 | train_data = np.loadtxt(open("./train.csv","rb"),delimiter=",",skiprows=0)
28 | test_data = np.loadtxt(open("./predict.csv","rb"),delimiter=",",skiprows=0)
29 | train_label_sex = train_data[:, 1:2]
30 | train_label_sex = one_hot(train_label_sex,train_data.shape[0])
31 | train_data = train_data[:, 3:]
32 | train_data = np.reshape(train_data, (1858,n_steps,n_input))
33 |
34 |
35 |
36 | test_label_sex = test_data[:, 1:2]
37 | test_label_sex = one_hot(test_label_sex,test_data.shape[0])
38 | test_data = test_data[:, 3:]
39 | test_data = np.reshape(test_data, (200,n_steps,n_input))
40 |
41 |
42 | # tf Graph input
43 | x = tf.placeholder("float", [None, n_steps, n_input])
44 | y = tf.placeholder("float", [None, n_classes])
45 |
46 | # Define weights
47 | weights = {
48 | 'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))
49 | }
50 | biases = {
51 | 'out': tf.Variable(tf.random_normal([n_classes]))
52 | }
53 |
54 |
55 | def RNN(x, weights, biases):
56 |
57 | # Prepare data shape to match `rnn` function requirements
58 | # Current data input shape: (batch_size, n_steps, n_input)
59 | # Required shape: 'n_steps' tensors list of shape (batch_size, n_input)
60 |
61 | # Permuting n_steps
62 |
63 | x = tf.transpose(x, [1, 0, 2])
64 | # Reshaping to (n_steps*batch_size, n_input)
65 | x = tf.reshape(x, [-1, n_input])
66 | # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
67 | x = tf.split(0, n_steps, x)
68 |
69 | # Define a lstm cell with tensorflow
70 | lstm_cell = rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0)
71 |
72 | # Get lstm cell output
73 | outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
74 |
75 | # Linear activation, using rnn inner loop last output
76 | return tf.matmul(outputs[-1], weights['out']) + biases['out']
77 |
78 | pred = RNN(x, weights, biases)
79 |
80 | # Define loss and optimizer
81 | cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
82 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
83 |
84 | # Evaluate model
85 | correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
86 | accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
87 |
88 | # Initializing the variables
89 | init = tf.global_variables_initializer()
90 |
91 | # Launch the graph
92 | with tf.Session() as sess:
93 | sess.run(init)
94 | step = 1
95 | # Keep training until reach max iterations
96 | while step < training_iters:
97 | # Reshape data to get 28 seq of 28 elements
98 | # Run optimization op (backprop)
99 | sess.run(optimizer, feed_dict={x: train_data, y: train_label_sex})
100 | if step % display_step == 0:
101 | # Calculate batch accuracy
102 | acc = sess.run(accuracy, feed_dict={x: train_data, y: train_label_sex})
103 | # Calculate batch loss
104 | loss = sess.run(cost, feed_dict={x: train_data, y: train_label_sex})
105 | print("Iter " + str(step) + ", Loss= " + \
106 | "{:.6f}".format(loss) + ", Training Accuracy= " + \
107 | "{:.5f}".format(acc))
108 | step += 1
109 | print("Optimization Finished!")
110 |
111 |
112 | print("Testing Accuracy:", \
113 | sess.run(accuracy, feed_dict={x: test_data, y: test_label_sex}))
114 |
--------------------------------------------------------------------------------
/TensorFlow/sex_predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 | import tensorflow as tf
4 | import numpy as np
5 | import csv
6 | import random
7 | # id,sex,age,WBC,RBC,HGB,HCT,MCV,MCH,MCHC,RDW,PLT,MPV,PCT,PDW,LYM,LYM%,MON,MON%,NEU,NEU%,EOS,EOS%,BAS,BAS%,ALY,ALY%,LIC,LIC%
8 |
9 | # 1为男, 0为女
10 |
11 | '''
12 | 数据处理部分
13 | '''
14 | # 数据集路径
15 | cwd = os.getcwd()
16 |
17 | train = csv.reader(open(cwd + '/train.csv', 'rb'))
18 | predict = csv.reader(open(cwd + '/predict.csv', 'rb'))
19 |
20 |
21 | # 转化标签为one-hot格式(类别为两类,男和女)
22 | def dense_to_one_hot(labels_dense, num_classes=2):
23 | labels_dense = np.array(labels_dense)
24 | num_labels = labels_dense.shape[0]
25 | index_offset = np.arange(num_labels) * num_classes
26 | labels_one_hot = np.zeros((num_labels, num_classes))
27 | labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
28 | return labels_one_hot
29 |
30 |
31 | # 读取数据
32 | def write_to_tensor(name, csv_name):
33 | if os.path.exists(name):
34 | return
35 | csv_file = csv.reader(open(cwd + '/' + csv_name, 'rb'))
36 | writer = tf.python_io.TFRecordWriter(name)
37 | for line in csv_file:
38 | if not line:
39 | break
40 | if len(line) is not 29:
41 | continue
42 | index = [int(line[1])]
43 | # 提取从第4列到第28列
44 | data = map(float, line)[3:29]
45 | # 注意list类型, Feature或FeatureList等
46 | example = tf.train.Example(features=tf.train.Features(feature={
47 | "label": tf.train.Feature(int64_list=tf.train.Int64List(value=index)),
48 | 'content': tf.train.Feature(float_list=tf.train.FloatList(value=data))
49 | }))
50 | print data, index
51 | # 序列化并写入tfrecord
52 | writer.write(example.SerializeToString())
53 | writer.close()
54 |
55 |
56 | # 读取数据并解析
57 | def read_and_decode(filename):
58 | # 根据文件名生成一个队列
59 | filename_queue = tf.train.string_input_producer([filename])
60 | # 创建tfrecord reader
61 | reader = tf.TFRecordReader()
62 | # 返回文件名和文件
63 | _, serialized_example = reader.read(filename_queue)
64 | # 读取时要注意fix shape
65 | features = tf.parse_single_example(serialized_example,
66 | features={
67 | 'label': tf.FixedLenFeature([], tf.int64),
68 | 'content': tf.FixedLenFeature([26], tf.float32),
69 | })
70 | data = tf.cast(features['content'], tf.float32)
71 | label = tf.cast(features['label'], tf.int32)
72 | return data, label
73 |
74 |
75 | '''
76 | 网络结构部分 结构 双隐层 26 - 64 - 512 - 2 均为全连接层
77 | '''
78 |
79 | #添加层函数
80 | def add_layer(inputs,in_size,out_size,n_layer,activation_function=None):
81 | layer_name='layer%s'%n_layer
82 | with tf.name_scope('layer'):
83 | with tf.name_scope('weights'):
84 | Ws = tf.Variable(tf.random_normal([in_size,out_size]))
85 | tf.histogram_summary(layer_name+'/weights',Ws)
86 | with tf.name_scope('baises'):
87 | bs = tf.Variable(tf.zeros([1,out_size])+0.5)
88 | tf.histogram_summary(layer_name+'/baises',bs)
89 | with tf.name_scope('Wx_plus_b'):
90 | Wxpb = tf.matmul(inputs,Ws) + bs
91 |
92 | if activation_function is None:
93 | outputs = Wxpb
94 | else:
95 | outputs = activation_function(Wxpb)
96 | tf.histogram_summary(layer_name+'/outputs',outputs)
97 | return outputs
98 |
99 | # 定义占位符
100 | with tf.name_scope('inputs'):
101 | x = tf.placeholder(tf.float32, shape=[None, 26])
102 | y_ = tf.placeholder(tf.float32, shape=[None, 2])
103 |
104 | #2个隐藏层
105 | l1 = add_layer(tf.reshape(x, [-1, 26]),26,64,n_layer=1,activation_function=tf.nn.relu)
106 | l2 = add_layer(l1,64,512,n_layer=2,activation_function=tf.nn.relu)
107 | # add output layer
108 | y_result = add_layer(l2,512,2,n_layer=3)
109 |
110 |
111 | # 定义损失函数 交叉熵
112 | with tf.name_scope('loss'):
113 | cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_result, y_))
114 | tf.scalar_summary('loss',cross_entropy)
115 | # 定义训练op
116 | with tf.name_scope('train'):
117 | train_step = tf.train.AdamOptimizer(0.1).minimize(cross_entropy)
118 |
119 | # 定义正确预测
120 | # correct_prediction = tf.less_equal(tf.abs(tf.sub(tf.argmax(y_result, 1), tf.argmax(y_, 1))), 5)
121 | correct_prediction = tf.equal(tf.argmax(y_result, 1), tf.argmax(y_, 1))
122 |
123 | # 定义正确率
124 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
125 |
126 | # 定义Model Saver op
127 | saver = tf.train.Saver()
128 |
129 | # 定义计算图激活op
130 | init_op = tf.global_variables_initializer()
131 |
132 | '''
133 | 训练部分
134 | '''
135 | # 如果没有保存模型则训练一个新的
136 | if not os.path.exists("./ckpt_sex/checkpoint"):
137 | # 创建tfrecord
138 | write_to_tensor('train_sex.tfrecords', 'train.csv')
139 | write_to_tensor('predict_sex.tfrecords', 'predict.csv')
140 | # 读取tfrecord
141 | train_img, train_label = read_and_decode("train_sex.tfrecords")
142 | test_img, test_label = read_and_decode("predict_sex.tfrecords")
143 |
144 | # 使用shuffle_batch分batch并打乱顺序
145 | img_batch, label_batch = tf.train.shuffle_batch([train_img, train_label],
146 | batch_size=17, capacity=2000,
147 | min_after_dequeue=1000)
148 | test_img_batch, test_label_batch = tf.train.shuffle_batch([test_img, test_label],
149 | batch_size=200, capacity=20000,
150 | min_after_dequeue=10000)
151 | with tf.Session() as sess:
152 |
153 | merged = tf.merge_all_summaries()
154 | writer = tf.train.SummaryWriter("logs/", sess.graph)
155 | # 激活计算图
156 | sess.run(init_op)
157 | # 启动队列
158 | threads = tf.train.start_queue_runners(sess=sess)
159 | # 迭代次数 = 10000
160 | for i in range(10000):
161 | # batch
162 | image, label = sess.run([img_batch, label_batch])
163 | # 输出局部正确率
164 | if i % 100 == 0:
165 | train_accuracy = accuracy.eval(feed_dict={
166 | x: image, y_: dense_to_one_hot(label)})
167 | print("step %d, training accuracy %g" % (i, train_accuracy))
168 | result = sess.run(merged,feed_dict={x: image,
169 | y_: dense_to_one_hot(label)})
170 | writer.add_summary(result,i)
171 | train_step.run(feed_dict={x: image, y_: dense_to_one_hot(label)})
172 | # 加载测试集
173 | test_img, test_label = sess.run([test_img_batch, test_label_batch])
174 | # 输出整体正确率
175 | print("test accuracy %g" % accuracy.eval(feed_dict={
176 | x: test_img, y_: dense_to_one_hot(test_label)}))
177 | # 保存模型
178 | save_path = saver.save(sess, cwd + "/ckpt_sex/sex.ckpt", write_meta_graph=None)
179 | print("Model saved in file: %s" % save_path)
180 |
181 | '''
182 | 预测部分
183 | 给出预测数据格式CSV中任意一行(包括id有29个数据)即可
184 | '''
185 |
186 | def preloadedata(data):
187 | return tf.reshape(np.array(map(float, data[3:29])), [1, 26]).eval()
188 |
189 | # 加载模型
190 | with tf.Session() as sess:
191 | # 恢复checkpoint.
192 | saver.restore(sess, cwd + "/ckpt_sex/sex.ckpt")
193 | print("Model restored.")
194 | # 读取数据
195 | predict_data = csv.reader(open(cwd + '/predict.csv', 'rb'))
196 | # 预处理数据
197 | my_data = [37,1,66,8.7,6.9,0.111,10.8,0.55,6.3,0.4,4.6,7.61,87.7,0.1,3.78,1.1,0.03,0.3,0.03,0.4,0.16,1.8,122,0.352,93,32.1,345,11.4,160]
198 | my_data = preloadedata(my_data)
199 | # 输出预测结果
200 | print "predictions", tf.argmax(y_result, 1).eval(feed_dict={x: my_data, keep_prob: 1.0}, session=sess)
201 | # 输出男女概率
202 | print "probabilities", tf.nn.softmax(y_result.eval(feed_dict={x: my_data, keep_prob: 1.0}, session=sess)).eval()
203 |
204 |
--------------------------------------------------------------------------------
/Traindata.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Tue Dec 13 17:19:21 2016
5 |
6 | @author: zhao
7 | """
8 | import random
9 | import numpy as np
10 | import pandas as pd
11 |
12 | class Traindata:
13 | def __init__(self):
14 | self.df = pd.read_csv('trainurl', index_col = 0)
15 | #将性别转化为2维矩阵,行代表病人id,列代表性别,为男则第一列置1,女则第二列置1
16 | self.gender = np.zeros((1858,2))
17 | for i in range(1858):
18 | if self.df.iloc[i,0]==1:
19 | self.gender[i,0]=1
20 | else:
21 | self.gender[i,1]=1
22 | self.age = self.df.loc[1:,['age']]
23 | #将26项指标转换为26列的矩阵
24 | self.parameter = self.df.loc[1:,['WBC','RBC','HGB','HCT','MCV','MCH','MCHC','ROW','PLT','MPV','PCT','PDW','LYM','LYM%','MON','MON%','NEU','NEU%','EOS','EOS%','BAS','BAS%','ALY','ALY%','LIC','LIC%']]
25 | self.parameter = np.array(self.parameter)
26 | #可以返回随机的n个数据
27 | def next_batch_gender(self,n):
28 | lable = np.zeros((n,2))
29 | para = np.zeros((n,26))
30 | for i in range(n):
31 | k=random.randint(0, 1858)
32 | if self.gender[k,0]==1:
33 | lable[i,0]=1
34 | else:
35 | lable[i,1]=1
36 | para[0] = self.parameter[k]
37 | return para,lable
38 |
39 | def next_batch_age(self,n):
40 | para = np.zeros((n,26))
41 | for i in range(n):
42 | k=random.randint(0, 1858)
43 | if(i==0):
44 | age = pd.DataFrame([self.age.iloc[k]])
45 | else:
46 | age.append(self.age.iloc[k])
47 | para[0] = self.parameter[k]
48 | return para,age
49 |
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/dealdata.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import csv as cv
3 | import numpy as np
4 | import re
5 | csv_file1_object = cv.reader(open('table1.csv','rb'))
6 | csv_file2_object = cv.reader(open('table2.csv','rb'))
7 | csv_file3=open('table3.csv','wb')
8 | csv_file3_object=cv.writer(csv_file3)
9 | head1 = csv_file1_object.next()
10 | head2 = csv_file2_object.next()
11 | data_2=[]
12 | data2=[]
13 | for row in csv_file2_object:
14 | #print len(row[1])
15 | if len(row[1])<10 and int(row[0])<=26:
16 | data_2.append(row)
17 | else:
18 | pass
19 |
20 | data2=np.array(data_2)
21 | col=0
22 | data2=data2[np.argsort(data2[:,col])]
23 |
24 |
25 | csv_file3_object.writerow(['id','sex','age','WBC','RBC','HGB','HCT','MCV','MCH'\
26 | ,'MCHC','RDW','PLT','MPV','PCT','PDW','LYM','LYM%','MON','MON%','NEU','NEU%','EOS'\
27 | ,'EOS%','BAS','BAS%','ALY','ALY%','LIC','LIC%'])
28 | i=1
29 | for row in csv_file1_object:
30 | right_only_stats= data2[(data2[0::,2]==row[2] ) & (data2[0::,3]==row[3]),1]
31 |
32 | #right_only_stats=np.column_stack((right_only_stats,np.array([row[0],row[1]])))
33 | right_only_stats=np.insert(right_only_stats,0,values=i,axis=None)
34 | i=i+1
35 | right_only_stats=np.insert(right_only_stats,1,values=row[0],axis=None)
36 | right_only_stats=np.insert(right_only_stats,2,values=row[1],axis=None)
37 | #right_only_stats= data2[0::,2]==row[2]
38 | if len(right_only_stats)==29:
39 | csv_file3_object.writerow(right_only_stats)
--------------------------------------------------------------------------------
/matlab/nn/create_nn.m:
--------------------------------------------------------------------------------
1 | load('predict_input_transpose.mat')
2 | load('predict_output_transpose.mat')
3 | load('train_input_transpose.mat')
4 | load('train_output_transpose.mat')
5 | net=newff(train_input_transpose,train_output_transpose,{10,2});
6 | [net,tr]=train(net,train_input_transpose,train_output_transpose);
--------------------------------------------------------------------------------
/matlab/nn/network_hit139.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/matlab/nn/network_hit139.mat
--------------------------------------------------------------------------------
/matlab/nn/readme.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/matlab/nn/readme.md
--------------------------------------------------------------------------------
/matlab/nn/test_nn.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/matlab/nn/test_nn.m
--------------------------------------------------------------------------------
/matlab/svm_with_pca/readme.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/matlab/svm_with_pca/readme.md
--------------------------------------------------------------------------------
/matlab/svm_with_pca/svm_with_pca.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csxiaoyaojianxian/BloodTestReportOCR/95d058e4999806fa50bbcf6d10fe8a0af5746759/matlab/svm_with_pca/svm_with_pca.m
--------------------------------------------------------------------------------
/sklearn/.idea/bloodpredict.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/sklearn/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/sklearn/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/sklearn/README.md:
--------------------------------------------------------------------------------
1 | # 利用scikit-learn预测病人性别以及年龄
2 |
3 | ## 环境配置(Ubuntu 14.04或以上版本)
4 |
5 | ```
6 | sudo apt-get install python-numpy cython python-scipy python-matplotlib
7 | pip install -U scikit-learn(如果不行就加sudo)
8 | pip install pandas
9 | ```
10 |
11 | ## 使用
12 | 1. 下载预处理过的数据集
13 |
14 | ```
15 | chmod +x download.sh
16 | ./download.sh
17 | ```
18 |
19 | 2. 预测
20 |
21 | ```
22 | python gender_predict.py
23 | python age_predict.py
24 | ```
--------------------------------------------------------------------------------
/sklearn/age_predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | pandas 0.18.1
4 | scikit-learn 0.18.1
5 | matplotlib 1.5.3
6 | numpy 1.11.1
7 | """
8 |
9 | import pandas as pd
10 | import numpy as np
11 | import matplotlib.pyplot as plt
12 |
13 | from sklearn.model_selection import train_test_split
14 | from sklearn.feature_selection import SelectFromModel
15 | from sklearn.ensemble import RandomForestRegressor
16 | from sklearn.ensemble import AdaBoostRegressor
17 |
18 | # 使用了预处理的第二组数据集
19 | class_names_train2 = ['sex','age','WBC','RBC','BAS#','HGB','HCT','MCV',
20 | 'MCH','MCHC','RDW-CV','PLT','MPV','PCT','PDW','LYM#',
21 | 'LYM%','MONO','MONO%','NEU#','NEU%','EOS#','EOS%','BAS%',
22 | 'IG#','IG%','NRBC#','NRBC%','P-LCR']
23 |
24 |
25 | def load_data():
26 | # 数据集已合并, 去掉了标签行, sex预处理为数字
27 | df = pd.DataFrame(pd.read_csv('train2.csv', names=class_names_train2))
28 | # 转化为字符串
29 | df = df.convert_objects(convert_numeric=True)
30 | # 使用平均值填充缺失值
31 | df = df.fillna(df.mean())
32 | return df
33 |
34 |
35 | def split_data(df, low, high):
36 | """
37 | :param df: 输入的dataframe
38 | :param low: 截取区间的低阈值
39 | :param high: 截取区间的高阈值(不包含)
40 | :return: 截取的dataframe
41 | """
42 | df_lowcut = df[df['age'] >= low]
43 | df_cut = df_lowcut[df_lowcut['age'] < high]
44 |
45 | selected_names = [x for x in class_names_train2 if (x != 'age' and x != 'sex')]
46 | x_data = df_cut[selected_names].as_matrix()
47 | y_data = df_cut['age'].as_matrix()
48 | # 用平均值填充nan
49 | def fill_nan(np_array):
50 | col_mean = np.nanmean(np_array, axis=0)
51 | nan_ids = np.where(np.isnan(np_array))
52 | np_array[nan_ids] = np.take(col_mean, nan_ids[1])
53 | return np_array
54 |
55 | x_data = fill_nan(x_data)
56 | print 'x有没有nan值:', np.any(np.isnan(x_data))
57 | print 'y有没有nan值:', np.any(np.isnan(y_data))
58 |
59 | return x_data, y_data
60 |
61 |
62 | def draw(labels, prediction):
63 | """
64 | 绘制折线图比较结果
65 | :param labels: 1维numpy数组
66 | :param prediction: 1维numpy数组
67 | :return:
68 | """
69 | result = []
70 | for i in range(labels.shape[0]):
71 | result.append([labels[i], prediction[i]])
72 |
73 | # 将年龄按照大小排序
74 | result = sorted(result, key=lambda x: x[0])
75 | labels = [row[0] for row in result]
76 | prediction = [row[1] for row in result]
77 |
78 | plt.plot(labels, label='labels')
79 | plt.plot(prediction, label='predict')
80 | plt.legend(loc='upper left')
81 | plt.show()
82 |
83 |
84 | # 评估测试集
85 | def evalue(clf, X_test, y_test):
86 | pd = clf.predict(X_test)
87 |
88 | delta = [x1 - x2 for (x1, x2) in zip(y_test, pd)]
89 | correct_indices = [x for x in delta if abs(x) < 5]
90 | precision = float(len(correct_indices)) / len(pd)
91 |
92 | print '准确率为: ' + str(precision)
93 | draw(y_test, pd)
94 |
95 |
96 | def feature_select(clf, X_train, y_train, X_test):
97 | # 预训练
98 | print '特征选择预训练中...'
99 | clf.fit(X_train, y_train)
100 |
101 | # 评估特征
102 | importances = clf.feature_importances_
103 | indices = np.argsort(importances)[::-1]
104 | print("特征权值分布为: ")
105 | for f in range(X_train.shape[1]):
106 | print("%d. %s %d (%f)" % (f + 1, class_names_train2[indices[f]], indices[f], importances[indices[f]]))
107 |
108 | # 过滤掉权值小于threshold的特征
109 | model = SelectFromModel(clf, threshold=0.01, prefit=True)
110 | X_train_new = model.transform(X_train)
111 | X_test_new = model.transform(X_test)
112 | print '训练集和测试集的容量以及选择的特征数为: ', X_train_new.shape, X_test_new.shape
113 | # 返回压缩特征之后的训练集和测试集
114 | return X_train_new, X_test_new
115 |
116 |
117 | if __name__ == '__main__':
118 | #载入数据
119 | df = load_data()
120 | x1, y1 = split_data(df, 0, 25)
121 | x2, y2 = split_data(df, 25, 60)
122 | x3, y3 = split_data(df, 60, 80)
123 |
124 | def test_data(X_data, y_data):
125 | # 按9:1分裂训练集/测试集
126 | X_train, X_test, y_train, y_test = \
127 | train_test_split(X_data, y_data, test_size=0.1, random_state=0)
128 | # 使用随机森林
129 | clf = RandomForestRegressor(max_features=None, n_estimators=20, max_depth=None)
130 | # 特征选择
131 | X_train_compressed, X_test_compressed = feature_select(clf, X_train, y_train, X_test)
132 | # 使用提取的特征重新训练
133 | clf.fit(X_train_compressed, y_train)
134 | # 评估训练集效果
135 | evalue(clf, X_train_compressed, y_train)
136 | # 评估测试集效果
137 | evalue(clf, X_test_compressed, y_test)
138 |
139 | test_data(x1, y1)
140 | test_data(x2, y2)
141 | test_data(x3, y3)
142 |
--------------------------------------------------------------------------------
/sklearn/bloodpredict.py:
--------------------------------------------------------------------------------
1 | #coding = utf-8
2 | import pickle
3 | import numpy as np
4 | from sklearn import svm
5 | from sklearn import metrics
6 | from sklearn.cross_validation import train_test_split
7 |
8 |
9 | def extract(filename):
10 | X = np.loadtxt(filename, skiprows= 1,delimiter=',', usecols=(3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28))
11 | y = np.loadtxt(filename, dtype='string', skiprows= 1,delimiter=',', usecols=(1,))
12 | for i in range(len(y)):
13 | if y[i] == '\xc4\xd0':
14 | y[i] = 1
15 | else:
16 | y[i] = 0
17 | return X,y
18 |
19 | def split_test(X,y):
20 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
21 | return X_train, X_test, y_train, y_test
22 |
23 | def save_model(model,name):
24 | pickle.dump(model, open(str(name)+'.pkl', 'w'))
25 |
26 | def load_model(name):
27 | model = pickle.load(open(str(name)+'.pkl'))
28 | return model
29 |
30 | if __name__ == "__main__":
31 | X, y = extract('train.csv')
32 | X_train, X_test, y_train, y_test = split_test(X, y)
33 | clf = svm.SVC(kernel='linear', gamma=0.7, C = 1.0).fit(X_train, y_train)
34 | y_predicted = clf.predict(X_test)
35 | print metrics.classification_report(y_test, y_predicted)
36 | print
37 | print "test_accuracy_score"
38 | print metrics.accuracy_score(y_test, y_predicted)
39 | save_model(clf,'sex')
40 |
41 | X, y =extract('predict.csv')
42 | clf2 = load_model('sex')
43 | y2_predicted = clf2.predict(X)
44 | print "accuracy_score"
45 | print metrics.accuracy_score(y, y2_predicted)
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/sklearn/download.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | FILE=data.csv
4 | URL=http://home.ustc.edu.cn/~xxuan/$FILE
5 |
6 | echo "Downloading data.csv..."
7 | wget $URL -O $FILE
8 | echo "Done."
9 |
10 | FILE=train2.csv
11 | echo "Downloading train2.csv..."
12 | wget $URL -O $FILE
13 | echo "Done."
--------------------------------------------------------------------------------
/sklearn/gender_predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | pandas 0.18.1
4 | scikit-learn 0.18.1
5 | matplotlib 1.5.3
6 | numpy 1.11.1
7 | """
8 |
9 | import pandas as pd
10 | import numpy as np
11 |
12 | from sklearn.model_selection import train_test_split
13 | from sklearn.feature_selection import SelectFromModel
14 | from sklearn.ensemble import AdaBoostClassifier
15 |
16 | class_names_train2 = ['sex', 'age', 'WBC', 'RBC', 'BAS#', 'HGB', 'HCT', 'MCV',
17 | 'MCH', 'MCHC', 'RDW-CV', 'PLT', 'MPV', 'PCT', 'PDW', 'LYM#',
18 | 'LYM%', 'MONO', 'MONO%', 'NEU#', 'NEU%', 'EOS#', 'EOS%', 'BAS%',
19 | 'IG#', 'IG%', 'NRBC#', 'NRBC%', 'P-LCR']
20 |
21 |
22 | def load_data():
23 | # 数据集已合并, 去掉了标签行
24 | # sex预处理: 男是1, 女是0
25 |
26 | df = pd.DataFrame(pd.read_csv('train2.csv', names=class_names_train2))
27 | df = df.convert_objects(convert_numeric=True)
28 | df = df.fillna(df.mean())
29 |
30 | # 去掉id, 分裂标签
31 | selected_names = [x for x in class_names_train2 if (x != 'sex' and x != 'age')]
32 | X_data = df[selected_names].as_matrix()
33 | y_data = df['sex'].as_matrix().astype(int)
34 | return X_data, y_data
35 |
36 |
37 | def data_preprocess(X_data, y_data):
38 | # 按3:1分裂训练集/测试集
39 | X_train, X_test, y_train, y_test = \
40 | train_test_split(X_data, y_data, test_size=0.25)
41 | return X_train, X_test, y_train, y_test
42 |
43 |
44 | def evalue(clf, X_test, y_test):
45 | """
46 | 评估模型在测试集上的性能
47 | :param clf: 模型
48 | :param X_test: 测试集数据
49 | :param y_test: 测试集标记
50 | :return:
51 | """
52 | pd = clf.predict(X_test)
53 |
54 | correct_pairs = [(x, y) for (x, y) in zip(y_test, pd) if x == y]
55 | precision = float(len(correct_pairs)) / len(pd)
56 |
57 | print '准确率为: ' + str(precision)
58 |
59 |
60 | def feature_select(clf, X_train, y_train, X_test):
61 | # 预训练
62 | clf.fit(X_train, y_train)
63 |
64 | # 评估特征
65 | importances = clf.feature_importances_
66 | indices = np.argsort(importances)[::-1]
67 | print("特征权值分布为: ")
68 | for f in range(X_train.shape[1]):
69 | print("%d. %s %d (%f)" % (f + 1, class_names_train2[indices[f]], indices[f], importances[indices[f]]))
70 |
71 | # 过滤掉权值小于threshold的特征
72 | model = SelectFromModel(clf, threshold=0.04, prefit=True)
73 | X_train_new = model.transform(X_train)
74 | X_test_new = model.transform(X_test)
75 | print '训练集和测试集的容量以及选择的特征数为: ', X_train_new.shape, X_test_new.shape
76 | # 返回压缩特征之后的训练集和测试集
77 | return X_train_new, X_test_new
78 |
79 |
80 | if __name__ == '__main__':
81 | # 载入数据
82 | X_data, y_data = load_data()
83 | X_train, X_test, y_train, y_test = data_preprocess(X_data, y_data)
84 |
85 | # 使用adaboost
86 | clf = clf = AdaBoostClassifier()
87 | # 选择特征, 压缩数据
88 | X_train_compressed, X_test_compressed = feature_select(clf, X_train, y_train, X_test)
89 |
90 | # 使用选择的特征重新训练
91 | clf.fit(X_train_compressed, y_train)
92 | # 评估模型
93 | evalue(clf, X_test_compressed, y_test)
94 |
--------------------------------------------------------------------------------
/weixin/README.md:
--------------------------------------------------------------------------------
1 | # 关于微信公众号的开发
2 | ##1.环境配置
3 | ###安装lxml
4 | sudo apt-get install python-lxml
5 | ##2.运行
6 | ###1)将wx.py内token代码换成自己公众号接口配置的token
7 | ###2)开始
8 | sudo python wx.py 80
9 |
10 |
11 |
--------------------------------------------------------------------------------
/weixin/reply_text.xml:
--------------------------------------------------------------------------------
1 | $def with (toUser,fromUser,createTime,content)
2 |
3 |
4 |
5 | $createTime
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/weixin/wx.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import sys
3 | sys.path.append("..")
4 | import web
5 | import hashlib
6 | import urllib2
7 | import time
8 | from lxml import etree
9 | from PIL import Image
10 | import BloodTestReportOCR.tf_predict
11 | from BloodTestReportOCR.imageFilter import ImageFilter
12 | import numpy, cv2
13 | import json
14 | urls = (
15 | '/weixin', 'Weixin'
16 | )
17 |
18 | token = "galigeigei"
19 |
20 | class Weixin:
21 | def __init__(self):
22 | self.render = web.template.render('./')
23 |
24 | def POST(self):
25 | str_xml = web.data()
26 | xml = etree.fromstring(str_xml)
27 | msgType = xml.find("MsgType").text
28 | fromUser = xml.find("FromUserName").text
29 | toUser = xml.find("ToUserName").text
30 |
31 | res = '请输入图片'
32 | if msgType == 'image':
33 | print('gali')
34 | url = xml.find('PicUrl').text
35 | img = cv2.imdecode(numpy.fromstring(urllib2.urlopen(url).read(), numpy.uint8), cv2.CV_LOAD_IMAGE_UNCHANGED)
36 | data = ImageFilter(image=img).ocr(22)
37 | if data:
38 | data = json.loads(data)
39 | pre = [str(data['bloodtest'][i]['value']) for i in range(22)]
40 | for i in range(22):
41 | if pre[i] == '': pre[i] = 0
42 | else:
43 | tmp = pre[i].replace('.', '', pre[i].count('.')-1)
44 | pre[i] = float(tmp)
45 |
46 | arr = numpy.array(pre)
47 | arr = numpy.reshape(arr, [1, 22])
48 |
49 | sex, age = tf_predict.predict(arr)
50 | res = 'sex:'+['女','男'][sex] + ' age:'+str(int(age))
51 | else:
52 | res = '请输入正确图片'
53 |
54 | return self.render.reply_text(fromUser, toUser, int(time.time()), res)
55 |
56 | def GET(self):
57 | data = web.input()
58 | signature = data.signature
59 | timestamp = data.timestamp
60 | nonce = data.nonce
61 | echostr = data.echostr
62 | list = [token, timestamp, nonce]
63 | list.sort()
64 | str = list[0] + list[1] + list[2]
65 | hashcode = hashlib.sha1(str).hexdigest()
66 | if hashcode == signature: return echostr
67 |
68 | app = web.application(urls, globals())
69 |
70 | if __name__ == '__main__':
71 | app.run()
72 |
--------------------------------------------------------------------------------