├── LICENSE ├── README.md ├── cnn.py ├── main.py ├── train.py └── tt.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 AST 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CNN_MINST 2 | CNN MINST 手写数字识别 中科大自动化系人工智能导论2022大作业三 3 | 4 | 以下引自作业报告 5 |
基于MNIST数据集的手写数字识别
6 | 7 | # 实验环境 8 | 9 | ---- 10 | 11 | python 3.10.2 12 | 13 | pyTorch 1.11.0 14 | 15 | pillow 9.1.0 16 | 17 | opencv 4.55.64 18 | 19 | numpy 1.22.3 20 | 21 | # 摘要 22 | 23 | ---- 24 | 25 | 使用pyTorch定义了一个两层的卷积神经网络,使用MNIST数据集对其进行训练,使用训练后的模型进行手写数字的识别,同时提供了一个查看MNIST数据集的方法。文件中已经给出了一个训练好的模型即`CNN_for_MNIST.pth`;`__pycache__`文件夹似乎是由Python自动生成的,目的是使程序运行的稍快一些【】 26 | 建议不用关心或者嫌麻烦直接删除掉该文件夹 27 | 28 | # 代码结构 29 | 30 | ---- 31 | 32 | - `cnn.py`定义一个两层(两个卷积层、池化层和一个全连接层)的卷积神经网络 33 | - `train.py`用于使用cnn定义的网络来训练模型(使用MNIST数据集->[MNIST数据集](http://yann.lecun.com/exdb/mnist/)) 34 | - `tt.py`提供了一种查看MNIST数据集的方式,打开运行之后即可查看训练所用的二进制数据相应的的图片,由于pyplot的显示的窗口相关操作我不太理解,故需要关闭当前显示窗口才能查看下一张图片,不关心训练集的话可以不管这个文件 35 | - `main.py`使用训练好的模型进行手写数字的识别 36 | 37 | # 代码细节 38 | 39 | ---- 40 | 41 | - `cnn.py`文件定义了一个CNN类,其中的 42 | 43 | self.layer1 = nn.Sequential(#卷积 44 | nn.Conv2d(1, 25, kernel_size=3),#输入通道,输出通道,卷积核 45 | 46 | nn.BatchNorm2d(25),#参数为输出通道数 47 | 48 | nn.ReLU(inplace=True)#线性整流 49 | 50 | ) 51 | 52 | 53 | 54 | self.layer2 = nn.Sequential(#池化 55 | 56 | nn.MaxPool2d(kernel_size=2, stride=2) 57 | 58 | ) 59 | 60 | 61 | 62 | self.layer3 = nn.Sequential(#卷积 63 | 64 | nn.Conv2d(25, 50, kernel_size=3), 65 | 66 | nn.BatchNorm2d(50), 67 | 68 | nn.ReLU(inplace=True) 69 | 70 | ) 71 | 72 | 73 | 74 | self.layer4 = nn.Sequential(#池化 75 | 76 | nn.MaxPool2d(kernel_size=2, stride=2) 77 | 78 | ) 79 | 80 | 定义了了一个神经网络的两个卷积层和两个池化层 81 | 82 | 而代码块 83 | 84 | self.fc = nn.Sequential( 85 | 86 | nn.Linear(50 * 5 * 5, 1024), 87 | 88 | nn.ReLU(inplace=True), 89 | 90 | nn.Linear(1024, 128), 91 | 92 | nn.ReLU(inplace=True), 93 | 94 | nn.Linear(128, 10) 95 | 96 | ) 97 | 将定义了后续的两次线性化和三次激活 98 | 99 | - `train.py`文件相应操作在代码中有注释,这里只描述过程 100 | 101 | - 定义数据分批大小即一次训练所用样本数、学习率和数据预处理方法(将图片转换成torch用的tensor格式);选择模型之后的损失函数和优化器定义也属于此块(torch中已有模型,这里只做调用) 102 | - 下载数据(文件中已经下载好了放在data文件夹中) 103 | - 选择模型即cnn中已经定义的、使用数据进行训练、每50次训练输出当前错误率 104 | - 训练完成后保存模型(由于本人已经训练过模型,这里在提交的代码中已经被注释) 105 | - 在测试集中看模型识别准确率,代码中将predict和label进行了输出,可以直观的看到识别情况 106 | 107 | - `main.py`文件,同上这里描述过程 108 | 109 | - 首先定义了一些参数:画板大小、画线宽度 110 | - 定义画线函数 111 | - 循环读取画线进行识别,画好后按`space`进行识别,识别完成后按`space`进行清空可继续进行,任何时候按`esc`可退出 112 | 113 | -------------------------------------------------------------------------------- /cnn.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | class CNN(nn.Module):#定义卷积神经网络 4 | 5 | def __init__(self): 6 | 7 | super(CNN, self).__init__() 8 | 9 | self.layer1 = nn.Sequential(#卷积 10 | 11 | nn.Conv2d(1, 25, kernel_size=3),#输入通道,输出通道,卷积核 12 | 13 | nn.BatchNorm2d(25),#参数为输出通道数 14 | 15 | nn.ReLU(inplace=True)#线性整流 16 | 17 | ) 18 | 19 | 20 | 21 | self.layer2 = nn.Sequential(#池化 22 | 23 | nn.MaxPool2d(kernel_size=2, stride=2) 24 | 25 | ) 26 | 27 | 28 | 29 | self.layer3 = nn.Sequential(#卷积 30 | 31 | nn.Conv2d(25, 50, kernel_size=3), 32 | 33 | nn.BatchNorm2d(50), 34 | 35 | nn.ReLU(inplace=True) 36 | 37 | ) 38 | 39 | 40 | 41 | self.layer4 = nn.Sequential(#池化 42 | 43 | nn.MaxPool2d(kernel_size=2, stride=2) 44 | 45 | ) 46 | 47 | 48 | 49 | self.fc = nn.Sequential( 50 | 51 | nn.Linear(50 * 5 * 5, 1024), 52 | 53 | nn.ReLU(inplace=True), 54 | 55 | nn.Linear(1024, 128), 56 | 57 | nn.ReLU(inplace=True), 58 | 59 | nn.Linear(128, 10) 60 | 61 | ) 62 | 63 | 64 | 65 | 66 | def forward(self, x): 67 | 68 | x = self.layer1(x) 69 | 70 | x = self.layer2(x) 71 | 72 | x = self.layer3(x) 73 | 74 | x = self.layer4(x) 75 | 76 | x = x.view(x.size(0), -1) 77 | 78 | x = self.fc(x) 79 | 80 | return x 81 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # 导入包,在测试时写了很多东西,于是莫名其妙的导入了很多没用到包没有删除 2 | import torch 3 | 4 | import torchvision.transforms as transforms 5 | 6 | from PIL import Image 7 | 8 | from torch import nn, optim 9 | 10 | from torch.autograd import Variable 11 | 12 | from torch.utils.data import DataLoader 13 | 14 | from torchvision import datasets 15 | 16 | import cv2 17 | 18 | import numpy as np 19 | 20 | import cnn 21 | 22 | clean = 0 23 | board = 200 24 | WINDOWNAME = 'Win' 25 | line = 10 26 | #鼠标划线函数 27 | def draw_line(event, x, y, flags, param): 28 | global ix, iy 29 | if event == cv2.EVENT_LBUTTONDOWN: 30 | ix, iy = x, y 31 | elif (event == cv2.EVENT_MOUSEMOVE) & (flags == cv2.EVENT_FLAG_LBUTTON): 32 | cv2.line(img, (ix, iy), (x, y), 255, line)#黑线,粗细为line像素 33 | ix, iy = x, y 34 | 35 | 36 | img = np.zeros((board, board, 1), np.uint8) 37 | cv2.namedWindow(WINDOWNAME) 38 | cv2.setMouseCallback(WINDOWNAME, draw_line) 39 | transf = transforms.Compose( 40 | 41 | [transforms.ToTensor(), 42 | 43 | transforms.Normalize([0.5], [0.5]) 44 | ]) 45 | while True: 46 | cv2.imshow(WINDOWNAME, img)#展示写入窗口,输入就是使用鼠标在其上写数字从0到9,写完后按下空格进行识别,再按一下清空窗口,写好后再按再次检测,往复;按esc退出 47 | key = cv2.waitKey(20) 48 | 49 | if key == 32: 50 | if clean:#重新初始化画板 51 | img = np.zeros((board, board, 1), np.uint8) 52 | cv2.imshow(WINDOWNAME, img) 53 | else: 54 | # 把图片resize成MNIST数据集的标准尺寸28*28 55 | resized_img = cv2.resize(img, (28, 28), cv2.INTER_CUBIC) 56 | im = transf(resized_img) 57 | im = torch.unsqueeze(im, dim=0) # 对数据增加一个新维度,因为tensor的参数是[batch, channel, height, width] 58 | if torch.cuda.is_available():#GPU是否可以使用 59 | im = im.cuda() 60 | else: 61 | im = Variable(im) 62 | model = torch.load('CNN_for_MNIST.pth') 63 | out = model(im) 64 | c = out.tolist()[0] 65 | print(c) 66 | print(c.index(max(c))) 67 | clean = not clean 68 | elif key == 27: 69 | break 70 | cv2.destroyAllWindows() 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #模型训练,作用为下载MNIST数据集(放入同目录下的data文件夹),使用数据集训练网络并将训练好的模型保存为同目录下的CNN_for_MNIST.pth 2 | import torch 3 | 4 | from torch import nn, optim 5 | 6 | from torch.autograd import Variable 7 | 8 | from torch.utils.data import DataLoader 9 | 10 | from torchvision import datasets, transforms 11 | 12 | import cv2 13 | 14 | import cnn 15 | 16 | 17 | # 定义一些超参数 18 | 19 | batch_size = 64 #分批参数 20 | 21 | learning_rate = 0.02 #学习率 22 | 23 | 24 | 25 | 26 | # 数据预处理。transforms.ToTensor()将图片转换成PyTorch中处理的对象Tensor,并且进行标准化(数据在0~1之间) 27 | 28 | # transforms.Normalize()做归一化。它进行了减均值,再除以标准差。两个参数分别是均值和标准差 29 | 30 | # transforms.Compose()函数将各种预处理的操作组合到了一起 31 | 32 | data_tf = transforms.Compose( 33 | 34 | [transforms.ToTensor(), 35 | 36 | transforms.Normalize([0.5], [0.5]) 37 | ]) 38 | 39 | 40 | # 数据集的下载器,下载数据集放入data文件夹 41 | 42 | train_dataset = datasets.MNIST(root='./data', train=True, transform=data_tf, download=True) #训练集数据 43 | 44 | test_dataset = datasets.MNIST(root='./data', train=False, transform=data_tf) #测试集数据 45 | 46 | train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) 47 | 48 | test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) 49 | 50 | 51 | 52 | # 选择模型 53 | 54 | model = cnn.CNN() 55 | 56 | if torch.cuda.is_available():#关于是否使用GPU,本人未使用,但看网上大家都会写上这个我也加上去了,可能GPU效率更高吧,但就本问题和模型而言计算量CPU就可以胜任,下面出现也类似 57 | 58 | model = model.cuda() 59 | 60 | 61 | # 定义损失函数和优化器 62 | 63 | criterion = nn.CrossEntropyLoss()#常用于多分类问题的交叉熵 64 | 65 | optimizer = optim.SGD(model.parameters(), lr=learning_rate) 66 | 67 | 68 | 69 | 70 | # 训练模型 71 | 72 | epoch = 0 73 | 74 | #从训练集读取数据,开始迭代 75 | 76 | for data in train_loader: 77 | 78 | img, label = data 79 | 80 | if torch.cuda.is_available(): 81 | 82 | img = img.cuda() 83 | 84 | label = label.cuda() 85 | 86 | else: 87 | 88 | img = Variable(img) 89 | 90 | label = Variable(label) 91 | 92 | out = model(img) 93 | 94 | loss = criterion(out, label) #计算损失函数值 95 | 96 | print_loss = loss.data.item() 97 | 98 | optimizer.zero_grad() #将梯度归零 99 | 100 | loss.backward() #反向传播计算得到每个参数的梯度值 101 | 102 | optimizer.step() #通过梯度下降执行一步参数更新 103 | 104 | epoch+=1 105 | 106 | if epoch%50 == 0: #输出进度 107 | 108 | print('epoch: {}, loss: {:.4}'.format(epoch, loss.data.item())) 109 | 110 | 111 | # 保存和加载整个模型 112 | 113 | #torch.save(model, 'CNN_for_MNIST.pth') 114 | 115 | 116 | 117 | 118 | # 模型评估 119 | 120 | model.eval() 121 | 122 | eval_loss = 0 123 | 124 | eval_acc = 0 125 | 126 | for data in test_loader: 127 | 128 | img, label = data 129 | img = Variable(img) 130 | 131 | if torch.cuda.is_available(): #CPU or GPU 132 | 133 | img = img.cuda() 134 | label = label.cuda() 135 | 136 | out = model(img) 137 | loss = criterion(out, label) 138 | eval_loss += loss.data.item()*label.size(0) 139 | _, pred = torch.max(out, 1) #pred为一个tensor,里面是一组预测可能性最大的数,实际为下标,但在这里下标刚好与数匹配 140 | num_correct = (pred == label).sum() #按理说bool型不能接sum,但当pred和label是tensor格式时会得到一个tensor且只有一个值为前两者相等的值得个数,刚好统计了正确的预测数 141 | print('pred',pred) 142 | print('label',label) 143 | #print(type(num_correct)) 144 | #print(num_correct) 145 | #print((pred == label).sum()) 146 | 147 | eval_acc += num_correct.item() 148 | 149 | print('Test Loss: {:.6f}, Acc: {:.6f}'.format( #输出模型评估参数总损失和正确率 150 | 151 | eval_loss / (len(test_dataset)), 152 | 153 | eval_acc / (len(test_dataset)) 154 | 155 | )) 156 | -------------------------------------------------------------------------------- /tt.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import matplotlib.pyplot as plt #用于显示图片 4 | import cv2 5 | import os 6 | os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" 7 | 8 | #忽略警告 9 | import warnings 10 | warnings.filterwarnings('ignore') 11 | 12 | #选择运行设备 13 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 14 | 15 | #下载数据集 28*28=784 16 | dataset_train = torchvision.datasets.MNIST(root='./data', train=True, transform=torchvision.transforms.ToTensor(), download=True) 17 | dataset_test = torchvision.datasets.MNIST(root='./data', train=False, transform=torchvision.transforms.ToTensor(), download=False) 18 | 19 | #将数据集按批量大小加载到数据集中 20 | data_loader_train = torch.utils.data.DataLoader(dataset=dataset_train, batch_size=100, shuffle=True) #600*100*([[28*28],x]) 21 | data_loader_test = torch.utils.data.DataLoader(dataset=dataset_test, batch_size=100, shuffle=False) 22 | 23 | #for epoch in range(5): #一共五个周期,其中一个周期(len(dataset_train)=60000)/(batch_size=100)=(len(dataset_train)=600)个批量 24 | for i, (images, labels) in enumerate(data_loader_train): 25 | 26 | #print(i, images[0].shape, labels[0].shape) 27 | ''' 28 | 每一个周期,共600个批次(i=0~599); 29 | data_loader_train包含600个批次,包括整个训练集; 30 | 每一批次一共100张图片,对应100个标签, len(images[0])=1; 31 | images包含一个批次的100张图片(image[0].shape=torch.Size([1,28,28])),labels包含一个批次的100个标签,标签范围为0~9 32 | ''' 33 | 34 | #每20个批量绘制最后一个批量的所有图片 35 | if (i + 1) % 3 == 0: 36 | for j in range(len(images)): 37 | #if(labels[j] == 1): 38 | print('batch_number [{}/{}]'.format(i + 1, len(data_loader_train))) 39 | image = images[j].resize(28, 28) #将(1,28,28)->(28,28) 40 | print(image) 41 | plt.imshow(image) # 显示图片,接受tensors, numpy arrays, numbers, dicts or lists 42 | plt.axis('off') # 不显示坐标轴 43 | plt.title("$The {} picture in {} batch, label={}$".format(j + 1, i + 1, labels[j])) 44 | plt.show() 45 | 46 | --------------------------------------------------------------------------------