├── .gitignore
├── LICENSE
├── README.md
├── docs
├── ai-cover.jpg
├── cnn-convolution.gif
├── cnn-convolution.png
├── cnn-layers.png
├── cnn-maxpooling.png
├── cnn-pooling.png
├── conv-with-padding.png
├── fe-ai.md
├── img-grayscale.jpeg
├── img-regression.jpg
├── picture-55.jpg
├── picture-rgb.png
├── regression-result.jpg
└── tensorflow-playground.png
├── index.html
└── net
├── car.js
└── convnet.js
/.gitignore:
--------------------------------------------------------------------------------
1 | # MAC
2 | .DS_Store
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Ranjay
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## 项目说明
2 |
3 | 前端CNN图像识别项目:
4 |
5 | * 打开页面,调出开发者工具控制台并切换到console面板
6 | * 等待console输出“模型训练好了”字段
7 | * 点击识别按钮即可输出识别结果
8 |
9 | 
10 |
11 | ## 目录结构
12 |
13 | ```
14 | ├── net 依赖库及资源文件
15 | ├── docs 项目文档
16 | ├── index.html 项目主文件
17 | └── .gitignore
18 | ```
19 |
20 | ## 项目文档
21 |
22 | [《前端AI实战——告诉世界前端也能做AI》](docs/fe-ai.md)
23 |
24 | ## 其他
25 |
26 | 项目具体知识点详见文件`index.html`内代码注释
27 |
--------------------------------------------------------------------------------
/docs/ai-cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryOnlyZRJ/image-regression/4d076078bcd7e0aab461cadbe71d84e49aa477af/docs/ai-cover.jpg
--------------------------------------------------------------------------------
/docs/cnn-convolution.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryOnlyZRJ/image-regression/4d076078bcd7e0aab461cadbe71d84e49aa477af/docs/cnn-convolution.gif
--------------------------------------------------------------------------------
/docs/cnn-convolution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryOnlyZRJ/image-regression/4d076078bcd7e0aab461cadbe71d84e49aa477af/docs/cnn-convolution.png
--------------------------------------------------------------------------------
/docs/cnn-layers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryOnlyZRJ/image-regression/4d076078bcd7e0aab461cadbe71d84e49aa477af/docs/cnn-layers.png
--------------------------------------------------------------------------------
/docs/cnn-maxpooling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryOnlyZRJ/image-regression/4d076078bcd7e0aab461cadbe71d84e49aa477af/docs/cnn-maxpooling.png
--------------------------------------------------------------------------------
/docs/cnn-pooling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryOnlyZRJ/image-regression/4d076078bcd7e0aab461cadbe71d84e49aa477af/docs/cnn-pooling.png
--------------------------------------------------------------------------------
/docs/conv-with-padding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryOnlyZRJ/image-regression/4d076078bcd7e0aab461cadbe71d84e49aa477af/docs/conv-with-padding.png
--------------------------------------------------------------------------------
/docs/fe-ai.md:
--------------------------------------------------------------------------------
1 | # 前端AI实战——告诉世界前端也能做AI
2 |
3 | 我想大多数人和我一样,第一次听见“人工智能”这个词的时候都会觉得是一个很高大上、遥不可及的概念,特别像我这样一个平凡的前端,和大部分人一样,都觉得人工智能其实离我们很遥远,我们对它的印象总是停留在各种各样神奇而又复杂的算法,这些仿佛都是那些技术专家或者海归博士才有能力去做的工作。我也曾一度以为自己和这个行业没有太多缘分,但自从Tensorflow发布了JS版本之后,这一领域又引起了我的注意。在python垄断的时代,发布JS工具库不就是意味着我们前端工程师也可以参与其中?
4 |
5 | 当我决定开始投身这片领域做一些自己感兴趣的事情的时候,却发现身边的人投来的都是鄙夷的目光,他们对前端的印象,还总是停留在上个年代那些只会写写页面脚本的切图仔,只有身处这片领域的我们才知道大前端时代早已发生了翻天覆地的变革。
6 |
7 | 今天,我就带领大家从原理开始,尽可能用最通俗易懂的方式,让JS的爱好者们快速上手人工智能。
8 |
9 | 具体项目可参照:https://github.com/jerryOnlyZRJ/image-regression 。
10 |
11 | 本文就单拿人工智能下的一块小领域——“图像识别”作一些简单介绍和实战指引,当然这些都只是这片大领域下的冰山一角,还有很多很多知识等着你去发掘。
12 |
13 | ## 1.CNN卷积神经网络原理剖析
14 |
15 | 如果我不讲解这部分内容,而是直接教你们怎么使用一个现成的库,那这篇文章就没什么价值了,看完之后给你们留下的也一定都会是“开局一张图,过程全靠编”的错觉。因此,要真正了解人工智能,就应该进入这个黑盒,里面的思想才是精华。
16 |
17 | ### 1.1.图像灰度级与灰度图
18 |
19 | #### 1.1.1.基本概念
20 |
21 | 要做图像识别,我们肯定要先从图像下手,大家先理解一个概念——图像灰度级。
22 |
23 | 众所周知,我们的图片都是由许多像素点组成的,就好像一张100*100像素的图片,就表示它是由10000个像素点呈现的。但你可曾想过,这些像素点可以由一系列的数字表示嘛?
24 |
25 | 就先不拿彩色的图片吧,彩色太复杂了,我们就先拿一张黑白的图片为例,假设我们以黑色像素的深浅为基准,将白色到黑色的渐变过程分为不同的等级,这样,图片上每一个像素点都能用一个最为临近的等级数字表示出来:
26 |
27 | 
28 |
29 | 如果我们用1表示白色,用0表示黑色,将图像二值化,最后以矢量(数字)的形式呈现出来,结果大概就是这样:(下图是一张5*5的二值化图像,没有具体表示含义,只作示例)
30 |
31 | 
32 |
33 | 同理,如果是彩色的图像,那我们是不是可以把R、G、B三个维度的像素单独提取出来分别处理呢?这样,每一个维度不就可以单独视为一张灰度图。
34 |
35 | 
36 |
37 | #### 1.1.2.平滑图像与特征点
38 |
39 | 如果一张图像没有什么像素突变,比如一张全白的图片,如果以数字表示,自然都是0,那我们可以称这张图片的像素点是平滑的。再比如这张全白的图片上有一个黑点,自然,灰度图上就会有一个突兀的数值,我们就把它称作特征点,通常来说,图像的特征点有可能是噪声、边缘或者图片的实际特征。
40 |
41 | ### 1.2.神经网络与模型训练
42 |
43 | tensorflow在发布了JS版本的工具库后,也同时制作了一个[Tensorflow游乐场](http://playground.tensorflow.org/),打开之后,引入眼帘的网页中央这个东西便是神经网络:
44 |
45 | 
46 |
47 | 从图中,我们可以看到神经网络有很多不同的层级,就是图中的Layers,每一层都是前一层经过滤波器计算后的结果,越多的层级以及越多的“神经元”经过一次计算过程计算出来的结果误差越小,同时,计算的时间也会增加。神经网络正是模仿了我们人类脑袋里的神经元经过了一系列计算然后学习事物的过程。这里推荐阮一峰的[《神经网络入门》](http://www.ruanyifeng.com/blog/2017/07/neural-network.html)这篇文章,能够帮助大家更加浅显地了解神经网络是什么。
48 |
49 | 在我们的卷积神经网络中,这些层级都有不同的名字:输入层、卷积层、池化层以及输出层。
50 |
51 | * 输入层:我们输入的矢量化之后的图像
52 | * 卷积层:经过滤波器卷积计算之后的图像
53 | * 池化层:经过池化滤波器卷积计算之后的图像
54 | * 输出层:输出数据
55 |
56 | Features就是我们的算子,也称为滤波器,但是每种不同的滤波器对最后的输出结果都会有不同的影响,进过训练之后,机器会通过我们赋予的算法(比如激活函数等等)计算出哪些滤波器会对输出结果造成较大的误差,哪些滤波器对输出结果压根没有影响(原理很简单,第一次计算使用所有滤波器,第二次计算拿掉某一个滤波器,然后观察误差值(Training loss)就可以知道这个被拿掉的滤波器所起到的作用了),机器会为比较重要的滤波器赋予较高的权重,我们将这样一个过程称为“训练”。最终,我们将得到的整个带有权重的神经网络称为我们通过机器训练出的“模型”,我们可以拿着这个模型去让机器学习事物。
57 |
58 | 这就是机器学习中“训练模型”的过程,Tensorflow.js就是为我们提供的训练模型的工具库,当你真正掌握了模型训练的奥义之后,Tensorflow对你而言就像JQuery用起来一般简单。
59 |
60 | 大家看完这些介绍之后肯定还是一脸茫然,什么是滤波器?什么又是卷积计算?不着急,下一个版块的内容将会为大家揭开所有谜题。
61 |
62 | ### 1.3.卷积算法揭秘
63 |
64 | #### 1.3.1.卷积算法
65 |
66 | 还记得我们在1.1.1里说到一张图片可以用矢量的形式表示每个像素点嘛?卷积计算就是在这基础上,使用某些算子对这些像素点进行处理,而这些算子,就是我们刚刚提到的滤波器(比如左边,就是一张经过二值化处理的5\*5的图片,中间的就是我们的滤波器):
67 |
68 | 
69 |
70 | 那计算的过程又是怎样的呢?卷积这东西听起来感觉很复杂,但实际上就是把我们的滤波器套到图像上,乘积求和,然后将图像上位于滤波器中心的值用计算结果替换,大概的效果就是下面这张动图这样:
71 |
72 | 
73 |
74 | 对,所谓高大上的卷积就是这样一个过程,我们的滤波器每次计算之后就向右移动一个像素,所以我们可以称滤波器的步长为1,以此类推。不过我们发现,经过滤波器处理后的图像,好像“变小了”!原来是5\*5的图片这下变成了3\*3,这是卷积运算带来的必然副作用,如果不想让图片变小,我们可以为原图像加上一定像素且值均为0的边界(padding)去抵消副作用,就像下面这样:
75 |
76 | 
77 |
78 | #### 1.3.2.池化算法
79 |
80 | 其实在平时训练模型的过程中,我们输入的图像肯定不只有5\*5像素这么小,我们最经常见到的图片许多都是100\*100像素以上的,这样使用我们的机器去计算起来肯定是比较复杂的,因此,我们常常会使用池化算法进行特征提取或者图像平滑处理,池化的过程其实就是按照某种规律将图片等比缩小,过程就像下面这样:
81 |
82 | 
83 |
84 | 而池化算法最常用的有两大类:取均值算法和取最大值算法,顾名思义,取均值算法就是取滤波器中的平均值作为结果,取最大值算法就是取滤波器中的最大值作为输出结果:
85 |
86 | 
87 |
88 | 上图就是取最大值算法的处理过程,大家也能很直观的看出,在池化层中,滤波器的步长大都是等于滤波器自身大小的(比较方便控制缩放比例)。并且,取最大值算法肯定是很容易取到滤波器中的特征点(还记得特征点嘛?忘记的话快回去1.1.2看看哦~),所以我们可以讲取最大值算法的池化处理称为特征提取;同理,取均值算法因为把所有的像素点的灰度级都平均了,所以我们可以称之为平滑处理。
89 |
90 | 关于卷积神经网络的知识,可以具体参照这篇文章:[《卷积神经网络(1)卷积层和池化层学习》](https://www.cnblogs.com/zf-blog/p/6075286.html)。了解了这些知识之后,就可以开始我们的实战啦~
91 |
92 | ## 2.图像识别实战
93 |
94 | 说了那么多理论,也不比实操来得有感觉。在大家了解了卷积神经网络的基本原理之后,就可以使用我们的工具库来帮助我们完成相关工作,这里我推荐[ConvNetJS](https://github.com/karpathy/convnetjs)。这款工具库的本质就是我们在1.2中提到的别人训练好的模型,我们只需要拿来“学习”即可。
95 |
96 | ### 2.1.使用ConvNetJS
97 |
98 | 我们可以看到在ConvNetJS的README里有这样一段官方demo,具体的含义我已经用注释在代码里标注:
99 |
100 | ```js
101 | // 定义一个神经网络
102 | var layer_defs = [];
103 | // 输入层:即是32*32*3的图像
104 | layer_defs.push({type:'input', out_sx:32, out_sy:32, out_depth:3});
105 | // 卷积层
106 | // filter:用16个5*5的滤波器去卷积
107 | // stride:卷积步长为1
108 | // padding:填充宽度为2(为保证输出的图像大小不会发生变化)
109 | // activation:激活函数为relu(还有Tanh、Sigmoid等等函数,功能不同)
110 | layer_defs.push({type:'conv', sx:5, filters:16, stride:1, pad:2, activation:'relu'});
111 | // 池化层
112 | // 池化滤波器的大小为2*2
113 | // stride:步长为2
114 | // 在这里我们无法看出这个框架池化是使用的Avy Pooling还是Max Pooling算法,先视为后者
115 | layer_defs.push({type:'pool', sx:2, stride:2});
116 | // 反复卷积和池化减小模型误差
117 | layer_defs.push({type:'conv', sx:5, filters:20, stride:1, pad:2, activation:'relu'});
118 | layer_defs.push({type:'pool', sx:2, stride:2});
119 | layer_defs.push({type:'conv', sx:5, filters:20, stride:1, pad:2, activation:'relu'});
120 | layer_defs.push({type:'pool', sx:2, stride:2});
121 | // 输出层
122 | // 分类器:输出10中不同的类别
123 | layer_defs.push({type:'softmax', num_classes:10});
124 |
125 | // 实例化一个神经网络
126 | net = new convnetjs.Net();
127 | net.makeLayers(layer_defs);
128 |
129 | // 模型训练
130 | const trainer = new convnetjs.SGDTrainer(net, { learning_rate: 0.01, momentum: 0.9, batch_size: 5, l2_decay: 0.0 });
131 | trainer.train(imgVol, classIndex);
132 |
133 | // 使用训练好的模型进行图像识别
134 | var x = convnetjs.img_to_vol(document.getElementById('some_image'))
135 | var output_probabilities_vol = net.forward(x)
136 | ```
137 |
138 | 如果想要更形象点,上述过程可以用这样一幅图表示:
139 |
140 | 
141 |
142 | 中间的“卷积-池化-卷积-池化……“就是我们定义并训练的神经网络,我们输入矢量化处理后的图像后,先进行卷积运算,不同的滤波器得到了不同的结果,官方demo里是使用了16个不同的滤波器(PS:这里给大家留一个思考的问题,一个3\*3的二值化滤波器,能写出多少种可能?),自然能卷积出16种不同的结果,再拿着这些结果池化处理,不断重复这个过程,最终得出图像识别结果:
143 |
144 | 
145 |
146 | ### 2.2.实战项目解析
147 |
148 | 来,我们一起详细梳理一下使用ConvNetJS这个工具库完成整个图像识别的具体流程,
149 |
150 | (PS:项目代码具体参照:https://github.com/jerryOnlyZRJ/image-regression )
151 |
152 | 首先,我们必须先有数据供我们的模型去学习,至少你该让这个模型知道啥是啥对吧,在项目里的 `net` 文件夹里的 `car.js` 文件,存放的就是我们的学习数据,如果你们感兴趣可以打开看看,里面的数据就是告诉机器什么样的车标对应的是车的什么品牌。
153 |
154 | 在我们的项目里,是通过这样一段代码完成机器学习的:
155 |
156 | ```js
157 | const trainer = new convnetjs.SGDTrainer(net, { learning_rate: 0.01, momentum: 0.9, batch_size: 5, l2_decay: 0.0 });
158 | let imageList = [];
159 | const loadData = i => {
160 | return function () {
161 | return new Promise(function (resolve, reject) {
162 | let image = new Image();
163 | image.crossOrigin = "anonymous";
164 | image.src = carList[i].url;
165 | image.onload = function () {
166 | let vol = convnetjs.img_to_vol(image);
167 | // 逐张训练图片
168 | trainer.train(vol, i);
169 | resolve();
170 | };
171 | image.onerror = reject;
172 | })
173 | }
174 | }
175 | // 遍历图片资源
176 | for (let j = 0; j < carList.length; j++) {
177 | imageList.push(loadData(j));
178 | }
179 | var testBtn = document.getElementById("test")
180 | function training(){
181 | testBtn.disabled = true
182 | return new Promise((resolve, reject) => {
183 | Promise.all(imageList.map(imageContainer => imageContainer())).then(() => {
184 | console.log("模型训练好了!!!👌")
185 | testBtn.disabled = false
186 | resolve()
187 | })
188 | })
189 | }
190 | ```
191 |
192 | 我们试着去打印一下图像识别的输出结果,得到的是这样一个东西:
193 |
194 | 
195 |
196 | 从识别结果中我们可以看到,我们得到的是一个数组,这就是经过分类器分类的10个不同类别,对应的自然是我们的车的品牌,值就是每个类别对应的概率。所以,我们只要拿到概率的最大值,就是预测得出的最倾向的结果。
197 |
198 | ## 3.结语
199 |
200 | 随着JS引擎的计算能力不断增强,人工智能领域的不断发展,可以预见的是,在不久的将来,肯定能有一些简单的算法可以被移植到用户前端执行,这样既能减少请求,又能分担后端压力。这一切并不是无稽之谈,为什么tensorflow.js会应运而生,正是因为JS的社区在不断壮大,JS这款便捷的语言也在得到更为普遍的使用。所以,请对你所从事的这份前端事业,有足够的信心!
201 |
202 | 还是那句老话:
203 |
204 | **技术从来不会受限于语言,受限你的,永远只是思想。**
205 |
206 | 我并不是什么算法工程师,我也不是CS专业出来的科班生,我只是一枚普普通通的前端,和绝大多数人一样,没有多深厚的基础,但我愿意去学,我享受克服困难的过程,而那份对人工智能的执着,只是来源于那份不满足于现状的倔性和对这片领域一成不变的初心。
207 |
208 | 如果您觉得这篇文章对您有帮助,还请麻烦您为文章提供的示例demo项目点个star;如果您对我的其他项目感兴趣,也欢迎follow哦~
209 |
210 | ## 4.鸣谢
211 |
212 | 本文项目资源大部分来自[京程一灯](https://yd.ke.qq.com),感谢[京程一灯](https://yd.ke.qq.com/)袁志佳老师对本文以及我个人提供的支持和帮助,如果你也在前端前进路上感到迷茫,[京程一灯](https://yd.ke.qq.com/)也许是你不错的选择。
--------------------------------------------------------------------------------
/docs/img-grayscale.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryOnlyZRJ/image-regression/4d076078bcd7e0aab461cadbe71d84e49aa477af/docs/img-grayscale.jpeg
--------------------------------------------------------------------------------
/docs/img-regression.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryOnlyZRJ/image-regression/4d076078bcd7e0aab461cadbe71d84e49aa477af/docs/img-regression.jpg
--------------------------------------------------------------------------------
/docs/picture-55.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryOnlyZRJ/image-regression/4d076078bcd7e0aab461cadbe71d84e49aa477af/docs/picture-55.jpg
--------------------------------------------------------------------------------
/docs/picture-rgb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryOnlyZRJ/image-regression/4d076078bcd7e0aab461cadbe71d84e49aa477af/docs/picture-rgb.png
--------------------------------------------------------------------------------
/docs/regression-result.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryOnlyZRJ/image-regression/4d076078bcd7e0aab461cadbe71d84e49aa477af/docs/regression-result.jpg
--------------------------------------------------------------------------------
/docs/tensorflow-playground.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerryOnlyZRJ/image-regression/4d076078bcd7e0aab461cadbe71d84e49aa477af/docs/tensorflow-playground.png
--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | 利用卷积神经网络算法识别车logo
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
88 |
89 |
90 |
--------------------------------------------------------------------------------
/net/car.js:
--------------------------------------------------------------------------------
1 | var carList = [
2 | {
3 | "name": "奥迪",
4 | "url": "http://p.pstatp.com/avatar/100x100/1dd5000048d6334c26b4.png",
5 | "index": 0
6 | },
7 | {
8 | "name": "奔驰",
9 | "url": "http://p.pstatp.com/avatar/100x100/1dd50000190229abeec8.png",
10 | "index": 1
11 | },
12 | {
13 | "name": "宝马",
14 | "url": "http://p.pstatp.com/avatar/100x100/1dd5000018fab5bd782b.png",
15 | "index": 2
16 | },
17 | {
18 | "name": "本田",
19 | "url": "http://p.pstatp.com/avatar/100x100/1dd5000018fc7c108922.png",
20 | "index": 3
21 | },
22 | {
23 | "name": "别克",
24 | "url": "http://p.pstatp.com/avatar/100x100/1dd5000018fefd5c26b7.png",
25 | "index": 4
26 | },
27 | {
28 | "name": "比亚迪",
29 | "url": "http://p.pstatp.com/avatar/100x100/1dd5000018f0e7000aa7.png",
30 | "index": 5
31 | },
32 | {
33 | "name": "保时捷",
34 | "url": "http://p.pstatp.com/avatar/100x100/1dd50000190122c26c84.png",
35 | "index": 6
36 | },
37 | {
38 | "name": "大众",
39 | "url": "http://p.pstatp.com/avatar/100x100/1dd500001906761eb47e.png",
40 | "index": 7
41 | },
42 | {
43 | "name": "哈弗",
44 | "url": "http://p.pstatp.com/avatar/100x100/1dd50000193811739d87.png",
45 | "index": 8
46 | },
47 | {
48 | "name": "兰博基尼",
49 | "url": "http://p.pstatp.com/avatar/100x100/1dd50000191738275961.png",
50 | "index": 9
51 | },
52 | {
53 | "name": "雪佛兰",
54 | "url": "http://p.pstatp.com/avatar/100x100/1dd50000196a38ee3f43.png",
55 | "index": 10
56 | },
57 | {
58 | "name": "现代",
59 | "url": "http://p.pstatp.com/avatar/100x100/1dd50000196d2f0b9a05.png",
60 | "index": 11
61 | }
62 | ]
--------------------------------------------------------------------------------
/net/convnet.js:
--------------------------------------------------------------------------------
1 | var convnetjs = convnetjs || { REVISION: 'ALPHA' };
2 | (function(global) {
3 | "use strict";
4 |
5 | // Random number utilities
6 | var return_v = false;
7 | var v_val = 0.0;
8 | var gaussRandom = function() {
9 | if(return_v) {
10 | return_v = false;
11 | return v_val;
12 | }
13 | var u = 2*Math.random()-1;
14 | var v = 2*Math.random()-1;
15 | var r = u*u + v*v;
16 | if(r == 0 || r > 1) return gaussRandom();
17 | var c = Math.sqrt(-2*Math.log(r)/r);
18 | v_val = v*c; // cache this
19 | return_v = true;
20 | return u*c;
21 | }
22 | var randf = function(a, b) { return Math.random()*(b-a)+a; }
23 | var randi = function(a, b) { return Math.floor(Math.random()*(b-a)+a); }
24 | var randn = function(mu, std){ return mu+gaussRandom()*std; }
25 |
26 | // Array utilities
27 | var zeros = function(n) {
28 | if(typeof(n)==='undefined' || isNaN(n)) { return []; }
29 | if(typeof ArrayBuffer === 'undefined') {
30 | // lacking browser support
31 | var arr = new Array(n);
32 | for(var i=0;i maxv) { maxv = w[i]; maxi = i; }
66 | if(w[i] < minv) { minv = w[i]; mini = i; }
67 | }
68 | return {maxi: maxi, maxv: maxv, mini: mini, minv: minv, dv:maxv-minv};
69 | }
70 |
71 | // create random permutation of numbers, in range [0...n-1]
72 | var randperm = function(n) {
73 | var i = n,
74 | j = 0,
75 | temp;
76 | var array = [];
77 | for(var q=0;qright
259 | var augment = function(V, crop, dx, dy, fliplr) {
260 | // note assumes square outputs of size crop x crop
261 | if(typeof(fliplr)==='undefined') var fliplr = false;
262 | if(typeof(dx)==='undefined') var dx = global.randi(0, V.sx - crop);
263 | if(typeof(dy)==='undefined') var dy = global.randi(0, V.sy - crop);
264 |
265 | // randomly sample a crop in the input volume
266 | var W;
267 | if(crop !== V.sx || dx!==0 || dy!==0) {
268 | W = new Vol(crop, crop, V.depth, 0.0);
269 | for(var x=0;x=V.sx || y+dy<0 || y+dy>=V.sy) continue; // oob
272 | for(var d=0;d=0 && oy=0 && ox=0 && oy=0 && ox=0 && oy=0 && ox a) { a = v; winx=ox; winy=oy;}
689 | }
690 | }
691 | }
692 | this.switchx[n] = winx;
693 | this.switchy[n] = winy;
694 | n++;
695 | A.set(ax, ay, d, a);
696 | }
697 | }
698 | }
699 | this.out_act = A;
700 | return this.out_act;
701 | },
702 | backward: function() {
703 | // pooling layers have no parameters, so simply compute
704 | // gradient wrt data here
705 | var V = this.in_act;
706 | V.dw = global.zeros(V.w.length); // zero out gradient wrt data
707 | var A = this.out_act; // computed in forward pass
708 |
709 | var n = 0;
710 | for(var d=0;d amax) amax = as[i];
841 | }
842 |
843 | // compute exponentials (carefully to not blow up)
844 | var es = global.zeros(this.out_depth);
845 | var esum = 0.0;
846 | for(var i=0;i 0) {
1004 | // violating dimension, apply loss
1005 | x.dw[i] += 1;
1006 | x.dw[y] -= 1;
1007 | loss += ydiff;
1008 | }
1009 | }
1010 |
1011 | return loss;
1012 | },
1013 | getParamsAndGrads: function() {
1014 | return [];
1015 | },
1016 | toJSON: function() {
1017 | var json = {};
1018 | json.out_depth = this.out_depth;
1019 | json.out_sx = this.out_sx;
1020 | json.out_sy = this.out_sy;
1021 | json.layer_type = this.layer_type;
1022 | json.num_inputs = this.num_inputs;
1023 | return json;
1024 | },
1025 | fromJSON: function(json) {
1026 | this.out_depth = json.out_depth;
1027 | this.out_sx = json.out_sx;
1028 | this.out_sy = json.out_sy;
1029 | this.layer_type = json.layer_type;
1030 | this.num_inputs = json.num_inputs;
1031 | }
1032 | }
1033 |
1034 | global.RegressionLayer = RegressionLayer;
1035 | global.SoftmaxLayer = SoftmaxLayer;
1036 | global.SVMLayer = SVMLayer;
1037 |
1038 | })(convnetjs);
1039 |
1040 | (function(global) {
1041 | "use strict";
1042 | var Vol = global.Vol; // convenience
1043 |
1044 | // Implements ReLU nonlinearity elementwise
1045 | // x -> max(0, x)
1046 | // the output is in [0, inf)
1047 | var ReluLayer = function(opt) {
1048 | var opt = opt || {};
1049 |
1050 | // computed
1051 | this.out_sx = opt.in_sx;
1052 | this.out_sy = opt.in_sy;
1053 | this.out_depth = opt.in_depth;
1054 | this.layer_type = 'relu';
1055 | }
1056 | ReluLayer.prototype = {
1057 | forward: function(V, is_training) {
1058 | this.in_act = V;
1059 | var V2 = V.clone();
1060 | var N = V.w.length;
1061 | var V2w = V2.w;
1062 | for(var i=0;i 1/(1+e^(-x))
1099 | // so the output is between 0 and 1.
1100 | var SigmoidLayer = function(opt) {
1101 | var opt = opt || {};
1102 |
1103 | // computed
1104 | this.out_sx = opt.in_sx;
1105 | this.out_sy = opt.in_sy;
1106 | this.out_depth = opt.in_depth;
1107 | this.layer_type = 'sigmoid';
1108 | }
1109 | SigmoidLayer.prototype = {
1110 | forward: function(V, is_training) {
1111 | this.in_act = V;
1112 | var V2 = V.cloneAndZero();
1113 | var N = V.w.length;
1114 | var V2w = V2.w;
1115 | var Vw = V.w;
1116 | for(var i=0;i max(x)
1153 | // where x is a vector of size group_size. Ideally of course,
1154 | // the input size should be exactly divisible by group_size
1155 | var MaxoutLayer = function(opt) {
1156 | var opt = opt || {};
1157 |
1158 | // required
1159 | this.group_size = typeof opt.group_size !== 'undefined' ? opt.group_size : 2;
1160 |
1161 | // computed
1162 | this.out_sx = opt.in_sx;
1163 | this.out_sy = opt.in_sy;
1164 | this.out_depth = Math.floor(opt.in_depth / this.group_size);
1165 | this.layer_type = 'maxout';
1166 |
1167 | this.switches = global.zeros(this.out_sx*this.out_sy*this.out_depth); // useful for backprop
1168 | }
1169 | MaxoutLayer.prototype = {
1170 | forward: function(V, is_training) {
1171 | this.in_act = V;
1172 | var N = this.out_depth;
1173 | var V2 = new Vol(this.out_sx, this.out_sy, this.out_depth, 0.0);
1174 |
1175 | // optimization branch. If we're operating on 1D arrays we dont have
1176 | // to worry about keeping track of x,y,d coordinates inside
1177 | // input volumes. In convnets we do :(
1178 | if(this.out_sx === 1 && this.out_sy === 1) {
1179 | for(var i=0;i a) {
1186 | a = a2;
1187 | ai = j;
1188 | }
1189 | }
1190 | V2.w[i] = a;
1191 | this.switches[i] = ix + ai;
1192 | }
1193 | } else {
1194 | var n=0; // counter for switches
1195 | for(var x=0;x a) {
1204 | a = a2;
1205 | ai = j;
1206 | }
1207 | }
1208 | V2.set(x,y,i,a);
1209 | this.switches[n] = ix + ai;
1210 | n++;
1211 | }
1212 | }
1213 | }
1214 |
1215 | }
1216 | this.out_act = V2;
1217 | return this.out_act;
1218 | },
1219 | backward: function() {
1220 | var V = this.in_act; // we need to set dw of this
1221 | var V2 = this.out_act;
1222 | var N = this.out_depth;
1223 | V.dw = global.zeros(V.w.length); // zero out gradient wrt data
1224 |
1225 | // pass the gradient through the appropriate switch
1226 | if(this.out_sx === 1 && this.out_sy === 1) {
1227 | for(var i=0;i tanh(x)
1274 | // so the output is between -1 and 1.
1275 | var TanhLayer = function(opt) {
1276 | var opt = opt || {};
1277 |
1278 | // computed
1279 | this.out_sx = opt.in_sx;
1280 | this.out_sy = opt.in_sy;
1281 | this.out_depth = opt.in_depth;
1282 | this.layer_type = 'tanh';
1283 | }
1284 | TanhLayer.prototype = {
1285 | forward: function(V, is_training) {
1286 | this.in_act = V;
1287 | var V2 = V.cloneAndZero();
1288 | var N = V.w.length;
1289 | for(var i=0;i= 2, 'Error! At least one input layer and one loss layer are required.');
1538 | assert(defs[0].type === 'input', 'Error! First layer must be the input layer, to declare size of inputs');
1539 |
1540 | // desugar layer_defs for adding activation, dropout layers etc
1541 | var desugar = function() {
1542 | var new_defs = [];
1543 | for(var i=0;i0) {
1595 | var prev = this.layers[i-1];
1596 | def.in_sx = prev.out_sx;
1597 | def.in_sy = prev.out_sy;
1598 | def.in_depth = prev.out_depth;
1599 | }
1600 |
1601 | switch(def.type) {
1602 | case 'fc': this.layers.push(new global.FullyConnLayer(def)); break;
1603 | case 'lrn': this.layers.push(new global.LocalResponseNormalizationLayer(def)); break;
1604 | case 'dropout': this.layers.push(new global.DropoutLayer(def)); break;
1605 | case 'input': this.layers.push(new global.InputLayer(def)); break;
1606 | case 'softmax': this.layers.push(new global.SoftmaxLayer(def)); break;
1607 | case 'regression': this.layers.push(new global.RegressionLayer(def)); break;
1608 | case 'conv': this.layers.push(new global.ConvLayer(def)); break;
1609 | case 'pool': this.layers.push(new global.PoolLayer(def)); break;
1610 | case 'relu': this.layers.push(new global.ReluLayer(def)); break;
1611 | case 'sigmoid': this.layers.push(new global.SigmoidLayer(def)); break;
1612 | case 'tanh': this.layers.push(new global.TanhLayer(def)); break;
1613 | case 'maxout': this.layers.push(new global.MaxoutLayer(def)); break;
1614 | case 'svm': this.layers.push(new global.SVMLayer(def)); break;
1615 | default: console.log('ERROR: UNRECOGNIZED LAYER TYPE: ' + def.type);
1616 | }
1617 | }
1618 | },
1619 |
1620 | // forward prop the network.
1621 | // The trainer class passes is_training = true, but when this function is
1622 | // called from outside (not from the trainer), it defaults to prediction mode
1623 | forward: function(V, is_training) {
1624 | if(typeof(is_training) === 'undefined') is_training = false;
1625 | var act = this.layers[0].forward(V, is_training);
1626 | for(var i=1;i=0;i--) { // first layer assumed input
1644 | this.layers[i].backward();
1645 | }
1646 | return loss;
1647 | },
1648 | getParamsAndGrads: function() {
1649 | // accumulate parameters and gradients for the entire network
1650 | var response = [];
1651 | for(var i=0;i maxv) { maxv = p[i]; maxi = i;}
1670 | }
1671 | return maxi; // return index of the class with highest class probability
1672 | },
1673 | toJSON: function() {
1674 | var json = {};
1675 | json.layers = [];
1676 | for(var i=0;i 0.0)) {
1754 | // only vanilla sgd doesnt need either lists
1755 | // momentum needs gsum
1756 | // adagrad needs gsum
1757 | // adadelta needs gsum and xsum
1758 | for(var i=0;i 0 ? 1 : -1);
1785 | var l2grad = l2_decay * (p[j]);
1786 |
1787 | var gij = (l2grad + l1grad + g[j]) / this.batch_size; // raw batch gradient
1788 |
1789 | var gsumi = this.gsum[i];
1790 | var xsumi = this.xsum[i];
1791 | if(this.method === 'adagrad') {
1792 | // adagrad update
1793 | gsumi[j] = gsumi[j] + gij * gij;
1794 | var dx = - this.learning_rate / Math.sqrt(gsumi[j] + this.eps) * gij;
1795 | p[j] += dx;
1796 | } else if(this.method === 'windowgrad') {
1797 | // this is adagrad but with a moving window weighted average
1798 | // so the gradient is not accumulated over the entire history of the run.
1799 | // it's also referred to as Idea #1 in Zeiler paper on Adadelta. Seems reasonable to me!
1800 | gsumi[j] = this.ro * gsumi[j] + (1-this.ro) * gij * gij;
1801 | var dx = - this.learning_rate / Math.sqrt(gsumi[j] + this.eps) * gij; // eps added for better conditioning
1802 | p[j] += dx;
1803 | } else if(this.method === 'adadelta') {
1804 | // assume adadelta if not sgd or adagrad
1805 | gsumi[j] = this.ro * gsumi[j] + (1-this.ro) * gij * gij;
1806 | var dx = - Math.sqrt((xsumi[j] + this.eps)/(gsumi[j] + this.eps)) * gij;
1807 | xsumi[j] = this.ro * xsumi[j] + (1-this.ro) * dx * dx; // yes, xsum lags behind gsum by 1.
1808 | p[j] += dx;
1809 | } else if(this.method === 'nesterov') {
1810 | var dx = gsumi[j];
1811 | gsumi[j] = gsumi[j] * this.momentum + this.learning_rate * gij;
1812 | dx = this.momentum * dx - (1.0 + this.momentum) * gsumi[j];
1813 | p[j] += dx;
1814 | } else {
1815 | // assume SGD
1816 | if(this.momentum > 0.0) {
1817 | // momentum update
1818 | var dx = this.momentum * gsumi[j] - this.learning_rate * gij; // step
1819 | gsumi[j] = dx; // back this up for next iteration of momentum
1820 | p[j] += dx; // apply corrected gradient
1821 | } else {
1822 | // vanilla sgd
1823 | p[j] += - this.learning_rate * gij;
1824 | }
1825 | }
1826 | g[j] = 0.0; // zero out gradient so that we can begin accumulating anew
1827 | }
1828 | }
1829 | }
1830 |
1831 | // appending softmax_loss for backwards compatibility, but from now on we will always use cost_loss
1832 | // in future, TODO: have to completely redo the way loss is done around the network as currently
1833 | // loss is a bit of a hack. Ideally, user should specify arbitrary number of loss functions on any layer
1834 | // and it should all be computed correctly and automatically.
1835 | return {fwd_time: fwd_time, bwd_time: bwd_time,
1836 | l2_decay_loss: l2_decay_loss, l1_decay_loss: l1_decay_loss,
1837 | cost_loss: cost_loss, softmax_loss: cost_loss,
1838 | loss: cost_loss + l1_decay_loss + l2_decay_loss}
1839 | }
1840 | }
1841 |
1842 | global.Trainer = Trainer;
1843 | global.SGDTrainer = Trainer; // backwards compatibility
1844 | })(convnetjs);
1845 |
1846 | (function(global) {
1847 | "use strict";
1848 |
1849 | // used utilities, make explicit local references
1850 | var randf = global.randf;
1851 | var randi = global.randi;
1852 | var Net = global.Net;
1853 | var Trainer = global.Trainer;
1854 | var maxmin = global.maxmin;
1855 | var randperm = global.randperm;
1856 | var weightedSample = global.weightedSample;
1857 | var getopt = global.getopt;
1858 | var arrUnique = global.arrUnique;
1859 |
1860 | /*
1861 | A MagicNet takes data: a list of convnetjs.Vol(), and labels
1862 | which for now are assumed to be class indeces 0..K. MagicNet then:
1863 | - creates data folds for cross-validation
1864 | - samples candidate networks
1865 | - evaluates candidate networks on all data folds
1866 | - produces predictions by model-averaging the best networks
1867 | */
1868 | var MagicNet = function(data, labels, opt) {
1869 | var opt = opt || {};
1870 | if(typeof data === 'undefined') { data = []; }
1871 | if(typeof labels === 'undefined') { labels = []; }
1872 |
1873 | // required inputs
1874 | this.data = data; // store these pointers to data
1875 | this.labels = labels;
1876 |
1877 | // optional inputs
1878 | this.train_ratio = getopt(opt, 'train_ratio', 0.7);
1879 | this.num_folds = getopt(opt, 'num_folds', 10);
1880 | this.num_candidates = getopt(opt, 'num_candidates', 50); // we evaluate several in parallel
1881 | // how many epochs of data to train every network? for every fold?
1882 | // higher values mean higher accuracy in final results, but more expensive
1883 | this.num_epochs = getopt(opt, 'num_epochs', 50);
1884 | // number of best models to average during prediction. Usually higher = better
1885 | this.ensemble_size = getopt(opt, 'ensemble_size', 10);
1886 |
1887 | // candidate parameters
1888 | this.batch_size_min = getopt(opt, 'batch_size_min', 10);
1889 | this.batch_size_max = getopt(opt, 'batch_size_max', 300);
1890 | this.l2_decay_min = getopt(opt, 'l2_decay_min', -4);
1891 | this.l2_decay_max = getopt(opt, 'l2_decay_max', 2);
1892 | this.learning_rate_min = getopt(opt, 'learning_rate_min', -4);
1893 | this.learning_rate_max = getopt(opt, 'learning_rate_max', 0);
1894 | this.momentum_min = getopt(opt, 'momentum_min', 0.9);
1895 | this.momentum_max = getopt(opt, 'momentum_max', 0.9);
1896 | this.neurons_min = getopt(opt, 'neurons_min', 5);
1897 | this.neurons_max = getopt(opt, 'neurons_max', 30);
1898 |
1899 | // computed
1900 | this.folds = []; // data fold indices, gets filled by sampleFolds()
1901 | this.candidates = []; // candidate networks that are being currently evaluated
1902 | this.evaluated_candidates = []; // history of all candidates that were fully evaluated on all folds
1903 | this.unique_labels = arrUnique(labels);
1904 | this.iter = 0; // iteration counter, goes from 0 -> num_epochs * num_training_data
1905 | this.foldix = 0; // index of active fold
1906 |
1907 | // callbacks
1908 | this.finish_fold_callback = null;
1909 | this.finish_batch_callback = null;
1910 |
1911 | // initializations
1912 | if(this.data.length > 0) {
1913 | this.sampleFolds();
1914 | this.sampleCandidates();
1915 | }
1916 | };
1917 |
1918 | MagicNet.prototype = {
1919 |
1920 | // sets this.folds to a sampling of this.num_folds folds
1921 | sampleFolds: function() {
1922 | var N = this.data.length;
1923 | var num_train = Math.floor(this.train_ratio * N);
1924 | this.folds = []; // flush folds, if any
1925 | for(var i=0;i= lastiter) {
2007 | // finished evaluation of this fold. Get final validation
2008 | // accuracies, record them, and go on to next fold.
2009 | var val_acc = this.evalValErrors();
2010 | for(var k=0;k= this.folds.length) {
2023 | // we finished all folds as well! Record these candidates
2024 | // and sample new ones to evaluate.
2025 | for(var k=0;k (b.accv / b.acc.length)
2032 | ? -1 : 1;
2033 | });
2034 | // and clip only to the top few ones (lets place limit at 3*ensemble_size)
2035 | // otherwise there are concerns with keeping these all in memory
2036 | // if MagicNet is being evaluated for a very long time
2037 | if(this.evaluated_candidates.length > 3 * this.ensemble_size) {
2038 | this.evaluated_candidates = this.evaluated_candidates.slice(0, 3 * this.ensemble_size);
2039 | }
2040 | if(this.finish_batch_callback !== null) {
2041 | this.finish_batch_callback();
2042 | }
2043 | this.sampleCandidates(); // begin with new candidates
2044 | this.foldix = 0; // reset this
2045 | } else {
2046 | // we will go on to another fold. reset all candidates nets
2047 | for(var k=0;k