├── .nojekyll ├── 99~参考资料 ├── d2l.ai │ └── README.md ├── 周志华~《机器学习》 │ └── README.md ├── AI 研报 │ └── 2021~中金~AI 十年展望 │ │ ├── 03~AI 视角下的自动驾驶行业全解析.md │ │ ├── 02~边际成本决定竞争力,算法龙头主导格局优化.md │ │ ├── README.md │ │ └── 01~底层模拟人脑,算力决定上限.md ├── TensorFlow-in-Practice │ └── README.md ├── DeepLearning-Specialization │ ├── NeuralNetworks-And-DeepLearning │ │ ├── Week1 │ │ │ ├── 基于 Logistic 回归的图像分类实践.md │ │ │ ├── 浅层神经网络.md │ │ │ ├── 基于 Numpy 的 Python 向量操作.md │ │ │ ├── 神经网络、有监督学习与深度学习.md │ │ │ ├── 二元分类与 Logistic 回归.md │ │ │ └── 梯度下降与向量化操作.md │ │ └── Week2 │ │ │ └── Python+Basics+With+Numpy+v3.ipynb │ ├── Sequence Models │ │ ├── Week1 │ │ │ └── Music Generation │ │ │ │ └── my_music.midi │ │ └── Week2 │ │ │ └── Operations+on+word+vectors+-+v2.ipynb │ ├── Convolutional_Neural_Networks │ │ └── Week4 │ │ │ ├── Art_generation_output │ │ │ ├── style1.png │ │ │ ├── content1.png │ │ │ └── generated1.png │ │ │ └── Face+Recognition+for+the+Happy+House+-+v3.ipynb │ ├── README.md │ └── Improving Deep Neural Networks │ │ └── Week1 │ │ └── Gradient+Checking+v1.ipynb └── FastAI │ └── README.md ├── 06~行业应用 ├── 社会网络 │ ├── 社团发现 │ │ └── README.md │ └── README.md ├── 推荐系统 │ ├── 99~参考资料 │ │ ├── 2016-推荐系统实践篇.md │ │ ├── 2016-推荐系统理论篇.md │ │ └── 2017-架构师-用户画像实践.md │ └── README.md ├── README.md ├── RPA │ └── README.md └── 无人驾驶 │ └── 概述.md ├── 04~自然语言处理 └── README.link ├── 01~AI 数学基础 └── README.link ├── .DS_Store ├── 02~机器学习 └── README.link ├── 03~深度学习 └── README.link ├── 05~工具与工程化 └── README.link ├── .github ├── .DS_Store ├── ISSUE_TEMPLATE │ ├── custom.md │ ├── feature_request.md │ └── bug_report.md └── ABOUT.md ├── .gitattributes ├── .gitignore ├── _sidebar.md ├── index.html ├── README.md ├── header.svg └── LICENSE /.nojekyll: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /99~参考资料/d2l.ai/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /06~行业应用/社会网络/社团发现/README.md: -------------------------------------------------------------------------------- 1 | # 社团发现 2 | -------------------------------------------------------------------------------- /99~参考资料/周志华~《机器学习》/README.md: -------------------------------------------------------------------------------- 1 | # 周志华《机器学习》 2 | -------------------------------------------------------------------------------- /99~参考资料/AI 研报/2021~中金~AI 十年展望/03~AI 视角下的自动驾驶行业全解析.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /06~行业应用/社会网络/README.md: -------------------------------------------------------------------------------- 1 | # 社会网络分析 2 | 3 | # Links 4 | -------------------------------------------------------------------------------- /99~参考资料/AI 研报/2021~中金~AI 十年展望/02~边际成本决定竞争力,算法龙头主导格局优化.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /04~自然语言处理/README.link: -------------------------------------------------------------------------------- 1 | https://github.com/wx-chevalier/NLP-Notes 2 | -------------------------------------------------------------------------------- /01~AI 数学基础/README.link: -------------------------------------------------------------------------------- 1 | https://github.com/wx-chevalier/AI-Math-Notes 2 | -------------------------------------------------------------------------------- /99~参考资料/TensorFlow-in-Practice/README.md: -------------------------------------------------------------------------------- 1 | # TensorFlow-in-Practice 2 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wx-chevalier/AI-Notes/master/.DS_Store -------------------------------------------------------------------------------- /02~机器学习/README.link: -------------------------------------------------------------------------------- 1 | https://github.com/wx-chevalier/MachineLearning-Notes 2 | -------------------------------------------------------------------------------- /03~深度学习/README.link: -------------------------------------------------------------------------------- 1 | https://github.com/wx-chevalier/DeepLearning-Notes 2 | -------------------------------------------------------------------------------- /05~工具与工程化/README.link: -------------------------------------------------------------------------------- 1 | https://github.com/wx-chevalier/AI-Toolkits-Notes 2 | -------------------------------------------------------------------------------- /06~行业应用/推荐系统/99~参考资料/2016-推荐系统实践篇.md: -------------------------------------------------------------------------------- 1 | > 参考地址:https://cowtransfer.com/s/854c9d86a08843 2 | -------------------------------------------------------------------------------- /06~行业应用/推荐系统/99~参考资料/2016-推荐系统理论篇.md: -------------------------------------------------------------------------------- 1 | > 参考地址:https://cowtransfer.com/s/854c9d86a08843 2 | -------------------------------------------------------------------------------- /.github/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wx-chevalier/AI-Notes/master/.github/.DS_Store -------------------------------------------------------------------------------- /06~行业应用/推荐系统/99~参考资料/2017-架构师-用户画像实践.md: -------------------------------------------------------------------------------- 1 | > 参考地址:https://cowtransfer.com/s/854c9d86a08843 2 | -------------------------------------------------------------------------------- /06~行业应用/README.md: -------------------------------------------------------------------------------- 1 | # 人工智能与深度学习的行业应用 2 | 3 | ![人工智能的行业](https://s2.ax1x.com/2019/10/26/KBZR29.png) 4 | -------------------------------------------------------------------------------- /99~参考资料/AI 研报/2021~中金~AI 十年展望/README.md: -------------------------------------------------------------------------------- 1 | > [原文地址](https://mp.weixin.qq.com/s/27MRrC6qlDdbh1cIwDOgrg) 2 | -------------------------------------------------------------------------------- /99~参考资料/DeepLearning-Specialization/NeuralNetworks-And-DeepLearning/Week1/基于 Logistic 回归的图像分类实践.md: -------------------------------------------------------------------------------- 1 | # 基于 Logistic 回归的图像分类实践 2 | 3 | # 延伸阅读 4 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.xmind filter=lfs diff=lfs merge=lfs -text 2 | *.zip filter=lfs diff=lfs merge=lfs -text 3 | *.pdf filter=lfs diff=lfs merge=lfs -text 4 | -------------------------------------------------------------------------------- /06~行业应用/推荐系统/README.md: -------------------------------------------------------------------------------- 1 | # 推荐系统 2 | 3 | # Links 4 | 5 | - https://mp.weixin.qq.com/s/CZwQkE76daQ7BRxWbKlMpg 6 | - https://mp.weixin.qq.com/s/VbstSv8-L8cd6CkTcSCLrA 7 | -------------------------------------------------------------------------------- /99~参考资料/FastAI/README.md: -------------------------------------------------------------------------------- 1 | # Fast AI 2 | 3 | [Paper Space](https://www.paperspace.com/) 4 | 5 | ```sh 6 | $ curl http://files.fast.ai/setup/paperspace | bash 7 | ``` 8 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/custom.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Custom issue template 3 | about: Describe this issue template's purpose here. 4 | title: "" 5 | labels: "" 6 | assignees: "" 7 | --- 8 | -------------------------------------------------------------------------------- /99~参考资料/DeepLearning-Specialization/Sequence Models/Week1/Music Generation/my_music.midi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wx-chevalier/AI-Notes/master/99~参考资料/DeepLearning-Specialization/Sequence Models/Week1/Music Generation/my_music.midi -------------------------------------------------------------------------------- /99~参考资料/DeepLearning-Specialization/Convolutional_Neural_Networks/Week4/Art_generation_output/style1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wx-chevalier/AI-Notes/master/99~参考资料/DeepLearning-Specialization/Convolutional_Neural_Networks/Week4/Art_generation_output/style1.png -------------------------------------------------------------------------------- /99~参考资料/DeepLearning-Specialization/Convolutional_Neural_Networks/Week4/Art_generation_output/content1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wx-chevalier/AI-Notes/master/99~参考资料/DeepLearning-Specialization/Convolutional_Neural_Networks/Week4/Art_generation_output/content1.png -------------------------------------------------------------------------------- /99~参考资料/DeepLearning-Specialization/Convolutional_Neural_Networks/Week4/Art_generation_output/generated1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wx-chevalier/AI-Notes/master/99~参考资料/DeepLearning-Specialization/Convolutional_Neural_Networks/Week4/Art_generation_output/generated1.png -------------------------------------------------------------------------------- /99~参考资料/DeepLearning-Specialization/README.md: -------------------------------------------------------------------------------- 1 | # 深度学习课程笔记 2 | 3 | # 狗粮 4 | 5 | 如果觉得本系列对你有所帮助,欢迎给我家布丁买点狗粮(支付宝扫码)~ 6 | 7 | # 版权 8 | 9 | ![](https://parg.co/bDY) ![](https://parg.co/bDm) 10 | 11 | 笔者所有文章遵循 [知识共享 署名-非商业性使用-禁止演绎 4.0 国际许可协议](https://creativecommons.org/licenses/by-nc-nd/4.0/deed.zh),欢迎转载,尊重版权。 12 | -------------------------------------------------------------------------------- /.github/ABOUT.md: -------------------------------------------------------------------------------- 1 | # ABOUT | 关于 2 | 3 | ![default](https://user-images.githubusercontent.com/5803001/44629091-c0c56180-a97c-11e8-8aff-52d51a8aec1f.jpg) 4 | 5 | 本系列使用的参考资料请前往 [Awesome-Lists/DataScienceAI](https://github.com/wx-chevalier/Awesome-Lists),本系列的精华缩略版可以参考 [Awesome-CheatSheet/DataScienceAI](https://github.com/wx-chevalier/Awesome-CheatSheets)。 6 | 7 | ## 规划 8 | 9 | ## 致谢 10 | 11 | 由于笔者平日忙于工作,几乎所有线上的文档都是我夫人帮忙整理,在此特别致谢;同时也感谢我家的布丁安静的趴在脚边,不再那么粪发涂墙。 12 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "" 5 | labels: "" 6 | assignees: "" 7 | --- 8 | 9 | **Is your feature request related to a problem? Please describe.** 10 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 11 | 12 | **Describe the solution you'd like** 13 | A clear and concise description of what you want to happen. 14 | 15 | **Describe alternatives you've considered** 16 | A clear and concise description of any alternative solutions or features you've considered. 17 | 18 | **Additional context** 19 | Add any other context or screenshots about the feature request here. 20 | -------------------------------------------------------------------------------- /99~参考资料/DeepLearning-Specialization/NeuralNetworks-And-DeepLearning/Week1/浅层神经网络.md: -------------------------------------------------------------------------------- 1 | # 浅层神经网络 2 | 3 | # 单隐层计算 4 | 5 | 我们可以得到结构相似的输入数据与一层隐层输出 6 | 7 | # Activation Function 8 | 9 | 构建神经网络中非常重要的一个环节就是选择合适的激活函数(Activation Function),激活函数是为了增加神经网络模型的非线性,也可以看做从数据空间到最终表达空间的一种映射。仅就 Sigmod 与 tahn 相比时,在大部分情况下我们应该优先使用 tahn 函数;除了在最终的输出层,因为输出层我们需要得到的是 0~1 范围内的概率表示。譬如在上面介绍的浅层神经网络中,我们就可以使用 Sigmod 作为隐层的激活函数,而使用 tahn 作为输出层的激活函数。 10 | 11 | 不过 Sigmod 与 tahn 同样都存在在极大值或者极小值处梯度较小、收敛缓慢的问题。并且采用 Sigmoid 等函数,算激活函数时(指数运算),计算量大,反向传播求误差梯度时,求导涉及除法,计算量相对大;而采用 ReLU(rectified linear unit) 激活函数,整个过程的计算量节省很多。此外,ReLU 会使一部分神经元的输出为 0,这样就造成了网络的稀疏性,并且减少了参数的相互依存关系,缓解了过拟合问题的发生。 12 | 13 | ![](https://raw.githubusercontent.com/wx-chevalier/OSS/master/2017/8/1/activation_function.png) 14 | 15 | ## 为什么需要非线性激活函数? 16 | 17 | 如果我们选择了所谓的 Identity Activation Function,即直接将输入值作为输出返回,这样我们最终的输出值也就自然与输入值存在线性相关性。 18 | 19 | ## 激活函数的导数 20 | 21 | # Gradient Descent | 梯度下降 22 | 23 | ## 反向传播的直观解释 24 | 25 | ## 随机初始化 26 | 27 | # 延伸阅读 28 | -------------------------------------------------------------------------------- /06~行业应用/RPA/README.md: -------------------------------------------------------------------------------- 1 | # RPA 2 | 3 | 单从概念上说:机器人流程自动化(Robotic process automation),指的是依托机器人流程自动化技术的电脑程序,代替人类用来自动执行任务,以达到快速降低成本、提升表现的目的。最适合采用这类技术的是涉及大量重复性手工处理、有固定规则、结构化数据较多的机构,比如金融财务服务行业。RPA 能够替代人工,将繁琐的业务操作流程自动化,并更加精准地完成工作。 4 | 5 | # 背景特性 6 | 7 | 以最常见的 RPA 财务分析机器人为例,处理财务分析报表时,机器人会打开指定的企业财务数据页面,一键提取页面中相应的数据信息,并自动汇总在表格软件当中。一些功能较多的 RPA 软件甚至可以将汇总好的表格数据自动生成柱状图、饼图等,并直接按照要求置入 PPT 文件里,大部分需要人工几小时甚至十几小时完成的任务,RPA 软件可以在几分钟之内搞定,极大程度上缩短业务流程。当然,这只是财务分析业务这一种 RPA 机器人的案例,在其他领域还有更多不同的 RPA 软件能够提供相应的服务。如果只看上面的案例,可能会有读者认为 RPA 与 AI 并无差别,但实际上,RPA 从技术层面来说并不能等同于人工智能,只是在很多环节上会用到人工智能的技术。 8 | 9 | 当机器人打开需要分析的企业财务页面时,就会用到人工智能中的 NLP(自然语言处理)技术,识别页面中企业名称、财务数据等相关字段,从而更加精准的整理成表格文件。 10 | 11 | # 技术分析 12 | 13 | RPA 要实现的最核心的功能是通过模拟人类手工在电脑的操作,包括键盘、鼠标的输入等等,以达到节省人工操作、提升效率的目的,所以模拟技术可以说是 RPA 的核心之一。 14 | 15 | 多数主流 RPA 产品采用的技术主要有以下几种: 16 | 17 | - 利用 WIN32 API 基于屏幕坐标点进行鼠标的移动点击拖放以及键盘操作; 18 | - 利用 WIN32 API 获取句柄从而在鼠标光标不动情况下直接操作 windows OS 上运行的任意窗体及其内的控件; 19 | - 利用 IE DOM 类库在鼠标光标不动情况下直接操作 IE 浏览器所打开的网页中的 HTML 元素; 20 | - 利用图像识别和 OCR 技术基于屏幕区域截图比对来获取屏幕坐标位置并用 WIN32 API 触发鼠标键盘操作。 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "" 5 | labels: "" 6 | assignees: "" 7 | --- 8 | 9 | **Describe the bug** 10 | A clear and concise description of what the bug is. 11 | 12 | **To Reproduce** 13 | Steps to reproduce the behavior: 14 | 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | 28 | - OS: [e.g. iOS] 29 | - Browser [e.g. chrome, safari] 30 | - Version [e.g. 22] 31 | 32 | **Smartphone (please complete the following information):** 33 | 34 | - Device: [e.g. iPhone6] 35 | - OS: [e.g. iOS8.1] 36 | - Browser [e.g. stock browser, safari] 37 | - Version [e.g. 22] 38 | 39 | **Additional context** 40 | Add any other context about the problem here. 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore all 2 | * 3 | 4 | # Unignore all with extensions 5 | !*.* 6 | 7 | # Unignore all dirs 8 | !*/ 9 | 10 | .DS_Store 11 | 12 | # Logs 13 | logs 14 | *.log 15 | npm-debug.log* 16 | yarn-debug.log* 17 | yarn-error.log* 18 | 19 | # Runtime data 20 | pids 21 | *.pid 22 | *.seed 23 | *.pid.lock 24 | 25 | # Directory for instrumented libs generated by jscoverage/JSCover 26 | lib-cov 27 | 28 | # Coverage directory used by tools like istanbul 29 | coverage 30 | 31 | # nyc test coverage 32 | .nyc_output 33 | 34 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 35 | .grunt 36 | 37 | # Bower dependency directory (https://bower.io/) 38 | bower_components 39 | 40 | # node-waf configuration 41 | .lock-wscript 42 | 43 | # Compiled binary addons (https://nodejs.org/api/addons.html) 44 | build/Release 45 | 46 | # Dependency directories 47 | node_modules/ 48 | jspm_packages/ 49 | 50 | # TypeScript v1 declaration files 51 | typings/ 52 | 53 | # Optional npm cache directory 54 | .npm 55 | 56 | # Optional eslint cache 57 | .eslintcache 58 | 59 | # Optional REPL history 60 | .node_repl_history 61 | 62 | # Output of 'npm pack' 63 | *.tgz 64 | 65 | # Yarn Integrity file 66 | .yarn-integrity 67 | 68 | # dotenv environment variables file 69 | .env 70 | 71 | # next.js build output 72 | .next 73 | -------------------------------------------------------------------------------- /99~参考资料/DeepLearning-Specialization/NeuralNetworks-And-DeepLearning/Week1/基于 Numpy 的 Python 向量操作.md: -------------------------------------------------------------------------------- 1 | # BroadCasting 2 | 3 | Numpy 会自动进行矩阵扩展操作以适应指定的矩阵运算 4 | 5 | ```py 6 | A (2d array): 5 x 4 7 | B (1d array): 1 8 | Result (2d array): 5 x 4 9 | 10 | 11 | A (2d array): 5 x 4 12 | B (1d array): 4 13 | Result (2d array): 5 x 4 14 | 15 | 16 | A (3d array): 15 x 3 x 5 17 | B (3d array): 15 x 1 x 5 18 | Result (3d array): 15 x 3 x 5 19 | 20 | 21 | A (3d array): 15 x 3 x 5 22 | B (2d array): 3 x 5 23 | Result (3d array): 15 x 3 x 5 24 | 25 | 26 | A (3d array): 15 x 3 x 5 27 | B (2d array): 3 x 1 28 | Result (3d array): 15 x 3 x 5 29 | 30 | 31 | A (4d array): 8 x 1 x 6 x 1 32 | B (3d array): 7 x 1 x 5 33 | Result (4d array): 8 x 7 x 6 x 5 34 | ``` 35 | 36 | ```py 37 | a = np.random.randn(2, 3) # a.shape = (4, 3) 38 | b = np.random.randn(2, 1) # b.shape = (3, 2) 39 | c = a+b 40 | ``` 41 | 42 | No! In numpy the "\*" operator indicates element-wise multiplication. It is different from "np.dot()". If you would try "c = np.dot(a,b)" you would get c.shape = (4, 2). 43 | 44 | Also, the broadcasting cannot happen because of the shape of b. b should have been something like (4, 1) or (1, 3) to broadcast properly. So a\*b leads to an error! 45 | 46 | ```py 47 | a = np.random.randn(4, 3) # a.shape = (4, 3) 48 | b = np.random.randn(3, 2) # b.shape = (3, 2) 49 | c = a*b 50 | 51 | // ValueError: operands could not be broadcast together with shapes (4,3) (3,2) 52 | ``` 53 | 54 | # 延伸阅读 55 | -------------------------------------------------------------------------------- /99~参考资料/AI 研报/2021~中金~AI 十年展望/01~底层模拟人脑,算力决定上限.md: -------------------------------------------------------------------------------- 1 | > [原文地址](https://research.cicc.com/frontend/recommend/detail?id=2176) 2 | 3 | # AI 十年展望(一):底层模拟人脑,算力决定上限 4 | 5 | AI 技术中长期对社会的潜在影响深远,影响几乎所有行业。本文介绍了以深度学习为代表的人工智能理论的基本原理,并指出了由于目前的人工智能具备坚实的数学基础、神经学基础,未来随着底层算力的不断增长,人工智能影响边界将会不断扩宽,行业的发展潜力目前仍处在被市场低估状态。 6 | 7 | # 摘要 8 | 9 | AI 能够从底层模拟人脑主要工作机制,基于其理论的模型能够达到的智能水平上限较高。人的神经元近似一个基于阈值的二进制的逻辑门,与数字电路 0/1 的机制相似,深度学习能从底层上模拟人脑神经元工作机制,只要网络层数、神经元个数足够多,AI 将在某些维度接近甚至超过人脑智能。人工神经网络 4 大理论支柱为“阈值逻辑”、“Hebb 学习率”、“梯度下降”、“反向传播”,前 2 个理论解决了单个神经元层面的建模问题,后 2 个理论则解决了多层神经网络训练问题。2006 年 Hinton 首次实现了 5 层神经网络的训练,之后行业迎来爆发式发展,不断验证了该技术的潜力。 10 | 11 | 深度学习具备坚实的数学理论基础支撑。人脑绝大多数活动本质上都是广义计算问题,因此人脑其实是一个复杂的函数,深度学习就是去找到这个函数,万能近似定理则从数学上证明了一定条件下深度神经网络模型能够模拟任意的函数。 12 | 13 | 用于深度学习的算力以 6 年 30 万倍的速度增加,算力是核心瓶颈也是未来提升的关键。从进化角度看,人的智能是一个随着神经元数量提升,从量变到质变的过程,这个量变过程对应人工智能中模型所用算力的提升过程。目前能够实现的 AI 模型中不论从神经元个数还是连接数量看,与真实人类还有较大的差距,未来随着现有芯片技术的不断推进,和突破冯诺依曼架构的类脑芯片等新技术的发展,算力的持续突破将会不断释放人工智能技术的潜力。 14 | 15 | # 全文概要 16 | 17 | 1、为什么说深度学习技术的潜在上限高? 18 | 19 | 因为深度学习从底层模拟人脑神经元的主要工作机制。智能很大程度是广义计算问题,人工神经网络尽管无法做到完全“复制”人脑,但已经能较好地模拟其主要底层机制,因为神经元可近似为基于阈值的二进制单元,类似数字电路 0/1 机制。 20 | 21 | 从生物进化的角度看,人的智能是量变到质变的过程。在完成单个神经元主要工作机制模型后,只要网络层数、神经元个数足够多,AI 将在某些维度接近甚至超过人脑智能。 22 | 23 | 此外,从数学角度,万能近似定理论证了深度学习有坚实的数学基础。该定理证明了深度学习数学模型能够以任意精度逼近任意的函数,而人的智能很大程度即广义计算问题,进而深度学习模型能够模拟人脑的绝大部分活动,具备很高的上限。 24 | 25 | ![人工神经网络首先从单个神经元维度模拟的人脑主要机制](https://ngte-superbed.oss-cn-beijing.aliyuncs.com/item/20240110184658.png) 26 | 27 | 2、为什么直到近几年深度学习才爆发? 28 | 29 | 因为早期都是浅层神经网络,直到 2006 年才首次实现 5 层隐层网络的训练。2006 年 Hinton 首次把 ANN 提升到了 5 层,解决了深度神经网络的训练问题;2012 年 Hinton 在 ImageNet 挑战赛中引进深度学习算法,在随后几年内,深度学习成功地大幅降低了图片识别错误率,随后行业迎来爆发式发展,深度学习的商用进程得以加速。 30 | -------------------------------------------------------------------------------- /06~行业应用/无人驾驶/概述.md: -------------------------------------------------------------------------------- 1 | # 无人驾驶概述 2 | 3 | 1. 感知(Perception):主要涉及的技术点包括场景理解、交通状况分析、路面检测、空间检测、障碍物检测、行人检测、路沿检测、车道检测。还有一个比较新颖有趣的是通过胎压去检测道路质量。 4 | 5 | 2. 运动规划(Motion Planning):主要涉及的技术点包括运动规划、轨迹规划、速度规划、运动模型。比较有趣的一些进展包括通过赛车游戏去学习基于网格的运动规划,重量级货车的避障规划,普适于无人驾驶的双轮模型等等。 6 | 7 | 3. 防碰撞(CollisionAvoidance):主要涉及如何通过车内的感知系统以及 V2X 系统去辅助防碰撞。比较有趣的一些进展包括如何实时地去评估当前驾驶行为的危险性,如何通过当前道路的拓扑去增强自行车骑士的安全性等等。 8 | 9 | 4. 地图与定位(Mapping andLocalization):主要涉及如何通过不同的传感器,包括激光雷达、视觉、GNSS,以及 V2X 去建图与定位。比较有趣的一些进展包括如何在一些特殊的场景去定位,比如在长隧道里面,既没有 GNSS 信号,也没有太好的激光或者视觉特征的时候如何定位。 10 | 11 | 5. 合作系统(CooperativeSystems):主要涉及如何协同多个无人车去完成一些任务,比如在多个无人车同时在一个十字路口出现时如何调度,还有就是当有多个无人车同时在停车场试如何有序的停车。 12 | 13 | 6. 控制策略(Control Strategy):主要研究在不同的细分场景下的控制策略,比如在十字路口如何控制,转线如何控制,在感知数据不可靠时如何尽量安全的控制等等。 14 | 15 | 7. 车辆检测与跟踪(VehicleDetection and Tracking):主要关注如何通过激光雷达、视觉,以及毫米波雷达进行车辆检测与跟踪。比较有趣的工作包括通过深度学习与深度视觉的结合进行车辆跟踪,通过单目视觉深度学习去尽量估计车体大小,通过传统视觉边缘检测方法去判断是否车体等。 16 | 17 | 8. 静态物体检测(Static ObjectDetection):主要涉及通过视觉以及激光雷达去检测一些静态的物体,包括交通灯、交通指示牌、路沿、路面等等,每个物体品类的检测都是一个细分方向。 18 | 19 | 9. 动态物体检测(Moving ObjectDetection):主要涉及通过视觉、激光雷达、毫米波雷达,以及传感器融合的方法去检测一些动态的物体,包括行人、车辆、自行车骑士等,并根据这些动态物体的动作去预测行为。 20 | 21 | 10. 道路与路口检测(Road andIntersection Detection):道路与路口检测由于其特殊性以及对安全的影响,被单独列出作为一个细分的小方向。研究的前沿一般涉及一些细分场景,比如建筑工地的检测、停车位的检测等。 22 | 23 | 11. 决策系统(Planning andDecision):主要涉及每个无人车的动作的决策,比如加速、刹车、换线、超车、调头等等。研究的前沿一般涉及在高速行驶中如何安全的换线,在通过视觉理解了场景后如何决策,在感知信息缺失的时候(比如在隧道里面)如何决策等。 24 | 25 | 12. 主动与被动安全(Active andPassive Safety):主要涉及如何通过不同传感器的感知去确保无人驾驶以及行人安全,比较有趣的一些研究包括通过对 CAN 总线的异常检测去评估车辆的安全性,通过对停车场的视频监控去训练自动泊车模型等。 26 | 27 | 13. 无人车与交通的交互(AutonomousVehicles:Interaction with Traffic):主要研究无人车如何与现有的交通生态共存,特别是传统车与无人车的共存。比较有趣的一些研究包括 V2X 虚拟交通标志,通过视觉去评估旁边车道司机的驾驶行为等。 28 | 29 | 14. 视觉定位(SLAM and VisualOdometry):主要研究如何用视觉与激光雷达进行实时定位与建图。比较有趣的一些研究包括视觉的线上校准,使用车道线进行实时定位与导航等。 30 | 31 | 15. 环境学习与建图(Mapping andLearning the Environment):主要研究如何建立精准的环境信息图。比较有趣的一些研究包括使用低空无人机去创建给无人驾驶使用的地图,以及通过停车场监控摄像头建立辅助自动泊车的地图等。 32 | -------------------------------------------------------------------------------- /_sidebar.md: -------------------------------------------------------------------------------- 1 | - [1 01~数理统计](/01~数理统计/README.md) 2 | 3 | - [2 02~机器学习](/02~机器学习/README.md) 4 | 5 | - [3 03~深度学习](/03~深度学习/README.md) 6 | 7 | - [4 04~自然语言处理](/04~自然语言处理/README.md) 8 | 9 | - [5 05~工具与工程化](/05~工具与工程化/README.md) 10 | 11 | - [6 06~行业应用 [4]](/06~行业应用/README.md) 12 | - [6.1 RPA](/06~行业应用/RPA/README.md) 13 | 14 | - [6.2 推荐系统](/06~行业应用/推荐系统/README.md) 15 | 16 | - 6.3 无人驾驶 [1] 17 | - [6.3.1 概述](/06~行业应用/无人驾驶/概述.md) 18 | - [6.4 社会网络 [1]](/06~行业应用/社会网络/README.md) 19 | - [6.4.1 社团发现](/06~行业应用/社会网络/社团发现/README.md) 20 | 21 | - 7 99~参考资料 [6] 22 | - 7.1 AI 研报 [1] 23 | - [7.1.1 2021~中金~AI 十年展望 [3]](/99~参考资料/AI%20研报/2021~中金~AI%20十年展望/README.md) 24 | - [7.1.1.1 01~底层模拟人脑,算力决定上限](/99~参考资料/AI%20研报/2021~中金~AI%20十年展望/01~底层模拟人脑,算力决定上限.md) 25 | - [7.1.1.2 02~边际成本决定竞争力,算法龙头主导格局优化](/99~参考资料/AI%20研报/2021~中金~AI%20十年展望/02~边际成本决定竞争力,算法龙头主导格局优化.md) 26 | - [7.1.1.3 03~AI 视角下的自动驾驶行业全解析](/99~参考资料/AI%20研报/2021~中金~AI%20十年展望/03~AI%20视角下的自动驾驶行业全解析.md) 27 | - [7.2 DeepLearning Specialization [1]](/99~参考资料/DeepLearning-Specialization/README.md) 28 | - 7.2.1 NeuralNetworks And DeepLearning [1] 29 | - 7.2.1.1 Week1 [6] 30 | - [7.2.1.1.1 二元分类与 Logistic 回归](/99~参考资料/DeepLearning-Specialization/NeuralNetworks-And-DeepLearning/Week1/二元分类与%20Logistic%20回归.md) 31 | - [7.2.1.1.2 基于 Logistic 回归的图像分类实践](/99~参考资料/DeepLearning-Specialization/NeuralNetworks-And-DeepLearning/Week1/基于%20Logistic%20回归的图像分类实践.md) 32 | - [7.2.1.1.3 基于 Numpy 的 Python 向量操作](/99~参考资料/DeepLearning-Specialization/NeuralNetworks-And-DeepLearning/Week1/基于%20Numpy%20的%20Python%20向量操作.md) 33 | - [7.2.1.1.4 梯度下降与向量化操作](/99~参考资料/DeepLearning-Specialization/NeuralNetworks-And-DeepLearning/Week1/梯度下降与向量化操作.md) 34 | - [7.2.1.1.5 浅层神经网络](/99~参考资料/DeepLearning-Specialization/NeuralNetworks-And-DeepLearning/Week1/浅层神经网络.md) 35 | - [7.2.1.1.6 神经网络、有监督学习与深度学习](/99~参考资料/DeepLearning-Specialization/NeuralNetworks-And-DeepLearning/Week1/神经网络、有监督学习与深度学习.md) 36 | - [7.3 FastAI](/99~参考资料/FastAI/README.md) 37 | 38 | - [7.4 TensorFlow in Practice](/99~参考资料/TensorFlow-in-Practice/README.md) 39 | 40 | - [7.5 d2l.ai](/99~参考资料/d2l.ai/README.md) 41 | 42 | - [7.6 周志华 机器学习](/99~参考资料/周志华-机器学习/README.md) 43 | -------------------------------------------------------------------------------- /99~参考资料/DeepLearning-Specialization/NeuralNetworks-And-DeepLearning/Week1/神经网络、有监督学习与深度学习.md: -------------------------------------------------------------------------------- 1 | # 神经网络、有监督学习与深度学习 2 | 3 | 深度学习正在逐步地改变世界,从网络搜索、广告推荐这样传统的互联网业务到健康医疗、自动驾驶等不同的行业领域。百年前的电气革命为社会带来了新的支柱产业,而如今 AI 正是新时代的电力基础,驱动社会技术的快速发展。本课程的第一部分,关注于如何构建包括深度神经网络在内的神经网络以及如何利用数据训练该网络,课程的最后将会构建出可识别动物的深度神经网络。本课程的第二部分将会讲解更多有关于深度学习实践的相关资料,包括超参数调优、正则化、如何从 Momentum Armrest Prop 以及 AD Authorization 等算法中选择合适的优化算法。第三部分将会学习如何自定义机器学习项目,如何进行数据预处理、将数据应用到模型训练、提取交叉校验的训练集与验证集等内容。第四部分将会着眼于卷积神经网络 CNN,如何构建经典的 CNN 模型。在最后的第五部分,将会学习构建序列模型(Seq2Seq 等)以解决自然语言处理相关的任务;典型的序列模型包括了 RNNs、LSTM 等。 4 | 5 | # 神经网络 6 | 7 | 深度学习往往关联于大规模神经网络的训练,本章我们即来了解下何谓神经网络。以经典的房屋价格预测为例,假设我们拥有六组房屋数据,每组包含了房屋的面积以及价格;我们希望寻找合适的函数来根据房屋的尺寸预测房屋价格。如果我们采用线性回归来解决这个问题,我们会画出一条 `y = kx + b` 这样的函数线,其形式如下黑色线所示: 8 | 9 | 我们知道房屋的价格不可能为负数,因此我们可以将使用 ReLU(Rectified Linear Unit)函数来描述尺寸与价格之间的关系,如上图蓝色线型所示。我们可以将该问题抽象为输入为房间的尺寸 x,输出为房间的价格 y,而某个神经元即为接受输入并且进行合适的运算之后输出目标值的函数: 10 | 11 | 如上图所示即是最简单的单元神经网络,而复杂的神经网络即是由无数的神经元分层连接堆叠而成。譬如实际的房屋价格会由尺寸、卧室数目、所属区域(Zip Code)以及社区的富裕程度影响。我们理想的神经网络即是能够自动帮我们构建隐层(Hidden Units),即输入单元之间的关系以进行最好地预测: 12 | 13 | 给定输入之后,神经网络的任务之一即是为我们自动构建隐层;每个隐层单元都会输入输入层的全部特征作为输入值。 14 | 15 | # 有监督学习 16 | 17 | 神经网络的分类很多,不过截止到目前大多数的有价值的神经网络都还是基于机器学习中所谓的有监督学习(Supervised Learning)。在有监督学习中,我们的训练数据集中已知了特征与结果输出之间的对应关系,而目标就是寻找正确的输入与输出之间的关系表示。譬如目前最赚钱的深度学习应用之一,在线广告中就是输入有关于网站展示的信息以及部分用户的信息,神经网络会预测用户是否会点击该广告;通过为不同的用户展示他们最感兴趣的广告,来增加用户的实际点击率。下表即列举了几种常见的领域应用与它们的输入输出: 18 | 19 | 计算机视觉近年来也发展迅速,典型的应用之一即是图片标注;我们可能随机输入一张图片来寻找与它最相近的图片。语音识别则是能够将用户输入的语音数据转化为文字表述;机器翻译则是能将不同语言间的语句进行自由转化,譬如将某个英文段落转化为对应的中文表述。而在自动驾驶中,我们可能会输入某张从雷达中获取的车前图片,来判断路上其他车的相对位置。而对于这些不同的行业领域我们也需要应用不同类型的神经网络,譬如对上文提及的房价预测,我们就可以使用标准的神经网络;而对于图片应用则会优先考虑使用卷积神经网络(CNN)。 20 | 21 | 而对于序列数据,譬如随时间播放的音频流,其可以表示为一维的时间序列,我们通常会使用 RNN 来处理这个类型的数据。而在文本处理中,我们也常常将文本表示为字符序列,因此也往往会使用 RNN 来处理这个类型的数据。对于自动驾驶这样更为复杂的应用,我们可能会需要同时处理图片、文本等多种类别的数据,因此会使用混合网络架构。 22 | 23 | 模型训练中我们常见的另一组概念就是结构化数据与非结构化数据,结构化数据有点类似于关系型数据库中存放的数据;譬如上面介绍的房屋价格预测中,我们会拥有包含了尺寸、卧室数等列的数据表,这种形式的就是所谓结构化数据。结构化数据中每个特征,譬如房间尺寸、卧室数目、用户年龄等都有可解释的意义;而非结构化数据的典型代表,语音、文本或者图片,往往会使用像素值或者单个词作为特征向量的组成,这些特征值往往很难有实际意义的解释。人类经过长时间的进化之后能够较好地辨析非结构化数据,而利用深度学习技术,现在机器也在不断提升自身对于非结构化数据的辨析能力。 24 | 25 | # 深度学习 26 | 27 | 深度学习背后的理论基础与技术概念已经出现了有数十年,本部分我们即来讨论为何直到近些年深度学习才得到了爆炸性的发展。我们可以用下图来描述数据集的大小与算法性能(准确率、推准率等)之间的关系: 28 | 29 | 对于支持向量机、Logistics 回归这样经典的机器学习算法而言,在数据量从零递增的初始阶段,其性能会不断提升;不过很快就会触碰到天花板,此时性能很难再随着数据集的增长而提升。而伴随着移动互联网时代的到来,我们能够从网站、移动应用或者其他安装在电子终端设备上的传感器中获取到海量的数据;这些数据在开启大数据时代的同时也为深度学习的发展提供了坚实的基础。我们在上图中也可以看出,越是大型的神经网络随着数据量的增加,其性能提升的越快,并且其性能天花板也是越高。 30 | 31 | 深度学习崛起的另一个重要基石就是计算能力的提升,这里不仅指新一代 CPU 或者 GPU 设备,还有是在许多基础优化算法上的革新,都使得我们能够更快地训练出神经网络。譬如早期我们会使用 Sigmod 函数作为神经网络的激活函数,随着 x 的增大其梯度会逐渐趋近于零,这就导致了模型收敛变得相对缓慢;而 ReLU 则能较好地避免这个问题,其在正无穷大时梯度值依然保持恒定。简单地从 Sigmod 函数迁移到 ReLU 即能够为模型训练带来极大的效率提升,这也方便了我们构建出更复杂的神经网络。 32 | -------------------------------------------------------------------------------- /99~参考资料/DeepLearning-Specialization/NeuralNetworks-And-DeepLearning/Week1/二元分类与 Logistic 回归.md: -------------------------------------------------------------------------------- 1 | # 二元分类与 Logistic 回归 2 | 3 | 本部分将会介绍神经网格构建与训练的基础知识;一般来说,网络的计算过程由正向传播(Forward Propagation)与反向传播(Back Propagation)两部分组成。这里我们将会以简单的 Logistic 回归为例,讲解如何解决常见的二元分类(Binary Classification)问题。这里我们将会尝试训练出简单的神经网络以自动识别某个图片是否为猫,为猫则输出 1,否则输出 0。计算机中的图片往往表示为红、绿、蓝三个通道的像素值;如果我们的图像是 64 _ 64 像素值大小,我们的单张图片的特征维度即为 64 _ 64 \* 3 = 12288,即可以使用 $n_x = 12288$ 来表示特征向量的维度。 4 | 5 | # 深度学习的标准术语约定 6 | 7 | ## 神经网络的符号 8 | 9 | 上标 $^{(i)}$ 表示第 $i$ 个训练用例,而上标 $^{[l]}$ 则表示第 $l$ 层。 10 | 11 | ### 尺寸 12 | 13 | - $m$:数据集中用例的数目。 14 | - $n_x$:输入的单个用例的特征向量维度。 15 | - $n_y$:输出的维度(待分类的数目)。 16 | - $n_h^{[l]}$:第 $l$ 个隐层的单元数目,在循环中,我们可能定义 $n_x = n_h^{[0]}$ 以及 $n_y = n_h^{number \, of \, layers + 1}$。 17 | - $L$:神经网络的层数。 18 | 19 | ### 对象 20 | 21 | - $X \in R^{n_x \times m}$:输入的矩阵,即包含了 $m$ 个用例,每个用例的特征向量维度为 $n_x$。 22 | - $x^{(i)} \in R^{n_x}$:第 $i$ 个用例的特征向量,表示为列向量。 23 | - $Y \in R^{n_y \times m}$:标签矩阵。 24 | - $y^{(i)} \in R^{n_y}$:第 $i$ 个用例的输出标签。 25 | - $W^{[l]} \in R^{number \, of \, units \, in \, next \, layer \times number \, of \, unites \, in \, the \, previous \, layer}$:第 $l$ 层与第 $l+1$ 层之间的权重矩阵,在简单的二元分类且仅有输入层与输出层的情况下,其维度就是 $ 1 \times n_x$。 26 | - $b^{[l]} \in R^{number \, of \, units \, in \, next \, layer}$:第 $l$ 层的偏差矩阵。 27 | - $\hat{y} \in R^{n_y}$:输出的预测向量,也可以表示为 $a^{[L]}$,其中 $L$ 表示网络中的总层数。 28 | 29 | ### 通用前向传播等式 30 | 31 | - $ a = g^{[l]}(W_xx^{(i)} + b_1) = g^{[l]}(z_1) $,其中 $g^{[l]}$ 表示第 $l$ 层的激活函数。 32 | - $\hat{y}^{(i)} = softmax(W_hh + b_2)$。 33 | - 通用激活公式:$a*j^{[l]} = g^{[l]}(\sum_kw*{jk}^{[l]}a_k^{[l-1]} + b_j^{[l]}) = g^{[l]}(z_j^{[l]})$。 34 | - $J(x, W, b, y)$ 或者 $J(\hat{y}, y)$ 表示损失函数。 35 | 36 | ### 损失函数 37 | 38 | - $J*{CE(\hat{y},y)} = - \sum*{i=0}^m y^{(i)}log \hat{y}^{(i)}$ 39 | - $J*{1(\hat{y},y)} = \sum*{i=0}^m | y^{(i)} - \hat{y}^{(i)} |$ 40 | 41 | ## 深度学习的表示 42 | 43 | 在深度学习中,使用结点代表输入、激活函数或者数据,边代表权重或者偏差,下图即是两个典型的神经网络: 44 | 45 | # Logistic 回归 46 | 47 | ## 基础模型 48 | 49 | 在猫咪识别问题中,我们给定了未知的图片,可以将其表示为 $X \in R^{n_x}$ 的特征向量;我们的任务就是寻找合适的算法,来根据特征向量推导出该图片是猫咪的概率。在上面的介绍中我们假设了 Logistic 函数的参数为 $w \in R^{n_x} $ 以及 $b \in R$,则输出的计算公式可以表示为: 50 | 51 | $$ 52 | \hat{y} = \sigma(w^Tx + b) 53 | $$ 54 | 55 | 这里的 $\sigma$ 表示 Sigmoid 函数,该函数的表达式与线型如下: 56 | 57 | ![](https://upload.wikimedia.org/wikipedia/commons/thumb/5/53/Sigmoid-function-2.svg/2000px-Sigmoid-function-2.svg.png) 58 | 59 | 上图中可以发现,当 $t$ 非常大时,$e^{-t}$ 趋近于 0,整体的函数值趋近于 1;反之,如果 $t$ 非常小的时候,整体的函数值趋近于 0。 60 | 61 | ## 损失函数与代价函数 62 | 63 | 我们的训练目标是在给定训练数据 $\{(x^{(1)}, y^{(1)}),...,(x^{(m)},y^{(m)})\}$ 的情况下使得 $\hat{y}^{(i)}$ 尽可能接近 $y^{(i)}$,而所谓的损失函数即是用于衡量预测结果与真实值之间的误差。最简单的损失函数定义方式为平方差损失: 64 | 65 | $$ 66 | L(\hat{y},y) = \frac{1}{2} (\hat{y} - y)^2 67 | $$ 68 | 69 | 不过 Logistic 回归中我们并不倾向于使用这样的损失函数,因为其对于梯度下降并不友好,很多情况下会陷入非凸状态而只能得到局部最优解。这里我们将会使用如下的损失函数: 70 | 71 | $$ 72 | L(\hat{y},y) = -(ylog\hat{y} + (1-y)log(1-\hat{y})) 73 | $$ 74 | 75 | 我们的优化目标是希望损失函数值越小越好,这里我们考虑两个极端情况,当 $y = 1$ 时,损失函数值为 $-log\hat{y}$;此时如果 $\hat{y} = 1$,则损失函数为 0。反之如果 $\hat{y} = 0$,则损失函数值趋近于无穷大。当 $y = 0$ 时,损失函数值为 $-log(1-\hat{y})$;如果 $\hat{y} = 1$,则损失函数值也变得无穷大。这样我们可以将 Logistic 回归中总的代价函数定义为: 76 | 77 | $$ 78 | J(w,b) = 79 | \frac{1}{m}\sum*{i=1}^mL(\hat{y}^{(i)} - y^{(i)}) = 80 | -\frac{1}{m} \sum*{i=1}^m [y^{(i)}log\hat{y}^{(i)} + (1-y^{(i)})log(1-\hat{y}^{(i)})] 81 | $$ 82 | 83 | 在深度学习的模型训练中我们常常会接触到损失函数(Loss Function)与代价函数(Cost Function)的概念,其中损失函数代指单个训练用例的错误程度,而代价函数往往是整个训练集中所有训练用例的损失函数值的平均。 84 | -------------------------------------------------------------------------------- /99~参考资料/DeepLearning-Specialization/NeuralNetworks-And-DeepLearning/Week1/梯度下降与向量化操作.md: -------------------------------------------------------------------------------- 1 | # 梯度下降与向量化操作 2 | 3 | 我们在前文[二元分类与 Logistic 回归](https://zhuanlan.zhihu.com/p/28530027)中建立了 Logistic 回归的预测公式: 4 | 5 | $$ 6 | \hat{y} = \sigma(w^Tx + b), \, \sigma(z) = \frac{1}{1+e^{-z}} 7 | $$ 8 | 9 | 整个训练集的损失函数为: 10 | 11 | $$ 12 | J(w,b) = 13 | \frac{1}{m}\sum*{i=1}^mL(\hat{y}^{(i)} - y^{(i)}) = \\ 14 | -\frac{1}{m} \sum*{i=1}^m [y^{(i)}log\hat{y}^{(i)} + (1-y^{(i)})log(1-\hat{y}^{(i)})] 15 | $$ 16 | 17 | 模型的训练目标即是寻找合适的 $w$ 与 $b$ 以最小化代价函数值;简单起见我们先假设 $w$ 与 $b$ 都是一维实数,那么可以得到如下的 $J$ 关于 $w$ 与 $b$ 的图: 18 | 19 | 上图所示的函数 $J$ 即是典型的凸函数,与非凸函数的区别在于其不含有多个局部最低点;选择这样的代价函数就保证了无论我们初始化模型参数如何,都能够寻找到合适的最优解。如果我们仅考虑对于 $w$ 参数进行更新,则可以得到如下的一维图形: 20 | 21 | 参数 $w$ 的更新公式为: 22 | 23 | $$ 24 | w := w - \alpha \frac{dJ(w)}{dw} 25 | $$ 26 | 27 | 其中 $\alpha$ 表示学习速率,即每次更新的 $w$ 的步伐长度;当 $w$ 大于最优解 $w'$ 时,导数大于 0;即 $\frac{dJ(w)}{dw}$ 的值大于 0,那么 $w$ 就会向更小的方向更新。反之当 $w$ 小于最优解 $w'$ 时,导数小于 0,那么 $w$ 就会向更大的方向更新。 28 | 29 | # 导数 30 | 31 | 本部分是对于微积分中导数(Derivative)相关理论进行简单讲解,熟悉的同学可以跳过。 32 | 33 | 上图中,$a = 2$ 时,$f(a) = 6$;$a = 2.001$ 时,$f(a) = 6.003$,导数为 $\frac{6.003 - 6}{2.001 - 2} = 3$;在某个直线型函数中,其导数值是恒定不变的。我们继续考虑下述二次函数: 34 | 35 | 上图中,$a = 2$ 时,$f(a) = 4$;$a = 2.001$ 时,$f(a) \approx 4.004$,此处的导数即为 4。而当 $a = 5$ 时,此处的导数为 10;可以发现二次函数的导数值随着 $x$ 值的变化而变化。下表列举出了常见的导数: 36 | 37 | ![](http://durofy.com/wp-content/uploads/2012/10/basic_derivatives.jpg) 38 | 39 | 下表列举了常用的导数复合运算公式: 40 | 41 | # 计算图(Computation Graph) 42 | 43 | 神经网络中的计算即是由多个计算网络输出的前向传播与计算梯度的后向传播构成,我们可以将复杂的代价计算函数切割为多个子过程: 44 | 45 | $$ 46 | J(a, b, c) = 3 \times (a + bc) 47 | $$ 48 | 49 | 定义 $u = bc$ 以及 $v = a + u$ 和 $J = 3v$,那么整个计算图可以定义如下: 50 | 51 | 根据导数计算公式,我们可知: 52 | 53 | $$ 54 | \frac{dJ}{dv} = 3, \, 55 | \frac{dJ}{da} = \frac{dJ}{dv} \frac{dv}{da} = 3 56 | $$ 57 | 58 | 在复杂的计算图中,我们往往需要经过大量的中间计算才能得到最终输出值与原始参数的导数 $dvar = \frac{dFinalOutputVar}{d{var}'}$,这里的 $dvar$ 即表示了最终输出值相对于任意中间变量的导数。而所谓的反向传播(Back Propagation)即是当我们需要计算最终值相对于某个特征变量的导数时,我们需要利用计算图中上一步的结点定义。 59 | 60 | # Logistic 回归中的导数计算 61 | 62 | 我们在上文中讨论过 Logistic 回归的损失函数计算公式为: 63 | 64 | $$ 65 | z = w^Tx + b \\ 66 | \hat{y} = a = \sigma(z) \\ 67 | L(a,y) = -( ylog(a) + (1-y)log(1-a) ) 68 | $$ 69 | 70 | 这里我们假设输入的特征向量维度为 2,即输入参数共有 $x_1, w_1, x_2, w_2, b$ 这五个;可以推导出如下的计算图: 71 | 72 | 首先我们反向求出 $L$ 对于 $a$ 的导数: 73 | 74 | $$ 75 | da = \frac{dL(a,y)}{da} = -\frac{y}{a} + \frac{1-y}{1-a} 76 | $$ 77 | 78 | 然后继续反向求出 $L$ 对于 $z$ 的导数: 79 | 80 | $$ 81 | dz = \frac{dL}{dz} 82 | =\frac{dL(a,y)}{dz} 83 | = \frac{dL}{da} \frac{da}{dz} 84 | = a-y 85 | $$ 86 | 87 | 依次类推求出最终的损失函数相较于原始参数的导数之后,根据如下公式进行参数更新: 88 | 89 | $$ 90 | w_1 := w_1 - \alpha dw_1 \\ 91 | w_2 := w_2 - \alpha dw_2 \\ 92 | 93 | b := b - \alpha db 94 | $$ 95 | 96 | 接下来我们需要将对于单个用例的损失函数扩展到整个训练集的代价函数: 97 | 98 | $$ 99 | J(w,b) = \frac{1}{m} \sum*{i=1}^m L(a^{(i)},y) \\ 100 | a^{(i)} = \hat{y}^{(i)} = \sigma(z^{(i)}) = \sigma(w^Tx^{(i)} + b) 101 | $$ 102 | 103 | 我们可以对于某个权重参数 $w_1$,其导数计算为: 104 | 105 | $$ 106 | \frac{\partial J(w,b)}{\partial w_1} = \frac{1}{m} \sum*{i=1}^m \frac{\partial}{\partial w_1}L(a^{(i)},y^{(i)}) 107 | $$ 108 | 109 | 完整的 Logistic 回归中某次训练的流程如下,这里仅假设特征向量的维度为 2: 110 | 111 | # 向量化操作 112 | 113 | 在上述的 $m$ 个训练用例的 Logistic 回归中,每次训练我们需要进行两层循环,外层循环遍历所有的特征,内层循环遍历所有的训练用例;如果特征向量的维度或者训练用例非常多时,多层循环无疑会大大降低运行效率,因此我们使用向量化(Vectorization)操作来进行实际的训练。我们首先来讨论下何谓向量化操作。在 Logistic 回归中,我们需要计算 $z = w^Tx + b$,如果是非向量化的循环方式操作,我们可能会写出如下的代码: 114 | 115 | ``` 116 | z = 0; 117 | for i in range(n_x): 118 | z += w[i] * x[i] 119 | 120 | z += b 121 | ``` 122 | 123 | 而如果是向量化的操作,我们的代码则会简洁很多: 124 | 125 | ``` 126 | z = np.dot(w, x) + b 127 | ``` 128 | 129 | 在[未来的章节](https://parg.co/bjz)中我们会实际比较循环操作与向量化操作二者的性能差异,可以发现向量化操作能够带来近百倍的性能提升;目前无论 GPU 还是 CPU 环境都内置了并行指令集,SIMD(Single Instruction Multiple Data),因此无论何时我们都应该尽可能避免使用显式的循环。Numpy 还为我们提供了很多便捷的向量转化操作,譬如 `np.exp(v)` 用于进行指数计算,`np.log(v)` 用于进行对数计算,`np.abs(v)` 用于进行绝对值计算。 130 | 131 | 下面我们将上述的 Logistic 回归流程转化为向量化操作,其中输入数据可以变为 $n*x \times m$ 的矩阵,即共有 $m$ 个训练用例,每个用例的维度为 $n_x$: 132 | 133 | $$ 134 | Z = np.dot(W^TX) + b \\ 135 | A = [a^{(1)},a^{(2)},...,a^{(m)}] = \sigma(z) 136 | $$ 137 | 138 | 我们可以得到各个变量梯度计算公式为: 139 | 140 | $$ 141 | dZ = A - Y = [a^{(1)} y^{(1)}...] \\ 142 | db = \frac{1}{m}\sum*{i=1}^mdz^{(i)}=\frac{1}{m}np.sum(dZ) \\ 143 | dW = \frac{1}{m} X dZ^{T}= 144 | \frac{1}{m} 145 | \begin{bmatrix} 146 | \vdots \\ 147 | 148 | x^{(i1)} ... x^{(im)} \\ 149 | 150 | \vdots \\ 151 | \end{bmatrix} 152 | \begin{bmatrix} 153 | \vdots \\ 154 | 155 | dz^{(i)} \\ 156 | 157 | \vdots \\ 158 | \end{bmatrix} \\ 159 | = \frac{1}{m} 160 | \begin{bmatrix} 161 | \vdots \\ 162 | 163 | x^{(1)}dz^{(1)} + ... + x^{(m)}dz^{(m)} \\ 164 | 165 | \vdots \\ 166 | \end{bmatrix} \\ 167 | $$ 168 | 169 | # 延伸阅读 170 | 171 | - [机器学习、深度学习与自然语言处理领域推荐的书籍列表](https://zhuanlan.zhihu.com/p/25612011) 172 | 173 | - [Andrew NG 深度学习课程笔记:神经网络、有监督学习与深度学习](https://zhuanlan.zhihu.com/p/28488349) 174 | 175 | - [基于 Python 的简单自然语言处理实践](https://zhuanlan.zhihu.com/p/26249110) 176 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | AIDL Series 7 | 8 | 9 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 34 | 38 | 40 | 45 |
46 | 63 | 96 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 142 | 143 | 144 | 145 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Contributors][contributors-shield]][contributors-url] 2 | [![Forks][forks-shield]][forks-url] 3 | [![Stargazers][stars-shield]][stars-url] 4 | [![Issues][issues-shield]][issues-url] 5 | [![license: CC BY-NC-SA 4.0](https://ngte-superbed.oss-cn-beijing.aliyuncs.com/item/license-CC%20BY--NC--SA%204.0-lightgrey.svg)][license-url] 6 | 7 | 8 |
9 |

10 | 11 | Logo 12 | Logo 13 | 14 | 15 |

16 | 在线阅读 >> 17 |
18 |
19 | 代码案例 20 | · 21 | 参考资料 22 | 23 |

24 |

25 | 26 | 27 | 28 | # AI Series | 人工智能与深度学习实战 29 | 30 | 在本系列中,你可能会接触到数据挖掘、机器学习、深度学习、自然语言处理、人工智能等很多的概念。值得说明的是,鉴于很多的算法实现是以文字结合代码介绍为最佳,因此工具实践篇中的大部分内容是以 Juypter Notebook 的形式放在该仓库中,并且在 Colab 中进行浏览、编辑与实验。 31 | 32 | ![default](https://ngte-superbed.oss-cn-beijing.aliyuncs.com/item/611143c55132923bf8ccc10d.png) 33 | 34 | # Nav | 导航 35 | 36 | 本系列目前分为[机器学习](https://github.com/wx-chevalier/MachineLearning-Notes)、[自然语言处理](https://github.com/wx-chevalier/NLP-Notes)等部分。如果您是新人,建议从[机器学习](./机器学习)篇开始阅读,也可以在[数理统计](https://github.com/wx-chevalier/Mathematics-Notes)篇中夯实理论基础。如果想了解更多实践操作,建议阅读[深度学习](https://github.com/wx-chevalier/DeepLearning-Notes)、[自然语言处理](./自然语言处理)、[工具与工程化](https://github.com/wx-chevalier/AI-Toolkits-Notes)([TensorFlow Series](https://github.com/wx-chevalier/TensorFlow-Notes))等部分。 37 | 38 | # Preface | 前言 39 | 40 | 1956 年,几个计算机科学家相聚在达特茅斯会议,提出了“人工智能”的概念,梦想着用当时刚刚出现的计算机来构造复杂的、拥有与人类智慧同样本质特性的机器。2012 年以后,得益于数据量的上涨、运算力的提升和机器学习新算法(深度学习)的出现,人工智能开始大爆发。 41 | 42 | ![AI 算法大分类](https://ngte-superbed.oss-cn-beijing.aliyuncs.com/item/20221225150539.png) 43 | 44 | 机器学习是一种实现人工智能的方法,机器学习最基本的做法,是使用算法来解析数据、从中学习,然后对真实世界中的事件做出决策和预测。与传统的为解决特定任务、硬编码的软件程序不同,机器学习是用大量的数据来“训练”,通过各种算法从数据中学习如何完成任务。机器学习直接来源于早期的人工智能领域,传统的模型算法包括决策树、聚类、贝叶斯分类、支持向量机、EM、Adaboost 等等。从任务类型上来分,机器学习算法可以分为监督学习(如分类问题)、无监督学习(如聚类问题)、半监督学习、集成学习和强化学习的等。 45 | 46 | - 弱人工智能 Artificial Narrow Intelligence (ANI): 弱人工智能是擅长于单个方面的人工智能。比如有能战胜象棋世界冠军的人工智能,但是它只会下象棋,你要问它怎样更好地在硬盘上储存数据,它就不知道怎么回答你了。 47 | 48 | - 强人工智能 Artificial General Intelligence (AGI): 人类级别的人工智能。强人工智能是指在各方面都能和人类比肩的人工智能,人类能干的脑力活它都能干。创造强人工智能比创造弱人工智能难得多,我们现在还做不到。Linda Gottfredson 教授把智能定义为“一种宽泛的心理能力,能够进行思考、计划、解决问题、抽象思维、理解复杂理念、快速学习和从经验中学习等操作。”强人工智能在进行这些操作时应该和人类一样得心应手。 49 | 50 | - 超人工智能 Artificial Superintelligence (ASI): 牛津哲学家,知名人工智能思想家 Nick Bostrom 把超级智能定义为“在几乎所有领域都比最聪明的人类大脑都聪明很多,包括科学创新、通识和社交技能。”超人工智能可以是各方面都比人类强一点,也可以是各方面都比人类强万亿倍的。超人工智能也正是为什么人工智能这个话题这么火热的缘故。 51 | 52 | ## 人工智能与数据科学 53 | 54 | ![人工智能 Venn 图](https://ngte-superbed.oss-cn-beijing.aliyuncs.com/item/waPjrd.png) 55 | 56 | ## 人工智能与深度学习 57 | 58 | 传统的机器学习算法在指纹识别、基于 Haar 的人脸检测、基于 HoG 特征的物体检测等领域的应用基本达到了商业化的要求或者特定场景的商业化水平,但每前进一步都异常艰难,直到深度学习算法的出现。 59 | 60 | ![人工智能与深度学习的关系](https://ngte-superbed.oss-cn-beijing.aliyuncs.com/item/KBeQG4.png) 61 | 62 | 深度学习是一种实现机器学习的技术,深度学习本来并不是一种独立的学习方法,其本身也会用到有监督和无监督的学习方法来训练深度神经网络。最初的深度学习是利用深度神经网络来解决特征表达的一种学习过程。深度神经网络本身并不是一个全新的概念,可大致理解为包含多个隐含层的神经网络结构。为了提高深层神经网络的训练效果,人们对神经元的连接方法和激活函数等方面做出相应的调整。其实有不少想法早年间也曾有过,但由于当时训练数据量不足、计算能力落后,因此最终的效果不尽如人意。深度学习摧枯拉朽般地实现了各种任务,使得似乎所有的机器辅助功能都变为可能。无人驾驶汽车,预防性医疗保健,甚至是更好的电影推荐,都近在眼前,或者即将实现。 63 | 64 | ## 人工智能的发展阶段 65 | 66 | - 第一阶段(20 世纪 50 年代中期到 80 年代初期):深耕细作,30 年技术发展为人工智能产业化奠定基础。在 1956 年之前,人工智能就已经开始孕育。神经元模型、图灵测试的提出以及 SNARC 神经网络计算机的发明,为人工智能的诞生奠定了基础。1956 年的达特茅斯会议代表人工智能正式诞生和兴起。此后人工智能快速发展,深度学习模型以及 AlphaGo 增强学习的雏形——感知器均在这个阶段得以发明。随后由于早期的系统适用于更宽的问题选择和更难的问题时效果均不理想,因此美国、英国相继缩减经费支持,人工智能进入低谷。 67 | 68 | - 第二阶段(20 世纪 80 年代初期至 21 世纪初期):急功近利,人工智能成功商用但跨越式发展失败。80 年代初期,人工智能逐渐成为产业,第一个成功的商用专家系统 R1 为 DEC 公司每年节约 4000 万美元左右的费用。截止到 20 世纪 80 年代末,几乎一半的“财富 500 强”都在开发或使用“专家系统”。受此鼓励,日本、美国等国家投入巨资开发第 5 代计算机——人工智能计算机。在 90 年代初,IBM、苹果推出的台式机进入普通百姓家庭中,奠定了计算机工业的发展方向。第 5 代计算机由于技术路线明显背离计算机工业的发展方向,项目宣告失败,人工智能再一次进入低谷。尽管如此,浅层学习如支持向量机、Boosting 和最大熵方法等在 90 年代得到了广泛应用。 69 | 70 | - 第三阶段(21 世纪初期至今):量变产生质变,人工智能有望实现规模化应用。摩尔定律和云计算带来的计算能力的提升,以及互联网和大数据广泛应用带来的海量数据量的积累,使得深度学习算法在各行业得到快速应用,并推动语音识别、图像识别等技术快速发展并迅速产业化。2006 年,Geoffrey Hinton 和他的学生在《Science》上提出基于深度信念网络(Deep Belief Networks,DBN)可使用非监督学习的训练算法,使得深度学习在学术界持续升温。2012 年,DNN 技术在图像识别领域的应用使得 Hinton 的学生在 ImageNet 评测中取得了非常好的成绩。深度学习算法的应用使得语音识别、图像识别技术取得了突破性进展,围绕语音、图像、机器人、自动驾驶等人工智能技术的创新创业大量涌现,人工智能迅速进入发展热潮。 71 | 72 | 不同的模型、策略、算法的搭配,不断地推动着人工智能的发展,其又可以被分为三个阶段:计算智能、感知智能和认知智能。 73 | 74 | - 第一阶段的计算智能即快速计算和记忆存储,像机器人战胜围棋大师,靠的就是超强的记忆能力和运算速度。人脑的逻辑能力再强大,也敌不过人工智能每天和自己下几百盘棋,通过强大的计算能力对十几步后的结果做出预测,从这一角度来说,人工智能多次战败世界级围棋选手,足以证明这一领域发展之成熟。 75 | 76 | - 第二阶段的感知智能,即让机器拥有视觉、听觉、触觉等感知能力。自动驾驶汽车做的就是这一方面的研究,使机器通过传感器对周围的环境进行感知和处理,从而实现自动驾驶。感知智能方面的技术目前发展比较成熟的领域有语音识别和图像识别,比如做安全领域人脸识别技术的 Face++,以成熟的计算机视觉技术深耕电商、短视频等领域的 Yi+,能够对多种语言进行准确识别翻译的科大讯飞等。 77 | 78 | - 第三阶段的认知智能就是让机器拥有自己的认知,能理解会思考。认知智能是目前机器和人差距最大的领域,因为这不仅涉及逻辑和技术,还涉及心理学、哲学和语言学等学科。 79 | 80 | # About 81 | 82 | ## Copyright & More | 延伸阅读 83 | 84 | ![License: CC BY-NC-SA 4.0](https://img.shields.io/badge/License-CC%20BY--NC--SA%204.0-lightgrey.svg) ![](https://ngte-superbed.oss-cn-beijing.aliyuncs.com/item/bDm) 85 | 86 | 笔者所有文章遵循[知识共享 署名-非商业性使用-禁止演绎 4.0 国际许可协议](https://creativecommons.org/licenses/by-nc-nd/4.0/deed.zh),欢迎转载,尊重版权。 87 | 88 | ![技术视野](https://ngte-superbed.oss-cn-beijing.aliyuncs.com/item/yTSKdH.png) 89 | 90 | 您还可以前往 [NGTE Books](https://ng-tech.icu/books-gallery/) 主页浏览包含知识体系、编程语言、软件工程、模式与架构、Web 与大前端、服务端开发实践与工程架构、分布式基础架构、人工智能与深度学习、产品运营与创业等多类目的书籍列表: 91 | 92 | ![NGTE Books](https://ngte-superbed.oss-cn-beijing.aliyuncs.com/item/19uXtI.png) 93 | 94 | ## Links 95 | 96 | - https://github.com/KeKe-Li/tutorial 97 | - https://github.com/fengdu78/Data-Science-Notes/blob/master/README.md 98 | 99 | 100 | 101 | 102 | [contributors-shield]: https://img.shields.io/github/contributors/wx-chevalier/AI-Notes.svg?style=flat-square 103 | [contributors-url]: https://github.com/wx-chevalier/AI-Notes/graphs/contributors 104 | [forks-shield]: https://img.shields.io/github/forks/wx-chevalier/AI-Notes.svg?style=flat-square 105 | [forks-url]: https://github.com/wx-chevalier/AI-Notes/network/members 106 | [stars-shield]: https://img.shields.io/github/stars/wx-chevalier/AI-Notes.svg?style=flat-square 107 | [stars-url]: https://github.com/wx-chevalier/AI-Notes/stargazers 108 | [issues-shield]: https://img.shields.io/github/issues/wx-chevalier/AI-Notes.svg?style=flat-square 109 | [issues-url]: https://github.com/wx-chevalier/AI-Notes/issues 110 | [license-shield]: https://img.shields.io/github/license/wx-chevalier/AI-Notes.svg?style=flat-square 111 | [license-url]: https://github.com/wx-chevalier/AI-Notes/blob/master/LICENSE.txt 112 | -------------------------------------------------------------------------------- /header.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 101 | 366 |
367 |

368 | AI Series by 王下邀月熊 369 |

370 |

371 | 人工智能与深度学习实战 372 |

373 |
374 | 375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 | 391 | 392 |
393 |
394 |
-------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International 2 | Public License 3 | 4 | By exercising the Licensed Rights (defined below), You accept and agree 5 | to be bound by the terms and conditions of this Creative Commons 6 | Attribution-NonCommercial-ShareAlike 4.0 International Public License 7 | ("Public License"). To the extent this Public License may be 8 | interpreted as a contract, You are granted the Licensed Rights in 9 | consideration of Your acceptance of these terms and conditions, and the 10 | Licensor grants You such rights in consideration of benefits the 11 | Licensor receives from making the Licensed Material available under 12 | these terms and conditions. 13 | 14 | 15 | Section 1 -- Definitions. 16 | 17 | a. Adapted Material means material subject to Copyright and Similar 18 | Rights that is derived from or based upon the Licensed Material 19 | and in which the Licensed Material is translated, altered, 20 | arranged, transformed, or otherwise modified in a manner requiring 21 | permission under the Copyright and Similar Rights held by the 22 | Licensor. For purposes of this Public License, where the Licensed 23 | Material is a musical work, performance, or sound recording, 24 | Adapted Material is always produced where the Licensed Material is 25 | synched in timed relation with a moving image. 26 | 27 | b. Adapter's License means the license You apply to Your Copyright 28 | and Similar Rights in Your contributions to Adapted Material in 29 | accordance with the terms and conditions of this Public License. 30 | 31 | c. BY-NC-SA Compatible License means a license listed at 32 | creativecommons.org/compatiblelicenses, approved by Creative 33 | Commons as essentially the equivalent of this Public License. 34 | 35 | d. Copyright and Similar Rights means copyright and/or similar rights 36 | closely related to copyright including, without limitation, 37 | performance, broadcast, sound recording, and Sui Generis Database 38 | Rights, without regard to how the rights are labeled or 39 | categorized. For purposes of this Public License, the rights 40 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 41 | Rights. 42 | 43 | e. Effective Technological Measures means those measures that, in the 44 | absence of proper authority, may not be circumvented under laws 45 | fulfilling obligations under Article 11 of the WIPO Copyright 46 | Treaty adopted on December 20, 1996, and/or similar international 47 | agreements. 48 | 49 | f. Exceptions and Limitations means fair use, fair dealing, and/or 50 | any other exception or limitation to Copyright and Similar Rights 51 | that applies to Your use of the Licensed Material. 52 | 53 | g. License Elements means the license attributes listed in the name 54 | of a Creative Commons Public License. The License Elements of this 55 | Public License are Attribution, NonCommercial, and ShareAlike. 56 | 57 | h. Licensed Material means the artistic or literary work, database, 58 | or other material to which the Licensor applied this Public 59 | License. 60 | 61 | i. Licensed Rights means the rights granted to You subject to the 62 | terms and conditions of this Public License, which are limited to 63 | all Copyright and Similar Rights that apply to Your use of the 64 | Licensed Material and that the Licensor has authority to license. 65 | 66 | j. Licensor means the individual(s) or entity(ies) granting rights 67 | under this Public License. 68 | 69 | k. NonCommercial means not primarily intended for or directed towards 70 | commercial advantage or monetary compensation. For purposes of 71 | this Public License, the exchange of the Licensed Material for 72 | other material subject to Copyright and Similar Rights by digital 73 | file-sharing or similar means is NonCommercial provided there is 74 | no payment of monetary compensation in connection with the 75 | exchange. 76 | 77 | l. Share means to provide material to the public by any means or 78 | process that requires permission under the Licensed Rights, such 79 | as reproduction, public display, public performance, distribution, 80 | dissemination, communication, or importation, and to make material 81 | available to the public including in ways that members of the 82 | public may access the material from a place and at a time 83 | individually chosen by them. 84 | 85 | m. Sui Generis Database Rights means rights other than copyright 86 | resulting from Directive 96/9/EC of the European Parliament and of 87 | the Council of 11 March 1996 on the legal protection of databases, 88 | as amended and/or succeeded, as well as other essentially 89 | equivalent rights anywhere in the world. 90 | 91 | n. You means the individual or entity exercising the Licensed Rights 92 | under this Public License. Your has a corresponding meaning. 93 | 94 | 95 | Section 2 -- Scope. 96 | 97 | a. License grant. 98 | 99 | 1. Subject to the terms and conditions of this Public License, 100 | the Licensor hereby grants You a worldwide, royalty-free, 101 | non-sublicensable, non-exclusive, irrevocable license to 102 | exercise the Licensed Rights in the Licensed Material to: 103 | 104 | a. reproduce and Share the Licensed Material, in whole or 105 | in part, for NonCommercial purposes only; and 106 | 107 | b. produce, reproduce, and Share Adapted Material for 108 | NonCommercial purposes only. 109 | 110 | 2. Exceptions and Limitations. For the avoidance of doubt, where 111 | Exceptions and Limitations apply to Your use, this Public 112 | License does not apply, and You do not need to comply with 113 | its terms and conditions. 114 | 115 | 3. Term. The term of this Public License is specified in Section 116 | 6(a). 117 | 118 | 4. Media and formats; technical modifications allowed. The 119 | Licensor authorizes You to exercise the Licensed Rights in 120 | all media and formats whether now known or hereafter created, 121 | and to make technical modifications necessary to do so. The 122 | Licensor waives and/or agrees not to assert any right or 123 | authority to forbid You from making technical modifications 124 | necessary to exercise the Licensed Rights, including 125 | technical modifications necessary to circumvent Effective 126 | Technological Measures. For purposes of this Public License, 127 | simply making modifications authorized by this Section 2(a) 128 | (4) never produces Adapted Material. 129 | 130 | 5. Downstream recipients. 131 | 132 | a. Offer from the Licensor -- Licensed Material. Every 133 | recipient of the Licensed Material automatically 134 | receives an offer from the Licensor to exercise the 135 | Licensed Rights under the terms and conditions of this 136 | Public License. 137 | 138 | b. Additional offer from the Licensor -- Adapted Material. 139 | Every recipient of Adapted Material from You 140 | automatically receives an offer from the Licensor to 141 | exercise the Licensed Rights in the Adapted Material 142 | under the conditions of the Adapter's License You apply. 143 | 144 | c. No downstream restrictions. You may not offer or impose 145 | any additional or different terms or conditions on, or 146 | apply any Effective Technological Measures to, the 147 | Licensed Material if doing so restricts exercise of the 148 | Licensed Rights by any recipient of the Licensed 149 | Material. 150 | 151 | 6. No endorsement. Nothing in this Public License constitutes or 152 | may be construed as permission to assert or imply that You 153 | are, or that Your use of the Licensed Material is, connected 154 | with, or sponsored, endorsed, or granted official status by, 155 | the Licensor or others designated to receive attribution as 156 | provided in Section 3(a)(1)(A)(i). 157 | 158 | b. Other rights. 159 | 160 | 1. Moral rights, such as the right of integrity, are not 161 | licensed under this Public License, nor are publicity, 162 | privacy, and/or other similar personality rights; however, to 163 | the extent possible, the Licensor waives and/or agrees not to 164 | assert any such rights held by the Licensor to the limited 165 | extent necessary to allow You to exercise the Licensed 166 | Rights, but not otherwise. 167 | 168 | 2. Patent and trademark rights are not licensed under this 169 | Public License. 170 | 171 | 3. To the extent possible, the Licensor waives any right to 172 | collect royalties from You for the exercise of the Licensed 173 | Rights, whether directly or through a collecting society 174 | under any voluntary or waivable statutory or compulsory 175 | licensing scheme. In all other cases the Licensor expressly 176 | reserves any right to collect such royalties, including when 177 | the Licensed Material is used other than for NonCommercial 178 | purposes. 179 | 180 | 181 | Section 3 -- License Conditions. 182 | 183 | Your exercise of the Licensed Rights is expressly made subject to the 184 | following conditions. 185 | 186 | a. Attribution. 187 | 188 | 1. If You Share the Licensed Material (including in modified 189 | form), You must: 190 | 191 | a. retain the following if it is supplied by the Licensor 192 | with the Licensed Material: 193 | 194 | i. identification of the creator(s) of the Licensed 195 | Material and any others designated to receive 196 | attribution, in any reasonable manner requested by 197 | the Licensor (including by pseudonym if 198 | designated); 199 | 200 | ii. a copyright notice; 201 | 202 | iii. a notice that refers to this Public License; 203 | 204 | iv. a notice that refers to the disclaimer of 205 | warranties; 206 | 207 | v. a URI or hyperlink to the Licensed Material to the 208 | extent reasonably practicable; 209 | 210 | b. indicate if You modified the Licensed Material and 211 | retain an indication of any previous modifications; and 212 | 213 | c. indicate the Licensed Material is licensed under this 214 | Public License, and include the text of, or the URI or 215 | hyperlink to, this Public License. 216 | 217 | 2. You may satisfy the conditions in Section 3(a)(1) in any 218 | reasonable manner based on the medium, means, and context in 219 | which You Share the Licensed Material. For example, it may be 220 | reasonable to satisfy the conditions by providing a URI or 221 | hyperlink to a resource that includes the required 222 | information. 223 | 3. If requested by the Licensor, You must remove any of the 224 | information required by Section 3(a)(1)(A) to the extent 225 | reasonably practicable. 226 | 227 | b. ShareAlike. 228 | 229 | In addition to the conditions in Section 3(a), if You Share 230 | Adapted Material You produce, the following conditions also apply. 231 | 232 | 1. The Adapter's License You apply must be a Creative Commons 233 | license with the same License Elements, this version or 234 | later, or a BY-NC-SA Compatible License. 235 | 236 | 2. You must include the text of, or the URI or hyperlink to, the 237 | Adapter's License You apply. You may satisfy this condition 238 | in any reasonable manner based on the medium, means, and 239 | context in which You Share Adapted Material. 240 | 241 | 3. You may not offer or impose any additional or different terms 242 | or conditions on, or apply any Effective Technological 243 | Measures to, Adapted Material that restrict exercise of the 244 | rights granted under the Adapter's License You apply. 245 | 246 | 247 | Section 4 -- Sui Generis Database Rights. 248 | 249 | Where the Licensed Rights include Sui Generis Database Rights that 250 | apply to Your use of the Licensed Material: 251 | 252 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 253 | to extract, reuse, reproduce, and Share all or a substantial 254 | portion of the contents of the database for NonCommercial purposes 255 | only; 256 | 257 | b. if You include all or a substantial portion of the database 258 | contents in a database in which You have Sui Generis Database 259 | Rights, then the database in which You have Sui Generis Database 260 | Rights (but not its individual contents) is Adapted Material, 261 | including for purposes of Section 3(b); and 262 | 263 | c. You must comply with the conditions in Section 3(a) if You Share 264 | all or a substantial portion of the contents of the database. 265 | 266 | For the avoidance of doubt, this Section 4 supplements and does not 267 | replace Your obligations under this Public License where the Licensed 268 | Rights include other Copyright and Similar Rights. 269 | 270 | 271 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 272 | 273 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 274 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 275 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 276 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 277 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 278 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 279 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 280 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 281 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 282 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 283 | 284 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 285 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 286 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 287 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 288 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 289 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 290 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 291 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 292 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 293 | 294 | c. The disclaimer of warranties and limitation of liability provided 295 | above shall be interpreted in a manner that, to the extent 296 | possible, most closely approximates an absolute disclaimer and 297 | waiver of all liability. 298 | 299 | 300 | Section 6 -- Term and Termination. 301 | 302 | a. This Public License applies for the term of the Copyright and 303 | Similar Rights licensed here. However, if You fail to comply with 304 | this Public License, then Your rights under this Public License 305 | terminate automatically. 306 | 307 | b. Where Your right to use the Licensed Material has terminated under 308 | Section 6(a), it reinstates: 309 | 310 | 1. automatically as of the date the violation is cured, provided 311 | it is cured within 30 days of Your discovery of the 312 | violation; or 313 | 314 | 2. upon express reinstatement by the Licensor. 315 | 316 | For the avoidance of doubt, this Section 6(b) does not affect any 317 | right the Licensor may have to seek remedies for Your violations 318 | of this Public License. 319 | 320 | c. For the avoidance of doubt, the Licensor may also offer the 321 | Licensed Material under separate terms or conditions or stop 322 | distributing the Licensed Material at any time; however, doing so 323 | will not terminate this Public License. 324 | 325 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 326 | License. 327 | 328 | 329 | Section 7 -- Other Terms and Conditions. 330 | 331 | a. The Licensor shall not be bound by any additional or different 332 | terms or conditions communicated by You unless expressly agreed. 333 | 334 | b. Any arrangements, understandings, or agreements regarding the 335 | Licensed Material not stated herein are separate from and 336 | independent of the terms and conditions of this Public License. 337 | 338 | 339 | Section 8 -- Interpretation. 340 | 341 | a. For the avoidance of doubt, this Public License does not, and 342 | shall not be interpreted to, reduce, limit, restrict, or impose 343 | conditions on any use of the Licensed Material that could lawfully 344 | be made without permission under this Public License. 345 | 346 | b. To the extent possible, if any provision of this Public License is 347 | deemed unenforceable, it shall be automatically reformed to the 348 | minimum extent necessary to make it enforceable. If the provision 349 | cannot be reformed, it shall be severed from this Public License 350 | without affecting the enforceability of the remaining terms and 351 | conditions. 352 | 353 | c. No term or condition of this Public License will be waived and no 354 | failure to comply consented to unless expressly agreed to by the 355 | Licensor. 356 | 357 | d. Nothing in this Public License constitutes or may be interpreted 358 | as a limitation upon, or waiver of, any privileges and immunities 359 | that apply to the Licensor or You, including from the legal 360 | processes of any jurisdiction or authority. 361 | -------------------------------------------------------------------------------- /99~参考资料/DeepLearning-Specialization/Improving Deep Neural Networks/Week1/Gradient+Checking+v1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Gradient Checking\n", 8 | "\n", 9 | "Welcome to the final assignment for this week! In this assignment you will learn to implement and use gradient checking. \n", 10 | "\n", 11 | "You are part of a team working to make mobile payments available globally, and are asked to build a deep learning model to detect fraud--whenever someone makes a payment, you want to see if the payment might be fraudulent, such as if the user's account has been taken over by a hacker. \n", 12 | "\n", 13 | "But backpropagation is quite challenging to implement, and sometimes has bugs. Because this is a mission-critical application, your company's CEO wants to be really certain that your implementation of backpropagation is correct. Your CEO says, \"Give me a proof that your backpropagation is actually working!\" To give this reassurance, you are going to use \"gradient checking\".\n", 14 | "\n", 15 | "Let's do it!" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "# Packages\n", 27 | "import numpy as np\n", 28 | "from testCases import *\n", 29 | "from gc_utils import sigmoid, relu, dictionary_to_vector, vector_to_dictionary, gradients_to_vector" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "## 1) How does gradient checking work?\n", 37 | "\n", 38 | "Backpropagation computes the gradients $\\frac{\\partial J}{\\partial \\theta}$, where $\\theta$ denotes the parameters of the model. $J$ is computed using forward propagation and your loss function.\n", 39 | "\n", 40 | "Because forward propagation is relatively easy to implement, you're confident you got that right, and so you're almost 100% sure that you're computing the cost $J$ correctly. Thus, you can use your code for computing $J$ to verify the code for computing $\\frac{\\partial J}{\\partial \\theta}$. \n", 41 | "\n", 42 | "Let's look back at the definition of a derivative (or gradient):\n", 43 | "$$ \\frac{\\partial J}{\\partial \\theta} = \\lim_{\\varepsilon \\to 0} \\frac{J(\\theta + \\varepsilon) - J(\\theta - \\varepsilon)}{2 \\varepsilon} \\tag{1}$$\n", 44 | "\n", 45 | "If you're not familiar with the \"$\\displaystyle \\lim_{\\varepsilon \\to 0}$\" notation, it's just a way of saying \"when $\\varepsilon$ is really really small.\"\n", 46 | "\n", 47 | "We know the following:\n", 48 | "\n", 49 | "- $\\frac{\\partial J}{\\partial \\theta}$ is what you want to make sure you're computing correctly. \n", 50 | "- You can compute $J(\\theta + \\varepsilon)$ and $J(\\theta - \\varepsilon)$ (in the case that $\\theta$ is a real number), since you're confident your implementation for $J$ is correct. \n", 51 | "\n", 52 | "Lets use equation (1) and a small value for $\\varepsilon$ to convince your CEO that your code for computing $\\frac{\\partial J}{\\partial \\theta}$ is correct!" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "## 2) 1-dimensional gradient checking\n", 60 | "\n", 61 | "Consider a 1D linear function $J(\\theta) = \\theta x$. The model contains only a single real-valued parameter $\\theta$, and takes $x$ as input.\n", 62 | "\n", 63 | "You will implement code to compute $J(.)$ and its derivative $\\frac{\\partial J}{\\partial \\theta}$. You will then use gradient checking to make sure your derivative computation for $J$ is correct. \n", 64 | "\n", 65 | "\n", 66 | "
**Figure 1** : **1D linear model**
\n", 67 | "\n", 68 | "The diagram above shows the key computation steps: First start with $x$, then evaluate the function $J(x)$ (\"forward propagation\"). Then compute the derivative $\\frac{\\partial J}{\\partial \\theta}$ (\"backward propagation\"). \n", 69 | "\n", 70 | "**Exercise**: implement \"forward propagation\" and \"backward propagation\" for this simple function. I.e., compute both $J(.)$ (\"forward propagation\") and its derivative with respect to $\\theta$ (\"backward propagation\"), in two separate functions. " 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 2, 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "# GRADED FUNCTION: forward_propagation\n", 82 | "\n", 83 | "def forward_propagation(x, theta):\n", 84 | " \"\"\"\n", 85 | " Implement the linear forward propagation (compute J) presented in Figure 1 (J(theta) = theta * x)\n", 86 | " \n", 87 | " Arguments:\n", 88 | " x -- a real-valued input\n", 89 | " theta -- our parameter, a real number as well\n", 90 | " \n", 91 | " Returns:\n", 92 | " J -- the value of function J, computed using the formula J(theta) = theta * x\n", 93 | " \"\"\"\n", 94 | " \n", 95 | " ### START CODE HERE ### (approx. 1 line)\n", 96 | " J = theta*x\n", 97 | " ### END CODE HERE ###\n", 98 | " \n", 99 | " return J" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 3, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "J = 8\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "x, theta = 2, 4\n", 117 | "J = forward_propagation(x, theta)\n", 118 | "print (\"J = \" + str(J))" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "**Expected Output**:\n", 126 | "\n", 127 | "\n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | "
** J ** 8
" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "**Exercise**: Now, implement the backward propagation step (derivative computation) of Figure 1. That is, compute the derivative of $J(\\theta) = \\theta x$ with respect to $\\theta$. To save you from doing the calculus, you should get $dtheta = \\frac { \\partial J }{ \\partial \\theta} = x$." 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 4, 145 | "metadata": { 146 | "collapsed": true 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "# GRADED FUNCTION: backward_propagation\n", 151 | "\n", 152 | "def backward_propagation(x, theta):\n", 153 | " \"\"\"\n", 154 | " Computes the derivative of J with respect to theta (see Figure 1).\n", 155 | " \n", 156 | " Arguments:\n", 157 | " x -- a real-valued input\n", 158 | " theta -- our parameter, a real number as well\n", 159 | " \n", 160 | " Returns:\n", 161 | " dtheta -- the gradient of the cost with respect to theta\n", 162 | " \"\"\"\n", 163 | " \n", 164 | " ### START CODE HERE ### (approx. 1 line)\n", 165 | " dtheta = x\n", 166 | " ### END CODE HERE ###\n", 167 | " \n", 168 | " return dtheta" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 5, 174 | "metadata": { 175 | "scrolled": true 176 | }, 177 | "outputs": [ 178 | { 179 | "name": "stdout", 180 | "output_type": "stream", 181 | "text": [ 182 | "dtheta = 2\n" 183 | ] 184 | } 185 | ], 186 | "source": [ 187 | "x, theta = 2, 4\n", 188 | "dtheta = backward_propagation(x, theta)\n", 189 | "print (\"dtheta = \" + str(dtheta))" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "**Expected Output**:\n", 197 | "\n", 198 | "\n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | "
** dtheta ** 2
" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "**Exercise**: To show that the `backward_propagation()` function is correctly computing the gradient $\\frac{\\partial J}{\\partial \\theta}$, let's implement gradient checking.\n", 211 | "\n", 212 | "**Instructions**:\n", 213 | "- First compute \"gradapprox\" using the formula above (1) and a small value of $\\varepsilon$. Here are the Steps to follow:\n", 214 | " 1. $\\theta^{+} = \\theta + \\varepsilon$\n", 215 | " 2. $\\theta^{-} = \\theta - \\varepsilon$\n", 216 | " 3. $J^{+} = J(\\theta^{+})$\n", 217 | " 4. $J^{-} = J(\\theta^{-})$\n", 218 | " 5. $gradapprox = \\frac{J^{+} - J^{-}}{2 \\varepsilon}$\n", 219 | "- Then compute the gradient using backward propagation, and store the result in a variable \"grad\"\n", 220 | "- Finally, compute the relative difference between \"gradapprox\" and the \"grad\" using the following formula:\n", 221 | "$$ difference = \\frac {\\mid\\mid grad - gradapprox \\mid\\mid_2}{\\mid\\mid grad \\mid\\mid_2 + \\mid\\mid gradapprox \\mid\\mid_2} \\tag{2}$$\n", 222 | "You will need 3 Steps to compute this formula:\n", 223 | " - 1'. compute the numerator using np.linalg.norm(...)\n", 224 | " - 2'. compute the denominator. You will need to call np.linalg.norm(...) twice.\n", 225 | " - 3'. divide them.\n", 226 | "- If this difference is small (say less than $10^{-7}$), you can be quite confident that you have computed your gradient correctly. Otherwise, there may be a mistake in the gradient computation. \n" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 8, 232 | "metadata": { 233 | "collapsed": true 234 | }, 235 | "outputs": [], 236 | "source": [ 237 | "# GRADED FUNCTION: gradient_check\n", 238 | "\n", 239 | "def gradient_check(x, theta, epsilon = 1e-7):\n", 240 | " \"\"\"\n", 241 | " Implement the backward propagation presented in Figure 1.\n", 242 | " \n", 243 | " Arguments:\n", 244 | " x -- a real-valued input\n", 245 | " theta -- our parameter, a real number as well\n", 246 | " epsilon -- tiny shift to the input to compute approximated gradient with formula(1)\n", 247 | " \n", 248 | " Returns:\n", 249 | " difference -- difference (2) between the approximated gradient and the backward propagation gradient\n", 250 | " \"\"\"\n", 251 | " \n", 252 | " # Compute gradapprox using left side of formula (1). epsilon is small enough, you don't need to worry about the limit.\n", 253 | " ### START CODE HERE ### (approx. 5 lines)\n", 254 | " thetaplus = theta + epsilon # Step 1\n", 255 | " thetaminus = theta - epsilon # Step 2\n", 256 | " J_plus = thetaplus*x # Step 3\n", 257 | " J_minus = thetaminus*x # Step 4\n", 258 | " gradapprox = (J_plus - J_minus)/(2*epsilon) # Step 5\n", 259 | " ### END CODE HERE ###\n", 260 | " \n", 261 | " # Check if gradapprox is close enough to the output of backward_propagation()\n", 262 | " ### START CODE HERE ### (approx. 1 line)\n", 263 | " grad = x\n", 264 | " ### END CODE HERE ###\n", 265 | " \n", 266 | " ### START CODE HERE ### (approx. 1 line)\n", 267 | " numerator = np.linalg.norm((grad-gradapprox),keepdims=True) # Step 1'\n", 268 | " denominator = np.linalg.norm((grad),keepdims=True) + np.linalg.norm((gradapprox),keepdims=True) # Step 2'\n", 269 | " difference = numerator/denominator # Step 3'\n", 270 | " ### END CODE HERE ###\n", 271 | " \n", 272 | " if difference < 1e-7:\n", 273 | " print (\"The gradient is correct!\")\n", 274 | " else:\n", 275 | " print (\"The gradient is wrong!\")\n", 276 | " \n", 277 | " return difference" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 9, 283 | "metadata": { 284 | "scrolled": true 285 | }, 286 | "outputs": [ 287 | { 288 | "name": "stdout", 289 | "output_type": "stream", 290 | "text": [ 291 | "The gradient is correct!\n", 292 | "difference = 2.91933588329e-10\n" 293 | ] 294 | } 295 | ], 296 | "source": [ 297 | "x, theta = 2, 4\n", 298 | "difference = gradient_check(x, theta)\n", 299 | "print(\"difference = \" + str(difference))" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "**Expected Output**:\n", 307 | "The gradient is correct!\n", 308 | "\n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | "
** difference ** 2.9193358103083e-10
" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "Congrats, the difference is smaller than the $10^{-7}$ threshold. So you can have high confidence that you've correctly computed the gradient in `backward_propagation()`. \n", 321 | "\n", 322 | "Now, in the more general case, your cost function $J$ has more than a single 1D input. When you are training a neural network, $\\theta$ actually consists of multiple matrices $W^{[l]}$ and biases $b^{[l]}$! It is important to know how to do a gradient check with higher-dimensional inputs. Let's do it!" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "## 3) N-dimensional gradient checking" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": { 335 | "collapsed": true 336 | }, 337 | "source": [ 338 | "The following figure describes the forward and backward propagation of your fraud detection model.\n", 339 | "\n", 340 | "\n", 341 | "
**Figure 2** : **deep neural network**
*LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID*
\n", 342 | "\n", 343 | "Let's look at your implementations for forward propagation and backward propagation. " 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 10, 349 | "metadata": { 350 | "collapsed": true 351 | }, 352 | "outputs": [], 353 | "source": [ 354 | "def forward_propagation_n(X, Y, parameters):\n", 355 | " \"\"\"\n", 356 | " Implements the forward propagation (and computes the cost) presented in Figure 3.\n", 357 | " \n", 358 | " Arguments:\n", 359 | " X -- training set for m examples\n", 360 | " Y -- labels for m examples \n", 361 | " parameters -- python dictionary containing your parameters \"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\":\n", 362 | " W1 -- weight matrix of shape (5, 4)\n", 363 | " b1 -- bias vector of shape (5, 1)\n", 364 | " W2 -- weight matrix of shape (3, 5)\n", 365 | " b2 -- bias vector of shape (3, 1)\n", 366 | " W3 -- weight matrix of shape (1, 3)\n", 367 | " b3 -- bias vector of shape (1, 1)\n", 368 | " \n", 369 | " Returns:\n", 370 | " cost -- the cost function (logistic cost for one example)\n", 371 | " \"\"\"\n", 372 | " \n", 373 | " # retrieve parameters\n", 374 | " m = X.shape[1]\n", 375 | " W1 = parameters[\"W1\"]\n", 376 | " b1 = parameters[\"b1\"]\n", 377 | " W2 = parameters[\"W2\"]\n", 378 | " b2 = parameters[\"b2\"]\n", 379 | " W3 = parameters[\"W3\"]\n", 380 | " b3 = parameters[\"b3\"]\n", 381 | "\n", 382 | " # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID\n", 383 | " Z1 = np.dot(W1, X) + b1\n", 384 | " A1 = relu(Z1)\n", 385 | " Z2 = np.dot(W2, A1) + b2\n", 386 | " A2 = relu(Z2)\n", 387 | " Z3 = np.dot(W3, A2) + b3\n", 388 | " A3 = sigmoid(Z3)\n", 389 | "\n", 390 | " # Cost\n", 391 | " logprobs = np.multiply(-np.log(A3),Y) + np.multiply(-np.log(1 - A3), 1 - Y)\n", 392 | " cost = 1./m * np.sum(logprobs)\n", 393 | " \n", 394 | " cache = (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3)\n", 395 | " \n", 396 | " return cost, cache" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": {}, 402 | "source": [ 403 | "Now, run backward propagation." 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 11, 409 | "metadata": { 410 | "collapsed": true 411 | }, 412 | "outputs": [], 413 | "source": [ 414 | "def backward_propagation_n(X, Y, cache):\n", 415 | " \"\"\"\n", 416 | " Implement the backward propagation presented in figure 2.\n", 417 | " \n", 418 | " Arguments:\n", 419 | " X -- input datapoint, of shape (input size, 1)\n", 420 | " Y -- true \"label\"\n", 421 | " cache -- cache output from forward_propagation_n()\n", 422 | " \n", 423 | " Returns:\n", 424 | " gradients -- A dictionary with the gradients of the cost with respect to each parameter, activation and pre-activation variables.\n", 425 | " \"\"\"\n", 426 | " \n", 427 | " m = X.shape[1]\n", 428 | " (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache\n", 429 | " \n", 430 | " dZ3 = A3 - Y\n", 431 | " dW3 = 1./m * np.dot(dZ3, A2.T)\n", 432 | " db3 = 1./m * np.sum(dZ3, axis=1, keepdims = True)\n", 433 | " \n", 434 | " dA2 = np.dot(W3.T, dZ3)\n", 435 | " dZ2 = np.multiply(dA2, np.int64(A2 > 0))\n", 436 | " dW2 = 1./m * np.dot(dZ2, A1.T) * 2\n", 437 | " db2 = 1./m * np.sum(dZ2, axis=1, keepdims = True)\n", 438 | " \n", 439 | " dA1 = np.dot(W2.T, dZ2)\n", 440 | " dZ1 = np.multiply(dA1, np.int64(A1 > 0))\n", 441 | " dW1 = 1./m * np.dot(dZ1, X.T)\n", 442 | " db1 = 4./m * np.sum(dZ1, axis=1, keepdims = True)\n", 443 | " \n", 444 | " gradients = {\"dZ3\": dZ3, \"dW3\": dW3, \"db3\": db3,\n", 445 | " \"dA2\": dA2, \"dZ2\": dZ2, \"dW2\": dW2, \"db2\": db2,\n", 446 | " \"dA1\": dA1, \"dZ1\": dZ1, \"dW1\": dW1, \"db1\": db1}\n", 447 | " \n", 448 | " return gradients" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": { 454 | "collapsed": true 455 | }, 456 | "source": [ 457 | "You obtained some results on the fraud detection test set but you are not 100% sure of your model. Nobody's perfect! Let's implement gradient checking to verify if your gradients are correct." 458 | ] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": {}, 463 | "source": [ 464 | "**How does gradient checking work?**.\n", 465 | "\n", 466 | "As in 1) and 2), you want to compare \"gradapprox\" to the gradient computed by backpropagation. The formula is still:\n", 467 | "\n", 468 | "$$ \\frac{\\partial J}{\\partial \\theta} = \\lim_{\\varepsilon \\to 0} \\frac{J(\\theta + \\varepsilon) - J(\\theta - \\varepsilon)}{2 \\varepsilon} \\tag{1}$$\n", 469 | "\n", 470 | "However, $\\theta$ is not a scalar anymore. It is a dictionary called \"parameters\". We implemented a function \"`dictionary_to_vector()`\" for you. It converts the \"parameters\" dictionary into a vector called \"values\", obtained by reshaping all parameters (W1, b1, W2, b2, W3, b3) into vectors and concatenating them.\n", 471 | "\n", 472 | "The inverse function is \"`vector_to_dictionary`\" which outputs back the \"parameters\" dictionary.\n", 473 | "\n", 474 | "\n", 475 | "
**Figure 2** : **dictionary_to_vector() and vector_to_dictionary()**
You will need these functions in gradient_check_n()
\n", 476 | "\n", 477 | "We have also converted the \"gradients\" dictionary into a vector \"grad\" using gradients_to_vector(). You don't need to worry about that.\n", 478 | "\n", 479 | "**Exercise**: Implement gradient_check_n().\n", 480 | "\n", 481 | "**Instructions**: Here is pseudo-code that will help you implement the gradient check.\n", 482 | "\n", 483 | "For each i in num_parameters:\n", 484 | "- To compute `J_plus[i]`:\n", 485 | " 1. Set $\\theta^{+}$ to `np.copy(parameters_values)`\n", 486 | " 2. Set $\\theta^{+}_i$ to $\\theta^{+}_i + \\varepsilon$\n", 487 | " 3. Calculate $J^{+}_i$ using to `forward_propagation_n(x, y, vector_to_dictionary(`$\\theta^{+}$ `))`. \n", 488 | "- To compute `J_minus[i]`: do the same thing with $\\theta^{-}$\n", 489 | "- Compute $gradapprox[i] = \\frac{J^{+}_i - J^{-}_i}{2 \\varepsilon}$\n", 490 | "\n", 491 | "Thus, you get a vector gradapprox, where gradapprox[i] is an approximation of the gradient with respect to `parameter_values[i]`. You can now compare this gradapprox vector to the gradients vector from backpropagation. Just like for the 1D case (Steps 1', 2', 3'), compute: \n", 492 | "$$ difference = \\frac {\\| grad - gradapprox \\|_2}{\\| grad \\|_2 + \\| gradapprox \\|_2 } \\tag{3}$$" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": 16, 498 | "metadata": { 499 | "collapsed": true 500 | }, 501 | "outputs": [], 502 | "source": [ 503 | "# GRADED FUNCTION: gradient_check_n\n", 504 | "\n", 505 | "def gradient_check_n(parameters, gradients, X, Y, epsilon = 1e-7):\n", 506 | " \"\"\"\n", 507 | " Checks if backward_propagation_n computes correctly the gradient of the cost output by forward_propagation_n\n", 508 | " \n", 509 | " Arguments:\n", 510 | " parameters -- python dictionary containing your parameters \"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\":\n", 511 | " grad -- output of backward_propagation_n, contains gradients of the cost with respect to the parameters. \n", 512 | " x -- input datapoint, of shape (input size, 1)\n", 513 | " y -- true \"label\"\n", 514 | " epsilon -- tiny shift to the input to compute approximated gradient with formula(1)\n", 515 | " \n", 516 | " Returns:\n", 517 | " difference -- difference (2) between the approximated gradient and the backward propagation gradient\n", 518 | " \"\"\"\n", 519 | " \n", 520 | " # Set-up variables\n", 521 | " parameters_values, _ = dictionary_to_vector(parameters)\n", 522 | " grad = gradients_to_vector(gradients)\n", 523 | " num_parameters = parameters_values.shape[0]\n", 524 | " J_plus = np.zeros((num_parameters, 1))\n", 525 | " J_minus = np.zeros((num_parameters, 1))\n", 526 | " gradapprox = np.zeros((num_parameters, 1))\n", 527 | " \n", 528 | " # Compute gradapprox\n", 529 | " for i in range(num_parameters):\n", 530 | " \n", 531 | " # Compute J_plus[i]. Inputs: \"parameters_values, epsilon\". Output = \"J_plus[i]\".\n", 532 | " # \"_\" is used because the function you have to outputs two parameters but we only care about the first one\n", 533 | " ### START CODE HERE ### (approx. 3 lines)\n", 534 | " thetaplus = np.copy(parameters_values) # Step 1\n", 535 | " thetaplus[i][0] = thetaplus[i][0] + epsilon # Step 2\n", 536 | " J_plus[i], _ = forward_propagation_n(X,Y,vector_to_dictionary(thetaplus)) # Step 3\n", 537 | " ### END CODE HERE ###\n", 538 | " \n", 539 | " # Compute J_minus[i]. Inputs: \"parameters_values, epsilon\". Output = \"J_minus[i]\".\n", 540 | " ### START CODE HERE ### (approx. 3 lines)\n", 541 | " thetaminus = np.copy(parameters_values) # Step 1\n", 542 | " thetaminus[i][0] = thetaminus[i][0] - epsilon # Step 2 \n", 543 | " J_minus[i], _ = forward_propagation_n(X,Y,vector_to_dictionary(thetaminus)) # Step 3\n", 544 | " ### END CODE HERE ###\n", 545 | " \n", 546 | " # Compute gradapprox[i]\n", 547 | " ### START CODE HERE ### (approx. 1 line)\n", 548 | " gradapprox[i] = (J_plus[i] - J_minus[i])/(2*epsilon)\n", 549 | " ### END CODE HERE ###\n", 550 | " \n", 551 | " # Compare gradapprox to backward propagation gradients by computing difference.\n", 552 | " ### START CODE HERE ### (approx. 1 line)\n", 553 | " numerator = np.linalg.norm((grad-gradapprox),keepdims=True) # Step 1'\n", 554 | " denominator = np.linalg.norm((grad),keepdims=True) + np.linalg.norm((gradapprox),keepdims=True) # Step 2'\n", 555 | " difference = numerator/denominator # Step 3'\n", 556 | " ### END CODE HERE ###\n", 557 | "\n", 558 | " if difference > 2e-7:\n", 559 | " print (\"\\033[93m\" + \"There is a mistake in the backward propagation! difference = \" + str(difference) + \"\\033[0m\")\n", 560 | " else:\n", 561 | " print (\"\\033[92m\" + \"Your backward propagation works perfectly fine! difference = \" + str(difference) + \"\\033[0m\")\n", 562 | " \n", 563 | " return difference" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": 17, 569 | "metadata": { 570 | "scrolled": false 571 | }, 572 | "outputs": [ 573 | { 574 | "name": "stdout", 575 | "output_type": "stream", 576 | "text": [ 577 | "\u001b[93mThere is a mistake in the backward propagation! difference = [[ 0.28509316]]\u001b[0m\n" 578 | ] 579 | } 580 | ], 581 | "source": [ 582 | "X, Y, parameters = gradient_check_n_test_case()\n", 583 | "\n", 584 | "cost, cache = forward_propagation_n(X, Y, parameters)\n", 585 | "gradients = backward_propagation_n(X, Y, cache)\n", 586 | "difference = gradient_check_n(parameters, gradients, X, Y)" 587 | ] 588 | }, 589 | { 590 | "cell_type": "markdown", 591 | "metadata": {}, 592 | "source": [ 593 | "**Expected output**:\n", 594 | "\n", 595 | "\n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | "
** There is a mistake in the backward propagation!** difference = 0.285093156781
" 601 | ] 602 | }, 603 | { 604 | "cell_type": "markdown", 605 | "metadata": {}, 606 | "source": [ 607 | "It seems that there were errors in the `backward_propagation_n` code we gave you! Good that you've implemented the gradient check. Go back to `backward_propagation` and try to find/correct the errors *(Hint: check dW2 and db1)*. Rerun the gradient check when you think you've fixed it. Remember you'll need to re-execute the cell defining `backward_propagation_n()` if you modify the code. \n", 608 | "\n", 609 | "Can you get gradient check to declare your derivative computation correct? Even though this part of the assignment isn't graded, we strongly urge you to try to find the bug and re-run gradient check until you're convinced backprop is now correctly implemented. \n", 610 | "\n", 611 | "**Note** \n", 612 | "- Gradient Checking is slow! Approximating the gradient with $\\frac{\\partial J}{\\partial \\theta} \\approx \\frac{J(\\theta + \\varepsilon) - J(\\theta - \\varepsilon)}{2 \\varepsilon}$ is computationally costly. For this reason, we don't run gradient checking at every iteration during training. Just a few times to check if the gradient is correct. \n", 613 | "- Gradient Checking, at least as we've presented it, doesn't work with dropout. You would usually run the gradient check algorithm without dropout to make sure your backprop is correct, then add dropout. \n", 614 | "\n", 615 | "Congrats, you can be confident that your deep learning model for fraud detection is working correctly! You can even use this to convince your CEO. :) \n", 616 | "\n", 617 | "\n", 618 | "**What you should remember from this notebook**:\n", 619 | "- Gradient checking verifies closeness between the gradients from backpropagation and the numerical approximation of the gradient (computed using forward propagation).\n", 620 | "- Gradient checking is slow, so we don't run it in every iteration of training. You would usually run it only to make sure your code is correct, then turn it off and use backprop for the actual learning process. " 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": null, 626 | "metadata": { 627 | "collapsed": true 628 | }, 629 | "outputs": [], 630 | "source": [] 631 | } 632 | ], 633 | "metadata": { 634 | "coursera": { 635 | "course_slug": "deep-neural-network", 636 | "graded_item_id": "n6NBD", 637 | "launcher_item_id": "yfOsE" 638 | }, 639 | "kernelspec": { 640 | "display_name": "Python 3", 641 | "language": "python", 642 | "name": "python3" 643 | }, 644 | "language_info": { 645 | "codemirror_mode": { 646 | "name": "ipython", 647 | "version": 3 648 | }, 649 | "file_extension": ".py", 650 | "mimetype": "text/x-python", 651 | "name": "python", 652 | "nbconvert_exporter": "python", 653 | "pygments_lexer": "ipython3", 654 | "version": "3.6.0" 655 | } 656 | }, 657 | "nbformat": 4, 658 | "nbformat_minor": 1 659 | } 660 | -------------------------------------------------------------------------------- /99~参考资料/DeepLearning-Specialization/Convolutional_Neural_Networks/Week4/Face+Recognition+for+the+Happy+House+-+v3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Face Recognition for the Happy House\n", 8 | "\n", 9 | "Welcome to the first assignment of week 4! Here you will build a face recognition system. Many of the ideas presented here are from [FaceNet](https://arxiv.org/pdf/1503.03832.pdf). In lecture, we also talked about [DeepFace](https://research.fb.com/wp-content/uploads/2016/11/deepface-closing-the-gap-to-human-level-performance-in-face-verification.pdf). \n", 10 | "\n", 11 | "Face recognition problems commonly fall into two categories: \n", 12 | "\n", 13 | "- **Face Verification** - \"is this the claimed person?\". For example, at some airports, you can pass through customs by letting a system scan your passport and then verifying that you (the person carrying the passport) are the correct person. A mobile phone that unlocks using your face is also using face verification. This is a 1:1 matching problem. \n", 14 | "- **Face Recognition** - \"who is this person?\". For example, the video lecture showed a face recognition video (https://www.youtube.com/watch?v=wr4rx0Spihs) of Baidu employees entering the office without needing to otherwise identify themselves. This is a 1:K matching problem. \n", 15 | "\n", 16 | "FaceNet learns a neural network that encodes a face image into a vector of 128 numbers. By comparing two such vectors, you can then determine if two pictures are of the same person.\n", 17 | " \n", 18 | "**In this assignment, you will:**\n", 19 | "- Implement the triplet loss function\n", 20 | "- Use a pretrained model to map face images into 128-dimensional encodings\n", 21 | "- Use these encodings to perform face verification and face recognition\n", 22 | "\n", 23 | "In this exercise, we will be using a pre-trained model which represents ConvNet activations using a \"channels first\" convention, as opposed to the \"channels last\" convention used in lecture and previous programming assignments. In other words, a batch of images will be of shape $(m, n_C, n_H, n_W)$ instead of $(m, n_H, n_W, n_C)$. Both of these conventions have a reasonable amount of traction among open-source implementations; there isn't a uniform standard yet within the deep learning community. \n", 24 | "\n", 25 | "Let's load the required packages. \n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stderr", 35 | "output_type": "stream", 36 | "text": [ 37 | "Using TensorFlow backend.\n" 38 | ] 39 | } 40 | ], 41 | "source": [ 42 | "from keras.models import Sequential\n", 43 | "from keras.layers import Conv2D, ZeroPadding2D, Activation, Input, concatenate\n", 44 | "from keras.models import Model\n", 45 | "from keras.layers.normalization import BatchNormalization\n", 46 | "from keras.layers.pooling import MaxPooling2D, AveragePooling2D\n", 47 | "from keras.layers.merge import Concatenate\n", 48 | "from keras.layers.core import Lambda, Flatten, Dense\n", 49 | "from keras.initializers import glorot_uniform\n", 50 | "from keras.engine.topology import Layer\n", 51 | "from keras import backend as K\n", 52 | "K.set_image_data_format('channels_first')\n", 53 | "import cv2\n", 54 | "import os\n", 55 | "import numpy as np\n", 56 | "from numpy import genfromtxt\n", 57 | "import pandas as pd\n", 58 | "import tensorflow as tf\n", 59 | "from fr_utils import *\n", 60 | "from inception_blocks_v2 import *\n", 61 | "\n", 62 | "%matplotlib inline\n", 63 | "%load_ext autoreload\n", 64 | "%autoreload 2\n", 65 | "\n", 66 | "np.set_printoptions(threshold=np.nan)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "## 0 - Naive Face Verification\n", 74 | "\n", 75 | "In Face Verification, you're given two images and you have to tell if they are of the same person. The simplest way to do this is to compare the two images pixel-by-pixel. If the distance between the raw images are less than a chosen threshold, it may be the same person! \n", 76 | "\n", 77 | "\n", 78 | "
**Figure 1**
" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": { 84 | "collapsed": true 85 | }, 86 | "source": [ 87 | "Of course, this algorithm performs really poorly, since the pixel values change dramatically due to variations in lighting, orientation of the person's face, even minor changes in head position, and so on. \n", 88 | "\n", 89 | "You'll see that rather than using the raw image, you can learn an encoding $f(img)$ so that element-wise comparisons of this encoding gives more accurate judgements as to whether two pictures are of the same person." 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "## 1 - Encoding face images into a 128-dimensional vector \n", 97 | "\n", 98 | "### 1.1 - Using an ConvNet to compute encodings\n", 99 | "\n", 100 | "The FaceNet model takes a lot of data and a long time to train. So following common practice in applied deep learning settings, let's just load weights that someone else has already trained. The network architecture follows the Inception model from [Szegedy *et al.*](https://arxiv.org/abs/1409.4842). We have provided an inception network implementation. You can look in the file `inception_blocks.py` to see how it is implemented (do so by going to \"File->Open...\" at the top of the Jupyter notebook). \n" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "The key things you need to know are:\n", 108 | "\n", 109 | "- This network uses 96x96 dimensional RGB images as its input. Specifically, inputs a face image (or batch of $m$ face images) as a tensor of shape $(m, n_C, n_H, n_W) = (m, 3, 96, 96)$ \n", 110 | "- It outputs a matrix of shape $(m, 128)$ that encodes each input face image into a 128-dimensional vector\n", 111 | "\n", 112 | "Run the cell below to create the model for face images." 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 2, 118 | "metadata": { 119 | "collapsed": true 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "FRmodel = faceRecoModel(input_shape=(3, 96, 96))" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 3, 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "name": "stdout", 133 | "output_type": "stream", 134 | "text": [ 135 | "Total Params: 3743280\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "print(\"Total Params:\", FRmodel.count_params())" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "** Expected Output **\n", 148 | "\n", 149 | "
\n", 150 | "Total Params: 3743280\n", 151 | "
\n", 152 | "
\n" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "By using a 128-neuron fully connected layer as its last layer, the model ensures that the output is an encoding vector of size 128. You then use the encodings the compare two face images as follows:\n", 160 | "\n", 161 | "\n", 162 | "
**Figure 2**:
By computing a distance between two encodings and thresholding, you can determine if the two pictures represent the same person
\n", 163 | "\n", 164 | "So, an encoding is a good one if: \n", 165 | "- The encodings of two images of the same person are quite similar to each other \n", 166 | "- The encodings of two images of different persons are very different\n", 167 | "\n", 168 | "The triplet loss function formalizes this, and tries to \"push\" the encodings of two images of the same person (Anchor and Positive) closer together, while \"pulling\" the encodings of two images of different persons (Anchor, Negative) further apart. \n", 169 | "\n", 170 | "\n", 171 | "
\n", 172 | "
**Figure 3**:
In the next part, we will call the pictures from left to right: Anchor (A), Positive (P), Negative (N)
" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "\n", 180 | "\n", 181 | "### 1.2 - The Triplet Loss\n", 182 | "\n", 183 | "For an image $x$, we denote its encoding $f(x)$, where $f$ is the function computed by the neural network.\n", 184 | "\n", 185 | "\n", 186 | "\n", 187 | "\n", 190 | "\n", 191 | "Training will use triplets of images $(A, P, N)$: \n", 192 | "\n", 193 | "- A is an \"Anchor\" image--a picture of a person. \n", 194 | "- P is a \"Positive\" image--a picture of the same person as the Anchor image.\n", 195 | "- N is a \"Negative\" image--a picture of a different person than the Anchor image.\n", 196 | "\n", 197 | "These triplets are picked from our training dataset. We will write $(A^{(i)}, P^{(i)}, N^{(i)})$ to denote the $i$-th training example. \n", 198 | "\n", 199 | "You'd like to make sure that an image $A^{(i)}$ of an individual is closer to the Positive $P^{(i)}$ than to the Negative image $N^{(i)}$) by at least a margin $\\alpha$:\n", 200 | "\n", 201 | "$$\\mid \\mid f(A^{(i)}) - f(P^{(i)}) \\mid \\mid_2^2 + \\alpha < \\mid \\mid f(A^{(i)}) - f(N^{(i)}) \\mid \\mid_2^2$$\n", 202 | "\n", 203 | "You would thus like to minimize the following \"triplet cost\":\n", 204 | "\n", 205 | "$$\\mathcal{J} = \\sum^{m}_{i=1} \\large[ \\small \\underbrace{\\mid \\mid f(A^{(i)}) - f(P^{(i)}) \\mid \\mid_2^2}_\\text{(1)} - \\underbrace{\\mid \\mid f(A^{(i)}) - f(N^{(i)}) \\mid \\mid_2^2}_\\text{(2)} + \\alpha \\large ] \\small_+ \\tag{3}$$\n", 206 | "\n", 207 | "Here, we are using the notation \"$[z]_+$\" to denote $max(z,0)$. \n", 208 | "\n", 209 | "Notes:\n", 210 | "- The term (1) is the squared distance between the anchor \"A\" and the positive \"P\" for a given triplet; you want this to be small. \n", 211 | "- The term (2) is the squared distance between the anchor \"A\" and the negative \"N\" for a given triplet, you want this to be relatively large, so it thus makes sense to have a minus sign preceding it. \n", 212 | "- $\\alpha$ is called the margin. It is a hyperparameter that you should pick manually. We will use $\\alpha = 0.2$. \n", 213 | "\n", 214 | "Most implementations also normalize the encoding vectors to have norm equal one (i.e., $\\mid \\mid f(img)\\mid \\mid_2$=1); you won't have to worry about that here.\n", 215 | "\n", 216 | "**Exercise**: Implement the triplet loss as defined by formula (3). Here are the 4 steps:\n", 217 | "1. Compute the distance between the encodings of \"anchor\" and \"positive\": $\\mid \\mid f(A^{(i)}) - f(P^{(i)}) \\mid \\mid_2^2$\n", 218 | "2. Compute the distance between the encodings of \"anchor\" and \"negative\": $\\mid \\mid f(A^{(i)}) - f(N^{(i)}) \\mid \\mid_2^2$\n", 219 | "3. Compute the formula per training example: $ \\mid \\mid f(A^{(i)}) - f(P^{(i)}) \\mid - \\mid \\mid f(A^{(i)}) - f(N^{(i)}) \\mid \\mid_2^2 + \\alpha$\n", 220 | "3. Compute the full formula by taking the max with zero and summing over the training examples:\n", 221 | "$$\\mathcal{J} = \\sum^{m}_{i=1} \\large[ \\small \\mid \\mid f(A^{(i)}) - f(P^{(i)}) \\mid \\mid_2^2 - \\mid \\mid f(A^{(i)}) - f(N^{(i)}) \\mid \\mid_2^2+ \\alpha \\large ] \\small_+ \\tag{3}$$\n", 222 | "\n", 223 | "Useful functions: `tf.reduce_sum()`, `tf.square()`, `tf.subtract()`, `tf.add()`, `tf.maximum()`.\n", 224 | "For steps 1 and 2, you will need to sum over the entries of $\\mid \\mid f(A^{(i)}) - f(P^{(i)}) \\mid \\mid_2^2$ and $\\mid \\mid f(A^{(i)}) - f(N^{(i)}) \\mid \\mid_2^2$ while for step 4 you will need to sum over the training examples." 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 34, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "# GRADED FUNCTION: triplet_loss\n", 234 | "\n", 235 | "def triplet_loss(y_true, y_pred, alpha = 0.2):\n", 236 | " \"\"\"\n", 237 | " Implementation of the triplet loss as defined by formula (3)\n", 238 | " \n", 239 | " Arguments:\n", 240 | " y_true -- true labels, required when you define a loss in Keras, you don't need it in this function.\n", 241 | " y_pred -- python list containing three objects:\n", 242 | " anchor -- the encodings for the anchor images, of shape (None, 128)\n", 243 | " positive -- the encodings for the positive images, of shape (None, 128)\n", 244 | " negative -- the encodings for the negative images, of shape (None, 128)\n", 245 | " \n", 246 | " Returns:\n", 247 | " loss -- real number, value of the loss\n", 248 | " \"\"\"\n", 249 | " \n", 250 | " anchor, positive, negative = y_pred[0], y_pred[1], y_pred[2]\n", 251 | " \n", 252 | " #print(y_pred[0].get_shape())\n", 253 | " ### START CODE HERE ### (≈ 4 lines)\n", 254 | " # Step 1: Compute the (encoding) distance between the anchor and the positive, you will need to sum over axis=-1\n", 255 | " pos_dist = tf.reduce_sum(tf.square(tf.subtract(anchor,positive)), axis = -1)\n", 256 | " # Step 2: Compute the (encoding) distance between the anchor and the negative, you will need to sum over axis=-1\n", 257 | " neg_dist = tf.reduce_sum(tf.square(tf.subtract(anchor,negative)), axis = -1)\n", 258 | " # Step 3: subtract the two previous distances and add alpha.\n", 259 | " basic_loss = pos_dist - neg_dist + alpha\n", 260 | " # Step 4: Take the maximum of basic_loss and 0.0. Sum over the training examples.\n", 261 | " loss = tf.reduce_sum(tf.maximum(basic_loss,0.0))\n", 262 | " ### END CODE HERE ###\n", 263 | " \n", 264 | " return loss" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 35, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | "loss = 528.143\n" 277 | ] 278 | } 279 | ], 280 | "source": [ 281 | "with tf.Session() as test:\n", 282 | " tf.set_random_seed(1)\n", 283 | " y_true = (None, None, None)\n", 284 | " y_pred = (tf.random_normal([3, 128], mean=6, stddev=0.1, seed = 1),\n", 285 | " tf.random_normal([3, 128], mean=1, stddev=1, seed = 1),\n", 286 | " tf.random_normal([3, 128], mean=3, stddev=4, seed = 1))\n", 287 | " loss = triplet_loss(y_true, y_pred)\n", 288 | " \n", 289 | " print(\"loss = \" + str(loss.eval()))" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "**Expected Output**:\n", 297 | "\n", 298 | "\n", 299 | " \n", 300 | " \n", 303 | " \n", 306 | " \n", 307 | "\n", 308 | "
\n", 301 | " **loss**\n", 302 | " \n", 304 | " 528.143\n", 305 | "
" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "## 2 - Loading the trained model\n", 316 | "\n", 317 | "FaceNet is trained by minimizing the triplet loss. But since training requires a lot of data and a lot of computation, we won't train it from scratch here. Instead, we load a previously trained model. Load a model using the following cell; this might take a couple of minutes to run. " 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 6, 323 | "metadata": { 324 | "collapsed": true 325 | }, 326 | "outputs": [], 327 | "source": [ 328 | "FRmodel.compile(optimizer = 'adam', loss = triplet_loss, metrics = ['accuracy'])\n", 329 | "load_weights_from_FaceNet(FRmodel)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "Here're some examples of distances between the encodings between three individuals:\n", 337 | "\n", 338 | "\n", 339 | "
\n", 340 | "
**Figure 4**:
Example of distance outputs between three individuals' encodings
\n", 341 | "\n", 342 | "Let's now use this model to perform face verification and face recognition! " 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "## 3 - Applying the model" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "Back to the Happy House! Residents are living blissfully since you implemented happiness recognition for the house in an earlier assignment. \n", 357 | "\n", 358 | "However, several issues keep coming up: The Happy House became so happy that every happy person in the neighborhood is coming to hang out in your living room. It is getting really crowded, which is having a negative impact on the residents of the house. All these random happy people are also eating all your food. \n", 359 | "\n", 360 | "So, you decide to change the door entry policy, and not just let random happy people enter anymore, even if they are happy! Instead, you'd like to build a **Face verification** system so as to only let people from a specified list come in. To get admitted, each person has to swipe an ID card (identification card) to identify themselves at the door. The face recognition system then checks that they are who they claim to be. " 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "### 3.1 - Face Verification\n", 368 | "\n", 369 | "Let's build a database containing one encoding vector for each person allowed to enter the happy house. To generate the encoding we use `img_to_encoding(image_path, model)` which basically runs the forward propagation of the model on the specified image. \n", 370 | "\n", 371 | "Run the following code to build the database (represented as a python dictionary). This database maps each person's name to a 128-dimensional encoding of their face." 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 7, 377 | "metadata": { 378 | "collapsed": true 379 | }, 380 | "outputs": [], 381 | "source": [ 382 | "database = {}\n", 383 | "database[\"danielle\"] = img_to_encoding(\"images/danielle.png\", FRmodel)\n", 384 | "database[\"younes\"] = img_to_encoding(\"images/younes.jpg\", FRmodel)\n", 385 | "database[\"tian\"] = img_to_encoding(\"images/tian.jpg\", FRmodel)\n", 386 | "database[\"andrew\"] = img_to_encoding(\"images/andrew.jpg\", FRmodel)\n", 387 | "database[\"kian\"] = img_to_encoding(\"images/kian.jpg\", FRmodel)\n", 388 | "database[\"dan\"] = img_to_encoding(\"images/dan.jpg\", FRmodel)\n", 389 | "database[\"sebastiano\"] = img_to_encoding(\"images/sebastiano.jpg\", FRmodel)\n", 390 | "database[\"bertrand\"] = img_to_encoding(\"images/bertrand.jpg\", FRmodel)\n", 391 | "database[\"kevin\"] = img_to_encoding(\"images/kevin.jpg\", FRmodel)\n", 392 | "database[\"felix\"] = img_to_encoding(\"images/felix.jpg\", FRmodel)\n", 393 | "database[\"benoit\"] = img_to_encoding(\"images/benoit.jpg\", FRmodel)\n", 394 | "database[\"arnaud\"] = img_to_encoding(\"images/arnaud.jpg\", FRmodel)" 395 | ] 396 | }, 397 | { 398 | "cell_type": "markdown", 399 | "metadata": {}, 400 | "source": [ 401 | "Now, when someone shows up at your front door and swipes their ID card (thus giving you their name), you can look up their encoding in the database, and use it to check if the person standing at the front door matches the name on the ID.\n", 402 | "\n", 403 | "**Exercise**: Implement the verify() function which checks if the front-door camera picture (`image_path`) is actually the person called \"identity\". You will have to go through the following steps:\n", 404 | "1. Compute the encoding of the image from image_path\n", 405 | "2. Compute the distance about this encoding and the encoding of the identity image stored in the database\n", 406 | "3. Open the door if the distance is less than 0.7, else do not open.\n", 407 | "\n", 408 | "As presented above, you should use the L2 distance (np.linalg.norm). (Note: In this implementation, compare the L2 distance, not the square of the L2 distance, to the threshold 0.7.) " 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 17, 414 | "metadata": { 415 | "collapsed": true 416 | }, 417 | "outputs": [], 418 | "source": [ 419 | "# GRADED FUNCTION: verify\n", 420 | "\n", 421 | "def verify(image_path, identity, database, model):\n", 422 | " \"\"\"\n", 423 | " Function that verifies if the person on the \"image_path\" image is \"identity\".\n", 424 | " \n", 425 | " Arguments:\n", 426 | " image_path -- path to an image\n", 427 | " identity -- string, name of the person you'd like to verify the identity. Has to be a resident of the Happy house.\n", 428 | " database -- python dictionary mapping names of allowed people's names (strings) to their encodings (vectors).\n", 429 | " model -- your Inception model instance in Keras\n", 430 | " \n", 431 | " Returns:\n", 432 | " dist -- distance between the image_path and the image of \"identity\" in the database.\n", 433 | " door_open -- True, if the door should open. False otherwise.\n", 434 | " \"\"\"\n", 435 | " \n", 436 | " ### START CODE HERE ###\n", 437 | " \n", 438 | " # Step 1: Compute the encoding for the image. Use img_to_encoding() see example above. (≈ 1 line)\n", 439 | " encoding = img_to_encoding(image_path, model)\n", 440 | " \n", 441 | " # Step 2: Compute distance with identity's image (≈ 1 line)\n", 442 | " dist = np.linalg.norm(database[identity]-encoding)\n", 443 | " \n", 444 | " # Step 3: Open the door if dist < 0.7, else don't open (≈ 3 lines)\n", 445 | " if dist < 0.7:\n", 446 | " print(\"It's \" + str(identity) + \", welcome home!\")\n", 447 | " door_open = True\n", 448 | " else:\n", 449 | " print(\"It's not \" + str(identity) + \", please go away\")\n", 450 | " door_open = False\n", 451 | " \n", 452 | " \n", 453 | " ### END CODE HERE ###\n", 454 | " \n", 455 | " return dist, door_open" 456 | ] 457 | }, 458 | { 459 | "cell_type": "markdown", 460 | "metadata": {}, 461 | "source": [ 462 | "Younes is trying to enter the Happy House and the camera takes a picture of him (\"images/camera_0.jpg\"). Let's run your verification algorithm on this picture:\n", 463 | "\n", 464 | "" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 18, 470 | "metadata": {}, 471 | "outputs": [ 472 | { 473 | "name": "stdout", 474 | "output_type": "stream", 475 | "text": [ 476 | "It's younes, welcome home!\n" 477 | ] 478 | }, 479 | { 480 | "data": { 481 | "text/plain": [ 482 | "(0.65939283, True)" 483 | ] 484 | }, 485 | "execution_count": 18, 486 | "metadata": {}, 487 | "output_type": "execute_result" 488 | } 489 | ], 490 | "source": [ 491 | "verify(\"images/camera_0.jpg\", \"younes\", database, FRmodel)" 492 | ] 493 | }, 494 | { 495 | "cell_type": "markdown", 496 | "metadata": { 497 | "collapsed": true 498 | }, 499 | "source": [ 500 | "**Expected Output**:\n", 501 | "\n", 502 | "\n", 503 | " \n", 504 | " \n", 507 | " \n", 510 | " \n", 511 | "\n", 512 | "
\n", 505 | " **It's younes, welcome home!**\n", 506 | " \n", 508 | " (0.65939283, True)\n", 509 | "
" 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": { 518 | "collapsed": true 519 | }, 520 | "source": [ 521 | "Benoit, who broke the aquarium last weekend, has been banned from the house and removed from the database. He stole Kian's ID card and came back to the house to try to present himself as Kian. The front-door camera took a picture of Benoit (\"images/camera_2.jpg). Let's run the verification algorithm to check if benoit can enter.\n", 522 | "" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": 10, 528 | "metadata": {}, 529 | "outputs": [ 530 | { 531 | "name": "stdout", 532 | "output_type": "stream", 533 | "text": [ 534 | "It's not kian, please go away\n" 535 | ] 536 | }, 537 | { 538 | "data": { 539 | "text/plain": [ 540 | "(0.86224014, False)" 541 | ] 542 | }, 543 | "execution_count": 10, 544 | "metadata": {}, 545 | "output_type": "execute_result" 546 | } 547 | ], 548 | "source": [ 549 | "verify(\"images/camera_2.jpg\", \"kian\", database, FRmodel)" 550 | ] 551 | }, 552 | { 553 | "cell_type": "markdown", 554 | "metadata": {}, 555 | "source": [ 556 | "**Expected Output**:\n", 557 | "\n", 558 | "\n", 559 | " \n", 560 | " \n", 563 | " \n", 566 | " \n", 567 | "\n", 568 | "
\n", 561 | " **It's not kian, please go away**\n", 562 | " \n", 564 | " (0.86224014, False)\n", 565 | "
" 569 | ] 570 | }, 571 | { 572 | "cell_type": "markdown", 573 | "metadata": {}, 574 | "source": [ 575 | "### 3.2 - Face Recognition\n", 576 | "\n", 577 | "Your face verification system is mostly working well. But since Kian got his ID card stolen, when he came back to the house that evening he couldn't get in! \n", 578 | "\n", 579 | "To reduce such shenanigans, you'd like to change your face verification system to a face recognition system. This way, no one has to carry an ID card anymore. An authorized person can just walk up to the house, and the front door will unlock for them! \n", 580 | "\n", 581 | "You'll implement a face recognition system that takes as input an image, and figures out if it is one of the authorized persons (and if so, who). Unlike the previous face verification system, we will no longer get a person's name as another input. \n", 582 | "\n", 583 | "**Exercise**: Implement `who_is_it()`. You will have to go through the following steps:\n", 584 | "1. Compute the target encoding of the image from image_path\n", 585 | "2. Find the encoding from the database that has smallest distance with the target encoding. \n", 586 | " - Initialize the `min_dist` variable to a large enough number (100). It will help you keep track of what is the closest encoding to the input's encoding.\n", 587 | " - Loop over the database dictionary's names and encodings. To loop use `for (name, db_enc) in database.items()`.\n", 588 | " - Compute L2 distance between the target \"encoding\" and the current \"encoding\" from the database.\n", 589 | " - If this distance is less than the min_dist, then set min_dist to dist, and identity to name." 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": 19, 595 | "metadata": { 596 | "collapsed": true 597 | }, 598 | "outputs": [], 599 | "source": [ 600 | "# GRADED FUNCTION: who_is_it\n", 601 | "\n", 602 | "def who_is_it(image_path, database, model):\n", 603 | " \"\"\"\n", 604 | " Implements face recognition for the happy house by finding who is the person on the image_path image.\n", 605 | " \n", 606 | " Arguments:\n", 607 | " image_path -- path to an image\n", 608 | " database -- database containing image encodings along with the name of the person on the image\n", 609 | " model -- your Inception model instance in Keras\n", 610 | " \n", 611 | " Returns:\n", 612 | " min_dist -- the minimum distance between image_path encoding and the encodings from the database\n", 613 | " identity -- string, the name prediction for the person on image_path\n", 614 | " \"\"\"\n", 615 | " \n", 616 | " ### START CODE HERE ### \n", 617 | " \n", 618 | " ## Step 1: Compute the target \"encoding\" for the image. Use img_to_encoding() see example above. ## (≈ 1 line)\n", 619 | " encoding = img_to_encoding(image_path, model)\n", 620 | " \n", 621 | " ## Step 2: Find the closest encoding ##\n", 622 | " \n", 623 | " # Initialize \"min_dist\" to a large value, say 100 (≈1 line)\n", 624 | " min_dist = 100\n", 625 | " \n", 626 | " # Loop over the database dictionary's names and encodings.\n", 627 | " for (name, db_enc) in database.items():\n", 628 | " \n", 629 | " # Compute L2 distance between the target \"encoding\" and the current \"emb\" from the database. (≈ 1 line)\n", 630 | " dist = np.linalg.norm(db_enc-encoding)\n", 631 | "\n", 632 | " # If this distance is less than the min_dist, then set min_dist to dist, and identity to name. (≈ 3 lines)\n", 633 | " if dist < min_dist:\n", 634 | " min_dist = dist\n", 635 | " identity = name\n", 636 | "\n", 637 | " ### END CODE HERE ###\n", 638 | " \n", 639 | " if min_dist > 0.7:\n", 640 | " print(\"Not in the database.\")\n", 641 | " else:\n", 642 | " print (\"it's \" + str(identity) + \", the distance is \" + str(min_dist))\n", 643 | " \n", 644 | " return min_dist, identity" 645 | ] 646 | }, 647 | { 648 | "cell_type": "markdown", 649 | "metadata": {}, 650 | "source": [ 651 | "Younes is at the front-door and the camera takes a picture of him (\"images/camera_0.jpg\"). Let's see if your who_it_is() algorithm identifies Younes. " 652 | ] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "execution_count": 20, 657 | "metadata": { 658 | "scrolled": false 659 | }, 660 | "outputs": [ 661 | { 662 | "name": "stdout", 663 | "output_type": "stream", 664 | "text": [ 665 | "it's younes, the distance is 0.659393\n" 666 | ] 667 | }, 668 | { 669 | "data": { 670 | "text/plain": [ 671 | "(0.65939283, 'younes')" 672 | ] 673 | }, 674 | "execution_count": 20, 675 | "metadata": {}, 676 | "output_type": "execute_result" 677 | } 678 | ], 679 | "source": [ 680 | "who_is_it(\"images/camera_0.jpg\", database, FRmodel)" 681 | ] 682 | }, 683 | { 684 | "cell_type": "markdown", 685 | "metadata": {}, 686 | "source": [ 687 | "**Expected Output**:\n", 688 | "\n", 689 | "\n", 690 | " \n", 691 | " \n", 694 | " \n", 697 | " \n", 698 | "\n", 699 | "
\n", 692 | " **it's younes, the distance is 0.659393**\n", 693 | " \n", 695 | " (0.65939283, 'younes')\n", 696 | "
" 700 | ] 701 | }, 702 | { 703 | "cell_type": "markdown", 704 | "metadata": {}, 705 | "source": [ 706 | "You can change \"`camera_0.jpg`\" (picture of younes) to \"`camera_1.jpg`\" (picture of bertrand) and see the result." 707 | ] 708 | }, 709 | { 710 | "cell_type": "markdown", 711 | "metadata": {}, 712 | "source": [ 713 | "Your Happy House is running well. It only lets in authorized persons, and people don't need to carry an ID card around anymore! \n", 714 | "\n", 715 | "You've now seen how a state-of-the-art face recognition system works.\n", 716 | "\n", 717 | "Although we won't implement it here, here're some ways to further improve the algorithm:\n", 718 | "- Put more images of each person (under different lighting conditions, taken on different days, etc.) into the database. Then given a new image, compare the new face to multiple pictures of the person. This would increae accuracy.\n", 719 | "- Crop the images to just contain the face, and less of the \"border\" region around the face. This preprocessing removes some of the irrelevant pixels around the face, and also makes the algorithm more robust.\n" 720 | ] 721 | }, 722 | { 723 | "cell_type": "markdown", 724 | "metadata": {}, 725 | "source": [ 726 | "\n", 727 | "**What you should remember**:\n", 728 | "- Face verification solves an easier 1:1 matching problem; face recognition addresses a harder 1:K matching problem. \n", 729 | "- The triplet loss is an effective loss function for training a neural network to learn an encoding of a face image.\n", 730 | "- The same encoding can be used for verification and recognition. Measuring distances between two images' encodings allows you to determine whether they are pictures of the same person. " 731 | ] 732 | }, 733 | { 734 | "cell_type": "markdown", 735 | "metadata": {}, 736 | "source": [ 737 | "Congrats on finishing this assignment! \n" 738 | ] 739 | }, 740 | { 741 | "cell_type": "markdown", 742 | "metadata": {}, 743 | "source": [ 744 | "### References:\n", 745 | "\n", 746 | "- Florian Schroff, Dmitry Kalenichenko, James Philbin (2015). [FaceNet: A Unified Embedding for Face Recognition and Clustering](https://arxiv.org/pdf/1503.03832.pdf)\n", 747 | "- Yaniv Taigman, Ming Yang, Marc'Aurelio Ranzato, Lior Wolf (2014). [DeepFace: Closing the gap to human-level performance in face verification](https://research.fb.com/wp-content/uploads/2016/11/deepface-closing-the-gap-to-human-level-performance-in-face-verification.pdf) \n", 748 | "- The pretrained model we use is inspired by Victor Sy Wang's implementation and was loaded using his code: https://github.com/iwantooxxoox/Keras-OpenFace.\n", 749 | "- Our implementation also took a lot of inspiration from the official FaceNet github repository: https://github.com/davidsandberg/facenet \n" 750 | ] 751 | } 752 | ], 753 | "metadata": { 754 | "coursera": { 755 | "course_slug": "convolutional-neural-networks", 756 | "graded_item_id": "IaknP", 757 | "launcher_item_id": "5UMr4" 758 | }, 759 | "kernelspec": { 760 | "display_name": "Python 3", 761 | "language": "python", 762 | "name": "python3" 763 | }, 764 | "language_info": { 765 | "codemirror_mode": { 766 | "name": "ipython", 767 | "version": 3 768 | }, 769 | "file_extension": ".py", 770 | "mimetype": "text/x-python", 771 | "name": "python", 772 | "nbconvert_exporter": "python", 773 | "pygments_lexer": "ipython3", 774 | "version": "3.6.0" 775 | } 776 | }, 777 | "nbformat": 4, 778 | "nbformat_minor": 2 779 | } 780 | -------------------------------------------------------------------------------- /99~参考资料/DeepLearning-Specialization/Sequence Models/Week2/Operations+on+word+vectors+-+v2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Operations on word vectors\n", 8 | "\n", 9 | "Welcome to your first assignment of this week! \n", 10 | "\n", 11 | "Because word embeddings are very computionally expensive to train, most ML practitioners will load a pre-trained set of embeddings. \n", 12 | "\n", 13 | "**After this assignment you will be able to:**\n", 14 | "\n", 15 | "- Load pre-trained word vectors, and measure similarity using cosine similarity\n", 16 | "- Use word embeddings to solve word analogy problems such as Man is to Woman as King is to ______. \n", 17 | "- Modify word embeddings to reduce their gender bias \n", 18 | "\n", 19 | "Let's get started! Run the following cell to load the packages you will need." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "name": "stderr", 29 | "output_type": "stream", 30 | "text": [ 31 | "Using TensorFlow backend.\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "import numpy as np\n", 37 | "from w2v_utils import *" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "Next, lets load the word vectors. For this assignment, we will use 50-dimensional GloVe vectors to represent words. Run the following cell to load the `word_to_vec_map`. " 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 2, 50 | "metadata": { 51 | "collapsed": true 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "words, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "You've loaded:\n", 63 | "- `words`: set of words in the vocabulary.\n", 64 | "- `word_to_vec_map`: dictionary mapping words to their GloVe vector representation.\n", 65 | "\n", 66 | "You've seen that one-hot vectors do not do a good job cpaturing what words are similar. GloVe vectors provide much more useful information about the meaning of individual words. Lets now see how you can use GloVe vectors to decide how similar two words are. \n", 67 | "\n" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "# 1 - Cosine similarity\n", 75 | "\n", 76 | "To measure how similar two words are, we need a way to measure the degree of similarity between two embedding vectors for the two words. Given two vectors $u$ and $v$, cosine similarity is defined as follows: \n", 77 | "\n", 78 | "$$\\text{CosineSimilarity(u, v)} = \\frac {u . v} {||u||_2 ||v||_2} = cos(\\theta) \\tag{1}$$\n", 79 | "\n", 80 | "where $u.v$ is the dot product (or inner product) of two vectors, $||u||_2$ is the norm (or length) of the vector $u$, and $\\theta$ is the angle between $u$ and $v$. This similarity depends on the angle between $u$ and $v$. If $u$ and $v$ are very similar, their cosine similarity will be close to 1; if they are dissimilar, the cosine similarity will take a smaller value. \n", 81 | "\n", 82 | "\n", 83 | "
**Figure 1**: The cosine of the angle between two vectors is a measure of how similar they are
\n", 84 | "\n", 85 | "**Exercise**: Implement the function `cosine_similarity()` to evaluate similarity between word vectors.\n", 86 | "\n", 87 | "**Reminder**: The norm of $u$ is defined as $ ||u||_2 = \\sqrt{\\sum_{i=1}^{n} u_i^2}$" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 3, 93 | "metadata": { 94 | "collapsed": true 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "# GRADED FUNCTION: cosine_similarity\n", 99 | "\n", 100 | "def cosine_similarity(u, v):\n", 101 | " \"\"\"\n", 102 | " Cosine similarity reflects the degree of similariy between u and v\n", 103 | " \n", 104 | " Arguments:\n", 105 | " u -- a word vector of shape (n,) \n", 106 | " v -- a word vector of shape (n,)\n", 107 | "\n", 108 | " Returns:\n", 109 | " cosine_similarity -- the cosine similarity between u and v defined by the formula above.\n", 110 | " \"\"\"\n", 111 | " \n", 112 | " distance = 0.0\n", 113 | " \n", 114 | " ### START CODE HERE ###\n", 115 | " # Compute the dot product between u and v (≈1 line)\n", 116 | " dot = np.dot(u,v)\n", 117 | " # Compute the L2 norm of u (≈1 line)\n", 118 | " norm_u = np.sqrt(np.sum(u*u))\n", 119 | " \n", 120 | " # Compute the L2 norm of v (≈1 line)\n", 121 | " norm_v = np.sqrt(np.sum(v*v))\n", 122 | " # Compute the cosine similarity defined by formula (1) (≈1 line)\n", 123 | " cosine_similarity = dot/(norm_u*norm_v)\n", 124 | " ### END CODE HERE ###\n", 125 | " \n", 126 | " return cosine_similarity" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 4, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "name": "stdout", 136 | "output_type": "stream", 137 | "text": [ 138 | "cosine_similarity(father, mother) = 0.890903844289\n", 139 | "cosine_similarity(ball, crocodile) = 0.274392462614\n", 140 | "cosine_similarity(france - paris, rome - italy) = -0.675147930817\n" 141 | ] 142 | } 143 | ], 144 | "source": [ 145 | "father = word_to_vec_map[\"father\"]\n", 146 | "mother = word_to_vec_map[\"mother\"]\n", 147 | "ball = word_to_vec_map[\"ball\"]\n", 148 | "crocodile = word_to_vec_map[\"crocodile\"]\n", 149 | "france = word_to_vec_map[\"france\"]\n", 150 | "italy = word_to_vec_map[\"italy\"]\n", 151 | "paris = word_to_vec_map[\"paris\"]\n", 152 | "rome = word_to_vec_map[\"rome\"]\n", 153 | "\n", 154 | "print(\"cosine_similarity(father, mother) = \", cosine_similarity(father, mother))\n", 155 | "print(\"cosine_similarity(ball, crocodile) = \",cosine_similarity(ball, crocodile))\n", 156 | "print(\"cosine_similarity(france - paris, rome - italy) = \",cosine_similarity(france - paris, rome - italy))" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "**Expected Output**:\n", 164 | "\n", 165 | "\n", 166 | " \n", 167 | " \n", 170 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 178 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 186 | " \n", 189 | " \n", 190 | "
\n", 168 | " **cosine_similarity(father, mother)** =\n", 169 | " \n", 171 | " 0.890903844289\n", 172 | "
\n", 176 | " **cosine_similarity(ball, crocodile)** =\n", 177 | " \n", 179 | " 0.274392462614\n", 180 | "
\n", 184 | " **cosine_similarity(france - paris, rome - italy)** =\n", 185 | " \n", 187 | " -0.675147930817\n", 188 | "
" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "After you get the correct expected output, please feel free to modify the inputs and measure the cosine similarity between other pairs of words! Playing around the cosine similarity of other inputs will give you a better sense of how word vectors behave. " 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "## 2 - Word analogy task\n", 205 | "\n", 206 | "In the word analogy task, we complete the sentence \"*a* is to *b* as *c* is to **____**\". An example is '*man* is to *woman* as *king* is to *queen*' . In detail, we are trying to find a word *d*, such that the associated word vectors $e_a, e_b, e_c, e_d$ are related in the following manner: $e_b - e_a \\approx e_d - e_c$. We will measure the similarity between $e_b - e_a$ and $e_d - e_c$ using cosine similarity. \n", 207 | "\n", 208 | "**Exercise**: Complete the code below to be able to perform word analogies!" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 7, 214 | "metadata": { 215 | "collapsed": true 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "# GRADED FUNCTION: complete_analogy\n", 220 | "\n", 221 | "def complete_analogy(word_a, word_b, word_c, word_to_vec_map):\n", 222 | " \"\"\"\n", 223 | " Performs the word analogy task as explained above: a is to b as c is to ____. \n", 224 | " \n", 225 | " Arguments:\n", 226 | " word_a -- a word, string\n", 227 | " word_b -- a word, string\n", 228 | " word_c -- a word, string\n", 229 | " word_to_vec_map -- dictionary that maps words to their corresponding vectors. \n", 230 | " \n", 231 | " Returns:\n", 232 | " best_word -- the word such that v_b - v_a is close to v_best_word - v_c, as measured by cosine similarity\n", 233 | " \"\"\"\n", 234 | " \n", 235 | " # convert words to lower case\n", 236 | " word_a, word_b, word_c = word_a.lower(), word_b.lower(), word_c.lower()\n", 237 | " \n", 238 | " ### START CODE HERE ###\n", 239 | " # Get the word embeddings v_a, v_b and v_c (≈1-3 lines)\n", 240 | " e_a, e_b, e_c = word_to_vec_map[word_a], word_to_vec_map[word_b], word_to_vec_map[word_c]\n", 241 | " ### END CODE HERE ###\n", 242 | " \n", 243 | " words = word_to_vec_map.keys()\n", 244 | " max_cosine_sim = -100 # Initialize max_cosine_sim to a large negative number\n", 245 | " best_word = None # Initialize best_word with None, it will help keep track of the word to output\n", 246 | "\n", 247 | " # loop over the whole word vector set\n", 248 | " for w in words: \n", 249 | " # to avoid best_word being one of the input words, pass on them.\n", 250 | " if w in [word_a, word_b, word_c] :\n", 251 | " continue\n", 252 | " \n", 253 | " ### START CODE HERE ###\n", 254 | " # Compute cosine similarity between the vector (e_b - e_a) and the vector ((w's vector representation) - e_c) (≈1 line)\n", 255 | " cosine_sim = cosine_similarity(e_b - e_a, word_to_vec_map[w] - e_c)\n", 256 | " \n", 257 | " # If the cosine_sim is more than the max_cosine_sim seen so far,\n", 258 | " # then: set the new max_cosine_sim to the current cosine_sim and the best_word to the current word (≈3 lines)\n", 259 | " if cosine_sim > max_cosine_sim:\n", 260 | " max_cosine_sim = cosine_sim\n", 261 | " best_word = w\n", 262 | " ### END CODE HERE ###\n", 263 | " \n", 264 | " return best_word" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "Run the cell below to test your code, this may take 1-2 minutes." 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 8, 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "name": "stdout", 281 | "output_type": "stream", 282 | "text": [ 283 | "italy -> italian :: spain -> spanish\n", 284 | "india -> delhi :: japan -> tokyo\n", 285 | "man -> woman :: boy -> girl\n", 286 | "small -> smaller :: large -> larger\n" 287 | ] 288 | } 289 | ], 290 | "source": [ 291 | "triads_to_try = [('italy', 'italian', 'spain'), ('india', 'delhi', 'japan'), ('man', 'woman', 'boy'), ('small', 'smaller', 'large')]\n", 292 | "for triad in triads_to_try:\n", 293 | " print ('{} -> {} :: {} -> {}'.format( *triad, complete_analogy(*triad,word_to_vec_map)))" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "**Expected Output**:\n", 301 | "\n", 302 | "\n", 303 | " \n", 304 | " \n", 307 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 315 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 323 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 331 | " \n", 334 | " \n", 335 | "
\n", 305 | " **italy -> italian** ::\n", 306 | " \n", 308 | " spain -> spanish\n", 309 | "
\n", 313 | " **india -> delhi** ::\n", 314 | " \n", 316 | " japan -> tokyo\n", 317 | "
\n", 321 | " **man -> woman ** ::\n", 322 | " \n", 324 | " boy -> girl\n", 325 | "
\n", 329 | " **small -> smaller ** ::\n", 330 | " \n", 332 | " large -> larger\n", 333 | "
" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "Once you get the correct expected output, please feel free to modify the input cells above to test your own analogies. Try to find some other analogy pairs that do work, but also find some where the algorithm doesn't give the right answer: For example, you can try small->smaller as big->?. " 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "### Congratulations!\n", 350 | "\n", 351 | "You've come to the end of this assignment. Here are the main points you should remember:\n", 352 | "\n", 353 | "- Cosine similarity a good way to compare similarity between pairs of word vectors. (Though L2 distance works too.) \n", 354 | "- For NLP applications, using a pre-trained set of word vectors from the internet is often a good way to get started. \n", 355 | "\n", 356 | "Even though you have finished the graded portions, we recommend you take a look too at the rest of this notebook. \n", 357 | "\n", 358 | "Congratulations on finishing the graded portions of this notebook! \n" 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": {}, 364 | "source": [ 365 | "## 3 - Debiasing word vectors (OPTIONAL/UNGRADED) " 366 | ] 367 | }, 368 | { 369 | "cell_type": "markdown", 370 | "metadata": {}, 371 | "source": [ 372 | "In the following exercise, you will examine gender biases that can be reflected in a word embedding, and explore algorithms for reducing the bias. In addition to learning about the topic of debiasing, this exercise will also help hone your intuition about what word vectors are doing. This section involves a bit of linear algebra, though you can probably complete it even without being expert in linear algebra, and we encourage you to give it a shot. This portion of the notebook is optional and is not graded. \n", 373 | "\n", 374 | "Lets first see how the GloVe word embeddings relate to gender. You will first compute a vector $g = e_{woman}-e_{man}$, where $e_{woman}$ represents the word vector corresponding to the word *woman*, and $e_{man}$ corresponds to the word vector corresponding to the word *man*. The resulting vector $g$ roughly encodes the concept of \"gender\". (You might get a more accurate representation if you compute $g_1 = e_{mother}-e_{father}$, $g_2 = e_{girl}-e_{boy}$, etc. and average over them. But just using $e_{woman}-e_{man}$ will give good enough results for now.) \n" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 9, 380 | "metadata": {}, 381 | "outputs": [ 382 | { 383 | "name": "stdout", 384 | "output_type": "stream", 385 | "text": [ 386 | "[-0.087144 0.2182 -0.40986 -0.03922 -0.1032 0.94165\n", 387 | " -0.06042 0.32988 0.46144 -0.35962 0.31102 -0.86824\n", 388 | " 0.96006 0.01073 0.24337 0.08193 -1.02722 -0.21122\n", 389 | " 0.695044 -0.00222 0.29106 0.5053 -0.099454 0.40445\n", 390 | " 0.30181 0.1355 -0.0606 -0.07131 -0.19245 -0.06115\n", 391 | " -0.3204 0.07165 -0.13337 -0.25068714 -0.14293 -0.224957\n", 392 | " -0.149 0.048882 0.12191 -0.27362 -0.165476 -0.20426\n", 393 | " 0.54376 -0.271425 -0.10245 -0.32108 0.2516 -0.33455\n", 394 | " -0.04371 0.01258 ]\n" 395 | ] 396 | } 397 | ], 398 | "source": [ 399 | "g = word_to_vec_map['woman'] - word_to_vec_map['man']\n", 400 | "print(g)" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "metadata": {}, 406 | "source": [ 407 | "Now, you will consider the cosine similarity of different words with $g$. Consider what a positive value of similarity means vs a negative cosine similarity. " 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 10, 413 | "metadata": { 414 | "scrolled": false 415 | }, 416 | "outputs": [ 417 | { 418 | "name": "stdout", 419 | "output_type": "stream", 420 | "text": [ 421 | "List of names and their similarities with constructed vector:\n", 422 | "john -0.23163356146\n", 423 | "marie 0.315597935396\n", 424 | "sophie 0.318687898594\n", 425 | "ronaldo -0.312447968503\n", 426 | "priya 0.17632041839\n", 427 | "rahul -0.169154710392\n", 428 | "danielle 0.243932992163\n", 429 | "reza -0.079304296722\n", 430 | "katy 0.283106865957\n", 431 | "yasmin 0.233138577679\n" 432 | ] 433 | } 434 | ], 435 | "source": [ 436 | "print ('List of names and their similarities with constructed vector:')\n", 437 | "\n", 438 | "# girls and boys name\n", 439 | "name_list = ['john', 'marie', 'sophie', 'ronaldo', 'priya', 'rahul', 'danielle', 'reza', 'katy', 'yasmin']\n", 440 | "\n", 441 | "for w in name_list:\n", 442 | " print (w, cosine_similarity(word_to_vec_map[w], g))" 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "metadata": {}, 448 | "source": [ 449 | "As you can see, female first names tend to have a positive cosine similarity with our constructed vector $g$, while male first names tend to have a negative cosine similarity. This is not suprising, and the result seems acceptable. \n", 450 | "\n", 451 | "But let's try with some other words." 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 11, 457 | "metadata": {}, 458 | "outputs": [ 459 | { 460 | "name": "stdout", 461 | "output_type": "stream", 462 | "text": [ 463 | "Other words and their similarities:\n", 464 | "lipstick 0.276919162564\n", 465 | "guns -0.18884855679\n", 466 | "science -0.0608290654093\n", 467 | "arts 0.00818931238588\n", 468 | "literature 0.0647250443346\n", 469 | "warrior -0.209201646411\n", 470 | "doctor 0.118952894109\n", 471 | "tree -0.0708939917548\n", 472 | "receptionist 0.330779417506\n", 473 | "technology -0.131937324476\n", 474 | "fashion 0.0356389462577\n", 475 | "teacher 0.179209234318\n", 476 | "engineer -0.0803928049452\n", 477 | "pilot 0.00107644989919\n", 478 | "computer -0.103303588739\n", 479 | "singer 0.185005181365\n" 480 | ] 481 | } 482 | ], 483 | "source": [ 484 | "print('Other words and their similarities:')\n", 485 | "word_list = ['lipstick', 'guns', 'science', 'arts', 'literature', 'warrior','doctor', 'tree', 'receptionist', \n", 486 | " 'technology', 'fashion', 'teacher', 'engineer', 'pilot', 'computer', 'singer']\n", 487 | "for w in word_list:\n", 488 | " print (w, cosine_similarity(word_to_vec_map[w], g))" 489 | ] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": {}, 494 | "source": [ 495 | "Do you notice anything surprising? It is astonishing how these results reflect certain unhealthy gender stereotypes. For example, \"computer\" is closer to \"man\" while \"literature\" is closer to \"woman\". Ouch! \n", 496 | "\n", 497 | "We'll see below how to reduce the bias of these vectors, using an algorithm due to [Boliukbasi et al., 2016](https://arxiv.org/abs/1607.06520). Note that some word pairs such as \"actor\"/\"actress\" or \"grandmother\"/\"grandfather\" should remain gender specific, while other words such as \"receptionist\" or \"technology\" should be neutralized, i.e. not be gender-related. You will have to treat these two type of words differently when debiasing.\n", 498 | "\n", 499 | "### 3.1 - Neutralize bias for non-gender specific words \n", 500 | "\n", 501 | "The figure below should help you visualize what neutralizing does. If you're using a 50-dimensional word embedding, the 50 dimensional space can be split into two parts: The bias-direction $g$, and the remaining 49 dimensions, which we'll call $g_{\\perp}$. In linear algebra, we say that the 49 dimensional $g_{\\perp}$ is perpendicular (or \"othogonal\") to $g$, meaning it is at 90 degrees to $g$. The neutralization step takes a vector such as $e_{receptionist}$ and zeros out the component in the direction of $g$, giving us $e_{receptionist}^{debiased}$. \n", 502 | "\n", 503 | "Even though $g_{\\perp}$ is 49 dimensional, given the limitations of what we can draw on a screen, we illustrate it using a 1 dimensional axis below. \n", 504 | "\n", 505 | "\n", 506 | "
**Figure 2**: The word vector for \"receptionist\" represented before and after applying the neutralize operation.
\n", 507 | "\n", 508 | "**Exercise**: Implement `neutralize()` to remove the bias of words such as \"receptionist\" or \"scientist\". Given an input embedding $e$, you can use the following formulas to compute $e^{debiased}$: \n", 509 | "\n", 510 | "$$e^{bias\\_component} = \\frac{e \\cdot g}{||g||_2^2} * g\\tag{2}$$\n", 511 | "$$e^{debiased} = e - e^{bias\\_component}\\tag{3}$$\n", 512 | "\n", 513 | "If you are an expert in linear algebra, you may recognize $e^{bias\\_component}$ as the projection of $e$ onto the direction $g$. If you're not an expert in linear algebra, don't worry about this.\n", 514 | "\n", 515 | " " 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": 24, 525 | "metadata": { 526 | "collapsed": true 527 | }, 528 | "outputs": [], 529 | "source": [ 530 | "def neutralize(word, g, word_to_vec_map):\n", 531 | " \"\"\"\n", 532 | " Removes the bias of \"word\" by projecting it on the space orthogonal to the bias axis. \n", 533 | " This function ensures that gender neutral words are zero in the gender subspace.\n", 534 | " \n", 535 | " Arguments:\n", 536 | " word -- string indicating the word to debias\n", 537 | " g -- numpy-array of shape (50,), corresponding to the bias axis (such as gender)\n", 538 | " word_to_vec_map -- dictionary mapping words to their corresponding vectors.\n", 539 | " \n", 540 | " Returns:\n", 541 | " e_debiased -- neutralized word vector representation of the input \"word\"\n", 542 | " \"\"\"\n", 543 | " \n", 544 | " ### START CODE HERE ###\n", 545 | " # Select word vector representation of \"word\". Use word_to_vec_map. (≈ 1 line)\n", 546 | " e = word_to_vec_map[word]\n", 547 | " # print(np.dot(g,g)*g)\n", 548 | " # Compute e_biascomponent using the formula give above. (≈ 1 line)\n", 549 | " e_biascomponent = (np.dot(e,g)/np.dot(g,g))*g\n", 550 | " \n", 551 | " # Neutralize e by substracting e_biascomponent from it \n", 552 | " # e_debiased should be equal to its orthogonal projection. (≈ 1 line)\n", 553 | " e_debiased = e - e_biascomponent\n", 554 | " ### END CODE HERE ###\n", 555 | " \n", 556 | " return e_debiased" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 25, 562 | "metadata": {}, 563 | "outputs": [ 564 | { 565 | "name": "stdout", 566 | "output_type": "stream", 567 | "text": [ 568 | "cosine similarity between receptionist and g, before neutralizing: 0.330779417506\n", 569 | "cosine similarity between receptionist and g, after neutralizing: -5.60374039375e-17\n" 570 | ] 571 | } 572 | ], 573 | "source": [ 574 | "e = \"receptionist\"\n", 575 | "print(\"cosine similarity between \" + e + \" and g, before neutralizing: \", cosine_similarity(word_to_vec_map[\"receptionist\"], g))\n", 576 | "\n", 577 | "e_debiased = neutralize(\"receptionist\", g, word_to_vec_map)\n", 578 | "print(\"cosine similarity between \" + e + \" and g, after neutralizing: \", cosine_similarity(e_debiased, g))" 579 | ] 580 | }, 581 | { 582 | "cell_type": "markdown", 583 | "metadata": {}, 584 | "source": [ 585 | "**Expected Output**: The second result is essentially 0, up to numerical roundof (on the order of $10^{-17}$).\n", 586 | "\n", 587 | "\n", 588 | "\n", 589 | " \n", 590 | " \n", 593 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 601 | " \n", 604 | "
\n", 591 | " **cosine similarity between receptionist and g, before neutralizing:** :\n", 592 | " \n", 594 | " 0.330779417506\n", 595 | "
\n", 599 | " **cosine similarity between receptionist and g, after neutralizing:** :\n", 600 | " \n", 602 | " -3.26732746085e-17\n", 603 | "
" 605 | ] 606 | }, 607 | { 608 | "cell_type": "markdown", 609 | "metadata": {}, 610 | "source": [ 611 | "### 3.2 - Equalization algorithm for gender-specific words\n", 612 | "\n", 613 | "Next, lets see how debiasing can also be applied to word pairs such as \"actress\" and \"actor.\" Equalization is applied to pairs of words that you might want to have differ only through the gender property. As a concrete example, suppose that \"actress\" is closer to \"babysit\" than \"actor.\" By applying neutralizing to \"babysit\" we can reduce the gender-stereotype associated with babysitting. But this still does not guarantee that \"actor\" and \"actress\" are equidistant from \"babysit.\" The equalization algorithm takes care of this. \n", 614 | "\n", 615 | "The key idea behind equalization is to make sure that a particular pair of words are equi-distant from the 49-dimensional $g_\\perp$. The equalization step also ensures that the two equalized steps are now the same distance from $e_{receptionist}^{debiased}$, or from any other work that has been neutralized. In pictures, this is how equalization works: \n", 616 | "\n", 617 | "\n", 618 | "\n", 619 | "\n", 620 | "The derivation of the linear algebra to do this is a bit more complex. (See Bolukbasi et al., 2016 for details.) But the key equations are: \n", 621 | "\n", 622 | "$$ \\mu = \\frac{e_{w1} + e_{w2}}{2}\\tag{4}$$ \n", 623 | "\n", 624 | "$$ \\mu_{B} = \\frac {\\mu \\cdot \\text{bias_axis}}{||\\text{bias_axis}||_2^2} *\\text{bias_axis}\n", 625 | "\\tag{5}$$ \n", 626 | "\n", 627 | "$$\\mu_{\\perp} = \\mu - \\mu_{B} \\tag{6}$$\n", 628 | "\n", 629 | "$$ e_{w1B} = \\frac {e_{w1} \\cdot \\text{bias_axis}}{||\\text{bias_axis}||_2^2} *\\text{bias_axis}\n", 630 | "\\tag{7}$$ \n", 631 | "$$ e_{w2B} = \\frac {e_{w2} \\cdot \\text{bias_axis}}{||\\text{bias_axis}||_2^2} *\\text{bias_axis}\n", 632 | "\\tag{8}$$\n", 633 | "\n", 634 | "\n", 635 | "$$e_{w1B}^{corrected} = \\sqrt{ |{1 - ||\\mu_{\\perp} ||^2_2} |} * \\frac{e_{\\text{w1B}} - \\mu_B} {|(e_{w1} - \\mu_{\\perp}) - \\mu_B)|} \\tag{9}$$\n", 636 | "\n", 637 | "\n", 638 | "$$e_{w2B}^{corrected} = \\sqrt{ |{1 - ||\\mu_{\\perp} ||^2_2} |} * \\frac{e_{\\text{w2B}} - \\mu_B} {|(e_{w2} - \\mu_{\\perp}) - \\mu_B)|} \\tag{10}$$\n", 639 | "\n", 640 | "$$e_1 = e_{w1B}^{corrected} + \\mu_{\\perp} \\tag{11}$$\n", 641 | "$$e_2 = e_{w2B}^{corrected} + \\mu_{\\perp} \\tag{12}$$\n", 642 | "\n", 643 | "\n", 644 | "**Exercise**: Implement the function below. Use the equations above to get the final equalized version of the pair of words. Good luck!" 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": null, 650 | "metadata": { 651 | "collapsed": true 652 | }, 653 | "outputs": [], 654 | "source": [ 655 | "def equalize(pair, bias_axis, word_to_vec_map):\n", 656 | " \"\"\"\n", 657 | " Debias gender specific words by following the equalize method described in the figure above.\n", 658 | " \n", 659 | " Arguments:\n", 660 | " pair -- pair of strings of gender specific words to debias, e.g. (\"actress\", \"actor\") \n", 661 | " bias_axis -- numpy-array of shape (50,), vector corresponding to the bias axis, e.g. gender\n", 662 | " word_to_vec_map -- dictionary mapping words to their corresponding vectors\n", 663 | " \n", 664 | " Returns\n", 665 | " e_1 -- word vector corresponding to the first word\n", 666 | " e_2 -- word vector corresponding to the second word\n", 667 | " \"\"\"\n", 668 | " \n", 669 | " ### START CODE HERE ###\n", 670 | " # Step 1: Select word vector representation of \"word\". Use word_to_vec_map. (≈ 2 lines)\n", 671 | " w1, w2 = None\n", 672 | " e_w1, e_w2 = None\n", 673 | " \n", 674 | " # Step 2: Compute the mean of e_w1 and e_w2 (≈ 1 line)\n", 675 | " mu = None\n", 676 | "\n", 677 | " # Step 3: Compute the projections of mu over the bias axis and the orthogonal axis (≈ 2 lines)\n", 678 | " mu_B = None\n", 679 | " mu_orth = None\n", 680 | "\n", 681 | " # Step 4: Use equations (7) and (8) to compute e_w1B and e_w2B (≈2 lines)\n", 682 | " e_w1B = None\n", 683 | " e_w2B = None\n", 684 | " \n", 685 | " # Step 5: Adjust the Bias part of e_w1B and e_w2B using the formulas (9) and (10) given above (≈2 lines)\n", 686 | " corrected_e_w1B = None\n", 687 | " corrected_e_w2B = None\n", 688 | "\n", 689 | " # Step 6: Debias by equalizing e1 and e2 to the sum of their corrected projections (≈2 lines)\n", 690 | " e1 = None\n", 691 | " e2 = None\n", 692 | " \n", 693 | " ### END CODE HERE ###\n", 694 | " \n", 695 | " return e1, e2" 696 | ] 697 | }, 698 | { 699 | "cell_type": "code", 700 | "execution_count": null, 701 | "metadata": { 702 | "collapsed": true, 703 | "scrolled": true 704 | }, 705 | "outputs": [], 706 | "source": [ 707 | "print(\"cosine similarities before equalizing:\")\n", 708 | "print(\"cosine_similarity(word_to_vec_map[\\\"man\\\"], gender) = \", cosine_similarity(word_to_vec_map[\"man\"], g))\n", 709 | "print(\"cosine_similarity(word_to_vec_map[\\\"woman\\\"], gender) = \", cosine_similarity(word_to_vec_map[\"woman\"], g))\n", 710 | "print()\n", 711 | "e1, e2 = equalize((\"man\", \"woman\"), g, word_to_vec_map)\n", 712 | "print(\"cosine similarities after equalizing:\")\n", 713 | "print(\"cosine_similarity(e1, gender) = \", cosine_similarity(e1, g))\n", 714 | "print(\"cosine_similarity(e2, gender) = \", cosine_similarity(e2, g))" 715 | ] 716 | }, 717 | { 718 | "cell_type": "markdown", 719 | "metadata": {}, 720 | "source": [ 721 | "**Expected Output**:\n", 722 | "\n", 723 | "cosine similarities before equalizing:\n", 724 | "\n", 725 | " \n", 726 | " \n", 729 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 737 | " \n", 740 | " \n", 741 | "
\n", 727 | " **cosine_similarity(word_to_vec_map[\"man\"], gender)** =\n", 728 | " \n", 730 | " -0.117110957653\n", 731 | "
\n", 735 | " **cosine_similarity(word_to_vec_map[\"woman\"], gender)** =\n", 736 | " \n", 738 | " 0.356666188463\n", 739 | "
\n", 742 | "\n", 743 | "cosine similarities after equalizing:\n", 744 | "\n", 745 | " \n", 746 | " \n", 749 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 757 | " \n", 760 | " \n", 761 | "
\n", 747 | " **cosine_similarity(u1, gender)** =\n", 748 | " \n", 750 | " -0.700436428931\n", 751 | "
\n", 755 | " **cosine_similarity(u2, gender)** =\n", 756 | " \n", 758 | " 0.700436428931\n", 759 | "
" 762 | ] 763 | }, 764 | { 765 | "cell_type": "markdown", 766 | "metadata": { 767 | "collapsed": true 768 | }, 769 | "source": [ 770 | "Please feel free to play with the input words in the cell above, to apply equalization to other pairs of words. \n", 771 | "\n", 772 | "These debiasing algorithms are very helpful for reducing bias, but are not perfect and do not eliminate all traces of bias. For example, one weakness of this implementation was that the bias direction $g$ was defined using only the pair of words _woman_ and _man_. As discussed earlier, if $g$ were defined by computing $g_1 = e_{woman} - e_{man}$; $g_2 = e_{mother} - e_{father}$; $g_3 = e_{girl} - e_{boy}$; and so on and averaging over them, you would obtain a better estimate of the \"gender\" dimension in the 50 dimensional word embedding space. Feel free to play with such variants as well. \n", 773 | " " 774 | ] 775 | }, 776 | { 777 | "cell_type": "markdown", 778 | "metadata": {}, 779 | "source": [ 780 | "### Congratulations\n", 781 | "\n", 782 | "You have come to the end of this notebook, and have seen a lot of the ways that word vectors can be used as well as modified. \n", 783 | "\n", 784 | "Congratulations on finishing this notebook! \n" 785 | ] 786 | }, 787 | { 788 | "cell_type": "markdown", 789 | "metadata": {}, 790 | "source": [ 791 | "**References**:\n", 792 | "- The debiasing algorithm is from Bolukbasi et al., 2016, [Man is to Computer Programmer as Woman is to\n", 793 | "Homemaker? Debiasing Word Embeddings](https://papers.nips.cc/paper/6228-man-is-to-computer-programmer-as-woman-is-to-homemaker-debiasing-word-embeddings.pdf)\n", 794 | "- The GloVe word embeddings were due to Jeffrey Pennington, Richard Socher, and Christopher D. Manning. (https://nlp.stanford.edu/projects/glove/)\n" 795 | ] 796 | } 797 | ], 798 | "metadata": { 799 | "coursera": { 800 | "course_slug": "nlp-sequence-models", 801 | "graded_item_id": "8hb5s", 802 | "launcher_item_id": "5NrJ6" 803 | }, 804 | "kernelspec": { 805 | "display_name": "Python 3", 806 | "language": "python", 807 | "name": "python3" 808 | }, 809 | "language_info": { 810 | "codemirror_mode": { 811 | "name": "ipython", 812 | "version": 3 813 | }, 814 | "file_extension": ".py", 815 | "mimetype": "text/x-python", 816 | "name": "python", 817 | "nbconvert_exporter": "python", 818 | "pygments_lexer": "ipython3", 819 | "version": "3.6.0" 820 | } 821 | }, 822 | "nbformat": 4, 823 | "nbformat_minor": 2 824 | } 825 | -------------------------------------------------------------------------------- /99~参考资料/DeepLearning-Specialization/NeuralNetworks-And-DeepLearning/Week2/Python+Basics+With+Numpy+v3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Python Basics with Numpy (optional assignment)\n", 8 | "\n", 9 | "Welcome to your first assignment. This exercise gives you a brief introduction to Python. Even if you've used Python before, this will help familiarize you with functions we'll need. \n", 10 | "\n", 11 | "**Instructions:**\n", 12 | "- You will be using Python 3.\n", 13 | "- Avoid using for-loops and while-loops, unless you are explicitly told to do so.\n", 14 | "- Do not modify the (# GRADED FUNCTION [function name]) comment in some cells. Your work would not be graded if you change this. Each cell containing that comment should only contain one function.\n", 15 | "- After coding your function, run the cell right below it to check if your result is correct.\n", 16 | "\n", 17 | "**After this assignment you will:**\n", 18 | "- Be able to use iPython Notebooks\n", 19 | "- Be able to use numpy functions and numpy matrix/vector operations\n", 20 | "- Understand the concept of \"broadcasting\"\n", 21 | "- Be able to vectorize code\n", 22 | "\n", 23 | "Let's get started!" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## About iPython Notebooks ##\n", 31 | "\n", 32 | "iPython Notebooks are interactive coding environments embedded in a webpage. You will be using iPython notebooks in this class. You only need to write code between the ### START CODE HERE ### and ### END CODE HERE ### comments. After writing your code, you can run the cell by either pressing \"SHIFT\"+\"ENTER\" or by clicking on \"Run Cell\" (denoted by a play symbol) in the upper bar of the notebook. \n", 33 | "\n", 34 | "We will often specify \"(≈ X lines of code)\" in the comments to tell you about how much code you need to write. It is just a rough estimate, so don't feel bad if your code is longer or shorter.\n", 35 | "\n", 36 | "**Exercise**: Set test to `\"Hello World\"` in the cell below to print \"Hello World\" and run the two cells below." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 1, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "### START CODE HERE ### (≈ 1 line of code)\n", 48 | "test = \"Hello World\"\n", 49 | "### END CODE HERE ###" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 2, 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "test: Hello World\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "print (\"test: \" + test)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "**Expected output**:\n", 76 | "test: Hello World" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "\n", 84 | "**What you need to remember**:\n", 85 | "- Run your cells using SHIFT+ENTER (or \"Run cell\")\n", 86 | "- Write code in the designated areas using Python 3 only\n", 87 | "- Do not modify the code outside of the designated areas" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "## 1 - Building basic functions with numpy ##\n", 95 | "\n", 96 | "Numpy is the main package for scientific computing in Python. It is maintained by a large community (www.numpy.org). In this exercise you will learn several key numpy functions such as np.exp, np.log, and np.reshape. You will need to know how to use these functions for future assignments.\n", 97 | "\n", 98 | "### 1.1 - sigmoid function, np.exp() ###\n", 99 | "\n", 100 | "Before using np.exp(), you will use math.exp() to implement the sigmoid function. You will then see why np.exp() is preferable to math.exp().\n", 101 | "\n", 102 | "**Exercise**: Build a function that returns the sigmoid of a real number x. Use math.exp(x) for the exponential function.\n", 103 | "\n", 104 | "**Reminder**:\n", 105 | "$sigmoid(x) = \\frac{1}{1+e^{-x}}$ is sometimes also known as the logistic function. It is a non-linear function used not only in Machine Learning (Logistic Regression), but also in Deep Learning.\n", 106 | "\n", 107 | "\n", 108 | "\n", 109 | "To refer to a function belonging to a specific package you could call it using package_name.function(). Run the code below to see an example with math.exp()." 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 5, 115 | "metadata": { 116 | "collapsed": true 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "# GRADED FUNCTION: basic_sigmoid\n", 121 | "\n", 122 | "import math\n", 123 | "\n", 124 | "def basic_sigmoid(x):\n", 125 | " \"\"\"\n", 126 | " Compute sigmoid of x.\n", 127 | "\n", 128 | " Arguments:\n", 129 | " x -- A scalar\n", 130 | "\n", 131 | " Return:\n", 132 | " s -- sigmoid(x)\n", 133 | " \"\"\"\n", 134 | " \n", 135 | " ### START CODE HERE ### (≈ 1 line of code)\n", 136 | " s = 1.0/(1+math.exp(-x))\n", 137 | " ### END CODE HERE ###\n", 138 | " \n", 139 | " return s" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 6, 145 | "metadata": { 146 | "collapsed": false 147 | }, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "0.9525741268224334" 153 | ] 154 | }, 155 | "execution_count": 6, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "basic_sigmoid(3)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "**Expected Output**: \n", 169 | "\n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | "\n", 175 | "
** basic_sigmoid(3) **0.9525741268224334
" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "Actually, we rarely use the \"math\" library in deep learning because the inputs of the functions are real numbers. In deep learning we mostly use matrices and vectors. This is why numpy is more useful. " 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 7, 188 | "metadata": { 189 | "collapsed": false 190 | }, 191 | "outputs": [ 192 | { 193 | "ename": "TypeError", 194 | "evalue": "bad operand type for unary -: 'list'", 195 | "output_type": "error", 196 | "traceback": [ 197 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 198 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", 199 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m### One reason why we use \"numpy\" instead of \"math\" in Deep Learning ###\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mbasic_sigmoid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# you will see this give an error when you run it, because x is a vector.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 200 | "\u001b[0;32m\u001b[0m in \u001b[0;36mbasic_sigmoid\u001b[0;34m(x)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;31m### START CODE HERE ### (≈ 1 line of code)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0ms\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1.0\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mmath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0;31m### END CODE HERE ###\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 201 | "\u001b[0;31mTypeError\u001b[0m: bad operand type for unary -: 'list'" 202 | ] 203 | } 204 | ], 205 | "source": [ 206 | "### One reason why we use \"numpy\" instead of \"math\" in Deep Learning ###\n", 207 | "x = [1, 2, 3]\n", 208 | "basic_sigmoid(x) # you will see this give an error when you run it, because x is a vector." 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "In fact, if $ x = (x_1, x_2, ..., x_n)$ is a row vector then $np.exp(x)$ will apply the exponential function to every element of x. The output will thus be: $np.exp(x) = (e^{x_1}, e^{x_2}, ..., e^{x_n})$" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 8, 221 | "metadata": { 222 | "collapsed": false 223 | }, 224 | "outputs": [ 225 | { 226 | "name": "stdout", 227 | "output_type": "stream", 228 | "text": [ 229 | "[ 2.71828183 7.3890561 20.08553692]\n" 230 | ] 231 | } 232 | ], 233 | "source": [ 234 | "import numpy as np\n", 235 | "\n", 236 | "# example of np.exp\n", 237 | "x = np.array([1, 2, 3])\n", 238 | "print(np.exp(x)) # result is (exp(1), exp(2), exp(3))" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "Furthermore, if x is a vector, then a Python operation such as $s = x + 3$ or $s = \\frac{1}{x}$ will output s as a vector of the same size as x." 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 9, 251 | "metadata": { 252 | "collapsed": false 253 | }, 254 | "outputs": [ 255 | { 256 | "name": "stdout", 257 | "output_type": "stream", 258 | "text": [ 259 | "[4 5 6]\n" 260 | ] 261 | } 262 | ], 263 | "source": [ 264 | "# example of vector operation\n", 265 | "x = np.array([1, 2, 3])\n", 266 | "print (x + 3)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "Any time you need more info on a numpy function, we encourage you to look at [the official documentation](https://docs.scipy.org/doc/numpy-1.10.1/reference/generated/numpy.exp.html). \n", 274 | "\n", 275 | "You can also create a new cell in the notebook and write `np.exp?` (for example) to get quick access to the documentation.\n", 276 | "\n", 277 | "**Exercise**: Implement the sigmoid function using numpy. \n", 278 | "\n", 279 | "**Instructions**: x could now be either a real number, a vector, or a matrix. The data structures we use in numpy to represent these shapes (vectors, matrices...) are called numpy arrays. You don't need to know more for now.\n", 280 | "$$ \\text{For } x \\in \\mathbb{R}^n \\text{, } sigmoid(x) = sigmoid\\begin{pmatrix}\n", 281 | " x_1 \\\\\n", 282 | " x_2 \\\\\n", 283 | " ... \\\\\n", 284 | " x_n \\\\\n", 285 | "\\end{pmatrix} = \\begin{pmatrix}\n", 286 | " \\frac{1}{1+e^{-x_1}} \\\\\n", 287 | " \\frac{1}{1+e^{-x_2}} \\\\\n", 288 | " ... \\\\\n", 289 | " \\frac{1}{1+e^{-x_n}} \\\\\n", 290 | "\\end{pmatrix}\\tag{1} $$" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 12, 296 | "metadata": { 297 | "collapsed": false 298 | }, 299 | "outputs": [], 300 | "source": [ 301 | "# GRADED FUNCTION: sigmoid\n", 302 | "\n", 303 | "import numpy as np # this means you can access numpy functions by writing np.function() instead of numpy.function()\n", 304 | "\n", 305 | "def sigmoid(x):\n", 306 | " \"\"\"\n", 307 | " Compute the sigmoid of x\n", 308 | "\n", 309 | " Arguments:\n", 310 | " x -- A scalar or numpy array of any size\n", 311 | "\n", 312 | " Return:\n", 313 | " s -- sigmoid(x)\n", 314 | " \"\"\"\n", 315 | " \n", 316 | " ### START CODE HERE ### (≈ 1 line of code)\n", 317 | " s = 1.0/(1+np.exp(-x))\n", 318 | " ### END CODE HERE ###\n", 319 | " \n", 320 | " return s" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 13, 326 | "metadata": { 327 | "collapsed": false 328 | }, 329 | "outputs": [ 330 | { 331 | "data": { 332 | "text/plain": [ 333 | "array([ 0.73105858, 0.88079708, 0.95257413])" 334 | ] 335 | }, 336 | "execution_count": 13, 337 | "metadata": {}, 338 | "output_type": "execute_result" 339 | } 340 | ], 341 | "source": [ 342 | "x = np.array([1, 2, 3])\n", 343 | "sigmoid(x)" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "**Expected Output**: \n", 351 | "\n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | "
**sigmoid([1,2,3])** array([ 0.73105858, 0.88079708, 0.95257413])
\n" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "### 1.2 - Sigmoid gradient\n", 364 | "\n", 365 | "As you've seen in lecture, you will need to compute gradients to optimize loss functions using backpropagation. Let's code your first gradient function.\n", 366 | "\n", 367 | "**Exercise**: Implement the function sigmoid_grad() to compute the gradient of the sigmoid function with respect to its input x. The formula is: $$sigmoid\\_derivative(x) = \\sigma'(x) = \\sigma(x) (1 - \\sigma(x))\\tag{2}$$\n", 368 | "You often code this function in two steps:\n", 369 | "1. Set s to be the sigmoid of x. You might find your sigmoid(x) function useful.\n", 370 | "2. Compute $\\sigma'(x) = s(1-s)$" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 14, 376 | "metadata": { 377 | "collapsed": false 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "# GRADED FUNCTION: sigmoid_derivative\n", 382 | "\n", 383 | "def sigmoid_derivative(x):\n", 384 | " \"\"\"\n", 385 | " Compute the gradient (also called the slope or derivative) of the sigmoid function with respect to its input x.\n", 386 | " You can store the output of the sigmoid function into variables and then use it to calculate the gradient.\n", 387 | " \n", 388 | " Arguments:\n", 389 | " x -- A scalar or numpy array\n", 390 | "\n", 391 | " Return:\n", 392 | " ds -- Your computed gradient.\n", 393 | " \"\"\"\n", 394 | " \n", 395 | " ### START CODE HERE ### (≈ 2 lines of code)\n", 396 | " s = 1.0/(1+np.exp(-x))\n", 397 | " ds = s*(1-s)\n", 398 | " ### END CODE HERE ###\n", 399 | " \n", 400 | " return ds" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 15, 406 | "metadata": { 407 | "collapsed": false 408 | }, 409 | "outputs": [ 410 | { 411 | "name": "stdout", 412 | "output_type": "stream", 413 | "text": [ 414 | "sigmoid_derivative(x) = [ 0.19661193 0.10499359 0.04517666]\n" 415 | ] 416 | } 417 | ], 418 | "source": [ 419 | "x = np.array([1, 2, 3])\n", 420 | "print (\"sigmoid_derivative(x) = \" + str(sigmoid_derivative(x)))" 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": {}, 426 | "source": [ 427 | "**Expected Output**: \n", 428 | "\n", 429 | "\n", 430 | "\n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | "
**sigmoid_derivative([1,2,3])** [ 0.19661193 0.10499359 0.04517666]
\n", 436 | "\n" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": {}, 442 | "source": [ 443 | "### 1.3 - Reshaping arrays ###\n", 444 | "\n", 445 | "Two common numpy functions used in deep learning are [np.shape](https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.shape.html) and [np.reshape()](https://docs.scipy.org/doc/numpy/reference/generated/numpy.reshape.html). \n", 446 | "- X.shape is used to get the shape (dimension) of a matrix/vector X. \n", 447 | "- X.reshape(...) is used to reshape X into some other dimension. \n", 448 | "\n", 449 | "For example, in computer science, an image is represented by a 3D array of shape $(length, height, depth = 3)$. However, when you read an image as the input of an algorithm you convert it to a vector of shape $(length*height*3, 1)$. In other words, you \"unroll\", or reshape, the 3D array into a 1D vector.\n", 450 | "\n", 451 | "\n", 452 | "\n", 453 | "**Exercise**: Implement `image2vector()` that takes an input of shape (length, height, 3) and returns a vector of shape (length\\*height\\*3, 1). For example, if you would like to reshape an array v of shape (a, b, c) into a vector of shape (a*b,c) you would do:\n", 454 | "``` python\n", 455 | "v = v.reshape((v.shape[0]*v.shape[1], v.shape[2])) # v.shape[0] = a ; v.shape[1] = b ; v.shape[2] = c\n", 456 | "```\n", 457 | "- Please don't hardcode the dimensions of image as a constant. Instead look up the quantities you need with `image.shape[0]`, etc. " 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 16, 463 | "metadata": { 464 | "collapsed": false 465 | }, 466 | "outputs": [], 467 | "source": [ 468 | "# GRADED FUNCTION: image2vector\n", 469 | "def image2vector(image):\n", 470 | " \"\"\"\n", 471 | " Argument:\n", 472 | " image -- a numpy array of shape (length, height, depth)\n", 473 | " \n", 474 | " Returns:\n", 475 | " v -- a vector of shape (length*height*depth, 1)\n", 476 | " \"\"\"\n", 477 | " \n", 478 | " ### START CODE HERE ### (≈ 1 line of code)\n", 479 | " v = image.reshape((image.shape[0]*image.shape[1]*image.shape[2],1))\n", 480 | " ### END CODE HERE ###\n", 481 | " \n", 482 | " return v" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": 17, 488 | "metadata": { 489 | "collapsed": false 490 | }, 491 | "outputs": [ 492 | { 493 | "name": "stdout", 494 | "output_type": "stream", 495 | "text": [ 496 | "image2vector(image) = [[ 0.67826139]\n", 497 | " [ 0.29380381]\n", 498 | " [ 0.90714982]\n", 499 | " [ 0.52835647]\n", 500 | " [ 0.4215251 ]\n", 501 | " [ 0.45017551]\n", 502 | " [ 0.92814219]\n", 503 | " [ 0.96677647]\n", 504 | " [ 0.85304703]\n", 505 | " [ 0.52351845]\n", 506 | " [ 0.19981397]\n", 507 | " [ 0.27417313]\n", 508 | " [ 0.60659855]\n", 509 | " [ 0.00533165]\n", 510 | " [ 0.10820313]\n", 511 | " [ 0.49978937]\n", 512 | " [ 0.34144279]\n", 513 | " [ 0.94630077]]\n" 514 | ] 515 | } 516 | ], 517 | "source": [ 518 | "# This is a 3 by 3 by 2 array, typically images will be (num_px_x, num_px_y,3) where 3 represents the RGB values\n", 519 | "image = np.array([[[ 0.67826139, 0.29380381],\n", 520 | " [ 0.90714982, 0.52835647],\n", 521 | " [ 0.4215251 , 0.45017551]],\n", 522 | "\n", 523 | " [[ 0.92814219, 0.96677647],\n", 524 | " [ 0.85304703, 0.52351845],\n", 525 | " [ 0.19981397, 0.27417313]],\n", 526 | "\n", 527 | " [[ 0.60659855, 0.00533165],\n", 528 | " [ 0.10820313, 0.49978937],\n", 529 | " [ 0.34144279, 0.94630077]]])\n", 530 | "\n", 531 | "print (\"image2vector(image) = \" + str(image2vector(image)))" 532 | ] 533 | }, 534 | { 535 | "cell_type": "markdown", 536 | "metadata": {}, 537 | "source": [ 538 | "**Expected Output**: \n", 539 | "\n", 540 | "\n", 541 | "\n", 542 | " \n", 543 | " \n", 544 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | "
**image2vector(image)** [[ 0.67826139]\n", 545 | " [ 0.29380381]\n", 546 | " [ 0.90714982]\n", 547 | " [ 0.52835647]\n", 548 | " [ 0.4215251 ]\n", 549 | " [ 0.45017551]\n", 550 | " [ 0.92814219]\n", 551 | " [ 0.96677647]\n", 552 | " [ 0.85304703]\n", 553 | " [ 0.52351845]\n", 554 | " [ 0.19981397]\n", 555 | " [ 0.27417313]\n", 556 | " [ 0.60659855]\n", 557 | " [ 0.00533165]\n", 558 | " [ 0.10820313]\n", 559 | " [ 0.49978937]\n", 560 | " [ 0.34144279]\n", 561 | " [ 0.94630077]]
" 566 | ] 567 | }, 568 | { 569 | "cell_type": "markdown", 570 | "metadata": {}, 571 | "source": [ 572 | "### 1.4 - Normalizing rows\n", 573 | "\n", 574 | "Another common technique we use in Machine Learning and Deep Learning is to normalize our data. It often leads to a better performance because gradient descent converges faster after normalization. Here, by normalization we mean changing x to $ \\frac{x}{\\| x\\|} $ (dividing each row vector of x by its norm).\n", 575 | "\n", 576 | "For example, if $$x = \n", 577 | "\\begin{bmatrix}\n", 578 | " 0 & 3 & 4 \\\\\n", 579 | " 2 & 6 & 4 \\\\\n", 580 | "\\end{bmatrix}\\tag{3}$$ then $$\\| x\\| = np.linalg.norm(x, axis = 1, keepdims = True) = \\begin{bmatrix}\n", 581 | " 5 \\\\\n", 582 | " \\sqrt{56} \\\\\n", 583 | "\\end{bmatrix}\\tag{4} $$and $$ x\\_normalized = \\frac{x}{\\| x\\|} = \\begin{bmatrix}\n", 584 | " 0 & \\frac{3}{5} & \\frac{4}{5} \\\\\n", 585 | " \\frac{2}{\\sqrt{56}} & \\frac{6}{\\sqrt{56}} & \\frac{4}{\\sqrt{56}} \\\\\n", 586 | "\\end{bmatrix}\\tag{5}$$ Note that you can divide matrices of different sizes and it works fine: this is called broadcasting and you're going to learn about it in part 5.\n", 587 | "\n", 588 | "\n", 589 | "**Exercise**: Implement normalizeRows() to normalize the rows of a matrix. After applying this function to an input matrix x, each row of x should be a vector of unit length (meaning length 1)." 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": 22, 595 | "metadata": { 596 | "collapsed": false 597 | }, 598 | "outputs": [], 599 | "source": [ 600 | "# GRADED FUNCTION: normalizeRows\n", 601 | "\n", 602 | "def normalizeRows(x):\n", 603 | " \"\"\"\n", 604 | " Implement a function that normalizes each row of the matrix x (to have unit length).\n", 605 | " \n", 606 | " Argument:\n", 607 | " x -- A numpy matrix of shape (n, m)\n", 608 | " \n", 609 | " Returns:\n", 610 | " x -- The normalized (by row) numpy matrix. You are allowed to modify x.\n", 611 | " \"\"\"\n", 612 | " \n", 613 | " ### START CODE HERE ### (≈ 2 lines of code)\n", 614 | " # Compute x_norm as the norm 2 of x. Use np.linalg.norm(..., ord = 2, axis = ..., keepdims = True)\n", 615 | " x_norm = np.linalg.norm(x,axis = 1,ord = 2, keepdims = True)\n", 616 | " \n", 617 | " print(x_norm.shape)\n", 618 | " \n", 619 | " # Divide x by its norm.\n", 620 | " x = x/x_norm\n", 621 | " print(x.shape)\n", 622 | " ### END CODE HERE ###\n", 623 | "\n", 624 | " return x" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": 23, 630 | "metadata": { 631 | "collapsed": false 632 | }, 633 | "outputs": [ 634 | { 635 | "name": "stdout", 636 | "output_type": "stream", 637 | "text": [ 638 | "(2, 1)\n", 639 | "(2, 3)\n", 640 | "normalizeRows(x) = [[ 0. 0.6 0.8 ]\n", 641 | " [ 0.13736056 0.82416338 0.54944226]]\n" 642 | ] 643 | } 644 | ], 645 | "source": [ 646 | "x = np.array([\n", 647 | " [0, 3, 4],\n", 648 | " [1, 6, 4]])\n", 649 | "print(\"normalizeRows(x) = \" + str(normalizeRows(x)))" 650 | ] 651 | }, 652 | { 653 | "cell_type": "markdown", 654 | "metadata": {}, 655 | "source": [ 656 | "**Expected Output**: \n", 657 | "\n", 658 | "\n", 659 | "\n", 660 | " \n", 661 | " \n", 662 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | "
**normalizeRows(x)** [[ 0. 0.6 0.8 ]\n", 663 | " [ 0.13736056 0.82416338 0.54944226]]
" 668 | ] 669 | }, 670 | { 671 | "cell_type": "markdown", 672 | "metadata": {}, 673 | "source": [ 674 | "**Note**:\n", 675 | "In normalizeRows(), you can try to print the shapes of x_norm and x, and then rerun the assessment. You'll find out that they have different shapes. This is normal given that x_norm takes the norm of each row of x. So x_norm has the same number of rows but only 1 column. So how did it work when you divided x by x_norm? This is called broadcasting and we'll talk about it now! " 676 | ] 677 | }, 678 | { 679 | "cell_type": "markdown", 680 | "metadata": {}, 681 | "source": [ 682 | "### 1.5 - Broadcasting and the softmax function ####\n", 683 | "A very important concept to understand in numpy is \"broadcasting\". It is very useful for performing mathematical operations between arrays of different shapes. For the full details on broadcasting, you can read the official [broadcasting documentation](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)." 684 | ] 685 | }, 686 | { 687 | "cell_type": "markdown", 688 | "metadata": {}, 689 | "source": [ 690 | "**Exercise**: Implement a softmax function using numpy. You can think of softmax as a normalizing function used when your algorithm needs to classify two or more classes. You will learn more about softmax in the second course of this specialization.\n", 691 | "\n", 692 | "**Instructions**:\n", 693 | "- $ \\text{for } x \\in \\mathbb{R}^{1\\times n} \\text{, } softmax(x) = softmax(\\begin{bmatrix}\n", 694 | " x_1 &&\n", 695 | " x_2 &&\n", 696 | " ... &&\n", 697 | " x_n \n", 698 | "\\end{bmatrix}) = \\begin{bmatrix}\n", 699 | " \\frac{e^{x_1}}{\\sum_{j}e^{x_j}} &&\n", 700 | " \\frac{e^{x_2}}{\\sum_{j}e^{x_j}} &&\n", 701 | " ... &&\n", 702 | " \\frac{e^{x_n}}{\\sum_{j}e^{x_j}} \n", 703 | "\\end{bmatrix} $ \n", 704 | "\n", 705 | "- $\\text{for a matrix } x \\in \\mathbb{R}^{m \\times n} \\text{, $x_{ij}$ maps to the element in the $i^{th}$ row and $j^{th}$ column of $x$, thus we have: }$ $$softmax(x) = softmax\\begin{bmatrix}\n", 706 | " x_{11} & x_{12} & x_{13} & \\dots & x_{1n} \\\\\n", 707 | " x_{21} & x_{22} & x_{23} & \\dots & x_{2n} \\\\\n", 708 | " \\vdots & \\vdots & \\vdots & \\ddots & \\vdots \\\\\n", 709 | " x_{m1} & x_{m2} & x_{m3} & \\dots & x_{mn}\n", 710 | "\\end{bmatrix} = \\begin{bmatrix}\n", 711 | " \\frac{e^{x_{11}}}{\\sum_{j}e^{x_{1j}}} & \\frac{e^{x_{12}}}{\\sum_{j}e^{x_{1j}}} & \\frac{e^{x_{13}}}{\\sum_{j}e^{x_{1j}}} & \\dots & \\frac{e^{x_{1n}}}{\\sum_{j}e^{x_{1j}}} \\\\\n", 712 | " \\frac{e^{x_{21}}}{\\sum_{j}e^{x_{2j}}} & \\frac{e^{x_{22}}}{\\sum_{j}e^{x_{2j}}} & \\frac{e^{x_{23}}}{\\sum_{j}e^{x_{2j}}} & \\dots & \\frac{e^{x_{2n}}}{\\sum_{j}e^{x_{2j}}} \\\\\n", 713 | " \\vdots & \\vdots & \\vdots & \\ddots & \\vdots \\\\\n", 714 | " \\frac{e^{x_{m1}}}{\\sum_{j}e^{x_{mj}}} & \\frac{e^{x_{m2}}}{\\sum_{j}e^{x_{mj}}} & \\frac{e^{x_{m3}}}{\\sum_{j}e^{x_{mj}}} & \\dots & \\frac{e^{x_{mn}}}{\\sum_{j}e^{x_{mj}}}\n", 715 | "\\end{bmatrix} = \\begin{pmatrix}\n", 716 | " softmax\\text{(first row of x)} \\\\\n", 717 | " softmax\\text{(second row of x)} \\\\\n", 718 | " ... \\\\\n", 719 | " softmax\\text{(last row of x)} \\\\\n", 720 | "\\end{pmatrix} $$" 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": 24, 726 | "metadata": { 727 | "collapsed": false 728 | }, 729 | "outputs": [], 730 | "source": [ 731 | "# GRADED FUNCTION: softmax\n", 732 | "\n", 733 | "def softmax(x):\n", 734 | " \"\"\"Calculates the softmax for each row of the input x.\n", 735 | "\n", 736 | " Your code should work for a row vector and also for matrices of shape (n, m).\n", 737 | "\n", 738 | " Argument:\n", 739 | " x -- A numpy matrix of shape (n,m)\n", 740 | "\n", 741 | " Returns:\n", 742 | " s -- A numpy matrix equal to the softmax of x, of shape (n,m)\n", 743 | " \"\"\"\n", 744 | " \n", 745 | " ### START CODE HERE ### (≈ 3 lines of code)\n", 746 | " # Apply exp() element-wise to x. Use np.exp(...).\n", 747 | " x_exp = np.exp(x)\n", 748 | "\n", 749 | " # Create a vector x_sum that sums each row of x_exp. Use np.sum(..., axis = 1, keepdims = True).\n", 750 | " x_sum = np.sum(x_exp,axis = 1, keepdims = True)\n", 751 | " \n", 752 | " # Compute softmax(x) by dividing x_exp by x_sum. It should automatically use numpy broadcasting.\n", 753 | " s = x_exp/x_sum\n", 754 | "\n", 755 | " ### END CODE HERE ###\n", 756 | " \n", 757 | " return s" 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": 25, 763 | "metadata": { 764 | "collapsed": false 765 | }, 766 | "outputs": [ 767 | { 768 | "name": "stdout", 769 | "output_type": "stream", 770 | "text": [ 771 | "softmax(x) = [[ 9.80897665e-01 8.94462891e-04 1.79657674e-02 1.21052389e-04\n", 772 | " 1.21052389e-04]\n", 773 | " [ 8.78679856e-01 1.18916387e-01 8.01252314e-04 8.01252314e-04\n", 774 | " 8.01252314e-04]]\n" 775 | ] 776 | } 777 | ], 778 | "source": [ 779 | "x = np.array([\n", 780 | " [9, 2, 5, 0, 0],\n", 781 | " [7, 5, 0, 0 ,0]])\n", 782 | "print(\"softmax(x) = \" + str(softmax(x)))" 783 | ] 784 | }, 785 | { 786 | "cell_type": "markdown", 787 | "metadata": {}, 788 | "source": [ 789 | "**Expected Output**:\n", 790 | "\n", 791 | "\n", 792 | "\n", 793 | " \n", 794 | " \n", 795 | " \n", 799 | " \n", 800 | "
**softmax(x)** [[ 9.80897665e-01 8.94462891e-04 1.79657674e-02 1.21052389e-04\n", 796 | " 1.21052389e-04]\n", 797 | " [ 8.78679856e-01 1.18916387e-01 8.01252314e-04 8.01252314e-04\n", 798 | " 8.01252314e-04]]
\n" 801 | ] 802 | }, 803 | { 804 | "cell_type": "markdown", 805 | "metadata": {}, 806 | "source": [ 807 | "**Note**:\n", 808 | "- If you print the shapes of x_exp, x_sum and s above and rerun the assessment cell, you will see that x_sum is of shape (2,1) while x_exp and s are of shape (2,5). **x_exp/x_sum** works due to python broadcasting.\n", 809 | "\n", 810 | "Congratulations! You now have a pretty good understanding of python numpy and have implemented a few useful functions that you will be using in deep learning." 811 | ] 812 | }, 813 | { 814 | "cell_type": "markdown", 815 | "metadata": {}, 816 | "source": [ 817 | "\n", 818 | "**What you need to remember:**\n", 819 | "- np.exp(x) works for any np.array x and applies the exponential function to every coordinate\n", 820 | "- the sigmoid function and its gradient\n", 821 | "- image2vector is commonly used in deep learning\n", 822 | "- np.reshape is widely used. In the future, you'll see that keeping your matrix/vector dimensions straight will go toward eliminating a lot of bugs. \n", 823 | "- numpy has efficient built-in functions\n", 824 | "- broadcasting is extremely useful" 825 | ] 826 | }, 827 | { 828 | "cell_type": "markdown", 829 | "metadata": { 830 | "collapsed": true 831 | }, 832 | "source": [ 833 | "## 2) Vectorization" 834 | ] 835 | }, 836 | { 837 | "cell_type": "markdown", 838 | "metadata": {}, 839 | "source": [ 840 | "\n", 841 | "In deep learning, you deal with very large datasets. Hence, a non-computationally-optimal function can become a huge bottleneck in your algorithm and can result in a model that takes ages to run. To make sure that your code is computationally efficient, you will use vectorization. For example, try to tell the difference between the following implementations of the dot/outer/elementwise product." 842 | ] 843 | }, 844 | { 845 | "cell_type": "code", 846 | "execution_count": 26, 847 | "metadata": { 848 | "collapsed": false 849 | }, 850 | "outputs": [ 851 | { 852 | "name": "stdout", 853 | "output_type": "stream", 854 | "text": [ 855 | "dot = 278\n", 856 | " ----- Computation time = 0.17289699999989416ms\n", 857 | "outer = [[ 81. 18. 18. 81. 0. 81. 18. 45. 0. 0. 81. 18. 45. 0.\n", 858 | " 0.]\n", 859 | " [ 18. 4. 4. 18. 0. 18. 4. 10. 0. 0. 18. 4. 10. 0.\n", 860 | " 0.]\n", 861 | " [ 45. 10. 10. 45. 0. 45. 10. 25. 0. 0. 45. 10. 25. 0.\n", 862 | " 0.]\n", 863 | " [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", 864 | " 0.]\n", 865 | " [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", 866 | " 0.]\n", 867 | " [ 63. 14. 14. 63. 0. 63. 14. 35. 0. 0. 63. 14. 35. 0.\n", 868 | " 0.]\n", 869 | " [ 45. 10. 10. 45. 0. 45. 10. 25. 0. 0. 45. 10. 25. 0.\n", 870 | " 0.]\n", 871 | " [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", 872 | " 0.]\n", 873 | " [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", 874 | " 0.]\n", 875 | " [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", 876 | " 0.]\n", 877 | " [ 81. 18. 18. 81. 0. 81. 18. 45. 0. 0. 81. 18. 45. 0.\n", 878 | " 0.]\n", 879 | " [ 18. 4. 4. 18. 0. 18. 4. 10. 0. 0. 18. 4. 10. 0.\n", 880 | " 0.]\n", 881 | " [ 45. 10. 10. 45. 0. 45. 10. 25. 0. 0. 45. 10. 25. 0.\n", 882 | " 0.]\n", 883 | " [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", 884 | " 0.]\n", 885 | " [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", 886 | " 0.]]\n", 887 | " ----- Computation time = 0.35026000000004665ms\n", 888 | "elementwise multiplication = [ 81. 4. 10. 0. 0. 63. 10. 0. 0. 0. 81. 4. 25. 0. 0.]\n", 889 | " ----- Computation time = 0.20884099999984862ms\n", 890 | "gdot = [ 30.82471128 17.14219967 29.20685098]\n", 891 | " ----- Computation time = 0.4346299999999914ms\n" 892 | ] 893 | } 894 | ], 895 | "source": [ 896 | "import time\n", 897 | "\n", 898 | "x1 = [9, 2, 5, 0, 0, 7, 5, 0, 0, 0, 9, 2, 5, 0, 0]\n", 899 | "x2 = [9, 2, 2, 9, 0, 9, 2, 5, 0, 0, 9, 2, 5, 0, 0]\n", 900 | "\n", 901 | "### CLASSIC DOT PRODUCT OF VECTORS IMPLEMENTATION ###\n", 902 | "tic = time.process_time()\n", 903 | "dot = 0\n", 904 | "for i in range(len(x1)):\n", 905 | " dot+= x1[i]*x2[i]\n", 906 | "toc = time.process_time()\n", 907 | "print (\"dot = \" + str(dot) + \"\\n ----- Computation time = \" + str(1000*(toc - tic)) + \"ms\")\n", 908 | "\n", 909 | "### CLASSIC OUTER PRODUCT IMPLEMENTATION ###\n", 910 | "tic = time.process_time()\n", 911 | "outer = np.zeros((len(x1),len(x2))) # we create a len(x1)*len(x2) matrix with only zeros\n", 912 | "for i in range(len(x1)):\n", 913 | " for j in range(len(x2)):\n", 914 | " outer[i,j] = x1[i]*x2[j]\n", 915 | "toc = time.process_time()\n", 916 | "print (\"outer = \" + str(outer) + \"\\n ----- Computation time = \" + str(1000*(toc - tic)) + \"ms\")\n", 917 | "\n", 918 | "### CLASSIC ELEMENTWISE IMPLEMENTATION ###\n", 919 | "tic = time.process_time()\n", 920 | "mul = np.zeros(len(x1))\n", 921 | "for i in range(len(x1)):\n", 922 | " mul[i] = x1[i]*x2[i]\n", 923 | "toc = time.process_time()\n", 924 | "print (\"elementwise multiplication = \" + str(mul) + \"\\n ----- Computation time = \" + str(1000*(toc - tic)) + \"ms\")\n", 925 | "\n", 926 | "### CLASSIC GENERAL DOT PRODUCT IMPLEMENTATION ###\n", 927 | "W = np.random.rand(3,len(x1)) # Random 3*len(x1) numpy array\n", 928 | "tic = time.process_time()\n", 929 | "gdot = np.zeros(W.shape[0])\n", 930 | "for i in range(W.shape[0]):\n", 931 | " for j in range(len(x1)):\n", 932 | " gdot[i] += W[i,j]*x1[j]\n", 933 | "toc = time.process_time()\n", 934 | "print (\"gdot = \" + str(gdot) + \"\\n ----- Computation time = \" + str(1000*(toc - tic)) + \"ms\")" 935 | ] 936 | }, 937 | { 938 | "cell_type": "code", 939 | "execution_count": 27, 940 | "metadata": { 941 | "collapsed": false 942 | }, 943 | "outputs": [ 944 | { 945 | "name": "stdout", 946 | "output_type": "stream", 947 | "text": [ 948 | "dot = 278\n", 949 | " ----- Computation time = 0.22006699999987944ms\n", 950 | "outer = [[81 18 18 81 0 81 18 45 0 0 81 18 45 0 0]\n", 951 | " [18 4 4 18 0 18 4 10 0 0 18 4 10 0 0]\n", 952 | " [45 10 10 45 0 45 10 25 0 0 45 10 25 0 0]\n", 953 | " [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", 954 | " [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", 955 | " [63 14 14 63 0 63 14 35 0 0 63 14 35 0 0]\n", 956 | " [45 10 10 45 0 45 10 25 0 0 45 10 25 0 0]\n", 957 | " [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", 958 | " [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", 959 | " [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", 960 | " [81 18 18 81 0 81 18 45 0 0 81 18 45 0 0]\n", 961 | " [18 4 4 18 0 18 4 10 0 0 18 4 10 0 0]\n", 962 | " [45 10 10 45 0 45 10 25 0 0 45 10 25 0 0]\n", 963 | " [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", 964 | " [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]\n", 965 | " ----- Computation time = 0.24974000000010932ms\n", 966 | "elementwise multiplication = [81 4 10 0 0 63 10 0 0 0 81 4 25 0 0]\n", 967 | " ----- Computation time = 0.12139700000002307ms\n", 968 | "gdot = [ 30.82471128 17.14219967 29.20685098]\n", 969 | " ----- Computation time = 0.17465999999988213ms\n" 970 | ] 971 | } 972 | ], 973 | "source": [ 974 | "x1 = [9, 2, 5, 0, 0, 7, 5, 0, 0, 0, 9, 2, 5, 0, 0]\n", 975 | "x2 = [9, 2, 2, 9, 0, 9, 2, 5, 0, 0, 9, 2, 5, 0, 0]\n", 976 | "\n", 977 | "### VECTORIZED DOT PRODUCT OF VECTORS ###\n", 978 | "tic = time.process_time()\n", 979 | "dot = np.dot(x1,x2)\n", 980 | "toc = time.process_time()\n", 981 | "print (\"dot = \" + str(dot) + \"\\n ----- Computation time = \" + str(1000*(toc - tic)) + \"ms\")\n", 982 | "\n", 983 | "### VECTORIZED OUTER PRODUCT ###\n", 984 | "tic = time.process_time()\n", 985 | "outer = np.outer(x1,x2)\n", 986 | "toc = time.process_time()\n", 987 | "print (\"outer = \" + str(outer) + \"\\n ----- Computation time = \" + str(1000*(toc - tic)) + \"ms\")\n", 988 | "\n", 989 | "### VECTORIZED ELEMENTWISE MULTIPLICATION ###\n", 990 | "tic = time.process_time()\n", 991 | "mul = np.multiply(x1,x2)\n", 992 | "toc = time.process_time()\n", 993 | "print (\"elementwise multiplication = \" + str(mul) + \"\\n ----- Computation time = \" + str(1000*(toc - tic)) + \"ms\")\n", 994 | "\n", 995 | "### VECTORIZED GENERAL DOT PRODUCT ###\n", 996 | "tic = time.process_time()\n", 997 | "dot = np.dot(W,x1)\n", 998 | "toc = time.process_time()\n", 999 | "print (\"gdot = \" + str(dot) + \"\\n ----- Computation time = \" + str(1000*(toc - tic)) + \"ms\")" 1000 | ] 1001 | }, 1002 | { 1003 | "cell_type": "markdown", 1004 | "metadata": {}, 1005 | "source": [ 1006 | "As you may have noticed, the vectorized implementation is much cleaner and more efficient. For bigger vectors/matrices, the differences in running time become even bigger. \n", 1007 | "\n", 1008 | "**Note** that `np.dot()` performs a matrix-matrix or matrix-vector multiplication. This is different from `np.multiply()` and the `*` operator (which is equivalent to `.*` in Matlab/Octave), which performs an element-wise multiplication." 1009 | ] 1010 | }, 1011 | { 1012 | "cell_type": "markdown", 1013 | "metadata": {}, 1014 | "source": [ 1015 | "### 2.1 Implement the L1 and L2 loss functions\n", 1016 | "\n", 1017 | "**Exercise**: Implement the numpy vectorized version of the L1 loss. You may find the function abs(x) (absolute value of x) useful.\n", 1018 | "\n", 1019 | "**Reminder**:\n", 1020 | "- The loss is used to evaluate the performance of your model. The bigger your loss is, the more different your predictions ($ \\hat{y} $) are from the true values ($y$). In deep learning, you use optimization algorithms like Gradient Descent to train your model and to minimize the cost.\n", 1021 | "- L1 loss is defined as:\n", 1022 | "$$\\begin{align*} & L_1(\\hat{y}, y) = \\sum_{i=0}^m|y^{(i)} - \\hat{y}^{(i)}| \\end{align*}\\tag{6}$$" 1023 | ] 1024 | }, 1025 | { 1026 | "cell_type": "code", 1027 | "execution_count": 28, 1028 | "metadata": { 1029 | "collapsed": false 1030 | }, 1031 | "outputs": [], 1032 | "source": [ 1033 | "# GRADED FUNCTION: L1\n", 1034 | "\n", 1035 | "def L1(yhat, y):\n", 1036 | " \"\"\"\n", 1037 | " Arguments:\n", 1038 | " yhat -- vector of size m (predicted labels)\n", 1039 | " y -- vector of size m (true labels)\n", 1040 | " \n", 1041 | " Returns:\n", 1042 | " loss -- the value of the L1 loss function defined above\n", 1043 | " \"\"\"\n", 1044 | " \n", 1045 | " ### START CODE HERE ### (≈ 1 line of code)\n", 1046 | " loss = np.sum(np.abs(yhat-y))\n", 1047 | " ### END CODE HERE ###\n", 1048 | " \n", 1049 | " return loss" 1050 | ] 1051 | }, 1052 | { 1053 | "cell_type": "code", 1054 | "execution_count": 29, 1055 | "metadata": { 1056 | "collapsed": false 1057 | }, 1058 | "outputs": [ 1059 | { 1060 | "name": "stdout", 1061 | "output_type": "stream", 1062 | "text": [ 1063 | "L1 = 1.1\n" 1064 | ] 1065 | } 1066 | ], 1067 | "source": [ 1068 | "yhat = np.array([.9, 0.2, 0.1, .4, .9])\n", 1069 | "y = np.array([1, 0, 0, 1, 1])\n", 1070 | "print(\"L1 = \" + str(L1(yhat,y)))" 1071 | ] 1072 | }, 1073 | { 1074 | "cell_type": "markdown", 1075 | "metadata": {}, 1076 | "source": [ 1077 | "**Expected Output**:\n", 1078 | "\n", 1079 | "\n", 1080 | "\n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | "
**L1** 1.1
\n" 1086 | ] 1087 | }, 1088 | { 1089 | "cell_type": "markdown", 1090 | "metadata": {}, 1091 | "source": [ 1092 | "**Exercise**: Implement the numpy vectorized version of the L2 loss. There are several way of implementing the L2 loss but you may find the function np.dot() useful. As a reminder, if $x = [x_1, x_2, ..., x_n]$, then `np.dot(x,x)` = $\\sum_{j=0}^n x_j^{2}$. \n", 1093 | "\n", 1094 | "- L2 loss is defined as $$\\begin{align*} & L_2(\\hat{y},y) = \\sum_{i=0}^m(y^{(i)} - \\hat{y}^{(i)})^2 \\end{align*}\\tag{7}$$" 1095 | ] 1096 | }, 1097 | { 1098 | "cell_type": "code", 1099 | "execution_count": 30, 1100 | "metadata": { 1101 | "collapsed": false 1102 | }, 1103 | "outputs": [], 1104 | "source": [ 1105 | "# GRADED FUNCTION: L2\n", 1106 | "\n", 1107 | "def L2(yhat, y):\n", 1108 | " \"\"\"\n", 1109 | " Arguments:\n", 1110 | " yhat -- vector of size m (predicted labels)\n", 1111 | " y -- vector of size m (true labels)\n", 1112 | " \n", 1113 | " Returns:\n", 1114 | " loss -- the value of the L2 loss function defined above\n", 1115 | " \"\"\"\n", 1116 | " \n", 1117 | " ### START CODE HERE ### (≈ 1 line of code)\n", 1118 | " loss = np.sum(np.power((yhat-y),2))\n", 1119 | " ### END CODE HERE ###\n", 1120 | " \n", 1121 | " return loss" 1122 | ] 1123 | }, 1124 | { 1125 | "cell_type": "code", 1126 | "execution_count": 31, 1127 | "metadata": { 1128 | "collapsed": false 1129 | }, 1130 | "outputs": [ 1131 | { 1132 | "name": "stdout", 1133 | "output_type": "stream", 1134 | "text": [ 1135 | "L2 = 0.43\n" 1136 | ] 1137 | } 1138 | ], 1139 | "source": [ 1140 | "yhat = np.array([.9, 0.2, 0.1, .4, .9])\n", 1141 | "y = np.array([1, 0, 0, 1, 1])\n", 1142 | "print(\"L2 = \" + str(L2(yhat,y)))" 1143 | ] 1144 | }, 1145 | { 1146 | "cell_type": "markdown", 1147 | "metadata": {}, 1148 | "source": [ 1149 | "**Expected Output**: \n", 1150 | "\n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | "
**L2** 0.43
" 1156 | ] 1157 | }, 1158 | { 1159 | "cell_type": "markdown", 1160 | "metadata": {}, 1161 | "source": [ 1162 | "Congratulations on completing this assignment. We hope that this little warm-up exercise helps you in the future assignments, which will be more exciting and interesting!" 1163 | ] 1164 | }, 1165 | { 1166 | "cell_type": "markdown", 1167 | "metadata": {}, 1168 | "source": [ 1169 | "\n", 1170 | "**What to remember:**\n", 1171 | "- Vectorization is very important in deep learning. It provides computational efficiency and clarity.\n", 1172 | "- You have reviewed the L1 and L2 loss.\n", 1173 | "- You are familiar with many numpy functions such as np.sum, np.dot, np.multiply, np.maximum, etc..." 1174 | ] 1175 | } 1176 | ], 1177 | "metadata": { 1178 | "coursera": { 1179 | "course_slug": "neural-networks-deep-learning", 1180 | "graded_item_id": "XHpfv", 1181 | "launcher_item_id": "Zh0CU" 1182 | }, 1183 | "kernelspec": { 1184 | "display_name": "Python 3", 1185 | "language": "python", 1186 | "name": "python3" 1187 | }, 1188 | "language_info": { 1189 | "codemirror_mode": { 1190 | "name": "ipython", 1191 | "version": 3 1192 | }, 1193 | "file_extension": ".py", 1194 | "mimetype": "text/x-python", 1195 | "name": "python", 1196 | "nbconvert_exporter": "python", 1197 | "pygments_lexer": "ipython3", 1198 | "version": "3.5.2" 1199 | } 1200 | }, 1201 | "nbformat": 4, 1202 | "nbformat_minor": 2 1203 | } 1204 | --------------------------------------------------------------------------------