├── .gitignore ├── Bash Tutorial.ipynb ├── Lab 0.0 Jupyter notebook quickstart.ipynb ├── Lab 0.1 Python 101.ipynb ├── Lab 0.2 Numpy Basics.ipynb ├── Lab 0.3 Scikit-learn.ipynb ├── Pytorch Demo.ipynb ├── README.md ├── Vagrantfile.patreco ├── Vagrantfile.slp ├── install_openfst.sh ├── spell_checker_test_set └── start_jupyter.bash /.gitignore: -------------------------------------------------------------------------------- 1 | Vagrantfile 2 | .vagrant 3 | *.pyc 4 | *checkpoint* 5 | __pycache__ 6 | -------------------------------------------------------------------------------- /Bash Tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Shell scripting tutorial\n", 8 | "\n", 9 | "Learning shell scripting can provide you quick and easy ways to perform a lot of work related with a machine learning / text processing project.\n", 10 | "\n", 11 | "Some of the things you can achieve with shell scripting:\n", 12 | "\n", 13 | "- Installing project dependencies\n", 14 | "- Build project dependencies from source\n", 15 | "- Download organize and clean data\n", 16 | "- Extract data statistics (e.g. word / character counts)\n", 17 | "- Perform text processing\n", 18 | "- Train and evaluate models using existing frameworks that provide a command line interface (Kaldi, openfst, fasttext, fairseq etc.)\n", 19 | "\n" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "First of all list current working directory and it's contents" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 49, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "name": "stdout", 36 | "output_type": "stream", 37 | "text": [ 38 | "/home/geopar/projects/bash_tutorial\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "pwd" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 50, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "total 80K\n", 56 | "drwxr-xr-x 6 geopar geopar 4.0K Oct 18 06:48 \u001b[0m\u001b[01;34m.\u001b[0m\n", 57 | "drwxrwxrwx 13 geopar geopar 4.0K Oct 17 19:45 \u001b[34;42m..\u001b[0m\n", 58 | "-rw-r--r-- 1 geopar geopar 55K Oct 18 06:48 'Bash Tutorial.ipynb'\n", 59 | "drwxr-xr-x 2 geopar geopar 4.0K Oct 17 20:52 \u001b[01;34mbin\u001b[0m\n", 60 | "drwxr-xr-x 2 geopar geopar 4.0K Oct 18 06:42 \u001b[01;34mdata\u001b[0m\n", 61 | "drwxr-xr-x 2 geopar geopar 4.0K Oct 17 19:46 \u001b[01;34m.ipynb_checkpoints\u001b[0m\n", 62 | "drwxr-xr-x 9 geopar geopar 4.0K Oct 17 20:41 \u001b[01;34mkenlm\u001b[0m\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "ls -lah" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "## Installing project dependencies and building a project from source\n", 75 | "\n", 76 | "Let's say we need to build an N-Gram language model for some corpus. One commonly used tool for this is KenLM. Let's download and build it from source" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "Download KenLM from git repo" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 51, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "name": "stdout", 93 | "output_type": "stream", 94 | "text": [ 95 | "fatal: destination path 'kenlm' already exists and is not an empty directory.\n" 96 | ] 97 | }, 98 | { 99 | "ename": "", 100 | "evalue": "128", 101 | "output_type": "error", 102 | "traceback": [] 103 | } 104 | ], 105 | "source": [ 106 | "git clone https://github.com/kpu/kenlm" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "Install necessary dependencies for building KenLM (follow docs: https://kheafield.com/code/kenlm/dependencies/)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 52, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "# sudo apt-get install build-essential libboost-all-dev cmake zlib1g-dev libbz2-dev liblzma-dev" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "List files in current directory" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 53, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "name": "stdout", 139 | "output_type": "stream", 140 | "text": [ 141 | "total 80K\n", 142 | "drwxr-xr-x 6 geopar geopar 4.0K Oct 18 06:48 \u001b[0m\u001b[01;34m.\u001b[0m\n", 143 | "drwxrwxrwx 13 geopar geopar 4.0K Oct 17 19:45 \u001b[34;42m..\u001b[0m\n", 144 | "-rw-r--r-- 1 geopar geopar 55K Oct 18 06:48 'Bash Tutorial.ipynb'\n", 145 | "drwxr-xr-x 2 geopar geopar 4.0K Oct 17 20:52 \u001b[01;34mbin\u001b[0m\n", 146 | "drwxr-xr-x 2 geopar geopar 4.0K Oct 18 06:42 \u001b[01;34mdata\u001b[0m\n", 147 | "drwxr-xr-x 2 geopar geopar 4.0K Oct 17 19:46 \u001b[01;34m.ipynb_checkpoints\u001b[0m\n", 148 | "drwxr-xr-x 9 geopar geopar 4.0K Oct 17 20:41 \u001b[01;34mkenlm\u001b[0m\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "ls -lah" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "Navigate inside mosesdecoder" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 54, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "cd kenlm" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "Display current working directory" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 55, 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "name": "stdout", 186 | "output_type": "stream", 187 | "text": [ 188 | "/home/geopar/projects/bash_tutorial/kenlm\n" 189 | ] 190 | } 191 | ], 192 | "source": [ 193 | "pwd" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 56, 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "name": "stdout", 203 | "output_type": "stream", 204 | "text": [ 205 | "total 212K\n", 206 | "drwxr-xr-x 9 geopar geopar 4.0K Oct 17 20:41 \u001b[0m\u001b[01;34m.\u001b[0m\n", 207 | "drwxr-xr-x 6 geopar geopar 4.0K Oct 18 06:48 \u001b[01;34m..\u001b[0m\n", 208 | "drwxr-xr-x 7 geopar geopar 4.0K Oct 17 20:41 \u001b[01;34mbuild\u001b[0m\n", 209 | "-rw-r--r-- 1 geopar geopar 696 Oct 17 20:40 BUILDING\n", 210 | "-rwxr-xr-x 1 geopar geopar 81 Oct 17 20:40 \u001b[01;32mclean_query_only.sh\u001b[0m\n", 211 | "drwxr-xr-x 3 geopar geopar 4.0K Oct 17 20:40 \u001b[01;34mcmake\u001b[0m\n", 212 | "-rw-r--r-- 1 geopar geopar 3.7K Oct 17 20:40 CMakeLists.txt\n", 213 | "-rwxr-xr-x 1 geopar geopar 1.2K Oct 17 20:40 \u001b[01;32mcompile_query_only.sh\u001b[0m\n", 214 | "-rw-r--r-- 1 geopar geopar 26K Oct 17 20:40 COPYING\n", 215 | "-rw-r--r-- 1 geopar geopar 35K Oct 17 20:40 COPYING.3\n", 216 | "-rw-r--r-- 1 geopar geopar 7.5K Oct 17 20:40 COPYING.LESSER.3\n", 217 | "-rw-r--r-- 1 geopar geopar 63K Oct 17 20:40 Doxyfile\n", 218 | "drwxr-xr-x 6 geopar geopar 4.0K Oct 17 20:40 \u001b[01;34m.git\u001b[0m\n", 219 | "drwxr-xr-x 3 geopar geopar 4.0K Oct 17 20:40 \u001b[01;34m.github\u001b[0m\n", 220 | "-rw-r--r-- 1 geopar geopar 249 Oct 17 20:40 .gitignore\n", 221 | "-rw-r--r-- 1 geopar geopar 1.2K Oct 17 20:40 LICENSE\n", 222 | "drwxr-xr-x 7 geopar geopar 4.0K Oct 17 20:40 \u001b[01;34mlm\u001b[0m\n", 223 | "-rw-r--r-- 1 geopar geopar 220 Oct 17 20:40 MANIFEST.in\n", 224 | "drwxr-xr-x 2 geopar geopar 4.0K Oct 17 20:40 \u001b[01;34mpython\u001b[0m\n", 225 | "-rw-r--r-- 1 geopar geopar 5.3K Oct 17 20:40 README.md\n", 226 | "-rw-r--r-- 1 geopar geopar 1.9K Oct 17 20:40 setup.py\n", 227 | "drwxr-xr-x 4 geopar geopar 4.0K Oct 17 20:40 \u001b[01;34mutil\u001b[0m\n" 228 | ] 229 | } 230 | ], 231 | "source": [ 232 | "ls -lah" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "Use cmake to compile project (follow instructions: https://github.com/kpu/kenlm)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 11, 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "-- The C compiler identification is GNU 9.3.0\n", 252 | "-- The CXX compiler identification is GNU 9.3.0\n", 253 | "-- Check for working C compiler: /usr/bin/cc\n", 254 | "-- Check for working C compiler: /usr/bin/cc -- works\n", 255 | "-- Detecting C compiler ABI info\n", 256 | "-- Detecting C compiler ABI info - done\n", 257 | "-- Detecting C compile features\n", 258 | "-- Detecting C compile features - done\n", 259 | "-- Check for working CXX compiler: /usr/bin/c++\n", 260 | "-- Check for working CXX compiler: /usr/bin/c++ -- works\n", 261 | "-- Detecting CXX compiler ABI info\n", 262 | "-- Detecting CXX compiler ABI info - done\n", 263 | "-- Detecting CXX compile features\n", 264 | "-- Detecting CXX compile features - done\n", 265 | "-- Found Boost: /usr/lib/x86_64-linux-gnu/cmake/Boost-1.71.0/BoostConfig.cmake (found suitable version \"1.71.0\", minimum required is \"1.41.0\") found components: program_options system thread unit_test_framework \n", 266 | "-- Check if compiler accepts -pthread\n", 267 | "-- Check if compiler accepts -pthread - yes\n", 268 | "-- Found Threads: TRUE \n", 269 | "-- Found ZLIB: /usr/lib/x86_64-linux-gnu/libz.so (found version \"1.2.11\") \n", 270 | "-- Found BZip2: /usr/lib/x86_64-linux-gnu/libbz2.so (found version \"1.0.8\") \n", 271 | "-- Looking for BZ2_bzCompressInit\n", 272 | "-- Looking for BZ2_bzCompressInit - found\n", 273 | "-- Looking for lzma_auto_decoder in /usr/lib/x86_64-linux-gnu/liblzma.so\n", 274 | "-- Looking for lzma_auto_decoder in /usr/lib/x86_64-linux-gnu/liblzma.so - found\n", 275 | "-- Looking for lzma_easy_encoder in /usr/lib/x86_64-linux-gnu/liblzma.so\n", 276 | "-- Looking for lzma_easy_encoder in /usr/lib/x86_64-linux-gnu/liblzma.so - found\n", 277 | "-- Looking for lzma_lzma_preset in /usr/lib/x86_64-linux-gnu/liblzma.so\n", 278 | "-- Looking for lzma_lzma_preset in /usr/lib/x86_64-linux-gnu/liblzma.so - found\n", 279 | "-- Found LibLZMA: /usr/lib/x86_64-linux-gnu/liblzma.so (found version \"5.2.4\") \n", 280 | "-- Found OpenMP_C: -fopenmp (found version \"4.5\") \n", 281 | "-- Found OpenMP_CXX: -fopenmp (found version \"4.5\") \n", 282 | "-- Found OpenMP: TRUE (found version \"4.5\") \n", 283 | "-- Configuring done\n", 284 | "-- Generating done\n", 285 | "-- Build files have been written to: /home/geopar/projects/bash_tutorial/kenlm/build\n", 286 | "\u001b[35m\u001b[1mScanning dependencies of target kenlm_util\u001b[0m\n", 287 | "[ 1%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/bignum.cc.o\u001b[0m\n", 288 | "[ 2%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/diy-fp.cc.o\u001b[0m\n", 289 | "[ 3%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/bignum-dtoa.cc.o\u001b[0m\n", 290 | "[ 4%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/cached-powers.cc.o\u001b[0m\n", 291 | "[ 5%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/double-conversion.cc.o\u001b[0m\n", 292 | "[ 6%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/fast-dtoa.cc.o\u001b[0m\n", 293 | "[ 7%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/fixed-dtoa.cc.o\u001b[0m\n", 294 | "[ 8%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/strtod.cc.o\u001b[0m\n", 295 | "[ 9%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/chain.cc.o\u001b[0m\n", 296 | "[ 10%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/count_records.cc.o\u001b[0m\n", 297 | "[ 11%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/io.cc.o\u001b[0m\n", 298 | "[ 12%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/line_input.cc.o\u001b[0m\n", 299 | "[ 13%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/multi_progress.cc.o\u001b[0m\n", 300 | "[ 14%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/rewindable_stream.cc.o\u001b[0m\n", 301 | "[ 15%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/bit_packing.cc.o\u001b[0m\n", 302 | "[ 16%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/ersatz_progress.cc.o\u001b[0m\n", 303 | "[ 17%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/exception.cc.o\u001b[0m\n", 304 | "[ 18%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/file.cc.o\u001b[0m\n", 305 | "[ 19%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/file_piece.cc.o\u001b[0m\n", 306 | "[ 20%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/float_to_string.cc.o\u001b[0m\n", 307 | "[ 21%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/integer_to_string.cc.o\u001b[0m\n", 308 | "[ 22%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/mmap.cc.o\u001b[0m\n", 309 | "[ 23%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/murmur_hash.cc.o\u001b[0m\n", 310 | "[ 25%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/parallel_read.cc.o\u001b[0m\n", 311 | "[ 26%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/pool.cc.o\u001b[0m\n", 312 | "[ 27%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/read_compressed.cc.o\u001b[0m\n", 313 | "[ 28%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/scoped.cc.o\u001b[0m\n", 314 | "[ 29%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/spaces.cc.o\u001b[0m\n", 315 | "[ 30%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/string_piece.cc.o\u001b[0m\n", 316 | "[ 31%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/usage.cc.o\u001b[0m\n", 317 | "[ 32%] \u001b[32m\u001b[1mLinking CXX static library ../lib/libkenlm_util.a\u001b[0m\n", 318 | "[ 32%] Built target kenlm_util\n", 319 | "\u001b[35m\u001b[1mScanning dependencies of target kenlm\u001b[0m\n", 320 | "\u001b[35m\u001b[1mScanning dependencies of target probing_hash_table_benchmark\u001b[0m\n", 321 | "\u001b[35m\u001b[1mScanning dependencies of target kenlm_filter\u001b[0m\n", 322 | "[ 33%] \u001b[32mBuilding CXX object util/CMakeFiles/probing_hash_table_benchmark.dir/probing_hash_table_benchmark_main.cc.o\u001b[0m\n", 323 | "[ 34%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/arpa_io.cc.o\u001b[0m\n", 324 | "[ 35%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/phrase.cc.o\u001b[0m\n", 325 | "[ 36%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/bhiksha.cc.o\u001b[0m\n", 326 | "[ 37%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/binary_format.cc.o\u001b[0m\n", 327 | "[ 38%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/vocab.cc.o\u001b[0m\n", 328 | "[ 39%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/config.cc.o\u001b[0m\n", 329 | "[ 40%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/lm_exception.cc.o\u001b[0m\n", 330 | "[ 41%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/model.cc.o\u001b[0m\n", 331 | "[ 42%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/quantize.cc.o\u001b[0m\n", 332 | "[ 43%] \u001b[32m\u001b[1mLinking CXX static library ../../lib/libkenlm_filter.a\u001b[0m\n", 333 | "[ 43%] Built target kenlm_filter\n", 334 | "[ 44%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/read_arpa.cc.o\u001b[0m\n", 335 | "[ 45%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/search_hashed.cc.o\u001b[0m\n", 336 | "[ 46%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/search_trie.cc.o\u001b[0m\n", 337 | "[ 47%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/sizes.cc.o\u001b[0m\n", 338 | "[ 48%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/trie.cc.o\u001b[0m\n", 339 | "[ 50%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/trie_sort.cc.o\u001b[0m\n", 340 | "[ 51%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/value_build.cc.o\u001b[0m\n", 341 | "[ 52%] \u001b[32m\u001b[1mLinking CXX executable ../bin/probing_hash_table_benchmark\u001b[0m\n", 342 | "[ 52%] Built target probing_hash_table_benchmark\n", 343 | "[ 53%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/virtual_interface.cc.o\u001b[0m\n", 344 | "[ 54%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/vocab.cc.o\u001b[0m\n", 345 | "[ 55%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/model_buffer.cc.o\u001b[0m\n", 346 | "[ 56%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/print.cc.o\u001b[0m\n", 347 | "[ 57%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/renumber.cc.o\u001b[0m\n", 348 | "[ 58%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/size_option.cc.o\u001b[0m\n", 349 | "[ 59%] \u001b[32m\u001b[1mLinking CXX static library ../lib/libkenlm.a\u001b[0m\n", 350 | "[ 59%] Built target kenlm\n", 351 | "\u001b[35m\u001b[1mScanning dependencies of target fragment\u001b[0m\n", 352 | "\u001b[35m\u001b[1mScanning dependencies of target kenlm_benchmark\u001b[0m\n", 353 | "\u001b[35m\u001b[1mScanning dependencies of target query\u001b[0m\n", 354 | "\u001b[35m\u001b[1mScanning dependencies of target build_binary\u001b[0m\n", 355 | "[ 60%] \u001b[32mBuilding CXX object lm/CMakeFiles/query.dir/query_main.cc.o\u001b[0m\n", 356 | "[ 61%] \u001b[32mBuilding CXX object lm/CMakeFiles/fragment.dir/fragment_main.cc.o\u001b[0m\n", 357 | "[ 62%] \u001b[32mBuilding CXX object lm/CMakeFiles/build_binary.dir/build_binary_main.cc.o\u001b[0m\n", 358 | "[ 63%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm_benchmark.dir/kenlm_benchmark_main.cc.o\u001b[0m\n", 359 | "[ 64%] \u001b[32m\u001b[1mLinking CXX executable ../bin/fragment\u001b[0m\n", 360 | "[ 64%] Built target fragment\n", 361 | "\u001b[35m\u001b[1mScanning dependencies of target kenlm_builder\u001b[0m\n", 362 | "[ 65%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/adjust_counts.cc.o\u001b[0m\n", 363 | "[ 66%] \u001b[32m\u001b[1mLinking CXX executable ../bin/build_binary\u001b[0m\n", 364 | "[ 66%] Built target build_binary\n", 365 | "\u001b[35m\u001b[1mScanning dependencies of target phrase_table_vocab\u001b[0m\n", 366 | "[ 67%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/phrase_table_vocab.dir/phrase_table_vocab_main.cc.o\u001b[0m\n" 367 | ] 368 | }, 369 | { 370 | "name": "stdout", 371 | "output_type": "stream", 372 | "text": [ 373 | "[ 68%] \u001b[32m\u001b[1mLinking CXX executable ../bin/query\u001b[0m\n", 374 | "[ 68%] Built target query\n", 375 | "\u001b[35m\u001b[1mScanning dependencies of target filter\u001b[0m\n", 376 | "[ 69%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/filter.dir/filter_main.cc.o\u001b[0m\n", 377 | "[ 70%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/phrase_table_vocab\u001b[0m\n", 378 | "[ 70%] Built target phrase_table_vocab\n", 379 | "\u001b[35m\u001b[1mScanning dependencies of target kenlm_interpolate\u001b[0m\n", 380 | "[ 71%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/backoff_reunification.cc.o\u001b[0m\n", 381 | "[ 72%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/corpus_count.cc.o\u001b[0m\n", 382 | "[ 73%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/bounded_sequence_encoding.cc.o\u001b[0m\n", 383 | "[ 75%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/merge_probabilities.cc.o\u001b[0m\n", 384 | "[ 76%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/initial_probabilities.cc.o\u001b[0m\n", 385 | "[ 77%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/merge_vocab.cc.o\u001b[0m\n", 386 | "[ 78%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/normalize.cc.o\u001b[0m\n", 387 | "[ 79%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/interpolate.cc.o\u001b[0m\n", 388 | "[ 80%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/output.cc.o\u001b[0m\n", 389 | "[ 81%] \u001b[32m\u001b[1mLinking CXX executable ../bin/kenlm_benchmark\u001b[0m\n", 390 | "[ 81%] Built target kenlm_benchmark\n", 391 | "[ 82%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/pipeline.cc.o\u001b[0m\n", 392 | "[ 83%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/pipeline.cc.o\u001b[0m\n", 393 | "[ 84%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/filter\u001b[0m\n", 394 | "[ 84%] Built target filter\n", 395 | "[ 85%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/split_worker.cc.o\u001b[0m\n", 396 | "[ 86%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/tune_derivatives.cc.o\u001b[0m\n", 397 | "[ 87%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/tune_instances.cc.o\u001b[0m\n", 398 | "[ 88%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/tune_weights.cc.o\u001b[0m\n", 399 | "[ 89%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/universal_vocab.cc.o\u001b[0m\n", 400 | "[ 90%] \u001b[32m\u001b[1mLinking CXX static library ../../lib/libkenlm_builder.a\u001b[0m\n", 401 | "[ 90%] Built target kenlm_builder\n", 402 | "\u001b[35m\u001b[1mScanning dependencies of target lmplz\u001b[0m\n", 403 | "\u001b[35m\u001b[1mScanning dependencies of target count_ngrams\u001b[0m\n", 404 | "[ 91%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/lmplz.dir/lmplz_main.cc.o\u001b[0m\n", 405 | "[ 92%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/count_ngrams.dir/count_ngrams_main.cc.o\u001b[0m\n", 406 | "[ 93%] \u001b[32m\u001b[1mLinking CXX static library ../../lib/libkenlm_interpolate.a\u001b[0m\n", 407 | "[ 93%] Built target kenlm_interpolate\n", 408 | "\u001b[35m\u001b[1mScanning dependencies of target interpolate\u001b[0m\n", 409 | "\u001b[35m\u001b[1mScanning dependencies of target streaming_example\u001b[0m\n", 410 | "[ 94%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/streaming_example.dir/streaming_example_main.cc.o\u001b[0m\n", 411 | "[ 95%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/interpolate.dir/interpolate_main.cc.o\u001b[0m\n", 412 | "[ 96%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/lmplz\u001b[0m\n", 413 | "[ 96%] Built target lmplz\n", 414 | "[ 97%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/count_ngrams\u001b[0m\n", 415 | "[ 97%] Built target count_ngrams\n", 416 | "[ 98%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/interpolate\u001b[0m\n", 417 | "[ 98%] Built target interpolate\n", 418 | "[100%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/streaming_example\u001b[0m\n", 419 | "[100%] Built target streaming_example\n" 420 | ] 421 | } 422 | ], 423 | "source": [ 424 | "mkdir build\n", 425 | "cd build\n", 426 | "cmake ..\n", 427 | "make -j4\n", 428 | "# make install to install kenlm system-wide" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": {}, 434 | "source": [ 435 | "The previous commands built the KenLM binaries inside the bin folder. Let's copy it in a more accessible directory" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 57, 441 | "metadata": {}, 442 | "outputs": [ 443 | { 444 | "name": "stdout", 445 | "output_type": "stream", 446 | "text": [ 447 | "total 192\n", 448 | "drwxr-xr-x 7 geopar geopar 4096 Oct 17 20:41 \u001b[0m\u001b[01;34mbuild\u001b[0m\n", 449 | "-rw-r--r-- 1 geopar geopar 696 Oct 17 20:40 BUILDING\n", 450 | "-rwxr-xr-x 1 geopar geopar 81 Oct 17 20:40 \u001b[01;32mclean_query_only.sh\u001b[0m\n", 451 | "drwxr-xr-x 3 geopar geopar 4096 Oct 17 20:40 \u001b[01;34mcmake\u001b[0m\n", 452 | "-rw-r--r-- 1 geopar geopar 3689 Oct 17 20:40 CMakeLists.txt\n", 453 | "-rwxr-xr-x 1 geopar geopar 1154 Oct 17 20:40 \u001b[01;32mcompile_query_only.sh\u001b[0m\n", 454 | "-rw-r--r-- 1 geopar geopar 26530 Oct 17 20:40 COPYING\n", 455 | "-rw-r--r-- 1 geopar geopar 35147 Oct 17 20:40 COPYING.3\n", 456 | "-rw-r--r-- 1 geopar geopar 7637 Oct 17 20:40 COPYING.LESSER.3\n", 457 | "-rw-r--r-- 1 geopar geopar 63537 Oct 17 20:40 Doxyfile\n", 458 | "-rw-r--r-- 1 geopar geopar 1150 Oct 17 20:40 LICENSE\n", 459 | "drwxr-xr-x 7 geopar geopar 4096 Oct 17 20:40 \u001b[01;34mlm\u001b[0m\n", 460 | "-rw-r--r-- 1 geopar geopar 220 Oct 17 20:40 MANIFEST.in\n", 461 | "drwxr-xr-x 2 geopar geopar 4096 Oct 17 20:40 \u001b[01;34mpython\u001b[0m\n", 462 | "-rw-r--r-- 1 geopar geopar 5394 Oct 17 20:40 README.md\n", 463 | "-rw-r--r-- 1 geopar geopar 1937 Oct 17 20:40 setup.py\n", 464 | "drwxr-xr-x 4 geopar geopar 4096 Oct 17 20:40 \u001b[01;34mutil\u001b[0m\n" 465 | ] 466 | } 467 | ], 468 | "source": [ 469 | "ls -l" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 58, 475 | "metadata": {}, 476 | "outputs": [ 477 | { 478 | "name": "stdout", 479 | "output_type": "stream", 480 | "text": [ 481 | "cp: cannot stat 'bin': No such file or directory\n", 482 | "total 56K\n", 483 | "drwxrwxrwx 13 geopar geopar 4.0K Oct 17 19:45 \u001b[0m\u001b[34;42m.\u001b[0m\n", 484 | "drwxr-xr-x 24 geopar geopar 4.0K Oct 17 20:53 \u001b[01;34m..\u001b[0m\n", 485 | "drwxr-xr-x 6 geopar geopar 4.0K Oct 18 06:48 \u001b[01;34mbash_tutorial\u001b[0m\n", 486 | "drwxr-xr-x 4 geopar geopar 4.0K Sep 22 16:06 \u001b[01;34mexam-generator\u001b[0m\n", 487 | "drwxr-xr-x 3 geopar geopar 4.0K Oct 7 20:20 \u001b[01;34mfsspell\u001b[0m\n", 488 | "-rw-r--r-- 1 geopar geopar 174 Oct 8 18:25 mispel.py\n", 489 | "drwxr-xr-x 3 geopar geopar 4.0K Oct 8 18:20 \u001b[01;34m.mypy_cache\u001b[0m\n", 490 | "drwxr-xr-x 4 geopar geopar 4.0K Sep 3 13:23 \u001b[01;34mpython-lab\u001b[0m\n", 491 | "drwxrwxrwx 17 geopar geopar 4.0K Sep 7 06:21 \u001b[34;42mslp\u001b[0m\n", 492 | "drwxr-xr-x 14 geopar geopar 4.0K Sep 28 18:57 \u001b[01;34mslp_daptmlm\u001b[0m\n", 493 | "drwxr-xr-x 10 geopar geopar 4.0K Sep 6 14:21 \u001b[01;34mspace-vim\u001b[0m\n", 494 | "drwxr-xr-x 2 geopar geopar 4.0K Oct 7 18:19 \u001b[01;34mspellweb\u001b[0m\n", 495 | "drwxr-xr-x 14 geopar geopar 4.0K Sep 7 08:09 \u001b[01;34msplchk\u001b[0m\n", 496 | "drwxr-xr-x 15 geopar geopar 4.0K Sep 15 19:20 \u001b[01;34mtransformers\u001b[0m\n" 497 | ] 498 | } 499 | ], 500 | "source": [ 501 | "cp -r bin ../../ # 2 directories up\n", 502 | "cd ../../\n", 503 | "ls -lah" 504 | ] 505 | }, 506 | { 507 | "cell_type": "markdown", 508 | "metadata": {}, 509 | "source": [ 510 | "## Download and preprocessing training corpus\n", 511 | "\n", 512 | "Let's get a book from project gutenberg and clean it up using bash" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 20, 518 | "metadata": {}, 519 | "outputs": [ 520 | { 521 | "name": "stdout", 522 | "output_type": "stream", 523 | "text": [ 524 | "--2020-10-17 20:54:39-- http://www.gutenberg.org/cache/epub/345/pg345.txt\n", 525 | "Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47\n", 526 | "Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:80... connected.\n", 527 | "HTTP request sent, awaiting response... 200 OK\n", 528 | "Length: 883160 (862K) [text/plain]\n", 529 | "Saving to: ‘data/dracula.txt’\n", 530 | "\n", 531 | "data/dracula.txt 100%[===================>] 862.46K 272KB/s in 3.2s \n", 532 | "\n", 533 | "2020-10-17 20:54:43 (272 KB/s) - ‘data/dracula.txt’ saved [883160/883160]\n", 534 | "\n" 535 | ] 536 | } 537 | ], 538 | "source": [ 539 | "mkdir data\n", 540 | "wget -O data/dracula.txt http://www.gutenberg.org/cache/epub/345/pg345.txt" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": 36, 546 | "metadata": {}, 547 | "outputs": [ 548 | { 549 | "name": "stdout", 550 | "output_type": "stream", 551 | "text": [ 552 | "total 1.7M\n", 553 | "drwxr-xr-x 2 geopar geopar 4.0K Oct 18 06:42 \u001b[0m\u001b[01;34m.\u001b[0m\n", 554 | "drwxr-xr-x 6 geopar geopar 4.0K Oct 18 06:42 \u001b[01;34m..\u001b[0m\n", 555 | "-rw-r--r-- 1 geopar geopar 859K Oct 18 06:42 dracula1.txt\n", 556 | "-rw-r--r-- 1 geopar geopar 863K Oct 1 11:00 dracula.txt\n" 557 | ] 558 | } 559 | ], 560 | "source": [ 561 | "ls -lah data" 562 | ] 563 | }, 564 | { 565 | "cell_type": "markdown", 566 | "metadata": {}, 567 | "source": [ 568 | "Count the number of lines, words and characters using wc" 569 | ] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "execution_count": 83, 574 | "metadata": {}, 575 | "outputs": [ 576 | { 577 | "name": "stdout", 578 | "output_type": "stream", 579 | "text": [ 580 | "15973 data/dracula.txt\n" 581 | ] 582 | } 583 | ], 584 | "source": [ 585 | "wc -l data/dracula.txt" 586 | ] 587 | }, 588 | { 589 | "cell_type": "markdown", 590 | "metadata": {}, 591 | "source": [ 592 | "We can format column printing using awk" 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": 78, 598 | "metadata": {}, 599 | "outputs": [ 600 | { 601 | "name": "stdout", 602 | "output_type": "stream", 603 | "text": [ 604 | "data/dracula.txt contains 15973 lines\n" 605 | ] 606 | } 607 | ], 608 | "source": [ 609 | "wc -l data/dracula.txt | awk '{printf \"%s contains %s lines\\n\", $2, $1}'" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": 65, 615 | "metadata": {}, 616 | "outputs": [ 617 | { 618 | "name": "stdout", 619 | "output_type": "stream", 620 | "text": [ 621 | "data/dracula.txt contains 164424 words\n" 622 | ] 623 | } 624 | ], 625 | "source": [ 626 | "wc -w data/dracula.txt | awk '{printf \"%s contains %s words\\n\", $2, $1}'" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": 66, 632 | "metadata": {}, 633 | "outputs": [ 634 | { 635 | "name": "stdout", 636 | "output_type": "stream", 637 | "text": [ 638 | "data/dracula.txt contains 883160 characters\n" 639 | ] 640 | } 641 | ], 642 | "source": [ 643 | "wc -c data/dracula.txt | awk '{printf \"%s contains %s characters\\n\", $2, $1}'" 644 | ] 645 | }, 646 | { 647 | "cell_type": "markdown", 648 | "metadata": {}, 649 | "source": [ 650 | "Inspect the first 250 lines using head" 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": 70, 656 | "metadata": {}, 657 | "outputs": [ 658 | { 659 | "name": "stdout", 660 | "output_type": "stream", 661 | "text": [ 662 | "The Project Gutenberg EBook of Dracula, by Bram Stoker\n", 663 | "\n", 664 | "This eBook is for the use of anyone anywhere at no cost and with\n", 665 | "almost no restrictions whatsoever. You may copy it, give it away or\n", 666 | "re-use it under the terms of the Project Gutenberg License included\n", 667 | "with this eBook or online at www.gutenberg.org/license\n", 668 | "\n", 669 | "\n", 670 | "Title: Dracula\n", 671 | "\n", 672 | "Author: Bram Stoker\n", 673 | "\n", 674 | "Release Date: August 16, 2013 [EBook #345]\n", 675 | "\n", 676 | "Language: English\n", 677 | "\n", 678 | "\n", 679 | "*** START OF THIS PROJECT GUTENBERG EBOOK DRACULA ***\n", 680 | "\n", 681 | "\n", 682 | "\n", 683 | "\n", 684 | "Produced by Chuck Greif and the Online Distributed\n", 685 | "Proofreading Team at http://www.pgdp.net (This file was\n", 686 | "produced from images generously made available by The\n", 687 | "Internet Archive)\n", 688 | "\n", 689 | "\n", 690 | "\n", 691 | "\n", 692 | "\n", 693 | "\n", 694 | "\n", 695 | " DRACULA\n", 696 | "\n", 697 | "\n", 698 | "\n", 699 | "\n", 700 | "\n", 701 | " DRACULA\n", 702 | "\n", 703 | " _by_\n", 704 | "\n", 705 | " Bram Stoker\n", 706 | "\n", 707 | " [Illustration: colophon]\n", 708 | "\n", 709 | " NEW YORK\n", 710 | "\n", 711 | " GROSSET & DUNLAP\n", 712 | "\n", 713 | " _Publishers_\n", 714 | "\n", 715 | " Copyright, 1897, in the United States of America, according\n", 716 | " to Act of Congress, by Bram Stoker\n", 717 | "\n", 718 | " [_All rights reserved._]\n", 719 | "\n", 720 | " PRINTED IN THE UNITED STATES\n", 721 | " AT\n", 722 | " THE COUNTRY LIFE PRESS, GARDEN CITY, N.Y.\n", 723 | "\n", 724 | "\n", 725 | "\n", 726 | "\n", 727 | " TO\n", 728 | "\n", 729 | " MY DEAR FRIEND\n", 730 | "\n", 731 | " HOMMY-BEG\n", 732 | "\n", 733 | "\n", 734 | "\n", 735 | "\n", 736 | "CONTENTS\n", 737 | "\n", 738 | "\n", 739 | "CHAPTER I\n", 740 | " Page\n", 741 | "\n", 742 | "Jonathan Harker's Journal 1\n", 743 | "\n", 744 | "CHAPTER II\n", 745 | "\n", 746 | "Jonathan Harker's Journal 14\n", 747 | "\n", 748 | "CHAPTER III\n", 749 | "\n", 750 | "Jonathan Harker's Journal 26\n", 751 | "\n", 752 | "CHAPTER IV\n", 753 | "\n", 754 | "Jonathan Harker's Journal 38\n", 755 | "\n", 756 | "CHAPTER V\n", 757 | "\n", 758 | "Letters--Lucy and Mina 51\n", 759 | "\n", 760 | "CHAPTER VI\n", 761 | "\n", 762 | "Mina Murray's Journal 59\n", 763 | "\n", 764 | "CHAPTER VII\n", 765 | "\n", 766 | "Cutting from \"The Dailygraph,\" 8 August 71\n", 767 | "\n", 768 | "CHAPTER VIII\n", 769 | "\n", 770 | "Mina Murray's Journal 84\n", 771 | "\n", 772 | "CHAPTER IX\n", 773 | "\n", 774 | "Mina Murray's Journal 98\n", 775 | "\n", 776 | "CHAPTER X\n", 777 | "\n", 778 | "Mina Murray's Journal 111\n", 779 | "\n", 780 | "CHAPTER XI\n", 781 | "\n", 782 | "Lucy Westenra's Diary 124\n", 783 | "\n", 784 | "CHAPTER XII\n", 785 | "\n", 786 | "Dr. Seward's Diary 136\n", 787 | "\n", 788 | "CHAPTER XIII\n", 789 | "\n", 790 | "Dr. Seward's Diary 152\n", 791 | "\n", 792 | "CHAPTER XIV\n", 793 | "\n", 794 | "Mina Harker's Journal 167\n", 795 | "\n", 796 | "CHAPTER XV\n", 797 | "\n", 798 | "Dr. Seward's Diary 181\n", 799 | "\n", 800 | "CHAPTER XVI\n", 801 | "\n", 802 | "Dr. Seward's Diary 194\n", 803 | "\n", 804 | "CHAPTER XVII\n", 805 | "\n", 806 | "Dr. Seward's Diary 204\n", 807 | "\n", 808 | "CHAPTER XVIII\n", 809 | "\n", 810 | "Dr. Seward's Diary 216\n", 811 | "\n", 812 | "CHAPTER XIX\n", 813 | "\n", 814 | "Jonathan Harker's Journal 231\n", 815 | "\n", 816 | "CHAPTER XX\n", 817 | "\n", 818 | "Jonathan Harker's Journal 243\n", 819 | "\n", 820 | "CHAPTER XXI\n", 821 | "\n", 822 | "Dr. Seward's Diary 256\n", 823 | "\n", 824 | "CHAPTER XXII\n", 825 | "\n", 826 | "Jonathan Harker's Journal 269\n", 827 | "\n", 828 | "CHAPTER XXIII\n", 829 | "\n", 830 | "Dr. Seward's Diary 281\n", 831 | "\n", 832 | "CHAPTER XXIV\n", 833 | "\n", 834 | "Dr. Seward's Phonograph Diary, spoken by Van Helsing 294\n", 835 | "\n", 836 | "CHAPTER XXV\n", 837 | "\n", 838 | "Dr. Seward's Diary 308\n", 839 | "\n", 840 | "CHAPTER XXVI\n", 841 | "\n", 842 | "Dr. Seward's Diary 322\n", 843 | "\n", 844 | "CHAPTER XXVII\n", 845 | "\n", 846 | "Mina Harker's Journal 338\n", 847 | "\n", 848 | "\n", 849 | "\n", 850 | "\n", 851 | "DRACULA\n", 852 | "\n", 853 | "\n", 854 | "\n", 855 | "\n", 856 | "CHAPTER I\n", 857 | "\n", 858 | "JONATHAN HARKER'S JOURNAL\n", 859 | "\n", 860 | "(_Kept in shorthand._)\n", 861 | "\n", 862 | "\n", 863 | "_3 May. Bistritz._--Left Munich at 8:35 P. M., on 1st May, arriving at\n", 864 | "Vienna early next morning; should have arrived at 6:46, but train was an\n", 865 | "hour late. Buda-Pesth seems a wonderful place, from the glimpse which I\n", 866 | "got of it from the train and the little I could walk through the\n", 867 | "streets. I feared to go very far from the station, as we had arrived\n", 868 | "late and would start as near the correct time as possible. The\n", 869 | "impression I had was that we were leaving the West and entering the\n", 870 | "East; the most western of splendid bridges over the Danube, which is\n", 871 | "here of noble width and depth, took us among the traditions of Turkish\n", 872 | "rule.\n", 873 | "\n", 874 | "We left in pretty good time, and came after nightfall to Klausenburgh.\n", 875 | "Here I stopped for the night at the Hotel Royale. I had for dinner, or\n", 876 | "rather supper, a chicken done up some way with red pepper, which was\n", 877 | "very good but thirsty. (_Mem._, get recipe for Mina.) I asked the\n", 878 | "waiter, and he said it was called \"paprika hendl,\" and that, as it was a\n", 879 | "national dish, I should be able to get it anywhere along the\n", 880 | "Carpathians. I found my smattering of German very useful here; indeed, I\n", 881 | "don't know how I should be able to get on without it.\n", 882 | "\n", 883 | "Having had some time at my disposal when in London, I had visited the\n", 884 | "British Museum, and made search among the books and maps in the library\n", 885 | "regarding Transylvania; it had struck me that some foreknowledge of the\n", 886 | "country could hardly fail to have some importance in dealing with a\n", 887 | "nobleman of that country. I find that the district he named is in the\n", 888 | "extreme east of the country, just on the borders of three states,\n", 889 | "Transylvania, Moldavia and Bukovina, in the midst of the Carpathian\n", 890 | "mountains; one of the wildest and least known portions of Europe. I was\n", 891 | "not able to light on any map or work giving the exact locality of the\n", 892 | "Castle Dracula, as there are no maps of this country as yet to compare\n", 893 | "with our own Ordnance Survey maps; but I found that Bistritz, the post\n", 894 | "town named by Count Dracula, is a fairly well-known place. I shall enter\n", 895 | "here some of my notes, as they may refresh my memory when I talk over my\n", 896 | "travels with Mina.\n", 897 | "\n", 898 | "In the population of Transylvania there are four distinct nationalities:\n", 899 | "Saxons in the South, and mixed with them the Wallachs, who are the\n", 900 | "descendants of the Dacians; Magyars in the West, and Szekelys in the\n", 901 | "East and North. I am going among the latter, who claim to be descended\n", 902 | "from Attila and the Huns. This may be so, for when the Magyars conquered\n", 903 | "the country in the eleventh century they found the Huns settled in it. I\n", 904 | "read that every known superstition in the world is gathered into the\n", 905 | "horseshoe of the Carpathians, as if it were the centre of some sort of\n", 906 | "imaginative whirlpool; if so my stay may be very interesting. (_Mem._, I\n", 907 | "must ask the Count all about them.)\n", 908 | "\n", 909 | "I did not sleep well, though my bed was comfortable enough, for I had\n", 910 | "all sorts of queer dreams. There was a dog howling all night under my\n", 911 | "window, which may have had something to do with it; or it may have been\n" 912 | ] 913 | } 914 | ], 915 | "source": [ 916 | "head -250 data/dracula.txt" 917 | ] 918 | }, 919 | { 920 | "cell_type": "markdown", 921 | "metadata": {}, 922 | "source": [ 923 | "We see that the first 200 lines contain project gutenberg specific text and the tableof contents. We can remove these using sed. Then we inspect the new file using head and wc." 924 | ] 925 | }, 926 | { 927 | "cell_type": "code", 928 | "execution_count": 88, 929 | "metadata": {}, 930 | "outputs": [ 931 | { 932 | "name": "stdout", 933 | "output_type": "stream", 934 | "text": [ 935 | "\n", 936 | "_3 May. Bistritz._--Left Munich at 8:35 P. M., on 1st May, arriving at\n", 937 | "Vienna early next morning; should have arrived at 6:46, but train was an\n", 938 | "hour late. Buda-Pesth seems a wonderful place, from the glimpse which I\n", 939 | "got of it from the train and the little I could walk through the\n", 940 | "streets. I feared to go very far from the station, as we had arrived\n", 941 | "late and would start as near the correct time as possible. The\n", 942 | "impression I had was that we were leaving the West and entering the\n", 943 | "East; the most western of splendid bridges over the Danube, which is\n", 944 | "here of noble width and depth, took us among the traditions of Turkish\n" 945 | ] 946 | } 947 | ], 948 | "source": [ 949 | "sed -e \"1,200d\" data/dracula.txt > data/dracula1.txt\n", 950 | "head data/dracula1.txt" 951 | ] 952 | }, 953 | { 954 | "cell_type": "code", 955 | "execution_count": 89, 956 | "metadata": {}, 957 | "outputs": [ 958 | { 959 | "name": "stdout", 960 | "output_type": "stream", 961 | "text": [ 962 | "data/dracula1.txt contains 15773 lines, 164092 words and 878987 characters\n" 963 | ] 964 | } 965 | ], 966 | "source": [ 967 | "wc data/dracula1.txt | \\\n", 968 | " awk '{\n", 969 | " printf \"%s contains %s lines, %s words and %s characters\\n\", $4, $1, $2, $3\n", 970 | " }'" 971 | ] 972 | }, 973 | { 974 | "cell_type": "markdown", 975 | "metadata": {}, 976 | "source": [ 977 | "We can also remove all empty lines using sed" 978 | ] 979 | }, 980 | { 981 | "cell_type": "code", 982 | "execution_count": 92, 983 | "metadata": {}, 984 | "outputs": [ 985 | { 986 | "name": "stdout", 987 | "output_type": "stream", 988 | "text": [ 989 | "data/dracula2.txt contains 13323 lines, 164092 words and 874087 characters\n" 990 | ] 991 | } 992 | ], 993 | "source": [ 994 | "sed -r '/^\\s*$/d' data/dracula1.txt > data/dracula2.txt\n", 995 | "\n", 996 | "wc data/dracula2.txt | \\\n", 997 | " awk '{\n", 998 | " printf \"%s contains %s lines, %s words and %s characters\\n\", $4, $1, $2, $3\n", 999 | " }'" 1000 | ] 1001 | }, 1002 | { 1003 | "cell_type": "markdown", 1004 | "metadata": {}, 1005 | "source": [ 1006 | "Convert all characters to lowercase using tr" 1007 | ] 1008 | }, 1009 | { 1010 | "cell_type": "code", 1011 | "execution_count": 94, 1012 | "metadata": {}, 1013 | "outputs": [ 1014 | { 1015 | "name": "stdout", 1016 | "output_type": "stream", 1017 | "text": [ 1018 | "_3 may. bistritz._--left munich at 8:35 p. m., on 1st may, arriving at\n", 1019 | "vienna early next morning; should have arrived at 6:46, but train was an\n", 1020 | "hour late. buda-pesth seems a wonderful place, from the glimpse which i\n", 1021 | "got of it from the train and the little i could walk through the\n", 1022 | "streets. i feared to go very far from the station, as we had arrived\n", 1023 | "late and would start as near the correct time as possible. the\n", 1024 | "impression i had was that we were leaving the west and entering the\n", 1025 | "east; the most western of splendid bridges over the danube, which is\n", 1026 | "here of noble width and depth, took us among the traditions of turkish\n", 1027 | "rule.\n" 1028 | ] 1029 | } 1030 | ], 1031 | "source": [ 1032 | "tr A-Z a-z data/dracula3.txt\n", 1033 | "head data/dracula3.txt" 1034 | ] 1035 | }, 1036 | { 1037 | "cell_type": "markdown", 1038 | "metadata": {}, 1039 | "source": [ 1040 | "And remove punctuation and numbers" 1041 | ] 1042 | }, 1043 | { 1044 | "cell_type": "code", 1045 | "execution_count": 155, 1046 | "metadata": {}, 1047 | "outputs": [ 1048 | { 1049 | "name": "stdout", 1050 | "output_type": "stream", 1051 | "text": [ 1052 | " may bistritzleft munich at p m on st may arriving at\n", 1053 | "vienna early next morning should have arrived at but train was an\n", 1054 | "hour late budapesth seems a wonderful place from the glimpse which i\n", 1055 | "got of it from the train and the little i could walk through the\n", 1056 | "streets i feared to go very far from the station as we had arrived\n", 1057 | "late and would start as near the correct time as possible the\n", 1058 | "impression i had was that we were leaving the west and entering the\n", 1059 | "east the most western of splendid bridges over the danube which is\n", 1060 | "here of noble width and depth took us among the traditions of turkish\n", 1061 | "rule\n" 1062 | ] 1063 | } 1064 | ], 1065 | "source": [ 1066 | "cat data/dracula3.txt | tr -d [:punct:] | tr -d [:digit:] > data/dracula4.txt\n", 1067 | "head data/dracula4.txt" 1068 | ] 1069 | }, 1070 | { 1071 | "cell_type": "markdown", 1072 | "metadata": {}, 1073 | "source": [ 1074 | "Now we can perform a word frequency analysis using uniq and sort.\n", 1075 | "First we need to substitute spaces with newlines and then sort them to group the same words together. Uniq then will count consecutive lines that are the same and print word frequencies. We reverse sort the result to print most frequent words first" 1076 | ] 1077 | }, 1078 | { 1079 | "cell_type": "code", 1080 | "execution_count": 6, 1081 | "metadata": {}, 1082 | "outputs": [], 1083 | "source": [ 1084 | "# sed -r 's/\\s+/\\n/g' data/dracula4.txt | \\ # Replace spaces with new lines\n", 1085 | "# awk 'NF' | \\ # another way to remove empty lines\n", 1086 | "# sort | \\ # alphabetical sort\n", 1087 | "# uniq -c | \\ # word count \n", 1088 | "# sort -nr | \\ # reverse numeric sort\n", 1089 | "# awk '{$1=$1; print}' | \\ # strip leading and trailing whitespace\n", 1090 | "# awk 'BEGIN { OFS=\"\\t\" } {print $2,$1}' > data/wordcount.txt # reverse columns\n", 1091 | " \n", 1092 | " \n", 1093 | "sed -r 's/\\s+/\\n/g' data/dracula4.txt | \\\n", 1094 | " awk 'NF' | \\\n", 1095 | " sort | \\\n", 1096 | " uniq -c | \\\n", 1097 | " sort -nr | \\\n", 1098 | " awk '{$1=$1; print}' | \\\n", 1099 | " awk 'BEGIN { OFS=\" \" } {print $2,$1}' > data/wordcount.txt" 1100 | ] 1101 | }, 1102 | { 1103 | "cell_type": "code", 1104 | "execution_count": 56, 1105 | "metadata": {}, 1106 | "outputs": [ 1107 | { 1108 | "name": "stdout", 1109 | "output_type": "stream", 1110 | "text": [ 1111 | "the 8027\n", 1112 | "and 5893\n", 1113 | "i 4710\n", 1114 | "to 4538\n", 1115 | "of 3732\n", 1116 | "a 2962\n", 1117 | "in 2555\n", 1118 | "he 2543\n", 1119 | "that 2455\n", 1120 | "it 2138\n" 1121 | ] 1122 | } 1123 | ], 1124 | "source": [ 1125 | "head data/wordcount.txt" 1126 | ] 1127 | }, 1128 | { 1129 | "cell_type": "markdown", 1130 | "metadata": {}, 1131 | "source": [ 1132 | "We can even create a histogram of word counts using a simple python script." 1133 | ] 1134 | }, 1135 | { 1136 | "cell_type": "code", 1137 | "execution_count": 72, 1138 | "metadata": {}, 1139 | "outputs": [], 1140 | "source": [ 1141 | "function histogram {\n", 1142 | "python3 -c 'import sys\n", 1143 | "for line in sys.stdin:\n", 1144 | " data, width = line.split()\n", 1145 | " print(\"{:<15}{:=<{width}}\".format(data, \"\", width=int(int(width) / 75)))' # each = corresponds to a count of 75\n", 1146 | "\n", 1147 | "}" 1148 | ] 1149 | }, 1150 | { 1151 | "cell_type": "code", 1152 | "execution_count": 73, 1153 | "metadata": {}, 1154 | "outputs": [], 1155 | "source": [ 1156 | "cat data/wordcount.txt | histogram > data/histogram.txt" 1157 | ] 1158 | }, 1159 | { 1160 | "cell_type": "code", 1161 | "execution_count": 74, 1162 | "metadata": { 1163 | "scrolled": true 1164 | }, 1165 | "outputs": [ 1166 | { 1167 | "name": "stdout", 1168 | "output_type": "stream", 1169 | "text": [ 1170 | "the ===========================================================================================================\n", 1171 | "and ==============================================================================\n", 1172 | "i ==============================================================\n", 1173 | "to ============================================================\n", 1174 | "of =================================================\n", 1175 | "a =======================================\n", 1176 | "in ==================================\n", 1177 | "he =================================\n", 1178 | "that ================================\n", 1179 | "it ============================\n", 1180 | "was =========================\n", 1181 | "as =====================\n", 1182 | "we ====================\n", 1183 | "for ====================\n", 1184 | "is ====================\n", 1185 | "his ===================\n", 1186 | "you ===================\n", 1187 | "me ===================\n", 1188 | "not ==================\n", 1189 | "with =================\n", 1190 | "my ================\n", 1191 | "all ===============\n", 1192 | "be ===============\n", 1193 | "so ==============\n", 1194 | "at ==============\n", 1195 | "on ==============\n", 1196 | "but ==============\n", 1197 | "have ==============\n", 1198 | "her ==============\n", 1199 | "had =============\n", 1200 | "him ============\n", 1201 | "she ==========\n", 1202 | "when ==========\n", 1203 | "there ==========\n", 1204 | "this ========\n", 1205 | "which ========\n", 1206 | "if ========\n", 1207 | "from ========\n", 1208 | "are ========\n", 1209 | "said =======\n", 1210 | "were =======\n", 1211 | "then =======\n", 1212 | "by =======\n", 1213 | "or ======\n", 1214 | "could ======\n", 1215 | "one ======\n", 1216 | "do ======\n", 1217 | "no ======\n", 1218 | "they ======\n", 1219 | "them ======\n", 1220 | "what ======\n", 1221 | "us ======\n", 1222 | "will ======\n", 1223 | "must ======\n", 1224 | "up =====\n", 1225 | "some =====\n", 1226 | "out =====\n", 1227 | "would =====\n", 1228 | "shall =====\n", 1229 | "may =====\n", 1230 | "our =====\n", 1231 | "now =====\n", 1232 | "see =====\n", 1233 | "been =====\n", 1234 | "know =====\n", 1235 | "can =====\n", 1236 | "more ====\n", 1237 | "time ====\n", 1238 | "an ====\n", 1239 | "has ====\n", 1240 | "come ====\n", 1241 | "am ====\n", 1242 | "over ====\n", 1243 | "any ====\n", 1244 | "van ====\n", 1245 | "your ====\n", 1246 | "came ====\n", 1247 | "helsing ===\n", 1248 | "went ===\n", 1249 | "into ===\n", 1250 | "only ===\n", 1251 | "who ===\n", 1252 | "go ===\n", 1253 | "very ===\n", 1254 | "did ===\n", 1255 | "before ===\n", 1256 | "like ===\n", 1257 | "here ===\n", 1258 | "back ===\n", 1259 | "down ===\n", 1260 | "again ===\n", 1261 | "seemed ===\n", 1262 | "about ===\n", 1263 | "well ===\n", 1264 | "even ===\n", 1265 | "such ===\n", 1266 | "way ===\n", 1267 | "took ==\n", 1268 | "lucy ==\n", 1269 | "than ==\n", 1270 | "good ==\n", 1271 | "dear ==\n", 1272 | "think ==\n", 1273 | "their ==\n", 1274 | "much ==\n", 1275 | "where ==\n", 1276 | "saw ==\n", 1277 | "how ==\n", 1278 | "though ==\n", 1279 | "man ==\n", 1280 | "through ==\n", 1281 | "mina ==\n", 1282 | "too ==\n", 1283 | "night ==\n", 1284 | "hand ==\n", 1285 | "after ==\n", 1286 | "room ==\n", 1287 | "face ==\n", 1288 | "should ==\n", 1289 | "door ==\n", 1290 | "made ==\n", 1291 | "tell ==\n", 1292 | "poor ==\n", 1293 | "old ==\n", 1294 | "own ==\n", 1295 | "other ==\n", 1296 | "eyes ==\n", 1297 | "away ==\n", 1298 | "looked ==\n", 1299 | "work ==\n", 1300 | "great ==\n", 1301 | "friend ==\n", 1302 | "sleep ==\n", 1303 | "once ==\n", 1304 | "jonathan ==\n", 1305 | "dr ==\n", 1306 | "things ==\n", 1307 | "get ==\n", 1308 | "look ==\n", 1309 | "little ==\n", 1310 | "make ==\n", 1311 | "just ==\n", 1312 | "might ==\n", 1313 | "got ==\n", 1314 | "day ==\n", 1315 | "professor ==\n", 1316 | "its ==\n", 1317 | "found ==\n", 1318 | "yet ==\n", 1319 | "count ==\n", 1320 | "off =\n", 1321 | "god =\n", 1322 | "take =\n", 1323 | "long =\n", 1324 | "say =\n", 1325 | "thought =\n", 1326 | "told =\n", 1327 | "men =\n", 1328 | "let =\n", 1329 | "life =\n", 1330 | "asked =\n", 1331 | "without =\n", 1332 | "something =\n", 1333 | "last =\n", 1334 | "till =\n", 1335 | "place =\n", 1336 | "oh =\n", 1337 | "myself =\n", 1338 | "first =\n", 1339 | "fear =\n", 1340 | "arthur =\n", 1341 | "ever =\n", 1342 | "house =\n", 1343 | "heart =\n", 1344 | "two =\n", 1345 | "never =\n", 1346 | "knew =\n", 1347 | "done =\n", 1348 | "himself =\n", 1349 | "these =\n", 1350 | "quite =\n", 1351 | "same =\n", 1352 | "want =\n", 1353 | "find =\n", 1354 | "still =\n", 1355 | "harker =\n", 1356 | "began =\n", 1357 | "nothing =\n", 1358 | "coming =\n", 1359 | "window =\n", 1360 | "round =\n", 1361 | "put =\n", 1362 | "head =\n", 1363 | "many =\n", 1364 | "hands =\n", 1365 | "however =\n", 1366 | "help =\n", 1367 | "right =\n", 1368 | "mr =\n", 1369 | "hear =\n", 1370 | "blood =\n", 1371 | "whilst =\n", 1372 | "mind =\n", 1373 | "white =\n", 1374 | "open =\n", 1375 | "full =\n", 1376 | "moment =\n", 1377 | "anything =\n", 1378 | "keep =\n", 1379 | "thing =\n", 1380 | "terrible =\n", 1381 | "morning =\n", 1382 | "left =\n", 1383 | "rest =\n", 1384 | "seen =\n", 1385 | "diary =\n", 1386 | "madam =\n", 1387 | "heard =\n", 1388 | "far =\n", 1389 | "upon =\n", 1390 | "why =\n", 1391 | "mrs =\n", 1392 | "strange =\n", 1393 | "cannot =\n", 1394 | "bed =\n", 1395 | "felt =\n", 1396 | "each =\n", 1397 | "few =\n", 1398 | "both =\n", 1399 | "tonight =\n", 1400 | "since =\n", 1401 | "project =\n", 1402 | "godalming =\n", 1403 | "every =\n", 1404 | "dont =\n", 1405 | "turned =\n", 1406 | "read =\n", 1407 | "those =\n", 1408 | "seward =\n", 1409 | "light =\n", 1410 | "being =\n", 1411 | "others =\n", 1412 | "give =\n", 1413 | "alone =\n", 1414 | "sort =\n", 1415 | "quincey =\n", 1416 | "opened =\n", 1417 | "love =\n", 1418 | "another =\n", 1419 | "stood =\n" 1420 | ] 1421 | } 1422 | ], 1423 | "source": [ 1424 | "head -250 data/histogram.txt " 1425 | ] 1426 | }, 1427 | { 1428 | "cell_type": "markdown", 1429 | "metadata": {}, 1430 | "source": [ 1431 | "## Training an n-gram language model\n", 1432 | "\n", 1433 | "Now we can use KenLM to train a 3-gram Language model on our preprocessed corpus" 1434 | ] 1435 | }, 1436 | { 1437 | "cell_type": "code", 1438 | "execution_count": 78, 1439 | "metadata": {}, 1440 | "outputs": [ 1441 | { 1442 | "name": "stdout", 1443 | "output_type": "stream", 1444 | "text": [ 1445 | "Builds unpruned language models with modified Kneser-Ney smoothing.\n", 1446 | "\n", 1447 | "Please cite:\n", 1448 | "@inproceedings{Heafield-estimate,\n", 1449 | " author = {Kenneth Heafield and Ivan Pouzyrevsky and Jonathan H. Clark and Philipp Koehn},\n", 1450 | " title = {Scalable Modified {Kneser-Ney} Language Model Estimation},\n", 1451 | " year = {2013},\n", 1452 | " month = {8},\n", 1453 | " booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics},\n", 1454 | " address = {Sofia, Bulgaria},\n", 1455 | " url = {http://kheafield.com/professional/edinburgh/estimate\\_paper.pdf},\n", 1456 | "}\n", 1457 | "\n", 1458 | "Provide the corpus on stdin. The ARPA file will be written to stdout. Order of\n", 1459 | "the model (-o) is the only mandatory option. As this is an on-disk program,\n", 1460 | "setting the temporary file location (-T) and sorting memory (-S) is recommended.\n", 1461 | "\n", 1462 | "Memory sizes are specified like GNU sort: a number followed by a unit character.\n", 1463 | "Valid units are % for percentage of memory (supported platforms only) and (in\n", 1464 | "increasing powers of 1024): b, K, M, G, T, P, E, Z, Y. Default is K (*1024).\n", 1465 | "This machine has 13238644736 bytes of memory.\n", 1466 | "\n", 1467 | "Language model building options:\n", 1468 | " -h [ --help ] Show this help message\n", 1469 | " -o [ --order ] arg Order of the model\n", 1470 | " --interpolate_unigrams [=arg(=1)] (=1)\n", 1471 | " Interpolate the unigrams (default) as \n", 1472 | " opposed to giving lots of mass to \n", 1473 | " like SRI. If you want SRI's behavior \n", 1474 | " with a large and the old lmplz \n", 1475 | " default, use --interpolate_unigrams 0.\n", 1476 | " --skip_symbols Treat , , and as \n", 1477 | " whitespace instead of throwing an \n", 1478 | " exception\n", 1479 | " -T [ --temp_prefix ] arg (=/tmp/) Temporary file prefix\n", 1480 | " -S [ --memory ] arg (=80%) Sorting memory\n", 1481 | " --minimum_block arg (=8K) Minimum block size to allow\n", 1482 | " --sort_block arg (=64M) Size of IO operations for sort \n", 1483 | " (determines arity)\n", 1484 | " --block_count arg (=2) Block count (per order)\n", 1485 | " --vocab_estimate arg (=1000000) Assume this vocabulary size for \n", 1486 | " purposes of calculating memory in step \n", 1487 | " 1 (corpus count) and pre-sizing the \n", 1488 | " hash table\n", 1489 | " --vocab_pad arg (=0) If the vocabulary is smaller than this \n", 1490 | " value, pad with to reach this \n", 1491 | " size. Requires --interpolate_unigrams\n", 1492 | " --verbose_header Add a verbose header to the ARPA file \n", 1493 | " that includes information such as token\n", 1494 | " count, smoothing type, etc.\n", 1495 | " --text arg Read text from a file instead of stdin\n", 1496 | " --arpa arg Write ARPA to a file instead of stdout\n", 1497 | " --intermediate arg Write ngrams to intermediate files. \n", 1498 | " Turns off ARPA output (which can be \n", 1499 | " reactivated by --arpa file). Forces \n", 1500 | " --renumber on.\n", 1501 | " --renumber Renumber the vocabulary identifiers so \n", 1502 | " that they are monotone with the hash of\n", 1503 | " each string. This is consistent with \n", 1504 | " the ordering used by the trie data \n", 1505 | " structure.\n", 1506 | " --collapse_values Collapse probability and backoff into a\n", 1507 | " single value, q that yields the same \n", 1508 | " sentence-level probabilities. See \n", 1509 | " http://kheafield.com/professional/edinb\n", 1510 | " urgh/rest_paper.pdf for more details, \n", 1511 | " including a proof.\n", 1512 | " --prune arg Prune n-grams with count less than or \n", 1513 | " equal to the given threshold. Specify \n", 1514 | " one value for each order i.e. 0 0 1 to \n", 1515 | " prune singleton trigrams and above. \n", 1516 | " The sequence of values must be \n", 1517 | " non-decreasing and the last value \n", 1518 | " applies to any remaining orders. \n", 1519 | " Default is to not prune, which is \n", 1520 | " equivalent to --prune 0.\n", 1521 | " --limit_vocab_file arg Read allowed vocabulary separated by \n", 1522 | " whitespace. N-grams that contain \n", 1523 | " vocabulary items not in this list will \n", 1524 | " be pruned. Can be combined with --prune\n", 1525 | " arg\n", 1526 | " --discount_fallback [=arg(=0.5 1 1.5)]\n", 1527 | " The closed-form estimate for Kneser-Ney\n", 1528 | " discounts does not work without \n", 1529 | " singletons or doubletons. It can also \n", 1530 | " fail if these values are out of range. \n", 1531 | " This option falls back to \n", 1532 | " user-specified discounts when the \n", 1533 | " closed-form estimate fails. Note that \n", 1534 | " this option is generally a bad idea: \n", 1535 | " you should deduplicate your corpus \n", 1536 | " instead. However, class-based models \n", 1537 | " need custom discounts because they lack\n", 1538 | " singleton unigrams. Provide up to \n", 1539 | " three discounts (for adjusted counts 1,\n", 1540 | " 2, and 3+), which will be applied to \n", 1541 | " all orders where the closed-form \n", 1542 | " estimates fail.\n", 1543 | "\n" 1544 | ] 1545 | }, 1546 | { 1547 | "ename": "", 1548 | "evalue": "1", 1549 | "output_type": "error", 1550 | "traceback": [] 1551 | } 1552 | ], 1553 | "source": [ 1554 | "./bin/lmplz --help" 1555 | ] 1556 | }, 1557 | { 1558 | "cell_type": "code", 1559 | "execution_count": 82, 1560 | "metadata": {}, 1561 | "outputs": [ 1562 | { 1563 | "name": "stdout", 1564 | "output_type": "stream", 1565 | "text": [ 1566 | "=== 1/5 Counting and sorting n-grams ===\n", 1567 | "Reading /home/geopar/projects/bash_tutorial/data/dracula4.txt\n", 1568 | "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", 1569 | "****************************************************************************************************\n", 1570 | "Unigram tokens 163116 types 10719\n", 1571 | "=== 2/5 Calculating and sorting adjusted counts ===\n", 1572 | "Chain sizes: 1:128628 2:3683752192 3:6907035648\n", 1573 | "Statistics:\n", 1574 | "1 10719 D1=0.644236 D2=1.00333 D3+=1.39803\n", 1575 | "2 72604 D1=0.771329 D2=1.15115 D3+=1.31645\n", 1576 | "3 133828 D1=0.882174 D2=1.25019 D3+=1.4389\n", 1577 | "Memory estimate for binary LM:\n", 1578 | "type kB\n", 1579 | "probing 4326 assuming -p 1.5\n", 1580 | "probing 4793 assuming -r models -p 1.5\n", 1581 | "trie 1828 without quantization\n", 1582 | "trie 1039 assuming -q 8 -b 8 quantization \n", 1583 | "trie 1738 assuming -a 22 array pointer compression\n", 1584 | "trie 949 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n", 1585 | "=== 3/5 Calculating and sorting initial probabilities ===\n", 1586 | "Chain sizes: 1:128628 2:1161664 3:2676560\n", 1587 | "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", 1588 | "####################################################################################################\n", 1589 | "=== 4/5 Calculating and writing order-interpolated probabilities ===\n", 1590 | "Chain sizes: 1:128628 2:1161664 3:2676560\n", 1591 | "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", 1592 | "####################################################################################################\n", 1593 | "=== 5/5 Writing ARPA model ===\n", 1594 | "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", 1595 | "****************************************************************************************************\n", 1596 | "Name:lmplz\tVmPeak:10491128 kB\tVmRSS:10520 kB\tRSSMax:2417444 kB\tuser:0.256158\tsys:0.52217\tCPU:0.778345\treal:0.757482\n" 1597 | ] 1598 | } 1599 | ], 1600 | "source": [ 1601 | "./bin/lmplz -o 3 data/dracula.lm.arpa " 1602 | ] 1603 | }, 1604 | { 1605 | "cell_type": "markdown", 1606 | "metadata": {}, 1607 | "source": [ 1608 | "We can see some 1-gram, 2-gram and 3-gram scores using grep" 1609 | ] 1610 | }, 1611 | { 1612 | "cell_type": "code", 1613 | "execution_count": 90, 1614 | "metadata": {}, 1615 | "outputs": [ 1616 | { 1617 | "name": "stdout", 1618 | "output_type": "stream", 1619 | "text": [ 1620 | "\\\u001b[01;31m\u001b[K1-grams\u001b[m\u001b[K:\n", 1621 | "-4.890992\t\t0\n", 1622 | "0\t\t-0.61173564\n", 1623 | "-1.4264659\t\t0\n", 1624 | "-2.7945037\tmay\t-0.3863618\n", 1625 | "-4.7507243\tbistritzleft\t-0.11276052\n", 1626 | "-4.7507243\tmunich\t-0.11276052\n", 1627 | "-2.1378868\tat\t-0.7028289\n", 1628 | "-3.8816328\tp\t-0.14545205\n", 1629 | "-3.9838684\tm\t-0.19899167\n", 1630 | "-2.1765325\ton\t-0.64976496\n", 1631 | "\u001b[36m\u001b[K--\u001b[m\u001b[K\n", 1632 | "\\\u001b[01;31m\u001b[K2-grams\u001b[m\u001b[K:\n", 1633 | "-1.7202902\t \t0\n", 1634 | "-1.1438557\tmay \t0\n", 1635 | "-1.0149596\tat \t0\n", 1636 | "-1.0583433\tp \t0\n", 1637 | "-0.9642732\tm \t0\n", 1638 | "-1.2002825\ton \t0\n", 1639 | "-1.1483785\tearly \t0\n", 1640 | "-0.9711424\tnext \t0\n", 1641 | "-1.4812524\tmorning \t0\n", 1642 | "-1.1897001\tshould \t0\n", 1643 | "\u001b[36m\u001b[K--\u001b[m\u001b[K\n", 1644 | "\\\u001b[01;31m\u001b[K3-grams\u001b[m\u001b[K:\n", 1645 | "-0.8619659\t may \n", 1646 | "-1.1233317\ti may \n", 1647 | "-0.7419162\tof may \n", 1648 | "-0.9894832\tit may \n", 1649 | "-1.0609393\tand may \n", 1650 | "-1.0421566\twe may \n", 1651 | "-1.3042028\the may \n", 1652 | "-1.8012102\tthere may \n", 1653 | "-0.9887751\tthis may \n", 1654 | "-1.196441\tyou may \n" 1655 | ] 1656 | } 1657 | ], 1658 | "source": [ 1659 | "cat data/dracula.lm.arpa | egrep \"1-grams|2-grams|3-grams\" -A10" 1660 | ] 1661 | }, 1662 | { 1663 | "cell_type": "markdown", 1664 | "metadata": {}, 1665 | "source": [ 1666 | "We can also use query to use the trained language model to score the perplexity of a sentence.\n", 1667 | "Lower perplexity indicates a more probable sentence.\n", 1668 | "\n", 1669 | "Let's have the model score two possible endings." 1670 | ] 1671 | }, 1672 | { 1673 | "cell_type": "code", 1674 | "execution_count": 91, 1675 | "metadata": {}, 1676 | "outputs": [ 1677 | { 1678 | "name": "stdout", 1679 | "output_type": "stream", 1680 | "text": [ 1681 | "./bin/query: invalid option -- 'h'\n", 1682 | "KenLM was compiled with maximum order 6.\n", 1683 | "Usage: ./bin/query [-b] [-n] [-w] [-s] lm_file\n", 1684 | "-b: Do not buffer output.\n", 1685 | "-n: Do not wrap the input in and .\n", 1686 | "-v summary|sentence|word: Print statistics at this level.\n", 1687 | " Can be used multiple times: -v summary -v sentence -v word\n", 1688 | "-l lazy|populate|read|parallel: Load lazily, with populate, or malloc+read\n", 1689 | "The default loading method is populate on Linux and read on others.\n", 1690 | "\n", 1691 | "Each word in the output is formatted as:\n", 1692 | " word=vocab_id ngram_length log10(p(word|context))\n", 1693 | "where ngram_length is the length of n-gram matched. A vocab_id of 0 indicates\n", 1694 | "the unknown word. Sentence-level output includes log10 probability of the\n", 1695 | "sentence and OOV count.\n" 1696 | ] 1697 | }, 1698 | { 1699 | "ename": "", 1700 | "evalue": "1", 1701 | "output_type": "error", 1702 | "traceback": [] 1703 | } 1704 | ], 1705 | "source": [ 1706 | "./bin/query -h" 1707 | ] 1708 | }, 1709 | { 1710 | "cell_type": "code", 1711 | "execution_count": 92, 1712 | "metadata": {}, 1713 | "outputs": [], 1714 | "source": [ 1715 | "echo \"harker and mina die a horrible death\" > data/bad_ending\n", 1716 | "echo \"harker and mina live happily ever after\" > data/good_ending" 1717 | ] 1718 | }, 1719 | { 1720 | "cell_type": "code", 1721 | "execution_count": 100, 1722 | "metadata": {}, 1723 | "outputs": [ 1724 | { 1725 | "name": "stdout", 1726 | "output_type": "stream", 1727 | "text": [ 1728 | "harker and mina die a horrible death\n", 1729 | "Perplexity including OOVs:\t456.47716780020465\n" 1730 | ] 1731 | } 1732 | ], 1733 | "source": [ 1734 | "cat data/bad_ending\n", 1735 | "./bin/query data/dracula.lm.arpa < data/bad_ending 2>&1| grep \"Perplexity\" | head -1" 1736 | ] 1737 | }, 1738 | { 1739 | "cell_type": "code", 1740 | "execution_count": 101, 1741 | "metadata": {}, 1742 | "outputs": [ 1743 | { 1744 | "name": "stdout", 1745 | "output_type": "stream", 1746 | "text": [ 1747 | "harker and mina live happily ever after\n", 1748 | "Perplexity including OOVs:\t916.2937036462345\n" 1749 | ] 1750 | } 1751 | ], 1752 | "source": [ 1753 | "cat data/good_ending\n", 1754 | "./bin/query data/dracula.lm.arpa < data/good_ending 2>&1| grep \"Perplexity\" | head -1" 1755 | ] 1756 | }, 1757 | { 1758 | "cell_type": "code", 1759 | "execution_count": null, 1760 | "metadata": {}, 1761 | "outputs": [], 1762 | "source": [] 1763 | } 1764 | ], 1765 | "metadata": { 1766 | "kernelspec": { 1767 | "display_name": "Bash", 1768 | "language": "bash", 1769 | "name": "bash" 1770 | }, 1771 | "language_info": { 1772 | "codemirror_mode": "shell", 1773 | "file_extension": ".sh", 1774 | "mimetype": "text/x-sh", 1775 | "name": "bash" 1776 | } 1777 | }, 1778 | "nbformat": 4, 1779 | "nbformat_minor": 4 1780 | } 1781 | -------------------------------------------------------------------------------- /Lab 0.0 Jupyter notebook quickstart.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Jupyter notebook quickstart" 8 | ] 9 | }, 10 | { 11 | "cell_type": "raw", 12 | "metadata": {}, 13 | "source": [ 14 | "Jupyter notebook is an interactive python console.\n", 15 | "\n", 16 | "It allows for python execution in the browser and \n", 17 | "easy inspection of the code results and the plots\n", 18 | "produced.\n", 19 | "\n", 20 | "Furthermore it allows to write plain text along\n", 21 | "with the code that makes it easy to document the code\n", 22 | "and explain the math / intuition behind it.\n", 23 | "\n", 24 | "This is why it is widely used for exploratory data\n", 25 | "analysis and for presentations like this one" 26 | ] 27 | }, 28 | { 29 | "cell_type": "raw", 30 | "metadata": {}, 31 | "source": [ 32 | "1. Use to enter a shell in edit mode\n", 33 | "2. Use to leave edit mode\n", 34 | "3. Use + to execute a shell and move to the next\n", 35 | "4. Use + to execute a shell and insert another below" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "%timeit Measures how long a statement takes to execute" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "import random\n", 52 | "L = [random.random() for i in range(100000)]\n", 53 | "%timeit L.sort()" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "%ls Lists the contents of the current directory" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "%ls" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "%pwd shows the current working directory" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "%pwd" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "? shows the documentation of a function / class / module" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "import os\n", 102 | "\n", 103 | "os?" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "?? shows extended documentation and code if available" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "os??" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "## Further reading\n", 127 | "\n", 128 | "1. https://www.datacamp.com/community/tutorials/tutorial-jupyter-notebook \n", 129 | "2. https://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/" 130 | ] 131 | } 132 | ], 133 | "metadata": { 134 | "kernelspec": { 135 | "display_name": "Python 3", 136 | "language": "python", 137 | "name": "python3" 138 | }, 139 | "language_info": { 140 | "codemirror_mode": { 141 | "name": "ipython", 142 | "version": 3 143 | }, 144 | "file_extension": ".py", 145 | "mimetype": "text/x-python", 146 | "name": "python", 147 | "nbconvert_exporter": "python", 148 | "pygments_lexer": "ipython3", 149 | "version": "3.6.4" 150 | } 151 | }, 152 | "nbformat": 4, 153 | "nbformat_minor": 2 154 | } 155 | -------------------------------------------------------------------------------- /Lab 0.1 Python 101.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Python is a great general-purpose programming language on its own, but with the help of a few popular libraries (numpy, scipy, matplotlib) it becomes a powerful environment for scientific computing.\n", 15 | "\n", 16 | "We expect that many of you will have some experience with Python and numpy; for the rest of you, this section will serve as a quick crash course both on the Python programming language and on the use of Python for scientific computing.\n", 17 | "\n", 18 | "Some of you may have previous knowledge in Matlab, in which case we also recommend the numpy for Matlab users page (https://docs.scipy.org/doc/numpy-dev/user/numpy-for-matlab-users.html)." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "In this tutorial, we will cover:\n", 26 | "\n", 27 | "* Basic Python: Basic data types (Containers, Lists, Dictionaries, Sets, Tuples), Functions, Classes\n", 28 | "* Numpy: Arrays, Array indexing, Datatypes, Array math, Broadcasting\n", 29 | "* Matplotlib: Plotting, Subplots, Images\n", 30 | "* IPython: Creating notebooks, Typical workflows" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## Python Basics" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### Basic data types" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "#### Numbers" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "Integers and floats work as you would expect from other languages:" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "x = 3\n", 68 | "print(x, type(x))" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "print(x + 1) # Addition;\n", 78 | "print(x - 1) # Subtraction;\n", 79 | "print(x * 2) # Multiplication;\n", 80 | "print(x ** 2) # Exponentiation;" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "print(x / 2) # Floating point division\n", 90 | "print(x // 2) # Integer division" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "x += 1 # x = x + 1 = 4\n", 100 | "print(x) # Prints 4\n", 101 | "x *= 2 # x = x * 2 = 8\n", 102 | "print(x) # Prints 8" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "y = 2.5\n", 112 | "print(type(y)) # Prints \n", 113 | "print(y, y + 1, y * 2, y ** 2) # Prints 2.5 3.5 5.0 6.25" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "#### Booleans" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Python implements all of the usual operators for Boolean logic, but uses English words rather than symbols (`&&`, `||`, etc.):" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "t, f = True, False\n", 137 | "print(type(t)) # Prints " 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "print(t and f) # Logical AND;\n", 147 | "print(t or f) # Logical OR;\n", 148 | "print(not t) # Logical NOT;\n", 149 | "print(t != f) # Logical XOR;" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "#### Strings" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "hello = 'hello' # String literals can use single quotes\n", 166 | "world = \"world\" # or double quotes; it does not matter." 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "print(hello, len(hello))" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "hw = hello + ' ' + world # String concatenation\n", 185 | "print(hw) # prints \"hello world\"" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "hw12 = '{} {} {}'.format(hello, world, 12) # string formatting\n", 195 | "print(hw12) # prints \"hello world 12\"" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "String objects have a bunch of useful methods; for example:" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "s = \"Hello World\"\n", 212 | "print(s.upper()) # Convert a string to uppercase; prints \"HELLO WORLD\"\n", 213 | "print(s.lower()) # Convert a string to uppercase; prints \"hello world\"\n", 214 | "print(s.replace('l', '')) # Replace all instances of one substring with another;\n", 215 | " # prints \"Heo Word\"\n", 216 | "print(' world\\n \\t'.strip()) # Strip leading and trailing whitespace; prints \"world\"\n", 217 | "print(s.split(' ')) # Splits the string into a list of tokens on ' '. prints ['Hello', 'World']" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "You can find a list of all string methods in the [documentation](https://docs.python.org/2/library/stdtypes.html#string-methods)." 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "### Containers" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "Python includes several built-in container types: lists, dictionaries, sets, and tuples." 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "#### Lists" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "A list is the Python equivalent of an array, but is resizeable and can contain elements of different types:" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "xs = [3, 1, 2] # Create a list\n", 262 | "print(xs)\n", 263 | "print(xs[0]) # Indexing starts from zero. prints 3" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "xs[2] = 'foo' # Lists can contain elements of different types\n", 273 | "print(xs)\n", 274 | "\n", 275 | "# DON'T DO THIS" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "xs.append('bar') # Add a new element to the end of the list\n", 285 | "print(xs) " 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "zip takes 2 lists and returns a list of tuples containing elements of both lists. It's useful for iterating 2 lists simultaneously" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "x = [1, 2, 3]\n", 302 | "y = ['a', 'b', 'c']\n", 303 | "\n", 304 | "print(list(zip(x, y))) # prints [(1, 'a'), (2, 'b'), (3, 'c')]\n", 305 | "print(list(zip(y, x))) # prints [('a', 1), ('b', 2), ('c', 3)]" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "As usual, you can find all the details about lists in the [documentation](https://docs.python.org/2/tutorial/datastructures.html#more-on-lists)." 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "#### Slicing" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "In addition to accessing list elements one at a time, Python provides concise syntax to access sublists; this is known as slicing:" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "xs = [1, 2, 3, 4, 5, 6]\n", 336 | "# Take slices of lists\n", 337 | "# Last index is not inclusive\n", 338 | "print(xs[2:5]) # prints [3, 4, 5]\n", 339 | "\n", 340 | "# Slice to the end\n", 341 | "# If last index is not specified slices to the last element\n", 342 | "print(xs[3:]) # prints [4, 5, 6]\n", 343 | "\n", 344 | "# Slice from the beginning\n", 345 | "# If first index is not specified slices from first element\n", 346 | "print(xs[:4]) # prints [1, 2, 3, 4]\n", 347 | "\n", 348 | "# Negative indexing\n", 349 | "# Last element is -1\n", 350 | "print(xs[-1]) # prints 6\n", 351 | "\n", 352 | "# Second to last element is -2\n", 353 | "# Remember last index is not inclusive\n", 354 | "print(xs[2:-2]) #prints [3, 4]\n", 355 | "\n", 356 | "# Step slices\n", 357 | "# Take a slice of step 2 from second element to the second to last not inclusive\n", 358 | "print(xs[1:-2:2]) # prints [2, 4]\n", 359 | "\n", 360 | "# Reverse list\n", 361 | "print(xs[::-1]) # prints [6, 5, 4, 3, 2, 1]" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "#### Loops" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": {}, 374 | "source": [ 375 | "You can loop over the elements of a list like this:" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "animals = ['cat', 'dog', 'monkey']\n", 385 | "for animal in animals:\n", 386 | " print(animal)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": {}, 392 | "source": [ 393 | "If you want access to the index of each element within the body of a loop, use the built-in `enumerate` function:" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "animals = ['cat', 'dog', 'monkey']\n", 403 | "for idx, animal in enumerate(animals):\n", 404 | " print('#{}: {}'.format(idx, animal))" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": {}, 410 | "source": [ 411 | "#### List comprehensions:" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": {}, 417 | "source": [ 418 | "A frequent programming operation is transforming all elements in a list. As a simple example, consider the following code that computes square numbers:" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "xs = [1, 2, 3, 4, 5, 6]\n", 428 | "squares = []\n", 429 | "for x in xs:\n", 430 | " squares.append(x ** 2)\n", 431 | "print(squares) # prints [1, 4, 9, 16, 25, 36]" 432 | ] 433 | }, 434 | { 435 | "cell_type": "markdown", 436 | "metadata": {}, 437 | "source": [ 438 | "A more concise way to do this is using a list comprehension" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [ 447 | "squares = [i ** 2 for i in xs] \n", 448 | "print(squares) # prints [1, 4, 9, 16, 25, 36]" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": {}, 454 | "source": [ 455 | "List comprehensions can also contain conditions:" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": null, 461 | "metadata": {}, 462 | "outputs": [], 463 | "source": [ 464 | "even_squares = [x ** 2 for x in xs if x % 2 == 0]\n", 465 | "print(even_squares) # prints [4, 16, 36]" 466 | ] 467 | }, 468 | { 469 | "cell_type": "markdown", 470 | "metadata": {}, 471 | "source": [ 472 | "#### Dictionaries" 473 | ] 474 | }, 475 | { 476 | "cell_type": "markdown", 477 | "metadata": {}, 478 | "source": [ 479 | "A dictionary stores (key, value) pairs, similar to a `Map` in Java or an object in Javascript. \n", 480 | "Good generic type for storing data. \n", 481 | "Lookup is fast.\n", 482 | "\n", 483 | "You can use it like this:" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": null, 489 | "metadata": {}, 490 | "outputs": [], 491 | "source": [ 492 | "x = {}\n", 493 | "\n", 494 | "x['Hello'] = 'World'\n", 495 | "\n", 496 | "print(x['Hello']) # Prints World" 497 | ] 498 | }, 499 | { 500 | "cell_type": "markdown", 501 | "metadata": {}, 502 | "source": [ 503 | "You can access the keys and the values in the dictionary like this" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": null, 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [ 512 | "y = {\n", 513 | " '0': 'zero', \n", 514 | " '1': 'one',\n", 515 | " '2': 'two',\n", 516 | " '3': 'three',\n", 517 | " '4': 'four',\n", 518 | " '5': 'five',\n", 519 | " '6': 'six',\n", 520 | " '7': 'seven',\n", 521 | " '8': 'eight',\n", 522 | " '9': 'nine',\n", 523 | " '10': 'ten'\n", 524 | "}\n", 525 | "\n", 526 | "# The keys (indexes) as a list\n", 527 | "print(y.keys()) # prints ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']\n", 528 | "\n", 529 | "# The values as a list\n", 530 | "print(y.values()) # prints ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten']" 531 | ] 532 | }, 533 | { 534 | "cell_type": "markdown", 535 | "metadata": {}, 536 | "source": [ 537 | "Check if an element is in y keys" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "metadata": {}, 544 | "outputs": [], 545 | "source": [ 546 | "print('0' in y)\n", 547 | "print(0 in y)" 548 | ] 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "metadata": {}, 553 | "source": [ 554 | "Iterate elements in dictionary" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": null, 560 | "metadata": {}, 561 | "outputs": [], 562 | "source": [ 563 | "# PRO TIP: You can't assume the elements will be in order\n", 564 | "# If you need ordered elements use collections.OrderedDict (or lists)\n", 565 | "for k, v in y.items():\n", 566 | " print(\"Key: {}\\t Value: {}\".format(k, v))" 567 | ] 568 | }, 569 | { 570 | "cell_type": "markdown", 571 | "metadata": {}, 572 | "source": [ 573 | "Dictionary comprehensions: These are similar to list comprehensions, and allow you to easily construct/transform dictionaries. For example:" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "import pprint # for pretty printing\n", 583 | "z = {k: v.upper() for k, v in y.items()}\n", 584 | "pprint.pprint(z)" 585 | ] 586 | }, 587 | { 588 | "cell_type": "markdown", 589 | "metadata": {}, 590 | "source": [ 591 | "### Functions" 592 | ] 593 | }, 594 | { 595 | "cell_type": "markdown", 596 | "metadata": {}, 597 | "source": [ 598 | "Functions are defined using the def keyword.\n", 599 | "\n", 600 | "\n", 601 | "A typical function declaration is def f(p1, p2, p3 ..., k1=v1, k2=v2, ...)\n", 602 | "where p1, p2 ... are positional (required) arguments \n", 603 | "and k1, k2 ... are keyword (optional) arguments with default values v1, v2, ..." 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": null, 609 | "metadata": {}, 610 | "outputs": [], 611 | "source": [ 612 | "def add_num_to_list(xs, num=2):\n", 613 | " return [x + num for x in xs]\n", 614 | "\n", 615 | "print(add_num_to_list([1, 2, 3]))\n", 616 | "print(add_num_to_list([1, 2, 3], num=10))" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": null, 622 | "metadata": {}, 623 | "outputs": [], 624 | "source": [ 625 | "def fibonacci(n):\n", 626 | " f0, f1 = 0, 1\n", 627 | " for _ in range(n):\n", 628 | " f0, f1 = f1, f0 + f1\n", 629 | " return f0\n", 630 | "\n", 631 | "print(fibonacci(9))" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": null, 637 | "metadata": {}, 638 | "outputs": [], 639 | "source": [ 640 | "def factorial(n):\n", 641 | " res = 1\n", 642 | " if n == 0 or n == 1:\n", 643 | " return res\n", 644 | " else:\n", 645 | " for i in range(2, n + 1):\n", 646 | " res *= i\n", 647 | " return res\n", 648 | "\n", 649 | "print(factorial(5))" 650 | ] 651 | }, 652 | { 653 | "cell_type": "markdown", 654 | "metadata": {}, 655 | "source": [ 656 | "### Classes" 657 | ] 658 | }, 659 | { 660 | "cell_type": "markdown", 661 | "metadata": {}, 662 | "source": [ 663 | "The syntax for defining classes in Python is straightforward:" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": null, 669 | "metadata": {}, 670 | "outputs": [], 671 | "source": [ 672 | "class Greeter(object):\n", 673 | "\n", 674 | " # Constructor\n", 675 | " def __init__(self, name):\n", 676 | " self.name = name # Create an instance variable\n", 677 | "\n", 678 | " # Instance method\n", 679 | " def greet(self, loud=False):\n", 680 | " s = ('Hello, {}'.format(self.name))\n", 681 | " if loud:\n", 682 | " s = s.upper()\n", 683 | " print(s)\n", 684 | "\n", 685 | "g = Greeter('Fred') # Construct an instance of the Greeter class\n", 686 | "g.greet() # Call an instance method; prints \"Hello, Fred\"\n", 687 | "g.greet(loud=True) # Call an instance method; prints \"HELLO, FRED\"" 688 | ] 689 | }, 690 | { 691 | "cell_type": "markdown", 692 | "metadata": {}, 693 | "source": [ 694 | "Inheritance is also simple. Child class inherits methods and attributes from parent" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": null, 700 | "metadata": {}, 701 | "outputs": [], 702 | "source": [ 703 | "class LoudBob(Greeter):\n", 704 | " def __init__(self):\n", 705 | " # Call parent constructor\n", 706 | " super(LoudBob, self).__init__('Bob')\n", 707 | " \n", 708 | " # Override parent method. Notice we change the method signature\n", 709 | " def greet(self):\n", 710 | " # Call parent method\n", 711 | " super(LoudBob, self).greet(loud=True)\n", 712 | " \n", 713 | "bob = LoudBob()\n", 714 | "bob.greet()" 715 | ] 716 | }, 717 | { 718 | "cell_type": "markdown", 719 | "metadata": {}, 720 | "source": [ 721 | "### Caveats" 722 | ] 723 | }, 724 | { 725 | "cell_type": "markdown", 726 | "metadata": {}, 727 | "source": [ 728 | "Python is dynamically typed and interpreted. So you can compare apples to oranges" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": null, 734 | "metadata": {}, 735 | "outputs": [], 736 | "source": [ 737 | "x = [1, 2, 3]\n", 738 | "y = 'Hello World'\n", 739 | "\n", 740 | "print(x == y) # evaluates False instead of throwing an exception" 741 | ] 742 | }, 743 | { 744 | "cell_type": "markdown", 745 | "metadata": {}, 746 | "source": [ 747 | "What is wrong with the following function? Why?" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": null, 753 | "metadata": {}, 754 | "outputs": [], 755 | "source": [ 756 | "def list_to_pow(xs, n=2):\n", 757 | " for i in range(len(xs)):\n", 758 | " xs[i] = xs[i] ** n\n", 759 | " return xs\n", 760 | "\n", 761 | "x = [1, 2, 3]\n", 762 | "y = list_to_pow(x, n=3)\n", 763 | "print(\"input is: {}\".format(x))\n", 764 | "print(\"output is: {}\".format(y))" 765 | ] 766 | }, 767 | { 768 | "cell_type": "markdown", 769 | "metadata": {}, 770 | "source": [ 771 | "### Exercise 1" 772 | ] 773 | }, 774 | { 775 | "cell_type": "markdown", 776 | "metadata": {}, 777 | "source": [ 778 | "Fix the previous function." 779 | ] 780 | }, 781 | { 782 | "cell_type": "markdown", 783 | "metadata": {}, 784 | "source": [ 785 | "Insert code in the cell below:" 786 | ] 787 | }, 788 | { 789 | "cell_type": "code", 790 | "execution_count": null, 791 | "metadata": {}, 792 | "outputs": [], 793 | "source": [ 794 | "def list_to_pow(xs, n=2):\n", 795 | " # Insert your code and delete the following line\n", 796 | " raise NotImplementedError" 797 | ] 798 | }, 799 | { 800 | "cell_type": "code", 801 | "execution_count": null, 802 | "metadata": {}, 803 | "outputs": [], 804 | "source": [ 805 | "x = [1, 2, 3]\n", 806 | "y = list_to_pow(x, n=3)\n", 807 | "print(\"input is: {}\".format(x))\n", 808 | "print(\"output is: {}\".format(y))" 809 | ] 810 | }, 811 | { 812 | "cell_type": "markdown", 813 | "metadata": {}, 814 | "source": [ 815 | "### Exercise 2" 816 | ] 817 | }, 818 | { 819 | "cell_type": "markdown", 820 | "metadata": {}, 821 | "source": [ 822 | "Write a function that takes two lists of equal length as input adds them element by element.\n", 823 | "The result should be a list containing the element-wise sums" 824 | ] 825 | }, 826 | { 827 | "cell_type": "markdown", 828 | "metadata": {}, 829 | "source": [ 830 | "Insert code in the cell below:" 831 | ] 832 | }, 833 | { 834 | "cell_type": "code", 835 | "execution_count": null, 836 | "metadata": {}, 837 | "outputs": [], 838 | "source": [ 839 | "def add_lists(xs, ys):\n", 840 | " # Assert checks the condition given and throws an Exception if False\n", 841 | " assert(len(xs) == len(ys))\n", 842 | " raise NotImplementedError" 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": null, 848 | "metadata": {}, 849 | "outputs": [], 850 | "source": [ 851 | "z = add_lists([1, 2, 3], [10, 10, 10])\n", 852 | "print(\"{}\".format(z))" 853 | ] 854 | }, 855 | { 856 | "cell_type": "code", 857 | "execution_count": null, 858 | "metadata": {}, 859 | "outputs": [], 860 | "source": [ 861 | "print(add_lists([1, 2], [1, 2, 3])) # Should throw AssertionError" 862 | ] 863 | }, 864 | { 865 | "cell_type": "markdown", 866 | "metadata": {}, 867 | "source": [ 868 | "### Exercise 3" 869 | ] 870 | }, 871 | { 872 | "cell_type": "markdown", 873 | "metadata": {}, 874 | "source": [ 875 | "Write a function that takes a string as input and counts how many times each word occurs.\n", 876 | "Input is a string and output is a dictionary with the words as keys and the counts as values\n", 877 | "\n", 878 | "Hint: Use .lower() and .split(), .strip()" 879 | ] 880 | }, 881 | { 882 | "cell_type": "markdown", 883 | "metadata": {}, 884 | "source": [ 885 | "Insert code in the cell below:" 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "execution_count": null, 891 | "metadata": {}, 892 | "outputs": [], 893 | "source": [ 894 | "def strip_punctuation(s):\n", 895 | " s = ' '.join(s.split('\\n'))\n", 896 | " return ''.join(c for c in s if c not in '?-.,:;')\n", 897 | "\n", 898 | "def wordcount(s):\n", 899 | " s = strip_punctuation(s)\n", 900 | " raise NotImplementedError" 901 | ] 902 | }, 903 | { 904 | "cell_type": "code", 905 | "execution_count": null, 906 | "metadata": {}, 907 | "outputs": [], 908 | "source": [ 909 | "text = \"\"\"\n", 910 | "'Tis sweet and commendable in your nature, Hamlet,\n", 911 | "To give these mourning duties to your father;\n", 912 | "But you must know, your father lost a father,\n", 913 | "That father lost, lost his, and the survivor bound\n", 914 | "In filial obligation, for some term\n", 915 | "To do obsequious sorrow. But to persevere\n", 916 | "In obstinate condolement is a course\n", 917 | "Of impious stubbornness. 'Tis unmanly grief,\n", 918 | "It shows a will most incorrect to heaven,\n", 919 | "A heart unfortified, a mind impatient,\n", 920 | "An understanding simple and unschool'd;\n", 921 | "For what we know must be, and is as common\n", 922 | "As any the most vulgar thing to sense,\n", 923 | "Why should we in our peevish opposition\n", 924 | "Take it to heart? Fie, 'tis a fault to heaven,\n", 925 | "A fault against the dead, a fault to nature,\n", 926 | "To reason most absurd, whose common theme\n", 927 | "Is death of fathers, and who still hath cried,\n", 928 | "From the first corse till he that died today,\n", 929 | "This must be so. We pray you throw to earth\n", 930 | "This unprevailing woe, and think of us\n", 931 | "As of a father; for let the world take note\n", 932 | "You are the most immediate to our throne,\n", 933 | "And with no less nobility of love\n", 934 | "Than that which dearest father bears his son\n", 935 | "Do I impart toward you. For your intent\n", 936 | "In going back to school in Wittenberg,\n", 937 | "It is most retrograde to our desire:\n", 938 | "And we beseech you bend you to remain\n", 939 | "Here in the cheer and comfort of our eye,\n", 940 | "Our chiefest courtier, cousin, and our son.\n", 941 | "\"\"\"\n", 942 | "\n", 943 | "import pprint\n", 944 | "\n", 945 | "pprint.pprint(wordcount(text))" 946 | ] 947 | }, 948 | { 949 | "cell_type": "markdown", 950 | "metadata": {}, 951 | "source": [ 952 | "## Further Reading\n", 953 | "\n", 954 | "1. https://docs.python.org/3/tutorial/ \n", 955 | "2. Learn Python the Hard Way (book)" 956 | ] 957 | } 958 | ], 959 | "metadata": { 960 | "kernelspec": { 961 | "display_name": "Python 3", 962 | "language": "python", 963 | "name": "python3" 964 | }, 965 | "language_info": { 966 | "codemirror_mode": { 967 | "name": "ipython", 968 | "version": 3 969 | }, 970 | "file_extension": ".py", 971 | "mimetype": "text/x-python", 972 | "name": "python", 973 | "nbconvert_exporter": "python", 974 | "pygments_lexer": "ipython3", 975 | "version": "3.6.4" 976 | } 977 | }, 978 | "nbformat": 4, 979 | "nbformat_minor": 2 980 | } 981 | -------------------------------------------------------------------------------- /Lab 0.2 Numpy Basics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Numpy" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Numpy is the core library for scientific computing in Python. It provides a high-performance multidimensional array object, and tools for working with these arrays. If you are already familiar with MATLAB, you might find this [tutorial](http://wiki.scipy.org/NumPy_for_Matlab_Users) useful to get started with Numpy." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "To use Numpy, we first need to import the `numpy` package:" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import numpy as np" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "### Arrays" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "A numpy array is a grid of values, all of the same type, and is indexed by a tuple of nonnegative integers. The number of dimensions is the rank of the array; the shape of an array is a tuple of integers giving the size of the array along each dimension." 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "Convert python list to numpy array" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "x = np.array([1, 2, 3]) # Create a rank 1 array\n", 61 | "\n", 62 | "print(x, type(x), type(x[0]))" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "Find out the size of a numpy array" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "x = np.array([\n", 79 | " [1, 2, 3], \n", 80 | " [4, 5 ,6]\n", 81 | "]) # Create a rank 2 array\n", 82 | "\n", 83 | "# x has 2 rows, 3 columns\n", 84 | "x.shape" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "Access elements in numpy array" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "x = np.array([1, 2, 3]) # Create a rank 1 array\n", 101 | "\n", 102 | "print(x[0]) # Access first element\n", 103 | "print(x[1:]) # Slice from first element\n", 104 | "print(x[-2]) # Access second to last element\n", 105 | "print(x[::-1]) # Reverse array" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "Numpy also provides many functions to create arrays:" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "scrolled": false 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "a = np.zeros((2,2)) # Create an array of all zeros\n", 124 | "print(a)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "b = np.ones((3,4)) # Create an array of all ones\n", 134 | "print(b)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "c = np.full((2,2), 7) # Create a constant array\n", 144 | "print(c)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "d = np.eye(2) # Create a 2x2 identity matrix\n", 154 | "print(d)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "e = np.random.random((2,2)) # Create an array filled with random values\n", 164 | " # Values are drawn from a Uniform distribution ~U(0,1)\n", 165 | " # https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.random.random.html\n", 166 | "print(e)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "e = np.random.randn(2,2) # Create an array filled with random values\n", 176 | " # Values are drawn from a Normal distribution ~N(0, 1)\n", 177 | " # https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.randn.html \n", 178 | "print(e)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "### Array indexing" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "Numpy offers several ways to index into arrays." 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "Slicing: Similar to Python lists, numpy arrays can be sliced. Since arrays may be multidimensional, you must specify a slice for each dimension of the array:" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "import numpy as np\n", 209 | "\n", 210 | "# Create the following rank 2 array with shape (3, 4)\n", 211 | "# [[ 1 2 3 4]\n", 212 | "# [ 5 6 7 8]\n", 213 | "# [ 9 10 11 12]]\n", 214 | "a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])\n", 215 | "\n", 216 | "# Use slicing to pull out the subarray consisting of the first 2 rows\n", 217 | "# and columns 1 and 2; b is the following array of shape (2, 2):\n", 218 | "# [[2 3]\n", 219 | "# [6 7]]\n", 220 | "b = a[:2, 1:3]\n", 221 | "print(b)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "A slice of an array is a view into the same data, so modifying it will modify the original array." 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "print(a[0, 1])\n", 238 | "b[0, 0] = 77 # b[0, 0] is the same piece of data as a[0, 1]\n", 239 | "print(a[0, 1])" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "You can also mix integer indexing with slice indexing. However, doing so will yield an array of lower rank than the original array. Note that this is quite different from the way that MATLAB handles array slicing:" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "# Create the following rank 2 array with shape (3, 4)\n", 256 | "a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])\n", 257 | "print(a)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "Two ways of accessing the data in the middle row of the array.\n", 265 | "Mixing integer indexing with slices yields an array of lower rank,\n", 266 | "while using only slices yields an array of the same rank as the\n", 267 | "original array:" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "row_r1 = a[1, :] # Rank 1 view of the second row of a \n", 277 | "row_r2 = a[1:2, :] # Rank 2 view of the second row of a\n", 278 | "row_r3 = a[[1], :] # Rank 2 view of the second row of a\n", 279 | "print(row_r1, row_r1.shape)\n", 280 | "print(row_r2, row_r2.shape)\n", 281 | "print(row_r3, row_r3.shape)" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "# We can make the same distinction when accessing columns of an array:\n", 291 | "col_r1 = a[:, 1]\n", 292 | "col_r2 = a[:, 1:2]\n", 293 | "print(col_r1, col_r1.shape)\n", 294 | "print(col_r2, col_r2.shape)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "Boolean array indexing: Boolean array indexing lets you pick out arbitrary elements of an array. Frequently this type of indexing is used to select the elements of an array that satisfy some condition. Here is an example:" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "import numpy as np\n", 311 | "\n", 312 | "a = np.array([[1,2], [3, 4], [5, 6]])\n", 313 | "\n", 314 | "bool_idx = (a > 2) # Find the elements of a that are bigger than 2;\n", 315 | " # this returns a numpy array of Booleans of the same\n", 316 | " # shape as a, where each slot of bool_idx tells\n", 317 | " # whether that element of a is > 2.\n", 318 | "\n", 319 | "print(bool_idx)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "# We use boolean array indexing to construct a rank 1 array\n", 329 | "# consisting of the elements of a corresponding to the True values\n", 330 | "# of bool_idx\n", 331 | "print(a[bool_idx])\n", 332 | "\n", 333 | "# We can do all of the above in a single concise statement:\n", 334 | "print(a[a > 2])" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "For brevity we have left out a lot of details about numpy array indexing; if you want to know more you should read the [documentation](https://docs.scipy.org/doc/numpy-1.15.1/reference/arrays.indexing.html)." 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "metadata": {}, 347 | "source": [ 348 | "### Datatypes" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "Every numpy array is a grid of elements of the same type. Numpy provides a large set of numeric datatypes that you can use to construct arrays. Numpy tries to guess a datatype when you create an array, but functions that construct arrays usually also include an optional argument to explicitly specify the datatype. Here is an example:" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "x = np.array([1, 2]) # Let numpy choose the datatype\n", 365 | "y = np.array([1.0, 2.0]) # Let numpy choose the datatype\n", 366 | "z = np.array([1, 2], dtype=np.int64) # Force a particular datatype\n", 367 | "\n", 368 | "print(x.dtype, y.dtype, z.dtype)" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": {}, 374 | "source": [ 375 | "Specify the type of the array elements" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "x = np.array([1, 2, 3], dtype=np.float)\n", 385 | "\n", 386 | "print(x, type(x), type(x[0]))" 387 | ] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": {}, 392 | "source": [ 393 | "You can read all about numpy datatypes in the [documentation](http://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html)." 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "metadata": {}, 399 | "source": [ 400 | "### Array math" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "metadata": {}, 406 | "source": [ 407 | "Basic mathematical functions operate elementwise on arrays, and are available both as operator overloads and as functions in the numpy module:" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": null, 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [ 416 | "x = np.array([[1,2],[3,4]], dtype=np.float64)\n", 417 | "y = np.array([[5,6],[7,8]], dtype=np.float64)\n", 418 | "\n", 419 | "# Elementwise sum; both produce a numpy array\n", 420 | "print(x + y)\n", 421 | "print(np.add(x, y))" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": { 428 | "scrolled": false 429 | }, 430 | "outputs": [], 431 | "source": [ 432 | "# Elementwise difference; both produce a numpy array\n", 433 | "print(x - y)\n", 434 | "print(np.subtract(x, y))" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "metadata": { 441 | "scrolled": true 442 | }, 443 | "outputs": [], 444 | "source": [ 445 | "# Elementwise product; both produce a numpy array\n", 446 | "# Notice, unlike MATLAB, this is not matrix multiplication\n", 447 | "print(x * y)\n", 448 | "print(np.multiply(x, y))" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": { 455 | "scrolled": true 456 | }, 457 | "outputs": [], 458 | "source": [ 459 | "# Elementwise square root; produces the array\n", 460 | "# [[ 1. 1.41421356]\n", 461 | "# [ 1.73205081 2. ]]\n", 462 | "print(np.sqrt(x))" 463 | ] 464 | }, 465 | { 466 | "cell_type": "markdown", 467 | "metadata": {}, 468 | "source": [ 469 | "Note that unlike MATLAB, `*` is elementwise multiplication, not matrix multiplication. You can use the following for matrix multiplication" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": null, 475 | "metadata": {}, 476 | "outputs": [], 477 | "source": [ 478 | "# Matrix multiplication\n", 479 | "x = np.array([[1,2],[3,4]])\n", 480 | "y = np.array([[5,6],[7,8]])\n", 481 | "\n", 482 | "v = np.array([9,10])\n", 483 | "w = np.array([11, 12])\n", 484 | "\n", 485 | "print(x @ y)\n", 486 | "print(np.matmul(x, y))\n", 487 | "print(x.dot(y))\n", 488 | "print(np.dot(x, y))" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [ 497 | "# Inner product of vectors; both produce 219\n", 498 | "print(v @ w)\n", 499 | "print(v.dot(w))" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "# Matrix / vector product; both produce the rank 1 array [29 67]\n", 509 | "print(x @ v)\n", 510 | "print(x.dot(v))" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": null, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [ 519 | "# Matrix / matrix product; both produce the rank 2 array\n", 520 | "# [[19 22]\n", 521 | "# [43 50]]\n", 522 | "print(x @ y)\n", 523 | "print(x.dot(y))" 524 | ] 525 | }, 526 | { 527 | "cell_type": "markdown", 528 | "metadata": {}, 529 | "source": [ 530 | "Numpy provides many useful functions for performing computations on arrays; one of the most useful is `sum`:" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "x = np.array([[1,2],[3,4]])\n", 540 | "print(x)\n", 541 | "\n", 542 | "print(np.sum(x)) # Compute sum of all elements; prints \"10\"\n", 543 | "print(np.sum(x, axis=0)) # Compute sum of each column; prints \"[4 6]\"\n", 544 | "print(np.sum(x, axis=1)) # Compute sum of each row; prints \"[3 7]\"" 545 | ] 546 | }, 547 | { 548 | "cell_type": "markdown", 549 | "metadata": {}, 550 | "source": [ 551 | "You can find the full list of mathematical functions provided by numpy in the [documentation](http://docs.scipy.org/doc/numpy/reference/routines.math.html).\n", 552 | "\n", 553 | "Apart from computing mathematical functions using arrays, we frequently need to reshape or otherwise manipulate data in arrays. The simplest example of this type of operation is transposing a matrix; to transpose a matrix, simply use the T attribute of an array object:" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": null, 559 | "metadata": {}, 560 | "outputs": [], 561 | "source": [ 562 | "print(x)\n", 563 | "print(x.T)" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": null, 569 | "metadata": {}, 570 | "outputs": [], 571 | "source": [ 572 | "v = np.array([[1,2,3]])\n", 573 | "print(v)\n", 574 | "print(v.T)" 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "metadata": {}, 580 | "source": [ 581 | "### Broadcasting" 582 | ] 583 | }, 584 | { 585 | "cell_type": "markdown", 586 | "metadata": {}, 587 | "source": [ 588 | "Broadcasting is a powerful mechanism that allows numpy to work with arrays of different shapes when performing arithmetic operations. Frequently we have a smaller array and a larger array, and we want to use the smaller array multiple times to perform some operation on the larger array.\n", 589 | "\n", 590 | "For example, suppose that we want to add a constant vector to each row of a matrix. We could do it like this:" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": null, 596 | "metadata": {}, 597 | "outputs": [], 598 | "source": [ 599 | "# We will add the vector v to each row of the matrix x,\n", 600 | "# storing the result in the matrix y\n", 601 | "x = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])\n", 602 | "v = np.array([1, 0, 1])\n", 603 | "y = np.empty_like(x) # Create an empty matrix with the same shape as x\n", 604 | "\n", 605 | "# Add the vector v to each row of the matrix x with an explicit loop\n", 606 | "for i in range(4):\n", 607 | " y[i, :] = x[i, :] + v\n", 608 | "\n", 609 | "print(y)" 610 | ] 611 | }, 612 | { 613 | "cell_type": "markdown", 614 | "metadata": {}, 615 | "source": [ 616 | "This works; however when the matrix `x` is very large, computing an explicit loop in Python could be slow. Note that adding the vector v to each row of the matrix `x` is equivalent to forming a matrix `vv` by stacking multiple copies of `v` vertically, then performing elementwise summation of `x` and `vv`. We could implement this approach like this:" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": null, 622 | "metadata": {}, 623 | "outputs": [], 624 | "source": [ 625 | "vv = np.tile(v, (4, 1)) # Stack 4 copies of v on top of each other\n", 626 | "print(vv) # Prints \"[[1 0 1]\n", 627 | " # [1 0 1]\n", 628 | " # [1 0 1]\n", 629 | " # [1 0 1]]\"" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": null, 635 | "metadata": {}, 636 | "outputs": [], 637 | "source": [ 638 | "y = x + vv # Add x and vv elementwise\n", 639 | "print(y)" 640 | ] 641 | }, 642 | { 643 | "cell_type": "markdown", 644 | "metadata": {}, 645 | "source": [ 646 | "Numpy broadcasting allows us to perform this computation without actually creating multiple copies of v. Consider this version, using broadcasting:" 647 | ] 648 | }, 649 | { 650 | "cell_type": "code", 651 | "execution_count": null, 652 | "metadata": {}, 653 | "outputs": [], 654 | "source": [ 655 | "import numpy as np\n", 656 | "\n", 657 | "# We will add the vector v to each row of the matrix x,\n", 658 | "# storing the result in the matrix y\n", 659 | "x = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])\n", 660 | "v = np.array([1, 0, 1])\n", 661 | "y = x + v # Add v to each row of x using broadcasting\n", 662 | "print(y)" 663 | ] 664 | }, 665 | { 666 | "cell_type": "markdown", 667 | "metadata": {}, 668 | "source": [ 669 | "The line `y = x + v` works even though `x` has shape `(4, 3)` and `v` has shape `(3,)` due to broadcasting; this line works as if v actually had shape `(4, 3)`, where each row was a copy of `v`, and the sum was performed elementwise.\n", 670 | "\n", 671 | "Broadcasting two arrays together follows these rules:\n", 672 | "\n", 673 | "1. If the arrays do not have the same rank, prepend the shape of the lower rank array with 1s until both shapes have the same length.\n", 674 | "2. The two arrays are said to be compatible in a dimension if they have the same size in the dimension, or if one of the arrays has size 1 in that dimension.\n", 675 | "3. The arrays can be broadcast together if they are compatible in all dimensions.\n", 676 | "4. After broadcasting, each array behaves as if it had shape equal to the elementwise maximum of shapes of the two input arrays.\n", 677 | "5. In any dimension where one array had size 1 and the other array had size greater than 1, the first array behaves as if it were copied along that dimension\n", 678 | "\n", 679 | "If this explanation does not make sense, try reading the explanation from the [documentation](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) or this [explanation](http://wiki.scipy.org/EricsBroadcastingDoc).\n", 680 | "\n", 681 | "Functions that support broadcasting are known as universal functions. You can find the list of all universal functions in the [documentation](http://docs.scipy.org/doc/numpy/reference/ufuncs.html#available-ufuncs).\n", 682 | "\n", 683 | "Here are some applications of broadcasting:" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "metadata": { 690 | "scrolled": true 691 | }, 692 | "outputs": [], 693 | "source": [ 694 | "# Compute outer product of vectors\n", 695 | "v = np.array([1,2,3]) # v has shape (3,)\n", 696 | "w = np.array([4,5]) # w has shape (2,)\n", 697 | "# To compute an outer product, we first reshape v to be a column\n", 698 | "# vector of shape (3, 1); we can then broadcast it against w to yield\n", 699 | "# an output of shape (3, 2), which is the outer product of v and w:\n", 700 | "\n", 701 | "print(v.reshape((3, 1)) * w)" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": null, 707 | "metadata": {}, 708 | "outputs": [], 709 | "source": [ 710 | "# Add a vector to each row of a matrix\n", 711 | "x = np.array([[1,2,3], [4,5,6]])\n", 712 | "# x has shape (2, 3) and v has shape (3,) so they broadcast to (2, 3),\n", 713 | "# giving the following matrix:\n", 714 | "\n", 715 | "print(x + v)" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": null, 721 | "metadata": {}, 722 | "outputs": [], 723 | "source": [ 724 | "# Add a vector to each column of a matrix\n", 725 | "# x has shape (2, 3) and w has shape (2,).\n", 726 | "# If we transpose x then it has shape (3, 2) and can be broadcast\n", 727 | "# against w to yield a result of shape (3, 2); transposing this result\n", 728 | "# yields the final result of shape (2, 3) which is the matrix x with\n", 729 | "# the vector w added to each column. Gives the following matrix:\n", 730 | "\n", 731 | "print((x.T + w).T)" 732 | ] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": null, 737 | "metadata": { 738 | "scrolled": true 739 | }, 740 | "outputs": [], 741 | "source": [ 742 | "# Another solution is to reshape w to be a row vector of shape (2, 1);\n", 743 | "# we can then broadcast it directly against x to produce the same\n", 744 | "# output.\n", 745 | "print(x + w.reshape((2, 1)))" 746 | ] 747 | }, 748 | { 749 | "cell_type": "code", 750 | "execution_count": null, 751 | "metadata": { 752 | "scrolled": true 753 | }, 754 | "outputs": [], 755 | "source": [ 756 | "# Multiply a matrix by a constant:\n", 757 | "# x has shape (2, 3). Numpy treats scalars as arrays of shape ();\n", 758 | "# these can be broadcast together to shape (2, 3), producing the\n", 759 | "# following array:\n", 760 | "print(x * 2)" 761 | ] 762 | }, 763 | { 764 | "cell_type": "markdown", 765 | "metadata": {}, 766 | "source": [ 767 | "Broadcasting makes it easy to write vectorized code. Like in MATLAB it is highly recommended to vectorize operations on Numpy arrays. It leads to great performance improvements.\n", 768 | "\n", 769 | "Here's an example of the performance difference:" 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": null, 775 | "metadata": {}, 776 | "outputs": [], 777 | "source": [ 778 | "def add_lists(a, b):\n", 779 | " c = []\n", 780 | " for x, y in zip(a, b):\n", 781 | " c.append(x + y)\n", 782 | " return c\n", 783 | "\n", 784 | "def add_lists2(a, b):\n", 785 | " return a + b" 786 | ] 787 | }, 788 | { 789 | "cell_type": "code", 790 | "execution_count": null, 791 | "metadata": {}, 792 | "outputs": [], 793 | "source": [ 794 | "N = 100000\n", 795 | "x = np.ones((N, 100))\n", 796 | "y = np.arange(N).reshape(N, 1)\n", 797 | "\n", 798 | "z1 = add_lists(x, y)\n", 799 | "z2 = add_lists2(x, y)\n", 800 | "print(np.all(z1 == z2))" 801 | ] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "execution_count": null, 806 | "metadata": {}, 807 | "outputs": [], 808 | "source": [ 809 | "%timeit add_lists(x, y)" 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": null, 815 | "metadata": {}, 816 | "outputs": [], 817 | "source": [ 818 | "%timeit add_lists2(x, y) # Vectorized operation is >7x faster" 819 | ] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": null, 824 | "metadata": {}, 825 | "outputs": [], 826 | "source": [ 827 | "print(x.shape)\n", 828 | "print(y.shape)" 829 | ] 830 | }, 831 | { 832 | "cell_type": "markdown", 833 | "metadata": {}, 834 | "source": [ 835 | "## Matplotlib" 836 | ] 837 | }, 838 | { 839 | "cell_type": "markdown", 840 | "metadata": {}, 841 | "source": [ 842 | "Matplotlib is a plotting library. In this section give a brief introduction to the `matplotlib.pyplot` module, which provides a plotting system similar to that of MATLAB." 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": null, 848 | "metadata": {}, 849 | "outputs": [], 850 | "source": [ 851 | "import matplotlib.pyplot as plt" 852 | ] 853 | }, 854 | { 855 | "cell_type": "markdown", 856 | "metadata": {}, 857 | "source": [ 858 | "By running this special iPython command, we will be displaying plots inline:" 859 | ] 860 | }, 861 | { 862 | "cell_type": "code", 863 | "execution_count": null, 864 | "metadata": {}, 865 | "outputs": [], 866 | "source": [ 867 | "%matplotlib inline" 868 | ] 869 | }, 870 | { 871 | "cell_type": "markdown", 872 | "metadata": {}, 873 | "source": [ 874 | "### Plotting" 875 | ] 876 | }, 877 | { 878 | "cell_type": "markdown", 879 | "metadata": {}, 880 | "source": [ 881 | "The most important function in `matplotlib` is plot, which allows you to plot 2D data. Here is a simple example:" 882 | ] 883 | }, 884 | { 885 | "cell_type": "code", 886 | "execution_count": null, 887 | "metadata": {}, 888 | "outputs": [], 889 | "source": [ 890 | "# Compute the x and y coordinates for points on a sine curve\n", 891 | "x = np.arange(0, 3 * np.pi, 0.1)\n", 892 | "y = np.sin(x)\n", 893 | "\n", 894 | "# Plot the points using matplotlib\n", 895 | "plt.plot(x, y)" 896 | ] 897 | }, 898 | { 899 | "cell_type": "markdown", 900 | "metadata": {}, 901 | "source": [ 902 | "With just a little bit of extra work we can easily plot multiple lines at once, and add a title, legend, and axis labels:" 903 | ] 904 | }, 905 | { 906 | "cell_type": "code", 907 | "execution_count": null, 908 | "metadata": {}, 909 | "outputs": [], 910 | "source": [ 911 | "y_sin = np.sin(x)\n", 912 | "y_cos = np.cos(x)\n", 913 | "\n", 914 | "# Plot the points using matplotlib\n", 915 | "plt.plot(x, y_sin)\n", 916 | "plt.plot(x, y_cos)\n", 917 | "plt.xlabel('x axis label')\n", 918 | "plt.ylabel('y axis label')\n", 919 | "plt.title('Sine and Cosine')\n", 920 | "plt.legend(['Sine', 'Cosine'])" 921 | ] 922 | }, 923 | { 924 | "cell_type": "markdown", 925 | "metadata": {}, 926 | "source": [ 927 | "### Subplots " 928 | ] 929 | }, 930 | { 931 | "cell_type": "markdown", 932 | "metadata": {}, 933 | "source": [ 934 | "You can plot different things in the same figure using the subplot function. Here is an example:" 935 | ] 936 | }, 937 | { 938 | "cell_type": "code", 939 | "execution_count": null, 940 | "metadata": {}, 941 | "outputs": [], 942 | "source": [ 943 | "# Compute the x and y coordinates for points on sine and cosine curves\n", 944 | "x = np.arange(0, 3 * np.pi, 0.1)\n", 945 | "y_sin = np.sin(x)\n", 946 | "y_cos = np.cos(x)\n", 947 | "\n", 948 | "# Set up a subplot grid that has height 2 and width 1,\n", 949 | "# and set the first such subplot as active.\n", 950 | "plt.subplot(2, 1, 1)\n", 951 | "\n", 952 | "# Make the first plot\n", 953 | "plt.plot(x, y_sin)\n", 954 | "plt.title('Sine')\n", 955 | "\n", 956 | "# Set the second subplot as active, and make the second plot.\n", 957 | "plt.subplot(2, 1, 2)\n", 958 | "plt.plot(x, y_cos)\n", 959 | "plt.title('Cosine')\n", 960 | "\n", 961 | "# Show the figure.\n", 962 | "plt.show()" 963 | ] 964 | }, 965 | { 966 | "cell_type": "markdown", 967 | "metadata": {}, 968 | "source": [ 969 | "You can read much more about the `subplot` function in the [documentation](http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.subplot)." 970 | ] 971 | }, 972 | { 973 | "cell_type": "markdown", 974 | "metadata": {}, 975 | "source": [ 976 | "### Exercise 1" 977 | ] 978 | }, 979 | { 980 | "cell_type": "markdown", 981 | "metadata": {}, 982 | "source": [ 983 | "Write a function to implement the sigmoid function which is given by the following formula. \n", 984 | "\n", 985 | "$$s(x) = \\frac{e^{z}}{1 + e^{z}}$$\n", 986 | "\n", 987 | "The function should take as input a numpy array and return the sigmoid element-wise\n", 988 | "\n", 989 | "Write the test code that prints the sigmoid of $z, z \\in [-10, 10]$" 990 | ] 991 | }, 992 | { 993 | "cell_type": "code", 994 | "execution_count": null, 995 | "metadata": {}, 996 | "outputs": [], 997 | "source": [ 998 | "def sigmoid(z):\n", 999 | " raise NotImplementedError" 1000 | ] 1001 | }, 1002 | { 1003 | "cell_type": "code", 1004 | "execution_count": null, 1005 | "metadata": {}, 1006 | "outputs": [], 1007 | "source": [ 1008 | "# Define z in [0, 1]\n", 1009 | "# Calculate simgoid(z)\n", 1010 | "# print result" 1011 | ] 1012 | }, 1013 | { 1014 | "cell_type": "markdown", 1015 | "metadata": {}, 1016 | "source": [ 1017 | "### Exercise 2\n", 1018 | "\n", 1019 | "Plot the sigmoid function in the interval $z \\in [-10, 10]$" 1020 | ] 1021 | }, 1022 | { 1023 | "cell_type": "code", 1024 | "execution_count": null, 1025 | "metadata": {}, 1026 | "outputs": [], 1027 | "source": [ 1028 | "def plot_sigmoid(z):\n", 1029 | " s = sigmoid(z)\n", 1030 | " raise NotImplementedError" 1031 | ] 1032 | }, 1033 | { 1034 | "cell_type": "code", 1035 | "execution_count": null, 1036 | "metadata": {}, 1037 | "outputs": [], 1038 | "source": [ 1039 | "# plot_sigmoid(z)" 1040 | ] 1041 | }, 1042 | { 1043 | "cell_type": "markdown", 1044 | "metadata": {}, 1045 | "source": [ 1046 | "### Exercise 3\n", 1047 | "\n", 1048 | "Generate a $1D$ distribution of white noise, following a normal distribution ~$N(3,0.1)$\n", 1049 | "\n", 1050 | "Plot the result in the interval $x \\in [-1, 1]$\n", 1051 | "\n", 1052 | "Function `generate_noise` should return two numpy arrays, the interval $x$ and the generated noise\n", 1053 | "\n", 1054 | "Hint: Use np.linspace " 1055 | ] 1056 | }, 1057 | { 1058 | "cell_type": "code", 1059 | "execution_count": null, 1060 | "metadata": {}, 1061 | "outputs": [], 1062 | "source": [ 1063 | "def generate_noise(nsamples, mu=3, sigma=.1, low=-1, high=1):\n", 1064 | " raise NotImplementedError" 1065 | ] 1066 | }, 1067 | { 1068 | "cell_type": "code", 1069 | "execution_count": null, 1070 | "metadata": {}, 1071 | "outputs": [], 1072 | "source": [ 1073 | "# x, n = generate_noise(...)\n", 1074 | "\n", 1075 | "# plot n" 1076 | ] 1077 | }, 1078 | { 1079 | "cell_type": "markdown", 1080 | "metadata": {}, 1081 | "source": [ 1082 | "### Exercise 4\n", 1083 | "\n", 1084 | "Create a function $f$ that is $sin(x)$ when $x \\in [-\\pi, \\pi]$ and white noise when $x \\in [\\pi, 2\\pi]$.\n", 1085 | "\n", 1086 | "Plot the result.\n", 1087 | "\n", 1088 | "Function `calculate_f` should return $x$ and $f(x)$, $x \\in [-\\pi, 2\\pi]$\n", 1089 | "\n", 1090 | "Hint: Use np.concatenate and np.linspace" 1091 | ] 1092 | }, 1093 | { 1094 | "cell_type": "code", 1095 | "execution_count": null, 1096 | "metadata": {}, 1097 | "outputs": [], 1098 | "source": [ 1099 | "def calculate_f(nsamples):\n", 1100 | " raise NotImplementedError" 1101 | ] 1102 | }, 1103 | { 1104 | "cell_type": "code", 1105 | "execution_count": null, 1106 | "metadata": {}, 1107 | "outputs": [], 1108 | "source": [ 1109 | "# x, f = calculate_f(...)\n", 1110 | "\n", 1111 | "# plot f(x)" 1112 | ] 1113 | }, 1114 | { 1115 | "cell_type": "markdown", 1116 | "metadata": {}, 1117 | "source": [ 1118 | "## Further Reading\n", 1119 | "\n", 1120 | "1. https://docs.scipy.org/doc/numpy/user/numpy-for-matlab-users.html\n", 1121 | "2. http://www.labri.fr/perso/nrougier/from-python-to-numpy/\n", 1122 | "3. http://www.labri.fr/perso/nrougier/teaching/numpy.100/index.html\n", 1123 | "4. http://cs231n.github.io/python-numpy-tutorial/" 1124 | ] 1125 | } 1126 | ], 1127 | "metadata": { 1128 | "kernelspec": { 1129 | "display_name": "Python 3", 1130 | "language": "python", 1131 | "name": "python3" 1132 | }, 1133 | "language_info": { 1134 | "codemirror_mode": { 1135 | "name": "ipython", 1136 | "version": 3 1137 | }, 1138 | "file_extension": ".py", 1139 | "mimetype": "text/x-python", 1140 | "name": "python", 1141 | "nbconvert_exporter": "python", 1142 | "pygments_lexer": "ipython3", 1143 | "version": "3.6.4" 1144 | } 1145 | }, 1146 | "nbformat": 4, 1147 | "nbformat_minor": 2 1148 | } 1149 | -------------------------------------------------------------------------------- /Pytorch Demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import torch\n", 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from sklearn.datasets import make_blobs" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "X, y = make_blobs(n_samples=10000, n_features=2)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 4, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "Populating the interactive namespace from numpy and matplotlib\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "%pylab inline" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 5, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/plain": [ 56 | "" 57 | ] 58 | }, 59 | "execution_count": 5, 60 | "metadata": {}, 61 | "output_type": "execute_result" 62 | }, 63 | { 64 | "data": { 65 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYYAAAD8CAYAAABzTgP2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAIABJREFUeJzt3Xd4lGXWwOHfmZ6EQCihVwVRsYGAKFYURXTFXnbtBXvZ1d11da37uWtZu2vBvpa1YEPAggoiCkgRpfdiACH01Knn+2NCyCSTBpmZlHNfVyDzvs87cyaTzJmni6pijDHG7ORIdQDGGGPqF0sMxhhjYlhiMMYYE8MSgzHGmBiWGIwxxsSwxGCMMSaGJQZjjDExLDEYY4yJYYnBGGNMDFeqA9gdbdq00e7du6c6DGOMaVBmzZq1SVWzqyvXIBND9+7dmTlzZqrDMMaYBkVEVtekXJ00JYnIKyKyUUTmlTnWSkQmiMjSkv9bVnLtJSVllorIJXURjzHGmN1XV30MrwHDyh27HfhaVXsBX5fcjiEirYB7gMOAgcA9lSUQY4wxyVEniUFVJwNbyh0eAbxe8v3rwOlxLj0JmKCqW1R1KzCBignGGGNMEiVyVFI7VV0PUPJ/2zhlOgG/lrmdU3LMGGNMiqR6uKrEORZ3gwgRGSkiM0VkZm5uboLDMsbsKdUgGt6AaiDVoZhaSmRi2CAiHQBK/t8Yp0wO0KXM7c7Aunh3pqqjVLW/qvbPzq52tJUxJkVUlUjBK+jGw9DcoejGAUS2307EP9OSRAORyMQwBtg5yugS4JM4Zb4AThSRliWdzieWHDPGNFBa9AHkPwmaDxSDFkHRh7D1EnTj4WjxxFSHaKpRV8NV/wdMBXqLSI6IXAE8CAwVkaXA0JLbiEh/EXkJQFW3AP8AZpR83V9yzBiTAKpFaDgX1UjiHqTg2WgyqCAImoduuxkNr40Tmx/V4sTFZWpMGuKez/3791eb4GZMzan60R33QtGngIAjEzLvxpFW94MAI78dCPirKOGGjGsQ31A07zEIzgENAkXR2Nz9kBb/Qlxd6zy2pk5EZqlq/+rKpbrz2RiTBLr9digaCwQAP0Q2wfa/oIEZdXP/kR27Pu1XOxUpCKHl6JbzIfAt6DagAIgAYQjOQjefi0YK0UgeGrf2YRKpQS6JYYypOY1sgeIJRJNCWcVo/vNIqwG7f9+BWej2OyG8BhDUewzo5qovknQI/1rS3BSvxSICWoBuOhki0RGI6jkKyfoX4mhVdTzh39D858E/EbSQaA1kX6TZjYhn959nU2M1BmMau/BGEHcl59bU6C5UA2hwPhraVT7in4JuuRDCK4AQEAT/JCBcxT2lgWt/COdQycj0En6IrC+53xAEvkO3XEhVTd8aXo9u+h0UvRu9VrdHayOBaeiWK1D/pBo9V2M1BmMaP2dXiNvZ7ABPv2ovjxSNgx13E/0kH0Kde4N7fygeHad0qIp7coHnaGh+F2w6robBl7nf0DJ0y0VoxqWI+wDE2T6mhOY/D1pA/MRUjO54AMk+tpaP2zRZYjCmkRNHOtrsaih4ocxoIQFJQzKuq/LaSGA+bP8LENx1MLwg+lUp586C5Y6HIDAJNk2Ovb/aCP4I235EAUXA0QrSr0QyLoPAD1SZmMJrUA0ildWeTClLDMY0AZJxLTg7owUvQCgX3L0h46JobaISGloBW37P7r2JdyR2tZudqhqtVFsKkc2Q/wjq/xYcbSBc1arSXlRDlhhqwPoYjGkCRARJOw3JvAvEAaF50VFJucegwdLV8tHgIiLb/kwk9yx002lEh5DWVpj4SSFRIhCcBsHlgK+KciHIPS6mn8TEZzUGY5oIDW9Ct10dO/lMC9Etl0L25Ogw0a3XE/1U3/DmN0E+uPtD8BeitZzyNZ0Q6Nbo800/GzyDwX0QIvGWbGvarMZgTBOhRZ+CxumY1SK0eAK6/e9AMQ0zKUC0g3o1tBwFLV8B4s2nUIjkoPlPoFsuRrffmthZ4A2U1RiMaSoiuVScywAQhOLPou31DZ3+BtuuAQ0Qf/Hmsoqg+BvwTQDfScmIrsGwGoMxjZBquOK6Q54qVkIITCM687ih05LF+wJERyhVlxwK0aKPEh9WA2OJwZhGRDVAZPt96IZD0A2HEMk9GQ38GD3pPrSKKwupemJaQxSmZs1i1sdQniUGYxoR3XYbFI0m2oEcgfBydMuVaHAJ4mgO0S1SKrs6SVHWJx4k7YxUB1HvWGIwppHQ8Abwf0PFuQIBtODF6OibzL+wawKaQdLBOzTVUdQ7lhiMaSzCOSDeOCciEFoaHX1T+DzWdFKGhm24ahyWGIxpLFw9QOPNLHaB+6DokhHhX6l6PaOmxrYajccSgzGNhDhaQdqZVJz9K5B+GVr8WclS1KaU++BUR1AvJTQxiEhvEZlT5muHiNxSrsyxIrK9TJm7ExmTMY1a5t/B0bzcQQfk/R8UfZySkOovL9L8jlQHUS8ldIKbqi4GDgEQESewFog3aPg7VT01kbEY0xRIYCKq+eWO+ktWHrUGglgC4kl1EPVSMn9TjgeWq2pVyx8aYyqhqmhoGRpciMZb2gLQwMxKmovC7PZS141WMbppBJGC/1W5AVBTlMwlMc4H/lfJucNF5GdgHXCbqs5PXljG1H8aXIpuuxbCuSDRvRRo8TjiHRRb0NEe8FK3y1s3ZkHIexAFJOOCVAdTbySlxiAiHuA04P04p2cD3VT1YOBpIG5DqIiMFJGZIjIzNzc3ccEaU8+oBkq20FwDFEVrBJHN6Lar0fDGmLKSdjqIzVOonSIoeCrVQdQryWpKOhmYraobyp9Q1R1a0iiqquMBt4i0iVNulKr2V9X+2dnZiY/YmPrCP5G4wyo1XGGdH3G2Rlq+CtIiObE1FpEtqNow3p2SlRguoJJmJBFpLyUzTERkYElMjWCZR2PqSGQTxH3TCkC4wmctxNMXMu9LfFyNibRGxBab3inhPwkRSQeGAleXOXYNgKo+D5wNXCsiIaLbRZ2v1hNkzC7u/sSdrSzpiPfwmEMayUe3XgXBucmJrbFwH5LqCOqVhCcGVS0EWpc79nyZ758Bnkl0HMY0VOLujfpOgOKv2bXVpg9cvcB7XExZzXugJCnYjN5aUWukKMsGNhvTAEiLfyPN74l+snXtD81uQVq9GdP8oapQ9CmWFHZDcD6RHQ+ike2pjqResEY1YxoAEQekn4mkn1lFqQiWFHZXAArfRP1fQ5tPESm/rEjTYjUGY+o5jRSiRZ+iBf9Fg0urKOkAaZW0uBqfAITXodvvQYMLUh1MSlmNwZh6TAM/o1svI7plZRBwoGmnIM3/WXG56MAUWyRvjwWh+CO0+DPUewSS9TQi7lQHlXRWYzCmnlINR2c7az5oAdFmomIo+gz8X1QsXzw2et7UgWLw/4AWvpnqQFLCEoMx9VVwHmhRnBOFaOF7cY7bgnB1qxgK3011EClhicGYequKRe8iW9CCV9Gi8ejOzXlsi8q6p02zM9/6GIypr9wHEf+zmyO6VWfev6PLRu+4H1q/DZF1yY6wkfNA2impDiIlrMZgTD0l4kGyHie6I9vOZiI3oERrE8Fo34NuRbfdDMFfUhVq4yPp4OyCZFxdfdlGyGoMxtRj4j0asr9Ei8ZAZDMUfwaR38qVUgitAnc/oktn2Ioyu813BogP8fQH30lIE93IxxKDMfWcONsjzUYCEPF/U3lBZ7ckRdSI+T9HWo9FXF1SHUlKWVOSMQ2J7zSiG/GU42gB+U9gtYU9pEG08I1UR5FylhiMaUAk4wpw9Yy2gQPgA2lG9E/Z5jDsuRCElqU6iJSzpiRjGhBxpEPr0eCfiAZmI86OqO9kyD0i1aE1El7w9Et1EClnicGYBkbECb4TEN8J0QOqqDQDzUttYA2eI7rHRfrvUx1IyllTkjENnIhAxuVAWrkztvdzrbgOQtp8iDhsIUJLDMY0ApJxLaRfSLTPIS3aB+E7NdVhNSyRHHB0SHUU9UIytvZcBeQBYSCkqv3LnRfgSWA4UAhcqqqzEx2XMY2JiANp/mc080YI54KzLeBGiz/D9miooch20B0gWamOJOWSVWM4TlUPKZ8USpwM9Cr5Ggk8l6SYjGl0RHyIqwsi3ujmPr7TUx1SAyIgGakOol6oD01JI4D/atQ0IEtErD5nTB2QzOtAMlMdRgMRhp0LEjZxyUgMCnwpIrNEZGSc852AX8vczik5FkNERorITBGZmZubm6BQjWlcxNkRaf0ReIeDozUVO6jNLl7wT0p1EPVCMhLDYFXtR7TJ6HoRObrceYlzTYXpm6o6SlX7q2r/7OzsRMRpTIOlqkQK3iGSO4TIhr5Etlxcuj2luLriaPkEjrZTgVBqA63XrMawU8ITg6quK/l/I/ARMLBckRyg7MIknQFbP9iYWtD8pyDvXxDOia64GpiGbrkArTCLN97nMBMVAO+RqQ6iXkhoYhCRDJFoA6eIZAAnAvPKFRsDXCxRg4Dtqro+kXEZ05hopBAKXgbK7famRejWa9Dwhl3HfCdg8xsqI4izXaqDqBcSXWNoB0wRkZ+BH4Fxqvq5iFwjIteUlBkPrACWAS8C1yU4JmMal3AOSCVv9uE16ObT0cgWACTzLnC0TWJwDYhNbCuV0HkMqroCODjO8efLfK/A9YmMw5hGzdketKptQPPQgreQzBsRZ2vI/grddDqElwORpIWZWtXtU+GF9MuSFUy9Vx+Gqxpj9oA4mkPaaUR3d4snAIFpu8qLG2n9NngGE90ZrrH1OziJLk3uYNcorGqWI/cOia5cawBLDMY0CtL8PvANr+SsA8ptPCOOFjhavYy0nQxZz9Oo+h3Sfo80vxPJ/hY8g6g+8XmQFvdEJwQawBKDMY2CiBtH1iPgOpiKLcReJP3S+Nc5WuHwHQeZd1J5jaMh8UDmX5H086MdycGfqbq2kAbp59nCeeVYYjCmEZFWL4LnCKJNRGngyEaynkDc+1Z9Xfo54OxC3N3hGhSBond33XRWM+cp4zIk8++JDakBssRgTCMijiwcrV5C2k5B2oxFsr9DfMdVf514kdajIeNKcPYAR4XFBxoIPxS9X3pLml1HpW9zkom4D4wuW25iWGIwphESR1bJYno1/xMXRzMcmTfjyP4Cyf4KnHsnMMJE2tV0JL6TwV3J7nYaANc+SYqpYbHEYIypQMSJtPkEvCelOpRa8oHvjJgjkvUw0JzYTmhfdCRSuU55E2WJwRgTl4gHSRsO+FIdSs1IOrgPQDIujD3sbIO0+RC8x0fLONpAxlVI1r9TFGj9Z3s+G2MqpcVfAMWVnK1u0liyOCD9SsQ7CDxHxG0+E1dXpOWzKYitYbLEYIypnGQSbVgoP0M6DTx9IVIQPR/6KfmxAeAC71E4mt+WosdvnCwxGGMqJennoEWfUKHWIB6k5ShEPABE8l+F/AepdtkJgtTNMhw+ohP3OiMt/lUH92fKssRgjKmUuA9CM2+FvH+D7Hy7cCGtXi5NCtFyvVHSgYIq7swbHSEUmEx0C/idXESbpRxUnzh80PxuRFzReRfufjbcNAEsMRhjquTIuARNOw0C06N7InsGIVJulrTn0Oq7HLQQAlPKFfJCi0cRz0FQPAGN7ICCZ4kmiPIykKzHajQvw+wZSwzGmGqJoyX4hlV+XqJv8LrtFsBP/AwRZ/c4cSASRpztIeMiBIiEl0LxN8Q2X/mQ1u8g7t579DxMzdhwVWNMnRDfECT7C3APoMZvLVqEFn0cez8tHob0cyjtR3DujbR6yZJCElmNwRhTZ8TZAbIeQzedAprHrv4CT8n38facjm2WEvEgze9CM+8EgtHaiEmqhNUYRKSLiEwUkYUiMl9Ebo5T5lgR2S4ic0q+7k5UPMaY5BBnW6T1h+AdBtICnJ2h2Z/YtTdCWWlI2lnx70cclhRSJJE1hhBwq6rOLtn3eZaITFDVBeXKfaeqpyYwDmNMkomrC9LyiZhj6jkQ3XpVSfdDCBBIOwO8x6YgQlOVhCUGVV0PrC/5Pk9EFgKdgPKJwRjTBIhnAGRPAf+XEMkH7xGIq2eqwzJxJKWPQUS6A32B6XFOHy4iPwPrgNtUdX4yYjLGJJ84mkHamakOw1Qj4YlBRJoBHwC3qOqOcqdnA91UNV9EhgMfA70quZ+RwEiArl27JjBiY4xp2hI6XFWis2A+AN5S1Q/Ln1fVHaqaX/L9eMAtIm3i3ZeqjlLV/qraPzu7ml2ZjDHG7LZEjkoS4GVgoao+VkmZ9iXlEJGBJfFsTlRMxhhjqpfIpqTBwEXAXBGZU3LsDqArgKo+D5wNXCsiIaAIOF9V68M6vsYY02QlclTSFGK3TIpX5hngmUTFYIwxpvZsSQxjjDExLDEYY4yJYYnBGGNMDEsMxhhjYlhiMMYYE8MSgzHGmBiWGIwxxsSwxGCMMSaGJQZjjDExLDEYY4yJYYnBGGNMDEsMxhhjYlhiMMYYE8MSgzHGmBiWGIwxxsSwxGCMMSaGJQZjjDExEp4YRGSYiCwWkWUicnuc814Rebfk/HQR6Z7omIwxxlQuoYlBRJzAf4CTgf2BC0Rk/3LFrgC2qmpP4HHgoUTGZIwxpmqJrjEMBJap6gpVDQDvACPKlRkBvF7y/WjgeBGpcq9oY4wxiZPoxNAJ+LXM7ZySY3HLqGoI2A60TnBcxhhjKpHoxBDvk7/uRhlEZKSIzBSRmbm5uXUSnDHGmIoSnRhygC5lbncG1lVWRkRcQAtgS/k7UtVRqtpfVftnZ2cnKFxjjDGJTgwzgF4i0kNEPMD5wJhyZcYAl5R8fzbwjapWqDEYY4xJDlci71xVQyJyA/AF4AReUdX5InI/MFNVxwAvA2+IyDKiNYXzExmTMcaYqiU0MQCo6nhgfLljd5f5vhg4J9FxGGOMqRmb+WyMMSaGJQZjjDExLDEYY4yJYYnBGGNMDEsMxhhjYlhiMMYYE8MSgzHGmBiWGIwxxsSwxGCMMSaGJQZjjDExLDEYY4yJYYnBGGNMjIQvomeqt3rBr0z9dBZuj4ujzh5E2y5tUh2SMaYJs8SQYq/e9T9GPzaWcCiMw+HglTvf5qZnr+KkS49LdWjGmCbKmpJSaMms5Xzw+FgCRQHCwTBBf5BAcZCnrnuRrRu3pzo8Y0wTZYkhhSa9+z2B4mCF4w6ng+ljZ6UgImOMscSQUiKSsseePHoql+5zIyd7z+fSfW9iykfTUxaLMaZ+SUhiEJFHRGSRiPwiIh+JSFYl5VaJyFwRmSMiMxMRS3127HmD8fjcFY5HwhEG/e7Qaq8Ph8NM+Wg6//zDkzxxzQssnL4UgDWL1jJ59FRW/LI67nWT3v2ehy99hrXLfiMUDLN2yXoevPApJo+eumdPyBjTKCSq83kC8LeSPZ8fAv4G/LWSssep6qYExVGv9eq3F2f96XeMfvRTQoEQqgqqDBzeD4D/3PwK416YQDAQwu11cdLlx3PtY5fg8bqJRCLcPeIhfvl2AcUFfsQhTHjjW9p1a8vG1bk43U7CoQi9Du3BA2PvID0zrfRxX7r9LfyFgZhY/EUBXvrbWxx99uFJ/RkYY+qfhNQYVPVLVQ2V3JwGdE7E4zQGl91/PqdddxIOh6ARRRV+HD+bC3tcz8dPf0YwEP0xBv0hxj73BbcPvR+AaWNn8cvkaFIA0IgSKAry66K1+IsCFO4owl/oZ/GPy3jmppdLH09V2bAmN24sv63cmOBna4xpCJLRx3A58Fkl5xT4UkRmicjIqu5EREaKyEwRmZmbG/+NrSHK31bAJ09/RigYLj0WKA7iL/THLT9/2hKWzFrOlI+mU5wfv0xZQX+ISe98TyQSAWDci19V2reR3bn1bjwDY0xjs9tNSSLyFdA+zqk7VfWTkjJ3AiHgrUruZrCqrhORtsAEEVmkqpPjFVTVUcAogP79++vuxl3fzJrwc2mtoCY0oiybvZKMzHQcDiESqf5HEQqGiYQjjBs1gRduewONc4033cul/zi/VrEbYxqn3U4MqnpCVedF5BLgVOB4VY377qWq60r+3ygiHwEDgbiJoTHZvmkHbz3wAd9/9CN5W/Nrda3D6aBl+yz2OqQbn738Nf6iQLXX9B7QE6fLyet3vxe3JuJwOrjh6csZetExtYrFGNM4JaTzWUSGEe1sPkZVCyspkwE4VDWv5PsTgfsTEU+q5SxZx+evfMOWDdvoO+RAXrvrHTav30q4TPNRTYkI9575CA6H0K1PF1YvzMHjcYNE+w/CwXDM3AhxCBffey5Bf5C8LXlx79PpcjDssiEVjofDYTav3UKzls1iOq+NaWry/H6m/LoaQTiqazcyPJ5Uh5RQUsmH+T27U5FlgBfYXHJomqpeIyIdgZdUdbiI7AV8VHLeBbytqg/U5P779++vM2c2jNGtk0dP5eFLniEUChMOhnF5XISDIXbnx16+6cjpdtKld0dG3HAyy+espGB7IZNHT6uQcESEDnu3Y3vuDgq2V8zTbo+Lj7e9jse365f967e/49lbXsVf6CcSVo46axB/HHU1vnRv7QM3pgH7bOlibp3wOS6JdsmGNcJTw07l+L32TnFktScis1S1f7XlEpEYEq2hJIaAP8jZbS+nKK94j+7nlKtPID0zndGPfVqxf0CIduHXRCVl3V4XNz07kmGXRddn+nnSfO489V8xzU4en5vDTj2Uu9+7dbeegzEN0W/5eQz57ysUh2L7AX0uF5MvvYo26ekpimz31DQx2MznBNmxOY8nrxlVOpx0T4wb9RUfPjE2bqdxjZNCFWWD/hAfPjGWUDD6y/+/Bz+q0BcRKA4y7dNZbMu1NZxM0zF2yWIicT48C/DZsiXJDyhJbHXVPTD767l88sxn7Nicx5FnHsbwq04gLcPHttztXNP3z2zflBf/zby2FMKhyJ7fTxVWL8jhvrP/zf0f/5XFM5bFLeP2uNiyfhtZ2S0SGotp2lZt28p78+eRW5DPsd334sS9e+J2OhP6mMWhII9N/Z7RC+YRDIcZ1KUrDwwZSlEoSChc8W8vFIlQFKy4zlljYYlhN7378Me8ef9oiks+WS+dtYLPXv6GZ6b/i/ceGcP2TXmEajEMNdUi4QjTxs7isauepzCvKH6ZSISOPeONUDambkxYvoybvxhHOBIhGInw2fKlvPTTTN456zy8rqrfriKqzM/diKrSJ7stTkf1DSIb8vNRVU5/7y02FhSUHv965Qq+efkFHhgyFK/LSVG5piSnw8Gx3Xvs3pNsACwx7IYdW/L4773vxYz+8RcF+G3lRia8Pokfx89uUEmhlMLnr35TaZPTGTcNt85nkzCBcJhbJ3wW055fGAyyePMm3ps/l4sO7lvptbPXr+PacWMoDEaHb6e53Nx6xJEs2byJUCTCKb16M6BjJ0SEsUsW8ejU7/l1x3ZQpbK6uAJ3fDOB5l4voXCEoEYQwOdyc/4BB7JP68a7oZYlht2wcOoSXB5XhSWz/YV+vv9kBlltW7B6QU6KottDlSQFt9dFj4O68vBlz7Bt4w6OOG0AQy8+Gm+al03rtvDsza8wbexsHE4Hx553BNc8egnNsjKSG7tp0OZu/C3u8eJQiDFLFlWaGHb4/Vzy8WgKyjTtFASD/O3rL9k5x3/0gvmcue/+dM/K4tFp31foTK7KDr8fj9NJp/RMBnTszKDOXRjaAEck1YYlht2Q2apZ3L4DcQhZ2c1Zu3R9CqJKLFXl0cufI+APohFl7rcL+PT5L3jk63u48bC/seW3bURK2mK/fus7Fs9Yxgtz/o2jBtV5YwB8Tlfcjl6ANHfFVYgh2hQ0avYMwpVct/NoUSjI+wvmEoxEajVeY6dAOMyGggLGLV3M2KWLuePrLzm0YydePe1M0ms5pyEQDuN2OFK67H51LDHshv0G7UOL7OYUF/pjEoTH58ab7mHl3DUpjC4xQsEwocCu+RHFhX7WLl3Ps7e8Rv72wtKkABAKhNiwKpefvp7LoUMPTkW4pgHaP7strXxpFJbr1E1zufnDgbG/RxFV7vj6Sz5ZvBAl+mZbnUBkzwZwhMpdP2PdWk588zUmX3YVjhq8yU9evYp7v/2aNdu3k+ZycfFBffnj4YNx1cMPT/UvogZARHjoy7vouHc7fBle0pun4U33ct0TlzF93E81WqaioREq/uL7CwP8PGkexfkV52kE/EFWz2+gzWkmIWavX8dp77xJz6cfo98L/+Hp6VMJl3mzFRFeOu0MWqelkeH2kO5y43U6ObfPAQzpvhejF8zjDx++x8UfjebPX37GmCWL8IfDNUoKibIuP4+vVyyvttxP69dxzbhPWLVtGxFVCoJBXvt5Nvd/+00Soqw9m+C2B1SV5XNWUbC9kH0G7I03zcMp6X9omB3P1XC5nTErwO7Us18Pchaviztf45hzDufv7/4pGeGZem7RplzOeu/tmNE9aS4XZ+3Xh/uPi112LRgO892a1WwtLmJgx850bt6cy8d8yI9rc0qvr828zkTbp1VrLut7KKf26h13qQx/KMRVn37MlF8rbpzldTqZfuW1NPcmZ1CHTXBLAhGhZ98eHHxsH7xpHl7+29ulk8Qam1YdWlZoE/Wme7ns/vNxueOPMf9hzAw2rG48S6Sb3ffsjOn4y32yLwqFeH/BPBZs3MjcjRvwh6KbVS3buoVWaWmcts++dGnRgu9/XcOMdWtjkkp9SQoAS7Zs5h+TJ3LUay+yYuuW0uMb8vO59JMPOPD5p+MmhbLl6hvrY6gDkUiEe854hGljZ9av39g60qpjSx768i7+euI/yNuaj4gQCoS46O6zGTi8H4cOPZhv36+4LajT5WTOxHmcdOlxKYja1CcLNuXG7VgORiKc+f7beBxOIqr4XE6Kw2EEwQHcc+xxzN2woUK/Q31TGAxSGAxyytv/JdPrZVCnLsxcv5bcgoJKO8YB/OEwv+7YTq/W9WsvFEsMu2ne94v46o1vCYcjtOuazcwJcxplUoDo5LeNazbx6KR7eeG2N1gyczld9u1InyN6A1RaK9CIktGiYa0lYxJjvzZtWLVta4XkEFElUKafoDAUmwBum/BF0mKsC/5wGH9hIWOXLq7xNS//NJMhPfZKYFS1Z4lhN7z8t7f46Onx0U7mRpoMytq2YTt/PfEfADhcDiKhaKLsLXljAAAfHklEQVSYN2UxNz17JSsqGYUVDIQYeHLlk5JM03HdgEF8s3JFhRnEhuhEu3rG+hhqKWfJOj58chz+wqaRFMqLlFmzyV/o59+X/YdAJaOw0jJ9MUt5m6ZrvzbZvH762eyf3RYB0t1uPAle/6ghcIjQr33HVIdRgdUYaunL/06q1VacjV1Vg9q67d85eYGYeq9/x06MveAiVJW8gJ+BLz6X6pBSzudycdNhh6c6jAosMdTCG/94n3cf+qRuVkxtAiJh+zmZitZs386Ls2eQ4fEQKN6zvUoasjZp6bx15rns1bJVqkOpIGFNSSJyr4isFZE5JV/DKyk3TEQWi8gyEbk9UfHsqWU/reTdhz6OmeFrqrZkxjIa4jwZkzgLcjdyyv/+y7vz57K1CScFiO7IWN9GI+2U6BrD46r678pOiogT+A8wFMgBZojIGFVdkOC44lJVPn/lGz58cjwF2wo47NR+XHjXObTu0JIvXpsY7VcwNaYa7ZPp0rtTqkMx9cT9306s90NPk6Vz8xZ8tmwJC3I30rVFFift3YtmHk+NltdItFQ3JQ0ElqnqCgAReQcYAdRpYggFQ3z6/Jd8/vI3hMMRhl50NGfcNLxCx+izt7zK5y9/U7rHwmcvfcP3H/3IC788yrgXJtRlSE2C0+2wGpaJ8dNv61IdQr3gdjjI2b6dv0z4nIJgEIcIf/3qC1wOB6f02of7jj2e5l5fyuJLdGK4QUQuBmYCt6rq1nLnOwG/lrmdAxxWlwGoKnePeJhfJi8o3a7yjfve54dPZvD4d/8oXf1zy29bGTfqK4L+XZ9mwqEw+dsKeOD8x63DeTdktW1B1/2sA9rs0szjafJNSABhVTYW7toYaOf8jlAkwvilS1mxdSsfn/eH0tUGthUX8fmypeQHAhzVrTu9E7wXxB71MYjIVyIyL87XCOA5YG/gEGA98Gi8u4hzLG6jtIiMFJGZIjIzN7fmyywsnLaEud8tiNnD2F8UYOXcNcz4fE7psaWzV+LxVVzaN+gP8fPE+TV+vKbGl+Hl0v87nwOO3BdvySY+Hp+btEwfd713a71eWtgk3yUH98VXzU5sTUFly4sDBCNhlm/dws8bovtTfLd6FYNfGcU/Jk/kkR++44x33+KuiV8ltP9uj14hVT2h+lIgIi8CY+OcygG6lLndGYhb11TVUcAoiC6iV9MY5/+wJO76RUX5xcybsojDhvcDoG2X1nEXiTOVa966GTf95yoK84oozCsio0UaPQ7qyuARAxl+5fE0b52Z6hBNPXPdgEHk7NjBx4sXVljG2sRauXUr+7XJ5rrxn8ZMDAxGIny4cAEn9NibYxK0vWgiRyV1KHPzDGBenGIzgF4i0kNEPMD5wJi6jKN1hyzcnoo1AW+ahzaddg0T63FgN7r36YzTZXP+amrH5nweuuQZnrr+JVb8vJot67exfPZKxr0wAbfXPhWailwOBw8PHcakS64gvZLNd0x0/bV9WrdmWs6vxKt0F4WCfLAwcS0ZiXwXfFhE5orIL8BxwB8BRKSjiIwHUNUQcAPwBbAQeE9V6/TZHnH6QFyeim9STpeT4y4YHHPsrvdsiejaCvqDMcuMBwMhtm3czoT/Tk5hVKa+65jZnOeGn2ZLL1QiQnTjoqoW4Kvq3J5K2Ouiqhep6oGqepCqnqaq60uOr1PV4WXKjVfVfVR1b1V9oK7j8KV7eXTSfXTepwPeNA++dC/tumXz0IS7aN4qtqnjp2/m4/bap5g9VVzoZ/bXv9So7MY1ucz+6hc2/ropwVGZ+uaobt2546hjUx1GvRQIh5mas4bDO3eJ2x+R7nZzxr77Jezxm0R9v8cBXXll4ZOsX7GBSDhCp14d4naKblm/lUCxjbHeU063k/Y92lZZJuAP8uCFTzJ93GzcXjcBf5DBIwbwl9dviNv0ZxoXVeWZGdP4z4zpqQ6l3pq8ZjVHdOnGYyeezC1fjCeiSjAcxudyM3SvnhzfY++EPXaTSAwQ3VSn497tqyzT54jeeNI8cbeqNDXncrv43TUnVlnmlTveZvr4nwgUB0uT8Q9jZvLfe9/nin/+PhlhmhQav3QJz86YntJtOeu7rpktADhx715MvPgKPl2yiLyAn2O79eCQ9vE/3NaVJpMYqlOUX8SEN77FH2eLSlMzbq+LzFbN+MtrN9CpZ4cqy45/8asKq7IGigKMff5LSwwNWG5BAXdP+oqvV64gokp2ejr3HHM8w3r2Ki3jD4X469dfVNjRzeziFOH0/fYvvd2uWTOu7Fftjpx1xhJDibt+9yALpi21tX32wCHHH8D/jflb6aTByqhq6ezy8oryixIRmkmC4lCQ0955gw0FuyZubSgo4LrxY/A6nZy57/7kFhbwzcoV2EDVyrlEeGLYKSkdtWWJAVg5bw2LZiyLmfVsam/u5IXVJgWINuv1HtCTRdOXVji3/+G9ExGaSYIPFy5gc2H8xO4Ph/nf/LlJjqhhObVXbw5q155T9+lN+2apnQNko8WAnMXrcLps05A9VZzvZ+uGbTUqe+MzV+Br5sPpjv7cXW4nac18XP/U5YkM0STADn8x140bw10TvyKkVhfYHUJ0uZAr+/UvTQrhSITvVq/ig4XzWbF1S1LjsRoD0K1PF8JxZj27PE7C4Qhq+wrU2NRPZzH8yuMrPa+q/DxpPqsX5HDzs1exYNpils9ZzT6H7sVZfzyV9t2rHs1k6p/LPvmQeRs3NMUNDeuMAu8tmMfFB/clt7CANKebW74Yx3Z/MQqEI8qwnr149MSTk7L6qiUGoOu+nTj4uAOY883c0hEyIkIoYJ1jtVWwvaDSc/nbCrj1uHtYv3wD4VAYp8tJu+7ZPDrpvgpzSkzDsHBTLos25RK05S32WESVU97+LxluNwXBYIVE++Xypbw7rxMXHHhwwmOxpqQS93xwGyNuOJlmLTNwe93sMyBxY4Qbs1F/foOXbn8z7rnn//QaaxaupSi/mEBxkKL8YnKWrOc/N7+a5ChNXcnZvh1XDfqVTM0okB8nKQAUhUK8OffnpMRhr2gJj9fNyIcv4qPNrzG+6G1atm2R6pAarHcf+YSZX1b8BZ707g8xy2cAhAIhvhs91UaDNVD7ZWfbXIQkKg4lZ/l/SwyVWL9iQ6pDaLg0Wjt4/99j+GHMDMKh6BtHuJJNe8Iha4ZoqDo3b8Gwnr1Is6W0E87rdHJKr+SM2rPEUIkDjtzPVlrdA2sW5vDynW/z4EVPcfl+N7N1wzYGDDsEhzP2Z+pwOuh/0sG2b0MD9sjQk7n5sCOsSSkBdv5VpLvddG7egquSNMnN0nwlzr/9dCa+M4Wi/GI0Em3mcDhtq8qaUoVwMExRMEygKMjTN7zEDU9fwaIfl1GUV0RxgR9fhhdfho8bn7ky1eGaPeByOBh56ABapaXx94lfWdNSHfE6nZy+7/4EwmGO6NKVU3v1xpukmpk0xLbd/v3768yZMxP+ODlL1vHK3//HL5Pmk9W2BTlL1lmzx25yupyML34bf1GAiW9PYfkvq+hxQDeO/8ORpDVLS3V4pg6oKgNefI4txTZ7fXdJyb/HduvOPccOoWuLrLq9f5FZqlpttcNqDFXovE9H7n7v1tLbw9N+b4mhEm6vmxZtMtny21YiceZ97PwAkpbhY/hVNdr4zzQwIkJB0FYP2F0OhGv7D+ScPgfEJIRwJMIOv59MrzdpzXXWKFgLR4wYYDOkKxEJh3ll0ZMcffbhFX5GO/sRarJchmnY2jXLSHUIDZLL4eCuY47j1iOOjEkKb/0yhwEvPsfhr7xA3xee4anpP1S5X3Rdsb/UWrj+qcvJ7twaj8/2CyjP7XMz55t5XPfEZbTu2JK0Zj4A0pr5yGrbgpufvSrFEZpkuHng4TZCqZZ8Lhcn7d2LC8tNXPtk0QL+OeVbtvmLCYTDFASDvDBrBs/NTPweFgl5BUXkXWDnuKosYJuqHhKn3CogDwgDoZq0fSWTqjJvyiIWTF1C644tOfLMw3h18ZPcetw9LPhhSarDq1ccDgciQst2Wby6+CmmfDCNlXPX0HW/zhx9ziC8ad5Uh2iS4Iz9+pAXCPDYtO/Z4bcl7KvT3OvlhgGDOHv/PjjL1aif+nEaReXmLRSFQoyaNYNr+x+W0KUxEpIYVPW8nd+LyKPA9iqKH6eq9W5fx2AgyJ2n/ouFU5cQ9Ifw+Nw8e/MrPDrpPtp3b2eJoRyNKH2PPwCIThYc8vujUhyRSZWLD+7Lj2tzmLB8KcEGOLglmQLhME9On8qjU6dw51HHcuFBuz4/r8/Pi3tNYTCIPxQiLYHLcie0KUmig9PPBf6XyMdJhDH/+ZwFPyymuMBPOBSmKL+YvK0F3H/uY5x4yTH4MpruJ2BHmfkdbo8LT5qbO9/5o9UKDADzN25g4qoVlhRqoDgUoiAYwB8O888p37JwU27puX1atYl7Tev0dHwJbq5LdB/DUcAGVa248H6UAl+KyCwRGZngWGrl81cn4i8MVDieu2YTHfZqx7DLh+BJ8yDOpjUxq8eBXXE4HKUT1RQYeHI/Bp7cN7WBmXpj2tocQjbfp9YC4TCjF8wrvX37kUdXSABpLhd3HHlMwieE7nZiEJGvRGRenK8RZYpdQNW1hcGq2g84GbheRI6u4vFGishMEZmZm5tbWbE6s3NSW9xzqhz/h6M4+Ng+NKW1hp1uBzlL1hEKhEon+oUCIWZ+MYeZX8wBIBwO8+37U/m/8x/n8atfYPGMZakM2aRA67Q03E4b11JbEVXyA7s+jA7q3IXXRpzFoR06kunxsH+bbJ4adiqn9d4v4bEkbIKbiLiAtcChqppTg/L3Avmq+u/qyiZjgtu7D3/MG/e9j7/cvsQd9mrHxfeewxPXjIpbo2isfBle9j2sF0tmLKcwr+IEpqEXH8OtL13L34Y/wMKpSygu8CMOweNzc/kDF3DmzaemIGqTCoXBIINeep78YNP5+6gLaS43T518Csf3SNzKzjWd4JbItH4CsKiypCAiGSKSufN74ERgXryyqXDGTcPZu28PfCXDLr3pXtKbp3H7mzfy9A0vN6mk4HA6uOX5kZx23bBdi7eUIRJNAN9//GNpUoBorctfGODlv73N9k07khy1SZV0t5s3zzyHTLenwjkH0eaQtukZeJ02J6isHi2zOK77XqkOA0hsYjifcs1IItJRRMaX3GwHTBGRn4EfgXGq+nkC46kVj8/D45Pv5+73/sQf/n4W1zx6CW+teo5QIEzQn5ylb+sLp8vJto07GDDskLhNbJ40NydecizffTC9NCmU5XK7mDNxfjJCNfXEQe3aM/vq67lxwCDapKfjcjjYp1Vr/jL4aLJ8aewI+PHbmkqlPE4nlx9yaFJ2Z6uJhHVtq+qlcY6tA4aXfL8CSPxWRHvA4XAwYFhfBgyLdqwum7OSv//uXwT9TWvafyQSIRgI4Uv3cs8Ht3HvGY8gDiESiaAR5dw/j2D/w3vz5euTEIdUTB4C6Zm+1ARvUsbpcDCsZy/ygwGKgkH6d+zE37+ZQLElhAoEYUiP+lFbAFsrqVYevPApivKK457zpnvI7tKadcs2xKzA2r5HNoU7itixOT9ZYdY5EWHg8GhyPHTowbyzbhQ/fDKD4gI/A4YdUrpP88lXHM9Xb06u0MzmdDk5ZMgBSY/bpNarc2bzyA/fEQiFiaCMXjifsG0BGsNBNIHu26YNp73zJntlteKGgYPo37FTyuMyNbDlt62sW/5bpefP++vpDDipL+VrgpvXb6PDXu1weWqeg93e+rfkxp3D/1m6eVFG83SGXnQMv7vmxNKkANB7QE8uf+AC3D436ZlppDdPI7NVM/45/g7cnvr3nEzibCos5OHvJ1McChEpGboXikSa0iC+SjXzeOjdug3De+7DpQf3w+lwMHfjBnJ27GDymlVc/PFovl65PKUxWo2hhpwuZ6VDU9t2bcNFd53Dac0vqrD6arA4yJpF60jPTKNge2HpbmZur4tIWFHV0kXnzv3LCA46al/uOOVfCX0uNeF0OUtjDQVCbFm/lX+c9xjPznioyuvOvPlUTrjwGOZMnEdaMx+HDDnAkkITNGXNalwOh/UjlJPmdPHQCScxbO9ehFW5+KPRFbbrLA6FuHfSNwzpvlfKNrCyxFBDLdo0Z+++PVgyYxmRMm3onjQPw686HlWluDD+2jD+Qj8vz3+cN+5/n1lf/ozT5SQ3ZxPhcBg0Ouonq20LTr/xZK4++DbCwdT+MXl8bgLFsf0okYiyev6vbFq7mTadWld5ffPWmRx99uGJDNHUc16XE4k3hI3owLadf0E+p5PuWS3ZXFREUSBA22bNWJ+fV2GNoMbA7XAwuGtXlm7exO1ffUl+oPK1pH7Lz6cwGCTDU3FkVzJYYqiFO96+mT8edReFecWEAiGcLgf7DuzFObeNQETY77BeLJhacQ2lPkf0Jrtza/406hrCoTDndriSUGDXm38oEGJ77nZeueOtuHMEki2rXQs2rq64fJU4pMmNyDK759huPdA4VWyf08VJPXsxa91a0txuLjzoEH5/wEExC8i9+cscHvhuUqOpbfTIyuKAtu04Z/8DmbJmNS/MmlFt4vM4HQlf9qIqlhhqoUOPdry58lmmj5vNxjWb6D1gb/YbtE9pde/GZ67kT8fcTdAfJBQM43K7cPtcXP/U5aX3sWbR2rhvrkF/iJ++mRfdE7MS2V3a0KJtJstmraz7Jwe43E4uvu9civP9vP/opxVGX2Vlt6B9j7aVXG3MLmluNy+cejpXj/0YQYighCMR/nT4YK6sZt/iDpmZDb4ZygEc270Hdx09hG5Z0f0VikNBRo79uELTUXk+l4vfH3BwhdVWk8kSQy253C4Gnz4w7rmefXsw6pdH+eDxsSyfs4qe/Xpw1i2n0q5bdmmZjOZphILxfzFatm0BEeW3gopLfhx3/mD+8voNiEM4t/2V8Uc5CWS1y2Lbhm01XqrD4XTQukNLWndqxXl/GcGRZxxGwY5Cvv/4Rzas2URxfjEenxuny8ntb96UsjZP0/Ac0aUr06+8lkmrVlIcCnJU1+5kZ1S/kc8x3XqQ4fY0uN3gnOLA53LSvlkmb595boXnmltQiKOS5jWADLeHUCTM6b334y+DU7s6sSWGOta+e1uuf/LySs+vXrg27nFxCE63E3ecTYAcTgcX33cuLnf05Xr4q3v48wn3UbC9kEhJZ3eL7Obc99GfuWP4PytNCg6nI2YorTfNw1FnDeKv/70xplxG83SenfUwUz6YxpxJ8+nQoy0nXnocrTu0rPK5G1NeutvN8F771Lh8RJWvViynfbNmbCwsSGBkdUuAo7t24+r+AxnQsVPcD1BtMzLirhwAcETnLtxzzPG0a9aM5t7Ur1KcsLWSEikZayUlwtLZK/jj0XdVupyGSPyWJJfbyYgbhnHNo5eWHgsFo01PW3/byr4De9F1v87s2JLHeR1HEgrEr5G4PS4ysjLYnrsDb7qHU68+kcv/eYGNGjL1gqpy0+djmbhyJYWhhlVbAPjnkKGcf8BBVZZ5+sepPD/zx5g+Bp/Lxdtnnssh7TskOsQar5VkNYYkeufBjwgUVf4LX1mODgXDrF4Qu+SUy+1iwEmxm+JltEjHl+Elv5LEEAyE8KS5GV/8Nk6X05qFTL0yc/3aBpsUAJ6aPpXz+hyIAi/OnsmLs2awtbiInq1ac9fRx3Fk127cMGAQWT4fz838kc2FhezbJps7jzo2KUmhNiwxJNHyOavYnRqax+emzxG9qy3ndDq55L5zefGvbxEoil8rCRQFSpukjKlPJq9eRVEDTQoAW4qLWJeXx//m/cyrc2aX1gqWbtnMyLEf8+YZ59CvQ0cuOqgvFx1Uv/cvsZnPSeIv8rNhTe13MHU4BF+Gj1OvObFG5U+/YTg3PnNF6aS5spxuZ6Ud58akWguvD3clK666HA6cSazh7pXVkusHHMZzw09jcJeueBzVrwQbjkRwOoRXyiSFnYpDIZ6Y9n2iwq1zlhiSZNqns3C6q/9xu71u9j2sJ9mdW9MsK4OjzzmcZ2c+RFZ2ixo/1rDLhvDk9/9HWjNf6fIavgwvrdpnccn95+/2czAmkU7dp3fc1UXTXG7GXXARriQN32zu9fLVxZcTjkS44bNP+eHXNYQi1Q+dPaBtWwLhcKUjj5Zu2VLXoSaMtSkkybbcHWg4fjOSOIT0zDSC/iCHndKP29+4CY9vz2Y89h7Qk1cXP8X4F78iZ8k6Dhi8LydcdDRpzdL26H6NSZT2zTJ5atgp3PLF+NIEIcBzp4ygV+s2HN2tO5NXr6rT+Q1lZ2EDeJ1OztqvD+/Pn8vzs2aUHt9ZxiVCKE5zsBN46XdnkOHxxJ3YB9CrVas6izvRLDEkyUHH7E+FFfaAtGY+bnl+JG27tqF9j7bVLjdRG607tOSiu8+ps/szJtFO2KsnM6+6luk5OTgcwmGduuApaV56ctgp3D3xaz5ZvJDgHq7S6nY4SHO7ad8sk5wd24HoqKiD2nXg1kGDOfyVF+JeFy8pALicTpp5vHhdLi475FBenTOrwsijPw4avEcxJ5MlhiTpcUBXjjnncL77YFrpZjbedC+9+u3FMecdgdN2szIGAJ/LzTHde8Q9/tAJJ/H9r2tYn59X4byD6IzrwmCw0vmd3Vtkken1clinLlzR91DaZmQw57f1rNi2ld6t23BA23ZsKiykIFD7HRrzAgG8Lhe3Hj6YFj4vL8yawbbiYnq1as3fjz6Wvh061vo+U8USQxLd9sp19D/pEMaNmkAoEOKEC49m2BVDLCkYU0ObigrZXFQY95zP7eaBIUP56bf1vD335wq1CpfDwacXXFRhYbq+HTrGvGn7XK7oUO5ajCDM8HholRZtphURruo3gKv6Dajx9fXNHiUGETkHuBfYDxioqjPLnPsbcAUQBm5S1S/iXN8DeAdoBcwGLlLVRruZssPhYMgFRzLkgiNTHYoxDVKG21P58vcZGZzWez8O79yVTxcvYkfAT6gkOaS5XNxy2BE1Wq20mcfD0V27M2n1ygoPNahTZ37e8BvFoVDpuTSXizuOPKbebMtZF/a0m38ecCYwuexBEdmf6J7PfYBhwLMiEu9j8UPA46raC9hKNJEYY0xc6W43J+y1d2m/w05pLhdX9I1O6M3OyGDc7y/mggMOontWFv07duKpYady1aE1/wT/7xNPpk92WzxOJ26HAwcwpMdevHHGOYw+5wKG9NibtukZ9G3fgf8MP40z9+tTl08z5epkSQwRmQTctrPGUFJbQFX/VXL7C+BeVZ1a5hoBcoH2qhoSkcNLypxU3eM11CUxjDF7Ls/v55pxn/DTb+txOxwEwmHO63Mg9xwzpM5n88/fuIGcvB30yW5L5+Y1HzJeX6V6SYxOwLQyt3NKjpXVGtimqqEqypQSkZHASICuXbvWXaTGmAYl0+vlrTPPZdW2razLy2Of1m1ok56ekMfq07Ydfdq2S8h912fVJgYR+QpoH+fUnar6SWWXxTlWvmpSkzK7TqiOAkZBtMZQWTljTNPQPasl3bNsxd9EqDYxqOoJu3G/OUCXMrc7A+vKldkEZImIq6TWEK+MMcaYJEvUHPMxwPki4i0ZedQL+LFsAY12bkwEzi45dAlQWQ3EGGNMkuxRYhCRM0QkBzgcGFfSyYyqzgfeAxYAnwPXq2q45JrxIrJz0PBfgT+JyDKifQ4v70k8xhhj9pxt1GOMMU1ETUcl2eqqxhhjYjTIGoOI5AKrUx1HHWtDtEO+qbLn37SfP9jPIBnPv5uqZldXqEEmhsZIRGbWpIrXWNnzb9rPH+xnUJ+evzUlGWOMiWGJwRhjTAxLDPXHqFQHkGL2/E1T/xnUm+dvfQzGGGNiWI3BGGNMDEsMKSYi54jIfBGJiEj/cuf+JiLLRGSxiFS7HHlDJyL3ishaEZlT8jU81TElg4gMK3mNl4nI7amOJ9lEZJWIzC15zZvEzFUReUVENorIvDLHWonIBBFZWvJ/ylYItMSQenu62VFj87iqHlLyNT7VwSRayWv6H+BkYH/ggpLXvqk5ruQ1rxfDNZPgNaJ/12XdDnxdsnHZ1yW3U8ISQ4qp6kJVXRzn1AjgHVX1q+pKYBkwMLnRmSQYCCxT1RUl29q+Q/S1N42Yqk4GtpQ7PAJ4veT714HTkxpUGZYY6q9OwK9lble5kVEjcoOI/FJS1W4Ki+031de5LAW+FJFZJRtyNVXtVHU9QMn/bVMVSKJ2cDNlJHCzowanqp8F8BzwD6LP8x/Ao8DlyYsuJRrl61xLg1V1nYi0BSaIyKKST9QmRSwxJEECNztqcGr6sxCRF4GxCQ6nPmiUr3NtqOq6kv83ishHRJvXmmJi2CAiHVR1vYh0ADamKhBrSqq/qt3sqLEp+WPY6QyiHfON3Qygl4j0EBEP0QEHY1IcU9KISIaIZO78HjiRpvG6xzOG6IZlkOKNy6zGkGIicgbwNJBNdLOjOap6kqrOF5Gdmx2FKLPZUSP2sIgcQrQpZRVwdWrDSTxVDYnIDcAXgBN4pWSjq6aiHfCRiED0/ehtVf08tSElnoj8DzgWaFOy2dk9wIPAeyJyBbAGOCdl8dnMZ2OMMWVZU5IxxpgYlhiMMcbEsMRgjDEmhiUGY4wxMSwxGGOMiWGJwRhjTAxLDMYYY2JYYjDGGBPj/wH3xqteceVSdwAAAABJRU5ErkJggg==\n", 66 | "text/plain": [ 67 | "
" 68 | ] 69 | }, 70 | "metadata": { 71 | "needs_background": "light" 72 | }, 73 | "output_type": "display_data" 74 | } 75 | ], 76 | "source": [ 77 | "import matplotlib.pyplot as plt\n", 78 | "\n", 79 | "plt.scatter(X[:, 0], X[:, 1], c=y)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 6, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "from torch.utils.data import Dataset" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 7, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "class BlobData(Dataset):\n", 98 | " def __init__(self, X, y, trans=None):\n", 99 | " self.data = list(zip(X, y))\n", 100 | " self.trans = trans\n", 101 | " \n", 102 | " def __len__(self):\n", 103 | " return len(self.data)\n", 104 | " \n", 105 | " def __getitem__(self, idx):\n", 106 | " if self.trans is not None:\n", 107 | " return self.trans(self.data[idx])\n", 108 | " else:\n", 109 | " return self.data[idx]" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 8, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "from torchvision import transforms" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 9, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "class GaussianNoise(object):\n", 128 | " def __init__(self, std=0.1, mean=0.0):\n", 129 | " self.mean = mean\n", 130 | " self.std = std\n", 131 | " \n", 132 | " def __call__(self, datum):\n", 133 | " x, y = datum[0], datum[1]\n", 134 | " return x + (self.std * np.random.randn(2) + self.mean), y" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 10, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "class ToTensor(object):\n", 144 | " def __init__(self):\n", 145 | " pass\n", 146 | " \n", 147 | " def __call__(self, datum):\n", 148 | " x, y = datum[0], datum[1]\n", 149 | " t = torch.from_numpy(x).type(torch.FloatTensor) \n", 150 | " return t, y" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 11, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "my_transform = transforms.Compose([GaussianNoise(), ToTensor()])" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 12, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "blob_dataset = BlobData(X, y, trans=my_transform)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 13, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/plain": [ 179 | "10000" 180 | ] 181 | }, 182 | "execution_count": 13, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | } 186 | ], 187 | "source": [ 188 | "len(blob_dataset)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 14, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/plain": [ 199 | "(tensor([ 7.2354, -7.2486], requires_grad=True), 1)" 200 | ] 201 | }, 202 | "execution_count": 14, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "blob_dataset[100]" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 15, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "import torch.nn as nn\n", 218 | "import torch.nn.functional as F" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 16, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "class LinearWActivation(nn.Module):\n", 228 | " def __init__(self, in_features, out_features, activation='sigmoid'):\n", 229 | " super(LinearWActivation, self).__init__()\n", 230 | " self.f = nn.Linear(in_features, out_features)\n", 231 | " if activation == 'sigmoid':\n", 232 | " self.a = nn.Sigmoid()\n", 233 | " else:\n", 234 | " self.a = nn.ReLU()\n", 235 | " \n", 236 | " def forward(self, x):\n", 237 | " return self.a(self.f(x))\n", 238 | "\n", 239 | "class MyFunnyNet(nn.Module):\n", 240 | " def __init__(self, layers, n_features, n_classes, activation='sigmoid'):\n", 241 | " super(MyFunnyNet, self).__init__()\n", 242 | " layers_in = [n_features] + layers \n", 243 | " layers_out = layers + [n_classes]\n", 244 | " self.f = nn.Sequential(*[\n", 245 | " LinearWActivation(in_feats, out_feats, activation=activation)\n", 246 | " for in_feats, out_feats in zip(layers_in, layers_out)\n", 247 | " ])\n", 248 | " \n", 249 | " def forward(self, x):\n", 250 | " y = self.f(x)\n", 251 | " return y" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 17, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "import torch.optim as optim\n", 261 | "\n", 262 | "EPOCHS = 30\n", 263 | "BATCH_SZ = 32\n", 264 | "\n", 265 | "criterion = nn.CrossEntropyLoss()\n", 266 | "net = MyFunnyNet([100, 100], X.shape[1], len(set(y)))" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 18, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "optimizer = optim.SGD(list(net.parameters()), lr=1e-2)\n" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 19, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "from sklearn.model_selection import train_test_split" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 20, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 21, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "train_data = BlobData(X_train, y_train, trans=my_transform)\n", 303 | "test_data = BlobData(X_test, y_test, trans=my_transform)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 22, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "from torch.utils.data import DataLoader" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 23, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "train_dl = DataLoader(train_data, batch_size=BATCH_SZ)\n", 322 | "test_dl = DataLoader(test_data, batch_size=BATCH_SZ)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 24, 328 | "metadata": { 329 | "scrolled": true 330 | }, 331 | "outputs": [ 332 | { 333 | "name": "stdout", 334 | "output_type": "stream", 335 | "text": [ 336 | "Epoch: 0 \t Batch: 0 \t Loss 1.1050496101379395\n", 337 | "Epoch: 0 \t Batch: 100 \t Loss 1.0912319209315988\n", 338 | "Epoch: 0 \t Batch: 200 \t Loss 1.0853993975701024\n", 339 | "Epoch: 1 \t Batch: 0 \t Loss 1.0742565393447876\n", 340 | "Epoch: 1 \t Batch: 100 \t Loss 1.0655937678743117\n", 341 | "Epoch: 1 \t Batch: 200 \t Loss 1.0592965284983318\n", 342 | "Epoch: 2 \t Batch: 0 \t Loss 1.0462876558303833\n", 343 | "Epoch: 2 \t Batch: 100 \t Loss 1.0373207472338535\n", 344 | "Epoch: 2 \t Batch: 200 \t Loss 1.0296259086523483\n", 345 | "Epoch: 3 \t Batch: 0 \t Loss 1.0136157274246216\n", 346 | "Epoch: 3 \t Batch: 100 \t Loss 1.0027397664466706\n", 347 | "Epoch: 3 \t Batch: 200 \t Loss 0.9932335570677003\n", 348 | "Epoch: 4 \t Batch: 0 \t Loss 0.9736994504928589\n", 349 | "Epoch: 4 \t Batch: 100 \t Loss 0.9604842149385131\n", 350 | "Epoch: 4 \t Batch: 200 \t Loss 0.9492151719420704\n", 351 | "Epoch: 5 \t Batch: 0 \t Loss 0.9271013736724854\n", 352 | "Epoch: 5 \t Batch: 100 \t Loss 0.9115854619753243\n", 353 | "Epoch: 5 \t Batch: 200 \t Loss 0.8993108892322179\n", 354 | "Epoch: 6 \t Batch: 0 \t Loss 0.8759656548500061\n", 355 | "Epoch: 6 \t Batch: 100 \t Loss 0.8597459108522623\n", 356 | "Epoch: 6 \t Batch: 200 \t Loss 0.8476340933818722\n", 357 | "Epoch: 7 \t Batch: 0 \t Loss 0.8254131078720093\n", 358 | "Epoch: 7 \t Batch: 100 \t Loss 0.8098198824589795\n", 359 | "Epoch: 7 \t Batch: 200 \t Loss 0.7988525907791669\n", 360 | "Epoch: 8 \t Batch: 0 \t Loss 0.7804373502731323\n", 361 | "Epoch: 8 \t Batch: 100 \t Loss 0.7655479465380753\n", 362 | "Epoch: 8 \t Batch: 200 \t Loss 0.7561692933538067\n", 363 | "Epoch: 9 \t Batch: 0 \t Loss 0.7417194843292236\n", 364 | "Epoch: 9 \t Batch: 100 \t Loss 0.7283268523688363\n", 365 | "Epoch: 9 \t Batch: 200 \t Loss 0.7206358770232889\n", 366 | "Epoch: 10 \t Batch: 0 \t Loss 0.7095876932144165\n", 367 | "Epoch: 10 \t Batch: 100 \t Loss 0.6981357371453012\n", 368 | "Epoch: 10 \t Batch: 200 \t Loss 0.6919632239721307\n", 369 | "Epoch: 11 \t Batch: 0 \t Loss 0.6841816902160645\n", 370 | "Epoch: 11 \t Batch: 100 \t Loss 0.6740923706847842\n", 371 | "Epoch: 11 \t Batch: 200 \t Loss 0.6691658197350763\n", 372 | "Epoch: 12 \t Batch: 0 \t Loss 0.6636543273925781\n", 373 | "Epoch: 12 \t Batch: 100 \t Loss 0.655046019813802\n", 374 | "Epoch: 12 \t Batch: 200 \t Loss 0.6511142351734105\n", 375 | "Epoch: 13 \t Batch: 0 \t Loss 0.6484760046005249\n", 376 | "Epoch: 13 \t Batch: 100 \t Loss 0.6399260346252139\n", 377 | "Epoch: 13 \t Batch: 200 \t Loss 0.6367658649511005\n", 378 | "Epoch: 14 \t Batch: 0 \t Loss 0.6353266835212708\n", 379 | "Epoch: 14 \t Batch: 100 \t Loss 0.6279019282595946\n", 380 | "Epoch: 14 \t Batch: 200 \t Loss 0.6253434209088188\n", 381 | "Epoch: 15 \t Batch: 0 \t Loss 0.6252152919769287\n", 382 | "Epoch: 15 \t Batch: 100 \t Loss 0.6181991295059128\n", 383 | "Epoch: 15 \t Batch: 200 \t Loss 0.6161048032751131\n", 384 | "Epoch: 16 \t Batch: 0 \t Loss 0.6164901256561279\n", 385 | "Epoch: 16 \t Batch: 100 \t Loss 0.6102790773505031\n", 386 | "Epoch: 16 \t Batch: 200 \t Loss 0.6085530635136277\n", 387 | "Epoch: 17 \t Batch: 0 \t Loss 0.6098099946975708\n", 388 | "Epoch: 17 \t Batch: 100 \t Loss 0.6038077149060693\n", 389 | "Epoch: 17 \t Batch: 200 \t Loss 0.6023614732780267\n", 390 | "Epoch: 18 \t Batch: 0 \t Loss 0.6039808988571167\n", 391 | "Epoch: 18 \t Batch: 100 \t Loss 0.5984378769846246\n", 392 | "Epoch: 18 \t Batch: 200 \t Loss 0.5971999681411098\n", 393 | "Epoch: 19 \t Batch: 0 \t Loss 0.5992460250854492\n", 394 | "Epoch: 19 \t Batch: 100 \t Loss 0.593867818317791\n", 395 | "Epoch: 19 \t Batch: 200 \t Loss 0.5928187652013788\n", 396 | "Epoch: 20 \t Batch: 0 \t Loss 0.5951368808746338\n", 397 | "Epoch: 20 \t Batch: 100 \t Loss 0.5900417584003789\n", 398 | "Epoch: 20 \t Batch: 200 \t Loss 0.5891435481422577\n", 399 | "Epoch: 21 \t Batch: 0 \t Loss 0.5914766788482666\n", 400 | "Epoch: 21 \t Batch: 100 \t Loss 0.5867261349564732\n", 401 | "Epoch: 21 \t Batch: 200 \t Loss 0.585960844559456\n", 402 | "Epoch: 22 \t Batch: 0 \t Loss 0.5889554023742676\n", 403 | "Epoch: 22 \t Batch: 100 \t Loss 0.5839473138941397\n", 404 | "Epoch: 22 \t Batch: 200 \t Loss 0.5832366756538847\n", 405 | "Epoch: 23 \t Batch: 0 \t Loss 0.5856987237930298\n", 406 | "Epoch: 23 \t Batch: 100 \t Loss 0.5814635641504042\n", 407 | "Epoch: 23 \t Batch: 200 \t Loss 0.5808482481472528\n", 408 | "Epoch: 24 \t Batch: 0 \t Loss 0.5832868814468384\n", 409 | "Epoch: 24 \t Batch: 100 \t Loss 0.5793179267703896\n", 410 | "Epoch: 24 \t Batch: 200 \t Loss 0.5787759871032108\n", 411 | "Epoch: 25 \t Batch: 0 \t Loss 0.5814157128334045\n", 412 | "Epoch: 25 \t Batch: 100 \t Loss 0.5774194232308039\n", 413 | "Epoch: 25 \t Batch: 200 \t Loss 0.5769225993559728\n", 414 | "Epoch: 26 \t Batch: 0 \t Loss 0.579628050327301\n", 415 | "Epoch: 26 \t Batch: 100 \t Loss 0.5757626135750572\n", 416 | "Epoch: 26 \t Batch: 200 \t Loss 0.5753221769831074\n", 417 | "Epoch: 27 \t Batch: 0 \t Loss 0.5780152082443237\n", 418 | "Epoch: 27 \t Batch: 100 \t Loss 0.5742460577794821\n", 419 | "Epoch: 27 \t Batch: 200 \t Loss 0.5738395965514491\n", 420 | "Epoch: 28 \t Batch: 0 \t Loss 0.5767131447792053\n", 421 | "Epoch: 28 \t Batch: 100 \t Loss 0.5729207343394214\n", 422 | "Epoch: 28 \t Batch: 200 \t Loss 0.5725429321996015\n", 423 | "Epoch: 29 \t Batch: 0 \t Loss 0.5748416781425476\n", 424 | "Epoch: 29 \t Batch: 100 \t Loss 0.5717051424602471\n", 425 | "Epoch: 29 \t Batch: 200 \t Loss 0.5713635350934309\n" 426 | ] 427 | } 428 | ], 429 | "source": [ 430 | "net.train()\n", 431 | "for epoch in range(EPOCHS):\n", 432 | " running_average_loss = 0\n", 433 | " for i, data in enumerate(train_dl):\n", 434 | " X_batch, y_batch = data\n", 435 | " optimizer.zero_grad()\n", 436 | " out = net(X_batch)\n", 437 | " loss = criterion(out, y_batch)\n", 438 | " loss.backward()\n", 439 | " optimizer.step()\n", 440 | " \n", 441 | " running_average_loss += loss.detach().item()\n", 442 | " if i % 100 == 0:\n", 443 | " print(\"Epoch: {} \\t Batch: {} \\t Loss {}\".format(epoch, i, float(running_average_loss) / (i + 1)))" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 25, 449 | "metadata": {}, 450 | "outputs": [ 451 | { 452 | "name": "stdout", 453 | "output_type": "stream", 454 | "text": [ 455 | "0.9915865384615384\n" 456 | ] 457 | } 458 | ], 459 | "source": [ 460 | "net.eval()\n", 461 | "acc = 0\n", 462 | "n_samples = 0\n", 463 | "with torch.no_grad():\n", 464 | " for i, data in enumerate(test_dl):\n", 465 | " X_batch, y_batch = data\n", 466 | " out = net(X_batch)\n", 467 | " val, y_pred = out.max(1)\n", 468 | " acc += (y_batch == y_pred).sum().detach().item()\n", 469 | " n_samples += 32\n", 470 | "\n", 471 | "print(acc / n_samples)" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [] 487 | } 488 | ], 489 | "metadata": { 490 | "kernelspec": { 491 | "display_name": "Python 3", 492 | "language": "python", 493 | "name": "python3" 494 | }, 495 | "language_info": { 496 | "codemirror_mode": { 497 | "name": "ipython", 498 | "version": 3 499 | }, 500 | "file_extension": ".py", 501 | "mimetype": "text/x-python", 502 | "name": "python", 503 | "nbconvert_exporter": "python", 504 | "pygments_lexer": "ipython3", 505 | "version": "3.6.4" 506 | } 507 | }, 508 | "nbformat": 4, 509 | "nbformat_minor": 2 510 | } 511 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python - Numpy introductory Lab 2 | 3 | This is the introductory lab for Pattern Recognition and Speech and Language Processing in 4 | NTUA. Material covers an introduction to Python, NumPy and matplotlib. 5 | 6 | Some of the material is adapted from [this 7 | tutorial](http://cs231n.github.io/python-numpy-tutorial/) in cs231n course at Stanford. 8 | 9 | # Course Virtual Machine 10 | 11 | To ease the setup process we provide a virtual machine for each course with the required 12 | software preinstalled. 13 | 14 | To run the virtual machine you need to download and install [Vagrant](https://www.vagrantup.com/) and [VirtualBox](https://www.virtualbox.org/wiki/Downloads). 15 | Then you need to open a terminal and run: 16 | 17 | ``` 18 | # For Pattern Recognition course 19 | cd /python-lab 20 | cp Vagrantfile.patreco Vagrantfile 21 | vagrant up 22 | ``` 23 | 24 | or 25 | 26 | ``` 27 | # For Speech and Language Processing course 28 | cd /python-lab 29 | cp Vagrantfile.slp Vagrantfile 30 | vagrant up 31 | ``` 32 | 33 | Setup will take a while, since it installs all dependencies. 34 | 35 | When the setup is finished, a jupyter notebook will start in the machine. Navigate to the 36 | URL printed. 37 | 38 | Changes to files will be saved automatically to your current working directory. 39 | 40 | To access the console inside the VM use 41 | 42 | ``` 43 | vagrant ssh 44 | ``` 45 | 46 | To stop the VM use 47 | 48 | ``` 49 | vagrant halt 50 | ``` 51 | 52 | 53 | ## Bash tutorial 54 | 55 | To run the bash tutorial notebook first install the `bash_kernel` for jupyter notebook: 56 | 57 | ``` 58 | pip install bash_kernel 59 | python -m bash_kernel.install 60 | ``` 61 | 62 | and then run 63 | 64 | ``` 65 | jupyter notebook 66 | ``` 67 | 68 | and open `Bash Tutorial.ipynb` 69 | -------------------------------------------------------------------------------- /Vagrantfile.patreco: -------------------------------------------------------------------------------- 1 | ##################################################################################################### 2 | # CONSTANTS ######################################################################################### 3 | ##################################################################################################### 4 | 5 | VAGRANTFILE_API_VERSION = "2" 6 | 7 | VM_NAME = "PATRECO_VM" 8 | VM_BASE = "ubuntu/xenial64" 9 | VM_MEMORY = 4096 10 | 11 | ##################################################################################################### 12 | # INSTALL SYSTEM LEVEL DEPENDENCIES ################################################################# 13 | ##################################################################################################### 14 | 15 | $install_deps = <