├── README.md ├── Nato ├── Alphabet.wav └── characters.csv ├── Intro ├── images │ ├── both.gif │ ├── launch.png │ ├── toolbar.png │ ├── navigator.png │ └── architecture.png └── source │ ├── salida.txt │ ├── location.py │ └── hello.txt ├── LazyEvaluation.ipynb ├── meshgrid.ipynb ├── Polynomial features.ipynb ├── Memoization.ipynb ├── .gitignore ├── partiallyd-scrape.ipynb ├── youtube-captions-2.ipynb ├── python.ipynb ├── casey-neistat-analisys.ipynb ├── tloz-scrape.ipynb ├── youtube-captions.ipynb ├── bokeh └── x.html ├── Intro.ipynb ├── mt-scraper.ipynb ├── Scalers.ipynb ├── bokeh.ipynb └── insta-api.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # nbstuff 2 | Notebook stuff! 3 | -------------------------------------------------------------------------------- /Nato/Alphabet.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thatcsharpguy/nbstuff/master/Nato/Alphabet.wav -------------------------------------------------------------------------------- /Intro/images/both.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thatcsharpguy/nbstuff/master/Intro/images/both.gif -------------------------------------------------------------------------------- /Intro/images/launch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thatcsharpguy/nbstuff/master/Intro/images/launch.png -------------------------------------------------------------------------------- /Intro/images/toolbar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thatcsharpguy/nbstuff/master/Intro/images/toolbar.png -------------------------------------------------------------------------------- /Intro/images/navigator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thatcsharpguy/nbstuff/master/Intro/images/navigator.png -------------------------------------------------------------------------------- /Intro/images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thatcsharpguy/nbstuff/master/Intro/images/architecture.png -------------------------------------------------------------------------------- /Intro/source/salida.txt: -------------------------------------------------------------------------------- 1 | 2 | # 3 | ## 4 | ### 5 | #### 6 | ##### 7 | ###### 8 | ####### 9 | ######## 10 | ######### 11 | -------------------------------------------------------------------------------- /Intro/source/location.py: -------------------------------------------------------------------------------- 1 | class Location: 2 | def __init__(self, name, region): 3 | self.name = name 4 | self.region = region 5 | -------------------------------------------------------------------------------- /Intro/source/hello.txt: -------------------------------------------------------------------------------- 1 | ________ __ ______ __ __ 2 | /_ __/ /_ ____ _/ /_ / ____/_/ // /_ ____ ___ ____ __ 3 | / / / __ \/ __ `/ __/ / / /_ _ __/ / __ `/ / / / / / / 4 | / / / / / / /_/ / /_ / /___/_ _ __/ / /_/ / /_/ / /_/ / 5 | /_/ /_/ /_/\__,_/\__/ \____/ /_//_/ \__, /\__,_/\__, / 6 | /____/ /____/ -------------------------------------------------------------------------------- /LazyEvaluation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "a = 2\n", 12 | "b = 3\n", 13 | "def odds(n):\n", 14 | " \n", 15 | " return a + b\n", 16 | "\n", 17 | "def odds_lazy():\n", 18 | " yield a + b" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "c = calculo()\n", 30 | "print(calculo())\n", 31 | "\n" 32 | ] 33 | } 34 | ], 35 | "metadata": { 36 | "kernelspec": { 37 | "display_name": "Python 3", 38 | "language": "python", 39 | "name": "python3" 40 | }, 41 | "language_info": { 42 | "codemirror_mode": { 43 | "name": "ipython", 44 | "version": 3 45 | }, 46 | "file_extension": ".py", 47 | "mimetype": "text/x-python", 48 | "name": "python", 49 | "nbconvert_exporter": "python", 50 | "pygments_lexer": "ipython3", 51 | "version": "3.6.1" 52 | } 53 | }, 54 | "nbformat": 4, 55 | "nbformat_minor": 2 56 | } 57 | -------------------------------------------------------------------------------- /meshgrid.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "\n", 13 | "u = np.linspace(-2, 2, 6)\n", 14 | "v = np.linspace(-1, 1, 4)\n", 15 | "\n", 16 | "# Generate 2-D arrays from u and v: X, Y\n", 17 | "X,Y = np.meshgrid(u,v)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 17, 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "name": "stdout", 27 | "output_type": "stream", 28 | "text": [ 29 | "[-2. -1.2 -0.4 0.4 1.2 2. ]\n", 30 | "[[-2. -1.2 -0.4 0.4 1.2 2. ]\n", 31 | " [-2. -1.2 -0.4 0.4 1.2 2. ]\n", 32 | " [-2. -1.2 -0.4 0.4 1.2 2. ]\n", 33 | " [-2. -1.2 -0.4 0.4 1.2 2. ]]\n", 34 | "\n", 35 | "[-1. -0.33333333 0.33333333 1. ]\n", 36 | "[[-1. -1. -1. -1. -1. -1. ]\n", 37 | " [-0.33333333 -0.33333333 -0.33333333 -0.33333333 -0.33333333 -0.33333333]\n", 38 | " [ 0.33333333 0.33333333 0.33333333 0.33333333 0.33333333 0.33333333]\n", 39 | " [ 1. 1. 1. 1. 1. 1. ]]\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "print(u)\n", 45 | "print(X)\n", 46 | "print(\"\")\n", 47 | "print(v)\n", 48 | "print(Y)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": [ 59 | "6675.5" 60 | ] 61 | }, 62 | "execution_count": 3, 63 | "metadata": {}, 64 | "output_type": "execute_result" 65 | } 66 | ], 67 | "source": [ 68 | "a = np.array([7921,5184,8836,4761])\n", 69 | "a.mean()" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 4, 75 | "metadata": { 76 | "collapsed": true 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "r = a.max() - a.min()\n", 81 | "b = a - a.mean()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 5, 87 | "metadata": { 88 | "collapsed": true 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "b = b/ r" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 6, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "text/plain": [ 103 | "array([ 0.30564417, -0.36601227, 0.53018405, -0.46981595])" 104 | ] 105 | }, 106 | "execution_count": 6, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "b" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "collapsed": true 120 | }, 121 | "outputs": [], 122 | "source": [] 123 | } 124 | ], 125 | "metadata": { 126 | "kernelspec": { 127 | "display_name": "Python 3", 128 | "language": "python", 129 | "name": "python3" 130 | }, 131 | "language_info": { 132 | "codemirror_mode": { 133 | "name": "ipython", 134 | "version": 3 135 | }, 136 | "file_extension": ".py", 137 | "mimetype": "text/x-python", 138 | "name": "python", 139 | "nbconvert_exporter": "python", 140 | "pygments_lexer": "ipython3", 141 | "version": "3.6.1" 142 | } 143 | }, 144 | "nbformat": 4, 145 | "nbformat_minor": 2 146 | } 147 | -------------------------------------------------------------------------------- /Polynomial features.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Size of train (2, 2)\n", 13 | "[[2 3]\n", 14 | " [2 4]]\n", 15 | "Size of train (2, 6)\n", 16 | "[[ 1. 2. 3. 4. 6. 9.]\n", 17 | " [ 1. 2. 4. 4. 8. 16.]]\n" 18 | ] 19 | }, 20 | { 21 | "data": { 22 | "text/html": [ 23 | "
\n", 24 | "\n", 37 | "\n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | "
1x0x1x0^2x0 x1x1^2
01.02.03.04.06.09.0
11.02.04.04.08.016.0
\n", 70 | "
" 71 | ], 72 | "text/plain": [ 73 | " 1 x0 x1 x0^2 x0 x1 x1^2\n", 74 | "0 1.0 2.0 3.0 4.0 6.0 9.0\n", 75 | "1 1.0 2.0 4.0 4.0 8.0 16.0" 76 | ] 77 | }, 78 | "execution_count": 4, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "import pandas as pd\n", 85 | "import numpy as np\n", 86 | "from sklearn.preprocessing import PolynomialFeatures\n", 87 | "pf = PolynomialFeatures(degree=2, include_bias=True)\n", 88 | "\n", 89 | "test = np.array([\n", 90 | " [2, 3],\n", 91 | " [2, 4]\n", 92 | "])\n", 93 | "\n", 94 | "pf.fit(test)\n", 95 | "print(\"Size of train\", test.shape)\n", 96 | "print(test)\n", 97 | "x = pf.transform(test)\n", 98 | "print(\"Size of train\", x.shape)\n", 99 | "print(x)\n", 100 | "\n", 101 | "df = pd.DataFrame(x)\n", 102 | "df.columns = pf.get_feature_names()\n", 103 | "df.head()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "collapsed": true 111 | }, 112 | "outputs": [], 113 | "source": [] 114 | } 115 | ], 116 | "metadata": { 117 | "kernelspec": { 118 | "display_name": "Python 3", 119 | "language": "python", 120 | "name": "python3" 121 | }, 122 | "language_info": { 123 | "codemirror_mode": { 124 | "name": "ipython", 125 | "version": 3 126 | }, 127 | "file_extension": ".py", 128 | "mimetype": "text/x-python", 129 | "name": "python", 130 | "nbconvert_exporter": "python", 131 | "pygments_lexer": "ipython3", 132 | "version": "3.6.1" 133 | } 134 | }, 135 | "nbformat": 4, 136 | "nbformat_minor": 2 137 | } 138 | -------------------------------------------------------------------------------- /Memoization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Memoization" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Sin memoización" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "def fib(n):\n", 26 | " if n == 0 or n == 1:\n", 27 | " return 1\n", 28 | " return fib(n-1) + fib(n-2)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "14930352\n", 41 | "CPU times: user 5.62 s, sys: 30.4 ms, total: 5.65 s\n", 42 | "Wall time: 5.71 s\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "%%time\n", 48 | "result = fib(35)\n", 49 | "print(result)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Memoizado" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 3, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "# Casos base\n", 68 | "storage = dict([\n", 69 | " (0, 1), \n", 70 | " (1, 1)\n", 71 | "])\n", 72 | "\n", 73 | "def fib(n):\n", 74 | " if n in storage:\n", 75 | " return storage[n]\n", 76 | " storage[n] = fib(n-1) + fib(n-2)\n", 77 | " return storage[n]" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 4, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | "14930352\n", 90 | "CPU times: user 601 µs, sys: 328 µs, total: 929 µs\n", 91 | "Wall time: 712 µs\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "%%time\n", 97 | "result = fib(35)\n", 98 | "print(result)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 5, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "20365011074\n", 111 | "CPU times: user 861 µs, sys: 825 µs, total: 1.69 ms\n", 112 | "Wall time: 1.56 ms\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "%%time\n", 118 | "result = fib(50)\n", 119 | "print(result)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 6, 125 | "metadata": {}, 126 | "outputs": [ 127 | { 128 | "name": "stdout", 129 | "output_type": "stream", 130 | "text": [ 131 | "573147844013817084101\n", 132 | "CPU times: user 509 µs, sys: 402 µs, total: 911 µs\n", 133 | "Wall time: 617 µs\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "%%time\n", 139 | "result = fib(100)\n", 140 | "print(result)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "collapsed": true 148 | }, 149 | "outputs": [], 150 | "source": [] 151 | } 152 | ], 153 | "metadata": { 154 | "kernelspec": { 155 | "display_name": "Python 3", 156 | "language": "python", 157 | "name": "python3" 158 | }, 159 | "language_info": { 160 | "codemirror_mode": { 161 | "name": "ipython", 162 | "version": 3 163 | }, 164 | "file_extension": ".py", 165 | "mimetype": "text/x-python", 166 | "name": "python", 167 | "nbconvert_exporter": "python", 168 | "pygments_lexer": "ipython3", 169 | "version": "3.6.1" 170 | } 171 | }, 172 | "nbformat": 4, 173 | "nbformat_minor": 2 174 | } 175 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/python,pycharm,jupyternotebook 3 | 4 | ### JupyterNotebook ### 5 | .ipynb_checkpoints 6 | */.ipynb_checkpoints/* 7 | 8 | # Remove previous ipynb_checkpoints 9 | # git rm -r .ipynb_checkpoints/ 10 | # 11 | ### PyCharm ### 12 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 13 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 14 | 15 | # User-specific stuff: 16 | .idea/**/workspace.xml 17 | .idea/**/tasks.xml 18 | .idea/dictionaries 19 | 20 | # Sensitive or high-churn files: 21 | .idea/**/dataSources/ 22 | .idea/**/dataSources.ids 23 | .idea/**/dataSources.xml 24 | .idea/**/dataSources.local.xml 25 | .idea/**/sqlDataSources.xml 26 | .idea/**/dynamic.xml 27 | .idea/**/uiDesigner.xml 28 | 29 | # Gradle: 30 | .idea/**/gradle.xml 31 | .idea/**/libraries 32 | 33 | # CMake 34 | cmake-build-debug/ 35 | 36 | # Mongo Explorer plugin: 37 | .idea/**/mongoSettings.xml 38 | 39 | ## File-based project format: 40 | *.iws 41 | 42 | ## Plugin-specific files: 43 | 44 | # IntelliJ 45 | /out/ 46 | 47 | # mpeltonen/sbt-idea plugin 48 | .idea_modules/ 49 | 50 | # JIRA plugin 51 | atlassian-ide-plugin.xml 52 | 53 | # Cursive Clojure plugin 54 | .idea/replstate.xml 55 | 56 | # Ruby plugin and RubyMine 57 | /.rakeTasks 58 | 59 | # Crashlytics plugin (for Android Studio and IntelliJ) 60 | com_crashlytics_export_strings.xml 61 | crashlytics.properties 62 | crashlytics-build.properties 63 | fabric.properties 64 | 65 | ### PyCharm Patch ### 66 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 67 | 68 | # *.iml 69 | # modules.xml 70 | # .idea/misc.xml 71 | # *.ipr 72 | 73 | # Sonarlint plugin 74 | .idea/sonarlint 75 | 76 | ### Python ### 77 | # Byte-compiled / optimized / DLL files 78 | __pycache__/ 79 | *.py[cod] 80 | *$py.class 81 | 82 | # C extensions 83 | *.so 84 | 85 | # Distribution / packaging 86 | .Python 87 | build/ 88 | develop-eggs/ 89 | dist/ 90 | downloads/ 91 | eggs/ 92 | .eggs/ 93 | lib/ 94 | lib64/ 95 | parts/ 96 | sdist/ 97 | var/ 98 | wheels/ 99 | *.egg-info/ 100 | .installed.cfg 101 | *.egg 102 | 103 | # PyInstaller 104 | # Usually these files are written by a python script from a template 105 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 106 | *.manifest 107 | *.spec 108 | 109 | # Installer logs 110 | pip-log.txt 111 | pip-delete-this-directory.txt 112 | 113 | # Unit test / coverage reports 114 | htmlcov/ 115 | .tox/ 116 | .coverage 117 | .coverage.* 118 | .cache 119 | nosetests.xml 120 | coverage.xml 121 | *.cover 122 | .hypothesis/ 123 | 124 | # Translations 125 | *.mo 126 | *.pot 127 | 128 | # Django stuff: 129 | *.log 130 | local_settings.py 131 | 132 | # Flask stuff: 133 | instance/ 134 | .webassets-cache 135 | 136 | # Scrapy stuff: 137 | .scrapy 138 | 139 | # Sphinx documentation 140 | docs/_build/ 141 | 142 | # PyBuilder 143 | target/ 144 | 145 | # Jupyter Notebook 146 | 147 | # pyenv 148 | .python-version 149 | 150 | # celery beat schedule file 151 | celerybeat-schedule 152 | 153 | # SageMath parsed files 154 | *.sage.py 155 | 156 | # Environments 157 | .env 158 | .venv 159 | env/ 160 | venv/ 161 | ENV/ 162 | env.bak/ 163 | venv.bak/ 164 | 165 | # Spyder project settings 166 | .spyderproject 167 | .spyproject 168 | 169 | # Rope project settings 170 | .ropeproject 171 | 172 | # mkdocs documentation 173 | /site 174 | 175 | # mypy 176 | .mypy_cache/ 177 | ### macOS ### 178 | *.DS_Store 179 | .AppleDouble 180 | .LSOverride 181 | 182 | # Icon must end with two \r 183 | Icon 184 | 185 | # Thumbnails 186 | ._* 187 | 188 | # Files that might appear in the root of a volume 189 | .DocumentRevisions-V100 190 | .fseventsd 191 | .Spotlight-V100 192 | .TemporaryItems 193 | .Trashes 194 | .VolumeIcon.icns 195 | .com.apple.timemachine.donotpresent 196 | 197 | # Directories potentially created on remote AFP share 198 | .AppleDB 199 | .AppleDesktop 200 | Network Trash Folder 201 | Temporary Items 202 | .apdisk 203 | 204 | ### Windows ### 205 | # Windows thumbnail cache files 206 | Thumbs.db 207 | ehthumbs.db 208 | ehthumbs_vista.db 209 | 210 | # Folder config file 211 | Desktop.ini 212 | 213 | # Recycle Bin used on file shares 214 | $RECYCLE.BIN/ 215 | 216 | # Windows Installer files 217 | *.cab 218 | *.msi 219 | *.msm 220 | *.msp 221 | 222 | # Windows shortcuts 223 | *.lnk 224 | 225 | # End of https://www.gitignore.io/api/macos,windows 226 | 227 | # Captions files 228 | youtube-captions/* 229 | casey-neistat-analisys/* 230 | mt-scraper/* 231 | partiallyd-scrape/* 232 | tloz-scrape/* 233 | /.metadata/ 234 | -------------------------------------------------------------------------------- /Nato/characters.csv: -------------------------------------------------------------------------------- 1 | Character,Code word,ICAO 2008 IPA convention[5],ICAO recording (1955)[12],Wikipedia IPA and respelling,ICAO 2008 respelling [5],ITU-R 2007 (WRC-07) respelling[7],IMO respelling[citation needed],FAA respelling[8][9],SIA[13] (France aeronautical),NATO & U.S. Army respelling[14] 2 | A,Alfa,ˈælfɑ,[ˈælfʌ],/ˈælfɑː/ AL-fah,AL FAH,AL FAH,AL FAH,ALFAH or AL-FAH,al fah,AL fah 3 | B,Bravo,ˈbrɑːˈvo,[brɑˈvoʊ],/ˌbrɑːˈvoʊ/ BRAH-VOH,BRAH VOH,BRAH VOH,BRAH VOH (1955: BRAH VOH),BRAHVOH or BRAH-VO,bra vo,BRAH voh 4 | C,Charlie,ˈtʃɑːli  or ˈʃɑːli,"[ˈtʃɑ˞li], [ˈʃɑ˞li]",/ˈtʃɑːrliː/ CHAR-lee or /ˈʃɑːrliː/ SHAR-lee,CHAR LEE or SHAR LEE,CHAR LEE or SHAR LEE,CHAR LEE,CHARLEE or CHAR-LEE,"tchah li, char li",CHAR lee 5 | D,Delta,ˈdeltɑ,[ˈdɛltʌ],/ˈdɛltɑː/ DEL-tah,DELL TAH,DELL TAH,DELL TAH,DELLTAH or DELL-TAH,del tah,DEL tah 6 | E,Echo,ˈeko,[ˈɛkoʊ],/ˈɛkoʊ/,ECK OH,ECK OH,ECK OH,ECKOH or ECK-OH,èk o,EKK oh 7 | F,Foxtrot,ˈfɔkstrɔt,[ˈfɑkstrɑt],/ˈfɒkstrɒt/ FOKS-trot,FOKS TROT,FOKS TROT,FOKS TROT,FOKSTROT or FOKS-TROT,fox trott,FOKS trot 8 | G,Golf,ɡʌlf [sic],[ˈɡʌl(f)],/ˈɡɒlf/ GOLF,GOLF,GOLF,GOLF,GOLF,golf,Golf 9 | H,Hotel,hoːˈtel,[hoʊˈtɛl],/hoʊˈtɛl/ hoh-TEL,HOH TELL,HOH TELL,HOH TELL,HOHTELL or HOH-TELL,ho tèll,HO tell 10 | I,India,ˈindiˑɑ,[ˈɪndi.ʌ],/ˈɪndiːɑː/ IN-dee-ah,IN DEE AH,IN DEE AH,IN DEE AH,INDEE AH or IN-DEE-AH,in di ah,IN dee ah 11 | J,Juliett,ˈdʒuːliˑˈet,[ˌdʒuliˈɛt],/ˈdʒuːliːɛt/ JEW-lee-et or /ˌdʒuːliːˈɛt/ JEW-lee-ET,JEW LEE ETT,JEW LEE ETT,JEW LEE ETT,JEWLEE ETT or JEW-LEE-ETT,djou li ètt,JEW lee ett 12 | K,Kilo,ˈkiːlo,[ˈkiloʊ],/ˈkiːloʊ/ KEE-loh,KEY LOH,KEY LOH,KEY LOH,KEYLOH or KEY-LOH,ki lo,KEY loh 13 | L,Lima,ˈliːmɑ,[ˈlimʌ],/ˈliːmɑː/ LEE-mah,LEE MAH,LEE MAH,LEE MAH,LEEMAH or LEE-MAH,li mah,LEE mah 14 | M,Mike,mɑik,[ˈmʌɪk],/ˈmaɪk/ MYK,MIKE,MIKE,MIKE,MIKE,maïk,Mike 15 | N,November,noˈvembə,[noʊˈvɛmbɹ̩],/noʊˈvɛmbər/ noh-VEM-bər[15],NO VEM BER,NO VEM BER,NO VEM BER,NOVEMBER or NO-VEM-BER,no vèmm ber,NOH vem ber 16 | O,Oscar,ˈɔskɑ,[ˈɑskɹ̩],/ˈɒskɑː/ OS-kah,OSS CAH,OSS CAH,OSS CAH,OSS-SCAR or OSS-CAR,oss kar,OSS car 17 | P,Papa,pəˈpɑ,[pəˈpɑ],/pɑːˈpɑː/ pah-PAH,PAH PAH,PAH PAH,PAH PAH,PAHPAH or PAH-PAH,pah pah,PAH pah 18 | Q,Quebec,keˈbek,[kɛˈbɛk],/kɛˈbɛk/ ke-BEK,KEH BECK,KEH BECK,KEH BECK,KEHBECK or KWUH-BECK,ké bèk,keh BECK 19 | R,Romeo,ˈroːmiˑo,[ˈɹoʊmi.oʊ],/ˈroʊmiːoʊ/ ROH-mee-oh,ROW ME OH,ROW ME OH,ROW ME OH,ROWME OH or ROW-ME-OH,ro mi o,ROW me oh 20 | S,Sierra,siˈerɑ,[siˈɛɾʌ],/siːˈɛrɑː/ see-ERR-ah,SEE AIR RAH,SEE AIR RAH,SEE AIR RAH,SEEAIRAH or SEE-AIR-AH,si èr rah,see AIR ah 21 | T,Tango,ˈtænɡo,[ˈtæŋɡoʊ],/ˈtæŋɡoʊ/ TANG-goh,TANG OH,TANG GO,TANG GO,TANGGO or TANG-GO,tang go,TANG go 22 | U,Uniform,ˈjuːnifɔːm  or ˈuːnifɔrm,"[ˈjunɪ̈fɔ˞m], [ˈunɪ̈fɔ˞m]",/ˈjuːniːfɔːrm/ EW-nee-form or /ˈuːniːfɔːrm/ OO-nee-form,YOU NEE FORM or OO NEE FORM,YOU NEE FORM or OO NEE FORM,YOU NEE FORM or OO NEE FORM,YOUNEE FORM or YOU-NEE-FORM or OO-NEE-FORM,"you ni form, ou ni form",YOU nee form 23 | V,Victor,ˈviktɑ,[ˈvɪktəɹ],/ˈvɪktɑː/ VIK-tah,VIK TAH,VIK TAH,VIK TAH,VIKTAH or VIK-TAR,vik tar,VIK ter 24 | W,Whiskey,ˈwiski,[ˈwɪski],/ˈwɪskiː/ WIS-kee,WISS KEY,WISS KEY,WISS KEY,WISSKEY or WISS-KEY,ouiss ki,WISS key 25 | X,X-ray or Xray,ˈeksˈrei,[ˈɛksɹeɪ],/ˈɛksreɪ/ EKS-ray or /ˌɛksˈreɪ/ EKS-RAY,ECKS RAY,ECKS RAY,ECKS RAY,ECKSRAY [sic] or ECKS-RAY,èkss ré,EKS ray 26 | Y,Yankee,ˈjænki,[ˈjæŋki],/ˈjæŋkiː/ YANG-kee,YANG KEY,YANG KEY,YANG KEY,YANGKEY [sic] or YANG-KEY,yang ki,YANG kee 27 | Z,Zulu,ˈzuːluː,[ˈzulu],/ˈzuːluː/ ZOO-loo,ZOO LOO,ZOO LOO,ZOO LOO,ZOOLOO or ZOO-LOO,zou lou,ZOO luu 28 | 0,Zero,,,/ˈziːroʊ/ ZEE-roh /ˌnɑːˌdɑːˌzeɪˈroʊ/ NAH-DAH-ZAY-ROH,ZE-RO,NAH-DAH-ZAY-ROH,NADAZERO,Zero,zi ro,ZE-RO 29 | 1,One,,,/ˈwʌn/ WUN /ˌuːˌnɑːˈwʌn/ OO-NAH-WUN,WUN,OO-NAH-WUN,UNAONE,One,ouann,WUN; Won (USMC)[16] 30 | 2,Two,,,/ˈtuː/ TOO /ˌbiːˌsoʊˈtuː/ BEE-SOH-TOO,TOO,BEES-SOH-TOO,BISSOTWO,Two,tou,TOO 31 | 3,Three,,,/ˈtriː/ TREE /ˌteɪˌrɑːˈtriː/ TAY-RAH-TREE,TREE,TAY-RAH-TREE,TERRATHREE,Three,tri,TREE 32 | 4,Four,,,/ˈfoʊ.ər/ FOH-ər /ˌkɑːrˌteɪˈfoʊ.ər/ KAR-TAY-FOH-ər,FOW-er,KAR-TAY-FOWER,KARTEFOUR,Four,fo eur,FOW-ER 33 | 5,Five,,,/ˈfaɪf/ FYF[17] /ˌpænˌtɑːˈfaɪv/ PAN-TAH-FYV,FIFE,PAN-TAH-FIVE,PANTAFIVE,Five,fa ïf,FIFE 34 | 6,Six,,,/ˈsɪks/ SIKS /ˌsɔːkˌsiːˈsɪks/ SOK-SEE-SIKS,SIX,SOK-SEE-SIX,SOXISIX,Six,siks,SIX 35 | 7,Seven,,,/ˈsɛvɛn/ SEV-en /ˌseɪˌteɪˈsɛvɛn/ SAY-TAY-SEV-en,SEV-en,SAY-TAY-SEVEN,SETTESEVEN,Seven,sèv n,SEV-EN 36 | 8,Eight,,,/ˈeɪt/ AYT /ˌɔːkˌtoʊˈeɪt/ OK-TOH-AYT,AIT,OK-TOH-AIT,OKTOEIGHT,Eight,eït,AIT 37 | 9,Nine,,,/ˈnaɪnər/ NY-nər[18] /ˌnɔːvˌeɪˈnaɪnər/ NOV-AY-NY-nər,NIN-er,NO-VAY-NINER,NOVENINE,Niner,naï neu,NIN-ER 38 | . (decimal point),Decimal point,,,/ˌdeɪˌsiːˈmæl/ DAY-SEE-MAL,DAY-SEE-MAL,,,POINT,dè si mal,DAY-SEE-MAL (ITU) 39 | 100,Hundred,,,/ˈhʌndrɛd/ HUN-dred,HUN-dred,,,,hun-dred, 40 | 1000,Thousand,,,/ˌtaʊˈsænd/ TOW-ZEND[19],TOU-SAND,,,,taou zend,TOU-SAND 41 | - (hyphen),Dash,,,/ˈdæʃ/ DASH,,,imo,faa,, 42 | . (full stop),Period,,,/ˈstɒp/ STOP,,,,,,STOP (ITU) 43 | -------------------------------------------------------------------------------- /partiallyd-scrape.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os\n", 12 | "import requests\n", 13 | "import json\n", 14 | "import re\n", 15 | "import pandas as pd\n", 16 | "import matplotlib.pyplot as plt\n", 17 | "from bs4 import BeautifulSoup\n", 18 | "\n", 19 | "base_dir = \"partiallyd-scrape\"\n", 20 | "if not os.path.exists(base_dir):\n", 21 | " os.makedirs(base_dir)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "collapsed": true 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "url = \"http://partiallyderivative.com/podcast/\"\n", 33 | "podcast_page = BeautifulSoup(requests.get(url).text, \"lxml\")" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "raw_urls = []\n", 45 | "raw_titles = []\n", 46 | "raw_dates = []\n", 47 | "\n", 48 | "post_list = podcast_page.find('ul', {'class':'post-list'})\n", 49 | "for li in post_list.findAll('li'):\n", 50 | " link = li.find('a')\n", 51 | " span = li.find('span')\n", 52 | " raw_dates.append(span.text)\n", 53 | " raw_urls.append(link['href'])\n", 54 | " name = li.text[:-len(span.text)]\n", 55 | " raw_titles.append(name)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "links = { }\n", 65 | "for raw_url in raw_urls:\n", 66 | " podcast_article = BeautifulSoup(requests.get(raw_url).text, \"lxml\")\n", 67 | " ol = podcast_article.find('ol')\n", 68 | " links[raw_url] = []\n", 69 | " if ol is None:\n", 70 | " continue\n", 71 | " for li in ol.findAll('li'):\n", 72 | " a = li.find('a')\n", 73 | " try:\n", 74 | " links[raw_url].append([a.text, a['href']])\n", 75 | " except:\n", 76 | " print(raw_url)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "reg = re.compile('([0-9]{4})/([0-9]{2})/([0-9]{2})/([a-zA-Z0-9-_]+)')\n", 86 | "dates = []\n", 87 | "slug = []\n", 88 | "count = []\n", 89 | "\n", 90 | "for u in raw_urls:\n", 91 | " match = reg.search(u)\n", 92 | " if match:\n", 93 | " day = int(match.group(3))\n", 94 | " month = int(match.group(2))\n", 95 | " year = int(match.group(1))\n", 96 | " dt_str = \"%04d-%02d-%02d\" % (year,month,day)\n", 97 | " try:\n", 98 | " slug.append(match.group(4))\n", 99 | " dates.append(pd.to_datetime(dt_str))\n", 100 | " count.append(len(links[u]))\n", 101 | " except:\n", 102 | " print(dt_str)\n", 103 | "podcasts_df = pd.DataFrame({\n", 104 | " 'id': slug,\n", 105 | " 'date': dates,\n", 106 | " 'count': count\n", 107 | "})\n", 108 | "\n", 109 | "podcasts_df = podcasts_df.set_index('date')\n", 110 | "\n", 111 | "print(podcasts_df.head())\n", 112 | "podcasts_df.to_csv(\"partiallyd-scrape/podcasts.csv\")" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "podcasts_df.plot(figsize=(15,4))\n", 122 | "plt.show()" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "podcasts_w_links = podcasts_df[podcasts_df['count'] != 0]\n", 132 | "podcasts_w_links.plot(figsize=(15,4))\n", 133 | "plt.show()" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "podcasts_w_links.describe()" 143 | ] 144 | } 145 | ], 146 | "metadata": { 147 | "kernelspec": { 148 | "display_name": "Python 3", 149 | "language": "python", 150 | "name": "python3" 151 | }, 152 | "language_info": { 153 | "codemirror_mode": { 154 | "name": "ipython", 155 | "version": 3 156 | }, 157 | "file_extension": ".py", 158 | "mimetype": "text/x-python", 159 | "name": "python", 160 | "nbconvert_exporter": "python", 161 | "pygments_lexer": "ipython3", 162 | "version": "3.6.1" 163 | } 164 | }, 165 | "nbformat": 4, 166 | "nbformat_minor": 2 167 | } 168 | -------------------------------------------------------------------------------- /youtube-captions-2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os\n", 12 | "from os.path import join\n", 13 | "import requests\n", 14 | "import json\n", 15 | "import urllib\n", 16 | "import pandas as pd\n", 17 | "from bs4 import BeautifulSoup\n", 18 | "from urllib.parse import urlencode\n", 19 | "from slugify import slugify\n", 20 | "import nltk\n", 21 | "from nltk.tokenize import word_tokenize, sent_tokenize\n", 22 | "from pytube import YouTube" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "song = \"\"\"Ella existio solo en un sueño \n", 34 | "el es un poema que el poeta \n", 35 | "nunca escribió \n", 36 | "y en la eternidad los dos \n", 37 | "unieron sus almas para darle vida \n", 38 | "\"\"\"" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "from nltk.corpus import stopwords\n", 50 | "from string import punctuation\n", 51 | "spanish_stopwords = set(list(punctuation)) " 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# Cleaning song\n", 61 | "song_wo_stopwords = [word.lower() for word in word_tokenize(song) if word.lower() not in spanish_stopwords]\n", 62 | "print(song_wo_stopwords)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": { 69 | "collapsed": true 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "# Load words:\n", 74 | "candidates = {}\n", 75 | "\n", 76 | "for w in song_wo_stopwords:\n", 77 | " candidates[w] = []\n", 78 | " \n", 79 | "directory = 'youtube-captions/captions'\n", 80 | "for file in os.listdir(directory):\n", 81 | " if file.endswith(\"json\"):\n", 82 | " with open(join(directory, file), 'r') as captions_file:\n", 83 | " video = json.load(captions_file)\n", 84 | " captions = video['captions_parsed']\n", 85 | " for caption in captions:\n", 86 | " if caption['content'] == None:\n", 87 | " continue\n", 88 | " tokenized = word_tokenize(caption['content'])\n", 89 | " for w1 in tokenized:\n", 90 | " if w1.lower() in candidates:\n", 91 | " caption['id'] = video['id']\n", 92 | " caption['count'] = len(tokenized)\n", 93 | " candidates[w1.lower()].append(caption)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "complete_df = pd.read_csv(\"youtube-captions/complete.csv\", index_col=0,parse_dates=['published_at'])\n", 103 | "print(complete_df.info())\n", 104 | "complete_df.head()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "collapsed": true 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "def start_to_time(s: float):\n", 116 | " seconds = float(s) / 60\n", 117 | " ceil_seconds = math.floor(seconds)\n", 118 | " minutes = ceil_seconds\n", 119 | " seconds = round((seconds - ceil_seconds) * 60)\n", 120 | " return str(minutes) +\"m\" + str(seconds)+ \"s\"" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": { 127 | "scrolled": false 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "import math\n", 132 | "\n", 133 | "video_url = \"http://youtube.com/watch?v=%s&t=%s\"\n", 134 | "for candidate_key in candidates:\n", 135 | " print(candidate_key)\n", 136 | " if len(candidates[candidate_key]) == 0:\n", 137 | " continue\n", 138 | " candidates[candidate_key].sort(key=lambda x: x['count'], reverse=True)\n", 139 | " for c in candidates[candidate_key][:4]:\n", 140 | " tokenized = word_tokenize(c['content'])\n", 141 | " word_count = len(tokenized)\n", 142 | " duration, start = float(c['duration']), float(c['start'])\n", 143 | " word_duration = word_count / duration\n", 144 | " word_location = tokenized.index(candidate_key)\n", 145 | " tentative_word_start = (start + word_location * word_duration) - 1\n", 146 | " print(video_url % (c['id'], start_to_time(tentative_word_start)))\n", 147 | "# print(word_duration)\n", 148 | "# print(video_url % (c['id'], start_to_time(text['start'])))" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": { 155 | "collapsed": true 156 | }, 157 | "outputs": [], 158 | "source": [] 159 | } 160 | ], 161 | "metadata": { 162 | "kernelspec": { 163 | "display_name": "Python 3", 164 | "language": "python", 165 | "name": "python3" 166 | }, 167 | "language_info": { 168 | "codemirror_mode": { 169 | "name": "ipython", 170 | "version": 3 171 | }, 172 | "file_extension": ".py", 173 | "mimetype": "text/x-python", 174 | "name": "python", 175 | "nbconvert_exporter": "python", 176 | "pygments_lexer": "ipython3", 177 | "version": "3.6.1" 178 | } 179 | }, 180 | "nbformat": 4, 181 | "nbformat_minor": 2 182 | } 183 | -------------------------------------------------------------------------------- /python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Python.\n", 8 | "\n", 9 | "Como muchos de ustedes ya sabrán, Python es un lenguaje de programación, que como muy pocos seguramente saben, tomó su nombre no de una serpiente, si no de un programa de comedia británico, pero en fin. Python fue publicado en 1991 por Guido van Rossum, inicialmente fue pensado como un simple lenguaje de scripting pero en la actualidad se ha infiltrado en el desarrollo web, la ciencia de datos, machine learning y ramas afines.\n", 10 | "\n", 11 | "## Filosofía. \n", 12 | "\n", 13 | "La filosofía detrás de Python podría estar resumida en un documento que fue creado en 1999 llamado , ocho años después de su creación. Pueden consultar el documento en este enlace: pero les voy a decir algunos de estos principios que sí, suenan muy filosóficos: \n", 14 | "\n", 15 | " - Beautiful is better than ugly\n", 16 | " - Explicit is better than implicit\n", 17 | " - Simple is better than complex\n", 18 | " - Readability counts\n", 19 | " - There should be one—and preferably only one—obvious way to do it.\n", 20 | " - If the implementation is hard to explain, it's a bad idea.\n", 21 | "\n", 22 | "Lo cierto es que mientras que estos principios suenan bonitos, el escribir software todavía recae en los humanos, así que estos principios no se aplican muchas veces. Y, por ejemplo, puedes encontrar que en Python es normal que encuentres más de una manera de hacer las cosas. \n", 23 | "\n", 24 | "## Características\n", 25 | "\n", 26 | "**Es dinámicamente tipado**: Porque podemos hacer algo como esto: " 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 1, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "a = 1\n", 38 | "b = 'C'\n", 39 | "c = [0.1, 0.5]" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "Es decir, no es necesario especificar el tipo de dato de una variable antes de declararla. Y no existe un compilador, ni el intérprete, que esté comprobando esto antes de que el programa se esté ejecutando. \n", 47 | "\n", 48 | "También permite algo como esto:" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 2, 54 | "metadata": { 55 | "collapsed": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "a = 1\n", 60 | "a = 'C'\n", 61 | "a = [0.1, 0.5]" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "Es decir, cambiar por completo el tipo de dato de una variable sin que nadie diga nada. Y créanme, esto puede ser motivo de muchas confusiones, pero una vez que te acostumbras, puede llegar a ser una herramienta muy útil. \n", 69 | "\n", 70 | "Sin embargo, también es considerado un lenguaje **fuertemente tipado** (cabe recalcar que puede existir esta combinación: dinámico y fuertemente tipado a la vez). Es considerado fuertemente tipado porque el lenguaje define un conjunto de reglas (de comportamientos) bajo las cuales los tipos de dato se pueden mezclar entre ellos, y romper esas reglas generará una excepción. Toma por ejemplo el siguiente código:" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 3, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "ename": "TypeError", 80 | "evalue": "must be str, not int", 81 | "output_type": "error", 82 | "traceback": [ 83 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 84 | "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", 85 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0ma3\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"a\"\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;36m3\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 86 | "\u001b[1;31mTypeError\u001b[0m: must be str, not int" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "a3 = \"a\" + 3 " 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "por increíble que parezca, esto nos generaría un error puesto que los tipos de dato int y string no definen una forma de mezclarse, si quieres concatenar las cadenas tendrías que primero convertir el entero a cadena.\n", 99 | "\n", 100 | "**No existen los corchetes (o llaves)**: sino que los bloques de código se definen usando indentaciones (tabs o espacios, lo que importa es que seas consistente en el método de indentación que usas), es decir un bloque `if` se define de la siguiente manera: " 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 4, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "b es C\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "if b == 'C':\n", 118 | " print(\"b es C\")\n", 119 | "elif b == 'A':\n", 120 | " print(\"b es A\")" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "O un código un poco más elaborado se vería así: " 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": { 134 | "collapsed": true 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "def del_none(d):\n", 139 | " for key, value in list(d.items()):\n", 140 | " if value is None:\n", 141 | " del d[key]\n", 142 | " elif isinstance(value, str):\n", 143 | " d[key] = d[key].strip()\n", 144 | " elif isinstance(value, dict):\n", 145 | " del_none(value)\n", 146 | " return d" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "Ah, seguramente lo notaste, pero Python tampoco requiere que uses un `;` para terminar cada instrucción, la idea es que exista una instrucción por cada línea.\n", 154 | "\n", 155 | "Como tal vez pudiste ver, es también un **lenguaje de alto nivel**: La idea es abstraer (esconder) la mayor cantidad de detalles de implementación. Es un lenguaje de alto nivel y en ocasiones es muy sencillo leer programas escritos en este lenguaje, y a mi parecer, en muchos casos como si estuvieras leyendo un libro escrito en inglés. \n", 156 | "\n", 157 | "Python es también **multiparadigma**, puedes organizar tu código en clases, o utilizarlo como un lenguaje funcional, o puedes simplemente crear un programa que se ejecute proceduralmente... o una combinación de todo esto. \n", 158 | "\n", 159 | "**Altamente extensible**: tiene soporte para descargar módulos o bibliotecas de repositorios de paquetes que permiten que añadirle funcionalidad a tus programas, así que es normal que cuando descargues un proyecto tengas que descargar los paquetes asociados con instrucciones como las siguientes: \n", 160 | "\n", 161 | "```\n", 162 | "pip install package-name\n", 163 | "easy_install package-name\n", 164 | "``` \n", 165 | "\n", 166 | "Cuenta con una **consola interactiva** o REPL\n", 167 | "\n", 168 | "Es **multiplataforma** y no está fuertemente ligado a un sistema operativo ni a un entorno de desarrollo.\n", 169 | " \n", 170 | "## Desventajas \n", 171 | "\n", 172 | "- Considerado **lento**\n", 173 | "- A pesar de ser muy usado, hay áreas en las que no tiene mucho impacto, como el desarrollo para móviles \n", 174 | "- Consume mucha memoria y facilita la escritura de código que, aunque funciona, no está muy optimizado \n", 175 | "- Puede hacer que otros lenguajes sean difíciles de trabajar, uno se acostumbra muy rápido a las bondades de Python, a mi de pronto ya se me olvida poner puntos y coma en C# \n", 176 | "\n", 177 | "## Razones para aprender \n", 178 | "\n", 179 | " - Quieres desarrollar aplicaciones web\n", 180 | " - Te interesa automatizar tareas repetitivas\n", 181 | " - Quieres analizar datos\n", 182 | " - Es entretenido\n", 183 | " - Una herramienta más en tus habilidades \n", 184 | " \n", 185 | "## IDEs \n", 186 | "\n", 187 | " - [Thonny](http://thonny.org/)\n", 188 | " - [PyCharm](https://www.jetbrains.com/pycharm/)\n", 189 | " - [PyScripter](https://github.com/pyscripter/pyscripter)\n", 190 | " - [Visual Studio](https://www.visualstudio.com/es/vs/python/) y [Visual Studio Code](https://code.visualstudio.com/docs/languages/python) (con plugins)\n", 191 | " - [PyDev (Eclipse)](http://www.pydev.org/)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": { 197 | "collapsed": true 198 | }, 199 | "source": [ 200 | "## Recursos para aprender\n", 201 | "\n", 202 | " - [LearnPython.org (en español)](https://www.learnpython.org/es/)\n", 203 | " - [\"Python instantáneo\"](http://rapto.arrakis.es/AprendaPython.html)\n", 204 | " - [Python in one pic](https://github.com/coodict/python3-in-one-pic/blob/master/notebooks/py3-in-one-pic.ipynb) and [Python in one pic (interactive)](http://coodict.github.io/python3-in-one-pic/)\n", 205 | " - [A Byte of Python](https://python.swaroopch.com/)\n", 206 | " - [Introduction to Programming with Python](https://mva.microsoft.com/en-US/training-courses/introduction-to-programming-with-python-8360)\n", 207 | " - [Automate the Boring Stuff with Python](http://automatetheboringstuff.com/)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": { 214 | "collapsed": true 215 | }, 216 | "outputs": [], 217 | "source": [] 218 | } 219 | ], 220 | "metadata": { 221 | "kernelspec": { 222 | "display_name": "Python 3", 223 | "language": "python", 224 | "name": "python3" 225 | }, 226 | "language_info": { 227 | "codemirror_mode": { 228 | "name": "ipython", 229 | "version": 3 230 | }, 231 | "file_extension": ".py", 232 | "mimetype": "text/x-python", 233 | "name": "python", 234 | "nbconvert_exporter": "python", 235 | "pygments_lexer": "ipython3", 236 | "version": "3.6.2" 237 | } 238 | }, 239 | "nbformat": 4, 240 | "nbformat_minor": 2 241 | } 242 | -------------------------------------------------------------------------------- /casey-neistat-analisys.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import matplotlib.pyplot as plt\n", 12 | "import pandas as pd\n", 13 | "import requests\n", 14 | "import json\n", 15 | "import urllib\n", 16 | "import isodate\n", 17 | "import os\n", 18 | "from bs4 import BeautifulSoup\n", 19 | "from urllib.parse import urlencode\n", 20 | "from slugify import slugify\n", 21 | "from pytube import YouTube\n", 22 | "\n", 23 | "if not os.path.exists(\"casey-neistat-analisys\"):\n", 24 | " os.makedirs(\"casey-neistat-analisys\")" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "api_key = \"\" # Place your YT api key here\n", 34 | "assert api_key != \"\"\n", 35 | "channel_id = 'UCtinbF-Q-fVthA0qrFQTgXQ'\n", 36 | "\n", 37 | "playlists_parameters = {\n", 38 | " 'part': 'contentDetails',\n", 39 | " 'id': channel_id,\n", 40 | " 'key': api_key\n", 41 | "}\n", 42 | "\n", 43 | "categories_parameters = {\n", 44 | " 'part': 'snippet',\n", 45 | " 'regionCode': 'US',\n", 46 | " 'key': api_key\n", 47 | "}\n", 48 | "\n", 49 | "parameters = {\n", 50 | " 'key': api_key,\n", 51 | " 'part': 'snippet',\n", 52 | " 'type': 'video',\n", 53 | " 'channelId': channel_id,\n", 54 | " 'maxResults': 50,\n", 55 | " 'order': 'date'\n", 56 | "}\n", 57 | "max_pages = 100\n", 58 | "query_string = urlencode(parameters)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "get_categories_url = \"https://www.googleapis.com/youtube/v3/videoCategories?\" + urlencode(categories_parameters)\n", 70 | "r = requests.get(get_categories_url)\n", 71 | "result = json.loads(r.text)\n", 72 | "categoryId = []\n", 73 | "categoryNames = []\n", 74 | "for category in result['items']:\n", 75 | " categoryId.append(int(category['id']))\n", 76 | " categoryNames.append(category['snippet']['title'])\n", 77 | "categories_df = pd.DataFrame({'category': categoryId, 'name': categoryNames})\n", 78 | "categories_df.head()\n", 79 | "categories_df.to_csv(\"casey-neistat-analisys/categories_US.csv\", encoding='utf-8')" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "get_playlists_url = \"https://www.googleapis.com/youtube/v3/channels?\" + urlencode(playlists_parameters)\n", 89 | "r = requests.get(get_playlists_url)\n", 90 | "result = json.loads(r.text)\n", 91 | "\n", 92 | "playlist_id = result['items'][0]['contentDetails']['relatedPlaylists']['uploads']\n", 93 | "print(playlist_id)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "count = 0\n", 103 | "videos = []\n", 104 | "search_url = \"https://www.googleapis.com/youtube/v3/playlistItems?\"\n", 105 | "parameters['playlistId'] = playlist_id\n", 106 | "query_string = urlencode(parameters)\n", 107 | "pages = max_pages\n", 108 | "page_token = 'FIRST TIME!'\n", 109 | "while pages > 0 and len(page_token) > 0:\n", 110 | " qurl = search_url + query_string\n", 111 | " r = requests.get(search_url + query_string)\n", 112 | " result = json.loads(r.text)\n", 113 | " try:\n", 114 | " page_token = result[\"nextPageToken\"]\n", 115 | " except:\n", 116 | " page_token = ''\n", 117 | " parameters['pageToken'] = page_token\n", 118 | " pages = pages - 1\n", 119 | " videos.extend(result['items'])\n", 120 | " count += len(result['items'])\n", 121 | " query_string = urlencode(parameters)\n", 122 | "print(\"Done, found\", count)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "# Conversion to dataframes\n", 132 | "ids = []\n", 133 | "pub = []\n", 134 | "titles = []\n", 135 | "for v in videos:\n", 136 | " videoId = v['snippet']['resourceId']['videoId']\n", 137 | " #print(json.dumps(v))\n", 138 | " publishedDate = v['snippet']['publishedAt']\n", 139 | " title = v['snippet']['title']\n", 140 | " ids.append(videoId)\n", 141 | " pub.append(publishedDate)\n", 142 | " titles.append(title)\n", 143 | "initial_df = pd.DataFrame({\n", 144 | " 'id': ids,\n", 145 | " 'published_at': pub,\n", 146 | " 'title': titles\n", 147 | "})\n", 148 | "initial_df['published_at'] = pd.to_datetime(initial_df['published_at'])\n", 149 | "initial_df.to_csv(\"casey-neistat-analisys/casey_initial.csv\", encoding='utf-8')\n", 150 | "print(initial_df.info())" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "ids = list(initial_df['id'].values)\n", 160 | "categories = []\n", 161 | "default_language = []\n", 162 | "durations = []\n", 163 | "license = []\n", 164 | "viewCounts = []\n", 165 | "likeCounts = []\n", 166 | "dislikeCounts = []\n", 167 | "favoriteCounts = []\n", 168 | "commentCounts = []\n", 169 | "a=True\n", 170 | "batch_size = 50\n", 171 | "i = 0\n", 172 | "video_details = \"https://www.googleapis.com/youtube/v3/videos?id=%s&part=snippet,statistics,contentDetails&key=%s\" \n", 173 | "while i < len(ids):\n", 174 | " ids_to_query = ','.join(ids[i:i+batch_size])\n", 175 | " q = video_details % (ids_to_query, api_key)\n", 176 | " r = requests.get(q)\n", 177 | " resultlist = json.loads(r.text)\n", 178 | " for result in resultlist['items']:\n", 179 | " snippet = result['snippet']\n", 180 | " contentDetails = result['contentDetails']\n", 181 | " statistics = result['statistics']\n", 182 | "\n", 183 | " categories.append(snippet['categoryId'])\n", 184 | " if 'defaultAudioLanguage' in snippet:\n", 185 | " default_language.append(snippet['defaultAudioLanguage'])\n", 186 | " else:\n", 187 | " default_language.append('-')\n", 188 | " durations.append(contentDetails['duration'])\n", 189 | " license.append(contentDetails['licensedContent'])\n", 190 | " viewCounts.append(statistics['viewCount'])\n", 191 | " favoriteCounts.append(statistics['favoriteCount'])\n", 192 | " likeCount = -1\n", 193 | " dislikeCount = -1\n", 194 | " commentCount = -1\n", 195 | " if 'likeCount' in statistics:\n", 196 | " likeCount = int(statistics['likeCount'])\n", 197 | " dislikeCount = int(statistics['dislikeCount'])\n", 198 | " if 'commentCount' in statistics:\n", 199 | " commentCount = int(statistics['commentCount'])\n", 200 | " likeCounts.append(likeCount)\n", 201 | " dislikeCounts.append(dislikeCount)\n", 202 | " commentCounts.append(commentCount)\n", 203 | " \n", 204 | " i += batch_size\n", 205 | "\n", 206 | "details_df = pd.DataFrame({\n", 207 | " 'id': ids,\n", 208 | " 'category':categories,\n", 209 | " 'language': default_language,\n", 210 | " 'duration': durations,\n", 211 | " 'license': license,\n", 212 | " 'views': viewCounts,\n", 213 | " 'likes': likeCounts,\n", 214 | " 'dislikes': dislikeCounts,\n", 215 | " 'favs': favoriteCounts,\n", 216 | " 'comments': commentCounts\n", 217 | "})\n", 218 | "\n", 219 | "details_df.to_csv(\"casey-neistat-analisys/casey_detailed.csv\", encoding='utf-8')\n", 220 | "print(details_df.info())" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "initial_df = pd.read_csv(\"casey-neistat-analisys/casey_initial.csv\", index_col=0, \n", 230 | " parse_dates=['published_at'], na_values=[-1, ''])\n", 231 | "details_df = pd.read_csv(\"casey-neistat-analisys/casey_detailed.csv\", index_col=0, na_values=[-1, ''])\n", 232 | "\n", 233 | "\n", 234 | "initial_df = initial_df.drop_duplicates()\n", 235 | "details_df = details_df.drop_duplicates()\n", 236 | "details_df.duration = details_df.duration.apply(lambda iso: isodate.parse_duration(iso).total_seconds())\n", 237 | "\n", 238 | "complete_df = pd.merge(left=initial_df, right=details_df, on='id')\n", 239 | "complete_df.fillna(-1)\n", 240 | "complete_df.set_index('published_at', inplace=True)\n", 241 | "\n", 242 | "print(complete_df.tail())\n", 243 | "complete_df.to_csv(\"casey-neistat-analisys/casey_complete.csv\", encoding='utf-8')" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "complete_df = pd.read_csv(\"casey-neistat-analisys/casey_complete.csv\", parse_dates=['published_at'], index_col=0)\n", 253 | "complete_df = complete_df.tz_localize('UTC').tz_convert('US/Pacific')\n", 254 | "complete_df.head(10)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": { 261 | "scrolled": false 262 | }, 263 | "outputs": [], 264 | "source": [ 265 | "# When he didnt uploaded a vlog:\n", 266 | "vlog_start,vlog_end = '2015-03-24', '2016-11-19'\n", 267 | "daily_vlog_count = complete_df.loc[vlog_start:vlog_end,['views']].resample('D').count()\n", 268 | "daily_vlog_count.columns = ['videos']\n", 269 | "print(daily_vlog_count[daily_vlog_count['videos'] == 0])\n", 270 | "print(daily_vlog_count['videos']['2015'])" 271 | ] 272 | } 273 | ], 274 | "metadata": { 275 | "kernelspec": { 276 | "display_name": "Python 3", 277 | "language": "python", 278 | "name": "python3" 279 | }, 280 | "language_info": { 281 | "codemirror_mode": { 282 | "name": "ipython", 283 | "version": 3 284 | }, 285 | "file_extension": ".py", 286 | "mimetype": "text/x-python", 287 | "name": "python", 288 | "nbconvert_exporter": "python", 289 | "pygments_lexer": "ipython3", 290 | "version": "3.6.1" 291 | } 292 | }, 293 | "nbformat": 4, 294 | "nbformat_minor": 2 295 | } 296 | -------------------------------------------------------------------------------- /tloz-scrape.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 76, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os\n", 12 | "import requests\n", 13 | "import json\n", 14 | "import re\n", 15 | "import pandas as pd\n", 16 | "import matplotlib.pyplot as plt\n", 17 | "from os.path import join\n", 18 | "from slugify import slugify\n", 19 | "from bs4 import BeautifulSoup\n", 20 | "from bs4.element import NavigableString\n", 21 | "\n", 22 | "base_dir = \"tloz-scrape\"\n", 23 | "if not os.path.exists(base_dir):\n", 24 | " os.makedirs(base_dir)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 79, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "base_url = \"http://zelda.wikia.com\"\n", 34 | "characters = base_url + \"/wiki/The_Legend_of_Zelda_recurring_characters\"\n", 35 | "recurring_characters = BeautifulSoup(requests.get(characters).text, \"lxml\")" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 80, 41 | "metadata": { 42 | "scrolled": false 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "mw_content_text = recurring_characters.find('div', {\"id\":\"mw-content-text\"})\n", 47 | "\n", 48 | "attributes = set()\n", 49 | "characters = []\n", 50 | "\n", 51 | "for character_li in mw_content_text.findAll('li'):\n", 52 | " a = character_li.find('a')\n", 53 | " character_page = BeautifulSoup(requests.get(base_url + a['href']).text, \"lxml\")\n", 54 | " aside = character_page.find('aside')\n", 55 | " if aside is None:\n", 56 | " continue\n", 57 | " pi_datas = aside.findAll('div', {'class':'pi-item'}, recursive=False)\n", 58 | " character = {\n", 59 | " 'name': a.text.strip()\n", 60 | " }\n", 61 | " for pi_data in pi_datas:\n", 62 | " if isinstance(pi_data, NavigableString):\n", 63 | " continue\n", 64 | " label = pi_data.find('h3').text.strip()\n", 65 | " value = pi_data.find('div', {\"class\":\"pi-data-value\"})\n", 66 | " attributes.add(label)\n", 67 | " if \"Appears in\" == label or \\\n", 68 | " \"Appereance(s)\" == label:\n", 69 | " i = value.findAll('i')\n", 70 | " character[slugify(label, separator=\"_\")] = [ap.text.strip() for ap in i]\n", 71 | " elif \"Title(s)\" == label or \\\n", 72 | " \"Kindred\" == label or \\\n", 73 | " \"Attack method\"== label or \\\n", 74 | " \"Effective weapon(s)\" == label or \\\n", 75 | " \"Spoils\" == label or \\\n", 76 | " \"Alternate form(s)\" == label or \\\n", 77 | " \"Alternate form of\" == label:\n", 78 | " character[slugify(label, separator=\"_\")] = str(value)\n", 79 | " elif \"Homeland\" == label or \\\n", 80 | " \"Race\" == label or \\\n", 81 | " \"Hometown\" == label or \\\n", 82 | " \"Location(s)\" == label or \\\n", 83 | " \"Affiliation(s)\" == label:\n", 84 | " character[slugify(label, separator=\"_\")] = []\n", 85 | " game_name = ''\n", 86 | " for element in value.descendants:\n", 87 | " if element.name == \"u\":\n", 88 | " game_name = element.text.strip()\n", 89 | " elif element.name == \"a\":\n", 90 | " if game_name == '': # Solo tiene una aparición\n", 91 | " game_name = 'ORIGINAL'\n", 92 | " character[slugify(label, separator=\"_\")].append({'game': game_name,\n", 93 | " 'value': element.text.strip()})\n", 94 | " else:\n", 95 | " character[slugify(label, separator=\"_\")] = value.text.strip()\n", 96 | " characters.append(character)\n", 97 | " \n", 98 | "with open(join(base_dir,'data.txt'), 'w') as outfile:\n", 99 | " json.dump(characters, outfile, indent=4)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 71, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "Different attributes: ['kindred', 'homeland', 'affiliation_s', 'attack_method', 'effective_weapon_s', 'location_s', 'appearances', 'hometown', 'race', 'age', 'spoils', 'gender', 'alternate_form_s', 'first_appearance', 'appears_in', 'title_s', 'alternate_form_of']\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "with open(join(base_dir,'data.txt')) as json_data:\n", 117 | " characters = json.load(json_data)\n", 118 | "\n", 119 | "attributes = set()\n", 120 | "for c in characters:\n", 121 | " attributes.update(c.keys())\n", 122 | "attributes.remove('name')\n", 123 | "attributes = list(attributes)\n", 124 | "\n", 125 | "print(\"Different attributes:\", attributes)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 72, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "fa_regex = re.compile('([\\w\\s\\'&]+)\\(([0-9]{4})\\)')\n", 137 | "different_games = set()\n", 138 | "name, first_game, first_year, gender = [], [], [], []\n", 139 | "\n", 140 | "for c in characters:\n", 141 | " match = fa_regex.search(c['first_appearance'])\n", 142 | " if match:\n", 143 | " different_games.add(match.group(1).strip())\n", 144 | " \n", 145 | " first_game.append(match.group(1).strip())\n", 146 | " first_year.append(int(match.group(2)))\n", 147 | " name.append(c['name'])\n", 148 | " \n", 149 | " if 'gender' in c:\n", 150 | " gender.append(c['gender'])\n", 151 | " else:\n", 152 | " gender.append('')\n", 153 | " \n", 154 | "characters_initial_df = pd.DataFrame({'name': name,\n", 155 | " 'gender': gender,\n", 156 | " 'first_game': first_game,\n", 157 | " 'first_year': first_year\n", 158 | " })" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 73, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "data": { 168 | "text/html": [ 169 | "
\n", 170 | "\n", 183 | "\n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | "
first_gamefirst_yeargendername
0Ocarina of Time1998FemaleAnju
1The Wind Waker2003MaleAnkle
2Ocarina of Time1998FemaleAveil
3Ocarina of Time1998MaleBean Seller
4The Wind Waker2003MaleBeedle
\n", 231 | "
" 232 | ], 233 | "text/plain": [ 234 | " first_game first_year gender name\n", 235 | "0 Ocarina of Time 1998 Female Anju\n", 236 | "1 The Wind Waker 2003 Male Ankle\n", 237 | "2 Ocarina of Time 1998 Female Aveil\n", 238 | "3 Ocarina of Time 1998 Male Bean Seller\n", 239 | "4 The Wind Waker 2003 Male Beedle" 240 | ] 241 | }, 242 | "execution_count": 73, 243 | "metadata": {}, 244 | "output_type": "execute_result" 245 | } 246 | ], 247 | "source": [ 248 | "characters_initial_df.to_csv(join(base_dir,'characters_initial.csv'))\n", 249 | "characters_initial_df.head()" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 74, 255 | "metadata": { 256 | "collapsed": true, 257 | "scrolled": false 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "game_regex = re.compile('^\\(([0-9a-zA-Z/\\'\\s&]+)\\)')\n", 262 | "\n", 263 | "attributes = ['kindred', 'title_s']\n", 264 | "\n", 265 | "for i in range(len(characters)):\n", 266 | " for attr in attributes:\n", 267 | " if attr in characters[i]:\n", 268 | " kindred = characters[i][attr][len('
'):-len(\"
\")]\n", 269 | " kindred_list = []\n", 270 | " game = 'ORIGINAL'\n", 271 | " for kind in kindred.split(\"
\"):\n", 272 | " soup_content = BeautifulSoup(kind, \"html.parser\").text.strip()\n", 273 | " match = game_regex.search(soup_content)\n", 274 | " if match:\n", 275 | " game = match.group(1).strip()\n", 276 | " else:\n", 277 | " kindred_list.append({'game':game, 'value':soup_content})\n", 278 | " characters[i][attr] = kindred_list\n", 279 | "\n", 280 | "for i in range(len(characters)): \n", 281 | " \n", 282 | "with open(join(base_dir,'data1.txt'), 'w') as outfile:\n", 283 | " json.dump(characters, outfile, indent=4)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": { 290 | "collapsed": true 291 | }, 292 | "outputs": [], 293 | "source": [] 294 | } 295 | ], 296 | "metadata": { 297 | "kernelspec": { 298 | "display_name": "Python 3", 299 | "language": "python", 300 | "name": "python3" 301 | }, 302 | "language_info": { 303 | "codemirror_mode": { 304 | "name": "ipython", 305 | "version": 3 306 | }, 307 | "file_extension": ".py", 308 | "mimetype": "text/x-python", 309 | "name": "python", 310 | "nbconvert_exporter": "python", 311 | "pygments_lexer": "ipython3", 312 | "version": "3.6.1" 313 | } 314 | }, 315 | "nbformat": 4, 316 | "nbformat_minor": 2 317 | } 318 | -------------------------------------------------------------------------------- /youtube-captions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Before coding\n", 8 | "\n", 9 | "Create a new project\n", 10 | "\n", 11 | "https://console.developers.google.com/projectcreate\n", 12 | "\n", 13 | "Once you have created the project, enable access to the YouTube Data API\n", 14 | "\n", 15 | "https://console.developers.google.com/apis/library\n", 16 | "\n", 17 | "Once enabled, it is important that you get credentials for your project\n", 18 | "\n", 19 | "https://console.developers.google.com/apis/credentials/wizard?api=youtube.googleapis.com\n", 20 | "\n", 21 | "From the options select: \n", 22 | "\n", 23 | "| Option | Value |\n", 24 | "| ------------- | ------------- |\n", 25 | "| ¿Qué API estás usando? | **YouTube Data API v3** |\n", 26 | "| ¿Desde dónde llamarás a la API? | **Servidor Web** |\n", 27 | "| ¿A qué tipo de datos accederás? | **Datos públicos** | \n", 28 | "\n", 29 | "Having selected such values, press: **\"¿Qué credenciales necesito?\"** and you will be given an alphanumeric string that is your API key, place this value into the `api_key` variable:" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "api_key = \"\" # Place your YT api key here\n", 41 | "assert api_key != \"\"" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## Now, coding\n", 49 | "\n", 50 | "Import the necessary packages" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "collapsed": true 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "import requests\n", 62 | "import json\n", 63 | "import urllib\n", 64 | "import isodate\n", 65 | "import os\n", 66 | "import pandas as pd\n", 67 | "from bs4 import BeautifulSoup\n", 68 | "from urllib.parse import urlencode\n", 69 | "from slugify import slugify\n", 70 | "from pytube import YouTube" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "search_url = \"https://www.googleapis.com/youtube/v3/search?\"\n", 82 | "caption_url = \"https://www.youtube.com/api/timedtext?\"" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": true 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "language_preferences = ['es-MX','es']\n", 94 | "channels = {\n", 95 | " 'h3h3Productions' : 'UCDWIvJwLJsE4LG1Atne2blQ',\n", 96 | "}\n", 97 | "starting_channel = 'UCDWIvJwLJsE4LG1Atne2blQ'" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": { 104 | "collapsed": true 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "parameters = {\n", 109 | " 'key': api_key,\n", 110 | " 'part': 'snippet',\n", 111 | " 'type': 'video',\n", 112 | " 'channelId': starting_channel,\n", 113 | " 'maxResults': 50,\n", 114 | " 'order': 'date'\n", 115 | "}\n", 116 | "max_pages = 15\n", 117 | "query_string = urlencode(parameters)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "videos = {}\n", 127 | "count = 0\n", 128 | "for channel in channels:\n", 129 | " print(\"Searching for\", channel)\n", 130 | " parameters['channelId'] = channels[channel]\n", 131 | " videos[channel] = []\n", 132 | " if 'pageToken' in parameters:\n", 133 | " del(parameters['pageToken'])\n", 134 | " query_string = urlencode(parameters)\n", 135 | " pages = max_pages\n", 136 | " page_token = 'FIRST TIME!'\n", 137 | " while pages > 0 and len(page_token) > 0:\n", 138 | " qurl = search_url + query_string\n", 139 | " print(qurl)\n", 140 | " r = requests.get(search_url + query_string)\n", 141 | " result = json.loads(r.text)\n", 142 | " try:\n", 143 | " page_token = result[\"nextPageToken\"]\n", 144 | " except:\n", 145 | " page_token = ''\n", 146 | " parameters['pageToken'] = page_token\n", 147 | " pages = pages - 1\n", 148 | " print(len(result['items']), page_token)\n", 149 | " videos[channel].extend(result['items'])\n", 150 | " count += len(result['items'])\n", 151 | " query_string = urlencode(parameters)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "# Conversion to dataframes\n", 161 | "chn = []\n", 162 | "ids = []\n", 163 | "pub = []\n", 164 | "titles = []\n", 165 | "for c in channels:\n", 166 | " for v in videos[c]:\n", 167 | " videoId = v['id']['videoId']\n", 168 | " publishedDate = v['snippet']['publishedAt']\n", 169 | " title = v['snippet']['title']\n", 170 | " chn.append(slugify(c))\n", 171 | " ids.append(videoId)\n", 172 | " pub.append(publishedDate)\n", 173 | " titles.append(title)\n", 174 | "initial_df = pd.DataFrame({\n", 175 | " 'channel':chn,\n", 176 | " 'id': ids,\n", 177 | " 'published_at': pub,\n", 178 | " 'title': titles\n", 179 | "})\n", 180 | "initial_df['published_at'] = pd.to_datetime(initial_df['published_at'])\n", 181 | "initial_df.to_csv(\"youtube-captions/h3h3_initial.csv\", encoding='utf-8')\n", 182 | "print(initial_df.info())" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "ids = list(initial_df['id'].values)\n", 192 | "categories = []\n", 193 | "default_language = []\n", 194 | "durations = []\n", 195 | "license = []\n", 196 | "viewCounts = []\n", 197 | "likeCounts = []\n", 198 | "dislikeCounts = []\n", 199 | "favoriteCounts = []\n", 200 | "commentCounts = []\n", 201 | "\n", 202 | "batch_size = 50\n", 203 | "i = 0\n", 204 | "video_details = \"https://www.googleapis.com/youtube/v3/videos?id=%s&part=snippet,statistics,contentDetails&key=%s\" \n", 205 | "while i < len(ids):\n", 206 | " ids_to_query = ','.join(ids[i:i+batch_size])\n", 207 | " q = video_details % (ids_to_query, api_key)\n", 208 | " r = requests.get(q)\n", 209 | " resultlist = json.loads(r.text)\n", 210 | " for result in resultlist['items']:\n", 211 | " snippet = result['snippet']\n", 212 | " contentDetails = result['contentDetails']\n", 213 | " statistics = result['statistics']\n", 214 | "\n", 215 | " categories.append(snippet['categoryId'])\n", 216 | " if 'defaultAudioLanguage' in snippet:\n", 217 | " default_language.append(snippet['defaultAudioLanguage'])\n", 218 | " else:\n", 219 | " default_language.append('-')\n", 220 | " durations.append(contentDetails['duration'])\n", 221 | " license.append(contentDetails['licensedContent'])\n", 222 | " viewCounts.append(statistics['viewCount'])\n", 223 | " favoriteCounts.append(statistics['favoriteCount'])\n", 224 | " likeCount = -1\n", 225 | " dislikeCount = -1\n", 226 | " commentCount = -1\n", 227 | " if 'likeCount' in statistics:\n", 228 | " likeCount = int(statistics['likeCount'])\n", 229 | " dislikeCount = int(statistics['dislikeCount'])\n", 230 | " if 'commentCount' in statistics:\n", 231 | " commentCount = int(statistics['commentCount'])\n", 232 | " likeCounts.append(likeCount)\n", 233 | " dislikeCounts.append(dislikeCount)\n", 234 | " commentCounts.append(commentCount)\n", 235 | " \n", 236 | " i += batch_size\n", 237 | "\n", 238 | "details_df = pd.DataFrame({\n", 239 | " 'id': ids,\n", 240 | " 'category':categories,\n", 241 | " 'language': default_language,\n", 242 | " 'duration': durations,\n", 243 | " 'license': license,\n", 244 | " 'views': viewCounts,\n", 245 | " 'likes': likeCounts,\n", 246 | " 'dislikes': dislikeCounts,\n", 247 | " 'favs': favoriteCounts,\n", 248 | " 'comments': commentCounts\n", 249 | "})\n", 250 | "\n", 251 | "details_df.to_csv(\"youtube-captions/h3h3_details.csv\", encoding='utf-8')\n", 252 | "print(details_df.info())" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "initial_df = pd.read_csv(\"youtube-captions/h3h3_initial.csv\", index_col=0, parse_dates=['published_at'], na_values=[-1, ''])\n", 262 | "details_df = pd.read_csv(\"youtube-captions/h3h3_details.csv\", index_col=0, na_values=[-1, ''])\n", 263 | "\n", 264 | "\n", 265 | "initial_df = initial_df.drop_duplicates()\n", 266 | "details_df = details_df.drop_duplicates()\n", 267 | "details_df.duration = details_df.duration.apply(lambda iso: isodate.parse_duration(iso).total_seconds())\n", 268 | "\n", 269 | "#print(initial_df.info())\n", 270 | "#print(details_df.info())\n", 271 | "\n", 272 | "complete_df = pd.merge(left=initial_df, right=details_df, on='id')\n", 273 | "complete_df.fillna(-1)\n", 274 | "complete_df.set_index('id', inplace=True)\n", 275 | "\n", 276 | "print(complete_df.describe())\n", 277 | "complete_df.to_csv(\"youtube-captions/h3h3_complete.csv\", encoding='utf-8')" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": { 284 | "collapsed": true 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "vids_subs = []\n", 289 | "errors = []\n", 290 | "for index, row in complete_df.iterrows():\n", 291 | " videoId = index\n", 292 | " title = row['title']\n", 293 | " subtitles = ''\n", 294 | " i = 0\n", 295 | " try:\n", 296 | " yt = YouTube('https://www.youtube.com/watch?v=' + videoId)\n", 297 | " while len(subtitles) == 0 and i < len(language_preferences):\n", 298 | " lang = language_preferences[i]\n", 299 | " if yt.captions.get_by_language_code(lang) is not None:\n", 300 | " subtitles = yt.captions.get_by_language_code(lang).xml_captions\n", 301 | " i = i + 1\n", 302 | " except:\n", 303 | " print(\"Error\", videoId)\n", 304 | " errors.append(videoId)\n", 305 | " if len(subtitles) > 0:\n", 306 | " vids_subs.append({'id': videoId, 'title': title, 'captions': subtitles })\n", 307 | "print(\"Done!\")" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "collapsed": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "print(language_preferences)\n", 319 | "## Getting subs & cleaning them\n", 320 | "for subs in vids_subs:\n", 321 | " soup = BeautifulSoup(subs['captions'], \"lxml\")\n", 322 | " texts = soup.find_all('text')\n", 323 | " sub_entries = []\n", 324 | " for text in texts:\n", 325 | " sub_entry = {\n", 326 | " 'duration': text.get('dur'),\n", 327 | " 'start': text.get('start'),\n", 328 | " 'content': BeautifulSoup(text.get_text(), \"lxml\").text\n", 329 | " }\n", 330 | " sub_entries.append(sub_entry)\n", 331 | " del(subs['captions'])\n", 332 | " subs['captions_parsed'] = sub_entries" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": { 339 | "collapsed": true 340 | }, 341 | "outputs": [], 342 | "source": [ 343 | "from os.path import join\n", 344 | "## Now saving the good stuff\n", 345 | "directory = \"youtube-captions\\\\captions\"\n", 346 | "if not os.path.exists(directory):\n", 347 | " os.makedirs(directory)\n", 348 | "print(\"Saving to\", directory)\n", 349 | "for vid in vids_subs:\n", 350 | " file_path = join(directory, slugify(vid['title']) + '.json')\n", 351 | " with open(file_path, 'w') as outfile:\n", 352 | " json.dump(vid, outfile, indent=4)\n", 353 | "print(\"Done!\")" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": { 360 | "collapsed": true 361 | }, 362 | "outputs": [], 363 | "source": [ 364 | "cleaning_subs = \"\"\"directories = ['amlo', 'presidencia', 'epn']\n", 365 | "for d in directories:\n", 366 | " directory = join('youtube-captions', d)\n", 367 | " for file in os.listdir(directory):\n", 368 | " if file.endswith(\"json\"):\n", 369 | " video = None\n", 370 | " file1 = join(directory, file)\n", 371 | " with open(file1, 'r') as captions_file:\n", 372 | " video = json.load(captions_file)\n", 373 | " captions = video['captions_parsed']\n", 374 | " for cap in captions:\n", 375 | " try:\n", 376 | " cap['content'] = BeautifulSoup(cap['content'], \"lxml\").get_text()\n", 377 | " except:\n", 378 | " print(\"Error\", file1)\n", 379 | " with open(join(directory, file), 'w') as captions_file:\n", 380 | " json.dump(video, captions_file)\n", 381 | "print(\"Done!\")\n", 382 | "\"\"\"" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": { 389 | "collapsed": true 390 | }, 391 | "outputs": [], 392 | "source": [] 393 | } 394 | ], 395 | "metadata": { 396 | "kernelspec": { 397 | "display_name": "Python 3", 398 | "language": "python", 399 | "name": "python3" 400 | }, 401 | "language_info": { 402 | "codemirror_mode": { 403 | "name": "ipython", 404 | "version": 3 405 | }, 406 | "file_extension": ".py", 407 | "mimetype": "text/x-python", 408 | "name": "python", 409 | "nbconvert_exporter": "python", 410 | "pygments_lexer": "ipython3", 411 | "version": "3.6.1" 412 | } 413 | }, 414 | "nbformat": 4, 415 | "nbformat_minor": 2 416 | } 417 | -------------------------------------------------------------------------------- /bokeh/x.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Bokeh Plot 7 | 8 | 9 | 10 | 11 | 14 | 25 | 26 | 27 | 28 |
29 |
30 |
31 | 32 | 47 | 48 | -------------------------------------------------------------------------------- /Intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Jupyter Notebook App \n", 8 | "\n", 9 | "Aplicación compuesta por dos elementos: un servidor, y un cliente y a través del protocolo HTTP... como una aplicación web tradicional. El cliente es esto que estás viendo en este momento y se accede a él a través de un navegador web.\n", 10 | "\n", 11 | "\n", 12 | "\n", 13 | "### El cliente \n", 14 | "\n", 15 | "Con la Notebook App podemos crear los interesantes *Notebook Documents*, o simplemente *Notebooks*, que son archivos que contienen código (como por ejemplo, código de Python), texto enriquecido (es decir, con cosas como negritas, cursivas, imágenes, links, html...) y otros elementos interactivos como gráficas o simulaciones.\n", 16 | "\n", 17 | "Como te podrás imaginar, los *Notebooks* son un lugar ideal para documentar el código, explicar lo que sucede y complementarlo con __matemáticas__ diagramas, gráficas y demás elementos." 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "#### Ejemplo: \n", 25 | "Podrías tener algo como esto (una explicación matemática seguida de su implementación en código):\n", 26 | "\n", 27 | "**An infinite-state model**. Consider a sequence of $n+1$ messages that arrive over a period of time of length $T$" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "messages = [1,2,3,5,6,7,9,17] # n+1 message arrival times\n", 37 | "gaps = [m2 - m1 for m1, m2 in zip(messages[:-1], messages[1:])]\n", 38 | "T = sum(gaps)\n", 39 | "print(T,\":\",gaps)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "Los *Notebooks* forman parte del componente del cliente de la Notebook App.\n", 47 | "\n", 48 | "\n", 49 | "### El servidor \n", 50 | "\n", 51 | "El elemento del servidor en la aplicación es un servidor web que se encarga de ejecutar el código que nosotros introducimos a través del cliente. La forma de hacerlo es a través de los *kernels*, que son procesos que se ejecutan en el servidor en el que está corriendo la app, cada uno de los *Notebooks* tiene un *kernel* asociado el cual estará activo mientras que tu no te deshagas de él. \n", 52 | "\n", 53 | "Dependiendo de lo que estés tratando de ejecutar, cada *kernel* consumirá recursos de la computadora en donde se está ejecutando el servidor.\n", 54 | "\n", 55 | "#### Pero... why not both?\n", 56 | "\n", 57 | "\n", 58 | "\n", 59 | "Puede que te encuentres con que el cliente y el servidor se están ejecutando en la misma computadora, y que no necesitas de conexión a internet para ejecutarlo, esto es muy común y bastante útil puesto que tener el servidor a la mano nos permite usar directamente los archivos de nuestra PC, cosa que usando el navegador web no podríamos hacer. \n", 60 | "\n", 61 | "Una vez que has terminado de jugar *ahem*... trabajar con tu *Notebook* podrías subirlo ahora sí a un servidor más poderoso para que se ejecute más rápido, o si lo colocas en un lugar público (uh, tal vez esto no sea buena idea) esté accesible a través de internet." 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": { 67 | "collapsed": true 68 | }, 69 | "source": [ 70 | "## Instalación \n", 71 | "\n", 72 | "Si eres un novato como yo, la forma recomendada de obtener los *Notebooks* es a través de [Anaconda](https://www.anaconda.com/download), anda, ve a instalarlo si es que aún no lo has hecho, puedes dejar todo por default por el momento. \n", 73 | "`...` \n", 74 | "`...`\n", 75 | "\n", 76 | "Suficiente tiempo.\n", 77 | "\n", 78 | "Si todo salió bien, verás un ícono como este en tu Menú Inicio o en el Launchpad:\n", 79 | "\n", 80 | "\n", 81 | "\n", 82 | "Al abrir esta aplicación aparecerá un *Dashboard* con diferentes aplicaciones, de las cuales deberás seleccionar y ejecutar Jupyter Notebook:\n", 83 | "\n", 84 | " \n", 85 | "\n", 86 | "Si todo sale bien, se abrirá un navegador con lo que es conocido como el **Dashboard** de Jupyter, que en ciertos aspectos podrías ver como el Finder de Mac o el Explorador de Windows. Por ahora se abrirá en un folder de tu computadora, probablemente sea la carpeta de tu usuario o en tus documentos, pero esto es algo que puedes configurar más adelante.\n", 87 | "\n", 88 | "Vamos a crear un nuevo *Notebook*, así que deberás dar click en `Nuevo` o (`New`) y seleccionar Python 3 (o dos dependiendo de tu versión). Y cuando hayas terminado verás algo como esto:" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "collapsed": true 96 | }, 97 | "outputs": [], 98 | "source": [] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": { 103 | "collapsed": true 104 | }, 105 | "source": [ 106 | "Sí, un cuadro gris llamado celda (`cell`) en el que puedes escribir. Por ahora está en modo código, pero siempre podrás usar el selector de acá arriba para cambiar entre código y texto en Markdown (y también HTML). Seguramente ya tendrán tiempo para jugar con esto más adelante, pero por ahora les voy explicar lo básico de los *Notebooks*. \n", 107 | "## Las celdas\n", 108 | "\n", 109 | "Un *Notebook* se compone de celdas. Cada celda puede contener código o texto \"enriquecido\": Las celdas de texto aceptan tres tipos de formato: HTML, Markdown y un poco de LaTeX para fórmulas matemáticas. Así mismo, cada celda tiene cuatro modos diferentes: **reposo**, **edición**, **ejecución** y **ejecutadas**. Sí, inclusive las celdas de texto tienen que ser ejecutadas. Para ejecutar las celdas usualmente se usa la combinación de teclas SHIFT + ENTER.\n", 110 | "\n", 111 | "\n", 112 | "Las siguiente celda contiene cada una de las tres cosas (puedes ver el \"código fuente\" de cada celda si le das doble click o si le presiones enter mientras está seleccionada):" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "### Esto es Markdown:\n", 120 | "\n", 121 | "Emphasis, aka italics, with *asterisks* or _underscores_.\n", 122 | "\n", 123 | "Strong emphasis, aka bold, with **asterisks** or __underscores__.\n", 124 | "\n", 125 | "Combined emphasis with **asterisks and _underscores_**.\n", 126 | "\n", 127 | "Strikethrough uses two tildes. ~~Scratch this.~~ \n", 128 | "\n", 129 | "Puedes ver una guía rápida de lo que te ofrece Markdown [aquí](https://github.github.com/gfm/).\n", 130 | "\n", 131 | "

Esto es HTML:

\n", 132 | "\n", 133 | "Emphasis, aka italics, with <i></i>, ehhmhmhm... itallics.\n", 134 | "\n", 135 | "Strong emphasis, aka bold, with <b></b>, ehhmhmhm... bold.\n", 136 | "\n", 137 | "Combined emphasis with <i></i> and <b></b>, ehhmhmhm... both.\n", 138 | "\n", 139 | "Strikethrough uses <del></del>, like this.\n", 140 | "\n", 141 | "\n", 142 | "Puedes tener cosas sencillas como esta: $a = 10$, o un poco más complicadas como esto: $y = \\frac{1}{{\\sqrt {2\\pi } }}e^{ - \\frac{{z^2 }}{2}} = .3989e^{ - 5z^2 }$ o, ¿quieres que una fórmula resalte realmente? la puedes poner así:\n", 143 | "\n", 144 | "$$y = \\frac{1}{{\\sqrt {2\\pi } }}e^{ - \\frac{{z^2 }}{2}} = .3989e^{ - 5z^2 }$$\n", 145 | "\n" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 5, 151 | "metadata": { 152 | "collapsed": true 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "# Y esto es código\n", 157 | "value = \"¡Buen día señor sol!\"\n", 158 | "a = 2\n", 159 | "b = 3" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "La verdad es que no hay mucho que decir sobre las celdas de texto, solo que son muy, muy útiles.\n", 167 | "\n", 168 | "### Celdas de código\n", 169 | "\n", 170 | "Puedes identificar las celdas de código porque tienen un `In [1]:` al lado, en donde el número es el órden en el que se ejecutó dicha celda. Cada vez que una celda está pendiente de concluir su ejecución aparecerá un `*` en donde debería estar el número. Esto te indicará que hay algo pendiente de terminar de ejecutarse. \n", 171 | "\n", 172 | "Cada *Notebook* está asociado con un *Kernel*, y todo lo que se define en una celda está disponible para todas las demás. Por ejemplo, en la celda anterior a esta declaramos tres variables: `a`, `b` y `value`. En la celda siguiente podemos hacer uso de ellas sin problema:" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 6, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "c = a + b\n", 182 | "print(value)\n", 183 | "print(c)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "collapsed": true 191 | }, 192 | "outputs": [], 193 | "source": [] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "También, como ya te habrás dado cuenta, cuando imprimimos algo a \"consola\", los *Notebooks* lo presentan justo debajo de la celda en la que se llama ala función `print`.\n" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "También, como ya te habrás dado cuenta, cuando imprimimos algo a \"consola\", los *Notebooks* lo presentan justo debajo de la celda en la que se llama ala función `print`.\n" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "También, como ya te habrás dado cuenta, cuando imprimimos algo a \"consola\", los *Notebooks* lo presentan justo debajo de la celda en la que se llama ala función `print`.\n" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "### Toolbar\n", 221 | "\n", 222 | " \n", 223 | "\n", 224 | "Las primeras opciones (de izquierda a derecha) son bastante sencillas de entender.\n", 225 | "\n", 226 | " - **Guardar** _Notebook_ \n", 227 | " - **Agregar** nueva celda\n", 228 | " - **Cortar** celda\n", 229 | " - **Copiar** celda\n", 230 | " - **Pegar** celda\n", 231 | " - Mover celda **hacia arriba**\n", 232 | " - Mover celda **hacia abajo** \n", 233 | "\n", 234 | "Las siguientes tres requieren de un poco más de explicación \n", 235 | "\n", 236 | " - **Ejecutar** celda (y seleccionar la siguiente). Que es como presionar SHIFT + ENTER, así que la verdad es que creo que será muy raro que presiones este botón muy seguido. \n", 237 | " - **Interrumpir _kernel_**. Digamos que estás ejecutando una celda y te das cuenta de que hay un error en ella y que no vale la pena que se siga ejecutando. Este el caso de uso perfecto para este botón. Interrumpir *kernel* únicamente detendrá la ejecución de las tareas pendientes, el resto del kernel (y todo lo que hayamos definido antes) seguirá vivo. \n", 238 | " - **Reiniciar _kernel_**. Hay ocasiones en las que tal vez necesitemos comenzar desde cero con el *notebook*, este es el botón que podemos usar para esto. Reiniciar el _kernel_ removerá de memoria todo lo que se ejecutó previamente, así que tendrás que volver a ejecutar todo de nuevo. \n", 239 | " \n", 240 | "Y por último \n", 241 | "\n", 242 | " - Cambiar el tipo de celda. \n", 243 | " - Abrir el catálogo de comandos. Un botón bastante útil para descubrir los comandos que tenemos disponibles." 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": { 249 | "collapsed": true 250 | }, 251 | "source": [ 252 | "### Volviéndo al código \n", 253 | "\n", 254 | "Pero no creas que solamente es para códigos \"sencillos\", además de variables también podemos definir funciones y clases:" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 7, 260 | "metadata": { 261 | "collapsed": true 262 | }, 263 | "outputs": [], 264 | "source": [ 265 | "def suma(a, b):\n", 266 | " print(\"Sumando\", a, \"+\", b)\n", 267 | " return a + b" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 8, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "resultado = suma(10, 34)\n", 277 | "print(resultado)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 9, 283 | "metadata": { 284 | "collapsed": true 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "class Pokemon:\n", 289 | " def __init__(self, nombre, level):\n", 290 | " self.nombre = nombre\n", 291 | " self.level = level" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 10, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "pokachu = Pokemon(\"Pokachu\", 10)\n", 301 | "print(pokachu.nombre, pokachu.level)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "### Leyendo y escribiendo archivos \n", 309 | "\n", 310 | "Con los *Notebooks* podemos leer y escribir archivos, solamente recuerda: los archivos deben existir en el lado del servidor y no del cliente. En este caso, este *Notebook* tiene varios archivos en una ruta relativa `Intro/...`. Y en específico hay un archivo llamado `hello.txt`. \n", 311 | "\n", 312 | "Para leerlo basta con hacer lo siguiente:" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 11, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "with open('Intro/source/hello.txt', 'r') as hello:\n", 322 | " for l in hello.readlines():\n", 323 | " print(l)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "Mientras que para escribir un archivo es suficiente hacer esto:" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 12, 336 | "metadata": { 337 | "collapsed": true 338 | }, 339 | "outputs": [], 340 | "source": [ 341 | "with open('Intro/source/salida.txt', 'w') as salida:\n", 342 | " for i in range(10):\n", 343 | " salida.write((\"#\" * i) + \"\\n\")" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "### Importando código \n", 351 | "\n", 352 | "Además de todo, también podemos importar código, ya sea de algún módulo que venga ya dentro de Python, alguno que hayamos instalado con `pip` o directamente desde algún archivo de código fuente:" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 14, 358 | "metadata": { 359 | "collapsed": true 360 | }, 361 | "outputs": [], 362 | "source": [ 363 | "# Este viene con Python\n", 364 | "import os\n", 365 | "# Usa 'pip install requests' para instalar requests\n", 366 | "import json, requests\n", 367 | "# Esta es una clase dentro de un archivo local\n", 368 | "from Intro.source.location import Location" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 15, 374 | "metadata": {}, 375 | "outputs": [], 376 | "source": [ 377 | "print(\"Contenido de Intro/source:\")\n", 378 | "for filename in os.listdir('Intro/source'):\n", 379 | " print(filename)" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 16, 385 | "metadata": { 386 | "collapsed": true 387 | }, 388 | "outputs": [], 389 | "source": [ 390 | "resp = requests.get('http://pokeapi.co/api/v2/location/154')\n", 391 | "data = json.loads(resp.text)" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 17, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "viridianCity = Location(data[\"names\"][0][\"name\"], data[\"region\"][\"name\"])\n", 401 | "print(viridianCity.name + \", \" + viridianCity.region)" 402 | ] 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "metadata": {}, 407 | "source": [ 408 | "Acá hay [una lista](https://github.com/jupyter/jupyter/wiki/A-gallery-of-interesting-Jupyter-Notebooks) muy completa con ejemplos de _Notebooks_, tan solo para que veas todo el poder que estos tienen. Te invito a que los sigas usando y te diviertas un buen rato con ellos.\n", 409 | "\n", 410 | "Y pues bien, por el momento es todo, si quieres saber más házmelo saber a [@io_exception](https://twitter.com/io_exception) o a [feregrino@thatcsharpguy.com](malito:feregrino@thatcsharpguy.com)" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "metadata": { 417 | "collapsed": true 418 | }, 419 | "outputs": [], 420 | "source": [] 421 | } 422 | ], 423 | "metadata": { 424 | "kernelspec": { 425 | "display_name": "Python 3", 426 | "language": "python", 427 | "name": "python3" 428 | }, 429 | "language_info": { 430 | "codemirror_mode": { 431 | "name": "ipython", 432 | "version": 3 433 | }, 434 | "file_extension": ".py", 435 | "mimetype": "text/x-python", 436 | "name": "python", 437 | "nbconvert_exporter": "python", 438 | "pygments_lexer": "ipython3", 439 | "version": "3.6.2" 440 | } 441 | }, 442 | "nbformat": 4, 443 | "nbformat_minor": 2 444 | } 445 | -------------------------------------------------------------------------------- /mt-scraper.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "antonio-feregrino-bolanos\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "import requests\n", 18 | "import re\n", 19 | "import pandas as pd\n", 20 | "import numpy as np\n", 21 | "import datetime\n", 22 | "import slugify\n", 23 | "import os\n", 24 | "from urllib.parse import urljoin\n", 25 | "from bs4 import BeautifulSoup\n", 26 | "\n", 27 | "print(slugify.slugify('Antonio Feregrino Bolaños'))\n", 28 | "\n", 29 | "if not os.path.exists(\"mt-scraper\"):\n", 30 | " os.makedirs(\"mt-scraper/defensiva\")\n", 31 | " os.makedirs(\"mt-scraper/ofensiva\")" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "# Get base page\n", 43 | "base_url = \"http://www.mediotiempo.com/liga/futbol/ligamx/tabla-general/\"\n", 44 | "base_page = requests.get(base_url).text\n", 45 | "base_soup = BeautifulSoup(base_page, \"lxml\")" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "collapsed": true 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "tournament_container = base_soup.find(\"div\", { \"class\" : \"dropdown-container\" })\n", 57 | "ul = tournament_container.find('ul')\n", 58 | "tournaments = []\n", 59 | "for li in ul.findAll('li'):\n", 60 | " tournaments.append(li.get('value'))" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "collapsed": true 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "# Get tables\n", 72 | "c = { 'Team': 0, 'PTS':1, 'JJ':2, 'DG':3, 'JG':4,'JE':5, 'JP':6, 'GF': 7, 'GC': 8 }\n", 73 | "print(\"Tournaments\", len(tournaments))\n", 74 | "scraped = {}\n", 75 | "for tournament in tournaments:\n", 76 | " results = []\n", 77 | " url = urljoin(base_url, tournament)\n", 78 | " tournament_page = requests.get(url).text\n", 79 | " tournament_soup = BeautifulSoup(tournament_page, \"lxml\")\n", 80 | " tables = tournament_soup.findAll(\"div\", { \"class\" :'table-positions' })\n", 81 | " for table in tables:\n", 82 | " # need to find the table inside div.scroll:\n", 83 | " table = table.find('div', {'class':'scroll'}).find('table', {'class':'mt-table'})\n", 84 | " rows = table.tbody.findAll('tr')\n", 85 | " for row in rows:\n", 86 | " tds = row.findAll('td')\n", 87 | " team = tds[c['Team']].text.strip()\n", 88 | " pts = tds[c['PTS']].text.strip()\n", 89 | " jj = tds[c['JJ']].text.strip()\n", 90 | " dg = tds[c['DG']].text.strip()\n", 91 | " jg = tds[c['JG']].text.strip()\n", 92 | " je = tds[c['JE']].text.strip()\n", 93 | " jp = tds[c['JP']].text.strip()\n", 94 | " gf = tds[c['GF']].text.strip()\n", 95 | " gc = tds[c['GC']].text.strip()\n", 96 | " team_stat = {\n", 97 | " 'team':team,\n", 98 | " 'pts':pts,\n", 99 | " 'jj':jj,\n", 100 | " 'dg':dg,\n", 101 | " 'jg':jg,\n", 102 | " 'je':je,\n", 103 | " 'jp':jp,\n", 104 | " 'gf':gf,\n", 105 | " 'gc':gc\n", 106 | " }\n", 107 | " results.append(team_stat)\n", 108 | " scraped[tournament] = results" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "collapsed": true 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "torneo_largo = re.compile('(\\w+)-([0-9]{4})-+([0-9]{4})')\n", 120 | "torneo_corto = re.compile('(\\w+)-([0-9]{4})')" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": { 127 | "collapsed": true, 128 | "scrolled": false 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "dos = set(['invierno', 'apertura'])\n", 133 | "uno = set(['verano', 'clausura', 'bicentenario'])\n", 134 | "intermediate = []\n", 135 | "for torneo in scraped:\n", 136 | " match_torneo_largo = torneo_largo.search(torneo)\n", 137 | " match_torneo_corto = torneo_corto.search(torneo)\n", 138 | " name = ''\n", 139 | " if match_torneo_largo:\n", 140 | " t = match_torneo_largo.group(1)\n", 141 | " inicio = int(match_torneo_largo.group(2).upper())\n", 142 | " fin = int(match_torneo_largo.group(3).upper())\n", 143 | " if t == \"temporada\":\n", 144 | " name = \"Temporada de \" + str(inicio) + \" a \"+ str(fin)\n", 145 | " if t == \"liguilla\":\n", 146 | " name = \"Liguilla de \" + str(inicio) + \" a \"+ str(fin)\n", 147 | " elif match_torneo_corto:\n", 148 | " t = match_torneo_corto.group(1)\n", 149 | " c = int(match_torneo_corto.group(2).upper())\n", 150 | " if t in dos:\n", 151 | " name = \"Torneo corto \" + str(c) + \"-2\"\n", 152 | " if t in uno:\n", 153 | " name = \"Torneo corto \" + str(c) + \"-1\"\n", 154 | " for result in scraped[torneo]:\n", 155 | " intermediate.append([\n", 156 | " torneo,\n", 157 | " result['team'],\n", 158 | " result['pts'],\n", 159 | " result['jj'],\n", 160 | " result['dg'],\n", 161 | " result['jg'],\n", 162 | " result['je'],\n", 163 | " result['jp'],\n", 164 | " result['gf'],\n", 165 | " result['gc']\n", 166 | " ])\n", 167 | "tournament_df = pd.DataFrame(intermediate)\n", 168 | "tournament_df.columns = ['tournament', 'team', 'pts', 'jj', 'dg', 'jg','je', 'jp', 'gf', 'gc']\n", 169 | "tournament_df.set_index(['tournament', 'team'], inplace=True)\n", 170 | "print(tournament_df.tail())" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "collapsed": true 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "tournament_df.to_csv('mt-scraper/tournaments.csv')" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": { 188 | "collapsed": true 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "tournament_df = pd.read_csv('mt-scraper/tournaments.csv', index_col=[0, 1])\n", 193 | "print(tournament_df.head())" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": { 200 | "collapsed": true 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "all_tournaments = tournament_df.index.get_level_values(0).unique().values\n", 205 | "url = \"http://www.mediotiempo.com/liga/futbol/ligamx/calendario/\"\n", 206 | "seasons = []\n", 207 | "seasonRounds = []\n", 208 | "rounds = []\n", 209 | "for season in all_tournaments:\n", 210 | " #print(\"Scraping season\", season)\n", 211 | " season_url = url + tournament + \"/\"\n", 212 | " season_page = requests.get(season_url).text\n", 213 | " season_soup = BeautifulSoup(season_page, \"lxml\")\n", 214 | " seasonRound_ul = season_soup.find('ul', { \"name\": \"seasonRound\"})\n", 215 | " if seasonRound_ul is None: \n", 216 | " continue\n", 217 | " for li in seasonRound_ul.findAll('li'):\n", 218 | " seasonRound = li.get('value')\n", 219 | " seasonRound_url = season_url + seasonRound + \"/\"\n", 220 | " seasonRound_page = requests.get(seasonRound_url).text\n", 221 | " seasonRound_soup = BeautifulSoup(seasonRound_page, \"lxml\")\n", 222 | " round_ul = seasonRound_soup.find('ul', { \"name\": \"round\"})\n", 223 | " if round_ul is None: \n", 224 | " continue\n", 225 | " for li in round_ul.findAll('li'):\n", 226 | " _round = li.get('value')\n", 227 | " seasons.append(season)\n", 228 | " seasonRounds.append(seasonRound)\n", 229 | " rounds.append(_round)\n", 230 | " \n", 231 | "print(\"Found\",len(seasons), len(seasonRounds), len(rounds), \"rounds\")" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "collapsed": true 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "url = \"http://www.mediotiempo.com/liga/futbol/ligamx/calendario/%s/%s/%s\"\n", 243 | "\n", 244 | "matches_seasons = []\n", 245 | "matches_seasonRounds = []\n", 246 | "matches_rounds = []\n", 247 | "matches_date = []\n", 248 | "matches_time = []\n", 249 | "matches_home_team = []\n", 250 | "matches_result = []\n", 251 | "matches_away_team = []\n", 252 | "matches_venue = []\n", 253 | "\n", 254 | "for season,seasonRound,_round in zip(seasons,seasonRounds,rounds):\n", 255 | " query_url = url % (season,seasonRound,_round)\n", 256 | "# print(query_url)\n", 257 | " scrape = requests.get(query_url).text\n", 258 | " scrape_soup = BeautifulSoup(scrape, \"lxml\")\n", 259 | " calendar_groups = scrape_soup.findAll('div', {\"class\":\"mt-calendar-group\"})\n", 260 | " for calendar_group in calendar_groups:\n", 261 | " date = calendar_group.find('div', {\"class\":\"calendar-date-wrapper\"}).text.strip()\n", 262 | " match_wrappers = calendar_group.findAll('div', {\"class\":\"mt-calendar-match\"},recursive=False)\n", 263 | " for match_wrapper in match_wrappers:\n", 264 | " try:\n", 265 | " divs = match_wrapper.findAll('div')\n", 266 | " time = divs[0].text.strip()\n", 267 | " _as = divs[1].findAll('a')\n", 268 | " home_team = _as[0].text.strip()\n", 269 | " result = _as[1].text.replace(\"\\n\", \" \").strip()\n", 270 | " away_team = _as[2].text.strip()\n", 271 | " venue_div = divs[1].find('div', {'class':'venue-wrapper'})\n", 272 | " if venue_div is None:\n", 273 | " venue = ''\n", 274 | " else:\n", 275 | " venue = venue_div.text.strip()\n", 276 | "\n", 277 | " matches_seasons.append(season)\n", 278 | " matches_seasonRounds.append(seasonRound)\n", 279 | " matches_rounds.append(_round)\n", 280 | " matches_date.append(date)\n", 281 | " matches_time.append(time)\n", 282 | " matches_home_team.append(home_team)\n", 283 | " matches_result.append(result)\n", 284 | " matches_away_team.append(away_team)\n", 285 | " matches_venue.append(venue)\n", 286 | " except:\n", 287 | " print(\"Error\", query_url)\n", 288 | "print(\"Done scraping\")" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": { 295 | "collapsed": true 296 | }, 297 | "outputs": [], 298 | "source": [ 299 | "matches_df = pd.DataFrame({\n", 300 | " 'season' : matches_seasons,\n", 301 | " 'season_round': matches_seasonRounds,\n", 302 | " 'round': matches_rounds,\n", 303 | " 'date': matches_date,\n", 304 | " 'time': matches_time,\n", 305 | " 'home_team': matches_home_team,\n", 306 | " 'result': matches_result,\n", 307 | " 'away_team': matches_away_team,\n", 308 | " 'venue': matches_venue\n", 309 | "})\n", 310 | "\n", 311 | "print(matches_df.info())\n", 312 | "matches_df.to_csv('mt-scraper/matches_raw.csv')" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": { 319 | "collapsed": true 320 | }, 321 | "outputs": [], 322 | "source": [ 323 | "matches_df = pd.read_csv('mt-scraper/matches_raw.csv', index_col=0)\n", 324 | "\n", 325 | "month_dict = {\n", 326 | " 'enero': 1,\n", 327 | " 'febrero': 2,\n", 328 | " 'marzo': 3,\n", 329 | " 'abril':4,\n", 330 | " 'mayo': 5,\n", 331 | " 'junio': 6,\n", 332 | " 'julio': 7,\n", 333 | " 'agosto': 8,\n", 334 | " 'septiembre': 9,\n", 335 | " 'octubre': 10,\n", 336 | " 'noviembre': 11,\n", 337 | " 'diciembre': 12\n", 338 | "}\n", 339 | "\n", 340 | "mt_date_re = re.compile('(\\w{3})\\s([0-9]+)\\sde\\s(\\w+),\\s([0-9]{4})\\s([0-9]{2}):([0-9]{2})')\n", 341 | "def parse_dates(text_date):\n", 342 | " match = mt_date_re.search(text_date)\n", 343 | " if match:\n", 344 | " day = int(match.group(2))\n", 345 | " month = month_dict[match.group(3)] \n", 346 | " year = int(match.group(4))\n", 347 | " hour =int(match.group(5))\n", 348 | " minute =int(match.group(6))\n", 349 | " dt_str = \"%04d-%02d-%02d %02d:%02d\" % (year,month,day,hour,minute)\n", 350 | " try:\n", 351 | " return pd.to_datetime(dt_str)\n", 352 | " except:\n", 353 | " print(dt_str)\n", 354 | "\n", 355 | "mt_score = re.compile('([0-9]+)\\s*-\\s*([0-9]+)')\n", 356 | "def get_scores(raw_score):\n", 357 | " match = mt_score.search(raw_score)\n", 358 | " if match:\n", 359 | " return int(match.group(1)), int(match.group(2))\n", 360 | " return np.nan, np.nan\n", 361 | " \n", 362 | " \n", 363 | "date_time = matches_df['date'] + \" \" + matches_df[\"time\"]\n", 364 | "\n", 365 | "#matches_df['match_datetime']\n", 366 | "matches_df['match_datetime'] = date_time.apply(parse_dates)\n", 367 | "matches_df['home_score'], matches_df['away_score'] = zip(*matches_df['result'].apply(get_scores))\n", 368 | "matches_df.tail()\n", 369 | "\n", 370 | "#del matches_df['date'], matches_df['time'], matches_df['result']\n", 371 | "\n", 372 | "#matches_df.info()\n", 373 | "matches_df.to_csv('mt-scraper/matches_processed.csv')\n", 374 | "no_date = matches_df[matches_df['match_datetime'].isnull()]\n", 375 | "no_date.head(11)" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": { 382 | "collapsed": true 383 | }, 384 | "outputs": [], 385 | "source": [ 386 | "matches_processed_df = pd.read_csv('mt-scraper/matches_processed.csv', index_col=0, parse_dates=['match_datetime'])\n", 387 | "matches_processed_df.info()" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "metadata": { 394 | "collapsed": true 395 | }, 396 | "outputs": [], 397 | "source": [ 398 | "no_date = matches_processed_df[matches_processed_df['match_datetime'].isnull()]\n", 399 | "no_date.head(11)" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": { 406 | "collapsed": true 407 | }, 408 | "outputs": [], 409 | "source": [ 410 | "# Get details about each match (insane):\n", 411 | "_ = '''\n", 412 | "url = \"http://www.mediotiempo.com/partido/futbol/ligamx/%s/%s/ficha\"\n", 413 | "url_alt = \"http://www.mediotiempo.com/partido/futbol/liga-mx/%s/%s/ficha\"\n", 414 | "a = matches_processed_df[['home_team','away_team','match_datetime']].values\n", 415 | "for r in a[4300:4305]:\n", 416 | " s = r[0] + \" vs \" + r[1]\n", 417 | " _url = url % (slugify.slugify(s), pd.to_datetime(r[2]).strftime(\"%Y/%m/%d\"))\n", 418 | " rq = requests.get(_url)\n", 419 | " if rq.status_code != 200:\n", 420 | " _url = url_alt % (slugify.slugify(s), pd.to_datetime(r[2]).strftime(\"%Y/%m/%d\"))\n", 421 | " rq = requests.get(_url)\n", 422 | " if rq.status_code != 200:\n", 423 | " rq = None\n", 424 | " _url = None\n", 425 | " \n", 426 | " if rq is not None:\n", 427 | " print(_url)\n", 428 | "'''" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": { 435 | "collapsed": true 436 | }, 437 | "outputs": [], 438 | "source": [ 439 | "all_tournaments = tournament_df.index.get_level_values(0).unique().values" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "metadata": { 446 | "collapsed": true 447 | }, 448 | "outputs": [], 449 | "source": [ 450 | "# Ofensiva\n", 451 | "of_url = \"http://www.mediotiempo.com/liga/futbol/ligamx/estadisticas/equipos/%s/ofensiva?tabla=mas-goleadores\"\n", 452 | "ofensiva_tables = {}\n", 453 | "for season in all_tournaments:\n", 454 | " url = of_url % season\n", 455 | " r = requests.get(url)\n", 456 | " scrape_soup = BeautifulSoup(r.text, \"lxml\")\n", 457 | " table = scrape_soup.find('div', {'class': 'table-containers'})\n", 458 | " if table is not None:\n", 459 | " table_body = table.find('div', {'class': 'scroll'}).find('tbody', {'class':'mt-table-body'})\n", 460 | " rows = table_body.findAll('tr')\n", 461 | " if len(rows) == 0:\n", 462 | " continue\n", 463 | " \n", 464 | " gf = []\n", 465 | " equipo = []\n", 466 | " tt = []\n", 467 | " tg = []\n", 468 | " prec = []\n", 469 | " g_c = []\n", 470 | " ll = []\n", 471 | " lla = []\n", 472 | " fdl = []\n", 473 | " \n", 474 | " for row in rows:\n", 475 | " all_tds = row.findAll('td')\n", 476 | " equipo.append(all_tds[0].text.strip())\n", 477 | " if len(all_tds) == 2:\n", 478 | " gf.append(int(all_tds[1].text.strip()))\n", 479 | " tt.append(np.nan)\n", 480 | " tg.append(np.nan)\n", 481 | " prec.append(np.nan)\n", 482 | " g_c.append(np.nan)\n", 483 | " ll.append(np.nan)\n", 484 | " lla.append(np.nan)\n", 485 | " fdl.append(np.nan)\n", 486 | " else:\n", 487 | " gf.append(int(all_tds[3].text.strip()))\n", 488 | " tt.append(int(all_tds[1].text.strip()))\n", 489 | " tg.append(int(all_tds[2].text.strip()))\n", 490 | " prec.append(all_tds[4].text.strip())\n", 491 | " g_c.append(float(all_tds[5].text.strip()))\n", 492 | " ll.append(int(all_tds[6].text.strip()))\n", 493 | " lla.append(int(all_tds[7].text.strip()))\n", 494 | " fdl.append(int(all_tds[8].text.strip()))\n", 495 | " ofensiva_tables[season] = pd.DataFrame({'Equipo': equipo, \n", 496 | " 'GF': gf,\n", 497 | " 'TT': tt,\n", 498 | " 'TG': tg,\n", 499 | " 'PREC': prec,\n", 500 | " 'G_C': g_c,\n", 501 | " 'LL': ll,\n", 502 | " 'LLA': lla,\n", 503 | " 'FDL': fdl})" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": null, 509 | "metadata": { 510 | "collapsed": true 511 | }, 512 | "outputs": [], 513 | "source": [ 514 | "# Defensiva\n", 515 | "of_url = \"http://www.mediotiempo.com/liga/futbol/ligamx/estadisticas/equipos/%s/defensiva?tabla=menos-goleados\"\n", 516 | "defensiva_tables = {}\n", 517 | "for season in all_tournaments:\n", 518 | " url = of_url % season\n", 519 | " r = requests.get(url)\n", 520 | " scrape_soup = BeautifulSoup(r.text, \"lxml\")\n", 521 | " table = scrape_soup.find('div', {'class': 'table-containers'})\n", 522 | " if table is not None:\n", 523 | " table_body = table.find('div', {'class': 'scroll'}).find('tbody', {'class':'mt-table-body'})\n", 524 | " rows = table_body.findAll('tr')\n", 525 | " if len(rows) == 0:\n", 526 | " continue\n", 527 | " \n", 528 | " gc = []\n", 529 | " equipo = []\n", 530 | " ttp = []\n", 531 | " tgp = []\n", 532 | " blq = []\n", 533 | " tblq = []\n", 534 | " cblq = []\n", 535 | " pblq = []\n", 536 | " _int = []\n", 537 | " \n", 538 | " for row in rows:\n", 539 | " all_tds = row.findAll('td')\n", 540 | " equipo.append(all_tds[0].text.strip())\n", 541 | " if len(all_tds) == 2:\n", 542 | " gc.append(int(all_tds[1].text.strip()))\n", 543 | " ttp.append(np.nan)\n", 544 | " tgp.append(np.nan)\n", 545 | " blq.append(np.nan)\n", 546 | " tblq.append(np.nan)\n", 547 | " cblq.append(np.nan)\n", 548 | " pblq.append(np.nan)\n", 549 | " _int.append(np.nan)\n", 550 | " else:\n", 551 | " gc.append(int(all_tds[3].text.strip()))\n", 552 | " ttp.append(int(all_tds[1].text.strip()))\n", 553 | " tgp.append(int(all_tds[2].text.strip()))\n", 554 | " blq.append(int(all_tds[4].text.strip()))\n", 555 | " tblq.append(int(all_tds[5].text.strip()))\n", 556 | " cblq.append(int(all_tds[6].text.strip()))\n", 557 | " pblq.append(int(all_tds[7].text.strip()))\n", 558 | " _int.append(int(all_tds[8].text.strip()))\n", 559 | " defensiva_tables[season] = pd.DataFrame({'Equipo': equipo, \n", 560 | " 'TTP': ttp,\n", 561 | " 'TGP': tgp,\n", 562 | " 'GC': gc,\n", 563 | " 'BLQ': blq,\n", 564 | " 'TBLQ': tblq,\n", 565 | " 'CBLQ': cblq,\n", 566 | " 'PBLQ': pblq,\n", 567 | " 'INT': _int})" 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": null, 573 | "metadata": { 574 | "collapsed": true 575 | }, 576 | "outputs": [], 577 | "source": [ 578 | "for season in all_tournaments:\n", 579 | " if season in defensiva_tables and season in ofensiva_tables:\n", 580 | " defensiva_tables[season].to_csv('mt-scraper/defensiva/' + season +'.csv')\n", 581 | " ofensiva_tables[season].to_csv('mt-scraper/ofensiva/' + season +'.csv')" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": null, 587 | "metadata": { 588 | "collapsed": true 589 | }, 590 | "outputs": [], 591 | "source": [] 592 | } 593 | ], 594 | "metadata": { 595 | "kernelspec": { 596 | "display_name": "Python 3", 597 | "language": "python", 598 | "name": "python3" 599 | }, 600 | "language_info": { 601 | "codemirror_mode": { 602 | "name": "ipython", 603 | "version": 3 604 | }, 605 | "file_extension": ".py", 606 | "mimetype": "text/x-python", 607 | "name": "python", 608 | "nbconvert_exporter": "python", 609 | "pygments_lexer": "ipython3", 610 | "version": "3.6.1" 611 | } 612 | }, 613 | "nbformat": 4, 614 | "nbformat_minor": 2 615 | } 616 | -------------------------------------------------------------------------------- /Scalers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 31, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler, RobustScaler\n", 12 | "import numpy as np\n", 13 | "import pandas as pd" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 32, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "
\n", 25 | "\n", 38 | "\n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | "
hundredsneg_thousandsoutlierstensthousands
01.0-1000.000000-3.212321e+061.01.0
112.0-777.777778-7.777778e+022.0112.0
223.0-555.555556-5.555556e+023.0223.0
334.0-333.333333-3.333333e+024.0334.0
445.0-111.111111-1.111111e+025.0445.0
556.0111.1111111.111111e+026.0556.0
667.0333.3333333.333333e+027.0667.0
778.0555.5555565.555556e+028.0778.0
889.0777.7777787.777778e+029.0889.0
9100.01000.0000003.212321e+0610.01000.0
\n", 132 | "
" 133 | ], 134 | "text/plain": [ 135 | " hundreds neg_thousands outliers tens thousands\n", 136 | "0 1.0 -1000.000000 -3.212321e+06 1.0 1.0\n", 137 | "1 12.0 -777.777778 -7.777778e+02 2.0 112.0\n", 138 | "2 23.0 -555.555556 -5.555556e+02 3.0 223.0\n", 139 | "3 34.0 -333.333333 -3.333333e+02 4.0 334.0\n", 140 | "4 45.0 -111.111111 -1.111111e+02 5.0 445.0\n", 141 | "5 56.0 111.111111 1.111111e+02 6.0 556.0\n", 142 | "6 67.0 333.333333 3.333333e+02 7.0 667.0\n", 143 | "7 78.0 555.555556 5.555556e+02 8.0 778.0\n", 144 | "8 89.0 777.777778 7.777778e+02 9.0 889.0\n", 145 | "9 100.0 1000.000000 3.212321e+06 10.0 1000.0" 146 | ] 147 | }, 148 | "execution_count": 32, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "tens = np.linspace(1, 10, 10)\n", 155 | "hundreds = np.linspace(1, 100, 10)\n", 156 | "thousands = np.linspace(1, 1000, 10)\n", 157 | "neg_thousands = np.linspace(-1000, 1000, 10)\n", 158 | "outliers = neg_thousands.copy()\n", 159 | "outliers[0] = -3212321\n", 160 | "outliers[9] = 3212321\n", 161 | "\n", 162 | "data = pd.DataFrame({\n", 163 | " 'tens': tens,\n", 164 | " 'hundreds': hundreds,\n", 165 | " 'thousands': thousands,\n", 166 | " 'neg_thousands': neg_thousands,\n", 167 | " 'outliers': outliers\n", 168 | "})\n", 169 | "columns = data.columns\n", 170 | "data.head(10)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 33, 176 | "metadata": {}, 177 | "outputs": [ 178 | { 179 | "data": { 180 | "text/html": [ 181 | "
\n", 182 | "\n", 195 | "\n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | "
hundredsneg_thousandsoutlierstensthousands
00.0000000.0000000.0000000.0000000.000000
10.1111110.1111110.4998790.1111110.111111
20.2222220.2222220.4999140.2222220.222222
30.3333330.3333330.4999480.3333330.333333
40.4444440.4444440.4999830.4444440.444444
50.5555560.5555560.5000170.5555560.555556
60.6666670.6666670.5000520.6666670.666667
70.7777780.7777780.5000860.7777780.777778
80.8888890.8888890.5001210.8888890.888889
91.0000001.0000001.0000001.0000001.000000
\n", 289 | "
" 290 | ], 291 | "text/plain": [ 292 | " hundreds neg_thousands outliers tens thousands\n", 293 | "0 0.000000 0.000000 0.000000 0.000000 0.000000\n", 294 | "1 0.111111 0.111111 0.499879 0.111111 0.111111\n", 295 | "2 0.222222 0.222222 0.499914 0.222222 0.222222\n", 296 | "3 0.333333 0.333333 0.499948 0.333333 0.333333\n", 297 | "4 0.444444 0.444444 0.499983 0.444444 0.444444\n", 298 | "5 0.555556 0.555556 0.500017 0.555556 0.555556\n", 299 | "6 0.666667 0.666667 0.500052 0.666667 0.666667\n", 300 | "7 0.777778 0.777778 0.500086 0.777778 0.777778\n", 301 | "8 0.888889 0.888889 0.500121 0.888889 0.888889\n", 302 | "9 1.000000 1.000000 1.000000 1.000000 1.000000" 303 | ] 304 | }, 305 | "execution_count": 33, 306 | "metadata": {}, 307 | "output_type": "execute_result" 308 | } 309 | ], 310 | "source": [ 311 | "minmax = MinMaxScaler()\n", 312 | "minmax.fit(data)\n", 313 | "data_minmax_scaled = minmax.transform(data)\n", 314 | "pd.DataFrame(data_minmax_scaled, columns=columns)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 34, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "data": { 324 | "text/html": [ 325 | "
\n", 326 | "\n", 339 | "\n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | "
hundredsneg_thousandsoutlierstensthousands
00.01-1.000000-1.0000000.10.001
10.12-0.777778-0.0002420.20.112
20.23-0.555556-0.0001730.30.223
30.34-0.333333-0.0001040.40.334
40.45-0.111111-0.0000350.50.445
50.560.1111110.0000350.60.556
60.670.3333330.0001040.70.667
70.780.5555560.0001730.80.778
80.890.7777780.0002420.90.889
91.001.0000001.0000001.01.000
\n", 433 | "
" 434 | ], 435 | "text/plain": [ 436 | " hundreds neg_thousands outliers tens thousands\n", 437 | "0 0.01 -1.000000 -1.000000 0.1 0.001\n", 438 | "1 0.12 -0.777778 -0.000242 0.2 0.112\n", 439 | "2 0.23 -0.555556 -0.000173 0.3 0.223\n", 440 | "3 0.34 -0.333333 -0.000104 0.4 0.334\n", 441 | "4 0.45 -0.111111 -0.000035 0.5 0.445\n", 442 | "5 0.56 0.111111 0.000035 0.6 0.556\n", 443 | "6 0.67 0.333333 0.000104 0.7 0.667\n", 444 | "7 0.78 0.555556 0.000173 0.8 0.778\n", 445 | "8 0.89 0.777778 0.000242 0.9 0.889\n", 446 | "9 1.00 1.000000 1.000000 1.0 1.000" 447 | ] 448 | }, 449 | "execution_count": 34, 450 | "metadata": {}, 451 | "output_type": "execute_result" 452 | } 453 | ], 454 | "source": [ 455 | "maxabs = MaxAbsScaler()\n", 456 | "maxabs.fit(data)\n", 457 | "data_maxabs_scaled = maxabs.transform(data)\n", 458 | "pd.DataFrame(data_maxabs_scaled, columns=columns)" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 38, 464 | "metadata": { 465 | "scrolled": true 466 | }, 467 | "outputs": [ 468 | { 469 | "data": { 470 | "text/html": [ 471 | "
\n", 472 | "\n", 485 | "\n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | "
hundredsneg_thousandsoutlierstensthousands
0-1.000000-1.000000-3212.321000-1.000000-1.000000
1-0.777778-0.777778-0.777778-0.777778-0.777778
2-0.555556-0.555556-0.555556-0.555556-0.555556
3-0.333333-0.333333-0.333333-0.333333-0.333333
4-0.111111-0.111111-0.111111-0.111111-0.111111
50.1111110.1111110.1111110.1111110.111111
60.3333330.3333330.3333330.3333330.333333
70.5555560.5555560.5555560.5555560.555556
80.7777780.7777780.7777780.7777780.777778
91.0000001.0000003212.3210001.0000001.000000
\n", 579 | "
" 580 | ], 581 | "text/plain": [ 582 | " hundreds neg_thousands outliers tens thousands\n", 583 | "0 -1.000000 -1.000000 -3212.321000 -1.000000 -1.000000\n", 584 | "1 -0.777778 -0.777778 -0.777778 -0.777778 -0.777778\n", 585 | "2 -0.555556 -0.555556 -0.555556 -0.555556 -0.555556\n", 586 | "3 -0.333333 -0.333333 -0.333333 -0.333333 -0.333333\n", 587 | "4 -0.111111 -0.111111 -0.111111 -0.111111 -0.111111\n", 588 | "5 0.111111 0.111111 0.111111 0.111111 0.111111\n", 589 | "6 0.333333 0.333333 0.333333 0.333333 0.333333\n", 590 | "7 0.555556 0.555556 0.555556 0.555556 0.555556\n", 591 | "8 0.777778 0.777778 0.777778 0.777778 0.777778\n", 592 | "9 1.000000 1.000000 3212.321000 1.000000 1.000000" 593 | ] 594 | }, 595 | "execution_count": 38, 596 | "metadata": {}, 597 | "output_type": "execute_result" 598 | } 599 | ], 600 | "source": [ 601 | "robust = RobustScaler()\n", 602 | "robust.fit(data)\n", 603 | "data_robust_scaled = robust.transform(data)\n", 604 | "pd.DataFrame(data_robust_scaled, columns=columns)" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": 39, 610 | "metadata": {}, 611 | "outputs": [ 612 | { 613 | "data": { 614 | "text/html": [ 615 | "
\n", 616 | "\n", 629 | "\n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | "
hundredsneg_thousandsoutlierstensthousands
0-1.566699-1.566699-2.236068-1.566699-1.566699
1-1.218544-1.218544-0.000541-1.218544-1.218544
2-0.870388-0.870388-0.000387-0.870388-0.870388
3-0.522233-0.522233-0.000232-0.522233-0.522233
4-0.174078-0.174078-0.000077-0.174078-0.174078
50.1740780.1740780.0000770.1740780.174078
60.5222330.5222330.0002320.5222330.522233
70.8703880.8703880.0003870.8703880.870388
81.2185441.2185440.0005411.2185441.218544
91.5666991.5666992.2360681.5666991.566699
\n", 723 | "
" 724 | ], 725 | "text/plain": [ 726 | " hundreds neg_thousands outliers tens thousands\n", 727 | "0 -1.566699 -1.566699 -2.236068 -1.566699 -1.566699\n", 728 | "1 -1.218544 -1.218544 -0.000541 -1.218544 -1.218544\n", 729 | "2 -0.870388 -0.870388 -0.000387 -0.870388 -0.870388\n", 730 | "3 -0.522233 -0.522233 -0.000232 -0.522233 -0.522233\n", 731 | "4 -0.174078 -0.174078 -0.000077 -0.174078 -0.174078\n", 732 | "5 0.174078 0.174078 0.000077 0.174078 0.174078\n", 733 | "6 0.522233 0.522233 0.000232 0.522233 0.522233\n", 734 | "7 0.870388 0.870388 0.000387 0.870388 0.870388\n", 735 | "8 1.218544 1.218544 0.000541 1.218544 1.218544\n", 736 | "9 1.566699 1.566699 2.236068 1.566699 1.566699" 737 | ] 738 | }, 739 | "execution_count": 39, 740 | "metadata": {}, 741 | "output_type": "execute_result" 742 | } 743 | ], 744 | "source": [ 745 | "standard = StandardScaler()\n", 746 | "standard.fit(data)\n", 747 | "data_standard_scaled = standard.transform(data)\n", 748 | "pd.DataFrame(data_standard_scaled, columns=columns)" 749 | ] 750 | }, 751 | { 752 | "cell_type": "code", 753 | "execution_count": null, 754 | "metadata": { 755 | "collapsed": true 756 | }, 757 | "outputs": [], 758 | "source": [] 759 | } 760 | ], 761 | "metadata": { 762 | "kernelspec": { 763 | "display_name": "Python 3", 764 | "language": "python", 765 | "name": "python3" 766 | }, 767 | "language_info": { 768 | "codemirror_mode": { 769 | "name": "ipython", 770 | "version": 3 771 | }, 772 | "file_extension": ".py", 773 | "mimetype": "text/x-python", 774 | "name": "python", 775 | "nbconvert_exporter": "python", 776 | "pygments_lexer": "ipython3", 777 | "version": "3.6.1" 778 | } 779 | }, 780 | "nbformat": 4, 781 | "nbformat_minor": 2 782 | } 783 | -------------------------------------------------------------------------------- /bokeh.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from bokeh.plotting import figure, output_file, show\n", 10 | "from bokeh.io import output_notebook" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 4, 16 | "metadata": { 17 | "collapsed": true 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "x = [1,3,4,7]\n", 22 | "y = [6,4,6,1]" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 8, 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "plot = figure(plot_width=400, plot_height=400, tools='pan,box_zoom')" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 9, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/html": [ 44 | "
GlyphRenderer(
id = 'f8a3c423-e17d-4b1d-8f81-1d3840e4a9d0', …)
data_source = ColumnDataSource(id='06745886-9df8-4077-abc5-2578eb728cf0', ...),
glyph = Circle(id='1f83b727-488a-4abc-b9ca-19a01e960f6a', ...),
hover_glyph = None,
js_event_callbacks = {},
js_property_callbacks = {},
level = 'glyph',
muted = False,
muted_glyph = None,
name = None,
nonselection_glyph = Circle(id='3ceb6d9a-e1aa-458f-91ff-b51dfee9a77c', ...),
selection_glyph = None,
subscribed_events = [],
tags = [],
view = CDSView(id='df97428e-f895-490d-9594-340e90e9de57', ...),
visible = True,
x_range_name = 'default',
y_range_name = 'default')
\n", 45 | "\n" 60 | ], 61 | "text/plain": [ 62 | "GlyphRenderer(id='f8a3c423-e17d-4b1d-8f81-1d3840e4a9d0', ...)" 63 | ] 64 | }, 65 | "execution_count": 9, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "plot.circle(x, y)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 11, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/html": [ 82 | "\n", 83 | "
\n", 84 | " \n", 85 | " Loading BokehJS ...\n", 86 | "
" 87 | ] 88 | }, 89 | "metadata": {}, 90 | "output_type": "display_data" 91 | }, 92 | { 93 | "data": { 94 | "application/javascript": [ 95 | "\n", 96 | "(function(root) {\n", 97 | " function now() {\n", 98 | " return new Date();\n", 99 | " }\n", 100 | "\n", 101 | " var force = true;\n", 102 | "\n", 103 | " if (typeof (root._bokeh_onload_callbacks) === \"undefined\" || force === true) {\n", 104 | " root._bokeh_onload_callbacks = [];\n", 105 | " root._bokeh_is_loading = undefined;\n", 106 | " }\n", 107 | "\n", 108 | "\n", 109 | " \n", 110 | " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", 111 | " root._bokeh_timeout = Date.now() + 5000;\n", 112 | " root._bokeh_failed_load = false;\n", 113 | " }\n", 114 | "\n", 115 | " var NB_LOAD_WARNING = {'data': {'text/html':\n", 116 | " \"
\\n\"+\n", 117 | " \"

\\n\"+\n", 118 | " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", 119 | " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", 120 | " \"

\\n\"+\n", 121 | " \"
    \\n\"+\n", 122 | " \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n", 123 | " \"
  • use INLINE resources instead, as so:
  • \\n\"+\n", 124 | " \"
\\n\"+\n", 125 | " \"\\n\"+\n", 126 | " \"from bokeh.resources import INLINE\\n\"+\n", 127 | " \"output_notebook(resources=INLINE)\\n\"+\n", 128 | " \"\\n\"+\n", 129 | " \"
\"}};\n", 130 | "\n", 131 | " function display_loaded() {\n", 132 | " if (root.Bokeh !== undefined) {\n", 133 | " var el = document.getElementById(\"b0b348ba-b56f-4ff7-b7e4-9ce1f4aabd8e\");\n", 134 | " if (el != null) {\n", 135 | " el.textContent = \"BokehJS \" + Bokeh.version + \" successfully loaded.\";\n", 136 | " }\n", 137 | " } else if (Date.now() < root._bokeh_timeout) {\n", 138 | " setTimeout(display_loaded, 100)\n", 139 | " }\n", 140 | " }\n", 141 | "\n", 142 | "\n", 143 | " function run_callbacks() {\n", 144 | " try {\n", 145 | " root._bokeh_onload_callbacks.forEach(function(callback) { callback() });\n", 146 | " }\n", 147 | " finally {\n", 148 | " delete root._bokeh_onload_callbacks\n", 149 | " }\n", 150 | " console.info(\"Bokeh: all callbacks have finished\");\n", 151 | " }\n", 152 | "\n", 153 | " function load_libs(js_urls, callback) {\n", 154 | " root._bokeh_onload_callbacks.push(callback);\n", 155 | " if (root._bokeh_is_loading > 0) {\n", 156 | " console.log(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", 157 | " return null;\n", 158 | " }\n", 159 | " if (js_urls == null || js_urls.length === 0) {\n", 160 | " run_callbacks();\n", 161 | " return null;\n", 162 | " }\n", 163 | " console.log(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", 164 | " root._bokeh_is_loading = js_urls.length;\n", 165 | " for (var i = 0; i < js_urls.length; i++) {\n", 166 | " var url = js_urls[i];\n", 167 | " var s = document.createElement('script');\n", 168 | " s.src = url;\n", 169 | " s.async = false;\n", 170 | " s.onreadystatechange = s.onload = function() {\n", 171 | " root._bokeh_is_loading--;\n", 172 | " if (root._bokeh_is_loading === 0) {\n", 173 | " console.log(\"Bokeh: all BokehJS libraries loaded\");\n", 174 | " run_callbacks()\n", 175 | " }\n", 176 | " };\n", 177 | " s.onerror = function() {\n", 178 | " console.warn(\"failed to load library \" + url);\n", 179 | " };\n", 180 | " console.log(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", 181 | " document.getElementsByTagName(\"head\")[0].appendChild(s);\n", 182 | " }\n", 183 | " };var element = document.getElementById(\"b0b348ba-b56f-4ff7-b7e4-9ce1f4aabd8e\");\n", 184 | " if (element == null) {\n", 185 | " console.log(\"Bokeh: ERROR: autoload.js configured with elementid 'b0b348ba-b56f-4ff7-b7e4-9ce1f4aabd8e' but no matching script tag was found. \")\n", 186 | " return false;\n", 187 | " }\n", 188 | "\n", 189 | " var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-0.12.7.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.7.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-0.12.7.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-0.12.7.min.js\"];\n", 190 | "\n", 191 | " var inline_js = [\n", 192 | " function(Bokeh) {\n", 193 | " Bokeh.set_log_level(\"info\");\n", 194 | " },\n", 195 | " \n", 196 | " function(Bokeh) {\n", 197 | " \n", 198 | " },\n", 199 | " \n", 200 | " function(Bokeh) {\n", 201 | " \n", 202 | " document.getElementById(\"b0b348ba-b56f-4ff7-b7e4-9ce1f4aabd8e\").textContent = \"BokehJS is loading...\";\n", 203 | " },\n", 204 | " function(Bokeh) {\n", 205 | " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-0.12.7.min.css\");\n", 206 | " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-0.12.7.min.css\");\n", 207 | " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.7.min.css\");\n", 208 | " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.7.min.css\");\n", 209 | " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-tables-0.12.7.min.css\");\n", 210 | " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-tables-0.12.7.min.css\");\n", 211 | " }\n", 212 | " ];\n", 213 | "\n", 214 | " function run_inline_js() {\n", 215 | " \n", 216 | " if ((root.Bokeh !== undefined) || (force === true)) {\n", 217 | " for (var i = 0; i < inline_js.length; i++) {\n", 218 | " inline_js[i].call(root, root.Bokeh);\n", 219 | " }if (force === true) {\n", 220 | " display_loaded();\n", 221 | " }} else if (Date.now() < root._bokeh_timeout) {\n", 222 | " setTimeout(run_inline_js, 100);\n", 223 | " } else if (!root._bokeh_failed_load) {\n", 224 | " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", 225 | " root._bokeh_failed_load = true;\n", 226 | " } else if (force !== true) {\n", 227 | " var cell = $(document.getElementById(\"b0b348ba-b56f-4ff7-b7e4-9ce1f4aabd8e\")).parents('.cell').data().cell;\n", 228 | " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", 229 | " }\n", 230 | "\n", 231 | " }\n", 232 | "\n", 233 | " if (root._bokeh_is_loading === 0) {\n", 234 | " console.log(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", 235 | " run_inline_js();\n", 236 | " } else {\n", 237 | " load_libs(js_urls, function() {\n", 238 | " console.log(\"Bokeh: BokehJS plotting callback run at\", now());\n", 239 | " run_inline_js();\n", 240 | " });\n", 241 | " }\n", 242 | "}(window));" 243 | ] 244 | }, 245 | "metadata": {}, 246 | "output_type": "display_data" 247 | }, 248 | { 249 | "data": { 250 | "text/html": [ 251 | "\n", 252 | "\n", 253 | "
\n", 254 | "
\n", 255 | "
\n", 256 | "" 400 | ] 401 | }, 402 | "metadata": {}, 403 | "output_type": "display_data" 404 | } 405 | ], 406 | "source": [ 407 | "output_notebook()\n", 408 | "output_file('x.html')\n", 409 | "show(plot)" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": { 416 | "collapsed": true 417 | }, 418 | "outputs": [], 419 | "source": [] 420 | } 421 | ], 422 | "metadata": { 423 | "kernelspec": { 424 | "display_name": "Python 3", 425 | "language": "python", 426 | "name": "python3" 427 | }, 428 | "language_info": { 429 | "codemirror_mode": { 430 | "name": "ipython", 431 | "version": 3 432 | }, 433 | "file_extension": ".py", 434 | "mimetype": "text/x-python", 435 | "name": "python", 436 | "nbconvert_exporter": "python", 437 | "pygments_lexer": "ipython3", 438 | "version": "3.6.2" 439 | } 440 | }, 441 | "nbformat": 4, 442 | "nbformat_minor": 2 443 | } 444 | -------------------------------------------------------------------------------- /insta-api.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import requests\n", 12 | "import json" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "payload = { \"access_token\" : \"31183420.4d93899.54d8d5ac8c444e1eb82bfad2db04cc59\" }\n", 24 | "recent_media_url = \"https://api.instagram.com/v1/users/self/media/recent\"\n", 25 | "self_url = \"https://api.instagram.com/v1/users/self\"" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 4, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "self_r = requests.get(self_url, params=payload)\n", 35 | "self_response = json.loads(self_r.text)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 5, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "31183420\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "user_id=self_response[\"data\"][\"id\"]\n", 53 | "print(user_id)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 26, 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "payload[\"count\"] = 30\n", 65 | "recent_r = requests.get(recent_media_url, params=payload)\n", 66 | "recent_response = json.loads(recent_r.text)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 27, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "{'pagination': {}, 'data': [{'id': '1681981992014369647_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e35/25025257_155335971856366_1103702618208731136_n.jpg'}, 'low_resolution': {'width': 320, 'height': 320, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/25025257_155335971856366_1103702618208731136_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/25025257_155335971856366_1103702618208731136_n.jpg'}}, 'created_time': '1514727900', 'caption': {'id': '17914981123041327', 'text': 'altre giorno, altre chiesa chiusa', 'created_time': '1514727900', 'from': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}}, 'user_has_liked': False, 'likes': {'count': 7}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 0}, 'type': 'image', 'link': 'https://www.instagram.com/p/BdXmbXQntNv04XrUNgY0-wVbSMwBK1Aa-Ja-FY0/', 'location': None, 'attribution': None, 'users_in_photo': []}, {'id': '1681736231435551385_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e35/c12.0.1055.1055/26183027_1563413867109927_2500941975943905280_n.jpg'}, 'low_resolution': {'width': 320, 'height': 312, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/26183027_1563413867109927_2500941975943905280_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 625, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/26183027_1563413867109927_2500941975943905280_n.jpg'}}, 'created_time': '1514698603', 'caption': None, 'user_has_liked': False, 'likes': {'count': 12}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 1}, 'type': 'image', 'link': 'https://www.instagram.com/p/BdWujE4nsKZ0E0UtcwIc83vrowPD581WUm6CGw0/', 'location': None, 'attribution': None, 'users_in_photo': []}, {'id': '1680702603817386395_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e35/25026020_1858431304447509_6023051016792965120_n.jpg'}, 'low_resolution': {'width': 320, 'height': 320, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/25026020_1858431304447509_6023051016792965120_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/25026020_1858431304447509_6023051016792965120_n.jpg'}}, 'created_time': '1514575385', 'caption': {'id': '17915023630026303', 'text': 'Uffff, saca el parkour.', 'created_time': '1514575385', 'from': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}}, 'user_has_liked': False, 'likes': {'count': 22}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 0}, 'type': 'image', 'link': 'https://www.instagram.com/p/BdTDh0JnAGbte9B-Ds6pgPFZ4UsOV4LYGLKZU80/', 'location': None, 'attribution': None, 'users_in_photo': []}, {'id': '1680439648563563854_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e35/26151573_135144960497382_8535851668024066048_n.jpg'}, 'low_resolution': {'width': 320, 'height': 320, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/26151573_135144960497382_8535851668024066048_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/26151573_135144960497382_8535851668024066048_n.jpg'}}, 'created_time': '1514544038', 'caption': {'id': '17890735372186348', 'text': 'Igualita que en el videojuego 😘', 'created_time': '1514544038', 'from': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}}, 'user_has_liked': False, 'likes': {'count': 24}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 1}, 'type': 'image', 'link': 'https://www.instagram.com/p/BdSHvT_HQ1OJlXDxBojPyVvEXBfhd3zsomLrJs0/', 'location': None, 'attribution': None, 'users_in_photo': []}, {'id': '1679910026432338103_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e35/26153439_831026207079998_5080606892787499008_n.jpg'}, 'low_resolution': {'width': 320, 'height': 320, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/26153439_831026207079998_5080606892787499008_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/26153439_831026207079998_5080606892787499008_n.jpg'}}, 'created_time': '1514480902', 'caption': {'id': '17915541166002316', 'text': 'No se roben las monedas, no sean culeros.', 'created_time': '1514480902', 'from': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}}, 'user_has_liked': False, 'likes': {'count': 14}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 2}, 'type': 'image', 'link': 'https://www.instagram.com/p/BdQPUS8Hdi3-W70izTto69BTmnNX1laYWe9SR80/', 'location': None, 'attribution': None, 'users_in_photo': []}, {'id': '1679021698421887813_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e35/26181522_387098571730770_3171841448404320256_n.jpg'}, 'low_resolution': {'width': 320, 'height': 320, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/26181522_387098571730770_3171841448404320256_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/26181522_387098571730770_3171841448404320256_n.jpg'}}, 'created_time': '1514375005', 'caption': {'id': '17890693324188171', 'text': 'Vámonos a...', 'created_time': '1514375005', 'from': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}}, 'user_has_liked': False, 'likes': {'count': 13}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 2}, 'type': 'image', 'link': 'https://www.instagram.com/p/BdNFVbAnBNFrU2_iSC8PNecdpovQGXA0uRRDWQ0/', 'location': None, 'attribution': None, 'users_in_photo': []}, {'id': '1674999869294427671_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e35/25015721_1320395731399431_8685616632174739456_n.jpg'}, 'low_resolution': {'width': 320, 'height': 320, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/25015721_1320395731399431_8685616632174739456_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/25015721_1320395731399431_8685616632174739456_n.jpg'}}, 'created_time': '1513895566', 'caption': None, 'user_has_liked': False, 'likes': {'count': 7}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 2}, 'type': 'image', 'link': 'https://www.instagram.com/p/Bc-y4GwHr4Xqxtn3yv7Mn2zZdwsyIVF1RIggmo0/', 'location': None, 'attribution': None, 'users_in_photo': []}, {'id': '1674032295035421447_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e35/25014979_329774084172531_4407919425441759232_n.jpg'}, 'low_resolution': {'width': 320, 'height': 320, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/25014979_329774084172531_4407919425441759232_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/25014979_329774084172531_4407919425441759232_n.jpg'}}, 'created_time': '1513780222', 'caption': {'id': '17909104462066315', 'text': '谢谢你,潘颖和媛媛', 'created_time': '1513780222', 'from': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}}, 'user_has_liked': False, 'likes': {'count': 7}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 9}, 'type': 'image', 'link': 'https://www.instagram.com/p/Bc7W4DAHkMHjCBOgiyqgp67Yc3Jmveo2az6yFg0/', 'location': None, 'attribution': None, 'users_in_photo': []}, {'id': '1674010133448856437_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e35/25011930_1271132223032509_3755382961101340672_n.jpg'}, 'low_resolution': {'width': 320, 'height': 320, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/25011930_1271132223032509_3755382961101340672_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/25011930_1271132223032509_3755382961101340672_n.jpg'}}, 'created_time': '1513777580', 'caption': {'id': '17896847521091120', 'text': 'A weekend in the city', 'created_time': '1513777580', 'from': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}}, 'user_has_liked': False, 'likes': {'count': 12}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 0}, 'type': 'image', 'link': 'https://www.instagram.com/p/Bc7R1jan5N1N9oOpGMX8KfL1zHAdq9ylJ-ImSo0/', 'location': None, 'attribution': None, 'users_in_photo': []}, {'id': '1672145732269708444_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e35/25005863_2011006885843364_2919785679116304384_n.jpg'}, 'low_resolution': {'width': 320, 'height': 320, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/25005863_2011006885843364_2919785679116304384_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/25005863_2011006885843364_2919785679116304384_n.jpg'}}, 'created_time': '1513555326', 'caption': {'id': '17912745739029924', 'text': 'Sunset... at 3:45 pm ಠ_ಠ', 'created_time': '1513555326', 'from': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}}, 'user_has_liked': False, 'likes': {'count': 21}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 0}, 'type': 'image', 'link': 'https://www.instagram.com/p/Bc0p68gnxScyzvS8js4B1Jhftmam-scSl1w5VQ0/', 'location': None, 'attribution': None, 'users_in_photo': []}, {'id': '1668390594446992112_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e35/25021900_302093603634224_7936031481950896128_n.jpg'}, 'low_resolution': {'width': 320, 'height': 320, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/25021900_302093603634224_7936031481950896128_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/25021900_302093603634224_7936031481950896128_n.jpg'}}, 'created_time': '1513107679', 'caption': {'id': '17899060315114792', 'text': 'So close yet so far...', 'created_time': '1513107679', 'from': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}}, 'user_has_liked': False, 'likes': {'count': 31}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 3}, 'type': 'image', 'link': 'https://www.instagram.com/p/BcnUGf4naLwmj2yi94jhwEFZn_7vILa7sMo7ZU0/', 'location': None, 'attribution': None, 'users_in_photo': []}, {'id': '1668070783884004995_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e35/24332123_522726954764035_3757694933406842880_n.jpg'}, 'low_resolution': {'width': 320, 'height': 320, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/24332123_522726954764035_3757694933406842880_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/24332123_522726954764035_3757694933406842880_n.jpg'}}, 'created_time': '1513069554', 'caption': {'id': '17897912872124421', 'text': 'Me maman los barcos.', 'created_time': '1513069554', 'from': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}}, 'user_has_liked': False, 'likes': {'count': 11}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 0}, 'type': 'image', 'link': 'https://www.instagram.com/p/BcmLYpFnQ6De0R6KFsN_M7NZgon9zeipb7WRZs0/', 'location': None, 'attribution': None, 'users_in_photo': []}, {'id': '1666813262884001340_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e35/25006487_920484838106751_4570086678890283008_n.jpg'}, 'low_resolution': {'width': 320, 'height': 320, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/25006487_920484838106751_4570086678890283008_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/25006487_920484838106751_4570086678890283008_n.jpg'}}, 'created_time': '1512919646', 'caption': {'id': '17895482461093328', 'text': 'The Mist.', 'created_time': '1512919646', 'from': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}}, 'user_has_liked': False, 'likes': {'count': 23}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 3}, 'type': 'image', 'link': 'https://www.instagram.com/p/BchtdTZHFY8HdN0iROARHzcc7QOBqo99YEdaTQ0/', 'location': None, 'attribution': None, 'users_in_photo': []}, {'id': '1665504933486422957_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e15/25009038_306129216543111_9187458035721699328_n.jpg'}, 'low_resolution': {'width': 320, 'height': 320, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e15/25009038_306129216543111_9187458035721699328_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/e15/25009038_306129216543111_9187458035721699328_n.jpg'}}, 'created_time': '1512763689', 'caption': None, 'user_has_liked': False, 'likes': {'count': 15}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 0}, 'type': 'video', 'link': 'https://www.instagram.com/p/BcdD-msHmut5nH7fDIrZRitj7SKwkDvs-fxtY80/', 'location': None, 'attribution': None, 'users_in_photo': [], 'videos': {'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/vp/9910ba22d473534fcdedba4c5404f8af/5A4C5962/t50.2886-16/24995044_167376300536108_7108479399560740864_n.mp4', 'id': '17852753245214683'}, 'low_bandwidth': {'width': 480, 'height': 480, 'url': 'https://scontent.cdninstagram.com/vp/b056dad1cd9083db4bf75315e0d8e36e/5A4C3972/t50.2886-16/24725170_1592298994151375_8035081218867331072_n.mp4', 'id': '17885335735155490'}, 'low_resolution': {'width': 480, 'height': 480, 'url': 'https://scontent.cdninstagram.com/vp/b056dad1cd9083db4bf75315e0d8e36e/5A4C3972/t50.2886-16/24725170_1592298994151375_8035081218867331072_n.mp4', 'id': '17885335735155490'}}}, {'id': '1663373170467375776_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e35/24327395_1304976632939488_3726275581189292032_n.jpg'}, 'low_resolution': {'width': 320, 'height': 320, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/24327395_1304976632939488_3726275581189292032_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/24327395_1304976632939488_3726275581189292032_n.jpg'}}, 'created_time': '1512509555', 'caption': {'id': '17886434065190830', 'text': \"It's empty... *cries in mexican*\", 'created_time': '1512509555', 'from': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}}, 'user_has_liked': False, 'likes': {'count': 18}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 2}, 'type': 'image', 'link': 'https://www.instagram.com/p/BcVfRXqn4qgEeYznl41MovN9fYBBlZdWgEQsqo0/', 'location': None, 'attribution': None, 'users_in_photo': []}, {'id': '1661918870146857860_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e35/24327585_275410129651873_2301266468156735488_n.jpg'}, 'low_resolution': {'width': 320, 'height': 320, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/24327585_275410129651873_2301266468156735488_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/24327585_275410129651873_2301266468156735488_n.jpg'}}, 'created_time': '1512336189', 'caption': {'id': '17898830200101820', 'text': 'Tiene un bati-extintor xD', 'created_time': '1512336189', 'from': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}}, 'user_has_liked': False, 'likes': {'count': 13}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 0}, 'type': 'image', 'link': 'https://www.instagram.com/p/BcQUmg8nzuEsNRX3cKxWmXOuOmip551lmgB3dc0/', 'location': None, 'attribution': None, 'users_in_photo': []}, {'id': '1661155796838170867_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e35/24274343_539108086454845_5548137920025591808_n.jpg'}, 'low_resolution': {'width': 320, 'height': 320, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/24274343_539108086454845_5548137920025591808_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/24274343_539108086454845_5548137920025591808_n.jpg'}}, 'created_time': '1512245224', 'caption': {'id': '17898059296115180', 'text': 'SeeWoo', 'created_time': '1512245224', 'from': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}}, 'user_has_liked': False, 'likes': {'count': 12}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 0}, 'type': 'image', 'link': 'https://www.instagram.com/p/BcNnGVjHsDz5mzqSBkxEDdU-rhBmTNufSj0zQU0/', 'location': None, 'attribution': None, 'users_in_photo': []}, {'id': '1661145077321086188_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e35/24177831_1974250396181172_1652928519031750656_n.jpg'}, 'low_resolution': {'width': 320, 'height': 320, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/24177831_1974250396181172_1652928519031750656_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/24177831_1974250396181172_1652928519031750656_n.jpg'}}, 'created_time': '1512243946', 'caption': {'id': '17897764885097041', 'text': 'El bicolor', 'created_time': '1512243946', 'from': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}}, 'user_has_liked': False, 'likes': {'count': 6}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 0}, 'type': 'image', 'link': 'https://www.instagram.com/p/BcNkqWOHLTs-6oUzjqYc0gk1SxzBBYrVjbBR3c0/', 'location': None, 'attribution': None, 'users_in_photo': []}, {'id': '1653967032260771215_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e35/23734649_144027666320958_3382375844696555520_n.jpg'}, 'low_resolution': {'width': 320, 'height': 320, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/23734649_144027666320958_3382375844696555520_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/23734649_144027666320958_3382375844696555520_n.jpg'}}, 'created_time': '1511388256', 'caption': {'id': '17885803885141166', 'text': 'Y justo te das cuenta de que dejaste las memorias en México.', 'created_time': '1511388256', 'from': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}}, 'user_has_liked': False, 'likes': {'count': 14}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 0}, 'type': 'image', 'link': 'https://www.instagram.com/p/Bb0EkC0HD2PEl57Em85QK3Kg_Dv5d6w3H2C2-g0/', 'location': None, 'attribution': None, 'users_in_photo': []}, {'id': '1651740249104875103_31183420', 'user': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}, 'images': {'thumbnail': {'width': 150, 'height': 150, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s150x150/e35/23667806_167912247134192_8247260630583607296_n.jpg'}, 'low_resolution': {'width': 320, 'height': 320, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s320x320/e35/23667806_167912247134192_8247260630583607296_n.jpg'}, 'standard_resolution': {'width': 640, 'height': 640, 'url': 'https://scontent.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/23667806_167912247134192_8247260630583607296_n.jpg'}}, 'created_time': '1511122803', 'caption': {'id': '17850978850205561', 'text': 'Saca la vaca', 'created_time': '1511122803', 'from': {'id': '31183420', 'full_name': 'Antonio', 'profile_picture': 'https://scontent.cdninstagram.com/t51.2885-19/s150x150/24125006_161374611141852_1337836751690924032_n.jpg', 'username': 'fferegrino'}}, 'user_has_liked': False, 'likes': {'count': 13}, 'tags': [], 'filter': 'Normal', 'comments': {'count': 0}, 'type': 'image', 'link': 'https://www.instagram.com/p/BbsKQFZHH5fp_QZZbmZuNjZuh9ADSLTxejGb5E0/', 'location': None, 'attribution': None, 'users_in_photo': []}], 'meta': {'code': 200}}\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "print(recent_response)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 28, 89 | "metadata": { 90 | "scrolled": true 91 | }, 92 | "outputs": [ 93 | { 94 | "name": "stdout", 95 | "output_type": "stream", 96 | "text": [ 97 | "Done\n" 98 | ] 99 | } 100 | ], 101 | "source": [ 102 | "imgs_temp = recent_response[\"data\"]\n", 103 | "images=[]\n", 104 | "\n", 105 | "while \"pagination\" in recent_response and 'next_url' in recent_response[\"pagination\"]:\n", 106 | " for img in imgs_temp:\n", 107 | " images.append(img)\n", 108 | " imgs_temp.clear()\n", 109 | " if \"pagination\" in recent_response and 'next_url' in recent_response[\"pagination\"]:\n", 110 | " next_url = recent_response[\"pagination\"]['next_url']\n", 111 | " print(next_url)\n", 112 | " recent_r = requests.get(next_url)\n", 113 | " recent_response = json.loads(recent_r.text)\n", 114 | " recent_response[\"data\"]\n", 115 | "print(\"Done\")" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 29, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "name": "stdout", 125 | "output_type": "stream", 126 | "text": [ 127 | "0\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "print(len(images))" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": { 139 | "collapsed": true 140 | }, 141 | "outputs": [], 142 | "source": [] 143 | } 144 | ], 145 | "metadata": { 146 | "kernelspec": { 147 | "display_name": "Python 3", 148 | "language": "python", 149 | "name": "python3" 150 | }, 151 | "language_info": { 152 | "codemirror_mode": { 153 | "name": "ipython", 154 | "version": 3 155 | }, 156 | "file_extension": ".py", 157 | "mimetype": "text/x-python", 158 | "name": "python", 159 | "nbconvert_exporter": "python", 160 | "pygments_lexer": "ipython3", 161 | "version": "3.6.1" 162 | } 163 | }, 164 | "nbformat": 4, 165 | "nbformat_minor": 2 166 | } 167 | --------------------------------------------------------------------------------