├── BACK2BASICS_NLP.pdf ├── BACK2BASICS_NLP_PYCONES2018.ipynb ├── BACK2BASICS_NLP_PYCONES2018_WebScrapping.ipynb ├── Back2basics_extended_pycones_claudiaguirao.pdf ├── Back2basics_talk_pycones_claudiaguirao.pdf ├── Frida.PNG ├── README.md ├── comparativa.PNG ├── doc_cluster.pkl ├── lda.jpg ├── logo.svg ├── sinopsis_nominados_goya.csv ├── tfidf.png ├── titulo_edicion.csv └── ward_clusters.png /BACK2BASICS_NLP.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intiveda/back2basicsNLP/10e953a246a83fb2f2fcf942c788bb93858aadbd/BACK2BASICS_NLP.pdf -------------------------------------------------------------------------------- /BACK2BASICS_NLP_PYCONES2018_WebScrapping.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Scrapping sobre los Goya" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## web scrapping" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "from bs4 import BeautifulSoup\n", 26 | "import urllib.request\n", 27 | "import pandas as pd\n", 28 | "import time\n", 29 | "from random import randint" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "all_films = []\n", 41 | "for myedicion in reversed(range(1,33)):\n", 42 | " print(myedicion)\n", 43 | " time.sleep(1)\n", 44 | " myurl = 'https://www.premiosgoya.com/' + str(myedicion) + '-edicion/nominaciones/por-categoria/pelicula/'\n", 45 | " with urllib.request.urlopen(myurl) as url:\n", 46 | " s = url.read()\n", 47 | " soup = BeautifulSoup(s, \"lxml\")\n", 48 | " print(type(soup))\n", 49 | " snippet = soup.find_all(\"h2\", {\"class\": \"lista-de-peliculas__titulo\"})\n", 50 | " for h2 in snippet:\n", 51 | " peli = str(h2.a['href'])\n", 52 | " url_peli = \"https://www.premiosgoya.com\" + peli\n", 53 | " mydict = {'edicion': myedicion, 'url_peli': url_peli}\n", 54 | " all_films.append(mydict)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "collapsed": false 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "print(len(all_films))" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "collapsed": false 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "seen = set()\n", 77 | "all_films_unique = []\n", 78 | "for d in all_films:\n", 79 | " t = tuple(d.items())\n", 80 | " if t not in seen:\n", 81 | " seen.add(t)\n", 82 | " all_films_unique.append(d)\n", 83 | "\n", 84 | "print(len(all_films))\n", 85 | "print(len(all_films_unique))" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "collapsed": false 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "info_pelis = []\n", 97 | "for peli in all_films_unique:\n", 98 | " time.sleep(randint(0, 5))\n", 99 | " with urllib.request.urlopen(peli['url_peli']) as url:\n", 100 | " s = url.read()\n", 101 | " soup = BeautifulSoup(s, \"lxml\")\n", 102 | " mytittles = soup.find_all('h1', {\"class\":\"pelicula__header__titulo\"})\n", 103 | " for t in mytittles:\n", 104 | " peli['titulo'] = t.text\n", 105 | " snipped = soup.find_all('div', {\"class\":\"pelicula__header__descripcion\"})\n", 106 | " for des in snipped:\n", 107 | " peli['descripcion'] = des.text\n", 108 | " info_pelis.append(peli)\n" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "collapsed": true 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "import pandas as pd\n", 120 | "info_pelis_pd = pd.DataFrame(info_pelis)\n", 121 | "info_pelis_pd.to_csv(\"info_pelis_pd.csv\")" 122 | ] 123 | } 124 | ], 125 | "metadata": { 126 | "kernelspec": { 127 | "display_name": "Python 3", 128 | "language": "python", 129 | "name": "python3" 130 | }, 131 | "language_info": { 132 | "codemirror_mode": { 133 | "name": "ipython", 134 | "version": 3 135 | }, 136 | "file_extension": ".py", 137 | "mimetype": "text/x-python", 138 | "name": "python", 139 | "nbconvert_exporter": "python", 140 | "pygments_lexer": "ipython3", 141 | "version": "3.6.0" 142 | } 143 | }, 144 | "nbformat": 4, 145 | "nbformat_minor": 2 146 | } 147 | -------------------------------------------------------------------------------- /Back2basics_extended_pycones_claudiaguirao.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intiveda/back2basicsNLP/10e953a246a83fb2f2fcf942c788bb93858aadbd/Back2basics_extended_pycones_claudiaguirao.pdf -------------------------------------------------------------------------------- /Back2basics_talk_pycones_claudiaguirao.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intiveda/back2basicsNLP/10e953a246a83fb2f2fcf942c788bb93858aadbd/Back2basics_talk_pycones_claudiaguirao.pdf -------------------------------------------------------------------------------- /Frida.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intiveda/back2basicsNLP/10e953a246a83fb2f2fcf942c788bb93858aadbd/Frida.PNG -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BacktoBasics NLP 2 | # 6 octubre 2018 PyconEs Málaga 3 | ### **Claudia Guirao Fernández** 4 | _Data Scientist @ Kernel Analytics_ 5 | 6 | No olvides revisar mis notas sobre NLP :) 7 | 8 | ## Contenido del Notebook 9 | ### 1. _Basics_ sobre string en Python 10 | ### 2. Conceptos básicos al trabajar con texto 11 | ### 3. Librerías más comunes para el tratamiento de texto 12 | ### 4. Let's play 13 | ### 5. Clustering (NLTK, SpaCy, Sklearn) 14 | ### 6. Topic modelling 15 | -------------------------------------------------------------------------------- /comparativa.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intiveda/back2basicsNLP/10e953a246a83fb2f2fcf942c788bb93858aadbd/comparativa.PNG -------------------------------------------------------------------------------- /doc_cluster.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intiveda/back2basicsNLP/10e953a246a83fb2f2fcf942c788bb93858aadbd/doc_cluster.pkl -------------------------------------------------------------------------------- /lda.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intiveda/back2basicsNLP/10e953a246a83fb2f2fcf942c788bb93858aadbd/lda.jpg -------------------------------------------------------------------------------- /logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | pycon_logo_final 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /sinopsis_nominados_goya.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intiveda/back2basicsNLP/10e953a246a83fb2f2fcf942c788bb93858aadbd/sinopsis_nominados_goya.csv -------------------------------------------------------------------------------- /tfidf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intiveda/back2basicsNLP/10e953a246a83fb2f2fcf942c788bb93858aadbd/tfidf.png -------------------------------------------------------------------------------- /titulo_edicion.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intiveda/back2basicsNLP/10e953a246a83fb2f2fcf942c788bb93858aadbd/titulo_edicion.csv -------------------------------------------------------------------------------- /ward_clusters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intiveda/back2basicsNLP/10e953a246a83fb2f2fcf942c788bb93858aadbd/ward_clusters.png --------------------------------------------------------------------------------