├── COVID19.ipynb ├── Data Science Workshop 1.ipynb ├── Lab2.ipynb ├── Lab3.ipynb ├── Lab4.ipynb ├── MachineLearning1.ipynb ├── MachineLearning2.ipynb ├── MachineLearning3.ipynb ├── MachineLearning4.ipynb ├── Matma1.xlsx ├── Picture1.png ├── README.md ├── SKM Statystyki - Python 7.ipynb ├── SKM Statystyki - Python 8.ipynb ├── SKM Statystyki - Python 9 - Wizualizacja 2.ipynb ├── SKN Statystyki - Python 1.ipynb ├── SKN Statystyki - Python 2.ipynb ├── SKN Statystyki - Python 3.ipynb ├── SKN Statystyki - Python 4.ipynb ├── SKN Statystyki - 10 - Regresja.ipynb ├── SKN Statystyki - 11 - PJN.ipynb ├── SKN Statystyki - Python 11.ipynb ├── SKN Statystyki - Python 5.ipynb ├── SKN Statystyki - Python 6.ipynb ├── Workshop 2 - Beautiful Soup.ipynb ├── Workshop 3 - Paginacja.ipynb ├── Workshop 4 - Poprawność zachowania.ipynb ├── Wyklad 12 - Klasyfikacja tekstu.ipynb ├── adverts_22_04.csv ├── adverts_29_04.csv ├── dash_examples ├── 1.py ├── 2.py ├── 2a.py ├── 2b.py ├── 2c.py ├── 2t.py ├── 3.py └── 4.py ├── day.csv ├── img ├── PDSH.png ├── djpatel.jpg ├── github.png ├── growth.png ├── kernel.png ├── launch.png ├── over.png ├── pobrane.png ├── projection.png ├── rossum.jpg ├── se.jpg ├── tags.png ├── tortoise1.png ├── tortoise2.png ├── unbalanced.png └── ver.png ├── ml_map.png ├── odm.txt.gz ├── requirements.txt ├── simple_script.py ├── svm.png ├── wyklad11 └── odm.txt.gz ├── wyklad7 ├── myspider.py └── scrapy1.py ├── wyklad9 └── all.json └── xvi.png /Data Science Workshop 1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Getting Real Data from the Web" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "*Semestral Project*: build a statistical model from predicting real estate prices based on property features from data placed on ad site(s)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "# Curriculum\n", 22 | "\n", 23 | "## Getting the data\n", 24 | "### - Setting up the environement\n", 25 | "### - Anatomy of a Spider\n", 26 | "### - Anatomy of a Web Page\n", 27 | "### - Scrapy - a scraping framework\n", 28 | "### - Beautiful Soup\n", 29 | "### - Managing the Crawling Frontier\n", 30 | "### - Crawling Ethical Aspects \n", 31 | "## Processing the data\n", 32 | "### - Descriptive Analytics\n", 33 | "### - Feature Engineering\n", 34 | "### - Price Determining Factors\n", 35 | "## Price Prediction\n", 36 | "### - Training the model\n", 37 | "### - Model performance" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "*Spider* - a program that:\n", 45 | "- visits web pages from a list called *the frontier*\n", 46 | "- parses them to extract information - data and links\n", 47 | "- manages the _crawling frontier_ (links to be visited)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "https://packtpub.com/packt/offers/free-learning" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "https://docs.scrapy.org/en/latest/topics/spiders.html" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "You start by generating the initial Requests to crawl the first URLs, and specify a callback function to be called with the response downloaded from those requests.\n", 69 | "\n", 70 | "In the callback function, you parse the response (web page) and return either dicts with extracted data, Item objects, Request objects, or an iterable of these objects.\n", 71 | "\n", 72 | "In callback functions, you parse the page contents, typically using Selectors \n", 73 | "\n", 74 | "Finally, the items returned from the spider will be typically persisted to a database or written to a file\n" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 1, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "import scrapy\n", 84 | "from scrapy.crawler import CrawlerProcess\n", 85 | "\n", 86 | "class PythonEventsSpider(scrapy.Spider):\n", 87 | " name = 'pythoneventsspider'\n", 88 | "\n", 89 | " start_urls = ['https://www.python.org/events/python-events/',]\n", 90 | " found_events = []\n", 91 | "\n", 92 | " def parse(self, response):\n", 93 | " for event in response.xpath('//ul[contains(@class, \"list-recent-events\")]/li'):\n", 94 | " event_details = dict()\n", 95 | " event_details['name'] = event.xpath('h3[@class=\"event-title\"]/a/text()').extract_first()\n", 96 | " event_details['location'] = event.xpath('p/span[@class=\"event-location\"]/text()').extract_first()\n", 97 | " event_details['time'] = event.xpath('p/time/text()').extract_first()\n", 98 | " self.found_events.append(event_details)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 2, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "{'name': 'PyCon Odessa', 'location': 'BC «Solnechnyiy», 5, Sonyachna St., Odessa, Ukraine', 'time': '16 March'}\n", 111 | "{'name': 'IndyPy Automate Conf 2019', 'location': 'Indiana, USA', 'time': '22 March'}\n", 112 | "{'name': 'PyCon SK 2019', 'location': 'Bratislava, Slovakia', 'time': '22 March – 24 March '}\n", 113 | "{'name': 'Moscow Python Conf++ 2019', 'location': 'Moscow, Russia', 'time': '05 April'}\n", 114 | "{'name': 'DjangoCon Europe 2019', 'location': 'Copenhagen, Denmark', 'time': '10 April – 14 April '}\n", 115 | "{'name': 'PythonCamp 2019 - Cologne', 'location': 'GFU Cyrus AG, Am Grauen Stein 27,,51105 Köln, Germany', 'time': '13 April – 14 April '}\n", 116 | "{'name': 'PyCon APAC 2019', 'location': 'iACADEMY, Nexus Campus, Makati City, Philippines', 'time': '23 Feb. – 24 Feb. '}\n", 117 | "{'name': 'PyCascades 2019', 'location': 'Seattle, Washington, USA', 'time': '23 Feb. – 24 Feb. '}\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "process = CrawlerProcess({ 'LOG_LEVEL': 'ERROR'})\n", 123 | "process.crawl(PythonEventsSpider)\n", 124 | "spider = next(iter(process.crawlers)).spider\n", 125 | "process.start()\n", 126 | "\n", 127 | "for event in spider.found_events: print(event)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [] 136 | } 137 | ], 138 | "metadata": { 139 | "kernelspec": { 140 | "display_name": "Python 3", 141 | "language": "python", 142 | "name": "python3" 143 | }, 144 | "language_info": { 145 | "codemirror_mode": { 146 | "name": "ipython", 147 | "version": 3 148 | }, 149 | "file_extension": ".py", 150 | "mimetype": "text/x-python", 151 | "name": "python", 152 | "nbconvert_exporter": "python", 153 | "pygments_lexer": "ipython3", 154 | "version": "3.6.7" 155 | } 156 | }, 157 | "nbformat": 4, 158 | "nbformat_minor": 2 159 | } 160 | -------------------------------------------------------------------------------- /Lab2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lab 2" 8 | ] 9 | }, 10 | { 11 | "cell_type": "raw", 12 | "metadata": {}, 13 | "source": [ 14 | "Stworz funkcję która zwróci drugi najmniejszy element listy\n", 15 | "\n", 16 | "druginajmniejszy([1, 2, 3, 4, 6])\n", 17 | "\n", 18 | "2" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "def druginajmniejszy(lista):\n", 28 | " if len(lista) < 2:\n", 29 | " return lista[0]\n", 30 | " if lista[0] > lista[1]:\n", 31 | " pierwszy, drugi = lista[1], lista[0]\n", 32 | " else:\n", 33 | " pierwszy, drugi = lista[0], lista[1]\n", 34 | " for x in lista[2:]:\n", 35 | " print (x, pierwszy, drugi)\n", 36 | " if x < drugi and x!=pierwszy:\n", 37 | " if x < pierwszy:\n", 38 | " pierwszy, drugi = x, pierwszy\n", 39 | " else:\n", 40 | " drugi = x\n", 41 | " return drugi" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "druginajmniejszy([2,1,3,4,5])" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "druginajmniejszy([1,2,2,1,3,4,5])" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "# Zadanie 2" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "Stworz funkcję która usunie z listy najmniejszy element listy\n", 74 | "\n", 75 | "l = [4, 1, 2, 3]\n", 76 | "\n", 77 | "usun_min(l)\n", 78 | "\n", 79 | "l\n", 80 | "\n", 81 | "[4, 2, 3]" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "def usun_min(lista):\n", 91 | " do_usuniecia = set()\n", 92 | " for i in range(len(lista)):\n", 93 | " if lista[i]==min(lista):\n", 94 | " do_usuniecia.add(i)\n", 95 | " for i in range(len(lista)-1,-1,-1):\n", 96 | " if i in do_usuniecia:\n", 97 | " del lista[i]" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "l = [4, 1, 2, 3]\n", 107 | "usun_min(l)\n", 108 | "l" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "l = [4, 1, 1, 3]\n", 118 | "usun_min(l)\n", 119 | "l" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "# Frozenset i niezmienność" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "zbior = set()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "zbior" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "zbior.add(\"ala\")" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "zbior" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "zbior.add(\"ma\")" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "zbior.add(\"kota\")" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "zbior.add(\"ala\")" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "zbior" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "zbior.add([1,2])" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "zbior.add((1,2))" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "zbior" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "zbior.add(zbior)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "zbior.add(frozenset(zbior))" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "zbior" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "m = {}" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "m[ [1,2] ] = 1" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "m[ (1,2) ] = 1" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "fs = frozenset([1, 2, 3, 4, 5])\n", 291 | "\n", 292 | "size = len(fs)\n", 293 | "print('frozenset size =', size)\n", 294 | "\n", 295 | "contains_item = 5 in fs\n", 296 | "print('fs contains 5 =', contains_item)\n", 297 | "\n", 298 | "not_contains_item = 6 not in fs\n", 299 | "print('fs not contains 6 =', not_contains_item)\n", 300 | "\n", 301 | "is_disjoint = fs.isdisjoint(frozenset([1, 2]))\n", 302 | "print(is_disjoint)\n", 303 | "\n", 304 | "is_disjoint = fs.isdisjoint(frozenset([10, 20]))\n", 305 | "print(is_disjoint)\n", 306 | "\n", 307 | "is_subset = fs.issubset(set([1, 2]))\n", 308 | "print(is_subset)\n", 309 | "\n", 310 | "is_subset = fs.issubset(set([1, 2, 3, 4, 5, 6, 7]))\n", 311 | "print(is_subset)\n", 312 | "\n", 313 | "is_superset = fs.issuperset(frozenset([1, 2]))\n", 314 | "print(is_superset)\n", 315 | "\n", 316 | "is_superset = fs.issuperset(frozenset([1, 2, 10]))\n", 317 | "print(is_superset)\n", 318 | "\n", 319 | "fs1 = fs.union(frozenset([1, 2, 10]), set([99, 98]))\n", 320 | "print(fs1)\n", 321 | "\n", 322 | "fs1 = fs.intersection(set((1, 2, 10, 20)))\n", 323 | "print(fs1)\n", 324 | "\n", 325 | "fs1 = fs.difference(frozenset([1, 2, 3]))\n", 326 | "print(fs1)\n", 327 | "\n", 328 | "fs1 = fs.symmetric_difference(frozenset([1, 2, 10, 20]))\n", 329 | "print(fs1)\n", 330 | "\n", 331 | "fs1 = fs.copy()\n", 332 | "print(fs1)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "# Zadanie na dziś 1" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "Napisać `skrypt` który zliczy ilość wystąpień parametrów z linii komend" 347 | ] 348 | }, 349 | { 350 | "cell_type": "raw", 351 | "metadata": {}, 352 | "source": [ 353 | "PS C:\\work\\conda\\WarsztatPythonDataScience> python .\\simple_script.py asdf asdf as as as\n", 354 | "{'asdf': 2, 'as': 3}" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "# Zadanie na dziś 2" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "Napisać `skrypt` który zliczy ilość wystąpień znaków w parametrach z linii komend" 369 | ] 370 | }, 371 | { 372 | "cell_type": "raw", 373 | "metadata": {}, 374 | "source": [ 375 | "PS C:\\work\\conda\\WarsztatPythonDataScience> python .\\simple_script.py asdf asdf as as as\n", 376 | "{'a': 5, 's': 5, 'd': 2, 'f': 2, ' ': 4}" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": {}, 382 | "source": [ 383 | "# Zadanie na dziś 3" 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": {}, 389 | "source": [ 390 | "Wypisać wszystkie permutacje danej listy\n", 391 | "1. z wykorzystaniem itertools\n", 392 | "2. bez wykorzystania itertools" 393 | ] 394 | }, 395 | { 396 | "cell_type": "raw", 397 | "metadata": {}, 398 | "source": [ 399 | "permutations([1,2,3])\n", 400 | "[(1, 2, 3), (1, 3, 2), (2, 1, 3), (2, 3, 1), (3, 1, 2), (3, 2, 1)] " 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "metadata": {}, 406 | "source": [ 407 | "# Zadanie na dziś 4" 408 | ] 409 | }, 410 | { 411 | "cell_type": "markdown", 412 | "metadata": {}, 413 | "source": [ 414 | "Stworzyć funkcję `generuj(lista,n)` która zwielokrotni listę `lista` podaną jako pierwszy argument `n` razy dokładając licznik do elementów. Przykład:" 415 | ] 416 | }, 417 | { 418 | "cell_type": "raw", 419 | "metadata": {}, 420 | "source": [ 421 | "generuj(['p', 'q'],5)\n", 422 | "\n", 423 | "['p1', 'q1', 'p2', 'q2', 'p3', 'q3', 'p4', 'q4', 'p5', 'q5']" 424 | ] 425 | }, 426 | { 427 | "cell_type": "markdown", 428 | "metadata": {}, 429 | "source": [ 430 | "# Zadanie na dziś 5" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "Stworzyć funkcję `f(n)` która zbuduje `DataFrame` zawierającą tabliczkę mnożenia do `n` " 438 | ] 439 | }, 440 | { 441 | "cell_type": "markdown", 442 | "metadata": {}, 443 | "source": [ 444 | "# Zadanie na dziś 6" 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "metadata": {}, 450 | "source": [ 451 | "Sprawdzić jakie operatory działają na `frozenset`. Wypisać nietrywialne przykłady ich działania" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "metadata": {}, 458 | "outputs": [], 459 | "source": [ 460 | "a = frozenset([1])\n", 461 | "b = frozenset([2])\n", 462 | "a+b" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "metadata": {}, 469 | "outputs": [], 470 | "source": [ 471 | "a-b" 472 | ] 473 | } 474 | ], 475 | "metadata": { 476 | "kernelspec": { 477 | "display_name": "Python 3", 478 | "language": "python", 479 | "name": "python3" 480 | }, 481 | "language_info": { 482 | "codemirror_mode": { 483 | "name": "ipython", 484 | "version": 3 485 | }, 486 | "file_extension": ".py", 487 | "mimetype": "text/x-python", 488 | "name": "python", 489 | "nbconvert_exporter": "python", 490 | "pygments_lexer": "ipython3", 491 | "version": "3.7.3" 492 | } 493 | }, 494 | "nbformat": 4, 495 | "nbformat_minor": 4 496 | } 497 | -------------------------------------------------------------------------------- /Lab3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lab 3" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Zadanie na zaś 1" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "Napisać `skrypt` który zliczy ilość wystąpień parametrów z linii komend" 22 | ] 23 | }, 24 | { 25 | "cell_type": "raw", 26 | "metadata": {}, 27 | "source": [ 28 | "PS C:\\work\\conda\\WarsztatPythonDataScience> python .\\simple_script.py asdf asdf as as as\n", 29 | "{'asdf': 2, 'as': 3}" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "# Zadanie na zaś 3" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "Wypisać wszystkie permutacje danej listy\n", 44 | "1. z wykorzystaniem itertools\n", 45 | "2. bez wykorzystania itertools" 46 | ] 47 | }, 48 | { 49 | "cell_type": "raw", 50 | "metadata": {}, 51 | "source": [ 52 | "permutations([1,2,3])\n", 53 | "[(1, 2, 3), (1, 3, 2), (2, 1, 3), (2, 3, 1), (3, 1, 2), (3, 2, 1)] " 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "from itertools import permutations\n", 63 | "l = [1,2,3]\n", 64 | "list(permutations(l,len(l)))" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "def permutate(seq):\n", 74 | "\n", 75 | " if not seq:\n", 76 | " return [seq] \n", 77 | " else:\n", 78 | " temp = []\n", 79 | " for k in range(len(seq)):\n", 80 | " part = seq[:k] + seq[k+1:]\n", 81 | " for m in permutate(part):\n", 82 | " temp.append(seq[k:k+1] + m)\n", 83 | " return temp" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "permutate([1,2,3])" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "def permutate(seq):\n", 102 | "\n", 103 | " if not seq:\n", 104 | " return [seq] \n", 105 | " else:\n", 106 | " temp = []\n", 107 | " for k in range(len(seq)):\n", 108 | " part = seq[:k] + seq[k+1:]\n", 109 | " print (k, part)\n", 110 | " for m in permutate(part):\n", 111 | " temp.append(seq[k:k+1] + m)\n", 112 | " print (\"> \", m, seq[k:k+1], temp)\n", 113 | " return temp" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "permutate([1])" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "permutate([1,2])" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "permutate([1,2,3])" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "# Zadanie na zaś 5" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "Stworzyć funkcję `f(n)` która zbuduje `DataFrame` zawierającą tabliczkę mnożenia do `n` " 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "import pandas as pd\n", 164 | "def f(n):\n", 165 | " n+=1\n", 166 | " return pd.DataFrame([ [ x*y for x in range(1,n) ] for y in range(1,n)], index=range(1,n), columns=range(1,n))\n", 167 | "\n", 168 | "f(10)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "## Zadanie na dziś 1" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "data_url = 'http://bit.ly/2cLzoxH'\n", 185 | "gapminder = pd.read_csv(data_url)\n", 186 | "print(gapminder.head(3))" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "print(len(gapminder))" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "pd.pivot_table(gapminder, values='lifeExp', \n", 205 | " columns='continent')" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "- podsumować 'lifeExp' wg.:\n", 213 | " - Kraju\n", 214 | " - Roku" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "## Zadanie na dziś 2" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "df1 = gapminder[['continent', 'year','lifeExp']]\n", 231 | "\n", 232 | "pd.pivot_table(df1, values='lifeExp', \n", 233 | " index=['year'], \n", 234 | " columns='continent')" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "pd.pivot_table(df1, values='lifeExp', \n", 244 | " index=['year'], \n", 245 | " columns='continent',\n", 246 | " aggfunc='min')" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "Analogicznie, podać:\n", 256 | " - średnią długość życia w krajach\n", 257 | " - z uwględnieniem roku\n", 258 | " - bez względu na rok\n" 259 | ] 260 | } 261 | ], 262 | "metadata": { 263 | "kernelspec": { 264 | "display_name": "Python 3", 265 | "language": "python", 266 | "name": "python3" 267 | }, 268 | "language_info": { 269 | "codemirror_mode": { 270 | "name": "ipython", 271 | "version": 3 272 | }, 273 | "file_extension": ".py", 274 | "mimetype": "text/x-python", 275 | "name": "python", 276 | "nbconvert_exporter": "python", 277 | "pygments_lexer": "ipython3", 278 | "version": "3.7.3" 279 | } 280 | }, 281 | "nbformat": 4, 282 | "nbformat_minor": 4 283 | } 284 | -------------------------------------------------------------------------------- /Lab4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lab 4" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Zadanie na zaś 1" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | " country year pop continent lifeExp gdpPercap\n", 27 | "0 Afghanistan 1952 8425333.0 Asia 28.801 779.445314\n", 28 | "1 Afghanistan 1957 9240934.0 Asia 30.332 820.853030\n", 29 | "2 Afghanistan 1962 10267083.0 Asia 31.997 853.100710\n" 30 | ] 31 | } 32 | ], 33 | "source": [ 34 | "import pandas as pd\n", 35 | "data_url = 'http://bit.ly/2cLzoxH'\n", 36 | "gapminder = pd.read_csv(data_url)\n", 37 | "print(gapminder.head(3))" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "print(len(gapminder))" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "pd.pivot_table(gapminder, values='lifeExp', \n", 56 | " columns='continent')" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "- podsumować 'lifeExp' wg.:\n", 64 | " - Kraju\n", 65 | " - Roku" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "pd.pivot_table(gapminder, values='lifeExp', \n", 75 | " columns='country')" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "pd.pivot_table(gapminder, values='lifeExp', \n", 85 | " columns='year')" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "## Zadanie na zaś 2" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "df1 = gapminder[['continent', 'year','lifeExp']]\n", 102 | "\n", 103 | "pd.pivot_table(df1, values='lifeExp', \n", 104 | " index=['year'], \n", 105 | " columns='continent')" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "pd.pivot_table(df1, values='lifeExp', \n", 115 | " index=['year'], \n", 116 | " columns='continent',\n", 117 | " aggfunc=['mean', 'min'])" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "Analogicznie, podać:\n", 125 | " - średnią długość życia w krajach\n", 126 | " - z uwględnieniem roku\n", 127 | " - bez względu na rok\n" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "df1 = gapminder[['country', 'year','lifeExp']]\n", 137 | "pd.pivot_table(df1, values='lifeExp', \n", 138 | " index='year',\n", 139 | " columns='country',\n", 140 | " aggfunc=['mean'])" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "df2 = pd.pivot_table(df1, values='lifeExp', \n", 150 | " \n", 151 | " columns='country',\n", 152 | " aggfunc=['mean'])\n", 153 | "df2" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "pd.melt(df2)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "# Matma 1" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "df = pd.read_excel('Matma1.xlsx', index_col=0)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "df.head()" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "df.reset_index(inplace=True)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "df = df.dropna(axis=1, how='all', thresh=None, subset=None, inplace=False)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "students = df.iloc[:,0:24]" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "students.head()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "opinions1 = df.iloc[:,[21]]" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "opinions1" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "opinions2 = df.iloc[:,36:]" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "opinions2.head()" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "opinions = pd.concat([opinions1, opinions2], axis=1)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "opinions" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "naglowki = list(opinions.columns)\n", 287 | "naglowki" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "naglowki[0] = \"Wykladowca\"" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "opinions.columns=naglowki\n", 306 | "opinions" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "long_opinions = pd.melt(opinions,id_vars=['Wykladowca'],var_name=\"oldcolumn\")" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "long_opinions" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "long_opinions.drop([\"oldcolumn\"], axis=1,inplace=True)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "long_opinions" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "long_opinions.dropna(how='any', axis=0, inplace=True)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "long_opinions" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "long_opinions.groupby(\"Wykladowca\").count()" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "long_opinions.groupby(\"value\").count().sort_values(by = 'Wykladowca', ascending=False)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "long_opinions.groupby(['Wykladowca','value']).count()" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "long_opinions['cnt']=1" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "long_opinions" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": null, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "long_opinions.groupby(['Wykladowca','value']).count()" 415 | ] 416 | } 417 | ], 418 | "metadata": { 419 | "kernelspec": { 420 | "display_name": "Python 3", 421 | "language": "python", 422 | "name": "python3" 423 | }, 424 | "language_info": { 425 | "codemirror_mode": { 426 | "name": "ipython", 427 | "version": 3 428 | }, 429 | "file_extension": ".py", 430 | "mimetype": "text/x-python", 431 | "name": "python", 432 | "nbconvert_exporter": "python", 433 | "pygments_lexer": "ipython3", 434 | "version": "3.7.5" 435 | } 436 | }, 437 | "nbformat": 4, 438 | "nbformat_minor": 4 439 | } 440 | -------------------------------------------------------------------------------- /MachineLearning3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Machine Learning 3\n", 8 | "\n", 9 | "https://github.com/MichalKorzycki/WarsztatPythonDataScience\n", 10 | "\n", 11 | "Plik: `MachineLearning3.ipynb`" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "\n", 19 | "\n", 20 | "#### - Dobór technik scoringu\n", 21 | "#### - Dobór estymatorów\n", 22 | "#### - Grid Search\n", 23 | "#### - Dobór Hiperparametrów\n", 24 | "---\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "\n", 32 | "\n", 33 | "https://www.gumtree.pl/a-mieszkania-i-domy-sprzedam-i-kupie/praga-polnoc/mieszkanie-inwestycyjne-4+pok-przy-metrze-wilenska-targowa-70/1007172232370910500042709\n", 34 | "\n", 35 | "---" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import pandas as pd\n", 45 | "from numpy import log2\n", 46 | "\n", 47 | "data = pd.read_csv('adverts_29_04.csv', sep=';')\n", 48 | "data[\"log\"] = data['Wielkość (m2)'].apply(lambda x: log2(x))\n", 49 | "data['cena_za_metr'] = data['Cena'] / data['Wielkość (m2)']\n", 50 | "data = data.dropna(subset=['cena_za_metr'])\n", 51 | "\n", 52 | "df = data.drop(['Cena', 'Data dodania'], axis=1)\n", 53 | "dum_df = pd.get_dummies(df, columns=['Lokalizacja', 'Na sprzedaż przez', 'Rodzaj nieruchomości', 'Liczba pokoi', 'Liczba łazienek', 'Parking'])\n", 54 | "dum_df" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "---" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "## Co z opisem ?" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "import gzip\n", 78 | "import sys\n", 79 | "import re\n", 80 | "import re\n", 81 | "\n", 82 | "splitter = re.compile(r'[^ąąćęńłóóśśżżź\\w]+')\n", 83 | "isnumber = re.compile(r'[0-9]')\n", 84 | "\n", 85 | "f = gzip.open('odm.txt.gz', 'rt', encoding='utf-8')\n", 86 | "dictionary = {}\n", 87 | "\n", 88 | "for x in f:\n", 89 | " t = x.strip().split(',')\n", 90 | " tt = [ x.strip().lower() for x in t]\n", 91 | " for w in tt[1:]: \n", 92 | " dictionary[w]=tt[0]\n", 93 | "\n", 94 | "def lematize(w):\n", 95 | " w = w.replace('ą','ą')\n", 96 | " w = w.replace('ó','ó')\n", 97 | " w = w.replace('ę','ę')\n", 98 | " w = w.replace('ż','ż')\n", 99 | " return dictionary.get(w,w)\n", 100 | "\n", 101 | "opis1 = dum_df['opis'][0]" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "raw_corpus=[]\n", 111 | "n=0\n", 112 | "\n", 113 | "for i in dum_df.iterrows():\n", 114 | " n+=1\n", 115 | " l = list(splitter.split(i[1][1]))\n", 116 | " raw_corpus.append(l)\n", 117 | "\n", 118 | " \n", 119 | "all_words = []\n", 120 | "for t in raw_corpus:\n", 121 | " all_words[0:0] = t\n", 122 | "\n", 123 | "words = {}\n", 124 | "for w in all_words:\n", 125 | " rec = words.get(w.lower(), {'upper':0, 'lower': 0})\n", 126 | " if w.lower()==w or w.upper()==w:\n", 127 | " rec['lower'] = rec['lower'] +1\n", 128 | " else: \n", 129 | " rec['upper'] = rec['upper'] +1\n", 130 | " words[w.lower()] = rec\n", 131 | "\n", 132 | "raw_stop_words = [ x for x in words.keys() if words[x]['upper']>=words[x]['lower']*8 ] \n" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "\n", 142 | "set_raw_stop_words = set(raw_stop_words)\n", 143 | "\n", 144 | "def preprocessing(opis):\n", 145 | " opis = str(opis)\n", 146 | " tokenized = splitter.split(opis)\n", 147 | " l = list(tokenized)\n", 148 | " l = [ x.lower() for x in l ]\n", 149 | " l = [ x for x in l if len(x) > 2]\n", 150 | " l = [ x for x in l if isnumber.search(x) is None ]\n", 151 | " l = [ x for x in l if x not in set_raw_stop_words ]\n", 152 | " l = [ lematize(x) for x in l ]\n", 153 | " l = [ x for x in l if len(x) > 2]\n", 154 | " return l" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "opis1" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "print(preprocessing(opis1))" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "dum_df[\"opis\"] = dum_df[\"opis\"].apply(lambda x: ' '.join(preprocessing(x)))" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "dum_df" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "---\n", 198 | "## Dobór technik scoringu\n", 199 | "\n", 200 | "![Walidacja krzyżowa](xvi.png)\n", 201 | "\n", 202 | "https://scikit-learn.org/stable/modules/cross_validation.html" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "from sklearn.model_selection import cross_val_score\n", 212 | "import pandas as pd\n", 213 | "from sklearn.linear_model import LinearRegression\n", 214 | "from sklearn.model_selection import train_test_split\n", 215 | "from sklearn.model_selection import cross_val_score\n", 216 | "\n", 217 | "y = dum_df['cena_za_metr']\n", 218 | "X = dum_df.drop(['cena_za_metr', 'opis'], axis=1)\n", 219 | "\n", 220 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=17)\n", 221 | "scores = cross_val_score(LinearRegression(), X_train, y_train, cv=5)\n", 222 | "print(list(scores))\n", 223 | "print()\n", 224 | "print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "from sklearn.model_selection import cross_val_score\n", 234 | "import pandas as pd\n", 235 | "from sklearn.linear_model import LinearRegression\n", 236 | "from sklearn.metrics import mean_squared_error, make_scorer\n", 237 | "scoring='f1_macro'\n", 238 | "\n", 239 | "y = dum_df['cena_za_metr']\n", 240 | "X = dum_df.drop(['cena_za_metr', 'opis'], axis=1)\n", 241 | "\n", 242 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=17)\n", 243 | "scores = cross_val_score(LinearRegression(), X_train, y_train, scoring=make_scorer(mean_squared_error), cv=5)\n", 244 | "print(list(scores))\n", 245 | "print()\n", 246 | "print(\"Mean square error: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "---\n", 254 | "## Dobór estymatorów\n", 255 | "\n", 256 | "![Dobór estymatorów](ml_map.png)\n", 257 | "\n", 258 | "https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html\n", 259 | "\n", 260 | "![SVM](svm.png)\n", 261 | "\n", 262 | "https://prateekvjoshi.com/2012/08/24/support-vector-machines/" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "from sklearn.base import BaseEstimator, TransformerMixin\n", 272 | "from sklearn.decomposition import TruncatedSVD\n", 273 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 274 | "from sklearn.pipeline import Pipeline\n", 275 | "from sklearn.linear_model import LinearRegression\n", 276 | "from sklearn.model_selection import train_test_split\n", 277 | "from sklearn.model_selection import cross_val_score\n", 278 | "\n", 279 | "class ItemSelector(BaseEstimator, TransformerMixin):\n", 280 | " def __init__(self, key):\n", 281 | " self.key = key\n", 282 | "\n", 283 | " def fit(self, x, y=None):\n", 284 | " return self\n", 285 | "\n", 286 | " def transform(self, data_dict):\n", 287 | " return data_dict[self.key]\n", 288 | "\n", 289 | "\n", 290 | "pipeline = Pipeline([\n", 291 | " ('selector', ItemSelector(key='opis')),\n", 292 | " ('tfidf', TfidfVectorizer()),\n", 293 | " ('best', TruncatedSVD(n_components=250)),\n", 294 | " ('linear', LinearRegression())\n", 295 | " ])\n", 296 | "\n", 297 | "y = dum_df['cena_za_metr']\n", 298 | "X = dum_df.drop(['cena_za_metr'], axis=1)\n", 299 | "\n", 300 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)\n", 301 | "scores = cross_val_score(pipeline, X_train, y_train, cv=3)\n", 302 | "print(list(scores))\n", 303 | "print()\n", 304 | "print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "from sklearn.feature_extraction.text import TfidfTransformer\n", 314 | "from sklearn.feature_extraction.text import CountVectorizer\n", 315 | "\n", 316 | "vectorizer = TfidfVectorizer(min_df=2)\n", 317 | "X = vectorizer.fit_transform(X_train['opis'])\n", 318 | "vectorizer.get_feature_names()[:20]" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "vectorizer.idf_" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "from operator import itemgetter\n", 337 | "\n", 338 | "sorted(zip(vectorizer.idf_, vectorizer.get_feature_names()), key=itemgetter(0), reverse=True)[:40]" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "from sklearn.base import BaseEstimator, TransformerMixin\n", 348 | "from sklearn.decomposition import TruncatedSVD\n", 349 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 350 | "from sklearn.pipeline import Pipeline\n", 351 | "from sklearn.linear_model import LinearRegression\n", 352 | "from sklearn.svm import SVR\n", 353 | "from sklearn.model_selection import train_test_split\n", 354 | "from sklearn.model_selection import cross_val_score\n", 355 | "\n", 356 | "class ItemSelector(BaseEstimator, TransformerMixin):\n", 357 | " def __init__(self, key):\n", 358 | " self.key = key\n", 359 | "\n", 360 | " def fit(self, x, y=None):\n", 361 | " return self\n", 362 | "\n", 363 | " def transform(self, data_dict):\n", 364 | " return data_dict[self.key]\n", 365 | "\n", 366 | "\n", 367 | "pipeline = Pipeline([\n", 368 | " ('selector', ItemSelector(key='opis')),\n", 369 | " ('tfidf', TfidfVectorizer()),\n", 370 | " ('best', TruncatedSVD(n_components=250)),\n", 371 | " ('linear', SVR(kernel='linear', C=1000, gamma='auto'))\n", 372 | " ])\n", 373 | "\n", 374 | "y = dum_df['cena_za_metr']\n", 375 | "X = dum_df.drop(['cena_za_metr'], axis=1)\n", 376 | "\n", 377 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)\n", 378 | "scores = cross_val_score(pipeline, X_train, y_train, cv=5)\n", 379 | "print(list(scores))\n", 380 | "print()\n", 381 | "print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "from sklearn.base import BaseEstimator, TransformerMixin\n", 391 | "from sklearn.decomposition import TruncatedSVD\n", 392 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 393 | "from sklearn.pipeline import Pipeline\n", 394 | "from sklearn.linear_model import LinearRegression\n", 395 | "sklearn.svm.LinearSVR\n", 396 | "from sklearn.svm import SVR\n", 397 | "from sklearn.model_selection import train_test_split\n", 398 | "from sklearn.model_selection import cross_val_score\n", 399 | "\n", 400 | "class ItemSelector(BaseEstimator, TransformerMixin):\n", 401 | " def __init__(self, key):\n", 402 | " self.key = key\n", 403 | "\n", 404 | " def fit(self, x, y=None):\n", 405 | " return self\n", 406 | "\n", 407 | " def transform(self, data_dict):\n", 408 | " return data_dict[self.key]\n", 409 | "\n", 410 | "\n", 411 | "pipeline = Pipeline([\n", 412 | " ('selector', ItemSelector(key='opis')),\n", 413 | " ('tfidf', TfidfVectorizer()),\n", 414 | " ('best', TruncatedSVD(n_components=350)),\n", 415 | " ('linear', SVR(kernel='linear', C=1000, gamma='auto'))\n", 416 | " ])\n", 417 | "\n", 418 | "y = dum_df['cena_za_metr']\n", 419 | "X = dum_df.drop(['cena_za_metr'], axis=1)\n", 420 | "\n", 421 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)\n", 422 | "scores = cross_val_score(pipeline, X_train, y_train, cv=5)\n", 423 | "print(list(scores))\n", 424 | "print()\n", 425 | "print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [ 434 | "from sklearn.base import BaseEstimator, TransformerMixin\n", 435 | "from sklearn.decomposition import TruncatedSVD\n", 436 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 437 | "from sklearn.pipeline import Pipeline\n", 438 | "from sklearn.linear_model import LinearRegression\n", 439 | "from sklearn.svm import LinearSVR\n", 440 | "from sklearn.model_selection import train_test_split\n", 441 | "from sklearn.model_selection import cross_val_score\n", 442 | "\n", 443 | "class ItemSelector(BaseEstimator, TransformerMixin):\n", 444 | " def __init__(self, key):\n", 445 | " self.key = key\n", 446 | "\n", 447 | " def fit(self, x, y=None):\n", 448 | " return self\n", 449 | "\n", 450 | " def transform(self, data_dict):\n", 451 | " return data_dict[self.key]\n", 452 | "\n", 453 | "\n", 454 | "pipeline = Pipeline([\n", 455 | " ('selector', ItemSelector(key='opis')),\n", 456 | " ('tfidf', TfidfVectorizer()),\n", 457 | " ('best', TruncatedSVD(n_components=350)),\n", 458 | " ('linear', LinearSVR(C=1000))\n", 459 | " ])\n", 460 | "\n", 461 | "y = dum_df['cena_za_metr']\n", 462 | "X = dum_df.drop(['cena_za_metr'], axis=1)\n", 463 | "\n", 464 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)\n", 465 | "scores = cross_val_score(pipeline, X_train, y_train, cv=5)\n", 466 | "print(list(scores))\n", 467 | "print()\n", 468 | "print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": null, 474 | "metadata": {}, 475 | "outputs": [], 476 | "source": [ 477 | "from sklearn.base import BaseEstimator, TransformerMixin\n", 478 | "from sklearn.decomposition import TruncatedSVD\n", 479 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 480 | "from sklearn.pipeline import Pipeline\n", 481 | "from sklearn.linear_model import LinearRegression\n", 482 | "from sklearn.svm import LinearSVR\n", 483 | "from sklearn.model_selection import train_test_split\n", 484 | "from sklearn.model_selection import cross_val_score\n", 485 | "from sklearn.model_selection import GridSearchCV\n", 486 | "from time import time\n", 487 | "from warnings import simplefilter\n", 488 | "simplefilter(action='ignore', category=FutureWarning)\n", 489 | "\n", 490 | "class ItemSelector(BaseEstimator, TransformerMixin):\n", 491 | " def __init__(self, key):\n", 492 | " self.key = key\n", 493 | "\n", 494 | " def fit(self, x, y=None):\n", 495 | " return self\n", 496 | "\n", 497 | " def transform(self, data_dict):\n", 498 | " return data_dict[self.key]\n", 499 | "\n", 500 | "parameters = parameters = {\n", 501 | " 'best__n_components': (350,500,750,1000),\n", 502 | " 'linear__C': (100,1000,10000)\n", 503 | "}\n", 504 | "\n", 505 | "pipeline = Pipeline([\n", 506 | " ('selector', ItemSelector(key='opis')),\n", 507 | " ('tfidf', TfidfVectorizer()),\n", 508 | " ('best', TruncatedSVD()),\n", 509 | " ('linear', LinearSVR())\n", 510 | " ])\n", 511 | "\n", 512 | "grid_search = GridSearchCV(pipeline, parameters, verbose=1, cv=3)\n", 513 | "\n", 514 | "\n", 515 | "y = dum_df['cena_za_metr']\n", 516 | "X = dum_df.drop(['cena_za_metr'], axis=1)\n", 517 | "\n", 518 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=17)\n", 519 | "\n", 520 | "t0 = time()\n", 521 | "grid_search.fit(X_train, y_train)\n", 522 | "print(\"done in %0.3fs\" % (time() - t0))\n", 523 | "print(\"Best parameters set:\")\n", 524 | "best_parameters = grid_search.best_estimator_.get_params()\n", 525 | "for param_name in sorted(parameters.keys()):\n", 526 | " print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": {}, 533 | "outputs": [], 534 | "source": [ 535 | "from sklearn.base import BaseEstimator, TransformerMixin\n", 536 | "from sklearn.decomposition import TruncatedSVD\n", 537 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 538 | "from sklearn.pipeline import Pipeline\n", 539 | "from sklearn.linear_model import LinearRegression\n", 540 | "from sklearn.svm import SVR\n", 541 | "from sklearn.model_selection import train_test_split\n", 542 | "from sklearn.model_selection import cross_val_score\n", 543 | "from sklearn.model_selection import GridSearchCV\n", 544 | "from time import time\n", 545 | "from warnings import simplefilter\n", 546 | "simplefilter(action='ignore', category=FutureWarning)\n", 547 | "\n", 548 | "class ItemSelector(BaseEstimator, TransformerMixin):\n", 549 | " def __init__(self, key):\n", 550 | " self.key = key\n", 551 | "\n", 552 | " def fit(self, x, y=None):\n", 553 | " return self\n", 554 | "\n", 555 | " def transform(self, data_dict):\n", 556 | " return data_dict[self.key]\n", 557 | "\n", 558 | "parameters = parameters = {\n", 559 | " 'best__n_components': (750,1000),\n", 560 | " 'svr__C': (100,1000),\n", 561 | " 'svr__kernel':('linear', 'rbf')\n", 562 | "}\n", 563 | "\n", 564 | "pipeline = Pipeline([\n", 565 | " ('selector', ItemSelector(key='opis')),\n", 566 | " ('tfidf', TfidfVectorizer()),\n", 567 | " ('best', TruncatedSVD()),\n", 568 | " ('svr', SVR())\n", 569 | " ])\n", 570 | "\n", 571 | "grid_search = GridSearchCV(pipeline, parameters, verbose=1, cv=3)\n", 572 | "\n", 573 | "\n", 574 | "y = dum_df['cena_za_metr']\n", 575 | "X = dum_df.drop(['cena_za_metr'], axis=1)\n", 576 | "\n", 577 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=17)\n", 578 | "\n", 579 | "t0 = time()\n", 580 | "grid_search.fit(X_train, y_train)\n", 581 | "print(\"done in %0.3fs\" % (time() - t0))\n", 582 | "print(\"Best parameters set:\")\n", 583 | "best_parameters = grid_search.best_estimator_.get_params()\n", 584 | "for param_name in sorted(parameters.keys()):\n", 585 | " print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": null, 591 | "metadata": {}, 592 | "outputs": [], 593 | "source": [ 594 | "from sklearn.base import BaseEstimator, TransformerMixin\n", 595 | "from sklearn.preprocessing import StandardScaler, Normalizer\n", 596 | "from sklearn.decomposition import TruncatedSVD\n", 597 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 598 | "from sklearn.pipeline import Pipeline\n", 599 | "from sklearn.linear_model import LinearRegression, Ridge\n", 600 | "from warnings import simplefilter\n", 601 | "simplefilter(action='ignore', category=DeprecationWarning)\n", 602 | "\n", 603 | "\n", 604 | "\n", 605 | "param_grid = dict( scale=['passthrough', StandardScaler(), Normalizer()]\n", 606 | ")\n", 607 | " \n", 608 | "print(param_grid)\n", 609 | "\n", 610 | "y = dum_df['cena_za_metr']\n", 611 | "X = dum_df.drop(['cena_za_metr', 'opis'], axis=1)\n", 612 | "\n", 613 | "pipe = Pipeline([\n", 614 | " ('scale', 'passthrough'),\n", 615 | " ('regression', Ridge())\n", 616 | "])\n", 617 | "\n", 618 | "grid_search = GridSearchCV(pipe, param_grid, verbose=1, cv=3)\n", 619 | "\n", 620 | "\n", 621 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=17)\n", 622 | "\n", 623 | "t0 = time()\n", 624 | "grid_search.fit(X_train, y_train)\n", 625 | "print(\"done in %0.3fs\" % (time() - t0))\n", 626 | "print(\"Best parameters set:\")\n", 627 | "print(grid_search.best_estimator_)\n", 628 | "print(grid_search.best_score_)" 629 | ] 630 | }, 631 | { 632 | "cell_type": "markdown", 633 | "metadata": {}, 634 | "source": [ 635 | "https://scikit-learn.org/stable/modules/generated/sklearn.compose.TransformedTargetRegressor.html" 636 | ] 637 | } 638 | ], 639 | "metadata": { 640 | "kernelspec": { 641 | "display_name": "Python 3", 642 | "language": "python", 643 | "name": "python3" 644 | }, 645 | "language_info": { 646 | "codemirror_mode": { 647 | "name": "ipython", 648 | "version": 3 649 | }, 650 | "file_extension": ".py", 651 | "mimetype": "text/x-python", 652 | "name": "python", 653 | "nbconvert_exporter": "python", 654 | "pygments_lexer": "ipython3", 655 | "version": "3.7.5" 656 | } 657 | }, 658 | "nbformat": 4, 659 | "nbformat_minor": 4 660 | } 661 | -------------------------------------------------------------------------------- /MachineLearning4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Machine Learning 4" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### - Feature Engineering\n", 15 | "### - Ostateczny kształt `Pipeline`\n", 16 | "### - Problemy z trenowaniem modelu\n", 17 | "### - Materiały do dalszej nauki\n", 18 | "### - Projekt do realizacji\n", 19 | "\n", 20 | "---\n", 21 | "## Feature Engineering" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "---\n", 29 | "# The features you use influence more than everything else the result. \n", 30 | "# No algorithm alone, to my knowledge, can supplement the information gain given by correct feature engineering.\n", 31 | "#
— Luca Massaron Autor, Kaggle master
" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "---" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "# Coming up with features is difficult, time-consuming, requires expert knowledge.\n", 46 | "# \"_*Applied machine learning*_\" is basically feature engineering.\n", 47 | "#
— Andrew Ng
\n" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "---\n", 55 | "## Techniki Inżynierii Wymiarów\n", 56 | "### Liczby\n", 57 | "- Binaryzacja\n", 58 | "- Kubełkowanie (stała szerokość lub kwantyle)\n", 59 | "- Skalowanie \n", 60 | " - wygładzanie (dodanie +1 - _*Wygładzanie Laplace'a*_)\n", 61 | " - min-max \n", 62 | " - logarytm \n", 63 | " - standaryzacja (skalowanie o wariancję)\n", 64 | " - NIE gubimy zer! (dane rzadkie w gęste przez np. odjęcie średniej)\n", 65 | " - zaawansowane skalowanie: TF-IDF\n", 66 | "\n", 67 | "### Agregacje\n", 68 | "- sumy, średnie, wariancje, dalsze momenty (np. per kategoria)\n", 69 | "- przejście z liczb bezwzględnych na względne (np. w kategorii)\n", 70 | "- przejście na z wartości na rank (kolejność) \n", 71 | "\n", 72 | "\n", 73 | "### Kategorie\n", 74 | "- dummy encoding\n", 75 | "- feature hashing\n", 76 | "- redukcja wymiarów\n", 77 | "\n", 78 | "### Kategorie porządkowe np. daty\n", 79 | "- rozbicie na elementy (dzień, miesiąc, rok, kwartał, dzień tygodnia, dzień miesiąca)\n", 80 | "- przejście na zmienne biegunowe\n", 81 | "\n", 82 | "![Zmienne biegunowe](Picture1.png)\n", 83 | "\n", 84 | "---\n", 85 | "## Ostateczny kształt `Pipeline`" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "import pandas as pd\n", 95 | "from numpy import log2\n", 96 | "\n", 97 | "data = pd.read_csv('adverts_29_04.csv', sep=';')\n", 98 | "\n", 99 | "data['cena_za_metr'] = data['Cena'] / data['Wielkość (m2)']\n", 100 | "data[\"log\"] = data['Wielkość (m2)'].apply(lambda x: log2(x))\n", 101 | "data['msc'] = data['Data dodania'].apply(lambda x: x[3:])\n", 102 | "\n", 103 | "data = data.dropna(subset=['cena_za_metr'])\n", 104 | "\n", 105 | "df = data.drop(['Cena', 'Data dodania'], axis=1)\n", 106 | "\n", 107 | "dum_df = pd.get_dummies(df, columns=['msc', 'Lokalizacja', 'Na sprzedaż przez', 'Rodzaj nieruchomości', 'Liczba pokoi', 'Liczba łazienek', 'Parking'])\n" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "import gzip\n", 117 | "import sys\n", 118 | "import re\n", 119 | "import re\n", 120 | "\n", 121 | "splitter = re.compile(r'[^ąąćęńłóóśśżżź\\w]+')\n", 122 | "isnumber = re.compile(r'[0-9]')\n", 123 | "\n", 124 | "f = gzip.open('odm.txt.gz', 'rt', encoding='utf-8')\n", 125 | "dictionary = {}\n", 126 | "set_dict= set()\n", 127 | "\n", 128 | "for x in f:\n", 129 | " t = x.strip().split(',')\n", 130 | " tt = [ x.strip().lower() for x in t]\n", 131 | " for w in tt:\n", 132 | " set_dict.add(w)\n", 133 | " dictionary[w]=tt[0]\n", 134 | "\n", 135 | "def lematize(w):\n", 136 | " w = w.replace('ą','ą')\n", 137 | " w = w.replace('ó','ó')\n", 138 | " w = w.replace('ę','ę')\n", 139 | " w = w.replace('ż','ż')\n", 140 | " return dictionary.get(w,w)\n", 141 | "\n", 142 | "opis1 = dum_df['opis'][0]\n", 143 | "\n", 144 | "\n", 145 | "\n", 146 | "raw_corpus=[]\n", 147 | "n=0\n", 148 | "\n", 149 | "for i in dum_df.iterrows():\n", 150 | " n+=1\n", 151 | " l = list(splitter.split(i[1][1]))\n", 152 | " raw_corpus.append(l)\n", 153 | "\n", 154 | " \n", 155 | "all_words = []\n", 156 | "for t in raw_corpus:\n", 157 | " all_words[0:0] = t\n", 158 | "\n", 159 | "words = {}\n", 160 | "for w in all_words:\n", 161 | " rec = words.get(w.lower(), {'upper':0, 'lower': 0})\n", 162 | " if w.lower()==w or w.upper()==w:\n", 163 | " rec['lower'] = rec['lower'] +1\n", 164 | " else: \n", 165 | " rec['upper'] = rec['upper'] +1\n", 166 | " words[w.lower()] = rec\n", 167 | "\n", 168 | "raw_stop_words = [ x for x in words.keys() if words[x]['upper']>=words[x]['lower']*4 ] \n", 169 | "\n", 170 | "set_raw_stop_words = set(raw_stop_words)\n", 171 | "\n", 172 | "def preprocessing(opis, filter_raw=True, filter_dict=True):\n", 173 | " opis = str(opis)\n", 174 | " tokenized = splitter.split(opis)\n", 175 | " l = list(tokenized)\n", 176 | " l = [ x.lower() for x in l ]\n", 177 | " l = [ x for x in l if len(x) > 2]\n", 178 | " l = [ x for x in l if x.find('_') < 0]\n", 179 | " l = [ x for x in l if isnumber.search(x) is None ]\n", 180 | " if filter_raw: l = [ x for x in l if x not in set_raw_stop_words ]\n", 181 | " if filter_dict: l = [ x for x in l if x in set_dict ]\n", 182 | " l = [ lematize(x) for x in l ]\n", 183 | " l = [ x for x in l if len(x) > 2]\n", 184 | " return l" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "opis1" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "print(preprocessing(opis1))" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "print(preprocessing(opis1, filter_raw=False))" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "print(preprocessing(opis1, filter_dict=False))" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "print(preprocessing(opis1, filter_raw=False, filter_dict=False))" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "dum_df[\"opisTT\"] = dum_df[\"opis\"].apply(lambda x: ' '.join(preprocessing(x,filter_raw=True, filter_dict=True)))\n", 239 | "dum_df[\"opisTF\"] = dum_df[\"opis\"].apply(lambda x: ' '.join(preprocessing(x,filter_raw=True, filter_dict=False)))\n", 240 | "dum_df[\"opisFT\"] = dum_df[\"opis\"].apply(lambda x: ' '.join(preprocessing(x,filter_raw=False, filter_dict=True)))\n", 241 | "dum_df[\"opisFF\"] = dum_df[\"opis\"].apply(lambda x: ' '.join(preprocessing(x,filter_raw=False, filter_dict=False)))" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "```python\n", 249 | "from sklearn.ensemble import GradientBoostingRegressor\n", 250 | "from sklearn.svm import SVR\n", 251 | "from sklearn.base import BaseEstimator, TransformerMixin\n", 252 | "from sklearn.decomposition import TruncatedSVD\n", 253 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 254 | "from sklearn.pipeline import Pipeline\n", 255 | "from sklearn.linear_model import LinearRegression\n", 256 | "from sklearn.model_selection import train_test_split\n", 257 | "from sklearn.model_selection import cross_val_score\n", 258 | "from sklearn.model_selection import GridSearchCV\n", 259 | "from time import time\n", 260 | "from sklearn.preprocessing import StandardScaler, Normalizer, RobustScaler\n", 261 | "from sklearn.compose import TransformedTargetRegressor\n", 262 | "from sklearn.pipeline import FeatureUnion\n", 263 | "\n", 264 | "class ItemSelector(BaseEstimator, TransformerMixin):\n", 265 | " def __init__(self, key=''):\n", 266 | " self.key = key\n", 267 | "\n", 268 | " def fit(self, x, y=None):\n", 269 | " return self\n", 270 | "\n", 271 | " def transform(self, data_dict):\n", 272 | " return data_dict[self.key]\n", 273 | "\n", 274 | "class ItemUnSelector(BaseEstimator, TransformerMixin):\n", 275 | " def __init__(self, keys=[]):\n", 276 | " self.keys = keys\n", 277 | "\n", 278 | " def fit(self, x, y=None):\n", 279 | " return self\n", 280 | "\n", 281 | " def transform(self, data_dict):\n", 282 | " return data_dict.drop(self.keys, axis=1)\n", 283 | "\n", 284 | "\n", 285 | "pipeline = Pipeline([\n", 286 | " ('union', \n", 287 | " FeatureUnion(\n", 288 | " transformer_list=[\n", 289 | " ('table', \n", 290 | " Pipeline([\n", 291 | " ('selector1', ItemUnSelector(keys=['opis', 'opisTT', 'opisTF', 'opisFT', 'opisFF'])),\n", 292 | " ('scaler1', 'passthrough')\n", 293 | " ])\n", 294 | " ),\n", 295 | " ('description', \n", 296 | " Pipeline([\n", 297 | " ('selector2', ItemSelector()),\n", 298 | " ('tfidf', TfidfVectorizer()),\n", 299 | " ('best', TruncatedSVD()),\n", 300 | " ('scaler2', 'passthrough')\n", 301 | " ])\n", 302 | " )\n", 303 | " ]\n", 304 | " ) \n", 305 | "\n", 306 | " ),\n", 307 | " ('regressor', \n", 308 | " TransformedTargetRegressor()\n", 309 | " )\n", 310 | "])\n", 311 | "\n", 312 | "parameters = parameters = {\n", 313 | " 'union__transformer_weights': [ { 'table': 3.0, 'description': 1.0}, { 'table': 2.0, 'description': 1.0}, { 'table': 1.0, 'description': 1.0}],\n", 314 | "\n", 315 | " 'union__description__best__n_components': (650, 700, 750),\n", 316 | " 'union__description__tfidf__min_df': (3, 4, 5),\n", 317 | " 'union__description__tfidf__binary': (True,False),\n", 318 | " 'union__description__selector2__key': ['opisTT', 'opisTF', 'opisFT', 'opisFF'] ,\n", 319 | " \n", 320 | " 'union__table__scaler1': ['passthrough', StandardScaler(), Normalizer(), RobustScaler()],\n", 321 | " 'union__description__scaler2': ['passthrough', StandardScaler(), Normalizer(), RobustScaler(with_centering=False)],\n", 322 | " \n", 323 | " 'regressor': [SVR(kernel='rbf', C=10000), SVR(kernel='linear', C=10000), GradientBoostingRegressor()] ,\n", 324 | "}\n", 325 | "\n", 326 | "grid_search = GridSearchCV(pipeline, parameters, verbose=1, cv=10, n_jobs=-1)\n", 327 | "\n", 328 | "\n", 329 | "y = dum_df['cena_za_metr']\n", 330 | "X = dum_df.drop(['cena_za_metr'], axis=1)\n", 331 | "\n", 332 | "t0 = time()\n", 333 | "grid_search.fit(X, y)\n", 334 | "print(\"done in %0.3fs\" % (time() - t0))\n", 335 | "\n", 336 | "print(\"Best parameters set:\")\n", 337 | "print(grid_search.cv_results_)\n", 338 | "print(grid_search.best_score_)\n", 339 | "print()\n", 340 | "best_parameters = grid_search.best_estimator_.get_params()\n", 341 | "for param_name in sorted(parameters.keys()):\n", 342 | " print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))\n", 343 | "```" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "---\n", 351 | "## Problemy z trenowaniem modelu\n", 352 | "\n", 353 | "\n", 354 | "### To ile tych prób mamy ?\n", 355 | "\n", 356 | "- 3 zestawy wag `union`\n", 357 | "- 3 zestawy wymiarów SVD\n", 358 | "- 6 zestawów parametrów TF-IDF\n", 359 | "- 4 zbiory danych tekstowych\n", 360 | "- 4 mechanizmy skalowania części `table`\n", 361 | "- 4 mechanizmy skalowania części `description`\n", 362 | "- 3 regresory\n", 363 | "- 10 walidacji krzyżowych" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [ 372 | "3 * 3 * 6 * 4 * 4 * 4 * 3 * 10" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": {}, 379 | "outputs": [], 380 | "source": [ 381 | "from sklearn.ensemble import GradientBoostingRegressor\n", 382 | "from sklearn.svm import SVR\n", 383 | "from sklearn.base import BaseEstimator, TransformerMixin\n", 384 | "from sklearn.decomposition import TruncatedSVD\n", 385 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 386 | "from sklearn.pipeline import Pipeline\n", 387 | "from sklearn.linear_model import LinearRegression\n", 388 | "from sklearn.model_selection import train_test_split\n", 389 | "from sklearn.model_selection import cross_val_score\n", 390 | "from sklearn.model_selection import GridSearchCV\n", 391 | "from time import time\n", 392 | "from sklearn.preprocessing import StandardScaler, Normalizer, RobustScaler\n", 393 | "from sklearn.compose import TransformedTargetRegressor\n", 394 | "from sklearn.pipeline import FeatureUnion\n", 395 | "\n", 396 | "class ItemSelector(BaseEstimator, TransformerMixin):\n", 397 | " def __init__(self, key=''):\n", 398 | " self.key = key\n", 399 | "\n", 400 | " def fit(self, x, y=None):\n", 401 | " return self\n", 402 | "\n", 403 | " def transform(self, data_dict):\n", 404 | " return data_dict[self.key]\n", 405 | "\n", 406 | "class ItemUnSelector(BaseEstimator, TransformerMixin):\n", 407 | " def __init__(self, keys=[]):\n", 408 | " self.keys = keys\n", 409 | "\n", 410 | " def fit(self, x, y=None):\n", 411 | " return self\n", 412 | "\n", 413 | " def transform(self, data_dict):\n", 414 | " return data_dict.drop(self.keys, axis=1)\n", 415 | "\n", 416 | "\n", 417 | "pipeline = Pipeline([\n", 418 | " ('union', \n", 419 | " FeatureUnion(\n", 420 | " transformer_list=[\n", 421 | " ('table', \n", 422 | " Pipeline([\n", 423 | " ('selector1', ItemUnSelector(keys=['opis', 'opisTT', 'opisTF', 'opisFT', 'opisFF'])),\n", 424 | " ('scaler1', 'passthrough')\n", 425 | " ])\n", 426 | " ),\n", 427 | " ('description', \n", 428 | " Pipeline([\n", 429 | " ('selector2', ItemSelector()),\n", 430 | " ('tfidf', TfidfVectorizer()),\n", 431 | " ('best', TruncatedSVD()),\n", 432 | " ('scaler2', 'passthrough')\n", 433 | " ])\n", 434 | " )\n", 435 | " ]\n", 436 | " ) \n", 437 | "\n", 438 | " ),\n", 439 | " ('regressor', \n", 440 | " TransformedTargetRegressor()\n", 441 | " )\n", 442 | "])\n", 443 | "\n", 444 | "parameters = parameters = {\n", 445 | " 'union__transformer_weights': [ { 'table': 1.0, 'description': 1.0}],\n", 446 | "\n", 447 | " 'union__description__best__n_components': (700,),\n", 448 | " 'union__description__tfidf__min_df': (3,),\n", 449 | " 'union__description__tfidf__binary': (True,),\n", 450 | " 'union__description__selector2__key': [ 'opisFF'] ,\n", 451 | " \n", 452 | " 'union__table__scaler1': [ RobustScaler()],\n", 453 | " 'union__description__scaler2': [ RobustScaler(with_centering=False)],\n", 454 | " \n", 455 | " 'regressor': [ GradientBoostingRegressor()] ,\n", 456 | "}\n", 457 | "\n", 458 | "grid_search = GridSearchCV(pipeline, parameters, verbose=1, cv=10, n_jobs=-1)\n", 459 | "\n", 460 | "\n", 461 | "y = dum_df['cena_za_metr']\n", 462 | "X = dum_df.drop(['cena_za_metr'], axis=1)\n", 463 | "\n", 464 | "t0 = time()\n", 465 | "grid_search.fit(X, y)\n", 466 | "print(\"done in %0.3fs\" % (time() - t0))\n", 467 | "\n", 468 | "print(f'Best score: {grid_search.best_score_}')\n", 469 | "\n", 470 | "print(\"Best parameters set:\")\n", 471 | "print()\n", 472 | "best_parameters = grid_search.best_estimator_.get_params()\n", 473 | "for param_name in sorted(parameters.keys()):\n", 474 | " print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "metadata": {}, 481 | "outputs": [], 482 | "source": [ 483 | "secs = 3 * 3 * 6 * 4 * 4 * 4 * 3 * 30 " 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": null, 489 | "metadata": {}, 490 | "outputs": [], 491 | "source": [ 492 | "secs/(3600*24)" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "metadata": {}, 498 | "source": [ 499 | "---\n", 500 | "## Poprawa skuteczności\n", 501 | "### Więcej informacji\n", 502 | "- poziomo - więcej danych (skąd ? czy to nie zaburzy modelu ?)\n", 503 | "- pionowo - więcej wymiarów\n", 504 | " - więcej danych (zdjęcia ?)\n", 505 | " - więcej wymiarów - Feature Engineering\n", 506 | "---\n", 507 | "## Materiały do dalszej nauki\n", 508 | "- Udemy - https://www.udemy.com/course/introduction-to-data-science-using-python/\n", 509 | "- Udemy - https://www.udemy.com/course/python-scrapy-for-beginners/\n", 510 | "- edX - https://www.edx.org/course/introduction-to-python-for-data-science-2\n", 511 | "- Coursera - IBM https://www.coursera.org/learn/python-for-applied-data-science-ai\n", 512 | "- Coursera - Stanford Machine Learning https://www.coursera.org/learn/machine-learning\n", 513 | "\n", 514 | "### Tego jest dużo ...\n", 515 | "https://www.forbes.com/sites/bernardmarr/2020/02/24/the-9-best-free-online-data-science-courses-in-2020/\n", 516 | "https://www.dataquest.io/blog/free-books-learn-data-science/\n", 517 | "100+ - https://www.learndatasci.com/free-data-science-books/\n", 518 | "\n", 519 | "https://jakevdp.github.io/PythonDataScienceHandbook/" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": {}, 525 | "source": [ 526 | "---\n", 527 | "# Temat Projektu\n", 528 | "\n", 529 | "- Pobierz dane (`Scrapy`, `requests` ...) - ok. 1000 rekordów (im więcej, tym lepiej)\n", 530 | "- Przygotuj dane do analizy (`Beautiful Soup`, `lxml`) \n", 531 | "- Zbuduj `Pipeline`\n", 532 | "- Wytrenuj jak najlepszy model" 533 | ] 534 | }, 535 | { 536 | "cell_type": "markdown", 537 | "metadata": {}, 538 | "source": [ 539 | "---" 540 | ] 541 | } 542 | ], 543 | "metadata": { 544 | "kernelspec": { 545 | "display_name": "Python 3", 546 | "language": "python", 547 | "name": "python3" 548 | }, 549 | "language_info": { 550 | "codemirror_mode": { 551 | "name": "ipython", 552 | "version": 3 553 | }, 554 | "file_extension": ".py", 555 | "mimetype": "text/x-python", 556 | "name": "python", 557 | "nbconvert_exporter": "python", 558 | "pygments_lexer": "ipython3", 559 | "version": "3.7.5" 560 | } 561 | }, 562 | "nbformat": 4, 563 | "nbformat_minor": 4 564 | } 565 | -------------------------------------------------------------------------------- /Matma1.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/Matma1.xlsx -------------------------------------------------------------------------------- /Picture1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/Picture1.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Program Semestru 2 | 3 | ## 1. Język Python 1 4 | - ### podstawowe elementy składni 5 | - ### struktury danych 6 | - ### Środowisko pracy do pracy z danymi – anakonda, jupyter, biblioteki i moduły 7 | 8 | ## 2. Język Python 2 9 | - ### instrukcje sterujące 10 | - ### obsługa błędów 11 | - ### Korzystanie z pythona – notebooks, skrypty 12 | 13 | ## 3. Data Wrangling 1 14 | - ### Tidy Data – co to jest 15 | - ### Data wrangling, munging, tidying 16 | - ### Biblioteka Pandas – wprowadzenie. Czytanie danych 17 | 18 | ## 4. Data Wrangling 2 19 | - ### Data Wrangling w praktyce – podstawowe operacje 20 | - ### Biblioteka Pandas – wybieranie kolumn i „krojenie danych” 21 | 22 | ## 5. Data Wrangling 3 23 | - ### Czyszczenie danych 24 | - ### Pandas – agregacja, grupowanie 25 | 26 | ## 6. Wizualizacja danych 1 27 | - ### Matplotlib – wprowadzenie 28 | - ### Proste wykresy 29 | - ### Konfiguracja wykresu, sztuczki i kruczki 30 | 31 | ## 7. Wizualizacja danych 2 32 | - ### Seaborn – wprowadzenie 33 | - ### Różnice i podobieństwa do Matplotlib 34 | - ### Dash by Plotly - interfejsy webowe 35 | 36 | ## 8. Zewnętrzne źródła danych 37 | - ### Pojęcie API i korzystanie z nich. JSON 38 | - ### Samodzielne pobieranie danych 39 | 40 | ## 9. Scraping 41 | - ### Biblioteka Scrapy 42 | - ### Biblioteki Beautiful Soup, lxml 43 | - ### Ściąganie danych z sieci 44 | 45 | ## 10. Machine Learning 1 46 | - ### Klasyfikacja w ML 47 | - ### Biblioteka scikit 48 | 49 | ## 11. Machine Learning 2 50 | - ### Metryki skuteczności optymalizacja modeli 51 | - ### Trening klasyfikatorów w scikit 52 | 53 | ## 12. Machine Learning 3 54 | - ### Wybór optymalnego modelu 55 | - ### Badanie charakterystyk modeli 56 | - ### Grid search w scikit 57 | 58 | ## 13. Machine Learning 4 59 | - ### Regresja w ML 60 | - ### Regresja w Scikit 61 | 62 | ## 14. Wprowadzenie do maszynowego przetwarzania tekstu 63 | - ### Specyfika danych tekstowych 64 | - ### Postawowe metryki dla danych tekstowych 65 | - ### Klasyfikacja dokumentów w Scikit. 66 | 67 | Aby odtworzyć środowisko wykładu można skorzystać z polecenia: 68 | `conda create --name --file requirements.txt` -------------------------------------------------------------------------------- /SKM Statystyki - Python 7.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Wykład 7 - Web Scraping\n", 8 | "\n", 9 | "## Spiders\n", 10 | "## Generators\n", 11 | "## Selectors" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "---\n", 19 | "## Lista obecności\n", 20 | "## http://bit.ly/SKNSwyklad0403 " 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "---" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Zadanie 1\n", 35 | "\n", 36 | "Narysuj wykres o kształcie okręgu" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "%matplotlib inline\n", 46 | "import matplotlib.pyplot as plt\n", 47 | "\n", 48 | "circle=plt.Circle((0,0),2)\n", 49 | "ax=plt.gca()\n", 50 | "ax.add_patch(circle)\n", 51 | "\n", 52 | "plt.axis('scaled')\n", 53 | "plt.show()" 54 | ] 55 | }, 56 | { 57 | "cell_type": "raw", 58 | "metadata": {}, 59 | "source": [ 60 | "plt.gca(projection='polar')\n", 61 | "If the current axes doesn't exist, or isn't a polar one, the appropriate axes will be created and then returned." 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "import matplotlib.pyplot as plt\n", 71 | "import numpy as np\n", 72 | "\n", 73 | "x = np.linspace(-1, 1, 100)\n", 74 | "\n", 75 | "plt.plot(x,np.sqrt(1-x*x))\n", 76 | "plt.plot(x,-np.sqrt(1-x*x))\n", 77 | "\n", 78 | "plt.show()" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "import matplotlib.pyplot as plt\n", 88 | "import numpy as np\n", 89 | "\n", 90 | "x = np.linspace(-1, 1, 100)\n", 91 | "\n", 92 | "plt.plot(x,np.sqrt(1-x*x))\n", 93 | "plt.plot(x,-np.sqrt(1-x*x))\n", 94 | "plt.axis('scaled')\n", 95 | "plt.show()" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "import matplotlib.pyplot as plt\n", 105 | "import numpy as np\n", 106 | "\n", 107 | "# Plot circle or radius 3\n", 108 | "\n", 109 | "an = np.linspace(0, 2*np.pi, 100)\n", 110 | "\n", 111 | "plt.subplot(331)\n", 112 | "plt.plot(3*np.cos(an), 3*np.sin(an))\n", 113 | "plt.title('not equal, looks like ellipse', fontsize=10)\n", 114 | "\n", 115 | "plt.subplot(333)\n", 116 | "plt.plot(3*np.cos(an), 3*np.sin(an))\n", 117 | "plt.axis('equal')\n", 118 | "plt.title('equal, looks like circle', fontsize=10)\n", 119 | "\n", 120 | "plt.subplot(337)\n", 121 | "plt.plot(3*np.cos(an), 3*np.sin(an))\n", 122 | "plt.axis('equal')\n", 123 | "plt.axis([-3, 3, -3, 3])\n", 124 | "plt.title('looks like circle, even after changing limits', fontsize=10)\n", 125 | "\n", 126 | "plt.subplot(339)\n", 127 | "plt.plot(3*np.cos(an), 3*np.sin(an))\n", 128 | "plt.axis('equal')\n", 129 | "plt.axis([-3, 3, -3, 3])\n", 130 | "plt.plot([0, 3], [0, 3])\n", 131 | "plt.title('still equal after adding line', fontsize=10)\n", 132 | "\n", 133 | "plt.show()" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "import matplotlib.pyplot as plt\n", 143 | "import numpy as np\n", 144 | "\n", 145 | "an = np.linspace(0, 2*np.pi, 100)\n", 146 | "\n", 147 | "plt.plot(3*np.cos(an)*1, 3*np.sin(an)*1)\n", 148 | "plt.axis('equal');" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "import matplotlib.pyplot as plt\n", 158 | "import numpy as np\n", 159 | "\n", 160 | "an = np.linspace(0, 5*2*np.pi, 5*100)\n", 161 | "\n", 162 | "plt.plot(3*np.cos(an)*an, 3*np.sin(an)*an)\n", 163 | "plt.axis('equal');" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "import matplotlib.pyplot as plt\n", 173 | "import numpy as np\n", 174 | "\n", 175 | "an = np.linspace(0, 5*2*np.pi, 5*100)\n", 176 | "\n", 177 | "plt.polar(an, an);" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "import matplotlib.pyplot as plt\n", 187 | "import numpy as np\n", 188 | "\n", 189 | "an = np.linspace(0, 5*2*np.pi, 5*100)\n", 190 | "\n", 191 | "plt.polar(an, an, 'r+');" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "https://matplotlib.org/3.1.3/tutorials/introductory/sample_plots.html" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "---" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "## Web Scraping" 213 | ] 214 | }, 215 | { 216 | "cell_type": "raw", 217 | "metadata": {}, 218 | "source": [ 219 | "Skrypty znajdują się w katalogu wyklad7" 220 | ] 221 | }, 222 | { 223 | "cell_type": "raw", 224 | "metadata": {}, 225 | "source": [ 226 | "Po każdym skrypcie znajduje się polecenie do jego odpalenia w danym katalogu poprzedzone promptem \">\"" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "### Skrypt scrapy1.py" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "```python\n", 241 | "import scrapy\n", 242 | "from scrapy.crawler import CrawlerProcess\n", 243 | "\n", 244 | "class PythonEventsSpider(scrapy.Spider):\n", 245 | " name = 'pythoneventsspider'\n", 246 | "\n", 247 | " start_urls = ['https://www.python.org/events/python-events/',]\n", 248 | " found_events = []\n", 249 | "\n", 250 | " def parse(self, response):\n", 251 | " for event in response.xpath('//ul[contains(@class, \"list-recent-events\")]/li'):\n", 252 | " event_details = dict()\n", 253 | " event_details['name'] = event.xpath('h3[@class=\"event-title\"]/a/text()').extract_first()\n", 254 | " event_details['location'] = event.xpath('p/span[@class=\"event-location\"]/text()').extract_first()\n", 255 | " event_details['time'] = event.xpath('p/time/text()').extract_first()\n", 256 | " self.found_events.append(event_details)\n", 257 | " \n", 258 | "print(\"Scrapy Example 1\")\n", 259 | "process = CrawlerProcess({ 'LOG_LEVEL': 'ERROR'})\n", 260 | "process.crawl(PythonEventsSpider)\n", 261 | "spider = next(iter(process.crawlers)).spider\n", 262 | "process.start()\n", 263 | "\n", 264 | "for event in spider.found_events: \n", 265 | " print(event)\n", 266 | " \n", 267 | "```" 268 | ] 269 | }, 270 | { 271 | "cell_type": "raw", 272 | "metadata": {}, 273 | "source": [ 274 | "> python scrapy1.py" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "```python\n", 282 | "import scrapy\n", 283 | "\n", 284 | "class BlogSpider(scrapy.Spider):\n", 285 | " name = 'blogspider'\n", 286 | " start_urls = ['https://blog.scrapinghub.com']\n", 287 | "\n", 288 | " def parse(self, response):\n", 289 | " for title in response.css('.post-header>h2'):\n", 290 | " yield {'title': title.css('a ::text').get()}\n", 291 | "\n", 292 | " for next_page in response.css('a.next-posts-link'):\n", 293 | " yield response.follow(next_page, self.parse)\n", 294 | "```" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "> scrapy runspider myspider.py" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "### *Generatory* są mechanizmem\n", 311 | "* tworzenia iteratorów\n", 312 | "* Zwraca dane przez *yield*\n", 313 | "* Każde wywołanie _next()_ zaczyna od miejsca gdzie skończył poprzedni krok\n", 314 | "* _next()_ tworzona jest automatycznie" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "range(5, -1, -1)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "list(range(5, -1, -1))" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "for i in range(5, -1, -1):\n", 342 | " print(i)" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "def reverse(data):\n", 352 | " for index in range(len(data)-1, -1, -1):\n", 353 | " #print(index)\n", 354 | " yield data[index]" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "for c in reverse('Python'):\n", 364 | " print (c)" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [ 373 | "def reverse(data):\n", 374 | " for index in range(len(data)-1, -1, -1):\n", 375 | " print(index)\n", 376 | " yield data[index]" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "for c in reverse('Python'):\n", 386 | " print (c)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "raw", 391 | "metadata": {}, 392 | "source": [ 393 | "> scrapy runspider myspider.py -t csv -o outputfile.csv" 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "metadata": {}, 399 | "source": [ 400 | "## Selectors" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "metadata": {}, 406 | "source": [ 407 | "- Parsel https://parsel.readthedocs.io/en/latest/\n", 408 | " - Using XPath https://www.w3schools.com/xml/xpath_intro.asp\n", 409 | " - Using CSS https://developer.mozilla.org/en-US/docs/Learn/CSS/Building_blocks/Selectors\n", 410 | "- Beautiful Soup https://www.crummy.com/software/BeautifulSoup/bs4/doc/" 411 | ] 412 | }, 413 | { 414 | "cell_type": "markdown", 415 | "metadata": {}, 416 | "source": [ 417 | "![HTML Tree](img/pobrane.png \"Html Tree\")" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "from scrapy.selector import Selector\n", 427 | "\n", 428 | "body = 'good'\n", 429 | "Selector(text=body).xpath('//span/text()').get()" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [ 438 | "from parsel import Selector\n", 439 | "\n", 440 | "sel = Selector(text=u\"\"\"\n", 441 | " \n", 442 | "

Hello, Parsel!

\n", 443 | " \n", 447 | " \n", 448 | " \"\"\")" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "sel.css('h1::text').get()\n" 458 | ] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": {}, 463 | "source": [ 464 | "### Beautiful Soup" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [ 473 | "html_doc = \"\"\"\n", 474 | "The Dormouse's story\n", 475 | "\n", 476 | "

The Dormouse's story

\n", 477 | "\n", 478 | "

Once upon a time there were three little sisters; and their names were\n", 479 | "Elsie,\n", 480 | "Lacie and\n", 481 | "Tillie;\n", 482 | "and they lived at the bottom of a well.

\n", 483 | "\n", 484 | "

...

\n", 485 | "\"\"\"" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": null, 491 | "metadata": {}, 492 | "outputs": [], 493 | "source": [ 494 | "from bs4 import BeautifulSoup\n", 495 | "soup = BeautifulSoup(html_doc, 'html.parser')\n", 496 | "\n", 497 | "print(soup.prettify())" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "soup.p" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": null, 512 | "metadata": {}, 513 | "outputs": [], 514 | "source": [ 515 | "soup.p['class']" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": null, 521 | "metadata": {}, 522 | "outputs": [], 523 | "source": [ 524 | "soup.a" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": null, 530 | "metadata": {}, 531 | "outputs": [], 532 | "source": [ 533 | "soup.find_all('a')" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": null, 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [ 542 | "soup.find(id=\"link3\")" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [ 551 | "[ link.get('href') for link in soup.find_all('a')]" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": null, 557 | "metadata": {}, 558 | "outputs": [], 559 | "source": [ 560 | "soup.a" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": null, 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | "soup.a.find_next_sibling(\"a\")" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": null, 575 | "metadata": {}, 576 | "outputs": [], 577 | "source": [ 578 | "soup.p" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": null, 584 | "metadata": {}, 585 | "outputs": [], 586 | "source": [ 587 | "soup.p.find_next_sibling(\"p\")" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": null, 593 | "metadata": {}, 594 | "outputs": [], 595 | "source": [ 596 | "pn=soup.p.find_next_sibling(\"p\")\n", 597 | "children = pn.children" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": null, 603 | "metadata": {}, 604 | "outputs": [], 605 | "source": [ 606 | "children" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": null, 612 | "metadata": {}, 613 | "outputs": [], 614 | "source": [ 615 | "lista = [ x for x in children ]\n", 616 | "lista" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": null, 622 | "metadata": {}, 623 | "outputs": [], 624 | "source": [ 625 | "lista[1].get('href')" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": null, 631 | "metadata": {}, 632 | "outputs": [], 633 | "source": [ 634 | "head_tag = soup.head\n", 635 | "head_tag" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": null, 641 | "metadata": {}, 642 | "outputs": [], 643 | "source": [ 644 | "for child in head_tag.children:\n", 645 | " print(child)" 646 | ] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": null, 651 | "metadata": {}, 652 | "outputs": [], 653 | "source": [ 654 | "for child in head_tag.descendants:\n", 655 | " print(child)" 656 | ] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": null, 661 | "metadata": {}, 662 | "outputs": [], 663 | "source": [ 664 | "last_a_tag = soup.find(\"a\", id=\"link3\")\n", 665 | "last_a_tag\n" 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": null, 671 | "metadata": {}, 672 | "outputs": [], 673 | "source": [ 674 | "last_a_tag.next_sibling" 675 | ] 676 | }, 677 | { 678 | "cell_type": "code", 679 | "execution_count": null, 680 | "metadata": {}, 681 | "outputs": [], 682 | "source": [ 683 | "last_a_tag.next_element" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "metadata": {}, 690 | "outputs": [], 691 | "source": [ 692 | "last_a_tag.parent" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": null, 698 | "metadata": {}, 699 | "outputs": [], 700 | "source": [ 701 | "def has_class_but_no_id(tag):\n", 702 | " return tag.has_attr('class') and not tag.has_attr('id')\n", 703 | "\n", 704 | "soup.find_all(has_class_but_no_id)" 705 | ] 706 | }, 707 | { 708 | "cell_type": "code", 709 | "execution_count": null, 710 | "metadata": {}, 711 | "outputs": [], 712 | "source": [ 713 | "soup.find_all(id='link2')" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": null, 719 | "metadata": {}, 720 | "outputs": [], 721 | "source": [ 722 | "soup.find_all(\"a\", class_=\"sister\")" 723 | ] 724 | }, 725 | { 726 | "cell_type": "code", 727 | "execution_count": null, 728 | "metadata": {}, 729 | "outputs": [], 730 | "source": [ 731 | "soup.find_all(\"a\")\n", 732 | "soup(\"a\")" 733 | ] 734 | }, 735 | { 736 | "cell_type": "markdown", 737 | "metadata": {}, 738 | "source": [ 739 | "---\n", 740 | "## Zadanie 1\n", 741 | "Wypisać tytuły ogłoszeń z:\n", 742 | "https://www.gumtree.pl/s-mieszkania-i-domy-sprzedam-i-kupie/v1c9073p1" 743 | ] 744 | }, 745 | { 746 | "cell_type": "markdown", 747 | "metadata": {}, 748 | "source": [ 749 | "---\n", 750 | "## Zadanie 2\n", 751 | "Wypisać adresy www ogłoszeń z:\n", 752 | "https://www.gumtree.pl/s-mieszkania-i-domy-sprzedam-i-kupie/v1c9073p1" 753 | ] 754 | }, 755 | { 756 | "cell_type": "markdown", 757 | "metadata": {}, 758 | "source": [ 759 | "---" 760 | ] 761 | } 762 | ], 763 | "metadata": { 764 | "kernelspec": { 765 | "display_name": "Python 3", 766 | "language": "python", 767 | "name": "python3" 768 | }, 769 | "language_info": { 770 | "codemirror_mode": { 771 | "name": "ipython", 772 | "version": 3 773 | }, 774 | "file_extension": ".py", 775 | "mimetype": "text/x-python", 776 | "name": "python", 777 | "nbconvert_exporter": "python", 778 | "pygments_lexer": "ipython3", 779 | "version": "3.7.5" 780 | } 781 | }, 782 | "nbformat": 4, 783 | "nbformat_minor": 4 784 | } 785 | -------------------------------------------------------------------------------- /SKN Statystyki - Python 1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Warsztaty Python w Data Science" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "***" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "# Python 1" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "***" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# https://github.com/MichalKorzycki/WarsztatPythonDataScience" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "***" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "# Python" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "Język Python jest:\n", 57 | "- dynamicznym, silnie typowanym językiem skryptowym\n", 58 | "- napędza takie sajty jak Youtube czy Instagram\n", 59 | "- są dwie \"konkurencyjne\" wersje języka - 2.7 i tzw. py3k (3.6, 3.7, 3.8)\n", 60 | "- na razie korzystamy z 3.7\n", 61 | "- może pracować jako skrypty (samodzielny program)\n", 62 | "- albo notebook (to co widzimy)\n", 63 | "- poważnym językiem programowania" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "![title](img/rossum.jpg)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "print (\"Hello World\")" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "import sys\n", 89 | "print (sys.version)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "![title](img/growth.png)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "![title](img/projection.png)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "![title](img/tags.png)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "***" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "# Cel Zajęc\n", 125 | "- ## Celem wykładu jest przekazanie wiedzy z podstaw przetwarzania danych w języku Python\n", 126 | "- ## Celem ćwiczeń jest zbudować system co automatycznie cenę mieszkania na podstawie automatycznie zebranych danych" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "***" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "# Program Wykładu\n", 141 | "## 1. Język Python 1\n", 142 | "- ### podstawowe elementy składni\n", 143 | "- ### struktury danych\n", 144 | "- ### Środowisko pracy do pracy z danymi – anakonda, jupyter\n", 145 | "\n", 146 | "## 2. Język Python 2 \n", 147 | "- ### Instrukcje sterujące\n", 148 | "- ### Obsługa błędów\n", 149 | "- ### Biblioteki i moduły\n", 150 | "- ### Korzystanie z pythona – notebooks, skrypty\n", 151 | "\n", 152 | "## 3. Data Wrangling 1 \n", 153 | "- ### Tidy Data – co to jest\n", 154 | "- ### Data wrangling, munging, tidying\n", 155 | "- ### Biblioteka Pandas – wprowadzenie. Czytanie danych\n", 156 | "\n", 157 | "## 4. Data Wrangling 2 \n", 158 | "- ### Data Wrangling w praktyce – podstawowe operacje\n", 159 | "- ### Biblioteka Pandas – wybieranie kolumn i „krojenie danych”\n", 160 | "\n", 161 | "## 5. Data Wrangling 3 \n", 162 | "- ### Czyszczenie danych\n", 163 | "- ### Pandas – agregacja, grupowanie\n", 164 | "\n", 165 | "## 6. Wizualizacja danych 1 \n", 166 | "- ### Matplotlib – wprowadzenie\n", 167 | "- ### Proste wykresy\n", 168 | "- ### Konfiguracja wykresu, sztuczki i kruczki\n", 169 | " \n", 170 | "## 7. Wizualizacja danych 2 \n", 171 | "- ### Seaborn – wprowadzenie\n", 172 | "- ### Różnice i podobieństwa do Matplotlib\n", 173 | "- ### Dash by Plotly - interfejsy webowe\n", 174 | "\n", 175 | "## 8. Zewnętrzne źródła danych \n", 176 | "- ### Pojęcie API i korzystanie z nich. JSON\n", 177 | "- ### Samodzielne pobieranie danych\n", 178 | "\n", 179 | "## 9. Scraping \n", 180 | "- ### Biblioteka Scrapy\n", 181 | "- ### Biblioteki Beautiful Soup, lxml\n", 182 | "- ### Ściąganie danych z sieci\n", 183 | "\n", 184 | "## 10. Machine Learning 1 \n", 185 | "- ### Klasyfikacja w ML\n", 186 | "- ### Biblioteka scikit\n", 187 | "\n", 188 | "## 11. Machine Learning 2 \n", 189 | "- ### Metryki skuteczności optymalizacja modeli\n", 190 | "- ### Trening klasyfikatorów w scikit\n", 191 | "\n", 192 | "## 12. Machine Learning 3 \n", 193 | "- ### Wybór optymalnego modelu\n", 194 | "- ### Badanie charakterystyk modeli\n", 195 | "- ### Grid search w scikit\n", 196 | "\n", 197 | "## 13. Machine Learning 4 \n", 198 | "- ### Regresja w ML\n", 199 | "- ### Regresja w Scikit\n", 200 | "\n", 201 | "## 14. Wprowadzenie do maszynowego przetwarzania tekstu \n", 202 | "- ### Specyfika danych tekstowych\n", 203 | "- ### Postawowe metryki dla danych tekstowych\n", 204 | "- ### Klasyfikacja dokumentów w Scikit." 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "***" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "# Narzędzia" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "# https://www.anaconda.com/download/" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "![title](img/ver.png)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "![title](img/launch.png)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "# Składnia języka" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "## Zmienne i ich Typy" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "s = \"Ala ma kota\"\n", 263 | "s" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "## Typy zmiennych\n", 271 | "### Podstawowe\n", 272 | "- liczby całkowite (int)\n", 273 | "- liczby zmiennoprzecinkowe (float)\n", 274 | "- Łańcuchy znaków (str)\n", 275 | "- Boolowskie (True i False)\n", 276 | "\n", 277 | "### Złożone\n", 278 | "- listy - (list)\n", 279 | "- krotki - (tuple)\n", 280 | "- słowniki - (dict)\n" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [ 289 | "7+2" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "7/2" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "7//2" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "7%2" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "## Python jest językiem dynamicznym\n" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "s = 3\n", 333 | "s" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "s = \"Ala ma \"" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "s = s + \"kota\"\n", 352 | "s" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "s = \"Ala ma \"\n", 362 | "n = 3" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [ 371 | "s = \"Ala ma \" + n + \"koty\"\n", 372 | "s" 373 | ] 374 | }, 375 | { 376 | "cell_type": "markdown", 377 | "metadata": {}, 378 | "source": [ 379 | "## Python jest SILNIE typowany\n", 380 | "\n", 381 | "Ale zawsze można skorzystać z konwersji" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "s = \"Ala ma \" + str(n) + \" koty \"\n", 391 | "s" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "s * 2" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [ 409 | "\"10\"" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "int(\"10\")" 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "metadata": {}, 424 | "source": [ 425 | "## Boolean" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [ 434 | "n == 3" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [ 443 | "n != 3" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": null, 449 | "metadata": {}, 450 | "outputs": [], 451 | "source": [ 452 | "not n == 3" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "n == 3 or n != 3" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "n == 3 and n != 3" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "if n == 3:\n", 480 | " print( \"Trzy\" )" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": null, 486 | "metadata": {}, 487 | "outputs": [], 488 | "source": [ 489 | "if n == 4:\n", 490 | " print (\"Cztery\")\n", 491 | "else:\n", 492 | " print(\"To nie cztery\")" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "metadata": {}, 498 | "source": [ 499 | "## Listy" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "a = [4,5,6,7]\n", 509 | "a" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "metadata": {}, 516 | "outputs": [], 517 | "source": [ 518 | "a[1]" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [ 527 | "a[0]" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": null, 533 | "metadata": {}, 534 | "outputs": [], 535 | "source": [ 536 | "a[0:2]" 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": null, 542 | "metadata": {}, 543 | "outputs": [], 544 | "source": [ 545 | "a[-1]" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": null, 551 | "metadata": {}, 552 | "outputs": [], 553 | "source": [ 554 | "a[1:-1]" 555 | ] 556 | }, 557 | { 558 | "cell_type": "raw", 559 | "metadata": {}, 560 | "source": [ 561 | " 0 1 2 3\n", 562 | "\n", 563 | " | | | |\n", 564 | "\n", 565 | "[ 4 , 5 , 6 , 7 ]\n", 566 | " \n", 567 | " | | | |\n", 568 | "\n", 569 | " -4 -3 -2 -1" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": null, 575 | "metadata": {}, 576 | "outputs": [], 577 | "source": [ 578 | "a[-4]" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": null, 584 | "metadata": {}, 585 | "outputs": [], 586 | "source": [ 587 | "len(a)" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": null, 593 | "metadata": {}, 594 | "outputs": [], 595 | "source": [ 596 | "for i in a:\n", 597 | " print (i)" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": null, 603 | "metadata": {}, 604 | "outputs": [], 605 | "source": [ 606 | "for i in range(len(a)):\n", 607 | " print(i)" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": null, 613 | "metadata": {}, 614 | "outputs": [], 615 | "source": [ 616 | "for i in range(len(a)):\n", 617 | " print(a[i])" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": null, 623 | "metadata": {}, 624 | "outputs": [], 625 | "source": [ 626 | "a.append(8)\n", 627 | "a" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": null, 633 | "metadata": {}, 634 | "outputs": [], 635 | "source": [ 636 | "a + a" 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": null, 642 | "metadata": {}, 643 | "outputs": [], 644 | "source": [ 645 | "a * 3" 646 | ] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": null, 651 | "metadata": {}, 652 | "outputs": [], 653 | "source": [ 654 | "s[0]" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": null, 660 | "metadata": {}, 661 | "outputs": [], 662 | "source": [ 663 | "\" \".join( [\"Ala\", \"ma\", \"kota\"] )" 664 | ] 665 | }, 666 | { 667 | "cell_type": "markdown", 668 | "metadata": {}, 669 | "source": [ 670 | "## Krotki (tuple)" 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": null, 676 | "metadata": {}, 677 | "outputs": [], 678 | "source": [ 679 | "t = (1, 2, 3, 4)\n", 680 | "t" 681 | ] 682 | }, 683 | { 684 | "cell_type": "markdown", 685 | "metadata": {}, 686 | "source": [ 687 | "## Słowniki (dict)" 688 | ] 689 | }, 690 | { 691 | "cell_type": "code", 692 | "execution_count": null, 693 | "metadata": {}, 694 | "outputs": [], 695 | "source": [ 696 | "m = { 'a': 1, 'b': 2 }" 697 | ] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": null, 702 | "metadata": {}, 703 | "outputs": [], 704 | "source": [ 705 | "m.keys()" 706 | ] 707 | }, 708 | { 709 | "cell_type": "code", 710 | "execution_count": null, 711 | "metadata": {}, 712 | "outputs": [], 713 | "source": [ 714 | "m.values()" 715 | ] 716 | }, 717 | { 718 | "cell_type": "code", 719 | "execution_count": null, 720 | "metadata": {}, 721 | "outputs": [], 722 | "source": [ 723 | "m['a']" 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": null, 729 | "metadata": {}, 730 | "outputs": [], 731 | "source": [ 732 | "m['c']" 733 | ] 734 | }, 735 | { 736 | "cell_type": "code", 737 | "execution_count": null, 738 | "metadata": {}, 739 | "outputs": [], 740 | "source": [ 741 | "m.get('c', 0)" 742 | ] 743 | }, 744 | { 745 | "cell_type": "code", 746 | "execution_count": null, 747 | "metadata": {}, 748 | "outputs": [], 749 | "source": [ 750 | "m = dict( [(\"a\", 1), (\"b\", 2)] )" 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": null, 756 | "metadata": {}, 757 | "outputs": [], 758 | "source": [ 759 | "m" 760 | ] 761 | }, 762 | { 763 | "cell_type": "code", 764 | "execution_count": null, 765 | "metadata": {}, 766 | "outputs": [], 767 | "source": [ 768 | "l = [ \"a\", \"b\", \"c\" ]\n", 769 | "l" 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": null, 775 | "metadata": {}, 776 | "outputs": [], 777 | "source": [ 778 | "list(zip( range(len(l)), l))" 779 | ] 780 | }, 781 | { 782 | "cell_type": "code", 783 | "execution_count": null, 784 | "metadata": {}, 785 | "outputs": [], 786 | "source": [ 787 | "m = dict(zip( range(len(l)), l))\n", 788 | "m" 789 | ] 790 | }, 791 | { 792 | "cell_type": "code", 793 | "execution_count": null, 794 | "metadata": {}, 795 | "outputs": [], 796 | "source": [ 797 | "for k in m.keys():\n", 798 | " print (k, m[k])" 799 | ] 800 | }, 801 | { 802 | "cell_type": "code", 803 | "execution_count": null, 804 | "metadata": {}, 805 | "outputs": [], 806 | "source": [ 807 | "for k in m:\n", 808 | " print( k, m[k])" 809 | ] 810 | }, 811 | { 812 | "cell_type": "markdown", 813 | "metadata": {}, 814 | "source": [ 815 | "## Funkcje" 816 | ] 817 | }, 818 | { 819 | "cell_type": "code", 820 | "execution_count": null, 821 | "metadata": {}, 822 | "outputs": [], 823 | "source": [ 824 | "def dodaj_2(x):\n", 825 | " wynik = x + 2\n", 826 | " return wynik" 827 | ] 828 | }, 829 | { 830 | "cell_type": "code", 831 | "execution_count": null, 832 | "metadata": {}, 833 | "outputs": [], 834 | "source": [ 835 | "dodaj_2(5)" 836 | ] 837 | }, 838 | { 839 | "cell_type": "code", 840 | "execution_count": null, 841 | "metadata": {}, 842 | "outputs": [], 843 | "source": [ 844 | "def is_odd(x):\n", 845 | " print (\"*\" * x)\n", 846 | " return (x % 2) == 1" 847 | ] 848 | }, 849 | { 850 | "cell_type": "code", 851 | "execution_count": null, 852 | "metadata": {}, 853 | "outputs": [], 854 | "source": [ 855 | "is_odd(5)" 856 | ] 857 | }, 858 | { 859 | "cell_type": "code", 860 | "execution_count": null, 861 | "metadata": {}, 862 | "outputs": [], 863 | "source": [ 864 | "is_odd(8)" 865 | ] 866 | }, 867 | { 868 | "cell_type": "code", 869 | "execution_count": null, 870 | "metadata": {}, 871 | "outputs": [], 872 | "source": [ 873 | "def slownie(n):\n", 874 | " jednosci = { 0: \"zero\", 1: \"jeden\", 2: \"dwa\", 3: \"trzy\", 4: \"cztery\", 5: \"pięć\", 6: \"sześć\", 7: \"siedem\", 8: \"osiem\", 9: \"dziewięć\"}\n", 875 | " return jednosci[n]\n", 876 | "\n", 877 | "slownie(6)" 878 | ] 879 | }, 880 | { 881 | "cell_type": "markdown", 882 | "metadata": {}, 883 | "source": [ 884 | "## Referencje" 885 | ] 886 | }, 887 | { 888 | "cell_type": "code", 889 | "execution_count": null, 890 | "metadata": {}, 891 | "outputs": [], 892 | "source": [ 893 | "a = [81, 82, 83]\n", 894 | "b = [81, 82, 83]\n", 895 | "\n", 896 | "print ( a is b )\n", 897 | "\n", 898 | "print (a == b )" 899 | ] 900 | }, 901 | { 902 | "cell_type": "code", 903 | "execution_count": null, 904 | "metadata": {}, 905 | "outputs": [], 906 | "source": [ 907 | "a = b\n", 908 | "\n", 909 | "a is b" 910 | ] 911 | }, 912 | { 913 | "cell_type": "code", 914 | "execution_count": null, 915 | "metadata": {}, 916 | "outputs": [], 917 | "source": [ 918 | "a.append(84)\n", 919 | "\n", 920 | "b" 921 | ] 922 | }, 923 | { 924 | "cell_type": "code", 925 | "execution_count": null, 926 | "metadata": {}, 927 | "outputs": [], 928 | "source": [ 929 | "a = [81, 82, 83]\n", 930 | "b = list(a)\n", 931 | "\n", 932 | "a is b" 933 | ] 934 | }, 935 | { 936 | "cell_type": "code", 937 | "execution_count": null, 938 | "metadata": {}, 939 | "outputs": [], 940 | "source": [ 941 | "n1 = 3\n", 942 | "n2 = 3\n", 943 | "\n", 944 | "n1 is n2" 945 | ] 946 | }, 947 | { 948 | "cell_type": "markdown", 949 | "metadata": {}, 950 | "source": [ 951 | "***" 952 | ] 953 | }, 954 | { 955 | "cell_type": "markdown", 956 | "metadata": {}, 957 | "source": [ 958 | "## Zadanie 1" 959 | ] 960 | }, 961 | { 962 | "cell_type": "raw", 963 | "metadata": {}, 964 | "source": [ 965 | "Napisać funkcję co policzy ilość wystąpień na liście\n", 966 | "\n", 967 | "Przykład:\n", 968 | "zlicz( [1, 2, 3, 4, 1, 2, 1] )\n", 969 | "\n", 970 | "{ 1: 3, 2: 3, 3: 1, 4: 1 }" 971 | ] 972 | }, 973 | { 974 | "cell_type": "markdown", 975 | "metadata": {}, 976 | "source": [ 977 | "## Zadanie 2" 978 | ] 979 | }, 980 | { 981 | "cell_type": "raw", 982 | "metadata": {}, 983 | "source": [ 984 | "\n", 985 | "Napisać funkcję co wypisze choinkę o odpowiedniej wysokości\n", 986 | "\n", 987 | "Przykład:\n", 988 | "choinka(7)\n", 989 | "\n", 990 | " *\n", 991 | " ***\n", 992 | " *****\n", 993 | " *******\n", 994 | " *********\n", 995 | " ***********\n", 996 | " *************\n", 997 | " *\n", 998 | " ***\n", 999 | "\n", 1000 | "choinka(4)\n", 1001 | " \n", 1002 | " *\n", 1003 | " ***\n", 1004 | " *****\n", 1005 | " *******\n", 1006 | " *\n", 1007 | " ***\n", 1008 | "\n" 1009 | ] 1010 | }, 1011 | { 1012 | "cell_type": "markdown", 1013 | "metadata": {}, 1014 | "source": [ 1015 | "## Zadanie 3" 1016 | ] 1017 | }, 1018 | { 1019 | "cell_type": "code", 1020 | "execution_count": null, 1021 | "metadata": {}, 1022 | "outputs": [], 1023 | "source": [ 1024 | "\n", 1025 | "Napisać funkcję co wypiszę liczbę naturalną słownie dla liczb od 0 do 999999.\n", 1026 | "\n", 1027 | "slownie(1984)\n", 1028 | "'tysiąc dziewięćset osiemdziesiąt cztery'\n" 1029 | ] 1030 | }, 1031 | { 1032 | "cell_type": "markdown", 1033 | "metadata": {}, 1034 | "source": [ 1035 | "# Literatura" 1036 | ] 1037 | }, 1038 | { 1039 | "cell_type": "markdown", 1040 | "metadata": {}, 1041 | "source": [ 1042 | "### https://docs.python.org/2/" 1043 | ] 1044 | }, 1045 | { 1046 | "cell_type": "markdown", 1047 | "metadata": {}, 1048 | "source": [ 1049 | "### https://pl.wikibooks.org/wiki/Zanurkuj_w_Pythonie" 1050 | ] 1051 | }, 1052 | { 1053 | "cell_type": "markdown", 1054 | "metadata": {}, 1055 | "source": [ 1056 | "### http://diveintopython.org" 1057 | ] 1058 | }, 1059 | { 1060 | "cell_type": "markdown", 1061 | "metadata": {}, 1062 | "source": [ 1063 | "### http://getpython3.com/diveintopython3/" 1064 | ] 1065 | }, 1066 | { 1067 | "cell_type": "markdown", 1068 | "metadata": {}, 1069 | "source": [ 1070 | "### http://www.greenteapress.com/thinkpython/html/index.html" 1071 | ] 1072 | }, 1073 | { 1074 | "cell_type": "markdown", 1075 | "metadata": {}, 1076 | "source": [ 1077 | "### https://jakevdp.github.io/PythonDataScienceHandbook/" 1078 | ] 1079 | } 1080 | ], 1081 | "metadata": { 1082 | "kernelspec": { 1083 | "display_name": "Python 3", 1084 | "language": "python", 1085 | "name": "python3" 1086 | }, 1087 | "language_info": { 1088 | "codemirror_mode": { 1089 | "name": "ipython", 1090 | "version": 3 1091 | }, 1092 | "file_extension": ".py", 1093 | "mimetype": "text/x-python", 1094 | "name": "python", 1095 | "nbconvert_exporter": "python", 1096 | "pygments_lexer": "ipython3", 1097 | "version": "3.7.3" 1098 | }, 1099 | "widgets": { 1100 | "application/vnd.jupyter.widget-state+json": { 1101 | "state": {}, 1102 | "version_major": 2, 1103 | "version_minor": 0 1104 | } 1105 | } 1106 | }, 1107 | "nbformat": 4, 1108 | "nbformat_minor": 4 1109 | } 1110 | -------------------------------------------------------------------------------- /SKN Statystyki - Python 3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Warsztaty Python w Data Science" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "***" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Python 3" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "***" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# https://github.com/MichalKorzycki/WarsztatPythonDataScience" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "***" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## 3. Data Wrangling 1\n", 50 | "- Tidy Data – co to jest\n", 51 | "- Data wrangling, munging, tidying\n", 52 | "- Biblioteka Pandas – wprowadzenie. Czytanie danych" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "## Zadanie 1" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "Stworz funkcję która zwróci drugi najmniejszy element listy\n", 67 | "\n", 68 | "druginajmniejszy([1, 2, 3, 4, 6])\n", 69 | "\n", 70 | "2" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "def druginajmniejszy(lista):\n", 80 | " if len(lista) < 2:\n", 81 | " return lista[0]\n", 82 | " if lista[0] > lista[1]:\n", 83 | " pierwszy, drugi = lista[1], lista[0]\n", 84 | " else:\n", 85 | " pierwszy, drugi = lista[0], lista[1]\n", 86 | " for x in lista[2:]:\n", 87 | " #print x, pierwszy, drugi\n", 88 | " if x < drugi:\n", 89 | " if x < pierwszy:\n", 90 | " pierwszy, drugi = x, pierwszy\n", 91 | " else:\n", 92 | " drugi = x\n", 93 | " return drugi" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "druginajmniejszy([2,1,3,4,5])" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## Zadanie 2" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "Stworz funkcję która usunie z listy najmniejszy element listy\n", 117 | "\n", 118 | "l = [4, 1, 2, 3]\n", 119 | "\n", 120 | "usun_min(l)\n", 121 | "\n", 122 | "l\n", 123 | "\n", 124 | "[4, 2, 3]" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "def usun_min(lista):\n", 134 | " for i in range(len(lista)):\n", 135 | " if lista[i]==min(lista):\n", 136 | " del lista[i]" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "l = [4, 1, 2, 3]\n", 146 | "usun_min(l)\n", 147 | "l" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "def usun_min(lista):\n", 157 | " do_usuniecia = set()\n", 158 | " for i in range(len(lista)):\n", 159 | " if lista[i]==min(lista):\n", 160 | " do_usuniecia.add(i)\n", 161 | " for i in do_usuniecia:\n", 162 | " del lista[i]" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "l = [4, 1, 2, 3]\n", 172 | "usun_min(l)\n", 173 | "l" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "## Zadanie 3" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "Stworz funkcję która usunie \"wyplaszczy\" zagnieżdzone listy\n", 188 | "\n", 189 | "l = [4, 1, [ 2, [7] ], 3]\n", 190 | "\n", 191 | "wyplaszcz(l)\n", 192 | "\n", 193 | "l\n", 194 | "\n", 195 | "[4, 1, 2, 7, 3]" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "def wyplaszcz(lista):\n", 205 | " ret = []\n", 206 | " for x in lista:\n", 207 | " if type(x)==type([]):\n", 208 | " for y in wyplaszcz(x):\n", 209 | " ret.append(y)\n", 210 | " else:\n", 211 | " ret.append(x)\n", 212 | " return ret" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "l = [4, 1, [ 2, [7] ], 3]\n", 222 | "l = wyplaszcz(l)\n", 223 | "l" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "def wyplaszcz(lista):\n", 233 | " kopia = list(lista)\n", 234 | " for i in range(len(lista)-1,-1,-1):\n", 235 | " del lista[i]\n", 236 | " for x in kopia:\n", 237 | " if type(x)==type([]):\n", 238 | " for y in wyplaszcz(x):\n", 239 | " lista.append(y)\n", 240 | " else:\n", 241 | " lista.append(x)\n", 242 | " return lista" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "l = [4, 1, [ 2, [7] ], 3]\n", 252 | "wyplaszcz(l)\n", 253 | "l" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "## Zadanie 4" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "Wygenerować listę liczb z zakresu 1-1000 które nie są podzielne przez żadną cyfrę od 2 do 9." 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "[ (num,[div for div in range(2,10) if num%div == 0]) for num in range(1,20)]" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "[num for num in range(1,1001) if [div for div in range(2,10) if num%div == 0]]" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "***" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "## Errata" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "def zlicz(lista):\n", 309 | " ilosc_wystapien = {}\n", 310 | " return {x: ilosc_wystapien.get(x, 0) + 1 for x in lista }\n" 311 | ] 312 | }, 313 | { 314 | "cell_type": "raw", 315 | "metadata": {}, 316 | "source": [ 317 | "Nie działa :)" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "***" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "## Data Wrangling, Munging, Tidying " 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "### Data Wrangling\n", 339 | "- __Discovering__ (eksploracja danych)\n", 340 | "- __Structuring__ (przygotowanie danych do konkretnego zadania)\n", 341 | "- __Cleaning__ (ujednolicenie danych, usuwanie danych niepotrzebnych itd.)\n", 342 | "- Enriching (łączenie danych)\n", 343 | "- Validating (sprawdzenie poprawności merytorycznej)\n", 344 | "- Publishing (publikacja wyników)" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "## Tidy Data" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": {}, 357 | "source": [ 358 | "Wickham, Hadley - _\"Tidy Data\"_\n", 359 | "https://www.jstatsoft.org/index.php/jss/article/view/v059i10/v59i10.pdf" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "- __Each variable you measure should be in one column.__\n", 367 | "- __Each different observation of that variable should be in a different row.__\n", 368 | "- There should be one table for each \"kind\" of variable.\n", 369 | "- If you have multiple tables, they should include a column in the table that allows them to be linked." 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "## Pandas" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": {}, 382 | "source": [ 383 | "https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],\n", 393 | " 'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions',\n", 394 | " 'Lions', 'Lions'],\n", 395 | " 'wins': [11, 8, 10, 15, 11, 6, 10, 4],\n", 396 | " 'losses': [5, 8, 6, 1, 5, 10, 6, 12]}" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "import pandas\n", 406 | "\n", 407 | "football = pandas.DataFrame(data)\n", 408 | "print (football)" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "metadata": {}, 415 | "outputs": [], 416 | "source": [ 417 | "football" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "football.describe()" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [ 435 | "football.dtypes" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": null, 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [ 444 | "football.head()" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [ 453 | "football.tail()" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": null, 459 | "metadata": {}, 460 | "outputs": [], 461 | "source": [ 462 | "football.sample(5)" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "metadata": {}, 469 | "outputs": [], 470 | "source": [ 471 | "football['year']" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [ 480 | "football.year" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": null, 486 | "metadata": {}, 487 | "outputs": [], 488 | "source": [ 489 | "football[['year', 'wins', 'losses']]" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [ 498 | "import numpy as np\n", 499 | "import pandas as pd\n", 500 | "\n", 501 | "index = pd.date_range('1/1/2000', periods=8)\n", 502 | "df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=['A', 'B', 'C'])\n", 503 | "df" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": null, 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [ 512 | "df.values" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": null, 518 | "metadata": {}, 519 | "outputs": [], 520 | "source": [ 521 | "df.index" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": null, 527 | "metadata": {}, 528 | "outputs": [], 529 | "source": [ 530 | "df.columns" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "newcols = []\n", 540 | "for i in range(len(df.columns)):\n", 541 | " newcols.append(df.columns[i].lower())\n", 542 | "df.columns = newcols" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [ 551 | "df" 552 | ] 553 | }, 554 | { 555 | "cell_type": "markdown", 556 | "metadata": {}, 557 | "source": [ 558 | "## Row Selection\n", 559 | "1. Slicing\n", 560 | "2. Individual index (iloc / loc)\n", 561 | "3. Boolean indexing\n", 562 | "4. Combination" 563 | ] 564 | }, 565 | { 566 | "cell_type": "markdown", 567 | "metadata": {}, 568 | "source": [ 569 | "### Slicing" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": null, 575 | "metadata": {}, 576 | "outputs": [], 577 | "source": [ 578 | "df[1:3]" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": null, 584 | "metadata": {}, 585 | "outputs": [], 586 | "source": [ 587 | "football[3:5]" 588 | ] 589 | }, 590 | { 591 | "cell_type": "markdown", 592 | "metadata": {}, 593 | "source": [ 594 | "### Individual index" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": null, 600 | "metadata": {}, 601 | "outputs": [], 602 | "source": [ 603 | "football['wins']" 604 | ] 605 | }, 606 | { 607 | "cell_type": "markdown", 608 | "metadata": {}, 609 | "source": [ 610 | "### Iloc\n", 611 | "- An integer, e.g. 5.\n", 612 | "- A list or array of integers, e.g. [4, 3, 0].\n", 613 | "- A slice object with ints, e.g. 1:7.\n", 614 | "- A boolean array.\n", 615 | "- A function" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": null, 621 | "metadata": {}, 622 | "outputs": [], 623 | "source": [ 624 | "football.iloc[[0,3]]" 625 | ] 626 | }, 627 | { 628 | "cell_type": "markdown", 629 | "metadata": {}, 630 | "source": [ 631 | "### Loc\n", 632 | "- A single label\n", 633 | "- A list or array of labels, e.g. ['a', 'b', 'c'].\n", 634 | "- A slice object with labels, e.g. 'a':'f' __(WARNING - both the start and the stop are included)__ \n", 635 | "- A boolean array\n", 636 | "- A callable function " 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": null, 642 | "metadata": {}, 643 | "outputs": [], 644 | "source": [ 645 | "df.loc['2000-01-03']" 646 | ] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": null, 651 | "metadata": {}, 652 | "outputs": [], 653 | "source": [ 654 | "df.loc['2000-01-03': '2000-01-04'] " 655 | ] 656 | }, 657 | { 658 | "cell_type": "markdown", 659 | "metadata": {}, 660 | "source": [ 661 | "### Boolean indexing" 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": null, 667 | "metadata": {}, 668 | "outputs": [], 669 | "source": [ 670 | "football[football.wins > 10]" 671 | ] 672 | }, 673 | { 674 | "cell_type": "markdown", 675 | "metadata": {}, 676 | "source": [ 677 | "### Combination" 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": null, 683 | "metadata": {}, 684 | "outputs": [], 685 | "source": [ 686 | "football[(football.wins > 10) & (football.team == \"Packers\")]" 687 | ] 688 | }, 689 | { 690 | "cell_type": "markdown", 691 | "metadata": {}, 692 | "source": [ 693 | "***" 694 | ] 695 | }, 696 | { 697 | "cell_type": "markdown", 698 | "metadata": {}, 699 | "source": [ 700 | "## Nowe kolumny" 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": null, 706 | "metadata": {}, 707 | "outputs": [], 708 | "source": [ 709 | "df.suma = df.a + df.b + df.c" 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": null, 715 | "metadata": {}, 716 | "outputs": [], 717 | "source": [ 718 | "df['suma'] = df.a + df.b + df.c" 719 | ] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": null, 724 | "metadata": {}, 725 | "outputs": [], 726 | "source": [ 727 | "df.suma" 728 | ] 729 | }, 730 | { 731 | "cell_type": "markdown", 732 | "metadata": {}, 733 | "source": [ 734 | "### Zadanie 1" 735 | ] 736 | }, 737 | { 738 | "cell_type": "markdown", 739 | "metadata": {}, 740 | "source": [ 741 | "Podsumować dane dla zespołu `Packers` (stworzyć `DataFrame` z danymi tego zespołu)" 742 | ] 743 | }, 744 | { 745 | "cell_type": "markdown", 746 | "metadata": {}, 747 | "source": [ 748 | "### Zadanie 2" 749 | ] 750 | }, 751 | { 752 | "cell_type": "markdown", 753 | "metadata": {}, 754 | "source": [ 755 | "Dodać do DataFrame `football` kolumnę `games_played`" 756 | ] 757 | }, 758 | { 759 | "cell_type": "markdown", 760 | "metadata": {}, 761 | "source": [ 762 | "### Zadanie 3" 763 | ] 764 | }, 765 | { 766 | "cell_type": "markdown", 767 | "metadata": {}, 768 | "source": [ 769 | "Dodać do DataFrame `football` kolumnę `percentage_games_won`" 770 | ] 771 | }, 772 | { 773 | "cell_type": "markdown", 774 | "metadata": {}, 775 | "source": [ 776 | "### Zadanie 4" 777 | ] 778 | }, 779 | { 780 | "cell_type": "markdown", 781 | "metadata": {}, 782 | "source": [ 783 | "Stworzyć funkcję `get_subsets(l)` która wygeneruje wszystkie podzbiory elementów z `l` korzystając z `frozenset`\n", 784 | "\n", 785 | "\n", 786 | "Przykład:\n", 787 | " \n", 788 | "`get_subsets(['a', 'b', 'c', 'd'])`\n", 789 | "\n", 790 | "\n", 791 | "`{frozenset(),\n", 792 | " frozenset({'a', 'b', 'c'}),\n", 793 | " frozenset({'d'}),\n", 794 | " frozenset({'b'}),\n", 795 | " frozenset({'a'}),\n", 796 | " frozenset({'b', 'd'}),\n", 797 | " frozenset({'a', 'd'}),\n", 798 | " frozenset({'c'}),\n", 799 | " frozenset({'a', 'c', 'd'}),\n", 800 | " frozenset({'b', 'c'}),\n", 801 | " frozenset({'b', 'c', 'd'}),\n", 802 | " frozenset({'a', 'b'}),\n", 803 | " frozenset({'a', 'b', 'd'}),\n", 804 | " frozenset({'a', 'c'}),\n", 805 | " frozenset({'c', 'd'}),\n", 806 | " frozenset({'a', 'b', 'c', 'd'})}`" 807 | ] 808 | } 809 | ], 810 | "metadata": { 811 | "kernelspec": { 812 | "display_name": "Python 3", 813 | "language": "python", 814 | "name": "python3" 815 | }, 816 | "language_info": { 817 | "codemirror_mode": { 818 | "name": "ipython", 819 | "version": 3 820 | }, 821 | "file_extension": ".py", 822 | "mimetype": "text/x-python", 823 | "name": "python", 824 | "nbconvert_exporter": "python", 825 | "pygments_lexer": "ipython3", 826 | "version": "3.7.3" 827 | }, 828 | "widgets": { 829 | "application/vnd.jupyter.widget-state+json": { 830 | "state": {}, 831 | "version_major": 2, 832 | "version_minor": 0 833 | } 834 | } 835 | }, 836 | "nbformat": 4, 837 | "nbformat_minor": 4 838 | } 839 | -------------------------------------------------------------------------------- /SKN Statystyki - 10 - Regresja.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Wykład 10 - Nauczanie Maszynowe 1" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Regresja" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "---\n", 22 | "\n", 23 | "## Nauczanie Maszynowe (_Machine Learning_)\n", 24 | "\n", 25 | "- Z nadzorem (_supervised_)\n", 26 | "- Bez nadzoru (_unsupervised_)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "\n", 34 | "\n", 35 | "## Nauczanie Maszynowe bez nadzoru\n", 36 | "- Klasteryzacja\n", 37 | "- Reguły asocjacyjne\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "\n", 45 | "## Nauczanie Maszynowe z nadzorem\n", 46 | "- Klasyfikacja \n", 47 | "- Regresja\n" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "---\n", 55 | "\n", 56 | "### Dla zmiennych tłumaczących `X` szukamy funkcji `f` która zwróci nam jak najlepiej przybliżone dane tłumaczone `y`\n", 57 | "\n", 58 | "$$ \n", 59 | "y \\approx f (X)\n", 60 | "$$\n", 61 | "\n", 62 | "---" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "%matplotlib inline\n", 72 | "import matplotlib.pyplot as plt\n", 73 | "import numpy as np\n", 74 | "plt.figure(figsize=(10,6))\n", 75 | "plt.style.use(\"dark_background\")\n", 76 | "\n", 77 | "x = np.linspace(-2, 2, 10)\n", 78 | "plt.scatter(x, x+0.5*np.abs(x));" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "x = np.linspace(-2, 2, 100)\n", 88 | "plt.plot(x, x+0.6*x*x)\n", 89 | "x = np.linspace(-2, 2, 10)\n", 90 | "plt.scatter(x, x+0.5*np.abs(x));" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "x = np.linspace(-2, 2, 100)\n", 100 | "plt.plot(x, x+0.6*x*x)\n", 101 | "plt.plot(x, 1.3*x)\n", 102 | "x = np.linspace(-2, 2, 10)\n", 103 | "plt.scatter(x, x+0.5*np.abs(x));" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "x = np.linspace(-2, 2, 100)\n", 113 | "plt.plot(x, x*0.4+0.1)\n", 114 | "plt.plot(x, 1.3*x)\n", 115 | "x = np.linspace(-2, 2, 10)\n", 116 | "plt.scatter(x, x+0.5*np.abs(x));" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "---" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "import pandas as pd\n", 133 | "import numpy as np" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "data = pd.read_csv('adverts_22_04.csv', sep=';')\n", 143 | "data" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "data['cena_za_metr'] = data['Cena'] / data['Wielkość (m2)']\n", 153 | "data = data.dropna(subset=['cena_za_metr'])\n", 154 | "data = data.drop(['Cena', 'Data dodania'], axis=1)\n", 155 | "data" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "from sklearn.preprocessing import LabelEncoder\n", 165 | "labelencoder = LabelEncoder()\n", 166 | "\n", 167 | "label_encoded = data\n", 168 | "\n", 169 | "label_encoded['Lokalizacja_Cat'] = labelencoder.fit_transform(label_encoded['Lokalizacja'])\n", 170 | "label_encoded" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "data" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "from sklearn.preprocessing import OneHotEncoder\n", 189 | "\n", 190 | "enc = OneHotEncoder(handle_unknown='ignore')\n", 191 | "\n", 192 | "enc_df = pd.DataFrame(enc.fit_transform(label_encoded[['Lokalizacja_Cat']]).toarray())\n", 193 | "\n", 194 | "one_hot_data = label_encoded.join(enc_df)\n", 195 | "one_hot_data" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "dum_df = pd.get_dummies(data, columns=['Lokalizacja'])\n", 205 | "dum_df" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "data = pd.read_csv('adverts_22_04.csv', sep=';')\n", 215 | "data['cena_za_metr'] = data['Cena'] / data['Wielkość (m2)']\n", 216 | "data = data.dropna(subset=['cena_za_metr'])\n", 217 | "data = data.drop(['Cena', 'Data dodania'], axis=1)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "dum_df = pd.get_dummies(data, columns=['Lokalizacja', 'Na sprzedaż przez', 'Rodzaj nieruchomości', 'Liczba pokoi', 'Liczba łazienek', 'Parking'])\n", 227 | "dum_df" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "dum_df.columns" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "dum_df.corr()['cena_za_metr']" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "import numpy as np\n", 255 | "import seaborn as sns\n", 256 | "import matplotlib.pyplot as plt\n", 257 | "import matplotlib.dates as mdates\n", 258 | "\n", 259 | "plt.figure(figsize=(20,15))\n", 260 | "plt.style.use(\"dark_background\")\n", 261 | "\n", 262 | "sns.heatmap(dum_df.corr(), cmap=\"seismic\", annot=True, vmin=-1, vmax=1);" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "from sklearn.linear_model import LinearRegression\n", 272 | "\n", 273 | "y = dum_df['cena_za_metr']\n", 274 | "X = dum_df.drop(['cena_za_metr'], axis=1)\n", 275 | "\n", 276 | "reg = LinearRegression().fit(X, y)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "reg.score(X,y)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "reg.coef_" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "reg.intercept_" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | " $$y = b + m_0 x_0 + m_1 x_1 + ... + m_n x_n$$" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "list(zip(X.columns,reg.coef_))" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "X.iloc[0]" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [ 344 | "reg.predict(X.iloc[0:1])" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [] 360 | } 361 | ], 362 | "metadata": { 363 | "kernelspec": { 364 | "display_name": "Python 3", 365 | "language": "python", 366 | "name": "python3" 367 | }, 368 | "language_info": { 369 | "codemirror_mode": { 370 | "name": "ipython", 371 | "version": 3 372 | }, 373 | "file_extension": ".py", 374 | "mimetype": "text/x-python", 375 | "name": "python", 376 | "nbconvert_exporter": "python", 377 | "pygments_lexer": "ipython3", 378 | "version": "3.7.5" 379 | } 380 | }, 381 | "nbformat": 4, 382 | "nbformat_minor": 4 383 | } 384 | -------------------------------------------------------------------------------- /SKN Statystyki - Python 11.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Wyklad 11" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Machine Learning cd." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "### Czytanie i Pisanie" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "### Plików Excela" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "import os\n", 38 | "os.getcwd()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "import pandas as pd\n", 48 | "\n", 49 | "df = pd.DataFrame({'Data': [10, 20, 30, 20, 15, 30, 45]})\n", 50 | "writer = pd.ExcelWriter('pandas_simple.xlsx', engine='xlsxwriter')\n", 51 | "\n", 52 | "df.to_excel(writer, sheet_name='Sheet1')\n", 53 | "\n", 54 | "writer.save()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "import pandas as pd\n", 64 | "\n", 65 | "\n", 66 | "path = ('Book1.xlsx')\n", 67 | "xl = pd.ExcelFile(path)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "xl.sheet_names" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "df1 = xl.parse('Sheet1')" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "df1" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "# Support Vector Machines" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "## Usages" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "- classification\n", 116 | "- regression\n", 117 | "- outlier detection" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "## SVC Flavors in Scikit" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "- SVC - many kernels to choose from. L2 error metric for regularization (strong penalization of big factors, sensitive to outliers)\n", 132 | "- NuSVC - slightly different implementation - worth checking against SVC\n", 133 | "- Linear SVC - only linear kernel, more settings, L1 error metric for regularization\n", 134 | "- OneClassSVM - good for detecting outliers" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "## What are they good for" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "- Effective in high dimensional spaces.\n", 149 | "- Still effective in cases where number of dimensions is greater than the number of samples.\n", 150 | "- Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient.\n", 151 | "- Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided, but it is also possible to specify custom kernels." 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "## Complexity" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "Between\n", 166 | "\\begin{equation}\n", 167 | "O(n_{features} \\times n_{samples}^2)\n", 168 | "\\end{equation}\n", 169 | "and\n", 170 | "\\begin{equation}\n", 171 | "O(n_{features} \\times n_{samples}^3)\n", 172 | "\\end{equation}" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "from sklearn import datasets\n", 182 | "\n", 183 | "digits = datasets.load_digits()\n", 184 | "digits.data.shape" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "from sklearn import svm\n", 194 | "clf = svm.SVC(C=100., kernel=\"linear\")\n", 195 | "X = digits.data[:-1]\n", 196 | "y = digits.target[:-1]\n", 197 | "clf.fit(X, y) " 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "clf.get_params()" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "clf.coef_" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "shape = [n_class * (n_class-1) / 2, n_features]" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "clf.coef_.shape" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "import numpy as np\n", 241 | "from sklearn.model_selection import train_test_split\n", 242 | "from sklearn import datasets\n", 243 | "from sklearn import svm\n", 244 | "\n", 245 | "iris = datasets.load_iris()\n", 246 | "\n", 247 | "iris.data.shape, iris.target.shape" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "%matplotlib notebook\n", 257 | "\n", 258 | "import numpy as np\n", 259 | "import matplotlib.pyplot as plt\n", 260 | "from sklearn import svm, datasets\n", 261 | "\n", 262 | "# import some data to play with\n", 263 | "iris = datasets.load_iris()\n", 264 | "X = iris.data[:, :2] # we only take the first two features. We could\n", 265 | " # avoid this ugly slicing by using a two-dim dataset\n", 266 | "y = iris.target\n", 267 | "\n", 268 | "h = .02 # step size in the mesh\n", 269 | "\n", 270 | "# we create an instance of SVM and fit out data. We do not scale our\n", 271 | "# data since we want to plot the support vectors\n", 272 | "C = 1.0 # SVM regularization parameter\n", 273 | "svc = svm.SVC(kernel='linear', C=C).fit(X, y)\n", 274 | "rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X, y)\n", 275 | "poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(X, y)\n", 276 | "lin_svc = svm.LinearSVC(C=C).fit(X, y)\n", 277 | "\n", 278 | "# create a mesh to plot in\n", 279 | "x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n", 280 | "y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n", 281 | "xx, yy = np.meshgrid(np.arange(x_min, x_max, h),\n", 282 | " np.arange(y_min, y_max, h))\n", 283 | "\n", 284 | "# title for the plots\n", 285 | "titles = ['SVC with linear kernel',\n", 286 | " 'LinearSVC (linear kernel)',\n", 287 | " 'SVC with RBF kernel',\n", 288 | " 'SVC with polynomial (degree 3) kernel']\n", 289 | "\n", 290 | "\n", 291 | "for i, clf in enumerate((svc, lin_svc, rbf_svc, poly_svc)):\n", 292 | " # Plot the decision boundary. For that, we will assign a color to each\n", 293 | " # point in the mesh [x_min, m_max]x[y_min, y_max].\n", 294 | " plt.subplot(2, 2, i + 1)\n", 295 | " plt.subplots_adjust(wspace=0.8, hspace=0.8)\n", 296 | "\n", 297 | " Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n", 298 | "\n", 299 | " # Put the result into a color plot\n", 300 | " Z = Z.reshape(xx.shape)\n", 301 | " plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.6)\n", 302 | "\n", 303 | " # Plot also the training points\n", 304 | " plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)\n", 305 | " plt.xlabel('Sepal length')\n", 306 | " plt.ylabel('Sepal width')\n", 307 | " plt.xlim(xx.min(), xx.max())\n", 308 | " plt.ylim(yy.min(), yy.max())\n", 309 | " plt.xticks(())\n", 310 | " plt.yticks(())\n", 311 | " plt.title(titles[i])\n", 312 | "\n", 313 | "plt.show()" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "## Kernels" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": {}, 326 | "source": [ 327 | "- linear (use linear SVC if not grid-searching kernel)\n", 328 | "- RBF - default - Radial-basis function kernel (aka squared-exponential kernel).\n", 329 | "$$ k(x_i, x_j) = exp(\\frac{-1}{ 2 d(x_i / lengthscale, x_j / lengthscale)^2}) $$\n", 330 | "- poly\n", 331 | "- sigmoid\n", 332 | "$$ K(X, Y) = tanh( gamma ~\\times + coef_0)$$\n", 333 | "- precomputed, callable" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "%matplotlib inline\n", 343 | "import matplotlib.pyplot as plt\n", 344 | "import numpy as np\n", 345 | "\n", 346 | "x = np.linspace(-20, 20, 100)\n", 347 | "plt.title('Logistic function')\n", 348 | "plt.plot(x, np.tanh(x))\n", 349 | "print()" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "## Parameters" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "- C - Penalty parameter for error\n", 364 | "- kernel\n", 365 | "- gamma - kernel coefficient for rbf, poly, sigmoid \n", 366 | " - if gamma='scale' is passed then it uses 1 / (n_features * X.std\n", 367 | "- class_weight : {dict, ‘balanced’}" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": {}, 373 | "source": [ 374 | "## C - Penalty & Regularization" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "![title](img\\over.png)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "## Kernel" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": {}, 394 | "source": [ 395 | "![title](img\\kernel.png)" 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "## Weighting classes" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "![title](img\\unbalanced.png)" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "from __future__ import print_function\n", 419 | "\n", 420 | "from sklearn import datasets\n", 421 | "from sklearn.model_selection import train_test_split\n", 422 | "from sklearn.model_selection import GridSearchCV\n", 423 | "from sklearn.metrics import classification_report\n", 424 | "from sklearn.svm import SVC\n", 425 | "\n", 426 | "digits = datasets.load_digits()\n", 427 | "\n", 428 | "n_samples = len(digits.images)\n", 429 | "X = digits.images.reshape((n_samples, -1))\n", 430 | "y = digits.target\n", 431 | "\n", 432 | "# Split the dataset in two equal parts\n", 433 | "X_train, X_test, y_train, y_test = train_test_split(\n", 434 | " X, y, test_size=0.5, random_state=0)\n", 435 | "\n", 436 | "# Set the parameters by cross-validation\n", 437 | "tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],\n", 438 | " 'C': [1, 10, 100, 1000]},\n", 439 | " {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]\n", 440 | "\n", 441 | "scores = ['precision', 'recall', 'f1']\n", 442 | "\n", 443 | "for score in scores:\n", 444 | " print()\n", 445 | " print(\"# Tuning hyper-parameters for %s\" % score)\n", 446 | "\n", 447 | " clf = GridSearchCV(SVC(), tuned_parameters, cv=5,\n", 448 | " scoring='%s_macro' % score)\n", 449 | " clf.fit(X_train, y_train)\n", 450 | "\n", 451 | " print(\"Best parameters set found on development set:\")\n", 452 | " print(clf.best_params_)\n", 453 | " print()\n", 454 | " print(\"Grid scores on development set:\")\n", 455 | " means = clf.cv_results_['mean_test_score']\n", 456 | " stds = clf.cv_results_['std_test_score']\n", 457 | " for mean, std, params in zip(means, stds, clf.cv_results_['params']):\n", 458 | " print(\"%0.3f (+/-%0.03f) for %r\"\n", 459 | " % (mean, std * 2, params))\n", 460 | " \n" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": null, 466 | "metadata": {}, 467 | "outputs": [], 468 | "source": [ 469 | "from sklearn.metrics import f1_score, recall_score, precision_score, SCORERS" 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": {}, 475 | "source": [ 476 | "$$F_1 = \\frac{2 \\times precision \\times recall}{precision + recall} $$" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": null, 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [ 485 | "y_true = [0, 1, 0, 1, 0, 1, 0, 1]\n", 486 | "y_pred = [0, 1, 0, 0, 0, 0, 1, 1]\n", 487 | "\n", 488 | "precision_score(y_true, y_pred)" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [ 497 | "recall_score(y_true, y_pred)\n" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "f1_score(y_true, y_pred)" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": null, 512 | "metadata": {}, 513 | "outputs": [], 514 | "source": [ 515 | "SCORERS.keys()" 516 | ] 517 | } 518 | ], 519 | "metadata": { 520 | "kernelspec": { 521 | "display_name": "Python 2", 522 | "language": "python", 523 | "name": "python2" 524 | }, 525 | "language_info": { 526 | "codemirror_mode": { 527 | "name": "ipython", 528 | "version": 2 529 | }, 530 | "file_extension": ".py", 531 | "mimetype": "text/x-python", 532 | "name": "python", 533 | "nbconvert_exporter": "python", 534 | "pygments_lexer": "ipython2", 535 | "version": "2.7.14" 536 | } 537 | }, 538 | "nbformat": 4, 539 | "nbformat_minor": 2 540 | } 541 | -------------------------------------------------------------------------------- /Workshop 2 - Beautiful Soup.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "html_doc = \"\"\"\n", 10 | "The Dormouse's story\n", 11 | "\n", 12 | "

The Dormouse's story

\n", 13 | "\n", 14 | "

Once upon a time there were three little sisters; and their names were\n", 15 | "Elsie,\n", 16 | "Lacie and\n", 17 | "Tillie;\n", 18 | "and they lived at the bottom of a well.

\n", 19 | "\n", 20 | "

...

\n", 21 | "\"\"\"" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "from bs4 import BeautifulSoup\n", 31 | "soup = BeautifulSoup(html_doc, 'html.parser')\n", 32 | "\n", 33 | "print(soup.prettify())" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "from bs4 import BeautifulSoup\n", 43 | "import lxml\n", 44 | "soup = BeautifulSoup(html_doc, 'lxml')\n" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "soup.p" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "soup.p['class']" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "soup.a" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "soup.find_all('a')" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "soup.find(id=\"link3\")" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "[ link.get('href') for link in soup.find_all('a')]" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "soup.a" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "soup.a.find_next_sibling(\"a\")" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "soup.p" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "soup.p.find_next_sibling(\"p\")" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "pn=soup.p.find_next_sibling(\"p\")\n", 144 | "children = pn.children" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "children" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "lista = [ x for x in children ]\n", 163 | "lista" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "lista[1].get('href')" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "head_tag = soup.head\n", 182 | "head_tag" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "for child in head_tag.children:\n", 192 | " print(child)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "for child in head_tag.descendants:\n", 202 | " print(child)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "last_a_tag = soup.find(\"a\", id=\"link3\")\n", 212 | "last_a_tag\n" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "last_a_tag.next_sibling" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "last_a_tag.next_element" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "last_a_tag.parent" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "def has_class_but_no_id(tag):\n", 249 | " return tag.has_attr('class') and not tag.has_attr('id')\n", 250 | "\n", 251 | "soup.find_all(has_class_but_no_id)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "soup.find_all(id='link2')" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "soup.find_all(\"a\", class_=\"sister\")" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "soup.find_all(\"a\")\n", 279 | "soup(\"a\")" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [] 288 | } 289 | ], 290 | "metadata": { 291 | "kernelspec": { 292 | "display_name": "Python 3", 293 | "language": "python", 294 | "name": "python3" 295 | }, 296 | "language_info": { 297 | "codemirror_mode": { 298 | "name": "ipython", 299 | "version": 3 300 | }, 301 | "file_extension": ".py", 302 | "mimetype": "text/x-python", 303 | "name": "python", 304 | "nbconvert_exporter": "python", 305 | "pygments_lexer": "ipython3", 306 | "version": "3.6.7" 307 | } 308 | }, 309 | "nbformat": 4, 310 | "nbformat_minor": 2 311 | } 312 | -------------------------------------------------------------------------------- /Workshop 3 - Paginacja.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Generatory, Yield" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "mylist = [0, 1, 4]\n", 17 | "for i in mylist:\n", 18 | " print(i)" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "mylist = [x*x for x in range(3)]\n", 28 | "for i in mylist:\n", 29 | " print(i)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "mylist = (x*x for x in range(3))\n", 39 | "for i in mylist:\n", 40 | " print(i)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "def createGenerator():\n", 50 | " mylist = range(3)\n", 51 | " for i in mylist:\n", 52 | " yield i*i\n", 53 | " \n", 54 | "for i in createGenerator():\n", 55 | " print(i)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "# Fail2Ban" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "As you can see in my example, I have set up 300 maxretry and 300 for findtime, so, we need to have 300 GETs from the same IP in a time window of 300 seconds to have the originating IP blocked." 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "# Scraper" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "Scrapy doesn’t wait a fixed amount of time between requests, but uses a random interval between 0.5 * DOWNLOAD_DELAY and 1.5 * DOWNLOAD_DELAY." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "import scrapy\n", 93 | "import scrapy.crawler as crawler\n", 94 | "from bs4 import BeautifulSoup\n", 95 | "\n", 96 | "from scrapy.crawler import CrawlerProcess\n", 97 | "\n", 98 | "class MySpider(scrapy.Spider):\n", 99 | " name = 'myspider'\n", 100 | " start_urls = [\n", 101 | " 'https://www.gumtree.pl/s-mieszkania-i-domy-sprzedam-i-kupie/mazowieckie/v1c9073l3200001p1'\n", 102 | " ]\n", 103 | " \n", 104 | " custom_settings = {\n", 105 | " 'DOWNLOAD_DELAY': '4.0',\n", 106 | " }\n", 107 | "\n", 108 | " top_url = 'https://www.gumtree.pl'\n", 109 | " def parse(self, response):\n", 110 | " self.logger.info('Got successful response from {}'.format(response.url))\n", 111 | " soup = BeautifulSoup(response.body, 'lxml')\n", 112 | " link_tabs = soup.findAll(\"div\", {\"class\": \"result-link\"})\n", 113 | " item_urls = []\n", 114 | " for tab in link_tabs:\n", 115 | " hrefs = tab.findAll(\"a\", {\"class\": \"href-link\"})\n", 116 | " for h in hrefs:\n", 117 | " item_urls.append(self.top_url + h[\"href\"])\n", 118 | " \n", 119 | " \n", 120 | " for item_url in item_urls:\n", 121 | " yield scrapy.Request(item_url, self.parse_item)\n", 122 | "\n", 123 | " def parse_item(self, response):\n", 124 | " self.logger.info('Got successful response from {}'.format(response.url))\n", 125 | " # \n", 126 | " #item = MyItem()\n", 127 | " # populate `item` fields\n", 128 | " # and extract item_details_url\n", 129 | " #yield scrapy.Request(item_details_url, self.parse_details, meta={'item': item})\n", 130 | "\n", 131 | "\n" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "process = CrawlerProcess({\n", 141 | " 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'\n", 142 | "})\n", 143 | "process.crawl(MySpider)\n", 144 | "process.start()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "import scrapy\n", 154 | "import scrapy.crawler as crawler\n", 155 | "from bs4 import BeautifulSoup\n", 156 | "\n", 157 | "from scrapy.crawler import CrawlerProcess\n", 158 | "\n", 159 | "class MyPaginatingSpider(scrapy.Spider):\n", 160 | " name = 'mypaginatingspider'\n", 161 | " start_urls = [\n", 162 | " 'https://www.gumtree.pl/s-mieszkania-i-domy-sprzedam-i-kupie/mazowieckie/v1c9073l3200001p1'\n", 163 | " ]\n", 164 | " \n", 165 | " custom_settings = {\n", 166 | " 'DOWNLOAD_DELAY': '6.0',\n", 167 | " }\n", 168 | "\n", 169 | " top_url = 'https://www.gumtree.pl'\n", 170 | " def parse(self, response):\n", 171 | " self.logger.info('Got successful response from {}'.format(response.url))\n", 172 | " soup = BeautifulSoup(response.body, 'lxml')\n", 173 | " link_tabs = soup.findAll(\"div\", {\"class\": \"result-link\"})\n", 174 | " item_urls = []\n", 175 | " next_urls = []\n", 176 | " for tab in link_tabs:\n", 177 | " hrefs = tab.findAll(\"a\", {\"class\": \"href-link\"})\n", 178 | " for h in hrefs:\n", 179 | " item_urls.append(self.top_url + h[\"href\"])\n", 180 | " \n", 181 | " nexts = soup.findAll(\"a\", {\"class\": \"next\"})\n", 182 | " \n", 183 | " for h in nexts:\n", 184 | " for h in hrefs:\n", 185 | " next_urls.append(self.top_url + h[\"href\"])\n", 186 | " \n", 187 | " print (next_urls)\n", 188 | " \n", 189 | " for item_url in item_urls:\n", 190 | " yield scrapy.Request(item_url, self.parse_item)\n", 191 | "\n", 192 | " #for next_url in next_urls:\n", 193 | " # yield scrapy.Request(item_url, self.parse)\n", 194 | " \n", 195 | " \n", 196 | " def parse_item(self, response):\n", 197 | " self.logger.info('Got successful response from {}'.format(response.url))\n", 198 | "\n" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "process = CrawlerProcess({\n", 208 | " 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'\n", 209 | "})\n", 210 | "process.crawl(MyPaginatingSpider)\n", 211 | "process.start()" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [] 220 | } 221 | ], 222 | "metadata": { 223 | "kernelspec": { 224 | "display_name": "Python 3", 225 | "language": "python", 226 | "name": "python3" 227 | }, 228 | "language_info": { 229 | "codemirror_mode": { 230 | "name": "ipython", 231 | "version": 3 232 | }, 233 | "file_extension": ".py", 234 | "mimetype": "text/x-python", 235 | "name": "python", 236 | "nbconvert_exporter": "python", 237 | "pygments_lexer": "ipython3", 238 | "version": "3.6.7" 239 | } 240 | }, 241 | "nbformat": 4, 242 | "nbformat_minor": 2 243 | } 244 | -------------------------------------------------------------------------------- /Workshop 4 - Poprawność zachowania.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# robots.txt" 8 | ] 9 | }, 10 | { 11 | "cell_type": "raw", 12 | "metadata": {}, 13 | "source": [ 14 | "User-agent: *\n", 15 | "\n", 16 | "#Sorting parameters\n", 17 | "Disallow: /*order=*\n", 18 | "Disallow: /*galleryView=*\n", 19 | "Disallow: /*sort=*\n", 20 | "\n", 21 | "#Other comments:\n", 22 | "Disallow: *?sr=8*\n", 23 | "Disallow: /f-pbfvb/*\n", 24 | "Disallow: *?be=*\n", 25 | "Disallow: *?geoTag=*\n", 26 | "Disallow: /api/*\n", 27 | "Disallow: /306629115/MX_*\n", 28 | "Disallow: /bolt-2dot0-frontend$*\n", 29 | "Disallow: /7162/Gumtree_*\n", 30 | "Disallow: /169054071/Gumtree_*\n", 31 | "Disallow: /306629115/iBazar_MX*\n", 32 | "Disallow: /306629115/AR_*\n", 33 | "\n", 34 | "#Sitemaps\n", 35 | "Sitemap: https://www.gumtree.pl/vip_index.xml\n", 36 | "Sitemap: https://www.gumtree.pl/sitemap_index.xml\n", 37 | "Sitemap: https://www.gumtree.pl/sitemap_loccat.xml\n", 38 | "Sitemap: https://www.gumtree.pl/sitemap_loccatatt.xml\n", 39 | " " 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "# Scraper" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "import scrapy\n", 56 | "import scrapy.crawler as crawler\n", 57 | "from bs4 import BeautifulSoup\n", 58 | "\n", 59 | "from scrapy.crawler import CrawlerProcess\n", 60 | "\n", 61 | "class GumtreeApartmentsSpider(scrapy.Spider):\n", 62 | " name = 'gumtreeapartmentsspider'\n", 63 | " start_urls = [\n", 64 | " 'https://www.gumtree.pl/s-mieszkania-i-domy-sprzedam-i-kupie/mazowieckie/page-'+str(i)+'/v1c9073l3200001p'+str(i) for i in range(2,4)\n", 65 | " ]\n", 66 | " start_urls.append(\n", 67 | " 'https://www.gumtree.pl/s-mieszkania-i-domy-sprzedam-i-kupie/mazowieckie/v1c9073l3200001p1'\n", 68 | " )\n", 69 | " found_apartments = []\n", 70 | " \n", 71 | " custom_settings = {\n", 72 | " 'DOWNLOAD_DELAY': '4.0', #obejście Fail2Ban\n", 73 | " 'ROBOTSTXT_OBEY' : True\n", 74 | " }\n", 75 | "\n", 76 | " top_url = 'https://www.gumtree.pl'\n", 77 | " def parse(self, response):\n", 78 | " self.logger.info('Got successful response from {}'.format(response.url))\n", 79 | " soup = BeautifulSoup(response.body, 'lxml')\n", 80 | " link_tabs = soup.findAll(\"div\", {\"class\": \"result-link\"})\n", 81 | " item_urls = []\n", 82 | " next_urls = []\n", 83 | " for tab in link_tabs:\n", 84 | " hrefs = tab.findAll(\"a\", {\"class\": \"href-link\"})\n", 85 | " for h in hrefs:\n", 86 | " item_urls.append(self.top_url + h[\"href\"]) #dopisuje 'https://www.gumtree.pl', bo otrzymany adres jest względny, przygotowuje listę wszystkich linków\n", 87 | " \n", 88 | " for item_url in item_urls:\n", 89 | " yield scrapy.Request(item_url, self.parse_item)\n", 90 | " \n", 91 | " def parse_item(self, response): #item_url - odwiedzanie strony, #self.parse_item - przetworzenie przy pomocy funkcji\n", 92 | " #Courtesy of Mr Sebastian Muraszewski\n", 93 | " found_apartments = []\n", 94 | " soup = BeautifulSoup(response.text, 'html.parser') \n", 95 | " apartments = soup.find('div', {'class': 'vip-header-and-details'})\n", 96 | " apartment_details = dict()\n", 97 | " apartment_details['Nazwa ogłoszenia'] = apartments.find('span', class_ = 'myAdTitle').text\n", 98 | " apartment_details['Cena'] = apartments.find('span', class_ = 'amount').text.replace(\"\\xa0\",\" \") #replace w celu usunięcia twardej spacji\n", 99 | " container = soup.find('ul', class_ = 'selMenu') #zebranie informacji z ramki do kontenera\n", 100 | " \n", 101 | " nazwy = container.findAll('span', class_ = 'name')\n", 102 | " szczegoly = container.findAll('span', class_ = 'value')\n", 103 | " apartment_details.update({name.text: value.text.strip() for name, value in zip(nazwy, szczegoly)}) #dodawanie elementów z ramki do listy i usunięcie whitespace'ów\n", 104 | " \n", 105 | " with open('plik.csv', 'a') as f:\n", 106 | " v = apartment_details.values()\n", 107 | " f.write('\\t'.join(v))\n" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "process = CrawlerProcess({\n", 117 | " 'USER_AGENT': 'Testing MyCrawler (michal.korzycki@gmail.com)'\n", 118 | "})\n", 119 | "process.crawl(GumtreeApartmentsSpider)\n", 120 | "process.start()" 121 | ] 122 | } 123 | ], 124 | "metadata": { 125 | "kernelspec": { 126 | "display_name": "Python 3", 127 | "language": "python", 128 | "name": "python3" 129 | }, 130 | "language_info": { 131 | "codemirror_mode": { 132 | "name": "ipython", 133 | "version": 3 134 | }, 135 | "file_extension": ".py", 136 | "mimetype": "text/x-python", 137 | "name": "python", 138 | "nbconvert_exporter": "python", 139 | "pygments_lexer": "ipython3", 140 | "version": "3.6.7" 141 | } 142 | }, 143 | "nbformat": 4, 144 | "nbformat_minor": 2 145 | } 146 | -------------------------------------------------------------------------------- /Wyklad 12 - Klasyfikacja tekstu.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Klasyfikacja tekstu" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# encode document\n", 17 | "vector = vectorizer.transform([text[0]])\n", 18 | "# summarize encoded vector\n", 19 | "print(vector.shape)\n", 20 | "print(vector.toarray())" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "\n", 30 | "from sklearn.feature_extraction.text import HashingVectorizer\n", 31 | "# list of text documents\n", 32 | "text = [\"The quick brown fox jumped over the lazy dog.\"]\n", 33 | "# create the transform\n", 34 | "vectorizer = HashingVectorizer(n_features=20)\n", 35 | "# encode document\n", 36 | "vector = vectorizer.transform(text)\n", 37 | "# summarize encoded vector\n", 38 | "print(vector.shape)\n", 39 | "print(vector.toarray())" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | " categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "from sklearn.datasets import fetch_20newsgroups\n", 58 | "twenty_train = fetch_20newsgroups(data_home='c:\\\\work', subset='train', categories=categories, shuffle=True, random_state=42, download_if_missing=False)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "twenty_train.target_names" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "len(twenty_train.data)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "print(\"\\n\".join(twenty_train.data[0].split(\"\\n\")[:15]))" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | " print(twenty_train.target_names[twenty_train.target[0]])" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "from sklearn.feature_extraction.text import CountVectorizer\n", 104 | "count_vect = CountVectorizer()\n", 105 | "X_train_counts = count_vect.fit_transform(twenty_train.data)\n", 106 | "X_train_counts.shape" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "from sklearn.feature_extraction.text import CountVectorizer\n", 116 | "# list of text documents\n", 117 | "text = [\"The quick brown fox jumped over the lazy dog.\"]\n", 118 | "# create the transform\n", 119 | "vectorizer = CountVectorizer()\n", 120 | "# tokenize and build vocab\n", 121 | "vectorizer.fit(text)\n", 122 | "# summarize\n", 123 | "print(vectorizer.vocabulary_)\n", 124 | "# encode document\n", 125 | "vector = vectorizer.transform(text)\n", 126 | "# summarize encoded vector\n", 127 | "print(vector.shape)\n", 128 | "print(type(vector))\n", 129 | "print(vector.toarray())" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 139 | "# list of text documents\n", 140 | "text = [\"The quick brown fox jumped over the lazy dog.\",\n", 141 | "\t\t\"The dog.\",\n", 142 | "\t\t\"The fox\"]\n", 143 | "# create the transform\n", 144 | "vectorizer = TfidfVectorizer()\n", 145 | "# tokenize and build vocab\n", 146 | "vectorizer.fit(text)\n", 147 | "# summarize\n", 148 | "\n", 149 | "print(vectorizer.idf_)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "$$ tf-idf_{ij} = tf_{ij} \\times idf_i $$\n", 157 | "\n", 158 | "$$ tf_{ij} = \\frac{n_{ij}}{\\sum_{k} n_{kj}} n_{ij} - ilość~wystąpień~t_i~w~dokumencie~d_j $$ \n", 159 | "\n", 160 | "$$ idf_i = log \\frac{ |D| }{|{d : t_i \\in d }|} $$" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "from sklearn.feature_extraction.text import TfidfTransformer\n", 170 | "from sklearn.naive_bayes import MultinomialNB\n", 171 | "\n", 172 | "tfidf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)\n", 173 | "X_train_tfidf = tfidf_transformer.transform(X_train_counts)\n", 174 | "X_train_tfidf.shape\n", 175 | "\n", 176 | "clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "docs_new = ['God is love', 'OpenGL on the GPU is fast']\n", 186 | "X_new_counts = count_vect.transform(docs_new)\n", 187 | "X_new_tfidf = tfidf_transformer.transform(X_new_counts)\n", 188 | "\n", 189 | "predicted = clf.predict(X_new_tfidf)\n", 190 | "\n", 191 | "for doc, category in zip(docs_new, predicted):\n", 192 | " print('%r => %s' % (doc, twenty_train.target_names[category]))\n" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "from sklearn.pipeline import Pipeline\n", 202 | "text_clf = Pipeline([\n", 203 | " ('vect', CountVectorizer()),\n", 204 | " ('tfidf', TfidfTransformer()),\n", 205 | " ('clf', MultinomialNB()),\n", 206 | " ])" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "text_clf.fit(twenty_train.data, twenty_train.target) " 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "import numpy as np\n", 225 | "twenty_test = fetch_20newsgroups(data_home='c:\\\\work', subset='train', categories=categories, shuffle=True, random_state=42, download_if_missing=False)\n", 226 | "docs_test = twenty_test.data\n", 227 | "predicted = text_clf.predict(docs_test)\n", 228 | "print np.mean(predicted == twenty_test.target) " 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "from sklearn import metrics\n", 238 | "print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "from sklearn.linear_model import SGDClassifier\n", 248 | "text_clf = Pipeline([\n", 249 | "('vect', CountVectorizer()),\n", 250 | "('tfidf', TfidfTransformer()),\n", 251 | "('clf', SGDClassifier(loss='hinge', penalty='l2',\n", 252 | "alpha=1e-3, random_state=42,\n", 253 | "max_iter=5, tol=None)),\n", 254 | "])\n", 255 | "\n", 256 | "text_clf.fit(twenty_train.data, twenty_train.target) \n", 257 | "\n", 258 | "predicted = text_clf.predict(docs_test)\n", 259 | "np.mean(predicted == twenty_test.target) \n" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "from sklearn import metrics\n", 269 | "print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "metrics.confusion_matrix(twenty_test.target, predicted)" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "from sklearn.model_selection import GridSearchCV\n", 288 | "parameters = {\n", 289 | "'vect__ngram_range': [(1, 1), (1, 2)],\n", 290 | "'tfidf__use_idf': (True, False),\n", 291 | "'clf__alpha': (1e-2, 1e-3),\n", 292 | "}" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)\n", 302 | "gs_clf.__dict__" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "![Intern](\"img/se.jpg\")" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [] 318 | } 319 | ], 320 | "metadata": { 321 | "kernelspec": { 322 | "display_name": "Python 2", 323 | "language": "python", 324 | "name": "python2" 325 | }, 326 | "language_info": { 327 | "codemirror_mode": { 328 | "name": "ipython", 329 | "version": 2 330 | }, 331 | "file_extension": ".py", 332 | "mimetype": "text/x-python", 333 | "name": "python", 334 | "nbconvert_exporter": "python", 335 | "pygments_lexer": "ipython2", 336 | "version": "2.7.14" 337 | } 338 | }, 339 | "nbformat": 4, 340 | "nbformat_minor": 2 341 | } 342 | -------------------------------------------------------------------------------- /dash_examples/1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import dash 3 | import dash_core_components as dcc 4 | import dash_html_components as html 5 | 6 | external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css'] 7 | 8 | app = dash.Dash(__name__, external_stylesheets=external_stylesheets) 9 | 10 | app.layout = html.Div(children=[ 11 | html.H1(children='Hello Dash'), 12 | 13 | html.Div(children=''' 14 | Dash: A web application framework for Python. 15 | '''), 16 | 17 | dcc.Graph( 18 | id='example-graph', 19 | figure={ 20 | 'data': [ 21 | {'x': [1, 2, 3], 'y': [4, 1, 2], 'type': 'bar', 'name': 'SF'}, 22 | {'x': [1, 2, 3], 'y': [2, 4, 5], 'type': 'bar', 'name': u'Montréal'}, 23 | ], 24 | 'layout': { 25 | 'title': 'Dash Data Visualization' 26 | } 27 | } 28 | ) 29 | ]) 30 | 31 | if __name__ == '__main__': 32 | app.run_server(debug=True) 33 | -------------------------------------------------------------------------------- /dash_examples/2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import dash 3 | import dash_core_components as dcc 4 | import dash_html_components as html 5 | 6 | external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css'] 7 | 8 | app = dash.Dash(__name__, external_stylesheets=external_stylesheets) 9 | 10 | colors = { 11 | 'background': '#111111', 12 | 'text': '#7FDBFF' 13 | } 14 | 15 | app.layout = html.Div(style={'backgroundColor': colors['background']}, children=[ 16 | html.H1( 17 | children='Hello Dash', 18 | style={ 19 | 'textAlign': 'center', 20 | 'color': colors['text'] 21 | } 22 | ), 23 | 24 | html.Div(children='Dash: A web application framework for Python.', style={ 25 | 'textAlign': 'center', 26 | 'color': colors['text'] 27 | }), 28 | 29 | dcc.Graph( 30 | id='example-graph-2', 31 | figure={ 32 | 'data': [ 33 | {'x': [1, 2, 3], 'y': [4, 1, 2], 'type': 'bar', 'name': 'SF'}, 34 | {'x': [1, 2, 3], 'y': [2, 4, 5], 'type': 'bar', 'name': u'Montréal'}, 35 | ], 36 | 'layout': { 37 | 'plot_bgcolor': colors['background'], 38 | 'paper_bgcolor': colors['background'], 39 | 'font': { 40 | 'color': colors['text'] 41 | } 42 | } 43 | } 44 | ) 45 | ]) 46 | 47 | if __name__ == '__main__': 48 | app.run_server(debug=True) 49 | -------------------------------------------------------------------------------- /dash_examples/2a.py: -------------------------------------------------------------------------------- 1 | import dash 2 | import dash_core_components as dcc 3 | import dash_html_components as html 4 | import pandas as pd 5 | 6 | df = pd.read_csv( 7 | 'https://gist.githubusercontent.com/chriddyp/' 8 | 'c78bf172206ce24f77d6363a2d754b59/raw/' 9 | 'c353e8ef842413cae56ae3920b8fd78468aa4cb2/' 10 | 'usa-agricultural-exports-2011.csv') 11 | 12 | 13 | def generate_table(dataframe, max_rows=10): 14 | return html.Table( 15 | # Header 16 | [html.Tr([html.Th(col) for col in dataframe.columns])] + 17 | 18 | # Body 19 | [html.Tr([ 20 | html.Td(dataframe.iloc[i][col]) for col in dataframe.columns 21 | ]) for i in range(min(len(dataframe), max_rows))] 22 | ) 23 | 24 | 25 | external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css'] 26 | 27 | app = dash.Dash(__name__, external_stylesheets=external_stylesheets) 28 | 29 | app.layout = html.Div(children=[ 30 | html.H4(children='US Agriculture Exports (2011)'), 31 | generate_table(df) 32 | ]) 33 | 34 | if __name__ == '__main__': 35 | app.run_server(debug=True) 36 | -------------------------------------------------------------------------------- /dash_examples/2b.py: -------------------------------------------------------------------------------- 1 | import dash 2 | import dash_core_components as dcc 3 | import dash_html_components as html 4 | import pandas as pd 5 | import plotly.graph_objs as go 6 | 7 | external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css'] 8 | 9 | app = dash.Dash(__name__, external_stylesheets=external_stylesheets) 10 | 11 | df = pd.read_csv( 12 | 'https://gist.githubusercontent.com/chriddyp/' + 13 | '5d1ea79569ed194d432e56108a04d188/raw/' + 14 | 'a9f9e8076b837d541398e999dcbac2b2826a81f8/'+ 15 | 'gdp-life-exp-2007.csv') 16 | 17 | 18 | app.layout = html.Div([ 19 | dcc.Graph( 20 | id='life-exp-vs-gdp', 21 | figure={ 22 | 'data': [ 23 | go.Scatter( 24 | x=df[df['continent'] == i]['gdp per capita'], 25 | y=df[df['continent'] == i]['life expectancy'], 26 | text=df[df['continent'] == i]['country'], 27 | mode='markers', 28 | opacity=0.7, 29 | marker={ 30 | 'size': 15, 31 | 'line': {'width': 0.5, 'color': 'white'} 32 | }, 33 | name=i 34 | ) for i in df.continent.unique() 35 | ], 36 | 'layout': go.Layout( 37 | xaxis={'type': 'log', 'title': 'GDP Per Capita'}, 38 | yaxis={'title': 'Life Expectancy'}, 39 | margin={'l': 40, 'b': 40, 't': 10, 'r': 10}, 40 | legend={'x': 0, 'y': 1}, 41 | hovermode='closest' 42 | ) 43 | } 44 | ) 45 | ]) 46 | 47 | if __name__ == '__main__': 48 | app.run_server(debug=True) 49 | -------------------------------------------------------------------------------- /dash_examples/2c.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import dash 3 | import dash_core_components as dcc 4 | import dash_html_components as html 5 | 6 | external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css'] 7 | 8 | app = dash.Dash(__name__, external_stylesheets=external_stylesheets) 9 | 10 | app.layout = html.Div([ 11 | html.Label('Dropdown'), 12 | dcc.Dropdown( 13 | options=[ 14 | {'label': 'New York City', 'value': 'NYC'}, 15 | {'label': u'Montréal', 'value': 'MTL'}, 16 | {'label': 'San Francisco', 'value': 'SF'} 17 | ], 18 | value='MTL' 19 | ), 20 | 21 | html.Label('Multi-Select Dropdown'), 22 | dcc.Dropdown( 23 | options=[ 24 | {'label': 'New York City', 'value': 'NYC'}, 25 | {'label': u'Montréal', 'value': 'MTL'}, 26 | {'label': 'San Francisco', 'value': 'SF'} 27 | ], 28 | value=['MTL', 'SF'], 29 | multi=True 30 | ), 31 | 32 | html.Label('Radio Items'), 33 | dcc.RadioItems( 34 | options=[ 35 | {'label': 'New York City', 'value': 'NYC'}, 36 | {'label': u'Montréal', 'value': 'MTL'}, 37 | {'label': 'San Francisco', 'value': 'SF'} 38 | ], 39 | value='MTL' 40 | ), 41 | 42 | html.Label('Checkboxes'), 43 | dcc.Checklist( 44 | options=[ 45 | {'label': 'New York City', 'value': 'NYC'}, 46 | {'label': u'Montréal', 'value': 'MTL'}, 47 | {'label': 'San Francisco', 'value': 'SF'} 48 | ], 49 | values=['MTL', 'SF'] 50 | ), 51 | 52 | html.Label('Text Input'), 53 | dcc.Input(value='MTL', type='text'), 54 | 55 | html.Label('Slider'), 56 | dcc.Slider( 57 | min=0, 58 | max=9, 59 | marks={i: 'Label {}'.format(i) if i == 1 else str(i) for i in range(1, 6)}, 60 | value=5, 61 | ), 62 | ], style={'columnCount': 2}) 63 | 64 | if __name__ == '__main__': 65 | app.run_server(debug=True) 66 | -------------------------------------------------------------------------------- /dash_examples/2t.py: -------------------------------------------------------------------------------- 1 | import dash 2 | import dash_table 3 | import pandas as pd 4 | 5 | df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/solar.csv') 6 | 7 | app = dash.Dash(__name__) 8 | 9 | app.layout = dash_table.DataTable( 10 | id='table', 11 | columns=[{"name": i, "id": i} for i in df.columns], 12 | data=df.to_dict("rows"), 13 | ) 14 | 15 | if __name__ == '__main__': 16 | app.run_server(debug=True) 17 | 18 | -------------------------------------------------------------------------------- /dash_examples/3.py: -------------------------------------------------------------------------------- 1 | import dash 2 | import dash_core_components as dcc 3 | import dash_html_components as html 4 | from dash.dependencies import Input, Output 5 | 6 | external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css'] 7 | 8 | app = dash.Dash(__name__, external_stylesheets=external_stylesheets) 9 | 10 | app.layout = html.Div([ 11 | dcc.Input(id='my-id', value='initial value', type='text'), 12 | html.Div(id='my-div') 13 | ]) 14 | 15 | 16 | @app.callback( 17 | Output(component_id='my-div', component_property='children'), 18 | [Input(component_id='my-id', component_property='value')] 19 | ) 20 | def update_output_div(input_value): 21 | return 'You\'ve entered "{}"'.format(input_value) 22 | 23 | 24 | if __name__ == '__main__': 25 | app.run_server(debug=True) 26 | -------------------------------------------------------------------------------- /dash_examples/4.py: -------------------------------------------------------------------------------- 1 | import dash 2 | import dash_core_components as dcc 3 | import dash_html_components as html 4 | import plotly.graph_objs as go 5 | import pandas as pd 6 | 7 | app = dash.Dash() 8 | 9 | df = pd.read_csv( 10 | 'https://raw.githubusercontent.com/' 11 | 'plotly/datasets/master/' 12 | '1962_2006_walmart_store_openings.csv') 13 | 14 | app.layout = html.Div([ 15 | html.H1('Walmart Store Openings'), 16 | html.Div(id='text-content'), 17 | dcc.Graph(id='map', figure={ 18 | 'data': [{ 19 | 'lat': df['LAT'], 20 | 'lon': df['LON'], 21 | 'marker': { 22 | 'color': df['YEAR'], 23 | 'size': 8, 24 | 'opacity': 0.6 25 | }, 26 | 'customdata': df['storenum'], 27 | 'type': 'scattermapbox' 28 | }], 29 | 'layout': { 30 | 'mapbox': { 31 | 'accesstoken': 'pk.eyJ1IjoiY2hyaWRkeXAiLCJhIjoiY2ozcGI1MTZ3MDBpcTJ3cXR4b3owdDQwaCJ9.8jpMunbKjdq1anXwU5gxIw' 32 | }, 33 | 'hovermode': 'closest', 34 | 'margin': {'l': 0, 'r': 0, 'b': 0, 't': 0} 35 | } 36 | }) 37 | ]) 38 | 39 | @app.callback( 40 | dash.dependencies.Output('text-content', 'children'), 41 | [dash.dependencies.Input('map', 'hoverData')]) 42 | def update_text(hoverData): 43 | s = df[df['storenum'] == hoverData['points'][0]['customdata']] 44 | return html.H3( 45 | 'The {}, {} {} opened in {}'.format( 46 | s.iloc[0]['STRCITY'], 47 | s.iloc[0]['STRSTATE'], 48 | s.iloc[0]['type_store'], 49 | s.iloc[0]['YEAR'] 50 | ) 51 | ) 52 | 53 | app.css.append_css({ 54 | 'external_url': 'https://codepen.io/chriddyp/pen/bWLwgP.css' 55 | }) 56 | 57 | if __name__ == '__main__': 58 | app.run_server(debug=True) 59 | -------------------------------------------------------------------------------- /img/PDSH.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/img/PDSH.png -------------------------------------------------------------------------------- /img/djpatel.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/img/djpatel.jpg -------------------------------------------------------------------------------- /img/github.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/img/github.png -------------------------------------------------------------------------------- /img/growth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/img/growth.png -------------------------------------------------------------------------------- /img/kernel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/img/kernel.png -------------------------------------------------------------------------------- /img/launch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/img/launch.png -------------------------------------------------------------------------------- /img/over.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/img/over.png -------------------------------------------------------------------------------- /img/pobrane.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/img/pobrane.png -------------------------------------------------------------------------------- /img/projection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/img/projection.png -------------------------------------------------------------------------------- /img/rossum.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/img/rossum.jpg -------------------------------------------------------------------------------- /img/se.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/img/se.jpg -------------------------------------------------------------------------------- /img/tags.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/img/tags.png -------------------------------------------------------------------------------- /img/tortoise1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/img/tortoise1.png -------------------------------------------------------------------------------- /img/tortoise2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/img/tortoise2.png -------------------------------------------------------------------------------- /img/unbalanced.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/img/unbalanced.png -------------------------------------------------------------------------------- /img/ver.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/img/ver.png -------------------------------------------------------------------------------- /ml_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/ml_map.png -------------------------------------------------------------------------------- /odm.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/odm.txt.gz -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: win-64 4 | _anaconda_depends=2019.03=py37_0 5 | alabaster=0.7.12=py37_0 6 | anaconda=custom=py37_1 7 | anaconda-client=1.7.2=py37_0 8 | anaconda-project=0.8.4=py_0 9 | asn1crypto=1.2.0=py37_0 10 | astroid=2.3.2=py37_0 11 | astropy=3.2.3=py37he774522_0 12 | atomicwrites=1.3.0=py37_1 13 | attrs=19.3.0=py_0 14 | automat=0.8.0=py_0 15 | babel=2.7.0=py_0 16 | backcall=0.1.0=py37_0 17 | backports=1.0=py_2 18 | backports.os=0.1.1=py37_0 19 | backports.shutil_get_terminal_size=1.0.0=py37_2 20 | basemap=1.2.0=py37h4e5d7af_0 21 | basemap-data-hires=1.2.0=0 22 | bcrypt=3.1.7=py37he774522_0 23 | beautifulsoup4=4.8.2=py37_0 24 | bitarray=1.0.1=py37he774522_0 25 | bkcharts=0.2=py37_0 26 | blas=1.0=mkl 27 | bleach=3.1.0=py37_0 28 | blosc=1.16.3=h7bd577a_0 29 | bokeh=1.4.0=py37_0 30 | boto=2.49.0=py37_0 31 | bottleneck=1.2.1=py37h452e1ab_1 32 | bzip2=1.0.8=he774522_0 33 | ca-certificates=2020.1.1=0 34 | certifi=2019.11.28=py37_1 35 | cffi=1.13.1=py37h7a1dbc1_0 36 | chardet=3.0.4=py37_1003 37 | click=7.0=py37_0 38 | cloudpickle=1.2.2=py_0 39 | clyent=1.2.2=py37_1 40 | colorama=0.4.1=py37_0 41 | comtypes=1.1.7=py37_0 42 | console_shortcut=0.1.1=3 43 | constantly=15.1.0=py37h28b3542_0 44 | contextlib2=0.6.0=py_0 45 | cryptography=2.8=py37h7a1dbc1_0 46 | cssselect=1.1.0=py_0 47 | curl=7.65.3=h2a8f88b_0 48 | cycler=0.10.0=py37_0 49 | cython=0.29.13=py37ha925a31_0 50 | cytoolz=0.10.0=py37he774522_0 51 | dask=2.7.0=py_0 52 | dask-core=2.7.0=py_0 53 | decorator=4.4.1=py_0 54 | defusedxml=0.6.0=py_0 55 | distributed=2.7.0=py_0 56 | docutils=0.15.2=py37_0 57 | entrypoints=0.3=py37_0 58 | et_xmlfile=1.0.1=py37_0 59 | fastcache=1.1.0=py37he774522_0 60 | filelock=3.0.12=py_0 61 | flask=1.1.1=py_0 62 | freetype=2.9.1=ha9979f8_1 63 | fsspec=0.5.2=py_0 64 | geos=3.6.2=h9ef7328_2 65 | get_terminal_size=1.0.0=h38e98db_0 66 | gevent=1.4.0=py37he774522_0 67 | glob2=0.7=py_0 68 | greenlet=0.4.15=py37hfa6e2cd_0 69 | h5py=2.9.0=py37h5e291fa_0 70 | hdf5=1.10.4=h7ebc959_0 71 | heapdict=1.0.1=py_0 72 | html5lib=1.0.1=py37_0 73 | hyperlink=19.0.0=py_0 74 | icc_rt=2019.0.0=h0cc432a_1 75 | icu=58.2=ha66f8fd_1 76 | idna=2.8=py37_0 77 | imageio=2.6.1=py37_0 78 | imagesize=1.1.0=py37_0 79 | importlib_metadata=0.23=py37_0 80 | incremental=17.5.0=py37_0 81 | intel-openmp=2019.4=245 82 | ipykernel=5.1.3=py37h39e3cac_0 83 | ipython=7.12.0=py37h5ca1d4c_0 84 | ipython_genutils=0.2.0=py37_0 85 | ipywidgets=7.5.1=py_0 86 | isort=4.3.21=py37_0 87 | itsdangerous=1.1.0=py37_0 88 | jdcal=1.4.1=py_0 89 | jedi=0.15.1=py37_0 90 | jinja2=2.10.3=py_0 91 | joblib=0.14.0=py_0 92 | jpeg=9b=hb83a4c4_2 93 | json5=0.8.5=py_0 94 | jsonschema=3.1.1=py37_0 95 | jupyter=1.0.0=py37_7 96 | jupyter_client=5.3.4=py37_0 97 | jupyter_console=6.0.0=py37_0 98 | jupyter_core=4.6.1=py37_0 99 | jupyterlab=1.1.4=pyhf63ae98_0 100 | jupyterlab_server=1.0.6=py_0 101 | keyring=18.0.0=py37_0 102 | kiwisolver=1.1.0=py37ha925a31_0 103 | krb5=1.16.1=hc04afaa_7 104 | lazy-object-proxy=1.4.3=py37he774522_0 105 | libarchive=3.3.3=h0643e63_5 106 | libcurl=7.65.3=h2a8f88b_0 107 | libiconv=1.15=h1df5818_7 108 | liblief=0.9.0=ha925a31_2 109 | libpng=1.6.37=h2a8f88b_0 110 | libsodium=1.0.16=h9d3ae62_0 111 | libssh2=1.8.2=h7a1dbc1_0 112 | libtiff=4.1.0=h56a325e_0 113 | libxml2=2.9.9=h464c3ec_0 114 | libxslt=1.1.33=h579f668_0 115 | llvmlite=0.30.0=py37ha925a31_0 116 | locket=0.2.0=py37_1 117 | lxml=4.4.1=py37h1350720_0 118 | lz4-c=1.8.1.2=h2fa13f4_0 119 | lzo=2.10=h6df0209_2 120 | m2w64-gcc-libgfortran=5.3.0=6 121 | m2w64-gcc-libs=5.3.0=7 122 | m2w64-gcc-libs-core=5.3.0=7 123 | m2w64-gmp=6.1.0=2 124 | m2w64-libwinpthread-git=5.0.0.4634.697f757=2 125 | markupsafe=1.1.1=py37he774522_0 126 | matplotlib=3.1.3=py37_0 127 | matplotlib-base=3.1.3=py37h64f37c6_0 128 | mccabe=0.6.1=py37_1 129 | menuinst=1.4.16=py37he774522_0 130 | mistune=0.8.4=py37he774522_0 131 | mkl=2019.4=245 132 | mkl-service=2.3.0=py37hb782905_0 133 | mkl_fft=1.0.15=py37h14836fe_0 134 | mkl_random=1.1.0=py37h675688f_0 135 | mock=3.0.5=py37_0 136 | more-itertools=7.2.0=py37_0 137 | mpmath=1.1.0=py37_0 138 | msgpack-python=0.6.1=py37h74a9793_1 139 | msys2-conda-epoch=20160418=1 140 | multipledispatch=0.6.0=py37_0 141 | multitasking=0.0.9=pypi_0 142 | nbconvert=5.6.1=py37_0 143 | nbformat=4.4.0=py37_0 144 | networkx=2.4=py_0 145 | nltk=3.4.5=py37_0 146 | nose=1.3.7=py37_2 147 | notebook=6.0.2=py37_0 148 | numba=0.46.0=py37hf9181ef_0 149 | numexpr=2.7.0=py37hdce8814_0 150 | numpy=1.17.3=py37h4ceb530_0 151 | numpy-base=1.17.3=py37hc3f5095_0 152 | numpydoc=0.9.1=py_0 153 | olefile=0.46=py37_0 154 | openpyxl=3.0.0=py_0 155 | openssl=1.1.1e=he774522_0 156 | packaging=19.2=py_0 157 | pandas=0.25.2=py37ha925a31_0 158 | pandoc=2.2.3.2=0 159 | pandocfilters=1.4.2=py37_1 160 | parsel=1.5.2=py37_0 161 | parso=0.5.1=py_0 162 | partd=1.0.0=py_0 163 | path.py=12.0.1=py_0 164 | pathlib2=2.3.5=py37_0 165 | patsy=0.5.1=py37_0 166 | pep8=1.7.1=py37_0 167 | pickleshare=0.7.5=py37_0 168 | pillow=6.2.1=py37hdc69c19_0 169 | pip=19.3.1=py37_0 170 | pkginfo=1.5.0.1=py37_0 171 | pluggy=0.13.0=py37_0 172 | ply=3.11=py37_0 173 | powershell_shortcut=0.0.1=2 174 | proj4=5.2.0=ha925a31_1 175 | prometheus_client=0.7.1=py_0 176 | prompt_toolkit=2.0.10=py_0 177 | psutil=5.6.5=py37he774522_0 178 | py=1.8.0=py37_0 179 | py-lief=0.9.0=py37ha925a31_2 180 | pyasn1=0.4.8=py_0 181 | pyasn1-modules=0.2.7=py_0 182 | pycodestyle=2.5.0=py37_0 183 | pycosat=0.6.3=py37hfa6e2cd_0 184 | pycparser=2.19=py37_0 185 | pycrypto=2.6.1=py37hfa6e2cd_9 186 | pycurl=7.43.0.3=py37h7a1dbc1_0 187 | pydispatcher=2.0.5=py37_1 188 | pyflakes=2.1.1=py37_0 189 | pygments=2.4.2=py_0 190 | pyhamcrest=1.9.0=py37_2 191 | pylint=2.4.3=py37_0 192 | pyodbc=4.0.27=py37ha925a31_0 193 | pyopenssl=19.0.0=py37_0 194 | pyparsing=2.4.4=py_0 195 | pyproj=1.9.6=py37h6782396_0 196 | pyqt=5.9.2=py37h6538335_2 197 | pyreadline=2.1=py37_1 198 | pyrsistent=0.15.4=py37he774522_0 199 | pyshp=2.1.0=py_0 200 | pysocks=1.7.1=py37_0 201 | pytables=3.6.1=py37h1da0976_0 202 | pytest=5.2.2=py37_0 203 | pytest-arraydiff=0.3=py37h39e3cac_0 204 | pytest-astropy=0.5.0=py37_0 205 | pytest-doctestplus=0.4.0=py_0 206 | pytest-openfiles=0.4.0=py_0 207 | pytest-remotedata=0.3.2=py37_0 208 | pytest-runner=5.2=py_0 209 | python=3.7.5=h8c8aaf0_0 210 | python-dateutil=2.8.1=py_0 211 | python-libarchive-c=2.8=py37_13 212 | pytz=2019.3=py_0 213 | pywavelets=1.1.1=py37he774522_0 214 | pywin32=223=py37hfa6e2cd_1 215 | pywinpty=0.5.5=py37_1000 216 | pyyaml=5.1.2=py37he774522_0 217 | pyzmq=18.1.0=py37ha925a31_0 218 | qt=5.9.7=vc14h73c81de_0 219 | qtawesome=0.6.0=py_0 220 | qtconsole=4.5.5=py_0 221 | qtpy=1.9.0=py_0 222 | queuelib=1.5.0=py37_0 223 | requests=2.23.0=py37_0 224 | rope=0.14.0=py_0 225 | ruamel_yaml=0.15.46=py37hfa6e2cd_0 226 | scikit-image=0.15.0=py37ha925a31_0 227 | scikit-learn=0.21.3=py37h6288b17_0 228 | scipy=1.3.1=py37h29ff71c_0 229 | scrapy=1.6.0=py37_0 230 | seaborn=0.10.0=py_0 231 | send2trash=1.5.0=py37_0 232 | service_identity=18.1.0=py37h28b3542_0 233 | setuptools=41.6.0=py37_0 234 | simplegeneric=0.8.1=py37_2 235 | singledispatch=3.4.0.3=py37_0 236 | sip=4.19.8=py37h6538335_0 237 | six=1.13.0=py37_0 238 | snappy=1.1.7=h777316e_3 239 | snowballstemmer=2.0.0=py_0 240 | sortedcollections=1.1.2=py37_0 241 | sortedcontainers=2.1.0=py37_0 242 | soupsieve=1.9.3=py37_0 243 | sphinx=2.2.1=py_0 244 | sphinxcontrib=1.0=py37_1 245 | sphinxcontrib-applehelp=1.0.1=py_0 246 | sphinxcontrib-devhelp=1.0.1=py_0 247 | sphinxcontrib-htmlhelp=1.0.2=py_0 248 | sphinxcontrib-jsmath=1.0.1=py_0 249 | sphinxcontrib-qthelp=1.0.2=py_0 250 | sphinxcontrib-serializinghtml=1.1.3=py_0 251 | sphinxcontrib-websupport=1.1.2=py_0 252 | spyder=3.3.6=py37_0 253 | spyder-kernels=0.5.2=py37_0 254 | sqlalchemy=1.3.10=py37he774522_0 255 | sqlite=3.30.1=he774522_0 256 | statsmodels=0.10.1=py37h8c2d366_0 257 | sympy=1.4=py37_0 258 | tbb=2019.4=h74a9793_0 259 | tblib=1.5.0=py_0 260 | terminado=0.8.2=py37_0 261 | testpath=0.4.2=py37_0 262 | tk=8.6.8=hfa6e2cd_0 263 | toolz=0.10.0=py_0 264 | tornado=6.0.3=py37he774522_0 265 | tqdm=4.36.1=py_0 266 | traitlets=4.3.3=py37_0 267 | twisted=19.10.0=py37he774522_0 268 | unicodecsv=0.14.1=py37_0 269 | urllib3=1.24.2=py37_0 270 | vc=14.1=h0510ff6_4 271 | vs2015_runtime=14.16.27012=hf0eaf9b_0 272 | w3lib=1.21.0=py_0 273 | wcwidth=0.1.7=py37_0 274 | webencodings=0.5.1=py37_1 275 | werkzeug=0.16.0=py_0 276 | wheel=0.33.6=py37_0 277 | widgetsnbextension=3.5.1=py37_0 278 | win_inet_pton=1.1.0=py37_0 279 | win_unicode_console=0.5=py37_0 280 | wincertstore=0.2=py37_0 281 | winpty=0.4.3=4 282 | wrapt=1.11.2=py37he774522_0 283 | xlrd=1.2.0=py37_0 284 | xlsxwriter=1.2.2=py_0 285 | xlwings=0.16.0=py37_0 286 | xlwt=1.3.0=py37_0 287 | xz=5.2.4=h2fa13f4_4 288 | yaml=0.1.7=hc54c509_2 289 | yfinance=0.1.54=pypi_0 290 | zeromq=4.3.1=h33f27b4_3 291 | zict=1.0.0=py_0 292 | zipp=0.6.0=py_0 293 | zlib=1.2.11=h62dcd97_3 294 | zope=1.0=py37_1 295 | zope.interface=4.7.1=py37he774522_0 296 | zstd=1.3.7=h508b16e_0 297 | -------------------------------------------------------------------------------- /simple_script.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | if __name__ == '__main__': 4 | rez = {} 5 | for x in sys.argv[1:]: 6 | rez[x] = rez.get(x,0)+1 7 | print (rez) -------------------------------------------------------------------------------- /svm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/svm.png -------------------------------------------------------------------------------- /wyklad11/odm.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/wyklad11/odm.txt.gz -------------------------------------------------------------------------------- /wyklad7/myspider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | class BlogSpider(scrapy.Spider): 4 | name = 'blogspider' 5 | start_urls = ['https://blog.scrapinghub.com'] 6 | 7 | def parse(self, response): 8 | for title in response.css('.post-header>h2'): 9 | yield {'title': title.css('a ::text').get()} 10 | 11 | for next_page in response.css('a.next-posts-link'): 12 | yield response.follow(next_page, self.parse) -------------------------------------------------------------------------------- /wyklad7/scrapy1.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from scrapy.crawler import CrawlerProcess 3 | from scrapy.http import Request 4 | 5 | class PythonEventsSpider(scrapy.Spider): 6 | name = 'pythoneventsspider' 7 | 8 | start_urls = ['https://www.python.org/events/python-events/',] 9 | found_events = [] 10 | 11 | def parse(self, response): 12 | for event in response.xpath('//ul[contains(@class, "list-recent-events")]/li'): 13 | event_details = dict() 14 | event_details['name'] = event.xpath('h3[@class="event-title"]/a/text()').extract_first() 15 | event_details['location'] = event.xpath('p/span[@class="event-location"]/text()').extract_first() 16 | event_details['time'] = event.xpath('p/time/text()').extract_first() 17 | self.found_events.append(event_details) 18 | 19 | print("Scrapy Example 1") 20 | process = CrawlerProcess({ 'LOG_LEVEL': 'ERROR'}) 21 | process.crawl(PythonEventsSpider) 22 | spider = next(iter(process.crawlers)).spider 23 | process.start() 24 | 25 | for event in spider.found_events: 26 | print(event) -------------------------------------------------------------------------------- /xvi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MichalKorzycki/WarsztatPythonDataScience/8fd007d1f780bf09d2c8f151c62034c40ad0f471/xvi.png --------------------------------------------------------------------------------