├── #1 Load a CSV in Python Pandas.ipynb ├── #2 Extract a sub-string in Python Pandas.ipynb ├── #3 Pivot tables in Python Pandas.ipynb ├── #4 Header cleanup (reset_index, droplevel, rename) in Python Pandas.ipynb ├── #5 Read Excel in Python Pandas.ipynb ├── #6 Merge two tables in Python Pandas.ipynb ├── #7 Filter and export in Python Pandas.ipynb ├── Analytics Big Beach Spring Break.ipynb ├── Cleanup-2-Analytics-API-and-Custom-Search-Crawl.ipynb ├── Cleanup-Analytics.ipynb ├── Cleanup.ipynb ├── README.md └── Read News XML Sitemap with Python + Pandas.ipynb /#1 Load a CSV in Python Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Check: \n", 10 | "# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html\n", 11 | "#\n", 12 | "# Video about this notebook: \n", 13 | "# https://youtu.be/iq1qNmsBlXg " 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 38, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import pandas as pd" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 39, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "dfTestCSV = pd.DataFrame(pd.read_csv('./dfanalyticstestdata1.csv', header=2, sep=\"\\t\", index_col=0))" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 40, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/html": [ 42 | "
\n", 43 | "\n", 56 | "\n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | "
ga:pagePathga:pageviews
0/auto/-Oh-slug-slug-why-are-you-slug-id31218.html3
1/auto/-onboarding-story-id26545.html1
2/auto/abgefahren/Test-Zattoo-Sandro-Überzeile-...10
3/auto/autosalon/autosalon-story-ferrari-id3146...8
4/auto/Über-999-Stufen-und-durch-99-Haarnadel-K...2
5/auto/ueber-999-stufen-und-durch-99-haarnadel-...4
6/digital/Superlange-Überzeile-eines-Artikels-a...2
7/digital/thomas-probiert-was-labeltest-id31705...3
8/incoming/-CAS-id31002.html1
9/incoming/-kathy-Slug-id31000.html3
10/incoming/-Kathy-slug-id31004.html5
11/incoming/-Zattoo-test-id30914.html1
12/incoming/59-characters-59-characters-id30984....7
13/incoming/75-characters-title-75-characters-ti...2
14/incoming/8-Vermisste-in-Bondo-8-Vermisste-in-...1
15/incoming/A-test-4-A-test-4-id31112.html1
16/incoming/Abstimmungskampf-um-die-Rentenreform...2
17/incoming/ade-testing-id31460.html1
18/incoming/da-id31653.html11
19/incoming/das-ist-ein-test-artikel-iframe-id31...1
20/incoming/Detailhandel-verliert-1-Prozent-Umsa...1
21/incoming/e2e-test-catchword-e2e-id31001.html11
22/incoming/e2e-test-catchword-e2e-id31001.html'1
23/incoming/e2e-test-catchword-e2e-testsite-plea...96
24/incoming/form-story-form-story-id31887.html3
25/incoming/fux-ueber-sex-f-id6260.html1
26/incoming/iframe-article-super-iframe-article-...2
27/incoming/instagram-test-embeded-main-element-...1
28/incoming/ise-embedded-content-test-id31878.html1
29/incoming/ise-publish-date-test-ise-test-artic...6
.........
164/sport/fussball/wm/fussball-experte-ilja-kaenz...2
165/sport/fussball/wm/gehts-gegen-die-schweiz-alb...12
166/sport/fussball/wm/nach-geplatztem-wm-final-tr...1
167/sport/fussball/wm/wm-2018-schweden-schweiz-id...1
168/sport/mundgeruch-bei-der-royal-wedding-meghan...10
169/sport/presse-muss-demuetiger-sein-wm-2018-hat...12
170/sport/rad/dadsa-id31271.html6
171/sport/rad/Giro-Zoff-vor-Zeitfahren-Dumoulin-«...7
172/sport/reiten/Was-für-ein-Exploit-Monkey-Video...2
173/sport/schwingen/Knatsch-im-Reich-der-Bösen-Ka...5
174/sport/so-lernten-sich-lara-und-valon-kennen-l...1
175/sport/sport-tagesticker-id5539965.html14
176/sport/tennis/asdasd-id31325.html9
177/sport/tennis/glanzloser-sieg-gegen-qualifikan...1
178/sport/tennis/Regen-verschiebt-Duell-mit-Donal...1
179/sport/tennis/Timea-Bacsinszky-im-Interview-Ic...4
180/sport/tennis/timea-bacsinszky-im-interview-me...43
181/sport/tennis/timea-bacsinszky-im-interview-me...1
182/sport/tennis/timea-bacsinszky-im-interview-me...18
183/sport/tennis/timea-bacsinszky-im-interview-me...1
184/sport/testing-those-teasers-super-title-testi...38
185/sport/this-is-a-super-title-this-is-a-meta-ti...1
186/sport/thomas-markle-laesst-es-sich-nach-herz-...4
187/sport/Touristiker-jubeln-Trumps-Besuch-ist-20...1
188/sport/Tunisia-Gamiz-vs-Kennel-long-title-lore...2
189/sport/videoformate/dino/-id2483959.html1
190/video/viral/ein-mathe-model-und-eine-wunder-w...1
191/video/viral/ein-mathe-model-und-eine-wunder-w...3
192/vr/ein-tauziehen-um-die-trepa-mine-in-mitrovi...1
193/wirtschaft/wirtschaft-test-story-id31609.html5
\n", 372 | "

194 rows × 2 columns

\n", 373 | "
" 374 | ], 375 | "text/plain": [ 376 | " ga:pagePath ga:pageviews\n", 377 | "0 /auto/-Oh-slug-slug-why-are-you-slug-id31218.html 3\n", 378 | "1 /auto/-onboarding-story-id26545.html 1\n", 379 | "2 /auto/abgefahren/Test-Zattoo-Sandro-Überzeile-... 10\n", 380 | "3 /auto/autosalon/autosalon-story-ferrari-id3146... 8\n", 381 | "4 /auto/Über-999-Stufen-und-durch-99-Haarnadel-K... 2\n", 382 | "5 /auto/ueber-999-stufen-und-durch-99-haarnadel-... 4\n", 383 | "6 /digital/Superlange-Überzeile-eines-Artikels-a... 2\n", 384 | "7 /digital/thomas-probiert-was-labeltest-id31705... 3\n", 385 | "8 /incoming/-CAS-id31002.html 1\n", 386 | "9 /incoming/-kathy-Slug-id31000.html 3\n", 387 | "10 /incoming/-Kathy-slug-id31004.html 5\n", 388 | "11 /incoming/-Zattoo-test-id30914.html 1\n", 389 | "12 /incoming/59-characters-59-characters-id30984.... 7\n", 390 | "13 /incoming/75-characters-title-75-characters-ti... 2\n", 391 | "14 /incoming/8-Vermisste-in-Bondo-8-Vermisste-in-... 1\n", 392 | "15 /incoming/A-test-4-A-test-4-id31112.html 1\n", 393 | "16 /incoming/Abstimmungskampf-um-die-Rentenreform... 2\n", 394 | "17 /incoming/ade-testing-id31460.html 1\n", 395 | "18 /incoming/da-id31653.html 11\n", 396 | "19 /incoming/das-ist-ein-test-artikel-iframe-id31... 1\n", 397 | "20 /incoming/Detailhandel-verliert-1-Prozent-Umsa... 1\n", 398 | "21 /incoming/e2e-test-catchword-e2e-id31001.html 11\n", 399 | "22 /incoming/e2e-test-catchword-e2e-id31001.html' 1\n", 400 | "23 /incoming/e2e-test-catchword-e2e-testsite-plea... 96\n", 401 | "24 /incoming/form-story-form-story-id31887.html 3\n", 402 | "25 /incoming/fux-ueber-sex-f-id6260.html 1\n", 403 | "26 /incoming/iframe-article-super-iframe-article-... 2\n", 404 | "27 /incoming/instagram-test-embeded-main-element-... 1\n", 405 | "28 /incoming/ise-embedded-content-test-id31878.html 1\n", 406 | "29 /incoming/ise-publish-date-test-ise-test-artic... 6\n", 407 | ".. ... ...\n", 408 | "164 /sport/fussball/wm/fussball-experte-ilja-kaenz... 2\n", 409 | "165 /sport/fussball/wm/gehts-gegen-die-schweiz-alb... 12\n", 410 | "166 /sport/fussball/wm/nach-geplatztem-wm-final-tr... 1\n", 411 | "167 /sport/fussball/wm/wm-2018-schweden-schweiz-id... 1\n", 412 | "168 /sport/mundgeruch-bei-der-royal-wedding-meghan... 10\n", 413 | "169 /sport/presse-muss-demuetiger-sein-wm-2018-hat... 12\n", 414 | "170 /sport/rad/dadsa-id31271.html 6\n", 415 | "171 /sport/rad/Giro-Zoff-vor-Zeitfahren-Dumoulin-«... 7\n", 416 | "172 /sport/reiten/Was-für-ein-Exploit-Monkey-Video... 2\n", 417 | "173 /sport/schwingen/Knatsch-im-Reich-der-Bösen-Ka... 5\n", 418 | "174 /sport/so-lernten-sich-lara-und-valon-kennen-l... 1\n", 419 | "175 /sport/sport-tagesticker-id5539965.html1 4\n", 420 | "176 /sport/tennis/asdasd-id31325.html 9\n", 421 | "177 /sport/tennis/glanzloser-sieg-gegen-qualifikan... 1\n", 422 | "178 /sport/tennis/Regen-verschiebt-Duell-mit-Donal... 1\n", 423 | "179 /sport/tennis/Timea-Bacsinszky-im-Interview-Ic... 4\n", 424 | "180 /sport/tennis/timea-bacsinszky-im-interview-me... 43\n", 425 | "181 /sport/tennis/timea-bacsinszky-im-interview-me... 1\n", 426 | "182 /sport/tennis/timea-bacsinszky-im-interview-me... 18\n", 427 | "183 /sport/tennis/timea-bacsinszky-im-interview-me... 1\n", 428 | "184 /sport/testing-those-teasers-super-title-testi... 38\n", 429 | "185 /sport/this-is-a-super-title-this-is-a-meta-ti... 1\n", 430 | "186 /sport/thomas-markle-laesst-es-sich-nach-herz-... 4\n", 431 | "187 /sport/Touristiker-jubeln-Trumps-Besuch-ist-20... 1\n", 432 | "188 /sport/Tunisia-Gamiz-vs-Kennel-long-title-lore... 2\n", 433 | "189 /sport/videoformate/dino/-id2483959.html 1\n", 434 | "190 /video/viral/ein-mathe-model-und-eine-wunder-w... 1\n", 435 | "191 /video/viral/ein-mathe-model-und-eine-wunder-w... 3\n", 436 | "192 /vr/ein-tauziehen-um-die-trepa-mine-in-mitrovi... 1\n", 437 | "193 /wirtschaft/wirtschaft-test-story-id31609.html 5\n", 438 | "\n", 439 | "[194 rows x 2 columns]" 440 | ] 441 | }, 442 | "execution_count": 40, 443 | "metadata": {}, 444 | "output_type": "execute_result" 445 | } 446 | ], 447 | "source": [ 448 | "dfTestCSV" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [] 457 | } 458 | ], 459 | "metadata": { 460 | "kernelspec": { 461 | "display_name": "Python 3", 462 | "language": "python", 463 | "name": "python3" 464 | }, 465 | "language_info": { 466 | "codemirror_mode": { 467 | "name": "ipython", 468 | "version": 3 469 | }, 470 | "file_extension": ".py", 471 | "mimetype": "text/x-python", 472 | "name": "python", 473 | "nbconvert_exporter": "python", 474 | "pygments_lexer": "ipython3", 475 | "version": "3.7.2" 476 | } 477 | }, 478 | "nbformat": 4, 479 | "nbformat_minor": 2 480 | } 481 | -------------------------------------------------------------------------------- /#2 Extract a sub-string in Python Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Check documentation: \n", 10 | "# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.extract.html\n", 11 | "#\n", 12 | "# Video about this notebook: \n", 13 | "# https://www.youtube.com/watch?v=xhNSDQ1dbBA" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 58, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import pandas as pd" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 59, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "dfTestCSV = pd.DataFrame(pd.read_csv('./dfanalyticstestdata1.csv', header=2, sep=\"\\t\", index_col=0))" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 60, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/html": [ 42 | "
\n", 43 | "\n", 56 | "\n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | "
ga:pagePathga:pageviewsID_GA
0/auto/-Oh-slug-slug-why-are-you-slug-id31218.html331218
1/auto/-onboarding-story-id26545.html126545
2/auto/abgefahren/Test-Zattoo-Sandro-Überzeile-...1030770
3/auto/autosalon/autosalon-story-ferrari-id3146...831463
4/auto/Über-999-Stufen-und-durch-99-Haarnadel-K...231288
\n", 98 | "
" 99 | ], 100 | "text/plain": [ 101 | " ga:pagePath ga:pageviews ID_GA\n", 102 | "0 /auto/-Oh-slug-slug-why-are-you-slug-id31218.html 3 31218\n", 103 | "1 /auto/-onboarding-story-id26545.html 1 26545\n", 104 | "2 /auto/abgefahren/Test-Zattoo-Sandro-Überzeile-... 10 30770\n", 105 | "3 /auto/autosalon/autosalon-story-ferrari-id3146... 8 31463\n", 106 | "4 /auto/Über-999-Stufen-und-durch-99-Haarnadel-K... 2 31288" 107 | ] 108 | }, 109 | "execution_count": 60, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "dfTestCSV['ID_GA'] = dfTestCSV['ga:pagePath'].str.extract('(?:.*id)([0-9]+)(?:.html)')\n", 116 | "dfTestCSV.head(5)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [] 139 | } 140 | ], 141 | "metadata": { 142 | "kernelspec": { 143 | "display_name": "Python 3", 144 | "language": "python", 145 | "name": "python3" 146 | }, 147 | "language_info": { 148 | "codemirror_mode": { 149 | "name": "ipython", 150 | "version": 3 151 | }, 152 | "file_extension": ".py", 153 | "mimetype": "text/x-python", 154 | "name": "python", 155 | "nbconvert_exporter": "python", 156 | "pygments_lexer": "ipython3", 157 | "version": "3.7.2" 158 | } 159 | }, 160 | "nbformat": 4, 161 | "nbformat_minor": 2 162 | } 163 | -------------------------------------------------------------------------------- /#3 Pivot tables in Python Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Check documentation: \n", 10 | "# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot.html\n", 11 | "#\n", 12 | "# Video about this notebook: \n", 13 | "# https://youtu.be/E5v-kUi_NUI" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 96, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import pandas as pd" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 97, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "dfTestCSV = pd.DataFrame(pd.read_csv('./dfanalyticstestdata1.csv', header=2, sep=\"\\t\", index_col=0))" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 98, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/html": [ 42 | "
\n", 43 | "\n", 56 | "\n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | "
ga:pagePathga:pageviewsID_GA
66/incoming/Tochter-Alexis-Olympia-Das-Geheimnis...710061
135/people-tv/international/tochter-alexis-olympi...110061
91/news/-id1510256.html11510256
151/services/datenschutzbestimmungen-id151553.html5151553
83/life/besten-seite-sind-meine-ferien-meta-titl...11742
51/incoming/schreiben-sie-uns-id19186.html12819186
109/news/politik/Gegen-Gleichstellung-Homosexuell...219969
154/services/marktplatz/ticino-weekend-id2241980....12241980
20/incoming/Detailhandel-verliert-1-Prozent-Umsa...122672
189/sport/videoformate/dino/-id2483959.html12483959
\n", 128 | "
" 129 | ], 130 | "text/plain": [ 131 | " ga:pagePath ga:pageviews ID_GA\n", 132 | "66 /incoming/Tochter-Alexis-Olympia-Das-Geheimnis... 7 10061\n", 133 | "135 /people-tv/international/tochter-alexis-olympi... 1 10061\n", 134 | "91 /news/-id1510256.html 1 1510256\n", 135 | "151 /services/datenschutzbestimmungen-id151553.html 5 151553\n", 136 | "83 /life/besten-seite-sind-meine-ferien-meta-titl... 1 1742\n", 137 | "51 /incoming/schreiben-sie-uns-id19186.html1 28 19186\n", 138 | "109 /news/politik/Gegen-Gleichstellung-Homosexuell... 2 19969\n", 139 | "154 /services/marktplatz/ticino-weekend-id2241980.... 1 2241980\n", 140 | "20 /incoming/Detailhandel-verliert-1-Prozent-Umsa... 1 22672\n", 141 | "189 /sport/videoformate/dino/-id2483959.html 1 2483959" 142 | ] 143 | }, 144 | "execution_count": 98, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "dfTestCSV['ID_GA'] = dfTestCSV['ga:pagePath'].str.extract('(?:.*id)([0-9]+)(?:.html)')\n", 151 | "dfTestCSV = dfTestCSV.sort_values(by='ID_GA')\n", 152 | "dfTestCSV.head(10)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 99, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/plain": [ 163 | "194" 164 | ] 165 | }, 166 | "execution_count": 99, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "len(dfTestCSV)\n" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 100, 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "data": { 182 | "text/html": [ 183 | "
\n", 184 | "\n", 201 | "\n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | "
countsum
ga:pageviewsga:pageviews
ID_GA
1006128
151025611
15155315
174211
19186128
1996912
224198011
2267211
248395911
2654511
\n", 272 | "
" 273 | ], 274 | "text/plain": [ 275 | " count sum\n", 276 | " ga:pageviews ga:pageviews\n", 277 | "ID_GA \n", 278 | "10061 2 8\n", 279 | "1510256 1 1\n", 280 | "151553 1 5\n", 281 | "1742 1 1\n", 282 | "19186 1 28\n", 283 | "19969 1 2\n", 284 | "2241980 1 1\n", 285 | "22672 1 1\n", 286 | "2483959 1 1\n", 287 | "26545 1 1" 288 | ] 289 | }, 290 | "execution_count": 100, 291 | "metadata": {}, 292 | "output_type": "execute_result" 293 | } 294 | ], 295 | "source": [ 296 | "dfTestCSV = pd.pivot_table(dfTestCSV,index='ID_GA', values='ga:pageviews', aggfunc=['count', 'sum'])\n", 297 | "dfTestCSV = dfTestCSV.sort_values(by='ID_GA')\n", 298 | "dfTestCSV.head(10)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 94, 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "data": { 308 | "text/plain": [ 309 | "147" 310 | ] 311 | }, 312 | "execution_count": 94, 313 | "metadata": {}, 314 | "output_type": "execute_result" 315 | } 316 | ], 317 | "source": [ 318 | "len(dfTestCSV)\n" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [] 341 | } 342 | ], 343 | "metadata": { 344 | "kernelspec": { 345 | "display_name": "Python 3", 346 | "language": "python", 347 | "name": "python3" 348 | }, 349 | "language_info": { 350 | "codemirror_mode": { 351 | "name": "ipython", 352 | "version": 3 353 | }, 354 | "file_extension": ".py", 355 | "mimetype": "text/x-python", 356 | "name": "python", 357 | "nbconvert_exporter": "python", 358 | "pygments_lexer": "ipython3", 359 | "version": "3.7.2" 360 | } 361 | }, 362 | "nbformat": 4, 363 | "nbformat_minor": 2 364 | } 365 | -------------------------------------------------------------------------------- /#4 Header cleanup (reset_index, droplevel, rename) in Python Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Check documentation: \n", 10 | "# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reset_index.html\n", 11 | "# https://pandas-docs.github.io/pandas-docs-travis/reference/api/pandas.DataFrame.droplevel.html\n", 12 | "# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename.html\n", 13 | "#\n", 14 | "# Video about this notebook: \n", 15 | "# https://youtu.be/AvJ1YrdSKzQ" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 51, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 52, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "dfTestCSV = pd.DataFrame(pd.read_csv('./dfanalyticstestdata1.csv', header=2, sep=\"\\t\", index_col=0))" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 53, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "dfTestCSV['ID_GA'] = dfTestCSV['ga:pagePath'].str.extract('(?:.*id)([0-9]+)(?:.html)')" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 54, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "data": { 52 | "text/html": [ 53 | "
\n", 54 | "\n", 71 | "\n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | "
countsum
ga:pageviewsga:pageviews
ID_GA
1006128
151025611
15155315
174211
19186128
1996912
224198011
2267211
248395911
2654511
\n", 142 | "
" 143 | ], 144 | "text/plain": [ 145 | " count sum\n", 146 | " ga:pageviews ga:pageviews\n", 147 | "ID_GA \n", 148 | "10061 2 8\n", 149 | "1510256 1 1\n", 150 | "151553 1 5\n", 151 | "1742 1 1\n", 152 | "19186 1 28\n", 153 | "19969 1 2\n", 154 | "2241980 1 1\n", 155 | "22672 1 1\n", 156 | "2483959 1 1\n", 157 | "26545 1 1" 158 | ] 159 | }, 160 | "execution_count": 54, 161 | "metadata": {}, 162 | "output_type": "execute_result" 163 | } 164 | ], 165 | "source": [ 166 | "dfTestCSV = pd.pivot_table(dfTestCSV,index='ID_GA', values='ga:pageviews', aggfunc=['count', 'sum'])\n", 167 | "dfTestCSV.head(10)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 55, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/html": [ 178 | "
\n", 179 | "\n", 192 | "\n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | "
ID_GAcountsum
ga:pageviewsga:pageviews
01006128
1151025611
215155315
3174211
419186128
51996912
6224198011
72267211
8248395911
92654511
\n", 270 | "
" 271 | ], 272 | "text/plain": [ 273 | " ID_GA count sum\n", 274 | " ga:pageviews ga:pageviews\n", 275 | "0 10061 2 8\n", 276 | "1 1510256 1 1\n", 277 | "2 151553 1 5\n", 278 | "3 1742 1 1\n", 279 | "4 19186 1 28\n", 280 | "5 19969 1 2\n", 281 | "6 2241980 1 1\n", 282 | "7 22672 1 1\n", 283 | "8 2483959 1 1\n", 284 | "9 26545 1 1" 285 | ] 286 | }, 287 | "execution_count": 55, 288 | "metadata": {}, 289 | "output_type": "execute_result" 290 | } 291 | ], 292 | "source": [ 293 | "dfTestCSV.reset_index(inplace=True)\n", 294 | "dfTestCSV.head(10)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 56, 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "name": "stdout", 304 | "output_type": "stream", 305 | "text": [ 306 | "('ID_GA', '')\n", 307 | "('count', 'ga:pageviews')\n", 308 | "('sum', 'ga:pageviews')\n" 309 | ] 310 | } 311 | ], 312 | "source": [ 313 | "for col in dfTestCSV.columns: \n", 314 | " print(col)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 57, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "data": { 324 | "text/html": [ 325 | "
\n", 326 | "\n", 339 | "\n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | "
ID_GAcountsum
01006128
1151025611
215155315
3174211
419186128
51996912
6224198011
72267211
8248395911
92654511
\n", 411 | "
" 412 | ], 413 | "text/plain": [ 414 | " ID_GA count sum\n", 415 | "0 10061 2 8\n", 416 | "1 1510256 1 1\n", 417 | "2 151553 1 5\n", 418 | "3 1742 1 1\n", 419 | "4 19186 1 28\n", 420 | "5 19969 1 2\n", 421 | "6 2241980 1 1\n", 422 | "7 22672 1 1\n", 423 | "8 2483959 1 1\n", 424 | "9 26545 1 1" 425 | ] 426 | }, 427 | "execution_count": 57, 428 | "metadata": {}, 429 | "output_type": "execute_result" 430 | } 431 | ], 432 | "source": [ 433 | "dfTestCSV.columns = dfTestCSV.columns.droplevel(1)\n", 434 | "dfTestCSV.head(10)" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 59, 440 | "metadata": {}, 441 | "outputs": [ 442 | { 443 | "data": { 444 | "text/html": [ 445 | "
\n", 446 | "\n", 459 | "\n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | "
ID_GAid countpageviews per id
01006128
1151025611
215155315
3174211
419186128
51996912
6224198011
72267211
8248395911
92654511
102682811
113029122
123051911
133052712
143056811
153061511
163062512
173070212
183071513
193075811
2030770110
2130787160
223079028
233084212
243085912
253091411
263091911
273094715
283095011
293095211
............
1173216813
1183217712
1193217911
12032188112
121472572511
122504987811
123542149213
124553996514
125563678124
126569723011
127570310211
128618113
129624312
130625511
131625611
132626011
133627112
134632311
135632811
136644851915
137675972811
138692300511
139697411
140766260112
141774885511
142775857414
143783115512
144786011
145786312
146798742611
\n", 837 | "

147 rows × 3 columns

\n", 838 | "
" 839 | ], 840 | "text/plain": [ 841 | " ID_GA id count pageviews per id\n", 842 | "0 10061 2 8\n", 843 | "1 1510256 1 1\n", 844 | "2 151553 1 5\n", 845 | "3 1742 1 1\n", 846 | "4 19186 1 28\n", 847 | "5 19969 1 2\n", 848 | "6 2241980 1 1\n", 849 | "7 22672 1 1\n", 850 | "8 2483959 1 1\n", 851 | "9 26545 1 1\n", 852 | "10 26828 1 1\n", 853 | "11 30291 2 2\n", 854 | "12 30519 1 1\n", 855 | "13 30527 1 2\n", 856 | "14 30568 1 1\n", 857 | "15 30615 1 1\n", 858 | "16 30625 1 2\n", 859 | "17 30702 1 2\n", 860 | "18 30715 1 3\n", 861 | "19 30758 1 1\n", 862 | "20 30770 1 10\n", 863 | "21 30787 1 60\n", 864 | "22 30790 2 8\n", 865 | "23 30842 1 2\n", 866 | "24 30859 1 2\n", 867 | "25 30914 1 1\n", 868 | "26 30919 1 1\n", 869 | "27 30947 1 5\n", 870 | "28 30950 1 1\n", 871 | "29 30952 1 1\n", 872 | ".. ... ... ...\n", 873 | "117 32168 1 3\n", 874 | "118 32177 1 2\n", 875 | "119 32179 1 1\n", 876 | "120 32188 1 12\n", 877 | "121 4725725 1 1\n", 878 | "122 5049878 1 1\n", 879 | "123 5421492 1 3\n", 880 | "124 5539965 1 4\n", 881 | "125 5636781 2 4\n", 882 | "126 5697230 1 1\n", 883 | "127 5703102 1 1\n", 884 | "128 6181 1 3\n", 885 | "129 6243 1 2\n", 886 | "130 6255 1 1\n", 887 | "131 6256 1 1\n", 888 | "132 6260 1 1\n", 889 | "133 6271 1 2\n", 890 | "134 6323 1 1\n", 891 | "135 6328 1 1\n", 892 | "136 6448519 1 5\n", 893 | "137 6759728 1 1\n", 894 | "138 6923005 1 1\n", 895 | "139 6974 1 1\n", 896 | "140 7662601 1 2\n", 897 | "141 7748855 1 1\n", 898 | "142 7758574 1 4\n", 899 | "143 7831155 1 2\n", 900 | "144 7860 1 1\n", 901 | "145 7863 1 2\n", 902 | "146 7987426 1 1\n", 903 | "\n", 904 | "[147 rows x 3 columns]" 905 | ] 906 | }, 907 | "execution_count": 59, 908 | "metadata": {}, 909 | "output_type": "execute_result" 910 | } 911 | ], 912 | "source": [ 913 | "dfTestCSV = dfTestCSV.rename(columns={\"count\": \"id count\", \"sum\": \"pageviews per id\"})\n", 914 | "dfTestCSV" 915 | ] 916 | }, 917 | { 918 | "cell_type": "code", 919 | "execution_count": null, 920 | "metadata": {}, 921 | "outputs": [], 922 | "source": [] 923 | }, 924 | { 925 | "cell_type": "code", 926 | "execution_count": null, 927 | "metadata": {}, 928 | "outputs": [], 929 | "source": [] 930 | } 931 | ], 932 | "metadata": { 933 | "kernelspec": { 934 | "display_name": "Python 3", 935 | "language": "python", 936 | "name": "python3" 937 | }, 938 | "language_info": { 939 | "codemirror_mode": { 940 | "name": "ipython", 941 | "version": 3 942 | }, 943 | "file_extension": ".py", 944 | "mimetype": "text/x-python", 945 | "name": "python", 946 | "nbconvert_exporter": "python", 947 | "pygments_lexer": "ipython3", 948 | "version": "3.7.2" 949 | } 950 | }, 951 | "nbformat": 4, 952 | "nbformat_minor": 2 953 | } 954 | -------------------------------------------------------------------------------- /#5 Read Excel in Python Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Check documentation: \n", 10 | "# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html\n", 11 | "#\n", 12 | "# Video about this notebook: \n", 13 | "# https://youtu.be/--zFZjSlfK8" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 10, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import pandas as pd" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 11, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/html": [ 33 | "
\n", 34 | "\n", 47 | "\n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | "
AddressContentStatus CodeStatusContains: '\"article-text-box\"'Contains: '\"video-player\"'Contains: '\"article-body'Contains: '\\(sda\\)'Contains: '\"scribble\"'Contains: 'basic-image--free-desktop'Contains: '<iframe'Contains: 'layout-item--flex-desktop-4'Contains: '\\(sda\\/'
0https://www.blick.ch/people-tv/international/r...text/html; charset=utf-8200Connection Refused000000000
1https://www.blick.ch/sport/eishockey/overtime-...text/html; charset=utf-8200Connection Timeout000000000
2https://www.blick.ch/news/ausland/neben-berueh...text/html; charset=utf-8200Connection Timeout000000000
3https://www.blick.ch/news/grausiger-fund-in-an...text/html; charset=utf-8200Connection Timeout000000000
4https://www.blick.ch/news/mahamaya-bern-die-we...text/html; charset=utf-8200Connection Timeout000000000
5https://www.blick.ch/people-tv/musik/liechtens...text/html; charset=utf-8200Connection Timeout000000000
6https://www.blick.ch/people-tv/tv/bachelorette...text/html; charset=utf-8200Connection Timeout000000000
7https://www.blick.ch/news/das-ist-die-swiss-bo...text/html; charset=utf-8200Connection Timeout000000000
8https://www.blick.ch/news/schweiz/vergangenhei...text/html; charset=utf-8200Connection Timeout000000000
9https://www.blick.ch/news/wirtschaft/das-sind-...text/html; charset=utf-8200Connection Timeout000000000
\n", 229 | "
" 230 | ], 231 | "text/plain": [ 232 | " Address \\\n", 233 | "0 https://www.blick.ch/people-tv/international/r... \n", 234 | "1 https://www.blick.ch/sport/eishockey/overtime-... \n", 235 | "2 https://www.blick.ch/news/ausland/neben-berueh... \n", 236 | "3 https://www.blick.ch/news/grausiger-fund-in-an... \n", 237 | "4 https://www.blick.ch/news/mahamaya-bern-die-we... \n", 238 | "5 https://www.blick.ch/people-tv/musik/liechtens... \n", 239 | "6 https://www.blick.ch/people-tv/tv/bachelorette... \n", 240 | "7 https://www.blick.ch/news/das-ist-die-swiss-bo... \n", 241 | "8 https://www.blick.ch/news/schweiz/vergangenhei... \n", 242 | "9 https://www.blick.ch/news/wirtschaft/das-sind-... \n", 243 | "\n", 244 | " Content Status Code Status \\\n", 245 | "0 text/html; charset=utf-8 200 Connection Refused \n", 246 | "1 text/html; charset=utf-8 200 Connection Timeout \n", 247 | "2 text/html; charset=utf-8 200 Connection Timeout \n", 248 | "3 text/html; charset=utf-8 200 Connection Timeout \n", 249 | "4 text/html; charset=utf-8 200 Connection Timeout \n", 250 | "5 text/html; charset=utf-8 200 Connection Timeout \n", 251 | "6 text/html; charset=utf-8 200 Connection Timeout \n", 252 | "7 text/html; charset=utf-8 200 Connection Timeout \n", 253 | "8 text/html; charset=utf-8 200 Connection Timeout \n", 254 | "9 text/html; charset=utf-8 200 Connection Timeout \n", 255 | "\n", 256 | " Contains: '\"article-text-box\"' Contains: '\"video-player\"' \\\n", 257 | "0 0 0 \n", 258 | "1 0 0 \n", 259 | "2 0 0 \n", 260 | "3 0 0 \n", 261 | "4 0 0 \n", 262 | "5 0 0 \n", 263 | "6 0 0 \n", 264 | "7 0 0 \n", 265 | "8 0 0 \n", 266 | "9 0 0 \n", 267 | "\n", 268 | " Contains: '\"article-body' Contains: '\\(sda\\)' Contains: '\"scribble\"' \\\n", 269 | "0 0 0 0 \n", 270 | "1 0 0 0 \n", 271 | "2 0 0 0 \n", 272 | "3 0 0 0 \n", 273 | "4 0 0 0 \n", 274 | "5 0 0 0 \n", 275 | "6 0 0 0 \n", 276 | "7 0 0 0 \n", 277 | "8 0 0 0 \n", 278 | "9 0 0 0 \n", 279 | "\n", 280 | " Contains: 'basic-image--free-desktop' Contains: '\n", 324 | "\n", 337 | "\n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | "
AddressContentStatus CodeStatusarticle-text-boxvideo-playerarticle-bodysdascribblehas-imageiframenormal-article-layoutsda2
0https://www.blick.ch/people-tv/international/r...text/html; charset=utf-8200Connection Refused000000000
1https://www.blick.ch/sport/eishockey/overtime-...text/html; charset=utf-8200Connection Timeout000000000
2https://www.blick.ch/news/ausland/neben-berueh...text/html; charset=utf-8200Connection Timeout000000000
3https://www.blick.ch/news/grausiger-fund-in-an...text/html; charset=utf-8200Connection Timeout000000000
4https://www.blick.ch/news/mahamaya-bern-die-we...text/html; charset=utf-8200Connection Timeout000000000
5https://www.blick.ch/people-tv/musik/liechtens...text/html; charset=utf-8200Connection Timeout000000000
6https://www.blick.ch/people-tv/tv/bachelorette...text/html; charset=utf-8200Connection Timeout000000000
7https://www.blick.ch/news/das-ist-die-swiss-bo...text/html; charset=utf-8200Connection Timeout000000000
8https://www.blick.ch/news/schweiz/vergangenhei...text/html; charset=utf-8200Connection Timeout000000000
9https://www.blick.ch/news/wirtschaft/das-sind-...text/html; charset=utf-8200Connection Timeout000000000
\n", 519 | "" 520 | ], 521 | "text/plain": [ 522 | " Address \\\n", 523 | "0 https://www.blick.ch/people-tv/international/r... \n", 524 | "1 https://www.blick.ch/sport/eishockey/overtime-... \n", 525 | "2 https://www.blick.ch/news/ausland/neben-berueh... \n", 526 | "3 https://www.blick.ch/news/grausiger-fund-in-an... \n", 527 | "4 https://www.blick.ch/news/mahamaya-bern-die-we... \n", 528 | "5 https://www.blick.ch/people-tv/musik/liechtens... \n", 529 | "6 https://www.blick.ch/people-tv/tv/bachelorette... \n", 530 | "7 https://www.blick.ch/news/das-ist-die-swiss-bo... \n", 531 | "8 https://www.blick.ch/news/schweiz/vergangenhei... \n", 532 | "9 https://www.blick.ch/news/wirtschaft/das-sind-... \n", 533 | "\n", 534 | " Content Status Code Status \\\n", 535 | "0 text/html; charset=utf-8 200 Connection Refused \n", 536 | "1 text/html; charset=utf-8 200 Connection Timeout \n", 537 | "2 text/html; charset=utf-8 200 Connection Timeout \n", 538 | "3 text/html; charset=utf-8 200 Connection Timeout \n", 539 | "4 text/html; charset=utf-8 200 Connection Timeout \n", 540 | "5 text/html; charset=utf-8 200 Connection Timeout \n", 541 | "6 text/html; charset=utf-8 200 Connection Timeout \n", 542 | "7 text/html; charset=utf-8 200 Connection Timeout \n", 543 | "8 text/html; charset=utf-8 200 Connection Timeout \n", 544 | "9 text/html; charset=utf-8 200 Connection Timeout \n", 545 | "\n", 546 | " article-text-box video-player article-body sda scribble has-image \\\n", 547 | "0 0 0 0 0 0 0 \n", 548 | "1 0 0 0 0 0 0 \n", 549 | "2 0 0 0 0 0 0 \n", 550 | "3 0 0 0 0 0 0 \n", 551 | "4 0 0 0 0 0 0 \n", 552 | "5 0 0 0 0 0 0 \n", 553 | "6 0 0 0 0 0 0 \n", 554 | "7 0 0 0 0 0 0 \n", 555 | "8 0 0 0 0 0 0 \n", 556 | "9 0 0 0 0 0 0 \n", 557 | "\n", 558 | " iframe normal-article-layout sda2 \n", 559 | "0 0 0 0 \n", 560 | "1 0 0 0 \n", 561 | "2 0 0 0 \n", 562 | "3 0 0 0 \n", 563 | "4 0 0 0 \n", 564 | "5 0 0 0 \n", 565 | "6 0 0 0 \n", 566 | "7 0 0 0 \n", 567 | "8 0 0 0 \n", 568 | "9 0 0 0 " 569 | ] 570 | }, 571 | "execution_count": 12, 572 | "metadata": {}, 573 | "output_type": "execute_result" 574 | } 575 | ], 576 | "source": [ 577 | "#Rename columns\n", 578 | "dfCrawled = dfCrawled.rename(columns={\"Contains: \\'\\\"article-text-box\\\"\\'\": \"article-text-box\", \n", 579 | " \"Contains: \\'\\\"video-player\\\"\\'\":\"video-player\", \n", 580 | " \"Contains: \\'\\\"article-body\\'\":\"article-body\",\n", 581 | " \"Contains: \\'\\(sda\\)\\'\":\"sda\",\n", 582 | " \"Contains: \\'\\\"scribble\\\"\\'\":\"scribble\",\n", 583 | " \"Contains: \\'basic-image--free-desktop\\'\":\"has-image\", \n", 584 | " \"Contains: \\'\n", 600 | "\n", 613 | "\n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | "
Addressarticle-text-boxvideo-playerarticle-bodysdascribblehas-imageiframenormal-article-layoutsda2ID_Crawl
0https://www.blick.ch/people-tv/international/r...0000000004270807
1https://www.blick.ch/sport/eishockey/overtime-...000000000106244
2https://www.blick.ch/news/ausland/neben-berueh...0000000003896239
3https://www.blick.ch/news/grausiger-fund-in-an...0000000002024473
4https://www.blick.ch/news/mahamaya-bern-die-we...000000000110094
5https://www.blick.ch/people-tv/musik/liechtens...0000000002131248
6https://www.blick.ch/people-tv/tv/bachelorette...0000000003790667
7https://www.blick.ch/news/das-ist-die-swiss-bo...0000000002238005
8https://www.blick.ch/news/schweiz/vergangenhei...0000000005550392
9https://www.blick.ch/news/wirtschaft/das-sind-...0000000004955860
\n", 773 | "" 774 | ], 775 | "text/plain": [ 776 | " Address article-text-box \\\n", 777 | "0 https://www.blick.ch/people-tv/international/r... 0 \n", 778 | "1 https://www.blick.ch/sport/eishockey/overtime-... 0 \n", 779 | "2 https://www.blick.ch/news/ausland/neben-berueh... 0 \n", 780 | "3 https://www.blick.ch/news/grausiger-fund-in-an... 0 \n", 781 | "4 https://www.blick.ch/news/mahamaya-bern-die-we... 0 \n", 782 | "5 https://www.blick.ch/people-tv/musik/liechtens... 0 \n", 783 | "6 https://www.blick.ch/people-tv/tv/bachelorette... 0 \n", 784 | "7 https://www.blick.ch/news/das-ist-die-swiss-bo... 0 \n", 785 | "8 https://www.blick.ch/news/schweiz/vergangenhei... 0 \n", 786 | "9 https://www.blick.ch/news/wirtschaft/das-sind-... 0 \n", 787 | "\n", 788 | " video-player article-body sda scribble has-image iframe \\\n", 789 | "0 0 0 0 0 0 0 \n", 790 | "1 0 0 0 0 0 0 \n", 791 | "2 0 0 0 0 0 0 \n", 792 | "3 0 0 0 0 0 0 \n", 793 | "4 0 0 0 0 0 0 \n", 794 | "5 0 0 0 0 0 0 \n", 795 | "6 0 0 0 0 0 0 \n", 796 | "7 0 0 0 0 0 0 \n", 797 | "8 0 0 0 0 0 0 \n", 798 | "9 0 0 0 0 0 0 \n", 799 | "\n", 800 | " normal-article-layout sda2 ID_Crawl \n", 801 | "0 0 0 4270807 \n", 802 | "1 0 0 106244 \n", 803 | "2 0 0 3896239 \n", 804 | "3 0 0 2024473 \n", 805 | "4 0 0 110094 \n", 806 | "5 0 0 2131248 \n", 807 | "6 0 0 3790667 \n", 808 | "7 0 0 2238005 \n", 809 | "8 0 0 5550392 \n", 810 | "9 0 0 4955860 " 811 | ] 812 | }, 813 | "execution_count": 13, 814 | "metadata": {}, 815 | "output_type": "execute_result" 816 | } 817 | ], 818 | "source": [ 819 | "dfCrawled['ID_Crawl'] = dfCrawled['Address'].str.extract('(?:.*id)([0-9]+)(?:.html)', expand=False)\n", 820 | "dfCrawled = dfCrawled.drop(\"Status Code\",1)\n", 821 | "dfCrawled = dfCrawled.drop(\"Status\",1)\n", 822 | "dfCrawled = dfCrawled.drop(\"Content\",1)\n", 823 | "dfCrawled.head(10)" 824 | ] 825 | }, 826 | { 827 | "cell_type": "code", 828 | "execution_count": null, 829 | "metadata": {}, 830 | "outputs": [], 831 | "source": [] 832 | }, 833 | { 834 | "cell_type": "code", 835 | "execution_count": null, 836 | "metadata": {}, 837 | "outputs": [], 838 | "source": [] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": null, 843 | "metadata": {}, 844 | "outputs": [], 845 | "source": [] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": null, 850 | "metadata": {}, 851 | "outputs": [], 852 | "source": [] 853 | }, 854 | { 855 | "cell_type": "code", 856 | "execution_count": null, 857 | "metadata": {}, 858 | "outputs": [], 859 | "source": [] 860 | }, 861 | { 862 | "cell_type": "code", 863 | "execution_count": null, 864 | "metadata": {}, 865 | "outputs": [], 866 | "source": [] 867 | }, 868 | { 869 | "cell_type": "code", 870 | "execution_count": null, 871 | "metadata": {}, 872 | "outputs": [], 873 | "source": [] 874 | } 875 | ], 876 | "metadata": { 877 | "kernelspec": { 878 | "display_name": "Python 3", 879 | "language": "python", 880 | "name": "python3" 881 | }, 882 | "language_info": { 883 | "codemirror_mode": { 884 | "name": "ipython", 885 | "version": 3 886 | }, 887 | "file_extension": ".py", 888 | "mimetype": "text/x-python", 889 | "name": "python", 890 | "nbconvert_exporter": "python", 891 | "pygments_lexer": "ipython3", 892 | "version": "3.7.2" 893 | } 894 | }, 895 | "nbformat": 4, 896 | "nbformat_minor": 2 897 | } 898 | -------------------------------------------------------------------------------- /#6 Merge two tables in Python Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Check documentation: \n", 10 | "# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html\n", 11 | "#\n", 12 | "# Video about this notebook: \n", 13 | "# https://youtu.be/l9S6sWCykDo" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 24, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import pandas as pd" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 25, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "dfTestCSV = pd.DataFrame(pd.read_csv('./dfanalyticstestdata1.csv', header=2, sep=\"\\t\", index_col=0))" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 26, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "dfTestCSV['ID_GA'] = dfTestCSV['ga:pagePath'].str.extract('(?:.*id)([0-9]+)(?:.html)')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 27, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/html": [ 51 | "
\n", 52 | "\n", 65 | "\n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | "
ID_GAid countpageviews per id
01006128
1151025611
215155315
3174211
419186128
51996912
6224198011
72267211
8248395911
92654511
\n", 137 | "
" 138 | ], 139 | "text/plain": [ 140 | " ID_GA id count pageviews per id\n", 141 | "0 10061 2 8\n", 142 | "1 1510256 1 1\n", 143 | "2 151553 1 5\n", 144 | "3 1742 1 1\n", 145 | "4 19186 1 28\n", 146 | "5 19969 1 2\n", 147 | "6 2241980 1 1\n", 148 | "7 22672 1 1\n", 149 | "8 2483959 1 1\n", 150 | "9 26545 1 1" 151 | ] 152 | }, 153 | "execution_count": 27, 154 | "metadata": {}, 155 | "output_type": "execute_result" 156 | } 157 | ], 158 | "source": [ 159 | "dfTestCSV = pd.pivot_table(dfTestCSV,index='ID_GA', values='ga:pageviews', aggfunc=['count', 'sum'])\n", 160 | "dfTestCSV.reset_index(inplace=True)\n", 161 | "dfTestCSV.columns = dfTestCSV.columns.droplevel(1)\n", 162 | "dfTestCSV = dfTestCSV.rename(columns={\"count\": \"id count\", \"sum\": \"pageviews per id\"})\n", 163 | "dfTestCSV.head(10)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 30, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/html": [ 174 | "
\n", 175 | "\n", 188 | "\n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | "
Addressarticle-text-boxvideo-playerarticle-bodysdascribblehas-imageiframenormal-article-layoutsda2ID_Crawl
0https://www.blick.ch/people-tv/international/r...0000000004270807
1https://www.blick.ch/sport/eishockey/overtime-...000000000106244
2https://www.blick.ch/news/ausland/neben-berueh...0000000003896239
3https://www.blick.ch/news/grausiger-fund-in-an...0000000002024473
4https://www.blick.ch/news/mahamaya-bern-die-we...000000000110094
5https://www.blick.ch/people-tv/musik/liechtens...0000000002131248
6https://www.blick.ch/people-tv/tv/bachelorette...0000000003790667
7https://www.blick.ch/news/das-ist-die-swiss-bo...0000000002238005
8https://www.blick.ch/news/schweiz/vergangenhei...0000000005550392
9https://www.blick.ch/news/wirtschaft/das-sind-...0000000004955860
\n", 348 | "
" 349 | ], 350 | "text/plain": [ 351 | " Address article-text-box \\\n", 352 | "0 https://www.blick.ch/people-tv/international/r... 0 \n", 353 | "1 https://www.blick.ch/sport/eishockey/overtime-... 0 \n", 354 | "2 https://www.blick.ch/news/ausland/neben-berueh... 0 \n", 355 | "3 https://www.blick.ch/news/grausiger-fund-in-an... 0 \n", 356 | "4 https://www.blick.ch/news/mahamaya-bern-die-we... 0 \n", 357 | "5 https://www.blick.ch/people-tv/musik/liechtens... 0 \n", 358 | "6 https://www.blick.ch/people-tv/tv/bachelorette... 0 \n", 359 | "7 https://www.blick.ch/news/das-ist-die-swiss-bo... 0 \n", 360 | "8 https://www.blick.ch/news/schweiz/vergangenhei... 0 \n", 361 | "9 https://www.blick.ch/news/wirtschaft/das-sind-... 0 \n", 362 | "\n", 363 | " video-player article-body sda scribble has-image iframe \\\n", 364 | "0 0 0 0 0 0 0 \n", 365 | "1 0 0 0 0 0 0 \n", 366 | "2 0 0 0 0 0 0 \n", 367 | "3 0 0 0 0 0 0 \n", 368 | "4 0 0 0 0 0 0 \n", 369 | "5 0 0 0 0 0 0 \n", 370 | "6 0 0 0 0 0 0 \n", 371 | "7 0 0 0 0 0 0 \n", 372 | "8 0 0 0 0 0 0 \n", 373 | "9 0 0 0 0 0 0 \n", 374 | "\n", 375 | " normal-article-layout sda2 ID_Crawl \n", 376 | "0 0 0 4270807 \n", 377 | "1 0 0 106244 \n", 378 | "2 0 0 3896239 \n", 379 | "3 0 0 2024473 \n", 380 | "4 0 0 110094 \n", 381 | "5 0 0 2131248 \n", 382 | "6 0 0 3790667 \n", 383 | "7 0 0 2238005 \n", 384 | "8 0 0 5550392 \n", 385 | "9 0 0 4955860 " 386 | ] 387 | }, 388 | "execution_count": 30, 389 | "metadata": {}, 390 | "output_type": "execute_result" 391 | } 392 | ], 393 | "source": [ 394 | "dfCrawled = pd.DataFrame(pd.read_excel('./custom_search_all-test.xlsx', header=0, sheet_name='1 - All'))\n", 395 | "dfCrawled = dfCrawled.rename(columns={\"Contains: \\'\\\"article-text-box\\\"\\'\": \"article-text-box\", \n", 396 | " \"Contains: \\'\\\"video-player\\\"\\'\":\"video-player\", \n", 397 | " \"Contains: \\'\\\"article-body\\'\":\"article-body\",\n", 398 | " \"Contains: \\'\\(sda\\)\\'\":\"sda\",\n", 399 | " \"Contains: \\'\\\"scribble\\\"\\'\":\"scribble\",\n", 400 | " \"Contains: \\'basic-image--free-desktop\\'\":\"has-image\", \n", 401 | " \"Contains: \\'\n", 421 | "\n", 434 | "\n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | "
ID_GAid countpageviews per idAddressarticle-text-boxvideo-playerarticle-bodysdascribblehas-imageiframenormal-article-layoutsda2
34310001.03.0https://www.blick.ch/people-tv/international/e...0.00.01.00.02.01.01.01.02.0
36310021.01.0https://www.blick.ch/news/ausland/afghanistan-...0.00.01.03.02.00.01.01.00.0
37310041.05.0https://www.blick.ch/news/zwei-schweizer-in-ve...0.00.01.00.02.00.01.01.00.0
51312752.06.0https://www.blick.ch/news/schweiz/zuerich/knas...0.00.01.00.02.00.01.01.00.0
52312771.012.0https://www.blick.ch/news/ausland/steinmeier-p...0.00.01.00.02.00.01.01.00.0
65313881.03.0https://www.blick.ch/news/schweiz/zentralschwe...0.00.05.00.02.00.01.01.00.0
68314251.01.0https://www.blick.ch/people-tv/international/h...0.00.01.00.02.00.01.01.00.0
69314261.01.0https://www.blick.ch/sport/eishockey/nla/serve...1.00.01.00.02.00.01.01.00.0
70314271.01.0https://www.blick.ch/digital/games/news/colin-...0.00.03.00.02.05.01.01.00.0
71314281.01.0https://www.blick.ch/digital/games/larry-magna...0.00.01.00.02.00.01.01.00.0
\n", 616 | "" 617 | ], 618 | "text/plain": [ 619 | " ID_GA id count pageviews per id \\\n", 620 | "34 31000 1.0 3.0 \n", 621 | "36 31002 1.0 1.0 \n", 622 | "37 31004 1.0 5.0 \n", 623 | "51 31275 2.0 6.0 \n", 624 | "52 31277 1.0 12.0 \n", 625 | "65 31388 1.0 3.0 \n", 626 | "68 31425 1.0 1.0 \n", 627 | "69 31426 1.0 1.0 \n", 628 | "70 31427 1.0 1.0 \n", 629 | "71 31428 1.0 1.0 \n", 630 | "\n", 631 | " Address article-text-box \\\n", 632 | "34 https://www.blick.ch/people-tv/international/e... 0.0 \n", 633 | "36 https://www.blick.ch/news/ausland/afghanistan-... 0.0 \n", 634 | "37 https://www.blick.ch/news/zwei-schweizer-in-ve... 0.0 \n", 635 | "51 https://www.blick.ch/news/schweiz/zuerich/knas... 0.0 \n", 636 | "52 https://www.blick.ch/news/ausland/steinmeier-p... 0.0 \n", 637 | "65 https://www.blick.ch/news/schweiz/zentralschwe... 0.0 \n", 638 | "68 https://www.blick.ch/people-tv/international/h... 0.0 \n", 639 | "69 https://www.blick.ch/sport/eishockey/nla/serve... 1.0 \n", 640 | "70 https://www.blick.ch/digital/games/news/colin-... 0.0 \n", 641 | "71 https://www.blick.ch/digital/games/larry-magna... 0.0 \n", 642 | "\n", 643 | " video-player article-body sda scribble has-image iframe \\\n", 644 | "34 0.0 1.0 0.0 2.0 1.0 1.0 \n", 645 | "36 0.0 1.0 3.0 2.0 0.0 1.0 \n", 646 | "37 0.0 1.0 0.0 2.0 0.0 1.0 \n", 647 | "51 0.0 1.0 0.0 2.0 0.0 1.0 \n", 648 | "52 0.0 1.0 0.0 2.0 0.0 1.0 \n", 649 | "65 0.0 5.0 0.0 2.0 0.0 1.0 \n", 650 | "68 0.0 1.0 0.0 2.0 0.0 1.0 \n", 651 | "69 0.0 1.0 0.0 2.0 0.0 1.0 \n", 652 | "70 0.0 3.0 0.0 2.0 5.0 1.0 \n", 653 | "71 0.0 1.0 0.0 2.0 0.0 1.0 \n", 654 | "\n", 655 | " normal-article-layout sda2 \n", 656 | "34 1.0 2.0 \n", 657 | "36 1.0 0.0 \n", 658 | "37 1.0 0.0 \n", 659 | "51 1.0 0.0 \n", 660 | "52 1.0 0.0 \n", 661 | "65 1.0 0.0 \n", 662 | "68 1.0 0.0 \n", 663 | "69 1.0 0.0 \n", 664 | "70 1.0 0.0 \n", 665 | "71 1.0 0.0 " 666 | ] 667 | }, 668 | "execution_count": 29, 669 | "metadata": {}, 670 | "output_type": "execute_result" 671 | } 672 | ], 673 | "source": [ 674 | "dfMerged = pd.merge(dfTestCSV, dfCrawled, left_on='ID_GA', right_on='ID_Crawl', how='outer')\n", 675 | "dfMerged = dfMerged[dfMerged['ID_Crawl'].notnull()]\n", 676 | "dfMerged = dfMerged[dfMerged['ID_GA'].notnull()]\n", 677 | "dfMerged = dfMerged.drop(\"ID_Crawl\",1)\n", 678 | "dfMerged.head(10)" 679 | ] 680 | }, 681 | { 682 | "cell_type": "code", 683 | "execution_count": null, 684 | "metadata": {}, 685 | "outputs": [], 686 | "source": [] 687 | }, 688 | { 689 | "cell_type": "code", 690 | "execution_count": null, 691 | "metadata": {}, 692 | "outputs": [], 693 | "source": [] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": null, 698 | "metadata": {}, 699 | "outputs": [], 700 | "source": [] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": null, 705 | "metadata": {}, 706 | "outputs": [], 707 | "source": [] 708 | }, 709 | { 710 | "cell_type": "code", 711 | "execution_count": null, 712 | "metadata": {}, 713 | "outputs": [], 714 | "source": [] 715 | } 716 | ], 717 | "metadata": { 718 | "kernelspec": { 719 | "display_name": "Python 3", 720 | "language": "python", 721 | "name": "python3" 722 | }, 723 | "language_info": { 724 | "codemirror_mode": { 725 | "name": "ipython", 726 | "version": 3 727 | }, 728 | "file_extension": ".py", 729 | "mimetype": "text/x-python", 730 | "name": "python", 731 | "nbconvert_exporter": "python", 732 | "pygments_lexer": "ipython3", 733 | "version": "3.7.2" 734 | } 735 | }, 736 | "nbformat": 4, 737 | "nbformat_minor": 2 738 | } 739 | -------------------------------------------------------------------------------- /#7 Filter and export in Python Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Video about this notebook: \n", 10 | "# https://youtu.be/54BwLbGWluc" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import pandas as pd" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "dfTestCSV = pd.DataFrame(pd.read_csv('./dfanalyticstestdata1.csv', header=2, sep=\"\\t\", index_col=0))" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 4, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "dfTestCSV['ID_GA'] = dfTestCSV['ga:pagePath'].str.extract('(?:.*id)([0-9]+)(?:.html)')" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 5, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/html": [ 48 | "
\n", 49 | "\n", 62 | "\n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | "
ID_GAid countpageviews per id
01006128
1151025611
215155315
3174211
419186128
51996912
6224198011
72267211
8248395911
92654511
\n", 134 | "
" 135 | ], 136 | "text/plain": [ 137 | " ID_GA id count pageviews per id\n", 138 | "0 10061 2 8\n", 139 | "1 1510256 1 1\n", 140 | "2 151553 1 5\n", 141 | "3 1742 1 1\n", 142 | "4 19186 1 28\n", 143 | "5 19969 1 2\n", 144 | "6 2241980 1 1\n", 145 | "7 22672 1 1\n", 146 | "8 2483959 1 1\n", 147 | "9 26545 1 1" 148 | ] 149 | }, 150 | "execution_count": 5, 151 | "metadata": {}, 152 | "output_type": "execute_result" 153 | } 154 | ], 155 | "source": [ 156 | "dfTestCSV = pd.pivot_table(dfTestCSV,index='ID_GA', values='ga:pageviews', aggfunc=['count', 'sum'])\n", 157 | "dfTestCSV.reset_index(inplace=True)\n", 158 | "dfTestCSV.columns = dfTestCSV.columns.droplevel(1)\n", 159 | "dfTestCSV = dfTestCSV.rename(columns={\"count\": \"id count\", \"sum\": \"pageviews per id\"})\n", 160 | "dfTestCSV.head(10)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 6, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "data": { 170 | "text/html": [ 171 | "
\n", 172 | "\n", 185 | "\n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | "
Addressarticle-text-boxvideo-playerarticle-bodysdascribblehas-imageiframenormal-article-layoutsda2ID_Crawl
0https://www.blick.ch/people-tv/international/r...0000000004270807
1https://www.blick.ch/sport/eishockey/overtime-...000000000106244
2https://www.blick.ch/news/ausland/neben-berueh...0000000003896239
3https://www.blick.ch/news/grausiger-fund-in-an...0000000002024473
4https://www.blick.ch/news/mahamaya-bern-die-we...000000000110094
5https://www.blick.ch/people-tv/musik/liechtens...0000000002131248
6https://www.blick.ch/people-tv/tv/bachelorette...0000000003790667
7https://www.blick.ch/news/das-ist-die-swiss-bo...0000000002238005
8https://www.blick.ch/news/schweiz/vergangenhei...0000000005550392
9https://www.blick.ch/news/wirtschaft/das-sind-...0000000004955860
\n", 345 | "
" 346 | ], 347 | "text/plain": [ 348 | " Address article-text-box \\\n", 349 | "0 https://www.blick.ch/people-tv/international/r... 0 \n", 350 | "1 https://www.blick.ch/sport/eishockey/overtime-... 0 \n", 351 | "2 https://www.blick.ch/news/ausland/neben-berueh... 0 \n", 352 | "3 https://www.blick.ch/news/grausiger-fund-in-an... 0 \n", 353 | "4 https://www.blick.ch/news/mahamaya-bern-die-we... 0 \n", 354 | "5 https://www.blick.ch/people-tv/musik/liechtens... 0 \n", 355 | "6 https://www.blick.ch/people-tv/tv/bachelorette... 0 \n", 356 | "7 https://www.blick.ch/news/das-ist-die-swiss-bo... 0 \n", 357 | "8 https://www.blick.ch/news/schweiz/vergangenhei... 0 \n", 358 | "9 https://www.blick.ch/news/wirtschaft/das-sind-... 0 \n", 359 | "\n", 360 | " video-player article-body sda scribble has-image iframe \\\n", 361 | "0 0 0 0 0 0 0 \n", 362 | "1 0 0 0 0 0 0 \n", 363 | "2 0 0 0 0 0 0 \n", 364 | "3 0 0 0 0 0 0 \n", 365 | "4 0 0 0 0 0 0 \n", 366 | "5 0 0 0 0 0 0 \n", 367 | "6 0 0 0 0 0 0 \n", 368 | "7 0 0 0 0 0 0 \n", 369 | "8 0 0 0 0 0 0 \n", 370 | "9 0 0 0 0 0 0 \n", 371 | "\n", 372 | " normal-article-layout sda2 ID_Crawl \n", 373 | "0 0 0 4270807 \n", 374 | "1 0 0 106244 \n", 375 | "2 0 0 3896239 \n", 376 | "3 0 0 2024473 \n", 377 | "4 0 0 110094 \n", 378 | "5 0 0 2131248 \n", 379 | "6 0 0 3790667 \n", 380 | "7 0 0 2238005 \n", 381 | "8 0 0 5550392 \n", 382 | "9 0 0 4955860 " 383 | ] 384 | }, 385 | "execution_count": 6, 386 | "metadata": {}, 387 | "output_type": "execute_result" 388 | } 389 | ], 390 | "source": [ 391 | "dfCrawled = pd.DataFrame(pd.read_excel('./custom_search_all-test.xlsx', header=0, sheet_name='1 - All'))\n", 392 | "dfCrawled = dfCrawled.rename(columns={\"Contains: \\'\\\"article-text-box\\\"\\'\": \"article-text-box\", \n", 393 | " \"Contains: \\'\\\"video-player\\\"\\'\":\"video-player\", \n", 394 | " \"Contains: \\'\\\"article-body\\'\":\"article-body\",\n", 395 | " \"Contains: \\'\\(sda\\)\\'\":\"sda\",\n", 396 | " \"Contains: \\'\\\"scribble\\\"\\'\":\"scribble\",\n", 397 | " \"Contains: \\'basic-image--free-desktop\\'\":\"has-image\", \n", 398 | " \"Contains: \\'\n", 418 | "\n", 431 | "\n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | "
ID_GAid countpageviews per idAddressarticle-text-boxvideo-playerarticle-bodysdascribblehas-imageiframenormal-article-layoutsda2
34310001.03.0https://www.blick.ch/people-tv/international/e...0.00.01.00.02.01.01.01.02.0
36310021.01.0https://www.blick.ch/news/ausland/afghanistan-...0.00.01.03.02.00.01.01.00.0
37310041.05.0https://www.blick.ch/news/zwei-schweizer-in-ve...0.00.01.00.02.00.01.01.00.0
51312752.06.0https://www.blick.ch/news/schweiz/zuerich/knas...0.00.01.00.02.00.01.01.00.0
52312771.012.0https://www.blick.ch/news/ausland/steinmeier-p...0.00.01.00.02.00.01.01.00.0
65313881.03.0https://www.blick.ch/news/schweiz/zentralschwe...0.00.05.00.02.00.01.01.00.0
68314251.01.0https://www.blick.ch/people-tv/international/h...0.00.01.00.02.00.01.01.00.0
69314261.01.0https://www.blick.ch/sport/eishockey/nla/serve...1.00.01.00.02.00.01.01.00.0
70314271.01.0https://www.blick.ch/digital/games/news/colin-...0.00.03.00.02.05.01.01.00.0
71314281.01.0https://www.blick.ch/digital/games/larry-magna...0.00.01.00.02.00.01.01.00.0
\n", 613 | "" 614 | ], 615 | "text/plain": [ 616 | " ID_GA id count pageviews per id \\\n", 617 | "34 31000 1.0 3.0 \n", 618 | "36 31002 1.0 1.0 \n", 619 | "37 31004 1.0 5.0 \n", 620 | "51 31275 2.0 6.0 \n", 621 | "52 31277 1.0 12.0 \n", 622 | "65 31388 1.0 3.0 \n", 623 | "68 31425 1.0 1.0 \n", 624 | "69 31426 1.0 1.0 \n", 625 | "70 31427 1.0 1.0 \n", 626 | "71 31428 1.0 1.0 \n", 627 | "\n", 628 | " Address article-text-box \\\n", 629 | "34 https://www.blick.ch/people-tv/international/e... 0.0 \n", 630 | "36 https://www.blick.ch/news/ausland/afghanistan-... 0.0 \n", 631 | "37 https://www.blick.ch/news/zwei-schweizer-in-ve... 0.0 \n", 632 | "51 https://www.blick.ch/news/schweiz/zuerich/knas... 0.0 \n", 633 | "52 https://www.blick.ch/news/ausland/steinmeier-p... 0.0 \n", 634 | "65 https://www.blick.ch/news/schweiz/zentralschwe... 0.0 \n", 635 | "68 https://www.blick.ch/people-tv/international/h... 0.0 \n", 636 | "69 https://www.blick.ch/sport/eishockey/nla/serve... 1.0 \n", 637 | "70 https://www.blick.ch/digital/games/news/colin-... 0.0 \n", 638 | "71 https://www.blick.ch/digital/games/larry-magna... 0.0 \n", 639 | "\n", 640 | " video-player article-body sda scribble has-image iframe \\\n", 641 | "34 0.0 1.0 0.0 2.0 1.0 1.0 \n", 642 | "36 0.0 1.0 3.0 2.0 0.0 1.0 \n", 643 | "37 0.0 1.0 0.0 2.0 0.0 1.0 \n", 644 | "51 0.0 1.0 0.0 2.0 0.0 1.0 \n", 645 | "52 0.0 1.0 0.0 2.0 0.0 1.0 \n", 646 | "65 0.0 5.0 0.0 2.0 0.0 1.0 \n", 647 | "68 0.0 1.0 0.0 2.0 0.0 1.0 \n", 648 | "69 0.0 1.0 0.0 2.0 0.0 1.0 \n", 649 | "70 0.0 3.0 0.0 2.0 5.0 1.0 \n", 650 | "71 0.0 1.0 0.0 2.0 0.0 1.0 \n", 651 | "\n", 652 | " normal-article-layout sda2 \n", 653 | "34 1.0 2.0 \n", 654 | "36 1.0 0.0 \n", 655 | "37 1.0 0.0 \n", 656 | "51 1.0 0.0 \n", 657 | "52 1.0 0.0 \n", 658 | "65 1.0 0.0 \n", 659 | "68 1.0 0.0 \n", 660 | "69 1.0 0.0 \n", 661 | "70 1.0 0.0 \n", 662 | "71 1.0 0.0 " 663 | ] 664 | }, 665 | "execution_count": 7, 666 | "metadata": {}, 667 | "output_type": "execute_result" 668 | } 669 | ], 670 | "source": [ 671 | "dfMerged = pd.merge(dfTestCSV, dfCrawled, left_on='ID_GA', right_on='ID_Crawl', how='outer')\n", 672 | "dfMerged = dfMerged[dfMerged['ID_Crawl'].notnull()]\n", 673 | "dfMerged = dfMerged[dfMerged['ID_GA'].notnull()]\n", 674 | "dfMerged = dfMerged.drop(\"ID_Crawl\",1)\n", 675 | "dfMerged.head(10)" 676 | ] 677 | }, 678 | { 679 | "cell_type": "code", 680 | "execution_count": 14, 681 | "metadata": {}, 682 | "outputs": [], 683 | "source": [ 684 | "dfFilter = dfMerged" 685 | ] 686 | }, 687 | { 688 | "cell_type": "code", 689 | "execution_count": 15, 690 | "metadata": {}, 691 | "outputs": [ 692 | { 693 | "data": { 694 | "text/html": [ 695 | "
\n", 696 | "\n", 709 | "\n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | "
ID_GAid countpageviews per idAddressarticle-text-boxvideo-playerarticle-bodysdascribblehas-imageiframenormal-article-layoutsda2
34310001.03.0https://www.blick.ch/people-tv/international/e...0.00.01.00.02.01.01.01.02.0
36310021.01.0https://www.blick.ch/news/ausland/afghanistan-...0.00.01.03.02.00.01.01.00.0
37310041.05.0https://www.blick.ch/news/zwei-schweizer-in-ve...0.00.01.00.02.00.01.01.00.0
51312752.06.0https://www.blick.ch/news/schweiz/zuerich/knas...0.00.01.00.02.00.01.01.00.0
52312771.012.0https://www.blick.ch/news/ausland/steinmeier-p...0.00.01.00.02.00.01.01.00.0
65313881.03.0https://www.blick.ch/news/schweiz/zentralschwe...0.00.05.00.02.00.01.01.00.0
68314251.01.0https://www.blick.ch/people-tv/international/h...0.00.01.00.02.00.01.01.00.0
71314281.01.0https://www.blick.ch/digital/games/larry-magna...0.00.01.00.02.00.01.01.00.0
72314501.02.0https://www.blick.ch/news/schweiz/zuerich/scho...0.00.01.00.02.00.01.01.00.0
74314631.08.0https://www.blick.ch/news/schweiz/zuerich/zum-...0.00.01.00.02.00.01.01.00.0
13262601.01.0https://www.blick.ch/news/schweiz/zentralschwe...0.00.01.00.02.00.01.01.00.0
13563281.01.0https://www.blick.ch/news/beijing-fashion-week...0.00.01.00.02.00.01.01.00.0
\n", 923 | "
" 924 | ], 925 | "text/plain": [ 926 | " ID_GA id count pageviews per id \\\n", 927 | "34 31000 1.0 3.0 \n", 928 | "36 31002 1.0 1.0 \n", 929 | "37 31004 1.0 5.0 \n", 930 | "51 31275 2.0 6.0 \n", 931 | "52 31277 1.0 12.0 \n", 932 | "65 31388 1.0 3.0 \n", 933 | "68 31425 1.0 1.0 \n", 934 | "71 31428 1.0 1.0 \n", 935 | "72 31450 1.0 2.0 \n", 936 | "74 31463 1.0 8.0 \n", 937 | "132 6260 1.0 1.0 \n", 938 | "135 6328 1.0 1.0 \n", 939 | "\n", 940 | " Address article-text-box \\\n", 941 | "34 https://www.blick.ch/people-tv/international/e... 0.0 \n", 942 | "36 https://www.blick.ch/news/ausland/afghanistan-... 0.0 \n", 943 | "37 https://www.blick.ch/news/zwei-schweizer-in-ve... 0.0 \n", 944 | "51 https://www.blick.ch/news/schweiz/zuerich/knas... 0.0 \n", 945 | "52 https://www.blick.ch/news/ausland/steinmeier-p... 0.0 \n", 946 | "65 https://www.blick.ch/news/schweiz/zentralschwe... 0.0 \n", 947 | "68 https://www.blick.ch/people-tv/international/h... 0.0 \n", 948 | "71 https://www.blick.ch/digital/games/larry-magna... 0.0 \n", 949 | "72 https://www.blick.ch/news/schweiz/zuerich/scho... 0.0 \n", 950 | "74 https://www.blick.ch/news/schweiz/zuerich/zum-... 0.0 \n", 951 | "132 https://www.blick.ch/news/schweiz/zentralschwe... 0.0 \n", 952 | "135 https://www.blick.ch/news/beijing-fashion-week... 0.0 \n", 953 | "\n", 954 | " video-player article-body sda scribble has-image iframe \\\n", 955 | "34 0.0 1.0 0.0 2.0 1.0 1.0 \n", 956 | "36 0.0 1.0 3.0 2.0 0.0 1.0 \n", 957 | "37 0.0 1.0 0.0 2.0 0.0 1.0 \n", 958 | "51 0.0 1.0 0.0 2.0 0.0 1.0 \n", 959 | "52 0.0 1.0 0.0 2.0 0.0 1.0 \n", 960 | "65 0.0 5.0 0.0 2.0 0.0 1.0 \n", 961 | "68 0.0 1.0 0.0 2.0 0.0 1.0 \n", 962 | "71 0.0 1.0 0.0 2.0 0.0 1.0 \n", 963 | "72 0.0 1.0 0.0 2.0 0.0 1.0 \n", 964 | "74 0.0 1.0 0.0 2.0 0.0 1.0 \n", 965 | "132 0.0 1.0 0.0 2.0 0.0 1.0 \n", 966 | "135 0.0 1.0 0.0 2.0 0.0 1.0 \n", 967 | "\n", 968 | " normal-article-layout sda2 \n", 969 | "34 1.0 2.0 \n", 970 | "36 1.0 0.0 \n", 971 | "37 1.0 0.0 \n", 972 | "51 1.0 0.0 \n", 973 | "52 1.0 0.0 \n", 974 | "65 1.0 0.0 \n", 975 | "68 1.0 0.0 \n", 976 | "71 1.0 0.0 \n", 977 | "72 1.0 0.0 \n", 978 | "74 1.0 0.0 \n", 979 | "132 1.0 0.0 \n", 980 | "135 1.0 0.0 " 981 | ] 982 | }, 983 | "execution_count": 15, 984 | "metadata": {}, 985 | "output_type": "execute_result" 986 | } 987 | ], 988 | "source": [ 989 | "dfFilter = dfFilter[(dfFilter['article-text-box'] == 0) &\n", 990 | " (dfFilter['has-image'] < 4)\n", 991 | " ]\n", 992 | "dfFilter" 993 | ] 994 | }, 995 | { 996 | "cell_type": "code", 997 | "execution_count": 16, 998 | "metadata": {}, 999 | "outputs": [ 1000 | { 1001 | "data": { 1002 | "text/html": [ 1003 | "
\n", 1004 | "\n", 1017 | "\n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | "
ID_GAid countpageviews per idAddressarticle-text-boxvideo-playerarticle-bodysdascribblehas-imageiframenormal-article-layoutsda2
36310021.01.0https://www.blick.ch/news/ausland/afghanistan-...0.00.01.03.02.00.01.01.00.0
37310041.05.0https://www.blick.ch/news/zwei-schweizer-in-ve...0.00.01.00.02.00.01.01.00.0
51312752.06.0https://www.blick.ch/news/schweiz/zuerich/knas...0.00.01.00.02.00.01.01.00.0
52312771.012.0https://www.blick.ch/news/ausland/steinmeier-p...0.00.01.00.02.00.01.01.00.0
65313881.03.0https://www.blick.ch/news/schweiz/zentralschwe...0.00.05.00.02.00.01.01.00.0
72314501.02.0https://www.blick.ch/news/schweiz/zuerich/scho...0.00.01.00.02.00.01.01.00.0
74314631.08.0https://www.blick.ch/news/schweiz/zuerich/zum-...0.00.01.00.02.00.01.01.00.0
13262601.01.0https://www.blick.ch/news/schweiz/zentralschwe...0.00.01.00.02.00.01.01.00.0
13563281.01.0https://www.blick.ch/news/beijing-fashion-week...0.00.01.00.02.00.01.01.00.0
\n", 1183 | "
" 1184 | ], 1185 | "text/plain": [ 1186 | " ID_GA id count pageviews per id \\\n", 1187 | "36 31002 1.0 1.0 \n", 1188 | "37 31004 1.0 5.0 \n", 1189 | "51 31275 2.0 6.0 \n", 1190 | "52 31277 1.0 12.0 \n", 1191 | "65 31388 1.0 3.0 \n", 1192 | "72 31450 1.0 2.0 \n", 1193 | "74 31463 1.0 8.0 \n", 1194 | "132 6260 1.0 1.0 \n", 1195 | "135 6328 1.0 1.0 \n", 1196 | "\n", 1197 | " Address article-text-box \\\n", 1198 | "36 https://www.blick.ch/news/ausland/afghanistan-... 0.0 \n", 1199 | "37 https://www.blick.ch/news/zwei-schweizer-in-ve... 0.0 \n", 1200 | "51 https://www.blick.ch/news/schweiz/zuerich/knas... 0.0 \n", 1201 | "52 https://www.blick.ch/news/ausland/steinmeier-p... 0.0 \n", 1202 | "65 https://www.blick.ch/news/schweiz/zentralschwe... 0.0 \n", 1203 | "72 https://www.blick.ch/news/schweiz/zuerich/scho... 0.0 \n", 1204 | "74 https://www.blick.ch/news/schweiz/zuerich/zum-... 0.0 \n", 1205 | "132 https://www.blick.ch/news/schweiz/zentralschwe... 0.0 \n", 1206 | "135 https://www.blick.ch/news/beijing-fashion-week... 0.0 \n", 1207 | "\n", 1208 | " video-player article-body sda scribble has-image iframe \\\n", 1209 | "36 0.0 1.0 3.0 2.0 0.0 1.0 \n", 1210 | "37 0.0 1.0 0.0 2.0 0.0 1.0 \n", 1211 | "51 0.0 1.0 0.0 2.0 0.0 1.0 \n", 1212 | "52 0.0 1.0 0.0 2.0 0.0 1.0 \n", 1213 | "65 0.0 5.0 0.0 2.0 0.0 1.0 \n", 1214 | "72 0.0 1.0 0.0 2.0 0.0 1.0 \n", 1215 | "74 0.0 1.0 0.0 2.0 0.0 1.0 \n", 1216 | "132 0.0 1.0 0.0 2.0 0.0 1.0 \n", 1217 | "135 0.0 1.0 0.0 2.0 0.0 1.0 \n", 1218 | "\n", 1219 | " normal-article-layout sda2 \n", 1220 | "36 1.0 0.0 \n", 1221 | "37 1.0 0.0 \n", 1222 | "51 1.0 0.0 \n", 1223 | "52 1.0 0.0 \n", 1224 | "65 1.0 0.0 \n", 1225 | "72 1.0 0.0 \n", 1226 | "74 1.0 0.0 \n", 1227 | "132 1.0 0.0 \n", 1228 | "135 1.0 0.0 " 1229 | ] 1230 | }, 1231 | "execution_count": 16, 1232 | "metadata": {}, 1233 | "output_type": "execute_result" 1234 | } 1235 | ], 1236 | "source": [ 1237 | "dfFilter = dfFilter[dfFilter[\"Address\"].str.contains(\"ch/news/|ch/wetter/\")]\n", 1238 | "dfFilter" 1239 | ] 1240 | }, 1241 | { 1242 | "cell_type": "code", 1243 | "execution_count": null, 1244 | "metadata": {}, 1245 | "outputs": [], 1246 | "source": [ 1247 | "dfFilter.to_csv('dfFilter-Test.csv', sep='\\t')\n" 1248 | ] 1249 | }, 1250 | { 1251 | "cell_type": "code", 1252 | "execution_count": null, 1253 | "metadata": {}, 1254 | "outputs": [], 1255 | "source": [] 1256 | } 1257 | ], 1258 | "metadata": { 1259 | "kernelspec": { 1260 | "display_name": "Python 3", 1261 | "language": "python", 1262 | "name": "python3" 1263 | }, 1264 | "language_info": { 1265 | "codemirror_mode": { 1266 | "name": "ipython", 1267 | "version": 3 1268 | }, 1269 | "file_extension": ".py", 1270 | "mimetype": "text/x-python", 1271 | "name": "python", 1272 | "nbconvert_exporter": "python", 1273 | "pygments_lexer": "ipython3", 1274 | "version": "3.7.2" 1275 | } 1276 | }, 1277 | "nbformat": 4, 1278 | "nbformat_minor": 2 1279 | } 1280 | -------------------------------------------------------------------------------- /Analytics Big Beach Spring Break.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 14, 6 | "metadata": { 7 | "scrolled": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 15, 17 | "metadata": { 18 | "scrolled": true 19 | }, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "len(rows): 9\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "from apiclient.discovery import build\n", 31 | "from oauth2client.service_account import ServiceAccountCredentials\n", 32 | "\n", 33 | "SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']\n", 34 | "KEY_FILE_LOCATION = './Ga Test-1a2cc84a6ab1.json'\n", 35 | "VIEW_ID = '107822800'\n", 36 | "\n", 37 | "def initialize_analyticsreporting():\n", 38 | " credentials = ServiceAccountCredentials.from_json_keyfile_name(\n", 39 | " KEY_FILE_LOCATION, SCOPES)\n", 40 | " analytics = build('analyticsreporting', 'v4', credentials=credentials)\n", 41 | " return analytics\n", 42 | "\n", 43 | "#Get one report page\n", 44 | "def get_report(analytics, pageTokenVar):\n", 45 | " return analytics.reports().batchGet(\n", 46 | " body={\n", 47 | " 'reportRequests': [\n", 48 | " {\n", 49 | " 'viewId': VIEW_ID,\n", 50 | " 'dateRanges': [{'startDate': '3daysAgo', 'endDate': 'yesterday'}],\n", 51 | " 'metrics': [{'expression': 'ga:pageviews'}],\n", 52 | " 'dimensions': [{'name': 'ga:pagePath'}],\n", 53 | " 'pageSize': 10000,\n", 54 | " 'pageToken': pageTokenVar,\n", 55 | " 'samplingLevel': 'LARGE'\n", 56 | " }]\n", 57 | " }\n", 58 | " ).execute()\n", 59 | " \n", 60 | "def handle_report(analytics,pagetoken,rows): \n", 61 | " response = get_report(analytics, pagetoken)\n", 62 | "\n", 63 | " #Header, Dimentions Headers, Metric Headers \n", 64 | " columnHeader = response.get(\"reports\")[0].get('columnHeader', {})\n", 65 | " dimensionHeaders = columnHeader.get('dimensions', [])\n", 66 | " metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])\n", 67 | "\n", 68 | " #Pagination\n", 69 | " pagetoken = response.get(\"reports\")[0].get('nextPageToken', None)\n", 70 | " \n", 71 | " #Rows\n", 72 | " rowsNew = response.get(\"reports\")[0].get('data', {}).get('rows', [])\n", 73 | " rows = rows + rowsNew\n", 74 | " print(\"len(rows): \" + str(len(rows)))\n", 75 | "\n", 76 | " #Recursivly query next page\n", 77 | " if pagetoken != None:\n", 78 | " return handle_report(analytics,pagetoken,rows)\n", 79 | " else:\n", 80 | " #nicer results\n", 81 | " nicerows=[]\n", 82 | " for row in rows:\n", 83 | " dic={}\n", 84 | " dimensions = row.get('dimensions', [])\n", 85 | " dateRangeValues = row.get('metrics', [])\n", 86 | "\n", 87 | " for header, dimension in zip(dimensionHeaders, dimensions):\n", 88 | " dic[header] = dimension\n", 89 | "\n", 90 | " for i, values in enumerate(dateRangeValues):\n", 91 | " for metric, value in zip(metricHeaders, values.get('values')):\n", 92 | " if ',' in value or ',' in value:\n", 93 | " dic[metric.get('name')] = float(value)\n", 94 | " else:\n", 95 | " dic[metric.get('name')] = int(value)\n", 96 | " nicerows.append(dic)\n", 97 | " return nicerows\n", 98 | "\n", 99 | "#Start\n", 100 | "def main(): \n", 101 | " analytics = initialize_analyticsreporting()\n", 102 | " \n", 103 | " global dfanalytics\n", 104 | " dfanalytics = []\n", 105 | "\n", 106 | " rows = []\n", 107 | " rows = handle_report(analytics,'0',rows)\n", 108 | "\n", 109 | " dfanalytics = pd.DataFrame(list(rows))\n", 110 | "\n", 111 | "if __name__ == '__main__':\n", 112 | " main()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 16, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/html": [ 123 | "
\n", 124 | "\n", 137 | "\n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | "
ga:pagePathga:pageviews
0/16
1/?s=Corona+1
2/en/2
3/en/program/1
4/kontakt/2
5/line-up/7
6/pakete/3
7/partyboot/2
8/tickets/6
\n", 193 | "
" 194 | ], 195 | "text/plain": [ 196 | " ga:pagePath ga:pageviews\n", 197 | "0 / 16\n", 198 | "1 /?s=Corona+ 1\n", 199 | "2 /en/ 2\n", 200 | "3 /en/program/ 1\n", 201 | "4 /kontakt/ 2\n", 202 | "5 /line-up/ 7\n", 203 | "6 /pakete/ 3\n", 204 | "7 /partyboot/ 2\n", 205 | "8 /tickets/ 6" 206 | ] 207 | }, 208 | "execution_count": 16, 209 | "metadata": {}, 210 | "output_type": "execute_result" 211 | } 212 | ], 213 | "source": [ 214 | "dfanalytics" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [] 230 | } 231 | ], 232 | "metadata": { 233 | "kernelspec": { 234 | "display_name": "Python 3", 235 | "language": "python", 236 | "name": "python3" 237 | }, 238 | "language_info": { 239 | "codemirror_mode": { 240 | "name": "ipython", 241 | "version": 3 242 | }, 243 | "file_extension": ".py", 244 | "mimetype": "text/x-python", 245 | "name": "python", 246 | "nbconvert_exporter": "python", 247 | "pygments_lexer": "ipython3", 248 | "version": "3.7.2" 249 | } 250 | }, 251 | "nbformat": 4, 252 | "nbformat_minor": 2 253 | } 254 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python-Pandas-SEO-Videos 2 | -------------------------------------------------------------------------------- /Read News XML Sitemap with Python + Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 36, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/html": [ 11 | "
\n", 12 | "\n", 25 | "\n", 26 | " \n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | "
locnews:titlenews:publication_date
0https://www.nzz.ch/feuilleton/in-zeiten-des-co...Das virale Feuilleton-Tagebuch macht keine Ost...2020-04-11T17:29:16+02:00
1https://www.nzz.ch/international/coronavirus-i...Bundespräsident ruft zu Geduld und Disziplin a...2020-04-11T17:23:24+02:00
2https://www.nzz.ch/zuerich/coronavirus-in-zuer...Die Schliessung eines Spargelstands im Zürcher...2020-04-11T16:25:00+02:00
3https://www.nzz.ch/sport/verfahren-gegen-ex-fi...Verfahren gegen Ex-Fifa-Präsident Sepp Blatter...2020-04-11T15:25:36+02:00
4https://www.nzz.ch/schweiz/coronavirus-in-der-...Zahl der Neuinfizierten geht weiterhin zurück,...2020-04-11T15:18:27+02:00
5https://www.nzz.ch/panorama/die-wichtigsten-gr...2700 Menschen sind bisher in Deutschland an Co...2020-04-11T13:12:33+02:00
6https://www.nzz.ch/international/coronavirus-w...Die USA verzeichnen mehr als 2000 Todesfälle i...2020-04-11T13:05:51+02:00
7https://www.nzz.ch/panorama/radioaktiv-belaste...Waldbrände bei Tschernobyl weiter nicht unter ...2020-04-11T12:39:31+02:00
8https://www.nzz.ch/zuerich/zuerich-philosophis...Philosophie per Telefon während der Corona-Kri...2020-04-11T12:00:00+02:00
9https://www.nzz.ch/panorama/die-wichtigsten-gr...Die Schweiz zählt 1000 Tote, in Genf wächst pr...2020-04-11T11:31:00+02:00
10https://www.nzz.ch/panorama/passant-findet-in-...Passant findet in Köln Kriegsbombe am Rhein – ...2020-04-11T11:26:07+02:00
11https://www.nzz.ch/zuerich/coronavirus-in-zuer...Die Zürcher Regierung ist in der Krise mächtig...2020-04-11T10:30:00+02:00
12https://www.nzz.ch/briefing/feiertagsbriefing-...Feiertagsbriefing2020-04-11T08:03:19+02:00
13https://www.nzz.ch/briefing/feiertagsbriefing-...Feiertagsbriefing2020-04-11T07:56:12+02:00
14https://www.nzz.ch/panorama/umweltschuetzer-be...Umweltschützer befürchten erneut schwere Waldb...2020-04-11T07:44:32+02:00
15https://www.nzz.ch/international/pentagon-korr...Die USA versprechen bis zu zehn Millionen Doll...2020-04-11T07:13:32+02:00
16https://www.nzz.ch/wirtschaft/coronavirus-bill...Bill Gates – der Mann, der die Pandemie kommen...2020-04-11T07:00:00+02:00
17https://www.nzz.ch/sport/korruption-bei-der-wm...Korruption bei der WM-Doppelvergabe 2018 und 2...2020-04-11T06:30:00+02:00
18https://www.nzz.ch/schweiz/coronavirus-gebaut-...Gebaut wird überall ungefähr gleich. Warum als...2020-04-11T06:00:00+02:00
19https://www.nzz.ch/reisen/reisen-im-kopf-zu-be...Tränen auf Aurora2020-04-11T06:00:00+02:00
20https://www.nzz.ch/reisen/lagerkoller-in-den-e...«Und dann sitzt du fest im Basislager, wartest...2020-04-11T06:00:00+02:00
21https://www.nzz.ch/meinung/coronavirus-in-deut...Die Politik ist nun endgültig durchpädagogisie...2020-04-11T06:00:00+02:00
22https://www.nzz.ch/wirtschaft/trump-putin-und-...Trump, Putin und der saudische Kronprinz bilde...2020-04-11T05:36:13+02:00
23https://www.nzz.ch/wirtschaft/coronavirus-schw...Die Schweizer Banken bleiben krisenresistent2020-04-11T05:30:00+02:00
24https://www.nzz.ch/sport/das-muessen-sie-frau-...«Das müssen Sie Frau Wang fragen» – Sky Sun we...2020-04-11T05:30:00+02:00
25https://www.nzz.ch/finanzen/aktienmaerkte-so-e...So entwickeln sich Aktienmärkte nach wasserfal...2020-04-11T05:30:00+02:00
26https://www.nzz.ch/wissenschaft/coronavirus-fu...Für die meisten Allergiker bedeutet das neue C...2020-04-11T05:30:00+02:00
27https://www.nzz.ch/feuilleton/bei-t-c-boyle-is...Die Natur ist am Ende immer die Stärkere –T. C...2020-04-11T05:30:00+02:00
28https://www.nzz.ch/feuilleton/wann-ist-jesus-a...Ohne Ostern gäbe es vielleicht keinen Computer...2020-04-11T05:30:00+02:00
29https://www.nzz.ch/feuilleton/die-corona-krise...Die Corona-Krise offenbart auch eine Krise der...2020-04-11T05:30:00+02:00
............
82https://www.nzz.ch/panorama/saengerin-rihanna-...Sängerin Rihanna und Twitter-Chef Dorsey spend...2020-04-10T07:15:40+02:00
83https://www.nzz.ch/international/uno-sicherhei...Uno-Sicherheitsrat trifft sich virtuell: Guter...2020-04-10T06:37:43+02:00
84https://www.nzz.ch/panorama/massensterben-von-...Massensterben von Zugvögeln in Griechenland du...2020-04-10T06:00:24+02:00
85https://www.nzz.ch/zuerich/coronavirus-in-zuer...Ostern in der Isolation: 11 Ideen, damit einem...2020-04-10T06:00:00+02:00
86https://www.nzz.ch/international/pakistan-in-d...Ziaullah Khan produziert in Pakistan Waffen fü...2020-04-10T06:00:00+02:00
87https://www.nzz.ch/international/soldaten-toet...Terroreinsatz im Tschad: Soldaten töten rund 1...2020-04-10T05:49:27+02:00
88https://www.nzz.ch/fotografie/die-corona-krise...Die Corona-Krise in Bildern: vier Fotografen, ...2020-04-10T05:30:00+02:00
89https://www.nzz.ch/international/coronavirus-u...Auffällig viele Schwarze unter den Covid-19-Op...2020-04-10T05:30:00+02:00
90https://www.nzz.ch/gesellschaft/corona-krise-j...Er ist der Albtraum jeder Schwiegermutter2020-04-10T05:30:00+02:00
91https://www.nzz.ch/wissenschaft/coronavirus-wa...Die verseuchte Herzdame: Wie Viren von einem K...2020-04-10T05:30:00+02:00
92https://www.nzz.ch/feuilleton/die-landschaft-v...Die Landschaft, von der Passstrasse her gesehe...2020-04-10T05:30:00+02:00
93https://www.nzz.ch/feuilleton/trotz-coronaviru...In Berlin dürfen Bücher noch verkauft werden, ...2020-04-10T05:30:00+02:00
94https://www.nzz.ch/feuilleton/postcorona-studi...Post-Corona-Studien: Das Virus wäre nichts ohn...2020-04-10T05:30:00+02:00
95https://www.nzz.ch/technologie/wie-findet-die-...Wie findet die Swisscom unsere Standortdaten h...2020-04-10T05:30:00+02:00
96https://www.nzz.ch/panorama/coronavirus-norweg...Wegen der Corona-Krise dürfen die Norweger nic...2020-04-10T05:30:00+02:00
97https://www.nzz.ch/international/suedkoreaner-...Südkoreaner tauschen Daten gegen Bewegungsfrei...2020-04-10T05:30:00+02:00
98https://www.nzz.ch/gesellschaft/corona-krise-z...Einer der reichsten Spanier lässt nun Wegwerfk...2020-04-10T05:30:00+02:00
99https://www.nzz.ch/feuilleton/verschon-uns-got...«Verschon uns, Gott, mit Strafen»: Vielleicht ...2020-04-10T05:30:00+02:00
100https://www.nzz.ch/sport/jakub-blaszczykowski-...Jakub Blaszczykowski erfreut die Fussball-Roma...2020-04-10T05:30:00+02:00
101https://www.nzz.ch/feuilleton/corona-und-kunst...«Den Frühling können sie nicht absagen»2020-04-10T05:30:00+02:00
102https://www.nzz.ch/feuilleton/coronavirus-zuku...Das Leben im ewigen Idyll ist zu Ende2020-04-10T05:30:00+02:00
103https://www.nzz.ch/panorama/us-aktivistin-fuer...US-Aktivistin für Frauenrechte und Homo-Ehe mi...2020-04-10T05:01:53+02:00
104https://www.nzz.ch/wissenschaft/coronavirus-di...Diese neuartigen Impfstoffe sollen das Coronav...2020-04-10T05:00:00+02:00
105https://www.nzz.ch/zuerich/coronavirus-in-zuer...Den Startups trocknet in der Corona-Krise die ...2020-04-10T05:00:00+02:00
106https://www.nzz.ch/panorama/nach-kanu-unfall-d...Nach Kanu-Unfall der Kennedy-Enkelin: Taucher ...2020-04-10T03:54:05+02:00
107https://www.nzz.ch/wirtschaft/eu-finanzministe...Die EU-Finanzminister einigen sich auf ein 540...2020-04-10T01:57:21+02:00
108https://www.nzz.ch/fotografie/bilder-des-tages...Bilder des Tages2020-04-10T00:11:23+02:00
109https://www.nzz.ch/wirtschaft/noch-einmal-66-m...Weitere 6,6 Millionen Arbeitslose in den USA -...2020-04-09T23:43:41+02:00
110https://www.nzz.ch/video/nzz-format/natur-pur-...Natur pur: Uruguays Rinder ohne Antibiotika un...2020-04-09T23:00:00+02:00
111https://www.nzz.ch/meinung/zentralbanken-finan...Zentralbanken finanzieren Staatsausgaben: Das ...2020-04-09T20:24:24+02:00
\n", 403 | "

112 rows × 3 columns

\n", 404 | "
" 405 | ], 406 | "text/plain": [ 407 | " loc \\\n", 408 | "0 https://www.nzz.ch/feuilleton/in-zeiten-des-co... \n", 409 | "1 https://www.nzz.ch/international/coronavirus-i... \n", 410 | "2 https://www.nzz.ch/zuerich/coronavirus-in-zuer... \n", 411 | "3 https://www.nzz.ch/sport/verfahren-gegen-ex-fi... \n", 412 | "4 https://www.nzz.ch/schweiz/coronavirus-in-der-... \n", 413 | "5 https://www.nzz.ch/panorama/die-wichtigsten-gr... \n", 414 | "6 https://www.nzz.ch/international/coronavirus-w... \n", 415 | "7 https://www.nzz.ch/panorama/radioaktiv-belaste... \n", 416 | "8 https://www.nzz.ch/zuerich/zuerich-philosophis... \n", 417 | "9 https://www.nzz.ch/panorama/die-wichtigsten-gr... \n", 418 | "10 https://www.nzz.ch/panorama/passant-findet-in-... \n", 419 | "11 https://www.nzz.ch/zuerich/coronavirus-in-zuer... \n", 420 | "12 https://www.nzz.ch/briefing/feiertagsbriefing-... \n", 421 | "13 https://www.nzz.ch/briefing/feiertagsbriefing-... \n", 422 | "14 https://www.nzz.ch/panorama/umweltschuetzer-be... \n", 423 | "15 https://www.nzz.ch/international/pentagon-korr... \n", 424 | "16 https://www.nzz.ch/wirtschaft/coronavirus-bill... \n", 425 | "17 https://www.nzz.ch/sport/korruption-bei-der-wm... \n", 426 | "18 https://www.nzz.ch/schweiz/coronavirus-gebaut-... \n", 427 | "19 https://www.nzz.ch/reisen/reisen-im-kopf-zu-be... \n", 428 | "20 https://www.nzz.ch/reisen/lagerkoller-in-den-e... \n", 429 | "21 https://www.nzz.ch/meinung/coronavirus-in-deut... \n", 430 | "22 https://www.nzz.ch/wirtschaft/trump-putin-und-... \n", 431 | "23 https://www.nzz.ch/wirtschaft/coronavirus-schw... \n", 432 | "24 https://www.nzz.ch/sport/das-muessen-sie-frau-... \n", 433 | "25 https://www.nzz.ch/finanzen/aktienmaerkte-so-e... \n", 434 | "26 https://www.nzz.ch/wissenschaft/coronavirus-fu... \n", 435 | "27 https://www.nzz.ch/feuilleton/bei-t-c-boyle-is... \n", 436 | "28 https://www.nzz.ch/feuilleton/wann-ist-jesus-a... \n", 437 | "29 https://www.nzz.ch/feuilleton/die-corona-krise... \n", 438 | ".. ... \n", 439 | "82 https://www.nzz.ch/panorama/saengerin-rihanna-... \n", 440 | "83 https://www.nzz.ch/international/uno-sicherhei... \n", 441 | "84 https://www.nzz.ch/panorama/massensterben-von-... \n", 442 | "85 https://www.nzz.ch/zuerich/coronavirus-in-zuer... \n", 443 | "86 https://www.nzz.ch/international/pakistan-in-d... \n", 444 | "87 https://www.nzz.ch/international/soldaten-toet... \n", 445 | "88 https://www.nzz.ch/fotografie/die-corona-krise... \n", 446 | "89 https://www.nzz.ch/international/coronavirus-u... \n", 447 | "90 https://www.nzz.ch/gesellschaft/corona-krise-j... \n", 448 | "91 https://www.nzz.ch/wissenschaft/coronavirus-wa... \n", 449 | "92 https://www.nzz.ch/feuilleton/die-landschaft-v... \n", 450 | "93 https://www.nzz.ch/feuilleton/trotz-coronaviru... \n", 451 | "94 https://www.nzz.ch/feuilleton/postcorona-studi... \n", 452 | "95 https://www.nzz.ch/technologie/wie-findet-die-... \n", 453 | "96 https://www.nzz.ch/panorama/coronavirus-norweg... \n", 454 | "97 https://www.nzz.ch/international/suedkoreaner-... \n", 455 | "98 https://www.nzz.ch/gesellschaft/corona-krise-z... \n", 456 | "99 https://www.nzz.ch/feuilleton/verschon-uns-got... \n", 457 | "100 https://www.nzz.ch/sport/jakub-blaszczykowski-... \n", 458 | "101 https://www.nzz.ch/feuilleton/corona-und-kunst... \n", 459 | "102 https://www.nzz.ch/feuilleton/coronavirus-zuku... \n", 460 | "103 https://www.nzz.ch/panorama/us-aktivistin-fuer... \n", 461 | "104 https://www.nzz.ch/wissenschaft/coronavirus-di... \n", 462 | "105 https://www.nzz.ch/zuerich/coronavirus-in-zuer... \n", 463 | "106 https://www.nzz.ch/panorama/nach-kanu-unfall-d... \n", 464 | "107 https://www.nzz.ch/wirtschaft/eu-finanzministe... \n", 465 | "108 https://www.nzz.ch/fotografie/bilder-des-tages... \n", 466 | "109 https://www.nzz.ch/wirtschaft/noch-einmal-66-m... \n", 467 | "110 https://www.nzz.ch/video/nzz-format/natur-pur-... \n", 468 | "111 https://www.nzz.ch/meinung/zentralbanken-finan... \n", 469 | "\n", 470 | " news:title \\\n", 471 | "0 Das virale Feuilleton-Tagebuch macht keine Ost... \n", 472 | "1 Bundespräsident ruft zu Geduld und Disziplin a... \n", 473 | "2 Die Schliessung eines Spargelstands im Zürcher... \n", 474 | "3 Verfahren gegen Ex-Fifa-Präsident Sepp Blatter... \n", 475 | "4 Zahl der Neuinfizierten geht weiterhin zurück,... \n", 476 | "5 2700 Menschen sind bisher in Deutschland an Co... \n", 477 | "6 Die USA verzeichnen mehr als 2000 Todesfälle i... \n", 478 | "7 Waldbrände bei Tschernobyl weiter nicht unter ... \n", 479 | "8 Philosophie per Telefon während der Corona-Kri... \n", 480 | "9 Die Schweiz zählt 1000 Tote, in Genf wächst pr... \n", 481 | "10 Passant findet in Köln Kriegsbombe am Rhein – ... \n", 482 | "11 Die Zürcher Regierung ist in der Krise mächtig... \n", 483 | "12 Feiertagsbriefing \n", 484 | "13 Feiertagsbriefing \n", 485 | "14 Umweltschützer befürchten erneut schwere Waldb... \n", 486 | "15 Die USA versprechen bis zu zehn Millionen Doll... \n", 487 | "16 Bill Gates – der Mann, der die Pandemie kommen... \n", 488 | "17 Korruption bei der WM-Doppelvergabe 2018 und 2... \n", 489 | "18 Gebaut wird überall ungefähr gleich. Warum als... \n", 490 | "19 Tränen auf Aurora \n", 491 | "20 «Und dann sitzt du fest im Basislager, wartest... \n", 492 | "21 Die Politik ist nun endgültig durchpädagogisie... \n", 493 | "22 Trump, Putin und der saudische Kronprinz bilde... \n", 494 | "23 Die Schweizer Banken bleiben krisenresistent \n", 495 | "24 «Das müssen Sie Frau Wang fragen» – Sky Sun we... \n", 496 | "25 So entwickeln sich Aktienmärkte nach wasserfal... \n", 497 | "26 Für die meisten Allergiker bedeutet das neue C... \n", 498 | "27 Die Natur ist am Ende immer die Stärkere –T. C... \n", 499 | "28 Ohne Ostern gäbe es vielleicht keinen Computer... \n", 500 | "29 Die Corona-Krise offenbart auch eine Krise der... \n", 501 | ".. ... \n", 502 | "82 Sängerin Rihanna und Twitter-Chef Dorsey spend... \n", 503 | "83 Uno-Sicherheitsrat trifft sich virtuell: Guter... \n", 504 | "84 Massensterben von Zugvögeln in Griechenland du... \n", 505 | "85 Ostern in der Isolation: 11 Ideen, damit einem... \n", 506 | "86 Ziaullah Khan produziert in Pakistan Waffen fü... \n", 507 | "87 Terroreinsatz im Tschad: Soldaten töten rund 1... \n", 508 | "88 Die Corona-Krise in Bildern: vier Fotografen, ... \n", 509 | "89 Auffällig viele Schwarze unter den Covid-19-Op... \n", 510 | "90 Er ist der Albtraum jeder Schwiegermutter \n", 511 | "91 Die verseuchte Herzdame: Wie Viren von einem K... \n", 512 | "92 Die Landschaft, von der Passstrasse her gesehe... \n", 513 | "93 In Berlin dürfen Bücher noch verkauft werden, ... \n", 514 | "94 Post-Corona-Studien: Das Virus wäre nichts ohn... \n", 515 | "95 Wie findet die Swisscom unsere Standortdaten h... \n", 516 | "96 Wegen der Corona-Krise dürfen die Norweger nic... \n", 517 | "97 Südkoreaner tauschen Daten gegen Bewegungsfrei... \n", 518 | "98 Einer der reichsten Spanier lässt nun Wegwerfk... \n", 519 | "99 «Verschon uns, Gott, mit Strafen»: Vielleicht ... \n", 520 | "100 Jakub Blaszczykowski erfreut die Fussball-Roma... \n", 521 | "101 «Den Frühling können sie nicht absagen» \n", 522 | "102 Das Leben im ewigen Idyll ist zu Ende \n", 523 | "103 US-Aktivistin für Frauenrechte und Homo-Ehe mi... \n", 524 | "104 Diese neuartigen Impfstoffe sollen das Coronav... \n", 525 | "105 Den Startups trocknet in der Corona-Krise die ... \n", 526 | "106 Nach Kanu-Unfall der Kennedy-Enkelin: Taucher ... \n", 527 | "107 Die EU-Finanzminister einigen sich auf ein 540... \n", 528 | "108 Bilder des Tages \n", 529 | "109 Weitere 6,6 Millionen Arbeitslose in den USA -... \n", 530 | "110 Natur pur: Uruguays Rinder ohne Antibiotika un... \n", 531 | "111 Zentralbanken finanzieren Staatsausgaben: Das ... \n", 532 | "\n", 533 | " news:publication_date \n", 534 | "0 2020-04-11T17:29:16+02:00 \n", 535 | "1 2020-04-11T17:23:24+02:00 \n", 536 | "2 2020-04-11T16:25:00+02:00 \n", 537 | "3 2020-04-11T15:25:36+02:00 \n", 538 | "4 2020-04-11T15:18:27+02:00 \n", 539 | "5 2020-04-11T13:12:33+02:00 \n", 540 | "6 2020-04-11T13:05:51+02:00 \n", 541 | "7 2020-04-11T12:39:31+02:00 \n", 542 | "8 2020-04-11T12:00:00+02:00 \n", 543 | "9 2020-04-11T11:31:00+02:00 \n", 544 | "10 2020-04-11T11:26:07+02:00 \n", 545 | "11 2020-04-11T10:30:00+02:00 \n", 546 | "12 2020-04-11T08:03:19+02:00 \n", 547 | "13 2020-04-11T07:56:12+02:00 \n", 548 | "14 2020-04-11T07:44:32+02:00 \n", 549 | "15 2020-04-11T07:13:32+02:00 \n", 550 | "16 2020-04-11T07:00:00+02:00 \n", 551 | "17 2020-04-11T06:30:00+02:00 \n", 552 | "18 2020-04-11T06:00:00+02:00 \n", 553 | "19 2020-04-11T06:00:00+02:00 \n", 554 | "20 2020-04-11T06:00:00+02:00 \n", 555 | "21 2020-04-11T06:00:00+02:00 \n", 556 | "22 2020-04-11T05:36:13+02:00 \n", 557 | "23 2020-04-11T05:30:00+02:00 \n", 558 | "24 2020-04-11T05:30:00+02:00 \n", 559 | "25 2020-04-11T05:30:00+02:00 \n", 560 | "26 2020-04-11T05:30:00+02:00 \n", 561 | "27 2020-04-11T05:30:00+02:00 \n", 562 | "28 2020-04-11T05:30:00+02:00 \n", 563 | "29 2020-04-11T05:30:00+02:00 \n", 564 | ".. ... \n", 565 | "82 2020-04-10T07:15:40+02:00 \n", 566 | "83 2020-04-10T06:37:43+02:00 \n", 567 | "84 2020-04-10T06:00:24+02:00 \n", 568 | "85 2020-04-10T06:00:00+02:00 \n", 569 | "86 2020-04-10T06:00:00+02:00 \n", 570 | "87 2020-04-10T05:49:27+02:00 \n", 571 | "88 2020-04-10T05:30:00+02:00 \n", 572 | "89 2020-04-10T05:30:00+02:00 \n", 573 | "90 2020-04-10T05:30:00+02:00 \n", 574 | "91 2020-04-10T05:30:00+02:00 \n", 575 | "92 2020-04-10T05:30:00+02:00 \n", 576 | "93 2020-04-10T05:30:00+02:00 \n", 577 | "94 2020-04-10T05:30:00+02:00 \n", 578 | "95 2020-04-10T05:30:00+02:00 \n", 579 | "96 2020-04-10T05:30:00+02:00 \n", 580 | "97 2020-04-10T05:30:00+02:00 \n", 581 | "98 2020-04-10T05:30:00+02:00 \n", 582 | "99 2020-04-10T05:30:00+02:00 \n", 583 | "100 2020-04-10T05:30:00+02:00 \n", 584 | "101 2020-04-10T05:30:00+02:00 \n", 585 | "102 2020-04-10T05:30:00+02:00 \n", 586 | "103 2020-04-10T05:01:53+02:00 \n", 587 | "104 2020-04-10T05:00:00+02:00 \n", 588 | "105 2020-04-10T05:00:00+02:00 \n", 589 | "106 2020-04-10T03:54:05+02:00 \n", 590 | "107 2020-04-10T01:57:21+02:00 \n", 591 | "108 2020-04-10T00:11:23+02:00 \n", 592 | "109 2020-04-09T23:43:41+02:00 \n", 593 | "110 2020-04-09T23:00:00+02:00 \n", 594 | "111 2020-04-09T20:24:24+02:00 \n", 595 | "\n", 596 | "[112 rows x 3 columns]" 597 | ] 598 | }, 599 | "execution_count": 36, 600 | "metadata": {}, 601 | "output_type": "execute_result" 602 | } 603 | ], 604 | "source": [ 605 | "import requests\n", 606 | "import pandas as pd\n", 607 | "import xmltodict\n", 608 | "\n", 609 | "url = \"https://www.nzz.ch/sitemap/news0.xml\t\"\n", 610 | "res = requests.get(url)\n", 611 | "raw = xmltodict.parse(res.text)\n", 612 | "\n", 613 | "data = [[r[\"loc\"], r[\"news:news\"][\"news:title\"], r[\"news:news\"][\"news:publication_date\"]] for r in raw[\"urlset\"][\"url\"]]\n", 614 | "df = pd.DataFrame(data, columns=[\"loc\",\"news:title\", \"news:publication_date\"])\n", 615 | "df" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": null, 621 | "metadata": {}, 622 | "outputs": [], 623 | "source": [] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": null, 628 | "metadata": {}, 629 | "outputs": [], 630 | "source": [] 631 | } 632 | ], 633 | "metadata": { 634 | "kernelspec": { 635 | "display_name": "Python 3", 636 | "language": "python", 637 | "name": "python3" 638 | }, 639 | "language_info": { 640 | "codemirror_mode": { 641 | "name": "ipython", 642 | "version": 3 643 | }, 644 | "file_extension": ".py", 645 | "mimetype": "text/x-python", 646 | "name": "python", 647 | "nbconvert_exporter": "python", 648 | "pygments_lexer": "ipython3", 649 | "version": "3.7.2" 650 | } 651 | }, 652 | "nbformat": 4, 653 | "nbformat_minor": 2 654 | } 655 | --------------------------------------------------------------------------------