├── Anlage.xlsx ├── Anlage_wo_pwd.pdf ├── Anlage_wo_pwd.tsv ├── CNAME ├── Fixed_Anlage_wo_pwd.csv ├── Fixed_Anlage_wo_pwd_Encoded.csv ├── Quick_Exploration.ipynb ├── README.md ├── fix_postcode_city_mix.ipynb ├── liegenschaften.png └── tabula-Anlage_wo_pwd.csv /Anlage.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bundesAPI/bundesanstalt-immobilienaufgaben-liegenschaften/3794a07ab888ccee5cbe46a91ae11d2bb4dce990/Anlage.xlsx -------------------------------------------------------------------------------- /Anlage_wo_pwd.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bundesAPI/bundesanstalt-immobilienaufgaben-liegenschaften/3794a07ab888ccee5cbe46a91ae11d2bb4dce990/Anlage_wo_pwd.pdf -------------------------------------------------------------------------------- /CNAME: -------------------------------------------------------------------------------- 1 | liegenschaften.bima.bund.dev -------------------------------------------------------------------------------- /Quick_Exploration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "outputs": [], 18 | "source": [ 19 | "data_types ={\n", 20 | " \"id\" : int,\n", 21 | " \"postcode\" : pd.Int64Dtype()\t,\n", 22 | " \"city\": str,\n", 23 | " \"street\" : str,\n", 24 | " \"area\" : \"category\",\n", 25 | " \"user\" : \"category\",\n", 26 | " \"size\": float\n", 27 | "}" 28 | ], 29 | "metadata": { 30 | "collapsed": false, 31 | "pycharm": { 32 | "name": "#%%\n" 33 | } 34 | } 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "outputs": [], 40 | "source": [ 41 | "df = pd.read_csv(\"Fixed_Anlage_wo_pwd.csv\",dtype=data_types)" 42 | ], 43 | "metadata": { 44 | "collapsed": false, 45 | "pycharm": { 46 | "name": "#%%\n" 47 | } 48 | } 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": " id postcode city \\\n29350 30025 80637 München, Landeshauptstadt \n135888 112387 59425 Unna \n201335 -1 Unbekannt \n106899 93278 54292 Trier \n223542 170218 97762 Hammelburg \n199098 154103 56828 Alflen \n159823 -1 Unbekannt \n51347 52528 79540 Lörrach, Stadt \n29944 30633 89075 Ulm, Universitätsstadt \n102283 -1 Unbekannt \n\n street area \\\n29350 Hedwig - Dransfeld - Allee 35 Wohnen \n135888 Kamener Str. 91 - 93 Dienstliegenschaften Bundeswehr \n201335 Dienstliegenschaften Bundeswehr \n106899 Thebaeerstr. 55 Wohnen \n223542 Rommelstr. 31 Unbekannt \n199098 N.N. Dienstliegenschaften Bundeswehr \n159823 Dienstliegenschaften Bundeswehr \n51347 Maienbühlweg 9 Wohnen \n29944 Werastraße 12 Wohnen \n102283 Unbekannt Unbekannt \n\n user size \n29350 Unbekannt 32.70 \n135888 Unbekannt 0.00 \n201335 BMVg.-Unterbringung NaN \n106899 Unbekannt 9.92 \n223542 Unbekannt 609.10 \n199098 BMVg.-Unterbringung 30.00 \n159823 Unbekannt NaN \n51347 Unbekannt 103.90 \n29944 Unbekannt 44.20 \n102283 Unbekannt NaN ", 57 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idpostcodecitystreetareausersize
293503002580637München, LandeshauptstadtHedwig - Dransfeld - Allee 35WohnenUnbekannt32.70
13588811238759425UnnaKamener Str. 91 - 93 DienstliegenschaftenBundeswehrUnbekannt0.00
201335-1<NA>UnbekanntDienstliegenschaftenBundeswehrBMVg.-UnterbringungNaN
1068999327854292TrierThebaeerstr. 55WohnenUnbekannt9.92
22354217021897762HammelburgRommelstr. 31UnbekanntUnbekannt609.10
19909815410356828AlflenN.N. DienstliegenschaftenBundeswehrBMVg.-Unterbringung30.00
159823-1<NA>UnbekanntDienstliegenschaftenBundeswehrUnbekanntNaN
513475252879540Lörrach, StadtMaienbühlweg 9WohnenUnbekannt103.90
299443063389075Ulm, UniversitätsstadtWerastraße 12WohnenUnbekannt44.20
102283-1<NA>UnbekanntUnbekanntUnbekanntUnbekanntNaN
\n
" 58 | }, 59 | "execution_count": 4, 60 | "metadata": {}, 61 | "output_type": "execute_result" 62 | } 63 | ], 64 | "source": [ 65 | "df.sample(10)" 66 | ], 67 | "metadata": { 68 | "collapsed": false, 69 | "pycharm": { 70 | "name": "#%%\n" 71 | } 72 | } 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "source": [ 77 | "# Which \"User\" owns the most stuff" 78 | ], 79 | "metadata": { 80 | "collapsed": false 81 | } 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 5, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": " id postcode city street area size\nuser \nUnbekannt 155391 3444 4518 26901 7 27759\nBMVg.-Unterbringung 14702 584 518 952 3 9221\nBundesanstalt THW 1364 592 575 657 4 974\nGeneralzolldirektion 1275 378 334 667 4 844\nBundespolizei 898 198 179 263 2 729\n... ... ... ... ... ... ...\nSonst. Bewilligungen 1 0 1 1 1 0\nEnergie u. Nachhalti 1 1 1 1 1 1\nBilat. Zusammenarb. 1 1 1 1 1 1\nBSG 1 0 1 1 1 0\nGoethe-Institut 1 0 1 1 1 0\n\n[103 rows x 6 columns]", 90 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idpostcodecitystreetareasize
user
Unbekannt1553913444451826901727759
BMVg.-Unterbringung1470258451895239221
Bundesanstalt THW13645925756574974
Generalzolldirektion12753783346674844
Bundespolizei8981981792632729
.....................
Sonst. Bewilligungen101110
Energie u. Nachhalti111111
Bilat. Zusammenarb.111111
BSG101110
Goethe-Institut101110
\n

103 rows × 6 columns

\n
" 91 | }, 92 | "execution_count": 5, 93 | "metadata": {}, 94 | "output_type": "execute_result" 95 | } 96 | ], 97 | "source": [ 98 | "df.groupby(\"user\").nunique().sort_values(by=\"id\",ascending=False)" 99 | ], 100 | "metadata": { 101 | "collapsed": false, 102 | "pycharm": { 103 | "name": "#%%\n" 104 | } 105 | } 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "source": [ 110 | "# The biggest real estate categories" 111 | ], 112 | "metadata": { 113 | "collapsed": false 114 | } 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 6, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": " id postcode city street user size\narea \nWohnen 52772 768 748 11678 4 3630\nUnbekannt 35061 591 528 1000 100 11146\nGewerbe/sonst. Nutzung 33121 1575 1882 9024 7 7465\n Bundeswehr 28498 587 525 976 7 9293\nDienstliegenschaften 18938 1725 1498 4111 100 9474\nBundesforst 7158 1377 1835 1080 2 5180\nVerwaltungsaufgaben 594 241 241 240 2 370", 123 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idpostcodecitystreetusersize
area
Wohnen527727687481167843630
Unbekannt35061591528100010011146
Gewerbe/sonst. Nutzung3312115751882902477465
Bundeswehr2849858752597679293
Dienstliegenschaften189381725149841111009474
Bundesforst715813771835108025180
Verwaltungsaufgaben5942412412402370
\n
" 124 | }, 125 | "execution_count": 6, 126 | "metadata": {}, 127 | "output_type": "execute_result" 128 | } 129 | ], 130 | "source": [ 131 | "df.groupby(\"area\").nunique().sort_values(by=\"id\",ascending=False)" 132 | ], 133 | "metadata": { 134 | "collapsed": false, 135 | "pycharm": { 136 | "name": "#%%\n" 137 | } 138 | } 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "source": [ 143 | "# Managed property per category; sorted by size" 144 | ], 145 | "metadata": { 146 | "collapsed": false 147 | } 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 7, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": " size\narea \nBundesforst 1.250452e+09\nUnbekannt 1.064645e+09\n Bundeswehr 1.053773e+09\nGewerbe/sonst. Nutzung 9.878014e+07\nDienstliegenschaften 6.608605e+07\nVerwaltungsaufgaben 1.499094e+07\nWohnen 3.345593e+06", 156 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
size
area
Bundesforst1.250452e+09
Unbekannt1.064645e+09
Bundeswehr1.053773e+09
Gewerbe/sonst. Nutzung9.878014e+07
Dienstliegenschaften6.608605e+07
Verwaltungsaufgaben1.499094e+07
Wohnen3.345593e+06
\n
" 157 | }, 158 | "execution_count": 7, 159 | "metadata": {}, 160 | "output_type": "execute_result" 161 | } 162 | ], 163 | "source": [ 164 | "df.groupby(\"area\").sum().drop(columns=[\"id\",\"postcode\"]).sort_values(by=\"size\", ascending=False)" 165 | ], 166 | "metadata": { 167 | "collapsed": false, 168 | "pycharm": { 169 | "name": "#%%\n" 170 | } 171 | } 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "source": [ 176 | "# Managed property by user; sorted by size" 177 | ], 178 | "metadata": { 179 | "collapsed": false 180 | } 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 8, 185 | "outputs": [ 186 | { 187 | "data": { 188 | "text/plain": " size\nuser \nUnbekannt 2.517642e+09\nBMVg.-Unterbringung 1.017131e+09\nBundespolizei 5.587492e+06\nJulius-Kühn-Institut 2.889755e+06\nMax-Rubner-Institut 1.443736e+06\n... ...\nsonst. Bewil.(BMELV) 1.405000e+02\nBI f. Sportwissensch 1.145200e+02\nSonst. Bewilligungen 0.000000e+00\nGoethe-Institut 0.000000e+00\nBSG 0.000000e+00\n\n[103 rows x 1 columns]", 189 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
size
user
Unbekannt2.517642e+09
BMVg.-Unterbringung1.017131e+09
Bundespolizei5.587492e+06
Julius-Kühn-Institut2.889755e+06
Max-Rubner-Institut1.443736e+06
......
sonst. Bewil.(BMELV)1.405000e+02
BI f. Sportwissensch1.145200e+02
Sonst. Bewilligungen0.000000e+00
Goethe-Institut0.000000e+00
BSG0.000000e+00
\n

103 rows × 1 columns

\n
" 190 | }, 191 | "execution_count": 8, 192 | "metadata": {}, 193 | "output_type": "execute_result" 194 | } 195 | ], 196 | "source": [ 197 | "df.groupby(\"user\").sum().drop(columns=[\"id\",\"postcode\"]).sort_values(by=\"size\", ascending=False)" 198 | ], 199 | "metadata": { 200 | "collapsed": false, 201 | "pycharm": { 202 | "name": "#%%\n" 203 | } 204 | } 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "source": [ 209 | "# 10 Biggest properties" 210 | ], 211 | "metadata": { 212 | "collapsed": false 213 | } 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 9, 218 | "outputs": [ 219 | { 220 | "data": { 221 | "text/plain": " id postcode city \\\n172504 136529 29683 Osterheide \n212709 163077 39638 Gardelegen \n213560 163637 2957 Weißkeißel \n212707 163076 39638 Gardelegen \n172503 136528 29683 Osterheide \n199387 154294 17358 Torgelow \n212538 162965 39291 Möckern \n116043 99304 16831 Rheinsberg OT Flecken Zechlin \n172498 136523 29633 Munster \n211506 162284 6925 Annaburg \n\n street area user \\\n172504 N.N. Unbekannt Unbekannt \n212709 Salchauer Chaussee 1 Unbekannt Unbekannt \n213560 Muskauer Forst Dienstliegenschaften Bundeswehr BMVg.-Unterbringung \n212707 Salchauer Chaussee 1 Unbekannt Unbekannt \n172503 N.N. Dienstliegenschaften Bundeswehr BMVg.-Unterbringung \n199387 Pasewalker Chaussee 7 Unbekannt Unbekannt \n212538 Bergstr. Dienstliegenschaften Bundeswehr BMVg.-Unterbringung \n116043 SCHONORT Bundesforst Unbekannt \n172498 Zw. B 209 u. Pl.-Randstr. Brockhs Unbekannt Unbekannt \n211506 Annaburger Heide Unbekannt Unbekannt \n\n size \n172504 143505160.0 \n212709 121003889.0 \n213560 113392861.0 \n212707 112234400.0 \n172503 108444570.0 \n199387 93386240.0 \n212538 91173377.0 \n116043 82487930.0 \n172498 75715466.0 \n211506 70751500.0 ", 222 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idpostcodecitystreetareausersize
17250413652929683OsterheideN.N.UnbekanntUnbekannt143505160.0
21270916307739638GardelegenSalchauer Chaussee 1UnbekanntUnbekannt121003889.0
2135601636372957WeißkeißelMuskauer Forst DienstliegenschaftenBundeswehrBMVg.-Unterbringung113392861.0
21270716307639638GardelegenSalchauer Chaussee 1UnbekanntUnbekannt112234400.0
17250313652829683OsterheideN.N. DienstliegenschaftenBundeswehrBMVg.-Unterbringung108444570.0
19938715429417358TorgelowPasewalker Chaussee 7UnbekanntUnbekannt93386240.0
21253816296539291MöckernBergstr. DienstliegenschaftenBundeswehrBMVg.-Unterbringung91173377.0
1160439930416831Rheinsberg OT Flecken ZechlinSCHONORTBundesforstUnbekannt82487930.0
17249813652329633MunsterZw. B 209 u. Pl.-Randstr. BrockhsUnbekanntUnbekannt75715466.0
2115061622846925AnnaburgAnnaburger HeideUnbekanntUnbekannt70751500.0
\n
" 223 | }, 224 | "execution_count": 9, 225 | "metadata": {}, 226 | "output_type": "execute_result" 227 | } 228 | ], 229 | "source": [ 230 | "df.sort_values(by=\"size\",ascending=False).head(10)" 231 | ], 232 | "metadata": { 233 | "collapsed": false, 234 | "pycharm": { 235 | "name": "#%%\n" 236 | } 237 | } 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "source": [ 242 | "# 10 Smallest properties (with a size bigger than 0)" 243 | ], 244 | "metadata": { 245 | "collapsed": false 246 | } 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 10, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/plain": " id postcode city street \\\n88976 81461 14167 Berlin Jänickestraße 1 \n22642 23163 26441 Jever Auf der Dreesche u.a. \n22330 22843 26441 Jever Auf der Dreesche u.a. \n89872 82051 12435 Berlin Puschkinallee 52 \n47210 48295 6120 Halle (Saale) Vogelsang/Am Heiderand \n47209 48294 6120 Halle (Saale) Vogelsang/Am Heiderand \n47208 48293 6120 Halle (Saale) Vogelsang/Am Heiderand \n47207 48292 6120 Halle (Saale) Vogelsang/Am Heiderand \n47206 48291 6120 Halle (Saale) Vogelsang/Am Heiderand \n87465 80466 6120 Halle (Saale) Vogelsang/Am Heiderand \n\n area user size \n88976 Wohnen Unbekannt 0.001 \n22642 Wohnen Unbekannt 0.001 \n22330 Wohnen Unbekannt 0.001 \n89872 Dienstliegenschaften Unbekannt 0.010 \n47210 Wohnen Unbekannt 0.010 \n47209 Wohnen Unbekannt 0.010 \n47208 Wohnen Unbekannt 0.010 \n47207 Wohnen Unbekannt 0.010 \n47206 Wohnen Unbekannt 0.010 \n87465 Wohnen Unbekannt 0.010 ", 255 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idpostcodecitystreetareausersize
889768146114167BerlinJänickestraße 1WohnenUnbekannt0.001
226422316326441JeverAuf der Dreesche u.a.WohnenUnbekannt0.001
223302284326441JeverAuf der Dreesche u.a.WohnenUnbekannt0.001
898728205112435BerlinPuschkinallee 52DienstliegenschaftenUnbekannt0.010
47210482956120Halle (Saale)Vogelsang/Am HeiderandWohnenUnbekannt0.010
47209482946120Halle (Saale)Vogelsang/Am HeiderandWohnenUnbekannt0.010
47208482936120Halle (Saale)Vogelsang/Am HeiderandWohnenUnbekannt0.010
47207482926120Halle (Saale)Vogelsang/Am HeiderandWohnenUnbekannt0.010
47206482916120Halle (Saale)Vogelsang/Am HeiderandWohnenUnbekannt0.010
87465804666120Halle (Saale)Vogelsang/Am HeiderandWohnenUnbekannt0.010
\n
" 256 | }, 257 | "execution_count": 10, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "df[df[\"size\"]>0].sort_values(by=\"size\",ascending=True).head(10)\n", 264 | "### Weird results" 265 | ], 266 | "metadata": { 267 | "collapsed": false, 268 | "pycharm": { 269 | "name": "#%%\n" 270 | } 271 | } 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "source": [ 276 | "# Distribution of the property size" 277 | ], 278 | "metadata": { 279 | "collapsed": false 280 | } 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 11, 285 | "outputs": [ 286 | { 287 | "data": { 288 | "text/plain": "count 1.761400e+05\nmean 2.016619e+04\nstd 9.011719e+05\nmin 0.000000e+00\n25% 1.040000e+01\n50% 6.530000e+01\n75% 2.500000e+02\nmax 1.435052e+08\nName: size, dtype: float64" 289 | }, 290 | "execution_count": 11, 291 | "metadata": {}, 292 | "output_type": "execute_result" 293 | } 294 | ], 295 | "source": [ 296 | "df[\"size\"].describe()" 297 | ], 298 | "metadata": { 299 | "collapsed": false, 300 | "pycharm": { 301 | "name": "#%%\n" 302 | } 303 | } 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 12, 308 | "outputs": [ 309 | { 310 | "data": { 311 | "text/plain": "" 312 | }, 313 | "execution_count": 12, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | }, 317 | { 318 | "data": { 319 | "text/plain": "
", 320 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEFCAYAAAAYKqc0AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAATGklEQVR4nO3df4wcd3nH8feD0wD1pQFhekJOqA2Xhrh2KfhKGlRVdxQ156SGgtLWJmob5GKlUlD/QGqcFpW0VUX4wxJKCI0Oalmt0pxSisgvQ6ggV4IIJbgFbBOFusEll1Q2wfSqSyMhw9M/bgPL+dY3+3vny/slRXhmZ7/zmY3nyfDs7HciM5EkleUFww4gSeo9i7skFcjiLkkFsrhLUoEs7pJUoPOGHQBgw4YNuWnTpo7e++yzz7J+/freBuqTumStS06oT1Zz9l5dsvYz5+HDh5/JzJev+mJmDv2f7du3Z6ceeuihjt87aHXJWpecmfXJas7eq0vWfuYEvpwt6upQ2zIRsTMiZhcXF4cZQ5KKM9Tinpn3ZebeCy+8cJgxJKk4fqEqSQWyuEtSgSzuklQgi7skFci7ZSSpQN4tI0kFGolfqHbjyFOLXLfvgbPWn7jl6iGkkaTRYM9dkgpkcZekAlncJalAFndJKpDFXZIK1PPiHhFTEfFwRNwREVO9Hl+StLZKxT0iDkTEqYg4umL9TEQ8HhHHI2JfY3UCS8CLgIXexpUkVVH1yv0gMNO8IiLWAbcDO4AtwO6I2AI8nJk7gBuBv+hdVElSVbH8MI8KG0ZsAu7PzK2N5SuAmzPzysbyTQCZ+f7G8vnAP2TmNS3G2wvsBRgfH98+NzfX0QGcOr3IyefOXr9t4+j96nVpaYmxsbFhx1hTXXJCfbKas/fqkrWfOaenpw9n5uRqr3XzC9WNwJNNywvA5RHxduBK4CXAh1q9OTNngVmAycnJnJqa6ijEbXfew/4jZx/GiWs7G6+f5ufn6fQ4B6kuOaE+Wc3Ze3XJOqyc3RT3WGVdZubHgY9XGiBiJ7BzYmKiixiSpJW6uVtmAbi4afki4Ol2BnDiMEnqj26K+6PAJRGxudFf3wXc284ATvkrSf1R9VbIu4BHgEsjYiEi9mTmGeAG4EHgMeDuzDzWzs69cpek/qjUc8/M3S3WHwIOdbpze+6S1B8+rEOSCuTcMpJUIJ+hKkkFsi0jSQWyLSNJBbItI0kFsi0jSQWyLSNJBbItI0kFsi0jSQWyLSNJBbK4S1KBLO6SVCC/UJWkAvmFqiQVyLaMJBXI4i5JBbK4S1KBLO6SVCCLuyQVyFshJalA3gopSQWyLSNJBbK4S1KBLO6SVCCLuyQVyOIuSQWyuEtSgfpS3CNifUQcjojf7Mf4kqRzq1TcI+JARJyKiKMr1s9ExOMRcTwi9jW9dCNwdy+DSpKqq3rlfhCYaV4REeuA24EdwBZgd0RsiYg3A18HTvYwpySpDZGZ1TaM2ATcn5lbG8tXADdn5pWN5Zsam44B61ku+M8Bb8vMH6wy3l5gL8D4+Pj2ubm5jg7g1OlFTj539vptG0fvV69LS0uMjY0NO8aa6pIT6pPVnL1Xl6z9zDk9PX04MydXe+28LsbdCDzZtLwAXJ6ZNwBExHXAM6sVdoDMnAVmASYnJ3NqaqqjELfdeQ/7j5x9GCeu7Wy8fpqfn6fT4xykuuSE+mQ1Z+/VJeuwcnZT3GOVdT/8vwGZeXDNASJ2AjsnJia6iCFJWqmbu2UWgIubli8Cnm5nACcOk6T+6Ka4PwpcEhGbI+J8YBdwbzsDOOWvJPVH1Vsh7wIeAS6NiIWI2JOZZ4AbgAeBx4C7M/NYOzv3yl2S+qNSzz0zd7dYfwg41OnO7blLUn/4sA5JKpBzy0hSgXyGqiQVyLaMJBXItowkFci2jCQVyLaMJBXItowkFci2jCQVyLaMJBXItowkFcjiLkkFsrhLUoH8QlWSCuQXqpJUINsyklQgi7skFcjiLkkFsrhLUoEs7pJUIG+FlKQCeSukJBXItowkFcjiLkkFsrhLUoEs7pJUIIu7JBXI4i5JBep5cY+IyyLijoj4WET8Ua/HlyStrVJxj4gDEXEqIo6uWD8TEY9HxPGI2AeQmY9l5vXA7wCTvY8sSVpL1Sv3g8BM84qIWAfcDuwAtgC7I2JL47W3AJ8HPtOzpJKkyiIzq20YsQm4PzO3NpavAG7OzCsbyzcBZOb7m97zQGZe3WK8vcBegPHx8e1zc3MdHcCp04ucfO7s9ds2jt6vXpeWlhgbGxt2jDXVJSfUJ6s5e68uWfuZc3p6+nBmrtohOa+LcTcCTzYtLwCXR8QU8HbghcChVm/OzFlgFmBycjKnpqY6CnHbnfew/8jZh3Hi2s7G66f5+Xk6Pc5BqktOqE9Wc/ZeXbIOK2c3xT1WWZeZOQ/MVxogYiewc2JioosYkqSVurlbZgG4uGn5IuDpdgZw4jBJ6o9uivujwCURsTkizgd2Afe2M4BT/kpSf1S9FfIu4BHg0ohYiIg9mXkGuAF4EHgMuDszj7Wzc6/cJak/KvXcM3N3i/WHOMeXpmux5y5J/eHDOiSpQD5mT5IK5JW7JBXIWSElqUC2ZSSpQLZlJKlAtmUkqUAWd0kqkD13SSqQPXdJKpBtGUkqkMVdkgpkcZekAvmFqiQVyC9UJalAtmUkqUAWd0kqkMVdkgpkcZekAnm3jCQVyLtlJKlAtmUkqUAWd0kqkMVdkgpkcZekAlncJalAFndJKlBfintE/FZEfCQi7omI3+jHPiRJrVUu7hFxICJORcTRFetnIuLxiDgeEfsAMvMTmfku4Drgd3uaWJK0pnau3A8CM80rImIdcDuwA9gC7I6ILU2bvLfxuiRpgCoX98z8HHB6xeo3AMcz84nM/B4wB7w1ln0A+GRm/lvv4kqSqojMrL5xxCbg/szc2li+BpjJzD9sLP8ecDnwDeAPgEeBr2TmHauMtRfYCzA+Pr59bm6uowM4dXqRk8+dvX7bxtGb0mBpaYmxsbFhx1hTXXJCfbKas/fqkrWfOaenpw9n5uRqr53X5dixyrrMzFuBW8/1xsycjYj/BnZecMEF26empjoKcNud97D/yNmHceLazsbrp/n5eTo9zkGqS06oT1Zz9l5dsg4rZ7d3yywAFzctXwQ8XfXNThwmSf3RbXF/FLgkIjZHxPnALuDeqm92yl9J6o/KbZmIuAuYAjZExALwvsz824i4AXgQWAccyMxjVcfMzPuA+yYnJ9/VXuy1bdr3wKrrT9xyda93JUkjp3Jxz8zdLdYfAg51svOI2AnsnJiY6OTtkqQWfFiHJBXIuWUkqUA+Q1WSCmRbRpIKZFtGkgpkW0aSCmRbRpIKZFtGkgpkcZekAtlzl6QC2XOXpALZlpGkAlncJalA9twlqUD23CWpQLZlJKlAFndJKpDFXZIKZHGXpAJZ3CWpQN4KKUkF8lZISSqQbRlJKpDFXZIKZHGXpAKdN+wAg7Zp3wMtXztxy9UDTCJJ/eOVuyQVyOIuSQXqeVsmIl4F/BlwYWZe0+vx+6lVy8Z2jaS6qXTlHhEHIuJURBxdsX4mIh6PiOMRsQ8gM5/IzD39CCtJqqZqW+YgMNO8IiLWAbcDO4AtwO6I2NLTdJKkjkRmVtswYhNwf2ZubSxfAdycmVc2lm8CyMz3N5Y/dq62TETsBfYCjI+Pb5+bm+voAE6dXuTkcx29tbJtG3vzC9qlpSXGxsZ6MlY/1SUn1CerOXuvLln7mXN6evpwZk6u9lo3PfeNwJNNywvA5RHxMuCvgddFxE3PF/uVMnMWmAWYnJzMqampjkLcduc97D/S3zs6T1w71ZNx5ufn6fQ4B6kuOaE+Wc3Ze3XJOqyc3VTFWGVdZuZ3gOsrDRCxE9g5MTHRRQxJ0krd3Aq5AFzctHwR8HQ7AzhxmCT1RzdX7o8Cl0TEZuApYBfwjnYGqMuV+7l+1boab52UNGxVb4W8C3gEuDQiFiJiT2aeAW4AHgQeA+7OzGPt7Nwrd0nqj0pX7pm5u8X6Q8ChTndelyt3SaobH9YhSQVybhlJKpDPUJWkAtmWkaQC2ZaRpALZlpGkAtmWkaQC2ZaRpAIN9QHZP2k/Yjry1CLXrTKVgdMVSOo12zKSVCDbMpJUIIu7JBXI4i5JBfIL1RHQar74Xn3R2u/xJY0ev1CVpALZlpGkAlncJalAFndJKpDFXZIKZHGXpAJ5K2QftLr18D3bejOOtzCq7vy73X/eCilJBbItI0kFsrhLUoEs7pJUIIu7JBXI4i5JBbK4S1KBen6fe0SsBz4MfA+Yz8w7e70PSdK5Vbpyj4gDEXEqIo6uWD8TEY9HxPGI2NdY/XbgY5n5LuAtPc4rSaqgalvmIDDTvCIi1gG3AzuALcDuiNgCXAQ82djs+72JKUlqR2RmtQ0jNgH3Z+bWxvIVwM2ZeWVj+abGpgvAdzPz/oiYy8xdLcbbC+wFGB8f3z43N9fRAZw6vcjJ5zp668CNv5i+Zt22cfVf+h55arGtcfqdE1pnbaXVMWy+cB1jY2OVt2/3MzpXznbes7S0xDcXV7/W6dVn0Uo74y8tLa36eXaap5ef90q9/Ez76fnPtBfHvNL09PThzJxc7bVueu4b+dEVOiwX9cuBW4EPRcTVwH2t3pyZs8AswOTkZE5NTXUU4rY772H/kaFOkVPZe7ad6WvWE9dOrbr+uhbzeLTS75zQOmsrrY7h4Mx6Vvu702r7dj+jc+Vs5z3z8/Ps//yzbe+jnf220s748/Pzq36enebp5ee9Ui8/0356/jPtxTG3o5szOFZZl5n5LPDOSgMUOnGYJA1bN7dCLgAXNy1fBDzdzgBOHCZJ/dFNcX8UuCQiNkfE+cAu4N52BoiInRExu7jYXg9RknRuVW+FvAt4BLg0IhYiYk9mngFuAB4EHgPuzsxj7ezcK3dJ6o9KPffM3N1i/SHgUKc7t+cuSf3hwzokqUBDLe723CWpP7xyl6QCVf6Fal9DRHwb+K8O374BeKaHcfqpLlnrkhPqk9WcvVeXrP3M+XOZ+fLVXhiJ4t6NiPhyq5/fjpq6ZK1LTqhPVnP2Xl2yDiun87lLUoEs7pJUoBKK++ywA7ShLlnrkhPqk9WcvVeXrEPJWfueuyTpbCVcuUuSVrC4S1KBalPcWzyvtfn1iIhbG69/LSJeP6I5r23k+1pEfCEiXjuMnI0s58zatN0vR8T3I+KaQeZr2v+aOSNiKiK+EhHHIuJfBp2xKcda//4vjIj7IuKrjayVnn3Q44yrPhO56fWROJcaWdbKOhLn01o5m7Yb3LmUmSP/D7AO+E/gVcD5wFeBLSu2uQr4JMsPEfkV4F9HNOcbgZc2/rxjGDmrZm3a7rMsTxB3zSjmBF4CfB14ZWP5Z0f1MwX+FPhA488vB04D5w84568BrweOtnh96OdSG1lH5Xw6Z86mvx8DO5fqcuX+BuB4Zj6Rmd8D5oC3rtjmrcDf5bIvAi+JiFeMWs7M/EJmfrex+EWWH3IyDFU+U4B3A/8EnBpkuCZVcr4D+HhmfgsgM0c5awIXREQAYywX9zODDJmZn2vst5VROJeAtbOOyvlU4TOFAZ9LdSnuqz2vdWMH2/Rbuxn2sHyFNAxrZo2IjcDbgDsGmGulKp/pzwMvjYj5iDgcEb8/sHQ/rkrWDwGXsfzUsiPAH2fmDwYTr7JROJc6Mczz6ZyGcS7V48nSLZ7X2sE2/VY5Q0RMs/yX8Vf7mqi1Klk/CNyYmd9fvtAciio5zwO2A78OvBh4JCK+mJnf6He4FapkvRL4CvAm4NXAP0fEw5n5v33O1o5ROJfaMgLn01o+yIDPpboU9yrPa+36ma49UClDRPwi8FFgR2Z+Z0DZVqqSdRKYa/xl3ABcFRFnMvMTA0m4rOq/+2dy+eHsz0bE54DXAoMu7lWyvhO4JZebsMcj4pvAa4AvDSZiJaNwLlU2IufTWgZ/Lg3jy4cOvqw4D3gC2MyPvqj6hRXbXM2Pfwn0pRHN+UrgOPDGUf9MV2x/kOF8oVrlM70M+Exj258GjgJbRzTr3wA3N/48DjwFbBhC1k20/pJy6OdSG1lH4nxaK+eK7QZyLtXiyj0zz0TE889rXQccyMxjEXF94/U7WP4G+iqW/0X/H8tXSKOY88+BlwEfbvxX/EwOYca4ilmHrkrOzHwsIj4FfA34AfDRzDznLWnDygr8FXAwIo6wXDxvzMyBTlvbeCbyFLAhIhaA9wE/1ZRx6OfS8ypkHYnzqULOgXP6AUkqUF3ulpEktcHiLkkFsrhLUoEs7pJUIIu7JA1Y1YnGGtu+MiIeioh/b0yQdlWVfVjcJWnwDgIzFbd9L3B3Zr4O2AV8uMqbLO6SNGC5ykRjEfHqiPhUY36khyPiNc9vDvxM488XUvHXwrX4EZMk/QSYBa7PzP+IiMtZvkJ/E3Az8OmIeDewHnhzlcEs7pI0ZBExxvLc9P/YNLHYCxv/uxs4mJn7I+IK4O8jYmuuMZuoxV2Shu8FwP9k5i+t8toeGv35zHwkIl7E8uRj55wX3p67JA1ZLk/5/M2I+G344aMOn39k4LdYns6aiLgMeBHw7bXGdG4ZSRqw5onGgJMsTzT2WZZnDX0Fy5OOzWXmX0bEFuAjLD+5K4E/ycxPr7kPi7sklce2jCQVyOIuSQWyuEtSgSzuklQgi7skFcjiLkkFsrhLUoH+H26tnbv3TkZxAAAAAElFTkSuQmCC\n" 321 | }, 322 | "metadata": { 323 | "needs_background": "light" 324 | }, 325 | "output_type": "display_data" 326 | } 327 | ], 328 | "source": [ 329 | "df[\"size\"].hist(bins=50,log=True)" 330 | ], 331 | "metadata": { 332 | "collapsed": false, 333 | "pycharm": { 334 | "name": "#%%\n" 335 | } 336 | } 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "source": [ 341 | "## Soo many small ones... let's zoom in" 342 | ], 343 | "metadata": { 344 | "collapsed": false 345 | } 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 13, 350 | "outputs": [ 351 | { 352 | "data": { 353 | "text/plain": "" 354 | }, 355 | "execution_count": 13, 356 | "metadata": {}, 357 | "output_type": "execute_result" 358 | }, 359 | { 360 | "data": { 361 | "text/plain": "
", 362 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEFCAYAAAAYKqc0AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAATYUlEQVR4nO3db4xc11nH8e9D2pTijdpCylI5oU7ZyODGlDarhoSq2kWIrBPcQFXAxkIETK0gjECKUB2BUHiBWl5EoKSpqqVEUSXjVQil+VOXgGhWbWlaEldJbWNS3NSQjauY1u2WjSIVl4cXO06muzO7987M3ZmcfD+SFd9/Z35z9/jJ3TN3zo3MRJJUlu8bdgBJ0uBZ3CWpQBZ3SSqQxV2SCmRxl6QCvWLYAQAuvvji3LJlS0/HPvfcc2zatGmwgQZkVLOZq75RzWau+kY1Wy+5jhw58vXMfH3HjZk5tD/ATmB2YmIie/Xwww/3fGzTRjWbueob1Wzmqm9Us/WSC3gsu9TXoQ7LZOYDmbnvNa95zTBjSFJxHHOXpAJZ3CWpQBZ3SSqQxV2SCmRxl6QCDbW4R8TOiJhdXFwcZgxJKo63QkpSgUbiG6r9OPrMIjce+MSq9ac+cP0Q0kjSaHDMXZIKZHGXpAJZ3CWpQBZ3SSqQxV2SCjTw4h4RUxHxmYj4cERMDbp9SdL6KhX3iLgrIs5ExLEV62ci4smIOBkRB1qrE1gCvh9YGGxcSVIVVa/c7wZm2ldExAXAncAOYBuwOyK2AZ/JzB3A+4A/HVxUSVJVsfwwjwo7RmwBHszMK1rLVwO3Zua1reVbADLz/a3lC4G/ycz3dGlvH7APYHx8/Mq5ubme3sCZs4s8+/zq9ds3D/9br0tLS4yNjQ07xirmqm9Us5mrvlHN1kuu6enpI5k52WlbP99Q3Qw83ba8AFwVEe8GrgVeC3yw28GZOQvMAkxOTubU1FRPIe44eB+3HV39Nk7t6a29QZqfn6fX99Ukc9U3qtnMVd+oZht0rn6Ke3RYl5n5MeBjlRqI2AnsnJiY6COGJGmlfu6WWQAubVu+BDhdpwEnDpOkZvRT3B8FLo+Iy1rj67uA++s04JS/ktSMqrdCHgIeAbZGxEJE7M3Mc8B+4CHgBHBPZh6v8+JeuUtSMyqNuWfm7i7rDwOHe31xx9wlqRk+rEOSCuTcMpJUIJ+hKkkFclhGkgrksIwkFchhGUkqkMMyklQgh2UkqUAOy0hSgRyWkaQCOSwjSQWyuEtSgSzuklQgP1CVpAL5gaokFchhGUkqkMVdkgpkcZekAlncJalAFndJKpC3QkpSgbwVUpIK5LCMJBXI4i5JBbK4S1KBLO6SVCCLuyQVyOIuSQVqpLhHxKaIOBIRv9BE+5KktVUq7hFxV0SciYhjK9bPRMSTEXEyIg60bXofcM8gg0qSqqt65X43MNO+IiIuAO4EdgDbgN0RsS0ifg74N+DZAeaUJNUQmVltx4gtwIOZeUVr+Wrg1sy8trV8S2vXMWATywX/eeCXMvP/OrS3D9gHMD4+fuXc3FxPb+DM2UWefX71+u2bh/+t16WlJcbGxoYdYxVz1Teq2cxV36hm6yXX9PT0kcyc7LTtFX1k2Qw83ba8AFyVmfsBIuJG4OudCjtAZs4CswCTk5M5NTXVU4g7Dt7HbUdXv41Te3prb5Dm5+fp9X01yVz1jWo2c9U3qtkGnauf4h4d1r3wa0Bm3r1uAxE7gZ0TExN9xJAkrdTP3TILwKVty5cAp+s04MRhktSMfor7o8DlEXFZRFwI7ALur9OAU/5KUjOq3gp5CHgE2BoRCxGxNzPPAfuBh4ATwD2ZebzOi3vlLknNqDTmnpm7u6w/DBzu9cUdc5ekZviwDkkqkI/Zk6QCeeUuSQVyVkhJKpDDMpJUIIdlJKlADstIUoEs7pJUIMfcJalAjrlLUoEclpGkAlncJalAFndJKpAfqEpSgfxAVZIK5LCMJBXI4i5JBbK4S1KBLO6SVCDvlpGkAnm3jCQVyGEZSSqQxV2SCmRxl6QCWdwlqUAWd0kqkMVdkgpkcZekAg28uEfET0TEhyPi3oj4nUG3L0laX6XiHhF3RcSZiDi2Yv1MRDwZEScj4gBAZp7IzJuAXwEmBx9ZkrSeqlfudwMz7Ssi4gLgTmAHsA3YHRHbWtveBXwW+OeBJZUkVRaZWW3HiC3Ag5l5RWv5auDWzLy2tXwLQGa+v+2YT2Tm9V3a2wfsAxgfH79ybm6upzdw5uwizz6/ev32zcOf0mBpaYmxsbFhx1jFXPWNajZz1Teq2XrJNT09fSQzO46QvKKPLJuBp9uWF4CrImIKeDfwKuBwt4MzcxaYBZicnMypqameQtxx8D5uO7r6bZza01t7gzQ/P0+v76tJ5qpvVLOZq75RzTboXP0U9+iwLjNzHpiv1EDETmDnxMREHzEkSSv1c7fMAnBp2/IlwOn+4kiSBqGf4v4ocHlEXBYRFwK7gPvrNOCUv5LUjKq3Qh4CHgG2RsRCROzNzHPAfuAh4ARwT2Yeby6qJKmqSmPumbm7y/rDrPGh6Xocc5ekZvgkJkkqkM9QlaQCeeUuSQVyVkhJKpDDMpJUIIdlJKlADstIUoEs7pJUIMfcJalAjrlLUoEclpGkAlncJalA/TysY6RtOfCJjutPfaDjU/8kqSh+oCpJBfIDVUkqkGPuklQgi7skFcjiLkkFsrhLUoG8W0aSCuTdMpJUIIdlJKlAFndJKpDFXZIKZHGXpAJZ3CWpQMXOCtlNt9kiwRkjJZWjkSv3iPjFiPiriLgvIn6+ideQJHVXubhHxF0RcSYijq1YPxMRT0bEyYg4AJCZH8/M9wI3Ar860MSSpHXVuXK/G5hpXxERFwB3AjuAbcDuiNjWtssft7ZLkjZQ5eKemZ8Gzq5Y/XbgZGY+lZnfAeaAG2LZnwOfzMwvDi6uJKmKyMzqO0dsAR7MzCtay+8BZjLzt1vLvw5cBXwZ+A3gUeDxzPxwh7b2AfsAxsfHr5ybm+vpDZw5u8izz/d06CrbNw92GoSlpSXGxsYG2uYgmKu+Uc1mrvpGNVsvuaanp49k5mSnbf3eLRMd1mVm3g7cvtaBmTkbEV8Ddl500UVXTk1N9RTgjoP3cdvRwdz0c2pPbxm6mZ+fp9f31SRz1Teq2cxV36hmG3Sufu+WWQAubVu+BDhd9WAnDpOkZvRb3B8FLo+IyyLiQmAXcH/Vg53yV5KaUedWyEPAI8DWiFiIiL2ZeQ7YDzwEnADuyczjVdv0yl2SmlF5sDozd3dZfxg43MuLR8ROYOfExEQvh0uSuhjq9AOZ+QDwwOTk5HuHmeO8blMTOC2BpJeal93cMr2w6Et6qfEZqpJUIJ+hKkkFcj53SSqQwzKSVCCHZSSpQA7LSFKBLO6SVCDH3CWpQI65S1KBHJaRpAJZ3CWpQEOdW+alPiukc85IGlWOuUtSgZwVsgHnr+hv3n6OG9uu7r2il7RRHHOXpAJ55b6BHKOXtFG8cpekAlncJalATj8gSQXyVkhJKpDDMpJUIIu7JBXI4i5JBbK4S1KB/BLTCPDLTZIGzSt3SSrQwIt7RLwpIv46Iu4ddNuSpGoqFfeIuCsizkTEsRXrZyLiyYg4GREHADLzqczc20RYSVI1Va/c7wZm2ldExAXAncAOYBuwOyK2DTSdJKknkZnVdozYAjyYmVe0lq8Gbs3Ma1vLtwBk5vtby/dm5nvWaG8fsA9gfHz8yrm5uZ7ewJmzizz7fE+HNm781TSSbfvmet/oPfrM907vcD5X3XaatrS0xNjY2LBjdDSq2cxV36hm6yXX9PT0kcyc7LStn7tlNgNPty0vAFdFxA8Bfwa8NSJuOV/sV8rMWWAWYHJyMqempnoKccfB+7jt6Gje9HPz9nONZDu1Z6rW/jeuuBvnfK667TRtfn6eXvtB00Y1m7nqG9Vsg87VT+WJDusyM78B3FSpgZf4M1QlaVT1c7fMAnBp2/IlwOk6DThxmCQ1o58r90eByyPiMuAZYBfwa3Ua8Mq9N37pSdJ6qt4KeQh4BNgaEQsRsTczzwH7gYeAE8A9mXm8zot75S5Jzah05Z6Zu7usPwwc7vXFvXIfLn8DkMrlwzokqUDOLSNJBfIZqpJUIIdlJKlADstIUoGG+r1975YZrG53vwyTd+SoE/tF8xyWkaQCOSwjSQVyWEZF8td+vdw5LCNJBXJYRpIKZHGXpAJZ3CWpQH6gqlXWul/+pf6BpB+0rs9zVAY/UJWkAjksI0kFsrhLUoEs7pJUIIu7JBXI4i5JBfJWSA3EoKYbrttO07fnnc9z8/Zz3LgiW7fXrnsrYdO3Hnpr48uTt0JKUoEclpGkAlncJalAFndJKpDFXZIKZHGXpAJZ3CWpQAO/zz0iNgEfAr4DzGfmwUG/hiRpbZWu3CPirog4ExHHVqyfiYgnI+JkRBxorX43cG9mvhd414DzSpIqqDosczcw074iIi4A7gR2ANuA3RGxDbgEeLq123cHE1OSVEdkZrUdI7YAD2bmFa3lq4FbM/Pa1vItrV0XgG9m5oMRMZeZu7q0tw/YBzA+Pn7l3NxcT2/gzNlFnn2+p0MbN/5qRjLbKOTavnn1t5KXlpb46uJwrgc65QE4+swiMJhztt5rVN2/3dLSEmNjY2vu0639uqrkOW+9XIPK1M1aWaucs/X08jNb75heck1PTx/JzMlO2/oZc9/Mi1fosFzUrwJuBz4YEdcDD3Q7ODNngVmAycnJnJqa6inEHQfv47ajQ50ip6ubt58byWyjkOvUnqlV6+bn57nts89tfBg65wFemE9mEOdsvdeoun+7+fl51vu30639uqrkOW+9XIPK1M1aWaucs/X08jNb75hB5GrXT2+NDusyM58DfrNSA04cJkmN6OdWyAXg0rblS4DTdRpw4jBJakY/xf1R4PKIuCwiLgR2AffXaSAidkbE7OJis+NvkvRyU/VWyEPAI8DWiFiIiL2ZeQ7YDzwEnADuyczjdV7cK3dJakalMffM3N1l/WHgcK8v7pi7JDXDh3VIUoGGWtwdc5ekZnjlLkkFqvwN1UZDRPw38J89Hn4x8PUBxhmkUc1mrvpGNZu56hvVbL3kemNmvr7ThpEo7v2IiMe6ff122EY1m7nqG9Vs5qpvVLMNOpfzuUtSgSzuklSgEor77LADrGFUs5mrvlHNZq76RjXbQHO95MfcJUmrlXDlLklaweIuSQUa6eLe5Rmt7dsjIm5vbf9SRLyt6rEN59rTyvOliPhcRLylbdupiDgaEY9HxGMbnGsqIhZbr/14RPxJ1WM3INsftuU6FhHfjYgfbG1r8px1fD5w2/Zh9bH1cg2rj62Xa5h9bL1sw+pjl0bEwxFxIiKOR8Tvd9hn8P0sM0fyD3AB8BXgTcCFwBPAthX7XAd8kuUHh/w08IWqxzac6xrgda2/7zifq7V8Crh4SOdriuVHJdY+tulsK/bfCXyq6XPWavudwNuAY122b3gfq5hrw/tYxVxD6WNVsg2xj70BeFvr7xcBX96IWjbKV+5vB05m5lOZ+R1gDrhhxT43AB/NZZ8HXhsRb6h4bGO5MvNzmfnN1uLnWX6QSdP6ec9Nnq9e2t8NHBrg63eVmZ8Gzq6xyzD62Lq5htTHqpyvbpruY3WzbWQf+1pmfrH19/9heYr0zSt2G3g/G+Xi3ukZrStPSLd9qhzbZK52e1n+P/J5CfxjRByJ5YeED0rVXFdHxBMR8cmIeHPNY5vORkT8ADAD/F3b6qbOWRXD6GN1bVQfq2oYfayyYfaxiNgCvBX4wopNA+9no/f05hd1fEZrxX2qHNurym1HxDTL//De0bb6ZzLzdET8MPBPEfHvrSuOjcj1RZbnoliKiOuAjwOXVzy26Wzn7QT+JTPbr8CaOmdVDKOPVbbBfayKYfWxOobSxyJijOX/ofxBZn575eYOh/TVz0b5yr3KM1q77dP38137zEVE/CTwEeCGzPzG+fWZebr13zPA37P8a9eG5MrMb2fmUuvvh4FXRsTFVY5tOlubXaz4dbnBc1bFMPpYJUPoY+saYh+rY8P7WES8kuXCfjAzP9Zhl8H3syY+QBjEH5Z/q3gKuIwXP0h484p9rud7P4T416rHNpzrR4GTwDUr1m8CLmr7++eAmQ3M9SO8+MW1twP/1Tp3jZ2vOj8P4DUsj5lu2ohz1vYaW+j+AeGG97GKuTa8j1XMNZQ+ViXbsPpY6/1/FPjLNfYZeD8b6Ilt4Ad1HcufLH8F+KPWupuAm9pO2p2t7UeBybWO3cBcHwG+CTze+vNYa/2bWj+cJ4DjQ8i1v/W6T7D8Idw1ax27kdlayzcCcyuOa/qcHQK+Bvwvy1dJe0ekj62Xa1h9bL1cw+xja2YbYh97B8tDKV9q+3ld13Q/c/oBSSrQKI+5S5J6ZHGXpAJZ3CWpQBZ3SSqQxV2SNth6k5yt2Pcv2iY8+3JEfKvSa3i3jCRtrIh4J7DE8nwyV9Q47veAt2bmb623r1fukrTBssMkZxHxYxHxD635bT4TET/e4dDKE56N8twykvRyMsvyl5r+IyKuAj4E/Oz5jRHxRpa/qfqpKo1Z3CVpyFqTil0D/G3EC3OFvWrFbruAezPzu1XatLhL0vB9H/CtzPypNfbZBfxunQYlSUOUy1MAfzUifhleeOzeW85vj4itwOuAR6q2aXGXpA0WEYdYLtRbI2IhIvYCe4C9EXF+ArP2Jy7tZnnCs8q3N3orpCQVyCt3SSqQxV2SCmRxl6QCWdwlqUAWd0kqkMVdkgpkcZekAv0/3f4xN+yWz2gAAAAASUVORK5CYII=\n" 363 | }, 364 | "metadata": { 365 | "needs_background": "light" 366 | }, 367 | "output_type": "display_data" 368 | } 369 | ], 370 | "source": [ 371 | "df[df[\"size\"]<0.2e8][\"size\"].hist(bins=50,log=True)" 372 | ], 373 | "metadata": { 374 | "collapsed": false, 375 | "pycharm": { 376 | "name": "#%%\n" 377 | } 378 | } 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "source": [ 383 | "### Lots of small properties" 384 | ], 385 | "metadata": { 386 | "collapsed": false 387 | } 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 14, 392 | "outputs": [ 393 | { 394 | "data": { 395 | "text/plain": "" 396 | }, 397 | "execution_count": 14, 398 | "metadata": {}, 399 | "output_type": "execute_result" 400 | }, 401 | { 402 | "data": { 403 | "text/plain": "
", 404 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAM3klEQVR4nO3dbYxc91XH8e/BJqG1xdLiYhU7Yo1ipQ2xoHTUB0BoTYvqNHWDUIUctaiRQiwkIgKKBI76Bl6RFxSBSqhkleACUVZRWxGvY1FQYNU3VYndIuzENTWtS9YJcaKIBUcRbdTDi7lBi72z2Xnau3Pm+5Es7/3Pzp1zZmZ/e+d/HzYyE0lSLd/XdgGSpNEz3CWpIMNdkgoy3CWpIMNdkgra2nYBADt27MjZ2dmB7vvyyy+zbdu20Ra0ydnzdLDn6TBMz6dPn34xM9+y2m2bItxnZ2c5derUQPddXFxkbm5utAVtcvY8Hex5OgzTc0R8u9dtrU7LRMTBiDi6vLzcZhmSVE6r4Z6ZC5l5eGZmps0yJKkcd6hKUkGGuyQVZLhLUkGGuyQVZLhLUkGGuyQVtClOYhrGmUvL3Hnk8WvGLz5wWwvVSNLm4ElMklSQJzFJUkHOuUtSQYa7JBVkuEtSQYa7JBVkuEtSQYa7JBVkuEtSQYa7JBVkuEtSQYa7JBXktWUkqSCvLSNJBTktI0kFGe6SVJDhLkkFGe6SVJDhLkkFGe6SVJDhLkkFGe6SVJDhLkkFGe6SVJDhLkkFGe6SVJDhLkkFGe6SVJDXc5ekgryeuyQV5LSMJBVkuEtSQYa7JBVkuEtSQYa7JBVkuEtSQYa7JBVkuEtSQYa7JBVkuEtSQYa7JBVkuEtSQYa7JBVkuEtSQYa7JBVkuEtSQYa7JBVkuEtSQYa7JBVkuEtSQYa7JBVkuEtSQYa7JBU0lnCPiG0RcToiPjSO9UuS1raucI+IhyLickScvWr8QEScj4gLEXFkxU2/Czw6ykIlSeu33i33Y8CBlQMRsQV4ELgVuBm4IyJujoj3A08Dz4+wTklSH7au55sy80sRMXvV8LuAC5n5TYCImAduB7YD2+gG/isRcTIzvze6kiVJrycyc33f2A33E5l5S7P8EeBAZv5as/yrwLsz855m+U7gxcw80WN9h4HDADt37nzn/Pz8QA1cfmmZ51+5dnzfrpmB1jcJrly5wvbt29suY0PZ83Sw5/7s37//dGZ2VrttXVvuPcQqY//3myIzj61158w8ChwF6HQ6OTc3N1ARn3r4MT555to2Ln50sPVNgsXFRQZ9viaVPU8Hex6dYY6WWQJuWLG8G3h2uHIkSaMwTLg/CeyNiD0RcR1wCDg+mrIkScNY76GQjwBfBm6KiKWIuCszXwXuAb4InAMezcyn+nnwiDgYEUeXl5f7rVuStIb1Hi1zR4/xk8DJQR88MxeAhU6nc/eg65AkXWuYHarq0+yRx1cdv/jAbRtciaTqvLaMJBVkuEtSQa2GuztUJWk8Wg33zFzIzMMzM3XPJpWkNjgtI0kFGe6SVJDhLkkFuUNVkgpyh6okFeS0jCQVZLhLUkGGuyQVZLhLUkGGuyQV5KGQklSQh0JKUkFOy0hSQYa7JBVkuEtSQYa7JBVkuEtSQVvbfPCIOAgcvPHGG9ssY+RmjzzedgmSppyHQkpSQU7LSFJBhrskFWS4S1JBhrskFWS4S1JBhrskFWS4S1JBXs9dkgryJCZJKshpGUkqqNVry2gwZy4tc+cq16+5+MBtLVQjaTNyy12SCjLcJakgw12SCjLcJakgw12SCjLcJakgw12SCjLcJakgry0jSQV5bRlJKshpGUkqyHCXpIIMd0kqyHCXpIIMd0kqyHCXpIL8Yx2SNg3/EM3oGO6b2Owqb3KA+/ZtcCGSJo7TMpJUkOEuSQU5LSNNqV7Tfs5v12C4S/p/DP0anJaRpIIMd0kqyOu5S1JBrc65Z+YCsNDpdO5usw5pM/KEHg3DaRlJKsijZYbQ66iCttajyeSZyBoHt9wlqSDDXZIKMtwlqSDDXZIKMtwlqSDDXZIK8lBIqbgKh9p6MbP+Ge6ShrLWLw/Dtz2Gu1REhS10jY5z7pJUkFvukqZexTl9w12aME6/aD0Md2kN7izUpDLcJa3LKD8xeCXM8TPcJalPkzBH79EyklSQ4S5JBTktI2liTcL0SFvccpekgtxylzaIx6dPnkl+zQz3KeBHV2n6GO6SxmaSt3wn3cjDPSLeDtwL7ACeyMxPj/oxpFEzhFTNunaoRsRDEXE5Is5eNX4gIs5HxIWIOAKQmecy89eBXwE6oy9ZkvR61rvlfgz4U+AvXxuIiC3Ag8AvAkvAkxFxPDOfjogPA0ea+6gQ5+9fn58C2uf7FCIz1/eNEbPAicy8pVl+L/B7mfmBZvl+gMz8gxX3eTwzV302I+IwcBhg586d75yfnx+ogcsvLfP8K9eO79s1M9D6+nHm0vLYH2M1O99AXz33qnOQ52iU6+rHlStX2L59+7q/v98623ot19Lrda5s3D2P+/Uf5Oeg3/f2Svv37z+dmavOkAwz574LeGbF8hLw7oiYA34ZuB442evOmXkUOArQ6XRybm5uoCI+9fBjfPLMtW1c/Ohg6+vHnS1tod2379W+eu5V5yDP0SjX1Y/FxUVWe4/03kpe/a3d73PUpl6vc2Xj7nncr/8gPwe93tvDGuZZjFXGMjMXgcUh1itJGtIw4b4E3LBieTfw7HDlaFI5xyltLsNcfuBJYG9E7ImI64BDwPF+VhARByPi6PLy5pvvlKRJtt5DIR8BvgzcFBFLEXFXZr4K3AN8ETgHPJqZT/Xz4Jm5kJmHZ2bGv/NTkqbJuqZlMvOOHuMnWWOnqSRtJuM+THUz/VnG6doVr6nnMeiaFl7yV5IKajXc3aEqSePR6rRMZi4AC51O5+4269DG6/fQyTOXljfliUbSZuW0jCQV5A5VSdoAvT6tHjuwbSyPZ7ivg0dYSJo07lCVpIJaDXfPUJWk8XCHqiQVZLhLUkGGuyQVZLhLUkGGuyQV5KGQklSQ15ZZwZOV2tfrNbhv3wYXIk04z1DVWPkLU2qHc+6SVJDhLkkFGe6SVJBz7rqG8+TS5PNQSEkqyEMhp5hb6FJdzrlLUkGGuyQVZLhLUkGGuyQVZLhLUkGGuyQVZLhLUkGexCRJBU3dSUyVT9yp3Juk/jgtI0kFGe6SVJDhLkkFGe6SVJDhLkkFGe6SVJDhLkkFlf0zex7zLWmaueUuSQUZ7pJUkNeWkaSCWg33zFzIzMMzMzNtliFJ5TgtI0kFGe6SVJDhLkkFGe6SVJDhLkkFRWa2XQMR8QLw7QHvvgN4cYTlTAJ7ng72PB2G6fnHMvMtq92wKcJ9GBFxKjM7bdexkex5OtjzdBhXz07LSFJBhrskFVQh3I+2XUAL7Hk62PN0GEvPEz/nLkm6VoUtd0nSVQx3SSpoosM9Ig5ExPmIuBARR9quZxwi4oaI+MeIOBcRT0XEvc34myPi7yPiG83/b2q71lGKiC0R8bWIONEsV+/3hyLicxHx9ea1fu8U9PzbzXv6bEQ8EhE/UK3niHgoIi5HxNkVYz17jIj7mzw7HxEfGOaxJzbcI2IL8CBwK3AzcEdE3NxuVWPxKnBfZr4deA/wG02fR4AnMnMv8ESzXMm9wLkVy9X7/RPgbzPzbcBP0u29bM8RsQv4TaCTmbcAW4BD1Ov5GHDgqrFVe2x+rg8BP9Hc58+anBvIxIY78C7gQmZ+MzO/A8wDt7dc08hl5nOZ+dXm6/+m+0O/i26vn22+7bPAL7VS4BhExG7gNuAzK4Yr9/uDwM8Dfw6Qmd/JzP+kcM+NrcAbImIr8EbgWYr1nJlfAl66arhXj7cD85n5P5n5LeAC3ZwbyCSH+y7gmRXLS81YWRExC7wD+AqwMzOfg+4vAOBHWixt1P4Y+B3geyvGKvf748ALwF80U1GfiYhtFO45My8Bfwj8O/AcsJyZf0fhnlfo1eNIM22Swz1WGSt7XGdEbAc+D/xWZv5X2/WMS0R8CLicmafbrmUDbQV+Gvh0Zr4DeJnJn45YUzPPfDuwB/hRYFtEfKzdqlo30kyb5HBfAm5Ysbyb7se6ciLi++kG+8OZ+YVm+PmIeGtz+1uBy23VN2I/C3w4Ii7SnWr7hYj4a+r2C9338lJmfqVZ/hzdsK/c8/uBb2XmC5n5XeALwM9Qu+fX9OpxpJk2yeH+JLA3IvZExHV0d0Qcb7mmkYuIoDsXey4z/2jFTceBjzdffxx4bKNrG4fMvD8zd2fmLN3X9B8y82MU7RcgM/8DeCYibmqG3gc8TeGe6U7HvCci3ti8x99Hd39S5Z5f06vH48ChiLg+IvYAe4F/GvhRMnNi/wEfBP4V+DfgE23XM6Yef47uR7N/Af65+fdB4Ifp7mn/RvP/m9uudQy9zwEnmq9L9wv8FHCqeZ3/BnjTFPT8+8DXgbPAXwHXV+sZeITuPoXv0t0yv2utHoFPNHl2Hrh1mMf28gOSVNAkT8tIknow3CWpIMNdkgoy3CWpIMNdkgoy3CWpIMNdkgr6XydBxweyMDivAAAAAElFTkSuQmCC\n" 405 | }, 406 | "metadata": { 407 | "needs_background": "light" 408 | }, 409 | "output_type": "display_data" 410 | } 411 | ], 412 | "source": [ 413 | "df[df[\"size\"]<100][\"size\"].hist(bins=50,log=True)" 414 | ], 415 | "metadata": { 416 | "collapsed": false, 417 | "pycharm": { 418 | "name": "#%%\n" 419 | } 420 | } 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 15, 425 | "outputs": [], 426 | "source": [ 427 | "# I need coffee" 428 | ], 429 | "metadata": { 430 | "collapsed": false, 431 | "pycharm": { 432 | "name": "#%%\n" 433 | } 434 | } 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 15, 439 | "outputs": [], 440 | "source": [], 441 | "metadata": { 442 | "collapsed": false, 443 | "pycharm": { 444 | "name": "#%%\n" 445 | } 446 | } 447 | } 448 | ], 449 | "metadata": { 450 | "kernelspec": { 451 | "display_name": "Python 3", 452 | "language": "python", 453 | "name": "python3" 454 | }, 455 | "language_info": { 456 | "codemirror_mode": { 457 | "name": "ipython", 458 | "version": 2 459 | }, 460 | "file_extension": ".py", 461 | "mimetype": "text/x-python", 462 | "name": "python", 463 | "nbconvert_exporter": "python", 464 | "pygments_lexer": "ipython2", 465 | "version": "2.7.6" 466 | } 467 | }, 468 | "nbformat": 4, 469 | "nbformat_minor": 0 470 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bundesanstalt-immobilienaufgaben-liegenschaften 2 | 3 | | File | Info | Comment | 4 | |---------------------------------|------------------------------------------------|------------------------------------------------------------------------------------------------------| 5 | | Anlage_wo_pwd.pdf | raw file | | 6 | | Anlage_wo_pwd.tsv | Same as Anlage_wo_pwd.pdf but extracted as tsv | [with pdf2json](https://github.com/bundesAPI/bundesanstalt-immobilienaufgaben-liegenschaften/pull/1) | 7 | | Fixed_Anlage_wo_pwd.csv | cleaned up data garbage | with [fix_postcode_city_mix.ipynb](fix_postcode_city_mix.ipynb) | 8 | | Fixed_Anlage_wo_pwd_Encoded.csv | geo-encoded data | with osm
Exact_Pos_Failed means that City+Zip+Street was not found but City+Zip | 9 | 10 |

11 | 12 |

-------------------------------------------------------------------------------- /fix_postcode_city_mix.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "outputs": [], 18 | "source": [ 19 | "data_types_broken ={\n", 20 | " \"id\" : pd.Int64Dtype(),\n", 21 | " \"postcode\" : str\t,\n", 22 | " \"city\": str,\n", 23 | " \"street\" : str,\n", 24 | " \"area\" : \"category\",\n", 25 | " \"user\" : \"category\",\n", 26 | " \"size\": float\n", 27 | "}" 28 | ], 29 | "metadata": { 30 | "collapsed": false, 31 | "pycharm": { 32 | "name": "#%%\n" 33 | } 34 | } 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "outputs": [], 40 | "source": [ 41 | "df = pd.read_csv(\"Anlage_wo_pwd.tsv\", delimiter=\"\\t\" , decimal=\",\", dtype=data_types_broken)" 42 | ], 43 | "metadata": { 44 | "collapsed": false, 45 | "pycharm": { 46 | "name": "#%%\n" 47 | } 48 | } 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "source": [ 53 | "### 1. Find broken postcodes" 54 | ], 55 | "metadata": { 56 | "collapsed": false 57 | } 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 4, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": " id postcode \\\n4888 5001 14552 Gemeinde \n4891 5004 15827Blankenfelde \n17773 18182 15713 Königs \n65685 66105 72525 Gutsbezirk \n66789 66833 01825 Bad \n69421 68569 01825 Bad \n69426 68572 01825 Bad \n69434 68577 01825 Bad \n69441 68582 01825 Bad \n69959 68924 01825 Bad \n70825 69494 72525 Gutsbezirk \n87401 80422 15713Königs \n88197 80947 15713Königs \n88232 80969 15713Königs \n110638 95741 15713Königs \n110773 95830 14621 Schönwalde-Glien/ \n117855 100500 15713 Königs \n117901 100530 14621 Schönwalde-Glien/ \n119605 101654 16321 Ahrensfelde-Blumberg, \n119646 101680 15713 Königs \n\n city street \\\n4888 Michendorf/ OT Wilhelmshorst An den Bergen 121 \n4891 - Mahlow OT Blankenfelde Trebbiner Damm 28 \n17773 Wusterhausen OT Wernsdorf Am Sandberg 5 \n65685 Münsingen, gemfr. Gebiet Egelstein 9 \n66789 Gottleuba-Berggießhübel OT Börnersdo Börnersdorf Nr. 1A \n69421 Gottleuba-Berggießhübel OT Börnersdo Börnersdorf Nr. 1A \n69426 Gottleuba-Berggießhübel OT Börnersdo Börnersdorf Nr. 1A \n69434 Gottleuba-Berggießhübel OT Börnersdo Börnersdorf Nr. 1A \n69441 Gottleuba-Berggießhübel OT Börnersdo Börnersdorf Nr. 1A \n69959 Gottleuba-Berggießhübel OT Börnersdo Börnersdorf Nr. 1A \n70825 Münsingen, gemfr. Gebiet Egelstein 9 \n87401 Wusterhausen OT Niederlehme Wernsdorfer Straße 99b \n88197 Wusterhausen OT Niederlehme Wernsdorfer Straße 99b \n88232 Wusterhausen OT Niederlehme Wernsdorfer Straße 99b \n110638 Wusterhausen OT Niederlehme Seestraße 61 \n110773 OT Schönwalde Siedlung Birkenallee 9 \n117855 Wusterhausen OT Wernsdorf Buchfinkenweg 12a,b \n117901 OT Schönwalde Siedlung Birkenallee 7 \n119605 OT Lindenberg Krimhildstraße 26 \n119646 Wusterhausen / Niederlehme Wernsdorfer Str. 92 a \n\n area user size \n4888 Gewerbe/sonst. Nutzung NaN 1118.00 \n4891 Gewerbe/sonst. Nutzung NaN 1385.00 \n17773 Gewerbe/sonst. Nutzung NaN 440.00 \n65685 Dienstliegenschaften NaN 98.30 \n66789 Dienstliegenschaften NaN 0.00 \n69421 Dienstliegenschaften Bundespolizei 18.00 \n69426 Dienstliegenschaften NaN 200.00 \n69434 Dienstliegenschaften Bundespolizei 620.00 \n69441 Dienstliegenschaften Bundespolizei 836.73 \n69959 Dienstliegenschaften NaN 27.50 \n70825 Dienstliegenschaften NaN 163.21 \n87401 Gewerbe/sonst. Nutzung 0 2096.00 \n88197 Gewerbe/sonst. Nutzung NaN 15.00 \n88232 Gewerbe/sonst. Nutzung 0 50.00 \n110638 Gewerbe/sonst. Nutzung NaN 45.00 \n110773 Gewerbe/sonst. Nutzung NaN 917.00 \n117855 Gewerbe/sonst. Nutzung 0 523.00 \n117901 Gewerbe/sonst. Nutzung NaN 919.00 \n119605 Gewerbe/sonst. Nutzung NaN 702.00 \n119646 Gewerbe/sonst. Nutzung NaN 1582.00 ", 66 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idpostcodecitystreetareausersize
4888500114552 GemeindeMichendorf/ OT WilhelmshorstAn den Bergen 121Gewerbe/sonst. NutzungNaN1118.00
4891500415827Blankenfelde- Mahlow OT BlankenfeldeTrebbiner Damm 28Gewerbe/sonst. NutzungNaN1385.00
177731818215713 KönigsWusterhausen OT WernsdorfAm Sandberg 5Gewerbe/sonst. NutzungNaN440.00
656856610572525 GutsbezirkMünsingen, gemfr. GebietEgelstein 9DienstliegenschaftenNaN98.30
667896683301825 BadGottleuba-Berggießhübel OT BörnersdoBörnersdorf Nr. 1ADienstliegenschaftenNaN0.00
694216856901825 BadGottleuba-Berggießhübel OT BörnersdoBörnersdorf Nr. 1ADienstliegenschaftenBundespolizei18.00
694266857201825 BadGottleuba-Berggießhübel OT BörnersdoBörnersdorf Nr. 1ADienstliegenschaftenNaN200.00
694346857701825 BadGottleuba-Berggießhübel OT BörnersdoBörnersdorf Nr. 1ADienstliegenschaftenBundespolizei620.00
694416858201825 BadGottleuba-Berggießhübel OT BörnersdoBörnersdorf Nr. 1ADienstliegenschaftenBundespolizei836.73
699596892401825 BadGottleuba-Berggießhübel OT BörnersdoBörnersdorf Nr. 1ADienstliegenschaftenNaN27.50
708256949472525 GutsbezirkMünsingen, gemfr. GebietEgelstein 9DienstliegenschaftenNaN163.21
874018042215713KönigsWusterhausen OT NiederlehmeWernsdorfer Straße 99bGewerbe/sonst. Nutzung02096.00
881978094715713KönigsWusterhausen OT NiederlehmeWernsdorfer Straße 99bGewerbe/sonst. NutzungNaN15.00
882328096915713KönigsWusterhausen OT NiederlehmeWernsdorfer Straße 99bGewerbe/sonst. Nutzung050.00
1106389574115713KönigsWusterhausen OT NiederlehmeSeestraße 61Gewerbe/sonst. NutzungNaN45.00
1107739583014621 Schönwalde-Glien/OT Schönwalde SiedlungBirkenallee 9Gewerbe/sonst. NutzungNaN917.00
11785510050015713 KönigsWusterhausen OT WernsdorfBuchfinkenweg 12a,bGewerbe/sonst. Nutzung0523.00
11790110053014621 Schönwalde-Glien/OT Schönwalde SiedlungBirkenallee 7Gewerbe/sonst. NutzungNaN919.00
11960510165416321 Ahrensfelde-Blumberg,OT LindenbergKrimhildstraße 26Gewerbe/sonst. NutzungNaN702.00
11964610168015713 KönigsWusterhausen / NiederlehmeWernsdorfer Str. 92 aGewerbe/sonst. NutzungNaN1582.00
\n
" 67 | }, 68 | "execution_count": 4, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "df_broken_postcode = df[df[\"postcode\"].fillna(-1).str.strip().str.len() > 5]\n", 75 | "df_broken_postcode" 76 | ], 77 | "metadata": { 78 | "collapsed": false, 79 | "pycharm": { 80 | "name": "#%%\n" 81 | } 82 | } 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "source": [ 87 | "### 2. Fix broken postcodes" 88 | ], 89 | "metadata": { 90 | "collapsed": false 91 | } 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 5, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": " postcode_fixed city_0\n4888 14552 Gemeinde\n4891 15827 Blankenfelde\n17773 15713 Königs\n65685 72525 Gutsbezirk\n66789 01825 Bad \n69421 01825 Bad \n69426 01825 Bad \n69434 01825 Bad \n69441 01825 Bad \n69959 01825 Bad \n70825 72525 Gutsbezirk\n87401 15713 Königs\n88197 15713 Königs\n88232 15713 Königs\n110638 15713 Königs\n110773 14621 Schönwalde-Glien/\n117855 15713 Königs\n117901 14621 Schönwalde-Glien/\n119605 16321 Ahrensfelde-Blumberg,\n119646 15713 Königs", 100 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
postcode_fixedcity_0
488814552Gemeinde
489115827Blankenfelde
1777315713Königs
6568572525Gutsbezirk
6678901825Bad
6942101825Bad
6942601825Bad
6943401825Bad
6944101825Bad
6995901825Bad
7082572525Gutsbezirk
8740115713Königs
8819715713Königs
8823215713Königs
11063815713Königs
11077314621Schönwalde-Glien/
11785515713Königs
11790114621Schönwalde-Glien/
11960516321Ahrensfelde-Blumberg,
11964615713Königs
\n
" 101 | }, 102 | "execution_count": 5, 103 | "metadata": {}, 104 | "output_type": "execute_result" 105 | } 106 | ], 107 | "source": [ 108 | "df_postcode_split = df_broken_postcode['postcode'].str.extract('(?P\\d+)(?P\\D+)', expand=True)\n", 109 | "df_postcode_split" 110 | ], 111 | "metadata": { 112 | "collapsed": false, 113 | "pycharm": { 114 | "name": "#%%\n" 115 | } 116 | } 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 6, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/plain": " id street area user \\\n4888 5001 An den Bergen 121 Gewerbe/sonst. Nutzung NaN \n4891 5004 Trebbiner Damm 28 Gewerbe/sonst. Nutzung NaN \n17773 18182 Am Sandberg 5 Gewerbe/sonst. Nutzung NaN \n65685 66105 Egelstein 9 Dienstliegenschaften NaN \n66789 66833 Börnersdorf Nr. 1A Dienstliegenschaften NaN \n69421 68569 Börnersdorf Nr. 1A Dienstliegenschaften Bundespolizei \n69426 68572 Börnersdorf Nr. 1A Dienstliegenschaften NaN \n69434 68577 Börnersdorf Nr. 1A Dienstliegenschaften Bundespolizei \n69441 68582 Börnersdorf Nr. 1A Dienstliegenschaften Bundespolizei \n69959 68924 Börnersdorf Nr. 1A Dienstliegenschaften NaN \n70825 69494 Egelstein 9 Dienstliegenschaften NaN \n87401 80422 Wernsdorfer Straße 99b Gewerbe/sonst. Nutzung 0 \n88197 80947 Wernsdorfer Straße 99b Gewerbe/sonst. Nutzung NaN \n88232 80969 Wernsdorfer Straße 99b Gewerbe/sonst. Nutzung 0 \n110638 95741 Seestraße 61 Gewerbe/sonst. Nutzung NaN \n110773 95830 Birkenallee 9 Gewerbe/sonst. Nutzung NaN \n117855 100500 Buchfinkenweg 12a,b Gewerbe/sonst. Nutzung 0 \n117901 100530 Birkenallee 7 Gewerbe/sonst. Nutzung NaN \n119605 101654 Krimhildstraße 26 Gewerbe/sonst. Nutzung NaN \n119646 101680 Wernsdorfer Str. 92 a Gewerbe/sonst. Nutzung NaN \n\n size postcode city \n4888 1118.00 14552 Gemeinde Michendorf/ OT Wilhelmshorst \n4891 1385.00 15827 Blankenfelde - Mahlow OT Blankenfelde \n17773 440.00 15713 Königs Wusterhausen OT Wernsdorf \n65685 98.30 72525 Gutsbezirk Münsingen, gemfr. Gebiet \n66789 0.00 01825 Bad Gottleuba-Berggießhübel OT Börnersdo \n69421 18.00 01825 Bad Gottleuba-Berggießhübel OT Börnersdo \n69426 200.00 01825 Bad Gottleuba-Berggießhübel OT Börnersdo \n69434 620.00 01825 Bad Gottleuba-Berggießhübel OT Börnersdo \n69441 836.73 01825 Bad Gottleuba-Berggießhübel OT Börnersdo \n69959 27.50 01825 Bad Gottleuba-Berggießhübel OT Börnersdo \n70825 163.21 72525 Gutsbezirk Münsingen, gemfr. Gebiet \n87401 2096.00 15713 Königs Wusterhausen OT Niederlehme \n88197 15.00 15713 Königs Wusterhausen OT Niederlehme \n88232 50.00 15713 Königs Wusterhausen OT Niederlehme \n110638 45.00 15713 Königs Wusterhausen OT Niederlehme \n110773 917.00 14621 Schönwalde-Glien/ OT Schönwalde Siedlung \n117855 523.00 15713 Königs Wusterhausen OT Wernsdorf \n117901 919.00 14621 Schönwalde-Glien/ OT Schönwalde Siedlung \n119605 702.00 16321 Ahrensfelde-Blumberg, OT Lindenberg \n119646 1582.00 15713 Königs Wusterhausen / Niederlehme ", 125 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idstreetareausersizepostcodecity
48885001An den Bergen 121Gewerbe/sonst. NutzungNaN1118.0014552Gemeinde Michendorf/ OT Wilhelmshorst
48915004Trebbiner Damm 28Gewerbe/sonst. NutzungNaN1385.0015827Blankenfelde - Mahlow OT Blankenfelde
1777318182Am Sandberg 5Gewerbe/sonst. NutzungNaN440.0015713Königs Wusterhausen OT Wernsdorf
6568566105Egelstein 9DienstliegenschaftenNaN98.3072525Gutsbezirk Münsingen, gemfr. Gebiet
6678966833Börnersdorf Nr. 1ADienstliegenschaftenNaN0.0001825Bad Gottleuba-Berggießhübel OT Börnersdo
6942168569Börnersdorf Nr. 1ADienstliegenschaftenBundespolizei18.0001825Bad Gottleuba-Berggießhübel OT Börnersdo
6942668572Börnersdorf Nr. 1ADienstliegenschaftenNaN200.0001825Bad Gottleuba-Berggießhübel OT Börnersdo
6943468577Börnersdorf Nr. 1ADienstliegenschaftenBundespolizei620.0001825Bad Gottleuba-Berggießhübel OT Börnersdo
6944168582Börnersdorf Nr. 1ADienstliegenschaftenBundespolizei836.7301825Bad Gottleuba-Berggießhübel OT Börnersdo
6995968924Börnersdorf Nr. 1ADienstliegenschaftenNaN27.5001825Bad Gottleuba-Berggießhübel OT Börnersdo
7082569494Egelstein 9DienstliegenschaftenNaN163.2172525Gutsbezirk Münsingen, gemfr. Gebiet
8740180422Wernsdorfer Straße 99bGewerbe/sonst. Nutzung02096.0015713Königs Wusterhausen OT Niederlehme
8819780947Wernsdorfer Straße 99bGewerbe/sonst. NutzungNaN15.0015713Königs Wusterhausen OT Niederlehme
8823280969Wernsdorfer Straße 99bGewerbe/sonst. Nutzung050.0015713Königs Wusterhausen OT Niederlehme
11063895741Seestraße 61Gewerbe/sonst. NutzungNaN45.0015713Königs Wusterhausen OT Niederlehme
11077395830Birkenallee 9Gewerbe/sonst. NutzungNaN917.0014621Schönwalde-Glien/ OT Schönwalde Siedlung
117855100500Buchfinkenweg 12a,bGewerbe/sonst. Nutzung0523.0015713Königs Wusterhausen OT Wernsdorf
117901100530Birkenallee 7Gewerbe/sonst. NutzungNaN919.0014621Schönwalde-Glien/ OT Schönwalde Siedlung
119605101654Krimhildstraße 26Gewerbe/sonst. NutzungNaN702.0016321Ahrensfelde-Blumberg, OT Lindenberg
119646101680Wernsdorfer Str. 92 aGewerbe/sonst. NutzungNaN1582.0015713Königs Wusterhausen / Niederlehme
\n
" 126 | }, 127 | "execution_count": 6, 128 | "metadata": {}, 129 | "output_type": "execute_result" 130 | } 131 | ], 132 | "source": [ 133 | "df_tmp_postcode= df_broken_postcode.join(df_postcode_split).drop(columns=[\"postcode\"]).rename(columns={'postcode_fixed':'postcode'})\n", 134 | "df_tmp_postcode[\"city_fixed\"] = df_tmp_postcode[\"city_0\"] + df_tmp_postcode[\"city\"]\n", 135 | "df_tmp_postcode = df_tmp_postcode.drop(columns=[\"city\",\"city_0\"]).rename(columns={'city_fixed':\"city\"})\n", 136 | "df_tmp_postcode" 137 | ], 138 | "metadata": { 139 | "collapsed": false, 140 | "pycharm": { 141 | "name": "#%%\n" 142 | } 143 | } 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 7, 148 | "outputs": [ 149 | { 150 | "name": "stderr", 151 | "output_type": "stream", 152 | "text": [ 153 | "/opt/homebrew/Caskroom/miniforge/base/envs/py38/lib/python3.8/site-packages/pandas/core/frame.py:7511: SettingWithCopyWarning: \n", 154 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 155 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 156 | "\n", 157 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 158 | " self[col] = expressions.where(mask, this, that)\n" 159 | ] 160 | } 161 | ], 162 | "source": [ 163 | "df_broken_postcode.update(df_tmp_postcode)" 164 | ], 165 | "metadata": { 166 | "collapsed": false, 167 | "pycharm": { 168 | "name": "#%%\n" 169 | } 170 | } 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 8, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/plain": " id postcode city \\\n4888 5001 14552 Gemeinde Michendorf/ OT Wilhelmshorst \n4891 5004 15827 Blankenfelde - Mahlow OT Blankenfelde \n17773 18182 15713 Königs Wusterhausen OT Wernsdorf \n65685 66105 72525 Gutsbezirk Münsingen, gemfr. Gebiet \n66789 66833 01825 Bad Gottleuba-Berggießhübel OT Börnersdo \n69421 68569 01825 Bad Gottleuba-Berggießhübel OT Börnersdo \n69426 68572 01825 Bad Gottleuba-Berggießhübel OT Börnersdo \n69434 68577 01825 Bad Gottleuba-Berggießhübel OT Börnersdo \n69441 68582 01825 Bad Gottleuba-Berggießhübel OT Börnersdo \n69959 68924 01825 Bad Gottleuba-Berggießhübel OT Börnersdo \n70825 69494 72525 Gutsbezirk Münsingen, gemfr. Gebiet \n87401 80422 15713 Königs Wusterhausen OT Niederlehme \n88197 80947 15713 Königs Wusterhausen OT Niederlehme \n88232 80969 15713 Königs Wusterhausen OT Niederlehme \n110638 95741 15713 Königs Wusterhausen OT Niederlehme \n110773 95830 14621 Schönwalde-Glien/ OT Schönwalde Siedlung \n117855 100500 15713 Königs Wusterhausen OT Wernsdorf \n117901 100530 14621 Schönwalde-Glien/ OT Schönwalde Siedlung \n119605 101654 16321 Ahrensfelde-Blumberg, OT Lindenberg \n119646 101680 15713 Königs Wusterhausen / Niederlehme \n\n street area user size \n4888 An den Bergen 121 Gewerbe/sonst. Nutzung NaN 1118.00 \n4891 Trebbiner Damm 28 Gewerbe/sonst. Nutzung NaN 1385.00 \n17773 Am Sandberg 5 Gewerbe/sonst. Nutzung NaN 440.00 \n65685 Egelstein 9 Dienstliegenschaften NaN 98.30 \n66789 Börnersdorf Nr. 1A Dienstliegenschaften NaN 0.00 \n69421 Börnersdorf Nr. 1A Dienstliegenschaften Bundespolizei 18.00 \n69426 Börnersdorf Nr. 1A Dienstliegenschaften NaN 200.00 \n69434 Börnersdorf Nr. 1A Dienstliegenschaften Bundespolizei 620.00 \n69441 Börnersdorf Nr. 1A Dienstliegenschaften Bundespolizei 836.73 \n69959 Börnersdorf Nr. 1A Dienstliegenschaften NaN 27.50 \n70825 Egelstein 9 Dienstliegenschaften NaN 163.21 \n87401 Wernsdorfer Straße 99b Gewerbe/sonst. Nutzung 0 2096.00 \n88197 Wernsdorfer Straße 99b Gewerbe/sonst. Nutzung NaN 15.00 \n88232 Wernsdorfer Straße 99b Gewerbe/sonst. Nutzung 0 50.00 \n110638 Seestraße 61 Gewerbe/sonst. Nutzung NaN 45.00 \n110773 Birkenallee 9 Gewerbe/sonst. Nutzung NaN 917.00 \n117855 Buchfinkenweg 12a,b Gewerbe/sonst. Nutzung 0 523.00 \n117901 Birkenallee 7 Gewerbe/sonst. Nutzung NaN 919.00 \n119605 Krimhildstraße 26 Gewerbe/sonst. Nutzung NaN 702.00 \n119646 Wernsdorfer Str. 92 a Gewerbe/sonst. Nutzung NaN 1582.00 ", 179 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idpostcodecitystreetareausersize
4888500114552Gemeinde Michendorf/ OT WilhelmshorstAn den Bergen 121Gewerbe/sonst. NutzungNaN1118.00
4891500415827Blankenfelde - Mahlow OT BlankenfeldeTrebbiner Damm 28Gewerbe/sonst. NutzungNaN1385.00
177731818215713Königs Wusterhausen OT WernsdorfAm Sandberg 5Gewerbe/sonst. NutzungNaN440.00
656856610572525Gutsbezirk Münsingen, gemfr. GebietEgelstein 9DienstliegenschaftenNaN98.30
667896683301825Bad Gottleuba-Berggießhübel OT BörnersdoBörnersdorf Nr. 1ADienstliegenschaftenNaN0.00
694216856901825Bad Gottleuba-Berggießhübel OT BörnersdoBörnersdorf Nr. 1ADienstliegenschaftenBundespolizei18.00
694266857201825Bad Gottleuba-Berggießhübel OT BörnersdoBörnersdorf Nr. 1ADienstliegenschaftenNaN200.00
694346857701825Bad Gottleuba-Berggießhübel OT BörnersdoBörnersdorf Nr. 1ADienstliegenschaftenBundespolizei620.00
694416858201825Bad Gottleuba-Berggießhübel OT BörnersdoBörnersdorf Nr. 1ADienstliegenschaftenBundespolizei836.73
699596892401825Bad Gottleuba-Berggießhübel OT BörnersdoBörnersdorf Nr. 1ADienstliegenschaftenNaN27.50
708256949472525Gutsbezirk Münsingen, gemfr. GebietEgelstein 9DienstliegenschaftenNaN163.21
874018042215713Königs Wusterhausen OT NiederlehmeWernsdorfer Straße 99bGewerbe/sonst. Nutzung02096.00
881978094715713Königs Wusterhausen OT NiederlehmeWernsdorfer Straße 99bGewerbe/sonst. NutzungNaN15.00
882328096915713Königs Wusterhausen OT NiederlehmeWernsdorfer Straße 99bGewerbe/sonst. Nutzung050.00
1106389574115713Königs Wusterhausen OT NiederlehmeSeestraße 61Gewerbe/sonst. NutzungNaN45.00
1107739583014621Schönwalde-Glien/ OT Schönwalde SiedlungBirkenallee 9Gewerbe/sonst. NutzungNaN917.00
11785510050015713Königs Wusterhausen OT WernsdorfBuchfinkenweg 12a,bGewerbe/sonst. Nutzung0523.00
11790110053014621Schönwalde-Glien/ OT Schönwalde SiedlungBirkenallee 7Gewerbe/sonst. NutzungNaN919.00
11960510165416321Ahrensfelde-Blumberg, OT LindenbergKrimhildstraße 26Gewerbe/sonst. NutzungNaN702.00
11964610168015713Königs Wusterhausen / NiederlehmeWernsdorfer Str. 92 aGewerbe/sonst. NutzungNaN1582.00
\n
" 180 | }, 181 | "execution_count": 8, 182 | "metadata": {}, 183 | "output_type": "execute_result" 184 | } 185 | ], 186 | "source": [ 187 | "df_broken_postcode\n", 188 | "# Looks good! ... let's update the original dataframe" 189 | ], 190 | "metadata": { 191 | "collapsed": false, 192 | "pycharm": { 193 | "name": "#%%\n" 194 | } 195 | } 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 9, 200 | "outputs": [], 201 | "source": [ 202 | "df.update(df_broken_postcode)" 203 | ], 204 | "metadata": { 205 | "collapsed": false, 206 | "pycharm": { 207 | "name": "#%%\n" 208 | } 209 | } 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 10, 214 | "outputs": [], 215 | "source": [ 216 | "# Let's make sure we really got all the postcodes\n", 217 | "assert df[df[\"postcode\"].fillna(-1).str.strip().str.len() > 5].size == 0" 218 | ], 219 | "metadata": { 220 | "collapsed": false, 221 | "pycharm": { 222 | "name": "#%%\n" 223 | } 224 | } 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 11, 229 | "outputs": [], 230 | "source": [ 231 | "df[\"postcode\"] = df[\"postcode\"].str.strip()\n", 232 | "df[\"postcode\"] = pd.to_numeric(df[\"postcode\"]).astype(pd.Int64Dtype()) # Don't ask :D" 233 | ], 234 | "metadata": { 235 | "collapsed": false, 236 | "pycharm": { 237 | "name": "#%%\n" 238 | } 239 | } 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "source": [ 244 | "### 3. Fix IDs (some don't exist)\n", 245 | "#### replace all NaN with '-1' so we can use \"int\" datatype" 246 | ], 247 | "metadata": { 248 | "collapsed": false 249 | } 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 12, 254 | "outputs": [], 255 | "source": [ 256 | "df[\"id\"] = df[\"id\"].fillna(-1).astype(int)" 257 | ], 258 | "metadata": { 259 | "collapsed": false, 260 | "pycharm": { 261 | "name": "#%%\n" 262 | } 263 | } 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "source": [ 268 | "### 4. Replace all str NaN with \"Unbekannt\" (Unknown)" 269 | ], 270 | "metadata": { 271 | "collapsed": false 272 | } 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 13, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/plain": " id postcode city \\\n0 1 24799 Owschlag \n1 2 59494 Soest \n2 3 81549 München Cincinnati/Wikinger/Leif/Penn/Lincoln/... \n3 4 81549 München \n4 5 81549 München \n... ... ... ... \n234740 180184 98527 Suhl, Stadt \n234741 180185 16278 Angermünde, Stadt \n234742 180186 15234 Frankfurt (Oder) \n234743 180187 45219 Essen \n234744 180188 8525 Plauen \n\n street area user \\\n0 Unbekannt Bundesforst Unbekannt \n1 Unbekannt Bundesforst Unbekannt \n2 Unbekannt Wohnen Unbekannt \n3 Minnewitstr. 12 (350 A8) Wohnen Unbekannt \n4 Cincinnatistr. 59 Wgh. 3-6 Wohnen Unbekannt \n... ... ... ... \n234740 Friedrich-König-Straße Gewerbe/sonst. Nutzung Unbekannt \n234741 Rudolf-Breitscheid-Straße Gewerbe/sonst. Nutzung Unbekannt \n234742 Traubenweg 18 Gewerbe/sonst. Nutzung Unbekannt \n234743 Ringstraße 199 Gewerbe/sonst. Nutzung Unbekannt \n234744 Alte Jößnitzer Straße 30 Gewerbe/sonst. Nutzung Unbekannt \n\n size \n0 1922.00 \n1 45000.00 \n2 0.00 \n3 72.00 \n4 77.45 \n... ... \n234740 986.00 \n234741 2212.00 \n234742 2716.00 \n234743 3892.00 \n234744 16097.99 \n\n[234745 rows x 7 columns]", 281 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idpostcodecitystreetareausersize
0124799OwschlagUnbekanntBundesforstUnbekannt1922.00
1259494SoestUnbekanntBundesforstUnbekannt45000.00
2381549München Cincinnati/Wikinger/Leif/Penn/Lincoln/...UnbekanntWohnenUnbekannt0.00
3481549MünchenMinnewitstr. 12 (350 A8)WohnenUnbekannt72.00
4581549MünchenCincinnatistr. 59 Wgh. 3-6WohnenUnbekannt77.45
........................
23474018018498527Suhl, StadtFriedrich-König-StraßeGewerbe/sonst. NutzungUnbekannt986.00
23474118018516278Angermünde, StadtRudolf-Breitscheid-StraßeGewerbe/sonst. NutzungUnbekannt2212.00
23474218018615234Frankfurt (Oder)Traubenweg 18Gewerbe/sonst. NutzungUnbekannt2716.00
23474318018745219EssenRingstraße 199Gewerbe/sonst. NutzungUnbekannt3892.00
2347441801888525PlauenAlte Jößnitzer Straße 30Gewerbe/sonst. NutzungUnbekannt16097.99
\n

234745 rows × 7 columns

\n
" 282 | }, 283 | "execution_count": 13, 284 | "metadata": {}, 285 | "output_type": "execute_result" 286 | } 287 | ], 288 | "source": [ 289 | "str_columns = df.select_dtypes(include=[object]).columns.tolist()\n", 290 | "df[str_columns] = df[str_columns].fillna(value=\"Unbekannt\") #inplace doesn't work here!?\n", 291 | "df" 292 | ], 293 | "metadata": { 294 | "collapsed": false, 295 | "pycharm": { 296 | "name": "#%%\n" 297 | } 298 | } 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "source": [ 303 | "#### 5. Replace 0-User with \"Unbekannt\"" 304 | ], 305 | "metadata": { 306 | "collapsed": false 307 | } 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 14, 312 | "outputs": [], 313 | "source": [ 314 | "df[\"user\"] = df[\"user\"].replace(\"0\",\"Unbekannt\") #0 = Unkown" 315 | ], 316 | "metadata": { 317 | "collapsed": false, 318 | "pycharm": { 319 | "name": "#%%\n" 320 | } 321 | } 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "source": [ 326 | "#### 6. Save Results" 327 | ], 328 | "metadata": { 329 | "collapsed": false 330 | } 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 15, 335 | "outputs": [], 336 | "source": [ 337 | "df.to_csv(\"Fixed_Anlage_wo_pwd.csv\",index=False)" 338 | ], 339 | "metadata": { 340 | "collapsed": false, 341 | "pycharm": { 342 | "name": "#%%\n" 343 | } 344 | } 345 | } 346 | ], 347 | "metadata": { 348 | "kernelspec": { 349 | "display_name": "Python 3", 350 | "language": "python", 351 | "name": "python3" 352 | }, 353 | "language_info": { 354 | "codemirror_mode": { 355 | "name": "ipython", 356 | "version": 2 357 | }, 358 | "file_extension": ".py", 359 | "mimetype": "text/x-python", 360 | "name": "python", 361 | "nbconvert_exporter": "python", 362 | "pygments_lexer": "ipython2", 363 | "version": "2.7.6" 364 | } 365 | }, 366 | "nbformat": 4, 367 | "nbformat_minor": 0 368 | } -------------------------------------------------------------------------------- /liegenschaften.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bundesAPI/bundesanstalt-immobilienaufgaben-liegenschaften/3794a07ab888ccee5cbe46a91ae11d2bb4dce990/liegenschaften.png --------------------------------------------------------------------------------