├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── MANIFEST.in ├── README.rst ├── docs ├── Makefile ├── _static │ └── .gitignore ├── about.rst ├── api.rst ├── conf.py ├── developing_scrapers.rst ├── index.rst ├── make.bat └── using_scrapers.rst ├── pytest.ini ├── requirements.txt ├── setup.cfg ├── setup.py ├── statscraper ├── BaseScraperList.py ├── BaseScraperObject.py ├── DimensionValue.py ├── ValueList.py ├── __init__.py ├── base_scraper.py ├── compat.py ├── datatypes.py ├── datatypes │ ├── LICENSE │ ├── README.md │ ├── datatypes.csv │ └── values │ │ ├── currencies.csv │ │ ├── genders.csv │ │ ├── marital_statuses.csv │ │ ├── periods │ │ ├── academic-terms │ │ │ └── semesters.csv │ │ ├── months.csv │ │ └── quarters.csv │ │ ├── regions │ │ ├── eu.csv │ │ └── sweden │ │ │ ├── counties.csv │ │ │ └── municipalities.csv │ │ └── road_types.csv ├── exceptions.py └── scrapers │ ├── CranesScraper.py │ ├── PXWebScraper.py │ ├── SCBScraper.py │ ├── SMHIScraper.py │ ├── StatistikcentralenScraper.py │ ├── VantetiderScraper.py │ ├── VehicleScraper.py │ ├── __init__.py │ ├── uka_scraper.py │ └── work_injury_scraper.py ├── tests ├── scrapertests │ ├── test_injury_scraper.py │ ├── test_pxweb_scraper.py │ ├── test_smhi_scraper.py │ ├── test_vantetider_scraper.py │ └── test_vehicle_scraper.py ├── test-datatypes.py ├── test-scb.py ├── test_base_scraper.py ├── test_dialects.py └── test_resultset.py └── version.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | /__init__.py 6 | *pyc 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *,cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # IPython Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # dotenv 81 | .env 82 | 83 | # virtualenv 84 | venv/ 85 | ENV/ 86 | 87 | # Spyder project settings 88 | .spyderproject 89 | 90 | # Rope project settings 91 | .ropeproject 92 | 93 | # OS X 94 | .DS_Storegeckodriver.log 95 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | - 2.0.2 2 | 3 | - remove debug prints from SMHI scraper 4 | - upgrade BeautifulSoup to work with Pyhon 3.10+ 5 | 6 | - 2.0.1 7 | 8 | - Use https endpoint in SCB Scraper. 9 | 10 | - 2.0.0 11 | 12 | - Python 2 support deprecated. We will slowly phase out support. 13 | - Fix a bug with `DimensionValue.translate()` in Python 3. 14 | - Make `translate()` raise errors when it couldn't translate. 15 | - The municipality of Gotland is now known as 'Region Gotland' (was: Gotlands kommun). 16 | - Added some useful built-in filters to the SCB-scraper, to get results by eg municipality. 17 | - Upstream fix for typo in datatype region:sweden/municipality Vännäs kommun 18 | - SCB scraper will raises exception when an error message is returned 19 | - Fixes Python3 bug in SMHI scraper 20 | 21 | - 1.0.7 22 | 23 | - Bara kommun added to Swedish municipalities 24 | - Remove logic from SCBScraper that is already handled by BaseScraper 25 | 26 | - 1.0.6 27 | 28 | - Added dialect:skatteverket (two/four digit county/municipality codes) 29 | - Added data type for road category 30 | - Make SCB scraper treat a “Region” as, well, a region 31 | 32 | - 1.0.5 33 | - Added station key to SMHI scraper 34 | 35 | - 1.0.4 36 | - Added SMHI scraper 37 | 38 | - 1.0.3 39 | - Re-add demo scrapers that accidentally got left out in the first release 40 | 41 | - 1.0.0 42 | - First release 43 | 44 | - 1.0.0.dev2 45 | 46 | - Implement translation 47 | - Add Dataset.fetch_next() as generator for results 48 | 49 | - 1.0.0.dev1 50 | 51 | - Semantic versioning starts here 52 | - Implement datatypes and dialects 53 | 54 | - 0.0.2 55 | 56 | - Added some demo scrapers 57 | - The cursor is now moved when accessing datasets 58 | - Renamed methods for moving cursor: move_up(), move_to() 59 | - Added tests 60 | - Added datatypes subtree 61 | 62 | - 0.0.1 63 | - First version 64 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Journalism++ 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | recursive-include statscraper/datatypes * 3 | recursive-include statscraper/scrapers *.py 4 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Statscraper is a base library for building web scrapers for statistical data, with a helper ontology for (primarily Swedish) statistical data. A set of ready-to-use scrapers are included. 2 | 3 | For users 4 | ========= 5 | 6 | You can use Statscraper as a foundation for your next scraper, or try out any of the included scrapers. With Statscraper comes a unified interface for scraping, and some useful helper methods for scraper authors. 7 | 8 | Full documentation: ReadTheDocs_ 9 | 10 | For updates and discussion: Facebook_ 11 | 12 | By `Journalism++ Stockholm `_, and Robin Linderborg. 13 | 14 | Installing 15 | ---------- 16 | 17 | .. code:: bash 18 | 19 | pip install statscraper 20 | 21 | Using a scraper 22 | --------------- 23 | Scrapers acts like “cursors” that move around a hierarchy of datasets and collections of datasets. Collections and datasets are refered to as “items”. 24 | 25 | :: 26 | 27 | ┏━ Collection ━━━ Collection ━┳━ Dataset 28 | ROOT ━╋━ Collection ━┳━ Dataset ┣━ Dataset 29 | ┗━ Collection ┣━ Dataset ┗━ Dataset 30 | ┗━ Dataset 31 | 32 | ╰─────────────────────────┬───────────────────────╯ 33 | items 34 | 35 | Here's a simple example, with a scraper that returns only a single dataset: The number of cranes spotted at Hornborgarsjön each day as scraped from `Länsstyrelsen i Västra Götalands län `_. 36 | 37 | .. code:: python 38 | 39 | >>> from statscraper.scrapers import Cranes 40 | 41 | >>> scraper = Cranes() 42 | >>> scraper.items # List available datasets 43 | [] 44 | 45 | >>> dataset = scraper["Number of cranes"] 46 | >>> dataset.dimensions 47 | [, , ] 48 | 49 | >>> row = dataset.data[0] # first row in this dataset 50 | >>> row 51 | 52 | >>> row.dict 53 | {'value': '7', u'date': u'7', u'month': u'march', u'year': u'2015'} 54 | 55 | >>> df = dataset.data.pandas # get this dataset as a Pandas dataframe 56 | 57 | Building a scraper 58 | ------------------ 59 | Scrapers are built by extending a base scraper, or a derative of that. You need to provide a method for listing datasets or collections of datasets, and for fetching data. 60 | 61 | Statscraper is built for statistical data, meaning that it's most useful when the data you are scraping/fetching can be organized with a numerical value in each row: 62 | 63 | ======== ====== ======= 64 | city year value 65 | ======== ====== ======= 66 | Voi 2009 45483 67 | Kabarnet 2006 10191 68 | Taveta 2009 67505 69 | ======== ====== ======= 70 | 71 | A scraper can override these methods: 72 | 73 | * `_fetch_itemslist(item)` to yield collections or datasets at the current cursor position 74 | * `_fetch_data(dataset)` to yield rows from the currently selected dataset 75 | * `_fetch_dimensions(dataset)` to yield dimensions available for the currently selected dataset 76 | * `_fetch_allowed_values(dimension)` to yield allowed values for a dimension 77 | 78 | A number of hooks are avaiable for more advanced scrapers. These are called by adding the on decorator on a method: 79 | 80 | .. code:: python 81 | 82 | @BaseScraper.on("up") 83 | def my_method(self): 84 | # Do something when the user moves up one level 85 | 86 | For developers 87 | ============== 88 | These instructions are for developers working on the BaseScraper. See above for instructions for developing a scraper using the BaseScraper. 89 | 90 | Downloading 91 | ----------- 92 | 93 | .. code:: bash 94 | 95 | git clone https://github.com/jplusplus/statscraper 96 | python setup.py install 97 | 98 | This repo includes `statscraper-datatypes` as a subtree. To update this, do: 99 | 100 | .. code:: bash 101 | 102 | git subtree pull --prefix statscraper/datatypes git@github.com:jplusplus/statscraper-datatypes.git master --squash 103 | 104 | 105 | Tests 106 | ----- 107 | 108 | Since 2.0.0 we are using pytest. To run an individual test: 109 | 110 | .. code:: bash 111 | 112 | python3 -m pytest tests/test-datatypes.py 113 | 114 | 115 | Changelog 116 | --------- 117 | The changelog has been moved to `CHANGELOG.md `_. 118 | 119 | .. _Facebook: https://www.facebook.com/groups/skrejperpark 120 | .. _ReadTheDocs: http://statscraper.readthedocs.io 121 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = statscraper 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/_static/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jplusplus/statscraper/c75ed0474967c96c86f8def1223e55aebb80f631/docs/_static/.gitignore -------------------------------------------------------------------------------- /docs/about.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | About Statscraper 3 | ================= 4 | 5 | Statscraper is a base library for building web scrapers for statistical data, with a helper ontology for (primarily Swedish) statistical data. A set of ready-to-use scrapers are included. With Statscraper comes a unified interface for scraping, and some useful helper methods for scraper authors. 6 | 7 | Statscraper is developed by Jens Finnäs and Leo Wallentin from Journalism++, and Robin Linderborg from SVT Nyheter. 8 | 9 | The first stable version was released in August 2017. Statscraper is sponsored by Internetfonden/Stiftelsen för internetinfrastruktur and Journalism++ Stockholm. 10 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | API Documentation 3 | ================= 4 | 5 | Documentation of statscraper's public API. 6 | 7 | 8 | Main Interface 9 | -------------- 10 | 11 | .. autoclass:: statscraper.BaseScraper 12 | :members: 13 | .. autoclass:: statscraper.BaseScraperList 14 | :members: get 15 | .. autoclass:: statscraper.BaseScraperObject 16 | .. autoclass:: statscraper.Collection 17 | .. autoclass:: statscraper.Dataset 18 | .. autoclass:: statscraper.Dimension 19 | .. autoclass:: statscraper.DimensionList 20 | .. autoclass:: statscraper.DimensionValue 21 | .. autoclass:: statscraper.Item 22 | .. autoclass:: statscraper.Result 23 | .. autoclass:: statscraper.ResultSet 24 | .. autoclass:: statscraper.ValueList 25 | 26 | 27 | Exceptions 28 | -------------- 29 | 30 | .. autoclass:: statscraper.exceptions.DatasetNotInView 31 | .. autoclass:: statscraper.exceptions.InvalidData 32 | .. autoclass:: statscraper.exceptions.InvalidID 33 | .. autoclass:: statscraper.exceptions.NoSuchDatatype 34 | .. autoclass:: statscraper.exceptions.NoSuchItem -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import sys 6 | sys.path.insert(0, os.path.abspath('..')) 7 | 8 | from version import long_version, short_version, name, copyright, authors, short_desc 9 | 10 | # 11 | # statscraper documentation build configuration file, created by 12 | # sphinx-quickstart on Sun Mar 12 19:20:49 2017. 13 | # 14 | # This file is execfile()d with the current directory set to its 15 | # containing dir. 16 | # 17 | # Note that not all possible configuration values are present in this 18 | # autogenerated file. 19 | # 20 | # All configuration values have a default; values that are commented out 21 | # serve to show the default. 22 | 23 | # If extensions (or modules to document with autodoc) are in another directory, 24 | # add these directories to sys.path here. If the directory is relative to the 25 | # documentation root, use os.path.abspath to make it absolute, like shown here. 26 | # 27 | # import os 28 | # import sys 29 | # sys.path.insert(0, os.path.abspath('.')) 30 | 31 | 32 | # -- General configuration ------------------------------------------------ 33 | 34 | # If your documentation needs a minimal Sphinx version, state it here. 35 | # 36 | # needs_sphinx = '1.0' 37 | 38 | # Add any Sphinx extension module names here, as strings. They can be 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 40 | # ones. 41 | extensions = ['sphinx.ext.autodoc'] 42 | 43 | # Add any paths that contain templates here, relative to this directory. 44 | templates_path = ['_templates'] 45 | 46 | # The suffix(es) of source filenames. 47 | # You can specify multiple suffix as a list of string: 48 | # 49 | # source_suffix = ['.rst', '.md'] 50 | source_suffix = '.rst' 51 | 52 | # The master toctree document. 53 | master_doc = 'index' 54 | 55 | # General information about the project. 56 | project = name 57 | copyright = copyright 58 | author = authors 59 | 60 | # The version info for the project you're documenting, acts as replacement for 61 | # |version| and |release|, also used in various other places throughout the 62 | # built documents. 63 | # 64 | # The short X.Y version. 65 | version = short_version 66 | # The full version, including alpha/beta/rc tags. 67 | release = long_version 68 | 69 | # The language for content autogenerated by Sphinx. Refer to documentation 70 | # for a list of supported languages. 71 | # 72 | # This is also used if you do content translation via gettext catalogs. 73 | # Usually you set "language" from the command line for these cases. 74 | language = None 75 | 76 | # List of patterns, relative to source directory, that match files and 77 | # directories to ignore when looking for source files. 78 | # This patterns also effect to html_static_path and html_extra_path 79 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 80 | 81 | # The name of the Pygments (syntax highlighting) style to use. 82 | pygments_style = 'friendly' 83 | 84 | # If true, `todo` and `todoList` produce output, else they produce nothing. 85 | todo_include_todos = False 86 | 87 | 88 | # -- Options for HTML output ---------------------------------------------- 89 | 90 | # The theme to use for HTML and HTML Help pages. See the documentation for 91 | # a list of builtin themes. 92 | # 93 | html_theme = 'alabaster' 94 | 95 | # Theme options are theme-specific and customize the look and feel of a theme 96 | # further. For a list of options available for each theme, see the 97 | # documentation. 98 | # 99 | # html_theme_options = {} 100 | 101 | # Add any paths that contain custom static files (such as style sheets) here, 102 | # relative to this directory. They are copied after the builtin static files, 103 | # so a file named "default.css" will overwrite the builtin "default.css". 104 | html_static_path = ['_static'] 105 | 106 | html_sidebars = { 107 | '**': ['globaltoc.html', 'sourcelink.html', 'searchbox.html'], 108 | } 109 | 110 | 111 | # -- Options for HTMLHelp output ------------------------------------------ 112 | 113 | # Output file base name for HTML help builder. 114 | htmlhelp_basename = 'statscraperdoc' 115 | 116 | 117 | # -- Options for LaTeX output --------------------------------------------- 118 | 119 | latex_elements = { 120 | # The paper size ('letterpaper' or 'a4paper'). 121 | # 122 | # 'papersize': 'letterpaper', 123 | 124 | # The font size ('10pt', '11pt' or '12pt'). 125 | # 126 | # 'pointsize': '10pt', 127 | 128 | # Additional stuff for the LaTeX preamble. 129 | # 130 | # 'preamble': '', 131 | 132 | # Latex figure (float) alignment 133 | # 134 | # 'figure_align': 'htbp', 135 | } 136 | 137 | # Grouping the document tree into LaTeX files. List of tuples 138 | # (source start file, target name, title, 139 | # author, documentclass [howto, manual, or own class]). 140 | latex_documents = [ 141 | (master_doc, 'statscraper.tex', 'statscraper Documentation', 142 | authors, 'manual'), 143 | ] 144 | 145 | 146 | # -- Options for manual page output --------------------------------------- 147 | 148 | # One entry per manual page. List of tuples 149 | # (source start file, name, description, authors, manual section). 150 | man_pages = [ 151 | (master_doc, name, 'statscraper Documentation', 152 | [author], 1) 153 | ] 154 | 155 | 156 | # -- Options for Texinfo output ------------------------------------------- 157 | 158 | # Grouping the document tree into Texinfo files. List of tuples 159 | # (source start file, target name, title, author, 160 | # dir menu entry, description, category) 161 | texinfo_documents = [ 162 | (master_doc, name, 'statscraper Documentation', 163 | author, name, short_desc, 'Miscellaneous'), 164 | ] 165 | -------------------------------------------------------------------------------- /docs/developing_scrapers.rst: -------------------------------------------------------------------------------- 1 | =================== 2 | Developing scrapers 3 | =================== 4 | 5 | The scraper can navigate though an hierarchy of collections and datasets. Collections and datasets are refered to as “items”. 6 | 7 | :: 8 | 9 | ┏━ Collection ━━━ Collection ━┳━ Dataset 10 | ROOT ━╋━ Collection ━┳━ Dataset ┣━ Dataset 11 | ┗━ Collection ┣━ Dataset ┗━ Dataset 12 | ┗━ Dataset 13 | 14 | ╰─────────────────────────┬───────────────────────╯ 15 | items 16 | 17 | 18 | Scrapers are built by extending the BaseScraper class, or a subclass of it. Every scraper must override the methods :code:`_fetch_itemslist` and :code:`_fetch_data`: 19 | 20 | * :code:`_fetch_itemslist(self, item)` must yield items at the current position. 21 | * :code:`_fetch_data(self, dataset, query)` must yield rows from a dataset. 22 | 23 | Other methods that a scraper can chose to override are: 24 | 25 | * :code:`_fetch_dimensions(self, dataset)` should yield dimensions available on a dataset. 26 | * :code:`_fetch_allowed_values(self, dimension)` should yield allowed values for a dimension. 27 | 28 | A number of hooks are avaiable for more advanced scrapers. These are called by adding the on decorator on a method: 29 | 30 | .. code:: python 31 | 32 | @BaseScraper.on("up") 33 | def my_method(self): 34 | # Do something when the cusor moves up one level 35 | 36 | Check out the `statscraper/scrapers `_ directory for some scraper examples. 37 | 38 | Below if the full code for the CranesScraper scraper used in the chapter `Using Scrapers `_: 39 | 40 | .. code:: python 41 | 42 | # encoding: utf-8 43 | """ A scraper to fetch daily cranes sightings at Hornborgasjön 44 | from http://web05.lansstyrelsen.se/transtat_O/transtat.asp 45 | This is intended to be a minimal example of a scraper 46 | using Beautiful Soup. 47 | """ 48 | import requests 49 | from bs4 import BeautifulSoup 50 | from statscraper import BaseScraper, Dataset, Dimension, Result 51 | 52 | 53 | class Cranes(BaseScraper): 54 | 55 | def _fetch_itemslist(self, item): 56 | """ There is only one dataset. """ 57 | yield Dataset("Number of cranes") 58 | 59 | def _fetch_dimensions(self, dataset): 60 | """ Declaring available dimensions like this is not mandatory, 61 | but nice, especially if they differ from dataset to dataset. 62 | 63 | If you are using a built in datatype, you can specify the dialect 64 | you are expecting, to have values normalized. This scraper will 65 | look for Swedish month names (e.g. 'Januari'), but return them 66 | according to the Statscraper standard ('january'). 67 | """ 68 | yield Dimension(u"date", label="Day of the month") 69 | yield Dimension(u"month", datatype="month", dialect="swedish") 70 | yield Dimension(u"year", datatype="year") 71 | 72 | def _fetch_data(self, dataset, query=None): 73 | html = requests.get("http://web05.lansstyrelsen.se/transtat_O/transtat.asp").text 74 | soup = BeautifulSoup(html, 'html.parser') 75 | table = soup.find("table", "line").find_all("table")[2].findNext("table") 76 | rows = table.find_all("tr") 77 | column_headers = rows.pop(0).find_all("td", recursive=False) 78 | years = [x.text for x in column_headers[2:]] 79 | for row in rows: 80 | cells = row.find_all("td") 81 | date = cells.pop(0).text 82 | month = cells.pop(0).text 83 | i = 0 84 | for value in cells: 85 | # Each column from here is a year. 86 | if value.text: 87 | yield Result(value.text.encode("utf-8"), { 88 | "date": date, 89 | "month": month, 90 | "year": years[i], 91 | }) 92 | i += 1 93 | 94 | ----- 95 | Hooks 96 | ----- 97 | Some scrapers might need to execute certains tasks as the user moves around the items tree. There are a number of hooks, that can be used to run code as a respons to an event. A scraper class method is attached to a hook by using the :code:`BaseScraper.on` decorator, with the name of the hook as the only argument. Here is an example of a hook in a Selenium based browser, used to refresh the browser each time the end user navigates to the top-most collection. 98 | 99 | .. code:: python 100 | 101 | @BaseScraper.on("top") 102 | def refresh_browser(self): 103 | """ Refresh browser, to reset all forms """ 104 | self.browser.refresh() 105 | 106 | Available hooks are: 107 | 108 | * init: Called when initiating the class 109 | * up: Called when trying to go up one level (even if the scraper failed moving up) 110 | * top: Called when moving to top level 111 | * select: Called when trying to move to a specific Collection or Dataset. The target item will be provided as an artgument to the function. 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | =========================================== 2 | Statscraper: Standardizing Swedish scrapers 3 | =========================================== 4 | 5 | **Statscraper** provides a common set of guidelines, base classes and standards for writing scrapers for Swedish agencies' websites. Scrapers that comply with these standards provide a unified abstraction layer to the end-user, in terms of both usage and data output. 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | :caption: Contents: 10 | 11 | about 12 | using_scrapers 13 | developing_scrapers 14 | api 15 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=statscraper 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/using_scrapers.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Using scrapers 3 | ============== 4 | 5 | Every scraper built on Statscraper shares the same interface towards the user. Here's sample code using one of the included demo scrapers, to fetch the number of cranes spotted at Hornborgarsjön each day from `Länsstyrelsen i Västra Götalands län `_: 6 | 7 | .. code:: python 8 | 9 | >>> from statscraper.scrapers import Cranes 10 | 11 | >>> scraper = Cranes() 12 | >>> scraper.items # List available datasets 13 | [] 14 | 15 | >>> dataset = scraper["Number of cranes"] 16 | >>> dataset.dimensions 17 | [, , ] 18 | 19 | >>> row = dataset.data[0] # first row in this dataset 20 | >>> row 21 | 22 | >>> row.dict 23 | {'value': '7', u'date': u'7', u'month': u'march', u'year': u'2015'} 24 | >>> row.int 25 | 7 26 | >>> row.tuple 27 | ('7', {u'date': u'7', u'month': u'march', u'year': u'2015'}) 28 | 29 | >>> df = dataset.data.pandas # get this dataset as a Pandas dataframe 30 | 31 | 32 | Exploring sites 33 | --------------- 34 | Scrapers act like “cursors” that move around a hierarchy of datasets and collections of datasets. Collections and datasets are refered to as “items”. 35 | 36 | :: 37 | 38 | ┏━ Collection ━━━ Collection ━┳━ Dataset 39 | ROOT ━╋━ Collection ━┳━ Dataset ┣━ Dataset 40 | ┗━ Collection ┣━ Dataset ┗━ Dataset 41 | ┗━ Dataset 42 | 43 | ╰─────────────────────────┬───────────────────────╯ 44 | items 45 | 46 | The cursor is moved around the item tree as needed when you access properties or data, but you can also move manually around the items, if you want to be in full control. Some scrapers, e.g. those that need to fill out and post forms, or handle session data, might require that you move the cursor around manually. For most simple scrapers, e.g. those accessing an API, this should not be necessary. 47 | 48 | Moving the cursor manually: 49 | 50 | .. code:: python 51 | 52 | >>> from statscraper.scrapers import PXWeb 53 | 54 | >>> scraper = PXWeb(base_url="http://pxnet2.stat.fi/pxweb/api/v1/sv/StatFin/") 55 | >>> scraper.items 56 | [, , ...] 57 | 58 | >>> scraper.move_to("vrm").move_to("synt").move_to("080_synt_tau_203.px") 59 | >>> scraper.current_item 60 | 61 | 62 | >>> scraper.move_up() 63 | >>> scraper.current_item 64 | 65 | >>> scraper.move_to("010_synt_tau_101.px") 66 | >>> scraper.current_item 67 | 68 | 69 | >>> scraper.move_to_top() 70 | >>> scraper.move_to(0) # Moving by index works too 71 | 72 | 73 | The datasets above could also be accessed like this: 74 | 75 | .. code:: python 76 | 77 | >>> from statscraper.scrapers import PXWeb 78 | 79 | >>> scraper = PXWeb(base_url="http://pxnet2.stat.fi/pxweb/api/v1/sv/StatFin/") 80 | 81 | >>> collection = scraper["vrm"]["synt"] 82 | >>> collection 83 | 84 | 85 | >>> dataset_1 = collection["080_synt_tau_203.px"] 86 | >>> dataset_2 = collection["010_synt_tau_101.px"] 87 | 88 | At any given point, :code:`scraper["foo"]` is shorthand for :code:`scraper.current_item.items["foo"]`. 89 | 90 | If you want to loop throuh every available dataset a scraper can offer, there is a :code:`Scraper.descendants` property that will recursively move to every item in the tree. Here is an example, that will find all datasets in the SCB API that has monthly data: 91 | 92 | .. code:: python 93 | 94 | >>> from statscraper.scrapers import SCB 95 | 96 | >>> scraper = SCB() 97 | >>> for dataset in scraper.descendants: 98 | >>> if dataset.dimensions["Tid"].label == u"månad": 99 | >>> print "Ahoy! Dataset %s has monthly data!" % dataset 100 | 101 | Exploring datasets 102 | ------------------ 103 | 104 | Much like itemslists (:code:`Collection.items`), datasets are only fetched when you are inspecting or interacting with them. 105 | 106 | The actual data is stored in a property called data: 107 | 108 | .. code:: python 109 | 110 | >>> from statscraper.scrapers import Cranes 111 | 112 | >>> scraper = Cranes() 113 | >>> dataset = scraper.items[0] 114 | >>> for row in dataset.data: 115 | >>> print "%s cranes were spotted on %s" % (row.value, row["date"]) 116 | 117 | The data property will hold a list of result objects. The list can be converted to a few other formats, e.g. a pandas dataframe: 118 | 119 | .. code:: python 120 | 121 | >>> from statscraper.scrapers import Cranes 122 | 123 | >>> scraper = Cranes() 124 | >>> dataset = scraper.items[0] 125 | >>> df = dataset.data.pandas # convert to pandas dataframe 126 | 127 | If you want to querry a site or database for some subset of the available data, you can use the :code:`fetch()` method on the dataset (or on the scraper, to fetch data from the current position, if any): 128 | 129 | .. code:: python 130 | 131 | >>> dataset = scraper.items[0] 132 | >>> data = dataset.fetch(query={'year': "2017"}) 133 | 134 | or 135 | 136 | .. code:: python 137 | 138 | >>> scraper.move_to(0) 139 | >>> data = scraper.fetch(query={'year': "2017"}) 140 | 141 | Available dimensions can be inspected though the .dimensions property: 142 | 143 | .. code:: python 144 | 145 | >>> dataset.dimensions 146 | [, ] 147 | 148 | Note however that a scraper does not necessarily need to provide dimensions. If :code:`Dataset.dimensions` is None, it could simply mean that the scraper itself is not sure what to expect from the data. 149 | 150 | Dialects 151 | -------- 152 | 153 | Scraper authors can use the included :code:`Datatypes` module to have a standardised ontology for common statistical dimensions. If a dimensions uses a bulid in datatype, it can be translated to a different dialect. For instance, Swedish municipalities come in the following dialects: 154 | 155 | - :code:`short`: :code:`"Ale"` 156 | - :code:`numerical`: :code:`"1440"` 157 | - :code:`wikidata`: :code:`"Q498470"` 158 | - :code:`brå`: :code:`"8617"` 159 | - :code:`scb`: :code:`"1440 Ale kommun"` 160 | 161 | By default, Statscraper prefers human readable representations, and municipality values is internally stored like this: :code:`u"Borås kommun"`. The philosophy here is that human readable id's speed up debugging and makes it easy to spot errors during scraping and analysis. Yes, we do use Unicode for id's. It's 2017 after all. 162 | 163 | .. code:: python 164 | 165 | >>> from statscraper.scrapers import Cranes 166 | 167 | >>> scraper = Cranes() 168 | >>> data = scraper.items[0].data 169 | >>> row = data[0] 170 | >>> row["month"] 171 | 172 | >>> row["month"].translate("swedish") 173 | u'mars' 174 | 175 | For available datatypes, domains, values and dialects, see the `statscraper-datatypes repo `_. 176 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = tests 3 | norecursedirs = scrapertests 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Core dependecies 2 | pandas==0.25.3 3 | requests==2.28.2 4 | six==1.11.0 5 | Sphinx==1.6.7 6 | pytest==5.3.5 7 | 8 | # Scraper dependencies 9 | beautifulsoup4==4.11.1 10 | selenium==3.9.0 11 | xlrd==1.0.0 12 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 0 3 | 4 | [metadata] 5 | license_file = LICENSE 6 | 7 | [flake8] 8 | max-line-length = 90 9 | 10 | [options] 11 | python_requires = >=3.6 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from version import version, name, authors, email, short_desc 3 | 4 | 5 | def readme(): 6 | """Import README for use as long_description.""" 7 | with open("README.rst") as f: 8 | return f.read() 9 | 10 | 11 | setup( 12 | name=name, 13 | version=version, 14 | description=short_desc, 15 | long_description=readme(), 16 | url="https://github.com/jplusplus/statscraper", 17 | author=authors, 18 | author_email=email, 19 | license="MIT", 20 | packages=["statscraper"], 21 | zip_safe=False, 22 | install_requires=[ 23 | "pandas", 24 | "six", 25 | "requests", 26 | ], 27 | include_package_data=True, 28 | download_url="https://github.com/jplusplus/skrejperpark/archive/%s.tar.gz" 29 | % version, 30 | ) 31 | -------------------------------------------------------------------------------- /statscraper/BaseScraperList.py: -------------------------------------------------------------------------------- 1 | import six 2 | from .compat import unicode 3 | from .exceptions import NoSuchItem 4 | 5 | 6 | class BaseScraperList(list): 7 | """ Lists of dimensions, values, etc all inherit this class 8 | for some common convenience methods, such as get_by_label() 9 | """ 10 | 11 | _CONTAINS = object 12 | 13 | def get(self, key): 14 | """Provide alias for bracket notation.""" 15 | return self[key] 16 | 17 | def get_by_label(self, label): 18 | """ Return the first item with a specific label, 19 | or None. 20 | """ 21 | return next((x for x in self if x.label == label), None) 22 | 23 | def __getitem__(self, key): 24 | """ Make it possible to get item by id or value identity.""" 25 | if isinstance(key, six.string_types): 26 | if isinstance(key, unicode): 27 | def f(x): 28 | return (x.id == key) 29 | else: 30 | def f(x): 31 | return (x.id == unicode(key, encoding="utf-8")) 32 | elif isinstance(key, self._CONTAINS): 33 | def f(x): 34 | return (x is key) 35 | else: 36 | return list.__getitem__(self, key) 37 | 38 | try: 39 | return next(iter(filter(f, self))) 40 | except StopIteration: 41 | # No such item 42 | raise NoSuchItem("No such %s: %s" % (self._CONTAINS.__name__, key)) 43 | 44 | def __contains__(self, item): 45 | """ Make the 'in' keyword check for value/id """ 46 | if isinstance(item, six.string_types): 47 | return bool(len(list(filter(lambda x: x.value == item, self)))) 48 | else: 49 | return super(BaseScraperList, self).__contains__(item) -------------------------------------------------------------------------------- /statscraper/BaseScraperObject.py: -------------------------------------------------------------------------------- 1 | import six 2 | 3 | 4 | class BaseScraperObject(object): 5 | """ Objects like items, dimensions, values etc all inherit 6 | this class. BaseScraperObjects are typicalliy stored in a 7 | BaseScraperList. 8 | """ 9 | 10 | def get(self, key): 11 | """Provide alias for bracket notation.""" 12 | return self[key] 13 | 14 | @property 15 | def value(self): 16 | """ This is the value used for testing membership, 17 | comparison, etc. Overloaded for classes that store 18 | a value separate from the id, e.g. DimensionValue, 19 | that might have something like {id: 'year', value: 2017} 20 | """ 21 | if hasattr(self, '_value'): 22 | return self._value 23 | else: 24 | return self.id 25 | 26 | @value.setter 27 | def value(self, value): 28 | """ This is the value used for testing membership, 29 | comparison, etc. Overloaded for classes that store 30 | a value separate from the id, e.g. DimensionValue, 31 | that might have something like {id: 'year', value: 2017} 32 | """ 33 | self._value = value 34 | 35 | def __eq__(self, other): 36 | """ Enable equality check by string """ 37 | if self is other: 38 | return True 39 | elif isinstance(other, six.string_types): 40 | return (self.value == other) 41 | else: 42 | return super(BaseScraperObject, self) == other 43 | 44 | def __nonzero__(self): 45 | """ Make nonezero check value """ 46 | return bool(self.value) 47 | 48 | def __len__(self): 49 | """ Make len check value """ 50 | return len(self.value) 51 | 52 | def __int__(self): 53 | """ Make int return value """ 54 | return int(self.value) 55 | 56 | def __str__(self): 57 | if isinstance(self.value, six.string_types): 58 | try: 59 | if six.PY2: 60 | return self.value.encode("utf-8") 61 | else: 62 | return self.value 63 | except (UnicodeEncodeError, UnicodeDecodeError): 64 | return self.value 65 | else: 66 | return str(self.value) 67 | 68 | def __repr__(self): 69 | if self.label is None: 70 | label = self.id 71 | else: 72 | label = self.label.encode("utf-8") 73 | if str(self) != str(label): 74 | return '<%s: %s (%s)>' % (type(self).__name__, 75 | str(self), 76 | label) 77 | else: 78 | return '<%s: %s>' % (type(self).__name__, 79 | str(self)) 80 | -------------------------------------------------------------------------------- /statscraper/DimensionValue.py: -------------------------------------------------------------------------------- 1 | """This file contanis a class representing a value in a dataset.""" 2 | from .BaseScraperObject import BaseScraperObject 3 | 4 | 5 | class DimensionValue(BaseScraperObject): 6 | """The value for a dimension inside a Resultset.""" 7 | 8 | def __init__(self, value, dimension, label=None): 9 | """Value can be any type. dimension is a Dimension() object.""" 10 | self.value = value 11 | self._dimension = dimension 12 | self._label = label 13 | self._id = dimension.id 14 | 15 | @property 16 | def id(self): 17 | return self._id 18 | 19 | @id.setter 20 | def id(self, value): 21 | self._id = value 22 | 23 | @property 24 | def label(self): 25 | return self._label 26 | 27 | @label.setter 28 | def label(self, value): 29 | self._label = value 30 | 31 | @property 32 | def dimension(self): 33 | return self._dimension 34 | 35 | @dimension.setter 36 | def dimension(self, value): 37 | self._dimension = value 38 | 39 | def translate(self, dialect): 40 | """Translate this value to a different dialect.""" 41 | if self.dimension.datatype is None: 42 | raise Exception(f"""\ 43 | A value must belong to a dimension of a specific datatyp, to be translated. \ 44 | {self.dimension} does not have a datatype.""") 45 | dt = self.dimension.datatype 46 | if self.value not in dt.allowed_values: 47 | raise Exception(f"""\ 48 | {self.value} is not an allowed value for this datatype, and can not be translated.""") 49 | 50 | translations = dt.allowed_values[self.value] 51 | translation = ",".join([x.replace(",", "\\,") 52 | for x in translations.dialects[dialect]]) 53 | return translation 54 | -------------------------------------------------------------------------------- /statscraper/ValueList.py: -------------------------------------------------------------------------------- 1 | import six 2 | from .compat import unicode 3 | from .BaseScraperList import BaseScraperList 4 | from .DimensionValue import DimensionValue 5 | 6 | 7 | class ValueList(BaseScraperList): 8 | """A list of dimension values. 9 | 10 | allowed_values uses this class, to allow checking membership. 11 | """ 12 | 13 | def __getitem__(self, key): 14 | """Make it possible to get value by value or value identity.""" 15 | if isinstance(key, six.string_types): 16 | if isinstance(key, unicode): 17 | def f(x): 18 | return (x.value == key) 19 | else: 20 | def f(x): 21 | return (x.value == unicode(key, encoding="utf-8")) 22 | elif isinstance(key, DimensionValue): 23 | def f(x): 24 | return (x is key) 25 | else: 26 | return list.__getitem__(self, key) 27 | try: 28 | val = next(iter(filter(f, self))) 29 | return val 30 | except IndexError: 31 | # No such id 32 | raise NoSuchItem("No such value") 33 | 34 | def __contains__(self, item): 35 | """ in should look for value, not id. """ 36 | if isinstance(item, six.string_types): 37 | return bool(len(list(filter(lambda x: x.value == item, self)))) 38 | else: 39 | return super(ValueList, self).__contains__(item) 40 | -------------------------------------------------------------------------------- /statscraper/__init__.py: -------------------------------------------------------------------------------- 1 | # Exceptions 2 | from .exceptions import * 3 | 4 | # Classes 5 | from .DimensionValue import DimensionValue 6 | from .BaseScraperList import BaseScraperList 7 | from .BaseScraperObject import BaseScraperObject 8 | from .ValueList import ValueList 9 | from .datatypes import Datatype 10 | from .base_scraper import (BaseScraper, Item, Collection, Dataset, Result, 11 | ResultSet, ItemList, Dimension, DimensionList) 12 | 13 | # Contants 14 | from .base_scraper import ROOT, TYPE_DATASET, TYPE_COLLECTION 15 | -------------------------------------------------------------------------------- /statscraper/base_scraper.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | u""" 4 | This file contains the base class for scrapers. The scraper can navigate 5 | though an hierarchy of collections and datasets. Collections and datasets 6 | are refered to as “items”. 7 | 8 | ┏━ Collection ━━━ Collection ━┳━ Dataset 9 | ROOT ━╋━ Collection ━┳━ Dataset ┣━ Dataset 10 | ┗━ Collection ┣━ Dataset ┗━ Dataset 11 | ┗━ Dataset 12 | 13 | ╰───────────────────────┬─────────────────────╯ 14 | items 15 | 16 | A scraper can override three methods: 17 | * _fetch_itemslist(item) yields items at the current position 18 | * _fetch_dimensions(dataset) yields dimensions available on a dataset 19 | * _fetch_data(dataset) syield rows from a dataset 20 | 21 | A number of hooks are avaiable for more advanced scrapers. These are called 22 | by adding the on decorator on a method: 23 | 24 | @BaseScraper.on("up") 25 | def my_method(self): 26 | # Do something when the cusor moves up one level 27 | 28 | """ 29 | import six 30 | from hashlib import md5 31 | from json import dumps 32 | import pandas as pd 33 | from collections import deque 34 | from copy import copy 35 | from .exceptions import NoSuchItem, InvalidID 36 | from .datatypes import Datatype 37 | from .BaseScraperObject import BaseScraperObject 38 | from .BaseScraperList import BaseScraperList 39 | from .DimensionValue import DimensionValue 40 | from .ValueList import ValueList 41 | 42 | if six.PY3: 43 | unicode = str 44 | 45 | try: 46 | from itertools import ifilter as filter 47 | except ImportError: 48 | pass 49 | 50 | TYPE_DATASET = "Dataset" 51 | TYPE_COLLECTION = "Collection" 52 | ROOT = "" # Special id for root position 53 | VALUE_KEY = "value" # key/column holding the value of a result or dimension 54 | """ Constants for item types and id's """ 55 | 56 | 57 | class ResultSet(list): 58 | """The result of a dataset query. 59 | 60 | This is essentially a list of Result objects. 61 | """ 62 | 63 | _pandas = None 64 | dataset = None 65 | 66 | @property 67 | def list_of_dicts(self): 68 | """Return a list of dictionaries, with the key "value" for values.""" 69 | return [dict(x) for x in self] 70 | 71 | @property 72 | def pandas(self): 73 | """Return a Pandas dataframe.""" 74 | if self._pandas is None: 75 | self._pandas = pd.DataFrame().from_records(self.list_of_dicts) 76 | return self._pandas 77 | 78 | def translate(self, dialect): 79 | """Return a copy of this ResultSet in a different dialect.""" 80 | new_resultset = copy(self) 81 | new_resultset.dialect = dialect 82 | 83 | for result in new_resultset: 84 | for dimensionvalue in result.dimensionvalues: 85 | dimensionvalue.value = dimensionvalue.translate(dialect) 86 | return new_resultset 87 | 88 | def append(self, val): 89 | """Connect any new results to the resultset. 90 | 91 | This is where all the heavy lifting is done for creating results: 92 | - We add a datatype here, so that each result can handle 93 | validation etc independently. This is so that scraper authors 94 | don't need to worry about creating and passing around datatype objects. 95 | - As the scraper author yields result objects, we append them to 96 | a resultset. 97 | - This is also where we normalize dialects. 98 | """ 99 | val.resultset = self 100 | val.dataset = self.dataset 101 | 102 | # Check result dimensions against available dimensions for this dataset 103 | if val.dataset: 104 | dataset_dimensions = self.dataset.dimensions 105 | for k, v in val.raw_dimensions.items(): 106 | if k not in dataset_dimensions: 107 | d = Dimension(k) 108 | else: 109 | d = dataset_dimensions[k] 110 | 111 | # Normalize if we have a datatype and a foreign dialect 112 | normalized_value = unicode(v) 113 | if d.dialect and d.datatype: 114 | if d.dialect in d.datatype.dialects: 115 | for av in d.allowed_values: 116 | # Not all allowed_value have all dialects 117 | if unicode(v) in av.dialects.get(d.dialect, []): 118 | normalized_value = av.value 119 | # Use first match 120 | # We do not support multiple matches 121 | # This is by design. 122 | break 123 | 124 | # Create DimensionValue object 125 | if isinstance(v, DimensionValue): 126 | dim = v 127 | v.value = normalized_value 128 | else: 129 | if k in dataset_dimensions: 130 | dim = DimensionValue(normalized_value, d) 131 | else: 132 | dim = DimensionValue(normalized_value, Dimension()) 133 | 134 | val.dimensionvalues.append(dim) 135 | 136 | # Add last list of dimension values to the ResultSet 137 | # They will usually be the same for each result 138 | self.dimensionvalues = val.dimensionvalues 139 | 140 | super(ResultSet, self).append(val) 141 | 142 | 143 | class DimensionList(BaseScraperList): 144 | """A one dimensional list of dimensions.""" 145 | 146 | pass 147 | 148 | 149 | class Result(BaseScraperObject): 150 | u"""A “row” in a result. 151 | 152 | A result contains a numerical value, 153 | and optionally a set of dimensions with values. 154 | """ 155 | 156 | def __init__(self, value, dimensions={}): 157 | """Value is supposed, but not strictly required to be numerical.""" 158 | self.value = value 159 | self.label = VALUE_KEY 160 | self.raw_dimensions = dimensions 161 | self.dimensionvalues = DimensionList() 162 | 163 | def __getitem__(self, key): 164 | """ Make it possible to get dimensions by name. """ 165 | if isinstance(key, six.string_types): 166 | return self.dimensionvalues[key] 167 | else: 168 | return list.__getitem__(self, key) 169 | 170 | def __iter__(self): 171 | """ dict representation is like: 172 | {value: 123, dimension_1: "foo", dimension_2: "bar"} 173 | """ 174 | yield (VALUE_KEY, self.value) 175 | for dv in self.dimensionvalues: 176 | yield (dv.id, 177 | dv.value) 178 | 179 | @property 180 | def dict(self): 181 | return dict(self) 182 | 183 | @property 184 | def int(self): 185 | return int(self) 186 | 187 | @property 188 | def str(self): 189 | return str(int(self)) 190 | 191 | @property 192 | def tuple(self): 193 | """ Tuple conversion to (value, dimensions), e.g.: 194 | (123, {dimension_1: "foo", dimension_2: "bar"}) 195 | """ 196 | return (self.value, {dv.id: dv.value for dv in self.dimensionvalues}) 197 | 198 | 199 | class Dimension(BaseScraperObject): 200 | """A dimension in a dataset.""" 201 | 202 | def __init__(self, id_=None, label=None, 203 | allowed_values=None, datatype=None, 204 | dialect=None, domain=None): 205 | """A single dimension. 206 | 207 | If allowed_values are specified, they will override any 208 | allowed values for the datatype 209 | """ 210 | if id_ is None: 211 | id_ = "default" 212 | if id_ == VALUE_KEY: 213 | raise InvalidID("'%s' is not a valid Dimension id." % VALUE_KEY) 214 | self.id = id_ 215 | self._allowed_values = None 216 | self.datatype = None 217 | if label is None: 218 | self.label = id_ 219 | else: 220 | self.label = label 221 | if datatype: 222 | self.datatype = Datatype(datatype) 223 | self._allowed_values = self.datatype.allowed_values 224 | self.dialect = dialect 225 | if allowed_values: 226 | # Override allowed values from datatype, if any 227 | # 228 | # If allowed values is given as a list of values, create 229 | # value objects using an empty dimension. 230 | self._allowed_values = ValueList() 231 | for val in allowed_values: 232 | if isinstance(val, DimensionValue): 233 | self._allowed_values.append(val) 234 | else: 235 | self._allowed_values.append(DimensionValue(val, 236 | Dimension()) 237 | ) 238 | 239 | @property 240 | def allowed_values(self): 241 | """Return a list of allowed values.""" 242 | if self._allowed_values is None: 243 | self._allowed_values = ValueList() 244 | for val in self.scraper._fetch_allowed_values(self): 245 | if isinstance(val, DimensionValue): 246 | self._allowed_values.append(val) 247 | else: 248 | self._allowed_values.append(DimensionValue(val, 249 | Dimension())) 250 | return self._allowed_values 251 | 252 | 253 | class ItemList(BaseScraperList): 254 | """A one dimensional list of items. 255 | 256 | Has some conventience getters and setters for scrapers 257 | """ 258 | 259 | @property 260 | def type(self): 261 | """Check if this is a list of Collections or Datasets.""" 262 | try: 263 | return self[0].type 264 | except IndexError: 265 | return None 266 | 267 | def empty(self): 268 | """Empty this list (delete all contents).""" 269 | del self[:] 270 | return self 271 | 272 | def append(self, val): 273 | """Connect any new items to the scraper.""" 274 | val.scraper = self.scraper 275 | val._collection_path = copy(self.collection._collection_path) 276 | val._collection_path.append(val) 277 | super(ItemList, self).append(val) 278 | 279 | 280 | class Item(BaseScraperObject): 281 | """Common base class for collections and datasets.""" 282 | 283 | # These are populated when added to an itemlist 284 | parent = None # Parent item 285 | _items = None # ItemList with children 286 | _collection_path = None # All ancestors 287 | 288 | def __init__(self, id_, label=None, blob=None): 289 | """Use blob to store any custom data.""" 290 | self.id = id_ 291 | self.blob = blob 292 | if label is None: 293 | self.label = id_ 294 | else: 295 | self.label = label 296 | self._collection_path = deque([self]) # Will be overwritten when attached to an ItemList 297 | 298 | def _move_here(self): 299 | """Move the cursor to this item.""" 300 | cu = self.scraper.current_item 301 | # Already here? 302 | if self is cu: 303 | return 304 | # A child? 305 | if cu.items and self in cu.items: 306 | self.scraper.move_to(self) 307 | return 308 | # A parent? 309 | if self is cu.parent: 310 | self.scraper.move_up() 311 | # A sibling? 312 | if self.parent and self in self.parent.items: 313 | self.scraper.move_up() 314 | self.scraper.move_to(self) 315 | return 316 | # Last resort: Move to top and all the way down again 317 | self.scraper.move_to_top() 318 | for step in self.path: 319 | self.scraper.move_to(step) 320 | 321 | @property 322 | def path(self): 323 | """All named collections above, including the current, but not root.""" 324 | steps = list(self._collection_path) 325 | steps.pop(0) 326 | return steps 327 | 328 | @property 329 | def type(self): 330 | """Check if this is a Collection or Dataset.""" 331 | try: 332 | if isinstance(self, Collection): 333 | return TYPE_COLLECTION 334 | else: 335 | return TYPE_DATASET 336 | except IndexError: 337 | return None 338 | 339 | 340 | class Collection(Item): 341 | """A collection can contain collection of datasets. 342 | 343 | Lorem ipsum lorem lorem ipsum lorem. Dummy text. 344 | 345 | Basic Usage:: 346 | 347 | >>> from statscraper import Collection 348 | >>> c = Collection() 349 | 350 | """ 351 | 352 | @property 353 | def is_root(self): 354 | """Check if root element.""" 355 | if self.id == ROOT: 356 | return True 357 | else: 358 | return None 359 | 360 | @property 361 | def items(self): 362 | """ItemList of children.""" 363 | if self.scraper.current_item is not self: 364 | self._move_here() 365 | 366 | if self._items is None: 367 | self._items = ItemList() 368 | self._items.scraper = self.scraper 369 | self._items.collection = self 370 | for i in self.scraper._fetch_itemslist(self): 371 | i.parent = self 372 | if i.type == TYPE_DATASET and i.dialect is None: 373 | i.dialect = self.scraper.dialect 374 | self._items.append(i) 375 | return self._items 376 | 377 | def __getitem__(self, key): 378 | """Provide bracket notation. 379 | 380 | collection["abc"] is shorthand for collection.items["abc"] 381 | """ 382 | if self.scraper.current_item is not self: 383 | self._move_here() 384 | try: 385 | return self.items[key] 386 | except IndexError: 387 | # No such id 388 | raise NoSuchItem("No such item in Collection") 389 | 390 | def get(self, key): 391 | """Provide alias for bracket notation.""" 392 | return self[key] 393 | 394 | 395 | class Dataset(Item): 396 | """A dataset. Can be empty.""" 397 | 398 | _data = None # We store one ResultSet for each unique query 399 | _dimensions = None 400 | dialect = None 401 | query = None 402 | 403 | def __init__(self, id_, label=None, blob=None): 404 | super(Dataset, self).__init__(id_, label, blob) 405 | self._data = {} 406 | 407 | @property 408 | def items(self): 409 | """A dataset has no children.""" 410 | return None 411 | 412 | @property 413 | def _hash(self): 414 | """Return a hash for the current query. 415 | 416 | This hash is _not_ a unique representation of the dataset! 417 | """ 418 | dump = dumps(self.query, sort_keys=True) 419 | if isinstance(dump, str): 420 | dump = dump.encode('utf-8') 421 | return md5(dump).hexdigest() 422 | 423 | def fetch_next(self, query=None, **kwargs): 424 | """Generator to yield data one row at a time. 425 | Yields a Result, not the entire ResultSet. The containing ResultSet 426 | can be accessed through `Result.resultset`, but be careful not to 427 | manipulate the ResultSet until it is populated (when this generator 428 | is empty), or you may see unexpected results. 429 | """ 430 | if query: 431 | self.query = query 432 | 433 | hash_ = self._hash 434 | if hash_ in self._data: 435 | for result in self._data[hash_]: 436 | yield result 437 | 438 | if self.scraper.current_item is not self: 439 | self._move_here() 440 | 441 | self._data[hash_] = ResultSet() 442 | self._data[hash_].dialect = self.dialect 443 | self._data[hash_].dataset = self 444 | for result in self.scraper._fetch_data(self, 445 | query=self.query, 446 | **kwargs): 447 | self._data[hash_].append(result) 448 | yield result 449 | 450 | def fetch(self, query=None, **kwargs): 451 | """Ask scraper to return data for the current dataset.""" 452 | if query: 453 | self.query = query 454 | 455 | hash_ = self._hash 456 | if hash_ in self._data: 457 | return self._data[hash_] 458 | 459 | if self.scraper.current_item is not self: 460 | self._move_here() 461 | 462 | rs = ResultSet() 463 | rs.dialect = self.dialect 464 | rs.dataset = self 465 | for result in self.scraper._fetch_data(self, 466 | query=self.query, 467 | **kwargs): 468 | rs.append(result) 469 | self._data[hash_] = rs 470 | return self._data[hash_] 471 | 472 | @property 473 | def data(self): 474 | """Data as a property, given current query.""" 475 | return self.fetch(query=self.query) 476 | 477 | @property 478 | def dimensions(self): 479 | """Available dimensions, if defined.""" 480 | # First of all: Select this dataset 481 | if self.scraper.current_item is not self: 482 | self._move_here() 483 | 484 | if self._dimensions is None: 485 | self._dimensions = DimensionList() 486 | for d in self.scraper._fetch_dimensions(self): 487 | d.dataset = self 488 | d.scraper = self.scraper 489 | self._dimensions.append(d) 490 | return self._dimensions 491 | 492 | @property 493 | def shape(self): 494 | """Compute the shape of the dataset as (rows, cols).""" 495 | if not self.data: 496 | return (0, 0) 497 | return (len(self.data), len(self.dimensions)) 498 | 499 | 500 | class BaseScraper(Collection): 501 | """The base class for scrapers.""" 502 | 503 | # Hooks 504 | _hooks = { 505 | 'init': [], # Called when initiating the class 506 | 'up': [], # Called when trying to go up one level 507 | 'top': [], # Called when moving to top level 508 | 'select': [], # Called when trying to move to a Collection or Dataset 509 | } 510 | 511 | dialect = None 512 | 513 | @classmethod 514 | def on(cls, hook): 515 | """Hook decorator.""" 516 | def decorator(function_): 517 | cls._hooks[hook].append(function_) 518 | return function_ 519 | return decorator 520 | 521 | def __repr__(self): 522 | return u'' % self.__class__.__name__ 523 | 524 | def __init__(self, *args, **kwargs): 525 | """Initiate with a ROOT collection on top.""" 526 | self.current_item = Collection(ROOT) 527 | self.current_item.scraper = self 528 | self.root = self.current_item 529 | 530 | for f in self._hooks["init"]: 531 | f(self, *args, **kwargs) 532 | 533 | def __getitem__(self, key): 534 | """ Make scraper[a] shorthand for scraper.items[a] 535 | """ 536 | return self.items[key] 537 | 538 | @property 539 | def items(self): 540 | """ItemList of collections or datasets at the current position. 541 | 542 | None will be returned in case of no further levels 543 | """ 544 | return self.current_item.items 545 | 546 | def fetch(self, query=None, **kwargs): 547 | """Let the current item fetch it's data.""" 548 | return self.current_item.fetch(query, **kwargs) 549 | 550 | @property 551 | def parent(self): 552 | """Return the item above the current, if any.""" 553 | return self.current_item.parent 554 | 555 | @property 556 | def path(self): 557 | """All named collections above, including the current, but not root.""" 558 | return self.current_item.path 559 | 560 | def move_to_top(self): 561 | """Move to root item.""" 562 | self.current_item = self.root 563 | for f in self._hooks["top"]: 564 | f(self) 565 | return self 566 | 567 | def move_up(self): 568 | """Move up one level in the hierarchy, unless already on top.""" 569 | if self.current_item.parent is not None: 570 | self.current_item = self.current_item.parent 571 | 572 | for f in self._hooks["up"]: 573 | f(self) 574 | if self.current_item is self.root: 575 | for f in self._hooks["top"]: 576 | f(self) 577 | return self 578 | 579 | def move_to(self, id_): 580 | """Select a child item by id (str), reference or index.""" 581 | if self.items: 582 | try: 583 | self.current_item = self.items[id_] 584 | except (StopIteration, IndexError, NoSuchItem): 585 | raise NoSuchItem 586 | for f in self._hooks["select"]: 587 | f(self, id_) 588 | return self 589 | 590 | def _fetch_itemslist(self, item): 591 | """Must be overriden by scraper authors, to yield items. 592 | 593 | Should yield items (Collections or Datasets) at the 594 | current cursor position. E.g something like this: 595 | 596 | list = get_items(self.current_item) 597 | for item in list: 598 | if item.type == "Collection": 599 | yield Collection(item.id) 600 | else: 601 | yield Dataset(item.id) 602 | """ 603 | raise Exception("This scraper has no method for fetching list items!") 604 | 605 | def _fetch_dimensions(self, dataset): 606 | """Should be overriden by scraper authors, to yield dimensions.""" 607 | from warnings import warn 608 | warn("This scraper has no method for fetching dimensions.", 609 | RuntimeWarning) 610 | return 611 | yield 612 | # raise Exception("This scraper has no method for fetching dimensions!") 613 | 614 | def _fetch_allowed_values(self, dimension): 615 | """Can be overriden by scraper authors, to yield allowed values.""" 616 | if self.allowed_values is None: 617 | yield None 618 | for allowed_value in self.allowed_values: 619 | yield allowed_value 620 | 621 | def _fetch_data(self, dataset, query=None): 622 | """Must be overriden by scraper authors, to yield dataset rows.""" 623 | raise Exception("This scraper has no method for fetching data!") 624 | 625 | @property 626 | def descendants(self): 627 | """Recursively return every dataset below current item.""" 628 | for i in self.current_item.items: 629 | self.move_to(i) 630 | if i.type == TYPE_COLLECTION: 631 | for c in self.children: 632 | yield c 633 | else: 634 | yield i 635 | self.move_up() 636 | 637 | @property 638 | def children(self): 639 | """Former, misleading name for descendants.""" 640 | from warnings import warn 641 | warn("Deprecated. Use Scraper.descendants.", DeprecationWarning) 642 | for descendant in self.descendants: 643 | yield descendant 644 | 645 | 646 | # Solve any circular dependencies here: 647 | 648 | DimensionList._CONTAINS = Dimension 649 | ValueList._CONTAINS = DimensionValue 650 | ItemList._CONTAINS = Item 651 | -------------------------------------------------------------------------------- /statscraper/compat.py: -------------------------------------------------------------------------------- 1 | import six 2 | 3 | if six.PY3: 4 | from io import BytesIO as StringIO 5 | from json import JSONDecodeError 6 | unicode = str 7 | elif six.PY2: 8 | from StringIO import StringIO 9 | unicode = unicode 10 | JSONDecodeError = ValueError 11 | -------------------------------------------------------------------------------- /statscraper/datatypes.py: -------------------------------------------------------------------------------- 1 | """Contains code for parsing datatypes from the statscraper-datatypes repo.""" 2 | from glob import iglob 3 | from itertools import chain 4 | from csv import DictReader 5 | from csv import reader as CsvReader 6 | from .exceptions import NoSuchDatatype 7 | from .DimensionValue import DimensionValue 8 | from .ValueList import ValueList 9 | import os 10 | 11 | DIR_PATH = os.path.dirname(os.path.realpath(__file__)) 12 | DATATYPES_FILE = os.path.join(DIR_PATH, "datatypes", "datatypes.csv") 13 | VALUE_DELIMITOR = ',' 14 | 15 | 16 | class Datatype(object): 17 | """Represent a datatype, initiated by id.""" 18 | 19 | def __init__(self, id): 20 | """Id is a datatype from datatypes.csv.""" 21 | self.id = id 22 | self.allowed_values = ValueList() 23 | 24 | data = None 25 | with open(DATATYPES_FILE, 'r') as csvfile: 26 | reader = DictReader(csvfile) 27 | for row in reader: 28 | if row["id"] == id: 29 | data = row 30 | break 31 | if data is None: 32 | raise(NoSuchDatatype) 33 | self.value_type = data["value_type"] 34 | self.description = data["description"] 35 | domain = data["allowed_values"] 36 | if domain: 37 | for file_ in self._get_csv_files(domain): 38 | with open(file_, 'r') as csvfile: 39 | reader = DictReader(csvfile) 40 | dialect_names = [x 41 | for x in reader.fieldnames 42 | if x.startswith("dialect:")] 43 | self.dialects = [d[8:] for d in dialect_names] 44 | for row in reader: 45 | value = DimensionValue(row["id"], 46 | self, 47 | label=row["label"]) 48 | dialects = {x: None for x in self.dialects} 49 | 50 | for d in dialect_names: 51 | # parse this cell as a csv row 52 | csvreader = CsvReader([row[d]], 53 | delimiter=VALUE_DELIMITOR, 54 | skipinitialspace=True, 55 | strict=True) 56 | values = next(csvreader) 57 | dialects[d[8:]] = values 58 | value.dialects = dialects 59 | self.allowed_values.append(value) 60 | 61 | def _get_csv_files(self, domain): 62 | domain = os.path.join(*domain.split("/")) 63 | 64 | # We are fetching both by filename and dir name 65 | # so that regions/kenya will match anything in 66 | # `datatypes/values/regions/kenya/*.csv` 67 | # and/or `datatypes/values/regions/kenya.csv` 68 | # 69 | # There is probably an easier way to do this 70 | # FIXME the below function fetches /foo/bar/regions/kenya as well, but we probably want ^regions/kenya 71 | value_path_1 = os.path.join(DIR_PATH, "datatypes", "values", domain) 72 | value_path_2 = os.path.join(DIR_PATH, "datatypes", "values") 73 | files_1 = chain.from_iterable(iglob(os.path.join(root, '*.csv')) 74 | for root, dirs, files in os.walk(value_path_1)) 75 | files_2 = chain.from_iterable(iglob(os.path.join(root, domain + '.csv')) 76 | for root, dirs, files in os.walk(value_path_2)) 77 | for f in chain(files_1, files_2): 78 | yield f 79 | 80 | def __str__(self): 81 | return str(self.id) 82 | 83 | def __repr__(self): 84 | return '' % str(self) 85 | -------------------------------------------------------------------------------- /statscraper/datatypes/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Journalism++ 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /statscraper/datatypes/README.md: -------------------------------------------------------------------------------- 1 | # Statscraper Datatypes 2 | This repo contains data types (e.g. ”Swedish municipality”), with value types (e.g. ”string”), allowed values, names lookup tables for alternative names, and definitions. It is used by the Statscraper repo, as a semi standardized ontology for scrapers. 3 | 4 | All datatypes are listed in `/datatypes.csv`. Allowed values are in the `/values` folder, organized in further subfolders by domain. 5 | 6 | ## Data types (datatypes.csv) 7 | `datatypes.csv` contains for each datatype: 8 | - `id`: A unique id. We use human readable id's. 9 | - `description`: Should include a definition 10 | - `value_type`: `int`, `float`, `str`, `date` or `bool` 11 | - `allowed_values`: See below 12 | 13 | ### Value types 14 | Each data type can have one of the following value types: 15 | 16 | * `int` – a value that can be parsed as an integer 17 | * `float` – a value that can be parsed as a floating-point number 18 | * `str` – a value that can be parsed as a string. Empty strings are considered null. 19 | * `date` – a ISO 8601 date, e.g. `2016-07-05`, `2016-07-05T13:00:00`, `2016-W27`, or `1981-04`. 20 | * `bool` – 1 for True and 0 False. Blank means null. 21 | 22 | ## Allowed values 23 | 24 | Some data types, and some metadata fields, have a predefined set of allowed values (such as “regions”). In some domains, allowed values may be organized in categories (such as “Swedish municipalities”, “Swedish counties”). 25 | 26 | Allowed values are specified in csv files under the `values` directory, optionlly structured in subfolders by domain, e.g.`regions/sweden/municipalities.csv`. They are referenced like this: `regions/sweden/municipalities`, and `regions`. If there is a `regions/` folder, there can not be a `regions.csv` in the same directory. 27 | 28 | The allowed values csv's contain: 29 | 30 | * `id`: A unique id. We use human readable id's, e.g. "Stockholms kommun", not "0180" 31 | * `label`: An optional label 32 | * `dialect:`~: Columns prefixed with `dialect:` contain corresponding id's, e.g. names useed by major statistical providers, WikiData id's, etc 33 | -------------------------------------------------------------------------------- /statscraper/datatypes/datatypes.csv: -------------------------------------------------------------------------------- 1 | "id","description","value_type","allowed_values" 2 | "int","This datatype holds integer values","int", 3 | "float","This datatype holds floating point values","float", 4 | "bool","This datatype holds boolean values","bool", 5 | "str","This datatype holds string values. Empty strings are considered null.","str", 6 | "date","An ISO 8601 date/time stamp","date", 7 | "quarter","A quarter of a gregorian calendar year","str","periods/quarters" 8 | "gender","A subjects gender, in most data by legal definition","str","genders" 9 | "marital_status","A subjects marital status","str","marital_statuses" 10 | "year","A (proleptic) gregorian year","int", 11 | "month","A month in a (proleptic) gregorian year","str","periods/months" 12 | "academic_term","An academic term, e.g. a semester, trimester, etc","str","periods/academic-terms" 13 | "region","An administrative territorial region.","str","regions" 14 | "week","A ISO 8601 week number, week starting on Monday, first week of the year has at least four days","int", 15 | "currency","Currencies, following ISO 4217 where possible","str","currencies" 16 | "road_type","A road type and/or network","str","road_types" 17 | "road_number","A numeric roadnumber in a road system, eg 4 for the international e-road “E4”","int", 18 | -------------------------------------------------------------------------------- /statscraper/datatypes/values/currencies.csv: -------------------------------------------------------------------------------- 1 | "id","label","description","dialect:sv","dialect:num","dialect:wikidata" 2 | "AED","United Arab Emirates dirham","United Arab Emirates dirham","Emiratisk dirham",784,"Q200294" 3 | "AFN","Afghan afghani","Afghan afghani","Afghani",971,"Q199471" 4 | "ALL","Albanian lek","Albanian lek","Lek",8,"Q130498" 5 | "AMD","Armenian dram","Armenian dram","Dram",51,"Q130498" 6 | "ANG","Netherlands Antillean guilder","Netherlands Antillean guilder","Antillergulden",532,"Q200337" 7 | "AOA","Angolan kwanza","Angolan kwanza","Kwanza",973,"Q199578" 8 | "ARS","Argentine peso","Argentine peso","Argentinsk peso",32,"Q259502" 9 | "AUD","Australian dollar","Australian dollar","Australisk dollar",36,"Q232270" 10 | "AWG","Aruban florin","Aruban florin","Arubansk florin",533,"Q483725" 11 | "AZN","Azerbaijani manat","Azerbaijani manat","Azerbajdzjansk manat",944,"Q179620" 12 | "BAM","Bosnia and Herzegovina convertible mark","Bosnia and Herzegovina convertible mark","Konvertibilna marka",977,"Q179620" 13 | "BBD","Barbados dollar","Barbados dollar","Barbadisk dollar",52,"Q194351" 14 | "BDT","Bangladeshi taka","Bangladeshi taka","Taka",50,"Q172540" 15 | "BGN","Bulgarian lev","Bulgarian lev","Lev",975,"Q201871" 16 | "BHD","Bahraini dinar","Bahraini dinar","Bahrainsk dinar",48,"Q238007" 17 | "BIF","Burundian franc","Burundian franc","Burundisk franc",108,"Q210478" 18 | "BMD","Bermudian dollar","Bermudian dollar","Bermudisk dollar",60,"Q206319" 19 | "BND","Brunei dollar","Brunei dollar","Bruneisk dollar",96,"Q206319" 20 | "BOB","Boliviano","Boliviano","Boliviano",68,"Q173117" 21 | "BRL","Brazilian real","Brazilian real","Real",986,"Q194339" 22 | "BSD","Bahamian dollar","Bahamian dollar","Bahamansk dollar",44,"Q201799" 23 | "BTN","Bhutanese ngultrum","Bhutanese ngultrum","Ngultrum",64,"Q201799" 24 | "BWP","Botswana pula","Botswana pula","Pula",72,"Q186794" 25 | "BYR","Belarusian ruble","Former Belarusian ruble","Vitrysk rubel",974,"Q275112" 26 | "BZD","Belize dollar","Belize dollar","Belizisk dollar",84,"Q1104069" 27 | "CAD","Canadian dollar","Canadian dollar","Kanadensisk dollar",124,"Q4734" 28 | "CDF","Congolese franc","Congolese franc","Kongolesisk franc",976,"Q25344" 29 | "CHF","Swiss franc","Swiss franc","Schweizisk franc",756,"Q200050" 30 | "CLP","Chilean peso","Chilean peso","Chilensk peso",152,"Q1378945" 31 | "CNY","Chinese yuan","Chinese yuan","Renminbi",156,"Q244819" 32 | "COP","Colombian peso","Colombian peso","Colombiansk peso",170,"Q244819" 33 | "CRC","Costa Rican colon","Costa Rican colon","Costaricansk colón",188,"Q201505" 34 | "CUP","Cuban peso","Cuban peso","Kubansk peso",192,"Q201505" 35 | "CVE","Cape Verde escudo","Cape Verde escudo","Kapverdisk escudo",132,"Q131016" 36 | "CZK","Czech koruna","Czech koruna","Tjeckisk krona",203,"Q4594" 37 | "DJF","Djiboutian franc","Djiboutian franc","Djiboutisk franc",262,"Q25417" 38 | "DKK","Danish krone","Danish krone","Dansk krona",208,"Q242922" 39 | "DOP","Dominican peso","Dominican peso","Dominikansk peso",214,"Q199674" 40 | "DZD","Algerian dinar","Algerian dinar","Algerisk dinar",12,"Q199462" 41 | "EGP","Egyptian pound","Egyptian pound","Egyptiskt pund",818,"Q171503" 42 | "ERN","Eritrean nakfa","Eritrean nakfa","Nakfa",232,"Q206243" 43 | "ETB","Ethiopian birr","Ethiopian birr","Birr",230,"Q206243" 44 | "EUR","Euro","Euro","Euro",978,"Q4916" 45 | "FJD","Fiji dollar","Fiji dollar","Fijidollar",242,"Q330044" 46 | "FKP","Falkland Islands pound","Falkland Islands pound","Falklandspund",238,"Q25224" 47 | "GBP","Pound sterling","Pound sterling","Brittiskt pund",826,"Q4608" 48 | "GEL","Georgian lari","Georgian lari","Georgiska lari",981,"Q4608" 49 | "GHS","Ghanaian cedi","Ghanaian cedi","Ghana Cedi",936,"Q41429" 50 | "GIP","Gibraltar pound","Gibraltar pound","Gibraltarpund",292,"Q202885" 51 | "GMD","Gambian dalasi","Gambian dalasi","Dalasi",270,"Q213311" 52 | "GNF","Guinean franc","Guinean franc","Guinesisk franc",324,"Q207396" 53 | "GTQ","Guatemalan quetzal","Guatemalan quetzal","Quetzal",320,"Q213005" 54 | "GYD","Guyanese dollar","Guyanese dollar","Guyansk dollar",328,"Q31015" 55 | "HKD","Hong Kong dollar","Hong Kong dollar","Hongkongdollar",344,"Q4719" 56 | "HNL","Honduran lempira","Honduran lempira","Lempira",340,"Q26360" 57 | "HRK","Croatian kuna","Croatian kuna","Kroatisk kuna",191,"Q203955" 58 | "HTG","Haitian gourde","Haitian gourde","Gourde",332,"Q47190" 59 | "HUF","Hungarian forint","Hungarian forint","Forint",348,"Q41588" 60 | "IDR","Indonesian rupiah","Indonesian rupiah","Rupiah",360,"Q131309" 61 | "ILS","Israeli new shekel","Israeli new shekel","Shekel",376,"Q80524" 62 | "INR","Indian rupee","Indian rupee","Indisk rupie",356,"Q193094" 63 | "IQD","Iraqi dinar","Iraqi dinar","Irakisk dinar",368,"Q188608" 64 | "IRR","Iranian rial","Iranian rial","Iransk rial",364,"Q188608" 65 | "ISK","Icelandic króna","Icelandic króna","Isländsk krona",352,"Q209792" 66 | "JMD","Jamaican dollar","Jamaican dollar","Jamaicansk dollar",388,"Q203722" 67 | "JOD","Jordanian dinar","Jordanian dinar","Jordansk dinar",400,"Q8146" 68 | "JPY","Japanese yen","Japanese yen","Yen",392,"Q202882" 69 | "KES","Kenyan shilling","Kenyan shilling","Kenyansk shilling",404,"Q35881" 70 | "KGS","Kyrgyzstani som","Kyrgyzstani som","Kirgizistansk som",417, 71 | "KHR","Cambodian riel","Cambodian riel","Riel",116, 72 | "KMF","Comoro franc","Comoro franc","Komoransk franc",174,"Q106720" 73 | "KPW","North Korean won","North Korean won","Nordkoreansk won",408,"Q202040" 74 | "KRW","South Korean won","South Korean won","Sydkoreansk won",410,"Q193098" 75 | "KWD","Kuwaiti dinar","Kuwaiti dinar","Kuwaitisk dinar",414,"Q319885" 76 | "KYD","Cayman Islands dollar","Cayman Islands dollar","Caymansk dollar",136,"Q173751" 77 | "KZT","Kazakhstani tenge","Kazakhstani tenge","Tenge",398,"Q200055" 78 | "LAK","Lao kip","Lao kip","Kip",418,"Q201880" 79 | "LBP","Lebanese pound","Lebanese pound","Libanesiskt pund",422,"Q4596" 80 | "LKR","Sri Lankan rupee","Sri Lankan rupee","Lankesisk rupie",144,"Q242988" 81 | "LRD","Liberian dollar","Liberian dollar","Liberiansk dollar",430,"Q208039" 82 | "LSL","Lesotho loti","Lesotho loti","Loti",426,"Q190699" 83 | "LYD","Libyan dinar","Libyan dinar","Libysk dinar",434,"Q200192" 84 | "MAD","Moroccan dirham","Moroccan dirham","Marockansk dirham",504,"Q181129" 85 | "MDL","Moldovan leu","Moldovan leu","Moldavisk leu",498,"Q4584" 86 | "MGA","Malagasy ariary","Malagasy ariary","Ariary",969,"Q177875" 87 | "MKD","Macedonian denar","Macedonian denar","Makedonisk denar",807, 88 | "MMK","Myanmar kyat","Myanmar kyat","Kyat",104,"Q183435" 89 | "MNT","Mongolian tögrög","Mongolian tögrög","Tögrög",496,"Q241214" 90 | "MOP","Macanese pataca","Macanese pataca","Pataca",446,"Q207024" 91 | "MRO","Mauritanian ouguiya","Mauritanian ouguiya","Ouguiya",478,"Q212967" 92 | "MUR","Mauritian rupee","Mauritian rupee","Mauritisk rupie",480,"Q206600" 93 | "MVR","Maldivian rufiyaa","Maldivian rufiyaa","Rufiyah",462,"Q211694" 94 | "MWK","Malawian kwacha","Malawian kwacha","malawisk kwacha",454,"Q4730" 95 | "MXN","Mexican peso","Mexican peso","Mexikansk peso",484,"Q163712" 96 | "MYR","Malaysian ringgit","Malaysian ringgit","Ringgit",458,"Q200753" 97 | "MZN","Mozambican metical","Mozambican metical","Metical",943,"Q202462" 98 | "NAD","Namibian dollar","Namibian dollar","Namibisk dollar",516,"Q203567" 99 | "NGN","Nigerian naira","Nigerian naira","Naira",566,"Q207312" 100 | "NIO","Nicaraguan córdoba","Nicaraguan córdoba","Córdoba",558,"Q132643" 101 | "NOK","Norwegian krone","Norwegian krone","Norsk krona",578,"Q202895" 102 | "NPR","Nepalese rupee","Nepalese rupee","Nepalesisk rupie",524,"Q1472704" 103 | "NZD","New Zealand dollar","New Zealand dollar","Nyzeeländsk dollar",554,"Q272290" 104 | "OMR","Omani rial","Omani rial","Omansk rial",512,"Q210472" 105 | "PAB","Panamanian balboa","Panamanian balboa","Balboa",590,"Q210472" 106 | "PEN","Peruvian Sol","Peruvian Sol","Nuevo sol",604,"Q200759" 107 | "PGK","Papua New Guinean kina","Papua New Guinean kina","Kina",598,"Q17193" 108 | "PHP","Philippine peso","Philippine peso","Filippinsk peso",608,"Q188289" 109 | "PKR","Pakistani rupee","Pakistani rupee","Pakistansk rupie",586,"Q188289" 110 | "PLN","Polish złoty","Polish złoty","Złoty",985,"Q207514" 111 | "PYG","Paraguayan guaraní","Paraguayan guaraní","Guarani",600,"Q206386" 112 | "QAR","Qatari riyal","Qatari riyal","Qatarisk rial",634,"Q206386" 113 | "RON","Romanian leu","Romanian leu","Rumänsk leu",946,"Q172524" 114 | "RSD","Serbian dinar","Serbian dinar","Serbisk dinar",941,"Q41044" 115 | "RUB","Russian ruble","Russian ruble","Rysk rubel",643,"Q4741" 116 | "RWF","Rwandan franc","Rwandan franc","Rwandisk franc",646,"Q199857" 117 | "SAR","Saudi riyal","Saudi riyal","Saudiarabisk rial",682,"Q4597" 118 | "SBD","Solomon Islands dollar","Solomon Islands dollar","Salomondollar",90,"Q4597" 119 | "SCR","Seychelles rupee","Seychelles rupee","Seychellisk rupie",690,"Q271206" 120 | "SDG","Sudanese pound","Sudanese pound","Sudanesiskt pund",938,"Q271206" 121 | "SEK","Swedish krona/kronor","Swedish krona/kronor","Svensk krona",752,"Q190951" 122 | "SGD","Singapore dollar","Singapore dollar","Singaporiansk dollar",702,"Q374453" 123 | "SHP","Saint Helena pound","Saint Helena pound","Sankthelenskt pund",654,"Q4587" 124 | "SLL","Sierra Leonean leone","Sierra Leonean leone","Leone",694,"Q4603" 125 | "SOS","Somali shilling","Somali shilling","Somalisk shilling",706,"Q202036" 126 | "SRD","Surinamese dollar","Surinamese dollar","Surinamesisk dollar",968,"Q244366" 127 | "SSP","South Sudanese pound","South Sudanese pound","Sydsudanesiskt pund",728,"Q193712" 128 | "STD","São Tomé and Príncipe dobra","São Tomé and Príncipe dobra","Dobra",678,"Q240468" 129 | "SYP","Syrian pound","Syrian pound","Syriskt pund",760,"Q4823" 130 | "SZL","Swazi lilangeni","Swazi lilangeni","Lilangeni",748,"Q177882" 131 | "THB","Thai baht","Thai baht","Baht",764,"Q199886" 132 | "TJS","Tajikistani somoni","Tajikistani somoni","Somoni",972,"Q199886" 133 | "TMM","Turkmenistani manat","Former Turkmenistani manat","Turkmenistansk manat",795,"Q486637" 134 | "TND","Tunisian dinar","Tunisian dinar","Tunisisk dinar",788,"Q4613" 135 | "TOP","Tongan paʻanga","Tongan paʻanga (often rendered as pa’anga)","Pa'anga",776,"Q172872" 136 | "TRY","Turkish lira","Turkish lira","Turkisk lira",949,"Q242890" 137 | "TTD","Trinidad and Tobago dollar","Trinidad and Tobago dollar","Trinidaddollar",780,"Q208526" 138 | "TWD","New Taiwan dollar","New Taiwan dollar","Taiwanesisk dollar",901,"Q4589" 139 | "TZS","Tanzanian shilling","Tanzanian shilling","Tanzanisk shilling",834,"Q4589" 140 | "UAH","Ukrainian hryvnia","Ukrainian hryvnia","Hryvnja",980,"Q4598" 141 | "UGX","Ugandan shilling","Ugandan shilling","Ugandisk shilling",800,"Q4917" 142 | "USD","United States dollar","United States dollar","Amerikansk dollar",840,"Q209272" 143 | "UYU","Uruguayan peso","Uruguayan peso","Uruguayansk peso",858,"Q209272" 144 | "UZS","Uzbekistan som","Uzbekistan som","Uzbekistansk som",860,"Q203757" 145 | "VEF","Venezuelan bolívar","Venezuelan bolívar","Bolívar",937,"Q203757" 146 | "VND","Vietnamese đồng","Vietnamese đồng","Dong",704,"Q192090" 147 | "VUV","Vanuatu vatu","Vanuatu vatu","Vatu",548,"Q207523" 148 | "WST","Samoan tālā","Samoan tālā","Tala",882,"Q4588" 149 | "XAF","CFA franc BEAC","Central African CFA franc","CFA-franc (BEAC)",950,"Q847739" 150 | "XCD","East Caribbean dollar","East Caribbean dollar","Östkaribisk dollar",951,"Q26365" 151 | "XOF","CFA franc BCEAO","West African CFA franc","CFA-franc (BCEAO)",952,"Q861690" 152 | "XPF","CFP franc","Pacifique CFP franc","CFP-franc",953,"Q240512" 153 | "YER","Yemeni rial","Yemeni rial","Jemenitisk rial",886,"Q181907" 154 | "ZAR","South African rand","South African rand","Rand",710,"Q73408" 155 | "ZMK","Zambian kwacha","Zambian kwacha","Zambisk kwacha",894,"Q73408" 156 | "ZWD","Zimbabwean dollar ","Zimbabwean dollar ","Zimbabwisk dollar",716,"Q182803" 157 | -------------------------------------------------------------------------------- /statscraper/datatypes/values/genders.csv: -------------------------------------------------------------------------------- 1 | id,label,description,dialect:wikidata,dialect:scb,dialect:vcard,dialect:foaf 2 | male,male,sex or gender is male,Q6581097,1,Male,male 3 | female,female,sex or gender is female,Q6581072,2,Female,female 4 | other,other,"sex or gender is not male or female, e.g. intersex, or in some jurisdictions, a third legal gender",Q1097630,,Other, 5 | unknown,unknown,"there is a sex or gender (i.e. not null or none), but we do not know it",Q19798648,,Unknown, 6 | -------------------------------------------------------------------------------- /statscraper/datatypes/values/marital_statuses.csv: -------------------------------------------------------------------------------- 1 | id,label,description,dialect:scb 2 | unmarried,unmarried,"Currently unmarried, including those widowed or divorced","OG,SK,ÄNKL" 3 | married,married,Currently married,G 4 | -------------------------------------------------------------------------------- /statscraper/datatypes/values/periods/academic-terms/semesters.csv: -------------------------------------------------------------------------------- 1 | "id","label","description","dialect:swedish","dialect:numeric" 2 | "semester_1","First semester","The first semester of the academic year, in a two term system. This semester will often, but not always, start in August, September, or October in the Northern Hemisphere, and in February or March in the Southern Hemiphere.","HT",1 3 | "semester_2","Second semester","The second semester of the academic year, in a two term system. This semester will often, but not always, start in January or February in the Northern Hemisphere, and in July, August, September, or October in the Southern Hemiphere.","VT",2 4 | -------------------------------------------------------------------------------- /statscraper/datatypes/values/periods/months.csv: -------------------------------------------------------------------------------- 1 | id,label,description,dialect:wikidata,dialect:swedish,dialect:numeric,dialect:gnd 2 | january,January,First month of the gregorian year. Assume local timezone.,Q108,januari,1,4334971-7 3 | february,February,Second month of the gregorian year. Assume local timezone.,Q109,februari,2,4405048-3 4 | march,March,Third month of the gregorian year. Assume local timezone.,Q110,mars,3,4334968-7 5 | april,April,Fourth month of the gregorian year. Assume local timezone.,Q118,april,4,4334965-1 6 | may,May,Fifth month of the gregorian year. Assume local timezone.,Q119,maj,5,4168620-2 7 | june,June,Sixth month of the gregorian year. Assume local timezone.,Q120,juni,6,4405052-5 8 | july,July,Seventh month of the gregorian year. Assume local timezone.,Q121,juli,7,4389806-3 9 | august,August,Eighth month of the gregorian year. Assume local timezone.,Q122,augusti,8,4389837-3 10 | september,September,Nineth month of the gregorian year. Assume local timezone.,Q123,september,9,4389807-5 11 | october,October,Tenth month of the gregorian year. Assume local timezone.,Q124,oktober,10,4389801-4 12 | november,November,Eleventh month of the gregorian year. Assume local timezone.,Q125,november,11,4389811-7 13 | december,December,Twelfth month of the gregorian year. Assume local timezone.,Q126,december,12,4299252-7 14 | -------------------------------------------------------------------------------- /statscraper/datatypes/values/periods/quarters.csv: -------------------------------------------------------------------------------- 1 | id,label,description,dialect:numeric 2 | q1,First quarter,"January 1 – March 31 inclusive, in local timezone. 90 or 91 days.",1 3 | q2,Second quarter,"April 1 – June 30 inclusive, in local timezone. 91 days.",2 4 | q3,Third quarter,"July 1 – September 30 inclusive, in local timezone. 92 days.",3 5 | q4,Fourth quarter,"October 1 – December 31 inclusive, in local timezone. 92 days.",4 6 | -------------------------------------------------------------------------------- /statscraper/datatypes/values/regions/eu.csv: -------------------------------------------------------------------------------- 1 | id,label,description,dialect:wikidata 2 | eu,EU,"The EU. This refers to the union as a whole. For the set of member states, use one of the below",Q458 3 | eu-12,EU 12,"EU members after the establishment of the union in 1993: Belgium (BE), Denmark (DK), France (FR), Germany (DE), Greece (EL), Ireland (IE), Italy (IT), Luxembourg (LU), Netherlands (NL), Portugal (PT), Spain (ES) and United Kingdom (UK)",Q17627986 4 | eu-15,EU 15,"EU members after the 1995 enlargement: EU 12, plus Austria (AT), Finland (FI) and Sweden (SE) ",Q4590816 5 | eu-21,EU 21,"EU 15, plus Czech Republic, Hungary, Poland, Slovak Republic, before they where EU members.", 6 | eu-25,EU-25,"EU members after the 2004 enlargement: EU 15 + Cypern (CY), Tjeckien (CZ), Estland (EE), Ungern (HU), Lettland (LV), Litauen (LT), Malta (MT), Polen (PL), Slovakien (SK) och Slovenien (SI",Q19933476 7 | eu-27,EU-27,"EU members after the 2007 enlargement: EU 25, plus Bulgaria and Romania.",Q29440613 8 | eu-28,EU 28,"EU members after the 2013 enlargement: EU 27, plus Croatia",Q16681601 9 | -------------------------------------------------------------------------------- /statscraper/datatypes/values/regions/sweden/counties.csv: -------------------------------------------------------------------------------- 1 | "id","label","description","dialect:short","dialect:skatteverket","dialect:numerical","dialect:wikidata","dialect:arbetsmiljoverket" 2 | "Blekinge län","Blekinge län",,"Blekinge","10",10,"Q102377","Blekinge Län" 3 | "Dalarnas län","Dalarnas län",,"Dalarna","20",20,"Q103732","Dalarnas Län" 4 | "Gävleborgs län","Gävleborgs län",,"Gävleborg","21",21,"Q103699","Gävleborgs Län" 5 | "Hallands län","Hallands län",,"Halland","13",13,"Q103691","Hallands Län" 6 | "Jämtlands län","Jämtlands län",,"Jämtland","23",23,"Q103679","Jämtlands Län" 7 | "Jönköpings län","Jönköpings län",,"Jönköping","06",6,"Q103672","Jönköpings Län" 8 | "Kalmar län","Kalmar län",,"Kalmar","08",8,"Q103707","Kalmar Län" 9 | "Kronobergs län","Kronobergs län",,"Kronoberg","07",7,"Q104746","Kronobergs Län" 10 | "Norrbottens län","Norrbottens län",,"Norrbotten","25",25,"Q103686","Norrbottens Län" 11 | "Örebro län","Örebro län",,"Örebro","18",18,"Q104257","Örebro Län" 12 | "Östergötlands län","Östergötlands län",,"Östergötland","05",5,"Q104940","Östergötlands Län" 13 | "Skåne län","Skåne län",,"Skåne","12",12,"Q103659","Skåne Län" 14 | "Södermanlands län","Södermanlands län",,"Södermanland","04",4,"Q106915","Södermanlands Län" 15 | "Stockholms län","Stockholms län",,"Stockholm","01",1,"Q104231","Stockholms Län" 16 | "Uppsala län","Uppsala län",,"Uppsala","03",3,"Q104926","Uppsala Län" 17 | "Värmlands län","Värmlands län",,"Värmland","17",17,"Q106789","Värmlands Län" 18 | "Västerbottens län","Västerbottens län",,"Västerbotten","24",24,"Q104877","Västerbottens Län" 19 | "Västernorrlands län","Västernorrlands län",,"Västernorrland","22",22,"Q104891","Västernorrlands Län" 20 | "Västmanlands län","Västmanlands län",,"Västmanland","19",19,"Q105075","Västmanlands Län" 21 | "Västra Götalands län","Västra Götalands län","Since 1998","Västra Götaland","14",14,"Q103093","Västra Götalands Län" 22 | "Skaraborgs län","Skaraborgs län","Since 1998 part of Västra Götalands län","Skaraborg","16",16,"Q922842", 23 | "Älvsborgs län","Älvsborgs län","Since 1998 part of Västra Götalands län","Älvsborg","15",15,"Q254990", 24 | "Göteborgs och Bohus län","Göteborgs och Bohus län","Since 1998 part of Västra Götalands län","Göteborgs och Bohus län","14",14,"Q579801", 25 | "Gotlands län","Gotlands län","For some purposes identical to `regions/sweden/municipalities/Gotlands kommun`, as the adinistrative entity Region Gotland since 2011.","Gotland","09",9,"Q103738","Gotlands Län" 26 | -------------------------------------------------------------------------------- /statscraper/datatypes/values/regions/sweden/municipalities.csv: -------------------------------------------------------------------------------- 1 | id,label,description,dialect:short,dialect:skatteverket,dialect:numerical,dialect:wikidata,dialect:brå,dialect:scb 2 | Ale kommun,Ale kommun,Ale municipality,Ale,1440,1440,Q498470,8617,1440 Ale kommun 3 | Alingsås kommun,Alingsås kommun,Alingsås municipality,Alingsås,1489,1489,Q503162,8618,1489 Alingsås kommun 4 | Älmhults kommun,Älmhults kommun,Älmhult municipality,Älmhult,0765,765,Q254799,8428,0765 Älmhults kommun 5 | Älvdalens kommun,Älvdalens kommun,Älvdalen municipality,Älvdalen,2039,2039,Q123855,8363,2039 Älvdalens kommun 6 | Alvesta kommun,Alvesta kommun,Alvesta municipality,Alvesta,0764,764,Q182007,8421,0764 Alvesta kommun 7 | Älvkarleby kommun,Älvkarleby kommun,Älvkarleby municipality,Älvkarleby,0319,319,Q59858,8561,0319 Älvkarleby kommun 8 | Älvsbyns kommun,Älvsbyns kommun,Älvsbyn municipality,Älvsbyn,2560,2560,Q255025,8438,2560 Älvsbyns kommun 9 | Åmåls kommun,Åmåls kommun,Åmål municipality,Åmål,1492,1492,Q271079,8695,1492 Åmåls kommun 10 | Aneby kommun,Aneby kommun,Aneby municipality,Aneby,0604,604,Q503167,8394,0604 Aneby kommun 11 | Ånge kommun,Ånge kommun,Ånge municipality,Ånge,2260,2260,Q271107,8602,2260 Ånge kommun 12 | Ängelholms kommun,Ängelholms kommun,Ängelholm municipality,Ängelholm,1292,1292,Q255206,8491,1292 Ängelholms kommun 13 | Arboga kommun,Arboga kommun,Arboga municipality,Arboga,1984,1984,Q431271,8605,1984 Arboga kommun 14 | Åre kommun,Åre kommun,Åre municipality,Åre,2321,2321,Q271153,8391,2321 Åre kommun 15 | Årjängs kommun,Årjängs kommun,Årjäng municipality,Årjäng,1765,1765,Q2777887,8579,1765 Årjängs kommun 16 | Arjeplogs kommun,Arjeplogs kommun,Arjeplog municipality,Arjeplog,2506,2506,Q493855,8431,2506 Arjeplogs kommun 17 | Arvidsjaurs kommun,Arvidsjaurs kommun,Arvidsjaur municipality,Arvidsjaur,2505,2505,Q499404,8430,2505 Arvidsjaurs kommun 18 | Arvika kommun,Arvika kommun,Arvika municipality,Arvika,1784,1784,Q511365,8564,1784 Arvika kommun 19 | Åsele kommun,Åsele kommun,Åsele municipality,Åsele,2463,2463,Q271274,8595,2463 Åsele kommun 20 | Askersunds kommun,Askersunds kommun,Askersund municipality,Askersund,1882,1882,Q509501,8698,1882 Askersunds kommun 21 | Åstorps kommun,Åstorps kommun,Åstorp municipality,Åstorp,1277,1277,Q201813,8490,1277 Åstorps kommun 22 | Åtvidabergs kommun,Åtvidabergs kommun,Åtvidaberg municipality,Åtvidaberg,0561,561,Q271340,8722,0561 Åtvidabergs kommun 23 | Avesta kommun,Avesta kommun,Avesta municipality,Avesta,2084,2084,Q500071,8349,2084 Avesta kommun 24 | Bara kommun,Bara kommun,"Bara municipality, a former municipality of Sweden still appearing in some online datasets. Avoid using the short nameform as may cause confusion.",Bara,1229,1229,Q10426242,,1229 Bara kommun 25 | Båstads kommun,Båstads kommun,Båstad municipality,Båstad,1278,1278,Q499464,8448,1278 Båstads kommun 26 | Bengtsfors kommun,Bengtsfors kommun,Bengtsfors municipality,Bengtsfors,1460,1460,Q267462,8619,1460 Bengtsfors kommun 27 | Bergs kommun,Bergs kommun,Berg municipality,Berg,2326,2326,Q845930,8385,2326 Bergs kommun 28 | Bjurholms kommun,Bjurholms kommun,Bjurholm municipality,Bjurholm,2403,2403,Q376003,8581,2403 Bjurholms kommun 29 | Bjuvs kommun,Bjuvs kommun,Bjuv municipality,Bjuv,1260,1260,Q511310,8445,1260 Bjuvs kommun 30 | Bodens kommun,Bodens kommun,Boden municipality,Boden,2582,2582,Q509476,8441,2582 Bodens kommun 31 | Bollebygds kommun,Bollebygds kommun,Bollebygd municipality,Bollebygd,1443,1443,Q503102,8620,1443 Bollebygds kommun 32 | Bollnäs kommun,Bollnäs kommun,Bollnäs municipality,Bollnäs,2183,2183,Q62469,8367,2183 Bollnäs kommun 33 | Borås kommun,Borås kommun,Borås municipality,Borås,1490,1490,Q894327,8621,1490 Borås kommun 34 | Borgholms kommun,Borgholms kommun,Borgholm municipality,Borgholm,0885,885,Q509546,8408,0885 Borgholms kommun 35 | Borlänge kommun,Borlänge kommun,Borlänge municipality,Borlänge,2081,2081,Q503209,8350,2081 Borlänge kommun 36 | Botkyrka kommun,Botkyrka kommun,Botkyrka municipality,Botkyrka,0127,127,Q113718,8495,0127 Botkyrka kommun 37 | Boxholms kommun,Boxholms kommun,Boxholm municipality,Boxholm,0560,560,Q509514,8711,0560 Boxholms kommun 38 | Bräcke kommun,Bräcke kommun,Bräcke municipality,Bräcke,2305,2305,Q504261,8386,2305 Bräcke kommun 39 | Bromölla kommun,Bromölla kommun,Bromölla municipality,Bromölla,1272,1272,Q514830,8446,1272 Bromölla kommun 40 | Burlövs kommun,Burlövs kommun,Burlöv municipality,Burlöv,1231,1231,Q186662,8447,1231 Burlövs kommun 41 | Dals-Eds kommun,Dals-Eds kommun,Dals-Ed municipality,Dals-Ed,1438,1438,Q503132,8622,1438 Dals-Eds kommun 42 | Danderyds kommun,Danderyds kommun,Danderyd municipality,Danderyd,0162,162,Q113679,8496,0162 Danderyds kommun 43 | Degerfors kommun,Degerfors kommun,Degerfors municipality,Degerfors,1862,1862,Q509469,8699,1862 Degerfors kommun 44 | Dorotea kommun,Dorotea kommun,Dorotea municipality,Dorotea,2425,2425,Q132334,8582,2425 Dorotea kommun 45 | Eda kommun,Eda kommun,Eda municipality,Eda,1730,1730,Q498447,8565,1730 Eda kommun 46 | Ekerö kommun,Ekerö kommun,Ekerö municipality,Ekerö,0125,125,Q492614,8497,0125 Ekerö kommun 47 | Eksjö kommun,Eksjö kommun,Eksjö municipality,Eksjö,0686,686,Q512035,8395,0686 Eksjö kommun 48 | Emmaboda kommun,Emmaboda kommun,Emmaboda municipality,Emmaboda,0862,862,Q509452,8409,0862 Emmaboda kommun 49 | Enköpings kommun,Enköpings kommun,Enköping municipality,Enköping,0381,381,Q509568,8555,0381 Enköpings kommun 50 | Eskilstuna kommun,Eskilstuna kommun,Eskilstuna municipality,Eskilstuna,0484,484,Q503144,8545,0484 Eskilstuna kommun 51 | Eslövs kommun,Eslövs kommun,Eslöv municipality,Eslöv,1285,1285,Q1130264,8449,1285 Eslövs kommun 52 | Essunga kommun,Essunga kommun,Essunga municipality,Essunga,1445,1445,Q503127,8623,1445 Essunga kommun 53 | Fagersta kommun,Fagersta kommun,Fagersta municipality,Fagersta,1982,1982,Q47018,8606,1982 Fagersta kommun 54 | Falkenbergs kommun,Falkenbergs kommun,Falkenberg municipality,Falkenberg,1382,1382,Q508168,8378,1382 Falkenbergs kommun 55 | Falköpings kommun,Falköpings kommun,Falköping municipality,Falköping,1499,1499,Q503214,8624,1499 Falköpings kommun 56 | Falu kommun,Falu kommun,Falun municipality,Falun,2080,2080,Q501545,8351,2080 Falu kommun 57 | Färgelanda kommun,Färgelanda kommun,Färgelanda municipality,Färgelanda,1439,1439,Q499441,8625,1439 Färgelanda kommun 58 | Filipstads kommun,Filipstads kommun,Filipstad municipality,Filipstad,1782,1782,Q503204,8566,1782 Filipstads kommun 59 | Finspångs kommun,Finspångs kommun,Finspång municipality,Finspång,0562,562,Q503180,8712,0562 Finspångs kommun 60 | Flens kommun,Flens kommun,Flen municipality,Flen,0482,482,Q27982,8546,0482 Flens kommun 61 | Forshaga kommun,Forshaga kommun,Forshaga municipality,Forshaga,1763,1763,Q499385,8567,1763 Forshaga kommun 62 | Gagnefs kommun,Gagnefs kommun,Gagnef municipality,Gagnef,2026,2026,Q2642771,8352,2026 Gagnefs kommun 63 | Gällivare kommun,Gällivare kommun,Gällivare municipality,Gällivare,2523,2523,Q493815,8437,2523 Gällivare kommun 64 | Gävle kommun,Gävle kommun,Gävle municipality,Gävle,2180,2180,Q510010,8368,2180 Gävle kommun 65 | Gislaveds kommun,Gislaveds kommun,Gislaved municipality,Gislaved,0662,662,Q505259,8396,0662 Gislaveds kommun 66 | Gnesta kommun,Gnesta kommun,Gnesta municipality,Gnesta,0461,461,Q239332,8547,0461 Gnesta kommun 67 | Gnosjö kommun,Gnosjö kommun,Gnosjö municipality,Gnosjö,0617,617,Q509609,8397,0617 Gnosjö kommun 68 | Göteborgs kommun,Göteborgs kommun,Göteborg municipality,Göteborg,1480,1480,Q52502,8628,1480 Göteborgs kommun 69 | Götene kommun,Götene kommun,Götene municipality,Götene,1471,1471,Q511281,8660,1471 Götene kommun 70 | Region Gotland,Region Gotland,"Region Gotland, formally ”Gotland municipality”",Gotland,0980,980,Q374794,8365,0980 Region Gotland 71 | Grästorps kommun,Grästorps kommun,Grästorp municipality,Grästorp,1444,1444,Q503148,8626,1444 Grästorps kommun 72 | Grums kommun,Grums kommun,Grums municipality,Grums,1764,1764,Q503122,8568,1764 Grums kommun 73 | Gullspångs kommun,Gullspångs kommun,Gullspång municipality,Gullspång,1447,1447,Q503193,8627,1447 Gullspångs kommun 74 | Habo kommun,Habo kommun,Habo municipality,Habo,0643,643,Q503198,8398,0643 Habo kommun 75 | Håbo kommun,Håbo kommun,Håbo municipality,Håbo,0305,305,Q511253,8557,0305 Håbo kommun 76 | Hagfors kommun,Hagfors kommun,Hagfors municipality,Hagfors,1783,1783,Q511407,8569,1783 Hagfors kommun 77 | Hällefors kommun,Hällefors kommun,Hällefors municipality,Hällefors,1863,1863,Q220881,8701,1863 Hällefors kommun 78 | Hallsbergs kommun,Hallsbergs kommun,Hallsberg municipality,Hallsberg,1861,1861,Q508180,8700,1861 Hallsbergs kommun 79 | Hallstahammars kommun,Hallstahammars kommun,Hallstahammar municipality,Hallstahammar,1961,1961,Q47019,8607,1961 Hallstahammars kommun 80 | Halmstads kommun,Halmstads kommun,Halmstad municipality,Halmstad,1380,1380,Q504692,8379,1380 Halmstads kommun 81 | Hammarö kommun,Hammarö kommun,Hammarö municipality,Hammarö,1761,1761,Q499359,8570,1761 Hammarö kommun 82 | Haninge kommun,Haninge kommun,Haninge municipality,Haninge,0136,136,Q113692,8498,0136 Haninge kommun 83 | Haparanda kommun,Haparanda kommun,Haparanda municipality,Haparanda,2583,2583,Q510310,8442,2583 Haparanda kommun 84 | Härjedalens kommun,Härjedalens kommun,Härjedalen municipality,Härjedalen,2361,2361,Q513421,8387,2361 Härjedalens kommun 85 | Härnösands kommun,Härnösands kommun,Härnösand municipality,Härnösand,2280,2280,Q209634,8597,2280 Härnösands kommun 86 | Härryda kommun,Härryda kommun,Härryda municipality,Härryda,1401,1401,Q500125,8663,1401 Härryda kommun 87 | Hässleholms kommun,Hässleholms kommun,Hässleholm municipality,Hässleholm,1293,1293,Q508125,8451,1293 Hässleholms kommun 88 | Heby kommun,Heby kommun,Heby municipality,Heby,0331,331,Q516308,8556,0331 Heby kommun 89 | Heby kommun före 2007,Heby kommun före 2007,"Heby before 2007, when they changed county. Most of the time you would probably want to use Heby kommun even before 2007",Heby före 2007,1917,1917,Q516308,8556,1917 Heby kommun 90 | Hedemora kommun,Hedemora kommun,Hedemora municipality,Hedemora,2083,2083,Q507684,8353,2083 Hedemora kommun 91 | Helsingborgs kommun,Helsingborgs kommun,Helsingborg municipality,Helsingborg,1283,1283,Q487648,8450,1283 Helsingborgs kommun 92 | Herrljunga kommun,Herrljunga kommun,Herrljunga municipality,Herrljunga,1466,1466,Q503111,8661,1466 Herrljunga kommun 93 | Hjo kommun,Hjo kommun,Hjo municipality,Hjo,1497,1497,Q428749,8662,1497 Hjo kommun 94 | Hofors kommun,Hofors kommun,Hofors municipality,Hofors,2104,2104,Q62464,8369,2104 Hofors kommun 95 | Höganäs kommun,Höganäs kommun,Höganäs municipality,Höganäs,1284,1284,Q505013,8452,1284 Höganäs kommun 96 | Högsby kommun,Högsby kommun,Högsby municipality,Högsby,0821,821,Q510233,8411,0821 Högsby kommun 97 | Höörs kommun,Höörs kommun,Höör municipality,Höör,1267,1267,Q266401,8454,1267 Höörs kommun 98 | Hörby kommun,Hörby kommun,Hörby municipality,Hörby,1266,1266,Q504619,8453,1266 Hörby kommun 99 | Huddinge kommun,Huddinge kommun,Huddinge municipality,Huddinge,0126,126,Q492575,8499,0126 Huddinge kommun 100 | Hudiksvalls kommun,Hudiksvalls kommun,Hudiksvall municipality,Hudiksvall,2184,2184,Q29963,8370,2184 Hudiksvalls kommun 101 | Hultsfreds kommun,Hultsfreds kommun,Hultsfred municipality,Hultsfred,0860,860,Q512002,8410,0860 Hultsfreds kommun 102 | Hylte kommun,Hylte kommun,Hylte municipality,Hylte,1315,1315,Q498477,8380,1315 Hylte kommun 103 | Järfälla kommun,Järfälla kommun,Järfälla municipality,Järfälla,0123,123,Q301259,8500,0123 Järfälla kommun 104 | Jokkmokks kommun,Jokkmokks kommun,Jokkmokk municipality,Jokkmokk,2510,2510,Q512048,8432,2510 Jokkmokks kommun 105 | Jönköpings kommun,Jönköpings kommun,Jönköping municipality,Jönköping,0680,680,Q504689,8399,0680 Jönköpings kommun 106 | Kalix kommun,Kalix kommun,Kalix municipality,Kalix,2514,2514,Q117091,8434,2514 Kalix kommun 107 | Kalmar kommun,Kalmar kommun,Kalmar municipality,Kalmar,0880,880,Q508153,8412,0880 Kalmar kommun 108 | Karlsborgs kommun,Karlsborgs kommun,Karlsborg municipality,Karlsborg,1446,1446,Q499435,8664,1446 Karlsborgs kommun 109 | Karlshamns kommun,Karlshamns kommun,Karlshamn municipality,Karlshamn,1082,1082,Q510223,8343,1082 Karlshamns kommun 110 | Karlskoga kommun,Karlskoga kommun,Karlskoga municipality,Karlskoga,1883,1883,Q509634,8702,1883 Karlskoga kommun 111 | Karlskrona kommun,Karlskrona kommun,Karlskrona municipality,Karlskrona,1080,1080,Q1128384,8344,1080 Karlskrona kommun 112 | Karlstads kommun,Karlstads kommun,Karlstad municipality,Karlstad,1780,1780,Q498453,8571,1780 Karlstads kommun 113 | Katrineholms kommun,Katrineholms kommun,Katrineholm municipality,Katrineholm,0483,483,Q508140,8548,0483 Katrineholms kommun 114 | Kävlinge kommun,Kävlinge kommun,Kävlinge municipality,Kävlinge,1261,1261,Q513370,8457,1261 Kävlinge kommun 115 | Kils kommun,Kils kommun,Kil municipality,Kil,1715,1715,Q499393,8572,1715 Kils kommun 116 | Kinda kommun,Kinda kommun,Kinda municipality,Kinda,0513,513,Q515299,8713,0513 Kinda kommun 117 | Kiruna kommun,Kiruna kommun,Kiruna municipality,Kiruna,2584,2584,Q499474,8443,2584 Kiruna kommun 118 | Klippans kommun,Klippans kommun,Klippan municipality,Klippan,1276,1276,Q504614,8455,1276 Klippans kommun 119 | Knivsta kommun,Knivsta kommun,Knivsta municipality,Knivsta,0330,330,Q504465,8558,0330 Knivsta kommun 120 | Köpings kommun,Köpings kommun,Köping municipality,Köping,1983,1983,Q42009,8610,1983 Köpings kommun 121 | Kramfors kommun,Kramfors kommun,Kramfors municipality,Kramfors,2282,2282,Q514815,8598,2282 Kramfors kommun 122 | Kristianstads kommun,Kristianstads kommun,Kristianstad municipality,Kristianstad,1290,1290,Q498857,8456,1290 Kristianstads kommun 123 | Kristinehamns kommun,Kristinehamns kommun,Kristinehamn municipality,Kristinehamn,1781,1781,Q510364,8573,1781 Kristinehamns kommun 124 | Krokoms kommun,Krokoms kommun,Krokom municipality,Krokom,2309,2309,Q514707,8388,2309 Krokoms kommun 125 | Kumla kommun,Kumla kommun,Kumla municipality,Kumla,1881,1881,Q504988,8703,1881 Kumla kommun 126 | Kungälvs kommun,Kungälvs kommun,Kungälv municipality,Kungälv,1482,1482,Q511394,8665,1482 Kungälvs kommun 127 | Kungsbacka kommun,Kungsbacka kommun,Kungsbacka municipality,Kungsbacka,1384,1384,Q499380,8381,1384 Kungsbacka kommun 128 | Kungsörs kommun,Kungsörs kommun,Kungsör municipality,Kungsör,1960,1960,Q47169,8609,1960 Kungsörs kommun 129 | Laholms kommun,Laholms kommun,Laholm municipality,Laholm,1381,1381,Q487502,8382,1381 Laholms kommun 130 | Landskrona kommun,Landskrona kommun,Landskrona municipality,Landskrona,1282,1282,Q502298,8458,1282 Landskrona kommun 131 | Laxå kommun,Laxå kommun,Laxå municipality,Laxå,1860,1860,Q515326,8704,1860 Laxå kommun 132 | Lekebergs kommun,Lekebergs kommun,Lekeberg municipality,Lekeberg,1814,1814,Q515282,8705,1814 Lekebergs kommun 133 | Leksands kommun,Leksands kommun,Leksand municipality,Leksand,2029,2029,Q509651,8354,2029 Leksands kommun 134 | Lerums kommun,Lerums kommun,Lerum municipality,Lerum,1441,1441,Q503188,8666,1441 Lerums kommun 135 | Lessebo kommun,Lessebo kommun,Lessebo municipality,Lessebo,0761,761,Q509488,8422,0761 Lessebo kommun 136 | Lidingö kommun,Lidingö kommun,Lidingö municipality,Lidingö,0186,186,Q3120654,8501,0186 Lidingö kommun 137 | Lidköpings kommun,Lidköpings kommun,Lidköping municipality,Lidköping,1494,1494,Q515358,8667,1494 Lidköpings kommun 138 | Lilla Edets kommun,Lilla Edets kommun,Lilla Edet municipality,Lilla Edet,1462,1462,Q511241,8668,1462 Lilla Edets kommun 139 | Lindesbergs kommun,Lindesbergs kommun,Lindesberg municipality,Lindesberg,1885,1885,Q514858,8706,1885 Lindesbergs kommun 140 | Linköpings kommun,Linköpings kommun,Linköping municipality,Linköping,0580,580,Q499410,8714,0580 Linköpings kommun 141 | Ljungby kommun,Ljungby kommun,Ljungby municipality,Ljungby,0781,781,Q504235,8423,0781 Ljungby kommun 142 | Ljusdals kommun,Ljusdals kommun,Ljusdal municipality,Ljusdal,2161,2161,Q515235,8371,2161 Ljusdals kommun 143 | Ljusnarsbergs kommun,Ljusnarsbergs kommun,Ljusnarsberg municipality,Ljusnarsberg,1864,1864,Q514739,8707,1864 Ljusnarsbergs kommun 144 | Lomma kommun,Lomma kommun,Lomma municipality,Lomma,1262,1262,Q427991,8459,1262 Lomma kommun 145 | Ludvika kommun,Ludvika kommun,Ludvika municipality,Ludvika,2085,2085,Q503184,8355,2085 Ludvika kommun 146 | Luleå kommun,Luleå kommun,Luleå municipality,Luleå,2580,2580,Q177019,8439,2580 Luleå kommun 147 | Lunds kommun,Lunds kommun,Lund municipality,Lund,1281,1281,Q505018,8460,1281 Lunds kommun 148 | Lycksele kommun,Lycksele kommun,Lycksele municipality,Lycksele,2481,2481,Q948462,8583,2481 Lycksele kommun 149 | Lysekils kommun,Lysekils kommun,Lysekil municipality,Lysekil,1484,1484,Q503173,8669,1484 Lysekils kommun 150 | Malå kommun,Malå kommun,Malå municipality,Malå,2418,2418,Q501540,8584,2418 Malå kommun 151 | Malmö kommun,Malmö kommun,Malmö municipality,Malmö,1280,1280,Q503361,8461,1280 Malmö kommun 152 | Malung-Sälens kommun,Malung-Sälens kommun,"Malung-Sälen municipality, formerly known as Malungs kommun",Malung-Sälen,2023,2023,Q504266,8356,2023 Malung-Sälens kommun 153 | Mariestads kommun,Mariestads kommun,Mariestad municipality,Mariestad,1493,1493,Q427422,8670,1493 Mariestads kommun 154 | Markaryds kommun,Markaryds kommun,Markaryd municipality,Markaryd,0767,767,Q240574,8671,0767 Markaryds kommun 155 | Marks kommun,Marks kommun,Mark municipality,Mark,1463,1463,Q500153,8424,1463 Marks kommun 156 | Melleruds kommun,Melleruds kommun,Mellerud municipality,Mellerud,1461,1461,Q501438,8672,1461 Melleruds kommun 157 | Mjölby kommun,Mjölby kommun,Mjölby municipality,Mjölby,0586,586,Q267030,8715,0586 Mjölby kommun 158 | Mölndals kommun,Mölndals kommun,Mölndal municipality,Mölndal,1481,1481,Q511270,8674,1481 Mölndals kommun 159 | Mönsterås kommun,Mönsterås kommun,Mönsterås municipality,Mönsterås,0861,861,Q515250,8413,0861 Mönsterås kommun 160 | Mora kommun,Mora kommun,Mora municipality,Mora,2062,2062,Q504239,8357,2062 Mora kommun 161 | Mörbylånga kommun,Mörbylånga kommun,Mörbylånga municipality,Mörbylånga,0840,840,Q514756,8414,0840 Mörbylånga kommun 162 | Motala kommun,Motala kommun,Motala municipality,Motala,0583,583,Q508108,8716,0583 Motala kommun 163 | Mullsjö kommun,Mullsjö kommun,Mullsjö municipality,Mullsjö,0642,642,Q505076,8400,0642 Mullsjö kommun 164 | Munkedals kommun,Munkedals kommun,Munkedal municipality,Munkedal,1430,1430,Q389040,8673,1430 Munkedals kommun 165 | Munkfors kommun,Munkfors kommun,Munkfors municipality,Munkfors,1762,1762,Q501494,8574,1762 Munkfors kommun 166 | Nacka kommun,Nacka kommun,Nacka municipality,Nacka,0182,182,Q946647,8502,0182 Nacka kommun 167 | Nässjö kommun,Nässjö kommun,Nässjö municipality,Nässjö,0682,682,Q505096,8401,0682 Nässjö kommun 168 | Nora kommun,Nora kommun,Nora municipality,Nora,1884,1884,Q285894,8708,1884 Nora kommun 169 | Norbergs kommun,Norbergs kommun,Norberg municipality,Norberg,1962,1962,Q37404,8611,1962 Norbergs kommun 170 | Nordanstigs kommun,Nordanstigs kommun,Nordanstig municipality,Nordanstig,2132,2132,Q514805,8372,2132 Nordanstigs kommun 171 | Nordmalings kommun,Nordmalings kommun,Nordmaling municipality,Nordmaling,2401,2401,Q514722,8585,2401 Nordmalings kommun 172 | Norrköpings kommun,Norrköpings kommun,Norrköping municipality,Norrköping,0581,581,Q504676,8717,0581 Norrköpings kommun 173 | Norrtälje kommun,Norrtälje kommun,Norrtälje municipality,Norrtälje,0188,188,Q214048,8503,0188 Norrtälje kommun 174 | Norsjö kommun,Norsjö kommun,Norsjö municipality,Norsjö,2417,2417,Q507644,8586,2417 Norsjö kommun 175 | Nybro kommun,Nybro kommun,Nybro municipality,Nybro,0881,881,Q515318,8415,0881 Nybro kommun 176 | Nyköpings kommun,Nyköpings kommun,Nyköping municipality,Nyköping,0480,480,Q500267,8549,0480 Nyköpings kommun 177 | Nykvarns kommun,Nykvarns kommun,Nykvarn municipality,Nykvarn,0140,140,Q499460,8504,0140 Nykvarns kommun 178 | Nynäshamns kommun,Nynäshamns kommun,Nynäshamn municipality,Nynäshamn,0192,192,Q505090,8505,0192 Nynäshamns kommun 179 | Ockelbo kommun,Ockelbo kommun,Ockelbo municipality,Ockelbo,2101,2101,Q505109,8373,2101 Ockelbo kommun 180 | Öckerö kommun,Öckerö kommun,Öckerö municipality,Öckerö,1407,1407,Q293928,8696,1407 Öckerö kommun 181 | Ödeshögs kommun,Ödeshögs kommun,Ödeshög municipality,Ödeshög,0509,509,Q293970,8723,0509 Ödeshögs kommun 182 | Olofströms kommun,Olofströms kommun,Olofström municipality,Olofström,1060,1060,Q504257,8345,1060 Olofströms kommun 183 | Örebro kommun,Örebro kommun,Örebro municipality,Örebro,1880,1880,Q297718,8709,1880 Örebro kommun 184 | Örkelljunga kommun,Örkelljunga kommun,Örkelljunga municipality,Örkelljunga,1257,1257,Q297936,8492,1257 Örkelljunga kommun 185 | Örnsköldsviks kommun,Örnsköldsviks kommun,Örnsköldsvik municipality,Örnsköldsvik,2284,2284,Q298003,8603,2284 Örnsköldsviks kommun 186 | Orsa kommun,Orsa kommun,Orsa municipality,Orsa,2034,2034,Q504630,8358,2034 Orsa kommun 187 | Orusts kommun,Orusts kommun,Orust municipality,Orust,1421,1421,Q338752,8675,1421 Orusts kommun 188 | Osby kommun,Osby kommun,Osby municipality,Osby,1273,1273,Q504594,8478,1273 Osby kommun 189 | Oskarshamns kommun,Oskarshamns kommun,Oskarshamn municipality,Oskarshamn,0882,882,Q505006,8416,0882 Oskarshamns kommun 190 | Österåkers kommun,Österåkers kommun,Österåker municipality,Österåker,0117,117,Q117728,8543,0117 Österåkers kommun 191 | Östersunds kommun,Östersunds kommun,Östersund municipality,Östersund,2380,2380,Q306789,8392,2380 Östersunds kommun 192 | Östhammars kommun,Östhammars kommun,Östhammar municipality,Östhammar,0382,382,Q59093,8562,0382 Östhammars kommun 193 | Östra Göinge kommun,Östra Göinge kommun,Östra Göinge municipality,Östra Göinge,1256,1256,Q307370,8493,1256 Östra Göinge kommun 194 | Ovanåkers kommun,Ovanåkers kommun,Ovanåker municipality,Ovanåker,2121,2121,Q505085,8374,2121 Ovanåkers kommun 195 | Överkalix kommun,Överkalix kommun,Överkalix municipality,Överkalix,2513,2513,Q307569,8433,2513 Överkalix kommun 196 | Övertorneå kommun,Övertorneå kommun,Övertorneå municipality,Övertorneå,2518,2518,Q307603,8435,2518 Övertorneå kommun 197 | Oxelösunds kommun,Oxelösunds kommun,Oxelösund municipality,Oxelösund,0481,481,Q505246,8550,0481 Oxelösunds kommun 198 | Pajala kommun,Pajala kommun,Pajala municipality,Pajala,2521,2521,Q186230,8436,2521 Pajala kommun 199 | Partille kommun,Partille kommun,Partille municipality,Partille,1402,1402,Q125222,8676,1402 Partille kommun 200 | Perstorps kommun,Perstorps kommun,Perstorp municipality,Perstorp,1275,1275,Q504249,8479,1275 Perstorps kommun 201 | Piteå kommun,Piteå kommun,Piteå municipality,Piteå,2581,2581,Q507656,8440,2581 Piteå kommun 202 | Ragunda kommun,Ragunda kommun,Ragunda municipality,Ragunda,2303,2303,Q515342,8389,2303 Ragunda kommun 203 | Rättviks kommun,Rättviks kommun,Rättvik municipality,Rättvik,2031,2031,Q504244,8359,2031 Rättviks kommun 204 | Robertsfors kommun,Robertsfors kommun,Robertsfors municipality,Robertsfors,2409,2409,Q507670,8587,2409 Robertsfors kommun 205 | Ronneby kommun,Ronneby kommun,Ronneby municipality,Ronneby,1081,1081,Q515373,8346,1081 Ronneby kommun 206 | Säffle kommun,Säffle kommun,Säffle municipality,Säffle,1785,1785,Q511326,8577,1785 Säffle kommun 207 | Sala kommun,Sala kommun,Sala municipality,Sala,1981,1981,Q37399,8612,1981 Sala kommun 208 | Salems kommun,Salems kommun,Salem municipality,Salem,0128,128,Q1255130,8506,0128 Salems kommun 209 | Sandvikens kommun,Sandvikens kommun,Sandviken municipality,Sandviken,2181,2181,Q149539,8375,2181 Sandvikens kommun 210 | Säters kommun,Säters kommun,Säter municipality,Säter,2082,2082,Q1345439,8361,2082 Säters kommun 211 | Sävsjö kommun,Sävsjö kommun,Sävsjö municipality,Sävsjö,0684,684,Q505239,8402,0684 Sävsjö kommun 212 | Sigtuna kommun,Sigtuna kommun,Sigtuna municipality,Sigtuna,0191,191,Q216915,8507,0191 Sigtuna kommun 213 | Simrishamns kommun,Simrishamns kommun,Simrishamn municipality,Simrishamn,1291,1291,Q504626,8480,1291 Simrishamns kommun 214 | Sjöbo kommun,Sjöbo kommun,Sjöbo municipality,Sjöbo,1265,1265,Q504601,8481,1265 Sjöbo kommun 215 | Skara kommun,Skara kommun,Skara municipality,Skara,1495,1495,Q499421,8677,1495 Skara kommun 216 | Skellefteå kommun,Skellefteå kommun,Skellefteå municipality,Skellefteå,2482,2482,Q430780,8588,2482 Skellefteå kommun 217 | Skinnskattebergs kommun,Skinnskattebergs kommun,Skinnskatteberg municipality,Skinnskatteberg,1904,1904,Q37462,8613,1904 Skinnskattebergs kommun 218 | Skövde kommun,Skövde kommun,Skövde municipality,Skövde,1496,1496,Q501452,8678,1496 Skövde kommun 219 | Skurups kommun,Skurups kommun,Skurup municipality,Skurup,1264,1264,Q515266,8482,1264 Skurups kommun 220 | Smedjebackens kommun,Smedjebackens kommun,Smedjebacken municipality,Smedjebacken,2061,2061,Q505046,8360,2061 Smedjebackens kommun 221 | Söderhamns kommun,Söderhamns kommun,Söderhamn municipality,Söderhamn,2182,2182,Q145835,8376,2182 Söderhamns kommun 222 | Söderköpings kommun,Söderköpings kommun,Söderköping municipality,Söderköping,0582,582,Q515680,8718,0582 Söderköpings kommun 223 | Södertälje kommun,Södertälje kommun,Södertälje municipality,Södertälje,0181,181,Q516336,8535,0181 Södertälje kommun 224 | Sollefteå kommun,Sollefteå kommun,Sollefteå municipality,Sollefteå,2283,2283,Q221990,8599,2283 Sollefteå kommun 225 | Sollentuna kommun,Sollentuna kommun,Sollentuna municipality,Sollentuna,0163,163,Q503746,8508,0163 Sollentuna kommun 226 | Solna kommun,Solna kommun,Solna municipality,Solna,0184,184,Q109010,8509,0184 Solna kommun 227 | Sölvesborgs kommun,Sölvesborgs kommun,Sölvesborg municipality,Sölvesborg,1083,1083,Q515409,8347,1083 Sölvesborgs kommun 228 | Sorsele kommun,Sorsele kommun,Sorsele municipality,Sorsele,2422,2422,Q501463,8589,2422 Sorsele kommun 229 | Sotenäs kommun,Sotenäs kommun,Sotenäs municipality,Sotenäs,1427,1427,Q501420,8679,1427 Sotenäs kommun 230 | Staffanstorps kommun,Staffanstorps kommun,Staffanstorp municipality,Staffanstorp,1230,1230,Q504609,8483,1230 Staffanstorps kommun 231 | Stenungsunds kommun,Stenungsunds kommun,Stenungsund municipality,Stenungsund,1415,1415,Q511438,8680,1415 Stenungsunds kommun 232 | Stockholms kommun,Stockholms kommun,Stockholm municipality,Stockholm,0180,180,Q506250,8510,0180 Stockholms kommun 233 | Storfors kommun,Storfors kommun,Storfors municipality,Storfors,1760,1760,Q505936,8575,1760 Storfors kommun 234 | Storumans kommun,Storumans kommun,Storuman municipality,Storuman,2421,2421,Q499415,8590,2421 Storumans kommun 235 | Strängnäs kommun,Strängnäs kommun,Strängnäs municipality,Strängnäs,0486,486,Q501532,8551,0486 Strängnäs kommun 236 | Strömstads kommun,Strömstads kommun,Strömstad municipality,Strömstad,1486,1486,Q501424,8681,1486 Strömstads kommun 237 | Strömsunds kommun,Strömsunds kommun,Strömsund municipality,Strömsund,2313,2313,Q514770,8390,2313 Strömsunds kommun 238 | Sundbybergs kommun,Sundbybergs kommun,Sundbyberg municipality,Sundbyberg,0183,183,Q972564,8534,0183 Sundbybergs kommun 239 | Sundsvalls kommun,Sundsvalls kommun,Sundsvall municipality,Sundsvall,2281,2281,Q504994,8600,2281 Sundsvalls kommun 240 | Sunne kommun,Sunne kommun,Sunne municipality,Sunne,1766,1766,Q501205,8576,1766 Sunne kommun 241 | Surahammars kommun,Surahammars kommun,Surahammar municipality,Surahammar,1907,1907,Q34078,8614,1907 Surahammars kommun 242 | Svalövs kommun,Svalövs kommun,Svalöv municipality,Svalöv,1214,1214,Q504227,8484,1214 Svalövs kommun 243 | Svedala kommun,Svedala kommun,Svedala municipality,Svedala,1263,1263,Q515706,8485,1263 Svedala kommun 244 | Svenljunga kommun,Svenljunga kommun,Svenljunga municipality,Svenljunga,1465,1465,Q501487,8682,1465 Svenljunga kommun 245 | Täby kommun,Täby kommun,Täby municipality,Täby,0160,160,Q493066,8537,0160 Täby kommun 246 | Tanums kommun,Tanums kommun,Tanum municipality,Tanum,1435,1435,Q511228,8683,1435 Tanums kommun 247 | Tibro kommun,Tibro kommun,Tibro municipality,Tibro,1472,1472,Q501432,8684,1472 Tibro kommun 248 | Tidaholms kommun,Tidaholms kommun,Tidaholm municipality,Tidaholm,1498,1498,Q501459,8685,1498 Tidaholms kommun 249 | Tierps kommun,Tierps kommun,Tierp municipality,Tierp,0360,360,Q510198,8559,0360 Tierps kommun 250 | Timrå kommun,Timrå kommun,Timrå municipality,Timrå,2262,2262,Q504983,8601,2262 Timrå kommun 251 | Tingsryds kommun,Tingsryds kommun,Tingsryd municipality,Tingsryd,0763,763,Q515491,8425,0763 Tingsryds kommun 252 | Tjörns kommun,Tjörns kommun,Tjörn municipality,Tjörn,1419,1419,Q501448,8686,1419 Tjörns kommun 253 | Tomelilla kommun,Tomelilla kommun,Tomelilla municipality,Tomelilla,1270,1270,Q515519,8486,1270 Tomelilla kommun 254 | Töreboda kommun,Töreboda kommun,Töreboda municipality,Töreboda,1473,1473,Q501470,8689,1473 Töreboda kommun 255 | Torsås kommun,Torsås kommun,Torsås municipality,Torsås,0834,834,Q515551,8417,0834 Torsås kommun 256 | Torsby kommun,Torsby kommun,Torsby municipality,Torsby,1737,1737,Q510135,8578,1737 Torsby kommun 257 | Tranås kommun,Tranås kommun,Tranås municipality,Tranås,0687,687,Q505071,8403,0687 Tranås kommun 258 | Tranemo kommun,Tranemo kommun,Tranemo municipality,Tranemo,1452,1452,Q501479,8687,1452 Tranemo kommun 259 | Trelleborgs kommun,Trelleborgs kommun,Trelleborg municipality,Trelleborg,1287,1287,Q504219,8487,1287 Trelleborgs kommun 260 | Trollhättans kommun,Trollhättans kommun,Trollhättan municipality,Trollhättan,1488,1488,Q28532,8688,1488 Trollhättans kommun 261 | Trosa kommun,Trosa kommun,Trosa municipality,Trosa,0488,488,Q505064,8552,0488 Trosa kommun 262 | Tyresö kommun,Tyresö kommun,Tyresö municipality,Tyresö,0138,138,Q113730,8536,0138 Tyresö kommun 263 | Uddevalla kommun,Uddevalla kommun,Uddevalla municipality,Uddevalla,1485,1485,Q501442,8690,1485 Uddevalla kommun 264 | Ulricehamns kommun,Ulricehamns kommun,Ulricehamn municipality,Ulricehamn,1491,1491,Q382808,8691,1491 Ulricehamns kommun 265 | Umeå kommun,Umeå kommun,Umeå municipality,Umeå,2480,2480,Q507709,8591,2480 Umeå kommun 266 | Upplands Väsby kommun,Upplands Väsby kommun,Upplands Väsby municipality,Upplands Väsby,0114,114,Q499425,8539,0114 Upplands Väsby kommun 267 | Upplands-Bro kommun,Upplands-Bro kommun,Upplands-Bro municipality,Upplands-Bro,0139,139,Q113673,8538,0139 Upplands-Bro kommun 268 | Uppsala kommun,Uppsala kommun,Uppsala municipality,Uppsala,0380,380,Q59091,8560,0380 Uppsala kommun 269 | Uppsala kommun före 2003,Uppsala kommun före 2003,Uppsala municipality before Knivsta broke away in 2003. Most of the time you would probably want to use Uppsala kommun even before 2003,Uppsala,0380,380,Q59091,8560,0380 Uppsala kommun före 2003 270 | Uppvidinge kommun,Uppvidinge kommun,Uppvidinge municipality,Uppvidinge,0760,760,Q515505,8426,0760 Uppvidinge kommun 271 | Vadstena kommun,Vadstena kommun,Vadstena municipality,Vadstena,0584,584,Q515969,8719,0584 Vadstena kommun 272 | Vaggeryds kommun,Vaggeryds kommun,Vaggeryd municipality,Vaggeryd,0665,665,Q605329,8404,0665 Vaggeryds kommun 273 | Valdemarsviks kommun,Valdemarsviks kommun,Valdemarsvik municipality,Valdemarsvik,0563,563,Q509997,8720,0563 Valdemarsviks kommun 274 | Vallentuna kommun,Vallentuna kommun,Vallentuna municipality,Vallentuna,0115,115,Q501526,8540,0115 Vallentuna kommun 275 | Vänersborgs kommun,Vänersborgs kommun,Vänersborg municipality,Vänersborg,1487,1487,Q511426,8693,1487 Vänersborgs kommun 276 | Vännäs kommun,Vännäs kommun,Vännäs municipality,Vännäs,2460,2460,Q500210,8594,2460 Vännäs kommun 277 | Vansbro kommun,Vansbro kommun,Vansbro municipality,Vansbro,2021,2021,Q501551,8362,2021 Vansbro kommun 278 | Vara kommun,Vara kommun,Vara municipality,Vara,1470,1470,Q501428,8692,1470 Vara kommun 279 | Varbergs kommun,Varbergs kommun,Varberg municipality,Varberg,1383,1383,Q179180,8383,1383 Varbergs kommun 280 | Vårgårda kommun,Vårgårda kommun,Vårgårda municipality,Vårgårda,1442,1442,Q511297,8694,1442 Vårgårda kommun 281 | Värmdö kommun,Värmdö kommun,Värmdö municipality,Värmdö,0120,120,Q493841,8542,0120 Värmdö kommun 282 | Värnamo kommun,Värnamo kommun,Värnamo municipality,Värnamo,0683,683,Q280562,8406,0683 Värnamo kommun 283 | Västerås kommun,Västerås kommun,Västerås municipality,Västerås,1980,1980,Q34550,8615,1980 Västerås kommun 284 | Västerviks kommun,Västerviks kommun,Västervik municipality,Västervik,0883,883,Q515477,8419,0883 Västerviks kommun 285 | Vaxholms kommun,Vaxholms kommun,Vaxholm municipality,Vaxholm,0187,187,Q500090,8541,0187 Vaxholms kommun 286 | Växjö kommun,Växjö kommun,Växjö municipality,Växjö,0780,780,Q500217,8427,0780 Växjö kommun 287 | Vellinge kommun,Vellinge kommun,Vellinge municipality,Vellinge,1233,1233,Q511338,8488,1233 Vellinge kommun 288 | Vetlanda kommun,Vetlanda kommun,Vetlanda municipality,Vetlanda,0685,685,Q505052,8405,0685 Vetlanda kommun 289 | Vilhelmina kommun,Vilhelmina kommun,Vilhelmina municipality,Vilhelmina,2462,2462,Q515861,8592,2462 Vilhelmina kommun 290 | Vimmerby kommun,Vimmerby kommun,Vimmerby municipality,Vimmerby,0884,884,Q505057,8418,0884 Vimmerby kommun 291 | Vindelns kommun,Vindelns kommun,Vindeln municipality,Vindeln,2404,2404,Q504505,8593,2404 Vindelns kommun 292 | Vingåkers kommun,Vingåkers kommun,Vingåker municipality,Vingåker,0428,428,Q249378,8553,0428 Vingåkers kommun 293 | Ydre kommun,Ydre kommun,Ydre municipality,Ydre,0512,512,Q515699,8721,0512 Ydre kommun 294 | Ystads kommun,Ystads kommun,Ystad municipality,Ystad,1286,1286,Q505102,8489,1286 Ystads kommun 295 | -------------------------------------------------------------------------------- /statscraper/datatypes/values/road_types.csv: -------------------------------------------------------------------------------- 1 | "id","label","description","dialect:wikidata","dialect:sv" 2 | "e-road","E-road","A road in the international E-road network","Q106123","e" 3 | "national_road","National road","A primary road class in many countries. Often crossing a large part of the country ","Q1716124","rv" 4 | "county_road","County road","A secondary road class in some countries.",,"lv" 5 | "street","Street","A public thoroughfare in a built environment, not part of a national road system","Q79007", 6 | "road","Other road","A generic road type","Q34442", 7 | -------------------------------------------------------------------------------- /statscraper/exceptions.py: -------------------------------------------------------------------------------- 1 | class InvalidID(Exception): 2 | """This string is not allowed as an id at this point. 3 | Note: Inherits from Exception instead of StandardError 4 | for Python3.x compatibility reasons.""" 5 | 6 | pass 7 | 8 | 9 | class NoSuchItem(IndexError): 10 | """No such Collection or Dataset.""" 11 | 12 | pass 13 | 14 | 15 | class DatasetNotInView(IndexError): 16 | """Tried to operate on a dataset that is not visible. 17 | 18 | This can be raised by a scraper if the cursor needs to 19 | move before inspecting an item. 20 | """ 21 | 22 | pass 23 | 24 | 25 | class InvalidData(Exception): 26 | """The scraper encountered some invalid data.""" 27 | 28 | pass 29 | 30 | 31 | class NoSuchDatatype(Exception): 32 | """No datatype with that id.""" 33 | 34 | pass 35 | -------------------------------------------------------------------------------- /statscraper/scrapers/CranesScraper.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | """ A scraper to fetch daily cranes sightings at Hornborgasjön 3 | from http://web05.lansstyrelsen.se/transtat_O/transtat.asp 4 | This is intended to be a minimal example of a scraper 5 | using Beautiful Soup. 6 | """ 7 | import requests 8 | from bs4 import BeautifulSoup 9 | from statscraper import BaseScraper, Dataset, Dimension, Result 10 | 11 | 12 | class Cranes(BaseScraper): 13 | 14 | def _fetch_itemslist(self, item): 15 | """ There is only one dataset. """ 16 | yield Dataset("Number of cranes") 17 | 18 | def _fetch_dimensions(self, dataset): 19 | """ Declaring available dimensions like this is not mandatory, 20 | but nice, especially if they differ from dataset to dataset. 21 | 22 | If you are using a built in datatype, you can specify the dialect 23 | you are expecting, to have values normalized. This scraper will 24 | look for Swedish month names (e.g. 'Januari'), but return them 25 | according to the Statscraper standard ('january'). 26 | """ 27 | yield Dimension(u"date", label="Day of the month") 28 | yield Dimension(u"month", datatype="month", dialect="swedish") 29 | yield Dimension(u"year", datatype="year") 30 | 31 | def _fetch_data(self, dataset, query=None): 32 | html = requests.get("http://web05.lansstyrelsen.se/transtat_O/transtat.asp").text 33 | soup = BeautifulSoup(html, 'html.parser') 34 | table = soup.find("table", "line").find_all("table")[2].findNext("table") 35 | rows = table.find_all("tr") 36 | column_headers = rows.pop(0).find_all("td", recursive=False) 37 | years = [x.text for x in column_headers[2:]] 38 | for row in rows: 39 | cells = row.find_all("td") 40 | date = cells.pop(0).text 41 | month = cells.pop(0).text 42 | i = 0 43 | for value in cells: 44 | # Each column from here is a year. 45 | if value.text: 46 | yield Result(value.text.encode("utf-8"), { 47 | "date": date, 48 | "month": month, 49 | "year": years[i], 50 | }) 51 | i += 1 52 | -------------------------------------------------------------------------------- /statscraper/scrapers/PXWebScraper.py: -------------------------------------------------------------------------------- 1 | """A wrapper around the PX-Web API. 2 | 3 | As implementations and versions vary, this is best used as a base class, 4 | for more specific scrapers to extend. 5 | 6 | If used directly, an API endpoint must be set: 7 | scraper = PXWeb(base_url="http://api.example.com/") 8 | # ...or: 9 | scraper = PXWeb() 10 | scraper.base_url = "http://api.example.com/" 11 | """ 12 | 13 | import requests 14 | from statscraper import (BaseScraper, Collection, Result, 15 | Dataset, Dimension, InvalidData) 16 | from statscraper.compat import JSONDecodeError 17 | 18 | 19 | class PXWeb(BaseScraper): 20 | """Scraper.""" 21 | 22 | base_url = None # API endpoint 23 | 24 | @BaseScraper.on("init") 25 | def _get_args(self, *args, **kwargs): 26 | """Store `base_url`, if given on init. 27 | 28 | This is convenient when the PXWeb scraper is used directly by an end user. 29 | """ 30 | if "base_url" in kwargs and kwargs["base_url"]: 31 | self.base_url = kwargs["base_url"] 32 | 33 | def _api_path(self, item): 34 | """Get the API path for the current cursor position.""" 35 | if self.base_url is None: 36 | raise NotImplementedError("base_url not set") 37 | path = "/".join([x.blob["id"] for x in item.path]) 38 | return "/".join([self.base_url, path]) 39 | 40 | def _fetch_itemslist(self, item): 41 | data = requests.get(self._api_path(item)).json() 42 | 43 | for d in data: 44 | if d["type"] == "l": 45 | yield Collection(d["id"], label=d["text"], blob=d) 46 | else: 47 | yield Dataset(d["id"], label=d["text"], blob=d) 48 | 49 | def _fetch_dimensions(self, dataset): 50 | data = requests.get(self._api_path(dataset)).json() 51 | try: 52 | for d in data["variables"]: 53 | yield Dimension(d["code"], 54 | label=d["text"], 55 | allowed_values=d["values"]) 56 | 57 | except KeyError: 58 | yield None 59 | 60 | def _fetch_data(self, dataset, query): 61 | if query is None: 62 | query = {} 63 | body = { 64 | 'query': [{ 65 | 'code': key, 66 | 'selection': { 67 | 'filter': filtertype, 68 | # value can be a list or a value 69 | 'values': value if isinstance(value, list) else [value] 70 | } 71 | } for key, (filtertype, value) in query.items()], 72 | 'response': { 73 | 'format': "json" 74 | } 75 | } 76 | try: 77 | raw = requests.post(self._api_path(dataset), json=body) 78 | if raw.headers["content-type"] == "text/html": 79 | # This is an error message 80 | raise(InvalidData(f"""Error message from PX Web: 81 | 82 | {raw.content} 83 | 84 | Check your query for spelling errors, or try reducing the size. 85 | """)) 86 | data = raw.json() 87 | except JSONDecodeError: 88 | raise InvalidData("""No valid response from PX Web. 89 | Check your query for spelling errors, or try reducing the size. 90 | This error is frequently due to a too large result being requested.""") 91 | 92 | # All available dimensions are not always returned. 93 | # What is returned depends on the query 94 | raw_return_dimension = data["columns"] 95 | # Filter out dimensions only 96 | raw_return_dimension = [x for x in raw_return_dimension if x["type"] != "c"] 97 | 98 | for row in data[u"data"]: 99 | for value in row[u"values"]: 100 | dimensions = {} 101 | # 'key' contains one value for each dimension, 102 | # always preserving order. 103 | for d, v in zip(raw_return_dimension, row[u"key"]): 104 | dimensions[d["code"]] = v 105 | 106 | yield Result(value, dimensions=dimensions) 107 | -------------------------------------------------------------------------------- /statscraper/scrapers/SCBScraper.py: -------------------------------------------------------------------------------- 1 | """A wrapper around the SCB API.""" 2 | from .PXWebScraper import PXWeb, Dimension 3 | 4 | 5 | class SCB(PXWeb): 6 | """The SCB API uses PXWeb. We just hardcode the url.""" 7 | 8 | base_url = 'https://api.scb.se/OV0104/v1/doris/sv/ssd' 9 | COUNTIES = [ 10 | "01", "03", "04", "05", "06", "07", "08", "09", "10", "12", "13", 11 | "14", "17", "18", "19", "20", "21", "22", "23", "24", "25" 12 | ] 13 | MUNICIPALITIES = [ 14 | "0114", "0115", "0117", "0120", "0123", "0125", "0126", "0127", "0128", 15 | "0136", "0138", "0139", "0140", "0160", "0162", "0163", "0180", "0181", 16 | "0182", "0183", "0184", "0186", "0187", "0188", "0191", "0192", "0305", 17 | "0319", "0330", "0331", "0360", "0380", "0381", "0382", "0428", "0461", 18 | "0480", "0481", "0482", "0483", "0484", "0486", "0488", "0509", "0512", 19 | "0513", "0560", "0561", "0562", "0563", "0580", "0581", "0582", "0583", 20 | "0584", "0586", "0604", "0617", "0642", "0643", "0662", "0665", "0680", 21 | "0682", "0683", "0684", "0685", "0686", "0687", "0760", "0761", "0763", 22 | "0764", "0765", "0767", "0780", "0781", "0821", "0834", "0840", "0860", 23 | "0861", "0862", "0880", "0881", "0882", "0883", "0884", "0885", "0980", 24 | "1060", "1080", "1081", "1082", "1083", "1214", "1230", "1231", "1233", 25 | "1256", "1257", "1260", "1261", "1262", "1263", "1264", "1265", "1266", 26 | "1267", "1270", "1272", "1273", "1275", "1276", "1277", "1278", "1280", 27 | "1281", "1282", "1283", "1284", "1285", "1286", "1287", "1290", "1291", 28 | "1292", "1293", "1315", "1380", "1381", "1382", "1383", "1384", "1401", 29 | "1402", "1407", "1415", "1419", "1421", "1427", "1430", "1435", "1438", 30 | "1439", "1440", "1441", "1442", "1443", "1444", "1445", "1446", "1447", 31 | "1452", "1460", "1461", "1462", "1463", "1465", "1466", "1470", "1471", 32 | "1472", "1473", "1480", "1481", "1482", "1484", "1485", "1486", "1487", 33 | "1488", "1489", "1490", "1491", "1492", "1493", "1494", "1495", "1496", 34 | "1497", "1498", "1499", "1715", "1730", "1737", "1760", "1761", "1762", 35 | "1763", "1764", "1765", "1766", "1780", "1781", "1782", "1783", "1784", 36 | "1785", "1814", "1860", "1861", "1862", "1863", "1864", "1880", "1881", 37 | "1882", "1883", "1884", "1885", "1904", "1907", "1960", "1961", "1962", 38 | "1980", "1981", "1982", "1983", "1984", "2021", "2023", "2026", "2029", 39 | "2031", "2034", "2039", "2061", "2062", "2080", "2081", "2082", "2083", 40 | "2084", "2085", "2101", "2104", "2121", "2132", "2161", "2180", "2181", 41 | "2182", "2183", "2184", "2260", "2262", "2280", "2281", "2282", "2283", 42 | "2284", "2303", "2305", "2309", "2313", "2321", "2326", "2361", "2380", 43 | "2401", "2403", "2404", "2409", "2417", "2418", "2421", "2422", "2425", 44 | "2460", "2462", "2463", "2480", "2481", "2482", "2505", "2506", "2510", 45 | "2513", "2514", "2518", "2521", "2523", "2560", "2580", "2581", "2582", 46 | "2583", "2584" 47 | ] 48 | 49 | def _fetch_dimensions(self, dataset): 50 | """Yield all dimensions. 51 | 52 | We override this method just to set the correct datatype 53 | and dialect for regions. 54 | """ 55 | for dimension in super()._fetch_dimensions(dataset): 56 | if dimension.id == "Region": 57 | yield Dimension(dimension.id, 58 | datatype="region", 59 | dialect="skatteverket", 60 | label=dimension.label) 61 | else: 62 | yield dimension 63 | 64 | def _fetch_data(self, dataset, query={}, by=None): 65 | """Allow adding municipalities or counties to a query.""" 66 | if by == "municipality": 67 | query["Region"] = ("vs:RegionKommun07EjAggr", self.MUNICIPALITIES) 68 | elif by == "county": 69 | query["Region"] = ("vs:RegionLän07EjAggr", self.COUNTIES) 70 | return super()._fetch_data(dataset, query) 71 | -------------------------------------------------------------------------------- /statscraper/scrapers/SMHIScraper.py: -------------------------------------------------------------------------------- 1 | try: 2 | import StringIO 3 | except ImportError: 4 | import io as StringIO 5 | 6 | import requests 7 | import csv 8 | from datetime import datetime 9 | from bs4 import BeautifulSoup 10 | 11 | from statscraper import BaseScraper, Collection, Dimension, Dataset, Result, DimensionValue 12 | 13 | VERSION = "1.0" 14 | # LEVELS = ["api","parameter"] 15 | PERIODS = [ 16 | "corrected-archive", 17 | "latest-hour", 18 | "latest-day", 19 | "latest-months", 20 | ] 21 | 22 | 23 | class SMHI(BaseScraper): 24 | base_url = "http://opendata.smhi.se/apidocs/" 25 | 26 | def _fetch_itemslist(self, current_item): 27 | """ Get a all available apis 28 | """ 29 | if current_item.is_root: 30 | html = requests.get(self.base_url).text 31 | soup = BeautifulSoup(html, 'html.parser') 32 | for item_html in soup.select(".row .col-md-6"): 33 | try: 34 | label = item_html.select_one("h2").text 35 | except Exception: 36 | continue 37 | yield API(label, blob=item_html) 38 | else: 39 | # parameter = current_item.parent 40 | # data = requests.get(parameter.url) 41 | for resource in current_item.json["resource"]: 42 | label = u"{}, {}".format(resource["title"], resource["summary"]) 43 | yield SMHIDataset(label, blob=resource) 44 | 45 | def _fetch_dimensions(self, parameter): 46 | yield StationDimension("station") 47 | # Hack: This redundant of the station dimension, but 48 | # necessary to be able to include both station name 49 | # (=readabilty) and key in resultset. 50 | # It would be better if the ResultSet object could 51 | # handle both label and key print. 52 | yield Dimension("station_key") 53 | yield Dimension("period", allowed_values=PERIODS) 54 | yield Dimension("parameter") 55 | 56 | example_data = parameter._get_example_csv() 57 | for dim in example_data.columns: 58 | yield Dimension(dim) 59 | 60 | def _fetch_allowed_values(self, dimension): 61 | if dimension.id == "station": 62 | for station in dimension.dataset.json["station"]: 63 | yield Station( 64 | station["key"], 65 | dimension, 66 | label=station["name"], 67 | blob=station 68 | ) 69 | else: 70 | yield None 71 | 72 | def _fetch_data(self, dataset, query={}, include_inactive_stations=False): 73 | """ Should yield dataset rows 74 | """ 75 | parameter = dataset 76 | station_dim = dataset.dimensions["station"] 77 | all_stations = station_dim.allowed_values 78 | # Step 1: Prepare query 79 | if "station" not in query: 80 | if include_inactive_stations: 81 | # Get all stations 82 | query["station"] = list(all_stations) 83 | else: 84 | # Get only active stations 85 | query["station"] = list(station_dim.active_stations()) 86 | else: 87 | if not isinstance(query["station"], list): 88 | query["station"] = [query["station"]] 89 | # Make sure that the queried stations actually exist 90 | query["station"] = [all_stations.get_by_label(x) for x in query["station"]] 91 | 92 | if "period" not in query: 93 | # TODO: I'd prepare to do dataset.get("period").allowed_values here 94 | query["period"] = PERIODS 95 | 96 | elif not isinstance(query["period"], list): 97 | query["period"] = [query["period"]] 98 | 99 | for period in query["period"]: 100 | if period not in PERIODS: 101 | msg = u"{} is not an allowed period".format(period) 102 | raise Exception(msg) 103 | 104 | # Step 3: Get data 105 | for station in query["station"]: 106 | for period in query["period"]: 107 | url = dataset.url.replace( 108 | ".json", 109 | f"/station/{station.key}/period/{period}/data.csv" 110 | ) 111 | r = requests.get(url) 112 | 113 | if r.status_code == 200: 114 | raw_data = DataCsv().from_string(r.content).to_dictlist() 115 | 116 | # TODO: This is a very hard coded parse function 117 | # Expects fixed start row and number of cols 118 | for row in raw_data: 119 | value_col = parameter.id.split(",")[0] 120 | value = float(row[value_col]) 121 | 122 | row["parameter"] = parameter.id 123 | row["station"] = station.label 124 | row["station_key"] = station.key 125 | row["period"] = period 126 | 127 | row.pop(value_col, None) 128 | 129 | datapoint = Result(value, row) 130 | 131 | yield datapoint 132 | 133 | elif r.status_code == 404: 134 | print("Warning no data at {}".format(url)) 135 | else: 136 | raise Exception("Connection error for {}".format(url)) 137 | 138 | 139 | class API(Collection): 140 | """ 141 | """ 142 | level = "api" 143 | 144 | @property 145 | def key(self): 146 | return self.blob.select_one("a").get("href").replace("/index.html", "") 147 | 148 | @property 149 | def url(self): 150 | return "http://opendata-download-{}.smhi.se/api/version/{}.json"\ 151 | .format(self.key, VERSION) 152 | 153 | @property 154 | def json(self): 155 | return self._get_json_blob() 156 | 157 | def _get_json_blob(self): 158 | # Update blob 159 | error_msg = "Scraper does not support parsing of '{}' yet.".format(self.id) 160 | try: 161 | r = requests.get(self.url) 162 | except Exception: 163 | # Catch ie. "opendata-download-grid.smhi.se" 164 | raise NotImplementedError(error_msg) 165 | if r.status_code == 404: 166 | raise NotImplementedError(error_msg) 167 | 168 | return r.json() 169 | 170 | 171 | class StationDimension(Dimension): 172 | 173 | def active_stations(self): 174 | """ Get a list of all active stations 175 | """ 176 | return (x for x in self.allowed_values if x.is_active) 177 | 178 | 179 | class Station(DimensionValue): 180 | def __init__(self, value, dimension, label=None, blob=None): 181 | super(Station, self).__init__(value, dimension, label=label) 182 | 183 | self.key = value 184 | self.summary = blob["summary"] 185 | self.updated = datetime.fromtimestamp(blob["updated"]/1000) 186 | self.blob = blob 187 | 188 | # Was there an update in the last 100 days? 189 | self.is_active = (datetime.now() - self.updated).days < 100 190 | 191 | def __repr__(self): 192 | if self.is_active: 193 | status = "active" 194 | else: 195 | status = "inactive" 196 | return ""\ 197 | .format(self.label.encode("utf-8"), status) 198 | 199 | 200 | class SMHIDataset(Dataset): 201 | @property 202 | def key(self): 203 | return self.blob["key"] 204 | 205 | @property 206 | def url(self): 207 | api = self.parent 208 | return "http://opendata-download-{}.smhi.se/api/version/{}/parameter/{}.json"\ 209 | .format(api.key, VERSION, self.key) 210 | 211 | @property 212 | def json(self): 213 | if not hasattr(self, "_json"): 214 | self._json = requests.get(self.url).json() 215 | return self._json 216 | 217 | def get_stations_list(self): 218 | """ Get a dict list of all stations with properties such as 219 | latitude and longitude 220 | """ 221 | stations = self.dimensions["station"].allowed_values 222 | return self._format_station_list(stations) 223 | 224 | def get_active_stations_list(self): 225 | """ Get a dict list of all stations with properties such as 226 | latitude and longitude 227 | """ 228 | stations = self.dimensions["station"].active_stations() 229 | return self._format_station_list(stations) 230 | 231 | def _get_example_csv(self): 232 | """For dimension parsing 233 | """ 234 | station_key = self.json["station"][0]["key"] 235 | period = "corrected-archive" 236 | url = self.url.replace( 237 | ".json", 238 | f"/station/{station_key}/period/{period}/data.csv" 239 | ) 240 | 241 | r = requests.get(url) 242 | if r.status_code == 200: 243 | return DataCsv().from_string(r.content) 244 | else: 245 | raise Exception("Error connecting to api") 246 | 247 | def _format_station_list(self, stations): 248 | data = [] 249 | for station in stations: 250 | json_data = station.blob 251 | # Inlude all props but link 252 | json_data.pop('link', None) 253 | data.append(station.blob) 254 | 255 | return data 256 | 257 | 258 | class DataCsv(object): 259 | columns = [] 260 | data = [] 261 | 262 | def from_file(self, file_path): 263 | with open(file_path) as f: 264 | self._parse(f) 265 | 266 | return self 267 | 268 | def from_string(self, csv_content): 269 | if isinstance(csv_content, bytes): 270 | csv_content = csv_content.decode("utf-8") 271 | f = StringIO.StringIO(csv_content) 272 | self._parse(f) 273 | 274 | return self 275 | 276 | def to_dictlist(self): 277 | return [ 278 | dict(zip(self.columns, row)) 279 | for row in self.data 280 | ] 281 | 282 | def _parse(self, f): 283 | rows = list(csv.reader(f, delimiter=';')) 284 | tables = [] 285 | table = [] 286 | for i, row in enumerate(rows): 287 | is_last = i == len(rows) - 1 288 | 289 | # Check if new table 290 | if is_empty(row): 291 | if len(table) > 0: 292 | tables.append(table) 293 | table = [] 294 | continue 295 | 296 | is_header = len(table) == 0 297 | if is_header: 298 | n_cols = table_width(row) 299 | 300 | table.append(row[:n_cols]) 301 | 302 | if is_last: 303 | tables.append(table) 304 | 305 | data_table = tables[-1] 306 | self.columns = data_table[0] 307 | try: 308 | self.data = data_table[1:] 309 | except IndexError: 310 | self.data = [] 311 | 312 | 313 | def is_empty(row): 314 | """ Check if a csv row (represented as a list 315 | of values) is empty. 316 | 317 | [] => True 318 | ["","","foo"] => True 319 | ["foo","bar"] => False 320 | """ 321 | if len(row) == 0: 322 | return True 323 | if row[0] == "": 324 | return True 325 | return False 326 | 327 | 328 | def table_width(row): 329 | """ Get number of cols in row 330 | ["col1", "col2","","","other_col"] => 2 331 | """ 332 | 333 | for i, val in enumerate(row): 334 | if val == "": 335 | break 336 | return i 337 | -------------------------------------------------------------------------------- /statscraper/scrapers/StatistikcentralenScraper.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | """ A wrapper around the Statistikcentralen/Tilastokeskus API, 3 | demonstrating how to extend a scraper in the scraper park. 4 | 5 | The user can select 'fi' or 'sv' as their prefered language like this: 6 | 7 | scraper = Statistikcentralen("fi") 8 | # ...or: 9 | scraper = Statistikcentralen() 10 | scraper.lang = "fi" 11 | """ 12 | from PXWebScraper import PXWeb 13 | 14 | 15 | class Statistikcentralen(PXWeb): 16 | 17 | lang = "sv" 18 | _available_languages = ["sv", "fi"] 19 | 20 | @property 21 | def base_url(self): 22 | return 'http://pxnet2.stat.fi/pxweb/api/v1/%s/StatFin/' % self.lang 23 | 24 | @PXWeb.on("init") 25 | def _get_lang(self, *args, **kwargs): 26 | """ Let users select language 27 | """ 28 | if "lang" in kwargs: 29 | if kwargs["lang"] in self._available_languages: 30 | self.lang = kwargs["lang"] 31 | -------------------------------------------------------------------------------- /statscraper/scrapers/VantetiderScraper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import requests_cache 4 | from requests.exceptions import RequestException 5 | from itertools import product 6 | import re 7 | 8 | requests_cache.install_cache() 9 | 10 | from statscraper.base_scraper import (BaseScraper, Collection, 11 | Dataset, Dimension, Result) 12 | 13 | BASE_URL = u"http://www.vantetider.se/Kontaktkort/" 14 | 15 | class VantetiderScraper(BaseScraper): 16 | 17 | def _fetch_itemslist(self, current_item): 18 | # Get start page 19 | html = self._get_html(BASE_URL + "Sveriges") 20 | soup = BeautifulSoup(html, 'html.parser') 21 | # Get links to datasets 22 | links = soup.find_all("ul", {"class":"main-nav page-width"})[0]\ 23 | .find_all("li")[1]\ 24 | .find_all("a")\ 25 | [2:] # First two are _not relevant 26 | 27 | ids = [x.get("href").split("/Sveriges/")[-1].replace("/","") 28 | for x in links] 29 | labels = [x.text for x in links] 30 | 31 | for id_, label in zip(ids, labels): 32 | # Get html of dataset page 33 | yield VantetiderDataset(id_, label=label) 34 | 35 | def _fetch_dimensions(self, dataset): 36 | dimensions = {} 37 | dataset_id = dataset.id 38 | try: 39 | form = [x for x in dataset.soup.find_all("form") 40 | if "/Kontaktkort/" in x.get("action")][0] 41 | except IndexError: 42 | # http://www.vantetider.se/Kontaktkort/Sveriges/Aterbesok 43 | # does not have form element 44 | form = dataset.soup.find("div", {"class": "container_12 filter_section specialised_operation"}) 45 | self._form = form 46 | 47 | # 1. Get select elements (dropdowns) 48 | select_elems = form.find_all("select") 49 | for elem in select_elems: 50 | elem_id = elem.get("name") 51 | dim_id = elem_id.replace("select_","") 52 | 53 | dim = VantetiderDimension(dim_id) 54 | dim.elem_id = elem_id 55 | dim.elem = elem 56 | yield dim 57 | 58 | # 2. Get checkboxes (gender, ownership) 59 | checkbox_elems = [x for x in form.find_all("input", {"type": "checkbox"})] 60 | checkbox_labels = [x.text for x in form.find_all("label", {"class": "checkbox"})] 61 | for elem, label in zip(checkbox_elems, checkbox_labels): 62 | elem_id = elem.get("name") 63 | dim_id = elem_id.replace("checkbox_","") 64 | 65 | dim = VantetiderDimension(dim_id) 66 | dim.elem_id = elem_id 67 | dim.elem = elem 68 | yield dim 69 | 70 | 71 | # 3. Get radio buttons 72 | radio_elems = [x for x in form.find_all("input", {"type": "radio"})] 73 | elem_ids = get_unique([x.get("name") for x in radio_elems]) 74 | 75 | for elem_id in elem_ids: 76 | elems = [x for x in radio_elems if x.get("name") == elem_id] 77 | dim_id = elem_id.replace("","") 78 | 79 | dim = VantetiderDimension(dim_id) 80 | dim.elem_id = elem_id 81 | dim.elem = elems 82 | yield dim 83 | 84 | 85 | # 4. Add measure and measure key 86 | yield VantetiderDimension("measure", label="Nyckeltal") 87 | 88 | 89 | def _fetch_data(self, dataset, query): 90 | only_region = query.keys() == ["region"] 91 | NO_QUERY_DIMS = ["measure"] 92 | # 93 | NOT_IMPLEMENTED_DIMS = ["unit", "services"] 94 | 95 | for dim_id in NOT_IMPLEMENTED_DIMS: 96 | if dim_id in query.keys(): 97 | msg = "Querying by {} is not implemented.".format(dim_id) 98 | raise NotImplementedError(msg) 99 | 100 | form_keys = [x.elem_id for x in dataset.dimensions if x.id not in NO_QUERY_DIMS] 101 | 102 | queries = [] 103 | 104 | # Create payload for post request 105 | # Get a list of values to query by 106 | query_values = [] 107 | for dim in dataset.dimensions: 108 | if dim.id in NO_QUERY_DIMS: 109 | continue 110 | 111 | # Pass default value if dimension is not in query 112 | if dim.id not in query: 113 | value = [dim.default_value] 114 | 115 | else: 116 | # Translate passed values to ids 117 | value = query[dim.id] 118 | if not isinstance(value, list): 119 | value = [value] 120 | 121 | if value is None: 122 | raise ValueError() 123 | query_values.append(value) 124 | 125 | queries = list(product(*query_values)) 126 | 127 | self.log.info(u"Making a total of {} queries".format(len(queries))) 128 | 129 | data = [] 130 | 131 | for _query in queries: 132 | payload = dict(zip(form_keys, _query)) 133 | url = dataset.get_url(payload["select_region"]) 134 | 135 | for row in dataset._parse_result_page(url, payload, only_region=only_region): 136 | yield row 137 | 138 | 139 | # HELPER METHODS 140 | def _get_html(self, url): 141 | """ Get html from url 142 | """ 143 | self.log.info(u"/GET {}".format(url)) 144 | r = requests.get(url) 145 | if hasattr(r, 'from_cache'): 146 | if r.from_cache: 147 | self.log.info("(from cache)") 148 | 149 | if r.status_code != 200: 150 | throw_request_err(r) 151 | 152 | return r.content 153 | 154 | def _post_html(self, url, payload): 155 | self.log.info(u"/POST {} with {}".format(url, payload)) 156 | r = requests.post(url, payload) 157 | if r.status_code != 200: 158 | throw_request_err(r) 159 | 160 | return r.content 161 | 162 | def _get_json(self, url): 163 | """ Get json from url 164 | """ 165 | self.log.info(u"/GET " + url) 166 | r = requests.get(url) 167 | if hasattr(r, 'from_cache'): 168 | if r.from_cache: 169 | self.log.info("(from cache)") 170 | if r.status_code != 200: 171 | throw_request_err(r) 172 | 173 | return r.json() 174 | 175 | 176 | 177 | @property 178 | def log(self): 179 | if not hasattr(self, "_logger"): 180 | self._logger = PrintLogger() 181 | return self._logger 182 | 183 | 184 | class VantetiderDataset(Dataset): 185 | 186 | def get_url(self, region="Sverige"): 187 | region_slug = self._get_region_slug(region) 188 | return BASE_URL + region_slug + "/" + self.id 189 | 190 | @property 191 | def html(self): 192 | if not hasattr(self, "_html"): 193 | url = self.get_url() 194 | self._html = self.scraper._get_html(url) 195 | return self._html 196 | 197 | @property 198 | def soup(self): 199 | return BeautifulSoup(self.html, 'html.parser') 200 | 201 | @property 202 | def regions(self): 203 | """ Get a list of all regions 204 | """ 205 | regions = [] 206 | elem = self.dimensions["region"].elem 207 | for option_elem in elem.find_all("option"): 208 | region = option_elem.text.strip() 209 | regions.append(region) 210 | 211 | return regions 212 | 213 | 214 | def _get_region_slug(self, id_or_label): 215 | """ Get the regional slug to be used in url 216 | "Norrbotten" => "Norrbottens" 217 | 218 | :param id_or_label: Id or label of region 219 | """ 220 | #region = self.dimensions["region"].get(id_or_label) 221 | region = id_or_label 222 | slug = region\ 223 | .replace(u" ","-")\ 224 | .replace(u"ö","o")\ 225 | .replace(u"Ö","O")\ 226 | .replace(u"ä","a")\ 227 | .replace(u"å","a") + "s" 228 | 229 | EXCEPTIONS = { 230 | "Jamtland-Harjedalens": "Jamtlands", 231 | "Rikets": "Sveriges", 232 | } 233 | if slug in EXCEPTIONS: 234 | slug = EXCEPTIONS[slug] 235 | 236 | return slug 237 | 238 | def _parse_result_page(self, url, payload, only_region=False): 239 | """ Get data from a result page 240 | :param url: url to query 241 | :param payload: payload to pass 242 | :return: a dictlist with data 243 | """ 244 | data = [] 245 | try: 246 | 247 | if only_region: 248 | html = self.scraper._get_html(url) 249 | else: 250 | html = self.scraper._post_html(url, payload=payload) 251 | 252 | except RequestException500: 253 | 254 | self.scraper.log.warning(u"Status code 500 on {} with {}".format(url, payload)) 255 | return None 256 | 257 | 258 | current_selection = self._get_current_selection(html) 259 | 260 | table = Datatable(html) 261 | data = [] 262 | for row in table.data: 263 | region_or_unit_id, region_or_unit_label = row["region_or_unit"] 264 | if region_or_unit_label in self.regions: 265 | row["region"] = region_or_unit_label 266 | row["unit"] = None 267 | else: 268 | row["region"] = None 269 | row["unit"] = region_or_unit_label 270 | 271 | value = row["value"] 272 | 273 | row.pop("value", None) 274 | row.pop("region_or_unit", None) 275 | 276 | for dim in self.dimensions: 277 | if dim.id not in row: 278 | row[dim.id] = current_selection[dim.id][1] # gets label 279 | 280 | 281 | 282 | data.append(Result(value, row)) 283 | return data 284 | 285 | def _get_current_selection(self, html): 286 | if isinstance(html, str): 287 | html = BeautifulSoup(html, "html.parser") 288 | current_selection = {} 289 | for dim in self.dimensions: 290 | if dim.id in ["measure"]: 291 | continue 292 | 293 | elem = html.select("[name={}]".format(dim.elem_id)) 294 | 295 | if len(elem) > 1 or len(elem) == 0: 296 | import pdb;pdb.set_trace() 297 | raise Exception("DEBUG!") 298 | else: 299 | elem = elem[0] 300 | 301 | if dim.elem_type == "select": 302 | try: 303 | option_elem = elem.select_one("[selected]") 304 | selected_id = get_option_value(option_elem) 305 | selected_label = get_option_text(option_elem) 306 | except AttributeError: 307 | option_elem = elem.select_one("option") 308 | selected_id = get_option_value(option_elem) 309 | selected_label = get_option_text(option_elem) 310 | 311 | selected_cat = selected_id 312 | elif dim.elem_type == "radio": 313 | import pdb;pdb.set_trace() 314 | raise NotImplementedError() 315 | elif dim.elem_type == "checkbox": 316 | selected_cat = elem.has_attr("checked") 317 | selected_label = selected_cat 318 | 319 | current_selection[dim.id] = (selected_cat, selected_label) 320 | 321 | return current_selection 322 | 323 | class VantetiderDimension(Dimension): 324 | """docstring for VantetiderDimension""" 325 | 326 | @property 327 | def elem_type(self): 328 | """ :returns: "select"|"radio"|"checkbox" 329 | """ 330 | if not hasattr(self, "_elem_type"): 331 | self._elem_type = get_elem_type(self.elem) 332 | return self._elem_type 333 | 334 | 335 | @property 336 | def default_value(self): 337 | """ The default category when making a query 338 | """ 339 | if not hasattr(self, "_default_value"): 340 | if self.elem_type == "select": 341 | try: 342 | # Get option marked "selected" 343 | def_value = get_option_value(self.elem.select_one("[selected]")) 344 | except AttributeError: 345 | # ...or if that one doesen't exist get the first option 346 | def_value = get_option_value(self.elem.select_one("option")) 347 | 348 | elif self.elem_type == "checkbox": 349 | def_value = self.elem.get("value") 350 | 351 | elif self.elem_type == "radio": 352 | def_value = [x for x in self.elem if x.has_attr("checked")][0].get("value") 353 | 354 | self._default_value = def_value 355 | 356 | assert def_value is not None 357 | 358 | return self._default_value 359 | 360 | class PrintLogger(): 361 | """ Empyt "fake" logger 362 | """ 363 | 364 | def log(self, msg, *args, **kwargs): 365 | print(msg) 366 | 367 | def debug(self, msg, *args, **kwargs): 368 | print(msg) 369 | 370 | def info(self, msg, *args, **kwargs): 371 | print(msg) 372 | 373 | def warning(self, msg, *args, **kwargs): 374 | print(msg) 375 | 376 | def error(self, msg, *args, **kwargs): 377 | print(msg) 378 | 379 | def critical(self, msg, *args, **kwargs): 380 | print(msg) 381 | 382 | 383 | # UTILS 384 | class Datatable(object): 385 | def __init__(self, html): 386 | self.soup = BeautifulSoup(html, 'html.parser') 387 | self.data = self._parse_values() 388 | self._measures = None 389 | # Assumption: the data table is the last table on the page 390 | 391 | 392 | @property 393 | def has_tabs(self): 394 | """ Does the table have tabs? 395 | Like http://www.vantetider.se/Kontaktkort/Sveriges/VantatKortareAn60Dagar/ 396 | """ 397 | return len(self.soup.select(".table_switch")) > 0 398 | 399 | @property 400 | def has_horizontal_scroll(self): 401 | """ Does the table have horizontal scroll? 402 | Like http://www.vantetider.se/Kontaktkort/Sveriges/VantatKortareAn60Dagar/ 403 | """ 404 | return len(self.soup.select(".DTFC_ScrollWrapper")) > 0 405 | 406 | @property 407 | def has_vertical_scroll(self): 408 | """ Does the table have vertical scroll? 409 | Like http://www.vantetider.se/Kontaktkort/Sveriges/PrimarvardTelefon/ 410 | """ 411 | return bool(self.soup.select_one("#DataTables_Table_0_wrapper")) 412 | 413 | 414 | 415 | @property 416 | def measures(self): 417 | """ Get a list of the measuers of this datatable 418 | Measures can be "Antal Besök inom 7 dagar", 419 | "Måluppfyllelse vårdgarantin", etc 420 | """ 421 | if self._measures == None: 422 | self._measures = get_unique([x["measure"] for x in self.data]) 423 | 424 | return self._measures 425 | 426 | def _parse_values(self): 427 | """ Get values 428 | """ 429 | data = [] 430 | if self.has_tabs: 431 | def _parse_tab_text(tab): 432 | # Annoying html in tabs 433 | if tab.select_one(".visible_normal"): 434 | return tab.select_one(".visible_normal").text 435 | else: 436 | return tab.text 437 | 438 | sub_table_ids = [_parse_tab_text(x) for x in self.soup.select(".table_switch li")] 439 | sub_tables = self.soup.select(".dataTables_wrapper") 440 | assert len(sub_tables) == len(sub_table_ids) 441 | assert len(sub_tables) > 0 442 | 443 | for measure, table in zip(sub_table_ids, sub_tables): 444 | if self.has_horizontal_scroll: 445 | _data = self._parse_horizontal_scroll_table(table) 446 | for region, col, value in _data: 447 | data.append({ 448 | "region_or_unit": region, 449 | "select_period": col, # Hardcode warning! 450 | "measure": measure, 451 | }) 452 | 453 | else: 454 | if self.has_horizontal_scroll: 455 | raise NotImplementedError() 456 | 457 | if self.has_vertical_scroll: 458 | table = self.soup.select_one("#DataTables_Table_0_wrapper") 459 | _data = self._parse_vertical_scroll_table(table) 460 | else: 461 | table = self.soup.select(".chart.table.scrolling")[-1] 462 | _data = self._parse_regular_table(table) 463 | 464 | for region, measure, value in _data: 465 | data.append({ 466 | "region_or_unit": region, 467 | "measure": measure, 468 | "value": value 469 | }) 470 | 471 | return data 472 | 473 | def _parse_horizontal_scroll_table(self, table_html): 474 | """ Get list of dicts from horizontally scrollable table 475 | """ 476 | row_labels = [parse_text(x.text) for x in table_html.select(".DTFC_LeftBodyWrapper tbody tr")] 477 | row_label_ids = [None] * len(row_labels) 478 | cols = [parse_text(x.text) for x in table_html.select(".dataTables_scrollHead th")] 479 | value_rows = table_html.select(".dataTables_scrollBody tbody tr") 480 | 481 | values = [] 482 | for row_i, value_row in enumerate(value_rows): 483 | row_values = [parse_value(x.text) for x in value_row.select("td")] 484 | values.append(row_values) 485 | 486 | sheet = Sheet(zip(row_label_ids, row_labels), cols, values) 487 | 488 | return sheet.long_format 489 | 490 | def _parse_vertical_scroll_table(self, table_html): 491 | value_rows = table_html.select("tbody tr") 492 | row_labels = [parse_text(x.select_one("td").text) for x in value_rows] 493 | row_label_ids = [None] * len(row_labels) 494 | if table_html.select_one("td .clickable"): 495 | row_label_ids = [parse_landsting(x.select_one("td .clickable").get("onclick")) for x in value_rows] 496 | 497 | cols = [parse_text(x.text) for x in table_html.select(".dataTables_scrollHead th")][1:] 498 | values = [] 499 | for row in value_rows: 500 | row_values = [ parse_value(x.text) for x in row.select("td")[1:] ] 501 | values.append(row_values) 502 | 503 | sheet = Sheet(zip(row_label_ids, row_labels), cols, values) 504 | 505 | return sheet.long_format 506 | 507 | def _parse_regular_table(self, table_html): 508 | value_rows = table_html.select("tbody tr") 509 | row_labels = [parse_text(x.select_one("td").text) for x in value_rows] 510 | row_label_ids = [None] * len(row_labels) 511 | if table_html.select_one("td .clickable"): 512 | row_label_ids = [parse_landsting(x.select_one("td .clickable").get("onclick")) for x in value_rows] 513 | cols = [parse_text(x.text) for x in table_html.select("th")][1:] 514 | values = [] 515 | for row in value_rows: 516 | row_values = [ parse_value(x.text) for x in row.select("td")[1:] ] 517 | values.append(row_values) 518 | 519 | sheet = Sheet(zip(row_label_ids, row_labels), cols, values) 520 | 521 | return sheet.long_format 522 | 523 | 524 | 525 | class Sheet(object): 526 | """ Represents a two-dimensional sheet/table with data 527 | """ 528 | def __init__(self, rows, cols, values): 529 | """ 530 | :param rows: a list with row values 531 | :param cols: a list with column headers 532 | :param values: a list of lists with row values 533 | """ 534 | self.values_by_row = values 535 | self.values = flatten(values) 536 | 537 | if len(rows) * len(cols) == len(self.values): 538 | msg = ("Error initing sheet. Factor of n rows ({})", 539 | "and cols ({}) don't add up. Got {}, expected {}."\ 540 | .format(len(rows), len(cols), len(rows) * len(cols), len(self.values))) 541 | 542 | assert len(rows) == len(values) 543 | assert len(cols) == len(values[0]) 544 | 545 | self.row_index = rows 546 | self.col_index = cols 547 | 548 | @property 549 | def as_dictlist(self): 550 | """ Returns a dictlist with values 551 | [ 552 | { 553 | "row": "row_a", 554 | "col": "col_a", 555 | "value": 1, 556 | } 557 | ] 558 | """ 559 | data = [] 560 | for row_i, row in enumerate(self.row_index): 561 | for col_i, col in enumerate(self.col_index): 562 | value = self.values_by_row[row_i][col_i] 563 | data.append({ 564 | "row": row, 565 | "col": col, 566 | "value": value, 567 | }) 568 | return data 569 | 570 | @property 571 | def long_format(self): 572 | return zip( 573 | repeat(self.row_index, len(self.col_index)), 574 | self.col_index * len(self.row_index), 575 | self.values 576 | ) 577 | 578 | def get_unique(l): 579 | """ Get unique values from list 580 | Placed outside the class beacuse `list` conflicts our internal 581 | method with the same name. 582 | """ 583 | return list(set(l)) 584 | 585 | def get_elem_type(elem): 586 | """ Get elem type of soup selection 587 | :param elem: a soup element 588 | """ 589 | elem_type = None 590 | if isinstance(elem, list): 591 | if elem[0].get("type") == "radio": 592 | elem_type = "radio" 593 | else: 594 | raise ValueError(u"Unknown element type: {}".format(elem)) 595 | 596 | elif elem.name == "select": 597 | elem_type = "select" 598 | 599 | elif elem.name == "input": 600 | elem_type = elem.get("type") 601 | 602 | else: 603 | raise ValueError(u"Unknown element type: {}".format(elem)) 604 | 605 | # To be removed 606 | assert elem_type is not None 607 | 608 | return elem_type 609 | 610 | def get_option_value(elem): 611 | """ Get the value attribute, or if it doesn't exist the text 612 | content. 613 | => "foo" 614 | => "bar" 615 | :param elem: a soup element 616 | """ 617 | value = elem.get("value") 618 | if value is None: 619 | value = elem.text.strip() 620 | if value is None or value == "": 621 | msg = u"Error parsing value from {}.".format(elem) 622 | raise ValueError(msg) 623 | 624 | return value 625 | 626 | def get_option_text(elem): 627 | """ Get the text of option 628 | => "bar" 629 | => "bar" 630 | :param elem: a soup element 631 | """ 632 | return elem.text.strip() 633 | 634 | 635 | def parse_value(val): 636 | """ Parse values from html 637 | """ 638 | val = val.replace("%", " ")\ 639 | .replace(" ","")\ 640 | .replace(",", ".")\ 641 | .replace("st","").strip() 642 | 643 | missing = ["Ejdeltagit", "N/A"] 644 | if val in missing: 645 | return val 646 | elif val == "": 647 | return None 648 | 649 | return float(val) 650 | 651 | def parse_text(val): 652 | """ Format strings fetched from html 653 | """ 654 | return val.replace("\n", " ").strip() 655 | 656 | def parse_landsting(val): 657 | """ Get region/unit id from "handle_click_event_landsting(this, 1)" 658 | """ 659 | try: 660 | return re.search("\(this, (\d+)", val).group(1) 661 | except AttributeError: 662 | return None 663 | 664 | def is_string(val): 665 | return isinstance(val, str) or isinstance(val, unicode) 666 | 667 | def flatten(l): 668 | """Flatten list of lists 669 | """ 670 | return [item for sublist in l for item in sublist] 671 | 672 | def repeat(l, n): 673 | """ Repeat all items in list n times 674 | repeat([1,2,3], 2) => [1,1,2,2,3,3] 675 | http://stackoverflow.com/questions/24225072/repeating-elements-of-a-list-n-times 676 | """ 677 | return [x for x in l for i in range(n)] 678 | 679 | def is_int(s): 680 | try: 681 | int(s) 682 | return True 683 | except ValueError: 684 | return False 685 | 686 | def guess_measure_unit(values): 687 | last_words = [x.split(" ")[-1] for x in values] 688 | counts = Counter(last_words).most_common() 689 | max_share = float(counts[0][1] / float(len(values)) ) 690 | if max_share <= 0.5: 691 | raise ParseError(u"Not sure how to interpret the measure unit in: {}".format(values)) 692 | 693 | return counts[0][0] 694 | 695 | class RequestException404(RequestException): 696 | pass 697 | 698 | class RequestException500(RequestException): 699 | pass 700 | -------------------------------------------------------------------------------- /statscraper/scrapers/VehicleScraper.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | import pandas as pd 4 | import json 5 | from statscraper import BaseScraper, Dataset, Dimension, Result 6 | 7 | MONTHS = ['januari', 'februari', 'mars', 'april', 'maj', 'juni' 8 | 'juli', 'augusti', 'september', 'oktober', 'november', 'december'] 9 | 10 | 11 | class Vehicles(BaseScraper): 12 | """Vehicle statistics from Transportstyrelsen. 13 | 14 | :return: :class:`Vehicles ` object 15 | :rtype: statscraper.BaseScraper 16 | 17 | Usage:: 18 | 19 | >>> from statscraper.scrapers import Vehicles 20 | >>> scraper = Vehicles() 21 | >>> scraper.items 22 | # [] 23 | """ 24 | 25 | BASE_URL = ('https://www.transportstyrelsen.se/globalassets/' 26 | 'global/press/statistik/fordonsstatistik/{year}/' 27 | 'fordonsstatistik-{month}-{year}.xlsx') 28 | 29 | def _clean_data(self, df, year, month): 30 | df = df.dropna(how='all', axis=1) 31 | df = df.dropna(how='all', axis=0) 32 | df = df.drop('Totalsumma', axis=1) 33 | df = df.rename(columns={'Unnamed: 1': 'vehicle_type'}) 34 | df = df[df['vehicle_type'] != 'Totalsumma'] 35 | df.loc[:, 'year'] = year 36 | df.loc[:, 'month'] = month 37 | df = pd.melt(df, 38 | id_vars=['vehicle_type', 'month', 'year'], 39 | value_vars=['AVREGISTRERAD', 'AVSTÄLLD', 'ITRAFIK'], 40 | var_name='status') 41 | return df 42 | 43 | def _fetch_itemslist(self, item): 44 | """There's one dataset spread out in many files.""" 45 | yield Dataset('Vehicles') 46 | 47 | def _fetch_dimensions(self, dataset): 48 | yield Dimension('year', datatype='year') 49 | yield Dimension('month') # TODO: Convert to datatype month 50 | yield Dimension('vehicle_type') 51 | yield Dimension('status') 52 | 53 | def _fetch_data(self, dataset, query=None): 54 | files = [(y, m) for y in query['years'] for m in query['months']] 55 | frames = [] 56 | 57 | # Download and clean every monthly Excel file 58 | for file in files: 59 | year, month = file 60 | url = self.BASE_URL.format(year=year, month=MONTHS[month]) 61 | frame = self._clean_data(pd.read_excel(url), year, month) 62 | frames.append(frame) 63 | 64 | # Yield individual rows of type Result from the dataframe 65 | raw_data = pd.concat(frames) 66 | for i, row in raw_data.iterrows(): 67 | val = row.pop('value') 68 | yield Result(val, json.loads(row.to_json())) 69 | -------------------------------------------------------------------------------- /statscraper/scrapers/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | """ Expose scraper classes here. 3 | """ 4 | 5 | from .SCBScraper import SCB 6 | from .PXWebScraper import PXWeb 7 | from .CranesScraper import Cranes 8 | from .VehicleScraper import Vehicles 9 | from .SMHIScraper import SMHI 10 | 11 | -------------------------------------------------------------------------------- /statscraper/scrapers/uka_scraper.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | u""" A scraper to fetch Swedish university application statistics from 3 | the Swedish Higher Education Authority (Universitetskanslerämbetet, UKÄ), 4 | at http://statistik.uka.se 5 | """ 6 | from statscraper import BaseScraper, Dataset, Dimension, Result, Collection 7 | import requests 8 | from bs4 import BeautifulSoup 9 | 10 | 11 | class UKA(BaseScraper): 12 | 13 | def _fetch_itemslist(self, item): 14 | """ We only offer regional application stats. 15 | Other collections are differently structured. 16 | """ 17 | if item.is_root: 18 | yield Collection("regional", 19 | label="New students by area and school.") 20 | else: 21 | yield Dataset("municipality", 22 | label="Students by municipality, school, semester.") 23 | 24 | def _fetch_dimensions(self, dataset): 25 | """ Iterate through semesters, counties and municipalities. 26 | """ 27 | yield Dimension(u"school") 28 | yield Dimension(u"year", 29 | datatype="year") 30 | yield Dimension(u"semester", 31 | datatype="academic_term", 32 | dialect="swedish") # HT/VT 33 | yield Dimension(u"municipality", 34 | datatype="year", 35 | domain="sweden/municipalities") 36 | 37 | def _fetch_data(self, dataset, query): 38 | url = "http://statistik.uka.se/4.5d85793915901d205f935d0f.12.5d85793915901d205f965eab.portlet?action=resultat&view=resultTable&frageTyp=3&frageNr=240&tid=%s&grupp1=%s&grupp2=%s" 39 | thenmap_url = "http://api.thenmap.net/v1/se-7/data/%s?data_props=name|kommunkod" 40 | # 6 is 1993, the first year in the db 41 | if query is None: 42 | query = {} 43 | if "from" not in query: 44 | query['from'] = 1993 45 | if "semesters" not in query: 46 | query['semesters'] = (2016 - query["from"]) * 2 47 | start = (query["from"] - 1993) * 2 + 5 48 | terms = range(start, 49 | start + query["semesters"] + 2) 50 | for t in terms: 51 | # Get all municipalities, and their codes, from this year 52 | year = ((t - 5) / 2) + 1993 53 | semester = ["HT", "VT"][t % 2] 54 | municipalities = requests.get(thenmap_url % year).json() 55 | for id_, municipality_ in municipalities["data"].items(): 56 | municipality = municipality_.pop() 57 | code = municipality["kommunkod"].zfill(4) 58 | c, m = code[:2], code[2:] 59 | html = requests.get(url % (t, c, m)).text 60 | soup = BeautifulSoup(html, 'html.parser') 61 | table = soup.find("table") 62 | # The first rows are headers, the last are empty 63 | rows = table.find_all("tr")[5:-2] 64 | for row in rows: 65 | cells = row.find_all("td") 66 | 67 | yield Result(cells[2].text.strip(), { 68 | "municipality": municipality["name"], 69 | "school": cells[0].text.strip(), 70 | "semester": semester, 71 | "year": year, 72 | }) 73 | -------------------------------------------------------------------------------- /statscraper/scrapers/work_injury_scraper.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | """ A scraper to fetch Swedish work injury stats from 3 | http://webbstat.av.se 4 | 5 | This is an example of a scraper using Selenium. 6 | TODO: Move some useful functionality to a SeleciumFirefoxScraper 7 | 8 | To change download location: 9 | export STATSCRAPER_TEMPDIR="/path/to/temp/dir" 10 | 11 | """ 12 | from selenium import webdriver 13 | from selenium.webdriver.common.keys import Keys 14 | from selenium.webdriver.common.action_chains import ActionChains 15 | from selenium.webdriver.support.wait import WebDriverWait 16 | from statscraper import BaseScraper, Collection, Dataset, Result, Dimension 17 | import os 18 | from glob import iglob 19 | from time import sleep 20 | from uuid import uuid4 21 | from xlrd import open_workbook 22 | from selenium.webdriver.support import expected_conditions as EC 23 | from selenium.webdriver.common.by import By 24 | 25 | DEFAULT_TEMPDIR = "./tmp" 26 | TEMPDIR_ENVVAR = "STATSCRAPER_TEMPDIR" 27 | PAGELOAD_TIMEOUT = 90 # seconds 28 | 29 | 30 | class WorkInjuries(BaseScraper): 31 | 32 | tempdir = "./tmp" 33 | 34 | @BaseScraper.on("init") 35 | def initiate_browser(self): 36 | 37 | # Create a unique tempdir for downloaded files 38 | tempdir = os.getenv(TEMPDIR_ENVVAR, DEFAULT_TEMPDIR) 39 | tempsubdir = uuid4().hex 40 | # TODO: Remove this directory when finished! 41 | self.tempdir = os.path.join(tempdir, tempsubdir) 42 | try: 43 | # Try and create directory before checking if it exists, 44 | # to avoid race condition 45 | os.makedirs(self.tempdir) 46 | except OSError: 47 | if not os.path.isdir(self.tempdir): 48 | raise 49 | 50 | profile = webdriver.FirefoxProfile() 51 | # Set download location, avoid download dialogues if possible 52 | # Different settings needed for different Firefox versions 53 | # This will be a long list... 54 | profile.set_preference('browser.download.folderList', 2) 55 | profile.set_preference('browser.download.manager.showWhenStarting', False) 56 | profile.set_preference('browser.download.manager.closeWhenDone', True) 57 | profile.set_preference('browser.download.dir', self.tempdir) 58 | profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream;application/vnd.ms-excel") 59 | profile.set_preference("browser.helperApps.alwaysAsk.force", False) 60 | profile.set_preference("browser.download.manager.useWindow", False) 61 | 62 | self.browser = webdriver.Firefox(profile) 63 | 64 | self.browser.get('http://webbstat.av.se') 65 | detailed_cls = "Document_TX_GOTOTAB_Avancerad" 66 | """ The button for expanded detailed options. This 67 | also happens to be a good indicator as to wheter 68 | all content is loaded. 69 | """ 70 | 71 | # Wait for a content element, and 3 extra seconds just in case 72 | WebDriverWait(self.browser, PAGELOAD_TIMEOUT)\ 73 | .until(EC.presence_of_element_located((By.CLASS_NAME, 74 | detailed_cls))) 75 | self.browser.implicitly_wait(3) 76 | 77 | self.browser\ 78 | .find_element_by_class_name(detailed_cls)\ 79 | .find_element_by_tag_name("td")\ 80 | .click() 81 | # Wait for a content element, and 3 extra seconds just in case 82 | WebDriverWait(self.browser, PAGELOAD_TIMEOUT)\ 83 | .until(EC.presence_of_element_located((By.CLASS_NAME, 84 | detailed_cls))) 85 | self.browser.implicitly_wait(3) 86 | 87 | @BaseScraper.on("select") 88 | def switch_dataset(self, id_): 89 | (c, r, p) = self.current_item.blob 90 | 91 | # Select collection 92 | xpath = "//div[@title='%s']" % c 93 | # `c` can be either "Arbetsolycka" or "Arbetssjukdom" 94 | button = self.browser.find_element_by_xpath(xpath) 95 | button.click() 96 | 97 | # select Kommun or Län 98 | xpath = '//div[@class="QvContent"]/div[@class="QvGrid"]//div[@title="Visa tabell per:"]' 99 | self.browser\ 100 | .find_element_by_xpath(xpath)\ 101 | .click() 102 | region = "Kommun" if r == "kommun" else "Län" 103 | xpath = "//div[@class='QvListbox']//div[@title='%s']" % region 104 | self.browser\ 105 | .find_element_by_xpath(xpath)\ 106 | .click() 107 | 108 | # select Månad or År 109 | xpath = '//div[@class="QvContent"]/div[@class="QvGrid"]//div[@title="Tidsenhet:"]' 110 | self.browser\ 111 | .find_element_by_xpath(xpath)\ 112 | .click() 113 | period = "Månad" if p == u"månad" else "År och månad" 114 | xpath = "//div[@class='QvListbox']//div[@title='%s']" % period 115 | self.browser\ 116 | .find_element_by_xpath(xpath)\ 117 | .click() 118 | 119 | def _fetch_dimensions(self, dataset): 120 | """ Declaring available dimensions like this is not mandatory, 121 | but nice, especially if they differ from dataset to dataset. 122 | 123 | If you are using a built in datatype, you can specify the dialect 124 | you are expecting, to have values normalized. This scraper will 125 | look for Swedish month names (e.g. 'Januari'), but return them 126 | according to the Statscraper standard ('january'). 127 | """ 128 | yield Dimension(u"region", 129 | label="municipality or county", 130 | datatype="region", 131 | dialect="arbetsmiljoverket") 132 | yield Dimension(u"period", 133 | label="Year or month") 134 | 135 | def _fetch_itemslist(self, item): 136 | """ We define two collection: 137 | - Number of work injuries ("Arbetsolycka") 138 | - Number of workrelated diseases ("Arbetssjukdom") 139 | Each contains four datasets: 140 | - Per municipality and year 141 | - Per county and year 142 | - Per municipality and month 143 | - Per municipality and year 144 | """ 145 | if item.is_root: 146 | for c in ["Arbetsolycka", "Arbetssjukdom"]: 147 | yield Collection(c, blob=(c, None, None)) 148 | else: 149 | c = item.id 150 | for r in [u"kommun", u"län"]: 151 | for p in [u"år", u"månad"]: 152 | yield Dataset(u"%s-%s-%s" % (c, r, p), 153 | blob=(c, r, p), 154 | label=u"%s, antal per %s och %s" % (c, r, p)) 155 | 156 | def _fetch_data(self, dataset, query=None): 157 | (c, r, p) = dataset.blob 158 | 159 | self.browser\ 160 | .find_element_by_xpath("//div[@title='Skicka till Excel']")\ 161 | .click() 162 | # Press enter trice in case of any prompts 163 | actions = ActionChains(self.browser) 164 | actions.send_keys(Keys.RETURN) 165 | actions.send_keys(Keys.RETURN) 166 | actions.send_keys(Keys.RETURN) 167 | actions.perform() 168 | # Wait for download 169 | i = 0 170 | while not os.listdir(self.tempdir): 171 | sleep(1) 172 | i += 1 173 | if i > PAGELOAD_TIMEOUT: 174 | # TODO: Use a suitable basescraper exception 175 | raise Exception("Download timed out") 176 | sleep(20) # TODO: We need to check that the file is complete. 177 | # Something like this: 178 | # https://stackoverflow.com/questions/35891393/how-to-get-file-download-complete-status-using-selenium-web-driver-c-sharp#35892347 179 | 180 | # WARNING: Assuming the latest downloaded xls to be our file. 181 | # This is obviously not 100 % water proof. 182 | latest_download = max(iglob(os.path.join(self.tempdir, "*.xls")), 183 | key=os.path.getctime) 184 | workbook = open_workbook(latest_download) 185 | sheet = workbook.sheet_by_index(0) 186 | periods = sheet.row_values(0)[2:-1] 187 | periods = [int(x) for x in periods] 188 | for n in range(1, sheet.nrows): 189 | row = sheet.row_values(n) 190 | region = row.pop(0) 191 | row.pop(0) # empty due to merged cells 192 | if region == "Total": 193 | break 194 | i = 0 195 | for col in row[:-1]: 196 | yield Result( 197 | int(col), 198 | { 199 | "region": region, 200 | "period": periods[i], 201 | } 202 | ) 203 | -------------------------------------------------------------------------------- /tests/scrapertests/test_injury_scraper.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from unittest import TestCase 4 | from statscraper.scrapers.work_injury_scraper import WorkInjuries 5 | 6 | 7 | class TestInjuries(TestCase): 8 | 9 | def setup_method(self, test_method): 10 | self.scraper = WorkInjuries() 11 | 12 | def test_can_fetch(self): 13 | collection = self.scraper[1] # "Arbetssjukdomar" 14 | dataset = collection[3] 15 | data = dataset.data 16 | self.assertTrue(len(data)) 17 | -------------------------------------------------------------------------------- /tests/scrapertests/test_pxweb_scraper.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from unittest import TestCase 4 | 5 | from statscraper.scrapers import PXWeb 6 | 7 | 8 | class TestPXWeb(TestCase): 9 | 10 | def test_init_scraper(self): 11 | """Extending the PXWebScraper.""" 12 | pxscraper = PXWeb(base_url="http://pxnet2.stat.fi/pxweb/api/v1/sv/StatFin/") 13 | self.assertTrue(len(pxscraper.items)) 14 | 15 | def test_navigating_tree(self): 16 | """Navigate the tree..""" 17 | scraper = PXWeb(base_url="http://pxnet2.stat.fi/pxweb/api/v1/sv/StatFin/") 18 | scraper.move_to("tym")\ 19 | .move_to(u"tyonv")\ 20 | .move_to(u"statfin_pxt_tym_tyonv_001.px") 21 | data = scraper.fetch() 22 | self.assertTrue(len(data)) 23 | -------------------------------------------------------------------------------- /tests/scrapertests/test_smhi_scraper.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | from unittest import TestCase 3 | import pandas as pd 4 | from statscraper.scrapers.SMHIScraper import SMHI, Collection, API, SMHIDataset, Station 5 | 6 | 7 | class TestSMHI(TestCase): 8 | 9 | def setUp(self): 10 | """Setting up scraper.""" 11 | 12 | def test_fetch_api(self): 13 | scraper = SMHI() 14 | apis = scraper.items 15 | self.assertTrue(len(apis) > 0) 16 | api = apis[0] 17 | self.assertTrue(api, API) 18 | 19 | self.assertFalse(api.label is None) 20 | self.assertFalse(api.url is None) 21 | self.assertTrue(isinstance(api.json, dict)) 22 | 23 | 24 | 25 | def test_fetch_dataset(self): 26 | u"""Moving to an “API”.""" 27 | scraper = SMHI() 28 | api = scraper.get("Meteorological Observations") 29 | for dataset in api: 30 | self.assertFalse(dataset.label is None) 31 | self.assertFalse(dataset.url is None) 32 | # Make sure its a dataset 33 | self.assertTrue(isinstance(dataset, SMHIDataset)) 34 | # Get dimensions 35 | self.assertGreater(len(dataset.dimensions), 0) 36 | 37 | def test_fetch_allowed_values(self): 38 | scraper = SMHI() 39 | api = scraper.get("Meteorological Observations") 40 | dataset = api.items[0] 41 | stations = dataset.dimensions["station"].allowed_values 42 | active_stations = [x for x in dataset.dimensions["station"].active_stations()] 43 | self.assertTrue(len(stations)>0) 44 | self.assertTrue(len(active_stations)>0) 45 | 46 | station = dataset.dimensions["station"].allowed_values.get_by_label(u"Växjö A") 47 | self.assertTrue(isinstance(station, Station)) 48 | 49 | self.assertFalse(station.label is None) 50 | 51 | periods = dataset.dimensions["period"].allowed_values 52 | self.assertEqual(len(periods), 4) 53 | 54 | 55 | def test_query(self): 56 | scraper = SMHI() 57 | api = scraper.get("Meteorological Observations") 58 | dataset = api.items[0] 59 | data = dataset.fetch({"station": u"Växjö A", "period": "latest-months"}) 60 | self.assertTrue(len(data) > 0) 61 | 62 | 63 | def test_get_stations_list(self): 64 | scraper = SMHI() 65 | api = scraper.get("Meteorological Observations") 66 | dataset = api.items[0] 67 | stations = dataset.get_stations_list() 68 | self.assertTrue(len(stations) > 0) 69 | for station in stations: 70 | self.assertTrue("longitude" in station) 71 | 72 | active_stations = dataset.get_active_stations_list() 73 | 74 | self.assertTrue(len(active_stations) > 0) 75 | self.assertTrue(len(stations) > len(active_stations)) 76 | 77 | def test_iterate_queries(self): 78 | # Make same query to multiple datasets 79 | scraper = SMHI() 80 | api = scraper.get("Meteorological Observations") 81 | datasets = [ 82 | u"Nederbördsmängd, summa, 1 gång per månad", 83 | u"Lufttemperatur, medel, 1 gång per månad", 84 | ] 85 | dfs = [] 86 | for dataset_name in datasets: 87 | query = { 88 | "period": ["corrected-archive"], 89 | "station": "Abisko" 90 | } 91 | 92 | res = api.get(dataset_name).fetch(query) 93 | dfs.append(res.pandas) 94 | 95 | # Merge the two resultsets to one dataframe 96 | df = pd.concat(dfs) 97 | 98 | # Make sure that both parameters (datasets) are in 99 | # the final dataframe 100 | parameters = df["parameter"].unique() 101 | self.assertTrue(len(parameters) == 2) 102 | for parameter in parameters: 103 | self.assertTrue(parameter in datasets) 104 | -------------------------------------------------------------------------------- /tests/scrapertests/test_vantetider_scraper.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | from unittest import TestCase 3 | 4 | from statscraper.scrapers.VantetiderScraper import VantetiderScraper 5 | 6 | 7 | class TestVantetider(TestCase): 8 | 9 | def setUp(self): 10 | self.scraper = VantetiderScraper() 11 | 12 | def test_smhi_scraper(self): 13 | """Setting up scraper.""" 14 | self.assertTrue(len(self.scraper.items)) 15 | 16 | 17 | def test_fetch_dataset(self): 18 | u"""Moving to an “API”.""" 19 | dataset = self.scraper.get("PrimarvardBesok") 20 | 21 | #self.assertTrue(isinstance(dataset, Dataset)) 22 | self.assertEqual(len(self.scraper.items), 9) 23 | 24 | 25 | def test_fetch_dimensions(self): 26 | u"""Moving to an “API”.""" 27 | for dataset in self.scraper.items: 28 | self.assertGreater(len(dataset.dimensions),0) 29 | 30 | def test_basic_query(self): 31 | dataset = self.scraper.get("PrimarvardTelefon") 32 | res = dataset.fetch({"region": ["Blekinge"]}) 33 | df = res.pandas 34 | self.assertGreater(df.shape[0],0) 35 | 36 | def test_multi_period_query(self): 37 | dataset = self.scraper.get("PrimarvardBesok") 38 | res = dataset.fetch({ 39 | "region": ["Stockholm"], 40 | "year": ["2017", "2016"] 41 | }) 42 | df = res.pandas 43 | self.assertGreater(df.shape[0],0) 44 | -------------------------------------------------------------------------------- /tests/scrapertests/test_vehicle_scraper.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from unittest import TestCase 4 | from statscraper.scrapers import Vehicles 5 | 6 | 7 | class TestVehicles(TestCase): 8 | 9 | def setUp(self): 10 | self.scraper = Vehicles() 11 | 12 | def test_has_items(self): 13 | self.assertTrue(len(self.scraper.items)) 14 | 15 | def test_has_datasets(self): 16 | datasets = self.scraper.items 17 | self.assertTrue(len(datasets)) 18 | 19 | def test_can_fetch(self): 20 | dataset = self.scraper.items[0] 21 | data = dataset.fetch(query={'years': [2017], 'months':[0,1]}) 22 | self.assertTrue(len(data)) -------------------------------------------------------------------------------- /tests/test-datatypes.py: -------------------------------------------------------------------------------- 1 | """Test datatypes.""" 2 | from statscraper.datatypes import Datatype 3 | from statscraper import Dimension, DimensionValue 4 | 5 | 6 | def test_allowed_values(): 7 | """Datatypes shuold have allowed values.""" 8 | dt = Datatype("region") 9 | assert("Ale kommun" in dt.allowed_values) 10 | 11 | 12 | def test_b(): 13 | """Dimension values should be translatable.""" 14 | d = Dimension("municipality", datatype="region", domain="sweden/municipalities") 15 | dv = DimensionValue("Ale kommun", d) 16 | assert(dv.translate("numerical") == "1440") 17 | -------------------------------------------------------------------------------- /tests/test-scb.py: -------------------------------------------------------------------------------- 1 | """Test SCB/PXWeb scraper.""" 2 | from statscraper.scrapers import SCB 3 | from statscraper.exceptions import InvalidData 4 | import pytest 5 | 6 | 7 | def test_get_data(): 8 | """We should be able to access a dataset by path.""" 9 | scraper = SCB() 10 | scraper.move_to("HE").move_to("HE0110").move_to("HE0110F").move_to("Tab1DispInkN") 11 | data = scraper.fetch({ 12 | "ContentsCode": ("item", "000002VY"), 13 | "InkomstTyp": ("item", "FastInkIn"), 14 | }, by="municipality") 15 | 16 | assert "Region" in data.dataset.dimensions 17 | assert "InkomstTyp" in data.dataset.dimensions 18 | 19 | df = data.pandas 20 | assert "value" in df.columns 21 | assert "Region" in df.columns 22 | assert "InkomstTyp" in df.columns 23 | 24 | 25 | def test_values(): 26 | """Make sure values are numerical.""" 27 | scraper = SCB() 28 | scraper.move_to("HE").move_to("HE0110").move_to("HE0110F").move_to("Tab1DispInkN") 29 | data = scraper.fetch({ 30 | "ContentsCode": ("item", "000002VY"), 31 | "InkomstTyp": ("item", "FastInkIn"), 32 | }, by="municipality") 33 | float(data[0].value.isnumeric()) 34 | 35 | 36 | def test_invalid_query(): 37 | """We should raise an error on invalid queries.""" 38 | scraper = SCB() 39 | scraper.move_to("HE").move_to("HE0110").move_to("HE0110F").move_to("Tab1DispInkN") 40 | with pytest.raises(InvalidData): 41 | scraper.fetch({ 42 | "foo": ("bar", "buzz"), 43 | }, by="municipality") 44 | -------------------------------------------------------------------------------- /tests/test_base_scraper.py: -------------------------------------------------------------------------------- 1 | """Tests for scraper base class.""" 2 | from unittest import TestCase 3 | from statscraper import (BaseScraper, Dataset, Dimension, Result, 4 | DimensionValue, Collection, ROOT, NoSuchItem) 5 | 6 | 7 | class Scraper(BaseScraper): 8 | """A scraper with hardcoded yields.""" 9 | 10 | def _fetch_itemslist(self, item): 11 | yield Dataset("Dataset_1") 12 | yield Dataset("Dataset_2") 13 | yield Dataset("Dataset_3") 14 | 15 | def _fetch_dimensions(self, dataset): 16 | yield Dimension("date") 17 | 18 | # Assign a label to one of the allowed values 19 | mun = Dimension("municipality", allowed_values=[ 20 | "Umeå kommun", 21 | "Robertsfors kommun"]) 22 | mun.allowed_values["Robertsfors kommun"].label = "Robertsfors kommun" 23 | yield mun 24 | 25 | yield Dimension("gender") 26 | 27 | def _fetch_allowed_values(self, dimension): 28 | if dimension.id == "gender": 29 | yield DimensionValue("male", dimension, label="Men") 30 | yield DimensionValue("female", dimension, label="Women") 31 | 32 | def _fetch_data(self, dataset, query=None): 33 | if dataset.id == "Dataset_1": 34 | yield Result(127, { 35 | "date": "2017-08-10", 36 | "municipality": "Robertsfors kommun", 37 | }) 38 | elif dataset.id == "Dataset_2": 39 | yield Result(12, { 40 | "date": "2017-02-06", 41 | "municipality": "Umeå kommun", 42 | }) 43 | yield Result(130, { 44 | "date": "2017-02-07", 45 | "municipality": "Robertsfors kommun", 46 | }) 47 | 48 | 49 | class NestedScraper(Scraper): 50 | """A scraper with hardcoded yields. 51 | 52 | ROOT - Collection_1 - Dataset_1 53 | - Collection_2 - [Dataset_2, Dataset_3] 54 | """ 55 | 56 | def _fetch_itemslist(self, item): 57 | if item.id == ROOT: 58 | yield Collection("Collection_1") 59 | yield Collection("Collection_2") 60 | elif item.id == "Collection_1": 61 | yield Dataset("Dataset_1") 62 | elif item.id == "Collection_2": 63 | yield Dataset("Dataset_2") 64 | yield Dataset("Dataset_3") 65 | else: 66 | raise Exception("This can not possibly happen.") 67 | 68 | 69 | class CallbackScraper(Scraper): 70 | """A scraper with callbacks.""" 71 | 72 | @BaseScraper.on("init") 73 | def initiation_code(self): 74 | self.initiated = True 75 | 76 | def _fetch_itemslist(self, item): 77 | yield Dataset("Dataset_1") 78 | yield Dataset("Dataset_2") 79 | 80 | 81 | class TestBaseScraper(TestCase): 82 | """Testing base functionality.""" 83 | 84 | def test_init(self): 85 | """Extending the basescraper.""" 86 | scraper = Scraper() 87 | self.assertTrue(scraper.current_item.id == ROOT) 88 | 89 | def test_inspect_item(self): 90 | """Fetching items from an itemlist.""" 91 | scraper = Scraper() 92 | self.assertTrue(scraper.items[0] == scraper.items["Dataset_1"]) 93 | 94 | def test_move_to_item(self): 95 | """Moving the cursor up and down the tree.""" 96 | scraper = Scraper() 97 | scraper.move_to("Dataset_1") 98 | self.assertTrue(isinstance(scraper.current_item, Dataset)) 99 | self.assertTrue(scraper.current_item.id == "Dataset_1") 100 | 101 | scraper.move_up() 102 | scraper.move_to(1) 103 | self.assertTrue(isinstance(scraper.current_item, Dataset)) 104 | self.assertTrue(scraper.current_item.id == "Dataset_2") 105 | 106 | scraper.move_up() 107 | scraper.move_to(scraper.items[2]) 108 | self.assertTrue(isinstance(scraper.current_item, Dataset)) 109 | self.assertTrue(scraper.current_item.id == "Dataset_3") 110 | 111 | def test_chained_move_to(self): 112 | """Use chaining to move.""" 113 | scraper = Scraper() 114 | scraper.move_to("Dataset_1").move_up().move_to("Dataset_2") 115 | self.assertTrue(scraper.current_item.id == "Dataset_2") 116 | 117 | def test_stop_at_root(self): 118 | """Trying to move up from the root should do nothing.""" 119 | scraper = Scraper() 120 | scraper.move_up().move_up().move_up().move_up() 121 | self.assertTrue(scraper.current_item.is_root) 122 | 123 | def test_itemslist_contains(self): 124 | """Make sure 'in' keyword works with ItemList.""" 125 | scraper = Scraper() 126 | self.assertTrue("Dataset_1" in scraper.items) 127 | self.assertTrue(scraper.items[0] in scraper.items) 128 | 129 | def test_select_missing_item(self): 130 | """Select an Item by ID that doesn't exist.""" 131 | scraper = Scraper() 132 | with self.assertRaises(NoSuchItem): 133 | scraper.move_to("non_existing_item") 134 | 135 | def test_item_knows_parent(self): 136 | """Make sure an item knows who its parent is.""" 137 | scraper = Scraper() 138 | parent = scraper.current_item 139 | dataset = scraper["Dataset_1"] 140 | scraper.move_to("Dataset_1") 141 | self.assertTrue(scraper.parent.id == dataset.parent.id == 142 | scraper.current_item.parent.id == parent.id) 143 | 144 | def test_fetch_dataset(self): 145 | """Query a dataset for some data.""" 146 | scraper = Scraper() 147 | dataset = scraper[0] 148 | self.assertEqual(dataset.data[0]["municipality"], "Robertsfors kommun") 149 | 150 | def test_unselected_visible_dataset(self): 151 | """Query a dataset not selected, but visible.""" 152 | scraper = Scraper() 153 | dataset = scraper["Dataset_1"] 154 | scraper.move_to("Dataset_2") 155 | self.assertEqual(dataset.data[0]["municipality"], "Robertsfors kommun") 156 | 157 | def test_cached_data(self): 158 | """Query a dataset not selected but cached.""" 159 | scraper = Scraper() 160 | data_1 = scraper["Dataset_1"].data 161 | scraper.move_up().move_to("Dataset_2") 162 | self.assertEqual(data_1[0]["municipality"], "Robertsfors kommun") 163 | 164 | def test_get_dimension(self): 165 | """Get dimensions for a dataset.""" 166 | scraper = Scraper() 167 | dataset = scraper[0] 168 | self.assertTrue(len(dataset.dimensions)) 169 | self.assertTrue(isinstance(dataset.dimensions[0], Dimension)) 170 | 171 | dim = dataset.dimensions["municipality"] 172 | self.assertTrue(isinstance(dim, Dimension)) 173 | 174 | dim = dataset.dimensions.get("municipality") 175 | self.assertTrue(isinstance(dim, Dimension)) 176 | 177 | def test_select_allowed_values(self): 178 | """List allowed values from dimension.""" 179 | scraper = Scraper() 180 | dataset = scraper[0] 181 | 182 | municipality = dataset.dimensions["municipality"] 183 | self.assertTrue("Robertsfors kommun" in municipality.allowed_values) 184 | 185 | allowed_value = municipality.allowed_values["Robertsfors kommun"] 186 | self.assertEqual(allowed_value, "Robertsfors kommun") 187 | 188 | # We also want to be able to fetch allowed values by label 189 | allowed_value_by_label = municipality.allowed_values.get_by_label("Robertsfors kommun") 190 | self.assertEqual(allowed_value, allowed_value_by_label) 191 | 192 | gender = dataset.dimensions["gender"] 193 | self.assertEqual(len(gender.allowed_values), 2) 194 | 195 | # Get an allowed value by key 196 | female = gender.allowed_values["female"] 197 | 198 | # Get an allowed value by label 199 | female_by_label = gender.allowed_values.get_by_label("Women") 200 | 201 | # The two methods above should fetch the same item 202 | self.assertEqual(female, female_by_label) 203 | self.assertEqual(female.id, "gender") 204 | self.assertEqual(female.value, "female") 205 | self.assertEqual(female.label, "Women") 206 | 207 | def test_move_deep_manually(self): 208 | """Use the NestedScraper to move more than one step.""" 209 | scraper = NestedScraper() 210 | scraper.move_to("Collection_1") 211 | self.assertTrue("Dataset_1" in scraper.items) 212 | 213 | scraper.move_to("Dataset_1") 214 | self.assertEqual("Dataset_1", scraper.current_item) 215 | self.assertTrue(len(scraper.current_item.data)) 216 | 217 | scraper.move_to_top().move_to("Collection_2") 218 | self.assertTrue("Dataset_2" in scraper.items) 219 | self.assertTrue("Dataset_3" in scraper.items) 220 | 221 | scraper.move_up().move_to("Collection_1") 222 | self.assertTrue("Dataset_1" in scraper.items) 223 | 224 | def test_move_deep_automatically(self): 225 | """Use the NestedScraper to move more than one step, 226 | and make sure the cursor follows along as needed.""" 227 | scraper = NestedScraper() 228 | 229 | collection_2 = scraper.items["Collection_2"] 230 | self.assertTrue(len(collection_2.items)) 231 | 232 | scraper.move_to_top() 233 | dataset_1 = scraper["Collection_1"]["Dataset_1"] 234 | self.assertTrue(len(dataset_1.data)) 235 | 236 | dataset_2 = collection_2["Dataset_2"] 237 | self.assertTrue(len(dataset_2.data)) 238 | 239 | self.assertTrue(len(dataset_1.data)) 240 | 241 | def test_callbacks(self): 242 | """Extending the basescraper.""" 243 | scraper = CallbackScraper() 244 | self.assertTrue(scraper.initiated) 245 | -------------------------------------------------------------------------------- /tests/test_dialects.py: -------------------------------------------------------------------------------- 1 | """Tests related to the concept of certain datatypes having values with dialects.""" 2 | from unittest import TestCase 3 | from statscraper import (BaseScraper, Dataset, Result, Dimension, DimensionValue) 4 | 5 | 6 | class Scraper(BaseScraper): 7 | """A scraper with hardcoded yields.""" 8 | 9 | def _fetch_itemslist(self, item): 10 | yield Dataset("Dataset_1") 11 | 12 | def _fetch_dimensions(self, dataset): 13 | yield Dimension(u"municipality", datatype="region") 14 | 15 | def _fetch_data(self, dataset, query=None): 16 | yield Result(127, { 17 | "municipality": "Robertsfors kommun", 18 | }) 19 | yield Result(17, { 20 | "municipality": "Region Gotland", 21 | }) 22 | 23 | 24 | class TestDialects(TestCase): 25 | """Test translated values.""" 26 | 27 | def test_translations(self): 28 | """Test standalone translation.""" 29 | municipalities = Dimension("municipality", 30 | datatype="region", domain="sweden/municipalities") 31 | municipality = DimensionValue("Stockholms kommun", municipalities) 32 | assert municipality.translate("numerical") == "180" 33 | 34 | def test_dialects(self): 35 | """Test translation inside a scraper.""" 36 | scraper = Scraper() 37 | data1 = scraper.items[0].data 38 | self.assertEqual(str(data1[0]["municipality"]), "Robertsfors kommun") 39 | 40 | data2 = data1.translate("scb") 41 | self.assertEqual(str(data2[0]["municipality"]), "2409 Robertsfors kommun") 42 | -------------------------------------------------------------------------------- /tests/test_resultset.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from statscraper import Result, ResultSet 4 | from pandas.api import types as ptypes 5 | 6 | 7 | class TestResultSet(TestCase): 8 | 9 | def test_pandas_export(self): 10 | """Get results as pandas dataframe.""" 11 | result = ResultSet() 12 | result.append(Result(45483, {'city': "Voi"})) 13 | df = result.pandas 14 | self.assertTrue(ptypes.is_numeric_dtype(df.value)) 15 | -------------------------------------------------------------------------------- /version.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | 3 | name = "statscraper" 4 | 5 | short_version = "2.0.2" 6 | long_version = short_version 7 | 8 | short_desc = """\ 9 | A base class for building web scrapers for statistical data.\ 10 | """ 11 | authors = u"Jens Finnäs and Leo Wallentin, J++; Robin Linderborg" 12 | year = date.today().year 13 | copyright = "%s, %s" % (year, authors) 14 | email = "stockholm@jplusplus.org" 15 | 16 | version = long_version 17 | --------------------------------------------------------------------------------